featkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. featkit-0.1.0/.github/workflows/ci.yml +70 -0
  2. featkit-0.1.0/.github/workflows/docs.yml +28 -0
  3. featkit-0.1.0/.github/workflows/publish.yml +54 -0
  4. featkit-0.1.0/.gitignore +15 -0
  5. featkit-0.1.0/CHANGELOG.md +8 -0
  6. featkit-0.1.0/LICENSE +21 -0
  7. featkit-0.1.0/PKG-INFO +140 -0
  8. featkit-0.1.0/README.md +75 -0
  9. featkit-0.1.0/docs/.gitkeep +0 -0
  10. featkit-0.1.0/docs/examples.md +245 -0
  11. featkit-0.1.0/docs/general_plan.md +488 -0
  12. featkit-0.1.0/docs/index.md +5 -0
  13. featkit-0.1.0/docs/quickstart.md +94 -0
  14. featkit-0.1.0/mkdocs.yml +42 -0
  15. featkit-0.1.0/pyproject.toml +86 -0
  16. featkit-0.1.0/src/featkit/__init__.py +1 -0
  17. featkit-0.1.0/src/featkit/builders/.gitkeep +0 -0
  18. featkit-0.1.0/src/featkit/builders/__init__.py +0 -0
  19. featkit-0.1.0/src/featkit/builders/distributional_space.py +77 -0
  20. featkit-0.1.0/src/featkit/builders/pivot_space.py +102 -0
  21. featkit-0.1.0/src/featkit/builders/temporal_space.py +86 -0
  22. featkit-0.1.0/src/featkit/config.py +38 -0
  23. featkit-0.1.0/src/featkit/contracts/__init__.py +1 -0
  24. featkit-0.1.0/src/featkit/contracts/measurement/.gitkeep +0 -0
  25. featkit-0.1.0/src/featkit/contracts/measurement/__init__.py +27 -0
  26. featkit-0.1.0/src/featkit/contracts/measurement/base.py +47 -0
  27. featkit-0.1.0/src/featkit/contracts/measurement/defaults.py +117 -0
  28. featkit-0.1.0/src/featkit/contracts/output/.gitkeep +0 -0
  29. featkit-0.1.0/src/featkit/contracts/output/__init__.py +19 -0
  30. featkit-0.1.0/src/featkit/contracts/output/base.py +36 -0
  31. featkit-0.1.0/src/featkit/contracts/output/defaults.py +80 -0
  32. featkit-0.1.0/src/featkit/dataset/.gitkeep +0 -0
  33. featkit-0.1.0/src/featkit/dataset/__init__.py +0 -0
  34. featkit-0.1.0/src/featkit/dataset/base.py +120 -0
  35. featkit-0.1.0/src/featkit/enums.py +110 -0
  36. featkit-0.1.0/src/featkit/fields/.gitkeep +0 -0
  37. featkit-0.1.0/src/featkit/fields/__init__.py +9 -0
  38. featkit-0.1.0/src/featkit/fields/base.py +48 -0
  39. featkit-0.1.0/src/featkit/fields/categorical_field.py +55 -0
  40. featkit-0.1.0/src/featkit/fields/id_field.py +14 -0
  41. featkit-0.1.0/src/featkit/fields/measurement_field.py +42 -0
  42. featkit-0.1.0/src/featkit/fields/time_field.py +43 -0
  43. featkit-0.1.0/src/featkit/generators/__init__.py +0 -0
  44. featkit-0.1.0/src/featkit/generators/base.py +171 -0
  45. featkit-0.1.0/src/featkit/generators/output.py +118 -0
  46. featkit-0.1.0/src/featkit/generators/pyspark/.gitkeep +0 -0
  47. featkit-0.1.0/src/featkit/generators/pyspark/__init__.py +0 -0
  48. featkit-0.1.0/src/featkit/generators/pyspark/databricks.py +448 -0
  49. featkit-0.1.0/src/featkit/generators/sql/.gitkeep +0 -0
  50. featkit-0.1.0/src/featkit/generators/sql/__init__.py +0 -0
  51. featkit-0.1.0/src/featkit/generators/sql/base.py +496 -0
  52. featkit-0.1.0/src/featkit/generators/sql/databricks.py +19 -0
  53. featkit-0.1.0/src/featkit/generators/sql/snowflake.py +19 -0
  54. featkit-0.1.0/src/featkit/generators/sql/spark_sql.py +19 -0
  55. featkit-0.1.0/src/featkit/layer2/.gitkeep +0 -0
  56. featkit-0.1.0/src/featkit/layer2/__init__.py +0 -0
  57. featkit-0.1.0/src/featkit/layer2/base.py +86 -0
  58. featkit-0.1.0/src/featkit/layer2/distributional.py +51 -0
  59. featkit-0.1.0/src/featkit/layer2/pivoted.py +63 -0
  60. featkit-0.1.0/src/featkit/layer3/.gitkeep +0 -0
  61. featkit-0.1.0/src/featkit/layer3/__init__.py +0 -0
  62. featkit-0.1.0/src/featkit/layer3/temporal_feature.py +87 -0
  63. featkit-0.1.0/src/featkit/pipeline.py +63 -0
  64. featkit-0.1.0/tests/__init__.py +0 -0
  65. featkit-0.1.0/tests/test_builders.py +608 -0
  66. featkit-0.1.0/tests/test_contracts.py +173 -0
  67. featkit-0.1.0/tests/test_enums.py +138 -0
  68. featkit-0.1.0/tests/test_fields.py +467 -0
  69. featkit-0.1.0/tests/test_generators/.gitkeep +0 -0
  70. featkit-0.1.0/tests/test_generators/__init__.py +0 -0
  71. featkit-0.1.0/tests/test_generators/test_base.py +432 -0
  72. featkit-0.1.0/tests/test_generators/test_pyspark.py +366 -0
  73. featkit-0.1.0/tests/test_generators/test_sql_databricks.py +316 -0
  74. featkit-0.1.0/tests/test_generators/test_sql_snowflake.py +421 -0
  75. featkit-0.1.0/tests/test_integration.py +512 -0
  76. featkit-0.1.0/tests/test_layer2.py +401 -0
  77. featkit-0.1.0/tests/test_layer3.py +202 -0
  78. featkit-0.1.0/tests/test_output_contracts.py +152 -0
  79. featkit-0.1.0/tests/test_pipeline.py +266 -0
@@ -0,0 +1,70 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ name: Lint
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+
20
+ - name: Install lint dependencies
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install ruff
24
+
25
+ - name: Run Ruff lint
26
+ run: ruff check .
27
+
28
+ - name: Check formatting
29
+ run: ruff format --check .
30
+
31
+ typecheck:
32
+ name: Type Check
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+
37
+ - uses: actions/setup-python@v5
38
+ with:
39
+ python-version: "3.12"
40
+
41
+ - name: Install dependencies
42
+ run: |
43
+ python -m pip install --upgrade pip
44
+ pip install -e ".[dev]"
45
+
46
+ - name: Run mypy
47
+ run: mypy .
48
+
49
+ test:
50
+ name: Test (Python ${{ matrix.python-version }})
51
+ runs-on: ubuntu-latest
52
+ strategy:
53
+ fail-fast: false
54
+ matrix:
55
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
56
+
57
+ steps:
58
+ - uses: actions/checkout@v4
59
+
60
+ - uses: actions/setup-python@v5
61
+ with:
62
+ python-version: ${{ matrix.python-version }}
63
+
64
+ - name: Install dependencies
65
+ run: |
66
+ python -m pip install --upgrade pip
67
+ pip install -e ".[dev]"
68
+
69
+ - name: Run tests
70
+ run: pytest
@@ -0,0 +1,28 @@
1
+ name: Docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ permissions:
8
+ contents: write
9
+
10
+ jobs:
11
+ deploy:
12
+ name: Build and deploy docs
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ pip install -e ".[docs]"
26
+
27
+ - name: Deploy to GitHub Pages
28
+ run: mkdocs gh-deploy --force
@@ -0,0 +1,54 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+
20
+ - name: Install build tools
21
+ run: |
22
+ python -m pip install --upgrade pip
23
+ pip install build twine
24
+
25
+ - name: Build wheel and sdist
26
+ run: python -m build
27
+
28
+ - name: Check distribution
29
+ run: twine check dist/*
30
+
31
+ - name: Upload distribution artifacts
32
+ uses: actions/upload-artifact@v4
33
+ with:
34
+ name: dist
35
+ path: dist/
36
+
37
+ publish:
38
+ name: Publish to PyPI
39
+ needs: build
40
+ runs-on: ubuntu-latest
41
+ environment: pypi
42
+ permissions:
43
+ id-token: write
44
+ contents: read
45
+
46
+ steps:
47
+ - name: Download distribution artifacts
48
+ uses: actions/download-artifact@v4
49
+ with:
50
+ name: dist
51
+ path: dist/
52
+
53
+ - name: Publish to PyPI
54
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,15 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ .venv/
8
+ venv/
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+ .pytest_cache/
12
+ .coverage
13
+ htmlcov/
14
+ site/
15
+ *.log
@@ -0,0 +1,8 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
featkit-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mirko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
featkit-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: featkit
3
+ Version: 0.1.0
4
+ Summary: featkit — automated feature store generation from relational facts tables
5
+ Project-URL: Repository, https://github.com/Mirkiux/featkit
6
+ Project-URL: Documentation, https://mirkiux.github.io/featkit
7
+ Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
8
+ Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
9
+ Author: Mirko
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Mirko
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ License-File: LICENSE
32
+ Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
33
+ Classifier: Development Status :: 2 - Pre-Alpha
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Topic :: Scientific/Engineering
43
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
44
+ Requires-Python: >=3.10
45
+ Requires-Dist: sqlglot>=23.0
46
+ Provides-Extra: databricks
47
+ Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
48
+ Provides-Extra: dev
49
+ Requires-Dist: build>=1.0; extra == 'dev'
50
+ Requires-Dist: hatch>=1.9; extra == 'dev'
51
+ Requires-Dist: mypy>=1.0; extra == 'dev'
52
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
53
+ Requires-Dist: pytest>=7.0; extra == 'dev'
54
+ Requires-Dist: ruff>=0.4; extra == 'dev'
55
+ Requires-Dist: twine>=5.0; extra == 'dev'
56
+ Provides-Extra: docs
57
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
58
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
59
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
60
+ Provides-Extra: ibis
61
+ Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
62
+ Provides-Extra: spark
63
+ Requires-Dist: pyspark>=3.4; extra == 'spark'
64
+ Description-Content-Type: text/markdown
65
+
66
+ # featkit
67
+
68
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
69
+
70
+ It implements a three-layer architecture:
71
+
72
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
73
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
74
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
75
+
76
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
77
+
78
+ ## Key concepts
79
+
80
+ | Layer | What it does |
81
+ |---|---|
82
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
83
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
84
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
85
+
86
+ ## Installation
87
+
88
+ ```bash
89
+ pip install featkit
90
+ ```
91
+
92
+ ## Quickstart
93
+
94
+ ```python
95
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
96
+ from featkit.dataset import SimpleDataset
97
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
98
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
99
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
100
+
101
+ # Define schema
102
+ fields = [
103
+ IDField(name="ID_CLIENTE"),
104
+ TimeField(name="PERIODO",
105
+ source_granularity=TimeGranularity.MONTHLY,
106
+ target_granularity=TimeGranularity.MONTHLY),
107
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
108
+ allowed_values=["RETAIL", "CORP", "PYME"]),
109
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
110
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
111
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
112
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
113
+ ]
114
+
115
+ dataset = SimpleDataset(
116
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
117
+ fields=fields,
118
+ )
119
+
120
+ config = FeatureStoreConfig(
121
+ dataset=dataset,
122
+ output_schema="MY_DB.MY_SCHEMA",
123
+ output_table_prefix="FS",
124
+ time_windows=[3, 6, 9, 12],
125
+ )
126
+
127
+ pipeline = FeatureStorePipeline(config).build()
128
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
129
+
130
+ output.save("./output")
131
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
132
+ ```
133
+
134
+ ## Architecture
135
+
136
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
137
+
138
+ ## License
139
+
140
+ MIT
@@ -0,0 +1,75 @@
1
+ # featkit
2
+
3
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
4
+
5
+ It implements a three-layer architecture:
6
+
7
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
8
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
9
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
10
+
11
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
12
+
13
+ ## Key concepts
14
+
15
+ | Layer | What it does |
16
+ |---|---|
17
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
18
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
19
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install featkit
25
+ ```
26
+
27
+ ## Quickstart
28
+
29
+ ```python
30
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
31
+ from featkit.dataset import SimpleDataset
32
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
33
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
34
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
35
+
36
+ # Define schema
37
+ fields = [
38
+ IDField(name="ID_CLIENTE"),
39
+ TimeField(name="PERIODO",
40
+ source_granularity=TimeGranularity.MONTHLY,
41
+ target_granularity=TimeGranularity.MONTHLY),
42
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
43
+ allowed_values=["RETAIL", "CORP", "PYME"]),
44
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
45
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
46
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
47
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
48
+ ]
49
+
50
+ dataset = SimpleDataset(
51
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
52
+ fields=fields,
53
+ )
54
+
55
+ config = FeatureStoreConfig(
56
+ dataset=dataset,
57
+ output_schema="MY_DB.MY_SCHEMA",
58
+ output_table_prefix="FS",
59
+ time_windows=[3, 6, 9, 12],
60
+ )
61
+
62
+ pipeline = FeatureStorePipeline(config).build()
63
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
64
+
65
+ output.save("./output")
66
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
67
+ ```
68
+
69
+ ## Architecture
70
+
71
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
72
+
73
+ ## License
74
+
75
+ MIT
File without changes
@@ -0,0 +1,245 @@
1
+ # Examples
2
+
3
+ ## Example 1 — Pivot categoricals with marginals
4
+
5
+ Generate features for every category value plus the unconditional total (∅ marginal).
6
+
7
+ ```python
8
+ from featkit.config import FeatureStoreConfig
9
+ from featkit.dataset.base import SimpleDataset
10
+ from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
11
+ from featkit.fields.categorical_field import CategoricalField
12
+ from featkit.fields.id_field import IDField
13
+ from featkit.fields.measurement_field import MeasurementField
14
+ from featkit.fields.time_field import TimeField
15
+ from featkit.generators.sql.snowflake import SnowflakeSQLCodeGenerator
16
+ from featkit.pipeline import FeatureStorePipeline
17
+
18
+ ds = SimpleDataset(
19
+ "mydb.silver_sales",
20
+ [
21
+ IDField("client_id"),
22
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
23
+ MeasurementField("amount", MeasurementType.MONTO),
24
+ MeasurementField("txn_count", MeasurementType.CANTIDAD),
25
+ CategoricalField(
26
+ "channel",
27
+ CategoricalTreatment.PIVOT,
28
+ allowed_values=["branch", "online", "mobile"],
29
+ ),
30
+ CategoricalField(
31
+ "product",
32
+ CategoricalTreatment.PIVOT,
33
+ allowed_values=["current_account", "savings", "loan"],
34
+ ),
35
+ ],
36
+ )
37
+
38
+ cfg = FeatureStoreConfig(
39
+ dataset=ds,
40
+ output_schema="analytics",
41
+ output_table_prefix="feat_",
42
+ time_windows=[3, 6, 12],
43
+ include_marginals=True, # include ∅ (unconditional) totals
44
+ )
45
+
46
+ pipeline = FeatureStorePipeline(config=cfg).build()
47
+ print(f"Layer 2A columns : {len(pipeline.layer2a)}")
48
+ print(f"Layer 3 features: {len(pipeline.layer3)}")
49
+
50
+ result = SnowflakeSQLCodeGenerator().generate(pipeline)
51
+ result.save("output/snowflake/")
52
+ ```
53
+
54
+
55
+ ## Example 2 — Distributional metrics
56
+
57
+ Measure concentration, diversity, and dominant value of a categorical over time.
58
+
59
+ ```python
60
+ from featkit.enums import CategoricalTreatment, DistributionalMetric, MeasurementType, TimeGranularity
61
+ from featkit.fields.categorical_field import CategoricalField
62
+
63
+ # All five distributional metrics
64
+ region_field = CategoricalField(
65
+ "region",
66
+ CategoricalTreatment.DISTRIBUTIONAL,
67
+ distributional_metrics=[
68
+ DistributionalMetric.ENTROPY, # diversity index
69
+ DistributionalMetric.HHI, # Herfindahl–Hirschman index
70
+ DistributionalMetric.DOMINANT_PROPORTION, # share of the top category
71
+ DistributionalMetric.MODE, # most frequent category
72
+ DistributionalMetric.COUNT, # number of active categories
73
+ ],
74
+ )
75
+ ```
76
+
77
+ The generated `feat_layer2b` table will have one column per
78
+ `(categorical × measurement × aggregator × metric)` combination.
79
+ For `ENTROPY` and `HHI` the output type is `NUMERIC`; for `MODE` it is
80
+ `CATEGORICAL`, which restricts the set of valid temporal operators
81
+ (`ULT_MES`, `PREV_MES`, `REC` only).
82
+
83
+
84
+ ## Example 3 — Mixed pivot and distributional on the same field
85
+
86
+ Use `BOTH` treatment to get pivot columns *and* distributional metrics from a
87
+ single categorical field.
88
+
89
+ ```python
90
+ from featkit.enums import CategoricalTreatment, DistributionalMetric
91
+
92
+ product_field = CategoricalField(
93
+ "product_type",
94
+ CategoricalTreatment.BOTH,
95
+ allowed_values=["A", "B", "C"],
96
+ distributional_metrics=[DistributionalMetric.ENTROPY, DistributionalMetric.HHI],
97
+ )
98
+ ```
99
+
100
+
101
+ ## Example 4 — Multiple entity keys
102
+
103
+ The pipeline supports composite primary keys. List every ID field; all generated
104
+ `GROUP BY` and join clauses will include all of them.
105
+
106
+ ```python
107
+ from featkit.fields.id_field import IDField
108
+
109
+ ds = SimpleDataset(
110
+ "mydb.silver_transactions",
111
+ [
112
+ IDField("country_code"),
113
+ IDField("client_id"),
114
+ # ... other fields
115
+ ],
116
+ )
117
+ ```
118
+
119
+
120
+ ## Example 5 — Targeting Databricks SQL
121
+
122
+ ```python
123
+ from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
124
+
125
+ result = DatabricksSQLCodeGenerator().generate(pipeline)
126
+
127
+ # Databricks uses backtick quoting; syntax is otherwise identical to Snowflake
128
+ print(result.code.sql[:200])
129
+ result.save("output/databricks/")
130
+ ```
131
+
132
+
133
+ ## Example 6 — Generating a PySpark script
134
+
135
+ ```python
136
+ from featkit.generators.pyspark.databricks import PySparkCodeGenerator
137
+
138
+ result = PySparkCodeGenerator().generate(pipeline)
139
+
140
+ # result.code is a PySparkOutput; .code contains the full Python script
141
+ script = result.code.code
142
+ print(script[:500])
143
+
144
+ # Save — writes script.py instead of script.sql
145
+ result.save("output/pyspark/")
146
+ ```
147
+
148
+ The generated script is a self-contained Python file. Execute it by submitting
149
+ it to a Databricks job or a `spark-submit` invocation:
150
+
151
+ ```bash
152
+ databricks jobs submit --existing-cluster-id <id> --python-file output/pyspark/script.py
153
+ ```
154
+
155
+
156
+ ## Example 7 — Operators override
157
+
158
+ Restrict which temporal operators are applied to each Layer 2 output type.
159
+ Useful when you only need a subset of features.
160
+
161
+ ```python
162
+ from featkit.enums import Layer2OutputType, TemporalOperator
163
+ from featkit.config import FeatureStoreConfig
164
+
165
+ cfg = FeatureStoreConfig(
166
+ dataset=ds,
167
+ output_schema="analytics",
168
+ output_table_prefix="feat_",
169
+ time_windows=[3, 6],
170
+ operators_override={
171
+ # Only rolling averages and latest-month snapshots for numeric columns
172
+ Layer2OutputType.NUMERIC: [
173
+ TemporalOperator.PROM_U,
174
+ TemporalOperator.SUM_U,
175
+ TemporalOperator.ULT_MES,
176
+ ],
177
+ },
178
+ )
179
+ ```
180
+
181
+
182
+ ## Example 8 — Inspecting the DAG
183
+
184
+ ```python
185
+ import json
186
+
187
+ result = SnowflakeSQLCodeGenerator().generate(pipeline)
188
+
189
+ # Programmatic access
190
+ for node in result.dag.nodes:
191
+ print(f"{node.step_name:35s} depends on: {node.depends_on}")
192
+
193
+ # JSON (suitable for CI artefacts or a lineage tool)
194
+ print(json.dumps(json.loads(result.dag.to_json()), indent=2))
195
+
196
+ # Mermaid diagram (paste into any Mermaid renderer)
197
+ print(result.mermaid)
198
+ ```
199
+
200
+ Expected output:
201
+
202
+ ```
203
+ facts_table depends on: []
204
+ mob_table depends on: ['facts_table']
205
+ layer2a_pivot depends on: ['facts_table']
206
+ layer2b_distributional_ctes depends on: ['facts_table']
207
+ layer2_join depends on: ['layer2a_pivot', 'layer2b_distributional_ctes']
208
+ layer3_temporal depends on: ['layer2_join', 'mob_table']
209
+ final_output depends on: ['layer2_join', 'layer3_temporal']
210
+ ```
211
+
212
+
213
+ ## Example 9 — Dynamic categorical domain resolution
214
+
215
+ When the categorical domain is not known at configuration time, supply a
216
+ `domain_resolver` callable. featkit will call it at `build()` time; no
217
+ database adapter or executor is built into the framework.
218
+
219
+ ```python
220
+ from featkit.builders.pivot_space import PivotSpaceBuilder
221
+ from featkit.config import FeatureStoreConfig
222
+
223
+ # Provide your own query executor
224
+ def resolve_domain(field):
225
+ return conn.execute(
226
+ f"SELECT DISTINCT {field.name} FROM mydb.silver_transactions"
227
+ ).fetchall()
228
+
229
+ ds = SimpleDataset(
230
+ "mydb.silver_transactions",
231
+ [
232
+ IDField("client_id"),
233
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
234
+ MeasurementField("amount", MeasurementType.MONTO),
235
+ CategoricalField("segment", CategoricalTreatment.PIVOT), # no allowed_values
236
+ ],
237
+ )
238
+
239
+ # Build the pivot space manually with the resolver, then pass to the config
240
+ pivot_cols = PivotSpaceBuilder(
241
+ dataset=ds,
242
+ include_marginals=True,
243
+ domain_resolver=resolve_domain,
244
+ ).build()
245
+ ```