featkit 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {featkit-0.4.1 → featkit-0.4.3}/CHANGELOG.md +12 -0
  2. featkit-0.4.3/PKG-INFO +329 -0
  3. featkit-0.4.3/README.md +261 -0
  4. {featkit-0.4.1 → featkit-0.4.3}/pyproject.toml +2 -2
  5. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/ratio_space.py +24 -6
  6. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/config.py +15 -1
  7. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/enums.py +16 -0
  8. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/databricks.py +5 -6
  9. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/base.py +2 -2
  10. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/pipeline.py +1 -1
  11. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_sql_snowflake.py +53 -0
  12. {featkit-0.4.1 → featkit-0.4.3}/tests/test_ratio.py +119 -0
  13. featkit-0.4.1/PKG-INFO +0 -143
  14. featkit-0.4.1/README.md +0 -75
  15. {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/auto-tag.yml +0 -0
  16. {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/ci.yml +0 -0
  17. {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/docs.yml +0 -0
  18. {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/publish.yml +0 -0
  19. {featkit-0.4.1 → featkit-0.4.3}/.gitignore +0 -0
  20. {featkit-0.4.1 → featkit-0.4.3}/LICENSE +0 -0
  21. {featkit-0.4.1 → featkit-0.4.3}/docs/.gitkeep +0 -0
  22. {featkit-0.4.1 → featkit-0.4.3}/docs/example_databricks_notebook.md +0 -0
  23. {featkit-0.4.1 → featkit-0.4.3}/docs/examples.md +0 -0
  24. {featkit-0.4.1 → featkit-0.4.3}/docs/general_plan.md +0 -0
  25. {featkit-0.4.1 → featkit-0.4.3}/docs/index.md +0 -0
  26. {featkit-0.4.1 → featkit-0.4.3}/docs/quickstart.md +0 -0
  27. {featkit-0.4.1 → featkit-0.4.3}/mkdocs.yml +0 -0
  28. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/__init__.py +0 -0
  29. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/.gitkeep +0 -0
  30. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/__init__.py +0 -0
  31. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/distributional_space.py +0 -0
  32. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/pivot_space.py +0 -0
  33. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/temporal_space.py +0 -0
  34. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/__init__.py +0 -0
  35. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/.gitkeep +0 -0
  36. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/__init__.py +0 -0
  37. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/base.py +0 -0
  38. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/defaults.py +0 -0
  39. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/.gitkeep +0 -0
  40. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/__init__.py +0 -0
  41. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/base.py +0 -0
  42. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/defaults.py +0 -0
  43. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/.gitkeep +0 -0
  44. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/__init__.py +0 -0
  45. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/base.py +0 -0
  46. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/__init__.py +0 -0
  47. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/__init__.py +0 -0
  48. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/base.py +0 -0
  49. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/databricks_adapter.py +0 -0
  50. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/databricks_notebook_adapter.py +0 -0
  51. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/mock_adapter.py +0 -0
  52. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/spark_adapter.py +0 -0
  53. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/sqlalchemy_adapter.py +0 -0
  54. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/domain_resolver.py +0 -0
  55. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/.gitkeep +0 -0
  56. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/__init__.py +0 -0
  57. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/base.py +0 -0
  58. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/categorical_field.py +0 -0
  59. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/id_field.py +0 -0
  60. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/measurement_field.py +0 -0
  61. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/time_field.py +0 -0
  62. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/__init__.py +0 -0
  63. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/base.py +0 -0
  64. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/output.py +0 -0
  65. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/.gitkeep +0 -0
  66. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/__init__.py +0 -0
  67. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/.gitkeep +0 -0
  68. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/__init__.py +0 -0
  69. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/databricks.py +0 -0
  70. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/snowflake.py +0 -0
  71. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/spark_sql.py +0 -0
  72. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/.gitkeep +0 -0
  73. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/__init__.py +0 -0
  74. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/base.py +0 -0
  75. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/distributional.py +0 -0
  76. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/pivoted.py +0 -0
  77. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/ratio.py +0 -0
  78. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/.gitkeep +0 -0
  79. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/__init__.py +0 -0
  80. {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/temporal_feature.py +0 -0
  81. {featkit-0.4.1 → featkit-0.4.3}/tests/__init__.py +0 -0
  82. {featkit-0.4.1 → featkit-0.4.3}/tests/test_builders.py +0 -0
  83. {featkit-0.4.1 → featkit-0.4.3}/tests/test_contracts.py +0 -0
  84. {featkit-0.4.1 → featkit-0.4.3}/tests/test_enums.py +0 -0
  85. {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/__init__.py +0 -0
  86. {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/test_adapters.py +0 -0
  87. {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/test_domain_resolver.py +0 -0
  88. {featkit-0.4.1 → featkit-0.4.3}/tests/test_fields.py +0 -0
  89. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/.gitkeep +0 -0
  90. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/__init__.py +0 -0
  91. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_base.py +0 -0
  92. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_pyspark.py +0 -0
  93. {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_sql_databricks.py +0 -0
  94. {featkit-0.4.1 → featkit-0.4.3}/tests/test_integration.py +0 -0
  95. {featkit-0.4.1 → featkit-0.4.3}/tests/test_layer2.py +0 -0
  96. {featkit-0.4.1 → featkit-0.4.3}/tests/test_layer3.py +0 -0
  97. {featkit-0.4.1 → featkit-0.4.3}/tests/test_output_contracts.py +0 -0
  98. {featkit-0.4.1 → featkit-0.4.3}/tests/test_pipeline.py +0 -0
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.3] - 2026-06-30
11
+
12
+ ### Added
13
+ - `RatioMode` enum with two values: `ALL_PROJECTIONS` (default, existing behaviour) and `GLOBAL_TOTAL` (restricts Layer 2C denominators to the single all-∅ grand-total column, producing one ratio per numerator representing its share of the portfolio total).
14
+ - `FeatureStoreConfig.ratio_mode` parameter (default `RatioMode.ALL_PROJECTIONS`) to select the denominator strategy for `RatioSpaceBuilder`.
15
+
16
+ ## [0.4.2] - 2026-06-30
17
+
18
+ ### Fixed
19
+ - `FREQ` operator now counts only periods where the value is non-null **and strictly greater than 0** (previously counted any non-null value).
20
+ - `XM` operator now returns `1` only when **every** period in the time window has a non-null and strictly positive value, `0` otherwise (previously returned a raw count identical to FREQ). Both the SQL and PySpark generators are updated.
21
+
10
22
  ## [0.4.1] - 2026-06-09
11
23
 
12
24
  ### Fixed
featkit-0.4.3/PKG-INFO ADDED
@@ -0,0 +1,329 @@
1
+ Metadata-Version: 2.4
2
+ Name: featkit
3
+ Version: 0.4.3
4
+ Summary: featkit — automated feature store generation from relational facts tables
5
+ Project-URL: Repository, https://github.com/Mirkiux/featkit
6
+ Project-URL: Documentation, https://mirkiux.github.io/featkit
7
+ Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
8
+ Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
9
+ Author: Mirko
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Mirko
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ License-File: LICENSE
32
+ Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
33
+ Classifier: Development Status :: 2 - Pre-Alpha
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Topic :: Scientific/Engineering
43
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
44
+ Requires-Python: >=3.10
45
+ Requires-Dist: sqlglot>=23.0
46
+ Provides-Extra: databricks
47
+ Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
48
+ Provides-Extra: dev
49
+ Requires-Dist: build>=1.0; extra == 'dev'
50
+ Requires-Dist: hatch>=1.9; extra == 'dev'
51
+ Requires-Dist: mypy>=1.0; extra == 'dev'
52
+ Requires-Dist: pandas>=1.5; extra == 'dev'
53
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
54
+ Requires-Dist: pytest>=7.0; extra == 'dev'
55
+ Requires-Dist: ruff>=0.4; extra == 'dev'
56
+ Requires-Dist: twine>=5.0; extra == 'dev'
57
+ Provides-Extra: docs
58
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
59
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
60
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
61
+ Provides-Extra: execution
62
+ Requires-Dist: pandas>=1.5; extra == 'execution'
63
+ Provides-Extra: ibis
64
+ Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
65
+ Provides-Extra: spark
66
+ Requires-Dist: pyspark>=3.4; extra == 'spark'
67
+ Description-Content-Type: text/markdown
68
+
69
+ # featkit
70
+
71
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
72
+
73
+ It implements a three-layer architecture:
74
+
75
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
76
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
77
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
78
+
79
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
80
+
81
+ ## Key concepts
82
+
83
+ | Layer | What it does |
84
+ |---|---|
85
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
86
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
87
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
88
+
89
+ ## Installation
90
+
91
+ ```bash
92
+ pip install featkit
93
+ ```
94
+
95
+ ## Quickstart
96
+
97
+ ```python
98
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
99
+ from featkit.dataset import SimpleDataset
100
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
101
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
102
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
103
+
104
+ # Define schema
105
+ fields = [
106
+ IDField(name="ID_CLIENTE"),
107
+ TimeField(name="PERIODO",
108
+ source_granularity=TimeGranularity.MONTHLY,
109
+ target_granularity=TimeGranularity.MONTHLY),
110
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
111
+ allowed_values=["RETAIL", "CORP", "PYME"]),
112
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
113
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
114
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
115
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
116
+ ]
117
+
118
+ dataset = SimpleDataset(
119
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
120
+ fields=fields,
121
+ )
122
+
123
+ config = FeatureStoreConfig(
124
+ dataset=dataset,
125
+ output_schema="MY_DB.MY_SCHEMA",
126
+ output_table_prefix="FS",
127
+ time_windows=[3, 6, 9, 12],
128
+ )
129
+
130
+ pipeline = FeatureStorePipeline(config).build()
131
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
132
+
133
+ output.save("./output")
134
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
135
+ ```
136
+
137
+ ## Feature naming anatomy
138
+
139
+ Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
140
+
141
+ There are four families of features, each with its own naming pattern.
142
+
143
+ ---
144
+
145
+ ### Layer 2A — Pivot features
146
+
147
+ **Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
148
+
149
+ | Segment | Source | Example |
150
+ |---|---|---|
151
+ | `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
152
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
153
+ | `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
154
+
155
+ The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
156
+
157
+ | Measurement type | Semantic meaning | Valid `AGG` values |
158
+ |---|---|---|
159
+ | `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
160
+ | `CANTIDAD` | Count / quantity | `SUM` |
161
+ | `TICKET` | Average ticket size | `AVG` |
162
+ | `FLAG` | Binary indicator | `MAX` |
163
+ | `FECHA` | Date / timestamp | `MAX`, `MIN` |
164
+ | `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
165
+ | `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
166
+ | `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
167
+
168
+ Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
169
+
170
+ ```
171
+ SUM__MTO # global — all sectors, all channels
172
+ SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
173
+ SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
174
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
175
+ SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
176
+ ```
177
+
178
+ ---
179
+
180
+ ### Layer 2B — Distributional features
181
+
182
+ **Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
183
+
184
+ | Segment | Source | Example |
185
+ |---|---|---|
186
+ | `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
187
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO` |
188
+ | `AGG` | `Layer2Aggregator` enum | `SUM` |
189
+ | `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
190
+
191
+ These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
192
+
193
+ | Metric | What it measures |
194
+ |---|---|
195
+ | `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
196
+ | `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
197
+ | `DOMINANT_PROPORTION` | Share of the most common category value |
198
+ | `MODE` | The most frequent category value (output type: categorical) |
199
+ | `COUNT` | Number of distinct observed values |
200
+
201
+ ```
202
+ CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
203
+ SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
204
+ CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
205
+ ```
206
+
207
+ ---
208
+
209
+ ### Layer 2C — Ratio features
210
+
211
+ **Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
212
+
213
+ where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
214
+
215
+ The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
216
+
217
+ Enabled by setting `include_ratios=True` (requires `include_marginals=True`). The `ratio_mode` parameter controls which denominators are paired with each numerator:
218
+
219
+ | `ratio_mode` | Denominators considered | Ratios produced per numerator |
220
+ |---|---|---|
221
+ | `RatioMode.ALL_PROJECTIONS` *(default)* | Every proper marginal projection (partial or fully marginalised) | One per valid denominator |
222
+ | `RatioMode.GLOBAL_TOTAL` | Only the fully-marginalised column (all fields ∅) | Exactly one — the share of the grand total |
223
+
224
+ ```
225
+ # With RatioMode.ALL_PROJECTIONS (default):
226
+ # Numerator: DIGITAL channel + RETAIL sector → three denominators
227
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL # share within RETAIL
228
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL # share within DIGITAL
229
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total
230
+
231
+ # With RatioMode.GLOBAL_TOTAL:
232
+ # Same numerator → only the grand-total denominator
233
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total only
234
+ ```
235
+
236
+ ---
237
+
238
+ ### Layer 3 — Temporal features
239
+
240
+ **Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
241
+
242
+ `L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
243
+
244
+ | Segment | Source | Notes |
245
+ |---|---|---|
246
+ | `OPERATOR` | `TemporalOperator` enum | See table below |
247
+ | `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
248
+ | `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
249
+
250
+ #### Temporal operators
251
+
252
+ | Operator | Type | Description |
253
+ |---|---|---|
254
+ | `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
255
+ | `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
256
+ | `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
257
+ | `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
258
+ | `MIN_U` | Windowed | Minimum value observed in the window |
259
+ | `MAX_U` | Windowed | Maximum value observed in the window |
260
+ | `CREC` | Windowed | Growth rate across the window |
261
+ | `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
262
+ | `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
263
+ | `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
264
+ | `RATIO` | Windowed (composed) | Ratio of two sub-windows |
265
+ | `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
266
+ | `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
267
+ | `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
268
+
269
+ #### Valid operators per Layer 2 output type
270
+
271
+ | Output type | Valid operators |
272
+ |---|---|
273
+ | `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
274
+ | `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
275
+ | `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
276
+ | `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
277
+
278
+ #### Examples
279
+
280
+ ```
281
+ # Average amount (DIGITAL + RETAIL) over the last 6 months
282
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
283
+
284
+ # Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
285
+ SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
286
+
287
+ # Most recent value of the CANAL entropy (by amount)
288
+ CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
289
+
290
+ # Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
291
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
292
+
293
+ # Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
294
+ CANAL__MTO__SUM__MODE__REC__BACKWARD
295
+ ```
296
+
297
+ ---
298
+
299
+ ### Quick-reference: full name structure
300
+
301
+ ```
302
+ ┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
303
+ │ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
304
+ └───────────────────────────────────────────────────────────────────┘
305
+
306
+ ┌─ Layer 2B distributional ─────────────────────────────────────────┐
307
+ │ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
308
+ └───────────────────────────────────────────────────────────────────┘
309
+
310
+ ┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
311
+ │ {Layer 2A name} __over__ {Layer 2A name} │
312
+ └───────────────────────────────────────────────────────────────────┘
313
+
314
+ ┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
315
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
316
+ └───────────────────────────────────────────────────────────────────┘
317
+
318
+ ┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
319
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
320
+ └───────────────────────────────────────────────────────────────────┘
321
+ ```
322
+
323
+ ## Architecture
324
+
325
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
326
+
327
+ ## License
328
+
329
+ MIT
@@ -0,0 +1,261 @@
1
+ # featkit
2
+
3
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
4
+
5
+ It implements a three-layer architecture:
6
+
7
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
8
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
9
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
10
+
11
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
12
+
13
+ ## Key concepts
14
+
15
+ | Layer | What it does |
16
+ |---|---|
17
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
18
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
19
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install featkit
25
+ ```
26
+
27
+ ## Quickstart
28
+
29
+ ```python
30
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
31
+ from featkit.dataset import SimpleDataset
32
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
33
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
34
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
35
+
36
+ # Define schema
37
+ fields = [
38
+ IDField(name="ID_CLIENTE"),
39
+ TimeField(name="PERIODO",
40
+ source_granularity=TimeGranularity.MONTHLY,
41
+ target_granularity=TimeGranularity.MONTHLY),
42
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
43
+ allowed_values=["RETAIL", "CORP", "PYME"]),
44
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
45
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
46
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
47
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
48
+ ]
49
+
50
+ dataset = SimpleDataset(
51
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
52
+ fields=fields,
53
+ )
54
+
55
+ config = FeatureStoreConfig(
56
+ dataset=dataset,
57
+ output_schema="MY_DB.MY_SCHEMA",
58
+ output_table_prefix="FS",
59
+ time_windows=[3, 6, 9, 12],
60
+ )
61
+
62
+ pipeline = FeatureStorePipeline(config).build()
63
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
64
+
65
+ output.save("./output")
66
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
67
+ ```
68
+
69
+ ## Feature naming anatomy
70
+
71
+ Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
72
+
73
+ There are four families of features, each with its own naming pattern.
74
+
75
+ ---
76
+
77
+ ### Layer 2A — Pivot features
78
+
79
+ **Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
80
+
81
+ | Segment | Source | Example |
82
+ |---|---|---|
83
+ | `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
84
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
85
+ | `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
86
+
87
+ The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
88
+
89
+ | Measurement type | Semantic meaning | Valid `AGG` values |
90
+ |---|---|---|
91
+ | `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
92
+ | `CANTIDAD` | Count / quantity | `SUM` |
93
+ | `TICKET` | Average ticket size | `AVG` |
94
+ | `FLAG` | Binary indicator | `MAX` |
95
+ | `FECHA` | Date / timestamp | `MAX`, `MIN` |
96
+ | `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
97
+ | `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
98
+ | `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
99
+
100
+ Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
101
+
102
+ ```
103
+ SUM__MTO # global — all sectors, all channels
104
+ SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
105
+ SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
106
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
107
+ SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
108
+ ```
109
+
110
+ ---
111
+
112
+ ### Layer 2B — Distributional features
113
+
114
+ **Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
115
+
116
+ | Segment | Source | Example |
117
+ |---|---|---|
118
+ | `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
119
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO` |
120
+ | `AGG` | `Layer2Aggregator` enum | `SUM` |
121
+ | `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
122
+
123
+ These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
124
+
125
+ | Metric | What it measures |
126
+ |---|---|
127
+ | `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
128
+ | `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
129
+ | `DOMINANT_PROPORTION` | Share of the most common category value |
130
+ | `MODE` | The most frequent category value (output type: categorical) |
131
+ | `COUNT` | Number of distinct observed values |
132
+
133
+ ```
134
+ CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
135
+ SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
136
+ CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
137
+ ```
138
+
139
+ ---
140
+
141
+ ### Layer 2C — Ratio features
142
+
143
+ **Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
144
+
145
+ where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
146
+
147
+ The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
148
+
149
+ Enabled by setting `include_ratios=True` (requires `include_marginals=True`). The `ratio_mode` parameter controls which denominators are paired with each numerator:
150
+
151
+ | `ratio_mode` | Denominators considered | Ratios produced per numerator |
152
+ |---|---|---|
153
+ | `RatioMode.ALL_PROJECTIONS` *(default)* | Every proper marginal projection (partial or fully marginalised) | One per valid denominator |
154
+ | `RatioMode.GLOBAL_TOTAL` | Only the fully-marginalised column (all fields ∅) | Exactly one — the share of the grand total |
155
+
156
+ ```
157
+ # With RatioMode.ALL_PROJECTIONS (default):
158
+ # Numerator: DIGITAL channel + RETAIL sector → three denominators
159
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL # share within RETAIL
160
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL # share within DIGITAL
161
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total
162
+
163
+ # With RatioMode.GLOBAL_TOTAL:
164
+ # Same numerator → only the grand-total denominator
165
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total only
166
+ ```
167
+
168
+ ---
169
+
170
+ ### Layer 3 — Temporal features
171
+
172
+ **Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
173
+
174
+ `L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
175
+
176
+ | Segment | Source | Notes |
177
+ |---|---|---|
178
+ | `OPERATOR` | `TemporalOperator` enum | See table below |
179
+ | `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
180
+ | `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
181
+
182
+ #### Temporal operators
183
+
184
+ | Operator | Type | Description |
185
+ |---|---|---|
186
+ | `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
187
+ | `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
188
+ | `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
189
+ | `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
190
+ | `MIN_U` | Windowed | Minimum value observed in the window |
191
+ | `MAX_U` | Windowed | Maximum value observed in the window |
192
+ | `CREC` | Windowed | Growth rate across the window |
193
+ | `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
194
+ | `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
195
+ | `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
196
+ | `RATIO` | Windowed (composed) | Ratio of two sub-windows |
197
+ | `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
198
+ | `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
199
+ | `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
200
+
201
+ #### Valid operators per Layer 2 output type
202
+
203
+ | Output type | Valid operators |
204
+ |---|---|
205
+ | `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
206
+ | `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
207
+ | `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
208
+ | `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
209
+
210
+ #### Examples
211
+
212
+ ```
213
+ # Average amount (DIGITAL + RETAIL) over the last 6 months
214
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
215
+
216
+ # Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
217
+ SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
218
+
219
+ # Most recent value of the CANAL entropy (by amount)
220
+ CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
221
+
222
+ # Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
223
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
224
+
225
+ # Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
226
+ CANAL__MTO__SUM__MODE__REC__BACKWARD
227
+ ```
228
+
229
+ ---
230
+
231
+ ### Quick-reference: full name structure
232
+
233
+ ```
234
+ ┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
235
+ │ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
236
+ └───────────────────────────────────────────────────────────────────┘
237
+
238
+ ┌─ Layer 2B distributional ─────────────────────────────────────────┐
239
+ │ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
240
+ └───────────────────────────────────────────────────────────────────┘
241
+
242
+ ┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
243
+ │ {Layer 2A name} __over__ {Layer 2A name} │
244
+ └───────────────────────────────────────────────────────────────────┘
245
+
246
+ ┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
247
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
248
+ └───────────────────────────────────────────────────────────────────┘
249
+
250
+ ┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
251
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
252
+ └───────────────────────────────────────────────────────────────────┘
253
+ ```
254
+
255
+ ## Architecture
256
+
257
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
258
+
259
+ ## License
260
+
261
+ MIT
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "featkit"
7
- version = "0.4.1"
7
+ version = "0.4.3"
8
8
  description = "featkit — automated feature store generation from relational facts tables"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -69,7 +69,7 @@ target-version = "py310"
69
69
  select = ["E", "F", "I", "UP", "B", "SIM"]
70
70
 
71
71
  [tool.mypy]
72
- python_version = "3.10"
72
+ python_version = "3.12"
73
73
  strict = true
74
74
  ignore_missing_imports = true
75
75