featkit 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {featkit-0.4.1 → featkit-0.4.2}/CHANGELOG.md +6 -0
  2. featkit-0.4.2/PKG-INFO +322 -0
  3. featkit-0.4.2/README.md +254 -0
  4. {featkit-0.4.1 → featkit-0.4.2}/pyproject.toml +2 -2
  5. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/pyspark/databricks.py +5 -6
  6. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/base.py +2 -2
  7. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/test_sql_snowflake.py +53 -0
  8. featkit-0.4.1/PKG-INFO +0 -143
  9. featkit-0.4.1/README.md +0 -75
  10. {featkit-0.4.1 → featkit-0.4.2}/.github/workflows/auto-tag.yml +0 -0
  11. {featkit-0.4.1 → featkit-0.4.2}/.github/workflows/ci.yml +0 -0
  12. {featkit-0.4.1 → featkit-0.4.2}/.github/workflows/docs.yml +0 -0
  13. {featkit-0.4.1 → featkit-0.4.2}/.github/workflows/publish.yml +0 -0
  14. {featkit-0.4.1 → featkit-0.4.2}/.gitignore +0 -0
  15. {featkit-0.4.1 → featkit-0.4.2}/LICENSE +0 -0
  16. {featkit-0.4.1 → featkit-0.4.2}/docs/.gitkeep +0 -0
  17. {featkit-0.4.1 → featkit-0.4.2}/docs/example_databricks_notebook.md +0 -0
  18. {featkit-0.4.1 → featkit-0.4.2}/docs/examples.md +0 -0
  19. {featkit-0.4.1 → featkit-0.4.2}/docs/general_plan.md +0 -0
  20. {featkit-0.4.1 → featkit-0.4.2}/docs/index.md +0 -0
  21. {featkit-0.4.1 → featkit-0.4.2}/docs/quickstart.md +0 -0
  22. {featkit-0.4.1 → featkit-0.4.2}/mkdocs.yml +0 -0
  23. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/__init__.py +0 -0
  24. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/.gitkeep +0 -0
  25. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/__init__.py +0 -0
  26. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/distributional_space.py +0 -0
  27. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/pivot_space.py +0 -0
  28. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/ratio_space.py +0 -0
  29. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/builders/temporal_space.py +0 -0
  30. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/config.py +0 -0
  31. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/__init__.py +0 -0
  32. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/measurement/.gitkeep +0 -0
  33. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/measurement/__init__.py +0 -0
  34. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/measurement/base.py +0 -0
  35. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/measurement/defaults.py +0 -0
  36. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/output/.gitkeep +0 -0
  37. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/output/__init__.py +0 -0
  38. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/output/base.py +0 -0
  39. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/contracts/output/defaults.py +0 -0
  40. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/dataset/.gitkeep +0 -0
  41. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/dataset/__init__.py +0 -0
  42. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/dataset/base.py +0 -0
  43. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/enums.py +0 -0
  44. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/__init__.py +0 -0
  45. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/__init__.py +0 -0
  46. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/base.py +0 -0
  47. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/databricks_adapter.py +0 -0
  48. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/databricks_notebook_adapter.py +0 -0
  49. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/mock_adapter.py +0 -0
  50. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/spark_adapter.py +0 -0
  51. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/adapters/sqlalchemy_adapter.py +0 -0
  52. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/execution/domain_resolver.py +0 -0
  53. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/.gitkeep +0 -0
  54. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/__init__.py +0 -0
  55. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/base.py +0 -0
  56. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/categorical_field.py +0 -0
  57. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/id_field.py +0 -0
  58. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/measurement_field.py +0 -0
  59. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/fields/time_field.py +0 -0
  60. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/__init__.py +0 -0
  61. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/base.py +0 -0
  62. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/output.py +0 -0
  63. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/pyspark/.gitkeep +0 -0
  64. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/pyspark/__init__.py +0 -0
  65. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/.gitkeep +0 -0
  66. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/__init__.py +0 -0
  67. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/databricks.py +0 -0
  68. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/snowflake.py +0 -0
  69. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/generators/sql/spark_sql.py +0 -0
  70. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/.gitkeep +0 -0
  71. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/__init__.py +0 -0
  72. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/base.py +0 -0
  73. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/distributional.py +0 -0
  74. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/pivoted.py +0 -0
  75. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer2/ratio.py +0 -0
  76. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer3/.gitkeep +0 -0
  77. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer3/__init__.py +0 -0
  78. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/layer3/temporal_feature.py +0 -0
  79. {featkit-0.4.1 → featkit-0.4.2}/src/featkit/pipeline.py +0 -0
  80. {featkit-0.4.1 → featkit-0.4.2}/tests/__init__.py +0 -0
  81. {featkit-0.4.1 → featkit-0.4.2}/tests/test_builders.py +0 -0
  82. {featkit-0.4.1 → featkit-0.4.2}/tests/test_contracts.py +0 -0
  83. {featkit-0.4.1 → featkit-0.4.2}/tests/test_enums.py +0 -0
  84. {featkit-0.4.1 → featkit-0.4.2}/tests/test_execution/__init__.py +0 -0
  85. {featkit-0.4.1 → featkit-0.4.2}/tests/test_execution/test_adapters.py +0 -0
  86. {featkit-0.4.1 → featkit-0.4.2}/tests/test_execution/test_domain_resolver.py +0 -0
  87. {featkit-0.4.1 → featkit-0.4.2}/tests/test_fields.py +0 -0
  88. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/.gitkeep +0 -0
  89. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/__init__.py +0 -0
  90. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/test_base.py +0 -0
  91. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/test_pyspark.py +0 -0
  92. {featkit-0.4.1 → featkit-0.4.2}/tests/test_generators/test_sql_databricks.py +0 -0
  93. {featkit-0.4.1 → featkit-0.4.2}/tests/test_integration.py +0 -0
  94. {featkit-0.4.1 → featkit-0.4.2}/tests/test_layer2.py +0 -0
  95. {featkit-0.4.1 → featkit-0.4.2}/tests/test_layer3.py +0 -0
  96. {featkit-0.4.1 → featkit-0.4.2}/tests/test_output_contracts.py +0 -0
  97. {featkit-0.4.1 → featkit-0.4.2}/tests/test_pipeline.py +0 -0
  98. {featkit-0.4.1 → featkit-0.4.2}/tests/test_ratio.py +0 -0
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.2] - 2026-06-30
11
+
12
+ ### Fixed
13
+ - `FREQ` operator now counts only periods where the value is non-null **and strictly greater than 0** (previously counted any non-null value).
14
+ - `XM` operator now returns `1` only when **every** period in the time window has a non-null and strictly positive value, `0` otherwise (previously returned a raw count identical to FREQ). Both the SQL and PySpark generators are updated.
15
+
10
16
  ## [0.4.1] - 2026-06-09
11
17
 
12
18
  ### Fixed
featkit-0.4.2/PKG-INFO ADDED
@@ -0,0 +1,322 @@
1
+ Metadata-Version: 2.4
2
+ Name: featkit
3
+ Version: 0.4.2
4
+ Summary: featkit — automated feature store generation from relational facts tables
5
+ Project-URL: Repository, https://github.com/Mirkiux/featkit
6
+ Project-URL: Documentation, https://mirkiux.github.io/featkit
7
+ Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
8
+ Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
9
+ Author: Mirko
10
+ License: MIT License
11
+
12
+ Copyright (c) 2026 Mirko
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ License-File: LICENSE
32
+ Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
33
+ Classifier: Development Status :: 2 - Pre-Alpha
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Programming Language :: Python :: 3.13
42
+ Classifier: Topic :: Scientific/Engineering
43
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
44
+ Requires-Python: >=3.10
45
+ Requires-Dist: sqlglot>=23.0
46
+ Provides-Extra: databricks
47
+ Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
48
+ Provides-Extra: dev
49
+ Requires-Dist: build>=1.0; extra == 'dev'
50
+ Requires-Dist: hatch>=1.9; extra == 'dev'
51
+ Requires-Dist: mypy>=1.0; extra == 'dev'
52
+ Requires-Dist: pandas>=1.5; extra == 'dev'
53
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
54
+ Requires-Dist: pytest>=7.0; extra == 'dev'
55
+ Requires-Dist: ruff>=0.4; extra == 'dev'
56
+ Requires-Dist: twine>=5.0; extra == 'dev'
57
+ Provides-Extra: docs
58
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
59
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
60
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
61
+ Provides-Extra: execution
62
+ Requires-Dist: pandas>=1.5; extra == 'execution'
63
+ Provides-Extra: ibis
64
+ Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
65
+ Provides-Extra: spark
66
+ Requires-Dist: pyspark>=3.4; extra == 'spark'
67
+ Description-Content-Type: text/markdown
68
+
69
+ # featkit
70
+
71
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
72
+
73
+ It implements a three-layer architecture:
74
+
75
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
76
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
77
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
78
+
79
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
80
+
81
+ ## Key concepts
82
+
83
+ | Layer | What it does |
84
+ |---|---|
85
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
86
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
87
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
88
+
89
+ ## Installation
90
+
91
+ ```bash
92
+ pip install featkit
93
+ ```
94
+
95
+ ## Quickstart
96
+
97
+ ```python
98
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
99
+ from featkit.dataset import SimpleDataset
100
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
101
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
102
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
103
+
104
+ # Define schema
105
+ fields = [
106
+ IDField(name="ID_CLIENTE"),
107
+ TimeField(name="PERIODO",
108
+ source_granularity=TimeGranularity.MONTHLY,
109
+ target_granularity=TimeGranularity.MONTHLY),
110
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
111
+ allowed_values=["RETAIL", "CORP", "PYME"]),
112
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
113
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
114
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
115
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
116
+ ]
117
+
118
+ dataset = SimpleDataset(
119
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
120
+ fields=fields,
121
+ )
122
+
123
+ config = FeatureStoreConfig(
124
+ dataset=dataset,
125
+ output_schema="MY_DB.MY_SCHEMA",
126
+ output_table_prefix="FS",
127
+ time_windows=[3, 6, 9, 12],
128
+ )
129
+
130
+ pipeline = FeatureStorePipeline(config).build()
131
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
132
+
133
+ output.save("./output")
134
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
135
+ ```
136
+
137
+ ## Feature naming anatomy
138
+
139
+ Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
140
+
141
+ There are four families of features, each with its own naming pattern.
142
+
143
+ ---
144
+
145
+ ### Layer 2A — Pivot features
146
+
147
+ **Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
148
+
149
+ | Segment | Source | Example |
150
+ |---|---|---|
151
+ | `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
152
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
153
+ | `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
154
+
155
+ The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
156
+
157
+ | Measurement type | Semantic meaning | Valid `AGG` values |
158
+ |---|---|---|
159
+ | `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
160
+ | `CANTIDAD` | Count / quantity | `SUM` |
161
+ | `TICKET` | Average ticket size | `AVG` |
162
+ | `FLAG` | Binary indicator | `MAX` |
163
+ | `FECHA` | Date / timestamp | `MAX`, `MIN` |
164
+ | `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
165
+ | `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
166
+ | `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
167
+
168
+ Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
169
+
170
+ ```
171
+ SUM__MTO # global — all sectors, all channels
172
+ SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
173
+ SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
174
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
175
+ SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
176
+ ```
177
+
178
+ ---
179
+
180
+ ### Layer 2B — Distributional features
181
+
182
+ **Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
183
+
184
+ | Segment | Source | Example |
185
+ |---|---|---|
186
+ | `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
187
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO` |
188
+ | `AGG` | `Layer2Aggregator` enum | `SUM` |
189
+ | `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
190
+
191
+ These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
192
+
193
+ | Metric | What it measures |
194
+ |---|---|
195
+ | `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
196
+ | `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
197
+ | `DOMINANT_PROPORTION` | Share of the most common category value |
198
+ | `MODE` | The most frequent category value (output type: categorical) |
199
+ | `COUNT` | Number of distinct observed values |
200
+
201
+ ```
202
+ CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
203
+ SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
204
+ CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
205
+ ```
206
+
207
+ ---
208
+
209
+ ### Layer 2C — Ratio features
210
+
211
+ **Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
212
+
213
+ where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
214
+
215
+ The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
216
+
217
+ ```
218
+ # Numerator: DIGITAL channel + RETAIL sector
219
+ # Denominator: RETAIL sector only (CANAL marginalized → share of DIGITAL within RETAIL)
220
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL
221
+
222
+ # Denominator: DIGITAL channel only (SECTOR marginalized → share of RETAIL within DIGITAL)
223
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL
224
+
225
+ # Denominator: global total (both marginalized → share of DIGITAL/RETAIL in total portfolio)
226
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO
227
+ ```
228
+
229
+ ---
230
+
231
+ ### Layer 3 — Temporal features
232
+
233
+ **Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
234
+
235
+ `L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
236
+
237
+ | Segment | Source | Notes |
238
+ |---|---|---|
239
+ | `OPERATOR` | `TemporalOperator` enum | See table below |
240
+ | `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
241
+ | `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
242
+
243
+ #### Temporal operators
244
+
245
+ | Operator | Type | Description |
246
+ |---|---|---|
247
+ | `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
248
+ | `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
249
+ | `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
250
+ | `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
251
+ | `MIN_U` | Windowed | Minimum value observed in the window |
252
+ | `MAX_U` | Windowed | Maximum value observed in the window |
253
+ | `CREC` | Windowed | Growth rate across the window |
254
+ | `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
255
+ | `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
256
+ | `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
257
+ | `RATIO` | Windowed (composed) | Ratio of two sub-windows |
258
+ | `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
259
+ | `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
260
+ | `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
261
+
262
+ #### Valid operators per Layer 2 output type
263
+
264
+ | Output type | Valid operators |
265
+ |---|---|
266
+ | `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
267
+ | `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
268
+ | `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
269
+ | `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
270
+
271
+ #### Examples
272
+
273
+ ```
274
+ # Average amount (DIGITAL + RETAIL) over the last 6 months
275
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
276
+
277
+ # Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
278
+ SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
279
+
280
+ # Most recent value of the CANAL entropy (by amount)
281
+ CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
282
+
283
+ # Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
284
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
285
+
286
+ # Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
287
+ CANAL__MTO__SUM__MODE__REC__BACKWARD
288
+ ```
289
+
290
+ ---
291
+
292
+ ### Quick-reference: full name structure
293
+
294
+ ```
295
+ ┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
296
+ │ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
297
+ └───────────────────────────────────────────────────────────────────┘
298
+
299
+ ┌─ Layer 2B distributional ─────────────────────────────────────────┐
300
+ │ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
301
+ └───────────────────────────────────────────────────────────────────┘
302
+
303
+ ┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
304
+ │ {Layer 2A name} __over__ {Layer 2A name} │
305
+ └───────────────────────────────────────────────────────────────────┘
306
+
307
+ ┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
308
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
309
+ └───────────────────────────────────────────────────────────────────┘
310
+
311
+ ┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
312
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
313
+ └───────────────────────────────────────────────────────────────────┘
314
+ ```
315
+
316
+ ## Architecture
317
+
318
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
319
+
320
+ ## License
321
+
322
+ MIT
@@ -0,0 +1,254 @@
1
+ # featkit
2
+
3
+ **featkit** is a Python framework for automated feature store generation from relational facts tables.
4
+
5
+ It implements a three-layer architecture:
6
+
7
+ - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
8
+ - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
9
+ - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
10
+
11
+ The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
12
+
13
+ ## Key concepts
14
+
15
+ | Layer | What it does |
16
+ |---|---|
17
+ | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
18
+ | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
19
+ | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install featkit
25
+ ```
26
+
27
+ ## Quickstart
28
+
29
+ ```python
30
+ from featkit import FeatureStorePipeline, FeatureStoreConfig
31
+ from featkit.dataset import SimpleDataset
32
+ from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
33
+ from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
34
+ from featkit.generators.sql import SnowflakeSQLCodeGenerator
35
+
36
+ # Define schema
37
+ fields = [
38
+ IDField(name="ID_CLIENTE"),
39
+ TimeField(name="PERIODO",
40
+ source_granularity=TimeGranularity.MONTHLY,
41
+ target_granularity=TimeGranularity.MONTHLY),
42
+ CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
43
+ allowed_values=["RETAIL", "CORP", "PYME"]),
44
+ CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
45
+ allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
46
+ MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
47
+ MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
48
+ ]
49
+
50
+ dataset = SimpleDataset(
51
+ source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
52
+ fields=fields,
53
+ )
54
+
55
+ config = FeatureStoreConfig(
56
+ dataset=dataset,
57
+ output_schema="MY_DB.MY_SCHEMA",
58
+ output_table_prefix="FS",
59
+ time_windows=[3, 6, 9, 12],
60
+ )
61
+
62
+ pipeline = FeatureStorePipeline(config).build()
63
+ output = pipeline.run(SnowflakeSQLCodeGenerator())
64
+
65
+ output.save("./output")
66
+ # Writes: output/script.sql, output/dag.json, output/diagram.md
67
+ ```
68
+
69
+ ## Feature naming anatomy
70
+
71
+ Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
72
+
73
+ There are four families of features, each with its own naming pattern.
74
+
75
+ ---
76
+
77
+ ### Layer 2A — Pivot features
78
+
79
+ **Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
80
+
81
+ | Segment | Source | Example |
82
+ |---|---|---|
83
+ | `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
84
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
85
+ | `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
86
+
87
+ The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
88
+
89
+ | Measurement type | Semantic meaning | Valid `AGG` values |
90
+ |---|---|---|
91
+ | `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
92
+ | `CANTIDAD` | Count / quantity | `SUM` |
93
+ | `TICKET` | Average ticket size | `AVG` |
94
+ | `FLAG` | Binary indicator | `MAX` |
95
+ | `FECHA` | Date / timestamp | `MAX`, `MIN` |
96
+ | `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
97
+ | `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
98
+ | `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
99
+
100
+ Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
101
+
102
+ ```
103
+ SUM__MTO # global — all sectors, all channels
104
+ SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
105
+ SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
106
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
107
+ SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
108
+ ```
109
+
110
+ ---
111
+
112
+ ### Layer 2B — Distributional features
113
+
114
+ **Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
115
+
116
+ | Segment | Source | Example |
117
+ |---|---|---|
118
+ | `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
119
+ | `MEASUREMENT` | `MeasurementField.name` | `MTO` |
120
+ | `AGG` | `Layer2Aggregator` enum | `SUM` |
121
+ | `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
122
+
123
+ These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
124
+
125
+ | Metric | What it measures |
126
+ |---|---|
127
+ | `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
128
+ | `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
129
+ | `DOMINANT_PROPORTION` | Share of the most common category value |
130
+ | `MODE` | The most frequent category value (output type: categorical) |
131
+ | `COUNT` | Number of distinct observed values |
132
+
133
+ ```
134
+ CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
135
+ SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
136
+ CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
137
+ ```
138
+
139
+ ---
140
+
141
+ ### Layer 2C — Ratio features
142
+
143
+ **Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
144
+
145
+ where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
146
+
147
+ The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
148
+
149
+ ```
150
+ # Numerator: DIGITAL channel + RETAIL sector
151
+ # Denominator: RETAIL sector only (CANAL marginalized → share of DIGITAL within RETAIL)
152
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL
153
+
154
+ # Denominator: DIGITAL channel only (SECTOR marginalized → share of RETAIL within DIGITAL)
155
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL
156
+
157
+ # Denominator: global total (both marginalized → share of DIGITAL/RETAIL in total portfolio)
158
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO
159
+ ```
160
+
161
+ ---
162
+
163
+ ### Layer 3 — Temporal features
164
+
165
+ **Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
166
+
167
+ `L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
168
+
169
+ | Segment | Source | Notes |
170
+ |---|---|---|
171
+ | `OPERATOR` | `TemporalOperator` enum | See table below |
172
+ | `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
173
+ | `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
174
+
175
+ #### Temporal operators
176
+
177
+ | Operator | Type | Description |
178
+ |---|---|---|
179
+ | `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
180
+ | `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
181
+ | `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
182
+ | `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
183
+ | `MIN_U` | Windowed | Minimum value observed in the window |
184
+ | `MAX_U` | Windowed | Maximum value observed in the window |
185
+ | `CREC` | Windowed | Growth rate across the window |
186
+ | `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
187
+ | `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
188
+ | `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
189
+ | `RATIO` | Windowed (composed) | Ratio of two sub-windows |
190
+ | `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
191
+ | `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
192
+ | `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
193
+
194
+ #### Valid operators per Layer 2 output type
195
+
196
+ | Output type | Valid operators |
197
+ |---|---|
198
+ | `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
199
+ | `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
200
+ | `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
201
+ | `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
202
+
203
+ #### Examples
204
+
205
+ ```
206
+ # Average amount (DIGITAL + RETAIL) over the last 6 months
207
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
208
+
209
+ # Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
210
+ SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
211
+
212
+ # Most recent value of the CANAL entropy (by amount)
213
+ CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
214
+
215
+ # Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
216
+ SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
217
+
218
+ # Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
219
+ CANAL__MTO__SUM__MODE__REC__BACKWARD
220
+ ```
221
+
222
+ ---
223
+
224
+ ### Quick-reference: full name structure
225
+
226
+ ```
227
+ ┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
228
+ │ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
229
+ └───────────────────────────────────────────────────────────────────┘
230
+
231
+ ┌─ Layer 2B distributional ─────────────────────────────────────────┐
232
+ │ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
233
+ └───────────────────────────────────────────────────────────────────┘
234
+
235
+ ┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
236
+ │ {Layer 2A name} __over__ {Layer 2A name} │
237
+ └───────────────────────────────────────────────────────────────────┘
238
+
239
+ ┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
240
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
241
+ └───────────────────────────────────────────────────────────────────┘
242
+
243
+ ┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
244
+ │ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
245
+ └───────────────────────────────────────────────────────────────────┘
246
+ ```
247
+
248
+ ## Architecture
249
+
250
+ See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
251
+
252
+ ## License
253
+
254
+ MIT
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "featkit"
7
- version = "0.4.1"
7
+ version = "0.4.2"
8
8
  description = "featkit — automated feature store generation from relational facts tables"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -69,7 +69,7 @@ target-version = "py310"
69
69
  select = ["E", "F", "I", "UP", "B", "SIM"]
70
70
 
71
71
  [tool.mypy]
72
- python_version = "3.10"
72
+ python_version = "3.12"
73
73
  strict = true
74
74
  ignore_missing_imports = true
75
75
 
@@ -379,13 +379,12 @@ class PySparkCodeGenerator(AbstractCodeGenerator):
379
379
  f' - F.lit(1)).alias("{alias}")'
380
380
  )
381
381
  if op == TemporalOperator.FREQ:
382
- return (
383
- f'F.count(F.when({in_window} & {col_ref}.isNotNull(), F.lit(1))).alias("{alias}")'
384
- )
382
+ active = f"{in_window} & {col_ref}.isNotNull() & ({col_ref} > 0)"
383
+ return f'F.count(F.when({active}, F.lit(1))).alias("{alias}")'
385
384
  if op == TemporalOperator.XM:
386
- return (
387
- f'F.count(F.when({in_window} & {col_ref}.isNotNull(), F.lit(1))).alias("{alias}")'
388
- )
385
+ active = f"{in_window} & {col_ref}.isNotNull() & ({col_ref} > 0)"
386
+ count_expr = f"F.count(F.when({active}, F.lit(1)))"
387
+ return f'F.when({count_expr} == {w}, F.lit(1)).otherwise(F.lit(0)).alias("{alias}")'
389
388
  if op == TemporalOperator.REC:
390
389
  return f'(-F.max(F.when({col_ref}.isNotNull(), {mob}))).alias("{alias}")'
391
390
  if op == TemporalOperator.MEDIA_ABS:
@@ -468,7 +468,7 @@ class AbstractSQLCodeGenerator(AbstractCodeGenerator):
468
468
  lo, hi = 0, w - 1
469
469
  in_window = f"{mob_col} BETWEEN {lo} AND {hi}"
470
470
  case_col = f"CASE WHEN {in_window} THEN {col} END"
471
- case_notnull = f"CASE WHEN {in_window} AND {col} IS NOT NULL THEN 1 END"
471
+ case_notnull = f"CASE WHEN {in_window} AND {col} IS NOT NULL AND {col} > 0 THEN 1 END"
472
472
 
473
473
  if op == TemporalOperator.PROM_U:
474
474
  return f"AVG({case_col})"
@@ -493,7 +493,7 @@ class AbstractSQLCodeGenerator(AbstractCodeGenerator):
493
493
  if op == TemporalOperator.FREQ:
494
494
  return f"COUNT({case_notnull})"
495
495
  if op == TemporalOperator.XM:
496
- return f"COUNT({case_notnull})"
496
+ return f"CASE WHEN COUNT({case_notnull}) = {w} THEN 1 ELSE 0 END"
497
497
  if op == TemporalOperator.REC:
498
498
  return f"-MAX(CASE WHEN {col} IS NOT NULL THEN {mob_col} END)"
499
499
  if op == TemporalOperator.MEDIA_ABS:
@@ -419,3 +419,56 @@ class TestTableNaming:
419
419
  sql = _GEN.build_mob_table(pipeline).sql
420
420
  assert "myschema" in sql
421
421
  assert "x_mob_ref" in sql
422
+
423
+
424
+ # ---------------------------------------------------------------------------
425
+ # FREQ and XM operator semantics
426
+ # ---------------------------------------------------------------------------
427
+
428
+
429
+ def _pipeline_flag(window: int = 6) -> FeatureStorePipeline:
430
+ """Pipeline with a FLAG measurement, generating FREQ and XM temporal features."""
431
+ ds = SimpleDataset(
432
+ "db.facts",
433
+ [
434
+ IDField("id"),
435
+ TimeField("ts", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
436
+ MeasurementField("paid", MeasurementType.FLAG),
437
+ ],
438
+ )
439
+ cfg = FeatureStoreConfig(
440
+ dataset=ds,
441
+ output_schema="out",
442
+ output_table_prefix="feat_",
443
+ time_windows=[window],
444
+ )
445
+ return FeatureStorePipeline(config=cfg).build()
446
+
447
+
448
+ class TestFreqXmSemantics:
449
+ def _sql(self, window: int = 6) -> str:
450
+ out = _GEN.build_layer3(_pipeline_flag(window))
451
+ assert isinstance(out, SQLOutput)
452
+ return out.sql
453
+
454
+ def test_freq_filters_positive_values(self) -> None:
455
+ assert "> 0" in self._sql()
456
+
457
+ def test_freq_does_not_count_zero_values(self) -> None:
458
+ # The count expression must gate on > 0, not just IS NOT NULL
459
+ sql = self._sql()
460
+ assert "IS NOT NULL AND" in sql.upper() or "> 0" in sql
461
+
462
+ def test_xm_returns_one_or_zero(self) -> None:
463
+ sql = self._sql(window=6)
464
+ assert "CASE WHEN" in sql.upper()
465
+ assert "THEN 1" in sql
466
+ assert "ELSE 0" in sql
467
+
468
+ def test_xm_compares_count_to_window_size(self) -> None:
469
+ # The XM expression must compare the active-period count against the window size
470
+ sql = self._sql(window=6)
471
+ assert "= 6" in sql
472
+
473
+ def test_layer3_with_flag_is_parseable(self) -> None:
474
+ assert _is_parseable(self._sql())
featkit-0.4.1/PKG-INFO DELETED
@@ -1,143 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: featkit
3
- Version: 0.4.1
4
- Summary: featkit — automated feature store generation from relational facts tables
5
- Project-URL: Repository, https://github.com/Mirkiux/featkit
6
- Project-URL: Documentation, https://mirkiux.github.io/featkit
7
- Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
8
- Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
9
- Author: Mirko
10
- License: MIT License
11
-
12
- Copyright (c) 2026 Mirko
13
-
14
- Permission is hereby granted, free of charge, to any person obtaining a copy
15
- of this software and associated documentation files (the "Software"), to deal
16
- in the Software without restriction, including without limitation the rights
17
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
- copies of the Software, and to permit persons to whom the Software is
19
- furnished to do so, subject to the following conditions:
20
-
21
- The above copyright notice and this permission notice shall be included in all
22
- copies or substantial portions of the Software.
23
-
24
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
- SOFTWARE.
31
- License-File: LICENSE
32
- Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
33
- Classifier: Development Status :: 2 - Pre-Alpha
34
- Classifier: Intended Audience :: Developers
35
- Classifier: Intended Audience :: Science/Research
36
- Classifier: License :: OSI Approved :: MIT License
37
- Classifier: Programming Language :: Python :: 3
38
- Classifier: Programming Language :: Python :: 3.10
39
- Classifier: Programming Language :: Python :: 3.11
40
- Classifier: Programming Language :: Python :: 3.12
41
- Classifier: Programming Language :: Python :: 3.13
42
- Classifier: Topic :: Scientific/Engineering
43
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
44
- Requires-Python: >=3.10
45
- Requires-Dist: sqlglot>=23.0
46
- Provides-Extra: databricks
47
- Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
48
- Provides-Extra: dev
49
- Requires-Dist: build>=1.0; extra == 'dev'
50
- Requires-Dist: hatch>=1.9; extra == 'dev'
51
- Requires-Dist: mypy>=1.0; extra == 'dev'
52
- Requires-Dist: pandas>=1.5; extra == 'dev'
53
- Requires-Dist: pytest-cov>=4.0; extra == 'dev'
54
- Requires-Dist: pytest>=7.0; extra == 'dev'
55
- Requires-Dist: ruff>=0.4; extra == 'dev'
56
- Requires-Dist: twine>=5.0; extra == 'dev'
57
- Provides-Extra: docs
58
- Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
59
- Requires-Dist: mkdocs>=1.6; extra == 'docs'
60
- Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
61
- Provides-Extra: execution
62
- Requires-Dist: pandas>=1.5; extra == 'execution'
63
- Provides-Extra: ibis
64
- Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
65
- Provides-Extra: spark
66
- Requires-Dist: pyspark>=3.4; extra == 'spark'
67
- Description-Content-Type: text/markdown
68
-
69
- # featkit
70
-
71
- **featkit** is a Python framework for automated feature store generation from relational facts tables.
72
-
73
- It implements a three-layer architecture:
74
-
75
- - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
76
- - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
77
- - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
78
-
79
- The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
80
-
81
- ## Key concepts
82
-
83
- | Layer | What it does |
84
- |---|---|
85
- | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
86
- | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
87
- | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
88
-
89
- ## Installation
90
-
91
- ```bash
92
- pip install featkit
93
- ```
94
-
95
- ## Quickstart
96
-
97
- ```python
98
- from featkit import FeatureStorePipeline, FeatureStoreConfig
99
- from featkit.dataset import SimpleDataset
100
- from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
101
- from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
102
- from featkit.generators.sql import SnowflakeSQLCodeGenerator
103
-
104
- # Define schema
105
- fields = [
106
- IDField(name="ID_CLIENTE"),
107
- TimeField(name="PERIODO",
108
- source_granularity=TimeGranularity.MONTHLY,
109
- target_granularity=TimeGranularity.MONTHLY),
110
- CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
111
- allowed_values=["RETAIL", "CORP", "PYME"]),
112
- CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
113
- allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
114
- MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
115
- MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
116
- ]
117
-
118
- dataset = SimpleDataset(
119
- source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
120
- fields=fields,
121
- )
122
-
123
- config = FeatureStoreConfig(
124
- dataset=dataset,
125
- output_schema="MY_DB.MY_SCHEMA",
126
- output_table_prefix="FS",
127
- time_windows=[3, 6, 9, 12],
128
- )
129
-
130
- pipeline = FeatureStorePipeline(config).build()
131
- output = pipeline.run(SnowflakeSQLCodeGenerator())
132
-
133
- output.save("./output")
134
- # Writes: output/script.sql, output/dag.json, output/diagram.md
135
- ```
136
-
137
- ## Architecture
138
-
139
- See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
140
-
141
- ## License
142
-
143
- MIT
featkit-0.4.1/README.md DELETED
@@ -1,75 +0,0 @@
1
- # featkit
2
-
3
- **featkit** is a Python framework for automated feature store generation from relational facts tables.
4
-
5
- It implements a three-layer architecture:
6
-
7
- - **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
8
- - **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
9
- - **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
10
-
11
- The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
12
-
13
- ## Key concepts
14
-
15
- | Layer | What it does |
16
- |---|---|
17
- | Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
18
- | Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
19
- | Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
20
-
21
- ## Installation
22
-
23
- ```bash
24
- pip install featkit
25
- ```
26
-
27
- ## Quickstart
28
-
29
- ```python
30
- from featkit import FeatureStorePipeline, FeatureStoreConfig
31
- from featkit.dataset import SimpleDataset
32
- from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
33
- from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
34
- from featkit.generators.sql import SnowflakeSQLCodeGenerator
35
-
36
- # Define schema
37
- fields = [
38
- IDField(name="ID_CLIENTE"),
39
- TimeField(name="PERIODO",
40
- source_granularity=TimeGranularity.MONTHLY,
41
- target_granularity=TimeGranularity.MONTHLY),
42
- CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
43
- allowed_values=["RETAIL", "CORP", "PYME"]),
44
- CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
45
- allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
46
- MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
47
- MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
48
- ]
49
-
50
- dataset = SimpleDataset(
51
- source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
52
- fields=fields,
53
- )
54
-
55
- config = FeatureStoreConfig(
56
- dataset=dataset,
57
- output_schema="MY_DB.MY_SCHEMA",
58
- output_table_prefix="FS",
59
- time_windows=[3, 6, 9, 12],
60
- )
61
-
62
- pipeline = FeatureStorePipeline(config).build()
63
- output = pipeline.run(SnowflakeSQLCodeGenerator())
64
-
65
- output.save("./output")
66
- # Writes: output/script.sql, output/dag.json, output/diagram.md
67
- ```
68
-
69
- ## Architecture
70
-
71
- See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
72
-
73
- ## License
74
-
75
- MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes