featkit 0.4.1__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {featkit-0.4.1 → featkit-0.4.3}/CHANGELOG.md +12 -0
- featkit-0.4.3/PKG-INFO +329 -0
- featkit-0.4.3/README.md +261 -0
- {featkit-0.4.1 → featkit-0.4.3}/pyproject.toml +2 -2
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/ratio_space.py +24 -6
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/config.py +15 -1
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/enums.py +16 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/databricks.py +5 -6
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/base.py +2 -2
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/pipeline.py +1 -1
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_sql_snowflake.py +53 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_ratio.py +119 -0
- featkit-0.4.1/PKG-INFO +0 -143
- featkit-0.4.1/README.md +0 -75
- {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/auto-tag.yml +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/ci.yml +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/docs.yml +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/.github/workflows/publish.yml +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/.gitignore +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/LICENSE +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/example_databricks_notebook.md +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/examples.md +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/general_plan.md +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/index.md +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/docs/quickstart.md +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/mkdocs.yml +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/distributional_space.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/pivot_space.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/builders/temporal_space.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/measurement/defaults.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/contracts/output/defaults.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/dataset/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/databricks_adapter.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/databricks_notebook_adapter.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/mock_adapter.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/spark_adapter.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/adapters/sqlalchemy_adapter.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/execution/domain_resolver.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/categorical_field.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/id_field.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/measurement_field.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/fields/time_field.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/output.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/pyspark/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/databricks.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/snowflake.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/generators/sql/spark_sql.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/distributional.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/pivoted.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer2/ratio.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/src/featkit/layer3/temporal_feature.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_builders.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_contracts.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_enums.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/test_adapters.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_execution/test_domain_resolver.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_fields.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/.gitkeep +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/__init__.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_base.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_pyspark.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_generators/test_sql_databricks.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_integration.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_layer2.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_layer3.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_output_contracts.py +0 -0
- {featkit-0.4.1 → featkit-0.4.3}/tests/test_pipeline.py +0 -0
|
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.3] - 2026-06-30
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `RatioMode` enum with two values: `ALL_PROJECTIONS` (default, existing behaviour) and `GLOBAL_TOTAL` (restricts Layer 2C denominators to the single all-∅ grand-total column, producing one ratio per numerator representing its share of the portfolio total).
|
|
14
|
+
- `FeatureStoreConfig.ratio_mode` parameter (default `RatioMode.ALL_PROJECTIONS`) to select the denominator strategy for `RatioSpaceBuilder`.
|
|
15
|
+
|
|
16
|
+
## [0.4.2] - 2026-06-30
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
- `FREQ` operator now counts only periods where the value is non-null **and strictly greater than 0** (previously counted any non-null value).
|
|
20
|
+
- `XM` operator now returns `1` only when **every** period in the time window has a non-null and strictly positive value, `0` otherwise (previously returned a raw count identical to FREQ). Both the SQL and PySpark generators are updated.
|
|
21
|
+
|
|
10
22
|
## [0.4.1] - 2026-06-09
|
|
11
23
|
|
|
12
24
|
### Fixed
|
featkit-0.4.3/PKG-INFO
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featkit
|
|
3
|
+
Version: 0.4.3
|
|
4
|
+
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
|
+
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
|
+
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
7
|
+
Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
|
|
9
|
+
Author: Mirko
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Mirko
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
|
|
33
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Requires-Dist: sqlglot>=23.0
|
|
46
|
+
Provides-Extra: databricks
|
|
47
|
+
Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pandas>=1.5; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
56
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
57
|
+
Provides-Extra: docs
|
|
58
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
60
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
61
|
+
Provides-Extra: execution
|
|
62
|
+
Requires-Dist: pandas>=1.5; extra == 'execution'
|
|
63
|
+
Provides-Extra: ibis
|
|
64
|
+
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
65
|
+
Provides-Extra: spark
|
|
66
|
+
Requires-Dist: pyspark>=3.4; extra == 'spark'
|
|
67
|
+
Description-Content-Type: text/markdown
|
|
68
|
+
|
|
69
|
+
# featkit
|
|
70
|
+
|
|
71
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
72
|
+
|
|
73
|
+
It implements a three-layer architecture:
|
|
74
|
+
|
|
75
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
76
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
77
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
78
|
+
|
|
79
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
80
|
+
|
|
81
|
+
## Key concepts
|
|
82
|
+
|
|
83
|
+
| Layer | What it does |
|
|
84
|
+
|---|---|
|
|
85
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
86
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
87
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install featkit
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Quickstart
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
99
|
+
from featkit.dataset import SimpleDataset
|
|
100
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
101
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
102
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
103
|
+
|
|
104
|
+
# Define schema
|
|
105
|
+
fields = [
|
|
106
|
+
IDField(name="ID_CLIENTE"),
|
|
107
|
+
TimeField(name="PERIODO",
|
|
108
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
109
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
110
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
111
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
112
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
113
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
114
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
115
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
dataset = SimpleDataset(
|
|
119
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
120
|
+
fields=fields,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
config = FeatureStoreConfig(
|
|
124
|
+
dataset=dataset,
|
|
125
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
126
|
+
output_table_prefix="FS",
|
|
127
|
+
time_windows=[3, 6, 9, 12],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
131
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
132
|
+
|
|
133
|
+
output.save("./output")
|
|
134
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Feature naming anatomy
|
|
138
|
+
|
|
139
|
+
Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
|
|
140
|
+
|
|
141
|
+
There are four families of features, each with its own naming pattern.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
### Layer 2A — Pivot features
|
|
146
|
+
|
|
147
|
+
**Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
|
|
148
|
+
|
|
149
|
+
| Segment | Source | Example |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
|
|
152
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
|
|
153
|
+
| `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
|
|
154
|
+
|
|
155
|
+
The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
|
|
156
|
+
|
|
157
|
+
| Measurement type | Semantic meaning | Valid `AGG` values |
|
|
158
|
+
|---|---|---|
|
|
159
|
+
| `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
|
|
160
|
+
| `CANTIDAD` | Count / quantity | `SUM` |
|
|
161
|
+
| `TICKET` | Average ticket size | `AVG` |
|
|
162
|
+
| `FLAG` | Binary indicator | `MAX` |
|
|
163
|
+
| `FECHA` | Date / timestamp | `MAX`, `MIN` |
|
|
164
|
+
| `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
|
|
165
|
+
| `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
|
|
166
|
+
| `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
|
|
167
|
+
|
|
168
|
+
Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
SUM__MTO # global — all sectors, all channels
|
|
172
|
+
SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
|
|
173
|
+
SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
|
|
174
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
|
|
175
|
+
SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
### Layer 2B — Distributional features
|
|
181
|
+
|
|
182
|
+
**Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
|
|
183
|
+
|
|
184
|
+
| Segment | Source | Example |
|
|
185
|
+
|---|---|---|
|
|
186
|
+
| `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
|
|
187
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO` |
|
|
188
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM` |
|
|
189
|
+
| `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
|
|
190
|
+
|
|
191
|
+
These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
|
|
192
|
+
|
|
193
|
+
| Metric | What it measures |
|
|
194
|
+
|---|---|
|
|
195
|
+
| `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
|
|
196
|
+
| `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
|
|
197
|
+
| `DOMINANT_PROPORTION` | Share of the most common category value |
|
|
198
|
+
| `MODE` | The most frequent category value (output type: categorical) |
|
|
199
|
+
| `COUNT` | Number of distinct observed values |
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
|
|
203
|
+
SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
|
|
204
|
+
CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
### Layer 2C — Ratio features
|
|
210
|
+
|
|
211
|
+
**Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
|
|
212
|
+
|
|
213
|
+
where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
|
|
214
|
+
|
|
215
|
+
The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
|
|
216
|
+
|
|
217
|
+
Enabled by setting `include_ratios=True` (requires `include_marginals=True`). The `ratio_mode` parameter controls which denominators are paired with each numerator:
|
|
218
|
+
|
|
219
|
+
| `ratio_mode` | Denominators considered | Ratios produced per numerator |
|
|
220
|
+
|---|---|---|
|
|
221
|
+
| `RatioMode.ALL_PROJECTIONS` *(default)* | Every proper marginal projection (partial or fully marginalised) | One per valid denominator |
|
|
222
|
+
| `RatioMode.GLOBAL_TOTAL` | Only the fully-marginalised column (all fields ∅) | Exactly one — the share of the grand total |
|
|
223
|
+
|
|
224
|
+
```
|
|
225
|
+
# With RatioMode.ALL_PROJECTIONS (default):
|
|
226
|
+
# Numerator: DIGITAL channel + RETAIL sector → three denominators
|
|
227
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL # share within RETAIL
|
|
228
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL # share within DIGITAL
|
|
229
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total
|
|
230
|
+
|
|
231
|
+
# With RatioMode.GLOBAL_TOTAL:
|
|
232
|
+
# Same numerator → only the grand-total denominator
|
|
233
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total only
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
### Layer 3 — Temporal features
|
|
239
|
+
|
|
240
|
+
**Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
|
|
241
|
+
|
|
242
|
+
`L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
|
|
243
|
+
|
|
244
|
+
| Segment | Source | Notes |
|
|
245
|
+
|---|---|---|
|
|
246
|
+
| `OPERATOR` | `TemporalOperator` enum | See table below |
|
|
247
|
+
| `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
|
|
248
|
+
| `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
|
|
249
|
+
|
|
250
|
+
#### Temporal operators
|
|
251
|
+
|
|
252
|
+
| Operator | Type | Description |
|
|
253
|
+
|---|---|---|
|
|
254
|
+
| `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
|
|
255
|
+
| `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
|
|
256
|
+
| `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
|
|
257
|
+
| `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
|
|
258
|
+
| `MIN_U` | Windowed | Minimum value observed in the window |
|
|
259
|
+
| `MAX_U` | Windowed | Maximum value observed in the window |
|
|
260
|
+
| `CREC` | Windowed | Growth rate across the window |
|
|
261
|
+
| `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
|
|
262
|
+
| `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
|
|
263
|
+
| `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
|
|
264
|
+
| `RATIO` | Windowed (composed) | Ratio of two sub-windows |
|
|
265
|
+
| `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
|
|
266
|
+
| `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
|
|
267
|
+
| `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
|
|
268
|
+
|
|
269
|
+
#### Valid operators per Layer 2 output type
|
|
270
|
+
|
|
271
|
+
| Output type | Valid operators |
|
|
272
|
+
|---|---|
|
|
273
|
+
| `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
|
|
274
|
+
| `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
|
|
275
|
+
| `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
|
|
276
|
+
| `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
|
|
277
|
+
|
|
278
|
+
#### Examples
|
|
279
|
+
|
|
280
|
+
```
|
|
281
|
+
# Average amount (DIGITAL + RETAIL) over the last 6 months
|
|
282
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
|
|
283
|
+
|
|
284
|
+
# Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
|
|
285
|
+
SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
|
|
286
|
+
|
|
287
|
+
# Most recent value of the CANAL entropy (by amount)
|
|
288
|
+
CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
|
|
289
|
+
|
|
290
|
+
# Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
|
|
291
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
|
|
292
|
+
|
|
293
|
+
# Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
|
|
294
|
+
CANAL__MTO__SUM__MODE__REC__BACKWARD
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
---
|
|
298
|
+
|
|
299
|
+
### Quick-reference: full name structure
|
|
300
|
+
|
|
301
|
+
```
|
|
302
|
+
┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
|
|
303
|
+
│ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
|
|
304
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
305
|
+
|
|
306
|
+
┌─ Layer 2B distributional ─────────────────────────────────────────┐
|
|
307
|
+
│ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
|
|
308
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
309
|
+
|
|
310
|
+
┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
|
|
311
|
+
│ {Layer 2A name} __over__ {Layer 2A name} │
|
|
312
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
313
|
+
|
|
314
|
+
┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
|
|
315
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
|
|
316
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
317
|
+
|
|
318
|
+
┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
|
|
319
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
|
|
320
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## Architecture
|
|
324
|
+
|
|
325
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
326
|
+
|
|
327
|
+
## License
|
|
328
|
+
|
|
329
|
+
MIT
|
featkit-0.4.3/README.md
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# featkit
|
|
2
|
+
|
|
3
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
4
|
+
|
|
5
|
+
It implements a three-layer architecture:
|
|
6
|
+
|
|
7
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
8
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
9
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
10
|
+
|
|
11
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
12
|
+
|
|
13
|
+
## Key concepts
|
|
14
|
+
|
|
15
|
+
| Layer | What it does |
|
|
16
|
+
|---|---|
|
|
17
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
18
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
19
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install featkit
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
31
|
+
from featkit.dataset import SimpleDataset
|
|
32
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
33
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
34
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
35
|
+
|
|
36
|
+
# Define schema
|
|
37
|
+
fields = [
|
|
38
|
+
IDField(name="ID_CLIENTE"),
|
|
39
|
+
TimeField(name="PERIODO",
|
|
40
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
41
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
42
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
43
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
44
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
45
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
46
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
47
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
dataset = SimpleDataset(
|
|
51
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
52
|
+
fields=fields,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
config = FeatureStoreConfig(
|
|
56
|
+
dataset=dataset,
|
|
57
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
58
|
+
output_table_prefix="FS",
|
|
59
|
+
time_windows=[3, 6, 9, 12],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
63
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
64
|
+
|
|
65
|
+
output.save("./output")
|
|
66
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Feature naming anatomy
|
|
70
|
+
|
|
71
|
+
Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
|
|
72
|
+
|
|
73
|
+
There are four families of features, each with its own naming pattern.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
### Layer 2A — Pivot features
|
|
78
|
+
|
|
79
|
+
**Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
|
|
80
|
+
|
|
81
|
+
| Segment | Source | Example |
|
|
82
|
+
|---|---|---|
|
|
83
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
|
|
84
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
|
|
85
|
+
| `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
|
|
86
|
+
|
|
87
|
+
The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
|
|
88
|
+
|
|
89
|
+
| Measurement type | Semantic meaning | Valid `AGG` values |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
|
|
92
|
+
| `CANTIDAD` | Count / quantity | `SUM` |
|
|
93
|
+
| `TICKET` | Average ticket size | `AVG` |
|
|
94
|
+
| `FLAG` | Binary indicator | `MAX` |
|
|
95
|
+
| `FECHA` | Date / timestamp | `MAX`, `MIN` |
|
|
96
|
+
| `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
|
|
97
|
+
| `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
|
|
98
|
+
| `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
|
|
99
|
+
|
|
100
|
+
Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
SUM__MTO # global — all sectors, all channels
|
|
104
|
+
SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
|
|
105
|
+
SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
|
|
106
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
|
|
107
|
+
SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
### Layer 2B — Distributional features
|
|
113
|
+
|
|
114
|
+
**Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
|
|
115
|
+
|
|
116
|
+
| Segment | Source | Example |
|
|
117
|
+
|---|---|---|
|
|
118
|
+
| `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
|
|
119
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO` |
|
|
120
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM` |
|
|
121
|
+
| `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
|
|
122
|
+
|
|
123
|
+
These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
|
|
124
|
+
|
|
125
|
+
| Metric | What it measures |
|
|
126
|
+
|---|---|
|
|
127
|
+
| `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
|
|
128
|
+
| `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
|
|
129
|
+
| `DOMINANT_PROPORTION` | Share of the most common category value |
|
|
130
|
+
| `MODE` | The most frequent category value (output type: categorical) |
|
|
131
|
+
| `COUNT` | Number of distinct observed values |
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
|
|
135
|
+
SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
|
|
136
|
+
CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
### Layer 2C — Ratio features
|
|
142
|
+
|
|
143
|
+
**Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
|
|
144
|
+
|
|
145
|
+
where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
|
|
146
|
+
|
|
147
|
+
The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
|
|
148
|
+
|
|
149
|
+
Enabled by setting `include_ratios=True` (requires `include_marginals=True`). The `ratio_mode` parameter controls which denominators are paired with each numerator:
|
|
150
|
+
|
|
151
|
+
| `ratio_mode` | Denominators considered | Ratios produced per numerator |
|
|
152
|
+
|---|---|---|
|
|
153
|
+
| `RatioMode.ALL_PROJECTIONS` *(default)* | Every proper marginal projection (partial or fully marginalised) | One per valid denominator |
|
|
154
|
+
| `RatioMode.GLOBAL_TOTAL` | Only the fully-marginalised column (all fields ∅) | Exactly one — the share of the grand total |
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
# With RatioMode.ALL_PROJECTIONS (default):
|
|
158
|
+
# Numerator: DIGITAL channel + RETAIL sector → three denominators
|
|
159
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL # share within RETAIL
|
|
160
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL # share within DIGITAL
|
|
161
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total
|
|
162
|
+
|
|
163
|
+
# With RatioMode.GLOBAL_TOTAL:
|
|
164
|
+
# Same numerator → only the grand-total denominator
|
|
165
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO # share of total only
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
### Layer 3 — Temporal features
|
|
171
|
+
|
|
172
|
+
**Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
|
|
173
|
+
|
|
174
|
+
`L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
|
|
175
|
+
|
|
176
|
+
| Segment | Source | Notes |
|
|
177
|
+
|---|---|---|
|
|
178
|
+
| `OPERATOR` | `TemporalOperator` enum | See table below |
|
|
179
|
+
| `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
|
|
180
|
+
| `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
|
|
181
|
+
|
|
182
|
+
#### Temporal operators
|
|
183
|
+
|
|
184
|
+
| Operator | Type | Description |
|
|
185
|
+
|---|---|---|
|
|
186
|
+
| `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
|
|
187
|
+
| `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
|
|
188
|
+
| `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
|
|
189
|
+
| `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
|
|
190
|
+
| `MIN_U` | Windowed | Minimum value observed in the window |
|
|
191
|
+
| `MAX_U` | Windowed | Maximum value observed in the window |
|
|
192
|
+
| `CREC` | Windowed | Growth rate across the window |
|
|
193
|
+
| `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
|
|
194
|
+
| `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
|
|
195
|
+
| `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
|
|
196
|
+
| `RATIO` | Windowed (composed) | Ratio of two sub-windows |
|
|
197
|
+
| `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
|
|
198
|
+
| `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
|
|
199
|
+
| `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
|
|
200
|
+
|
|
201
|
+
#### Valid operators per Layer 2 output type
|
|
202
|
+
|
|
203
|
+
| Output type | Valid operators |
|
|
204
|
+
|---|---|
|
|
205
|
+
| `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
|
|
206
|
+
| `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
|
|
207
|
+
| `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
|
|
208
|
+
| `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
|
|
209
|
+
|
|
210
|
+
#### Examples
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
# Average amount (DIGITAL + RETAIL) over the last 6 months
|
|
214
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
|
|
215
|
+
|
|
216
|
+
# Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
|
|
217
|
+
SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
|
|
218
|
+
|
|
219
|
+
# Most recent value of the CANAL entropy (by amount)
|
|
220
|
+
CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
|
|
221
|
+
|
|
222
|
+
# Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
|
|
223
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
|
|
224
|
+
|
|
225
|
+
# Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
|
|
226
|
+
CANAL__MTO__SUM__MODE__REC__BACKWARD
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
### Quick-reference: full name structure
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
|
|
235
|
+
│ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
|
|
236
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
237
|
+
|
|
238
|
+
┌─ Layer 2B distributional ─────────────────────────────────────────┐
|
|
239
|
+
│ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
|
|
240
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
241
|
+
|
|
242
|
+
┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
|
|
243
|
+
│ {Layer 2A name} __over__ {Layer 2A name} │
|
|
244
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
245
|
+
|
|
246
|
+
┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
|
|
247
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
|
|
248
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
249
|
+
|
|
250
|
+
┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
|
|
251
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
|
|
252
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Architecture
|
|
256
|
+
|
|
257
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
258
|
+
|
|
259
|
+
## License
|
|
260
|
+
|
|
261
|
+
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "featkit"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.3"
|
|
8
8
|
description = "featkit — automated feature store generation from relational facts tables"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -69,7 +69,7 @@ target-version = "py310"
|
|
|
69
69
|
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
70
70
|
|
|
71
71
|
[tool.mypy]
|
|
72
|
-
python_version = "3.
|
|
72
|
+
python_version = "3.12"
|
|
73
73
|
strict = true
|
|
74
74
|
ignore_missing_imports = true
|
|
75
75
|
|