featkit 0.3.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featkit-0.4.2/.github/workflows/auto-tag.yml +54 -0
- featkit-0.4.2/CHANGELOG.md +42 -0
- featkit-0.4.2/PKG-INFO +322 -0
- featkit-0.4.2/README.md +254 -0
- {featkit-0.3.0 → featkit-0.4.2}/pyproject.toml +3 -2
- featkit-0.4.2/src/featkit/builders/ratio_space.py +102 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/builders/temporal_space.py +3 -3
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/config.py +7 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/domain_resolver.py +19 -2
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/pyspark/databricks.py +5 -6
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/base.py +32 -20
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer2/base.py +36 -26
- featkit-0.4.2/src/featkit/layer2/ratio.py +101 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer3/temporal_feature.py +3 -3
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/pipeline.py +10 -2
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_contracts.py +1 -1
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_execution/test_domain_resolver.py +56 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/test_sql_snowflake.py +53 -0
- featkit-0.4.2/tests/test_ratio.py +389 -0
- featkit-0.3.0/CHANGELOG.md +0 -25
- featkit-0.3.0/PKG-INFO +0 -143
- featkit-0.3.0/README.md +0 -75
- {featkit-0.3.0 → featkit-0.4.2}/.github/workflows/ci.yml +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/.github/workflows/docs.yml +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/.github/workflows/publish.yml +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/.gitignore +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/LICENSE +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/example_databricks_notebook.md +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/examples.md +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/general_plan.md +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/index.md +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/docs/quickstart.md +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/mkdocs.yml +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/builders/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/builders/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/builders/distributional_space.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/builders/pivot_space.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/measurement/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/measurement/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/measurement/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/measurement/defaults.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/output/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/output/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/output/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/contracts/output/defaults.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/dataset/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/dataset/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/dataset/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/enums.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/databricks_adapter.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/databricks_notebook_adapter.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/mock_adapter.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/spark_adapter.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/execution/adapters/sqlalchemy_adapter.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/categorical_field.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/id_field.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/measurement_field.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/fields/time_field.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/output.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/pyspark/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/pyspark/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/databricks.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/snowflake.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/generators/sql/spark_sql.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer2/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer2/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer2/distributional.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer2/pivoted.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer3/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/src/featkit/layer3/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_builders.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_enums.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_execution/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_execution/test_adapters.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_fields.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/.gitkeep +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/__init__.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/test_base.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/test_pyspark.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_generators/test_sql_databricks.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_integration.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_layer2.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_layer3.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_output_contracts.py +0 -0
- {featkit-0.3.0 → featkit-0.4.2}/tests/test_pipeline.py +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: Auto-tag on version bump
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
paths:
|
|
8
|
+
- "pyproject.toml"
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
tag:
|
|
12
|
+
name: Create version tag
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Ensure RELEASE_TOKEN is configured
|
|
17
|
+
env:
|
|
18
|
+
RELEASE_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
|
19
|
+
run: |
|
|
20
|
+
if [ -z "$RELEASE_TOKEN" ]; then
|
|
21
|
+
echo "RELEASE_TOKEN secret is not set. Add it (PAT with contents:read/write) so tag pushes can trigger publish.yml." >&2
|
|
22
|
+
exit 1
|
|
23
|
+
fi
|
|
24
|
+
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
with:
|
|
27
|
+
fetch-depth: 0
|
|
28
|
+
# A PAT is required so the tag push triggers downstream workflows
|
|
29
|
+
# (pushes made with GITHUB_TOKEN are intentionally excluded from
|
|
30
|
+
# workflow triggers by GitHub to prevent infinite loops).
|
|
31
|
+
token: ${{ secrets.RELEASE_TOKEN }}
|
|
32
|
+
|
|
33
|
+
- name: Read version from pyproject.toml
|
|
34
|
+
id: version
|
|
35
|
+
run: |
|
|
36
|
+
VERSION=$(grep '^version = ' pyproject.toml | head -1 | sed 's/version = "\(.*\)"/\1/')
|
|
37
|
+
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
38
|
+
|
|
39
|
+
- name: Check if tag exists
|
|
40
|
+
id: tag_check
|
|
41
|
+
run: |
|
|
42
|
+
if git rev-parse "v${{ steps.version.outputs.version }}" >/dev/null 2>&1; then
|
|
43
|
+
echo "exists=true" >> $GITHUB_OUTPUT
|
|
44
|
+
else
|
|
45
|
+
echo "exists=false" >> $GITHUB_OUTPUT
|
|
46
|
+
fi
|
|
47
|
+
|
|
48
|
+
- name: Create and push tag
|
|
49
|
+
if: steps.tag_check.outputs.exists == 'false'
|
|
50
|
+
run: |
|
|
51
|
+
git config user.name "github-actions[bot]"
|
|
52
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
53
|
+
git tag "v${{ steps.version.outputs.version }}"
|
|
54
|
+
git push origin "v${{ steps.version.outputs.version }}"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.4.2] - 2026-06-30
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- `FREQ` operator now counts only periods where the value is non-null **and strictly greater than 0** (previously counted any non-null value).
|
|
14
|
+
- `XM` operator now returns `1` only when **every** period in the time window has a non-null and strictly positive value, `0` otherwise (previously returned a raw count identical to FREQ). Both the SQL and PySpark generators are updated.
|
|
15
|
+
|
|
16
|
+
## [0.4.1] - 2026-06-09
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
- CI: auto-tag workflow now uses a PAT (`RELEASE_TOKEN`) to push tags so that `publish.yml` is triggered correctly (`fix(ci)`)
|
|
20
|
+
|
|
21
|
+
## [0.4.0] - 2026-06-09
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
- Ratio/percentage features (`RatioPivotedColumn`, `RatioSpaceBuilder`): for every pivot combination with at least one non-`None` categorical value, a `numerator / NULLIF(denominator, 0)` column is generated for each proper marginal projection of that combination. Controlled by `FeatureStoreConfig.include_ratios` (default `True`, requires `include_marginals=True`). (`feat(ratio)`)
|
|
25
|
+
- `verbose` parameter on `AdapterDomainResolver` and `AdapterCombinationResolver`: when `True`, the generated `SELECT DISTINCT` SQL is emitted at `DEBUG` level before execution. `FeatureStorePipeline` forwards `cfg.verbose` to the combination resolver automatically. (`feat(domain-resolver)`)
|
|
26
|
+
|
|
27
|
+
## [0.3.0] - 2026-06-08
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
- `AdapterCombinationResolver` — replaces per-field `SELECT DISTINCT` queries with a single multi-column query returning only observed combinations (`feat(builders)`)
|
|
31
|
+
- `verbose` logging option on `PivotSpaceBuilder`, `DistributionalSpaceBuilder`, and `TemporalSpaceBuilder`, configurable via `FeatureStoreConfig` (`feat(config)`)
|
|
32
|
+
|
|
33
|
+
### Fixed
|
|
34
|
+
- Marginal fields no longer contribute their name to pivot column names; e.g. `SUM__amount__channel__region_north` → `SUM__amount__region_north` (`fix(layer2)`)
|
|
35
|
+
|
|
36
|
+
## [0.2.0] - 2026-06-02
|
|
37
|
+
|
|
38
|
+
### Added
|
|
39
|
+
- Execution layer with adapter-based domain resolution (`feat(execution)`)
|
|
40
|
+
|
|
41
|
+
### Fixed
|
|
42
|
+
- Lazy-import `AdapterDomainResolver`; added `pandas` to dev dependencies
|
featkit-0.4.2/PKG-INFO
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featkit
|
|
3
|
+
Version: 0.4.2
|
|
4
|
+
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
|
+
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
|
+
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
7
|
+
Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
|
|
9
|
+
Author: Mirko
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Mirko
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
|
|
33
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Requires-Dist: sqlglot>=23.0
|
|
46
|
+
Provides-Extra: databricks
|
|
47
|
+
Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pandas>=1.5; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
56
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
57
|
+
Provides-Extra: docs
|
|
58
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
60
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
61
|
+
Provides-Extra: execution
|
|
62
|
+
Requires-Dist: pandas>=1.5; extra == 'execution'
|
|
63
|
+
Provides-Extra: ibis
|
|
64
|
+
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
65
|
+
Provides-Extra: spark
|
|
66
|
+
Requires-Dist: pyspark>=3.4; extra == 'spark'
|
|
67
|
+
Description-Content-Type: text/markdown
|
|
68
|
+
|
|
69
|
+
# featkit
|
|
70
|
+
|
|
71
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
72
|
+
|
|
73
|
+
It implements a three-layer architecture:
|
|
74
|
+
|
|
75
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
76
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
77
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
78
|
+
|
|
79
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
80
|
+
|
|
81
|
+
## Key concepts
|
|
82
|
+
|
|
83
|
+
| Layer | What it does |
|
|
84
|
+
|---|---|
|
|
85
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
86
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
87
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install featkit
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Quickstart
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
99
|
+
from featkit.dataset import SimpleDataset
|
|
100
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
101
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
102
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
103
|
+
|
|
104
|
+
# Define schema
|
|
105
|
+
fields = [
|
|
106
|
+
IDField(name="ID_CLIENTE"),
|
|
107
|
+
TimeField(name="PERIODO",
|
|
108
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
109
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
110
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
111
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
112
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
113
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
114
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
115
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
dataset = SimpleDataset(
|
|
119
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
120
|
+
fields=fields,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
config = FeatureStoreConfig(
|
|
124
|
+
dataset=dataset,
|
|
125
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
126
|
+
output_table_prefix="FS",
|
|
127
|
+
time_windows=[3, 6, 9, 12],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
131
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
132
|
+
|
|
133
|
+
output.save("./output")
|
|
134
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Feature naming anatomy
|
|
138
|
+
|
|
139
|
+
Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
|
|
140
|
+
|
|
141
|
+
There are four families of features, each with its own naming pattern.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
### Layer 2A — Pivot features
|
|
146
|
+
|
|
147
|
+
**Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
|
|
148
|
+
|
|
149
|
+
| Segment | Source | Example |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
|
|
152
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
|
|
153
|
+
| `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
|
|
154
|
+
|
|
155
|
+
The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
|
|
156
|
+
|
|
157
|
+
| Measurement type | Semantic meaning | Valid `AGG` values |
|
|
158
|
+
|---|---|---|
|
|
159
|
+
| `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
|
|
160
|
+
| `CANTIDAD` | Count / quantity | `SUM` |
|
|
161
|
+
| `TICKET` | Average ticket size | `AVG` |
|
|
162
|
+
| `FLAG` | Binary indicator | `MAX` |
|
|
163
|
+
| `FECHA` | Date / timestamp | `MAX`, `MIN` |
|
|
164
|
+
| `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
|
|
165
|
+
| `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
|
|
166
|
+
| `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
|
|
167
|
+
|
|
168
|
+
Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
SUM__MTO # global — all sectors, all channels
|
|
172
|
+
SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
|
|
173
|
+
SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
|
|
174
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
|
|
175
|
+
SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
### Layer 2B — Distributional features
|
|
181
|
+
|
|
182
|
+
**Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
|
|
183
|
+
|
|
184
|
+
| Segment | Source | Example |
|
|
185
|
+
|---|---|---|
|
|
186
|
+
| `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
|
|
187
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO` |
|
|
188
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM` |
|
|
189
|
+
| `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
|
|
190
|
+
|
|
191
|
+
These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
|
|
192
|
+
|
|
193
|
+
| Metric | What it measures |
|
|
194
|
+
|---|---|
|
|
195
|
+
| `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
|
|
196
|
+
| `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
|
|
197
|
+
| `DOMINANT_PROPORTION` | Share of the most common category value |
|
|
198
|
+
| `MODE` | The most frequent category value (output type: categorical) |
|
|
199
|
+
| `COUNT` | Number of distinct observed values |
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
|
|
203
|
+
SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
|
|
204
|
+
CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
### Layer 2C — Ratio features
|
|
210
|
+
|
|
211
|
+
**Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
|
|
212
|
+
|
|
213
|
+
where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
|
|
214
|
+
|
|
215
|
+
The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
|
|
216
|
+
|
|
217
|
+
```
|
|
218
|
+
# Numerator: DIGITAL channel + RETAIL sector
|
|
219
|
+
# Denominator: RETAIL sector only (CANAL marginalized → share of DIGITAL within RETAIL)
|
|
220
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL
|
|
221
|
+
|
|
222
|
+
# Denominator: DIGITAL channel only (SECTOR marginalized → share of RETAIL within DIGITAL)
|
|
223
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL
|
|
224
|
+
|
|
225
|
+
# Denominator: global total (both marginalized → share of DIGITAL/RETAIL in total portfolio)
|
|
226
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
### Layer 3 — Temporal features
|
|
232
|
+
|
|
233
|
+
**Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
|
|
234
|
+
|
|
235
|
+
`L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
|
|
236
|
+
|
|
237
|
+
| Segment | Source | Notes |
|
|
238
|
+
|---|---|---|
|
|
239
|
+
| `OPERATOR` | `TemporalOperator` enum | See table below |
|
|
240
|
+
| `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
|
|
241
|
+
| `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
|
|
242
|
+
|
|
243
|
+
#### Temporal operators
|
|
244
|
+
|
|
245
|
+
| Operator | Type | Description |
|
|
246
|
+
|---|---|---|
|
|
247
|
+
| `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
|
|
248
|
+
| `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
|
|
249
|
+
| `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
|
|
250
|
+
| `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
|
|
251
|
+
| `MIN_U` | Windowed | Minimum value observed in the window |
|
|
252
|
+
| `MAX_U` | Windowed | Maximum value observed in the window |
|
|
253
|
+
| `CREC` | Windowed | Growth rate across the window |
|
|
254
|
+
| `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
|
|
255
|
+
| `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
|
|
256
|
+
| `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
|
|
257
|
+
| `RATIO` | Windowed (composed) | Ratio of two sub-windows |
|
|
258
|
+
| `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
|
|
259
|
+
| `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
|
|
260
|
+
| `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
|
|
261
|
+
|
|
262
|
+
#### Valid operators per Layer 2 output type
|
|
263
|
+
|
|
264
|
+
| Output type | Valid operators |
|
|
265
|
+
|---|---|
|
|
266
|
+
| `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
|
|
267
|
+
| `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
|
|
268
|
+
| `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
|
|
269
|
+
| `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
|
|
270
|
+
|
|
271
|
+
#### Examples
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
# Average amount (DIGITAL + RETAIL) over the last 6 months
|
|
275
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
|
|
276
|
+
|
|
277
|
+
# Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
|
|
278
|
+
SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
|
|
279
|
+
|
|
280
|
+
# Most recent value of the CANAL entropy (by amount)
|
|
281
|
+
CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
|
|
282
|
+
|
|
283
|
+
# Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
|
|
284
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
|
|
285
|
+
|
|
286
|
+
# Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
|
|
287
|
+
CANAL__MTO__SUM__MODE__REC__BACKWARD
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
---
|
|
291
|
+
|
|
292
|
+
### Quick-reference: full name structure
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
|
|
296
|
+
│ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
|
|
297
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
298
|
+
|
|
299
|
+
┌─ Layer 2B distributional ─────────────────────────────────────────┐
|
|
300
|
+
│ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
|
|
301
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
302
|
+
|
|
303
|
+
┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
|
|
304
|
+
│ {Layer 2A name} __over__ {Layer 2A name} │
|
|
305
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
306
|
+
|
|
307
|
+
┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
|
|
308
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
|
|
309
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
310
|
+
|
|
311
|
+
┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
|
|
312
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
|
|
313
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
## Architecture
|
|
317
|
+
|
|
318
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
319
|
+
|
|
320
|
+
## License
|
|
321
|
+
|
|
322
|
+
MIT
|
featkit-0.4.2/README.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# featkit
|
|
2
|
+
|
|
3
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
4
|
+
|
|
5
|
+
It implements a three-layer architecture:
|
|
6
|
+
|
|
7
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
8
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
9
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
10
|
+
|
|
11
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
12
|
+
|
|
13
|
+
## Key concepts
|
|
14
|
+
|
|
15
|
+
| Layer | What it does |
|
|
16
|
+
|---|---|
|
|
17
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
18
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
19
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install featkit
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
31
|
+
from featkit.dataset import SimpleDataset
|
|
32
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
33
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
34
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
35
|
+
|
|
36
|
+
# Define schema
|
|
37
|
+
fields = [
|
|
38
|
+
IDField(name="ID_CLIENTE"),
|
|
39
|
+
TimeField(name="PERIODO",
|
|
40
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
41
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
42
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
43
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
44
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
45
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
46
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
47
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
dataset = SimpleDataset(
|
|
51
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
52
|
+
fields=fields,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
config = FeatureStoreConfig(
|
|
56
|
+
dataset=dataset,
|
|
57
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
58
|
+
output_table_prefix="FS",
|
|
59
|
+
time_windows=[3, 6, 9, 12],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
63
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
64
|
+
|
|
65
|
+
output.save("./output")
|
|
66
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Feature naming anatomy
|
|
70
|
+
|
|
71
|
+
Every feature produced by featkit has a deterministic, human-readable name built from fixed segments separated by `__` (double underscore). Understanding the segments lets you decode any feature name without looking at the code.
|
|
72
|
+
|
|
73
|
+
There are four families of features, each with its own naming pattern.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
### Layer 2A — Pivot features
|
|
78
|
+
|
|
79
|
+
**Pattern:** `{AGG}__{MEASUREMENT}[__{FIELD}_{VALUE}…]`
|
|
80
|
+
|
|
81
|
+
| Segment | Source | Example |
|
|
82
|
+
|---|---|---|
|
|
83
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM`, `COUNT`, `AVG`, `MIN`, `MAX` |
|
|
84
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO`, `TRX` |
|
|
85
|
+
| `FIELD_VALUE` | `CategoricalField.name` + `_` + value, one per non-marginal field, sorted alphabetically by field name | `CANAL_DIGITAL`, `SECTOR_RETAIL` |
|
|
86
|
+
|
|
87
|
+
The valid aggregators for each `MEASUREMENT` depend on its `MeasurementType`. Only contract-permitted aggregator–measurement combinations are generated.
|
|
88
|
+
|
|
89
|
+
| Measurement type | Semantic meaning | Valid `AGG` values |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| `MONTO` | Monetary amount | `SUM`, `MAX`, `MIN`, `AVG` |
|
|
92
|
+
| `CANTIDAD` | Count / quantity | `SUM` |
|
|
93
|
+
| `TICKET` | Average ticket size | `AVG` |
|
|
94
|
+
| `FLAG` | Binary indicator | `MAX` |
|
|
95
|
+
| `FECHA` | Date / timestamp | `MAX`, `MIN` |
|
|
96
|
+
| `BALANCE` | Point-in-time balance | `MAX`, `MIN`, `AVG` |
|
|
97
|
+
| `TIME_DIFF` | Duration / elapsed time | `SUM`, `AVG`, `MAX`, `MIN` |
|
|
98
|
+
| `ESTADISTICO` | Generic statistic | `SUM`, `AVG`, `MAX`, `MIN`, `COUNT` |
|
|
99
|
+
|
|
100
|
+
Categorical fields set to the **∅ marginal** (no filter on that dimension) are omitted from the name entirely, so the name implicitly aggregates over all values of that dimension.
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
SUM__MTO # global — all sectors, all channels
|
|
104
|
+
SUM__MTO__CANAL_DIGITAL # CANAL=DIGITAL, marginal over SECTOR
|
|
105
|
+
SUM__MTO__SECTOR_RETAIL # SECTOR=RETAIL, marginal over CANAL
|
|
106
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL # CANAL=DIGITAL and SECTOR=RETAIL (alphabetical order)
|
|
107
|
+
SUM__TRX__CANAL_PRESENCIAL # sum of TRX (CANTIDAD → only SUM is valid) for PRESENCIAL channel
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
### Layer 2B — Distributional features
|
|
113
|
+
|
|
114
|
+
**Pattern:** `{CATEGORICAL}__{MEASUREMENT}__{AGG}__{METRIC}`
|
|
115
|
+
|
|
116
|
+
| Segment | Source | Example |
|
|
117
|
+
|---|---|---|
|
|
118
|
+
| `CATEGORICAL` | `CategoricalField.name` | `CANAL`, `SECTOR` |
|
|
119
|
+
| `MEASUREMENT` | `MeasurementField.name` | `MTO` |
|
|
120
|
+
| `AGG` | `Layer2Aggregator` enum | `SUM` |
|
|
121
|
+
| `METRIC` | `DistributionalMetric` enum | `ENTROPY`, `HHI`, `DOMINANT_PROPORTION`, `MODE`, `COUNT` |
|
|
122
|
+
|
|
123
|
+
These columns capture the shape of the value distribution of a categorical field, weighted by the aggregated measurement.
|
|
124
|
+
|
|
125
|
+
| Metric | What it measures |
|
|
126
|
+
|---|---|
|
|
127
|
+
| `ENTROPY` | Shannon entropy of the category distribution — higher means more uniform spread |
|
|
128
|
+
| `HHI` | Herfindahl-Hirschman Index — concentration; higher means more dominated by one value |
|
|
129
|
+
| `DOMINANT_PROPORTION` | Share of the most common category value |
|
|
130
|
+
| `MODE` | The most frequent category value (output type: categorical) |
|
|
131
|
+
| `COUNT` | Number of distinct observed values |
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
CANAL__MTO__SUM__ENTROPY # entropy of channel distribution by amount
|
|
135
|
+
SECTOR__TRX__SUM__HHI # HHI of sector distribution by transaction count (CANTIDAD → only SUM)
|
|
136
|
+
CANAL__MTO__SUM__MODE # dominant channel by amount (categorical output)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
### Layer 2C — Ratio features
|
|
142
|
+
|
|
143
|
+
**Pattern:** `{NUMERATOR}__over__{DENOMINATOR}`
|
|
144
|
+
|
|
145
|
+
where `NUMERATOR` and `DENOMINATOR` are full Layer 2A pivot feature names. The denominator is always a **proper marginal projection** of the numerator: it has at least one categorical dimension set to ∅ that is non-∅ in the numerator, and no contradicting values.
|
|
146
|
+
|
|
147
|
+
The underlying value is `numerator / NULLIF(denominator, 0)` computed per entity per period.
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
# Numerator: DIGITAL channel + RETAIL sector
|
|
151
|
+
# Denominator: RETAIL sector only (CANAL marginalized → share of DIGITAL within RETAIL)
|
|
152
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__SECTOR_RETAIL
|
|
153
|
+
|
|
154
|
+
# Denominator: DIGITAL channel only (SECTOR marginalized → share of RETAIL within DIGITAL)
|
|
155
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__CANAL_DIGITAL
|
|
156
|
+
|
|
157
|
+
# Denominator: global total (both marginalized → share of DIGITAL/RETAIL in total portfolio)
|
|
158
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
### Layer 3 — Temporal features
|
|
164
|
+
|
|
165
|
+
**Pattern:** `{L2_NAME}__{OPERATOR}__{DIRECTION}[__{WINDOW}]`
|
|
166
|
+
|
|
167
|
+
`L2_NAME` is the full name of any Layer 2A, 2B, or 2C feature. The temporal segments are appended at the end.
|
|
168
|
+
|
|
169
|
+
| Segment | Source | Notes |
|
|
170
|
+
|---|---|---|
|
|
171
|
+
| `OPERATOR` | `TemporalOperator` enum | See table below |
|
|
172
|
+
| `DIRECTION` | `TimeWindowDirection` enum | `BACKWARD` or `FORWARD` |
|
|
173
|
+
| `WINDOW` | `window_size` (integer, number of periods) | Omitted for point-in-time operators |
|
|
174
|
+
|
|
175
|
+
#### Temporal operators
|
|
176
|
+
|
|
177
|
+
| Operator | Type | Description |
|
|
178
|
+
|---|---|---|
|
|
179
|
+
| `PROM_U` | Windowed | Arithmetic mean of the monthly values over the window — each period contributes equally regardless of its volume |
|
|
180
|
+
| `PROM_P` | Windowed | Volume-proportional weighted mean — each period's contribution is weighted by its share of the total aggregated value across the window; weights are derived automatically from the data, no user configuration required |
|
|
181
|
+
| `SUM_U` | Windowed | Unweighted sum of the monthly values over the window |
|
|
182
|
+
| `SUM_P` | Windowed | Volume-weighted sum over the window (analogous weighting to `PROM_P`) |
|
|
183
|
+
| `MIN_U` | Windowed | Minimum value observed in the window |
|
|
184
|
+
| `MAX_U` | Windowed | Maximum value observed in the window |
|
|
185
|
+
| `CREC` | Windowed | Growth rate across the window |
|
|
186
|
+
| `FREQ` | Windowed | Count of periods in the window where the value was non-null **and strictly greater than 0** |
|
|
187
|
+
| `XM` | Windowed | `1` if **every** period in the window had a non-null and strictly positive value, `0` otherwise — an all-or-nothing activity indicator (e.g. `1` means the customer was active on every single month in the window) |
|
|
188
|
+
| `MEDIA_ABS` | Windowed (composed) | Mean absolute deviation over the window |
|
|
189
|
+
| `RATIO` | Windowed (composed) | Ratio of two sub-windows |
|
|
190
|
+
| `ULT_MES` | Point-in-time | Value at the most recent period (no window suffix) |
|
|
191
|
+
| `PREV_MES` | Point-in-time | Value at the immediately preceding period (no window suffix) |
|
|
192
|
+
| `REC` | Point-in-time | Recency — periods elapsed since last non-null / non-zero observation (no window suffix) |
|
|
193
|
+
|
|
194
|
+
#### Valid operators per Layer 2 output type
|
|
195
|
+
|
|
196
|
+
| Output type | Valid operators |
|
|
197
|
+
|---|---|
|
|
198
|
+
| `NUMERIC` | `PROM_U`, `PROM_P`, `SUM_U`, `SUM_P`, `MIN_U`, `MAX_U`, `CREC`, `FREQ`, `XM`, `ULT_MES`, `PREV_MES`, `MEDIA_ABS`, `RATIO` |
|
|
199
|
+
| `FLAG` | `ULT_MES`, `PREV_MES`, `FREQ`, `XM`, `REC` |
|
|
200
|
+
| `CATEGORICAL` | `ULT_MES`, `PREV_MES`, `REC` |
|
|
201
|
+
| `TEMPORAL` | `ULT_MES`, `PREV_MES`, `REC`, `MIN_U`, `MAX_U`, `CREC` |
|
|
202
|
+
|
|
203
|
+
#### Examples
|
|
204
|
+
|
|
205
|
+
```
|
|
206
|
+
# Average amount (DIGITAL + RETAIL) over the last 6 months
|
|
207
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__PROM_U__BACKWARD__6
|
|
208
|
+
|
|
209
|
+
# Total transaction sum for RETAIL sector in the last 3 months (CANTIDAD → only SUM valid)
|
|
210
|
+
SUM__TRX__SECTOR_RETAIL__SUM_U__BACKWARD__3
|
|
211
|
+
|
|
212
|
+
# Most recent value of the CANAL entropy (by amount)
|
|
213
|
+
CANAL__MTO__SUM__ENTROPY__ULT_MES__BACKWARD
|
|
214
|
+
|
|
215
|
+
# Share of DIGITAL/RETAIL in total portfolio, averaged over last 12 months
|
|
216
|
+
SUM__MTO__CANAL_DIGITAL__SECTOR_RETAIL__over__SUM__MTO__PROM_U__BACKWARD__12
|
|
217
|
+
|
|
218
|
+
# Recency of the dominant channel (MODE is categorical → only REC/ULT_MES/PREV_MES valid)
|
|
219
|
+
CANAL__MTO__SUM__MODE__REC__BACKWARD
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
### Quick-reference: full name structure
|
|
225
|
+
|
|
226
|
+
```
|
|
227
|
+
┌─ Layer 2A pivot ──────────────────────────────────────────────────┐
|
|
228
|
+
│ AGG __ MEASUREMENT [__ FIELD_VALUE …] │
|
|
229
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
230
|
+
|
|
231
|
+
┌─ Layer 2B distributional ─────────────────────────────────────────┐
|
|
232
|
+
│ CATEGORICAL __ MEASUREMENT __ AGG __ METRIC │
|
|
233
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
234
|
+
|
|
235
|
+
┌─ Layer 2C ratio ──────────────────────────────────────────────────┐
|
|
236
|
+
│ {Layer 2A name} __over__ {Layer 2A name} │
|
|
237
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
238
|
+
|
|
239
|
+
┌─ Layer 3 temporal (windowed) ─────────────────────────────────────┐
|
|
240
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION __ WINDOW │
|
|
241
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
242
|
+
|
|
243
|
+
┌─ Layer 3 temporal (point-in-time) ────────────────────────────────┐
|
|
244
|
+
│ {Layer 2A/2B/2C name} __ OPERATOR __ DIRECTION │
|
|
245
|
+
└───────────────────────────────────────────────────────────────────┘
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Architecture
|
|
249
|
+
|
|
250
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
251
|
+
|
|
252
|
+
## License
|
|
253
|
+
|
|
254
|
+
MIT
|