featkit 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featkit-0.3.0/CHANGELOG.md +25 -0
- {featkit-0.1.0 → featkit-0.3.0}/PKG-INFO +4 -1
- featkit-0.3.0/docs/example_databricks_notebook.md +209 -0
- {featkit-0.1.0 → featkit-0.3.0}/pyproject.toml +3 -1
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/distributional_space.py +34 -0
- featkit-0.3.0/src/featkit/builders/pivot_space.py +219 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/temporal_space.py +30 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/config.py +14 -0
- featkit-0.3.0/src/featkit/execution/__init__.py +1 -0
- featkit-0.3.0/src/featkit/execution/adapters/__init__.py +18 -0
- featkit-0.3.0/src/featkit/execution/adapters/base.py +63 -0
- featkit-0.3.0/src/featkit/execution/adapters/databricks_adapter.py +61 -0
- featkit-0.3.0/src/featkit/execution/adapters/databricks_notebook_adapter.py +117 -0
- featkit-0.3.0/src/featkit/execution/adapters/mock_adapter.py +35 -0
- featkit-0.3.0/src/featkit/execution/adapters/spark_adapter.py +28 -0
- featkit-0.3.0/src/featkit/execution/adapters/sqlalchemy_adapter.py +44 -0
- featkit-0.3.0/src/featkit/execution/domain_resolver.py +177 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/base.py +55 -5
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/pivoted.py +2 -1
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/pipeline.py +13 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_builders.py +297 -0
- featkit-0.3.0/tests/test_execution/test_adapters.py +77 -0
- featkit-0.3.0/tests/test_execution/test_domain_resolver.py +356 -0
- featkit-0.3.0/tests/test_generators/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_layer2.py +19 -1
- featkit-0.1.0/CHANGELOG.md +0 -8
- featkit-0.1.0/src/featkit/builders/pivot_space.py +0 -102
- {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/ci.yml +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/docs.yml +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/publish.yml +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/.gitignore +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/LICENSE +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/README.md +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/docs/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/docs/examples.md +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/docs/general_plan.md +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/docs/index.md +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/docs/quickstart.md +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/mkdocs.yml +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/defaults.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/defaults.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/enums.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/categorical_field.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/id_field.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/measurement_field.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/time_field.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/output.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/snowflake.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/spark_sql.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/distributional.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/temporal_feature.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_contracts.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_enums.py +0 -0
- {featkit-0.1.0/tests/test_generators → featkit-0.3.0/tests/test_execution}/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_fields.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_base.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_pyspark.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_sql_databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_sql_snowflake.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_integration.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_layer3.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_output_contracts.py +0 -0
- {featkit-0.1.0 → featkit-0.3.0}/tests/test_pipeline.py +0 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.3.0] - 2026-06-08
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `AdapterCombinationResolver` — replaces per-field `SELECT DISTINCT` queries with a single multi-column query returning only observed combinations (`feat(builders)`)
|
|
14
|
+
- `verbose` logging option on `PivotSpaceBuilder`, `DistributionalSpaceBuilder`, and `TemporalSpaceBuilder`, configurable via `FeatureStoreConfig` (`feat(config)`)
|
|
15
|
+
|
|
16
|
+
### Fixed
|
|
17
|
+
- Marginal fields no longer contribute their name to pivot column names; e.g. `SUM__amount__channel__region_north` → `SUM__amount__region_north` (`fix(layer2)`)
|
|
18
|
+
|
|
19
|
+
## [0.2.0] - 2026-06-02
|
|
20
|
+
|
|
21
|
+
### Added
|
|
22
|
+
- Execution layer with adapter-based domain resolution (`feat(execution)`)
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
- Lazy-import `AdapterDomainResolver`; added `pandas` to dev dependencies
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: featkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
5
|
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
6
|
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
@@ -49,6 +49,7 @@ Provides-Extra: dev
|
|
|
49
49
|
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
50
|
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
51
|
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pandas>=1.5; extra == 'dev'
|
|
52
53
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
53
54
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
54
55
|
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
@@ -57,6 +58,8 @@ Provides-Extra: docs
|
|
|
57
58
|
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
58
59
|
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
59
60
|
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
61
|
+
Provides-Extra: execution
|
|
62
|
+
Requires-Dist: pandas>=1.5; extra == 'execution'
|
|
60
63
|
Provides-Extra: ibis
|
|
61
64
|
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
62
65
|
Provides-Extra: spark
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# Example — Observed-combinations pivot in a Databricks notebook
|
|
2
|
+
|
|
3
|
+
This example shows how featkit resolves pivot combinations at runtime by
|
|
4
|
+
querying the facts table directly from a Databricks notebook.
|
|
5
|
+
|
|
6
|
+
When an adapter is configured, `FeatureStorePipeline` constructs an
|
|
7
|
+
`AdapterCombinationResolver` and passes it to `PivotSpaceBuilder`. Instead of
|
|
8
|
+
generating the full Cartesian product of per-field domains, the builder issues a
|
|
9
|
+
**single `SELECT DISTINCT`** query for all pivot categoricals and builds only the
|
|
10
|
+
combinations that actually exist in the data. Marginals are then derived from
|
|
11
|
+
those observed combinations via subset-projection.
|
|
12
|
+
|
|
13
|
+
`DatabricksNotebookAdapter` discovers the pre-injected `spark` session
|
|
14
|
+
automatically — no constructor arguments are needed.
|
|
15
|
+
|
|
16
|
+
## Notebook cells
|
|
17
|
+
|
|
18
|
+
### Cell 1 — imports
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from featkit.config import FeatureStoreConfig
|
|
22
|
+
from featkit.dataset.base import SimpleDataset
|
|
23
|
+
from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
|
|
24
|
+
from featkit.execution.adapters import DatabricksNotebookAdapter
|
|
25
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
26
|
+
from featkit.fields.id_field import IDField
|
|
27
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
28
|
+
from featkit.fields.time_field import TimeField
|
|
29
|
+
from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
|
|
30
|
+
from featkit.pipeline import FeatureStorePipeline
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Cell 2 — define the dataset
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
ds = SimpleDataset(
|
|
37
|
+
"mydb.myschema.silver_transactions",
|
|
38
|
+
[
|
|
39
|
+
IDField("client_id"),
|
|
40
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
41
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
42
|
+
MeasurementField("txn_count", MeasurementType.CANTIDAD),
|
|
43
|
+
# allowed_values used as WHERE IN-filter; omit to query with no filter
|
|
44
|
+
CategoricalField(
|
|
45
|
+
"segment",
|
|
46
|
+
CategoricalTreatment.PIVOT,
|
|
47
|
+
allowed_values=["retail", "sme", "corporate"],
|
|
48
|
+
),
|
|
49
|
+
CategoricalField(
|
|
50
|
+
"product_type",
|
|
51
|
+
CategoricalTreatment.PIVOT,
|
|
52
|
+
allowed_values=["loan", "deposit", "card"],
|
|
53
|
+
),
|
|
54
|
+
],
|
|
55
|
+
)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Cell 3 — configure with the notebook adapter
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
adapter = DatabricksNotebookAdapter()
|
|
62
|
+
|
|
63
|
+
cfg = FeatureStoreConfig(
|
|
64
|
+
dataset=ds,
|
|
65
|
+
output_schema="analytics",
|
|
66
|
+
output_table_prefix="feat_",
|
|
67
|
+
time_windows=[3, 6, 12],
|
|
68
|
+
include_marginals=True,
|
|
69
|
+
adapter=adapter, # triggers SELECT DISTINCT combination query at build()
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Cell 4 — build and generate
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
# build() issues ONE SELECT DISTINCT for all pivot categoricals:
|
|
77
|
+
#
|
|
78
|
+
# SELECT DISTINCT product_type, segment
|
|
79
|
+
# FROM mydb.myschema.silver_transactions
|
|
80
|
+
# WHERE product_type IS NOT NULL
|
|
81
|
+
# AND segment IS NOT NULL
|
|
82
|
+
# AND product_type IN ('loan', 'deposit', 'card')
|
|
83
|
+
# AND segment IN ('retail', 'sme', 'corporate')
|
|
84
|
+
# ORDER BY 1, 2
|
|
85
|
+
#
|
|
86
|
+
# Only the returned combinations (plus their marginal projections) become
|
|
87
|
+
# pivot columns — unobserved cross-combinations are never generated.
|
|
88
|
+
pipeline = FeatureStorePipeline(config=cfg).build()
|
|
89
|
+
|
|
90
|
+
print(f"Layer 2A columns : {len(pipeline.layer2a)}")
|
|
91
|
+
print(f"Layer 3 features: {len(pipeline.layer3)}")
|
|
92
|
+
|
|
93
|
+
result = DatabricksSQLCodeGenerator().generate(pipeline)
|
|
94
|
+
print(result.code.sql[:500])
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Cell 5 — save the artefacts to DBFS
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
result.save("/dbfs/mnt/output/features/")
|
|
101
|
+
# Writes:
|
|
102
|
+
# /dbfs/mnt/output/features/script.sql
|
|
103
|
+
# /dbfs/mnt/output/features/dag.json
|
|
104
|
+
# /dbfs/mnt/output/features/diagram.md
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## How it works
|
|
108
|
+
|
|
109
|
+
`FeatureStorePipeline.build()` constructs an `AdapterCombinationResolver` and
|
|
110
|
+
passes it to `PivotSpaceBuilder` as the `combination_resolver` callable. The
|
|
111
|
+
resolver executes a single multi-column `SELECT DISTINCT`:
|
|
112
|
+
|
|
113
|
+
```sql
|
|
114
|
+
SELECT DISTINCT product_type, segment
|
|
115
|
+
FROM mydb.myschema.silver_transactions
|
|
116
|
+
WHERE product_type IS NOT NULL
|
|
117
|
+
AND segment IS NOT NULL
|
|
118
|
+
AND product_type IN ('loan', 'deposit', 'card')
|
|
119
|
+
AND segment IN ('retail', 'sme', 'corporate')
|
|
120
|
+
ORDER BY 1, 2
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Suppose the query returns three rows:
|
|
124
|
+
|
|
125
|
+
| product_type | segment |
|
|
126
|
+
|-------------|-----------|
|
|
127
|
+
| loan | retail |
|
|
128
|
+
| loan | sme |
|
|
129
|
+
| deposit | corporate |
|
|
130
|
+
|
|
131
|
+
With `include_marginals=True`, the builder derives every subset-projection of
|
|
132
|
+
those rows:
|
|
133
|
+
|
|
134
|
+
| product_type | segment | interpretation |
|
|
135
|
+
|-------------|-----------|------------------------------------------|
|
|
136
|
+
| loan | retail | observed combination |
|
|
137
|
+
| loan | sme | observed combination |
|
|
138
|
+
| deposit | corporate | observed combination |
|
|
139
|
+
| loan | `∅` | all segments for loan |
|
|
140
|
+
| deposit | `∅` | all segments for deposit |
|
|
141
|
+
| `∅` | retail | all products for retail |
|
|
142
|
+
| `∅` | sme | all products for sme |
|
|
143
|
+
| `∅` | corporate | all products for corporate |
|
|
144
|
+
| `∅` | `∅` | unconditional aggregate (always present)|
|
|
145
|
+
|
|
146
|
+
Unobserved combinations (e.g. `deposit × retail`) are **never generated**,
|
|
147
|
+
keeping the feature space lean.
|
|
148
|
+
|
|
149
|
+
## Fields without `allowed_values`
|
|
150
|
+
|
|
151
|
+
If a field has no `allowed_values`, it is still included in the `SELECT DISTINCT`
|
|
152
|
+
but its column is not filtered in the WHERE clause — all distinct values present
|
|
153
|
+
in the table are returned for that dimension:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
ds = SimpleDataset(
|
|
157
|
+
"mydb.myschema.silver_transactions",
|
|
158
|
+
[
|
|
159
|
+
IDField("client_id"),
|
|
160
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
161
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
162
|
+
# Static domain — used as IN-filter in the combined query
|
|
163
|
+
CategoricalField(
|
|
164
|
+
"channel",
|
|
165
|
+
CategoricalTreatment.PIVOT,
|
|
166
|
+
allowed_values=["branch", "online", "mobile"],
|
|
167
|
+
),
|
|
168
|
+
# No allowed_values — column included without an IN-filter
|
|
169
|
+
CategoricalField("segment", CategoricalTreatment.PIVOT),
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Using a different adapter
|
|
175
|
+
|
|
176
|
+
Swap `DatabricksNotebookAdapter` for any other adapter without changing the
|
|
177
|
+
rest of the code:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from featkit.execution.adapters import DatabricksAdapter
|
|
181
|
+
|
|
182
|
+
adapter = DatabricksAdapter(
|
|
183
|
+
host="<workspace>.azuredatabricks.net",
|
|
184
|
+
token="<pat>",
|
|
185
|
+
http_path="/sql/1.0/warehouses/<warehouse-id>",
|
|
186
|
+
catalog="mydb",
|
|
187
|
+
schema="myschema",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
cfg = FeatureStoreConfig(..., adapter=adapter)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Using `AdapterCombinationResolver` directly
|
|
194
|
+
|
|
195
|
+
The resolver can also be wired manually to `PivotSpaceBuilder` without going
|
|
196
|
+
through the pipeline:
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from featkit.execution.domain_resolver import AdapterCombinationResolver
|
|
200
|
+
from featkit.builders.pivot_space import PivotSpaceBuilder
|
|
201
|
+
|
|
202
|
+
resolver = AdapterCombinationResolver(adapter, "mydb.myschema.silver_transactions")
|
|
203
|
+
|
|
204
|
+
columns = PivotSpaceBuilder(
|
|
205
|
+
dataset=ds,
|
|
206
|
+
include_marginals=True,
|
|
207
|
+
combination_resolver=resolver,
|
|
208
|
+
).build()
|
|
209
|
+
```
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "featkit"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "featkit — automated feature store generation from relational facts tables"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -35,6 +35,7 @@ dependencies = [
|
|
|
35
35
|
ibis = ["ibis-framework>=9.0"]
|
|
36
36
|
spark = ["pyspark>=3.4"]
|
|
37
37
|
databricks = ["databricks-sql-connector>=3.0"]
|
|
38
|
+
execution = ["pandas>=1.5"]
|
|
38
39
|
docs = [
|
|
39
40
|
"mkdocs>=1.6",
|
|
40
41
|
"mkdocs-material>=9.5",
|
|
@@ -48,6 +49,7 @@ dev = [
|
|
|
48
49
|
"hatch>=1.9",
|
|
49
50
|
"build>=1.0",
|
|
50
51
|
"twine>=5.0",
|
|
52
|
+
"pandas>=1.5",
|
|
51
53
|
]
|
|
52
54
|
|
|
53
55
|
[project.urls]
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from typing import cast
|
|
6
7
|
|
|
7
8
|
from featkit.contracts.measurement.defaults import get_default_contract
|
|
@@ -11,6 +12,8 @@ from featkit.fields.categorical_field import CategoricalField
|
|
|
11
12
|
from featkit.fields.measurement_field import MeasurementField
|
|
12
13
|
from featkit.layer2.distributional import DistributionalColumn
|
|
13
14
|
|
|
15
|
+
_log = logging.getLogger(__name__)
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
class DistributionalSpaceBuilder:
|
|
16
19
|
"""Generates the full set of DistributionalColumn objects for a dataset.
|
|
@@ -26,18 +29,27 @@ class DistributionalSpaceBuilder:
|
|
|
26
29
|
An empty list produces no columns. Every entry must be present in the
|
|
27
30
|
dataset (compared by name, type, and contract); a ``ValueError`` is
|
|
28
31
|
raised for unknown fields.
|
|
32
|
+
verbose: When ``True``, emits ``DEBUG``-level log messages at key
|
|
33
|
+
milestones: builder start/end, and for each generated column the
|
|
34
|
+
``(categorical, measurement, aggregator, metric)`` combination dict
|
|
35
|
+
and the resulting column name.
|
|
29
36
|
"""
|
|
30
37
|
|
|
31
38
|
def __init__(
|
|
32
39
|
self,
|
|
33
40
|
dataset: AbstractDataset,
|
|
34
41
|
value_measurements: list[MeasurementField] | None = None,
|
|
42
|
+
verbose: bool = False,
|
|
35
43
|
) -> None:
|
|
36
44
|
self.dataset = dataset
|
|
37
45
|
self.value_measurements = value_measurements
|
|
46
|
+
self.verbose = verbose
|
|
38
47
|
|
|
39
48
|
def build(self) -> list[DistributionalColumn]:
|
|
40
49
|
"""Build and return all DistributionalColumn objects."""
|
|
50
|
+
if self.verbose:
|
|
51
|
+
_log.debug("DistributionalSpaceBuilder.build() started")
|
|
52
|
+
|
|
41
53
|
all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
|
|
42
54
|
dist_cats = [
|
|
43
55
|
c
|
|
@@ -70,8 +82,30 @@ class DistributionalSpaceBuilder:
|
|
|
70
82
|
for agg in aggs:
|
|
71
83
|
for metric in cat.distributional_metrics:
|
|
72
84
|
col = DistributionalColumn(mf, agg, cat, metric)
|
|
85
|
+
if self.verbose:
|
|
86
|
+
_log.debug(
|
|
87
|
+
"combo: cat=%r, measurement=%r, aggregator=%s, metric=%s",
|
|
88
|
+
cat.name,
|
|
89
|
+
mf.name,
|
|
90
|
+
agg.value,
|
|
91
|
+
metric.value,
|
|
92
|
+
)
|
|
93
|
+
_log.debug(
|
|
94
|
+
"combination: %s",
|
|
95
|
+
{
|
|
96
|
+
"categorical": cat.name,
|
|
97
|
+
"measurement": mf.name,
|
|
98
|
+
"aggregator": agg.value,
|
|
99
|
+
"metric": metric.value,
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
_log.debug("column_name: %r", col.column_name)
|
|
73
103
|
if col.column_name not in seen:
|
|
74
104
|
seen.add(col.column_name)
|
|
75
105
|
results.append(col)
|
|
76
106
|
|
|
107
|
+
if self.verbose:
|
|
108
|
+
_log.debug(
|
|
109
|
+
"DistributionalSpaceBuilder.build() done — %d column(s) generated", len(results)
|
|
110
|
+
)
|
|
77
111
|
return results
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""PivotSpaceBuilder — generates all PivotedColumn objects from a dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from itertools import combinations as _icombinations
|
|
8
|
+
from itertools import product
|
|
9
|
+
from typing import cast
|
|
10
|
+
|
|
11
|
+
from featkit.contracts.measurement.defaults import get_default_contract
|
|
12
|
+
from featkit.dataset.base import AbstractDataset
|
|
13
|
+
from featkit.enums import CategoricalTreatment, Layer2Aggregator, MeasurementType
|
|
14
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
15
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
16
|
+
from featkit.layer2.pivoted import PivotedColumn
|
|
17
|
+
|
|
18
|
+
_log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _with_marginals(
|
|
22
|
+
observed: list[dict[CategoricalField, str]],
|
|
23
|
+
cats: list[CategoricalField],
|
|
24
|
+
) -> list[dict[CategoricalField, str | None]]:
|
|
25
|
+
"""Expand *observed* combinations with all ∅-substituted variants.
|
|
26
|
+
|
|
27
|
+
For each observed combination and each subset of fields, a new
|
|
28
|
+
combination is produced where those fields are replaced with ``None``
|
|
29
|
+
(the ∅ marginal sentinel). The all-None combination is always included
|
|
30
|
+
even when *observed* is empty, since it represents an unconditional
|
|
31
|
+
aggregate over all data.
|
|
32
|
+
|
|
33
|
+
Duplicates are suppressed so overlapping projections of different
|
|
34
|
+
observed combinations appear only once.
|
|
35
|
+
"""
|
|
36
|
+
seen: set[tuple[tuple[str, str | None], ...]] = set()
|
|
37
|
+
result: list[dict[CategoricalField, str | None]] = []
|
|
38
|
+
|
|
39
|
+
def _append(combo: dict[CategoricalField, str | None]) -> None:
|
|
40
|
+
key = tuple(sorted((f.name, combo[f]) for f in cats))
|
|
41
|
+
if key not in seen:
|
|
42
|
+
seen.add(key)
|
|
43
|
+
result.append(combo)
|
|
44
|
+
|
|
45
|
+
_append({f: None for f in cats})
|
|
46
|
+
|
|
47
|
+
for combo in observed:
|
|
48
|
+
for r in range(len(cats)): # r == len(cats) (all-None) already added above
|
|
49
|
+
for nulled in _icombinations(cats, r):
|
|
50
|
+
c: dict[CategoricalField, str | None] = dict(combo)
|
|
51
|
+
for f in nulled:
|
|
52
|
+
c[f] = None
|
|
53
|
+
_append(c)
|
|
54
|
+
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PivotSpaceBuilder:
|
|
59
|
+
"""Generates the full set of PivotedColumn objects for a dataset.
|
|
60
|
+
|
|
61
|
+
Two combination strategies are supported:
|
|
62
|
+
|
|
63
|
+
* **Observed combinations** (preferred when an adapter is available):
|
|
64
|
+
supply a ``combination_resolver`` callable. It receives the list of
|
|
65
|
+
pivot categorical fields and returns only the combinations that
|
|
66
|
+
actually exist in the source table. Marginals are then derived from
|
|
67
|
+
those observed combinations rather than from the full Cartesian
|
|
68
|
+
product.
|
|
69
|
+
|
|
70
|
+
* **Cartesian product** (default, no adapter required): per-field
|
|
71
|
+
domains are resolved from ``allowed_values`` or ``domain_resolver``
|
|
72
|
+
and the full product is generated.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
dataset: The source facts-table schema.
|
|
76
|
+
include_marginals: When True, ∅-substituted combinations are added
|
|
77
|
+
on top of the base combinations (observed or Cartesian).
|
|
78
|
+
aggregators_override: Per-measurement-type override list. Only
|
|
79
|
+
aggregators that are also contract-valid for the measurement
|
|
80
|
+
type are used.
|
|
81
|
+
combination_resolver: Callable that takes the list of pivot
|
|
82
|
+
``CategoricalField`` objects and returns the observed
|
|
83
|
+
combinations as a list of ``{field: value}`` dicts. When
|
|
84
|
+
provided, ``domain_resolver`` is not used.
|
|
85
|
+
domain_resolver: Callable invoked per-field to resolve the domain
|
|
86
|
+
of a categorical whose ``allowed_values`` is None. Used only
|
|
87
|
+
in the Cartesian product path (i.e. when
|
|
88
|
+
``combination_resolver`` is not provided). Raises
|
|
89
|
+
``ValueError`` at build time if a dynamic field is encountered
|
|
90
|
+
and this is not provided.
|
|
91
|
+
verbose: When ``True``, emits ``DEBUG``-level log messages at key
|
|
92
|
+
milestones: builder start/end, each ``domain_resolver``
|
|
93
|
+
invocation with its resolved values, each ``cat_combination``
|
|
94
|
+
dict, and every generated column name.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
dataset: AbstractDataset,
|
|
100
|
+
include_marginals: bool = True,
|
|
101
|
+
aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None,
|
|
102
|
+
combination_resolver: (
|
|
103
|
+
Callable[[list[CategoricalField]], list[dict[CategoricalField, str]]] | None
|
|
104
|
+
) = None,
|
|
105
|
+
domain_resolver: Callable[[CategoricalField], list[str]] | None = None,
|
|
106
|
+
verbose: bool = False,
|
|
107
|
+
) -> None:
|
|
108
|
+
self.dataset = dataset
|
|
109
|
+
self.include_marginals = include_marginals
|
|
110
|
+
self.aggregators_override = aggregators_override
|
|
111
|
+
self.combination_resolver = combination_resolver
|
|
112
|
+
self.domain_resolver = domain_resolver
|
|
113
|
+
self.verbose = verbose
|
|
114
|
+
|
|
115
|
+
def build(self) -> list[PivotedColumn]:
|
|
116
|
+
"""Build and return all PivotedColumn objects."""
|
|
117
|
+
if self.verbose:
|
|
118
|
+
_log.debug("PivotSpaceBuilder.build() started")
|
|
119
|
+
|
|
120
|
+
all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
|
|
121
|
+
pivot_cats = [
|
|
122
|
+
c
|
|
123
|
+
for c in all_cats
|
|
124
|
+
if c.treatment in {CategoricalTreatment.PIVOT, CategoricalTreatment.BOTH}
|
|
125
|
+
]
|
|
126
|
+
measurements = [cast(MeasurementField, f) for f in self.dataset.measurement_fields]
|
|
127
|
+
|
|
128
|
+
all_combos: list[dict[CategoricalField, str | None]]
|
|
129
|
+
|
|
130
|
+
if self.combination_resolver is not None and pivot_cats:
|
|
131
|
+
observed_raw = self.combination_resolver(pivot_cats)
|
|
132
|
+
pivot_key_set = set(pivot_cats)
|
|
133
|
+
pivot_map = {c: c for c in pivot_cats}
|
|
134
|
+
observed: list[dict[CategoricalField, str]] = []
|
|
135
|
+
for combo in observed_raw:
|
|
136
|
+
if set(combo.keys()) != pivot_key_set:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"combination_resolver must return dicts keyed by all "
|
|
139
|
+
"pivot categorical fields"
|
|
140
|
+
)
|
|
141
|
+
if any(v is None for v in combo.values()):
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"combination_resolver returned None; "
|
|
144
|
+
"None is reserved as the ∅ marginal sentinel"
|
|
145
|
+
)
|
|
146
|
+
observed.append({pivot_map[f]: str(v) for f, v in combo.items()})
|
|
147
|
+
if self.include_marginals:
|
|
148
|
+
all_combos = _with_marginals(observed, pivot_cats)
|
|
149
|
+
else:
|
|
150
|
+
all_combos = [dict(c) for c in observed]
|
|
151
|
+
else:
|
|
152
|
+
cat_domains: dict[CategoricalField, list[str | None]] = {}
|
|
153
|
+
for cat in pivot_cats:
|
|
154
|
+
if cat.allowed_values is not None:
|
|
155
|
+
raw: list[str] = list(cat.allowed_values)
|
|
156
|
+
elif self.domain_resolver is not None:
|
|
157
|
+
if self.verbose:
|
|
158
|
+
_log.debug("domain_resolver: resolving domain for categorical %r", cat.name)
|
|
159
|
+
raw = list(self.domain_resolver(cat))
|
|
160
|
+
if self.verbose:
|
|
161
|
+
_log.debug(
|
|
162
|
+
"domain_resolver: resolved %d value(s) for %r: %s",
|
|
163
|
+
len(raw),
|
|
164
|
+
cat.name,
|
|
165
|
+
raw,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"CategoricalField {cat.name!r} has no allowed_values and no "
|
|
170
|
+
f"domain_resolver was provided"
|
|
171
|
+
)
|
|
172
|
+
if any(v is None for v in raw):
|
|
173
|
+
raise ValueError(
|
|
174
|
+
f"CategoricalField {cat.name!r}: resolved domain contains None; "
|
|
175
|
+
f"None is reserved as the ∅ marginal sentinel"
|
|
176
|
+
)
|
|
177
|
+
domain: list[str | None] = list(raw)
|
|
178
|
+
if self.include_marginals:
|
|
179
|
+
domain = domain + [None]
|
|
180
|
+
cat_domains[cat] = domain
|
|
181
|
+
|
|
182
|
+
cats = list(cat_domains.keys())
|
|
183
|
+
combos = product(*(cat_domains[c] for c in cats)) if cats else ((),)
|
|
184
|
+
all_combos = [
|
|
185
|
+
{cats[i]: combo[i] for i in range(len(cats))} if cats else {} for combo in combos
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
results: list[PivotedColumn] = []
|
|
189
|
+
seen: dict[str, PivotedColumn] = {}
|
|
190
|
+
|
|
191
|
+
for cat_combination in all_combos:
|
|
192
|
+
if self.verbose:
|
|
193
|
+
_log.debug(
|
|
194
|
+
"cat_combination: %s",
|
|
195
|
+
{c.name: v for c, v in cat_combination.items()},
|
|
196
|
+
)
|
|
197
|
+
for mf in measurements:
|
|
198
|
+
for agg in self._valid_aggregators(mf):
|
|
199
|
+
col = PivotedColumn(mf, agg, cat_combination)
|
|
200
|
+
if col.column_name in seen:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Duplicate pivot column name generated: {col.column_name!r}. "
|
|
203
|
+
f"Conflicting columns: {seen[col.column_name]!r} and {col!r}"
|
|
204
|
+
)
|
|
205
|
+
if self.verbose:
|
|
206
|
+
_log.debug("column_name: %r", col.column_name)
|
|
207
|
+
seen[col.column_name] = col
|
|
208
|
+
results.append(col)
|
|
209
|
+
|
|
210
|
+
if self.verbose:
|
|
211
|
+
_log.debug("PivotSpaceBuilder.build() done — %d column(s) generated", len(results))
|
|
212
|
+
return results
|
|
213
|
+
|
|
214
|
+
def _valid_aggregators(self, mf: MeasurementField) -> list[Layer2Aggregator]:
|
|
215
|
+
contract = mf.contract or get_default_contract(mf.measurement_type)
|
|
216
|
+
valid = contract.valid_layer2_aggregators
|
|
217
|
+
if self.aggregators_override and mf.measurement_type in self.aggregators_override:
|
|
218
|
+
return [a for a in self.aggregators_override[mf.measurement_type] if a in valid]
|
|
219
|
+
return sorted(valid, key=lambda a: a.value)
|
|
@@ -2,12 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from collections.abc import Sequence
|
|
6
7
|
|
|
7
8
|
from featkit.enums import Layer2OutputType, TemporalOperator, TimeWindowDirection
|
|
8
9
|
from featkit.layer2.base import AbstractLayer2Column
|
|
9
10
|
from featkit.layer3.temporal_feature import _POINT_IN_TIME_OPERATORS, TemporalFeature
|
|
10
11
|
|
|
12
|
+
_log = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
#: Operators that require composed (MEDIA_ABS / RATIO) window sizes.
|
|
12
15
|
_COMPOSED_OPERATORS: frozenset[TemporalOperator] = frozenset(
|
|
13
16
|
{TemporalOperator.MEDIA_ABS, TemporalOperator.RATIO}
|
|
@@ -33,6 +36,10 @@ class TemporalSpaceBuilder:
|
|
|
33
36
|
direction: Sliding-window direction applied to every feature.
|
|
34
37
|
operators_override: Per-output-type override. Only operators that are
|
|
35
38
|
also contract-valid for the column's output type are used.
|
|
39
|
+
verbose: When ``True``, emits ``DEBUG``-level log messages at key
|
|
40
|
+
milestones: builder start/end, and for each generated feature the
|
|
41
|
+
``(layer2_column, operator, window)`` combination dict and the
|
|
42
|
+
resulting column name.
|
|
36
43
|
"""
|
|
37
44
|
|
|
38
45
|
def __init__(
|
|
@@ -42,15 +49,20 @@ class TemporalSpaceBuilder:
|
|
|
42
49
|
composed_windows: list[int] | None = None,
|
|
43
50
|
direction: TimeWindowDirection = TimeWindowDirection.BACKWARD,
|
|
44
51
|
operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = None,
|
|
52
|
+
verbose: bool = False,
|
|
45
53
|
) -> None:
|
|
46
54
|
self.layer2_columns = layer2_columns
|
|
47
55
|
self.time_windows = time_windows
|
|
48
56
|
self.composed_windows = composed_windows
|
|
49
57
|
self.direction = direction
|
|
50
58
|
self.operators_override = operators_override
|
|
59
|
+
self.verbose = verbose
|
|
51
60
|
|
|
52
61
|
def build(self) -> list[TemporalFeature]:
|
|
53
62
|
"""Build and return all TemporalFeature objects."""
|
|
63
|
+
if self.verbose:
|
|
64
|
+
_log.debug("TemporalSpaceBuilder.build() started")
|
|
65
|
+
|
|
54
66
|
results: list[TemporalFeature] = []
|
|
55
67
|
seen: set[str] = set()
|
|
56
68
|
|
|
@@ -79,8 +91,26 @@ class TemporalSpaceBuilder:
|
|
|
79
91
|
|
|
80
92
|
for ws in window_sizes:
|
|
81
93
|
feat = TemporalFeature(col, op, self.direction, window_size=ws)
|
|
94
|
+
if self.verbose:
|
|
95
|
+
_log.debug(
|
|
96
|
+
"combo: layer2_column=%r, operator=%s, window=%s",
|
|
97
|
+
col.column_name,
|
|
98
|
+
op.value,
|
|
99
|
+
ws,
|
|
100
|
+
)
|
|
101
|
+
_log.debug(
|
|
102
|
+
"combination: %s",
|
|
103
|
+
{
|
|
104
|
+
"layer2_column": col.column_name,
|
|
105
|
+
"operator": op.value,
|
|
106
|
+
"window": ws,
|
|
107
|
+
},
|
|
108
|
+
)
|
|
109
|
+
_log.debug("column_name: %r", feat.column_name)
|
|
82
110
|
if feat.column_name not in seen:
|
|
83
111
|
seen.add(feat.column_name)
|
|
84
112
|
results.append(feat)
|
|
85
113
|
|
|
114
|
+
if self.verbose:
|
|
115
|
+
_log.debug("TemporalSpaceBuilder.build() done — %d feature(s) generated", len(results))
|
|
86
116
|
return results
|
|
@@ -3,10 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
from featkit.dataset.base import AbstractDataset
|
|
8
9
|
from featkit.enums import Layer2Aggregator, Layer2OutputType, MeasurementType, TemporalOperator
|
|
9
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from featkit.execution.adapters.base import DataSourceAdapter
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
@dataclass
|
|
12
16
|
class FeatureStoreConfig:
|
|
@@ -26,6 +30,14 @@ class FeatureStoreConfig:
|
|
|
26
30
|
aggregators. Only contract-valid aggregators are used.
|
|
27
31
|
operators_override: Per-output-type override for temporal operators.
|
|
28
32
|
Only contract-valid operators are used.
|
|
33
|
+
adapter: Optional execution adapter. When provided, categorical fields
|
|
34
|
+
with no ``allowed_values`` have their domain resolved at
|
|
35
|
+
``FeatureStorePipeline.build()`` time via a ``SELECT DISTINCT``
|
|
36
|
+
query against the facts table.
|
|
37
|
+
verbose: When ``True``, the space builders emit ``DEBUG``-level log
|
|
38
|
+
messages at key milestones: builder start/end, ``domain_resolver``
|
|
39
|
+
invocations (PivotSpaceBuilder only), and each generated column name
|
|
40
|
+
together with the combination that produced it.
|
|
29
41
|
"""
|
|
30
42
|
|
|
31
43
|
dataset: AbstractDataset
|
|
@@ -36,3 +48,5 @@ class FeatureStoreConfig:
|
|
|
36
48
|
include_marginals: bool = True
|
|
37
49
|
aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None
|
|
38
50
|
operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = field(default=None)
|
|
51
|
+
adapter: DataSourceAdapter | None = None
|
|
52
|
+
verbose: bool = False
|