featkit 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. featkit-0.3.0/CHANGELOG.md +25 -0
  2. {featkit-0.1.0 → featkit-0.3.0}/PKG-INFO +4 -1
  3. featkit-0.3.0/docs/example_databricks_notebook.md +209 -0
  4. {featkit-0.1.0 → featkit-0.3.0}/pyproject.toml +3 -1
  5. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/distributional_space.py +34 -0
  6. featkit-0.3.0/src/featkit/builders/pivot_space.py +219 -0
  7. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/temporal_space.py +30 -0
  8. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/config.py +14 -0
  9. featkit-0.3.0/src/featkit/execution/__init__.py +1 -0
  10. featkit-0.3.0/src/featkit/execution/adapters/__init__.py +18 -0
  11. featkit-0.3.0/src/featkit/execution/adapters/base.py +63 -0
  12. featkit-0.3.0/src/featkit/execution/adapters/databricks_adapter.py +61 -0
  13. featkit-0.3.0/src/featkit/execution/adapters/databricks_notebook_adapter.py +117 -0
  14. featkit-0.3.0/src/featkit/execution/adapters/mock_adapter.py +35 -0
  15. featkit-0.3.0/src/featkit/execution/adapters/spark_adapter.py +28 -0
  16. featkit-0.3.0/src/featkit/execution/adapters/sqlalchemy_adapter.py +44 -0
  17. featkit-0.3.0/src/featkit/execution/domain_resolver.py +177 -0
  18. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/base.py +55 -5
  19. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/pivoted.py +2 -1
  20. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/pipeline.py +13 -0
  21. {featkit-0.1.0 → featkit-0.3.0}/tests/test_builders.py +297 -0
  22. featkit-0.3.0/tests/test_execution/test_adapters.py +77 -0
  23. featkit-0.3.0/tests/test_execution/test_domain_resolver.py +356 -0
  24. featkit-0.3.0/tests/test_generators/__init__.py +0 -0
  25. {featkit-0.1.0 → featkit-0.3.0}/tests/test_layer2.py +19 -1
  26. featkit-0.1.0/CHANGELOG.md +0 -8
  27. featkit-0.1.0/src/featkit/builders/pivot_space.py +0 -102
  28. {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/ci.yml +0 -0
  29. {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/docs.yml +0 -0
  30. {featkit-0.1.0 → featkit-0.3.0}/.github/workflows/publish.yml +0 -0
  31. {featkit-0.1.0 → featkit-0.3.0}/.gitignore +0 -0
  32. {featkit-0.1.0 → featkit-0.3.0}/LICENSE +0 -0
  33. {featkit-0.1.0 → featkit-0.3.0}/README.md +0 -0
  34. {featkit-0.1.0 → featkit-0.3.0}/docs/.gitkeep +0 -0
  35. {featkit-0.1.0 → featkit-0.3.0}/docs/examples.md +0 -0
  36. {featkit-0.1.0 → featkit-0.3.0}/docs/general_plan.md +0 -0
  37. {featkit-0.1.0 → featkit-0.3.0}/docs/index.md +0 -0
  38. {featkit-0.1.0 → featkit-0.3.0}/docs/quickstart.md +0 -0
  39. {featkit-0.1.0 → featkit-0.3.0}/mkdocs.yml +0 -0
  40. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/__init__.py +0 -0
  41. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/.gitkeep +0 -0
  42. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/builders/__init__.py +0 -0
  43. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/__init__.py +0 -0
  44. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/.gitkeep +0 -0
  45. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/__init__.py +0 -0
  46. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/base.py +0 -0
  47. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/measurement/defaults.py +0 -0
  48. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/.gitkeep +0 -0
  49. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/__init__.py +0 -0
  50. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/base.py +0 -0
  51. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/contracts/output/defaults.py +0 -0
  52. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/.gitkeep +0 -0
  53. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/__init__.py +0 -0
  54. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/dataset/base.py +0 -0
  55. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/enums.py +0 -0
  56. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/.gitkeep +0 -0
  57. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/__init__.py +0 -0
  58. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/base.py +0 -0
  59. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/categorical_field.py +0 -0
  60. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/id_field.py +0 -0
  61. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/measurement_field.py +0 -0
  62. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/fields/time_field.py +0 -0
  63. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/__init__.py +0 -0
  64. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/base.py +0 -0
  65. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/output.py +0 -0
  66. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/.gitkeep +0 -0
  67. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/__init__.py +0 -0
  68. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/pyspark/databricks.py +0 -0
  69. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/.gitkeep +0 -0
  70. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/__init__.py +0 -0
  71. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/databricks.py +0 -0
  72. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/snowflake.py +0 -0
  73. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/generators/sql/spark_sql.py +0 -0
  74. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/.gitkeep +0 -0
  75. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/__init__.py +0 -0
  76. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/base.py +0 -0
  77. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer2/distributional.py +0 -0
  78. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/.gitkeep +0 -0
  79. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/__init__.py +0 -0
  80. {featkit-0.1.0 → featkit-0.3.0}/src/featkit/layer3/temporal_feature.py +0 -0
  81. {featkit-0.1.0 → featkit-0.3.0}/tests/__init__.py +0 -0
  82. {featkit-0.1.0 → featkit-0.3.0}/tests/test_contracts.py +0 -0
  83. {featkit-0.1.0 → featkit-0.3.0}/tests/test_enums.py +0 -0
  84. {featkit-0.1.0/tests/test_generators → featkit-0.3.0/tests/test_execution}/__init__.py +0 -0
  85. {featkit-0.1.0 → featkit-0.3.0}/tests/test_fields.py +0 -0
  86. {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/.gitkeep +0 -0
  87. {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_base.py +0 -0
  88. {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_pyspark.py +0 -0
  89. {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_sql_databricks.py +0 -0
  90. {featkit-0.1.0 → featkit-0.3.0}/tests/test_generators/test_sql_snowflake.py +0 -0
  91. {featkit-0.1.0 → featkit-0.3.0}/tests/test_integration.py +0 -0
  92. {featkit-0.1.0 → featkit-0.3.0}/tests/test_layer3.py +0 -0
  93. {featkit-0.1.0 → featkit-0.3.0}/tests/test_output_contracts.py +0 -0
  94. {featkit-0.1.0 → featkit-0.3.0}/tests/test_pipeline.py +0 -0
@@ -0,0 +1,25 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.3.0] - 2026-06-08
11
+
12
+ ### Added
13
+ - `AdapterCombinationResolver` — replaces per-field `SELECT DISTINCT` queries with a single multi-column query returning only observed combinations (`feat(builders)`)
14
+ - `verbose` logging option on `PivotSpaceBuilder`, `DistributionalSpaceBuilder`, and `TemporalSpaceBuilder`, configurable via `FeatureStoreConfig` (`feat(config)`)
15
+
16
+ ### Fixed
17
+ - Marginal fields no longer contribute their name to pivot column names; e.g. `SUM__amount__channel__region_north` → `SUM__amount__region_north` (`fix(layer2)`)
18
+
19
+ ## [0.2.0] - 2026-06-02
20
+
21
+ ### Added
22
+ - Execution layer with adapter-based domain resolution (`feat(execution)`)
23
+
24
+ ### Fixed
25
+ - Lazy-import `AdapterDomainResolver`; added `pandas` to dev dependencies
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featkit
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: featkit — automated feature store generation from relational facts tables
5
5
  Project-URL: Repository, https://github.com/Mirkiux/featkit
6
6
  Project-URL: Documentation, https://mirkiux.github.io/featkit
@@ -49,6 +49,7 @@ Provides-Extra: dev
49
49
  Requires-Dist: build>=1.0; extra == 'dev'
50
50
  Requires-Dist: hatch>=1.9; extra == 'dev'
51
51
  Requires-Dist: mypy>=1.0; extra == 'dev'
52
+ Requires-Dist: pandas>=1.5; extra == 'dev'
52
53
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
53
54
  Requires-Dist: pytest>=7.0; extra == 'dev'
54
55
  Requires-Dist: ruff>=0.4; extra == 'dev'
@@ -57,6 +58,8 @@ Provides-Extra: docs
57
58
  Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
58
59
  Requires-Dist: mkdocs>=1.6; extra == 'docs'
59
60
  Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
61
+ Provides-Extra: execution
62
+ Requires-Dist: pandas>=1.5; extra == 'execution'
60
63
  Provides-Extra: ibis
61
64
  Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
62
65
  Provides-Extra: spark
@@ -0,0 +1,209 @@
1
+ # Example — Observed-combinations pivot in a Databricks notebook
2
+
3
+ This example shows how featkit resolves pivot combinations at runtime by
4
+ querying the facts table directly from a Databricks notebook.
5
+
6
+ When an adapter is configured, `FeatureStorePipeline` constructs an
7
+ `AdapterCombinationResolver` and passes it to `PivotSpaceBuilder`. Instead of
8
+ generating the full Cartesian product of per-field domains, the builder issues a
9
+ **single `SELECT DISTINCT`** query for all pivot categoricals and builds only the
10
+ combinations that actually exist in the data. Marginals are then derived from
11
+ those observed combinations via subset-projection.
12
+
13
+ `DatabricksNotebookAdapter` discovers the pre-injected `spark` session
14
+ automatically — no constructor arguments are needed.
15
+
16
+ ## Notebook cells
17
+
18
+ ### Cell 1 — imports
19
+
20
+ ```python
21
+ from featkit.config import FeatureStoreConfig
22
+ from featkit.dataset.base import SimpleDataset
23
+ from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
24
+ from featkit.execution.adapters import DatabricksNotebookAdapter
25
+ from featkit.fields.categorical_field import CategoricalField
26
+ from featkit.fields.id_field import IDField
27
+ from featkit.fields.measurement_field import MeasurementField
28
+ from featkit.fields.time_field import TimeField
29
+ from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
30
+ from featkit.pipeline import FeatureStorePipeline
31
+ ```
32
+
33
+ ### Cell 2 — define the dataset
34
+
35
+ ```python
36
+ ds = SimpleDataset(
37
+ "mydb.myschema.silver_transactions",
38
+ [
39
+ IDField("client_id"),
40
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
41
+ MeasurementField("amount", MeasurementType.MONTO),
42
+ MeasurementField("txn_count", MeasurementType.CANTIDAD),
43
+ # allowed_values used as WHERE IN-filter; omit to query with no filter
44
+ CategoricalField(
45
+ "segment",
46
+ CategoricalTreatment.PIVOT,
47
+ allowed_values=["retail", "sme", "corporate"],
48
+ ),
49
+ CategoricalField(
50
+ "product_type",
51
+ CategoricalTreatment.PIVOT,
52
+ allowed_values=["loan", "deposit", "card"],
53
+ ),
54
+ ],
55
+ )
56
+ ```
57
+
58
+ ### Cell 3 — configure with the notebook adapter
59
+
60
+ ```python
61
+ adapter = DatabricksNotebookAdapter()
62
+
63
+ cfg = FeatureStoreConfig(
64
+ dataset=ds,
65
+ output_schema="analytics",
66
+ output_table_prefix="feat_",
67
+ time_windows=[3, 6, 12],
68
+ include_marginals=True,
69
+ adapter=adapter, # triggers SELECT DISTINCT combination query at build()
70
+ )
71
+ ```
72
+
73
+ ### Cell 4 — build and generate
74
+
75
+ ```python
76
+ # build() issues ONE SELECT DISTINCT for all pivot categoricals:
77
+ #
78
+ # SELECT DISTINCT product_type, segment
79
+ # FROM mydb.myschema.silver_transactions
80
+ # WHERE product_type IS NOT NULL
81
+ # AND segment IS NOT NULL
82
+ # AND product_type IN ('loan', 'deposit', 'card')
83
+ # AND segment IN ('retail', 'sme', 'corporate')
84
+ # ORDER BY 1, 2
85
+ #
86
+ # Only the returned combinations (plus their marginal projections) become
87
+ # pivot columns — unobserved cross-combinations are never generated.
88
+ pipeline = FeatureStorePipeline(config=cfg).build()
89
+
90
+ print(f"Layer 2A columns : {len(pipeline.layer2a)}")
91
+ print(f"Layer 3 features: {len(pipeline.layer3)}")
92
+
93
+ result = DatabricksSQLCodeGenerator().generate(pipeline)
94
+ print(result.code.sql[:500])
95
+ ```
96
+
97
+ ### Cell 5 — save the artefacts to DBFS
98
+
99
+ ```python
100
+ result.save("/dbfs/mnt/output/features/")
101
+ # Writes:
102
+ # /dbfs/mnt/output/features/script.sql
103
+ # /dbfs/mnt/output/features/dag.json
104
+ # /dbfs/mnt/output/features/diagram.md
105
+ ```
106
+
107
+ ## How it works
108
+
109
+ `FeatureStorePipeline.build()` constructs an `AdapterCombinationResolver` and
110
+ passes it to `PivotSpaceBuilder` as the `combination_resolver` callable. The
111
+ resolver executes a single multi-column `SELECT DISTINCT`:
112
+
113
+ ```sql
114
+ SELECT DISTINCT product_type, segment
115
+ FROM mydb.myschema.silver_transactions
116
+ WHERE product_type IS NOT NULL
117
+ AND segment IS NOT NULL
118
+ AND product_type IN ('loan', 'deposit', 'card')
119
+ AND segment IN ('retail', 'sme', 'corporate')
120
+ ORDER BY 1, 2
121
+ ```
122
+
123
+ Suppose the query returns three rows:
124
+
125
+ | product_type | segment |
126
+ |-------------|-----------|
127
+ | loan | retail |
128
+ | loan | sme |
129
+ | deposit | corporate |
130
+
131
+ With `include_marginals=True`, the builder derives every subset-projection of
132
+ those rows:
133
+
134
+ | product_type | segment | interpretation |
135
+ |-------------|-----------|------------------------------------------|
136
+ | loan | retail | observed combination |
137
+ | loan | sme | observed combination |
138
+ | deposit | corporate | observed combination |
139
+ | loan | `∅` | all segments for loan |
140
+ | deposit | `∅` | all segments for deposit |
141
+ | `∅` | retail | all products for retail |
142
+ | `∅` | sme | all products for sme |
143
+ | `∅` | corporate | all products for corporate |
144
+ | `∅` | `∅` | unconditional aggregate (always present)|
145
+
146
+ Unobserved combinations (e.g. `deposit × retail`) are **never generated**,
147
+ keeping the feature space lean.
148
+
149
+ ## Fields without `allowed_values`
150
+
151
+ If a field has no `allowed_values`, it is still included in the `SELECT DISTINCT`
152
+ but its column is not filtered in the WHERE clause — all distinct values present
153
+ in the table are returned for that dimension:
154
+
155
+ ```python
156
+ ds = SimpleDataset(
157
+ "mydb.myschema.silver_transactions",
158
+ [
159
+ IDField("client_id"),
160
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
161
+ MeasurementField("amount", MeasurementType.MONTO),
162
+ # Static domain — used as IN-filter in the combined query
163
+ CategoricalField(
164
+ "channel",
165
+ CategoricalTreatment.PIVOT,
166
+ allowed_values=["branch", "online", "mobile"],
167
+ ),
168
+ # No allowed_values — column included without an IN-filter
169
+ CategoricalField("segment", CategoricalTreatment.PIVOT),
170
+ ],
171
+ )
172
+ ```
173
+
174
+ ## Using a different adapter
175
+
176
+ Swap `DatabricksNotebookAdapter` for any other adapter without changing the
177
+ rest of the code:
178
+
179
+ ```python
180
+ from featkit.execution.adapters import DatabricksAdapter
181
+
182
+ adapter = DatabricksAdapter(
183
+ host="<workspace>.azuredatabricks.net",
184
+ token="<pat>",
185
+ http_path="/sql/1.0/warehouses/<warehouse-id>",
186
+ catalog="mydb",
187
+ schema="myschema",
188
+ )
189
+
190
+ cfg = FeatureStoreConfig(..., adapter=adapter)
191
+ ```
192
+
193
+ ## Using `AdapterCombinationResolver` directly
194
+
195
+ The resolver can also be wired manually to `PivotSpaceBuilder` without going
196
+ through the pipeline:
197
+
198
+ ```python
199
+ from featkit.execution.domain_resolver import AdapterCombinationResolver
200
+ from featkit.builders.pivot_space import PivotSpaceBuilder
201
+
202
+ resolver = AdapterCombinationResolver(adapter, "mydb.myschema.silver_transactions")
203
+
204
+ columns = PivotSpaceBuilder(
205
+ dataset=ds,
206
+ include_marginals=True,
207
+ combination_resolver=resolver,
208
+ ).build()
209
+ ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "featkit"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "featkit — automated feature store generation from relational facts tables"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -35,6 +35,7 @@ dependencies = [
35
35
  ibis = ["ibis-framework>=9.0"]
36
36
  spark = ["pyspark>=3.4"]
37
37
  databricks = ["databricks-sql-connector>=3.0"]
38
+ execution = ["pandas>=1.5"]
38
39
  docs = [
39
40
  "mkdocs>=1.6",
40
41
  "mkdocs-material>=9.5",
@@ -48,6 +49,7 @@ dev = [
48
49
  "hatch>=1.9",
49
50
  "build>=1.0",
50
51
  "twine>=5.0",
52
+ "pandas>=1.5",
51
53
  ]
52
54
 
53
55
  [project.urls]
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from typing import cast
6
7
 
7
8
  from featkit.contracts.measurement.defaults import get_default_contract
@@ -11,6 +12,8 @@ from featkit.fields.categorical_field import CategoricalField
11
12
  from featkit.fields.measurement_field import MeasurementField
12
13
  from featkit.layer2.distributional import DistributionalColumn
13
14
 
15
+ _log = logging.getLogger(__name__)
16
+
14
17
 
15
18
  class DistributionalSpaceBuilder:
16
19
  """Generates the full set of DistributionalColumn objects for a dataset.
@@ -26,18 +29,27 @@ class DistributionalSpaceBuilder:
26
29
  An empty list produces no columns. Every entry must be present in the
27
30
  dataset (compared by name, type, and contract); a ``ValueError`` is
28
31
  raised for unknown fields.
32
+ verbose: When ``True``, emits ``DEBUG``-level log messages at key
33
+ milestones: builder start/end, and for each generated column the
34
+ ``(categorical, measurement, aggregator, metric)`` combination dict
35
+ and the resulting column name.
29
36
  """
30
37
 
31
38
  def __init__(
32
39
  self,
33
40
  dataset: AbstractDataset,
34
41
  value_measurements: list[MeasurementField] | None = None,
42
+ verbose: bool = False,
35
43
  ) -> None:
36
44
  self.dataset = dataset
37
45
  self.value_measurements = value_measurements
46
+ self.verbose = verbose
38
47
 
39
48
  def build(self) -> list[DistributionalColumn]:
40
49
  """Build and return all DistributionalColumn objects."""
50
+ if self.verbose:
51
+ _log.debug("DistributionalSpaceBuilder.build() started")
52
+
41
53
  all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
42
54
  dist_cats = [
43
55
  c
@@ -70,8 +82,30 @@ class DistributionalSpaceBuilder:
70
82
  for agg in aggs:
71
83
  for metric in cat.distributional_metrics:
72
84
  col = DistributionalColumn(mf, agg, cat, metric)
85
+ if self.verbose:
86
+ _log.debug(
87
+ "combo: cat=%r, measurement=%r, aggregator=%s, metric=%s",
88
+ cat.name,
89
+ mf.name,
90
+ agg.value,
91
+ metric.value,
92
+ )
93
+ _log.debug(
94
+ "combination: %s",
95
+ {
96
+ "categorical": cat.name,
97
+ "measurement": mf.name,
98
+ "aggregator": agg.value,
99
+ "metric": metric.value,
100
+ },
101
+ )
102
+ _log.debug("column_name: %r", col.column_name)
73
103
  if col.column_name not in seen:
74
104
  seen.add(col.column_name)
75
105
  results.append(col)
76
106
 
107
+ if self.verbose:
108
+ _log.debug(
109
+ "DistributionalSpaceBuilder.build() done — %d column(s) generated", len(results)
110
+ )
77
111
  return results
@@ -0,0 +1,219 @@
1
+ """PivotSpaceBuilder — generates all PivotedColumn objects from a dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Callable
7
+ from itertools import combinations as _icombinations
8
+ from itertools import product
9
+ from typing import cast
10
+
11
+ from featkit.contracts.measurement.defaults import get_default_contract
12
+ from featkit.dataset.base import AbstractDataset
13
+ from featkit.enums import CategoricalTreatment, Layer2Aggregator, MeasurementType
14
+ from featkit.fields.categorical_field import CategoricalField
15
+ from featkit.fields.measurement_field import MeasurementField
16
+ from featkit.layer2.pivoted import PivotedColumn
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ def _with_marginals(
22
+ observed: list[dict[CategoricalField, str]],
23
+ cats: list[CategoricalField],
24
+ ) -> list[dict[CategoricalField, str | None]]:
25
+ """Expand *observed* combinations with all ∅-substituted variants.
26
+
27
+ For each observed combination and each subset of fields, a new
28
+ combination is produced where those fields are replaced with ``None``
29
+ (the ∅ marginal sentinel). The all-None combination is always included
30
+ even when *observed* is empty, since it represents an unconditional
31
+ aggregate over all data.
32
+
33
+ Duplicates are suppressed so overlapping projections of different
34
+ observed combinations appear only once.
35
+ """
36
+ seen: set[tuple[tuple[str, str | None], ...]] = set()
37
+ result: list[dict[CategoricalField, str | None]] = []
38
+
39
+ def _append(combo: dict[CategoricalField, str | None]) -> None:
40
+ key = tuple(sorted((f.name, combo[f]) for f in cats))
41
+ if key not in seen:
42
+ seen.add(key)
43
+ result.append(combo)
44
+
45
+ _append({f: None for f in cats})
46
+
47
+ for combo in observed:
48
+ for r in range(len(cats)): # r == len(cats) (all-None) already added above
49
+ for nulled in _icombinations(cats, r):
50
+ c: dict[CategoricalField, str | None] = dict(combo)
51
+ for f in nulled:
52
+ c[f] = None
53
+ _append(c)
54
+
55
+ return result
56
+
57
+
58
+ class PivotSpaceBuilder:
59
+ """Generates the full set of PivotedColumn objects for a dataset.
60
+
61
+ Two combination strategies are supported:
62
+
63
+ * **Observed combinations** (preferred when an adapter is available):
64
+ supply a ``combination_resolver`` callable. It receives the list of
65
+ pivot categorical fields and returns only the combinations that
66
+ actually exist in the source table. Marginals are then derived from
67
+ those observed combinations rather than from the full Cartesian
68
+ product.
69
+
70
+ * **Cartesian product** (default, no adapter required): per-field
71
+ domains are resolved from ``allowed_values`` or ``domain_resolver``
72
+ and the full product is generated.
73
+
74
+ Args:
75
+ dataset: The source facts-table schema.
76
+ include_marginals: When True, ∅-substituted combinations are added
77
+ on top of the base combinations (observed or Cartesian).
78
+ aggregators_override: Per-measurement-type override list. Only
79
+ aggregators that are also contract-valid for the measurement
80
+ type are used.
81
+ combination_resolver: Callable that takes the list of pivot
82
+ ``CategoricalField`` objects and returns the observed
83
+ combinations as a list of ``{field: value}`` dicts. When
84
+ provided, ``domain_resolver`` is not used.
85
+ domain_resolver: Callable invoked per-field to resolve the domain
86
+ of a categorical whose ``allowed_values`` is None. Used only
87
+ in the Cartesian product path (i.e. when
88
+ ``combination_resolver`` is not provided). Raises
89
+ ``ValueError`` at build time if a dynamic field is encountered
90
+ and this is not provided.
91
+ verbose: When ``True``, emits ``DEBUG``-level log messages at key
92
+ milestones: builder start/end, each ``domain_resolver``
93
+ invocation with its resolved values, each ``cat_combination``
94
+ dict, and every generated column name.
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ dataset: AbstractDataset,
100
+ include_marginals: bool = True,
101
+ aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None,
102
+ combination_resolver: (
103
+ Callable[[list[CategoricalField]], list[dict[CategoricalField, str]]] | None
104
+ ) = None,
105
+ domain_resolver: Callable[[CategoricalField], list[str]] | None = None,
106
+ verbose: bool = False,
107
+ ) -> None:
108
+ self.dataset = dataset
109
+ self.include_marginals = include_marginals
110
+ self.aggregators_override = aggregators_override
111
+ self.combination_resolver = combination_resolver
112
+ self.domain_resolver = domain_resolver
113
+ self.verbose = verbose
114
+
115
+ def build(self) -> list[PivotedColumn]:
116
+ """Build and return all PivotedColumn objects."""
117
+ if self.verbose:
118
+ _log.debug("PivotSpaceBuilder.build() started")
119
+
120
+ all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
121
+ pivot_cats = [
122
+ c
123
+ for c in all_cats
124
+ if c.treatment in {CategoricalTreatment.PIVOT, CategoricalTreatment.BOTH}
125
+ ]
126
+ measurements = [cast(MeasurementField, f) for f in self.dataset.measurement_fields]
127
+
128
+ all_combos: list[dict[CategoricalField, str | None]]
129
+
130
+ if self.combination_resolver is not None and pivot_cats:
131
+ observed_raw = self.combination_resolver(pivot_cats)
132
+ pivot_key_set = set(pivot_cats)
133
+ pivot_map = {c: c for c in pivot_cats}
134
+ observed: list[dict[CategoricalField, str]] = []
135
+ for combo in observed_raw:
136
+ if set(combo.keys()) != pivot_key_set:
137
+ raise ValueError(
138
+ "combination_resolver must return dicts keyed by all "
139
+ "pivot categorical fields"
140
+ )
141
+ if any(v is None for v in combo.values()):
142
+ raise ValueError(
143
+ "combination_resolver returned None; "
144
+ "None is reserved as the ∅ marginal sentinel"
145
+ )
146
+ observed.append({pivot_map[f]: str(v) for f, v in combo.items()})
147
+ if self.include_marginals:
148
+ all_combos = _with_marginals(observed, pivot_cats)
149
+ else:
150
+ all_combos = [dict(c) for c in observed]
151
+ else:
152
+ cat_domains: dict[CategoricalField, list[str | None]] = {}
153
+ for cat in pivot_cats:
154
+ if cat.allowed_values is not None:
155
+ raw: list[str] = list(cat.allowed_values)
156
+ elif self.domain_resolver is not None:
157
+ if self.verbose:
158
+ _log.debug("domain_resolver: resolving domain for categorical %r", cat.name)
159
+ raw = list(self.domain_resolver(cat))
160
+ if self.verbose:
161
+ _log.debug(
162
+ "domain_resolver: resolved %d value(s) for %r: %s",
163
+ len(raw),
164
+ cat.name,
165
+ raw,
166
+ )
167
+ else:
168
+ raise ValueError(
169
+ f"CategoricalField {cat.name!r} has no allowed_values and no "
170
+ f"domain_resolver was provided"
171
+ )
172
+ if any(v is None for v in raw):
173
+ raise ValueError(
174
+ f"CategoricalField {cat.name!r}: resolved domain contains None; "
175
+ f"None is reserved as the ∅ marginal sentinel"
176
+ )
177
+ domain: list[str | None] = list(raw)
178
+ if self.include_marginals:
179
+ domain = domain + [None]
180
+ cat_domains[cat] = domain
181
+
182
+ cats = list(cat_domains.keys())
183
+ combos = product(*(cat_domains[c] for c in cats)) if cats else ((),)
184
+ all_combos = [
185
+ {cats[i]: combo[i] for i in range(len(cats))} if cats else {} for combo in combos
186
+ ]
187
+
188
+ results: list[PivotedColumn] = []
189
+ seen: dict[str, PivotedColumn] = {}
190
+
191
+ for cat_combination in all_combos:
192
+ if self.verbose:
193
+ _log.debug(
194
+ "cat_combination: %s",
195
+ {c.name: v for c, v in cat_combination.items()},
196
+ )
197
+ for mf in measurements:
198
+ for agg in self._valid_aggregators(mf):
199
+ col = PivotedColumn(mf, agg, cat_combination)
200
+ if col.column_name in seen:
201
+ raise ValueError(
202
+ f"Duplicate pivot column name generated: {col.column_name!r}. "
203
+ f"Conflicting columns: {seen[col.column_name]!r} and {col!r}"
204
+ )
205
+ if self.verbose:
206
+ _log.debug("column_name: %r", col.column_name)
207
+ seen[col.column_name] = col
208
+ results.append(col)
209
+
210
+ if self.verbose:
211
+ _log.debug("PivotSpaceBuilder.build() done — %d column(s) generated", len(results))
212
+ return results
213
+
214
+ def _valid_aggregators(self, mf: MeasurementField) -> list[Layer2Aggregator]:
215
+ contract = mf.contract or get_default_contract(mf.measurement_type)
216
+ valid = contract.valid_layer2_aggregators
217
+ if self.aggregators_override and mf.measurement_type in self.aggregators_override:
218
+ return [a for a in self.aggregators_override[mf.measurement_type] if a in valid]
219
+ return sorted(valid, key=lambda a: a.value)
@@ -2,12 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from collections.abc import Sequence
6
7
 
7
8
  from featkit.enums import Layer2OutputType, TemporalOperator, TimeWindowDirection
8
9
  from featkit.layer2.base import AbstractLayer2Column
9
10
  from featkit.layer3.temporal_feature import _POINT_IN_TIME_OPERATORS, TemporalFeature
10
11
 
12
+ _log = logging.getLogger(__name__)
13
+
11
14
  #: Operators that require composed (MEDIA_ABS / RATIO) window sizes.
12
15
  _COMPOSED_OPERATORS: frozenset[TemporalOperator] = frozenset(
13
16
  {TemporalOperator.MEDIA_ABS, TemporalOperator.RATIO}
@@ -33,6 +36,10 @@ class TemporalSpaceBuilder:
33
36
  direction: Sliding-window direction applied to every feature.
34
37
  operators_override: Per-output-type override. Only operators that are
35
38
  also contract-valid for the column's output type are used.
39
+ verbose: When ``True``, emits ``DEBUG``-level log messages at key
40
+ milestones: builder start/end, and for each generated feature the
41
+ ``(layer2_column, operator, window)`` combination dict and the
42
+ resulting column name.
36
43
  """
37
44
 
38
45
  def __init__(
@@ -42,15 +49,20 @@ class TemporalSpaceBuilder:
42
49
  composed_windows: list[int] | None = None,
43
50
  direction: TimeWindowDirection = TimeWindowDirection.BACKWARD,
44
51
  operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = None,
52
+ verbose: bool = False,
45
53
  ) -> None:
46
54
  self.layer2_columns = layer2_columns
47
55
  self.time_windows = time_windows
48
56
  self.composed_windows = composed_windows
49
57
  self.direction = direction
50
58
  self.operators_override = operators_override
59
+ self.verbose = verbose
51
60
 
52
61
  def build(self) -> list[TemporalFeature]:
53
62
  """Build and return all TemporalFeature objects."""
63
+ if self.verbose:
64
+ _log.debug("TemporalSpaceBuilder.build() started")
65
+
54
66
  results: list[TemporalFeature] = []
55
67
  seen: set[str] = set()
56
68
 
@@ -79,8 +91,26 @@ class TemporalSpaceBuilder:
79
91
 
80
92
  for ws in window_sizes:
81
93
  feat = TemporalFeature(col, op, self.direction, window_size=ws)
94
+ if self.verbose:
95
+ _log.debug(
96
+ "combo: layer2_column=%r, operator=%s, window=%s",
97
+ col.column_name,
98
+ op.value,
99
+ ws,
100
+ )
101
+ _log.debug(
102
+ "combination: %s",
103
+ {
104
+ "layer2_column": col.column_name,
105
+ "operator": op.value,
106
+ "window": ws,
107
+ },
108
+ )
109
+ _log.debug("column_name: %r", feat.column_name)
82
110
  if feat.column_name not in seen:
83
111
  seen.add(feat.column_name)
84
112
  results.append(feat)
85
113
 
114
+ if self.verbose:
115
+ _log.debug("TemporalSpaceBuilder.build() done — %d feature(s) generated", len(results))
86
116
  return results
@@ -3,10 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING
6
7
 
7
8
  from featkit.dataset.base import AbstractDataset
8
9
  from featkit.enums import Layer2Aggregator, Layer2OutputType, MeasurementType, TemporalOperator
9
10
 
11
+ if TYPE_CHECKING:
12
+ from featkit.execution.adapters.base import DataSourceAdapter
13
+
10
14
 
11
15
  @dataclass
12
16
  class FeatureStoreConfig:
@@ -26,6 +30,14 @@ class FeatureStoreConfig:
26
30
  aggregators. Only contract-valid aggregators are used.
27
31
  operators_override: Per-output-type override for temporal operators.
28
32
  Only contract-valid operators are used.
33
+ adapter: Optional execution adapter. When provided, categorical fields
34
+ with no ``allowed_values`` have their domain resolved at
35
+ ``FeatureStorePipeline.build()`` time via a ``SELECT DISTINCT``
36
+ query against the facts table.
37
+ verbose: When ``True``, the space builders emit ``DEBUG``-level log
38
+ messages at key milestones: builder start/end, ``domain_resolver``
39
+ invocations (PivotSpaceBuilder only), and each generated column name
40
+ together with the combination that produced it.
29
41
  """
30
42
 
31
43
  dataset: AbstractDataset
@@ -36,3 +48,5 @@ class FeatureStoreConfig:
36
48
  include_marginals: bool = True
37
49
  aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None
38
50
  operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = field(default=None)
51
+ adapter: DataSourceAdapter | None = None
52
+ verbose: bool = False