featkit 0.2.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. featkit-0.4.1/.github/workflows/auto-tag.yml +54 -0
  2. featkit-0.4.1/CHANGELOG.md +36 -0
  3. {featkit-0.2.0 → featkit-0.4.1}/PKG-INFO +1 -1
  4. featkit-0.4.1/docs/example_databricks_notebook.md +209 -0
  5. {featkit-0.2.0 → featkit-0.4.1}/pyproject.toml +2 -1
  6. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/builders/distributional_space.py +34 -0
  7. featkit-0.4.1/src/featkit/builders/pivot_space.py +219 -0
  8. featkit-0.4.1/src/featkit/builders/ratio_space.py +102 -0
  9. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/builders/temporal_space.py +33 -3
  10. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/config.py +12 -0
  11. featkit-0.4.1/src/featkit/execution/domain_resolver.py +194 -0
  12. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/base.py +85 -23
  13. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer2/base.py +36 -26
  14. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer2/pivoted.py +2 -1
  15. featkit-0.4.1/src/featkit/layer2/ratio.py +101 -0
  16. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer3/temporal_feature.py +3 -3
  17. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/pipeline.py +18 -5
  18. {featkit-0.2.0 → featkit-0.4.1}/tests/test_builders.py +297 -0
  19. {featkit-0.2.0 → featkit-0.4.1}/tests/test_contracts.py +1 -1
  20. featkit-0.4.1/tests/test_execution/test_domain_resolver.py +412 -0
  21. {featkit-0.2.0 → featkit-0.4.1}/tests/test_layer2.py +19 -1
  22. featkit-0.4.1/tests/test_ratio.py +389 -0
  23. featkit-0.2.0/CHANGELOG.md +0 -8
  24. featkit-0.2.0/docs/example_databricks_notebook.md +0 -141
  25. featkit-0.2.0/src/featkit/builders/pivot_space.py +0 -102
  26. featkit-0.2.0/src/featkit/execution/domain_resolver.py +0 -98
  27. featkit-0.2.0/tests/test_execution/test_domain_resolver.py +0 -233
  28. {featkit-0.2.0 → featkit-0.4.1}/.github/workflows/ci.yml +0 -0
  29. {featkit-0.2.0 → featkit-0.4.1}/.github/workflows/docs.yml +0 -0
  30. {featkit-0.2.0 → featkit-0.4.1}/.github/workflows/publish.yml +0 -0
  31. {featkit-0.2.0 → featkit-0.4.1}/.gitignore +0 -0
  32. {featkit-0.2.0 → featkit-0.4.1}/LICENSE +0 -0
  33. {featkit-0.2.0 → featkit-0.4.1}/README.md +0 -0
  34. {featkit-0.2.0 → featkit-0.4.1}/docs/.gitkeep +0 -0
  35. {featkit-0.2.0 → featkit-0.4.1}/docs/examples.md +0 -0
  36. {featkit-0.2.0 → featkit-0.4.1}/docs/general_plan.md +0 -0
  37. {featkit-0.2.0 → featkit-0.4.1}/docs/index.md +0 -0
  38. {featkit-0.2.0 → featkit-0.4.1}/docs/quickstart.md +0 -0
  39. {featkit-0.2.0 → featkit-0.4.1}/mkdocs.yml +0 -0
  40. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/__init__.py +0 -0
  41. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/builders/.gitkeep +0 -0
  42. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/builders/__init__.py +0 -0
  43. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/__init__.py +0 -0
  44. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/measurement/.gitkeep +0 -0
  45. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/measurement/__init__.py +0 -0
  46. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/measurement/base.py +0 -0
  47. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/measurement/defaults.py +0 -0
  48. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/output/.gitkeep +0 -0
  49. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/output/__init__.py +0 -0
  50. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/output/base.py +0 -0
  51. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/contracts/output/defaults.py +0 -0
  52. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/dataset/.gitkeep +0 -0
  53. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/dataset/__init__.py +0 -0
  54. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/dataset/base.py +0 -0
  55. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/enums.py +0 -0
  56. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/__init__.py +0 -0
  57. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/__init__.py +0 -0
  58. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/base.py +0 -0
  59. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/databricks_adapter.py +0 -0
  60. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/databricks_notebook_adapter.py +0 -0
  61. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/mock_adapter.py +0 -0
  62. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/spark_adapter.py +0 -0
  63. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/execution/adapters/sqlalchemy_adapter.py +0 -0
  64. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/.gitkeep +0 -0
  65. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/__init__.py +0 -0
  66. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/base.py +0 -0
  67. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/categorical_field.py +0 -0
  68. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/id_field.py +0 -0
  69. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/measurement_field.py +0 -0
  70. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/fields/time_field.py +0 -0
  71. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/__init__.py +0 -0
  72. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/base.py +0 -0
  73. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/output.py +0 -0
  74. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/pyspark/.gitkeep +0 -0
  75. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/pyspark/__init__.py +0 -0
  76. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/pyspark/databricks.py +0 -0
  77. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/.gitkeep +0 -0
  78. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/__init__.py +0 -0
  79. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/databricks.py +0 -0
  80. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/snowflake.py +0 -0
  81. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/generators/sql/spark_sql.py +0 -0
  82. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer2/.gitkeep +0 -0
  83. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer2/__init__.py +0 -0
  84. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer2/distributional.py +0 -0
  85. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer3/.gitkeep +0 -0
  86. {featkit-0.2.0 → featkit-0.4.1}/src/featkit/layer3/__init__.py +0 -0
  87. {featkit-0.2.0 → featkit-0.4.1}/tests/__init__.py +0 -0
  88. {featkit-0.2.0 → featkit-0.4.1}/tests/test_enums.py +0 -0
  89. {featkit-0.2.0 → featkit-0.4.1}/tests/test_execution/__init__.py +0 -0
  90. {featkit-0.2.0 → featkit-0.4.1}/tests/test_execution/test_adapters.py +0 -0
  91. {featkit-0.2.0 → featkit-0.4.1}/tests/test_fields.py +0 -0
  92. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/.gitkeep +0 -0
  93. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/__init__.py +0 -0
  94. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/test_base.py +0 -0
  95. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/test_pyspark.py +0 -0
  96. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/test_sql_databricks.py +0 -0
  97. {featkit-0.2.0 → featkit-0.4.1}/tests/test_generators/test_sql_snowflake.py +0 -0
  98. {featkit-0.2.0 → featkit-0.4.1}/tests/test_integration.py +0 -0
  99. {featkit-0.2.0 → featkit-0.4.1}/tests/test_layer3.py +0 -0
  100. {featkit-0.2.0 → featkit-0.4.1}/tests/test_output_contracts.py +0 -0
  101. {featkit-0.2.0 → featkit-0.4.1}/tests/test_pipeline.py +0 -0
@@ -0,0 +1,54 @@
1
+ name: Auto-tag on version bump
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "pyproject.toml"
9
+
10
+ jobs:
11
+ tag:
12
+ name: Create version tag
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Ensure RELEASE_TOKEN is configured
17
+ env:
18
+ RELEASE_TOKEN: ${{ secrets.RELEASE_TOKEN }}
19
+ run: |
20
+ if [ -z "$RELEASE_TOKEN" ]; then
21
+ echo "RELEASE_TOKEN secret is not set. Add it (PAT with contents:read/write) so tag pushes can trigger publish.yml." >&2
22
+ exit 1
23
+ fi
24
+
25
+ - uses: actions/checkout@v4
26
+ with:
27
+ fetch-depth: 0
28
+ # A PAT is required so the tag push triggers downstream workflows
29
+ # (pushes made with GITHUB_TOKEN are intentionally excluded from
30
+ # workflow triggers by GitHub to prevent infinite loops).
31
+ token: ${{ secrets.RELEASE_TOKEN }}
32
+
33
+ - name: Read version from pyproject.toml
34
+ id: version
35
+ run: |
36
+ VERSION=$(grep '^version = ' pyproject.toml | head -1 | sed 's/version = "\(.*\)"/\1/')
37
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
38
+
39
+ - name: Check if tag exists
40
+ id: tag_check
41
+ run: |
42
+ if git rev-parse "v${{ steps.version.outputs.version }}" >/dev/null 2>&1; then
43
+ echo "exists=true" >> $GITHUB_OUTPUT
44
+ else
45
+ echo "exists=false" >> $GITHUB_OUTPUT
46
+ fi
47
+
48
+ - name: Create and push tag
49
+ if: steps.tag_check.outputs.exists == 'false'
50
+ run: |
51
+ git config user.name "github-actions[bot]"
52
+ git config user.email "github-actions[bot]@users.noreply.github.com"
53
+ git tag "v${{ steps.version.outputs.version }}"
54
+ git push origin "v${{ steps.version.outputs.version }}"
@@ -0,0 +1,36 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.4.1] - 2026-06-09
11
+
12
+ ### Fixed
13
+ - CI: auto-tag workflow now uses a PAT (`RELEASE_TOKEN`) to push tags so that `publish.yml` is triggered correctly (`fix(ci)`)
14
+
15
+ ## [0.4.0] - 2026-06-09
16
+
17
+ ### Added
18
+ - Ratio/percentage features (`RatioPivotedColumn`, `RatioSpaceBuilder`): for every pivot combination with at least one non-`None` categorical value, a `numerator / NULLIF(denominator, 0)` column is generated for each proper marginal projection of that combination. Controlled by `FeatureStoreConfig.include_ratios` (default `True`, requires `include_marginals=True`). (`feat(ratio)`)
19
+ - `verbose` parameter on `AdapterDomainResolver` and `AdapterCombinationResolver`: when `True`, the generated `SELECT DISTINCT` SQL is emitted at `DEBUG` level before execution. `FeatureStorePipeline` forwards `cfg.verbose` to the combination resolver automatically. (`feat(domain-resolver)`)
20
+
21
+ ## [0.3.0] - 2026-06-08
22
+
23
+ ### Added
24
+ - `AdapterCombinationResolver` — replaces per-field `SELECT DISTINCT` queries with a single multi-column query returning only observed combinations (`feat(builders)`)
25
+ - `verbose` logging option on `PivotSpaceBuilder`, `DistributionalSpaceBuilder`, and `TemporalSpaceBuilder`, configurable via `FeatureStoreConfig` (`feat(config)`)
26
+
27
+ ### Fixed
28
+ - Marginal fields no longer contribute their name to pivot column names; e.g. `SUM__amount__channel__region_north` → `SUM__amount__region_north` (`fix(layer2)`)
29
+
30
+ ## [0.2.0] - 2026-06-02
31
+
32
+ ### Added
33
+ - Execution layer with adapter-based domain resolution (`feat(execution)`)
34
+
35
+ ### Fixed
36
+ - Lazy-import `AdapterDomainResolver`; added `pandas` to dev dependencies
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featkit
3
- Version: 0.2.0
3
+ Version: 0.4.1
4
4
  Summary: featkit — automated feature store generation from relational facts tables
5
5
  Project-URL: Repository, https://github.com/Mirkiux/featkit
6
6
  Project-URL: Documentation, https://mirkiux.github.io/featkit
@@ -0,0 +1,209 @@
1
+ # Example — Observed-combinations pivot in a Databricks notebook
2
+
3
+ This example shows how featkit resolves pivot combinations at runtime by
4
+ querying the facts table directly from a Databricks notebook.
5
+
6
+ When an adapter is configured, `FeatureStorePipeline` constructs an
7
+ `AdapterCombinationResolver` and passes it to `PivotSpaceBuilder`. Instead of
8
+ generating the full Cartesian product of per-field domains, the builder issues a
9
+ **single `SELECT DISTINCT`** query for all pivot categoricals and builds only the
10
+ combinations that actually exist in the data. Marginals are then derived from
11
+ those observed combinations via subset-projection.
12
+
13
+ `DatabricksNotebookAdapter` discovers the pre-injected `spark` session
14
+ automatically — no constructor arguments are needed.
15
+
16
+ ## Notebook cells
17
+
18
+ ### Cell 1 — imports
19
+
20
+ ```python
21
+ from featkit.config import FeatureStoreConfig
22
+ from featkit.dataset.base import SimpleDataset
23
+ from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
24
+ from featkit.execution.adapters import DatabricksNotebookAdapter
25
+ from featkit.fields.categorical_field import CategoricalField
26
+ from featkit.fields.id_field import IDField
27
+ from featkit.fields.measurement_field import MeasurementField
28
+ from featkit.fields.time_field import TimeField
29
+ from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
30
+ from featkit.pipeline import FeatureStorePipeline
31
+ ```
32
+
33
+ ### Cell 2 — define the dataset
34
+
35
+ ```python
36
+ ds = SimpleDataset(
37
+ "mydb.myschema.silver_transactions",
38
+ [
39
+ IDField("client_id"),
40
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
41
+ MeasurementField("amount", MeasurementType.MONTO),
42
+ MeasurementField("txn_count", MeasurementType.CANTIDAD),
43
+ # allowed_values used as WHERE IN-filter; omit to query with no filter
44
+ CategoricalField(
45
+ "segment",
46
+ CategoricalTreatment.PIVOT,
47
+ allowed_values=["retail", "sme", "corporate"],
48
+ ),
49
+ CategoricalField(
50
+ "product_type",
51
+ CategoricalTreatment.PIVOT,
52
+ allowed_values=["loan", "deposit", "card"],
53
+ ),
54
+ ],
55
+ )
56
+ ```
57
+
58
+ ### Cell 3 — configure with the notebook adapter
59
+
60
+ ```python
61
+ adapter = DatabricksNotebookAdapter()
62
+
63
+ cfg = FeatureStoreConfig(
64
+ dataset=ds,
65
+ output_schema="analytics",
66
+ output_table_prefix="feat_",
67
+ time_windows=[3, 6, 12],
68
+ include_marginals=True,
69
+ adapter=adapter, # triggers SELECT DISTINCT combination query at build()
70
+ )
71
+ ```
72
+
73
+ ### Cell 4 — build and generate
74
+
75
+ ```python
76
+ # build() issues ONE SELECT DISTINCT for all pivot categoricals:
77
+ #
78
+ # SELECT DISTINCT product_type, segment
79
+ # FROM mydb.myschema.silver_transactions
80
+ # WHERE product_type IS NOT NULL
81
+ # AND segment IS NOT NULL
82
+ # AND product_type IN ('loan', 'deposit', 'card')
83
+ # AND segment IN ('retail', 'sme', 'corporate')
84
+ # ORDER BY 1, 2
85
+ #
86
+ # Only the returned combinations (plus their marginal projections) become
87
+ # pivot columns — unobserved cross-combinations are never generated.
88
+ pipeline = FeatureStorePipeline(config=cfg).build()
89
+
90
+ print(f"Layer 2A columns : {len(pipeline.layer2a)}")
91
+ print(f"Layer 3 features: {len(pipeline.layer3)}")
92
+
93
+ result = DatabricksSQLCodeGenerator().generate(pipeline)
94
+ print(result.code.sql[:500])
95
+ ```
96
+
97
+ ### Cell 5 — save the artefacts to DBFS
98
+
99
+ ```python
100
+ result.save("/dbfs/mnt/output/features/")
101
+ # Writes:
102
+ # /dbfs/mnt/output/features/script.sql
103
+ # /dbfs/mnt/output/features/dag.json
104
+ # /dbfs/mnt/output/features/diagram.md
105
+ ```
106
+
107
+ ## How it works
108
+
109
+ `FeatureStorePipeline.build()` constructs an `AdapterCombinationResolver` and
110
+ passes it to `PivotSpaceBuilder` as the `combination_resolver` callable. The
111
+ resolver executes a single multi-column `SELECT DISTINCT`:
112
+
113
+ ```sql
114
+ SELECT DISTINCT product_type, segment
115
+ FROM mydb.myschema.silver_transactions
116
+ WHERE product_type IS NOT NULL
117
+ AND segment IS NOT NULL
118
+ AND product_type IN ('loan', 'deposit', 'card')
119
+ AND segment IN ('retail', 'sme', 'corporate')
120
+ ORDER BY 1, 2
121
+ ```
122
+
123
+ Suppose the query returns three rows:
124
+
125
+ | product_type | segment |
126
+ |-------------|-----------|
127
+ | loan | retail |
128
+ | loan | sme |
129
+ | deposit | corporate |
130
+
131
+ With `include_marginals=True`, the builder derives every subset-projection of
132
+ those rows:
133
+
134
+ | product_type | segment | interpretation |
135
+ |-------------|-----------|------------------------------------------|
136
+ | loan | retail | observed combination |
137
+ | loan | sme | observed combination |
138
+ | deposit | corporate | observed combination |
139
+ | loan | `∅` | all segments for loan |
140
+ | deposit | `∅` | all segments for deposit |
141
+ | `∅` | retail | all products for retail |
142
+ | `∅` | sme | all products for sme |
143
+ | `∅` | corporate | all products for corporate |
144
+ | `∅` | `∅` | unconditional aggregate (always present)|
145
+
146
+ Unobserved combinations (e.g. `deposit × retail`) are **never generated**,
147
+ keeping the feature space lean.
148
+
149
+ ## Fields without `allowed_values`
150
+
151
+ If a field has no `allowed_values`, it is still included in the `SELECT DISTINCT`
152
+ but its column is not filtered in the WHERE clause — all distinct values present
153
+ in the table are returned for that dimension:
154
+
155
+ ```python
156
+ ds = SimpleDataset(
157
+ "mydb.myschema.silver_transactions",
158
+ [
159
+ IDField("client_id"),
160
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
161
+ MeasurementField("amount", MeasurementType.MONTO),
162
+ # Static domain — used as IN-filter in the combined query
163
+ CategoricalField(
164
+ "channel",
165
+ CategoricalTreatment.PIVOT,
166
+ allowed_values=["branch", "online", "mobile"],
167
+ ),
168
+ # No allowed_values — column included without an IN-filter
169
+ CategoricalField("segment", CategoricalTreatment.PIVOT),
170
+ ],
171
+ )
172
+ ```
173
+
174
+ ## Using a different adapter
175
+
176
+ Swap `DatabricksNotebookAdapter` for any other adapter without changing the
177
+ rest of the code:
178
+
179
+ ```python
180
+ from featkit.execution.adapters import DatabricksAdapter
181
+
182
+ adapter = DatabricksAdapter(
183
+ host="<workspace>.azuredatabricks.net",
184
+ token="<pat>",
185
+ http_path="/sql/1.0/warehouses/<warehouse-id>",
186
+ catalog="mydb",
187
+ schema="myschema",
188
+ )
189
+
190
+ cfg = FeatureStoreConfig(..., adapter=adapter)
191
+ ```
192
+
193
+ ## Using `AdapterCombinationResolver` directly
194
+
195
+ The resolver can also be wired manually to `PivotSpaceBuilder` without going
196
+ through the pipeline:
197
+
198
+ ```python
199
+ from featkit.execution.domain_resolver import AdapterCombinationResolver
200
+ from featkit.builders.pivot_space import PivotSpaceBuilder
201
+
202
+ resolver = AdapterCombinationResolver(adapter, "mydb.myschema.silver_transactions")
203
+
204
+ columns = PivotSpaceBuilder(
205
+ dataset=ds,
206
+ include_marginals=True,
207
+ combination_resolver=resolver,
208
+ ).build()
209
+ ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "featkit"
7
- version = "0.2.0"
7
+ version = "0.4.1"
8
8
  description = "featkit — automated feature store generation from relational facts tables"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -78,6 +78,7 @@ module = ["tests.*"]
78
78
  disallow_untyped_defs = false
79
79
  disallow_untyped_calls = false
80
80
  disallow_any_generics = false
81
+ disallow_incomplete_defs = false
81
82
 
82
83
  [tool.pytest.ini_options]
83
84
  testpaths = ["tests"]
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from typing import cast
6
7
 
7
8
  from featkit.contracts.measurement.defaults import get_default_contract
@@ -11,6 +12,8 @@ from featkit.fields.categorical_field import CategoricalField
11
12
  from featkit.fields.measurement_field import MeasurementField
12
13
  from featkit.layer2.distributional import DistributionalColumn
13
14
 
15
+ _log = logging.getLogger(__name__)
16
+
14
17
 
15
18
  class DistributionalSpaceBuilder:
16
19
  """Generates the full set of DistributionalColumn objects for a dataset.
@@ -26,18 +29,27 @@ class DistributionalSpaceBuilder:
26
29
  An empty list produces no columns. Every entry must be present in the
27
30
  dataset (compared by name, type, and contract); a ``ValueError`` is
28
31
  raised for unknown fields.
32
+ verbose: When ``True``, emits ``DEBUG``-level log messages at key
33
+ milestones: builder start/end, and for each generated column the
34
+ ``(categorical, measurement, aggregator, metric)`` combination dict
35
+ and the resulting column name.
29
36
  """
30
37
 
31
38
  def __init__(
32
39
  self,
33
40
  dataset: AbstractDataset,
34
41
  value_measurements: list[MeasurementField] | None = None,
42
+ verbose: bool = False,
35
43
  ) -> None:
36
44
  self.dataset = dataset
37
45
  self.value_measurements = value_measurements
46
+ self.verbose = verbose
38
47
 
39
48
  def build(self) -> list[DistributionalColumn]:
40
49
  """Build and return all DistributionalColumn objects."""
50
+ if self.verbose:
51
+ _log.debug("DistributionalSpaceBuilder.build() started")
52
+
41
53
  all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
42
54
  dist_cats = [
43
55
  c
@@ -70,8 +82,30 @@ class DistributionalSpaceBuilder:
70
82
  for agg in aggs:
71
83
  for metric in cat.distributional_metrics:
72
84
  col = DistributionalColumn(mf, agg, cat, metric)
85
+ if self.verbose:
86
+ _log.debug(
87
+ "combo: cat=%r, measurement=%r, aggregator=%s, metric=%s",
88
+ cat.name,
89
+ mf.name,
90
+ agg.value,
91
+ metric.value,
92
+ )
93
+ _log.debug(
94
+ "combination: %s",
95
+ {
96
+ "categorical": cat.name,
97
+ "measurement": mf.name,
98
+ "aggregator": agg.value,
99
+ "metric": metric.value,
100
+ },
101
+ )
102
+ _log.debug("column_name: %r", col.column_name)
73
103
  if col.column_name not in seen:
74
104
  seen.add(col.column_name)
75
105
  results.append(col)
76
106
 
107
+ if self.verbose:
108
+ _log.debug(
109
+ "DistributionalSpaceBuilder.build() done — %d column(s) generated", len(results)
110
+ )
77
111
  return results
@@ -0,0 +1,219 @@
1
+ """PivotSpaceBuilder — generates all PivotedColumn objects from a dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Callable
7
+ from itertools import combinations as _icombinations
8
+ from itertools import product
9
+ from typing import cast
10
+
11
+ from featkit.contracts.measurement.defaults import get_default_contract
12
+ from featkit.dataset.base import AbstractDataset
13
+ from featkit.enums import CategoricalTreatment, Layer2Aggregator, MeasurementType
14
+ from featkit.fields.categorical_field import CategoricalField
15
+ from featkit.fields.measurement_field import MeasurementField
16
+ from featkit.layer2.pivoted import PivotedColumn
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ def _with_marginals(
22
+ observed: list[dict[CategoricalField, str]],
23
+ cats: list[CategoricalField],
24
+ ) -> list[dict[CategoricalField, str | None]]:
25
+ """Expand *observed* combinations with all ∅-substituted variants.
26
+
27
+ For each observed combination and each subset of fields, a new
28
+ combination is produced where those fields are replaced with ``None``
29
+ (the ∅ marginal sentinel). The all-None combination is always included
30
+ even when *observed* is empty, since it represents an unconditional
31
+ aggregate over all data.
32
+
33
+ Duplicates are suppressed so overlapping projections of different
34
+ observed combinations appear only once.
35
+ """
36
+ seen: set[tuple[tuple[str, str | None], ...]] = set()
37
+ result: list[dict[CategoricalField, str | None]] = []
38
+
39
+ def _append(combo: dict[CategoricalField, str | None]) -> None:
40
+ key = tuple(sorted((f.name, combo[f]) for f in cats))
41
+ if key not in seen:
42
+ seen.add(key)
43
+ result.append(combo)
44
+
45
+ _append({f: None for f in cats})
46
+
47
+ for combo in observed:
48
+ for r in range(len(cats)): # r == len(cats) (all-None) already added above
49
+ for nulled in _icombinations(cats, r):
50
+ c: dict[CategoricalField, str | None] = dict(combo)
51
+ for f in nulled:
52
+ c[f] = None
53
+ _append(c)
54
+
55
+ return result
56
+
57
+
58
+ class PivotSpaceBuilder:
59
+ """Generates the full set of PivotedColumn objects for a dataset.
60
+
61
+ Two combination strategies are supported:
62
+
63
+ * **Observed combinations** (preferred when an adapter is available):
64
+ supply a ``combination_resolver`` callable. It receives the list of
65
+ pivot categorical fields and returns only the combinations that
66
+ actually exist in the source table. Marginals are then derived from
67
+ those observed combinations rather than from the full Cartesian
68
+ product.
69
+
70
+ * **Cartesian product** (default, no adapter required): per-field
71
+ domains are resolved from ``allowed_values`` or ``domain_resolver``
72
+ and the full product is generated.
73
+
74
+ Args:
75
+ dataset: The source facts-table schema.
76
+ include_marginals: When True, ∅-substituted combinations are added
77
+ on top of the base combinations (observed or Cartesian).
78
+ aggregators_override: Per-measurement-type override list. Only
79
+ aggregators that are also contract-valid for the measurement
80
+ type are used.
81
+ combination_resolver: Callable that takes the list of pivot
82
+ ``CategoricalField`` objects and returns the observed
83
+ combinations as a list of ``{field: value}`` dicts. When
84
+ provided, ``domain_resolver`` is not used.
85
+ domain_resolver: Callable invoked per-field to resolve the domain
86
+ of a categorical whose ``allowed_values`` is None. Used only
87
+ in the Cartesian product path (i.e. when
88
+ ``combination_resolver`` is not provided). Raises
89
+ ``ValueError`` at build time if a dynamic field is encountered
90
+ and this is not provided.
91
+ verbose: When ``True``, emits ``DEBUG``-level log messages at key
92
+ milestones: builder start/end, each ``domain_resolver``
93
+ invocation with its resolved values, each ``cat_combination``
94
+ dict, and every generated column name.
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ dataset: AbstractDataset,
100
+ include_marginals: bool = True,
101
+ aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None,
102
+ combination_resolver: (
103
+ Callable[[list[CategoricalField]], list[dict[CategoricalField, str]]] | None
104
+ ) = None,
105
+ domain_resolver: Callable[[CategoricalField], list[str]] | None = None,
106
+ verbose: bool = False,
107
+ ) -> None:
108
+ self.dataset = dataset
109
+ self.include_marginals = include_marginals
110
+ self.aggregators_override = aggregators_override
111
+ self.combination_resolver = combination_resolver
112
+ self.domain_resolver = domain_resolver
113
+ self.verbose = verbose
114
+
115
+ def build(self) -> list[PivotedColumn]:
116
+ """Build and return all PivotedColumn objects."""
117
+ if self.verbose:
118
+ _log.debug("PivotSpaceBuilder.build() started")
119
+
120
+ all_cats = [cast(CategoricalField, f) for f in self.dataset.categorical_fields]
121
+ pivot_cats = [
122
+ c
123
+ for c in all_cats
124
+ if c.treatment in {CategoricalTreatment.PIVOT, CategoricalTreatment.BOTH}
125
+ ]
126
+ measurements = [cast(MeasurementField, f) for f in self.dataset.measurement_fields]
127
+
128
+ all_combos: list[dict[CategoricalField, str | None]]
129
+
130
+ if self.combination_resolver is not None and pivot_cats:
131
+ observed_raw = self.combination_resolver(pivot_cats)
132
+ pivot_key_set = set(pivot_cats)
133
+ pivot_map = {c: c for c in pivot_cats}
134
+ observed: list[dict[CategoricalField, str]] = []
135
+ for combo in observed_raw:
136
+ if set(combo.keys()) != pivot_key_set:
137
+ raise ValueError(
138
+ "combination_resolver must return dicts keyed by all "
139
+ "pivot categorical fields"
140
+ )
141
+ if any(v is None for v in combo.values()):
142
+ raise ValueError(
143
+ "combination_resolver returned None; "
144
+ "None is reserved as the ∅ marginal sentinel"
145
+ )
146
+ observed.append({pivot_map[f]: str(v) for f, v in combo.items()})
147
+ if self.include_marginals:
148
+ all_combos = _with_marginals(observed, pivot_cats)
149
+ else:
150
+ all_combos = [dict(c) for c in observed]
151
+ else:
152
+ cat_domains: dict[CategoricalField, list[str | None]] = {}
153
+ for cat in pivot_cats:
154
+ if cat.allowed_values is not None:
155
+ raw: list[str] = list(cat.allowed_values)
156
+ elif self.domain_resolver is not None:
157
+ if self.verbose:
158
+ _log.debug("domain_resolver: resolving domain for categorical %r", cat.name)
159
+ raw = list(self.domain_resolver(cat))
160
+ if self.verbose:
161
+ _log.debug(
162
+ "domain_resolver: resolved %d value(s) for %r: %s",
163
+ len(raw),
164
+ cat.name,
165
+ raw,
166
+ )
167
+ else:
168
+ raise ValueError(
169
+ f"CategoricalField {cat.name!r} has no allowed_values and no "
170
+ f"domain_resolver was provided"
171
+ )
172
+ if any(v is None for v in raw):
173
+ raise ValueError(
174
+ f"CategoricalField {cat.name!r}: resolved domain contains None; "
175
+ f"None is reserved as the ∅ marginal sentinel"
176
+ )
177
+ domain: list[str | None] = list(raw)
178
+ if self.include_marginals:
179
+ domain = domain + [None]
180
+ cat_domains[cat] = domain
181
+
182
+ cats = list(cat_domains.keys())
183
+ combos = product(*(cat_domains[c] for c in cats)) if cats else ((),)
184
+ all_combos = [
185
+ {cats[i]: combo[i] for i in range(len(cats))} if cats else {} for combo in combos
186
+ ]
187
+
188
+ results: list[PivotedColumn] = []
189
+ seen: dict[str, PivotedColumn] = {}
190
+
191
+ for cat_combination in all_combos:
192
+ if self.verbose:
193
+ _log.debug(
194
+ "cat_combination: %s",
195
+ {c.name: v for c, v in cat_combination.items()},
196
+ )
197
+ for mf in measurements:
198
+ for agg in self._valid_aggregators(mf):
199
+ col = PivotedColumn(mf, agg, cat_combination)
200
+ if col.column_name in seen:
201
+ raise ValueError(
202
+ f"Duplicate pivot column name generated: {col.column_name!r}. "
203
+ f"Conflicting columns: {seen[col.column_name]!r} and {col!r}"
204
+ )
205
+ if self.verbose:
206
+ _log.debug("column_name: %r", col.column_name)
207
+ seen[col.column_name] = col
208
+ results.append(col)
209
+
210
+ if self.verbose:
211
+ _log.debug("PivotSpaceBuilder.build() done — %d column(s) generated", len(results))
212
+ return results
213
+
214
+ def _valid_aggregators(self, mf: MeasurementField) -> list[Layer2Aggregator]:
215
+ contract = mf.contract or get_default_contract(mf.measurement_type)
216
+ valid = contract.valid_layer2_aggregators
217
+ if self.aggregators_override and mf.measurement_type in self.aggregators_override:
218
+ return [a for a in self.aggregators_override[mf.measurement_type] if a in valid]
219
+ return sorted(valid, key=lambda a: a.value)