featkit 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {featkit-0.1.0 → featkit-0.2.0}/PKG-INFO +4 -1
- featkit-0.2.0/docs/example_databricks_notebook.md +141 -0
- {featkit-0.1.0 → featkit-0.2.0}/pyproject.toml +3 -1
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/config.py +9 -0
- featkit-0.2.0/src/featkit/execution/__init__.py +1 -0
- featkit-0.2.0/src/featkit/execution/adapters/__init__.py +18 -0
- featkit-0.2.0/src/featkit/execution/adapters/base.py +63 -0
- featkit-0.2.0/src/featkit/execution/adapters/databricks_adapter.py +61 -0
- featkit-0.2.0/src/featkit/execution/adapters/databricks_notebook_adapter.py +117 -0
- featkit-0.2.0/src/featkit/execution/adapters/mock_adapter.py +35 -0
- featkit-0.2.0/src/featkit/execution/adapters/spark_adapter.py +28 -0
- featkit-0.2.0/src/featkit/execution/adapters/sqlalchemy_adapter.py +44 -0
- featkit-0.2.0/src/featkit/execution/domain_resolver.py +98 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/pipeline.py +8 -0
- featkit-0.2.0/tests/test_execution/test_adapters.py +77 -0
- featkit-0.2.0/tests/test_execution/test_domain_resolver.py +233 -0
- featkit-0.2.0/tests/test_generators/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/ci.yml +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/docs.yml +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/publish.yml +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/.gitignore +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/CHANGELOG.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/LICENSE +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/README.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/docs/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/docs/examples.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/docs/general_plan.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/docs/index.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/docs/quickstart.md +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/mkdocs.yml +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/distributional_space.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/pivot_space.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/temporal_space.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/defaults.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/defaults.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/enums.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/categorical_field.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/id_field.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/measurement_field.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/time_field.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/output.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/snowflake.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/spark_sql.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/distributional.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/pivoted.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/temporal_feature.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_builders.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_contracts.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_enums.py +0 -0
- {featkit-0.1.0/tests/test_generators → featkit-0.2.0/tests/test_execution}/__init__.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_fields.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/.gitkeep +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_base.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_pyspark.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_sql_databricks.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_sql_snowflake.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_integration.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_layer2.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_layer3.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_output_contracts.py +0 -0
- {featkit-0.1.0 → featkit-0.2.0}/tests/test_pipeline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: featkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
5
|
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
6
|
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
@@ -49,6 +49,7 @@ Provides-Extra: dev
|
|
|
49
49
|
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
50
|
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
51
|
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pandas>=1.5; extra == 'dev'
|
|
52
53
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
53
54
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
54
55
|
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
@@ -57,6 +58,8 @@ Provides-Extra: docs
|
|
|
57
58
|
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
58
59
|
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
59
60
|
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
61
|
+
Provides-Extra: execution
|
|
62
|
+
Requires-Dist: pandas>=1.5; extra == 'execution'
|
|
60
63
|
Provides-Extra: ibis
|
|
61
64
|
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
62
65
|
Provides-Extra: spark
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Example — Dynamic domain resolution in a Databricks notebook
|
|
2
|
+
|
|
3
|
+
This example shows how to let featkit resolve the `allowed_values` domain of a
|
|
4
|
+
`CategoricalField` at runtime by querying the facts table directly from a
|
|
5
|
+
Databricks notebook.
|
|
6
|
+
|
|
7
|
+
`DatabricksNotebookAdapter` discovers the pre-injected `spark` session
|
|
8
|
+
automatically — no constructor arguments are needed.
|
|
9
|
+
|
|
10
|
+
## Notebook cells
|
|
11
|
+
|
|
12
|
+
### Cell 1 — imports
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from featkit.config import FeatureStoreConfig
|
|
16
|
+
from featkit.dataset.base import SimpleDataset
|
|
17
|
+
from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
|
|
18
|
+
from featkit.execution.adapters import DatabricksNotebookAdapter
|
|
19
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
20
|
+
from featkit.fields.id_field import IDField
|
|
21
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
22
|
+
from featkit.fields.time_field import TimeField
|
|
23
|
+
from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
|
|
24
|
+
from featkit.pipeline import FeatureStorePipeline
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Cell 2 — define the dataset (no `allowed_values` on the categorical)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
ds = SimpleDataset(
|
|
31
|
+
"mydb.myschema.silver_transactions",
|
|
32
|
+
[
|
|
33
|
+
IDField("client_id"),
|
|
34
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
35
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
36
|
+
MeasurementField("txn_count", MeasurementType.CANTIDAD),
|
|
37
|
+
# No allowed_values — the adapter will resolve the domain at build() time
|
|
38
|
+
CategoricalField("segment", CategoricalTreatment.PIVOT),
|
|
39
|
+
CategoricalField("product_type", CategoricalTreatment.PIVOT),
|
|
40
|
+
],
|
|
41
|
+
)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Cell 3 — configure with the notebook adapter
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
adapter = DatabricksNotebookAdapter()
|
|
48
|
+
|
|
49
|
+
cfg = FeatureStoreConfig(
|
|
50
|
+
dataset=ds,
|
|
51
|
+
output_schema="analytics",
|
|
52
|
+
output_table_prefix="feat_",
|
|
53
|
+
time_windows=[3, 6, 12],
|
|
54
|
+
include_marginals=True,
|
|
55
|
+
adapter=adapter, # triggers SELECT DISTINCT resolution at build()
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Cell 4 — build and generate
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
# build() fires one SELECT DISTINCT per unresolved CategoricalField
|
|
63
|
+
pipeline = FeatureStorePipeline(config=cfg).build()
|
|
64
|
+
|
|
65
|
+
print(f"Layer 2A columns : {len(pipeline.layer2a)}")
|
|
66
|
+
print(f"Layer 3 features: {len(pipeline.layer3)}")
|
|
67
|
+
|
|
68
|
+
result = DatabricksSQLCodeGenerator().generate(pipeline)
|
|
69
|
+
print(result.code.sql[:500])
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Cell 5 — save the artefacts to DBFS
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
result.save("/dbfs/mnt/output/features/")
|
|
76
|
+
# Writes:
|
|
77
|
+
# /dbfs/mnt/output/features/script.sql
|
|
78
|
+
# /dbfs/mnt/output/features/dag.json
|
|
79
|
+
# /dbfs/mnt/output/features/diagram.md
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## How it works
|
|
83
|
+
|
|
84
|
+
When `FeatureStorePipeline.build()` is called with an `adapter` set on the
|
|
85
|
+
config, it constructs an `AdapterDomainResolver` and passes it to
|
|
86
|
+
`PivotSpaceBuilder` as the `domain_resolver` callable. For each
|
|
87
|
+
`CategoricalField` that has no `allowed_values`, the builder calls the resolver,
|
|
88
|
+
which executes:
|
|
89
|
+
|
|
90
|
+
```sql
|
|
91
|
+
SELECT DISTINCT segment
|
|
92
|
+
FROM mydb.myschema.silver_transactions
|
|
93
|
+
WHERE segment IS NOT NULL
|
|
94
|
+
ORDER BY 1
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The returned values become the column domain exactly as if they had been listed
|
|
98
|
+
in `allowed_values` at configuration time.
|
|
99
|
+
|
|
100
|
+
## Mixing static and dynamic domains
|
|
101
|
+
|
|
102
|
+
Static and dynamic fields can coexist in the same dataset. Fields that have
|
|
103
|
+
`allowed_values` set are used as-is; only fields without it trigger a query:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
ds = SimpleDataset(
|
|
107
|
+
"mydb.myschema.silver_transactions",
|
|
108
|
+
[
|
|
109
|
+
IDField("client_id"),
|
|
110
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
111
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
112
|
+
# Static domain — no query fired
|
|
113
|
+
CategoricalField(
|
|
114
|
+
"channel",
|
|
115
|
+
CategoricalTreatment.PIVOT,
|
|
116
|
+
allowed_values=["branch", "online", "mobile"],
|
|
117
|
+
),
|
|
118
|
+
# Dynamic domain — one SELECT DISTINCT executed at build()
|
|
119
|
+
CategoricalField("segment", CategoricalTreatment.PIVOT),
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Using a different adapter
|
|
125
|
+
|
|
126
|
+
Swap `DatabricksNotebookAdapter` for any other adapter without changing the
|
|
127
|
+
rest of the code:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from featkit.execution.adapters import DatabricksAdapter
|
|
131
|
+
|
|
132
|
+
adapter = DatabricksAdapter(
|
|
133
|
+
host="<workspace>.azuredatabricks.net",
|
|
134
|
+
token="<pat>",
|
|
135
|
+
http_path="/sql/1.0/warehouses/<warehouse-id>",
|
|
136
|
+
catalog="mydb",
|
|
137
|
+
schema="myschema",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
cfg = FeatureStoreConfig(..., adapter=adapter)
|
|
141
|
+
```
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "featkit"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "featkit — automated feature store generation from relational facts tables"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -35,6 +35,7 @@ dependencies = [
|
|
|
35
35
|
ibis = ["ibis-framework>=9.0"]
|
|
36
36
|
spark = ["pyspark>=3.4"]
|
|
37
37
|
databricks = ["databricks-sql-connector>=3.0"]
|
|
38
|
+
execution = ["pandas>=1.5"]
|
|
38
39
|
docs = [
|
|
39
40
|
"mkdocs>=1.6",
|
|
40
41
|
"mkdocs-material>=9.5",
|
|
@@ -48,6 +49,7 @@ dev = [
|
|
|
48
49
|
"hatch>=1.9",
|
|
49
50
|
"build>=1.0",
|
|
50
51
|
"twine>=5.0",
|
|
52
|
+
"pandas>=1.5",
|
|
51
53
|
]
|
|
52
54
|
|
|
53
55
|
[project.urls]
|
|
@@ -3,10 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
from featkit.dataset.base import AbstractDataset
|
|
8
9
|
from featkit.enums import Layer2Aggregator, Layer2OutputType, MeasurementType, TemporalOperator
|
|
9
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from featkit.execution.adapters.base import DataSourceAdapter
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
@dataclass
|
|
12
16
|
class FeatureStoreConfig:
|
|
@@ -26,6 +30,10 @@ class FeatureStoreConfig:
|
|
|
26
30
|
aggregators. Only contract-valid aggregators are used.
|
|
27
31
|
operators_override: Per-output-type override for temporal operators.
|
|
28
32
|
Only contract-valid operators are used.
|
|
33
|
+
adapter: Optional execution adapter. When provided, categorical fields
|
|
34
|
+
with no ``allowed_values`` have their domain resolved at
|
|
35
|
+
``FeatureStorePipeline.build()`` time via a ``SELECT DISTINCT``
|
|
36
|
+
query against the facts table.
|
|
29
37
|
"""
|
|
30
38
|
|
|
31
39
|
dataset: AbstractDataset
|
|
@@ -36,3 +44,4 @@ class FeatureStoreConfig:
|
|
|
36
44
|
include_marginals: bool = True
|
|
37
45
|
aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None
|
|
38
46
|
operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = field(default=None)
|
|
47
|
+
adapter: DataSourceAdapter | None = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""featkit.execution — adapters and domain resolution for live query execution."""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""featkit.execution.adapters — data source adapters for domain resolution."""
|
|
2
|
+
|
|
3
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
4
|
+
from featkit.execution.adapters.databricks_adapter import DatabricksAdapter
|
|
5
|
+
from featkit.execution.adapters.databricks_notebook_adapter import DatabricksNotebookAdapter
|
|
6
|
+
from featkit.execution.adapters.mock_adapter import MockAdapter
|
|
7
|
+
from featkit.execution.adapters.spark_adapter import SparkAdapter
|
|
8
|
+
from featkit.execution.adapters.sqlalchemy_adapter import SQLAlchemyAdapter
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DataSourceAdapter",
|
|
12
|
+
"DatabricksAdapter",
|
|
13
|
+
"DatabricksNotebookAdapter",
|
|
14
|
+
"EngineType",
|
|
15
|
+
"MockAdapter",
|
|
16
|
+
"SparkAdapter",
|
|
17
|
+
"SQLAlchemyAdapter",
|
|
18
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Base adapter ABC and EngineType enum for featkit execution layer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from enum import Enum
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
_log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EngineType(Enum):
|
|
15
|
+
"""Identifies the execution engine behind a :class:`DataSourceAdapter`."""
|
|
16
|
+
|
|
17
|
+
SQLALCHEMY = "sqlalchemy"
|
|
18
|
+
DATABRICKS = "databricks"
|
|
19
|
+
SPARK = "spark"
|
|
20
|
+
MOCK = "mock"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DataSourceAdapter(ABC):
|
|
24
|
+
"""Abstract base for all data source adapters.
|
|
25
|
+
|
|
26
|
+
Subclasses implement :meth:`engine_execute` with the engine-specific query
|
|
27
|
+
logic. The public :meth:`execute` method wraps ``engine_execute`` with
|
|
28
|
+
error handling: if the query fails the offending SQL is logged at ``ERROR``
|
|
29
|
+
level before re-raising, making debugging significantly easier in
|
|
30
|
+
production pipelines.
|
|
31
|
+
|
|
32
|
+
This follows the *Template Method* pattern — the base class owns the
|
|
33
|
+
algorithm skeleton; subclasses supply only the engine-specific step.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def execute(self, sql: str) -> pd.DataFrame:
|
|
37
|
+
"""Execute *sql* and return a DataFrame.
|
|
38
|
+
|
|
39
|
+
Delegates to :meth:`engine_execute`. On failure, logs the SQL that
|
|
40
|
+
caused the error and re-raises the original exception unchanged.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
return self.engine_execute(sql)
|
|
44
|
+
except Exception:
|
|
45
|
+
_log.exception(
|
|
46
|
+
"SQL execution failed on %s.\nFailed query:\n%s",
|
|
47
|
+
self.__class__.__name__,
|
|
48
|
+
sql,
|
|
49
|
+
)
|
|
50
|
+
raise
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
54
|
+
"""Execute *sql* against the underlying engine and return a DataFrame.
|
|
55
|
+
|
|
56
|
+
Implement this method in each adapter subclass. Do not call it
|
|
57
|
+
directly — use :meth:`execute` instead so that error handling and any
|
|
58
|
+
future cross-cutting behaviour (retries, metrics) are applied.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def engine_type(self) -> EngineType:
|
|
63
|
+
"""Return the :class:`EngineType` for this adapter."""
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Adapter for Databricks SQL warehouses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DatabricksAdapter(DataSourceAdapter):
|
|
11
|
+
"""Adapter for Databricks SQL warehouses.
|
|
12
|
+
|
|
13
|
+
Requires the optional ``databricks-sql-connector`` package::
|
|
14
|
+
|
|
15
|
+
pip install databricks-sql-connector
|
|
16
|
+
|
|
17
|
+
The connector is imported lazily inside :meth:`engine_execute` so that the
|
|
18
|
+
class can be referenced in code that runs on environments without the
|
|
19
|
+
package installed — the ``ImportError`` is only raised when a query is
|
|
20
|
+
actually attempted.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
host: str,
|
|
26
|
+
token: str,
|
|
27
|
+
http_path: str,
|
|
28
|
+
catalog: str,
|
|
29
|
+
schema: str,
|
|
30
|
+
) -> None:
|
|
31
|
+
self._host = host
|
|
32
|
+
self._token = token
|
|
33
|
+
self._http_path = http_path
|
|
34
|
+
self._catalog = catalog
|
|
35
|
+
self._schema = schema
|
|
36
|
+
|
|
37
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
38
|
+
try:
|
|
39
|
+
from databricks import sql as dbsql
|
|
40
|
+
except ImportError as exc:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"databricks-sql-connector is required for DatabricksAdapter. "
|
|
43
|
+
"Install with: pip install databricks-sql-connector"
|
|
44
|
+
) from exc
|
|
45
|
+
|
|
46
|
+
with (
|
|
47
|
+
dbsql.connect(
|
|
48
|
+
server_hostname=self._host,
|
|
49
|
+
http_path=self._http_path,
|
|
50
|
+
access_token=self._token,
|
|
51
|
+
catalog=self._catalog,
|
|
52
|
+
schema=self._schema,
|
|
53
|
+
) as conn,
|
|
54
|
+
conn.cursor() as cursor,
|
|
55
|
+
):
|
|
56
|
+
cursor.execute(sql)
|
|
57
|
+
result: pd.DataFrame = cursor.fetchall_arrow().to_pandas()
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
def engine_type(self) -> EngineType:
|
|
61
|
+
return EngineType.DATABRICKS
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Adapter for Databricks notebook environments.
|
|
2
|
+
|
|
3
|
+
In Databricks notebooks the Spark session is pre-instantiated and injected
|
|
4
|
+
into the notebook's global namespace as ``spark``. This adapter discovers
|
|
5
|
+
that session automatically — no constructor arguments required.
|
|
6
|
+
|
|
7
|
+
Usage inside a Databricks notebook::
|
|
8
|
+
|
|
9
|
+
from featkit.execution.adapters import DatabricksNotebookAdapter
|
|
10
|
+
|
|
11
|
+
adapter = DatabricksNotebookAdapter()
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _resolve_spark() -> Any:
|
|
25
|
+
"""Locate the ``spark`` session injected by the Databricks notebook runtime.
|
|
26
|
+
|
|
27
|
+
Databricks injects ``spark`` into the ``__main__`` module namespace before
|
|
28
|
+
the first notebook cell executes. This function retrieves it without
|
|
29
|
+
requiring the caller to hold a reference or import PySpark explicitly.
|
|
30
|
+
|
|
31
|
+
Raises
|
|
32
|
+
------
|
|
33
|
+
RuntimeError
|
|
34
|
+
When no ``spark`` object can be found. This most commonly means the
|
|
35
|
+
adapter is being used outside a Databricks notebook environment.
|
|
36
|
+
"""
|
|
37
|
+
main = sys.modules.get("__main__", None)
|
|
38
|
+
spark = getattr(main, "spark", None) if main is not None else None
|
|
39
|
+
|
|
40
|
+
if spark is None:
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
"Could not locate 'spark' in the notebook runtime namespace. "
|
|
43
|
+
"DatabricksNotebookAdapter is intended for use inside Databricks "
|
|
44
|
+
"notebooks where 'spark' is pre-injected by the runtime. "
|
|
45
|
+
"Outside that environment, use SparkAdapter(spark_session) instead."
|
|
46
|
+
)
|
|
47
|
+
return spark
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DatabricksNotebookAdapter(DataSourceAdapter):
|
|
51
|
+
"""Adapter for Databricks notebook environments.
|
|
52
|
+
|
|
53
|
+
Wraps the ``spark`` session that the Databricks runtime pre-injects into
|
|
54
|
+
every notebook's global namespace. No constructor arguments are needed —
|
|
55
|
+
the session is resolved lazily on the first :meth:`execute` call.
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
Inside a Databricks notebook::
|
|
60
|
+
|
|
61
|
+
from featkit.execution.adapters import DatabricksNotebookAdapter
|
|
62
|
+
from featkit.config import FeatureStoreConfig
|
|
63
|
+
from featkit.dataset.base import SimpleDataset
|
|
64
|
+
from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
|
|
65
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
66
|
+
from featkit.fields.id_field import IDField
|
|
67
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
68
|
+
from featkit.fields.time_field import TimeField
|
|
69
|
+
from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
|
|
70
|
+
from featkit.pipeline import FeatureStorePipeline
|
|
71
|
+
|
|
72
|
+
adapter = DatabricksNotebookAdapter()
|
|
73
|
+
|
|
74
|
+
ds = SimpleDataset(
|
|
75
|
+
"mydb.silver_transactions",
|
|
76
|
+
[
|
|
77
|
+
IDField("client_id"),
|
|
78
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
79
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
80
|
+
CategoricalField("segment", CategoricalTreatment.PIVOT), # no allowed_values
|
|
81
|
+
],
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
cfg = FeatureStoreConfig(
|
|
85
|
+
dataset=ds,
|
|
86
|
+
output_schema="analytics",
|
|
87
|
+
output_table_prefix="feat_",
|
|
88
|
+
time_windows=[3, 6, 12],
|
|
89
|
+
adapter=adapter,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
pipeline = FeatureStorePipeline(config=cfg).build()
|
|
93
|
+
result = DatabricksSQLCodeGenerator().generate(pipeline)
|
|
94
|
+
result.save("/dbfs/mnt/output/features/")
|
|
95
|
+
|
|
96
|
+
Outside a notebook (e.g. in a standalone script or test), use
|
|
97
|
+
:class:`SparkAdapter` instead and pass the session explicitly::
|
|
98
|
+
|
|
99
|
+
adapter = SparkAdapter(spark_session)
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self) -> None:
|
|
103
|
+
self._spark: Any = None
|
|
104
|
+
|
|
105
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
106
|
+
"""Execute *sql* via the notebook's pre-injected ``spark`` session.
|
|
107
|
+
|
|
108
|
+
The session is resolved once and cached for the lifetime of this
|
|
109
|
+
adapter instance.
|
|
110
|
+
"""
|
|
111
|
+
if self._spark is None:
|
|
112
|
+
self._spark = _resolve_spark()
|
|
113
|
+
result: pd.DataFrame = self._spark.sql(sql).toPandas()
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
def engine_type(self) -> EngineType:
|
|
117
|
+
return EngineType.SPARK
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""In-memory adapter for tests and examples."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MockAdapter(DataSourceAdapter):
|
|
11
|
+
"""In-memory adapter for tests and examples.
|
|
12
|
+
|
|
13
|
+
Accepts a dict mapping SQL strings to pre-built DataFrames.
|
|
14
|
+
Raises ``KeyError`` when an unregistered SQL string is executed.
|
|
15
|
+
Has no external *engine* dependencies (no database connector required);
|
|
16
|
+
``pandas`` is still needed as it is the shared return type of all adapters.
|
|
17
|
+
|
|
18
|
+
Each call to :meth:`execute` increments an internal counter so tests can
|
|
19
|
+
verify how many times a query was executed.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, results: dict[str, pd.DataFrame]) -> None:
|
|
23
|
+
self._results = results
|
|
24
|
+
self._call_counts: dict[str, int] = {}
|
|
25
|
+
|
|
26
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
27
|
+
self._call_counts[sql] = self._call_counts.get(sql, 0) + 1
|
|
28
|
+
return self._results[sql]
|
|
29
|
+
|
|
30
|
+
def call_count(self, sql: str) -> int:
|
|
31
|
+
"""Return the number of times *sql* has been executed."""
|
|
32
|
+
return self._call_counts.get(sql, 0)
|
|
33
|
+
|
|
34
|
+
def engine_type(self) -> EngineType:
|
|
35
|
+
return EngineType.MOCK
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Adapter that wraps a live PySpark SparkSession."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SparkAdapter(DataSourceAdapter):
|
|
13
|
+
"""Adapter that wraps a live PySpark ``SparkSession``.
|
|
14
|
+
|
|
15
|
+
The ``spark_session`` parameter is typed as ``Any`` so that pyspark does
|
|
16
|
+
not need to be installed in environments that use other adapters. Pass a
|
|
17
|
+
real ``SparkSession`` at runtime; pass a duck-typed fake in tests.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, spark_session: Any) -> None:
|
|
21
|
+
self._spark = spark_session
|
|
22
|
+
|
|
23
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
24
|
+
result: pd.DataFrame = self._spark.sql(sql).toPandas()
|
|
25
|
+
return result
|
|
26
|
+
|
|
27
|
+
def engine_type(self) -> EngineType:
|
|
28
|
+
return EngineType.SPARK
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Adapter backed by a SQLAlchemy-compatible connection string."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from featkit.execution.adapters.base import DataSourceAdapter, EngineType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SQLAlchemyAdapter(DataSourceAdapter):
|
|
13
|
+
"""Adapter backed by a SQLAlchemy-compatible connection string.
|
|
14
|
+
|
|
15
|
+
Supports any engine that SQLAlchemy supports: PostgreSQL, Oracle,
|
|
16
|
+
Snowflake, MySQL, SQLite, and any JDBC-compatible backend via the
|
|
17
|
+
appropriate dialect package.
|
|
18
|
+
|
|
19
|
+
The SQLAlchemy engine is created lazily on the first :meth:`execute` call.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, connection_string: str) -> None:
|
|
23
|
+
self._connection_string = connection_string
|
|
24
|
+
self._engine: Any = None
|
|
25
|
+
|
|
26
|
+
def _get_engine(self) -> Any:
|
|
27
|
+
if self._engine is None:
|
|
28
|
+
try:
|
|
29
|
+
from sqlalchemy import create_engine
|
|
30
|
+
except ImportError as exc:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"sqlalchemy is required for SQLAlchemyAdapter. "
|
|
33
|
+
"Install with: pip install sqlalchemy"
|
|
34
|
+
) from exc
|
|
35
|
+
self._engine = create_engine(self._connection_string)
|
|
36
|
+
return self._engine
|
|
37
|
+
|
|
38
|
+
def engine_execute(self, sql: str) -> pd.DataFrame:
|
|
39
|
+
engine = self._get_engine()
|
|
40
|
+
result: pd.DataFrame = pd.read_sql_query(sql, engine)
|
|
41
|
+
return result
|
|
42
|
+
|
|
43
|
+
def engine_type(self) -> EngineType:
|
|
44
|
+
return EngineType.SQLALCHEMY
|