featkit 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {featkit-0.1.0 → featkit-0.2.0}/PKG-INFO +4 -1
  2. featkit-0.2.0/docs/example_databricks_notebook.md +141 -0
  3. {featkit-0.1.0 → featkit-0.2.0}/pyproject.toml +3 -1
  4. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/config.py +9 -0
  5. featkit-0.2.0/src/featkit/execution/__init__.py +1 -0
  6. featkit-0.2.0/src/featkit/execution/adapters/__init__.py +18 -0
  7. featkit-0.2.0/src/featkit/execution/adapters/base.py +63 -0
  8. featkit-0.2.0/src/featkit/execution/adapters/databricks_adapter.py +61 -0
  9. featkit-0.2.0/src/featkit/execution/adapters/databricks_notebook_adapter.py +117 -0
  10. featkit-0.2.0/src/featkit/execution/adapters/mock_adapter.py +35 -0
  11. featkit-0.2.0/src/featkit/execution/adapters/spark_adapter.py +28 -0
  12. featkit-0.2.0/src/featkit/execution/adapters/sqlalchemy_adapter.py +44 -0
  13. featkit-0.2.0/src/featkit/execution/domain_resolver.py +98 -0
  14. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/pipeline.py +8 -0
  15. featkit-0.2.0/tests/test_execution/test_adapters.py +77 -0
  16. featkit-0.2.0/tests/test_execution/test_domain_resolver.py +233 -0
  17. featkit-0.2.0/tests/test_generators/__init__.py +0 -0
  18. {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/ci.yml +0 -0
  19. {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/docs.yml +0 -0
  20. {featkit-0.1.0 → featkit-0.2.0}/.github/workflows/publish.yml +0 -0
  21. {featkit-0.1.0 → featkit-0.2.0}/.gitignore +0 -0
  22. {featkit-0.1.0 → featkit-0.2.0}/CHANGELOG.md +0 -0
  23. {featkit-0.1.0 → featkit-0.2.0}/LICENSE +0 -0
  24. {featkit-0.1.0 → featkit-0.2.0}/README.md +0 -0
  25. {featkit-0.1.0 → featkit-0.2.0}/docs/.gitkeep +0 -0
  26. {featkit-0.1.0 → featkit-0.2.0}/docs/examples.md +0 -0
  27. {featkit-0.1.0 → featkit-0.2.0}/docs/general_plan.md +0 -0
  28. {featkit-0.1.0 → featkit-0.2.0}/docs/index.md +0 -0
  29. {featkit-0.1.0 → featkit-0.2.0}/docs/quickstart.md +0 -0
  30. {featkit-0.1.0 → featkit-0.2.0}/mkdocs.yml +0 -0
  31. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/__init__.py +0 -0
  32. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/.gitkeep +0 -0
  33. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/__init__.py +0 -0
  34. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/distributional_space.py +0 -0
  35. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/pivot_space.py +0 -0
  36. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/builders/temporal_space.py +0 -0
  37. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/__init__.py +0 -0
  38. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/.gitkeep +0 -0
  39. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/__init__.py +0 -0
  40. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/base.py +0 -0
  41. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/measurement/defaults.py +0 -0
  42. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/.gitkeep +0 -0
  43. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/__init__.py +0 -0
  44. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/base.py +0 -0
  45. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/contracts/output/defaults.py +0 -0
  46. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/.gitkeep +0 -0
  47. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/__init__.py +0 -0
  48. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/dataset/base.py +0 -0
  49. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/enums.py +0 -0
  50. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/.gitkeep +0 -0
  51. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/__init__.py +0 -0
  52. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/base.py +0 -0
  53. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/categorical_field.py +0 -0
  54. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/id_field.py +0 -0
  55. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/measurement_field.py +0 -0
  56. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/fields/time_field.py +0 -0
  57. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/__init__.py +0 -0
  58. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/base.py +0 -0
  59. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/output.py +0 -0
  60. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/.gitkeep +0 -0
  61. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/__init__.py +0 -0
  62. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/pyspark/databricks.py +0 -0
  63. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/.gitkeep +0 -0
  64. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/__init__.py +0 -0
  65. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/base.py +0 -0
  66. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/databricks.py +0 -0
  67. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/snowflake.py +0 -0
  68. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/generators/sql/spark_sql.py +0 -0
  69. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/.gitkeep +0 -0
  70. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/__init__.py +0 -0
  71. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/base.py +0 -0
  72. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/distributional.py +0 -0
  73. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer2/pivoted.py +0 -0
  74. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/.gitkeep +0 -0
  75. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/__init__.py +0 -0
  76. {featkit-0.1.0 → featkit-0.2.0}/src/featkit/layer3/temporal_feature.py +0 -0
  77. {featkit-0.1.0 → featkit-0.2.0}/tests/__init__.py +0 -0
  78. {featkit-0.1.0 → featkit-0.2.0}/tests/test_builders.py +0 -0
  79. {featkit-0.1.0 → featkit-0.2.0}/tests/test_contracts.py +0 -0
  80. {featkit-0.1.0 → featkit-0.2.0}/tests/test_enums.py +0 -0
  81. {featkit-0.1.0/tests/test_generators → featkit-0.2.0/tests/test_execution}/__init__.py +0 -0
  82. {featkit-0.1.0 → featkit-0.2.0}/tests/test_fields.py +0 -0
  83. {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/.gitkeep +0 -0
  84. {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_base.py +0 -0
  85. {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_pyspark.py +0 -0
  86. {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_sql_databricks.py +0 -0
  87. {featkit-0.1.0 → featkit-0.2.0}/tests/test_generators/test_sql_snowflake.py +0 -0
  88. {featkit-0.1.0 → featkit-0.2.0}/tests/test_integration.py +0 -0
  89. {featkit-0.1.0 → featkit-0.2.0}/tests/test_layer2.py +0 -0
  90. {featkit-0.1.0 → featkit-0.2.0}/tests/test_layer3.py +0 -0
  91. {featkit-0.1.0 → featkit-0.2.0}/tests/test_output_contracts.py +0 -0
  92. {featkit-0.1.0 → featkit-0.2.0}/tests/test_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: featkit
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: featkit — automated feature store generation from relational facts tables
5
5
  Project-URL: Repository, https://github.com/Mirkiux/featkit
6
6
  Project-URL: Documentation, https://mirkiux.github.io/featkit
@@ -49,6 +49,7 @@ Provides-Extra: dev
49
49
  Requires-Dist: build>=1.0; extra == 'dev'
50
50
  Requires-Dist: hatch>=1.9; extra == 'dev'
51
51
  Requires-Dist: mypy>=1.0; extra == 'dev'
52
+ Requires-Dist: pandas>=1.5; extra == 'dev'
52
53
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
53
54
  Requires-Dist: pytest>=7.0; extra == 'dev'
54
55
  Requires-Dist: ruff>=0.4; extra == 'dev'
@@ -57,6 +58,8 @@ Provides-Extra: docs
57
58
  Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
58
59
  Requires-Dist: mkdocs>=1.6; extra == 'docs'
59
60
  Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
61
+ Provides-Extra: execution
62
+ Requires-Dist: pandas>=1.5; extra == 'execution'
60
63
  Provides-Extra: ibis
61
64
  Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
62
65
  Provides-Extra: spark
@@ -0,0 +1,141 @@
1
+ # Example — Dynamic domain resolution in a Databricks notebook
2
+
3
+ This example shows how to let featkit resolve the `allowed_values` domain of a
4
+ `CategoricalField` at runtime by querying the facts table directly from a
5
+ Databricks notebook.
6
+
7
+ `DatabricksNotebookAdapter` discovers the pre-injected `spark` session
8
+ automatically — no constructor arguments are needed.
9
+
10
+ ## Notebook cells
11
+
12
+ ### Cell 1 — imports
13
+
14
+ ```python
15
+ from featkit.config import FeatureStoreConfig
16
+ from featkit.dataset.base import SimpleDataset
17
+ from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
18
+ from featkit.execution.adapters import DatabricksNotebookAdapter
19
+ from featkit.fields.categorical_field import CategoricalField
20
+ from featkit.fields.id_field import IDField
21
+ from featkit.fields.measurement_field import MeasurementField
22
+ from featkit.fields.time_field import TimeField
23
+ from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
24
+ from featkit.pipeline import FeatureStorePipeline
25
+ ```
26
+
27
+ ### Cell 2 — define the dataset (no `allowed_values` on the categorical)
28
+
29
+ ```python
30
+ ds = SimpleDataset(
31
+ "mydb.myschema.silver_transactions",
32
+ [
33
+ IDField("client_id"),
34
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
35
+ MeasurementField("amount", MeasurementType.MONTO),
36
+ MeasurementField("txn_count", MeasurementType.CANTIDAD),
37
+ # No allowed_values — the adapter will resolve the domain at build() time
38
+ CategoricalField("segment", CategoricalTreatment.PIVOT),
39
+ CategoricalField("product_type", CategoricalTreatment.PIVOT),
40
+ ],
41
+ )
42
+ ```
43
+
44
+ ### Cell 3 — configure with the notebook adapter
45
+
46
+ ```python
47
+ adapter = DatabricksNotebookAdapter()
48
+
49
+ cfg = FeatureStoreConfig(
50
+ dataset=ds,
51
+ output_schema="analytics",
52
+ output_table_prefix="feat_",
53
+ time_windows=[3, 6, 12],
54
+ include_marginals=True,
55
+ adapter=adapter, # triggers SELECT DISTINCT resolution at build()
56
+ )
57
+ ```
58
+
59
+ ### Cell 4 — build and generate
60
+
61
+ ```python
62
+ # build() fires one SELECT DISTINCT per unresolved CategoricalField
63
+ pipeline = FeatureStorePipeline(config=cfg).build()
64
+
65
+ print(f"Layer 2A columns : {len(pipeline.layer2a)}")
66
+ print(f"Layer 3 features: {len(pipeline.layer3)}")
67
+
68
+ result = DatabricksSQLCodeGenerator().generate(pipeline)
69
+ print(result.code.sql[:500])
70
+ ```
71
+
72
+ ### Cell 5 — save the artefacts to DBFS
73
+
74
+ ```python
75
+ result.save("/dbfs/mnt/output/features/")
76
+ # Writes:
77
+ # /dbfs/mnt/output/features/script.sql
78
+ # /dbfs/mnt/output/features/dag.json
79
+ # /dbfs/mnt/output/features/diagram.md
80
+ ```
81
+
82
+ ## How it works
83
+
84
+ When `FeatureStorePipeline.build()` is called with an `adapter` set on the
85
+ config, it constructs an `AdapterDomainResolver` and passes it to
86
+ `PivotSpaceBuilder` as the `domain_resolver` callable. For each
87
+ `CategoricalField` that has no `allowed_values`, the builder calls the resolver,
88
+ which executes:
89
+
90
+ ```sql
91
+ SELECT DISTINCT segment
92
+ FROM mydb.myschema.silver_transactions
93
+ WHERE segment IS NOT NULL
94
+ ORDER BY 1
95
+ ```
96
+
97
+ The returned values become the column domain exactly as if they had been listed
98
+ in `allowed_values` at configuration time.
99
+
100
+ ## Mixing static and dynamic domains
101
+
102
+ Static and dynamic fields can coexist in the same dataset. Fields that have
103
+ `allowed_values` set are used as-is; only fields without it trigger a query:
104
+
105
+ ```python
106
+ ds = SimpleDataset(
107
+ "mydb.myschema.silver_transactions",
108
+ [
109
+ IDField("client_id"),
110
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
111
+ MeasurementField("amount", MeasurementType.MONTO),
112
+ # Static domain — no query fired
113
+ CategoricalField(
114
+ "channel",
115
+ CategoricalTreatment.PIVOT,
116
+ allowed_values=["branch", "online", "mobile"],
117
+ ),
118
+ # Dynamic domain — one SELECT DISTINCT executed at build()
119
+ CategoricalField("segment", CategoricalTreatment.PIVOT),
120
+ ],
121
+ )
122
+ ```
123
+
124
+ ## Using a different adapter
125
+
126
+ Swap `DatabricksNotebookAdapter` for any other adapter without changing the
127
+ rest of the code:
128
+
129
+ ```python
130
+ from featkit.execution.adapters import DatabricksAdapter
131
+
132
+ adapter = DatabricksAdapter(
133
+ host="<workspace>.azuredatabricks.net",
134
+ token="<pat>",
135
+ http_path="/sql/1.0/warehouses/<warehouse-id>",
136
+ catalog="mydb",
137
+ schema="myschema",
138
+ )
139
+
140
+ cfg = FeatureStoreConfig(..., adapter=adapter)
141
+ ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "featkit"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "featkit — automated feature store generation from relational facts tables"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -35,6 +35,7 @@ dependencies = [
35
35
  ibis = ["ibis-framework>=9.0"]
36
36
  spark = ["pyspark>=3.4"]
37
37
  databricks = ["databricks-sql-connector>=3.0"]
38
+ execution = ["pandas>=1.5"]
38
39
  docs = [
39
40
  "mkdocs>=1.6",
40
41
  "mkdocs-material>=9.5",
@@ -48,6 +49,7 @@ dev = [
48
49
  "hatch>=1.9",
49
50
  "build>=1.0",
50
51
  "twine>=5.0",
52
+ "pandas>=1.5",
51
53
  ]
52
54
 
53
55
  [project.urls]
@@ -3,10 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING
6
7
 
7
8
  from featkit.dataset.base import AbstractDataset
8
9
  from featkit.enums import Layer2Aggregator, Layer2OutputType, MeasurementType, TemporalOperator
9
10
 
11
+ if TYPE_CHECKING:
12
+ from featkit.execution.adapters.base import DataSourceAdapter
13
+
10
14
 
11
15
  @dataclass
12
16
  class FeatureStoreConfig:
@@ -26,6 +30,10 @@ class FeatureStoreConfig:
26
30
  aggregators. Only contract-valid aggregators are used.
27
31
  operators_override: Per-output-type override for temporal operators.
28
32
  Only contract-valid operators are used.
33
+ adapter: Optional execution adapter. When provided, categorical fields
34
+ with no ``allowed_values`` have their domain resolved at
35
+ ``FeatureStorePipeline.build()`` time via a ``SELECT DISTINCT``
36
+ query against the facts table.
29
37
  """
30
38
 
31
39
  dataset: AbstractDataset
@@ -36,3 +44,4 @@ class FeatureStoreConfig:
36
44
  include_marginals: bool = True
37
45
  aggregators_override: dict[MeasurementType, list[Layer2Aggregator]] | None = None
38
46
  operators_override: dict[Layer2OutputType, list[TemporalOperator]] | None = field(default=None)
47
+ adapter: DataSourceAdapter | None = None
@@ -0,0 +1 @@
1
+ """featkit.execution — adapters and domain resolution for live query execution."""
@@ -0,0 +1,18 @@
1
+ """featkit.execution.adapters — data source adapters for domain resolution."""
2
+
3
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
4
+ from featkit.execution.adapters.databricks_adapter import DatabricksAdapter
5
+ from featkit.execution.adapters.databricks_notebook_adapter import DatabricksNotebookAdapter
6
+ from featkit.execution.adapters.mock_adapter import MockAdapter
7
+ from featkit.execution.adapters.spark_adapter import SparkAdapter
8
+ from featkit.execution.adapters.sqlalchemy_adapter import SQLAlchemyAdapter
9
+
10
+ __all__ = [
11
+ "DataSourceAdapter",
12
+ "DatabricksAdapter",
13
+ "DatabricksNotebookAdapter",
14
+ "EngineType",
15
+ "MockAdapter",
16
+ "SparkAdapter",
17
+ "SQLAlchemyAdapter",
18
+ ]
@@ -0,0 +1,63 @@
1
+ """Base adapter ABC and EngineType enum for featkit execution layer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from enum import Enum
8
+
9
+ import pandas as pd
10
+
11
+ _log = logging.getLogger(__name__)
12
+
13
+
14
+ class EngineType(Enum):
15
+ """Identifies the execution engine behind a :class:`DataSourceAdapter`."""
16
+
17
+ SQLALCHEMY = "sqlalchemy"
18
+ DATABRICKS = "databricks"
19
+ SPARK = "spark"
20
+ MOCK = "mock"
21
+
22
+
23
+ class DataSourceAdapter(ABC):
24
+ """Abstract base for all data source adapters.
25
+
26
+ Subclasses implement :meth:`engine_execute` with the engine-specific query
27
+ logic. The public :meth:`execute` method wraps ``engine_execute`` with
28
+ error handling: if the query fails the offending SQL is logged at ``ERROR``
29
+ level before re-raising, making debugging significantly easier in
30
+ production pipelines.
31
+
32
+ This follows the *Template Method* pattern — the base class owns the
33
+ algorithm skeleton; subclasses supply only the engine-specific step.
34
+ """
35
+
36
+ def execute(self, sql: str) -> pd.DataFrame:
37
+ """Execute *sql* and return a DataFrame.
38
+
39
+ Delegates to :meth:`engine_execute`. On failure, logs the SQL that
40
+ caused the error and re-raises the original exception unchanged.
41
+ """
42
+ try:
43
+ return self.engine_execute(sql)
44
+ except Exception:
45
+ _log.exception(
46
+ "SQL execution failed on %s.\nFailed query:\n%s",
47
+ self.__class__.__name__,
48
+ sql,
49
+ )
50
+ raise
51
+
52
+ @abstractmethod
53
+ def engine_execute(self, sql: str) -> pd.DataFrame:
54
+ """Execute *sql* against the underlying engine and return a DataFrame.
55
+
56
+ Implement this method in each adapter subclass. Do not call it
57
+ directly — use :meth:`execute` instead so that error handling and any
58
+ future cross-cutting behaviour (retries, metrics) are applied.
59
+ """
60
+
61
+ @abstractmethod
62
+ def engine_type(self) -> EngineType:
63
+ """Return the :class:`EngineType` for this adapter."""
@@ -0,0 +1,61 @@
1
+ """Adapter for Databricks SQL warehouses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
8
+
9
+
10
+ class DatabricksAdapter(DataSourceAdapter):
11
+ """Adapter for Databricks SQL warehouses.
12
+
13
+ Requires the optional ``databricks-sql-connector`` package::
14
+
15
+ pip install databricks-sql-connector
16
+
17
+ The connector is imported lazily inside :meth:`engine_execute` so that the
18
+ class can be referenced in code that runs on environments without the
19
+ package installed — the ``ImportError`` is only raised when a query is
20
+ actually attempted.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ host: str,
26
+ token: str,
27
+ http_path: str,
28
+ catalog: str,
29
+ schema: str,
30
+ ) -> None:
31
+ self._host = host
32
+ self._token = token
33
+ self._http_path = http_path
34
+ self._catalog = catalog
35
+ self._schema = schema
36
+
37
+ def engine_execute(self, sql: str) -> pd.DataFrame:
38
+ try:
39
+ from databricks import sql as dbsql
40
+ except ImportError as exc:
41
+ raise ImportError(
42
+ "databricks-sql-connector is required for DatabricksAdapter. "
43
+ "Install with: pip install databricks-sql-connector"
44
+ ) from exc
45
+
46
+ with (
47
+ dbsql.connect(
48
+ server_hostname=self._host,
49
+ http_path=self._http_path,
50
+ access_token=self._token,
51
+ catalog=self._catalog,
52
+ schema=self._schema,
53
+ ) as conn,
54
+ conn.cursor() as cursor,
55
+ ):
56
+ cursor.execute(sql)
57
+ result: pd.DataFrame = cursor.fetchall_arrow().to_pandas()
58
+ return result
59
+
60
+ def engine_type(self) -> EngineType:
61
+ return EngineType.DATABRICKS
@@ -0,0 +1,117 @@
1
+ """Adapter for Databricks notebook environments.
2
+
3
+ In Databricks notebooks the Spark session is pre-instantiated and injected
4
+ into the notebook's global namespace as ``spark``. This adapter discovers
5
+ that session automatically — no constructor arguments required.
6
+
7
+ Usage inside a Databricks notebook::
8
+
9
+ from featkit.execution.adapters import DatabricksNotebookAdapter
10
+
11
+ adapter = DatabricksNotebookAdapter()
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import sys
17
+ from typing import Any
18
+
19
+ import pandas as pd
20
+
21
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
22
+
23
+
24
+ def _resolve_spark() -> Any:
25
+ """Locate the ``spark`` session injected by the Databricks notebook runtime.
26
+
27
+ Databricks injects ``spark`` into the ``__main__`` module namespace before
28
+ the first notebook cell executes. This function retrieves it without
29
+ requiring the caller to hold a reference or import PySpark explicitly.
30
+
31
+ Raises
32
+ ------
33
+ RuntimeError
34
+ When no ``spark`` object can be found. This most commonly means the
35
+ adapter is being used outside a Databricks notebook environment.
36
+ """
37
+ main = sys.modules.get("__main__", None)
38
+ spark = getattr(main, "spark", None) if main is not None else None
39
+
40
+ if spark is None:
41
+ raise RuntimeError(
42
+ "Could not locate 'spark' in the notebook runtime namespace. "
43
+ "DatabricksNotebookAdapter is intended for use inside Databricks "
44
+ "notebooks where 'spark' is pre-injected by the runtime. "
45
+ "Outside that environment, use SparkAdapter(spark_session) instead."
46
+ )
47
+ return spark
48
+
49
+
50
+ class DatabricksNotebookAdapter(DataSourceAdapter):
51
+ """Adapter for Databricks notebook environments.
52
+
53
+ Wraps the ``spark`` session that the Databricks runtime pre-injects into
54
+ every notebook's global namespace. No constructor arguments are needed —
55
+ the session is resolved lazily on the first :meth:`execute` call.
56
+
57
+ Examples
58
+ --------
59
+ Inside a Databricks notebook::
60
+
61
+ from featkit.execution.adapters import DatabricksNotebookAdapter
62
+ from featkit.config import FeatureStoreConfig
63
+ from featkit.dataset.base import SimpleDataset
64
+ from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
65
+ from featkit.fields.categorical_field import CategoricalField
66
+ from featkit.fields.id_field import IDField
67
+ from featkit.fields.measurement_field import MeasurementField
68
+ from featkit.fields.time_field import TimeField
69
+ from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
70
+ from featkit.pipeline import FeatureStorePipeline
71
+
72
+ adapter = DatabricksNotebookAdapter()
73
+
74
+ ds = SimpleDataset(
75
+ "mydb.silver_transactions",
76
+ [
77
+ IDField("client_id"),
78
+ TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
79
+ MeasurementField("amount", MeasurementType.MONTO),
80
+ CategoricalField("segment", CategoricalTreatment.PIVOT), # no allowed_values
81
+ ],
82
+ )
83
+
84
+ cfg = FeatureStoreConfig(
85
+ dataset=ds,
86
+ output_schema="analytics",
87
+ output_table_prefix="feat_",
88
+ time_windows=[3, 6, 12],
89
+ adapter=adapter,
90
+ )
91
+
92
+ pipeline = FeatureStorePipeline(config=cfg).build()
93
+ result = DatabricksSQLCodeGenerator().generate(pipeline)
94
+ result.save("/dbfs/mnt/output/features/")
95
+
96
+ Outside a notebook (e.g. in a standalone script or test), use
97
+ :class:`SparkAdapter` instead and pass the session explicitly::
98
+
99
+ adapter = SparkAdapter(spark_session)
100
+ """
101
+
102
+ def __init__(self) -> None:
103
+ self._spark: Any = None
104
+
105
+ def engine_execute(self, sql: str) -> pd.DataFrame:
106
+ """Execute *sql* via the notebook's pre-injected ``spark`` session.
107
+
108
+ The session is resolved once and cached for the lifetime of this
109
+ adapter instance.
110
+ """
111
+ if self._spark is None:
112
+ self._spark = _resolve_spark()
113
+ result: pd.DataFrame = self._spark.sql(sql).toPandas()
114
+ return result
115
+
116
+ def engine_type(self) -> EngineType:
117
+ return EngineType.SPARK
@@ -0,0 +1,35 @@
1
+ """In-memory adapter for tests and examples."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pandas as pd
6
+
7
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
8
+
9
+
10
+ class MockAdapter(DataSourceAdapter):
11
+ """In-memory adapter for tests and examples.
12
+
13
+ Accepts a dict mapping SQL strings to pre-built DataFrames.
14
+ Raises ``KeyError`` when an unregistered SQL string is executed.
15
+ Has no external *engine* dependencies (no database connector required);
16
+ ``pandas`` is still needed as it is the shared return type of all adapters.
17
+
18
+ Each call to :meth:`execute` increments an internal counter so tests can
19
+ verify how many times a query was executed.
20
+ """
21
+
22
+ def __init__(self, results: dict[str, pd.DataFrame]) -> None:
23
+ self._results = results
24
+ self._call_counts: dict[str, int] = {}
25
+
26
+ def engine_execute(self, sql: str) -> pd.DataFrame:
27
+ self._call_counts[sql] = self._call_counts.get(sql, 0) + 1
28
+ return self._results[sql]
29
+
30
+ def call_count(self, sql: str) -> int:
31
+ """Return the number of times *sql* has been executed."""
32
+ return self._call_counts.get(sql, 0)
33
+
34
+ def engine_type(self) -> EngineType:
35
+ return EngineType.MOCK
@@ -0,0 +1,28 @@
1
+ """Adapter that wraps a live PySpark SparkSession."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
10
+
11
+
12
+ class SparkAdapter(DataSourceAdapter):
13
+ """Adapter that wraps a live PySpark ``SparkSession``.
14
+
15
+ The ``spark_session`` parameter is typed as ``Any`` so that pyspark does
16
+ not need to be installed in environments that use other adapters. Pass a
17
+ real ``SparkSession`` at runtime; pass a duck-typed fake in tests.
18
+ """
19
+
20
+ def __init__(self, spark_session: Any) -> None:
21
+ self._spark = spark_session
22
+
23
+ def engine_execute(self, sql: str) -> pd.DataFrame:
24
+ result: pd.DataFrame = self._spark.sql(sql).toPandas()
25
+ return result
26
+
27
+ def engine_type(self) -> EngineType:
28
+ return EngineType.SPARK
@@ -0,0 +1,44 @@
1
+ """Adapter backed by a SQLAlchemy-compatible connection string."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from featkit.execution.adapters.base import DataSourceAdapter, EngineType
10
+
11
+
12
+ class SQLAlchemyAdapter(DataSourceAdapter):
13
+ """Adapter backed by a SQLAlchemy-compatible connection string.
14
+
15
+ Supports any engine that SQLAlchemy supports: PostgreSQL, Oracle,
16
+ Snowflake, MySQL, SQLite, and any JDBC-compatible backend via the
17
+ appropriate dialect package.
18
+
19
+ The SQLAlchemy engine is created lazily on the first :meth:`execute` call.
20
+ """
21
+
22
+ def __init__(self, connection_string: str) -> None:
23
+ self._connection_string = connection_string
24
+ self._engine: Any = None
25
+
26
+ def _get_engine(self) -> Any:
27
+ if self._engine is None:
28
+ try:
29
+ from sqlalchemy import create_engine
30
+ except ImportError as exc:
31
+ raise ImportError(
32
+ "sqlalchemy is required for SQLAlchemyAdapter. "
33
+ "Install with: pip install sqlalchemy"
34
+ ) from exc
35
+ self._engine = create_engine(self._connection_string)
36
+ return self._engine
37
+
38
+ def engine_execute(self, sql: str) -> pd.DataFrame:
39
+ engine = self._get_engine()
40
+ result: pd.DataFrame = pd.read_sql_query(sql, engine)
41
+ return result
42
+
43
+ def engine_type(self) -> EngineType:
44
+ return EngineType.SQLALCHEMY