odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/plugins.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Plugin system for Odibi."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
if sys.version_info < (3, 10):
|
|
8
|
+
from importlib_metadata import entry_points
|
|
9
|
+
else:
|
|
10
|
+
from importlib.metadata import entry_points
|
|
11
|
+
|
|
12
|
+
# Type for connection factory function
|
|
13
|
+
# (name: str, config: Dict[str, Any]) -> BaseConnection
|
|
14
|
+
# We use Any for return type to avoid circular import with BaseConnection
|
|
15
|
+
ConnectionFactory = Any
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_CONNECTION_FACTORIES: Dict[str, ConnectionFactory] = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_connection_factory(type_name: str, factory: ConnectionFactory):
|
|
23
|
+
"""Register a connection factory.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
type_name: The 'type' string used in config (e.g., 'postgres')
|
|
27
|
+
factory: Function that takes (name, config) and returns a Connection instance
|
|
28
|
+
"""
|
|
29
|
+
_CONNECTION_FACTORIES[type_name] = factory
|
|
30
|
+
logger.debug(f"Registered connection factory: {type_name}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_connection_factory(type_name: str) -> Optional[ConnectionFactory]:
|
|
34
|
+
"""Get a registered connection factory.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
type_name: The connection type
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Factory function or None
|
|
41
|
+
"""
|
|
42
|
+
return _CONNECTION_FACTORIES.get(type_name)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_plugins():
|
|
46
|
+
"""Load plugins from entry points.
|
|
47
|
+
|
|
48
|
+
Scans 'odibi.connections' entry points.
|
|
49
|
+
The entry point value should be a callable (factory).
|
|
50
|
+
The entry point name is used as the connection type.
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
# Handle different entry_points API versions
|
|
54
|
+
# Python 3.9: entry_points() returns SelectableGroups, use .select() or get via group attr
|
|
55
|
+
# Python 3.10+: entry_points(group=...) works directly
|
|
56
|
+
if sys.version_info >= (3, 10):
|
|
57
|
+
eps = entry_points(group="odibi.connections")
|
|
58
|
+
elif sys.version_info >= (3, 9):
|
|
59
|
+
# Python 3.9: use select() method if available, else try group attribute
|
|
60
|
+
all_eps = entry_points()
|
|
61
|
+
if hasattr(all_eps, "select"):
|
|
62
|
+
eps = all_eps.select(group="odibi.connections")
|
|
63
|
+
elif hasattr(all_eps, "get"):
|
|
64
|
+
eps = all_eps.get("odibi.connections", [])
|
|
65
|
+
else:
|
|
66
|
+
eps = getattr(all_eps, "odibi.connections", [])
|
|
67
|
+
else:
|
|
68
|
+
# Python 3.8 and earlier
|
|
69
|
+
eps = entry_points().get("odibi.connections", [])
|
|
70
|
+
|
|
71
|
+
for ep in eps:
|
|
72
|
+
try:
|
|
73
|
+
factory = ep.load()
|
|
74
|
+
register_connection_factory(ep.name, factory)
|
|
75
|
+
logger.info(f"Loaded plugin: {ep.name}")
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Failed to load plugin {ep.name}: {e}", exc_info=True)
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Plugin discovery failed: {e}", exc_info=True)
|
odibi/project.py
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Project Module
|
|
3
|
+
==============
|
|
4
|
+
|
|
5
|
+
Unified Project API that integrates pipelines and semantic layer.
|
|
6
|
+
|
|
7
|
+
The Project class provides a seamless interface for:
|
|
8
|
+
- Loading project configuration (connections, pipelines, semantic layer)
|
|
9
|
+
- Executing semantic queries with auto-resolved table paths
|
|
10
|
+
- No manual table registration required
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
project = Project.load("odibi.yaml")
|
|
14
|
+
result = project.query("revenue BY region")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Dict, List, Optional
|
|
20
|
+
|
|
21
|
+
from odibi.config import ConnectionConfig, ProjectConfig, load_config_from_file
|
|
22
|
+
from odibi.context import EngineContext
|
|
23
|
+
from odibi.enums import EngineType
|
|
24
|
+
from odibi.semantics.metrics import SemanticLayerConfig, parse_semantic_config
|
|
25
|
+
from odibi.semantics.query import QueryResult, SemanticQuery
|
|
26
|
+
from odibi.utils.logging_context import get_logging_context
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SourceResolver:
|
|
30
|
+
"""
|
|
31
|
+
Resolves semantic layer source references to actual paths.
|
|
32
|
+
|
|
33
|
+
Supported source formats:
|
|
34
|
+
|
|
35
|
+
1. **$pipeline.node** (recommended): References a pipeline node's write target.
|
|
36
|
+
Example: `$build_warehouse.fact_orders` reads from wherever that node writes.
|
|
37
|
+
|
|
38
|
+
2. **connection.path**: Explicit connection + path. Supports nested paths!
|
|
39
|
+
The split happens on the FIRST dot only, so subdirectories work:
|
|
40
|
+
- `gold.fact_orders` → `/mnt/data/gold/fact_orders`
|
|
41
|
+
- `gold.oee/plant_a/metrics` → `/mnt/data/gold/oee/plant_a/metrics`
|
|
42
|
+
- `gold.domain/v2/fact_sales` → `/mnt/data/gold/domain/v2/fact_sales`
|
|
43
|
+
|
|
44
|
+
3. **table_name**: Uses the default connection (gold > silver > bronze > first).
|
|
45
|
+
Example: `fact_orders` with a single connection named "warehouse".
|
|
46
|
+
|
|
47
|
+
For Unity Catalog connections (catalog + schema_name):
|
|
48
|
+
`gold.fact_orders` → `catalog.schema.fact_orders`
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
connections: Dict[str, ConnectionConfig],
|
|
54
|
+
base_path: str = "",
|
|
55
|
+
pipelines: Optional[List[Any]] = None,
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Initialize the source resolver.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
connections: Dictionary of connection configurations
|
|
62
|
+
base_path: Base path for relative paths (directory of odibi.yaml)
|
|
63
|
+
pipelines: List of pipeline configs for $pipeline.node resolution
|
|
64
|
+
"""
|
|
65
|
+
self.connections = connections
|
|
66
|
+
self.base_path = base_path
|
|
67
|
+
self.pipelines = pipelines or []
|
|
68
|
+
self._node_index = self._build_node_index()
|
|
69
|
+
|
|
70
|
+
def _build_node_index(self) -> Dict[str, Dict[str, Any]]:
|
|
71
|
+
"""Build an index of pipeline.node -> node config for fast lookup."""
|
|
72
|
+
index = {}
|
|
73
|
+
for pipeline in self.pipelines:
|
|
74
|
+
pipeline_name = (
|
|
75
|
+
pipeline.pipeline if hasattr(pipeline, "pipeline") else pipeline.get("pipeline", "")
|
|
76
|
+
)
|
|
77
|
+
nodes = pipeline.nodes if hasattr(pipeline, "nodes") else pipeline.get("nodes", [])
|
|
78
|
+
for node in nodes:
|
|
79
|
+
node_name = node.name if hasattr(node, "name") else node.get("name", "")
|
|
80
|
+
key = f"{pipeline_name}.{node_name}"
|
|
81
|
+
index[key] = node
|
|
82
|
+
return index
|
|
83
|
+
|
|
84
|
+
def resolve(self, source: str) -> tuple[str, str]:
|
|
85
|
+
"""
|
|
86
|
+
Resolve a source reference to connection name and full path.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
source: Source reference. Supported formats:
|
|
90
|
+
- "$pipeline.node" (e.g., "$build_warehouse.fact_orders")
|
|
91
|
+
- "connection.table" (e.g., "gold.fact_orders")
|
|
92
|
+
- "table_name" (e.g., "fact_orders") - uses default connection
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Tuple of (connection_name, full_path)
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If connection or node not found
|
|
99
|
+
"""
|
|
100
|
+
# Handle $pipeline.node reference
|
|
101
|
+
if source.startswith("$"):
|
|
102
|
+
return self._resolve_node_reference(source)
|
|
103
|
+
|
|
104
|
+
# Handle connection.table or bare table name
|
|
105
|
+
if "." in source:
|
|
106
|
+
connection_name, table_name = source.split(".", 1)
|
|
107
|
+
else:
|
|
108
|
+
connection_name = self._find_default_connection()
|
|
109
|
+
table_name = source
|
|
110
|
+
|
|
111
|
+
if connection_name not in self.connections:
|
|
112
|
+
available = list(self.connections.keys())
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Connection '{connection_name}' not found in source '{source}'. "
|
|
115
|
+
f"Available connections: {available}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
connection = self.connections[connection_name]
|
|
119
|
+
full_path = self._build_path(connection, table_name)
|
|
120
|
+
|
|
121
|
+
return connection_name, full_path
|
|
122
|
+
|
|
123
|
+
def _resolve_node_reference(self, source: str) -> tuple[str, str]:
|
|
124
|
+
"""
|
|
125
|
+
Resolve a $pipeline.node reference to connection and path.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
source: Node reference (e.g., "$build_warehouse.fact_orders")
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Tuple of (connection_name, full_path)
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If node not found or node has no write config
|
|
135
|
+
"""
|
|
136
|
+
# Remove $ prefix and parse
|
|
137
|
+
ref = source[1:] # Remove $
|
|
138
|
+
|
|
139
|
+
if ref not in self._node_index:
|
|
140
|
+
available = list(self._node_index.keys())
|
|
141
|
+
raise ValueError(f"Node reference '{source}' not found. Available nodes: {available}")
|
|
142
|
+
|
|
143
|
+
node = self._node_index[ref]
|
|
144
|
+
|
|
145
|
+
# Get write config from node
|
|
146
|
+
write_config = node.write if hasattr(node, "write") else node.get("write")
|
|
147
|
+
if not write_config:
|
|
148
|
+
raise ValueError(f"Node '{source}' has no 'write' config. Cannot resolve source path.")
|
|
149
|
+
|
|
150
|
+
# Extract connection and path/table from write config
|
|
151
|
+
if hasattr(write_config, "connection"):
|
|
152
|
+
connection_name = write_config.connection
|
|
153
|
+
table_name = write_config.table or write_config.path
|
|
154
|
+
else:
|
|
155
|
+
connection_name = write_config.get("connection")
|
|
156
|
+
table_name = write_config.get("table") or write_config.get("path")
|
|
157
|
+
|
|
158
|
+
if not connection_name:
|
|
159
|
+
raise ValueError(
|
|
160
|
+
f"Node '{source}' write config has no 'connection'. Cannot resolve source path."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if not table_name:
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"Node '{source}' write config has no 'table' or 'path'. "
|
|
166
|
+
"Cannot resolve source path."
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if connection_name not in self.connections:
|
|
170
|
+
available = list(self.connections.keys())
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"Connection '{connection_name}' from node '{source}' not found. "
|
|
173
|
+
f"Available connections: {available}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
connection = self.connections[connection_name]
|
|
177
|
+
full_path = self._build_path(connection, table_name)
|
|
178
|
+
|
|
179
|
+
return connection_name, full_path
|
|
180
|
+
|
|
181
|
+
def _find_default_connection(self) -> str:
|
|
182
|
+
"""Find the default connection to use when not specified."""
|
|
183
|
+
if len(self.connections) == 1:
|
|
184
|
+
return list(self.connections.keys())[0]
|
|
185
|
+
|
|
186
|
+
priority = ["gold", "silver", "bronze", "warehouse", "default"]
|
|
187
|
+
for name in priority:
|
|
188
|
+
if name in self.connections:
|
|
189
|
+
return name
|
|
190
|
+
|
|
191
|
+
return list(self.connections.keys())[0]
|
|
192
|
+
|
|
193
|
+
def _build_path(self, connection: ConnectionConfig, table_name: str) -> str:
|
|
194
|
+
"""Build the full path for a table given a connection."""
|
|
195
|
+
conn_dict = connection.model_dump() if hasattr(connection, "model_dump") else connection
|
|
196
|
+
|
|
197
|
+
if "base_path" in conn_dict:
|
|
198
|
+
base = conn_dict["base_path"]
|
|
199
|
+
elif "path" in conn_dict:
|
|
200
|
+
base = conn_dict["path"]
|
|
201
|
+
elif "catalog" in conn_dict and "schema" in conn_dict:
|
|
202
|
+
return f"{conn_dict['catalog']}.{conn_dict['schema']}.{table_name}"
|
|
203
|
+
else:
|
|
204
|
+
base = ""
|
|
205
|
+
|
|
206
|
+
if self.base_path and not os.path.isabs(base):
|
|
207
|
+
base = os.path.join(self.base_path, base)
|
|
208
|
+
|
|
209
|
+
return os.path.join(base, table_name) if base else table_name
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
class Project:
|
|
213
|
+
"""
|
|
214
|
+
Unified Project API for Odibi.
|
|
215
|
+
|
|
216
|
+
Integrates project configuration, connections, and semantic layer
|
|
217
|
+
into a single interface for seamless querying.
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
# Load project and query
|
|
221
|
+
project = Project.load("odibi.yaml")
|
|
222
|
+
result = project.query("revenue BY region")
|
|
223
|
+
|
|
224
|
+
# Access the DataFrame
|
|
225
|
+
print(result.df)
|
|
226
|
+
|
|
227
|
+
# Multiple metrics and dimensions
|
|
228
|
+
result = project.query("revenue, order_count BY region, month")
|
|
229
|
+
|
|
230
|
+
# With filters
|
|
231
|
+
result = project.query("revenue BY category WHERE region = 'North'")
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def __init__(
|
|
235
|
+
self,
|
|
236
|
+
config: ProjectConfig,
|
|
237
|
+
semantic_config: Optional[SemanticLayerConfig] = None,
|
|
238
|
+
base_path: str = "",
|
|
239
|
+
lazy: bool = True,
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Initialize the Project.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
config: ProjectConfig instance
|
|
246
|
+
semantic_config: Optional SemanticLayerConfig (loaded from config.semantic if not provided)
|
|
247
|
+
base_path: Base path for resolving relative paths
|
|
248
|
+
lazy: If True, load tables on-demand; if False, load all upfront
|
|
249
|
+
"""
|
|
250
|
+
self.config = config
|
|
251
|
+
self.base_path = base_path
|
|
252
|
+
self.lazy = lazy
|
|
253
|
+
self._context: Optional[EngineContext] = None
|
|
254
|
+
self._loaded_tables: Dict[str, Any] = {}
|
|
255
|
+
|
|
256
|
+
self._resolver = SourceResolver(config.connections, base_path, config.pipelines)
|
|
257
|
+
|
|
258
|
+
if semantic_config:
|
|
259
|
+
self._semantic_config = semantic_config
|
|
260
|
+
else:
|
|
261
|
+
self._semantic_config = self._load_semantic_config()
|
|
262
|
+
|
|
263
|
+
self._query_engine: Optional[SemanticQuery] = None
|
|
264
|
+
if self._semantic_config:
|
|
265
|
+
self._query_engine = SemanticQuery(self._semantic_config)
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def load(
|
|
269
|
+
cls,
|
|
270
|
+
config_path: str,
|
|
271
|
+
semantic_path: Optional[str] = None,
|
|
272
|
+
lazy: bool = True,
|
|
273
|
+
) -> "Project":
|
|
274
|
+
"""
|
|
275
|
+
Load a Project from configuration file(s).
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
config_path: Path to odibi.yaml
|
|
279
|
+
semantic_path: Optional path to semantic config (overrides config.semantic)
|
|
280
|
+
lazy: If True, load tables on-demand
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Project instance
|
|
284
|
+
"""
|
|
285
|
+
ctx = get_logging_context()
|
|
286
|
+
ctx.info("Loading project", config=config_path)
|
|
287
|
+
|
|
288
|
+
config = load_config_from_file(config_path)
|
|
289
|
+
base_path = str(Path(config_path).parent.absolute())
|
|
290
|
+
|
|
291
|
+
semantic_config = None
|
|
292
|
+
if semantic_path:
|
|
293
|
+
semantic_config = cls._load_semantic_from_file(semantic_path, base_path)
|
|
294
|
+
elif config.semantic:
|
|
295
|
+
if "config" in config.semantic:
|
|
296
|
+
semantic_file = config.semantic["config"]
|
|
297
|
+
if not os.path.isabs(semantic_file):
|
|
298
|
+
semantic_file = os.path.join(base_path, semantic_file)
|
|
299
|
+
semantic_config = cls._load_semantic_from_file(semantic_file, base_path)
|
|
300
|
+
else:
|
|
301
|
+
semantic_config = parse_semantic_config(config.semantic)
|
|
302
|
+
|
|
303
|
+
return cls(
|
|
304
|
+
config=config,
|
|
305
|
+
semantic_config=semantic_config,
|
|
306
|
+
base_path=base_path,
|
|
307
|
+
lazy=lazy,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
@staticmethod
|
|
311
|
+
def _load_semantic_from_file(path: str, base_path: str) -> SemanticLayerConfig:
|
|
312
|
+
"""Load semantic config from a YAML file."""
|
|
313
|
+
import yaml
|
|
314
|
+
|
|
315
|
+
with open(path, "r") as f:
|
|
316
|
+
data = yaml.safe_load(f)
|
|
317
|
+
|
|
318
|
+
return parse_semantic_config(data)
|
|
319
|
+
|
|
320
|
+
def _load_semantic_config(self) -> Optional[SemanticLayerConfig]:
|
|
321
|
+
"""Load semantic config from project config."""
|
|
322
|
+
if not self.config.semantic:
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
if "config" in self.config.semantic:
|
|
326
|
+
semantic_file = self.config.semantic["config"]
|
|
327
|
+
if not os.path.isabs(semantic_file):
|
|
328
|
+
semantic_file = os.path.join(self.base_path, semantic_file)
|
|
329
|
+
return self._load_semantic_from_file(semantic_file, self.base_path)
|
|
330
|
+
else:
|
|
331
|
+
return parse_semantic_config(self.config.semantic)
|
|
332
|
+
|
|
333
|
+
def _get_context(self) -> EngineContext:
|
|
334
|
+
"""Get or create the engine context."""
|
|
335
|
+
if self._context is None:
|
|
336
|
+
from odibi.context import PandasContext, PolarsContext
|
|
337
|
+
|
|
338
|
+
engine_type = EngineType(self.config.engine.value)
|
|
339
|
+
|
|
340
|
+
if engine_type == EngineType.PANDAS:
|
|
341
|
+
base_context = PandasContext()
|
|
342
|
+
elif engine_type == EngineType.POLARS:
|
|
343
|
+
base_context = PolarsContext()
|
|
344
|
+
else:
|
|
345
|
+
from odibi.context import SparkContext
|
|
346
|
+
from pyspark.sql import SparkSession
|
|
347
|
+
|
|
348
|
+
spark = SparkSession.builder.getOrCreate()
|
|
349
|
+
base_context = SparkContext(spark)
|
|
350
|
+
|
|
351
|
+
self._context = EngineContext(context=base_context, df=None, engine_type=engine_type)
|
|
352
|
+
return self._context
|
|
353
|
+
|
|
354
|
+
def _load_table(self, source: str) -> Any:
|
|
355
|
+
"""
|
|
356
|
+
Load a table from its source reference.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
source: Source reference (e.g., "gold.fact_orders")
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
DataFrame (Pandas, Spark, or Polars depending on engine)
|
|
363
|
+
"""
|
|
364
|
+
if source in self._loaded_tables:
|
|
365
|
+
return self._loaded_tables[source]
|
|
366
|
+
|
|
367
|
+
ctx = get_logging_context()
|
|
368
|
+
connection_name, full_path = self._resolver.resolve(source)
|
|
369
|
+
connection = self.config.connections[connection_name]
|
|
370
|
+
|
|
371
|
+
ctx.debug("Loading table", source=source, path=full_path)
|
|
372
|
+
|
|
373
|
+
df = self._read_from_connection(connection, full_path)
|
|
374
|
+
self._loaded_tables[source] = df
|
|
375
|
+
|
|
376
|
+
return df
|
|
377
|
+
|
|
378
|
+
def _read_from_connection(self, connection: ConnectionConfig, path: str) -> Any:
|
|
379
|
+
"""
|
|
380
|
+
Read data from a connection.
|
|
381
|
+
|
|
382
|
+
Supports Delta, local files, and catalog references.
|
|
383
|
+
"""
|
|
384
|
+
conn_dict = connection.model_dump() if hasattr(connection, "model_dump") else connection
|
|
385
|
+
conn_type = conn_dict.get("type", "local")
|
|
386
|
+
|
|
387
|
+
engine_type = EngineType(self.config.engine.value)
|
|
388
|
+
|
|
389
|
+
if engine_type == EngineType.SPARK:
|
|
390
|
+
return self._read_spark(conn_dict, path, conn_type)
|
|
391
|
+
elif engine_type == EngineType.POLARS:
|
|
392
|
+
return self._read_polars(conn_dict, path, conn_type)
|
|
393
|
+
else:
|
|
394
|
+
return self._read_pandas(conn_dict, path, conn_type)
|
|
395
|
+
|
|
396
|
+
def _read_spark(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
|
|
397
|
+
"""Read data using Spark."""
|
|
398
|
+
from pyspark.sql import SparkSession
|
|
399
|
+
|
|
400
|
+
spark = SparkSession.builder.getOrCreate()
|
|
401
|
+
|
|
402
|
+
if conn_type == "delta":
|
|
403
|
+
if "catalog" in conn_dict:
|
|
404
|
+
return spark.table(path)
|
|
405
|
+
else:
|
|
406
|
+
return spark.read.format("delta").load(path)
|
|
407
|
+
else:
|
|
408
|
+
if os.path.exists(path):
|
|
409
|
+
if path.endswith(".parquet") or os.path.isdir(path):
|
|
410
|
+
return spark.read.parquet(path)
|
|
411
|
+
elif path.endswith(".csv"):
|
|
412
|
+
return spark.read.csv(path, header=True, inferSchema=True)
|
|
413
|
+
return spark.table(path)
|
|
414
|
+
|
|
415
|
+
def _read_pandas(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
|
|
416
|
+
"""Read data using Pandas."""
|
|
417
|
+
import pandas as pd
|
|
418
|
+
|
|
419
|
+
if conn_type == "delta":
|
|
420
|
+
try:
|
|
421
|
+
from deltalake import DeltaTable
|
|
422
|
+
|
|
423
|
+
dt = DeltaTable(path)
|
|
424
|
+
return dt.to_pandas()
|
|
425
|
+
except ImportError:
|
|
426
|
+
raise ImportError(
|
|
427
|
+
"deltalake package required for Delta tables with Pandas. "
|
|
428
|
+
"Install with: pip install deltalake"
|
|
429
|
+
)
|
|
430
|
+
else:
|
|
431
|
+
if path.endswith(".parquet") or os.path.isdir(path):
|
|
432
|
+
return pd.read_parquet(path)
|
|
433
|
+
elif path.endswith(".csv"):
|
|
434
|
+
return pd.read_csv(path)
|
|
435
|
+
else:
|
|
436
|
+
return pd.read_parquet(path)
|
|
437
|
+
|
|
438
|
+
def _read_polars(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
|
|
439
|
+
"""Read data using Polars."""
|
|
440
|
+
import polars as pl
|
|
441
|
+
|
|
442
|
+
if conn_type == "delta":
|
|
443
|
+
return pl.read_delta(path)
|
|
444
|
+
else:
|
|
445
|
+
if path.endswith(".parquet") or os.path.isdir(path):
|
|
446
|
+
return pl.read_parquet(path)
|
|
447
|
+
elif path.endswith(".csv"):
|
|
448
|
+
return pl.read_csv(path)
|
|
449
|
+
else:
|
|
450
|
+
return pl.read_parquet(path)
|
|
451
|
+
|
|
452
|
+
def _get_sources_for_query(self, query_string: str) -> List[str]:
|
|
453
|
+
"""
|
|
454
|
+
Get all source tables needed for a query.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
query_string: Semantic query string
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
List of source references
|
|
461
|
+
"""
|
|
462
|
+
if not self._query_engine:
|
|
463
|
+
return []
|
|
464
|
+
|
|
465
|
+
parsed = self._query_engine.parse(query_string)
|
|
466
|
+
sources = set()
|
|
467
|
+
|
|
468
|
+
for metric_name in parsed.metrics:
|
|
469
|
+
metric = self._semantic_config.get_metric(metric_name)
|
|
470
|
+
if metric and metric.source:
|
|
471
|
+
sources.add(metric.source)
|
|
472
|
+
|
|
473
|
+
for dim_name in parsed.dimensions:
|
|
474
|
+
dim = self._semantic_config.get_dimension(dim_name)
|
|
475
|
+
if dim and dim.source:
|
|
476
|
+
sources.add(dim.source)
|
|
477
|
+
|
|
478
|
+
return list(sources)
|
|
479
|
+
|
|
480
|
+
def _ensure_tables_loaded(self, sources: List[str]) -> None:
|
|
481
|
+
"""
|
|
482
|
+
Ensure all required tables are loaded into context.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
sources: List of source references to load
|
|
486
|
+
"""
|
|
487
|
+
context = self._get_context()
|
|
488
|
+
|
|
489
|
+
for source in sources:
|
|
490
|
+
table_name = source.split(".")[-1] if "." in source else source
|
|
491
|
+
|
|
492
|
+
if table_name not in self._loaded_tables:
|
|
493
|
+
df = self._load_table(source)
|
|
494
|
+
context.context.register(table_name, df)
|
|
495
|
+
|
|
496
|
+
def query(self, query_string: str) -> QueryResult:
|
|
497
|
+
"""
|
|
498
|
+
Execute a semantic query.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
query_string: Semantic query (e.g., "revenue BY region")
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
QueryResult with DataFrame and metadata
|
|
505
|
+
|
|
506
|
+
Raises:
|
|
507
|
+
ValueError: If semantic layer not configured or invalid query
|
|
508
|
+
"""
|
|
509
|
+
if not self._query_engine:
|
|
510
|
+
raise ValueError(
|
|
511
|
+
"Semantic layer not configured. Add 'semantic' section to odibi.yaml "
|
|
512
|
+
"or provide a semantic config file."
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
ctx = get_logging_context()
|
|
516
|
+
ctx.info("Executing project query", query=query_string)
|
|
517
|
+
|
|
518
|
+
sources = self._get_sources_for_query(query_string)
|
|
519
|
+
self._ensure_tables_loaded(sources)
|
|
520
|
+
|
|
521
|
+
context = self._get_context()
|
|
522
|
+
return self._query_engine.execute(query_string, context)
|
|
523
|
+
|
|
524
|
+
def register(self, name: str, df: Any) -> None:
|
|
525
|
+
"""
|
|
526
|
+
Manually register a DataFrame in the context.
|
|
527
|
+
|
|
528
|
+
Useful for testing or when data comes from non-standard sources.
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
name: Table name to register
|
|
532
|
+
df: DataFrame to register
|
|
533
|
+
"""
|
|
534
|
+
context = self._get_context()
|
|
535
|
+
context.context.register(name, df)
|
|
536
|
+
self._loaded_tables[name] = df
|
|
537
|
+
|
|
538
|
+
@property
|
|
539
|
+
def semantic_config(self) -> Optional[SemanticLayerConfig]:
|
|
540
|
+
"""Get the semantic layer configuration."""
|
|
541
|
+
return self._semantic_config
|
|
542
|
+
|
|
543
|
+
@property
|
|
544
|
+
def connections(self) -> Dict[str, ConnectionConfig]:
|
|
545
|
+
"""Get the connection configurations."""
|
|
546
|
+
return self.config.connections
|
|
547
|
+
|
|
548
|
+
@property
|
|
549
|
+
def metrics(self) -> List[str]:
|
|
550
|
+
"""Get list of available metric names."""
|
|
551
|
+
if not self._semantic_config:
|
|
552
|
+
return []
|
|
553
|
+
return [m.name for m in self._semantic_config.metrics]
|
|
554
|
+
|
|
555
|
+
@property
|
|
556
|
+
def dimensions(self) -> List[str]:
|
|
557
|
+
"""Get list of available dimension names."""
|
|
558
|
+
if not self._semantic_config:
|
|
559
|
+
return []
|
|
560
|
+
return [d.name for d in self._semantic_config.dimensions]
|
|
561
|
+
|
|
562
|
+
def describe(self) -> Dict[str, Any]:
|
|
563
|
+
"""
|
|
564
|
+
Get a description of the project and its semantic layer.
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
Dictionary with project info, metrics, dimensions
|
|
568
|
+
"""
|
|
569
|
+
return {
|
|
570
|
+
"project": self.config.project,
|
|
571
|
+
"engine": self.config.engine.value,
|
|
572
|
+
"connections": list(self.config.connections.keys()),
|
|
573
|
+
"metrics": [
|
|
574
|
+
{"name": m.name, "description": m.description, "source": m.source}
|
|
575
|
+
for m in (self._semantic_config.metrics if self._semantic_config else [])
|
|
576
|
+
],
|
|
577
|
+
"dimensions": [
|
|
578
|
+
{"name": d.name, "source": d.source, "column": d.get_column()}
|
|
579
|
+
for d in (self._semantic_config.dimensions if self._semantic_config else [])
|
|
580
|
+
],
|
|
581
|
+
}
|