odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/plugins.py ADDED
@@ -0,0 +1,80 @@
1
+ """Plugin system for Odibi."""
2
+
3
+ import logging
4
+ import sys
5
+ from typing import Any, Dict, Optional
6
+
7
+ if sys.version_info < (3, 10):
8
+ from importlib_metadata import entry_points
9
+ else:
10
+ from importlib.metadata import entry_points
11
+
12
+ # Type for connection factory function
13
+ # (name: str, config: Dict[str, Any]) -> BaseConnection
14
+ # We use Any for return type to avoid circular import with BaseConnection
15
+ ConnectionFactory = Any
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _CONNECTION_FACTORIES: Dict[str, ConnectionFactory] = {}
20
+
21
+
22
+ def register_connection_factory(type_name: str, factory: ConnectionFactory):
23
+ """Register a connection factory.
24
+
25
+ Args:
26
+ type_name: The 'type' string used in config (e.g., 'postgres')
27
+ factory: Function that takes (name, config) and returns a Connection instance
28
+ """
29
+ _CONNECTION_FACTORIES[type_name] = factory
30
+ logger.debug(f"Registered connection factory: {type_name}")
31
+
32
+
33
+ def get_connection_factory(type_name: str) -> Optional[ConnectionFactory]:
34
+ """Get a registered connection factory.
35
+
36
+ Args:
37
+ type_name: The connection type
38
+
39
+ Returns:
40
+ Factory function or None
41
+ """
42
+ return _CONNECTION_FACTORIES.get(type_name)
43
+
44
+
45
+ def load_plugins():
46
+ """Load plugins from entry points.
47
+
48
+ Scans 'odibi.connections' entry points.
49
+ The entry point value should be a callable (factory).
50
+ The entry point name is used as the connection type.
51
+ """
52
+ try:
53
+ # Handle different entry_points API versions
54
+ # Python 3.9: entry_points() returns SelectableGroups, use .select() or get via group attr
55
+ # Python 3.10+: entry_points(group=...) works directly
56
+ if sys.version_info >= (3, 10):
57
+ eps = entry_points(group="odibi.connections")
58
+ elif sys.version_info >= (3, 9):
59
+ # Python 3.9: use select() method if available, else try group attribute
60
+ all_eps = entry_points()
61
+ if hasattr(all_eps, "select"):
62
+ eps = all_eps.select(group="odibi.connections")
63
+ elif hasattr(all_eps, "get"):
64
+ eps = all_eps.get("odibi.connections", [])
65
+ else:
66
+ eps = getattr(all_eps, "odibi.connections", [])
67
+ else:
68
+ # Python 3.8 and earlier
69
+ eps = entry_points().get("odibi.connections", [])
70
+
71
+ for ep in eps:
72
+ try:
73
+ factory = ep.load()
74
+ register_connection_factory(ep.name, factory)
75
+ logger.info(f"Loaded plugin: {ep.name}")
76
+ except Exception as e:
77
+ logger.error(f"Failed to load plugin {ep.name}: {e}", exc_info=True)
78
+
79
+ except Exception as e:
80
+ logger.error(f"Plugin discovery failed: {e}", exc_info=True)
odibi/project.py ADDED
@@ -0,0 +1,581 @@
1
+ """
2
+ Project Module
3
+ ==============
4
+
5
+ Unified Project API that integrates pipelines and semantic layer.
6
+
7
+ The Project class provides a seamless interface for:
8
+ - Loading project configuration (connections, pipelines, semantic layer)
9
+ - Executing semantic queries with auto-resolved table paths
10
+ - No manual table registration required
11
+
12
+ Example:
13
+ project = Project.load("odibi.yaml")
14
+ result = project.query("revenue BY region")
15
+ """
16
+
17
+ import os
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional
20
+
21
+ from odibi.config import ConnectionConfig, ProjectConfig, load_config_from_file
22
+ from odibi.context import EngineContext
23
+ from odibi.enums import EngineType
24
+ from odibi.semantics.metrics import SemanticLayerConfig, parse_semantic_config
25
+ from odibi.semantics.query import QueryResult, SemanticQuery
26
+ from odibi.utils.logging_context import get_logging_context
27
+
28
+
29
+ class SourceResolver:
30
+ """
31
+ Resolves semantic layer source references to actual paths.
32
+
33
+ Supported source formats:
34
+
35
+ 1. **$pipeline.node** (recommended): References a pipeline node's write target.
36
+ Example: `$build_warehouse.fact_orders` reads from wherever that node writes.
37
+
38
+ 2. **connection.path**: Explicit connection + path. Supports nested paths!
39
+ The split happens on the FIRST dot only, so subdirectories work:
40
+ - `gold.fact_orders` → `/mnt/data/gold/fact_orders`
41
+ - `gold.oee/plant_a/metrics` → `/mnt/data/gold/oee/plant_a/metrics`
42
+ - `gold.domain/v2/fact_sales` → `/mnt/data/gold/domain/v2/fact_sales`
43
+
44
+ 3. **table_name**: Uses the default connection (gold > silver > bronze > first).
45
+ Example: `fact_orders` with a single connection named "warehouse".
46
+
47
+ For Unity Catalog connections (catalog + schema_name):
48
+ `gold.fact_orders` → `catalog.schema.fact_orders`
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ connections: Dict[str, ConnectionConfig],
54
+ base_path: str = "",
55
+ pipelines: Optional[List[Any]] = None,
56
+ ):
57
+ """
58
+ Initialize the source resolver.
59
+
60
+ Args:
61
+ connections: Dictionary of connection configurations
62
+ base_path: Base path for relative paths (directory of odibi.yaml)
63
+ pipelines: List of pipeline configs for $pipeline.node resolution
64
+ """
65
+ self.connections = connections
66
+ self.base_path = base_path
67
+ self.pipelines = pipelines or []
68
+ self._node_index = self._build_node_index()
69
+
70
+ def _build_node_index(self) -> Dict[str, Dict[str, Any]]:
71
+ """Build an index of pipeline.node -> node config for fast lookup."""
72
+ index = {}
73
+ for pipeline in self.pipelines:
74
+ pipeline_name = (
75
+ pipeline.pipeline if hasattr(pipeline, "pipeline") else pipeline.get("pipeline", "")
76
+ )
77
+ nodes = pipeline.nodes if hasattr(pipeline, "nodes") else pipeline.get("nodes", [])
78
+ for node in nodes:
79
+ node_name = node.name if hasattr(node, "name") else node.get("name", "")
80
+ key = f"{pipeline_name}.{node_name}"
81
+ index[key] = node
82
+ return index
83
+
84
+ def resolve(self, source: str) -> tuple[str, str]:
85
+ """
86
+ Resolve a source reference to connection name and full path.
87
+
88
+ Args:
89
+ source: Source reference. Supported formats:
90
+ - "$pipeline.node" (e.g., "$build_warehouse.fact_orders")
91
+ - "connection.table" (e.g., "gold.fact_orders")
92
+ - "table_name" (e.g., "fact_orders") - uses default connection
93
+
94
+ Returns:
95
+ Tuple of (connection_name, full_path)
96
+
97
+ Raises:
98
+ ValueError: If connection or node not found
99
+ """
100
+ # Handle $pipeline.node reference
101
+ if source.startswith("$"):
102
+ return self._resolve_node_reference(source)
103
+
104
+ # Handle connection.table or bare table name
105
+ if "." in source:
106
+ connection_name, table_name = source.split(".", 1)
107
+ else:
108
+ connection_name = self._find_default_connection()
109
+ table_name = source
110
+
111
+ if connection_name not in self.connections:
112
+ available = list(self.connections.keys())
113
+ raise ValueError(
114
+ f"Connection '{connection_name}' not found in source '{source}'. "
115
+ f"Available connections: {available}"
116
+ )
117
+
118
+ connection = self.connections[connection_name]
119
+ full_path = self._build_path(connection, table_name)
120
+
121
+ return connection_name, full_path
122
+
123
+ def _resolve_node_reference(self, source: str) -> tuple[str, str]:
124
+ """
125
+ Resolve a $pipeline.node reference to connection and path.
126
+
127
+ Args:
128
+ source: Node reference (e.g., "$build_warehouse.fact_orders")
129
+
130
+ Returns:
131
+ Tuple of (connection_name, full_path)
132
+
133
+ Raises:
134
+ ValueError: If node not found or node has no write config
135
+ """
136
+ # Remove $ prefix and parse
137
+ ref = source[1:] # Remove $
138
+
139
+ if ref not in self._node_index:
140
+ available = list(self._node_index.keys())
141
+ raise ValueError(f"Node reference '{source}' not found. Available nodes: {available}")
142
+
143
+ node = self._node_index[ref]
144
+
145
+ # Get write config from node
146
+ write_config = node.write if hasattr(node, "write") else node.get("write")
147
+ if not write_config:
148
+ raise ValueError(f"Node '{source}' has no 'write' config. Cannot resolve source path.")
149
+
150
+ # Extract connection and path/table from write config
151
+ if hasattr(write_config, "connection"):
152
+ connection_name = write_config.connection
153
+ table_name = write_config.table or write_config.path
154
+ else:
155
+ connection_name = write_config.get("connection")
156
+ table_name = write_config.get("table") or write_config.get("path")
157
+
158
+ if not connection_name:
159
+ raise ValueError(
160
+ f"Node '{source}' write config has no 'connection'. Cannot resolve source path."
161
+ )
162
+
163
+ if not table_name:
164
+ raise ValueError(
165
+ f"Node '{source}' write config has no 'table' or 'path'. "
166
+ "Cannot resolve source path."
167
+ )
168
+
169
+ if connection_name not in self.connections:
170
+ available = list(self.connections.keys())
171
+ raise ValueError(
172
+ f"Connection '{connection_name}' from node '{source}' not found. "
173
+ f"Available connections: {available}"
174
+ )
175
+
176
+ connection = self.connections[connection_name]
177
+ full_path = self._build_path(connection, table_name)
178
+
179
+ return connection_name, full_path
180
+
181
+ def _find_default_connection(self) -> str:
182
+ """Find the default connection to use when not specified."""
183
+ if len(self.connections) == 1:
184
+ return list(self.connections.keys())[0]
185
+
186
+ priority = ["gold", "silver", "bronze", "warehouse", "default"]
187
+ for name in priority:
188
+ if name in self.connections:
189
+ return name
190
+
191
+ return list(self.connections.keys())[0]
192
+
193
+ def _build_path(self, connection: ConnectionConfig, table_name: str) -> str:
194
+ """Build the full path for a table given a connection."""
195
+ conn_dict = connection.model_dump() if hasattr(connection, "model_dump") else connection
196
+
197
+ if "base_path" in conn_dict:
198
+ base = conn_dict["base_path"]
199
+ elif "path" in conn_dict:
200
+ base = conn_dict["path"]
201
+ elif "catalog" in conn_dict and "schema" in conn_dict:
202
+ return f"{conn_dict['catalog']}.{conn_dict['schema']}.{table_name}"
203
+ else:
204
+ base = ""
205
+
206
+ if self.base_path and not os.path.isabs(base):
207
+ base = os.path.join(self.base_path, base)
208
+
209
+ return os.path.join(base, table_name) if base else table_name
210
+
211
+
212
+ class Project:
213
+ """
214
+ Unified Project API for Odibi.
215
+
216
+ Integrates project configuration, connections, and semantic layer
217
+ into a single interface for seamless querying.
218
+
219
+ Example:
220
+ # Load project and query
221
+ project = Project.load("odibi.yaml")
222
+ result = project.query("revenue BY region")
223
+
224
+ # Access the DataFrame
225
+ print(result.df)
226
+
227
+ # Multiple metrics and dimensions
228
+ result = project.query("revenue, order_count BY region, month")
229
+
230
+ # With filters
231
+ result = project.query("revenue BY category WHERE region = 'North'")
232
+ """
233
+
234
+ def __init__(
235
+ self,
236
+ config: ProjectConfig,
237
+ semantic_config: Optional[SemanticLayerConfig] = None,
238
+ base_path: str = "",
239
+ lazy: bool = True,
240
+ ):
241
+ """
242
+ Initialize the Project.
243
+
244
+ Args:
245
+ config: ProjectConfig instance
246
+ semantic_config: Optional SemanticLayerConfig (loaded from config.semantic if not provided)
247
+ base_path: Base path for resolving relative paths
248
+ lazy: If True, load tables on-demand; if False, load all upfront
249
+ """
250
+ self.config = config
251
+ self.base_path = base_path
252
+ self.lazy = lazy
253
+ self._context: Optional[EngineContext] = None
254
+ self._loaded_tables: Dict[str, Any] = {}
255
+
256
+ self._resolver = SourceResolver(config.connections, base_path, config.pipelines)
257
+
258
+ if semantic_config:
259
+ self._semantic_config = semantic_config
260
+ else:
261
+ self._semantic_config = self._load_semantic_config()
262
+
263
+ self._query_engine: Optional[SemanticQuery] = None
264
+ if self._semantic_config:
265
+ self._query_engine = SemanticQuery(self._semantic_config)
266
+
267
+ @classmethod
268
+ def load(
269
+ cls,
270
+ config_path: str,
271
+ semantic_path: Optional[str] = None,
272
+ lazy: bool = True,
273
+ ) -> "Project":
274
+ """
275
+ Load a Project from configuration file(s).
276
+
277
+ Args:
278
+ config_path: Path to odibi.yaml
279
+ semantic_path: Optional path to semantic config (overrides config.semantic)
280
+ lazy: If True, load tables on-demand
281
+
282
+ Returns:
283
+ Project instance
284
+ """
285
+ ctx = get_logging_context()
286
+ ctx.info("Loading project", config=config_path)
287
+
288
+ config = load_config_from_file(config_path)
289
+ base_path = str(Path(config_path).parent.absolute())
290
+
291
+ semantic_config = None
292
+ if semantic_path:
293
+ semantic_config = cls._load_semantic_from_file(semantic_path, base_path)
294
+ elif config.semantic:
295
+ if "config" in config.semantic:
296
+ semantic_file = config.semantic["config"]
297
+ if not os.path.isabs(semantic_file):
298
+ semantic_file = os.path.join(base_path, semantic_file)
299
+ semantic_config = cls._load_semantic_from_file(semantic_file, base_path)
300
+ else:
301
+ semantic_config = parse_semantic_config(config.semantic)
302
+
303
+ return cls(
304
+ config=config,
305
+ semantic_config=semantic_config,
306
+ base_path=base_path,
307
+ lazy=lazy,
308
+ )
309
+
310
+ @staticmethod
311
+ def _load_semantic_from_file(path: str, base_path: str) -> SemanticLayerConfig:
312
+ """Load semantic config from a YAML file."""
313
+ import yaml
314
+
315
+ with open(path, "r") as f:
316
+ data = yaml.safe_load(f)
317
+
318
+ return parse_semantic_config(data)
319
+
320
+ def _load_semantic_config(self) -> Optional[SemanticLayerConfig]:
321
+ """Load semantic config from project config."""
322
+ if not self.config.semantic:
323
+ return None
324
+
325
+ if "config" in self.config.semantic:
326
+ semantic_file = self.config.semantic["config"]
327
+ if not os.path.isabs(semantic_file):
328
+ semantic_file = os.path.join(self.base_path, semantic_file)
329
+ return self._load_semantic_from_file(semantic_file, self.base_path)
330
+ else:
331
+ return parse_semantic_config(self.config.semantic)
332
+
333
+ def _get_context(self) -> EngineContext:
334
+ """Get or create the engine context."""
335
+ if self._context is None:
336
+ from odibi.context import PandasContext, PolarsContext
337
+
338
+ engine_type = EngineType(self.config.engine.value)
339
+
340
+ if engine_type == EngineType.PANDAS:
341
+ base_context = PandasContext()
342
+ elif engine_type == EngineType.POLARS:
343
+ base_context = PolarsContext()
344
+ else:
345
+ from odibi.context import SparkContext
346
+ from pyspark.sql import SparkSession
347
+
348
+ spark = SparkSession.builder.getOrCreate()
349
+ base_context = SparkContext(spark)
350
+
351
+ self._context = EngineContext(context=base_context, df=None, engine_type=engine_type)
352
+ return self._context
353
+
354
+ def _load_table(self, source: str) -> Any:
355
+ """
356
+ Load a table from its source reference.
357
+
358
+ Args:
359
+ source: Source reference (e.g., "gold.fact_orders")
360
+
361
+ Returns:
362
+ DataFrame (Pandas, Spark, or Polars depending on engine)
363
+ """
364
+ if source in self._loaded_tables:
365
+ return self._loaded_tables[source]
366
+
367
+ ctx = get_logging_context()
368
+ connection_name, full_path = self._resolver.resolve(source)
369
+ connection = self.config.connections[connection_name]
370
+
371
+ ctx.debug("Loading table", source=source, path=full_path)
372
+
373
+ df = self._read_from_connection(connection, full_path)
374
+ self._loaded_tables[source] = df
375
+
376
+ return df
377
+
378
+ def _read_from_connection(self, connection: ConnectionConfig, path: str) -> Any:
379
+ """
380
+ Read data from a connection.
381
+
382
+ Supports Delta, local files, and catalog references.
383
+ """
384
+ conn_dict = connection.model_dump() if hasattr(connection, "model_dump") else connection
385
+ conn_type = conn_dict.get("type", "local")
386
+
387
+ engine_type = EngineType(self.config.engine.value)
388
+
389
+ if engine_type == EngineType.SPARK:
390
+ return self._read_spark(conn_dict, path, conn_type)
391
+ elif engine_type == EngineType.POLARS:
392
+ return self._read_polars(conn_dict, path, conn_type)
393
+ else:
394
+ return self._read_pandas(conn_dict, path, conn_type)
395
+
396
+ def _read_spark(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
397
+ """Read data using Spark."""
398
+ from pyspark.sql import SparkSession
399
+
400
+ spark = SparkSession.builder.getOrCreate()
401
+
402
+ if conn_type == "delta":
403
+ if "catalog" in conn_dict:
404
+ return spark.table(path)
405
+ else:
406
+ return spark.read.format("delta").load(path)
407
+ else:
408
+ if os.path.exists(path):
409
+ if path.endswith(".parquet") or os.path.isdir(path):
410
+ return spark.read.parquet(path)
411
+ elif path.endswith(".csv"):
412
+ return spark.read.csv(path, header=True, inferSchema=True)
413
+ return spark.table(path)
414
+
415
+ def _read_pandas(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
416
+ """Read data using Pandas."""
417
+ import pandas as pd
418
+
419
+ if conn_type == "delta":
420
+ try:
421
+ from deltalake import DeltaTable
422
+
423
+ dt = DeltaTable(path)
424
+ return dt.to_pandas()
425
+ except ImportError:
426
+ raise ImportError(
427
+ "deltalake package required for Delta tables with Pandas. "
428
+ "Install with: pip install deltalake"
429
+ )
430
+ else:
431
+ if path.endswith(".parquet") or os.path.isdir(path):
432
+ return pd.read_parquet(path)
433
+ elif path.endswith(".csv"):
434
+ return pd.read_csv(path)
435
+ else:
436
+ return pd.read_parquet(path)
437
+
438
+ def _read_polars(self, conn_dict: Dict, path: str, conn_type: str) -> Any:
439
+ """Read data using Polars."""
440
+ import polars as pl
441
+
442
+ if conn_type == "delta":
443
+ return pl.read_delta(path)
444
+ else:
445
+ if path.endswith(".parquet") or os.path.isdir(path):
446
+ return pl.read_parquet(path)
447
+ elif path.endswith(".csv"):
448
+ return pl.read_csv(path)
449
+ else:
450
+ return pl.read_parquet(path)
451
+
452
+ def _get_sources_for_query(self, query_string: str) -> List[str]:
453
+ """
454
+ Get all source tables needed for a query.
455
+
456
+ Args:
457
+ query_string: Semantic query string
458
+
459
+ Returns:
460
+ List of source references
461
+ """
462
+ if not self._query_engine:
463
+ return []
464
+
465
+ parsed = self._query_engine.parse(query_string)
466
+ sources = set()
467
+
468
+ for metric_name in parsed.metrics:
469
+ metric = self._semantic_config.get_metric(metric_name)
470
+ if metric and metric.source:
471
+ sources.add(metric.source)
472
+
473
+ for dim_name in parsed.dimensions:
474
+ dim = self._semantic_config.get_dimension(dim_name)
475
+ if dim and dim.source:
476
+ sources.add(dim.source)
477
+
478
+ return list(sources)
479
+
480
+ def _ensure_tables_loaded(self, sources: List[str]) -> None:
481
+ """
482
+ Ensure all required tables are loaded into context.
483
+
484
+ Args:
485
+ sources: List of source references to load
486
+ """
487
+ context = self._get_context()
488
+
489
+ for source in sources:
490
+ table_name = source.split(".")[-1] if "." in source else source
491
+
492
+ if table_name not in self._loaded_tables:
493
+ df = self._load_table(source)
494
+ context.context.register(table_name, df)
495
+
496
+ def query(self, query_string: str) -> QueryResult:
497
+ """
498
+ Execute a semantic query.
499
+
500
+ Args:
501
+ query_string: Semantic query (e.g., "revenue BY region")
502
+
503
+ Returns:
504
+ QueryResult with DataFrame and metadata
505
+
506
+ Raises:
507
+ ValueError: If semantic layer not configured or invalid query
508
+ """
509
+ if not self._query_engine:
510
+ raise ValueError(
511
+ "Semantic layer not configured. Add 'semantic' section to odibi.yaml "
512
+ "or provide a semantic config file."
513
+ )
514
+
515
+ ctx = get_logging_context()
516
+ ctx.info("Executing project query", query=query_string)
517
+
518
+ sources = self._get_sources_for_query(query_string)
519
+ self._ensure_tables_loaded(sources)
520
+
521
+ context = self._get_context()
522
+ return self._query_engine.execute(query_string, context)
523
+
524
+ def register(self, name: str, df: Any) -> None:
525
+ """
526
+ Manually register a DataFrame in the context.
527
+
528
+ Useful for testing or when data comes from non-standard sources.
529
+
530
+ Args:
531
+ name: Table name to register
532
+ df: DataFrame to register
533
+ """
534
+ context = self._get_context()
535
+ context.context.register(name, df)
536
+ self._loaded_tables[name] = df
537
+
538
+ @property
539
+ def semantic_config(self) -> Optional[SemanticLayerConfig]:
540
+ """Get the semantic layer configuration."""
541
+ return self._semantic_config
542
+
543
+ @property
544
+ def connections(self) -> Dict[str, ConnectionConfig]:
545
+ """Get the connection configurations."""
546
+ return self.config.connections
547
+
548
+ @property
549
+ def metrics(self) -> List[str]:
550
+ """Get list of available metric names."""
551
+ if not self._semantic_config:
552
+ return []
553
+ return [m.name for m in self._semantic_config.metrics]
554
+
555
+ @property
556
+ def dimensions(self) -> List[str]:
557
+ """Get list of available dimension names."""
558
+ if not self._semantic_config:
559
+ return []
560
+ return [d.name for d in self._semantic_config.dimensions]
561
+
562
+ def describe(self) -> Dict[str, Any]:
563
+ """
564
+ Get a description of the project and its semantic layer.
565
+
566
+ Returns:
567
+ Dictionary with project info, metrics, dimensions
568
+ """
569
+ return {
570
+ "project": self.config.project,
571
+ "engine": self.config.engine.value,
572
+ "connections": list(self.config.connections.keys()),
573
+ "metrics": [
574
+ {"name": m.name, "description": m.description, "source": m.source}
575
+ for m in (self._semantic_config.metrics if self._semantic_config else [])
576
+ ],
577
+ "dimensions": [
578
+ {"name": d.name, "source": d.source, "column": d.get_column()}
579
+ for d in (self._semantic_config.dimensions if self._semantic_config else [])
580
+ ],
581
+ }