odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/cli/__init__.py
ADDED
odibi/cli/__main__.py
ADDED
odibi/cli/catalog.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""Catalog CLI command for querying the System Catalog."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timedelta, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from odibi.pipeline import PipelineManager
|
|
8
|
+
from odibi.utils.extensions import load_extensions
|
|
9
|
+
from odibi.utils.logging import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def add_catalog_parser(subparsers):
|
|
13
|
+
"""Add catalog subcommand parser."""
|
|
14
|
+
catalog_parser = subparsers.add_parser(
|
|
15
|
+
"catalog",
|
|
16
|
+
help="Query System Catalog metadata",
|
|
17
|
+
description="Query and explore the System Catalog (runs, pipelines, nodes, state, etc.)",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
catalog_subparsers = catalog_parser.add_subparsers(
|
|
21
|
+
dest="catalog_command", help="Catalog commands"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# odibi catalog runs
|
|
25
|
+
runs_parser = catalog_subparsers.add_parser("runs", help="List execution runs from meta_runs")
|
|
26
|
+
runs_parser.add_argument("config", help="Path to YAML config file")
|
|
27
|
+
runs_parser.add_argument("--pipeline", "-p", help="Filter by pipeline name")
|
|
28
|
+
runs_parser.add_argument("--node", "-n", help="Filter by node name")
|
|
29
|
+
runs_parser.add_argument(
|
|
30
|
+
"--status", "-s", choices=["SUCCESS", "FAILED", "RUNNING"], help="Filter by status"
|
|
31
|
+
)
|
|
32
|
+
runs_parser.add_argument(
|
|
33
|
+
"--days", "-d", type=int, default=7, help="Show runs from last N days (default: 7)"
|
|
34
|
+
)
|
|
35
|
+
runs_parser.add_argument(
|
|
36
|
+
"--limit", "-l", type=int, default=20, help="Maximum number of runs to show (default: 20)"
|
|
37
|
+
)
|
|
38
|
+
runs_parser.add_argument(
|
|
39
|
+
"--format",
|
|
40
|
+
"-f",
|
|
41
|
+
choices=["table", "json"],
|
|
42
|
+
default="table",
|
|
43
|
+
help="Output format (default: table)",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# odibi catalog pipelines
|
|
47
|
+
pipelines_parser = catalog_subparsers.add_parser(
|
|
48
|
+
"pipelines", help="List registered pipelines from meta_pipelines"
|
|
49
|
+
)
|
|
50
|
+
pipelines_parser.add_argument("config", help="Path to YAML config file")
|
|
51
|
+
pipelines_parser.add_argument(
|
|
52
|
+
"--format",
|
|
53
|
+
"-f",
|
|
54
|
+
choices=["table", "json"],
|
|
55
|
+
default="table",
|
|
56
|
+
help="Output format (default: table)",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# odibi catalog nodes
|
|
60
|
+
nodes_parser = catalog_subparsers.add_parser(
|
|
61
|
+
"nodes", help="List registered nodes from meta_nodes"
|
|
62
|
+
)
|
|
63
|
+
nodes_parser.add_argument("config", help="Path to YAML config file")
|
|
64
|
+
nodes_parser.add_argument("--pipeline", "-p", help="Filter by pipeline name")
|
|
65
|
+
nodes_parser.add_argument(
|
|
66
|
+
"--format",
|
|
67
|
+
"-f",
|
|
68
|
+
choices=["table", "json"],
|
|
69
|
+
default="table",
|
|
70
|
+
help="Output format (default: table)",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# odibi catalog state
|
|
74
|
+
state_parser = catalog_subparsers.add_parser("state", help="List HWM state from meta_state")
|
|
75
|
+
state_parser.add_argument("config", help="Path to YAML config file")
|
|
76
|
+
state_parser.add_argument("--pipeline", "-p", help="Filter by pipeline name")
|
|
77
|
+
state_parser.add_argument(
|
|
78
|
+
"--format",
|
|
79
|
+
"-f",
|
|
80
|
+
choices=["table", "json"],
|
|
81
|
+
default="table",
|
|
82
|
+
help="Output format (default: table)",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# odibi catalog tables
|
|
86
|
+
tables_parser = catalog_subparsers.add_parser(
|
|
87
|
+
"tables", help="List registered assets from meta_tables"
|
|
88
|
+
)
|
|
89
|
+
tables_parser.add_argument("config", help="Path to YAML config file")
|
|
90
|
+
tables_parser.add_argument("--project", help="Filter by project name")
|
|
91
|
+
tables_parser.add_argument(
|
|
92
|
+
"--format",
|
|
93
|
+
"-f",
|
|
94
|
+
choices=["table", "json"],
|
|
95
|
+
default="table",
|
|
96
|
+
help="Output format (default: table)",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# odibi catalog metrics
|
|
100
|
+
metrics_parser = catalog_subparsers.add_parser(
|
|
101
|
+
"metrics", help="List metrics definitions from meta_metrics"
|
|
102
|
+
)
|
|
103
|
+
metrics_parser.add_argument("config", help="Path to YAML config file")
|
|
104
|
+
metrics_parser.add_argument(
|
|
105
|
+
"--format",
|
|
106
|
+
"-f",
|
|
107
|
+
choices=["table", "json"],
|
|
108
|
+
default="table",
|
|
109
|
+
help="Output format (default: table)",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# odibi catalog patterns
|
|
113
|
+
patterns_parser = catalog_subparsers.add_parser(
|
|
114
|
+
"patterns", help="List pattern compliance from meta_patterns"
|
|
115
|
+
)
|
|
116
|
+
patterns_parser.add_argument("config", help="Path to YAML config file")
|
|
117
|
+
patterns_parser.add_argument(
|
|
118
|
+
"--format",
|
|
119
|
+
"-f",
|
|
120
|
+
choices=["table", "json"],
|
|
121
|
+
default="table",
|
|
122
|
+
help="Output format (default: table)",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# odibi catalog stats
|
|
126
|
+
stats_parser = catalog_subparsers.add_parser("stats", help="Show execution statistics")
|
|
127
|
+
stats_parser.add_argument("config", help="Path to YAML config file")
|
|
128
|
+
stats_parser.add_argument("--pipeline", "-p", help="Filter by pipeline name")
|
|
129
|
+
stats_parser.add_argument(
|
|
130
|
+
"--days", "-d", type=int, default=7, help="Statistics over last N days (default: 7)"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return catalog_parser
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def catalog_command(args):
|
|
137
|
+
"""Execute catalog command."""
|
|
138
|
+
if not hasattr(args, "catalog_command") or args.catalog_command is None:
|
|
139
|
+
print("Usage: odibi catalog <command>")
|
|
140
|
+
print("\nAvailable commands:")
|
|
141
|
+
print(" runs List execution runs")
|
|
142
|
+
print(" pipelines List registered pipelines")
|
|
143
|
+
print(" nodes List registered nodes")
|
|
144
|
+
print(" state List HWM state checkpoints")
|
|
145
|
+
print(" tables List registered assets")
|
|
146
|
+
print(" metrics List metrics definitions")
|
|
147
|
+
print(" patterns List pattern compliance")
|
|
148
|
+
print(" stats Show execution statistics")
|
|
149
|
+
return 1
|
|
150
|
+
|
|
151
|
+
command_map = {
|
|
152
|
+
"runs": _runs_command,
|
|
153
|
+
"pipelines": _pipelines_command,
|
|
154
|
+
"nodes": _nodes_command,
|
|
155
|
+
"state": _state_command,
|
|
156
|
+
"tables": _tables_command,
|
|
157
|
+
"metrics": _metrics_command,
|
|
158
|
+
"patterns": _patterns_command,
|
|
159
|
+
"stats": _stats_command,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
handler = command_map.get(args.catalog_command)
|
|
163
|
+
if handler:
|
|
164
|
+
return handler(args)
|
|
165
|
+
else:
|
|
166
|
+
print(f"Unknown catalog command: {args.catalog_command}")
|
|
167
|
+
return 1
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _get_catalog_manager(args):
|
|
171
|
+
"""Load config and return catalog manager."""
|
|
172
|
+
try:
|
|
173
|
+
config_path = Path(args.config).resolve()
|
|
174
|
+
|
|
175
|
+
load_extensions(config_path.parent)
|
|
176
|
+
if config_path.parent.parent != config_path.parent:
|
|
177
|
+
load_extensions(config_path.parent.parent)
|
|
178
|
+
if config_path.parent != Path.cwd():
|
|
179
|
+
load_extensions(Path.cwd())
|
|
180
|
+
|
|
181
|
+
manager = PipelineManager.from_yaml(args.config)
|
|
182
|
+
|
|
183
|
+
if not manager.catalog_manager:
|
|
184
|
+
logger.error("System Catalog not configured. Add 'system' section to config.")
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
return manager.catalog_manager
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f"Failed to load configuration: {e}")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _format_table(headers: list, rows: list, max_width: int = 40) -> str:
|
|
195
|
+
"""Format data as ASCII table."""
|
|
196
|
+
if not rows:
|
|
197
|
+
return "No data found."
|
|
198
|
+
|
|
199
|
+
def truncate(val, width):
|
|
200
|
+
s = str(val) if val is not None else ""
|
|
201
|
+
if len(s) > width:
|
|
202
|
+
return s[: width - 3] + "..."
|
|
203
|
+
return s
|
|
204
|
+
|
|
205
|
+
col_widths = []
|
|
206
|
+
for i, header in enumerate(headers):
|
|
207
|
+
max_col = len(header)
|
|
208
|
+
for row in rows:
|
|
209
|
+
if i < len(row):
|
|
210
|
+
max_col = max(max_col, min(len(str(row[i] or "")), max_width))
|
|
211
|
+
col_widths.append(min(max_col, max_width))
|
|
212
|
+
|
|
213
|
+
header_line = " | ".join(h.ljust(col_widths[i]) for i, h in enumerate(headers))
|
|
214
|
+
separator = "-+-".join("-" * w for w in col_widths)
|
|
215
|
+
|
|
216
|
+
lines = [header_line, separator]
|
|
217
|
+
for row in rows:
|
|
218
|
+
row_line = " | ".join(
|
|
219
|
+
truncate(row[i] if i < len(row) else "", col_widths[i]).ljust(col_widths[i])
|
|
220
|
+
for i in range(len(headers))
|
|
221
|
+
)
|
|
222
|
+
lines.append(row_line)
|
|
223
|
+
|
|
224
|
+
return "\n".join(lines)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _format_output(headers: list, rows: list, output_format: str) -> str:
|
|
228
|
+
"""Format output as table or JSON."""
|
|
229
|
+
if output_format == "json":
|
|
230
|
+
data = [dict(zip(headers, row)) for row in rows]
|
|
231
|
+
return json.dumps(data, indent=2, default=str)
|
|
232
|
+
else:
|
|
233
|
+
return _format_table(headers, rows)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _runs_command(args) -> int:
|
|
237
|
+
"""List execution runs."""
|
|
238
|
+
catalog = _get_catalog_manager(args)
|
|
239
|
+
if not catalog:
|
|
240
|
+
return 1
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
df = catalog._read_local_table(catalog.tables["meta_runs"])
|
|
244
|
+
|
|
245
|
+
if df.empty:
|
|
246
|
+
print("No runs found in catalog.")
|
|
247
|
+
return 0
|
|
248
|
+
|
|
249
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=args.days)
|
|
250
|
+
|
|
251
|
+
if "timestamp" in df.columns:
|
|
252
|
+
import pandas as pd
|
|
253
|
+
|
|
254
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
255
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
|
256
|
+
|
|
257
|
+
if df["timestamp"].dt.tz is None:
|
|
258
|
+
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
|
|
259
|
+
|
|
260
|
+
df = df[df["timestamp"] >= cutoff]
|
|
261
|
+
|
|
262
|
+
if args.pipeline and "pipeline_name" in df.columns:
|
|
263
|
+
df = df[df["pipeline_name"] == args.pipeline]
|
|
264
|
+
|
|
265
|
+
if args.node and "node_name" in df.columns:
|
|
266
|
+
df = df[df["node_name"] == args.node]
|
|
267
|
+
|
|
268
|
+
if args.status and "status" in df.columns:
|
|
269
|
+
df = df[df["status"] == args.status]
|
|
270
|
+
|
|
271
|
+
if "timestamp" in df.columns:
|
|
272
|
+
df = df.sort_values("timestamp", ascending=False)
|
|
273
|
+
|
|
274
|
+
df = df.head(args.limit)
|
|
275
|
+
|
|
276
|
+
headers = [
|
|
277
|
+
"run_id",
|
|
278
|
+
"pipeline_name",
|
|
279
|
+
"node_name",
|
|
280
|
+
"status",
|
|
281
|
+
"rows",
|
|
282
|
+
"duration_ms",
|
|
283
|
+
"timestamp",
|
|
284
|
+
]
|
|
285
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
286
|
+
|
|
287
|
+
if "rows" in available_cols:
|
|
288
|
+
available_cols[available_cols.index("rows")] = "rows_processed"
|
|
289
|
+
headers[headers.index("rows")] = "rows_processed"
|
|
290
|
+
|
|
291
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
292
|
+
|
|
293
|
+
print(_format_output(headers, rows, args.format))
|
|
294
|
+
print(f"\nShowing {len(rows)} runs from the last {args.days} days.")
|
|
295
|
+
return 0
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.error(f"Failed to query runs: {e}")
|
|
299
|
+
return 1
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _pipelines_command(args) -> int:
|
|
303
|
+
"""List registered pipelines."""
|
|
304
|
+
catalog = _get_catalog_manager(args)
|
|
305
|
+
if not catalog:
|
|
306
|
+
return 1
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
df = catalog._read_local_table(catalog.tables["meta_pipelines"])
|
|
310
|
+
|
|
311
|
+
if df.empty:
|
|
312
|
+
print("No pipelines registered in catalog. Run 'odibi deploy' first.")
|
|
313
|
+
return 0
|
|
314
|
+
|
|
315
|
+
headers = ["pipeline_name", "layer", "description", "version_hash", "updated_at"]
|
|
316
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
317
|
+
|
|
318
|
+
if "updated_at" in df.columns:
|
|
319
|
+
df = df.sort_values("updated_at", ascending=False)
|
|
320
|
+
|
|
321
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
322
|
+
|
|
323
|
+
print(_format_output(headers, rows, args.format))
|
|
324
|
+
print(f"\n{len(rows)} pipeline(s) registered.")
|
|
325
|
+
return 0
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.error(f"Failed to query pipelines: {e}")
|
|
329
|
+
return 1
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _nodes_command(args) -> int:
|
|
333
|
+
"""List registered nodes."""
|
|
334
|
+
catalog = _get_catalog_manager(args)
|
|
335
|
+
if not catalog:
|
|
336
|
+
return 1
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
df = catalog._read_local_table(catalog.tables["meta_nodes"])
|
|
340
|
+
|
|
341
|
+
if df.empty:
|
|
342
|
+
print("No nodes registered in catalog. Run 'odibi deploy' first.")
|
|
343
|
+
return 0
|
|
344
|
+
|
|
345
|
+
if args.pipeline and "pipeline_name" in df.columns:
|
|
346
|
+
df = df[df["pipeline_name"] == args.pipeline]
|
|
347
|
+
|
|
348
|
+
headers = ["pipeline_name", "node_name", "type", "version_hash", "updated_at"]
|
|
349
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
350
|
+
|
|
351
|
+
if "updated_at" in df.columns:
|
|
352
|
+
df = df.sort_values("updated_at", ascending=False)
|
|
353
|
+
|
|
354
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
355
|
+
|
|
356
|
+
print(_format_output(headers, rows, args.format))
|
|
357
|
+
print(f"\n{len(rows)} node(s) registered.")
|
|
358
|
+
return 0
|
|
359
|
+
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Failed to query nodes: {e}")
|
|
362
|
+
return 1
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _state_command(args) -> int:
|
|
366
|
+
"""List HWM state checkpoints."""
|
|
367
|
+
catalog = _get_catalog_manager(args)
|
|
368
|
+
if not catalog:
|
|
369
|
+
return 1
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
df = catalog._read_local_table(catalog.tables["meta_state"])
|
|
373
|
+
|
|
374
|
+
if df.empty:
|
|
375
|
+
print("No state checkpoints found in catalog.")
|
|
376
|
+
return 0
|
|
377
|
+
|
|
378
|
+
if args.pipeline and "key" in df.columns:
|
|
379
|
+
df = df[df["key"].str.contains(args.pipeline, na=False)]
|
|
380
|
+
|
|
381
|
+
headers = ["key", "value", "updated_at"]
|
|
382
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
383
|
+
|
|
384
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
385
|
+
|
|
386
|
+
print(_format_output(headers, rows, args.format))
|
|
387
|
+
print(f"\n{len(rows)} state checkpoint(s) found.")
|
|
388
|
+
return 0
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logger.error(f"Failed to query state: {e}")
|
|
392
|
+
return 1
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _tables_command(args) -> int:
|
|
396
|
+
"""List registered assets."""
|
|
397
|
+
catalog = _get_catalog_manager(args)
|
|
398
|
+
if not catalog:
|
|
399
|
+
return 1
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
df = catalog._read_local_table(catalog.tables["meta_tables"])
|
|
403
|
+
|
|
404
|
+
if df.empty:
|
|
405
|
+
print("No assets registered in catalog.")
|
|
406
|
+
return 0
|
|
407
|
+
|
|
408
|
+
if args.project and "project_name" in df.columns:
|
|
409
|
+
df = df[df["project_name"] == args.project]
|
|
410
|
+
|
|
411
|
+
headers = ["project_name", "table_name", "path", "format", "pattern_type", "updated_at"]
|
|
412
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
413
|
+
|
|
414
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
415
|
+
|
|
416
|
+
print(_format_output(headers, rows, args.format))
|
|
417
|
+
print(f"\n{len(rows)} asset(s) registered.")
|
|
418
|
+
return 0
|
|
419
|
+
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.error(f"Failed to query tables: {e}")
|
|
422
|
+
return 1
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _metrics_command(args) -> int:
|
|
426
|
+
"""List metrics definitions."""
|
|
427
|
+
catalog = _get_catalog_manager(args)
|
|
428
|
+
if not catalog:
|
|
429
|
+
return 1
|
|
430
|
+
|
|
431
|
+
try:
|
|
432
|
+
df = catalog._read_local_table(catalog.tables["meta_metrics"])
|
|
433
|
+
|
|
434
|
+
if df.empty:
|
|
435
|
+
print("No metrics defined in catalog.")
|
|
436
|
+
return 0
|
|
437
|
+
|
|
438
|
+
headers = ["metric_name", "source_table", "definition_sql", "dimensions"]
|
|
439
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
440
|
+
|
|
441
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
442
|
+
|
|
443
|
+
print(_format_output(headers, rows, args.format))
|
|
444
|
+
print(f"\n{len(rows)} metric(s) defined.")
|
|
445
|
+
return 0
|
|
446
|
+
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logger.error(f"Failed to query metrics: {e}")
|
|
449
|
+
return 1
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _patterns_command(args) -> int:
|
|
453
|
+
"""List pattern compliance."""
|
|
454
|
+
catalog = _get_catalog_manager(args)
|
|
455
|
+
if not catalog:
|
|
456
|
+
return 1
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
df = catalog._read_local_table(catalog.tables["meta_patterns"])
|
|
460
|
+
|
|
461
|
+
if df.empty:
|
|
462
|
+
print("No pattern data in catalog.")
|
|
463
|
+
return 0
|
|
464
|
+
|
|
465
|
+
headers = ["table_name", "pattern_type", "compliance_score", "configuration"]
|
|
466
|
+
available_cols = [c for c in headers if c in df.columns]
|
|
467
|
+
|
|
468
|
+
rows = df[available_cols].values.tolist() if available_cols else []
|
|
469
|
+
|
|
470
|
+
print(_format_output(headers, rows, args.format))
|
|
471
|
+
print(f"\n{len(rows)} pattern record(s) found.")
|
|
472
|
+
return 0
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logger.error(f"Failed to query patterns: {e}")
|
|
476
|
+
return 1
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _stats_command(args) -> int:
|
|
480
|
+
"""Show execution statistics."""
|
|
481
|
+
catalog = _get_catalog_manager(args)
|
|
482
|
+
if not catalog:
|
|
483
|
+
return 1
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
import pandas as pd
|
|
487
|
+
|
|
488
|
+
df = catalog._read_local_table(catalog.tables["meta_runs"])
|
|
489
|
+
|
|
490
|
+
if df.empty:
|
|
491
|
+
print("No runs found in catalog for statistics.")
|
|
492
|
+
return 0
|
|
493
|
+
|
|
494
|
+
cutoff = datetime.now(timezone.utc) - timedelta(days=args.days)
|
|
495
|
+
|
|
496
|
+
if "timestamp" in df.columns:
|
|
497
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
498
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
|
499
|
+
|
|
500
|
+
if df["timestamp"].dt.tz is None:
|
|
501
|
+
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
|
|
502
|
+
|
|
503
|
+
df = df[df["timestamp"] >= cutoff]
|
|
504
|
+
|
|
505
|
+
if args.pipeline and "pipeline_name" in df.columns:
|
|
506
|
+
df = df[df["pipeline_name"] == args.pipeline]
|
|
507
|
+
|
|
508
|
+
if df.empty:
|
|
509
|
+
print(f"No runs in the last {args.days} days.")
|
|
510
|
+
return 0
|
|
511
|
+
|
|
512
|
+
print(f"=== Execution Statistics (Last {args.days} Days) ===\n")
|
|
513
|
+
|
|
514
|
+
total_runs = len(df)
|
|
515
|
+
success_runs = len(df[df["status"] == "SUCCESS"]) if "status" in df.columns else 0
|
|
516
|
+
failed_runs = len(df[df["status"] == "FAILED"]) if "status" in df.columns else 0
|
|
517
|
+
success_rate = (success_runs / total_runs * 100) if total_runs > 0 else 0
|
|
518
|
+
|
|
519
|
+
print(f"Total Runs: {total_runs}")
|
|
520
|
+
print(f"Successful: {success_runs}")
|
|
521
|
+
print(f"Failed: {failed_runs}")
|
|
522
|
+
print(f"Success Rate: {success_rate:.1f}%")
|
|
523
|
+
|
|
524
|
+
if "rows_processed" in df.columns:
|
|
525
|
+
total_rows = df["rows_processed"].sum()
|
|
526
|
+
avg_rows = df["rows_processed"].mean()
|
|
527
|
+
print(f"\nTotal Rows: {int(total_rows):,}")
|
|
528
|
+
print(f"Avg Rows/Run: {int(avg_rows):,}")
|
|
529
|
+
|
|
530
|
+
if "duration_ms" in df.columns:
|
|
531
|
+
avg_duration_ms = df["duration_ms"].mean()
|
|
532
|
+
total_duration_ms = df["duration_ms"].sum()
|
|
533
|
+
print(f"\nAvg Duration: {avg_duration_ms / 1000:.2f}s")
|
|
534
|
+
print(f"Total Runtime: {total_duration_ms / 1000:.2f}s")
|
|
535
|
+
|
|
536
|
+
if "pipeline_name" in df.columns:
|
|
537
|
+
print("\n--- Runs by Pipeline ---")
|
|
538
|
+
pipeline_counts = df["pipeline_name"].value_counts()
|
|
539
|
+
for pipeline, count in pipeline_counts.items():
|
|
540
|
+
print(f" {pipeline}: {count}")
|
|
541
|
+
|
|
542
|
+
if "node_name" in df.columns and "status" in df.columns:
|
|
543
|
+
failed_nodes = df[df["status"] == "FAILED"]["node_name"].value_counts()
|
|
544
|
+
if not failed_nodes.empty:
|
|
545
|
+
print("\n--- Most Failed Nodes ---")
|
|
546
|
+
for node, count in failed_nodes.head(5).items():
|
|
547
|
+
print(f" {node}: {count} failures")
|
|
548
|
+
|
|
549
|
+
return 0
|
|
550
|
+
|
|
551
|
+
except Exception as e:
|
|
552
|
+
logger.error(f"Failed to compute statistics: {e}")
|
|
553
|
+
return 1
|
odibi/cli/deploy.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Deploy command implementation."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from odibi.pipeline import PipelineManager
|
|
6
|
+
from odibi.utils.extensions import load_extensions
|
|
7
|
+
from odibi.utils.logging import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def deploy_command(args):
|
|
11
|
+
"""Deploy pipeline definitions to System Catalog."""
|
|
12
|
+
try:
|
|
13
|
+
config_path = Path(args.config).resolve()
|
|
14
|
+
|
|
15
|
+
logger.info(f"Deploying configuration from {config_path}...")
|
|
16
|
+
|
|
17
|
+
# Load extensions (similar to run command)
|
|
18
|
+
load_extensions(config_path.parent)
|
|
19
|
+
if config_path.parent.parent != config_path.parent:
|
|
20
|
+
load_extensions(config_path.parent.parent)
|
|
21
|
+
if config_path.parent != Path.cwd():
|
|
22
|
+
load_extensions(Path.cwd())
|
|
23
|
+
|
|
24
|
+
# Load Pipeline Manager (this validates config and initializes connections/catalog)
|
|
25
|
+
# We don't need environment overrides for deployment typically, but user might want to deploy prod config?
|
|
26
|
+
# The user should pass the correct config file or env vars.
|
|
27
|
+
# We'll respect --env arg if we add it to deploy command.
|
|
28
|
+
env = getattr(args, "env", None)
|
|
29
|
+
manager = PipelineManager.from_yaml(args.config, env=env)
|
|
30
|
+
|
|
31
|
+
if not manager.catalog_manager:
|
|
32
|
+
logger.error("System Catalog not configured. Cannot deploy.")
|
|
33
|
+
return 1
|
|
34
|
+
|
|
35
|
+
catalog = manager.catalog_manager
|
|
36
|
+
|
|
37
|
+
# Bootstrap (ensure tables exist)
|
|
38
|
+
# PipelineManager.__init__ already calls bootstrap(), but calling it again is safe/idempotent
|
|
39
|
+
catalog.bootstrap()
|
|
40
|
+
|
|
41
|
+
logger.info(f"Syncing to System Catalog at: {catalog.base_path}")
|
|
42
|
+
|
|
43
|
+
project_config = manager.project_config
|
|
44
|
+
pipelines = project_config.pipelines
|
|
45
|
+
|
|
46
|
+
total_pipelines = len(pipelines)
|
|
47
|
+
total_nodes = sum(len(p.nodes) for p in pipelines)
|
|
48
|
+
|
|
49
|
+
logger.info(f"Found {total_pipelines} pipelines, {total_nodes} nodes.")
|
|
50
|
+
|
|
51
|
+
for i, pipeline in enumerate(pipelines, 1):
|
|
52
|
+
logger.info(f"[{i}/{total_pipelines}] Syncing pipeline: {pipeline.pipeline}")
|
|
53
|
+
|
|
54
|
+
# Register Pipeline
|
|
55
|
+
catalog.register_pipeline(pipeline, project_config)
|
|
56
|
+
|
|
57
|
+
# Register Nodes
|
|
58
|
+
for node in pipeline.nodes:
|
|
59
|
+
# logger.debug(f" - Syncing node: {node.name}")
|
|
60
|
+
catalog.register_node(pipeline.pipeline, node)
|
|
61
|
+
|
|
62
|
+
logger.info("Deployment complete! System Catalog is up to date.")
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"Deployment failed: {e}")
|
|
67
|
+
# import traceback
|
|
68
|
+
# traceback.print_exc()
|
|
69
|
+
return 1
|