iflow-mcp_niclasolofsson-dbt-core-mcp 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbt_core_mcp/__init__.py +18 -0
- dbt_core_mcp/__main__.py +436 -0
- dbt_core_mcp/context.py +459 -0
- dbt_core_mcp/cte_generator.py +601 -0
- dbt_core_mcp/dbt/__init__.py +1 -0
- dbt_core_mcp/dbt/bridge_runner.py +1361 -0
- dbt_core_mcp/dbt/manifest.py +781 -0
- dbt_core_mcp/dbt/runner.py +67 -0
- dbt_core_mcp/dependencies.py +50 -0
- dbt_core_mcp/server.py +381 -0
- dbt_core_mcp/tools/__init__.py +77 -0
- dbt_core_mcp/tools/analyze_impact.py +78 -0
- dbt_core_mcp/tools/build_models.py +190 -0
- dbt_core_mcp/tools/demo/__init__.py +1 -0
- dbt_core_mcp/tools/demo/hello.html +267 -0
- dbt_core_mcp/tools/demo/ui_demo.py +41 -0
- dbt_core_mcp/tools/get_column_lineage.py +1988 -0
- dbt_core_mcp/tools/get_lineage.py +89 -0
- dbt_core_mcp/tools/get_project_info.py +96 -0
- dbt_core_mcp/tools/get_resource_info.py +134 -0
- dbt_core_mcp/tools/install_deps.py +102 -0
- dbt_core_mcp/tools/list_resources.py +84 -0
- dbt_core_mcp/tools/load_seeds.py +179 -0
- dbt_core_mcp/tools/query_database.py +459 -0
- dbt_core_mcp/tools/run_models.py +234 -0
- dbt_core_mcp/tools/snapshot_models.py +120 -0
- dbt_core_mcp/tools/test_models.py +238 -0
- dbt_core_mcp/utils/__init__.py +1 -0
- dbt_core_mcp/utils/env_detector.py +186 -0
- dbt_core_mcp/utils/process_check.py +130 -0
- dbt_core_mcp/utils/tool_utils.py +411 -0
- dbt_core_mcp/utils/warehouse_adapter.py +82 -0
- dbt_core_mcp/utils/warehouse_databricks.py +297 -0
- iflow_mcp_niclasolofsson_dbt_core_mcp-1.7.0.dist-info/METADATA +784 -0
- iflow_mcp_niclasolofsson_dbt_core_mcp-1.7.0.dist-info/RECORD +38 -0
- iflow_mcp_niclasolofsson_dbt_core_mcp-1.7.0.dist-info/WHEEL +4 -0
- iflow_mcp_niclasolofsson_dbt_core_mcp-1.7.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_niclasolofsson_dbt_core_mcp-1.7.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
"""Execute SQL queries against the dbt project's database.
|
|
2
|
+
|
|
3
|
+
This module implements the query_database tool for dbt Core MCP.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import io
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
import tempfile
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from fastmcp.dependencies import Depends # type: ignore[reportAttributeAccessIssue]
|
|
16
|
+
from fastmcp.server.context import Context
|
|
17
|
+
|
|
18
|
+
from ..context import DbtCoreServerContext
|
|
19
|
+
from ..cte_generator import generate_cte_model
|
|
20
|
+
from ..dependencies import get_state
|
|
21
|
+
from . import dbtTool
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_cte_sql(
|
|
27
|
+
project_dir: Path,
|
|
28
|
+
cte_name: str,
|
|
29
|
+
model_name: str,
|
|
30
|
+
additional_sql: str = "",
|
|
31
|
+
model_paths: list[str] | None = None,
|
|
32
|
+
) -> str:
|
|
33
|
+
"""Extract CTE SQL from a model file and optionally replace the final SELECT.
|
|
34
|
+
|
|
35
|
+
This function extracts a specific CTE from a dbt model file, resolves all its
|
|
36
|
+
upstream dependencies, and optionally replaces the final SELECT with a full query.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
project_dir: Path to the dbt project directory
|
|
40
|
+
cte_name: Name of the CTE to extract
|
|
41
|
+
model_name: Name of the model file containing the CTE (without .sql extension)
|
|
42
|
+
additional_sql: Optional full SELECT/WITH query to replace the final SELECT
|
|
43
|
+
(use __cte__ or {{ cte }}; replaced with the CTE name at execution)
|
|
44
|
+
model_paths: List of model directory paths (defaults to ["models"])
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Complete SQL ready to execute (either the extracted CTE or replaced with full SQL)
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If model file not found, multiple files found, or CTE extraction fails
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
# Extract a CTE with a full SELECT
|
|
54
|
+
sql = extract_cte_sql(
|
|
55
|
+
project_dir,
|
|
56
|
+
"customer_agg",
|
|
57
|
+
"customers",
|
|
58
|
+
"SELECT * FROM __cte__ WHERE order_count > 5 LIMIT 10"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Extract a CTE and replace final SELECT
|
|
62
|
+
sql = extract_cte_sql(
|
|
63
|
+
project_dir,
|
|
64
|
+
"customer_agg",
|
|
65
|
+
"customers",
|
|
66
|
+
"SELECT customer_id, COUNT(*) AS cnt FROM __cte__ GROUP BY customer_id"
|
|
67
|
+
)
|
|
68
|
+
"""
|
|
69
|
+
# Use default model_paths if not provided
|
|
70
|
+
if model_paths is None:
|
|
71
|
+
model_paths = ["models"]
|
|
72
|
+
|
|
73
|
+
# Find the model file - search all configured model paths
|
|
74
|
+
model_files = []
|
|
75
|
+
for model_path in model_paths:
|
|
76
|
+
models_dir = project_dir / model_path
|
|
77
|
+
if models_dir.exists():
|
|
78
|
+
model_files.extend(list(models_dir.rglob(f"{model_name}.sql")))
|
|
79
|
+
|
|
80
|
+
if not model_files:
|
|
81
|
+
paths_searched = ", ".join(model_paths)
|
|
82
|
+
raise ValueError(f"Model file '{model_name}.sql' not found in any model paths: {paths_searched}")
|
|
83
|
+
|
|
84
|
+
if len(model_files) > 1:
|
|
85
|
+
raise ValueError(f"Multiple model files found for '{model_name}': {[str(f) for f in model_files]}")
|
|
86
|
+
|
|
87
|
+
model_file = model_files[0]
|
|
88
|
+
logger.info(f"Extracting CTE '{cte_name}' from model '{model_name}' at {model_file}")
|
|
89
|
+
|
|
90
|
+
# Create a temporary file for the extracted CTE model
|
|
91
|
+
# Use system temp directory to avoid dbt picking it up as a model
|
|
92
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".sql", delete=False) as tmp_file:
|
|
93
|
+
tmp_path = Path(tmp_file.name)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# Generate CTE model using the generator logic
|
|
97
|
+
# The generate_cte_model expects empty test_given list when we're just extracting
|
|
98
|
+
success = generate_cte_model(
|
|
99
|
+
base_model_path=model_file,
|
|
100
|
+
cte_name=cte_name,
|
|
101
|
+
test_given=[], # No CTE mocking when querying
|
|
102
|
+
output_path=tmp_path,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if not success:
|
|
106
|
+
example = 'Example: query_database(cte_name="%s", model_name="%s", sql="SELECT * FROM __cte__ WHERE <filters>" )' % (cte_name, model_name)
|
|
107
|
+
raise ValueError("Failed to extract CTE '%s' from model '%s'. Ensure the CTE name matches the model definition and include __cte__ in your SQL. %s" % (cte_name, model_name, example))
|
|
108
|
+
|
|
109
|
+
# Read the generated CTE SQL
|
|
110
|
+
cte_sql = tmp_path.read_text()
|
|
111
|
+
|
|
112
|
+
# Remove the sqlfluff disable comment if present
|
|
113
|
+
cte_sql = re.sub(r"^-- sqlfluff:disable\s*\n", "", cte_sql, flags=re.MULTILINE)
|
|
114
|
+
|
|
115
|
+
# If user provided additional SQL, require a full SELECT/WITH query
|
|
116
|
+
# The CTE extraction already includes "select * from {cte_name}" at the end
|
|
117
|
+
additional_sql_stripped = additional_sql.strip() if additional_sql else ""
|
|
118
|
+
if additional_sql_stripped:
|
|
119
|
+
if not re.match(r"^(select|with)\b", additional_sql_stripped, flags=re.IGNORECASE):
|
|
120
|
+
# Construct the correct query from the partial SQL provided
|
|
121
|
+
suggested_query = f"SELECT * FROM __cte__ {additional_sql_stripped}"
|
|
122
|
+
raise ValueError(f"additional_sql must be a full SELECT/WITH query when querying a CTE. Did you mean: {suggested_query}")
|
|
123
|
+
|
|
124
|
+
# Ensure the query references the CTE via __cte__ or {{ cte }}
|
|
125
|
+
if not re.search(r"__cte__|{{\s*cte\s*}}", additional_sql_stripped, flags=re.IGNORECASE):
|
|
126
|
+
raise ValueError("additional_sql must reference the CTE using __cte__ or {{ cte }}. Example: SELECT * FROM __cte__ WHERE <condition>")
|
|
127
|
+
|
|
128
|
+
pattern = rf"select \* from {re.escape(cte_name)}$"
|
|
129
|
+
query_sql = re.sub(r"\{\{\s*cte\s*\}\}", cte_name, additional_sql_stripped, flags=re.IGNORECASE)
|
|
130
|
+
query_sql = query_sql.replace("__cte__", cte_name)
|
|
131
|
+
final_sql = re.sub(pattern, query_sql, cte_sql, flags=re.IGNORECASE | re.MULTILINE)
|
|
132
|
+
else:
|
|
133
|
+
# Use the CTE SQL as-is (already has select * from cte_name)
|
|
134
|
+
final_sql = cte_sql
|
|
135
|
+
|
|
136
|
+
logger.debug(f"Final SQL to execute:\n{final_sql[:500]}...")
|
|
137
|
+
return final_sql
|
|
138
|
+
|
|
139
|
+
finally:
|
|
140
|
+
# Clean up temporary file
|
|
141
|
+
# Use a small delay to allow processes to release the file
|
|
142
|
+
import time
|
|
143
|
+
|
|
144
|
+
time.sleep(0.1)
|
|
145
|
+
try:
|
|
146
|
+
if tmp_path.exists():
|
|
147
|
+
tmp_path.unlink()
|
|
148
|
+
except PermissionError:
|
|
149
|
+
# File is still in use (Windows), try to delete it later
|
|
150
|
+
logger.debug(f"Temporary CTE file {tmp_path} still in use, will be cleaned up by OS")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.warning(f"Failed to delete temporary CTE file {tmp_path}: {e}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def _implementation(
|
|
156
|
+
ctx: Context | None,
|
|
157
|
+
sql: str,
|
|
158
|
+
output_file: str | None,
|
|
159
|
+
output_format: str,
|
|
160
|
+
cte_name: str | None,
|
|
161
|
+
model_name: str | None,
|
|
162
|
+
state: DbtCoreServerContext,
|
|
163
|
+
) -> dict[str, Any]:
|
|
164
|
+
"""Implementation function for query_database tool.
|
|
165
|
+
|
|
166
|
+
Separated for testing purposes - tests call this directly with explicit state.
|
|
167
|
+
The @tool() decorated query_database() function calls this with injected dependencies.
|
|
168
|
+
"""
|
|
169
|
+
# Ensure dbt components are initialized
|
|
170
|
+
await state.ensure_initialized(ctx, force_parse=False)
|
|
171
|
+
|
|
172
|
+
async def progress_callback(current: int, total: int, message: str) -> None:
|
|
173
|
+
if ctx:
|
|
174
|
+
await ctx.report_progress(progress=current, total=total, message=message)
|
|
175
|
+
|
|
176
|
+
# Handle CTE query if cte_name is provided
|
|
177
|
+
if cte_name:
|
|
178
|
+
if not model_name:
|
|
179
|
+
raise ValueError("model_name is required when querying a CTE (cte_name is specified)")
|
|
180
|
+
|
|
181
|
+
if not state.project_dir:
|
|
182
|
+
raise ValueError("Project directory not initialized")
|
|
183
|
+
|
|
184
|
+
# Get configured model paths
|
|
185
|
+
model_paths = state.get_project_paths()["model-paths"]
|
|
186
|
+
|
|
187
|
+
# Extract CTE SQL using the dedicated function
|
|
188
|
+
sql = extract_cte_sql(
|
|
189
|
+
project_dir=state.project_dir,
|
|
190
|
+
cte_name=cte_name,
|
|
191
|
+
model_name=model_name,
|
|
192
|
+
additional_sql=sql,
|
|
193
|
+
model_paths=model_paths,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Execute query using dbt show with --no-populate-cache for optimal performance
|
|
197
|
+
runner = await state.get_runner()
|
|
198
|
+
result = await runner.invoke_query(sql, progress_callback=progress_callback if ctx else None) # type: ignore
|
|
199
|
+
|
|
200
|
+
if not result.success:
|
|
201
|
+
error_msg = str(result.exception) if result.exception else "Unknown error"
|
|
202
|
+
# Include dbt output in error message for context
|
|
203
|
+
full_error = error_msg
|
|
204
|
+
|
|
205
|
+
# Try to extract error from stdout or stderr
|
|
206
|
+
if result.stdout and "Error" in result.stdout:
|
|
207
|
+
full_error = result.stdout
|
|
208
|
+
elif hasattr(result, "stderr") and result.stderr:
|
|
209
|
+
full_error = result.stderr
|
|
210
|
+
|
|
211
|
+
logger.error(f"Query execution failed. Error: {error_msg}, stdout: {result.stdout if hasattr(result, 'stdout') else 'N/A'}")
|
|
212
|
+
raise RuntimeError(f"Query execution failed: {full_error}")
|
|
213
|
+
|
|
214
|
+
# Parse JSON output from dbt show (extract the "show" payload)
|
|
215
|
+
output = result.stdout if hasattr(result, "stdout") else ""
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
# dbt show --output json returns: {"show": [...rows...]}
|
|
219
|
+
# Find the JSON object (look for {"show": pattern)
|
|
220
|
+
json_match = re.search(r'\{\s*"show"\s*:\s*\[', output)
|
|
221
|
+
if not json_match:
|
|
222
|
+
return {
|
|
223
|
+
"status": "failed",
|
|
224
|
+
"error": "No JSON output found in dbt show response",
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
# Use JSONDecoder to parse just the first complete JSON object
|
|
228
|
+
# This handles extra data after the JSON (like log lines)
|
|
229
|
+
decoder = json.JSONDecoder()
|
|
230
|
+
data, _ = decoder.raw_decode(output, json_match.start())
|
|
231
|
+
|
|
232
|
+
if "show" in data:
|
|
233
|
+
rows = data["show"]
|
|
234
|
+
row_count = len(rows)
|
|
235
|
+
|
|
236
|
+
# Handle different output formats
|
|
237
|
+
# Get elapsed time from result if available
|
|
238
|
+
elapsed_time = result.elapsed_time if hasattr(result, "elapsed_time") and result.elapsed_time is not None else None
|
|
239
|
+
|
|
240
|
+
if output_format in ("csv", "tsv"):
|
|
241
|
+
# Convert to CSV/TSV format
|
|
242
|
+
delimiter = "\t" if output_format == "tsv" else ","
|
|
243
|
+
csv_buffer = io.StringIO()
|
|
244
|
+
|
|
245
|
+
if rows:
|
|
246
|
+
writer = csv.DictWriter(csv_buffer, fieldnames=rows[0].keys(), delimiter=delimiter)
|
|
247
|
+
writer.writeheader()
|
|
248
|
+
writer.writerows(rows)
|
|
249
|
+
csv_string = csv_buffer.getvalue()
|
|
250
|
+
else:
|
|
251
|
+
csv_string = ""
|
|
252
|
+
|
|
253
|
+
if output_file:
|
|
254
|
+
# Save to file
|
|
255
|
+
output_path = Path(output_file)
|
|
256
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
257
|
+
|
|
258
|
+
with open(output_path, "w", encoding="utf-8", newline="") as f:
|
|
259
|
+
f.write(csv_string)
|
|
260
|
+
|
|
261
|
+
# Get file size
|
|
262
|
+
file_size_bytes = output_path.stat().st_size
|
|
263
|
+
file_size_kb = file_size_bytes / 1024
|
|
264
|
+
|
|
265
|
+
response: dict[str, Any] = {
|
|
266
|
+
"status": "success",
|
|
267
|
+
"row_count": row_count,
|
|
268
|
+
"format": output_format,
|
|
269
|
+
"saved_to": str(output_path),
|
|
270
|
+
"file_size_kb": round(file_size_kb, 2),
|
|
271
|
+
}
|
|
272
|
+
if elapsed_time is not None:
|
|
273
|
+
response["elapsed_time"] = round(elapsed_time, 2)
|
|
274
|
+
return response
|
|
275
|
+
else:
|
|
276
|
+
# Return CSV/TSV inline
|
|
277
|
+
response: dict[str, Any] = {
|
|
278
|
+
"status": "success",
|
|
279
|
+
"row_count": row_count,
|
|
280
|
+
"format": output_format,
|
|
281
|
+
output_format: csv_string,
|
|
282
|
+
}
|
|
283
|
+
if elapsed_time is not None:
|
|
284
|
+
response["elapsed_time"] = round(elapsed_time, 2)
|
|
285
|
+
return response
|
|
286
|
+
else:
|
|
287
|
+
# JSON format (default)
|
|
288
|
+
# Get elapsed time from result if available
|
|
289
|
+
elapsed_time = result.elapsed_time if hasattr(result, "elapsed_time") and result.elapsed_time is not None else None
|
|
290
|
+
|
|
291
|
+
if output_file:
|
|
292
|
+
# Ensure directory exists
|
|
293
|
+
output_path = Path(output_file)
|
|
294
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
# Write rows to file
|
|
297
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
298
|
+
json.dump(rows, f, indent=2)
|
|
299
|
+
|
|
300
|
+
# Get file size
|
|
301
|
+
file_size_bytes = output_path.stat().st_size
|
|
302
|
+
file_size_kb = file_size_bytes / 1024
|
|
303
|
+
|
|
304
|
+
# Return metadata with preview
|
|
305
|
+
response: dict[str, Any] = {
|
|
306
|
+
"status": "success",
|
|
307
|
+
"row_count": row_count,
|
|
308
|
+
"saved_to": str(output_path),
|
|
309
|
+
"file_size_kb": round(file_size_kb, 2),
|
|
310
|
+
"columns": list(rows[0].keys()) if rows else [],
|
|
311
|
+
"preview": rows[:3], # First 3 rows as preview
|
|
312
|
+
}
|
|
313
|
+
if elapsed_time is not None:
|
|
314
|
+
response["elapsed_time"] = round(elapsed_time, 2)
|
|
315
|
+
return response
|
|
316
|
+
else:
|
|
317
|
+
# Return all rows inline
|
|
318
|
+
response: dict[str, Any] = {
|
|
319
|
+
"status": "success",
|
|
320
|
+
"row_count": row_count,
|
|
321
|
+
"rows": rows,
|
|
322
|
+
}
|
|
323
|
+
if elapsed_time is not None:
|
|
324
|
+
response["elapsed_time"] = round(elapsed_time, 2)
|
|
325
|
+
return response
|
|
326
|
+
else:
|
|
327
|
+
return {
|
|
328
|
+
"status": "failed",
|
|
329
|
+
"error": "Unexpected JSON format from dbt show",
|
|
330
|
+
"data": data,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
except json.JSONDecodeError as e:
|
|
334
|
+
return {
|
|
335
|
+
"status": "error",
|
|
336
|
+
"message": f"Failed to parse query results: {e}",
|
|
337
|
+
"raw_output": output[:500],
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@dbtTool()
|
|
342
|
+
async def query_database(
|
|
343
|
+
ctx: Context,
|
|
344
|
+
sql: str,
|
|
345
|
+
output_file: str | None = None,
|
|
346
|
+
output_format: str = "json",
|
|
347
|
+
cte_name: str | None = None,
|
|
348
|
+
model_name: str | None = None,
|
|
349
|
+
state: DbtCoreServerContext = Depends(get_state),
|
|
350
|
+
) -> dict[str, Any]:
|
|
351
|
+
"""Execute a SQL query against the dbt project's database.
|
|
352
|
+
|
|
353
|
+
This tool compiles and runs SQL with Jinja templating support, allowing you to use
|
|
354
|
+
{{ ref('model') }} and {{ source('src', 'table') }} in your queries.
|
|
355
|
+
|
|
356
|
+
**SQL Templating**:
|
|
357
|
+
- Use {{ ref('model_name') }} to reference dbt models
|
|
358
|
+
- Use {{ source('source_name', 'table_name') }} to reference source tables
|
|
359
|
+
- dbt compiles these to actual table names before execution
|
|
360
|
+
|
|
361
|
+
**CTE Querying (LLM quick reference)**
|
|
362
|
+
- Always pass `cte_name` + `model_name` in parameters (not in SQL)
|
|
363
|
+
- Always write a normal `SELECT ... FROM __cte__ ...`
|
|
364
|
+
- `__cte__` or `{{ cte }}` is replaced with the CTE name
|
|
365
|
+
- What happens under the hood
|
|
366
|
+
- Extracts the target CTE plus upstream CTEs from the model
|
|
367
|
+
- Runs your query against that extracted CTE
|
|
368
|
+
- Templating
|
|
369
|
+
- dbt resolves all `{{ ref() }}` / `{{ source() }}` automatically; no manual table names
|
|
370
|
+
- Invalid syntax to avoid
|
|
371
|
+
- `{{ ref('model', cte='name') }}` does **not** exist; always use `cte_name` + `model_name`
|
|
372
|
+
|
|
373
|
+
**Output Management**:
|
|
374
|
+
- For large result sets (>100 rows), use output_file to save results
|
|
375
|
+
- If output_file is omitted, all data returns inline (may consume large context)
|
|
376
|
+
- output_file is automatically created with parent directories
|
|
377
|
+
|
|
378
|
+
**Output Formats**:
|
|
379
|
+
- json (default): Returns data as JSON array of objects
|
|
380
|
+
- csv: Returns comma-separated values with header row
|
|
381
|
+
- tsv: Returns tab-separated values with header row
|
|
382
|
+
- CSV/TSV formats use proper quoting (only when necessary) and are Excel-compatible
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
sql: SQL query with Jinja templating: {{ ref('model') }}, {{ source('src', 'table') }}
|
|
386
|
+
For exploratory queries, include LIMIT. For aggregations/counts, omit it.
|
|
387
|
+
When using cte_name/model_name, provide a full `SELECT`/`WITH` query that
|
|
388
|
+
selects from `__cte__` (or `{{ cte }}`), which is replaced with the CTE name.
|
|
389
|
+
output_file: Optional file path to save results. Recommended for large result sets (>100 rows).
|
|
390
|
+
If provided, only metadata is returned (no preview for CSV/TSV).
|
|
391
|
+
If omitted, all data is returned inline (may consume large context).
|
|
392
|
+
output_format: Output format - "json" (default), "csv", or "tsv"
|
|
393
|
+
cte_name: Optional CTE name to query from a model (requires model_name)
|
|
394
|
+
model_name: Optional model name containing the CTE (required when cte_name is specified)
|
|
395
|
+
state: Shared state object injected by FastMCP
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
JSON inline: {"status": "success", "row_count": N, "rows": [...], "elapsed_time": X.XX}
|
|
399
|
+
JSON file: {"status": "success", "row_count": N, "saved_to": "path", "preview": [...], "elapsed_time": X.XX}
|
|
400
|
+
CSV/TSV inline: {"status": "success", "row_count": N, "format": "csv", "csv": "...", "elapsed_time": X.XX}
|
|
401
|
+
CSV/TSV file: {"status": "success", "row_count": N, "format": "csv", "saved_to": "path", "elapsed_time": X.XX}
|
|
402
|
+
|
|
403
|
+
Note: elapsed_time is in seconds and represents the total query execution time including compilation
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
RuntimeError: If query execution fails
|
|
407
|
+
ValueError: If invalid CTE/model parameters provided
|
|
408
|
+
|
|
409
|
+
Examples:
|
|
410
|
+
# Simple query with ref()
|
|
411
|
+
query_database(sql="SELECT * FROM {{ ref('customers') }} LIMIT 10")
|
|
412
|
+
|
|
413
|
+
# Query with source()
|
|
414
|
+
query_database(sql="SELECT * FROM {{ source('jaffle_shop', 'orders') }} LIMIT 5")
|
|
415
|
+
|
|
416
|
+
# Aggregation (no LIMIT needed)
|
|
417
|
+
query_database(sql="SELECT COUNT(*) as total FROM {{ ref('customers') }}")
|
|
418
|
+
|
|
419
|
+
# Query a specific CTE from a model
|
|
420
|
+
query_database(
|
|
421
|
+
cte_name="customer_agg",
|
|
422
|
+
model_name="customers",
|
|
423
|
+
sql="SELECT * FROM __cte__ LIMIT 10"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Query a CTE with filtering
|
|
427
|
+
query_database(
|
|
428
|
+
cte_name="customer_agg",
|
|
429
|
+
model_name="customers",
|
|
430
|
+
sql="SELECT * FROM __cte__ WHERE order_count > 5 LIMIT 20"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Query a CTE with aggregation (full SELECT)
|
|
434
|
+
query_database(
|
|
435
|
+
cte_name="customer_agg",
|
|
436
|
+
model_name="customers",
|
|
437
|
+
sql="SELECT customer_id, COUNT(*) AS cnt FROM __cte__ GROUP BY customer_id"
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# WRONG - Do NOT use ref() with cte parameter (does not exist):
|
|
441
|
+
# query_database(sql="SELECT * FROM {{ ref('model', cte='cte_name') }}")
|
|
442
|
+
#
|
|
443
|
+
# CORRECT - Use cte_name and model_name parameters instead:
|
|
444
|
+
# query_database(cte_name="cte_name", model_name="model", sql="SELECT * FROM __cte__ LIMIT 10")
|
|
445
|
+
|
|
446
|
+
# Save large results to file
|
|
447
|
+
query_database(
|
|
448
|
+
sql="SELECT * FROM {{ ref('orders') }}",
|
|
449
|
+
output_file="temp_auto/orders_export.json"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
# Export as CSV
|
|
453
|
+
query_database(
|
|
454
|
+
sql="SELECT * FROM {{ ref('customers') }}",
|
|
455
|
+
output_file="temp_auto/customers.csv",
|
|
456
|
+
output_format="csv"
|
|
457
|
+
)
|
|
458
|
+
"""
|
|
459
|
+
return await _implementation(ctx, sql, output_file, output_format, cte_name, model_name, state)
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Run dbt models (compile SQL and execute against database).
|
|
2
|
+
|
|
3
|
+
This module implements the run_models tool for dbt Core MCP.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from fastmcp.dependencies import Depends # type: ignore[reportAttributeAccessIssue]
|
|
10
|
+
from fastmcp.server.context import Context
|
|
11
|
+
|
|
12
|
+
from ..context import DbtCoreServerContext
|
|
13
|
+
from ..dependencies import get_state
|
|
14
|
+
from . import dbtTool
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def _implementation(
|
|
20
|
+
ctx: Context | None,
|
|
21
|
+
select: str | None,
|
|
22
|
+
exclude: str | None,
|
|
23
|
+
select_state_modified: bool,
|
|
24
|
+
select_state_modified_plus_downstream: bool,
|
|
25
|
+
full_refresh: bool,
|
|
26
|
+
fail_fast: bool,
|
|
27
|
+
check_schema_changes: bool,
|
|
28
|
+
cache_selected_only: bool,
|
|
29
|
+
state: DbtCoreServerContext,
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
"""Implementation function for run_models tool.
|
|
32
|
+
|
|
33
|
+
Separated for testing purposes - tests call this directly with explicit state.
|
|
34
|
+
The @tool() decorated run_models() function calls this with injected dependencies.
|
|
35
|
+
"""
|
|
36
|
+
# Ensure dbt components are initialized
|
|
37
|
+
await state.ensure_initialized(ctx, force_parse=False)
|
|
38
|
+
|
|
39
|
+
# Build selector (state-based if requested, otherwise manual)
|
|
40
|
+
selector = await state.prepare_state_based_selection(select_state_modified, select_state_modified_plus_downstream, select)
|
|
41
|
+
|
|
42
|
+
if select_state_modified and not selector:
|
|
43
|
+
raise RuntimeError("No previous state found - cannot determine modifications. Run 'dbt run' or 'dbt build' first to create baseline state.")
|
|
44
|
+
|
|
45
|
+
# Construct dbt CLI args for run
|
|
46
|
+
args = ["run"]
|
|
47
|
+
|
|
48
|
+
if cache_selected_only and (select or selector or select_state_modified):
|
|
49
|
+
args.append("--cache-selected-only")
|
|
50
|
+
|
|
51
|
+
if selector:
|
|
52
|
+
target_path = state.get_project_paths().get("target-path", "target")
|
|
53
|
+
args.extend(["-s", selector, "--state", f"{target_path}/state_last_run"])
|
|
54
|
+
elif select:
|
|
55
|
+
args.extend(["-s", select])
|
|
56
|
+
|
|
57
|
+
if exclude:
|
|
58
|
+
args.extend(["--exclude", exclude])
|
|
59
|
+
|
|
60
|
+
if full_refresh:
|
|
61
|
+
args.append("--full-refresh")
|
|
62
|
+
|
|
63
|
+
if fail_fast:
|
|
64
|
+
args.append("--fail-fast")
|
|
65
|
+
|
|
66
|
+
pre_run_columns: dict[str, list[str]] = {}
|
|
67
|
+
expected_total: int | None = None
|
|
68
|
+
|
|
69
|
+
# Optional pre-run schema snapshot for change detection and accurate progress totals
|
|
70
|
+
if check_schema_changes or True:
|
|
71
|
+
list_args = ["list", "--resource-type", "model", "--output", "name"]
|
|
72
|
+
|
|
73
|
+
if select_state_modified:
|
|
74
|
+
selector = "state:modified+" if select_state_modified_plus_downstream else "state:modified"
|
|
75
|
+
target_path = state.get_project_paths().get("target-path", "target")
|
|
76
|
+
list_args.extend(["-s", selector, "--state", f"{target_path}/state_last_run"])
|
|
77
|
+
elif select:
|
|
78
|
+
list_args.extend(["-s", select])
|
|
79
|
+
|
|
80
|
+
if exclude:
|
|
81
|
+
list_args.extend(["--exclude", exclude])
|
|
82
|
+
|
|
83
|
+
logger.info(f"Getting model list: {list_args}")
|
|
84
|
+
runner = await state.get_runner()
|
|
85
|
+
list_result = await runner.invoke(list_args) # type: ignore
|
|
86
|
+
|
|
87
|
+
if list_result.success and list_result.stdout:
|
|
88
|
+
model_count = 0
|
|
89
|
+
for line in list_result.stdout.strip().split("\n"):
|
|
90
|
+
line = line.strip()
|
|
91
|
+
if not line or line.startswith("{") or ":" in line[:10] or "Running with dbt=" in line or "Registered adapter:" in line:
|
|
92
|
+
continue
|
|
93
|
+
model_count += 1
|
|
94
|
+
|
|
95
|
+
if check_schema_changes:
|
|
96
|
+
model_name = line
|
|
97
|
+
logger.info(f"Querying pre-run columns for {model_name}")
|
|
98
|
+
cols = await state.get_table_columns_from_db(model_name)
|
|
99
|
+
if cols:
|
|
100
|
+
pre_run_columns[model_name] = cols
|
|
101
|
+
else:
|
|
102
|
+
pre_run_columns[model_name] = []
|
|
103
|
+
|
|
104
|
+
if model_count > 0:
|
|
105
|
+
expected_total = model_count
|
|
106
|
+
logger.info(f"Expected total models to run: {expected_total}")
|
|
107
|
+
|
|
108
|
+
logger.info(f"Running dbt models with args: {args}")
|
|
109
|
+
logger.info(f"Expected total for progress: {expected_total}")
|
|
110
|
+
|
|
111
|
+
# Stream progress back to MCP client (if provided)
|
|
112
|
+
async def progress_callback(current: int, total: int, message: str) -> None:
|
|
113
|
+
if ctx:
|
|
114
|
+
await ctx.report_progress(progress=current, total=total, message=message)
|
|
115
|
+
|
|
116
|
+
# Clear stale run_results so we parse only fresh output
|
|
117
|
+
state.clear_stale_run_results()
|
|
118
|
+
|
|
119
|
+
runner = await state.get_runner()
|
|
120
|
+
result = await runner.invoke(args, progress_callback=progress_callback if ctx else None, expected_total=expected_total) # type: ignore
|
|
121
|
+
|
|
122
|
+
run_results = state.validate_and_parse_results(result, "run")
|
|
123
|
+
|
|
124
|
+
schema_changes: dict[str, dict[str, list[str]]] = {}
|
|
125
|
+
if check_schema_changes and pre_run_columns:
|
|
126
|
+
logger.info("Detecting schema changes by comparing pre/post-run database columns")
|
|
127
|
+
|
|
128
|
+
for model_name, old_columns in pre_run_columns.items():
|
|
129
|
+
new_columns = await state.get_table_columns_from_db(model_name)
|
|
130
|
+
|
|
131
|
+
if not new_columns:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
added = [c for c in new_columns if c not in old_columns]
|
|
135
|
+
removed = [c for c in old_columns if c not in new_columns] if old_columns else []
|
|
136
|
+
|
|
137
|
+
if added or removed:
|
|
138
|
+
schema_changes[model_name] = {}
|
|
139
|
+
if added:
|
|
140
|
+
schema_changes[model_name]["added"] = added
|
|
141
|
+
if removed:
|
|
142
|
+
schema_changes[model_name]["removed"] = removed
|
|
143
|
+
|
|
144
|
+
if result.success:
|
|
145
|
+
await state.save_execution_state()
|
|
146
|
+
|
|
147
|
+
# Summarize final progress back to caller
|
|
148
|
+
results_list = run_results.get("results", [])
|
|
149
|
+
await state.report_final_progress(ctx, results_list, "Run", "models")
|
|
150
|
+
|
|
151
|
+
if not results_list:
|
|
152
|
+
raise RuntimeError(f"No models matched selector: {select or selector or 'all'}")
|
|
153
|
+
|
|
154
|
+
response: dict[str, Any] = {
|
|
155
|
+
"status": "success",
|
|
156
|
+
"command": " ".join(args),
|
|
157
|
+
"results": run_results.get("results", []),
|
|
158
|
+
"elapsed_time": run_results.get("elapsed_time"),
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if schema_changes:
|
|
162
|
+
response["schema_changes"] = schema_changes
|
|
163
|
+
response["recommendation"] = "Schema changes detected. Consider running downstream models with modified_downstream=True to propagate changes."
|
|
164
|
+
|
|
165
|
+
return response
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@dbtTool()
|
|
169
|
+
async def run_models(
|
|
170
|
+
ctx: Context,
|
|
171
|
+
select: str | None = None,
|
|
172
|
+
exclude: str | None = None,
|
|
173
|
+
select_state_modified: bool = False,
|
|
174
|
+
select_state_modified_plus_downstream: bool = False,
|
|
175
|
+
full_refresh: bool = False,
|
|
176
|
+
fail_fast: bool = False,
|
|
177
|
+
check_schema_changes: bool = False,
|
|
178
|
+
cache_selected_only: bool = True,
|
|
179
|
+
state: DbtCoreServerContext = Depends(get_state),
|
|
180
|
+
) -> dict[str, Any]:
|
|
181
|
+
"""Run dbt models (compile SQL and execute against database).
|
|
182
|
+
|
|
183
|
+
**What are models**: SQL files (.sql) containing SELECT statements that define data transformations.
|
|
184
|
+
Models are compiled and executed to create/update tables and views in your database.
|
|
185
|
+
|
|
186
|
+
**Important**: This tool runs models only (SQL files). For CSV seed files, use load_seeds().
|
|
187
|
+
For running everything together (seeds + models + tests), use build_models().
|
|
188
|
+
|
|
189
|
+
State-based selection modes (uses dbt state:modified selector):
|
|
190
|
+
- select_state_modified: Run only models modified since last successful run (state:modified)
|
|
191
|
+
- select_state_modified_plus_downstream: Run modified + downstream dependencies (state:modified+)
|
|
192
|
+
Note: Requires select_state_modified=True
|
|
193
|
+
|
|
194
|
+
Manual selection (alternative to state-based):
|
|
195
|
+
- select: dbt selector syntax (e.g., "customers", "tag:mart", "stg_*")
|
|
196
|
+
- exclude: Exclude specific models
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
select: Manual selector (e.g., "customers", "tag:mart", "path:marts/*")
|
|
200
|
+
exclude: Exclude selector (e.g., "tag:deprecated")
|
|
201
|
+
select_state_modified: Use state:modified selector (changed models only)
|
|
202
|
+
select_state_modified_plus_downstream: Extend to state:modified+ (changed + downstream)
|
|
203
|
+
full_refresh: Force full refresh of incremental models
|
|
204
|
+
fail_fast: Stop execution on first failure
|
|
205
|
+
check_schema_changes: Detect schema changes and recommend downstream runs
|
|
206
|
+
cache_selected_only: Only cache schemas for selected models (default True for performance)
|
|
207
|
+
state: Shared state object injected by FastMCP
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Execution results with status, models run, timing info, and optional schema_changes
|
|
211
|
+
|
|
212
|
+
See also:
|
|
213
|
+
- seed_data(): Load CSV files (must run before models that reference them)
|
|
214
|
+
- build_models(): Run models + tests together in DAG order
|
|
215
|
+
- test_models(): Run tests after models complete
|
|
216
|
+
|
|
217
|
+
Examples:
|
|
218
|
+
# Run a specific model
|
|
219
|
+
run_models(select="customers")
|
|
220
|
+
|
|
221
|
+
# After loading seeds, run dependent models
|
|
222
|
+
seed_data()
|
|
223
|
+
run_models(select="stg_orders")
|
|
224
|
+
|
|
225
|
+
# Incremental: run only what changed
|
|
226
|
+
run_models(select_state_modified=True)
|
|
227
|
+
|
|
228
|
+
# Run changed models + everything downstream
|
|
229
|
+
run_models(select_state_modified=True, select_state_modified_plus_downstream=True)
|
|
230
|
+
|
|
231
|
+
# Full refresh marts (rebuild from scratch)
|
|
232
|
+
run_models(select="tag:mart", full_refresh=True)
|
|
233
|
+
"""
|
|
234
|
+
return await _implementation(ctx, select, exclude, select_state_modified, select_state_modified_plus_downstream, full_refresh, fail_fast, check_schema_changes, cache_selected_only, state)
|