mcp-hydrolix 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_hydrolix/main.py +3 -0
- mcp_hydrolix/mcp_env.py +2 -4
- mcp_hydrolix/mcp_server.py +409 -43
- {mcp_hydrolix-0.1.7.dist-info → mcp_hydrolix-0.2.1.dist-info}/METADATA +2 -2
- {mcp_hydrolix-0.1.7.dist-info → mcp_hydrolix-0.2.1.dist-info}/RECORD +8 -8
- {mcp_hydrolix-0.1.7.dist-info → mcp_hydrolix-0.2.1.dist-info}/WHEEL +0 -0
- {mcp_hydrolix-0.1.7.dist-info → mcp_hydrolix-0.2.1.dist-info}/entry_points.txt +0 -0
- {mcp_hydrolix-0.1.7.dist-info → mcp_hydrolix-0.2.1.dist-info}/licenses/LICENSE +0 -0
mcp_hydrolix/main.py
CHANGED
mcp_hydrolix/mcp_env.py
CHANGED
|
@@ -64,10 +64,8 @@ class HydrolixConfig:
|
|
|
64
64
|
self._default_credential: Optional[HydrolixCredential] = None
|
|
65
65
|
|
|
66
66
|
# Set the default credential to the service account from the environment, if available
|
|
67
|
-
if
|
|
68
|
-
self._default_credential = ServiceAccountToken(
|
|
69
|
-
global_service_account, f"https://{self.host}/config"
|
|
70
|
-
)
|
|
67
|
+
if global_service_account := (os.environ.get("HYDROLIX_TOKEN") or "").strip():
|
|
68
|
+
self._default_credential = ServiceAccountToken(global_service_account, None)
|
|
71
69
|
elif (global_username := os.environ.get("HYDROLIX_USER")) is not None and (
|
|
72
70
|
global_password := os.environ.get("HYDROLIX_PASSWORD")
|
|
73
71
|
) is not None:
|
mcp_hydrolix/mcp_server.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import logging
|
|
2
|
+
import re
|
|
3
3
|
import signal
|
|
4
4
|
from collections.abc import Sequence
|
|
5
|
-
from dataclasses import asdict, is_dataclass
|
|
6
5
|
from typing import Any, Final, Optional, List, cast, TypedDict
|
|
7
6
|
|
|
8
7
|
import clickhouse_connect
|
|
@@ -32,6 +31,8 @@ from mcp_hydrolix.utils import with_serializer
|
|
|
32
31
|
|
|
33
32
|
@dataclass
|
|
34
33
|
class Column:
|
|
34
|
+
"""Column with enriched metadata: column_category, base_function, merge_function."""
|
|
35
|
+
|
|
35
36
|
database: str
|
|
36
37
|
table: str
|
|
37
38
|
name: str
|
|
@@ -39,17 +40,18 @@ class Column:
|
|
|
39
40
|
default_kind: Optional[str]
|
|
40
41
|
default_expression: Optional[str]
|
|
41
42
|
comment: Optional[str]
|
|
43
|
+
column_category: Optional[str] = None # 'aggregate', 'alias_aggregate', 'dimension'
|
|
44
|
+
base_function: Optional[str] = None
|
|
45
|
+
merge_function: Optional[str] = None
|
|
42
46
|
|
|
43
47
|
|
|
44
48
|
@dataclass
|
|
45
49
|
class Table:
|
|
50
|
+
"""Table with summary table detection (is_summary_table=True if has aggregate columns)."""
|
|
51
|
+
|
|
46
52
|
database: str
|
|
47
53
|
name: str
|
|
48
54
|
engine: str
|
|
49
|
-
create_table_query: str
|
|
50
|
-
dependencies_database: List[str]
|
|
51
|
-
dependencies_table: List[str]
|
|
52
|
-
engine_full: str
|
|
53
55
|
sorting_key: str
|
|
54
56
|
primary_key: str
|
|
55
57
|
total_rows: Optional[int]
|
|
@@ -57,17 +59,25 @@ class Table:
|
|
|
57
59
|
total_bytes_uncompressed: Optional[int]
|
|
58
60
|
parts: Optional[int]
|
|
59
61
|
active_parts: Optional[int]
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
62
|
+
columns: Optional[List[Column]] = Field(default_factory=list)
|
|
63
|
+
is_summary_table: bool = False
|
|
64
|
+
summary_table_info: Optional[str] = None
|
|
63
65
|
|
|
64
66
|
|
|
65
|
-
@dataclass
|
|
66
67
|
class HdxQueryResult(TypedDict):
|
|
67
68
|
columns: List[str]
|
|
68
69
|
rows: List[List[Any]]
|
|
69
70
|
|
|
70
71
|
|
|
72
|
+
@dataclass
|
|
73
|
+
class TableClassification:
|
|
74
|
+
"""Result of table column classification."""
|
|
75
|
+
|
|
76
|
+
is_summary_table: bool
|
|
77
|
+
aggregate_columns: List[Column]
|
|
78
|
+
dimension_columns: List[Column]
|
|
79
|
+
|
|
80
|
+
|
|
71
81
|
MCP_SERVER_NAME = "mcp-hydrolix"
|
|
72
82
|
logger = logging.getLogger(MCP_SERVER_NAME)
|
|
73
83
|
|
|
@@ -132,7 +142,7 @@ async def create_hydrolix_client(pool_mgr, request_credential: Optional[Hydrolix
|
|
|
132
142
|
common.set_setting("invalid_setting_action", "send")
|
|
133
143
|
common.set_setting("autogenerate_session_id", False)
|
|
134
144
|
|
|
135
|
-
pool_kwargs = {
|
|
145
|
+
pool_kwargs: dict[str, Any] = {
|
|
136
146
|
"maxsize": HYDROLIX_CONFIG.query_pool_size,
|
|
137
147
|
"num_pools": 1,
|
|
138
148
|
"verify": HYDROLIX_CONFIG.verify,
|
|
@@ -188,7 +198,7 @@ async def execute_cmd(query: str):
|
|
|
188
198
|
client_shared_pool, get_request_credential()
|
|
189
199
|
) as client:
|
|
190
200
|
res = await client.command(query)
|
|
191
|
-
logger.info("Command
|
|
201
|
+
logger.info("Command executed successfully.")
|
|
192
202
|
return res
|
|
193
203
|
except Exception as err:
|
|
194
204
|
logger.error(f"Error executing command: {err}")
|
|
@@ -217,22 +227,178 @@ def result_to_table(query_columns, result) -> List[Table]:
|
|
|
217
227
|
return [Table(**dict(zip(query_columns, row))) for row in result]
|
|
218
228
|
|
|
219
229
|
|
|
220
|
-
|
|
221
|
-
|
|
230
|
+
# system.tables query fields for fetching table metadata
|
|
231
|
+
SYSTEM_TABLES_FIELDS = """database, name, engine, sorting_key, primary_key, total_rows, total_bytes,
|
|
232
|
+
total_bytes_uncompressed, parts, active_parts"""
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# Summary Table Support - Helper Functions
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def extract_function_from_type(column_type: str) -> Optional[str]:
|
|
239
|
+
"""
|
|
240
|
+
Extract aggregate function name from AggregateFunction type.
|
|
241
|
+
Examples:
|
|
242
|
+
"AggregateFunction(count, String)" -> "count"
|
|
243
|
+
"AggregateFunction(sumIf, Float64)" -> "sumIf"
|
|
244
|
+
"AggregateFunction(quantile(0.5), DateTime)" -> "quantile(0.5)"
|
|
245
|
+
"AggregateFunction(exponentialMovingAverage(0.5), UInt32)" -> "exponentialMovingAverage(0.5)"
|
|
246
|
+
"SimpleAggregateFunction(sum, Int64)" -> "sum"
|
|
247
|
+
"String" -> None
|
|
248
|
+
"""
|
|
249
|
+
# Match everything from AggregateFunction( up to the comma that separates function from types
|
|
250
|
+
# This captures function names with parameters like quantile(0.5) or quantile(0.5, 0.9)
|
|
251
|
+
# Pattern: function_name or function_name(params) where params can contain commas
|
|
252
|
+
match = re.match(r"^(?:Simple)?AggregateFunction\(([^,()]+(?:\([^)]*\))?)", column_type)
|
|
253
|
+
if match:
|
|
254
|
+
return match.group(1).strip()
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_merge_function(base_function: str) -> str:
|
|
259
|
+
"""
|
|
260
|
+
Generate -Merge function name from base function.
|
|
261
|
+
For parameterized functions, parameters go AFTER "Merge":
|
|
262
|
+
count -> countMerge
|
|
263
|
+
countIf -> countIfMerge
|
|
264
|
+
quantile(0.5) -> quantileMerge(0.5)
|
|
265
|
+
exponentialMovingAverage(0.5) -> exponentialMovingAverageMerge(0.5)
|
|
266
|
+
"""
|
|
267
|
+
# Check if function has parameters
|
|
268
|
+
match = re.match(r"^(\w+)(\(.+\))$", base_function)
|
|
269
|
+
if match:
|
|
270
|
+
# Parameterized: quantile(0.5) -> quantileMerge(0.5)
|
|
271
|
+
func_name = match.group(1)
|
|
272
|
+
params = match.group(2)
|
|
273
|
+
return f"{func_name}Merge{params}"
|
|
274
|
+
else:
|
|
275
|
+
# Non-parameterized: count -> countMerge
|
|
276
|
+
return f"{base_function}Merge"
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def classify_table_columns(columns: List[Column]) -> TableClassification:
|
|
280
|
+
"""
|
|
281
|
+
Classify columns and determine if table is a summary table (has any aggregate columns).
|
|
282
|
+
Requires columns to be enriched first via enrich_column_metadata().
|
|
283
|
+
"""
|
|
284
|
+
aggregate_columns = []
|
|
285
|
+
dimension_columns = []
|
|
286
|
+
|
|
287
|
+
for column in columns:
|
|
288
|
+
if column.column_category in ("aggregate", "alias_aggregate"):
|
|
289
|
+
aggregate_columns.append(column)
|
|
290
|
+
else:
|
|
291
|
+
dimension_columns.append(column)
|
|
292
|
+
|
|
293
|
+
return TableClassification(
|
|
294
|
+
is_summary_table=len(aggregate_columns) > 0,
|
|
295
|
+
aggregate_columns=aggregate_columns,
|
|
296
|
+
dimension_columns=dimension_columns,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def enrich_column_metadata(column: Column) -> Column:
|
|
301
|
+
"""
|
|
302
|
+
Classify column as aggregate, alias_aggregate, or dimension and populate metadata.
|
|
303
|
+
Sets column_category, base_function, and merge_function fields.
|
|
304
|
+
|
|
305
|
+
Detection strategy:
|
|
306
|
+
1. Check column_type for AggregateFunction/SimpleAggregateFunction (primary method)
|
|
307
|
+
2. Check if ALIAS wrapping a -Merge function (for user-friendly shortcuts)
|
|
308
|
+
3. Everything else is a dimension
|
|
309
|
+
|
|
310
|
+
Note: In real ClickHouse summary tables, aggregate columns ALWAYS have
|
|
311
|
+
AggregateFunction or SimpleAggregateFunction types.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
type_func = extract_function_from_type(column.column_type)
|
|
315
|
+
if type_func:
|
|
316
|
+
column.column_category = "aggregate"
|
|
317
|
+
column.base_function = type_func
|
|
318
|
+
column.merge_function = get_merge_function(type_func)
|
|
319
|
+
elif (
|
|
320
|
+
column.default_kind == "ALIAS"
|
|
321
|
+
and column.default_expression
|
|
322
|
+
and "Merge(" in column.default_expression
|
|
323
|
+
):
|
|
324
|
+
column.column_category = "alias_aggregate"
|
|
325
|
+
column.base_function = None
|
|
326
|
+
column.merge_function = None
|
|
327
|
+
# Everything else is a dimension
|
|
328
|
+
else:
|
|
329
|
+
column.column_category = "dimension"
|
|
222
330
|
|
|
331
|
+
return column
|
|
223
332
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
333
|
+
|
|
334
|
+
async def _populate_table_metadata(database: str, table: Table) -> None:
|
|
335
|
+
"""Fetch and populate table with column metadata from Hydrolix.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
database: Database name
|
|
339
|
+
table: Table object to enrich with column metadata
|
|
340
|
+
"""
|
|
341
|
+
# Use DESCRIBE TABLE instead of system.columns to get full AggregateFunction types
|
|
342
|
+
# system.columns returns simplified types (like "String") but DESCRIBE returns full types
|
|
343
|
+
# ("AggregateFunction(count, Nullable(String))")
|
|
344
|
+
# Use backticks for identifiers, not format_query_value which adds quotes for VALUES
|
|
345
|
+
column_data_query = f"DESCRIBE TABLE `{database}`.`{table.name}`"
|
|
346
|
+
column_data_query_result = await execute_query(column_data_query)
|
|
347
|
+
|
|
348
|
+
# DESCRIBE TABLE returns: name, type, default_type, default_expression, comment, ...
|
|
349
|
+
# Transform results to Column objects, mapping DESCRIBE TABLE fields to Column dataclass fields
|
|
350
|
+
column_names = column_data_query_result["columns"]
|
|
351
|
+
columns = [
|
|
352
|
+
Column(
|
|
353
|
+
database=database,
|
|
354
|
+
table=table.name,
|
|
355
|
+
name=row_dict.get("name", ""),
|
|
356
|
+
column_type=row_dict.get("type", ""),
|
|
357
|
+
default_kind=row_dict.get("default_type", ""),
|
|
358
|
+
default_expression=row_dict.get("default_expression", ""),
|
|
359
|
+
comment=row_dict.get("comment", ""),
|
|
360
|
+
)
|
|
361
|
+
for row_dict in (dict(zip(column_names, row)) for row in column_data_query_result["rows"])
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
# Summary Table Support: Enrich column metadata
|
|
365
|
+
# For each column, detect if it's an aggregate, alias_aggregate, or dimension
|
|
366
|
+
# and populate column_category, base_function, and merge_function fields.
|
|
367
|
+
enriched_columns = [enrich_column_metadata(col) for col in columns]
|
|
368
|
+
|
|
369
|
+
# Classify table based on enriched column metadata
|
|
370
|
+
# A table is a summary table if it has ANY aggregate columns
|
|
371
|
+
classification = classify_table_columns(enriched_columns)
|
|
372
|
+
is_summary_table = classification.is_summary_table
|
|
373
|
+
|
|
374
|
+
# Add human-readable usage guidance for LLMs querying summary tables
|
|
375
|
+
summary_table_info = None
|
|
376
|
+
if is_summary_table:
|
|
377
|
+
num_agg = len(classification.aggregate_columns)
|
|
378
|
+
num_dim = len(classification.dimension_columns)
|
|
379
|
+
summary_table_info = (
|
|
380
|
+
f"This is a SUMMARY TABLE with {num_agg} aggregate column(s) and {num_dim} dimension column(s). "
|
|
381
|
+
"Aggregate columns (column_category='aggregate') MUST be wrapped in their corresponding -Merge functions. "
|
|
382
|
+
"ALIAS aggregate columns (column_category='alias_aggregate') are pre-wrapped aggregates - use directly without -Merge. "
|
|
383
|
+
"Dimension columns (column_category='dimension') can be SELECTed directly and MUST appear in GROUP BY when mixed with aggregates. "
|
|
384
|
+
"IMPORTANT: Dimension columns may have function-like names (e.g., 'toStartOfHour(col)') - these are LITERAL column names, use them exactly as-is with backticks. "
|
|
385
|
+
"WRONG: SELECT toStartOfHour(col). RIGHT: SELECT `toStartOfHour(col)`. Also use in GROUP BY: GROUP BY `toStartOfHour(col)`. "
|
|
386
|
+
"CRITICAL RULE: If your SELECT includes ANY dimension columns (column_category='dimension') "
|
|
387
|
+
"AND ANY aggregate columns (column_category='aggregate' or 'alias_aggregate'), "
|
|
388
|
+
"you MUST include 'GROUP BY <all dimension columns from SELECT>'. "
|
|
389
|
+
"WITHOUT GROUP BY, the query will FAIL with 'NOT_AN_AGGREGATE' error. "
|
|
390
|
+
"IMPORTANT: ALIAS aggregates (column_category='alias_aggregate') are NOT dimensions - do NOT include them in GROUP BY. "
|
|
391
|
+
"Example: SELECT reqHost, cnt_all FROM table GROUP BY reqHost (reqHost=dimension, cnt_all=alias_aggregate). "
|
|
392
|
+
"CRITICAL: You MUST use the EXACT merge_function value from each aggregate column's metadata. "
|
|
393
|
+
"DO NOT infer the merge function from the column name - always check the merge_function field. "
|
|
394
|
+
"For example, if column `avgIf(col, condition)` has merge_function='avgIfMerge', "
|
|
395
|
+
"you MUST use avgIfMerge(`avgIf(col, condition)`), NOT avgMerge(...)."
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Populate table object with metadata
|
|
399
|
+
table.columns = enriched_columns
|
|
400
|
+
table.is_summary_table = is_summary_table
|
|
401
|
+
table.summary_table_info = summary_table_info
|
|
236
402
|
|
|
237
403
|
|
|
238
404
|
@mcp.tool()
|
|
@@ -251,17 +417,83 @@ async def list_databases() -> List[str]:
|
|
|
251
417
|
return databases
|
|
252
418
|
|
|
253
419
|
|
|
420
|
+
@mcp.tool()
|
|
421
|
+
async def get_table_info(database: str, table: str) -> Table:
|
|
422
|
+
"""Get detailed metadata for a specific table including columns and summary table detection.
|
|
423
|
+
|
|
424
|
+
REQUIRED USAGE: Call this tool BEFORE querying ANY table to check if it's a summary
|
|
425
|
+
table and get column metadata. This is mandatory to avoid query errors.
|
|
426
|
+
|
|
427
|
+
This tool provides:
|
|
428
|
+
- is_summary_table: Boolean indicating if table has pre-aggregated data
|
|
429
|
+
- columns: List of columns with metadata:
|
|
430
|
+
- column_category: 'aggregate', 'alias_aggregate', or 'dimension'
|
|
431
|
+
- merge_function: Exact -Merge function name for aggregate columns (e.g., "countMerge")
|
|
432
|
+
- column_type: ClickHouse data type
|
|
433
|
+
- default_expression: For ALIAS columns, shows the underlying expression
|
|
434
|
+
- summary_table_info: Human-readable description for summary tables
|
|
435
|
+
- row_count, total_bytes: Table statistics
|
|
436
|
+
|
|
437
|
+
WORKFLOW for querying tables:
|
|
438
|
+
1. Call get_table_info('database', 'table_name')
|
|
439
|
+
2. Check is_summary_table field
|
|
440
|
+
3. If is_summary_table=True:
|
|
441
|
+
- Read column_category and merge_function for each column
|
|
442
|
+
- Use merge_function to wrap aggregate columns in queries
|
|
443
|
+
- Example: SELECT countMerge(`count(vendor_id)`) FROM table
|
|
444
|
+
4. If is_summary_table=False:
|
|
445
|
+
- Use standard SQL (SELECT count(*), sum(col), etc.)
|
|
446
|
+
5. Execute query with run_select_query
|
|
447
|
+
|
|
448
|
+
For summary tables, aggregate columns MUST be wrapped with their corresponding -Merge functions
|
|
449
|
+
from the merge_function field. Querying without checking this metadata first will cause errors.
|
|
450
|
+
"""
|
|
451
|
+
# Fetch table metadata (row counts, sizes, etc.)
|
|
452
|
+
query = f"""
|
|
453
|
+
SELECT {SYSTEM_TABLES_FIELDS}
|
|
454
|
+
FROM system.tables
|
|
455
|
+
WHERE database = {format_query_value(database)} AND name = {format_query_value(table)}"""
|
|
456
|
+
|
|
457
|
+
result = await execute_query(query)
|
|
458
|
+
|
|
459
|
+
if not result["rows"]:
|
|
460
|
+
raise ToolError(f"Table {database}.{table} not found")
|
|
461
|
+
|
|
462
|
+
# Create Table object from first (and only) row
|
|
463
|
+
tables = result_to_table(result["columns"], result["rows"])
|
|
464
|
+
table_obj = tables[0]
|
|
465
|
+
|
|
466
|
+
# Populate table with column metadata
|
|
467
|
+
await _populate_table_metadata(database, table_obj)
|
|
468
|
+
|
|
469
|
+
return table_obj
|
|
470
|
+
|
|
471
|
+
|
|
254
472
|
@mcp.tool()
|
|
255
473
|
async def list_tables(
|
|
256
474
|
database: str, like: Optional[str] = None, not_like: Optional[str] = None
|
|
257
475
|
) -> List[Table]:
|
|
258
|
-
"""List
|
|
259
|
-
|
|
476
|
+
"""List all tables in a database for exploration and discovery.
|
|
477
|
+
|
|
478
|
+
Use this tool to:
|
|
479
|
+
- Discover what tables exist in a database
|
|
480
|
+
- Filter tables by name pattern (like/not_like)
|
|
481
|
+
- Get overview of table metadata (engine, row counts, etc.)
|
|
482
|
+
- Identify which tables are summary tables (is_summary_table field)
|
|
483
|
+
- Get complete column metadata including merge_function for aggregates
|
|
484
|
+
|
|
485
|
+
Returns complete table information including columns and summary table detection
|
|
486
|
+
(same metadata as get_table_info but for all tables in the database).
|
|
487
|
+
|
|
488
|
+
NOTE: If you already know which specific table you want to query, use
|
|
489
|
+
get_table_info(database, table) instead - it's faster and returns metadata
|
|
490
|
+
for just that one table.
|
|
491
|
+
|
|
492
|
+
BEFORE querying any table from the results, check is_summary_table and column
|
|
493
|
+
metadata to build correct queries."""
|
|
260
494
|
logger.info(f"Listing tables in database '{database}'")
|
|
261
495
|
query = f"""
|
|
262
|
-
SELECT
|
|
263
|
-
dependencies_table, engine_full, sorting_key, primary_key, total_rows, total_bytes,
|
|
264
|
-
total_bytes_uncompressed, parts, active_parts, total_marks, comment
|
|
496
|
+
SELECT {SYSTEM_TABLES_FIELDS}
|
|
265
497
|
FROM system.tables WHERE database = {format_query_value(database)}"""
|
|
266
498
|
if like:
|
|
267
499
|
query += f" AND name LIKE {format_query_value(like)}"
|
|
@@ -274,19 +506,9 @@ async def list_tables(
|
|
|
274
506
|
# Deserialize result as Table dataclass instances
|
|
275
507
|
tables = result_to_table(result["columns"], result["rows"])
|
|
276
508
|
|
|
509
|
+
# Populate each table with column metadata
|
|
277
510
|
for table in tables:
|
|
278
|
-
|
|
279
|
-
SELECT database, table, name, type AS column_type, default_kind, default_expression, comment
|
|
280
|
-
FROM system.columns
|
|
281
|
-
WHERE database = {format_query_value(database)} AND table = {format_query_value(table.name)}"""
|
|
282
|
-
column_data_query_result = await execute_query(column_data_query)
|
|
283
|
-
table.columns = [
|
|
284
|
-
c
|
|
285
|
-
for c in result_to_column(
|
|
286
|
-
column_data_query_result["columns"],
|
|
287
|
-
column_data_query_result["rows"],
|
|
288
|
-
)
|
|
289
|
-
]
|
|
511
|
+
await _populate_table_metadata(database, table)
|
|
290
512
|
|
|
291
513
|
logger.info(f"Found {len(tables)} tables")
|
|
292
514
|
return tables
|
|
@@ -298,6 +520,29 @@ async def run_select_query(query: str) -> dict[str, tuple | Sequence[str | Seque
|
|
|
298
520
|
"""Run a SELECT query in a Hydrolix time-series database using the Clickhouse SQL dialect.
|
|
299
521
|
Queries run using this tool will timeout after 30 seconds.
|
|
300
522
|
|
|
523
|
+
MANDATORY PRE-QUERY CHECK - DO THIS FIRST BEFORE EVERY QUERY:
|
|
524
|
+
|
|
525
|
+
BEFORE running ANY query on a table, you MUST call get_table_info(database, table_name)
|
|
526
|
+
to check if it's a summary table and get column metadata.
|
|
527
|
+
|
|
528
|
+
WHY: Summary tables require special -Merge functions for aggregate columns. Querying
|
|
529
|
+
without checking metadata first will cause:
|
|
530
|
+
- "Nested aggregate function" errors (if you use sum/count/avg instead of -Merge)
|
|
531
|
+
- "Cannot read AggregateFunction" errors (if you SELECT aggregate columns directly)
|
|
532
|
+
- Wrong results (if you treat aggregate columns as regular values)
|
|
533
|
+
|
|
534
|
+
REQUIRED WORKFLOW (follow this order every time):
|
|
535
|
+
|
|
536
|
+
1. FIRST: Call get_table_info('database', 'table_name')
|
|
537
|
+
- Check is_summary_table field
|
|
538
|
+
- Read column metadata (column_category, merge_function for each column)
|
|
539
|
+
|
|
540
|
+
2. THEN: Build query based on metadata
|
|
541
|
+
- If is_summary_table=False: use standard SQL (count, sum, avg, etc.)
|
|
542
|
+
- If is_summary_table=True: follow summary table rules below
|
|
543
|
+
|
|
544
|
+
Do NOT skip step 1. Do NOT assume a table is regular/summary without checking.
|
|
545
|
+
|
|
301
546
|
The primary key on tables queried this way is always a timestamp. Queries should include either
|
|
302
547
|
a LIMIT clause or a filter based on the primary key as a performance guard to ensure they return
|
|
303
548
|
in a reasonable amount of time. Queries should select specific fields and avoid the use of
|
|
@@ -312,6 +557,127 @@ async def run_select_query(query: str) -> dict[str, tuple | Sequence[str | Seque
|
|
|
312
557
|
full-text search whenever possible. When searching for substrings, the syntax `column LIKE
|
|
313
558
|
'%suffix'` or `column LIKE 'prefix%'` should be used.
|
|
314
559
|
|
|
560
|
+
SUMMARY TABLE RULES (only apply if is_summary_table=True from get_table_info):
|
|
561
|
+
|
|
562
|
+
Summary tables contain pre-computed aggregations stored in aggregate function state columns.
|
|
563
|
+
These tables are identified by having columns with aggregate function names like count(...),
|
|
564
|
+
sum(...), avg(...), countIf(...), sumIf(...), etc.
|
|
565
|
+
|
|
566
|
+
CRITICAL RULES for querying summary tables:
|
|
567
|
+
|
|
568
|
+
1. Raw aggregate columns (column_category='aggregate') CANNOT be SELECTed directly
|
|
569
|
+
- They store binary AggregateFunction states, not readable values
|
|
570
|
+
- Direct SELECT will cause deserialization errors
|
|
571
|
+
- MUST be wrapped in their -Merge function from get_table_info:
|
|
572
|
+
- count(vendor_id) → countMerge(`count(vendor_id)`)
|
|
573
|
+
- sum(bytes_out) → sumMerge(`sum(bytes_out)`)
|
|
574
|
+
- avg(latitude) → avgMerge(`avg(latitude)`)
|
|
575
|
+
- countIf(condition) → countIfMerge(`countIf(condition)`)
|
|
576
|
+
- ALWAYS check column.merge_function in get_table_info to get the exact function name
|
|
577
|
+
- Use backticks around column names with special characters
|
|
578
|
+
|
|
579
|
+
2. Do NOT use standard aggregate functions (sum/count/avg) on summary table columns
|
|
580
|
+
- WRONG: SELECT sum(count_column) FROM summary_table
|
|
581
|
+
(causes "nested aggregate function" error)
|
|
582
|
+
- RIGHT: SELECT countMerge(`count_column`) FROM summary_table
|
|
583
|
+
(uses the merge function from column metadata)
|
|
584
|
+
|
|
585
|
+
3. ALIAS aggregate columns (column_category='alias_aggregate') use directly:
|
|
586
|
+
- These are pre-defined shortcuts that already wrap -Merge functions
|
|
587
|
+
- Example: cnt_all (which is defined as ALIAS countMerge(`count()`))
|
|
588
|
+
- SELECT cnt_all directly, NO additional wrapping needed
|
|
589
|
+
- These make queries simpler and more readable
|
|
590
|
+
|
|
591
|
+
4. Dimension columns (column_category='dimension') - use as-is with backticks:
|
|
592
|
+
- Reference them exactly as listed in column metadata
|
|
593
|
+
- Many have function-like names (e.g., `toStartOfMinute(primary_datetime)`)
|
|
594
|
+
- These are LITERAL column names, not expressions to compute
|
|
595
|
+
- WRONG: SELECT toStartOfMinute(primary_datetime) (tries to call function on non-existent base column)
|
|
596
|
+
- RIGHT: SELECT `toStartOfMinute(primary_datetime)` (selects the actual dimension column)
|
|
597
|
+
- Always use backticks for columns with special characters
|
|
598
|
+
- Can be used in SELECT, WHERE, GROUP BY, ORDER BY
|
|
599
|
+
- For time dimensions in WHERE clauses:
|
|
600
|
+
* Use simple date format: '2022-06-01' (preferred)
|
|
601
|
+
* Use full timestamp: '2022-06-01 00:00:00' (with seconds)
|
|
602
|
+
* Do NOT use partial time: '2022-06-01 00:00' (causes parse errors)
|
|
603
|
+
* Use >= and < for ranges: WHERE col >= '2022-06-01' AND col < '2022-06-02'
|
|
604
|
+
|
|
605
|
+
5. CRITICAL: When mixing dimensions and aggregates in SELECT, you MUST use GROUP BY:
|
|
606
|
+
- SELECT only aggregates → no GROUP BY needed (aggregates entire table)
|
|
607
|
+
Example: SELECT count_vendor_id FROM table
|
|
608
|
+
- SELECT dimensions + aggregates → MUST GROUP BY all dimension columns
|
|
609
|
+
Example: SELECT pickup_dt, count_vendor_id FROM table GROUP BY pickup_dt
|
|
610
|
+
- Forgetting GROUP BY causes error: "Column X is not under aggregate function and not in GROUP BY"
|
|
611
|
+
|
|
612
|
+
6. NEVER use SELECT * on summary tables (will cause deserialization errors)
|
|
613
|
+
|
|
614
|
+
7. Aggregate columns can ONLY appear in SELECT:
|
|
615
|
+
- Raw aggregates: wrapped with -Merge (see column.merge_function)
|
|
616
|
+
- Alias aggregates: used directly
|
|
617
|
+
- NEVER in GROUP BY (use dimension columns only)
|
|
618
|
+
|
|
619
|
+
Summary table query patterns (after calling get_table_info first):
|
|
620
|
+
|
|
621
|
+
Pattern 1: Aggregate entire table
|
|
622
|
+
-- First: get_table_info('database', 'summary_table')
|
|
623
|
+
-- Read column.merge_function for count(column_name) = "countMerge"
|
|
624
|
+
SELECT countMerge(`count(column_name)`) as total FROM database.summary_table
|
|
625
|
+
|
|
626
|
+
Pattern 2: Aggregate with grouping by dimension
|
|
627
|
+
-- First: get_table_info('database', 'summary_table')
|
|
628
|
+
-- Read merge_function for each aggregate column
|
|
629
|
+
SELECT time_bucket_column,
|
|
630
|
+
countMerge(`count(column_name)`) as total,
|
|
631
|
+
avgMerge(`avg(other_column)`) as avg_value
|
|
632
|
+
FROM database.summary_table
|
|
633
|
+
GROUP BY time_bucket_column
|
|
634
|
+
|
|
635
|
+
Pattern 2b: Grouping with time range filter
|
|
636
|
+
-- First: get_table_info('database', 'summary_table')
|
|
637
|
+
SELECT `toStartOfMinute(datetime_field)` as time_bucket,
|
|
638
|
+
countMerge(`count(column)`) as total
|
|
639
|
+
FROM database.summary_table
|
|
640
|
+
WHERE `toStartOfMinute(datetime_field)` >= '2022-06-01'
|
|
641
|
+
AND `toStartOfMinute(datetime_field)` < '2022-06-02'
|
|
642
|
+
GROUP BY `toStartOfMinute(datetime_field)`
|
|
643
|
+
ORDER BY time_bucket DESC
|
|
644
|
+
|
|
645
|
+
Pattern 3: Multiple aggregates (no dimensions, no GROUP BY)
|
|
646
|
+
-- First: get_table_info('database', 'summary_table')
|
|
647
|
+
SELECT countMerge(`count(column_name)`) as count_result,
|
|
648
|
+
sumMerge(`sum(other_column)`) as sum_result
|
|
649
|
+
FROM database.summary_table
|
|
650
|
+
|
|
651
|
+
Pattern 4: Using ALIAS aggregate columns (no dimensions, no GROUP BY)
|
|
652
|
+
-- First: get_table_info('database', 'summary_table')
|
|
653
|
+
-- Check which columns have column_category='alias_aggregate'
|
|
654
|
+
SELECT cnt_all, sum_bytes, avg_value FROM database.summary_table
|
|
655
|
+
-- No -Merge needed, these are pre-defined aliases
|
|
656
|
+
|
|
657
|
+
Pattern 5: ALIAS aggregates with dimensions (requires GROUP BY)
|
|
658
|
+
-- First: get_table_info('database', 'summary_table')
|
|
659
|
+
SELECT time_dimension,
|
|
660
|
+
cnt_all,
|
|
661
|
+
avg_value
|
|
662
|
+
FROM database.summary_table
|
|
663
|
+
GROUP BY time_dimension
|
|
664
|
+
-- MUST include GROUP BY when mixing dimensions and aggregates
|
|
665
|
+
|
|
666
|
+
Pattern 6: Using dimensions with function-like names (common pattern)
|
|
667
|
+
-- First: get_table_info('database', 'summary_table')
|
|
668
|
+
-- See dimension column named: toStartOfMinute(primary_datetime)
|
|
669
|
+
-- WRONG: SELECT toStartOfMinute(primary_datetime) ... (tries to call function)
|
|
670
|
+
-- RIGHT: Use the literal column name with backticks
|
|
671
|
+
SELECT `toStartOfMinute(primary_datetime)` as time_bucket,
|
|
672
|
+
countMerge(`count()`) as cnt,
|
|
673
|
+
maxMerge(`max(value)`) as max_val
|
|
674
|
+
FROM database.summary_table
|
|
675
|
+
GROUP BY `toStartOfMinute(primary_datetime)`
|
|
676
|
+
ORDER BY time_bucket DESC
|
|
677
|
+
LIMIT 10
|
|
678
|
+
|
|
679
|
+
Regular table examples (non-summary):
|
|
680
|
+
|
|
315
681
|
Example query. Purpose: get logs from the `application.logs` table. Primary key: `timestamp`.
|
|
316
682
|
Performance guard: 10 minute recency filter.
|
|
317
683
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcp-hydrolix
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: An MCP server for Hydrolix.
|
|
5
5
|
Project-URL: Home, https://github.com/hydrolix/mcp-hydrolix
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -297,7 +297,7 @@ Example `mcpServers` configuration connecting to a remote HTTP server with per-r
|
|
|
297
297
|
{
|
|
298
298
|
"mcpServers": {
|
|
299
299
|
"mcp-hydrolix-remote": {
|
|
300
|
-
"url": "
|
|
300
|
+
"url": "https://my-hydrolix-mcp.example.com/mcp?token=<service-account-token>"
|
|
301
301
|
}
|
|
302
302
|
}
|
|
303
303
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mcp_hydrolix/__init__.py,sha256=DnAQkvoFf_QhrDNFLOmn-nHlldPUgtdN33k3xJWthgc,225
|
|
2
|
-
mcp_hydrolix/main.py,sha256=
|
|
3
|
-
mcp_hydrolix/mcp_env.py,sha256=
|
|
4
|
-
mcp_hydrolix/mcp_server.py,sha256=
|
|
2
|
+
mcp_hydrolix/main.py,sha256=W8A6EyGOcqXgkzlcl5dMId_-OenY5zXPYTOvEMl9nvE,2935
|
|
3
|
+
mcp_hydrolix/mcp_env.py,sha256=AZnShwPIgopDVBKqa15z0A7PQ6aqe_rPyuS21BH5Rc8,11480
|
|
4
|
+
mcp_hydrolix/mcp_server.py,sha256=rn5cNVgEd6pJOVUcPvG300kvDUQSUb32aAoyOo2eKL0,30228
|
|
5
5
|
mcp_hydrolix/utils.py,sha256=G7t4lajZIsQOl_oOHUQyEqytsPJpN71WcLkv1cbxsJk,2391
|
|
6
6
|
mcp_hydrolix/auth/__init__.py,sha256=Ui9pLq3Z5tH8X56T_SqACRLEU9zl1gmcONWif-GV1Ko,656
|
|
7
7
|
mcp_hydrolix/auth/credentials.py,sha256=IK8w6TjNxS1K0LCKBt3xXOOI-0ogWCVAkiJuOzEJuJY,1915
|
|
@@ -10,8 +10,8 @@ mcp_hydrolix/log/__init__.py,sha256=1K-ycdGrawELMLSBeiqE8bV3-SFJYOE0dD_U3PAP2QM,
|
|
|
10
10
|
mcp_hydrolix/log/log.py,sha256=6KX0oSz-BbCWUoPxbJED4sZBmbgCHa3KDrc5nYtdks4,1838
|
|
11
11
|
mcp_hydrolix/log/log.yaml,sha256=uQEW_LYSur_C4h0wR_vaYOVKE0an9tXozFMpjeZS5V8,1052
|
|
12
12
|
mcp_hydrolix/log/utils.py,sha256=gOnlo25-sGZydGJmr6T94Pb805RZ9LcZlLCRaVEuUv4,2099
|
|
13
|
-
mcp_hydrolix-0.1.
|
|
14
|
-
mcp_hydrolix-0.1.
|
|
15
|
-
mcp_hydrolix-0.1.
|
|
16
|
-
mcp_hydrolix-0.1.
|
|
17
|
-
mcp_hydrolix-0.1.
|
|
13
|
+
mcp_hydrolix-0.2.1.dist-info/METADATA,sha256=6Zz5VlOIZIQZ7o9Jiy1ix0JChNz2dRtTueFSd61FKHw,11125
|
|
14
|
+
mcp_hydrolix-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
15
|
+
mcp_hydrolix-0.2.1.dist-info/entry_points.txt,sha256=vHa7F2rOCVu8lpsqR8BYbE1w8ugJSOYwX95w802Y5qE,56
|
|
16
|
+
mcp_hydrolix-0.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
17
|
+
mcp_hydrolix-0.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|