prismiq 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
prismiq/query.py ADDED
@@ -0,0 +1,1233 @@
1
+ """Query builder for converting QueryDefinition to parameterized SQL.
2
+
3
+ This module provides the QueryBuilder class that generates safe,
4
+ parameterized SQL queries from QueryDefinition objects.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import date, datetime
10
+ from difflib import get_close_matches
11
+ from typing import Any
12
+
13
+ from pydantic import BaseModel, ConfigDict
14
+
15
+ from prismiq.calculated_fields import ExpressionParser
16
+ from prismiq.types import (
17
+ AggregationType,
18
+ ColumnSelection,
19
+ DatabaseSchema,
20
+ FilterDefinition,
21
+ FilterOperator,
22
+ JoinType,
23
+ QueryDefinition,
24
+ )
25
+
26
+ # ============================================================================
27
+ # Validation Models
28
+ # ============================================================================
29
+
30
+
31
+ class ValidationError(BaseModel):
32
+ """Detailed validation error."""
33
+
34
+ model_config = ConfigDict(strict=True)
35
+
36
+ code: str
37
+ """Machine-readable error code."""
38
+
39
+ message: str
40
+ """User-friendly error message."""
41
+
42
+ field: str | None = None
43
+ """Path to the problematic field (e.g., 'tables[0].name')."""
44
+
45
+ suggestion: str | None = None
46
+ """Suggested fix."""
47
+
48
+
49
+ class ValidationResult(BaseModel):
50
+ """Complete validation result."""
51
+
52
+ model_config = ConfigDict(strict=True)
53
+
54
+ valid: bool
55
+ """Whether the query is valid."""
56
+
57
+ errors: list[ValidationError]
58
+ """List of validation errors (empty if valid)."""
59
+
60
+
61
+ # Error codes
62
+ ERROR_TABLE_NOT_FOUND = "TABLE_NOT_FOUND"
63
+ ERROR_COLUMN_NOT_FOUND = "COLUMN_NOT_FOUND"
64
+ ERROR_INVALID_JOIN = "INVALID_JOIN"
65
+ ERROR_TYPE_MISMATCH = "TYPE_MISMATCH"
66
+ ERROR_INVALID_AGGREGATION = "INVALID_AGGREGATION"
67
+ ERROR_EMPTY_QUERY = "EMPTY_QUERY"
68
+ ERROR_CIRCULAR_JOIN = "CIRCULAR_JOIN"
69
+ ERROR_AMBIGUOUS_COLUMN = "AMBIGUOUS_COLUMN"
70
+ ERROR_INVALID_TIME_SERIES = "INVALID_TIME_SERIES"
71
+
72
+
73
+ class QueryBuilder:
74
+ """Builds parameterized SQL queries from QueryDefinition objects.
75
+
76
+ Uses the database schema to validate table and column references,
77
+ and generates SQL with proper identifier quoting for safety.
78
+
79
+ Example:
80
+ >>> builder = QueryBuilder(schema)
81
+ >>> sql, params = builder.build(query_definition)
82
+ >>> # sql: 'SELECT "users"."email" FROM "users" WHERE "users"."id" = $1'
83
+ >>> # params: [42]
84
+
85
+ With schema qualification:
86
+ >>> builder = QueryBuilder(schema, schema_name="org_123")
87
+ >>> sql, params = builder.build(query_definition)
88
+ >>> # sql: 'SELECT "org_123"."users"."email" FROM "org_123"."users" ...'
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ schema: DatabaseSchema,
94
+ schema_name: str | None = None,
95
+ ) -> None:
96
+ """Initialize the query builder.
97
+
98
+ Args:
99
+ schema: Database schema for validation.
100
+ schema_name: PostgreSQL schema name for schema-qualified table references.
101
+ If None, tables are referenced without schema prefix (uses search_path).
102
+ """
103
+ self._schema = schema
104
+ self._schema_name = schema_name
105
+
106
+ def validate(self, query: QueryDefinition) -> list[str]:
107
+ """Validate a query definition against the schema.
108
+
109
+ Args:
110
+ query: Query definition to validate.
111
+
112
+ Returns:
113
+ List of validation error messages (empty if valid).
114
+
115
+ Note:
116
+ This method returns simple string errors for backward compatibility.
117
+ Use validate_detailed() for richer error information.
118
+ """
119
+ result = self.validate_detailed(query)
120
+ return [err.message for err in result.errors]
121
+
122
+ def validate_detailed(self, query: QueryDefinition) -> ValidationResult:
123
+ """Validate a query definition with detailed error information.
124
+
125
+ Args:
126
+ query: Query definition to validate.
127
+
128
+ Returns:
129
+ ValidationResult with detailed errors including suggestions.
130
+ """
131
+ errors: list[ValidationError] = []
132
+
133
+ # Build table_id -> table_name mapping
134
+ table_map: dict[str, str] = {}
135
+ for qt in query.tables:
136
+ table_map[qt.id] = qt.name
137
+
138
+ # Get all available table names for suggestions
139
+ available_tables = self._schema.table_names()
140
+
141
+ # Validate tables exist in schema
142
+ for i, qt in enumerate(query.tables):
143
+ if not self._schema.has_table(qt.name):
144
+ suggestion = self._suggest_similar(qt.name, available_tables)
145
+ errors.append(
146
+ ValidationError(
147
+ code=ERROR_TABLE_NOT_FOUND,
148
+ message=f"Table '{qt.name}' not found in schema",
149
+ field=f"tables[{i}].name",
150
+ suggestion=suggestion,
151
+ )
152
+ )
153
+
154
+ # Build set of calculated field names for reference checking
155
+ calculated_field_names = {cf.name for cf in query.calculated_fields}
156
+
157
+ # Validate columns exist in tables
158
+ for i, col in enumerate(query.columns):
159
+ table_name = table_map.get(col.table_id)
160
+ if table_name:
161
+ table = self._schema.get_table(table_name)
162
+ if table:
163
+ # Allow "*" for COUNT(*) - this is a valid SQL pattern
164
+ if col.column == "*" and col.aggregation == AggregationType.COUNT:
165
+ continue # Skip further validation for COUNT(*)
166
+
167
+ # Allow references to calculated fields - they're defined in calculated_fields
168
+ if col.column in calculated_field_names:
169
+ continue # Skip further validation for calculated field references
170
+
171
+ if not table.has_column(col.column):
172
+ available_columns = [c.name for c in table.columns]
173
+ suggestion = self._suggest_similar(col.column, available_columns)
174
+ errors.append(
175
+ ValidationError(
176
+ code=ERROR_COLUMN_NOT_FOUND,
177
+ message=f"Column '{col.column}' not found in table '{table_name}'",
178
+ field=f"columns[{i}].column",
179
+ suggestion=suggestion,
180
+ )
181
+ )
182
+ else:
183
+ # Validate aggregation is valid for column type
184
+ if col.aggregation != AggregationType.NONE:
185
+ column_schema = table.get_column(col.column)
186
+ if column_schema:
187
+ agg_error = self._validate_aggregation(
188
+ col.aggregation, column_schema.data_type, col.column
189
+ )
190
+ if agg_error:
191
+ errors.append(
192
+ ValidationError(
193
+ code=ERROR_INVALID_AGGREGATION,
194
+ message=agg_error,
195
+ field=f"columns[{i}].aggregation",
196
+ suggestion=self._suggest_aggregation(
197
+ column_schema.data_type
198
+ ),
199
+ )
200
+ )
201
+
202
+ # Validate join columns
203
+ for i, join in enumerate(query.joins):
204
+ # From column
205
+ from_table_name = table_map.get(join.from_table_id)
206
+ if from_table_name:
207
+ from_table = self._schema.get_table(from_table_name)
208
+ if from_table and not from_table.has_column(join.from_column):
209
+ available_columns = [c.name for c in from_table.columns]
210
+ suggestion = self._suggest_similar(join.from_column, available_columns)
211
+ errors.append(
212
+ ValidationError(
213
+ code=ERROR_INVALID_JOIN,
214
+ message=f"Join column '{join.from_column}' not found in table '{from_table_name}'",
215
+ field=f"joins[{i}].from_column",
216
+ suggestion=suggestion,
217
+ )
218
+ )
219
+
220
+ # To column
221
+ to_table_name = table_map.get(join.to_table_id)
222
+ if to_table_name:
223
+ to_table = self._schema.get_table(to_table_name)
224
+ if to_table and not to_table.has_column(join.to_column):
225
+ available_columns = [c.name for c in to_table.columns]
226
+ suggestion = self._suggest_similar(join.to_column, available_columns)
227
+ errors.append(
228
+ ValidationError(
229
+ code=ERROR_INVALID_JOIN,
230
+ message=f"Join column '{join.to_column}' not found in table '{to_table_name}'",
231
+ field=f"joins[{i}].to_column",
232
+ suggestion=suggestion,
233
+ )
234
+ )
235
+
236
+ # Validate filter columns
237
+ for i, f in enumerate(query.filters):
238
+ table_name = table_map.get(f.table_id)
239
+ if table_name:
240
+ table = self._schema.get_table(table_name)
241
+ if table:
242
+ # Allow references to calculated fields - they're defined in calculated_fields
243
+ if f.column in calculated_field_names:
244
+ continue # Skip further validation for calculated field references
245
+
246
+ if not table.has_column(f.column):
247
+ available_columns = [c.name for c in table.columns]
248
+ suggestion = self._suggest_similar(f.column, available_columns)
249
+ errors.append(
250
+ ValidationError(
251
+ code=ERROR_COLUMN_NOT_FOUND,
252
+ message=f"Filter column '{f.column}' not found in table '{table_name}'",
253
+ field=f"filters[{i}].column",
254
+ suggestion=suggestion,
255
+ )
256
+ )
257
+ else:
258
+ # Validate filter value type matches column type
259
+ column_schema = table.get_column(f.column)
260
+ if column_schema and f.value is not None:
261
+ type_error = self._validate_filter_type(
262
+ f.operator, f.value, column_schema.data_type, f.column
263
+ )
264
+ if type_error:
265
+ errors.append(
266
+ ValidationError(
267
+ code=ERROR_TYPE_MISMATCH,
268
+ message=type_error,
269
+ field=f"filters[{i}].value",
270
+ suggestion=None,
271
+ )
272
+ )
273
+
274
+ # Validate order by columns
275
+ for i, o in enumerate(query.order_by):
276
+ # Allow references to calculated fields
277
+ if o.column in calculated_field_names:
278
+ continue
279
+
280
+ table_name = table_map.get(o.table_id)
281
+ if table_name:
282
+ table = self._schema.get_table(table_name)
283
+ if table and not table.has_column(o.column):
284
+ available_columns = [c.name for c in table.columns]
285
+ suggestion = self._suggest_similar(o.column, available_columns)
286
+ errors.append(
287
+ ValidationError(
288
+ code=ERROR_COLUMN_NOT_FOUND,
289
+ message=f"Order by column '{o.column}' not found in table '{table_name}'",
290
+ field=f"order_by[{i}].column",
291
+ suggestion=suggestion,
292
+ )
293
+ )
294
+
295
+ # Validate time series configuration
296
+ if query.time_series:
297
+ ts_errors = self._validate_time_series(query, table_map)
298
+ errors.extend(ts_errors)
299
+
300
+ # Check for circular joins
301
+ circular_error = self._check_circular_joins(query)
302
+ if circular_error:
303
+ errors.append(circular_error)
304
+
305
+ return ValidationResult(valid=len(errors) == 0, errors=errors)
306
+
307
+ def _validate_time_series(
308
+ self, query: QueryDefinition, table_map: dict[str, str]
309
+ ) -> list[ValidationError]:
310
+ """Validate time series configuration."""
311
+ errors: list[ValidationError] = []
312
+
313
+ if not query.time_series:
314
+ return errors
315
+
316
+ ts = query.time_series
317
+ table_name = table_map.get(ts.table_id)
318
+
319
+ if not table_name:
320
+ errors.append(
321
+ ValidationError(
322
+ code=ERROR_INVALID_TIME_SERIES,
323
+ message=f"Time series table_id '{ts.table_id}' not found",
324
+ field="time_series.table_id",
325
+ suggestion=None,
326
+ )
327
+ )
328
+ return errors
329
+
330
+ table = self._schema.get_table(table_name)
331
+ if not table:
332
+ return errors
333
+
334
+ # Validate date column exists
335
+ if not table.has_column(ts.date_column):
336
+ available_columns = [c.name for c in table.columns]
337
+ suggestion = self._suggest_similar(ts.date_column, available_columns)
338
+ errors.append(
339
+ ValidationError(
340
+ code=ERROR_INVALID_TIME_SERIES,
341
+ message=f"Date column '{ts.date_column}' not found in table '{table_name}'",
342
+ field="time_series.date_column",
343
+ suggestion=suggestion,
344
+ )
345
+ )
346
+ else:
347
+ # Validate column is a date/timestamp type
348
+ column_schema = table.get_column(ts.date_column)
349
+ if column_schema:
350
+ date_types = {
351
+ "date",
352
+ "timestamp",
353
+ "timestamp without time zone",
354
+ "timestamp with time zone",
355
+ "timestamptz",
356
+ }
357
+ is_date_type = any(dt in column_schema.data_type.lower() for dt in date_types)
358
+ if not is_date_type:
359
+ errors.append(
360
+ ValidationError(
361
+ code=ERROR_INVALID_TIME_SERIES,
362
+ message=f"Column '{ts.date_column}' is not a date/timestamp type (found: {column_schema.data_type})",
363
+ field="time_series.date_column",
364
+ suggestion="Use a column with date, timestamp, or timestamptz type",
365
+ )
366
+ )
367
+
368
+ return errors
369
+
370
+ def _suggest_similar(
371
+ self, name: str, candidates: list[str], max_suggestions: int = 3
372
+ ) -> str | None:
373
+ """Find similar names for suggestions."""
374
+ matches = get_close_matches(
375
+ name.lower(), [c.lower() for c in candidates], n=max_suggestions, cutoff=0.6
376
+ )
377
+ if matches:
378
+ # Map back to original case
379
+ original_matches = []
380
+ for match in matches:
381
+ for candidate in candidates:
382
+ if candidate.lower() == match:
383
+ original_matches.append(candidate)
384
+ break
385
+ if len(original_matches) == 1:
386
+ return f"Did you mean '{original_matches[0]}'?"
387
+ elif len(original_matches) > 1:
388
+ return f"Did you mean one of: {', '.join(repr(m) for m in original_matches)}?"
389
+ return None
390
+
391
+ def _validate_aggregation(
392
+ self, agg: AggregationType, data_type: str, column_name: str
393
+ ) -> str | None:
394
+ """Validate that an aggregation is valid for a data type."""
395
+ # Numeric aggregations
396
+ numeric_aggs = {AggregationType.SUM, AggregationType.AVG}
397
+ numeric_types = {
398
+ "integer",
399
+ "bigint",
400
+ "smallint",
401
+ "numeric",
402
+ "decimal",
403
+ "real",
404
+ "double precision",
405
+ }
406
+
407
+ if agg in numeric_aggs:
408
+ # Check if type is numeric-ish
409
+ data_type_lower = data_type.lower()
410
+ is_numeric = any(nt in data_type_lower for nt in numeric_types)
411
+ if not is_numeric:
412
+ return f"Aggregation '{agg.value}' is not valid for column '{column_name}' of type '{data_type}'"
413
+
414
+ return None
415
+
416
+ def _suggest_aggregation(self, data_type: str) -> str | None:
417
+ """Suggest valid aggregations for a data type."""
418
+ data_type_lower = data_type.lower()
419
+ numeric_types = {
420
+ "integer",
421
+ "bigint",
422
+ "smallint",
423
+ "numeric",
424
+ "decimal",
425
+ "real",
426
+ "double precision",
427
+ }
428
+
429
+ is_numeric = any(nt in data_type_lower for nt in numeric_types)
430
+ if is_numeric:
431
+ return "Valid aggregations for this column: sum, avg, min, max, count"
432
+ else:
433
+ return "Valid aggregations for this column: min, max, count"
434
+
435
+ def _validate_filter_type(
436
+ self, operator: FilterOperator, value: Any, data_type: str, column_name: str
437
+ ) -> str | None:
438
+ """Validate that a filter value is compatible with the column type."""
439
+ data_type_lower = data_type.lower()
440
+
441
+ # Check for list operators - combined condition
442
+ if operator in (FilterOperator.IN, FilterOperator.NOT_IN) and not isinstance(value, list):
443
+ return f"Operator '{operator.value}' requires a list value for column '{column_name}'"
444
+
445
+ # Check for between operator - combined condition
446
+ if operator == FilterOperator.BETWEEN and (
447
+ not isinstance(value, list | tuple) or len(value) != 2
448
+ ):
449
+ return f"Operator 'between' requires a list/tuple of exactly 2 values for column '{column_name}'"
450
+
451
+ # Basic numeric type checking
452
+ numeric_types = {
453
+ "integer",
454
+ "bigint",
455
+ "smallint",
456
+ "numeric",
457
+ "decimal",
458
+ "real",
459
+ "double precision",
460
+ }
461
+ is_numeric_column = any(nt in data_type_lower for nt in numeric_types)
462
+
463
+ if is_numeric_column and operator not in (
464
+ FilterOperator.IS_NULL,
465
+ FilterOperator.IS_NOT_NULL,
466
+ ):
467
+ # For IN/NOT_IN/IN_OR_NULL, check list items (None allowed for IN_OR_NULL)
468
+ if operator in (
469
+ FilterOperator.IN,
470
+ FilterOperator.NOT_IN,
471
+ FilterOperator.IN_OR_NULL,
472
+ ) and isinstance(value, list):
473
+ for v in value:
474
+ if v is not None and not isinstance(v, int | float):
475
+ return f"Column '{column_name}' is numeric but received non-numeric value in list"
476
+ elif operator == FilterOperator.BETWEEN and isinstance(value, list | tuple):
477
+ for v in value:
478
+ if not isinstance(v, int | float):
479
+ return f"Column '{column_name}' is numeric but received non-numeric value in range"
480
+ elif not isinstance(value, int | float | list | tuple):
481
+ return f"Column '{column_name}' is numeric but received non-numeric value"
482
+
483
+ return None
484
+
485
+ def _check_circular_joins(self, query: QueryDefinition) -> ValidationError | None:
486
+ """Check for circular join references."""
487
+ if not query.joins:
488
+ return None
489
+
490
+ # Build a simple adjacency list
491
+ # For simplicity, we just check if any table joins to itself
492
+ for i, join in enumerate(query.joins):
493
+ if join.from_table_id == join.to_table_id:
494
+ return ValidationError(
495
+ code=ERROR_CIRCULAR_JOIN,
496
+ message="Join references the same table on both sides",
497
+ field=f"joins[{i}]",
498
+ suggestion="A join should connect two different tables",
499
+ )
500
+
501
+ return None
502
+
503
+ def sanitize_filters(self, query: QueryDefinition) -> QueryDefinition:
504
+ """Remove filters that reference non-existent columns.
505
+
506
+ Filters referencing columns that don't exist in their target table are
507
+ silently removed instead of causing validation errors.
508
+
509
+ Args:
510
+ query: Query definition with potentially invalid filters.
511
+
512
+ Returns:
513
+ A new QueryDefinition with invalid filters removed.
514
+ """
515
+ if not query.filters:
516
+ return query
517
+
518
+ # Build table_id -> table_name mapping
519
+ table_map: dict[str, str] = {}
520
+ for qt in query.tables:
521
+ table_map[qt.id] = qt.name
522
+
523
+ # Build set of calculated field names (these are always valid)
524
+ calculated_field_names = {cf.name for cf in (query.calculated_fields or [])}
525
+
526
+ # Filter out invalid filters
527
+ valid_filters = []
528
+ for f in query.filters:
529
+ table_name = table_map.get(f.table_id)
530
+ if not table_name:
531
+ # Unknown table_id - skip this filter
532
+ continue
533
+
534
+ table = self._schema.get_table(table_name)
535
+ if not table:
536
+ # Unknown table - skip this filter
537
+ continue
538
+
539
+ # Allow references to calculated fields
540
+ if f.column in calculated_field_names:
541
+ valid_filters.append(f)
542
+ continue
543
+
544
+ # Check if column exists in table
545
+ if table.has_column(f.column):
546
+ valid_filters.append(f)
547
+ # else: column doesn't exist - skip this filter silently
548
+
549
+ # Return new query with sanitized filters
550
+ if len(valid_filters) == len(query.filters):
551
+ return query # No changes needed
552
+
553
+ return query.model_copy(update={"filters": valid_filters})
554
+
555
+ def build(self, query: QueryDefinition) -> tuple[str, list[Any]]:
556
+ """Build a parameterized SQL query.
557
+
558
+ Args:
559
+ query: Query definition to build.
560
+
561
+ Returns:
562
+ Tuple of (sql_string, parameters) where parameters use $1, $2 placeholders.
563
+ """
564
+ params: list[Any] = []
565
+
566
+ # Build table_id -> table_name mapping for schema lookup
567
+ table_map: dict[str, str] = {}
568
+ for qt in query.tables:
569
+ table_map[qt.id] = qt.name
570
+
571
+ # Build table_id -> table reference mapping
572
+ table_refs = self._build_table_refs(query)
573
+
574
+ # Build calculated field SQL map (shared across SELECT, WHERE, ORDER BY)
575
+ calc_sql_map = self._build_calc_sql_map(query)
576
+
577
+ # SELECT clause - with time series support
578
+ select_clause = self._build_select(query, table_refs, calc_sql_map)
579
+
580
+ # FROM clause
581
+ from_clause = self._build_from(query, table_refs)
582
+
583
+ # WHERE clause
584
+ where_clause, params = self._build_where(
585
+ query.filters, table_refs, table_map, calc_sql_map, params
586
+ )
587
+
588
+ # GROUP BY clause - with time series support
589
+ group_by_clause = self._build_group_by(query, table_refs, calc_sql_map)
590
+
591
+ # ORDER BY clause - with time series support
592
+ order_by_clause = self._build_order_by(query, table_refs, calc_sql_map)
593
+
594
+ # LIMIT and OFFSET
595
+ limit_clause = ""
596
+ if query.limit is not None:
597
+ params.append(query.limit)
598
+ limit_clause = f" LIMIT ${len(params)}"
599
+
600
+ offset_clause = ""
601
+ if query.offset is not None:
602
+ params.append(query.offset)
603
+ offset_clause = f" OFFSET ${len(params)}"
604
+
605
+ # Combine all clauses
606
+ sql = f"SELECT {select_clause} FROM {from_clause}"
607
+ if where_clause:
608
+ sql += f" WHERE {where_clause}"
609
+ if group_by_clause:
610
+ sql += f" GROUP BY {group_by_clause}"
611
+ if order_by_clause:
612
+ sql += f" ORDER BY {order_by_clause}"
613
+ sql += limit_clause + offset_clause
614
+
615
+ return sql, params
616
+
617
+ def _build_calc_sql_map(self, query: QueryDefinition) -> dict[str, str]:
618
+ """Build mapping from calculated field names to their SQL expressions.
619
+
620
+ Uses pre-computed sql_expression if available (recommended for inter-field
621
+ dependency resolution). Otherwise parses the expression on-demand.
622
+
623
+ Args:
624
+ query: Query definition containing calculated_fields.
625
+
626
+ Returns:
627
+ Dict mapping calculated field name to SQL expression.
628
+ """
629
+ calc_sql_map: dict[str, str] = {}
630
+
631
+ # Get base table reference for qualifying unqualified column references.
632
+ # Prefer alias over name since FROM clause uses alias when present.
633
+ # This prevents "ambiguous column" errors in multi-table queries.
634
+ if query.tables:
635
+ base_table = query.tables[0]
636
+ base_table_ref = base_table.alias or base_table.name
637
+ else:
638
+ base_table_ref = None
639
+
640
+ for cf in query.calculated_fields:
641
+ # Use pre-computed SQL if available (handles inter-field dependencies).
642
+ # IMPORTANT: sql_expression must be pre-validated and use parameterized
643
+ # values. It should have all column references fully qualified with the
644
+ # correct table alias/name to match the FROM clause.
645
+ if cf.sql_expression:
646
+ if not cf.sql_expression.strip():
647
+ raise ValueError(f"Calculated field '{cf.name}' has empty sql_expression")
648
+ calc_sql_map[cf.name] = cf.sql_expression
649
+ elif cf.expression:
650
+ # Fall back to parsing on-demand. This is a secondary code path
651
+ # that won't resolve inter-field references correctly. Prefer
652
+ # providing sql_expression from resolve_calculated_fields().
653
+ try:
654
+ parser = ExpressionParser()
655
+ ast = parser.parse(cf.expression)
656
+ calc_sql_map[cf.name] = ast.to_sql({}, default_table_ref=base_table_ref)
657
+ except ValueError as e:
658
+ # Fail closed: raise a clear error instead of injecting raw text
659
+ raise ValueError(
660
+ f"Failed to parse calculated field '{cf.name}': {e}. "
661
+ f"Expression: {cf.expression!r}"
662
+ ) from e
663
+
664
+ return calc_sql_map
665
+
666
+ def _build_table_refs(self, query: QueryDefinition) -> dict[str, str]:
667
+ """Build mapping from table_id to quoted table reference."""
668
+ refs: dict[str, str] = {}
669
+ for qt in query.tables:
670
+ if qt.alias:
671
+ refs[qt.id] = self._quote_identifier(qt.alias)
672
+ else:
673
+ refs[qt.id] = self._quote_identifier(qt.name)
674
+ return refs
675
+
676
+ def _build_select(
677
+ self,
678
+ query: QueryDefinition,
679
+ table_refs: dict[str, str],
680
+ calc_sql_map: dict[str, str],
681
+ ) -> str:
682
+ """Build the SELECT clause, including time series bucket if
683
+ configured."""
684
+ parts: list[str] = []
685
+
686
+ # Add time series bucket column first if configured
687
+ if query.time_series:
688
+ ts = query.time_series
689
+ table_ref = table_refs[ts.table_id]
690
+ date_col = f"{table_ref}.{self._quote_identifier(ts.date_column)}"
691
+ date_trunc = f"date_trunc('{ts.interval}', {date_col})"
692
+
693
+ # Add alias if specified
694
+ alias = ts.alias or f"{ts.date_column}_bucket"
695
+ date_trunc = f"{date_trunc} AS {self._quote_identifier(alias)}"
696
+
697
+ parts.append(date_trunc)
698
+
699
+ # Add regular columns
700
+ for col in query.columns:
701
+ table_ref = table_refs[col.table_id]
702
+
703
+ # Handle COUNT(*) specially - don't quote the asterisk
704
+ if col.column == "*" and col.aggregation == AggregationType.COUNT:
705
+ col_ref = "COUNT(*)"
706
+ # Handle column with inline sql_expression (e.g., calculated field)
707
+ elif col.sql_expression:
708
+ col_ref = f"({col.sql_expression})"
709
+
710
+ # Apply aggregation if specified
711
+ if col.aggregation != AggregationType.NONE:
712
+ col_ref = self._apply_aggregation(col_ref, col.aggregation)
713
+ # Handle calculated field references - expand to SQL expression
714
+ elif col.column in calc_sql_map:
715
+ # Use the converted SQL expression
716
+ col_ref = f"({calc_sql_map[col.column]})"
717
+
718
+ # Apply aggregation if specified
719
+ if col.aggregation != AggregationType.NONE:
720
+ col_ref = self._apply_aggregation(col_ref, col.aggregation)
721
+ else:
722
+ col_ref = f"{table_ref}.{self._quote_identifier(col.column)}"
723
+
724
+ # Apply date_trunc if specified (for date columns)
725
+ if col.date_trunc:
726
+ col_ref = f"date_trunc('{col.date_trunc}', {col_ref})"
727
+
728
+ # Apply aggregation if specified
729
+ if col.aggregation != AggregationType.NONE:
730
+ col_ref = self._apply_aggregation(col_ref, col.aggregation)
731
+
732
+ # Apply alias if specified
733
+ if col.alias:
734
+ col_ref = f"{col_ref} AS {self._quote_identifier(col.alias)}"
735
+
736
+ parts.append(col_ref)
737
+
738
+ return ", ".join(parts)
739
+
740
+ def _apply_aggregation(self, col_ref: str, agg: AggregationType) -> str:
741
+ """Apply aggregation function to column reference."""
742
+ agg_map = {
743
+ AggregationType.SUM: "SUM",
744
+ AggregationType.AVG: "AVG",
745
+ AggregationType.COUNT: "COUNT",
746
+ AggregationType.COUNT_DISTINCT: "COUNT_DISTINCT",
747
+ AggregationType.MIN: "MIN",
748
+ AggregationType.MAX: "MAX",
749
+ }
750
+
751
+ if agg == AggregationType.COUNT_DISTINCT:
752
+ return f"COUNT(DISTINCT {col_ref})"
753
+
754
+ func = agg_map.get(agg, "")
755
+ if func:
756
+ return f"{func}({col_ref})"
757
+
758
+ return col_ref
759
+
760
+ def _build_from(self, query: QueryDefinition, table_refs: dict[str, str]) -> str:
761
+ """Build the FROM clause including JOINs.
762
+
763
+ Uses schema-qualified table names if schema_name is set.
764
+ """
765
+ if not query.tables:
766
+ return ""
767
+
768
+ # Track which tables are already in the FROM clause
769
+ tables_in_from: set[str] = set()
770
+
771
+ # First table
772
+ first_table = query.tables[0]
773
+ sql = self._quote_table(first_table.name)
774
+ if first_table.alias:
775
+ sql += f" AS {self._quote_identifier(first_table.alias)}"
776
+ tables_in_from.add(first_table.id)
777
+
778
+ # Add JOINs
779
+ for join in query.joins:
780
+ # Find the table being joined (to_table)
781
+ to_table = query.get_table_by_id(join.to_table_id)
782
+ if to_table is None:
783
+ continue
784
+
785
+ join_type = self._join_type_sql(join.join_type)
786
+ from_ref = table_refs[join.from_table_id]
787
+ to_ref = table_refs[join.to_table_id]
788
+
789
+ table_sql = self._quote_table(to_table.name)
790
+ if to_table.alias:
791
+ table_sql += f" AS {self._quote_identifier(to_table.alias)}"
792
+
793
+ sql += (
794
+ f" {join_type} JOIN {table_sql} ON "
795
+ f"{from_ref}.{self._quote_identifier(join.from_column)} = "
796
+ f"{to_ref}.{self._quote_identifier(join.to_column)}"
797
+ )
798
+ tables_in_from.add(join.to_table_id)
799
+
800
+ # Add any remaining tables that aren't joined (creates implicit cross join)
801
+ # This handles cases where columns are selected from multiple tables without explicit joins
802
+ for qt in query.tables[1:]:
803
+ if qt.id not in tables_in_from:
804
+ table_sql = self._quote_table(qt.name)
805
+ if qt.alias:
806
+ table_sql += f" AS {self._quote_identifier(qt.alias)}"
807
+ sql += f", {table_sql}"
808
+ tables_in_from.add(qt.id)
809
+
810
+ return sql
811
+
812
+ def _join_type_sql(self, join_type: JoinType) -> str:
813
+ """Convert JoinType enum to SQL keyword."""
814
+ return {
815
+ JoinType.INNER: "INNER",
816
+ JoinType.LEFT: "LEFT",
817
+ JoinType.RIGHT: "RIGHT",
818
+ JoinType.FULL: "FULL",
819
+ }.get(join_type, "INNER")
820
+
821
+ def _build_where(
822
+ self,
823
+ filters: list[FilterDefinition],
824
+ table_refs: dict[str, str],
825
+ table_map: dict[str, str],
826
+ calc_sql_map: dict[str, str],
827
+ params: list[Any],
828
+ ) -> tuple[str, list[Any]]:
829
+ """Build the WHERE clause."""
830
+ if not filters:
831
+ return "", params
832
+
833
+ conditions: list[str] = []
834
+ for f in filters:
835
+ # Handle filter with inline sql_expression (e.g., calculated field)
836
+ if f.sql_expression:
837
+ col_ref = f"({f.sql_expression})"
838
+ # No type coercion for calculated fields (type not known from schema)
839
+ data_type = None
840
+ # Handle calculated field references - expand to SQL expression
841
+ elif f.column in calc_sql_map:
842
+ col_ref = f"({calc_sql_map[f.column]})"
843
+ # No type coercion for calculated fields (type not known from schema)
844
+ data_type = None
845
+ else:
846
+ table_ref = table_refs[f.table_id]
847
+ col_ref = f"{table_ref}.{self._quote_identifier(f.column)}"
848
+
849
+ # Get column data type for value coercion
850
+ table_name = table_map.get(f.table_id)
851
+ data_type = None
852
+ if table_name:
853
+ table = self._schema.get_table(table_name)
854
+ if table:
855
+ column = table.get_column(f.column)
856
+ if column:
857
+ data_type = column.data_type
858
+
859
+ condition, params = self._build_condition(col_ref, f, data_type, params)
860
+ conditions.append(condition)
861
+
862
+ return " AND ".join(conditions), params
863
+
864
+ def _build_condition(
865
+ self,
866
+ col_ref: str,
867
+ f: FilterDefinition,
868
+ data_type: str | None,
869
+ params: list[Any],
870
+ ) -> tuple[str, list[Any]]:
871
+ """Build a single filter condition."""
872
+ op = f.operator
873
+
874
+ # Coerce the filter value to the appropriate Python type
875
+ coerced_value = self._coerce_value(f.value, data_type)
876
+
877
+ if op == FilterOperator.EQ:
878
+ if coerced_value is None:
879
+ return f"{col_ref} IS NULL", params
880
+ params.append(coerced_value)
881
+ return f"{col_ref} = ${len(params)}", params
882
+
883
+ if op == FilterOperator.NEQ:
884
+ if coerced_value is None:
885
+ return f"{col_ref} IS NOT NULL", params
886
+ params.append(coerced_value)
887
+ return f"{col_ref} <> ${len(params)}", params
888
+
889
+ if op == FilterOperator.GT:
890
+ params.append(coerced_value)
891
+ return f"{col_ref} > ${len(params)}", params
892
+
893
+ if op == FilterOperator.GTE:
894
+ params.append(coerced_value)
895
+ return f"{col_ref} >= ${len(params)}", params
896
+
897
+ if op == FilterOperator.LT:
898
+ params.append(coerced_value)
899
+ return f"{col_ref} < ${len(params)}", params
900
+
901
+ if op == FilterOperator.LTE:
902
+ params.append(coerced_value)
903
+ return f"{col_ref} <= ${len(params)}", params
904
+
905
+ if op == FilterOperator.IN:
906
+ if isinstance(coerced_value, list):
907
+ if not coerced_value:
908
+ return "FALSE", params
909
+ placeholders: list[str] = []
910
+ for v in coerced_value:
911
+ params.append(v)
912
+ placeholders.append(f"${len(params)}")
913
+ return f"{col_ref} IN ({', '.join(placeholders)})", params
914
+ params.append(coerced_value)
915
+ return f"{col_ref} IN (${len(params)})", params
916
+
917
+ if op == FilterOperator.NOT_IN:
918
+ if isinstance(coerced_value, list):
919
+ if not coerced_value:
920
+ return "TRUE", params
921
+ placeholders = []
922
+ for v in coerced_value:
923
+ params.append(v)
924
+ placeholders.append(f"${len(params)}")
925
+ return f"{col_ref} NOT IN ({', '.join(placeholders)})", params
926
+ params.append(coerced_value)
927
+ return f"{col_ref} NOT IN (${len(params)})", params
928
+
929
+ if op == FilterOperator.IN_OR_NULL:
930
+ # Handle mixed selection of concrete values AND NULL
931
+ # Generates: (col IN (...) OR col IS NULL)
932
+ if isinstance(coerced_value, list):
933
+ # Filter out None values - they're handled by the IS NULL clause
934
+ concrete_values = [v for v in coerced_value if v is not None]
935
+ if not concrete_values:
936
+ # No concrete values (empty list or list of only None values)
937
+ return f"{col_ref} IS NULL", params
938
+ placeholders = []
939
+ for v in concrete_values:
940
+ params.append(v)
941
+ placeholders.append(f"${len(params)}")
942
+ return (
943
+ f"({col_ref} IN ({', '.join(placeholders)}) OR {col_ref} IS NULL)",
944
+ params,
945
+ )
946
+ # Single non-list value
947
+ if coerced_value is None:
948
+ # Single None value - just IS NULL
949
+ return f"{col_ref} IS NULL", params
950
+ params.append(coerced_value)
951
+ return f"({col_ref} IN (${len(params)}) OR {col_ref} IS NULL)", params
952
+
953
+ if op == FilterOperator.LIKE:
954
+ params.append(coerced_value)
955
+ return f"{col_ref} LIKE ${len(params)}", params
956
+
957
+ if op == FilterOperator.ILIKE:
958
+ params.append(coerced_value)
959
+ return f"{col_ref} ILIKE ${len(params)}", params
960
+
961
+ if op == FilterOperator.NOT_LIKE:
962
+ params.append(coerced_value)
963
+ return f"{col_ref} NOT LIKE ${len(params)}", params
964
+
965
+ if op == FilterOperator.NOT_ILIKE:
966
+ params.append(coerced_value)
967
+ return f"{col_ref} NOT ILIKE ${len(params)}", params
968
+
969
+ if op == FilterOperator.BETWEEN:
970
+ if isinstance(coerced_value, list | tuple) and len(coerced_value) == 2:
971
+ params.append(coerced_value[0])
972
+ p1 = len(params)
973
+ params.append(coerced_value[1])
974
+ p2 = len(params)
975
+ return f"{col_ref} BETWEEN ${p1} AND ${p2}", params
976
+ # Invalid BETWEEN value - raise error instead of silent fallback
977
+ value_desc = (
978
+ f"{len(coerced_value)} values"
979
+ if isinstance(coerced_value, list | tuple)
980
+ else type(coerced_value).__name__
981
+ )
982
+ raise ValueError(
983
+ f"BETWEEN filter on column '{f.column}' requires exactly 2 values, got {value_desc}"
984
+ )
985
+
986
+ if op == FilterOperator.IS_NULL:
987
+ return f"{col_ref} IS NULL", params
988
+
989
+ if op == FilterOperator.IS_NOT_NULL:
990
+ return f"{col_ref} IS NOT NULL", params
991
+
992
+ if op == FilterOperator.IN_SUBQUERY:
993
+ # For subquery filters (used in RLS filtering).
994
+ # SECURITY: The SQL in value["sql"] is interpolated directly without
995
+ # parameterization. Callers MUST ensure the SQL is safely generated
996
+ # (e.g., from trusted internal code, not user input). This is by design
997
+ # since subqueries cannot be parameterized.
998
+ if not isinstance(f.value, dict):
999
+ raise ValueError(
1000
+ f"IN_SUBQUERY filter on column '{f.column}' requires "
1001
+ f"value={{'sql': '...'}}, got {type(f.value).__name__}"
1002
+ )
1003
+ if "sql" not in f.value:
1004
+ raise ValueError(
1005
+ f"IN_SUBQUERY filter on column '{f.column}' requires "
1006
+ f"value={{'sql': '...'}}, missing 'sql' key"
1007
+ )
1008
+ subquery_sql = f.value["sql"].strip()
1009
+ if not subquery_sql:
1010
+ raise ValueError(f"IN_SUBQUERY filter on column '{f.column}' has empty SQL")
1011
+ return f"{col_ref} IN ({subquery_sql})", params
1012
+
1013
+ # Unknown operator - raise error instead of silent fallback
1014
+ raise ValueError(f"Unknown filter operator: {op}")
1015
+
1016
+ def _build_group_by(
1017
+ self,
1018
+ query: QueryDefinition,
1019
+ table_refs: dict[str, str],
1020
+ calc_sql_map: dict[str, str],
1021
+ ) -> str:
1022
+ """Build the GROUP BY clause, including time series bucket if
1023
+ configured."""
1024
+ group_by_parts: list[str] = []
1025
+
1026
+ # Add time series bucket to GROUP BY if present
1027
+ if query.time_series:
1028
+ ts = query.time_series
1029
+ table_ref = table_refs[ts.table_id]
1030
+ date_col = f"{table_ref}.{self._quote_identifier(ts.date_column)}"
1031
+ group_by_parts.append(f"date_trunc('{ts.interval}', {date_col})")
1032
+
1033
+ # Build set of calculated fields that have internal aggregation
1034
+ calc_fields_with_agg = {
1035
+ cf.name for cf in query.calculated_fields if cf.has_internal_aggregation
1036
+ }
1037
+
1038
+ # Build lookup from (table_id, column) to column selection for date_trunc/sql_expression
1039
+ column_lookup: dict[tuple[str, str], ColumnSelection] = {
1040
+ (col.table_id, col.column): col for col in query.columns
1041
+ }
1042
+
1043
+ # Add regular GROUP BY columns
1044
+ group_by_cols = query.derive_group_by()
1045
+ for g in group_by_cols:
1046
+ # Skip calculated fields that have internal aggregation
1047
+ # These fields contain SUM, COUNT, etc. and should NOT be in GROUP BY
1048
+ if g.column in calc_fields_with_agg:
1049
+ continue
1050
+
1051
+ # Look up the column selection to check for date_trunc/sql_expression
1052
+ col_sel = column_lookup.get((g.table_id, g.column))
1053
+
1054
+ # Handle column with inline sql_expression (e.g., calculated field)
1055
+ if col_sel and col_sel.sql_expression:
1056
+ group_by_parts.append(f"({col_sel.sql_expression})")
1057
+ # Handle calculated field references - expand to SQL expression
1058
+ elif g.column in calc_sql_map:
1059
+ group_by_parts.append(f"({calc_sql_map[g.column]})")
1060
+ else:
1061
+ table_ref = table_refs[g.table_id]
1062
+ col_ref = f"{table_ref}.{self._quote_identifier(g.column)}"
1063
+
1064
+ # Apply date_trunc if specified (must match SELECT clause)
1065
+ if col_sel and col_sel.date_trunc:
1066
+ col_ref = f"date_trunc('{col_sel.date_trunc}', {col_ref})"
1067
+
1068
+ group_by_parts.append(col_ref)
1069
+
1070
+ # If time series is present and there are aggregations, we need GROUP BY
1071
+ if query.time_series and query.has_aggregations() and not group_by_cols:
1072
+ # Only have the time series bucket
1073
+ pass
1074
+ elif not group_by_parts:
1075
+ return ""
1076
+
1077
+ return ", ".join(group_by_parts)
1078
+
1079
+ def _build_order_by(
1080
+ self,
1081
+ query: QueryDefinition,
1082
+ table_refs: dict[str, str],
1083
+ calc_sql_map: dict[str, str],
1084
+ ) -> str:
1085
+ """Build the ORDER BY clause, adding time series bucket if
1086
+ configured."""
1087
+ parts: list[str] = []
1088
+
1089
+ # Build lookup from (table_id, column) to column selection for date_trunc
1090
+ # Only include non-aggregated columns since those are the ones with date_trunc
1091
+ column_lookup: dict[tuple[str, str], ColumnSelection] = {
1092
+ (col.table_id, col.column): col
1093
+ for col in query.columns
1094
+ if col.aggregation == AggregationType.NONE
1095
+ }
1096
+
1097
+ # If time series is present and no explicit order by, order by date bucket
1098
+ if query.time_series and not query.order_by:
1099
+ ts = query.time_series
1100
+ table_ref = table_refs[ts.table_id]
1101
+ date_col = f"{table_ref}.{self._quote_identifier(ts.date_column)}"
1102
+ parts.append(f"date_trunc('{ts.interval}', {date_col}) ASC")
1103
+ else:
1104
+ # Use explicit order by
1105
+ for o in query.order_by:
1106
+ # Handle calculated field references - expand to SQL expression
1107
+ if o.column in calc_sql_map:
1108
+ col_ref = f"({calc_sql_map[o.column]})"
1109
+ else:
1110
+ table_ref = table_refs[o.table_id]
1111
+ col_ref = f"{table_ref}.{self._quote_identifier(o.column)}"
1112
+
1113
+ # Apply date_trunc if the column has it (must match SELECT/GROUP BY)
1114
+ col_sel = column_lookup.get((o.table_id, o.column))
1115
+ if col_sel and col_sel.date_trunc:
1116
+ col_ref = f"date_trunc('{col_sel.date_trunc}', {col_ref})"
1117
+
1118
+ parts.append(f"{col_ref} {o.direction.value}")
1119
+
1120
+ return ", ".join(parts)
1121
+
1122
+ def _coerce_value(self, value: Any, data_type: str | None) -> Any:
1123
+ """Coerce a filter value to the appropriate Python type for asyncpg.
1124
+
1125
+ asyncpg requires Python date/datetime objects for date/timestamp columns,
1126
+ not strings. This method converts string values to appropriate Python types
1127
+ based on the column's data type.
1128
+
1129
+ Args:
1130
+ value: The filter value (may be a string, list, or other type).
1131
+ data_type: The PostgreSQL data type of the column (e.g., 'date', 'timestamp').
1132
+
1133
+ Returns:
1134
+ The value coerced to the appropriate Python type.
1135
+ """
1136
+ if value is None or data_type is None:
1137
+ return value
1138
+
1139
+ data_type_lower = data_type.lower()
1140
+
1141
+ # Check if this is a date/timestamp column
1142
+ date_types = {"date"}
1143
+ timestamp_types = {
1144
+ "timestamp",
1145
+ "timestamp without time zone",
1146
+ "timestamp with time zone",
1147
+ "timestamptz",
1148
+ }
1149
+
1150
+ is_date = (
1151
+ any(dt in data_type_lower for dt in date_types) and "timestamp" not in data_type_lower
1152
+ )
1153
+ is_timestamp = any(dt in data_type_lower for dt in timestamp_types)
1154
+
1155
+ if not is_date and not is_timestamp:
1156
+ return value
1157
+
1158
+ # Handle list values (for IN, NOT_IN, BETWEEN)
1159
+ if isinstance(value, list):
1160
+ return [self._coerce_single_date_value(v, is_date) for v in value]
1161
+
1162
+ if isinstance(value, tuple):
1163
+ return tuple(self._coerce_single_date_value(v, is_date) for v in value)
1164
+
1165
+ return self._coerce_single_date_value(value, is_date)
1166
+
1167
+ def _coerce_single_date_value(self, value: Any, is_date: bool) -> Any:
1168
+ """Coerce a single value to date or datetime.
1169
+
1170
+ Args:
1171
+ value: The value to coerce.
1172
+ is_date: True for date columns, False for timestamp columns.
1173
+
1174
+ Returns:
1175
+ Python date or datetime object, or original value if not a string/date type.
1176
+
1177
+ Raises:
1178
+ ValueError: If a string value cannot be parsed as a valid date/datetime.
1179
+ """
1180
+ # Already the correct type
1181
+ if isinstance(value, datetime):
1182
+ return value.date() if is_date else value
1183
+ if isinstance(value, date):
1184
+ return value if is_date else datetime.combine(value, datetime.min.time())
1185
+
1186
+ # Try to parse string values
1187
+ if isinstance(value, str):
1188
+ expected_type = "date" if is_date else "datetime"
1189
+ try:
1190
+ # Try ISO format with time first (e.g., "2026-01-01T00:00:00")
1191
+ if "T" in value or " " in value:
1192
+ # Handle both 'T' separator and space separator
1193
+ dt = datetime.fromisoformat(value.replace(" ", "T"))
1194
+ return dt.date() if is_date else dt
1195
+ # Date only format (e.g., "2026-01-01")
1196
+ dt = datetime.strptime(value, "%Y-%m-%d")
1197
+ return dt.date() if is_date else dt
1198
+ except ValueError as e:
1199
+ raise ValueError(
1200
+ f"Invalid {expected_type} value: {value!r}. "
1201
+ f"Expected ISO format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)."
1202
+ ) from e
1203
+
1204
+ return value
1205
+
1206
+ def _quote_identifier(self, identifier: str) -> str:
1207
+ """Quote a SQL identifier to prevent injection.
1208
+
1209
+ Args:
1210
+ identifier: Column or table name.
1211
+
1212
+ Returns:
1213
+ Quoted identifier (e.g., "column_name").
1214
+ """
1215
+ # Escape any existing double quotes
1216
+ escaped = identifier.replace('"', '""')
1217
+ return f'"{escaped}"'
1218
+
1219
+ def _quote_table(self, table_name: str) -> str:
1220
+ """Quote a table name with optional schema qualification.
1221
+
1222
+ Args:
1223
+ table_name: Name of the table.
1224
+
1225
+ Returns:
1226
+ Schema-qualified table name if schema_name is set,
1227
+ otherwise just the quoted table name.
1228
+ E.g., "org_123"."users" or just "users"
1229
+ """
1230
+ quoted_table = self._quote_identifier(table_name)
1231
+ if self._schema_name:
1232
+ return f"{self._quote_identifier(self._schema_name)}.{quoted_table}"
1233
+ return quoted_table