PostBOUND 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. postbound/__init__.py +211 -0
  2. postbound/_base.py +6 -0
  3. postbound/_bench.py +1012 -0
  4. postbound/_core.py +1153 -0
  5. postbound/_hints.py +1373 -0
  6. postbound/_jointree.py +1079 -0
  7. postbound/_pipelines.py +1121 -0
  8. postbound/_qep.py +1986 -0
  9. postbound/_stages.py +876 -0
  10. postbound/_validation.py +734 -0
  11. postbound/db/__init__.py +72 -0
  12. postbound/db/_db.py +2348 -0
  13. postbound/db/_duckdb.py +785 -0
  14. postbound/db/mysql.py +1195 -0
  15. postbound/db/postgres.py +4216 -0
  16. postbound/experiments/__init__.py +12 -0
  17. postbound/experiments/analysis.py +674 -0
  18. postbound/experiments/benchmarking.py +54 -0
  19. postbound/experiments/ceb.py +877 -0
  20. postbound/experiments/interactive.py +105 -0
  21. postbound/experiments/querygen.py +334 -0
  22. postbound/experiments/workloads.py +980 -0
  23. postbound/optimizer/__init__.py +92 -0
  24. postbound/optimizer/__init__.pyi +73 -0
  25. postbound/optimizer/_cardinalities.py +369 -0
  26. postbound/optimizer/_joingraph.py +1150 -0
  27. postbound/optimizer/dynprog.py +1825 -0
  28. postbound/optimizer/enumeration.py +432 -0
  29. postbound/optimizer/native.py +539 -0
  30. postbound/optimizer/noopt.py +54 -0
  31. postbound/optimizer/presets.py +147 -0
  32. postbound/optimizer/randomized.py +650 -0
  33. postbound/optimizer/tonic.py +1479 -0
  34. postbound/optimizer/ues.py +1607 -0
  35. postbound/qal/__init__.py +343 -0
  36. postbound/qal/_qal.py +9678 -0
  37. postbound/qal/formatter.py +1089 -0
  38. postbound/qal/parser.py +2344 -0
  39. postbound/qal/relalg.py +4257 -0
  40. postbound/qal/transform.py +2184 -0
  41. postbound/shortcuts.py +70 -0
  42. postbound/util/__init__.py +46 -0
  43. postbound/util/_errors.py +33 -0
  44. postbound/util/collections.py +490 -0
  45. postbound/util/dataframe.py +71 -0
  46. postbound/util/dicts.py +330 -0
  47. postbound/util/jsonize.py +68 -0
  48. postbound/util/logging.py +106 -0
  49. postbound/util/misc.py +168 -0
  50. postbound/util/networkx.py +401 -0
  51. postbound/util/numbers.py +438 -0
  52. postbound/util/proc.py +107 -0
  53. postbound/util/stats.py +37 -0
  54. postbound/util/system.py +48 -0
  55. postbound/util/typing.py +35 -0
  56. postbound/vis/__init__.py +5 -0
  57. postbound/vis/fdl.py +69 -0
  58. postbound/vis/graphs.py +48 -0
  59. postbound/vis/optimizer.py +538 -0
  60. postbound/vis/plots.py +84 -0
  61. postbound/vis/tonic.py +70 -0
  62. postbound/vis/trees.py +105 -0
  63. postbound-0.19.0.dist-info/METADATA +355 -0
  64. postbound-0.19.0.dist-info/RECORD +67 -0
  65. postbound-0.19.0.dist-info/WHEEL +5 -0
  66. postbound-0.19.0.dist-info/licenses/LICENSE.txt +202 -0
  67. postbound-0.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2344 @@
1
+ """The parser constructs `SqlQuery` objects from query strings.
2
+
3
+ Other than the parsing itself, the process will also execute a basic column binding process. For example, consider
4
+ a query like *SELECT \\* FROM R WHERE R.a = 42*. In this case, the binding only affects the column reference *R.a*
5
+ and sets the table of that column to *R*. This binding based on column and table names is always performed.
6
+
7
+ If the table cannot be inferred based on the column name (e.g. for a query like *SELECT * FROM R, S WHERE a = 42*), a
8
+ second binding phase can be executed. This binding needs a working database connection and queries the database schema
9
+ to detect the correct tables for each column. Whether the second phase should also be executed by default can be
10
+ configured system-wide by setting the `auto_bind_columns` variable.
11
+
12
+ Notes
13
+ -----
14
+ Please beware that SQL parsing is a very challenging undertaking and there might be bugs in some lesser-used features.
15
+ If you encounter any issues, please report them on the GitHub issue tracker.
16
+ We test the parser based on some popular benchmarks, namely JOB and Stats to ensure that result sets from the raw SQL queries
17
+ match result sets from the parsed queries. However, we cannot guarantee that the parser will work for all SQL queries.
18
+
19
+ The parsing itself is based on the pglast project that implements a SQL -> JSON/dict conversion, based on the actual Postgres
20
+ query parser. Our parser implementation takes such a JSON representation as input and generates the more verbose structures of
21
+ the qal. There exists a Jupyter notebook called *PglastParsingTests* in the *tests* directory that shows the output emitted by
22
+ pglast for different SQL query features.
23
+
24
+ References
25
+ ----------
26
+
27
+ .. pglast project: https://github.com/lelit/pglast Thanks a lot for maintaining this fantastic tool and the great support!
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import collections
33
+ import json
34
+ import warnings
35
+ from collections.abc import Iterable
36
+ from typing import Literal, Optional, overload
37
+
38
+ import pglast
39
+
40
+ from .. import util
41
+ from .._core import ColumnReference, DBCatalog, TableReference
42
+ from ._qal import (
43
+ AbstractPredicate,
44
+ ArrayAccessExpression,
45
+ BaseClause,
46
+ BaseProjection,
47
+ BetweenPredicate,
48
+ BinaryPredicate,
49
+ CaseExpression,
50
+ CastExpression,
51
+ ColumnExpression,
52
+ CommonTableExpression,
53
+ CompoundOperator,
54
+ CompoundPredicate,
55
+ DirectTableSource,
56
+ Explain,
57
+ ExplicitFromClause,
58
+ From,
59
+ FunctionExpression,
60
+ FunctionTableSource,
61
+ GroupBy,
62
+ Having,
63
+ Hint,
64
+ ImplicitFromClause,
65
+ InPredicate,
66
+ JoinTableSource,
67
+ JoinType,
68
+ Limit,
69
+ LogicalOperator,
70
+ MathExpression,
71
+ MathOperator,
72
+ OrderBy,
73
+ OrderByExpression,
74
+ Select,
75
+ SelectStatement,
76
+ SetOperator,
77
+ SetQuery,
78
+ SqlExpression,
79
+ SqlOperator,
80
+ SqlQuery,
81
+ StarExpression,
82
+ StaticValueExpression,
83
+ SubqueryExpression,
84
+ SubqueryTableSource,
85
+ TableSource,
86
+ UnaryPredicate,
87
+ ValuesList,
88
+ ValuesTableSource,
89
+ ValuesWithQuery,
90
+ Where,
91
+ WindowExpression,
92
+ WithQuery,
93
+ build_query,
94
+ )
95
+
96
+ auto_bind_columns: bool = True
97
+ """Indicates whether the parser should use the database catalog to obtain column bindings."""
98
+
99
+
100
+ class SchemaCache:
101
+ """A simple cache that stores the columns that belong to tables in our database schema.
102
+
103
+ The cache only queries the actual catalog of the database system, if the requested table has not been cached, yet.
104
+
105
+ Parameters
106
+ ----------
107
+ schema : Optional[DatabaseSchema]
108
+ The schema to cache. If not provided, the cache cannot resolve column bindings.
109
+ """
110
+
111
+ def __init__(self, schema: Optional[DBCatalog] = None) -> None:
112
+ self._schema = schema
113
+ self._lookup_cache: dict[TableReference, tuple[list[str], set[str]]] = (
114
+ collections.defaultdict(set)
115
+ )
116
+
117
+ def initialize_with(self, schema: Optional[DBCatalog]) -> None:
118
+ """Sets the catalog if necessary"""
119
+ if self._schema is not None and self._schema != schema:
120
+ warnings.warn("Parsing query for new schema. Dropping old schema cache.")
121
+ self._schema = schema
122
+ self._lookup_cache.clear()
123
+ elif self._schema is not None:
124
+ # same schema as before, do nothing
125
+ return
126
+ self._schema = schema
127
+
128
+ def lookup_column(
129
+ self, colname: str, candidate_tables: Iterable[TableReference]
130
+ ) -> Optional[TableReference]:
131
+ """Resolves the table that defines a specific column.
132
+
133
+ If no catalog is available, this method will always return *None*.
134
+
135
+ Returns
136
+ -------
137
+ Optional[TableReference]
138
+ The table that defines the column. If there are multiple tables that could define the column, an arbitrary one
139
+ is returned. If none of the candidates is the correct table, *None* is returned.
140
+ """
141
+ if not self._schema:
142
+ return None
143
+
144
+ for candidate in candidate_tables:
145
+ if candidate.virtual:
146
+ continue
147
+
148
+ _, table_columns = self._inflate_cache(candidate)
149
+ if colname in table_columns:
150
+ return candidate
151
+
152
+ return None
153
+
154
+ def columns_of(self, table: str) -> list[str]:
155
+ """Provides the columns that belong to a specific table.
156
+
157
+ If no catalog is available, this method will always return an empty list.
158
+ """
159
+ if not self._schema:
160
+ return []
161
+
162
+ cols, _ = self._inflate_cache(table)
163
+ return cols
164
+
165
+ def _inflate_cache(self, table: str) -> tuple[list[str], set[str]]:
166
+ """Provides the columns that belong to a specific table, consulting the online catalog if necessary.
167
+
168
+ This method assumes that there is indeed an online schema available. Calling this method without a schema will
169
+ result in an arbitrary runtime error.
170
+
171
+ Returns
172
+ -------
173
+ tuple[list[str], set[str]]
174
+ The columns of the table in their defined order, as well as the same columns as a set.
175
+ """
176
+ cached_res = self._lookup_cache.get(table)
177
+ if cached_res:
178
+ return cached_res
179
+ cols: list[str] = [col.name for col in self._schema.columns(table)]
180
+ cols_set = set(cols)
181
+ self._lookup_cache[table] = cols, cols_set
182
+ return cols, cols_set
183
+
184
+
185
+ class QueryNamespace:
186
+ """The query namespace acts as the central service to resolve column bindings in a query.
187
+
188
+ It maintains a visibility map of all tables at a given point in the query and keeps track of the columns that form the
189
+ result relation at the same points in time. This information is used to bind column references to the correct tables,
190
+ including temporary virtual tables that alias existing physical columns.
191
+
192
+ The namespace protocol works as follows:
193
+
194
+ - While parsing a query, the table sources (CTEs and FROM entries) should be handled first. Each source should be
195
+ registered in the namespace using the `register_table` method.
196
+ - When a subquery or CTE is encoutered, the `open_nested` method has to be called to open a new local namespace and
197
+ track the virtual table correctly.
198
+ - Once all tables are registered, the parser can handle the *SELECT* clause. Afterwards, `determine_output_shape` has to
199
+ called to compute all columns that are part of the result relation of the current namespace. This method takes care
200
+ of resolving *SELECT \\** operations as necessary and requires that all input sources have already been registered and
201
+ completely parsed, such that their output shapes are known.
202
+ - While parsing the different clauses of the query, `lookup_column` and `resolve_table` can be used to determine the
203
+ correct table references based on the sources that are currently available in the namespace.
204
+
205
+ Each namespace can be connected to a parent namespace, which in turn can provide additional CTEs, physical tables or
206
+ subqueries (if the current namespace is for a LATERAL subquery). This allows the current namespace to check whether some
207
+ column is actually provided by an outer scope if the namespace does not provide the column itself.
208
+ """
209
+
210
+ _schema_cache: SchemaCache = SchemaCache()
211
+ """The schema cache that is used to resolve column bindings. This cache is shared through the entire program lifetime.
212
+
213
+ Changing the actual database schema while PostBOUND is running will result in undefined behavior.
214
+ """
215
+
216
+ @staticmethod
217
+ def empty(schema: Optional["DatabaseSchema"] = None) -> QueryNamespace: # type: ignore # noqa: F821
218
+ QueryNamespace._schema_cache.initialize_with(schema)
219
+ return QueryNamespace()
220
+
221
+ def __init__(self, *, parent: Optional[QueryNamespace] = None) -> None:
222
+ self._parent = parent
223
+
224
+ self._subquery_children: dict[str, QueryNamespace] = {}
225
+ """Nested namespaces that are provided as part of subqueries. Entries map alias -> query."""
226
+
227
+ self._setop_children: list[QueryNamespace, QueryNamespace] = []
228
+ """Namespace of the queries that form a set operation in the current namespace."""
229
+
230
+ self._current_ctx: list[TableReference] = []
231
+ """The tables that are currently in scope, no matter their origin (CTEs or FROM clause).
232
+
233
+ For the purpose of this dictionary, it does not matter where a table comes from (physical table, CTE, subquery, ...).
234
+ The only thing that matters is that the table is part of the FROM clause. This is especially important to build the
235
+ correct output shape of the namespace/relation if the SELECT clause contains * expressions.
236
+
237
+ Therefore, there might be tables that are contained in the `_cte_sources`, but not here (if the CTE is only used to
238
+ build other CTEs, but not part of the FROM clause itself).
239
+
240
+ Notice that tables that are defined in an enclosing scope (e.g. outer query in a sequence of nested CTEs) are not
241
+ contained in this context if they are not also part of this namespace's *FROM* clause. Instead, they are resolved
242
+ through the API on the parent namespace.
243
+
244
+ The ordering is important to resolve output columns correctly to the first match as Postgres does.
245
+
246
+ An optimized container to check whether a table is part of the current context is available via `_table_sources`
247
+ """
248
+
249
+ self._cte_sources: dict[str, QueryNamespace] = {}
250
+ """Namespaces that are induced by CTEs. Entries map alias -> CTE."""
251
+
252
+ self._table_sources: dict[str, TableReference] = {}
253
+ """The tables that are part of the FROM clause of the query. Entries map alias -> table.
254
+
255
+ Tables can be contained in this dictionary multiple times: once for each relevant identifier. If a table has both an
256
+ alias as well as a full name, both keys will be present.
257
+ """
258
+
259
+ self._output_shape: list[str] = []
260
+ """The column names that are part of the result set produced by the queries in this namespace.
261
+
262
+ These are really just the column names, not full references. This is because it makes the access mechanism more
263
+ transparent (just use the name, duh) and prevents accidental issues when a column from an inner query is re-used in an
264
+ outer query and is bound to both virtual tables. Comparing the references would indicate that these are different
265
+ columns (which arguably they are, just not for our purposes).
266
+ """
267
+
268
+ self._column_cache: dict[str, TableReference] = {}
269
+ """A cache to resolve common columns in the current context more quickly."""
270
+
271
+ def determine_output_shape(
272
+ self, select_clause: Optional[Select | Iterable[ColumnReference | str]]
273
+ ) -> None:
274
+ """Determines the columns that form the result relation of this namespace.
275
+
276
+ The result is only stored internally to allow parent namespaces to resolve column references correctly.
277
+
278
+ This method should only be called after all table sources from the current namespace are already registered in order to
279
+ ensure that star expressions can be resolved correctly.
280
+ """
281
+ self._output_shape = []
282
+ if self._setop_children:
283
+ # We use Postgre's rules here: the output relation of a set operation contains exactly those columns that are
284
+ # contained in the LHS relation
285
+ self._output_shape = list(self._setop_children[0]._output_shape)
286
+ return
287
+
288
+ for projection in select_clause:
289
+ if isinstance(projection, (str, ColumnReference)):
290
+ self._output_shape.append(
291
+ projection.name
292
+ if isinstance(projection, ColumnReference)
293
+ else projection
294
+ )
295
+ continue
296
+
297
+ # must be BaseProjection
298
+ if projection.target_name:
299
+ self._output_shape.append(projection.target_name)
300
+ continue
301
+
302
+ match projection.expression:
303
+ case ColumnExpression(col):
304
+ self._output_shape.append(col.name)
305
+
306
+ case StarExpression(from_table):
307
+ ctx = {from_table} if from_table else self._current_ctx
308
+ for table in ctx:
309
+ if not table.virtual:
310
+ self._output_shape.extend(
311
+ self._schema_cache.columns_of(table)
312
+ )
313
+ continue
314
+
315
+ if table.alias:
316
+ defining_nsp = self._lookup_namespace(table.alias)
317
+ if not defining_nsp and table.full_name:
318
+ # if we try to look up an aliased CTE, we need to use the full name instead
319
+ defining_nsp = self._lookup_namespace(table.full_name)
320
+ if not defining_nsp:
321
+ raise ParserError(f"No namespace found for table '{table}'")
322
+ self._output_shape.extend(defining_nsp._output_shape)
323
+
324
+ case _:
325
+ # do nothing, this is an expression that cannot be referenced later on!
326
+ pass
327
+
328
+ def register_table(self, table: TableReference) -> None:
329
+ """Adds a "physical" table to the current namespace.
330
+
331
+ In truth, the table does not need to be physical, it can also be a CTE that was defined in an outer namespace and
332
+ is scanned here. "Physical" in this context means that the current namespace does not define the table itself.
333
+ """
334
+ self._invalidate_column_cache()
335
+ self._current_ctx.append(table)
336
+ if table.alias:
337
+ self._table_sources[table.alias] = table
338
+ if table.full_name:
339
+ self._table_sources[table.full_name] = table
340
+
341
+ def provides_column(self, name: str) -> bool:
342
+ """Checks, whether the current namespace has a specific column in its output relation."""
343
+ return name in self._output_shape
344
+
345
+ def lookup_column(self, key: str) -> Optional[TableReference]:
346
+ """Searches for the table that provies a specific column.
347
+
348
+ This table can be either virtual, i.e. a subquery or CTE (possibly from an outer namespace), or an actual physical
349
+ table from the current database.
350
+
351
+ If no table is found , *None* is returned.
352
+ """
353
+ cached_table = self._column_cache.get(key)
354
+ if cached_table:
355
+ return cached_table
356
+
357
+ matching_table: Optional[TableReference] = None
358
+ for table in self._current_ctx:
359
+ # later tables overwrite unqualified columns of earlier tables
360
+ physical_table = (
361
+ self._schema_cache.lookup_column(key, [table])
362
+ if not table.virtual
363
+ else None
364
+ )
365
+ if physical_table:
366
+ matching_table = table
367
+ break
368
+
369
+ subquery_nsp = self._subquery_children.get(table.identifier())
370
+ if subquery_nsp and subquery_nsp.provides_column(key):
371
+ matching_table = table
372
+ break
373
+
374
+ cte_nsp = self._cte_sources.get(table.identifier())
375
+ if cte_nsp and cte_nsp.provides_column(key):
376
+ matching_table = table
377
+ break
378
+
379
+ parent_nsp = self._lookup_namespace(table.identifier())
380
+ if parent_nsp and parent_nsp.provides_column(key):
381
+ matching_table = table
382
+ break
383
+
384
+ if not matching_table:
385
+ return None
386
+
387
+ self._column_cache[key] = matching_table
388
+ return matching_table
389
+
390
+ def resolve_table(self, key: str) -> Optional[TableReference]:
391
+ """Searches for the table that is referenced by a specific key.
392
+
393
+ The table can be either provided by this namespace (as a physical table in the *FROM* clause, or defined through a
394
+ subquery/CTE), or by an outer namspace.
395
+ """
396
+ sourced_table = self._table_sources.get(key)
397
+ if sourced_table:
398
+ return sourced_table
399
+
400
+ if key in self._cte_sources:
401
+ return TableReference.create_virtual(key)
402
+
403
+ return self._parent.resolve_table(key) if self._parent else None
404
+
405
+ def open_nested(
406
+ self,
407
+ *,
408
+ alias: str = "",
409
+ source: Literal["cte", "subquery", "setop", "values", "temporary"],
410
+ ) -> QueryNamespace:
411
+ """Creates a new local namespace for a nested query.
412
+
413
+ Depending on the type of nested query, the namespace will be registered in different ways and used for different
414
+ purposes (see parameters below).
415
+
416
+ Parameters
417
+ ----------
418
+ alias : str, optional
419
+ The name of the namespace. This is only relevant for CTEs and subqueries in the FROM clause.
420
+ source : Literal["cte", "subquery", "setop", "values", "temporary"]
421
+ The type of nested query. This value is used to determine the use of the subquery namespace as follows:
422
+ - "cte": The namespace is a CTE that is part of the query.
423
+ - "subquery": The namespace is a subquery in the FROM clause.
424
+ - "setop": The namespace is part of a set operation. No alias is required, but the namespace might be used to
425
+ determine the output shape of the current namespace
426
+ - "values": The namespace is a temporary table that is part of a VALUES clause.
427
+ - "temporary": The namespace is a temporary table that is part of a subquery which is not used in the FROM clause,
428
+ e.g. as a filter condition.
429
+ """
430
+ if source != "temporary":
431
+ self._invalidate_column_cache()
432
+
433
+ child = QueryNamespace(parent=self)
434
+
435
+ match source:
436
+ case "cte":
437
+ self._cte_sources[alias] = child
438
+ case "subquery" | "values":
439
+ table = TableReference.create_virtual(alias)
440
+ self._subquery_children[alias] = child
441
+ self._current_ctx.append(table)
442
+ self._table_sources[alias] = table
443
+ case "setop":
444
+ self._setop_children.append(child)
445
+ case _:
446
+ # ignore other sources
447
+ pass
448
+
449
+ return child
450
+
451
+ def _lookup_namespace(self, table_key: str) -> Optional[QueryNamespace]:
452
+ """Searches for the (parent) namespace that provides a specific table."""
453
+ cte_nsp = self._cte_sources.get(table_key)
454
+ if cte_nsp:
455
+ return cte_nsp
456
+
457
+ subquery_nsp = self._subquery_children.get(table_key)
458
+ if subquery_nsp:
459
+ return subquery_nsp
460
+
461
+ return self._parent._lookup_namespace(table_key) if self._parent else None
462
+
463
+ def _invalidate_column_cache(self) -> None:
464
+ """Clears all currently cached columns in case there is fear of a change in the column bindings."""
465
+ self._column_cache.clear()
466
+
467
+
468
+ def _pglast_is_actual_colref(pglast_data: dict) -> bool:
469
+ """Checks, whether a apparent column reference is actually a column reference and not a star expression in disguise.
470
+
471
+ pglast represents both column references such as *R.a* or *a* as well as star expressions like *R.\\** as ``ColumnRef``
472
+ dictionaries, hence we need to make sure we are actually parsing the right thing. This method takes care of distinguishing
473
+ the two cases.
474
+
475
+ Parameters
476
+ ----------
477
+ pglast_data : dict
478
+ JSON encoding of the potential column
479
+
480
+ Returns
481
+ -------
482
+ bool
483
+ *True* if this is an actual column reference, *False* if this is a star expression.
484
+ """
485
+ fields: list[dict] = pglast_data["fields"]
486
+ if len(fields) == 1:
487
+ return "A_Star" not in fields[0]
488
+ if len(fields) == 2:
489
+ would_be_col: str = fields[1]
490
+ return "A_Star" not in would_be_col
491
+
492
+ would_be_col: str = fields[0]["String"]["sval"]
493
+ return not would_be_col.endswith("*")
494
+
495
+
496
+ def _pglast_create_bound_colref(
497
+ tab: str, col: str, *, namespace: QueryNamespace
498
+ ) -> ColumnReference:
499
+ """Creates a new reference to a column with known binding info.
500
+
501
+ Parameters
502
+ ----------
503
+ tab : str
504
+ The table to which to bind
505
+ col : str
506
+ The column to bind
507
+ namespace : QueryNamespace
508
+ The tables and columns that are available in the current query.
509
+
510
+ Returns
511
+ -------
512
+ ColumnReference
513
+ The new column reference
514
+ """
515
+ owning_table = namespace.resolve_table(tab)
516
+ if not owning_table:
517
+ raise ParserError("Table not found: " + tab)
518
+ parsed_column = ColumnReference(col, owning_table)
519
+ return parsed_column
520
+
521
+
522
+ def _pglast_parse_colref(
523
+ pglast_data: dict, *, namespace: QueryNamespace
524
+ ) -> ColumnReference:
525
+ """Handler method to parse column references in the query.
526
+
527
+ The column will be bound to its table if possible. This binding process uses the following rules:
528
+
529
+ - if the columns has already been resolved as part of an earlier parsing step in the same namespace, this column is re-used
530
+ - if the column is specified in qualified syntax (i.e. *table.column*), the table is directly inferred
531
+ - if the column is not qualified, but a `schema` is given, this schema is used together with the candidates from the
532
+ current namespace to lookup the owning table
533
+ - otherwise, the column is left unbound :(
534
+
535
+ Parameters
536
+ ----------
537
+ pglast_data : dict
538
+ JSON enconding of the column
539
+ namespace : QueryNamespace
540
+ The tables and columns that are available in the current query.
541
+
542
+ Returns
543
+ -------
544
+ ColumnReference
545
+ The parsed column reference.
546
+ """
547
+ fields: list[dict] = pglast_data["fields"]
548
+ if len(fields) > 2:
549
+ raise ParserError("Unknown column reference format: " + str(pglast_data))
550
+
551
+ if len(fields) == 2:
552
+ tab, col = fields
553
+ return _pglast_create_bound_colref(
554
+ tab["String"]["sval"], col["String"]["sval"], namespace=namespace
555
+ )
556
+
557
+ # at this point, we must have a single column parameter. It could be unbounded, or - if quoted - bounded
558
+ col: str = fields[0]["String"]["sval"]
559
+ owning_table = namespace.lookup_column(col)
560
+ return ColumnReference(col, owning_table)
561
+
562
+
563
+ def _pglast_parse_star(
564
+ pglast_data: dict, *, namespace: QueryNamespace
565
+ ) -> StarExpression:
566
+ """Handler method to parse star expressions that are potentially bounded to a specific table, e.g. *R.\\**.
567
+
568
+ Parameters
569
+ ----------
570
+ pglast_data : dict
571
+ JSON enconding of the star expression
572
+ namespace : QueryNamespace
573
+ The tables and columns that are available in the current query.
574
+
575
+ Returns
576
+ -------
577
+ StarExpression
578
+ The parsed star expression.
579
+ """
580
+ fields = pglast_data["fields"]
581
+ if len(fields) == 1 and "A_Star" in fields[0]:
582
+ return StarExpression()
583
+
584
+ if len(fields) == 2:
585
+ tab = fields[0]["String"]["sval"]
586
+ return StarExpression(from_table=namespace.resolve_table(tab))
587
+
588
+ raise ParserError("Unknown star reference format: " + str(pglast_data))
589
+
590
+
591
+ def _pglast_parse_const(pglast_data: dict) -> StaticValueExpression:
592
+ """Handler method to parse constant values in the query.
593
+
594
+ Parameters
595
+ ----------
596
+ pglast_data : dict
597
+ JSON enconding of the value. This data is extracted from the pglast data structure.
598
+
599
+ Returns
600
+ -------
601
+ StaticValueExpression
602
+ The parsed constant value.
603
+ """
604
+ pglast_data.pop("location", None)
605
+ valtype = util.dicts.key(pglast_data)
606
+ match valtype:
607
+ case "isnull":
608
+ return StaticValueExpression.null()
609
+ case "ival":
610
+ val = pglast_data["ival"]["ival"] if "ival" in pglast_data["ival"] else 0
611
+ return StaticValueExpression(val)
612
+ case "fval":
613
+ val = pglast_data["fval"]["fval"]
614
+ return StaticValueExpression(float(val))
615
+ case "sval":
616
+ return StaticValueExpression(pglast_data["sval"]["sval"])
617
+ case "boolval":
618
+ val = pglast_data["boolval"].get("boolval", False)
619
+ return StaticValueExpression(val)
620
+ case _:
621
+ raise ParserError("Unknown constant type: " + str(pglast_data))
622
+
623
+
624
+ _PglastOperatorMap: dict[str, SqlOperator] = {
625
+ "=": LogicalOperator.Equal,
626
+ "<": LogicalOperator.Less,
627
+ "<=": LogicalOperator.LessEqual,
628
+ ">": LogicalOperator.Greater,
629
+ ">=": LogicalOperator.GreaterEqual,
630
+ "<>": LogicalOperator.NotEqual,
631
+ "!=": LogicalOperator.NotEqual,
632
+ "AND_EXPR": CompoundOperator.And,
633
+ "OR_EXPR": CompoundOperator.Or,
634
+ "NOT_EXPR": CompoundOperator.Not,
635
+ "+": MathOperator.Add,
636
+ "-": MathOperator.Subtract,
637
+ "*": MathOperator.Multiply,
638
+ "/": MathOperator.Divide,
639
+ "%": MathOperator.Modulo,
640
+ "||": MathOperator.Concatenate,
641
+ }
642
+ """Map from the internal representation of Postgres operators to our standardized QAL operators."""
643
+
644
+
645
+ def _pglast_parse_operator(pglast_data: list[dict]) -> SqlOperator:
646
+ """Handler method to parse operators into our query representation.
647
+
648
+ Parameters
649
+ ----------
650
+ pglast_data : list[dict]
651
+ JSON enconding of the operator. This data is extracted from the pglast data structure.
652
+
653
+ Returns
654
+ -------
655
+ SqlOperator
656
+ The parsed operator.
657
+ """
658
+ if len(pglast_data) != 1:
659
+ raise ParserError("Unknown operator format: " + str(pglast_data))
660
+ operator = pglast_data[0]
661
+ if "String" not in operator or "sval" not in operator["String"]:
662
+ raise ParserError("Unknown operator format: " + str(pglast_data))
663
+ sval = operator["String"]["sval"]
664
+ if sval not in _PglastOperatorMap:
665
+ raise ParserError("Operator not yet in target map: " + sval)
666
+ return _PglastOperatorMap[sval]
667
+
668
+
669
+ _PglastTypeMap: dict[str, str] = {
670
+ "bpchar": "char",
671
+ "serial8": "bigserial",
672
+ "int4": "integer",
673
+ "int2": "smallint",
674
+ "int8": "bigint",
675
+ "float4": "real",
676
+ "float8": "double precision",
677
+ "boolean": "bool",
678
+ }
679
+ """Map from the internal representation of Postgres types to the SQL standard types."""
680
+
681
+
682
+ def _pglast_parse_type(pglast_data: dict) -> str:
683
+ """Handler method to parse type information from explicit type casts
684
+
685
+ Parameters
686
+ ----------
687
+ pglast_data : dict
688
+ JSON encoding of the type information.
689
+
690
+ Returns
691
+ -------
692
+ str
693
+ The actual type
694
+ """
695
+ if "names" not in pglast_data:
696
+ raise ParserError("Unknown type format: " + str(pglast_data))
697
+ names = pglast_data["names"]
698
+ if len(names) > 2:
699
+ raise ParserError("Unknown type format: " + str(pglast_data))
700
+ raw_type = names[-1]["String"]["sval"]
701
+
702
+ # for user-defined types we use get with the same type as argument
703
+ return _PglastTypeMap.get(raw_type, raw_type)
704
+
705
+
706
+ def _pglast_parse_case(
707
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
708
+ ) -> CaseExpression:
709
+ """Handler method to parse *CASE* expressions in a query.
710
+
711
+ Parameters
712
+ ----------
713
+ pglast_data : dict
714
+ JSON encoding of the *CASE* expression data. This data is extracted from the pglast data structure.
715
+ namespace : QueryNamespace
716
+ The tables and columns that are available in the current query.
717
+ query_txt : str
718
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
719
+ consider, such as hint blocks.
720
+
721
+ Returns
722
+ -------
723
+ CaseExpression
724
+ The parsed *CASE* expression.
725
+ """
726
+ cases: list[tuple[AbstractPredicate, SqlExpression]] = []
727
+ for arg in pglast_data["args"]:
728
+ current_case = _pglast_parse_predicate(
729
+ arg["CaseWhen"]["expr"], namespace=namespace, query_txt=query_txt
730
+ )
731
+ current_result = _pglast_parse_expression(
732
+ arg["CaseWhen"]["result"], namespace=namespace, query_txt=query_txt
733
+ )
734
+ cases.append((current_case, current_result))
735
+
736
+ if "defresult" in pglast_data:
737
+ default_result = _pglast_parse_expression(
738
+ pglast_data["defresult"], namespace=namespace, query_txt=query_txt
739
+ )
740
+ else:
741
+ default_result = None
742
+
743
+ return CaseExpression(cases, else_expr=default_result)
744
+
745
+
746
+ def _pglast_parse_expression(
747
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
748
+ ) -> SqlExpression:
749
+ """Handler method to parse arbitrary expressions in the query.
750
+
751
+ For some more complex expressions, this method will delegate to tailored parsing methods.
752
+
753
+ Parameters
754
+ ----------
755
+ pglast_data : dict
756
+ JSON encoding of the expression data. This data is extracted from the pglast data structure.
757
+ namespace: QueryNamespace
758
+ The tables and columns that are available in the current query.
759
+ query_txt : str
760
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
761
+ consider, such as hint blocks.
762
+
763
+ Returns
764
+ -------
765
+ SqlExpression
766
+ The parsed expression.
767
+ """
768
+ pglast_data.pop("location", None)
769
+ expression_key = util.dicts.key(pglast_data)
770
+
771
+ # When parsing the actual expression, we need to be aware that many expressions can actually be predicates, just not
772
+ # within the WHERE or HAVING clause. For example, "SELECT a IS NOT NULL FROM foo" is a perfectly valid query.
773
+ # Therefore, we handle a lot of expression cases by passing the input data back to our predicate parser and let it do the
774
+ # heavy lifting.
775
+
776
+ match expression_key:
777
+ case "ColumnRef" if _pglast_is_actual_colref(pglast_data["ColumnRef"]):
778
+ column = _pglast_parse_colref(pglast_data["ColumnRef"], namespace=namespace)
779
+ return ColumnExpression(column)
780
+
781
+ case "ColumnRef" if not _pglast_is_actual_colref(pglast_data["ColumnRef"]):
782
+ return _pglast_parse_star(pglast_data["ColumnRef"], namespace=namespace)
783
+
784
+ case "A_Const":
785
+ return _pglast_parse_const(pglast_data["A_Const"])
786
+
787
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_OP":
788
+ expression = pglast_data["A_Expr"]
789
+ operation = _pglast_parse_operator(expression["name"])
790
+ right = _pglast_parse_expression(
791
+ expression["rexpr"], namespace=namespace, query_txt=query_txt
792
+ )
793
+
794
+ if "lexpr" not in expression and operation in MathOperator:
795
+ return MathExpression(operation, right)
796
+ elif "lexpr" not in expression:
797
+ raise ParserError("Unknown operator format: " + str(expression))
798
+
799
+ left = _pglast_parse_expression(
800
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
801
+ )
802
+
803
+ if operation in LogicalOperator:
804
+ return BinaryPredicate(operation, left, right)
805
+
806
+ return MathExpression(operation, left, right)
807
+
808
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] in {
809
+ "AEXPR_LIKE",
810
+ "AEXPR_ILIKE",
811
+ "AEXPR_BETWEEN",
812
+ "AEXPR_IN",
813
+ }:
814
+ # we need to parse a predicate in disguise
815
+ predicate = _pglast_parse_predicate(
816
+ pglast_data, namespace=namespace, query_txt=query_txt
817
+ )
818
+ return predicate
819
+
820
+ case "NullTest":
821
+ predicate = _pglast_parse_predicate(
822
+ pglast_data, namespace=namespace, query_txt=query_txt
823
+ )
824
+ return predicate
825
+
826
+ case "BoolExpr":
827
+ predicate = _pglast_parse_predicate(
828
+ pglast_data, namespace=namespace, query_txt=query_txt
829
+ )
830
+ return predicate
831
+
832
+ case "FuncCall" if (
833
+ "over" not in pglast_data["FuncCall"]
834
+ ): # normal functions, aggregates and UDFs
835
+ expression: dict = pglast_data["FuncCall"]
836
+ funcname = ".".join(
837
+ elem["String"]["sval"] for elem in expression["funcname"]
838
+ )
839
+ distinct = expression.get("agg_distinct", False)
840
+ if expression.get("agg_filter", False):
841
+ filter_expr = _pglast_parse_predicate(
842
+ expression["agg_filter"], namespace=namespace, query_txt=query_txt
843
+ )
844
+ else:
845
+ filter_expr = None
846
+
847
+ if expression.get("agg_star", False):
848
+ return FunctionExpression(
849
+ funcname,
850
+ [StarExpression()],
851
+ distinct=distinct,
852
+ filter_where=filter_expr,
853
+ )
854
+
855
+ args = [
856
+ _pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
857
+ for arg in expression.get("args", [])
858
+ ]
859
+ return FunctionExpression(
860
+ funcname, args, distinct=distinct, filter_where=filter_expr
861
+ )
862
+
863
+ case "FuncCall" if "over" in pglast_data["FuncCall"]: # window functions
864
+ expression: dict = pglast_data["FuncCall"]
865
+ funcname = ".".join(
866
+ elem["String"]["sval"] for elem in expression["funcname"]
867
+ )
868
+
869
+ args = [
870
+ _pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
871
+ for arg in expression.get("args", [])
872
+ ]
873
+ fn = FunctionExpression(funcname, args)
874
+
875
+ window_spec: dict = expression["over"]
876
+
877
+ if "partitionClause" in window_spec:
878
+ partition = [
879
+ _pglast_parse_expression(
880
+ partition, namespace=namespace, query_txt=query_txt
881
+ )
882
+ for partition in window_spec["partitionClause"]
883
+ ]
884
+ else:
885
+ partition = None
886
+
887
+ if "orderClause" in window_spec:
888
+ order = _pglast_parse_orderby(
889
+ window_spec["orderClause"], namespace=namespace, query_txt=query_txt
890
+ )
891
+ else:
892
+ order = None
893
+
894
+ if "agg_filter" in expression:
895
+ filter_expr = _pglast_parse_expression(
896
+ expression["agg_filter"], namespace=namespace, query_txt=query_txt
897
+ )
898
+ else:
899
+ filter_expr = None
900
+
901
+ return WindowExpression(
902
+ fn, partitioning=partition, ordering=order, filter_condition=filter_expr
903
+ )
904
+
905
+ case "CoalesceExpr":
906
+ expression = pglast_data["CoalesceExpr"]
907
+ args = [
908
+ _pglast_parse_expression(arg, namespace=namespace, query_txt=query_txt)
909
+ for arg in expression["args"]
910
+ ]
911
+ return FunctionExpression("coalesce", args)
912
+
913
+ case "TypeCast":
914
+ expression: dict = pglast_data["TypeCast"]
915
+ casted_expression = _pglast_parse_expression(
916
+ expression["arg"], namespace=namespace, query_txt=query_txt
917
+ )
918
+ target_type = _pglast_parse_type(expression["typeName"])
919
+ type_params = [
920
+ _pglast_parse_expression(
921
+ param, namespace=namespace, query_txt=query_txt
922
+ )
923
+ for param in expression["typeName"].get("typmods", [])
924
+ ]
925
+
926
+ return CastExpression(
927
+ casted_expression, target_type, type_params=type_params
928
+ )
929
+
930
+ case "CaseExpr":
931
+ return _pglast_parse_case(
932
+ pglast_data["CaseExpr"], namespace=namespace, query_txt=query_txt
933
+ )
934
+
935
+ case "SubLink" if pglast_data["SubLink"]["subLinkType"] == "EXPR_SUBLINK":
936
+ subquery = _pglast_parse_query(
937
+ pglast_data["SubLink"]["subselect"]["SelectStmt"],
938
+ query_txt=query_txt,
939
+ namespace=namespace.open_nested(source="temporary"),
940
+ )
941
+ return SubqueryExpression(subquery)
942
+
943
+ case "A_Indirection":
944
+ expression: dict = pglast_data["A_Indirection"]
945
+ array_expression = _pglast_parse_expression(
946
+ expression["arg"], namespace=namespace, query_txt=query_txt
947
+ )
948
+
949
+ for index_expression in expression["indirection"]:
950
+ index_expression: dict = index_expression["A_Indices"]
951
+
952
+ if index_expression.get("is_slice", False):
953
+ lower = (
954
+ _pglast_parse_expression(
955
+ index_expression["lidx"],
956
+ namespace=namespace,
957
+ query_txt=query_txt,
958
+ )
959
+ if "lidx" in index_expression
960
+ else None
961
+ )
962
+ upper = (
963
+ _pglast_parse_expression(
964
+ index_expression["uidx"],
965
+ namespace=namespace,
966
+ query_txt=query_txt,
967
+ )
968
+ if "uidx" in index_expression
969
+ else None
970
+ )
971
+ array_expression = ArrayAccessExpression(
972
+ array_expression, lower_idx=lower, upper_idx=upper
973
+ )
974
+ continue
975
+
976
+ point_index = _pglast_parse_expression(
977
+ index_expression["uidx"], namespace=namespace, query_txt=query_txt
978
+ )
979
+ array_expression = ArrayAccessExpression(
980
+ array_expression, idx=point_index
981
+ )
982
+
983
+ return array_expression
984
+
985
+ case _:
986
+ raise ParserError("Unknown expression type: " + str(pglast_data))
987
+
988
+
989
+ def _pglast_parse_values_cte(
990
+ pglast_data: dict, *, namespace: QueryNamespace
991
+ ) -> tuple[ValuesList, list[str]]:
992
+ """Handler method to parse a CTE with a *VALUES* expressions.
993
+
994
+ Parameters
995
+ ----------
996
+ pglast_data : dict
997
+ JSON encoding of the CTE data. This data is extracted from the pglast data structure.
998
+ namespace: QueryNamespace
999
+ The tables and columns that are available in the current query.
1000
+
1001
+ Returns
1002
+ -------
1003
+ tuple[ValuesList, list[str]]
1004
+ The parsed *VALUES* expression and the column names.
1005
+ """
1006
+ values: ValuesList = []
1007
+ for row in pglast_data["ctequery"]["SelectStmt"]["valuesLists"]:
1008
+ raw_items = row["List"]["items"]
1009
+ parsed_items = [
1010
+ _pglast_parse_expression(item, namespace=namespace) for item in raw_items
1011
+ ]
1012
+ values.append(tuple(parsed_items))
1013
+
1014
+ colnames: list[str] = []
1015
+ for raw_colname in pglast_data.get("aliascolnames", []):
1016
+ colnames.append(raw_colname["String"]["sval"])
1017
+
1018
+ if colnames:
1019
+ namespace.determine_output_shape(colnames)
1020
+
1021
+ return values, colnames
1022
+
1023
+
1024
+ def _pglast_parse_ctes(
1025
+ json_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
1026
+ ) -> CommonTableExpression:
1027
+ """Handler method to parse the *WITH* clause of a query.
1028
+
1029
+ Parameters
1030
+ ----------
1031
+ json_data : dict
1032
+ JSON enconding of the CTEs, as extracted from the pglast data structure.
1033
+ parent_namespace: QueryNamespace
1034
+ The tables and columns that are available in the current query.
1035
+ query_txt: str
1036
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
1037
+ consider, such as hint blocks.
1038
+
1039
+ Returns
1040
+ -------
1041
+ CommonTableExpression
1042
+ The parsed CTEs.
1043
+ """
1044
+ parsed_ctes: list[CommonTableExpression] = []
1045
+ for pglast_data in json_data["ctes"]:
1046
+ current_cte: dict = pglast_data["CommonTableExpr"]
1047
+ target_name = current_cte["ctename"]
1048
+ target_table = TableReference.create_virtual(target_name)
1049
+
1050
+ match current_cte.get("ctematerialized", "CTEMaterializeDefault"):
1051
+ case "CTEMaterializeDefault":
1052
+ force_materialization = None
1053
+ case "CTEMaterializeAlways":
1054
+ force_materialization = True
1055
+ case "CTEMaterializeNever":
1056
+ force_materialization = False
1057
+
1058
+ query_data = current_cte["ctequery"]["SelectStmt"]
1059
+ child_nsp = parent_namespace.open_nested(alias=target_name, source="cte")
1060
+ if "targetList" not in query_data and query_data["op"] == "SETOP_NONE":
1061
+ # CTE is a VALUES query
1062
+ values, columns = _pglast_parse_values_cte(current_cte, namespace=child_nsp)
1063
+ parsed_cte = ValuesWithQuery(
1064
+ values,
1065
+ target_name=target_table.identifier(),
1066
+ columns=columns,
1067
+ materialized=force_materialization,
1068
+ )
1069
+ else:
1070
+ cte_query = _pglast_parse_query(
1071
+ current_cte["ctequery"]["SelectStmt"],
1072
+ namespace=child_nsp,
1073
+ query_txt=query_txt,
1074
+ )
1075
+ parsed_cte = WithQuery(
1076
+ cte_query, target_table, materialized=force_materialization
1077
+ )
1078
+
1079
+ parsed_ctes.append(parsed_cte)
1080
+
1081
+ recursive = json_data.get("recursive", False)
1082
+ return CommonTableExpression(parsed_ctes, recursive=recursive)
1083
+
1084
+
1085
+ def _pglast_try_select_star(
1086
+ target: dict, *, distinct: list[SqlExpression] | bool
1087
+ ) -> Optional[Select]:
1088
+ """Attempts to generate a *SELECT(\\*)* representation for a *SELECT* clause.
1089
+
1090
+ If the query is not actually a *SELECT(\\*)* query, this method will return *None*.
1091
+
1092
+ Parameters
1093
+ ----------
1094
+ target : dict
1095
+ JSON encoding of the target entry in the *SELECT* clause. This data is extracted from the pglast data structure
1096
+ distinct : list[SqlExpression] | bool
1097
+ The parsed *DISTINCT* part of the *SELECT* clause.
1098
+
1099
+ Returns
1100
+ -------
1101
+ Optional[Select]
1102
+ The parsed *SELECT(\\*)* clause, or *None* if this is not a *SELECT(\\*)* query.
1103
+ """
1104
+ if "ColumnRef" not in target:
1105
+ return None
1106
+ fields = target["ColumnRef"]["fields"]
1107
+ if len(fields) != 1:
1108
+ # multiple fields are used for qualified column references. This is definitely not a SELECT * query, so exit
1109
+ return None
1110
+ colref = fields[0]
1111
+ return Select.star(distinct=distinct) if "A_Star" in colref else None
1112
+
1113
+
1114
+ def _pglast_parse_select(
1115
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
1116
+ ) -> Select:
1117
+ """Handler method to parse the *SELECT* clause of a query.
1118
+
1119
+ This is the only parsing handler that will always be called when parsing a query, since all queries must at least have a
1120
+ *SELECT* clause.
1121
+
1122
+ Parameters
1123
+ ----------
1124
+ pglast_data : dict
1125
+ JSON encoding of the entire query. This is required to extract the different projections used in the *SELECT* clause,
1126
+ as well as potential required duplicate eliminations via *DISTINCT ON*
1127
+ namespace : QueryNamespace
1128
+ The tables and columns that are available in the current query.
1129
+ query_txt : str
1130
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser does not
1131
+ consider, such as hint blocks.
1132
+
1133
+ Returns
1134
+ -------
1135
+ Select
1136
+ The parsed *SELECT* clause
1137
+ """
1138
+
1139
+ pglast_distinct = pglast_data.get("distinctClause", None)
1140
+ if pglast_distinct is None:
1141
+ distinct = False # value not present --> no DISTINCT
1142
+ elif pglast_distinct == [{}]: # that is pglasts encoding of a plain DISTINCT
1143
+ distinct = True
1144
+ elif isinstance(pglast_distinct, list):
1145
+ distinct = [
1146
+ _pglast_parse_expression(expr, namespace=namespace, query_txt=query_txt)
1147
+ for expr in pglast_distinct
1148
+ ]
1149
+ else:
1150
+ raise ParserError(f"Unknown DISTINCT format: {pglast_distinct}")
1151
+
1152
+ targetlist: list[dict] = pglast_data[
1153
+ "targetList"
1154
+ ] # targetlist must always be present, so no need for .get()
1155
+
1156
+ # first, try for SELECT * queries
1157
+ if len(targetlist) == 1:
1158
+ target = targetlist[0]["ResTarget"]["val"]
1159
+ select_star = _pglast_try_select_star(target, distinct=distinct)
1160
+
1161
+ if select_star:
1162
+ namespace.determine_output_shape(select_star)
1163
+ return select_star
1164
+ # if this is not a SELECT * query, we can continue with the regular parsing
1165
+
1166
+ targets: list[BaseProjection] = []
1167
+ for target in targetlist:
1168
+ expression = _pglast_parse_expression(
1169
+ target["ResTarget"]["val"], namespace=namespace, query_txt=query_txt
1170
+ )
1171
+ alias = target["ResTarget"].get("name", "")
1172
+ projection = BaseProjection(expression, alias)
1173
+ targets.append(projection)
1174
+
1175
+ clause = Select(targets, distinct=distinct)
1176
+ namespace.determine_output_shape(clause)
1177
+ return clause
1178
+
1179
+
1180
+ def _pglast_parse_rangevar(rangevar: dict) -> TableReference:
1181
+ """Handler method to extract the `TableReference` from a *RangeVar* entry in the *FROM* clause.
1182
+
1183
+ Parameters
1184
+ ----------
1185
+ rangevar : dict
1186
+ JSON encoding of the range variable, as extracted from the pglast data structure.
1187
+
1188
+ Returns
1189
+ -------
1190
+ TableReference
1191
+ The parsed table reference.
1192
+ """
1193
+ name = rangevar["relname"]
1194
+ alias = rangevar["alias"]["aliasname"] if "alias" in rangevar else None
1195
+ schema = rangevar.get("schemaname", "")
1196
+ return TableReference(name, alias, schema=schema)
1197
+
1198
+
1199
+ def _pglast_is_values_list(pglast_data: dict) -> bool:
1200
+ """Checks, whether a pglast subquery representation refers to an actual subquery or a *VALUES* list.
1201
+
1202
+ Parameters
1203
+ ----------
1204
+ pglast_data : dict
1205
+ JSON encoding of the subquery data
1206
+
1207
+ Returns
1208
+ -------
1209
+ bool
1210
+ *True* if the subquery encodes a *VALUES* list, *False* otherwise.
1211
+ """
1212
+ query = pglast_data["subquery"]["SelectStmt"]
1213
+ return "valuesLists" in query
1214
+
1215
+
1216
+ def _pglast_parse_from_entry(
1217
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
1218
+ ) -> TableSource:
1219
+ """Handler method to parse individual entries in the *FROM* clause.
1220
+
1221
+ Parameters
1222
+ ----------
1223
+ pglast_data : dict
1224
+ JSON enconding of the current entry in the *FROM* clause. This data is extracted from the pglast data structure.
1225
+ namespace: QueryNamespace
1226
+ The tables and columns that are available in the current query.
1227
+ query_txt: str
1228
+ The raw query string that was passed to the parser. This is used to extract information that the PG parser does not
1229
+ provide, such as hint blocks.
1230
+
1231
+ Returns
1232
+ -------
1233
+ TableSource
1234
+ The parsed table source.
1235
+ """
1236
+ pglast_data.pop("location", None)
1237
+ entry_type = util.dicts.key(pglast_data)
1238
+
1239
+ match entry_type:
1240
+ case "RangeVar":
1241
+ table = _pglast_parse_rangevar(pglast_data["RangeVar"])
1242
+
1243
+ # If we specified a virtual table in a CTE, we will reference it later in some FROM clause. In this case,
1244
+ # we should not create a new table reference, but rather use the existing one.
1245
+ # But, if we alias the virtual table, we still need a new reference
1246
+ similar_table = namespace.resolve_table(table.full_name)
1247
+ if similar_table and similar_table.virtual and not table.alias:
1248
+ # a simple reference to the CTE
1249
+ namespace.register_table(similar_table)
1250
+ return DirectTableSource(similar_table)
1251
+ if similar_table and similar_table.virtual and table.alias:
1252
+ # an aliased reference to the CTE
1253
+ table = table.make_virtual()
1254
+ # TODO: should we also update the mapping of the full_name here?
1255
+
1256
+ namespace.register_table(table)
1257
+ return DirectTableSource(table)
1258
+
1259
+ case "JoinExpr":
1260
+ join_expr: dict = pglast_data["JoinExpr"]
1261
+ match join_expr["jointype"]:
1262
+ case "JOIN_INNER" if "quals" in join_expr:
1263
+ join_type = JoinType.InnerJoin
1264
+ case "JOIN_INNER" if "quals" not in join_expr:
1265
+ join_type = JoinType.CrossJoin
1266
+ case "JOIN_LEFT":
1267
+ join_type = JoinType.LeftJoin
1268
+ case "JOIN_RIGHT":
1269
+ join_type = JoinType.RightJoin
1270
+ case "JOIN_OUTER":
1271
+ join_type = JoinType.OuterJoin
1272
+ case "JOIN_FULL":
1273
+ join_type = JoinType.OuterJoin
1274
+ case _:
1275
+ raise ParserError("Unknown join type: " + join_expr["jointype"])
1276
+
1277
+ left = _pglast_parse_from_entry(
1278
+ join_expr["larg"], namespace=namespace, query_txt=query_txt
1279
+ )
1280
+ right = _pglast_parse_from_entry(
1281
+ join_expr["rarg"], namespace=namespace, query_txt=query_txt
1282
+ )
1283
+ if join_type == JoinType.CrossJoin:
1284
+ return JoinTableSource(left, right, join_type=JoinType.CrossJoin)
1285
+
1286
+ join_condition = _pglast_parse_predicate(
1287
+ join_expr["quals"], namespace=namespace, query_txt=query_txt
1288
+ )
1289
+
1290
+ # we do not need to store new tables in available_tables here, since this is already handled by the recursion.
1291
+ return JoinTableSource(
1292
+ left, right, join_condition=join_condition, join_type=join_type
1293
+ )
1294
+
1295
+ case "RangeSubselect" if _pglast_is_values_list(pglast_data["RangeSubselect"]):
1296
+ values_list = _pglast_parse_values(
1297
+ pglast_data["RangeSubselect"],
1298
+ parent_namespace=namespace,
1299
+ query_txt=query_txt,
1300
+ )
1301
+ return values_list
1302
+
1303
+ case "RangeSubselect":
1304
+ raw_subquery: dict = pglast_data["RangeSubselect"]
1305
+ is_lateral = raw_subquery.get("lateral", False)
1306
+
1307
+ if "alias" in raw_subquery:
1308
+ alias: str = raw_subquery["alias"]["aliasname"]
1309
+ else:
1310
+ alias = ""
1311
+
1312
+ child_nsp = namespace.open_nested(alias=alias, source="subquery")
1313
+ subquery = _pglast_parse_query(
1314
+ raw_subquery["subquery"]["SelectStmt"],
1315
+ namespace=child_nsp,
1316
+ query_txt=query_txt,
1317
+ )
1318
+
1319
+ subquery_source = SubqueryTableSource(
1320
+ subquery, target_name=alias, lateral=is_lateral
1321
+ )
1322
+ return subquery_source
1323
+
1324
+ case "RangeFunction":
1325
+ raw_function: dict = pglast_data["RangeFunction"]
1326
+
1327
+ if "alias" in raw_function:
1328
+ alias: str = raw_function["alias"]["aliasname"]
1329
+ else:
1330
+ alias = ""
1331
+
1332
+ function_expr: dict = raw_function["functions"][0]["List"]["items"][0]
1333
+ parsed_function = _pglast_parse_expression(
1334
+ function_expr, namespace=namespace, query_txt=query_txt
1335
+ )
1336
+ return FunctionTableSource(parsed_function, alias=alias)
1337
+
1338
+ case _:
1339
+ raise ParserError("Unknow FROM clause entry: " + str(pglast_data))
1340
+
1341
+
1342
+ def _pglast_parse_values(
1343
+ pglast_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
1344
+ ) -> ValuesTableSource:
1345
+ """Handler method to parse explicit *VALUES* lists in the *FROM* clause.
1346
+
1347
+ Parameters
1348
+ ----------
1349
+ pglast_data : dict
1350
+ JSON encoding of the actual *VALUES* list. This data is extracted from the pglast data structure and should be akin
1351
+ to a subquery.
1352
+ parent_namespace : QueryNamespace
1353
+ The tables and columns that are available in the current query. This is only used to register the columns of the
1354
+ VALUES list
1355
+ query_txt : str
1356
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1357
+ parser ignores, such as hint blocks.
1358
+
1359
+ Returns
1360
+ -------
1361
+ ValuesTableSource
1362
+ The parsed *VALUES* list.
1363
+ """
1364
+
1365
+ raw_alias: dict = pglast_data.get("alias", {})
1366
+ alias = raw_alias.get("aliasname", "")
1367
+ child_nsp = parent_namespace.open_nested(alias=alias, source="values")
1368
+ raw_values: list[dict] = pglast_data["subquery"]["SelectStmt"]["valuesLists"]
1369
+
1370
+ values: ValuesList = []
1371
+ for row in raw_values:
1372
+ raw_items = row["List"]["items"]
1373
+ parsed_items = [
1374
+ _pglast_parse_expression(item, namespace=child_nsp, query_txt=query_txt)
1375
+ for item in raw_items
1376
+ ]
1377
+ values.append(tuple(parsed_items))
1378
+
1379
+ if not alias:
1380
+ return ValuesTableSource(values, alias=alias, columns=[])
1381
+
1382
+ if "colnames" not in raw_alias:
1383
+ return ValuesTableSource(values, alias=alias, columns=[])
1384
+
1385
+ colnames = []
1386
+ for raw_colname in raw_alias["colnames"]:
1387
+ colnames.append(raw_colname["String"]["sval"])
1388
+ table_source = ValuesTableSource(values, alias=alias, columns=colnames)
1389
+ child_nsp.determine_output_shape(table_source.cols)
1390
+ return table_source
1391
+
1392
+
1393
+ def _pglast_parse_from(
1394
+ from_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
1395
+ ) -> From:
1396
+ """Handler method to parse the *FROM* clause of a query.
1397
+
1398
+ Parameters
1399
+ ----------
1400
+ from_clause : list[dict]
1401
+ The JSON representation of the *FROM* clause, as extracted from the pglast data structure.
1402
+ namespace : QueryNamespace
1403
+ The tables and columns that are available in the current query.
1404
+ query_txt : str
1405
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1406
+ parser ignores, such as hint blocks.
1407
+
1408
+ Returns
1409
+ -------
1410
+ From
1411
+ The parsed *FROM* clause.
1412
+ """
1413
+ contains_plain_table = False
1414
+ contains_join = False
1415
+ contains_mixed = False # plain tables and explicit JOINs, subqueries or VALUES
1416
+
1417
+ table_sources: list[TableSource] = []
1418
+ for entry in from_clause:
1419
+ current_table_source = _pglast_parse_from_entry(
1420
+ entry, namespace=namespace, query_txt=query_txt
1421
+ )
1422
+ table_sources.append(current_table_source)
1423
+
1424
+ match current_table_source:
1425
+ case DirectTableSource():
1426
+ contains_plain_table = True
1427
+ if contains_join:
1428
+ contains_mixed = True
1429
+ case JoinTableSource():
1430
+ contains_join = True
1431
+ if contains_plain_table:
1432
+ contains_mixed = True
1433
+ case SubqueryTableSource():
1434
+ contains_mixed = True
1435
+ case ValuesTableSource():
1436
+ contains_mixed = True
1437
+
1438
+ if not contains_join and not contains_mixed:
1439
+ return ImplicitFromClause(table_sources)
1440
+ if contains_join and not contains_mixed:
1441
+ return ExplicitFromClause(table_sources)
1442
+
1443
+ return From(table_sources)
1444
+
1445
+
1446
+ def _pglast_parse_predicate(
1447
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
1448
+ ) -> AbstractPredicate:
1449
+ """Handler method to parse arbitrary predicates in the *WHERE* or *HAVING* clause.
1450
+
1451
+ Parameters
1452
+ ----------
1453
+ pglast_data : dict
1454
+ JSON encoding of the predicate data. This data is extracted from the pglast data structure.
1455
+ namespace : QueryNamespace
1456
+ The tables and columns that are available in the current query.
1457
+ query_txt : str
1458
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1459
+ parser ignores, such as hint blocks.
1460
+
1461
+
1462
+ Returns
1463
+ -------
1464
+ AbstractPredicate
1465
+ The parsed predicate.
1466
+ """
1467
+ pglast_data.pop("location", None)
1468
+ expr_key = util.dicts.key(pglast_data)
1469
+ match expr_key:
1470
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_OP":
1471
+ expression = pglast_data["A_Expr"]
1472
+ operator = _pglast_parse_operator(expression["name"])
1473
+ left = _pglast_parse_expression(
1474
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
1475
+ )
1476
+ right = _pglast_parse_expression(
1477
+ expression["rexpr"], namespace=namespace, query_txt=query_txt
1478
+ )
1479
+ return BinaryPredicate(operator, left, right)
1480
+
1481
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_LIKE":
1482
+ expression = pglast_data["A_Expr"]
1483
+ operator = (
1484
+ LogicalOperator.Like
1485
+ if expression["name"][0]["String"]["sval"] == "~~"
1486
+ else LogicalOperator.NotLike
1487
+ )
1488
+ left = _pglast_parse_expression(
1489
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
1490
+ )
1491
+ right = _pglast_parse_expression(
1492
+ expression["rexpr"], namespace=namespace, query_txt=query_txt
1493
+ )
1494
+ return BinaryPredicate(operator, left, right)
1495
+
1496
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_ILIKE":
1497
+ expression = pglast_data["A_Expr"]
1498
+ operator = (
1499
+ LogicalOperator.ILike
1500
+ if expression["name"][0]["String"]["sval"] == "~~*"
1501
+ else LogicalOperator.NotILike
1502
+ )
1503
+ left = _pglast_parse_expression(
1504
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
1505
+ )
1506
+ right = _pglast_parse_expression(
1507
+ expression["rexpr"], namespace=namespace, query_txt=query_txt
1508
+ )
1509
+ return BinaryPredicate(operator, left, right)
1510
+
1511
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_BETWEEN":
1512
+ expression = pglast_data["A_Expr"]
1513
+ left = _pglast_parse_expression(
1514
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
1515
+ )
1516
+ raw_interval = expression["rexpr"]["List"]["items"]
1517
+ if len(raw_interval) != 2:
1518
+ raise ParserError("Invalid BETWEEN interval: " + str(raw_interval))
1519
+ lower = _pglast_parse_expression(
1520
+ raw_interval[0], namespace=namespace, query_txt=query_txt
1521
+ )
1522
+ upper = _pglast_parse_expression(
1523
+ raw_interval[1], namespace=namespace, query_txt=query_txt
1524
+ )
1525
+ return BetweenPredicate(left, (lower, upper))
1526
+
1527
+ case "A_Expr" if pglast_data["A_Expr"]["kind"] == "AEXPR_IN":
1528
+ expression = pglast_data["A_Expr"]
1529
+ left = _pglast_parse_expression(
1530
+ expression["lexpr"], namespace=namespace, query_txt=query_txt
1531
+ )
1532
+ raw_values = expression["rexpr"]["List"]["items"]
1533
+ values = [
1534
+ _pglast_parse_expression(
1535
+ value, namespace=namespace, query_txt=query_txt
1536
+ )
1537
+ for value in raw_values
1538
+ ]
1539
+ predicate = InPredicate(left, values)
1540
+ operator = expression["name"][0]["String"]["sval"]
1541
+ if operator == "=":
1542
+ return predicate
1543
+ elif operator == "<>":
1544
+ return CompoundPredicate.create_not(predicate)
1545
+ else:
1546
+ raise ParserError("Invalid IN operator: " + operator)
1547
+
1548
+ case "BoolExpr":
1549
+ expression = pglast_data["BoolExpr"]
1550
+ operator = _PglastOperatorMap[expression["boolop"]]
1551
+ children = [
1552
+ _pglast_parse_predicate(child, namespace=namespace, query_txt=query_txt)
1553
+ for child in expression["args"]
1554
+ ]
1555
+ return CompoundPredicate(operator, children)
1556
+
1557
+ case "NullTest":
1558
+ expression = pglast_data["NullTest"]
1559
+ testexpr = _pglast_parse_expression(
1560
+ expression["arg"], namespace=namespace, query_txt=query_txt
1561
+ )
1562
+ operation = (
1563
+ LogicalOperator.Is
1564
+ if expression["nulltesttype"] == "IS_NULL"
1565
+ else LogicalOperator.IsNot
1566
+ )
1567
+ return BinaryPredicate(operation, testexpr, StaticValueExpression.null())
1568
+
1569
+ case "FuncCall":
1570
+ expression = _pglast_parse_expression(
1571
+ pglast_data, namespace=namespace, query_txt=query_txt
1572
+ )
1573
+ return UnaryPredicate(expression)
1574
+
1575
+ case "SubLink":
1576
+ expression = pglast_data["SubLink"]
1577
+ sublink_type = expression["subLinkType"]
1578
+
1579
+ subquery = _pglast_parse_query(
1580
+ expression["subselect"]["SelectStmt"],
1581
+ namespace=namespace.open_nested(source="temporary"),
1582
+ query_txt=query_txt,
1583
+ )
1584
+ if sublink_type == "EXISTS_SUBLINK":
1585
+ return UnaryPredicate.exists(subquery)
1586
+
1587
+ testexpr = _pglast_parse_expression(
1588
+ expression["testexpr"], namespace=namespace, query_txt=query_txt
1589
+ )
1590
+
1591
+ if sublink_type == "ANY_SUBLINK" and "operName" not in expression:
1592
+ return InPredicate.subquery(testexpr, subquery)
1593
+
1594
+ if sublink_type == "ANY_SUBLINK":
1595
+ operator = _PglastOperatorMap[expression["operName"]]
1596
+ subquery_expression = FunctionExpression.any_func(subquery)
1597
+ return BinaryPredicate(operator, testexpr, subquery_expression)
1598
+ elif sublink_type == "ALL_SUBLINK":
1599
+ operator = _PglastOperatorMap[expression["operName"]]
1600
+ subquery_expression = FunctionExpression.all_func(subquery)
1601
+ return BinaryPredicate(operator, testexpr, subquery_expression)
1602
+ else:
1603
+ raise NotImplementedError("Subquery handling is not yet implemented")
1604
+
1605
+ case _:
1606
+ expression = _pglast_parse_expression(
1607
+ pglast_data, namespace=namespace, query_txt=query_txt
1608
+ )
1609
+ return UnaryPredicate(expression)
1610
+
1611
+
1612
+ def _pglast_parse_where(
1613
+ where_clause: dict, *, namespace: QueryNamespace, query_txt: str
1614
+ ) -> Where:
1615
+ """Handler method to parse the *WHERE* clause of a query.
1616
+
1617
+ Parameters
1618
+ ----------
1619
+ where_clause : dict
1620
+ The JSON representation of the *WHERE* clause, as extracted from the pglast data structure.
1621
+ namespace: QueryNamespace
1622
+ The tables and columns that can be referenced by expressions in the query.
1623
+ query_txt : str
1624
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1625
+ parser ignores, such as hint blocks.
1626
+
1627
+ Returns
1628
+ -------
1629
+ Where
1630
+ The parsed *WHERE* clause.
1631
+ """
1632
+ predicate = _pglast_parse_predicate(
1633
+ where_clause, namespace=namespace, query_txt=query_txt
1634
+ )
1635
+ return Where(predicate)
1636
+
1637
+
1638
+ def _pglast_parse_groupby(
1639
+ groupby_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
1640
+ ) -> GroupBy:
1641
+ """Handler method to parse the *GROUP BY* clause of a query.
1642
+
1643
+ Parameters
1644
+ ----------
1645
+ groupby_clause : list[dict]
1646
+ The JSON representation of the *GROUP BY* clause, as extracted from the pglast data structure
1647
+ namespace: QueryNamespace
1648
+ The tables and columns that can be referenced by expressions in the query
1649
+ query_txt : str
1650
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1651
+ parser ignores, such as hint blocks.
1652
+
1653
+ Returns
1654
+ -------
1655
+ GroupBy
1656
+ The parsed *GROUP BY* clause.
1657
+ """
1658
+ groupings: list[SqlExpression] = []
1659
+
1660
+ for item in groupby_clause:
1661
+ if "GroupingSet" in item:
1662
+ raise NotImplementedError("Grouping sets are not yet supported")
1663
+ group_expression = _pglast_parse_expression(
1664
+ item, namespace=namespace, query_txt=query_txt
1665
+ )
1666
+ groupings.append(group_expression)
1667
+
1668
+ return GroupBy(groupings)
1669
+
1670
+
1671
+ def _pglast_parse_having(
1672
+ having_clause: dict, *, namespace: QueryNamespace, query_txt: str
1673
+ ) -> Having:
1674
+ """Handler method to parse the *HAVING* clause of a query.
1675
+
1676
+ Parameters
1677
+ ----------
1678
+ having_clause : dict
1679
+ The JSON representation of the *HAVING* clause, as extracted from the pglast data structure.
1680
+ namespace: QueryNamespace
1681
+ The tables and columns that can be referenced by expressions in the query.
1682
+ query_txt : str
1683
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1684
+ parser ignores, such as hint blocks.
1685
+
1686
+ Returns
1687
+ -------
1688
+ Having
1689
+ The parsed *HAVING* clause.
1690
+ """
1691
+ predicate = _pglast_parse_predicate(
1692
+ having_clause, namespace=namespace, query_txt=query_txt
1693
+ )
1694
+ return Having(predicate)
1695
+
1696
+
1697
+ def _pglast_parse_orderby(
1698
+ order_clause: list[dict], *, namespace: QueryNamespace, query_txt: str
1699
+ ) -> OrderBy:
1700
+ """Handler method to parse the *ORDER BY* clause of a query.
1701
+
1702
+ Parameters
1703
+ ----------
1704
+ order_clause : list[dict]
1705
+ The JSON representation of the *ORDER BY* clause, as extracted from the pglast data structure.
1706
+ namespace : QueryNamespace
1707
+ The tables and columns that can be referenced by expressions in the query.
1708
+ query_txt : str
1709
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1710
+ parser ignores, such as hint blocks.
1711
+
1712
+ Returns
1713
+ -------
1714
+ OrderBy
1715
+ The parsed *ORDER BY* clause.
1716
+ """
1717
+ orderings: list[OrderByExpression] = []
1718
+
1719
+ for item in order_clause:
1720
+ expression = item["SortBy"]
1721
+ sort_key = _pglast_parse_expression(
1722
+ expression["node"], namespace=namespace, query_txt=query_txt
1723
+ )
1724
+
1725
+ match expression["sortby_dir"]:
1726
+ case "SORTBY_ASC":
1727
+ sort_ascending = True
1728
+ case "SORTBY_DESC":
1729
+ sort_ascending = False
1730
+ case "SORTBY_DEFAULT":
1731
+ sort_ascending = None
1732
+ case _:
1733
+ raise ParserError("Unknown sort direction: " + expression["sortby_dir"])
1734
+
1735
+ match expression["sortby_nulls"]:
1736
+ case "SORTBY_NULLS_FIRST":
1737
+ put_nulls_first = True
1738
+ case "SORTBY_NULLS_LAST":
1739
+ put_nulls_first = False
1740
+ case "SORTBY_NULLS_DEFAULT":
1741
+ put_nulls_first = None
1742
+ case _:
1743
+ raise ParserError(
1744
+ "Unknown nulls placement: " + expression["sortby_nulls"]
1745
+ )
1746
+
1747
+ order_expression = OrderByExpression(
1748
+ sort_key, ascending=sort_ascending, nulls_first=put_nulls_first
1749
+ )
1750
+ orderings.append(order_expression)
1751
+
1752
+ return OrderBy(orderings)
1753
+
1754
+
1755
+ def _pglast_parse_limit(
1756
+ pglast_data: dict, *, namespace: QueryNamespace, query_txt: str
1757
+ ) -> Optional[Limit]:
1758
+ """Handler method to parse LIMIT and OFFSET clauses.
1759
+
1760
+ This method assumes that the given query actually contains *LIMIT* or *OFFSET* clauses and will fail otherwise.
1761
+
1762
+ Parameters
1763
+ ----------
1764
+ pglast_data : dict
1765
+ JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
1766
+ of the ``SelectStmt`` is necessary.
1767
+ namespace : QueryNamespace
1768
+ The tables and columns that can be referenced by expressions in the query.
1769
+ query_txt : str
1770
+ The raw query text that was passed to the parser. This is used to extract information from the query that the PG
1771
+ parser ignores, such as hint blocks.
1772
+
1773
+ Returns
1774
+ -------
1775
+ Limit
1776
+ The limit clause. Can be *None* if no meaningful limit nor a meaningful offset is specified.
1777
+ """
1778
+ raw_limit: Optional[dict] = pglast_data.get("limitCount", None)
1779
+ raw_offset: Optional[dict] = pglast_data.get("limitOffset", None)
1780
+ if raw_limit is None and raw_offset is None:
1781
+ return None
1782
+
1783
+ if raw_limit is not None:
1784
+ # for LIMIT ALL there is no second ival, but instead an "isnull" member that is set to true
1785
+ raw_limit = raw_limit["A_Const"]["ival"]
1786
+ nrows: int | None = raw_limit["ival"] if "ival" in raw_limit else None
1787
+ else:
1788
+ nrows = None
1789
+ if raw_offset is not None:
1790
+ offset: int = raw_offset["A_Const"]["ival"].get("ival", 0)
1791
+ else:
1792
+ offset = None
1793
+
1794
+ normalized_query = query_txt.lower()
1795
+ contains_standard_limit = (
1796
+ "limit" in normalized_query or "fetch first" in normalized_query
1797
+ )
1798
+ if raw_limit is not None and not contains_standard_limit:
1799
+ if "fetch next" in normalized_query:
1800
+ fetch_direction = "next"
1801
+ elif "fetch prior" in normalized_query:
1802
+ fetch_direction = "prior"
1803
+ elif "fetch last" in normalized_query:
1804
+ fetch_direction = "last"
1805
+ else:
1806
+ raise ParserError("Could not determine LIMIT option")
1807
+ else:
1808
+ fetch_direction = "first"
1809
+
1810
+ return Limit(limit=nrows, offset=offset, fetch_direction=fetch_direction)
1811
+
1812
+
1813
+ def _pglast_parse_setop(
1814
+ pglast_data: dict, *, parent_namespace: QueryNamespace, query_txt: str
1815
+ ) -> SetQuery:
1816
+ """Handler method to parse set operations.
1817
+
1818
+ This method assumes that the given query is indeed a set operation and will fail otherwise.
1819
+
1820
+ Parameters
1821
+ ----------
1822
+ pglast_data : dict
1823
+ JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
1824
+ of the ``SelectStmt`` is necessary.
1825
+ parent_namespace : QueryNamespace
1826
+ The tables and columns that can be referenced by expressions in the query.
1827
+ query_txt : str
1828
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser ignores, such
1829
+ as hint blocks.
1830
+
1831
+ Returns
1832
+ -------
1833
+ SetOperationClause
1834
+ The parsed set clause
1835
+ """
1836
+ if "withClause" in pglast_data:
1837
+ with_clause = _pglast_parse_ctes(
1838
+ pglast_data["withClause"],
1839
+ parent_namespace=parent_namespace,
1840
+ query_txt=query_txt,
1841
+ )
1842
+ else:
1843
+ with_clause = None
1844
+
1845
+ left_query = _pglast_parse_query(
1846
+ pglast_data["larg"],
1847
+ namespace=parent_namespace.open_nested(source="setop"),
1848
+ query_txt=query_txt,
1849
+ )
1850
+ right_query = _pglast_parse_query(
1851
+ pglast_data["rarg"],
1852
+ namespace=parent_namespace.open_nested(source="setop"),
1853
+ query_txt=query_txt,
1854
+ )
1855
+
1856
+ match pglast_data["op"]:
1857
+ case "SETOP_UNION":
1858
+ operator = (
1859
+ SetOperator.UnionAll
1860
+ if pglast_data.get("all", False)
1861
+ else SetOperator.Union
1862
+ )
1863
+ case "SETOP_INTERSECT":
1864
+ operator = SetOperator.Intersect
1865
+ case "SETOP_EXCEPT":
1866
+ operator = SetOperator.Except
1867
+ case _:
1868
+ raise ParserError("Unknown set operation: " + pglast_data["op"])
1869
+
1870
+ if "sortClause" in pglast_data:
1871
+ order_clause = _pglast_parse_orderby(
1872
+ pglast_data["sortClause"], namespace=parent_namespace, query_txt=query_txt
1873
+ )
1874
+ else:
1875
+ order_clause = None
1876
+
1877
+ if pglast_data["limitOption"] == "LIMIT_OPTION_COUNT":
1878
+ limit_clause = _pglast_parse_limit(
1879
+ pglast_data, namespace=parent_namespace, query_txt=query_txt
1880
+ )
1881
+ else:
1882
+ limit_clause = None
1883
+
1884
+ parent_namespace.determine_output_shape(None)
1885
+ return SetQuery(
1886
+ left_query,
1887
+ right_query,
1888
+ set_operation=operator,
1889
+ cte_clause=with_clause,
1890
+ orderby_clause=order_clause,
1891
+ limit_clause=limit_clause,
1892
+ )
1893
+
1894
+
1895
+ def _pglast_parse_explain(pglast_data: dict) -> tuple[Optional[Explain], dict]:
1896
+ """Handler method to extract the *EXPLAIN* clause from a query.
1897
+
1898
+ Parameters
1899
+ ----------
1900
+ pglast_data : dict
1901
+ JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
1902
+ of the dictionary is necessary.
1903
+
1904
+ Returns
1905
+ -------
1906
+ tuple[Optional[Explain], dict]
1907
+ The parsed explain clause if one exists as well as the wrapped query. The query representation should be used for all
1908
+ further parsing steps.
1909
+ """
1910
+ if "ExplainStmt" not in pglast_data:
1911
+ return None, pglast_data
1912
+
1913
+ pglast_data = pglast_data["ExplainStmt"]
1914
+ explain_options: list[dict] = pglast_data.get("options", [])
1915
+
1916
+ use_analyze = False
1917
+ output_format = "TEXT"
1918
+ for option in explain_options:
1919
+ definition: dict = option["DefElem"]
1920
+ match definition["defname"]:
1921
+ case "analyze":
1922
+ use_analyze = True
1923
+ case "format":
1924
+ output_format = definition["arg"]["String"]["sval"]
1925
+ case _:
1926
+ raise ParserError("Unknown explain option: " + str(definition))
1927
+
1928
+ explain_clause = Explain(use_analyze, output_format)
1929
+ return explain_clause, pglast_data["query"]
1930
+
1931
+
1932
+ def _pglast_parse_query(
1933
+ stmt: dict, *, namespace: QueryNamespace, query_txt: str
1934
+ ) -> SelectStatement:
1935
+ """Main entry point into the parsing logic.
1936
+
1937
+ This function takes a single SQL SELECT query and provides the corresponding `SqlQuery` object.
1938
+ While parsing the different expressions, columns are automatically bound to their tables if they use qualified names.
1939
+ Otherwise, they are inferred from the database schema if one is given. If no schema is provided, the column will be
1940
+ left unbound.
1941
+
1942
+ Parameters
1943
+ ----------
1944
+ stmt : dict
1945
+ The JSON representation of the query. This should be the contents of the ``SelectStmt`` key in the JSON dictionary.
1946
+ namespace : QueryNamespace
1947
+ The tables and columns that can be referenced by expressions in the query. This is used to register the columns
1948
+ that are used in the query.
1949
+ query_txt : str
1950
+ The raw query text that was passed to the parser. This is used to extract information that the PG parser ignores, such
1951
+ as hint blocks.
1952
+
1953
+ Returns
1954
+ -------
1955
+ SelectStatement
1956
+ The parsed query
1957
+ """
1958
+ if stmt["op"] != "SETOP_NONE":
1959
+ return _pglast_parse_setop(
1960
+ stmt, parent_namespace=namespace, query_txt=query_txt
1961
+ )
1962
+
1963
+ clauses: list[BaseClause] = []
1964
+
1965
+ if "withClause" in stmt:
1966
+ with_clause = _pglast_parse_ctes(
1967
+ stmt["withClause"], parent_namespace=namespace, query_txt=query_txt
1968
+ )
1969
+ clauses.append(with_clause)
1970
+
1971
+ if "fromClause" in stmt:
1972
+ from_clause = _pglast_parse_from(
1973
+ stmt["fromClause"], namespace=namespace, query_txt=query_txt
1974
+ )
1975
+ clauses.append(from_clause)
1976
+
1977
+ # Each query is guaranteed to have a SELECT clause, so we can just parse it straight away
1978
+ select_clause = _pglast_parse_select(stmt, namespace=namespace, query_txt=query_txt)
1979
+ clauses.append(select_clause)
1980
+
1981
+ if "whereClause" in stmt:
1982
+ where_clause = _pglast_parse_where(
1983
+ stmt["whereClause"], namespace=namespace, query_txt=query_txt
1984
+ )
1985
+ clauses.append(where_clause)
1986
+
1987
+ if "groupClause" in stmt:
1988
+ group_clause = _pglast_parse_groupby(
1989
+ stmt["groupClause"], namespace=namespace, query_txt=query_txt
1990
+ )
1991
+ clauses.append(group_clause)
1992
+
1993
+ if "havingClause" in stmt:
1994
+ having_clause = _pglast_parse_having(
1995
+ stmt["havingClause"], namespace=namespace, query_txt=query_txt
1996
+ )
1997
+ clauses.append(having_clause)
1998
+
1999
+ if "sortClause" in stmt:
2000
+ order_clause = _pglast_parse_orderby(
2001
+ stmt["sortClause"], namespace=namespace, query_txt=query_txt
2002
+ )
2003
+ clauses.append(order_clause)
2004
+
2005
+ if stmt["limitOption"] == "LIMIT_OPTION_COUNT":
2006
+ limit_clause = _pglast_parse_limit(
2007
+ stmt, namespace=namespace, query_txt=query_txt
2008
+ )
2009
+ clauses.append(limit_clause)
2010
+
2011
+ return build_query(clauses)
2012
+
2013
+
2014
+ def _pglast_parse_set_commands(pglast_data: list[dict]) -> tuple[list[str], list[dict]]:
2015
+ """Handler method to parse all *SET* commands that precede the actual query.
2016
+
2017
+ Parameters
2018
+ ----------
2019
+ pglast_data : list[dict]
2020
+ JSON encoding of the entire query. The method takes care of accessing the appropriate keys by itself, no preparation
2021
+ of the dictionary is necessary.
2022
+
2023
+ Returns
2024
+ -------
2025
+ tuple[list[str], list[dict]]
2026
+ The parsed *SET* commands as a list of strings and the remaining query data. The query data is "forwarded" to the first
2027
+ encoding that does not represent a *SET* command.
2028
+ """
2029
+ prep_stmts: list[str] = []
2030
+
2031
+ for i, item in enumerate(pglast_data):
2032
+ stmt: dict = item["stmt"]
2033
+ if "VariableSetStmt" not in stmt:
2034
+ break
2035
+
2036
+ var_set_stmt: dict = stmt["VariableSetStmt"]
2037
+ if var_set_stmt["kind"] != "VAR_SET_VALUE":
2038
+ raise ParserError(f"Unknown variable set option: {var_set_stmt}")
2039
+ var_name = var_set_stmt["name"]
2040
+ var_value = var_set_stmt["args"][0]["A_Const"]["sval"]["sval"]
2041
+
2042
+ parsed_stmt = f"SET {var_name} TO '{var_value}';"
2043
+ prep_stmts.append(parsed_stmt)
2044
+
2045
+ return prep_stmts, pglast_data[i:]
2046
+
2047
+
2048
+ def _parse_hint_block(
2049
+ raw_query: str, *, set_cmds: list[str], _current_hint_text: list[str] = None
2050
+ ) -> Optional[Hint]:
2051
+ """Handler method to extract the hint block (i.e. preceding comments) from a query
2052
+
2053
+ Parameters
2054
+ ----------
2055
+ raw_query : str
2056
+ The query text that was passed to the parser. We require access to the raw query, because the PG parser ignores all
2057
+ comments and does not represent them in the AST in any way.
2058
+ set_cmds: list[str]
2059
+ *SET* commands that have already been parsed. These will be added to the hint block.
2060
+ _current_hint_text : list[str], optional
2061
+ Internal parameter to keep track of the current hint text. This is used because the parsing logic uses a recursive
2062
+ implementation.
2063
+
2064
+ Returns
2065
+ -------
2066
+ Optional[Hint]
2067
+ The hint block if any hints were found, or *None* otherwise.
2068
+ """
2069
+ _current_hint_text = _current_hint_text or []
2070
+
2071
+ raw_query = raw_query.lstrip()
2072
+ block_hint = raw_query.startswith("/*")
2073
+ line_hint = raw_query.startswith("--")
2074
+ if not block_hint and not line_hint:
2075
+ prep_stms = "\n".join(set_cmds)
2076
+ hints = "\n".join(_current_hint_text)
2077
+ return Hint(prep_stms, hints) if prep_stms or hints else None
2078
+
2079
+ if line_hint:
2080
+ line_end = raw_query.find("\n")
2081
+ if line_end == -1:
2082
+ # should never be raised b/c parsing should have failed already at this point
2083
+ raise ParserError(f"Unterminated line comment: {raw_query}")
2084
+
2085
+ line_comment = raw_query[:line_end].strip()
2086
+ _current_hint_text.append(line_comment)
2087
+ return _parse_hint_block(
2088
+ raw_query[line_end:],
2089
+ set_cmds=set_cmds,
2090
+ _current_hint_text=_current_hint_text,
2091
+ )
2092
+
2093
+ # must be block hint
2094
+ block_end = raw_query.find("*/")
2095
+ if block_end == -1:
2096
+ # should never be raised b/c parsing should have failed already at this point
2097
+ raise ParserError(f"Unterminated block comment: {raw_query}")
2098
+
2099
+ block_comment = raw_query[: block_end + 2].strip()
2100
+ _current_hint_text.append(block_comment)
2101
+ return _parse_hint_block(
2102
+ raw_query[block_end + 2 :],
2103
+ set_cmds=set_cmds,
2104
+ _current_hint_text=_current_hint_text,
2105
+ )
2106
+
2107
+
2108
+ def _apply_extra_clauses(
2109
+ parsed: SelectStatement, *, hint: Optional[Hint], explain_clause: Optional[Explain]
2110
+ ) -> SelectStatement:
2111
+ clauses = list(parsed.clauses())
2112
+ if hint:
2113
+ clauses.append(hint)
2114
+ if explain_clause:
2115
+ clauses.append(explain_clause)
2116
+ return build_query(clauses)
2117
+
2118
+
2119
+ @overload
2120
+ def parse_query(
2121
+ query: str,
2122
+ *,
2123
+ include_hints: bool = True,
2124
+ bind_columns: bool | None = None,
2125
+ db_schema: Optional[DBCatalog] = None,
2126
+ ) -> SqlQuery: ...
2127
+
2128
+
2129
+ @overload
2130
+ def parse_query(
2131
+ query: str,
2132
+ *,
2133
+ accept_set_query: bool,
2134
+ include_hints: bool = True,
2135
+ bind_columns: Optional[bool] = None,
2136
+ db_schema: Optional[DBCatalog] = None,
2137
+ ) -> SelectStatement: ...
2138
+
2139
+
2140
+ def parse_query(
2141
+ query: str,
2142
+ *,
2143
+ accept_set_query: bool = False,
2144
+ include_hints: bool = True,
2145
+ bind_columns: Optional[bool] = None,
2146
+ db_schema: Optional[DBCatalog] = None,
2147
+ ) -> SelectStatement:
2148
+ """Parses a query string into a proper `SqlQuery` object.
2149
+
2150
+ During parsing, the appropriate type of SQL query (i.e. with implicit, explicit or mixed *FROM* clause) will be
2151
+ inferred automatically. Therefore, this method can potentially return a subclass of `SqlQuery`.
2152
+
2153
+ Once the query has been transformed, a text-based binding process is executed. During this process, the referenced
2154
+ tables are normalized such that column references using the table alias are linked to the correct tables that are
2155
+ specified in the *FROM* clause (see the module-level documentation for an example). The parsing process can
2156
+ optionally also involve a binding process based on the schema of a live database. This is important for all
2157
+ remaining columns where the text-based parsing was not possible, e.g. because the column was specified without a
2158
+ table alias.
2159
+
2160
+ Parameters
2161
+ ----------
2162
+ query : str
2163
+ The query to parse
2164
+ accept_set_query : bool, optional
2165
+ Whether set queries are a valid result of the parsing process. If this is *False* (the default), an error will be
2166
+ raised if the input query is a set query. This implies that the result of the parsing process is always a `SqlQuery`
2167
+ instance. Otherwise, the result can also be a `SetQuery` instance.
2168
+ include_hints : bool, optional
2169
+ Whether to include hints in the parsed query. If this is *True* (the default), any preceding comments in the query
2170
+ text will be parsed as a hint block. Otherwise, these comments are simply ignored.
2171
+ bind_columns : bool | None, optional
2172
+ Whether to use *live binding*. This does not control the text-based binding, which is always performed. If this
2173
+ parameter is *None* (the default), the global `auto_bind_columns` variable will be queried. Depending on its
2174
+ value, live binding will be performed or not.
2175
+ db_schema : Optional[DBCatalog], optional
2176
+ For live binding, this indicates the database to use. If this is *None* (the default), the database will be
2177
+ tried to extract from the `DatabasePool`
2178
+
2179
+ Returns
2180
+ -------
2181
+ SqlQuery
2182
+ The parsed SQL query.
2183
+ """
2184
+ # NOTE: this documentation is a 1:1 copy of qal.parse_query. Both should be kept in sync.
2185
+ if not query:
2186
+ raise ParserError("Empty query")
2187
+
2188
+ if db_schema is None and (
2189
+ bind_columns or (bind_columns is None and auto_bind_columns)
2190
+ ):
2191
+ from ..db import DatabasePool # local import to prevent circular imports
2192
+
2193
+ db_schema = (
2194
+ None
2195
+ if DatabasePool.get_instance().empty()
2196
+ else DatabasePool.get_instance().current_database().schema()
2197
+ )
2198
+
2199
+ pglast_data = json.loads(pglast.parser.parse_sql_json(query))
2200
+ stmts = pglast_data["stmts"]
2201
+
2202
+ set_cmds, stmts = _pglast_parse_set_commands(stmts)
2203
+ if len(stmts) != 1:
2204
+ raise ValueError("Parser can only support single-statement queries for now")
2205
+ raw_query: dict = stmts[0]["stmt"]
2206
+
2207
+ if "ExplainStmt" in raw_query:
2208
+ explain_clause, raw_query = _pglast_parse_explain(raw_query)
2209
+ else:
2210
+ explain_clause = None
2211
+
2212
+ if "SelectStmt" not in raw_query:
2213
+ raise ValueError("Cannot parse non-SELECT queries")
2214
+ stmt = raw_query["SelectStmt"]
2215
+
2216
+ parsed_query = _pglast_parse_query(
2217
+ stmt, namespace=QueryNamespace.empty(db_schema), query_txt=query
2218
+ )
2219
+ if not accept_set_query and isinstance(query, SetQuery):
2220
+ raise ParserError("Input query is a set query")
2221
+
2222
+ hint = _parse_hint_block(query, set_cmds=set_cmds) if include_hints else None
2223
+ parsed_query = _apply_extra_clauses(
2224
+ parsed_query, hint=hint, explain_clause=explain_clause
2225
+ )
2226
+
2227
+ return parsed_query
2228
+
2229
+
2230
+ class ParserError(RuntimeError):
2231
+ """An error that is raised when parsing fails."""
2232
+
2233
+ def __init__(self, msg: str) -> None:
2234
+ super().__init__(msg)
2235
+
2236
+
2237
+ def load_table_json(json_data: dict | str) -> Optional[TableReference]:
2238
+ """Re-creates a table reference from its JSON encoding.
2239
+
2240
+ Parameters
2241
+ ----------
2242
+ json_data : dict | str
2243
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
2244
+
2245
+ Returns
2246
+ -------
2247
+ Optional[TableReference]
2248
+ The actual table. If the dictionary is empty or otherwise invalid, *None* is returned.
2249
+ """
2250
+ if not json_data:
2251
+ return None
2252
+ json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
2253
+ return TableReference(
2254
+ json_data.get("full_name", ""),
2255
+ json_data.get("alias", ""),
2256
+ virtual=json_data.get("virtual", False),
2257
+ schema=json_data.get("schemaname", None),
2258
+ )
2259
+
2260
+
2261
+ def load_column_json(json_data: dict | str) -> Optional[ColumnReference]:
2262
+ """Re-creates a column reference from its JSON encoding.
2263
+
2264
+ Parameters
2265
+ ----------
2266
+ json_data : dict | str
2267
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
2268
+
2269
+ Returns
2270
+ -------
2271
+ Optional[ColumnReference]
2272
+ The actual column. It the dictionary is empty or otherwise invalid, *None* is returned.
2273
+ """
2274
+ if not json_data:
2275
+ return None
2276
+ json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
2277
+ return ColumnReference(
2278
+ json_data.get("column"), load_table_json(json_data.get("table", None))
2279
+ )
2280
+
2281
+
2282
+ def load_expression_json(json_data: dict | str) -> Optional[SqlExpression]:
2283
+ """Re-creates an arbitrary SQL expression from its JSON encoding.
2284
+
2285
+ Parameters
2286
+ ----------
2287
+ json_data : dict | str
2288
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
2289
+
2290
+ Returns
2291
+ -------
2292
+ Optional[SqlExpression]
2293
+ The actual expression. If the dictionary is empty or *None*, *None* is returned. Notice that in case of
2294
+ malformed data, errors are raised.
2295
+ """
2296
+ if not json_data:
2297
+ return None
2298
+ json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
2299
+
2300
+ tables = [load_table_json(table_data) for table_data in json_data.get("tables", [])]
2301
+ expression_str = json_data["expression"]
2302
+ if not tables:
2303
+ emulated_query = f"SELECT {expression_str}"
2304
+ else:
2305
+ from_clause_str = ", ".join(str(tab) for tab in tables)
2306
+ emulated_query = f"SELECT {expression_str} FROM {from_clause_str}"
2307
+
2308
+ parsed_query = parse_query(emulated_query)
2309
+ return parsed_query.select_clause.targets[0].expression
2310
+
2311
+
2312
+ def load_predicate_json(json_data: dict | str) -> Optional[AbstractPredicate]:
2313
+ """Re-creates an arbitrary predicate from its JSON encoding.
2314
+
2315
+ Parameters
2316
+ ----------
2317
+ json_data : dict | str
2318
+ Either the JSON dictionary, or a string encoding of the dictionary (which will be parsed by *json.loads*)
2319
+
2320
+ Returns
2321
+ -------
2322
+ Optional[AbstractPredicate]
2323
+ The actual predicate. If the dictionary is empty or *None*, *None* is returned. Notice that in case of
2324
+ malformed data, errors are raised.
2325
+
2326
+ Raises
2327
+ ------
2328
+ KeyError
2329
+ If the encoding does not specify the tables that are referenced in the predicate
2330
+ KeyError
2331
+ If the encoding does not contain the actual predicate
2332
+ """
2333
+ if not json_data:
2334
+ return None
2335
+ json_data = json_data if isinstance(json_data, dict) else json.loads(json_data)
2336
+
2337
+ tables = [load_table_json(table_data) for table_data in json_data.get("tables", [])]
2338
+ if not tables:
2339
+ raise KeyError("Predicate needs at least one table!")
2340
+ from_clause_str = ", ".join(str(tab) for tab in tables)
2341
+ predicate_str = json_data["predicate"]
2342
+ emulated_query = f"SELECT * FROM {from_clause_str} WHERE {predicate_str}"
2343
+ parsed_query = parse_query(emulated_query)
2344
+ return parsed_query.where_clause.predicate