aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,496 @@
1
+ """LLM prompts and context-building utilities for synthetic intent
2
+ expansion.
3
+
4
+ Defines the system prompt and per-operator user prompts (A1-A10, B1-B5) used when calling the LLM during expansion. Provides helpers that format the current intent state and schema details into context strings, and the main ``_llm_expand_operator`` entry point that invokes the LLM and parses the returned JSON expansion list.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from .config import SimulatorConfig
12
+ from .contracts_base import ColumnRole, SchemaGraph, TableRole
13
+ from .contracts_core import SimulatorIntent
14
+ from .core_utils import debug, llm_json
15
+
16
+ _EXPANSION_SYSTEM = """You are an expert SQL analyst generating semantically meaningful query variations for training data.
17
+
18
+ RULES:
19
+ 1. Output MUST be valid JSON array of objects
20
+ 2. Each expansion must be semantically distinct from current state
21
+ 3. Prefer columns with high cardinality for filters
22
+ 4. Maintain referential integrity with foreign keys
23
+ 5. Consider business meaning when selecting columns/operations
24
+ 6. Never duplicate existing conditions
25
+ 7. Return empty array [] if no valid expansions exist"""
26
+
27
+ _EXPANSION_PROMPTS = {
28
+ "A1": """## Task: Add Filter Condition (A1)
29
+ Generate {max_variants} filter conditions for this query.
30
+
31
+ {context}
32
+
33
+ OUTPUT FORMAT (JSON array):
34
+ [{{"column": "table.column", "op": "=|!=|>|<|>=|<=|LIKE|IN|BETWEEN", "value_type": "string|number|date|list"}}]
35
+
36
+ GUIDELINES:
37
+ - Choose columns that would logically filter this data
38
+ - Prefer dimension columns (customer_id, product_category, etc.)
39
+ - Match operator to data type (LIKE for strings, comparison for numbers/dates)
40
+ - value_type guides what parameter will be used (not the actual value)
41
+ - Consider business scenarios (active customers, recent orders, etc.)""",
42
+ "A2": """## Task: Add Expr-to-Expr Comparison (A2)
43
+ Generate {max_variants} expr comparison conditions (filter expr-vs-expr).
44
+
45
+ {context}
46
+
47
+ OUTPUT FORMAT (JSON array):
48
+ [{{"left_col": "table.column", "op": "=|!=|>|<|>=|<=", "right_col": "table.column", "reason": "business meaning"}}]
49
+
50
+ GUIDELINES:
51
+ - Expressions must be comparable types (both numeric, both dates, etc.)
52
+ - Look for business relationships (ship_date > order_date, actual vs budget)
53
+ - Can compare across tables if joined
54
+ - Explain the business logic of the comparison""",
55
+ "A3": """## Task: Change Aggregation Function (A3)
56
+ Generate {max_variants} alternative aggregation approaches.
57
+
58
+ {context}
59
+
60
+ OUTPUT FORMAT (JSON array):
61
+ [{{"column": "table.column", "agg_func": "COUNT|SUM|AVG|MIN|MAX", "reason": "brief explanation"}}]
62
+
63
+ GUIDELINES:
64
+ - SUM/AVG only for numeric columns
65
+ - COUNT works for any column
66
+ - MIN/MAX for comparable types
67
+ - Consider what makes business sense (total sales, average order size, etc.)""",
68
+ "A4": """## Task: Add GROUP BY Column (A4)
69
+ Generate {max_variants} grouping columns to add.
70
+
71
+ {context}
72
+
73
+ OUTPUT FORMAT (JSON array):
74
+ [{{"column": "table.column", "reason": "brief business justification"}}]
75
+
76
+ GUIDELINES:
77
+ - Choose columns with reasonable cardinality (not too high, not too low)
78
+ - Prefer dimension attributes (category, region, time period)
79
+ - Column must be from tables already in the query
80
+ - Consider hierarchical groupings (year→month, country→city)""",
81
+ "A5": """## Task: Add ORDER BY Clause (A5)
82
+ Generate {max_variants} ordering options.
83
+
84
+ {context}
85
+
86
+ OUTPUT FORMAT (JSON array):
87
+ [{{"column": "table.column", "direction": "ASC|DESC", "agg_func": "COUNT|SUM|AVG|MIN|MAX|null"}}]
88
+
89
+ GUIDELINES:
90
+ - For grouped queries, can order by aggregated values
91
+ - DESC for "top N" scenarios (highest sales, most orders)
92
+ - ASC for chronological or alphabetical ordering
93
+ - agg_func only needed when ordering by an aggregation not in select""",
94
+ "A6": """## Task: Add HAVING Value Filter (A6)
95
+ Generate {max_variants} HAVING conditions for grouped results.
96
+
97
+ {context}
98
+
99
+ OUTPUT FORMAT (JSON array):
100
+ [{{"agg_func": "COUNT|SUM|AVG|MIN|MAX", "column": "table.column", "op": "=|!=|>|<|>=|<=", "value_type": "number"}}]
101
+
102
+ GUIDELINES:
103
+ - HAVING filters on aggregated values against a threshold
104
+ - Common patterns: COUNT(*) > N, SUM(amount) > threshold
105
+ - Must have GROUP BY in query
106
+ - Consider business thresholds (high-value customers, active products)""",
107
+ "A7": """## Task: Add HAVING Expression Comparison (A7)
108
+ Generate {max_variants} HAVING conditions comparing two aggregated expressions.
109
+
110
+ {context}
111
+
112
+ OUTPUT FORMAT (JSON array):
113
+ [{{"left_agg": "COUNT|SUM|AVG|MIN|MAX", "left_col": "table.column", "op": "=|!=|>|<|>=|<=", "right_agg": "COUNT|SUM|AVG|MIN|MAX", "right_col": "table.column", "reason": "business meaning"}}]
114
+
115
+ GUIDELINES:
116
+ - Compare two aggregated expressions (e.g. AVG(table.column) > MIN(table.column))
117
+ - Both sides must use valid aggregation functions
118
+ - Must have GROUP BY in query
119
+ - Consider business comparisons (average vs minimum, total vs count)""",
120
+ "A8": """## Task: Remove Filter (A8)
121
+ Select {max_variants} filters to remove for broader results.
122
+
123
+ {context}
124
+
125
+ OUTPUT FORMAT (JSON array):
126
+ [{{"left_col": "table.column", "op": "operator", "reason": "why removing makes sense"}}]
127
+
128
+ GUIDELINES:
129
+ - Identify restrictive filters that could be relaxed
130
+ - Removing filter should still produce meaningful query
131
+ - Consider which filters are optional vs essential
132
+ - Explain business rationale for removal""",
133
+ "A9": """## Task: Remove GROUP BY Column (A9)
134
+ Select {max_variants} grouping columns to remove.
135
+
136
+ {context}
137
+
138
+ OUTPUT FORMAT (JSON array):
139
+ [{{"column": "table.column", "reason": "why removing makes sense"}}]
140
+
141
+ GUIDELINES:
142
+ - Removing creates higher-level aggregation
143
+ - Keep at least one grouping column if query needs grouping
144
+ - Consider dimensional hierarchy (remove month to group by year only)
145
+ - Explain business rationale""",
146
+ "A10": """## Task: Remove HAVING Condition (A10)
147
+ Select {max_variants} HAVING conditions to remove.
148
+
149
+ {context}
150
+
151
+ OUTPUT FORMAT (JSON array):
152
+ [{{"agg_func": "COUNT|SUM|AVG", "column": "table.column", "op": "operator", "reason": "why removing makes sense"}}]
153
+
154
+ GUIDELINES:
155
+ - Removing broadens result set
156
+ - Consider if threshold is too restrictive
157
+ - Explain business rationale for relaxation""",
158
+ "B1": """## Task: Add Dimension Table Join (B1)
159
+ Generate {max_variants} dimension tables to join.
160
+
161
+ {context}
162
+
163
+ OUTPUT FORMAT (JSON array):
164
+ [{{"table": "table_name", "join_via": "existing_table", "reason": "what this enables"}}]
165
+
166
+ GUIDELINES:
167
+ - Choose dimension tables connected via FK to existing tables
168
+ - Consider what new attributes become available
169
+ - Explain analytical value of the join
170
+ - Prefer tables that add meaningful context""",
171
+ "B2": """## Task: Add Fact Table Join (B2)
172
+ Generate {max_variants} fact tables to join.
173
+
174
+ {context}
175
+
176
+ OUTPUT FORMAT (JSON array):
177
+ [{{"table": "table_name", "join_via": "existing_table", "reason": "what metrics this enables"}}]
178
+
179
+ GUIDELINES:
180
+ - Choose fact tables connected to existing dimensions
181
+ - Consider what new measures become available
182
+ - Explain analytical value (combining sales with inventory, etc.)
183
+ - Be careful about many-to-many relationships""",
184
+ "B3": """## Task: Swap Dimension Table (B3)
185
+ Generate {max_variants} dimension table swaps.
186
+
187
+ {context}
188
+
189
+ OUTPUT FORMAT (JSON array):
190
+ [{{"remove": "old_table", "add": "new_table", "reason": "why this swap makes sense"}}]
191
+
192
+ GUIDELINES:
193
+ - New table must connect to same fact tables
194
+ - Consider alternative grouping perspectives
195
+ - Explain what different insights the swap provides
196
+ - Both tables should be dimensions""",
197
+ "B4": """## Task: Remove Table (B4)
198
+ Select {max_variants} tables to remove.
199
+
200
+ {context}
201
+
202
+ OUTPUT FORMAT (JSON array):
203
+ [{{"table": "table_name", "reason": "why removal simplifies without losing key data"}}]
204
+
205
+ GUIDELINES:
206
+ - Only remove dimension tables
207
+ - Remaining tables must stay connected
208
+ - Remove tables not essential to query purpose
209
+ - Consider if columns from table are used in SELECT/WHERE/GROUP BY""",
210
+ "B5": """## Task: Add Bridge/Intermediate Table (B5)
211
+ Generate {max_variants} bridge tables to add.
212
+
213
+ {context}
214
+
215
+ OUTPUT FORMAT (JSON array):
216
+ [{{"table": "table_name", "connects": ["table1", "table2"], "reason": "what relationship this enables"}}]
217
+
218
+ GUIDELINES:
219
+ - Bridge tables connect two dimension tables
220
+ - Often represent many-to-many relationships
221
+ - Consider junction/association tables in schema
222
+ - Explain what analytical capability is enabled""",
223
+ }
224
+
225
+
226
+ def _format_intent_state(intent: SimulatorIntent) -> str:
227
+ """Format current intent state for LLM context.
228
+
229
+ Args:
230
+ intent: The SimulatorIntent whose state should be formatted.
231
+
232
+ Returns:
233
+ Multi-line string summarising tables, grain, select columns, filters, group-by, order-by, and having clauses.
234
+ """
235
+ lines = []
236
+ lines.append(f"Tables: {', '.join(intent.tables or [])}")
237
+ lines.append(f"Grain: {intent.grain}")
238
+
239
+ if intent.select_cols:
240
+ cols_info = []
241
+ for sc in intent.select_cols:
242
+ cols_info.append(sc.expr.primary_term)
243
+ lines.append(f"Select: {', '.join(cols_info)}")
244
+
245
+ if intent.filters_param:
246
+ filter_strs = []
247
+ for f in intent.filters_param:
248
+ if f.right_expr:
249
+ filter_strs.append(f"{f.left_expr.primary_column} {f.op} {f.right_expr.primary_column}")
250
+ else:
251
+ filter_strs.append(f"{f.left_expr.primary_column} {f.op} [{f.value_type}]")
252
+ lines.append(f"Filters: {', '.join(filter_strs)}")
253
+
254
+ if intent.group_by_cols:
255
+ lines.append(f"Group By: {', '.join(g.primary_column for g in intent.group_by_cols)}")
256
+
257
+ if intent.order_by_cols:
258
+ order_strs = []
259
+ for o in intent.order_by_cols:
260
+ order_strs.append(f"{o.expr.primary_term} {o.direction}")
261
+ lines.append(f"Order By: {', '.join(order_strs)}")
262
+
263
+ if intent.having_param:
264
+ having_strs = []
265
+ for h in intent.having_param:
266
+ having_strs.append(f"{h.left_expr.primary_term} {h.op} [{h.value_type}]")
267
+ lines.append(f"Having: {', '.join(having_strs)}")
268
+
269
+ return "\n".join(lines)
270
+
271
+
272
+ def _format_column_details(column_metadata: dict[str, dict[str, dict[str, Any]]], tables: list[str]) -> str:
273
+ """Format column details for LLM context.
274
+
275
+ Args:
276
+ column_metadata: Nested dict of table -> column -> metadata produced by ``_build_column_metadata_for_validation``.
277
+ tables: List of table names to include.
278
+
279
+ Returns:
280
+ Multi-line string listing each column with its data type, role, nullability, and cardinality for the requested tables.
281
+ """
282
+ lines = []
283
+ for table in tables:
284
+ if table not in column_metadata:
285
+ continue
286
+ lines.append(f"\n{table}:")
287
+ for col_name, col_info in column_metadata[table].items():
288
+ dtype = col_info.get("data_type", "unknown")
289
+ role = col_info.get("role", "")
290
+ nullable = "nullable" if col_info.get("nullable", True) else "required"
291
+ card = col_info.get("cardinality", "")
292
+ card_str = f", cardinality={card}" if card else ""
293
+ lines.append(f" - {col_name}: {dtype} ({role}, {nullable}{card_str})")
294
+ return "\n".join(lines)
295
+
296
+
297
+ def _build_expansion_context(
298
+ intent: SimulatorIntent,
299
+ operator: str,
300
+ schema: SchemaGraph,
301
+ column_metadata: dict[str, dict[str, dict[str, Any]]],
302
+ fk_map: dict = None,
303
+ ) -> str:
304
+ """Build operator-specific context for LLM expansion prompts.
305
+
306
+ Combines the current intent state and available column details with operator-specific supplementary context (filterable columns for A1, comparable pairs for A2, numeric columns for A3/A7, FK topology for B-series).
307
+
308
+ Args:
309
+ intent: The SimulatorIntent being expanded.
310
+ operator: Operator code (e.g. ``"A1"``, ``"B3"``).
311
+ schema: The schema graph, used for B-series table role context.
312
+ column_metadata: Pre-built column metadata dict.
313
+ fk_map: Pre-built FK map; optional.
314
+
315
+ Returns:
316
+ Formatted context string ready for inclusion in the LLM prompt.
317
+ """
318
+ context_parts = []
319
+
320
+ context_parts.append("### Current Query State")
321
+ context_parts.append(_format_intent_state(intent))
322
+
323
+ context_parts.append("\n### Available Columns")
324
+ context_parts.append(_format_column_details(column_metadata, intent.tables or []))
325
+
326
+ if operator.startswith("A"):
327
+ if operator == "A1":
328
+ context_parts.append("\n### Filter-Suitable Columns")
329
+ for table in intent.tables or []:
330
+ if table in column_metadata:
331
+ filterable = [
332
+ c
333
+ for c, info in column_metadata[table].items()
334
+ if info.get("role")
335
+ in (
336
+ ColumnRole.CATEGORICAL.value,
337
+ ColumnRole.TEMPORAL.value,
338
+ ColumnRole.IDENTIFIER.value,
339
+ )
340
+ ]
341
+ if filterable:
342
+ context_parts.append(f" {table}: {', '.join(filterable)}")
343
+
344
+ elif operator == "A2":
345
+ context_parts.append("\n### Comparable Column Pairs")
346
+ type_groups = {}
347
+ for table in intent.tables or []:
348
+ if table in column_metadata:
349
+ for col, info in column_metadata[table].items():
350
+ dtype = info.get("data_type", "unknown")
351
+ full_col = f"{table}.{col}"
352
+ if dtype not in type_groups:
353
+ type_groups[dtype] = []
354
+ type_groups[dtype].append(full_col)
355
+ for dtype, cols in type_groups.items():
356
+ if len(cols) >= 2:
357
+ context_parts.append(f" {dtype}: {', '.join(cols[:10])}")
358
+
359
+ elif operator == "A3":
360
+ context_parts.append("\n### Aggregatable Columns (numeric)")
361
+ for table in intent.tables or []:
362
+ if table in column_metadata:
363
+ numeric = [
364
+ c
365
+ for c, info in column_metadata[table].items()
366
+ if info.get("data_type")
367
+ in (
368
+ "integer",
369
+ "decimal",
370
+ "float",
371
+ "numeric",
372
+ "double",
373
+ "bigint",
374
+ "smallint",
375
+ "real",
376
+ )
377
+ ]
378
+ if numeric:
379
+ context_parts.append(f" {table}: {', '.join(numeric)}")
380
+
381
+ elif operator == "A7":
382
+ context_parts.append("\n### Aggregatable Columns for HAVING Comparison")
383
+ for table in intent.tables or []:
384
+ if table in column_metadata:
385
+ numeric = [
386
+ c
387
+ for c, info in column_metadata[table].items()
388
+ if info.get("data_type")
389
+ in (
390
+ "integer",
391
+ "decimal",
392
+ "float",
393
+ "numeric",
394
+ "double",
395
+ "bigint",
396
+ "smallint",
397
+ "real",
398
+ )
399
+ ]
400
+ if numeric:
401
+ context_parts.append(f" {table}: {', '.join(numeric)}")
402
+
403
+ elif operator.startswith("B"):
404
+ context_parts.append(_build_b_series_context(schema, intent.tables or [], fk_map))
405
+
406
+ return "\n".join(context_parts)
407
+
408
+
409
+ def _build_b_series_context(schema: SchemaGraph, current_tables: list[str], fk_map: dict = None) -> str:
410
+ """Build additional context for B-series join operators.
411
+
412
+ Lists available tables grouped by role and describes the FK connections between current intent tables and candidate tables.
413
+
414
+ Args:
415
+ schema: The schema graph.
416
+ current_tables: List of table names currently in the intent.
417
+ fk_map: Pre-built FK map; omitted FK section when None.
418
+
419
+ Returns:
420
+ Formatted string describing schema relationships for B-series LLM prompts.
421
+ """
422
+ lines = ["\n### Schema Relationships"]
423
+
424
+ lines.append("\nAvailable Tables (by role):")
425
+ for role in [TableRole.FACT, TableRole.DIMENSION, TableRole.BRIDGE]:
426
+ tables_with_role = [t for t in schema.tables if schema.tables[t].role == role.value and t not in current_tables]
427
+ if tables_with_role:
428
+ lines.append(f" {role.value}: {', '.join(tables_with_role[:10])}")
429
+
430
+ if fk_map:
431
+ lines.append("\nForeign Key Connections:")
432
+ for table in current_tables:
433
+ outgoing = fk_map.get(table, [])
434
+ if outgoing:
435
+ targets = [fk.get("target_table", "") for fk in outgoing]
436
+ lines.append(f" {table} -> {', '.join(targets)}")
437
+
438
+ for other_table, fks in fk_map.items():
439
+ if other_table in current_tables:
440
+ continue
441
+ for fk in fks:
442
+ if fk.get("target_table") in current_tables:
443
+ lines.append(f" {other_table} -> {fk.get('target_table')}")
444
+ break
445
+
446
+ return "\n".join(lines)
447
+
448
+
449
+ def llm_expand_operator(
450
+ intent: SimulatorIntent,
451
+ operator: str,
452
+ schema: SchemaGraph,
453
+ column_metadata: dict[str, dict[str, dict[str, Any]]],
454
+ fk_map: dict = None,
455
+ ) -> list[dict[str, Any]]:
456
+ """Call the LLM and return a list of valid expansion suggestions for
457
+ an operator.
458
+
459
+ Builds the operator-specific prompt, calls ``llm_json``, and normalises the response to a plain list. Returns an empty list if the operator is unknown, the LLM returns an unexpected structure, or the call raises an exception.
460
+
461
+ Args:
462
+ intent: The SimulatorIntent being expanded.
463
+ operator: Operator code matching a key in ``_EXPANSION_PROMPTS``.
464
+ schema: The schema graph for context building.
465
+ column_metadata: Pre-built column metadata dict.
466
+ fk_map: Pre-built FK map; optional.
467
+
468
+ Returns:
469
+ List of expansion dicts as returned by the LLM (structure varies by operator).
470
+ """
471
+ if operator not in _EXPANSION_PROMPTS:
472
+ debug(f"[expansion_rules.llm_expand_operator] unknown operator: {operator}")
473
+ return []
474
+
475
+ context = _build_expansion_context(intent, operator, schema, column_metadata, fk_map)
476
+ prompt_template = _EXPANSION_PROMPTS[operator]
477
+ prompt = prompt_template.format(max_variants=SimulatorConfig.MAX_EXPANSION_VARIANTS, context=context)
478
+
479
+ debug(f"[expansion_rules.llm_expand_operator] calling LLM for operator={operator}")
480
+
481
+ try:
482
+ result = llm_json(_EXPANSION_SYSTEM, prompt)
483
+
484
+ if isinstance(result, dict):
485
+ expansions = result.get("expansions", result.get("results", []))
486
+ elif isinstance(result, list):
487
+ expansions = result
488
+ else:
489
+ expansions = []
490
+
491
+ debug(f"[expansion_rules.llm_expand_operator] LLM returned {len(expansions)} expansion(s)")
492
+ return expansions
493
+
494
+ except Exception as e:
495
+ debug(f"[expansion_rules.llm_expand_operator] LLM call failed: {e}")
496
+ return []