PyPI - aetherdialect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

aetherdialect-0.1.0.dist-info/METADATA +197 -0
aetherdialect-0.1.0.dist-info/RECORD +34 -0
aetherdialect-0.1.0.dist-info/WHEEL +5 -0
aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
text2sql/__init__.py +7 -0
text2sql/config.py +1063 -0
text2sql/contracts_base.py +952 -0
text2sql/contracts_core.py +1890 -0
text2sql/core_utils.py +834 -0
text2sql/dialect.py +1134 -0
text2sql/expansion_ops.py +1218 -0
text2sql/expansion_rules.py +496 -0
text2sql/intent_expr.py +1759 -0
text2sql/intent_process.py +2133 -0
text2sql/intent_repair.py +1733 -0
text2sql/intent_resolve.py +1292 -0
text2sql/live_testing.py +1117 -0
text2sql/main_execution.py +799 -0
text2sql/pipeline.py +1662 -0
text2sql/qsim_ops.py +1286 -0
text2sql/qsim_sample.py +609 -0
text2sql/qsim_struct.py +569 -0
text2sql/schema.py +973 -0
text2sql/schema_profiling.py +2075 -0
text2sql/simulator.py +970 -0
text2sql/sql_gen.py +1537 -0
text2sql/templates.py +1037 -0
text2sql/text2sql.py +726 -0
text2sql/utils.py +973 -0
text2sql/validation_agg.py +1033 -0
text2sql/validation_execute.py +1092 -0
text2sql/validation_schema.py +1847 -0
text2sql/validation_semantic.py +2122 -0

text2sql/expansion_rules.py ADDED Viewed

@@ -0,0 +1,496 @@
+"""LLM prompts and context-building utilities for synthetic intent
+expansion.
+Defines the system prompt and per-operator user prompts (A1-A10, B1-B5) used when calling the LLM during expansion. Provides helpers that format the current intent state and schema details into context strings, and the main ``_llm_expand_operator`` entry point that invokes the LLM and parses the returned JSON expansion list.
+"""
+from __future__ import annotations
+from typing import Any
+from .config import SimulatorConfig
+from .contracts_base import ColumnRole, SchemaGraph, TableRole
+from .contracts_core import SimulatorIntent
+from .core_utils import debug, llm_json
+_EXPANSION_SYSTEM = """You are an expert SQL analyst generating semantically meaningful query variations for training data.
+RULES:
+1. Output MUST be valid JSON array of objects
+2. Each expansion must be semantically distinct from current state
+3. Prefer columns with high cardinality for filters
+4. Maintain referential integrity with foreign keys
+5. Consider business meaning when selecting columns/operations
+6. Never duplicate existing conditions
+7. Return empty array [] if no valid expansions exist"""
+_EXPANSION_PROMPTS = {
+    "A1": """## Task: Add Filter Condition (A1)
+Generate {max_variants} filter conditions for this query.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"column": "table.column", "op": "=|!=|>|<|>=|<=|LIKE|IN|BETWEEN", "value_type": "string|number|date|list"}}]
+GUIDELINES:
+- Choose columns that would logically filter this data
+- Prefer dimension columns (customer_id, product_category, etc.)
+- Match operator to data type (LIKE for strings, comparison for numbers/dates)
+- value_type guides what parameter will be used (not the actual value)
+- Consider business scenarios (active customers, recent orders, etc.)""",
+    "A2": """## Task: Add Expr-to-Expr Comparison (A2)
+Generate {max_variants} expr comparison conditions (filter expr-vs-expr).
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"left_col": "table.column", "op": "=|!=|>|<|>=|<=", "right_col": "table.column", "reason": "business meaning"}}]
+GUIDELINES:
+- Expressions must be comparable types (both numeric, both dates, etc.)
+- Look for business relationships (ship_date > order_date, actual vs budget)
+- Can compare across tables if joined
+- Explain the business logic of the comparison""",
+    "A3": """## Task: Change Aggregation Function (A3)
+Generate {max_variants} alternative aggregation approaches.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"column": "table.column", "agg_func": "COUNT|SUM|AVG|MIN|MAX", "reason": "brief explanation"}}]
+GUIDELINES:
+- SUM/AVG only for numeric columns
+- COUNT works for any column
+- MIN/MAX for comparable types
+- Consider what makes business sense (total sales, average order size, etc.)""",
+    "A4": """## Task: Add GROUP BY Column (A4)
+Generate {max_variants} grouping columns to add.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"column": "table.column", "reason": "brief business justification"}}]
+GUIDELINES:
+- Choose columns with reasonable cardinality (not too high, not too low)
+- Prefer dimension attributes (category, region, time period)
+- Column must be from tables already in the query
+- Consider hierarchical groupings (year→month, country→city)""",
+    "A5": """## Task: Add ORDER BY Clause (A5)
+Generate {max_variants} ordering options.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"column": "table.column", "direction": "ASC|DESC", "agg_func": "COUNT|SUM|AVG|MIN|MAX|null"}}]
+GUIDELINES:
+- For grouped queries, can order by aggregated values
+- DESC for "top N" scenarios (highest sales, most orders)
+- ASC for chronological or alphabetical ordering
+- agg_func only needed when ordering by an aggregation not in select""",
+    "A6": """## Task: Add HAVING Value Filter (A6)
+Generate {max_variants} HAVING conditions for grouped results.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"agg_func": "COUNT|SUM|AVG|MIN|MAX", "column": "table.column", "op": "=|!=|>|<|>=|<=", "value_type": "number"}}]
+GUIDELINES:
+- HAVING filters on aggregated values against a threshold
+- Common patterns: COUNT(*) > N, SUM(amount) > threshold
+- Must have GROUP BY in query
+- Consider business thresholds (high-value customers, active products)""",
+    "A7": """## Task: Add HAVING Expression Comparison (A7)
+Generate {max_variants} HAVING conditions comparing two aggregated expressions.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"left_agg": "COUNT|SUM|AVG|MIN|MAX", "left_col": "table.column", "op": "=|!=|>|<|>=|<=", "right_agg": "COUNT|SUM|AVG|MIN|MAX", "right_col": "table.column", "reason": "business meaning"}}]
+GUIDELINES:
+- Compare two aggregated expressions (e.g. AVG(table.column) > MIN(table.column))
+- Both sides must use valid aggregation functions
+- Must have GROUP BY in query
+- Consider business comparisons (average vs minimum, total vs count)""",
+    "A8": """## Task: Remove Filter (A8)
+Select {max_variants} filters to remove for broader results.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"left_col": "table.column", "op": "operator", "reason": "why removing makes sense"}}]
+GUIDELINES:
+- Identify restrictive filters that could be relaxed
+- Removing filter should still produce meaningful query
+- Consider which filters are optional vs essential
+- Explain business rationale for removal""",
+    "A9": """## Task: Remove GROUP BY Column (A9)
+Select {max_variants} grouping columns to remove.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"column": "table.column", "reason": "why removing makes sense"}}]
+GUIDELINES:
+- Removing creates higher-level aggregation
+- Keep at least one grouping column if query needs grouping
+- Consider dimensional hierarchy (remove month to group by year only)
+- Explain business rationale""",
+    "A10": """## Task: Remove HAVING Condition (A10)
+Select {max_variants} HAVING conditions to remove.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"agg_func": "COUNT|SUM|AVG", "column": "table.column", "op": "operator", "reason": "why removing makes sense"}}]
+GUIDELINES:
+- Removing broadens result set
+- Consider if threshold is too restrictive
+- Explain business rationale for relaxation""",
+    "B1": """## Task: Add Dimension Table Join (B1)
+Generate {max_variants} dimension tables to join.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"table": "table_name", "join_via": "existing_table", "reason": "what this enables"}}]
+GUIDELINES:
+- Choose dimension tables connected via FK to existing tables
+- Consider what new attributes become available
+- Explain analytical value of the join
+- Prefer tables that add meaningful context""",
+    "B2": """## Task: Add Fact Table Join (B2)
+Generate {max_variants} fact tables to join.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"table": "table_name", "join_via": "existing_table", "reason": "what metrics this enables"}}]
+GUIDELINES:
+- Choose fact tables connected to existing dimensions
+- Consider what new measures become available
+- Explain analytical value (combining sales with inventory, etc.)
+- Be careful about many-to-many relationships""",
+    "B3": """## Task: Swap Dimension Table (B3)
+Generate {max_variants} dimension table swaps.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"remove": "old_table", "add": "new_table", "reason": "why this swap makes sense"}}]
+GUIDELINES:
+- New table must connect to same fact tables
+- Consider alternative grouping perspectives
+- Explain what different insights the swap provides
+- Both tables should be dimensions""",
+    "B4": """## Task: Remove Table (B4)
+Select {max_variants} tables to remove.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"table": "table_name", "reason": "why removal simplifies without losing key data"}}]
+GUIDELINES:
+- Only remove dimension tables
+- Remaining tables must stay connected
+- Remove tables not essential to query purpose
+- Consider if columns from table are used in SELECT/WHERE/GROUP BY""",
+    "B5": """## Task: Add Bridge/Intermediate Table (B5)
+Generate {max_variants} bridge tables to add.
+{context}
+OUTPUT FORMAT (JSON array):
+[{{"table": "table_name", "connects": ["table1", "table2"], "reason": "what relationship this enables"}}]
+GUIDELINES:
+- Bridge tables connect two dimension tables
+- Often represent many-to-many relationships
+- Consider junction/association tables in schema
+- Explain what analytical capability is enabled""",
+}
+def _format_intent_state(intent: SimulatorIntent) -> str:
+    """Format current intent state for LLM context.
+    Args:
+        intent: The SimulatorIntent whose state should be formatted.
+    Returns:
+        Multi-line string summarising tables, grain, select columns, filters, group-by, order-by, and having clauses.
+    """
+    lines = []
+    lines.append(f"Tables: {', '.join(intent.tables or [])}")
+    lines.append(f"Grain: {intent.grain}")
+    if intent.select_cols:
+        cols_info = []
+        for sc in intent.select_cols:
+            cols_info.append(sc.expr.primary_term)
+        lines.append(f"Select: {', '.join(cols_info)}")
+    if intent.filters_param:
+        filter_strs = []
+        for f in intent.filters_param:
+            if f.right_expr:
+                filter_strs.append(f"{f.left_expr.primary_column} {f.op} {f.right_expr.primary_column}")
+            else:
+                filter_strs.append(f"{f.left_expr.primary_column} {f.op} [{f.value_type}]")
+        lines.append(f"Filters: {', '.join(filter_strs)}")
+    if intent.group_by_cols:
+        lines.append(f"Group By: {', '.join(g.primary_column for g in intent.group_by_cols)}")
+    if intent.order_by_cols:
+        order_strs = []
+        for o in intent.order_by_cols:
+            order_strs.append(f"{o.expr.primary_term} {o.direction}")
+        lines.append(f"Order By: {', '.join(order_strs)}")
+    if intent.having_param:
+        having_strs = []
+        for h in intent.having_param:
+            having_strs.append(f"{h.left_expr.primary_term} {h.op} [{h.value_type}]")
+        lines.append(f"Having: {', '.join(having_strs)}")
+    return "\n".join(lines)
+def _format_column_details(column_metadata: dict[str, dict[str, dict[str, Any]]], tables: list[str]) -> str:
+    """Format column details for LLM context.
+    Args:
+        column_metadata: Nested dict of table -> column -> metadata produced by ``_build_column_metadata_for_validation``.
+        tables: List of table names to include.
+    Returns:
+        Multi-line string listing each column with its data type, role, nullability, and cardinality for the requested tables.
+    """
+    lines = []
+    for table in tables:
+        if table not in column_metadata:
+            continue
+        lines.append(f"\n{table}:")
+        for col_name, col_info in column_metadata[table].items():
+            dtype = col_info.get("data_type", "unknown")
+            role = col_info.get("role", "")
+            nullable = "nullable" if col_info.get("nullable", True) else "required"
+            card = col_info.get("cardinality", "")
+            card_str = f", cardinality={card}" if card else ""
+            lines.append(f"  - {col_name}: {dtype} ({role}, {nullable}{card_str})")
+    return "\n".join(lines)
+def _build_expansion_context(
+    intent: SimulatorIntent,
+    operator: str,
+    schema: SchemaGraph,
+    column_metadata: dict[str, dict[str, dict[str, Any]]],
+    fk_map: dict = None,
+) -> str:
+    """Build operator-specific context for LLM expansion prompts.
+    Combines the current intent state and available column details with operator-specific supplementary context (filterable columns for A1, comparable pairs for A2, numeric columns for A3/A7, FK topology for B-series).
+    Args:
+        intent: The SimulatorIntent being expanded.
+        operator: Operator code (e.g. ``"A1"``, ``"B3"``).
+        schema: The schema graph, used for B-series table role context.
+        column_metadata: Pre-built column metadata dict.
+        fk_map: Pre-built FK map; optional.
+    Returns:
+        Formatted context string ready for inclusion in the LLM prompt.
+    """
+    context_parts = []
+    context_parts.append("### Current Query State")
+    context_parts.append(_format_intent_state(intent))
+    context_parts.append("\n### Available Columns")
+    context_parts.append(_format_column_details(column_metadata, intent.tables or []))
+    if operator.startswith("A"):
+        if operator == "A1":
+            context_parts.append("\n### Filter-Suitable Columns")
+            for table in intent.tables or []:
+                if table in column_metadata:
+                    filterable = [
+                        c
+                        for c, info in column_metadata[table].items()
+                        if info.get("role")
+                        in (
+                            ColumnRole.CATEGORICAL.value,
+                            ColumnRole.TEMPORAL.value,
+                            ColumnRole.IDENTIFIER.value,
+                        )
+                    ]
+                    if filterable:
+                        context_parts.append(f"  {table}: {', '.join(filterable)}")
+        elif operator == "A2":
+            context_parts.append("\n### Comparable Column Pairs")
+            type_groups = {}
+            for table in intent.tables or []:
+                if table in column_metadata:
+                    for col, info in column_metadata[table].items():
+                        dtype = info.get("data_type", "unknown")
+                        full_col = f"{table}.{col}"
+                        if dtype not in type_groups:
+                            type_groups[dtype] = []
+                        type_groups[dtype].append(full_col)
+            for dtype, cols in type_groups.items():
+                if len(cols) >= 2:
+                    context_parts.append(f"  {dtype}: {', '.join(cols[:10])}")
+        elif operator == "A3":
+            context_parts.append("\n### Aggregatable Columns (numeric)")
+            for table in intent.tables or []:
+                if table in column_metadata:
+                    numeric = [
+                        c
+                        for c, info in column_metadata[table].items()
+                        if info.get("data_type")
+                        in (
+                            "integer",
+                            "decimal",
+                            "float",
+                            "numeric",
+                            "double",
+                            "bigint",
+                            "smallint",
+                            "real",
+                        )
+                    ]
+                    if numeric:
+                        context_parts.append(f"  {table}: {', '.join(numeric)}")
+        elif operator == "A7":
+            context_parts.append("\n### Aggregatable Columns for HAVING Comparison")
+            for table in intent.tables or []:
+                if table in column_metadata:
+                    numeric = [
+                        c
+                        for c, info in column_metadata[table].items()
+                        if info.get("data_type")
+                        in (
+                            "integer",
+                            "decimal",
+                            "float",
+                            "numeric",
+                            "double",
+                            "bigint",
+                            "smallint",
+                            "real",
+                        )
+                    ]
+                    if numeric:
+                        context_parts.append(f"  {table}: {', '.join(numeric)}")
+    elif operator.startswith("B"):
+        context_parts.append(_build_b_series_context(schema, intent.tables or [], fk_map))
+    return "\n".join(context_parts)
+def _build_b_series_context(schema: SchemaGraph, current_tables: list[str], fk_map: dict = None) -> str:
+    """Build additional context for B-series join operators.
+    Lists available tables grouped by role and describes the FK connections between current intent tables and candidate tables.
+    Args:
+        schema: The schema graph.
+        current_tables: List of table names currently in the intent.
+        fk_map: Pre-built FK map; omitted FK section when None.
+    Returns:
+        Formatted string describing schema relationships for B-series LLM prompts.
+    """
+    lines = ["\n### Schema Relationships"]
+    lines.append("\nAvailable Tables (by role):")
+    for role in [TableRole.FACT, TableRole.DIMENSION, TableRole.BRIDGE]:
+        tables_with_role = [t for t in schema.tables if schema.tables[t].role == role.value and t not in current_tables]
+        if tables_with_role:
+            lines.append(f"  {role.value}: {', '.join(tables_with_role[:10])}")
+    if fk_map:
+        lines.append("\nForeign Key Connections:")
+        for table in current_tables:
+            outgoing = fk_map.get(table, [])
+            if outgoing:
+                targets = [fk.get("target_table", "") for fk in outgoing]
+                lines.append(f"  {table} -> {', '.join(targets)}")
+        for other_table, fks in fk_map.items():
+            if other_table in current_tables:
+                continue
+            for fk in fks:
+                if fk.get("target_table") in current_tables:
+                    lines.append(f"  {other_table} -> {fk.get('target_table')}")
+                    break
+    return "\n".join(lines)
+def llm_expand_operator(
+    intent: SimulatorIntent,
+    operator: str,
+    schema: SchemaGraph,
+    column_metadata: dict[str, dict[str, dict[str, Any]]],
+    fk_map: dict = None,
+) -> list[dict[str, Any]]:
+    """Call the LLM and return a list of valid expansion suggestions for
+    an operator.
+    Builds the operator-specific prompt, calls ``llm_json``, and normalises the response to a plain list. Returns an empty list if the operator is unknown, the LLM returns an unexpected structure, or the call raises an exception.
+    Args:
+        intent: The SimulatorIntent being expanded.
+        operator: Operator code matching a key in ``_EXPANSION_PROMPTS``.
+        schema: The schema graph for context building.
+        column_metadata: Pre-built column metadata dict.
+        fk_map: Pre-built FK map; optional.
+    Returns:
+        List of expansion dicts as returned by the LLM (structure varies by operator).
+    """
+    if operator not in _EXPANSION_PROMPTS:
+        debug(f"[expansion_rules.llm_expand_operator] unknown operator: {operator}")
+        return []
+    context = _build_expansion_context(intent, operator, schema, column_metadata, fk_map)
+    prompt_template = _EXPANSION_PROMPTS[operator]
+    prompt = prompt_template.format(max_variants=SimulatorConfig.MAX_EXPANSION_VARIANTS, context=context)
+    debug(f"[expansion_rules.llm_expand_operator] calling LLM for operator={operator}")
+    try:
+        result = llm_json(_EXPANSION_SYSTEM, prompt)
+        if isinstance(result, dict):
+            expansions = result.get("expansions", result.get("results", []))
+        elif isinstance(result, list):
+            expansions = result
+        else:
+            expansions = []
+        debug(f"[expansion_rules.llm_expand_operator] LLM returned {len(expansions)} expansion(s)")
+        return expansions
+    except Exception as e:
+        debug(f"[expansion_rules.llm_expand_operator] LLM call failed: {e}")
+        return []