PyPI - aetherdialect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

aetherdialect-0.1.0.dist-info/METADATA +197 -0
aetherdialect-0.1.0.dist-info/RECORD +34 -0
aetherdialect-0.1.0.dist-info/WHEEL +5 -0
aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
text2sql/__init__.py +7 -0
text2sql/config.py +1063 -0
text2sql/contracts_base.py +952 -0
text2sql/contracts_core.py +1890 -0
text2sql/core_utils.py +834 -0
text2sql/dialect.py +1134 -0
text2sql/expansion_ops.py +1218 -0
text2sql/expansion_rules.py +496 -0
text2sql/intent_expr.py +1759 -0
text2sql/intent_process.py +2133 -0
text2sql/intent_repair.py +1733 -0
text2sql/intent_resolve.py +1292 -0
text2sql/live_testing.py +1117 -0
text2sql/main_execution.py +799 -0
text2sql/pipeline.py +1662 -0
text2sql/qsim_ops.py +1286 -0
text2sql/qsim_sample.py +609 -0
text2sql/qsim_struct.py +569 -0
text2sql/schema.py +973 -0
text2sql/schema_profiling.py +2075 -0
text2sql/simulator.py +970 -0
text2sql/sql_gen.py +1537 -0
text2sql/templates.py +1037 -0
text2sql/text2sql.py +726 -0
text2sql/utils.py +973 -0
text2sql/validation_agg.py +1033 -0
text2sql/validation_execute.py +1092 -0
text2sql/validation_schema.py +1847 -0
text2sql/validation_semantic.py +2122 -0

text2sql/qsim_struct.py ADDED Viewed

@@ -0,0 +1,569 @@
+"""Structural utilities for question-generation simulator intent enumeration.
+Enumerates FK-connected table sets, generates and caches structural query skeletons, provides column capability helpers (filterable, groupable, aggregatable, comparable pairs), and builds schema context strings for LLM prompts.
+"""
+from __future__ import annotations
+import json
+import os
+from dataclasses import asdict, replace
+from itertools import combinations
+from typing import Any
+from .config import PolicyConfig, QSimConfig, SimulatorConfig
+from .contracts_base import ColumnRole, QSimSkeleton, SchemaGraph, SkeletonLimits
+from .contracts_core import QSimFilter
+from .core_utils import debug, intent_id
+_SKELETON_CACHE: dict[frozenset[str], list[QSimSkeleton]] = {}
+def build_fk_adjacency(schema: SchemaGraph) -> dict[str, set[str]]:
+    """Build an undirected FK adjacency map of tables in the schema.
+    Args:
+        schema: Schema graph whose foreign-key definitions are traversed.
+    Returns:
+        Dict mapping each table name to the set of table names it shares a foreign-key relationship with (bidirectional edges).
+    """
+    adj: dict[str, set[str]] = {t: set() for t in schema.tables}
+    for table in schema.tables.values():
+        for fk in table.foreign_keys:
+            adj[fk.src_table].add(fk.dst_table)
+            adj[fk.dst_table].add(fk.src_table)
+    return adj
+def is_connected(tables: list[str], adj: dict[str, set[str]]) -> bool:
+    """Return whether all tables in *tables* are mutually reachable via FK edges.
+    Uses a BFS from the first table to determine connectivity within the sub-graph induced by the provided table list.
+    Args:
+        tables: List of table names to test for connectivity.
+        adj: Undirected FK adjacency map as returned by ``build_fk_adjacency``.
+    Returns:
+        ``True`` if all tables are reachable from the first table; ``True`` unconditionally when *tables* contains zero or one entry.
+    """
+    if len(tables) <= 1:
+        return True
+    table_set = set(tables)
+    visited = set()
+    queue = [tables[0]]
+    while queue:
+        current = queue.pop(0)
+        if current in visited:
+            continue
+        visited.add(current)
+        for neighbor in adj.get(current, set()):
+            if neighbor in table_set and neighbor not in visited:
+                queue.append(neighbor)
+    return visited == table_set
+def enumerate_table_sets(schema: SchemaGraph, max_tables: int = None) -> list[list[str]]:
+    """Enumerate all valid FK-connected table combinations up to *max_tables* in size.
+    Includes all single-table sets and all multi-table combinations that form a connected sub-graph in the FK adjacency graph.
+    Args:
+        schema: Schema graph to derive tables and FK adjacency from.
+        max_tables: Maximum number of tables per combination; defaults to ``QSimConfig.MAX_TABLES_PER_INTENT``.
+    Returns:
+        List of table-name lists representing each valid table set.
+    """
+    if max_tables is None:
+        max_tables = QSimConfig.MAX_TABLES_PER_INTENT
+    adj = build_fk_adjacency(schema)
+    table_names = sorted(schema.tables.keys())
+    valid_sets: list[list[str]] = []
+    for t in table_names:
+        valid_sets.append([t])
+    for size in range(2, max_tables + 1):
+        for combo in combinations(table_names, size):
+            combo_list = list(combo)
+            if is_connected(combo_list, adj):
+                valid_sets.append(combo_list)
+    debug(f"[qsim_struct.enumerate_table_sets] found {len(valid_sets)} valid table combinations")
+    return valid_sets
+def _is_excluded_filter_column(col_name: str) -> bool:
+    """Return whether a column name matches any excluded filter pattern.
+    Args:
+        col_name: Column name string to test.
+    Returns:
+        ``True`` if any pattern from ``QSimConfig.EXCLUDED_FILTER_PATTERNS`` is found as a substring of *col_name* (case-insensitive).
+    """
+    for pattern in QSimConfig.EXCLUDED_FILTER_PATTERNS:
+        if pattern in col_name.lower():
+            return True
+    return False
+def get_filterable_columns(table: str, schema: SchemaGraph, column_roles: dict[str, str]) -> list[tuple[str, str]]:
+    """Return filterable columns for *table*, excluding audit/system columns.
+    Args:
+        table: Table name to inspect.
+        schema: Schema graph containing column metadata.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        List of ``(column_key, role)`` tuples for columns that are marked filterable and do not match any excluded filter pattern.
+    """
+    result = []
+    table_ir = schema.tables.get(table)
+    if not table_ir:
+        return result
+    for col_name, col_meta in table_ir.columns.items():
+        if not col_meta.is_filterable or _is_excluded_filter_column(col_name):
+            continue
+        col_key = f"{table}.{col_name}"
+        role = column_roles.get(col_key, col_meta.role or "unknown")
+        result.append((col_key, role))
+    return result
+def get_aggregatable_columns(table: str, schema: SchemaGraph, column_roles: dict[str, str]) -> list[str]:
+    """Return column keys that can be aggregated with SUM/AVG/MIN/MAX.
+    Only columns with the ``NUMERIC_MEASURE`` role are considered aggregatable.
+    Args:
+        table: Table name to inspect.
+        schema: Schema graph containing column metadata.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        List of ``table.column`` key strings for aggregatable columns.
+    """
+    result = []
+    table_ir = schema.tables.get(table)
+    if not table_ir:
+        return result
+    for col_name, col_meta in table_ir.columns.items():
+        col_key = f"{table}.{col_name}"
+        role = column_roles.get(col_key, col_meta.role or "unknown")
+        if role == ColumnRole.NUMERIC_MEASURE.value:
+            result.append(col_key)
+    return result
+def get_groupable_columns(table: str, schema: SchemaGraph, column_roles: dict[str, str]) -> list[str]:
+    """Return column keys that can be used in GROUP BY clauses.
+    Includes ``CATEGORICAL``, ``TEMPORAL``, and ``NUMERIC_CATEGORICAL`` roles.
+    Args:
+        table: Table name to inspect.
+        schema: Schema graph containing column metadata.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        List of ``table.column`` key strings for groupable columns.
+    """
+    result = []
+    table_ir = schema.tables.get(table)
+    if not table_ir:
+        return result
+    for col_name, col_meta in table_ir.columns.items():
+        col_key = f"{table}.{col_name}"
+        role = column_roles.get(col_key, col_meta.role or "unknown")
+        if role in (
+            ColumnRole.CATEGORICAL.value,
+            ColumnRole.TEMPORAL.value,
+            ColumnRole.NUMERIC_CATEGORICAL.value,
+        ):
+            result.append(col_key)
+    return result
+def get_comparable_column_pairs(
+    table_set: list[str], schema: SchemaGraph, column_roles: dict[str, str]
+) -> list[tuple[str, str, str, str, str]]:
+    """Return cross-table column pairs that can be semantically compared.
+    A pair is considered comparable when both columns share the same role and come from different tables. Numeric roles and temporal roles are matched separately.
+    Args:
+        table_set: List of table names to consider.
+        schema: Schema graph containing column metadata.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        List of 5-tuples ``(table1, col1, table2, col2, role)`` for each comparable column pair found.
+    """
+    comparable_pairs = []
+    numeric_roles = {
+        ColumnRole.NUMERIC_MEASURE.value,
+        ColumnRole.NUMERIC_CATEGORICAL.value,
+    }
+    temporal_roles = {ColumnRole.TEMPORAL.value}
+    all_numeric = []
+    all_temporal = []
+    for table in table_set:
+        table_ir = schema.tables.get(table)
+        if not table_ir:
+            continue
+        for col_name, col_meta in table_ir.columns.items():
+            col_key = f"{table}.{col_name}"
+            role = column_roles.get(col_key, col_meta.role or "unknown")
+            if role in numeric_roles:
+                all_numeric.append((table, col_name, role))
+            elif role in temporal_roles:
+                all_temporal.append((table, col_name, role))
+    for i, (t1, c1, r1) in enumerate(all_numeric):
+        for t2, c2, r2 in all_numeric[i + 1 :]:
+            if t1 != t2 and r1 == r2:
+                comparable_pairs.append((t1, c1, t2, c2, r1))
+    for i, (t1, c1, r1) in enumerate(all_temporal):
+        for t2, c2, _r2 in all_temporal[i + 1 :]:
+            if t1 != t2:
+                comparable_pairs.append((t1, c1, t2, c2, r1))
+    return comparable_pairs
+def compute_skeleton_limits(tables: list[str], schema: SchemaGraph, column_roles: dict[str, str]) -> SkeletonLimits:
+    """Compute schema-derived limits for skeleton enumeration.
+    Derives max_filters, max_groupby, and max_having from column capabilities (filterable, groupable, aggregatable) for the given table set, capped by config constants.
+    Args:
+        tables: List of table names in the intent.
+        schema: Schema graph for column metadata.
+        column_roles: Map of table.column key to role string.
+    Returns:
+        ``SkeletonLimits`` with derived values.
+    """
+    all_filterable = []
+    all_groupable = []
+    all_aggregatable = []
+    for table in tables:
+        all_filterable.extend(get_filterable_columns(table, schema, column_roles))
+        all_groupable.extend(get_groupable_columns(table, schema, column_roles))
+        all_aggregatable.extend(get_aggregatable_columns(table, schema, column_roles))
+    num_filterable = len(set(col for col, _ in all_filterable))
+    max_filter_cols = min(QSimConfig.MAX_FILTER_COLUMNS, num_filterable)
+    max_filters = min(QSimConfig.MAX_FILTERS_PER_INTENT, max_filter_cols * 2)
+    max_groupby = min(len(all_groupable), QSimConfig.MAX_GROUP_BY_COLUMNS)
+    max_having = min(SimulatorConfig.MAX_HAVING_CONDITIONS, 1 + len(all_aggregatable))
+    return SkeletonLimits(max_filters=max_filters, max_groupby=max_groupby, max_having=max_having)
+def compute_intent_id(intent_dict: dict[str, Any]) -> str:
+    """Compute a hash-based intent ID from the structural fields of an intent dict.
+    Tables, select columns, group-by columns, filters, and HAVING conditions are all sorted before hashing to ensure canonical equality regardless of ordering.
+    Args:
+        intent_dict: Dict with keys ``"tables"``, ``"grain"``, ``"select_cols"``, ``"group_by_cols"``, ``"filters_param"``, and ``"having_param"``.
+    Returns:
+        A short hash string suitable for use as a deduplicated intent identifier.
+    """
+    structural = {
+        "tables": sorted(intent_dict.get("tables", [])),
+        "grain": intent_dict.get("grain", "row_level"),
+        "select_cols": sorted(intent_dict.get("select_cols", [])),
+        "group_by_cols": sorted(intent_dict.get("group_by_cols", [])),
+        "filters_param": sorted(
+            intent_dict.get("filters_param", []),
+            key=lambda x: str(x.get("column", "")) if isinstance(x, dict) else "",
+        ),
+        "having_param": sorted(
+            intent_dict.get("having_param", []),
+            key=lambda x: str(x.get("expression", "")) if isinstance(x, dict) else "",
+        ),
+    }
+    return intent_id(structural)
+def generate_all_skeletons(tables: list[str], schema: SchemaGraph, column_roles: dict[str, str]) -> list[QSimSkeleton]:
+    """Generate all valid structural ``QSimSkeleton`` instances for a table set.
+    Results are cached in the module-level ``_SKELETON_CACHE`` so that repeated calls for the same table set are free.
+    Args:
+        tables: Ordered list of table names defining the skeleton's table set.
+        schema: Schema graph used to determine available filterable, groupable, and aggregatable columns.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        List of all ``QSimSkeleton`` combinations valid for the given table set and schema capabilities.
+    """
+    global _SKELETON_CACHE
+    table_key = frozenset(tables)
+    if table_key in _SKELETON_CACHE:
+        debug(f"[qsim_struct.generate_all_skeletons] cache_hit: {len(_SKELETON_CACHE[table_key])} skeletons")
+        return _SKELETON_CACHE[table_key]
+    limits = compute_skeleton_limits(tables, schema, column_roles)
+    max_filters = limits.max_filters
+    max_groupby = limits.max_groupby
+    max_having = limits.max_having
+    is_single_table = len(tables) == 1
+    has_comparable_pairs = len(get_comparable_column_pairs(tables, schema, column_roles)) > 0
+    skeletons = []
+    for has_agg in [True, False]:
+        for num_filters in range(0, max_filters + 1):
+            groupby_options = range(1, max_groupby + 1) if has_agg else [0]
+            for num_groupby in groupby_options:
+                for has_orderby in [True, False]:
+                    having_options = [True, False] if has_agg and num_groupby > 0 else [False]
+                    for has_having in having_options:
+                        distinct_options = [True, False] if not has_agg and is_single_table else [False]
+                        for has_distinct in distinct_options:
+                            expr_cmp_options = [True, False] if has_comparable_pairs and num_filters > 0 else [False]
+                            for has_expr_cmp in expr_cmp_options:
+                                skeletons.append(
+                                    QSimSkeleton(
+                                        tables=tables,
+                                        has_aggregation=has_agg,
+                                        num_filters=num_filters,
+                                        num_groupby=num_groupby,
+                                        has_orderby=has_orderby,
+                                        has_having=has_having,
+                                        has_distinct=has_distinct,
+                                        has_expr_comparison=has_expr_cmp,
+                                    )
+                                )
+    _SKELETON_CACHE[table_key] = skeletons
+    debug(
+        f"[qsim_struct.generate_all_skeletons] created {len(skeletons)} skeletons for tables={tables}, max_filters={max_filters}, max_groupby={max_groupby}, max_having={max_having}"
+    )
+    return skeletons
+def load_or_create_skeletons(
+    schema: SchemaGraph, column_roles: dict[str, str]
+) -> dict[frozenset[str], list[QSimSkeleton]]:
+    """Load the skeleton cache from disk or generate and persist it.
+    If the cached file exists and its schema hash matches, the cache is loaded; otherwise new skeletons are generated and saved.
+    Args:
+        schema: Schema graph used for skeleton generation and hash comparison.
+        column_roles: Map of ``table.column`` key to role string.
+    Returns:
+        Module-level skeleton cache dict mapping frozen table sets to their ``QSimSkeleton`` lists.
+    """
+    global _SKELETON_CACHE
+    skeleton_path = QSimConfig.SKELETONS_JSON_PATH
+    if not PolicyConfig.REGENERATE_SKELETON_CACHE and os.path.exists(skeleton_path):
+        try:
+            with open(skeleton_path, encoding="utf-8") as f:
+                cache_data = json.load(f)
+            cached_hash = cache_data.get("schema_hash", "")
+            if cached_hash != schema.schema_hash:
+                debug(
+                    f"[qsim_struct.load_or_create_skeletons] schema_hash mismatch: {cached_hash} != {schema.schema_hash}, regenerating"
+                )
+            else:
+                skeletons_data = cache_data.get("skeletons", {})
+                for table_key_str, skel_list in skeletons_data.items():
+                    table_key = frozenset(table_key_str.split("|"))
+                    _SKELETON_CACHE[table_key] = [
+                        QSimSkeleton(
+                            tables=s["tables"],
+                            has_aggregation=s["has_aggregation"],
+                            num_filters=s["num_filters"],
+                            num_groupby=s["num_groupby"],
+                            has_orderby=s["has_orderby"],
+                            has_having=s["has_having"],
+                            has_distinct=s.get("has_distinct", False),
+                            has_expr_comparison=s.get(
+                                "has_expr_comparison",
+                                s.get("has_column_comparison", False),
+                            ),
+                        )
+                        for s in skel_list
+                    ]
+                debug(f"[qsim_struct.load_or_create_skeletons] loaded {len(_SKELETON_CACHE)} table sets from cache")
+                return _SKELETON_CACHE
+        except Exception as e:
+            debug(f"[qsim_struct.load_or_create_skeletons] cache_load_failed: {e}")
+    debug("[qsim_struct.load_or_create_skeletons] generating new skeletons")
+    table_sets = enumerate_table_sets(schema, QSimConfig.MAX_TABLES_PER_INTENT)
+    for table_set in table_sets:
+        generate_all_skeletons(table_set, schema, column_roles)
+    cache_data = {
+        "schema_hash": schema.schema_hash,
+        "num_table_sets": len(_SKELETON_CACHE),
+        "skeletons": {"|".join(sorted(k)): [asdict(s) for s in v] for k, v in _SKELETON_CACHE.items()},
+    }
+    debug(f"[qsim_struct.load_or_create_skeletons] saving {len(_SKELETON_CACHE)} table sets to cache")
+    with open(skeleton_path, "w", encoding="utf-8") as f:
+        json.dump(cache_data, f, indent=2)
+    return _SKELETON_CACHE
+def decompose_between_filter(f: QSimFilter) -> list[QSimFilter]:
+    """Decompose a ``BETWEEN`` ``QSimFilter`` into a ``>=`` and a ``<=`` pair.
+    Args:
+        f: The filter to decompose.
+    Returns:
+        A list of two ``QSimFilter`` instances (lower ``>=`` and upper ``<=`` bounds) when ``f.op == "between"``; otherwise a single-element list with *f* unchanged.
+    """
+    if f.op != "between":
+        return [f]
+    return [
+        replace(f, op=">="),
+        replace(f, op="<="),
+    ]
+def build_schema_context(tables: list[str], schema: SchemaGraph) -> str:
+    """Build a schema context string for inclusion in LLM prompts.
+    Lists each table with its description and all column names annotated with data type, PK/FK markers, and filterability.
+    Args:
+        tables: Ordered list of table names to include.
+        schema: Schema graph containing table and column metadata.
+    Returns:
+        A formatted multi-line string with one ``TABLE ...`` block per table, joined by blank lines.
+    """
+    context_parts = []
+    for table in tables:
+        table_ir = schema.tables.get(table)
+        if not table_ir:
+            continue
+        col_descriptions = []
+        for col_name, col_meta in table_ir.columns.items():
+            col_type = col_meta.data_type or "unknown"
+            col_desc = f"{col_name} ({col_type})"
+            if col_meta.is_primary_key:
+                col_desc += " [PK]"
+            if col_meta.is_foreign_key:
+                fk_target = f"{col_meta.fk_target[0]}.{col_meta.fk_target[1]}" if col_meta.fk_target else "?"
+                col_desc += f" [FK -> {fk_target}]"
+            if col_meta.is_filterable:
+                col_desc += " [filterable]"
+            col_descriptions.append(col_desc)
+        table_desc = table_ir.description or f"{table} table"
+        context_parts.append(f"TABLE {table} ({table_desc}):\n  " + "\n  ".join(col_descriptions))
+    return "\n\n".join(context_parts)
+def validate_column_exists(col_ref: str, tables: list[str], schema: SchemaGraph) -> bool:
+    """Return whether a ``table.column`` reference is valid within a set of tables.
+    Args:
+        col_ref: Fully-qualified column reference string (``"table.column"``).
+        tables: Allowed table names; the reference's table must be in this list.
+        schema: Schema graph for column existence checks.
+    Returns:
+        ``True`` if *col_ref* is in ``table.column`` format, the table is in *tables*, and the column exists in the schema; ``False`` otherwise.
+    """
+    if "." not in col_ref:
+        return False
+    table, col = col_ref.split(".", 1)
+    if table not in tables:
+        return False
+    table_ir = schema.tables.get(table)
+    if not table_ir:
+        return False
+    return col in table_ir.columns