PyPI - relationalai - Versions diffs - 0.13.4__py3-none-any.whl → 0.13.5__py3-none-any.whl - Mend

relationalai 0.13.4py3-none-any.whl → 0.13.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

relationalai/semantics/metamodel/executor.py CHANGED Viewed

@@ -21,30 +21,37 @@ class Executor():
     def execute(self, model: Model, task:Task, format:Literal["pandas", "snowpark"]="pandas") -> Union[DataFrame, Any]:
         raise NotImplementedError(f"execute: {self}")
-    def _compute_cols(self, task: ir.Task, final_model: ir.Model|None) -> Tuple[list[str], list[str]]:
-        cols = []
-        extra_cols = []
+    def _compute_cols(self, task: ir.Task, final_model: ir.Model|None) -> Tuple[list[str], list[str], list[int]]:
+        cols = [] # all cols in output
+        extra_cols = [] # all key cols not in output
+        key_locs = [] # locations of output key cols in all output cols
         # we assume only queries have outputs
         original_outputs = collect_by_type(ir.Output, task) if task else None
         outputs = collect_by_type(ir.Output, final_model) if final_model else None
-        # there are some outputs, and they all have keys
+        # there are some outputs, and some have keys
         if original_outputs and outputs and not all(not out.keys for out in outputs):
             assert len(original_outputs) == 1
             original_output = original_outputs[0]
             original_cols = []
             original_cols_val = []
-            for alias, val in original_output.aliases:
-                if not alias:
-                    continue
-                original_cols.append(alias)
-                original_cols_val.append(val)
             keys = outputs[0].keys
             assert keys
             for out in outputs:
                 assert out.keys is not None
                 assert set(out.keys) == set(keys), "outputs with different key sets in the same query"
+            for (idx, (alias, val)) in enumerate(original_output.aliases):
+                if not alias:
+                    continue
+                original_cols.append(alias)
+                original_cols_val.append(val)
+                if isinstance(val, ir.Var) and val in keys:
+                    key_locs.append(idx)
             extra_cols = []
             name_cache = NameCache(start_from_one=True)
             for key in keys:
@@ -54,7 +61,7 @@ class Executor():
         elif outputs:
             cols = [alias for alias, _ in outputs[-1].aliases if alias]
-        return cols, extra_cols
+        return cols, extra_cols, key_locs
     def _postprocess_df(self, config: Config, df: DataFrame, extra_cols: list[str]) -> DataFrame:
         if bool(config.get("compiler.debug_hidden_keys", False)):

relationalai/semantics/metamodel/rewrite/__init__.py CHANGED Viewed

@@ -3,5 +3,6 @@ from .dnf_union_splitter import DNFUnionSplitter
 from .extract_nested_logicals import ExtractNestedLogicals
 from .flatten import Flatten
 from .format_outputs import FormatOutputs
+from .handle_aggregations_and_ranks import HandleAggregationsAndRanks
-__all__ = ["DischargeConstraints", "DNFUnionSplitter", "ExtractNestedLogicals", "Flatten", "FormatOutputs"]
+__all__ = ["DischargeConstraints", "DNFUnionSplitter", "ExtractNestedLogicals", "Flatten", "FormatOutputs", "HandleAggregationsAndRanks"]

relationalai/semantics/metamodel/rewrite/flatten.py CHANGED Viewed

@@ -585,7 +585,6 @@ def extend_body(body: OrderedSet[ir.Task], extra: ir.Task):
                     tuple(logical_body)
                 ))
         else:
-            # no hoists, just inline
-            body.update(extra.body)
+            body.add(extra)
     else:
         body.add(extra)

relationalai/semantics/metamodel/rewrite/format_outputs.py CHANGED Viewed

@@ -8,21 +8,22 @@ from relationalai.semantics.metamodel.util import FrozenOrderedSet
 from relationalai.semantics.metamodel.typer.typer import is_primitive
 class FormatOutputs(Pass):
-    def __init__(self, handle_outputs: bool=True):
+    def __init__(self, use_rel: bool=False):
         super().__init__()
-        self._handle_outputs = handle_outputs
+        self._use_rel = use_rel
     #--------------------------------------------------
     # Public API
     #--------------------------------------------------
     def rewrite(self, model: ir.Model, options:dict={}) -> ir.Model:
         wide_outputs = options.get("wide_outputs", False)
-        return self.OutputRewriter(wide_outputs).walk(model)
+        return self.OutputRewriter(wide_outputs, self._use_rel).walk(model)
     class OutputRewriter(visitor.Rewriter):
-        def __init__(self, wide_outputs: bool = False):
+        def __init__(self, wide_outputs: bool = False, use_rel: bool = False):
             super().__init__()
             self.wide_outputs = wide_outputs
+            self._use_rel = use_rel
         def handle_logical(self, node: ir.Logical, parent: ir.Node):
             # Rewrite children first
@@ -36,62 +37,146 @@ class FormatOutputs(Pass):
             if not groups["outputs"]:
                 return node
-            return adjust_outputs(node, groups["outputs"], self.wide_outputs)
+            if self.wide_outputs:
+                return adjust_wide_outputs(node, groups["outputs"])
+            return adjust_gnf_outputs(self._use_rel, node, groups["outputs"])
 #--------------------------------------------------
 # GNF vs wide output support
 #--------------------------------------------------
-def adjust_outputs(task: ir.Logical, outputs: OrderedSet[ir.Task], wide_outputs: bool = False):
+# For wide outputs, only adjust the output task to include the keys.
+# output looks like: (key0, key1, val0, val1, ...)
+def adjust_wide_outputs(task: ir.Logical, outputs: OrderedSet[ir.Task]):
+    body = list(task.body)
+    for output in outputs:
+        assert(isinstance(output, ir.Output))
+        if output.keys:
+            body.remove(output)
+            body.append(rewrite_wide_output(output))
+    return ir.Logical(task.engine, task.hoisted, tuple(body), task.annotations)
+# For GNF outputs we need to generate a rule for each "column" in the output
+# and potentially one wide key column
+def adjust_gnf_outputs(use_rel: bool, task: ir.Logical, outputs: OrderedSet[ir.Task]):
     body = list(task.body)
+    for output in outputs:
+        assert(isinstance(output, ir.Output))
+        if output.keys:
+            # Remove the original output. This is replaced by per-column outputs below
+            body.remove(output)
+            is_export = helpers.is_export(output)
+            # Exports and Rel execution rely on all columns being in GNF format.
+            if is_export or use_rel:
+                _adjust_all_gnf_outputs(body, output, is_export)
+            else: # Otherwise, put all keys into one wide keys relation
+                _adjust_outputs_with_wide_keys(body, output)
+    return ir.Logical(task.engine, task.hoisted, tuple(body), task.annotations)
+# Generate an output for each "column"
+# output looks like: def output(:cols, :col000, key0, key1, value)
+def _adjust_all_gnf_outputs(body, output: ir.Output, is_export: bool):
+    assert output.keys
+    original_cols = OrderedSet()
+    for idx, alias in enumerate(output.aliases):
+        # Skip None values which are used as a placeholder for missing values
+        if alias[1] is None:
+            continue
+        original_cols.add(alias[1])
+        body.extend(_generate_output_column_gnf(output, idx, alias, is_export))
+    idx = len(output.aliases)
+    for key in output.keys:
+        if key not in original_cols:
+            body.extend(_generate_output_column_gnf(output, idx, (key.name, key), is_export))
+            idx += 1
+# Generate an output for each value "column" and one wide output for all the keys
+#  * value output looks like: def output(:cols, :col000, key0, key1, value)
+#  * key output looks like:
+#    def output(:keys, output_key_0, output_key_1, other_key_0, ...)
+#
+# Exceptions: keys for exports and compound keys are converted to GNF, same as the
+# value columns.
+def _adjust_outputs_with_wide_keys(body, output: ir.Output):
+    assert output.keys
+    original_cols = OrderedSet()
+    val_cols: list[Tuple[str, ir.Value] | None] = []
+    key_cols: OrderedSet[Tuple[str, ir.Value]] = OrderedSet()
+    key_cols.add(("keys", f.literal("keys", types.Symbol))) # name key col so we can identify it later
+    for alias in output.aliases:
+        # None values are used as a placeholder for missing values
+        # They are added to maintain the correct col count when enumerated below
+        if alias[1] is None:
+            val_cols.append(None)
+            continue
+        original_cols.add(alias[1])
+        if isinstance(alias[1], ir.Var) and alias[1] in output.keys: # note: skips compound keys
+            key_cols.add(alias)
+        else:
+            val_cols.append(alias)
+    # Add keys not in output to the end
+    for key in output.keys:
+        if key not in original_cols:
+            key_cols.add((key.name, key))
+    # Generate GNF val cols
+    for idx, alias in enumerate(val_cols):
+        if alias:
+            new_col = _generate_output_column(output, idx, alias, key_cols)
+            body.extend(new_col)
+    # Create a wide key column with all keys
+    if len(key_cols) > 1:
+        body.append(ir.Output(
+            output.engine,
+            key_cols.frozen(),
+            output.keys,
+            output.annotations
+        ))
+# Generate a relation representing a single col in GNF form
+def _generate_output_column(output: ir.Output, idx: int, alias: tuple[str, ir.Value], key_cols):
+    if not output.keys:
+        return [output]
-    # For wide outputs, only adjust the output task to include the keys.
-    if wide_outputs:
-        for output in outputs:
-            assert(isinstance(output, ir.Output))
-            if output.keys:
-                body.remove(output)
-                body.append(rewrite_wide_output(output))
-        return ir.Logical(task.engine, task.hoisted, tuple(body), task.annotations)
+    aliases = [("cols", f.literal("cols", types.Symbol))]
+    aliases.append(("col", f.literal(f"col{idx:03}", types.Symbol)))
-    # For GNF outputs we need to generate a rule for each "column" in the output
-    else:
-        # First split outputs in potentially multiple outputs, one for each "column"
-        for output in outputs:
-            assert(isinstance(output, ir.Output))
-            if output.keys:
-                # Remove the original output. This is replaced by per-column outputs below
-                body.remove(output)
-                is_export = helpers.is_export(output)
-                # Generate an output for each "column"
-                # output looks like def output(:cols, :col000, key0, key1, value):
-                original_cols = OrderedSet()
-                for idx, alias in enumerate(output.aliases):
-                    # Skip None values which are used as a placeholder for missing values
-                    if alias[1] is None:
-                        continue
-                    original_cols.add(alias[1])
-                    body.extend(_generate_output_column(output, idx, alias, is_export))
-                idx = len(output.aliases)
-                for key in output.keys:
-                    if key not in original_cols:
-                        body.extend(_generate_output_column(output, idx, (key.name, key), is_export))
-                        idx += 1
-        return ir.Logical(task.engine, task.hoisted, tuple(body), task.annotations)
-# TODO: return non list?
-def _generate_output_column(output: ir.Output, idx: int, alias: tuple[str, ir.Value], is_export: bool):
+    # Append all keys at the start
+    keys = iter(key_cols)
+    assert next(keys) == ("keys", f.literal("keys", types.Symbol)) # skip col name
+    for key in keys:
+        aliases.append((f"key_{key[0]}_{idx}", key[1]))
+    aliases.append(alias) # append val
+    return [
+        ir.Output(
+            output.engine,
+            FrozenOrderedSet.from_iterable(aliases),
+            output.keys,
+            output.annotations
+        )
+    ]
+# Generate a relation representing a single col in GNF form for export
+def _generate_output_column_gnf(output: ir.Output, idx: int, alias: tuple[str, ir.Value], is_export: bool):
     if not output.keys:
         return [output]
     aliases = [("cols", f.literal("cols", types.Symbol))] if not is_export else []
     aliases.append(("col", f.literal(f"col{idx:03}", types.Symbol)))
-    # Append all keys at the start
     for k in output.keys:
         aliases.append((f"key_{k.name}_{idx}", k))

relationalai/semantics/metamodel/rewrite/handle_aggregations_and_ranks.py ADDED Viewed

@@ -0,0 +1,237 @@
+from __future__ import annotations
+from relationalai.semantics.metamodel import ir, helpers
+from relationalai.semantics.metamodel.visitor import Rewriter
+from relationalai.semantics.metamodel.compiler import Pass, group_tasks
+from relationalai.semantics.metamodel.util import OrderedSet, ordered_set
+from relationalai.semantics.metamodel import dependency
+# This rewrite pass handles aggregations and ranks to ensure that their dependencies are
+# self-contained in the required format for emitting Rel/LQP. The expected format is that
+# aggregations and ranks are each contained in their own Logical (which hoists the output
+# vars), with all dependencies pulled into the same Logical.
+#
+# For example,
+#
+# Logical ^[v::Int128]
+#     ... <dependencies> ...
+#     sum([foo::Foo], [], [a::Int128, v::Int128])
+#
+# Firstly, the pass ensures that all dependencies
+# required by an aggregation/rank are contained locally in the same Logical as the
+# aggregation/rank. For example, in the following Logical:
+#
+# Logical ^[v::Int128]
+#     Logical ^[a=None, foo=None]
+#         Foo(foo::Foo)
+#         a(foo::Foo, a::Int128)
+#     sum([foo::Foo], [], [a::Int128, v::Int128])
+# common(foo::Foo, a::Int128)
+#
+# The aggregation `sum` depends on the relation `common`. So, the lookup for `common` needs
+# to be pulled into the same Logical as the aggregation.
+#
+# Logical ^[v::Int128]
+#     Logical
+#         Logical ^[a=None, foo=None]
+#             Foo(foo::Foo)
+#             a(foo::Foo, a::Int128)
+#         common(foo::Foo, a::Int128)
+#     sum([foo::Foo], [], [a::Int128, v::Int128])
+# common(foo::Foo, a::Int128)
+#
+# Secondly, the pass separates Logicals containing more than one aggregation/rank into
+# separate Logicals, each containing a single aggregation/rank.
+#
+# Thirdly, the pass renames variables introduced inside aggregation/rank bodies to ensure
+# they do not clash with variables outside.
+class HandleAggregationsAndRanks(Pass):
+    def __init__(self):
+        super().__init__()
+    #--------------------------------------------------
+    # Public API
+    #--------------------------------------------------
+    def rewrite(self, model: ir.Model, options:dict={}) -> ir.Model:
+        dep_info = dependency.analyze(model.root)
+        r = AggregationsRanksRewriter(dep_info)
+        result = r.walk(model)
+        rn = AggregationsRanksVarRenameRewriter()
+        result = rn.walk(result)
+        return result
+# The AggregationsRanksRewriter ensures that each aggregation and rank is contained
+# in its own Logical, with all dependencies pulled into the same Logical.
+class AggregationsRanksRewriter(Rewriter):
+    def __init__(self, dep_info):
+        super().__init__()
+        self.info = dep_info
+        self.rewritten: dict[int, ir.Node] = {}
+    def handle_logical(self, node: ir.Logical, parent: ir.Node):
+        groups = group_tasks(node.body, {
+            "aggregates_and_ranks": (ir.Aggregate, ir.Rank),
+        })
+        aggregates_and_ranks = groups["aggregates_and_ranks"]
+        # If there are no aggregates or ranks, then just recurse into the Logical body
+        if not aggregates_and_ranks:
+            return super().handle_logical(node, parent)
+        agg_rank_logicals = []
+        for agg_rank in aggregates_and_ranks:
+            # Gather all dependencies of the logical containing the aggregate/rank
+            agg_deps = self.info.task_dependencies(agg_rank)
+            # Reconstruct the body of the Logical containing the aggregate/rank, starting
+            # with the existing body
+            body = ordered_set()
+            # agg_body is the inner body containing the dependencies of the aggregate/rank
+            agg_body = ordered_set()
+            for t in node.body:
+                if isinstance(t, (ir.Output, ir.Update)):
+                    # Outputs and Updates need to be kept in the outer body, rather than
+                    # nested inside the aggregate/rank body.
+                    body.add(t)
+                elif t not in aggregates_and_ranks:
+                    agg_body.add(t)
+            # Add all other dependencies
+            for dep in agg_deps:
+                # HACK: there are bugs in the dependency analysis that can cause cycles.
+                # Avoid these cycles because otherwise they can cause infinite recursion.
+                if agg_rank in self.info.task_dependencies(dep) or node in self.info.task_dependencies(dep):
+                    continue
+                agg_body.add(dep)
+            body.add(ir.Logical(node.engine, tuple(), tuple(agg_body)))
+            # Add the actual aggregate/rank
+            body.add(agg_rank.clone())
+            # Construct the final Logical holding the aggregate/rank contents.
+            # Output variables need to be hoisted
+            if isinstance(agg_rank, ir.Aggregate):
+                output_vars = [v for v in helpers.vars(agg_rank.args) if not helpers.is_aggregate_input(v, agg_rank)]
+            else:
+                assert isinstance(agg_rank, ir.Rank)
+                output_vars = [a for a in agg_rank.args]
+                output_vars.append(agg_rank.result)
+            agg_logical = ir.Logical(
+                engine=node.engine,
+                hoisted=tuple(output_vars),
+                body=tuple(body)
+            )
+            agg_rank_logicals.append(agg_logical)
+        if len(agg_rank_logicals) == 1:
+            # If there's only one, no need to create a parent Logical
+            result = agg_rank_logicals[0]
+        else:
+            # Otherwise, create a parent Logical, ensuring all body vars are hoisted
+            hoisted = OrderedSet()
+            for agg_rank_logical in agg_rank_logicals:
+                hoisted.update(agg_rank_logical.hoisted)
+            result = ir.Logical(
+                engine=node.engine,
+                hoisted=tuple(hoisted),
+                body=tuple(agg_rank_logicals)
+            )
+        # Rewrite the children
+        result = super().handle_logical(result, parent)
+        # Make a deep copy so that each task has a unique id. This is important for later
+        # rewrite passes (namely QuantifyVars) which identify tasks by id.
+        result = DeepCopyRewriter().walk(result)
+        return result
+# The AggregationsRanksVarRenameRewriter renames variables inside aggregation/rank
+# bodies to ensure they do not clash with variables outside. It is careful to keep certain
+# variables unrenamed because they need to interact with the outside, namely the group-by
+# variables and output variables.
+class AggregationsRanksVarRenameRewriter(Rewriter):
+    class RenameRewriter(Rewriter):
+        def __init__(self, to_keep: set[ir.Var], suffix: str):
+            super().__init__()
+            self.to_keep = to_keep
+            self.suffix = suffix
+            self.renamed_vars: dict[ir.Var, ir.Var] = {}
+        def handle_default(self, node: ir.Default, parent):
+            if node.var in self.to_keep:
+                return node
+            return ir.Default(
+                var=self.handle_var(node.var, node),
+                value=node.value
+            )
+        def handle_var(self, node: ir.Var, parent):
+            if node in self.to_keep:
+                return node
+            if node in self.renamed_vars:
+                return self.renamed_vars[node]
+            # Rename var
+            result = ir.Var(
+                type=node.type,
+                name=f"{node.name}_{self.suffix}"
+            )
+            self.renamed_vars[node] = result
+            return result
+    def __init__(self):
+        super().__init__()
+        self.renamed_vars: dict[ir.Var, ir.Var] = {}
+    def handle_logical(self, node: ir.Logical, parent: ir.Node):
+        groups = group_tasks(node.body, {
+            "aggregates_and_ranks": (ir.Aggregate, ir.Rank),
+        })
+        aggregates_and_ranks = groups["aggregates_and_ranks"]
+        if not aggregates_and_ranks:
+            return super().handle_logical(node, parent)
+        # There should only be one, because the AggregationsRanksRewriter should have
+        # separated them out.
+        assert len(aggregates_and_ranks) == 1, "Multiple aggregate/ranks still found after rewriting dependencies"
+        # Rename at this level
+        agg_rank = aggregates_and_ranks[0]
+        if isinstance(agg_rank, ir.Aggregate):
+            vars_to_keep = set(agg_rank.group)
+            output_vars = [v for v in helpers.vars(agg_rank.args) if not helpers.is_aggregate_input(v, agg_rank)]
+            vars_to_keep.update(output_vars)
+            result = self.RenameRewriter(vars_to_keep, 'agg').walk(node)
+        else:
+            assert isinstance(agg_rank, ir.Rank)
+            vars_to_keep = set(agg_rank.group)
+            output_vars = helpers.vars(agg_rank.args) + [agg_rank.result]
+            vars_to_keep.update(output_vars)
+            vars_to_keep.update(agg_rank.projection)
+            result = self.RenameRewriter(vars_to_keep, 'rank').walk(node)
+        # Process children
+        result = super().handle_logical(result, parent)
+        return result
+class DeepCopyRewriter(Rewriter):
+    def walk(self, node, parent=None):
+        new_node = super().walk(node, parent)
+        if isinstance(new_node, ir.Task):
+            return new_node.clone()
+        else:
+            return new_node

relationalai/semantics/metamodel/typer/typer.py CHANGED Viewed

@@ -1092,7 +1092,7 @@ class Replacer(visitor.Rewriter):
     def handle_var(self, node: ir.Var, parent: ir.Node):
         if node.id in self.net.resolved_types:
-            return f.var(node.name, self.net.resolved_types[node.id])
+            return node.reconstruct(name=node.name, type=self.net.resolved_types[node.id])
         return node
     def handle_literal(self, node: ir.Literal, parent: ir.Node):

relationalai 0.13.4__py3-none-any.whl → 0.13.5__py3-none-any.whl

relationalai 0.13.4py3-none-any.whl → 0.13.5py3-none-any.whl