PyPI - pytrilogy - Versions diffs - 0.0.2.7__py3-none-any.whl → 0.0.2.9__py3-none-any.whl - Mend

pytrilogy 0.0.2.7py3-none-any.whl → 0.0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pytrilogy might be problematic. Click here for more details.

Files changed (34) hide show

{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/METADATA +1 -1
{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/RECORD +34 -34
{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/WHEEL +1 -1
trilogy/__init__.py +1 -1
trilogy/constants.py +1 -0
trilogy/core/enums.py +1 -0
trilogy/core/models.py +154 -56
trilogy/core/optimization.py +44 -5
trilogy/core/optimizations/inline_datasource.py +14 -8
trilogy/core/optimizations/predicate_pushdown.py +73 -44
trilogy/core/processing/concept_strategies_v3.py +69 -28
trilogy/core/processing/node_generators/common.py +42 -16
trilogy/core/processing/node_generators/filter_node.py +89 -48
trilogy/core/processing/node_generators/group_node.py +3 -1
trilogy/core/processing/node_generators/rowset_node.py +13 -54
trilogy/core/processing/node_generators/select_node.py +10 -13
trilogy/core/processing/node_generators/unnest_node.py +5 -3
trilogy/core/processing/node_generators/window_node.py +23 -2
trilogy/core/processing/nodes/__init__.py +34 -6
trilogy/core/processing/nodes/base_node.py +67 -13
trilogy/core/processing/nodes/filter_node.py +3 -0
trilogy/core/processing/nodes/group_node.py +3 -0
trilogy/core/processing/nodes/merge_node.py +1 -11
trilogy/core/processing/nodes/select_node_v2.py +1 -0
trilogy/core/processing/utility.py +29 -10
trilogy/core/query_processor.py +47 -20
trilogy/dialect/base.py +47 -14
trilogy/dialect/common.py +15 -3
trilogy/dialect/presto.py +2 -1
trilogy/parsing/parse_engine.py +20 -1
trilogy/parsing/trilogy.lark +3 -1
{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/LICENSE.md +0 -0
{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/entry_points.txt +0 -0
{pytrilogy-0.0.2.7.dist-info → pytrilogy-0.0.2.9.dist-info}/top_level.txt +0 -0

trilogy/core/optimization.py CHANGED Viewed

@@ -17,19 +17,58 @@ from trilogy.core.optimizations import (
 MAX_OPTIMIZATION_LOOPS = 100
+# other optimizations may make a CTE a pure passthrough
+# remove those
+# def is_locally_irrelevant(cte: CTE) -> CTE | bool:
+#     if not len(cte.parent_ctes) == 1:
+#         return False
+#     parent = cte.parent_ctes[0]
+#     if not parent.output_columns == cte.output_columns:
+#         return False
+#     if cte.condition is not None:
+#         return False
+#     if cte.group_to_grain:
+#         return False
+#     if len(cte.joins)>1:
+#         return False
+#     return parent
 def filter_irrelevant_ctes(
     input: list[CTE],
     root_cte: CTE,
 ):
     relevant_ctes = set()
-    def recurse(cte: CTE):
+    def recurse(cte: CTE, inverse_map: dict[str, list[CTE]]):
+        # TODO: revisit this
+        # if parent := is_locally_irrelevant(cte):
+        #     logger.info(
+        #         f"[Optimization][Irrelevent CTE filtering] Removing redundant CTE {cte.name} and replacing with {parent.name}"
+        #     )
+        #     for child in inverse_map.get(cte.name, []):
+        #         child.parent_ctes = [
+        #             x for x in child.parent_ctes if x.name != cte.name
+        #         ] + [parent]
+        #         for x in child.source_map:
+        #             if cte.name in child.source_map[x]:
+        #                 child.source_map[x].remove(cte.name)
+        #                 child.source_map[x].append(parent.name)
+        #         for x2 in child.existence_source_map:
+        #             if cte.name in child.existence_source_map[x2]:
+        #                 child.existence_source_map[x2].remove(cte.name)
+        #                 child.existence_source_map[x2].append(parent.name)
+        # else:
         relevant_ctes.add(cte.name)
         for cte in cte.parent_ctes:
-            recurse(cte)
-    recurse(root_cte)
-    return [cte for cte in input if cte.name in relevant_ctes]
+            recurse(cte, inverse_map)
+    inverse_map = gen_inverse_map(input)
+    recurse(root_cte, inverse_map)
+    final = [cte for cte in input if cte.name in relevant_ctes]
+    if len(final) == len(input):
+        return input
+    return filter_irrelevant_ctes(final, root_cte)
 def gen_inverse_map(input: list[CTE]) -> dict[str, list[CTE]]:

trilogy/core/optimizations/inline_datasource.py CHANGED Viewed

@@ -5,6 +5,7 @@ from trilogy.core.models import (
 from trilogy.core.optimizations.base_optimization import OptimizationRule
 from collections import defaultdict
+from trilogy.constants import CONFIG
 class InlineDatasource(OptimizationRule):
@@ -18,28 +19,28 @@ class InlineDatasource(OptimizationRule):
         if not cte.parent_ctes:
             return False
-        self.log(
+        self.debug(
             f"Checking {cte.name} for consolidating inline tables with {len(cte.parent_ctes)} parents"
         )
         to_inline: list[CTE] = []
         force_group = False
         for parent_cte in cte.parent_ctes:
             if not parent_cte.is_root_datasource:
-                self.log(f"parent {parent_cte.name} is not root")
+                self.debug(f"parent {parent_cte.name} is not root")
                 continue
             if parent_cte.parent_ctes:
-                self.log(f"parent {parent_cte.name} has parents")
+                self.debug(f"parent {parent_cte.name} has parents")
                 continue
             if parent_cte.condition:
-                self.log(f"parent {parent_cte.name} has condition, cannot be inlined")
+                self.debug(f"parent {parent_cte.name} has condition, cannot be inlined")
                 continue
             raw_root = parent_cte.source.datasources[0]
             if not isinstance(raw_root, Datasource):
-                self.log(f"parent {parent_cte.name} is not datasource")
+                self.debug(f"Parent {parent_cte.name} is not datasource")
                 continue
             root: Datasource = raw_root
             if not root.can_be_inlined:
-                self.log(f"parent {parent_cte.name} datasource is not inlineable")
+                self.debug(f"Parent {parent_cte.name} datasource is not inlineable")
                 continue
             root_outputs = {x.address for x in root.output_concepts}
             inherited = {
@@ -52,7 +53,9 @@ class InlineDatasource(OptimizationRule):
                 )
                 continue
             if not root.grain.issubset(parent_cte.grain):
-                self.log(f"Not all {parent_cte.name} is at wrong grain to inline")
+                self.log(
+                    f"{parent_cte.name} is at wrong grain to inline ({root.grain} vs {parent_cte.grain})"
+                )
                 continue
             to_inline.append(parent_cte)
@@ -62,7 +65,10 @@ class InlineDatasource(OptimizationRule):
                 self.candidates[cte.name].add(replaceable.name)
                 self.count[replaceable.source.name] += 1
                 return True
-            if self.count[replaceable.source.name] > 1:
+            if (
+                self.count[replaceable.source.name]
+                > CONFIG.optimizations.constant_inline_cutoff
+            ):
                 self.log(
                     f"Skipping inlining raw datasource {replaceable.source.name} ({replaceable.name}) due to multiple references"
                 )

trilogy/core/optimizations/predicate_pushdown.py CHANGED Viewed

@@ -3,39 +3,13 @@ from trilogy.core.models import (
     Conditional,
     BooleanOperator,
     Datasource,
-    SubselectComparison,
+    ConceptArgs,
     Comparison,
     Parenthetical,
 )
 from trilogy.core.optimizations.base_optimization import OptimizationRule
 from trilogy.core.processing.utility import is_scalar_condition
-def decompose_condition(
-    conditional: Conditional,
-) -> list[SubselectComparison | Comparison | Conditional | Parenthetical]:
-    chunks: list[SubselectComparison | Comparison | Conditional | Parenthetical] = []
-    if conditional.operator == BooleanOperator.AND:
-        if not (
-            isinstance(
-                conditional.left,
-                (SubselectComparison, Comparison, Conditional, Parenthetical),
-            )
-            and isinstance(
-                conditional.right,
-                (SubselectComparison, Comparison, Conditional, Parenthetical),
-            )
-        ):
-            chunks.append(conditional)
-        else:
-            for val in [conditional.left, conditional.right]:
-                if isinstance(val, Conditional):
-                    chunks.extend(decompose_condition(val))
-                else:
-                    chunks.append(val)
-    else:
-        chunks.append(conditional)
-    return chunks
+from trilogy.utility import unique
 def is_child_of(a, comparison):
@@ -57,35 +31,51 @@ class PredicatePushdown(OptimizationRule):
     def _check_parent(
         self,
+        cte: CTE,
         parent_cte: CTE,
-        candidate: Conditional,
+        candidate: Conditional | Comparison | Parenthetical | None,
         inverse_map: dict[str, list[CTE]],
     ):
-        conditions = {x.address for x in candidate.concept_arguments}
+        if not isinstance(candidate, ConceptArgs):
+            return False
+        row_conditions = {x.address for x in candidate.row_arguments}
+        existence_conditions = {
+            y.address for x in candidate.existence_arguments for y in x
+        }
+        all_inputs = {x.address for x in candidate.concept_arguments}
         if is_child_of(candidate, parent_cte.condition):
             return False
         materialized = {k for k, v in parent_cte.source_map.items() if v != []}
-        if not conditions or not materialized:
+        if not row_conditions or not materialized:
+            return False
+        output_addresses = {x.address for x in parent_cte.output_columns}
+        # if any of the existence conditions are created on the asset, we can't push up to it
+        if existence_conditions and existence_conditions.intersection(output_addresses):
             return False
         # if it's a root datasource, we can filter on _any_ of the output concepts
         if parent_cte.is_root_datasource:
             extra_check = {
                 x.address for x in parent_cte.source.datasources[0].output_concepts
             }
-            if conditions.issubset(extra_check):
-                for x in conditions:
+            if row_conditions.issubset(extra_check):
+                for x in row_conditions:
                     if x not in materialized:
                         materialized.add(x)
                         parent_cte.source_map[x] = [
                             parent_cte.source.datasources[0].name
                         ]
-        if conditions.issubset(materialized):
+        if row_conditions.issubset(materialized):
             children = inverse_map.get(parent_cte.name, [])
             if all([is_child_of(candidate, child.condition) for child in children]):
                 self.log(
                     f"All concepts are found on {parent_cte.name} with existing {parent_cte.condition} and all it's {len(children)} children include same filter; pushing up {candidate}"
                 )
+                if parent_cte.condition and not is_scalar_condition(
+                    parent_cte.condition
+                ):
+                    self.log("Parent condition is not scalar, not safe to push up")
+                    return False
                 if parent_cte.condition:
                     parent_cte.condition = Conditional(
                         left=parent_cte.condition,
@@ -94,9 +84,22 @@ class PredicatePushdown(OptimizationRule):
                     )
                 else:
                     parent_cte.condition = candidate
+                # promote up existence sources
+                if all_inputs.difference(row_conditions):
+                    for x in all_inputs.difference(row_conditions):
+                        if x not in parent_cte.source_map and x in cte.source_map:
+                            sources = [
+                                parent
+                                for parent in cte.parent_ctes
+                                if parent.name in cte.source_map[x]
+                            ]
+                            parent_cte.source_map[x] = cte.source_map[x]
+                            parent_cte.parent_ctes = unique(
+                                parent_cte.parent_ctes + sources, "name"
+                            )
                 return True
         self.debug(
-            f"conditions {conditions} not subset of parent {parent_cte.name} parent has {materialized} "
+            f"conditions {row_conditions} not subset of parent {parent_cte.name} parent has {materialized} "
         )
         return False
@@ -111,24 +114,47 @@ class PredicatePushdown(OptimizationRule):
         if not cte.condition:
             self.debug(f"No CTE condition for {cte.name}")
             return False
+        parent_filter_status = {
+            parent.name: is_child_of(cte.condition, parent.condition)
+            for parent in cte.parent_ctes
+        }
+        # flatten existnce argument tuples to a list
+        flattened_existence = [
+            x.address for y in cte.condition.existence_arguments for x in y
+        ]
+        existence_only = [
+            parent.name
+            for parent in cte.parent_ctes
+            if all([x.address in flattened_existence for x in parent.output_columns])
+            and len(flattened_existence) > 0
+        ]
         if all(
             [
-                is_child_of(cte.condition, parent_cte.condition)
-                for parent_cte in cte.parent_ctes
+                value
+                for key, value in parent_filter_status.items()
+                if key not in existence_only
             ]
         ) and not any([isinstance(x, Datasource) for x in cte.source.datasources]):
             self.log(
-                f"All parents of {cte.name} have same filter, removing filter from {cte.name}"
+                f"All parents of {cte.name} have same filter or are existence only inputs, removing filter from {cte.name}"
             )
             cte.condition = None
+            # remove any "parent" CTEs that provided only existence inputs
+            if existence_only:
+                original = [y.name for y in cte.parent_ctes]
+                cte.parent_ctes = [
+                    x for x in cte.parent_ctes if x.name not in existence_only
+                ]
+                self.log(
+                    f"new parents for {cte.name} are {[x.name for x in cte.parent_ctes]}, vs {original}"
+                )
             return True
         else:
-            mapping = {
-                parent.name: is_child_of(cte.condition, parent.condition)
-                for parent in cte.parent_ctes
-            }
             self.log(
-                f"Could not remove filter from {cte.name}, as not all parents have the same filter: {mapping}"
+                f"Could not remove filter from {cte.name}, as not all parents have the same filter: {parent_filter_status}"
             )
         if self.complete.get(cte.name):
             self.debug("Have done this CTE before")
@@ -156,7 +182,10 @@ class PredicatePushdown(OptimizationRule):
             )
             for parent_cte in cte.parent_ctes:
                 local_pushdown = self._check_parent(
-                    parent_cte=parent_cte, candidate=candidate, inverse_map=inverse_map
+                    cte=cte,
+                    parent_cte=parent_cte,
+                    candidate=candidate,
+                    inverse_map=inverse_map,
                 )
                 optimized = optimized or local_pushdown
                 if local_pushdown:

trilogy/core/processing/concept_strategies_v3.py CHANGED Viewed

@@ -5,7 +5,7 @@ from trilogy.constants import logger
 from trilogy.core.enums import PurposeLineage, Granularity, FunctionType
 from trilogy.core.env_processor import generate_graph
 from trilogy.core.graph_models import ReferenceGraph
-from trilogy.core.models import Concept, Environment, Function, Grain
+from trilogy.core.models import Concept, Environment, Function, Grain, WhereClause
 from trilogy.core.processing.utility import (
     get_disconnected_components,
 )
@@ -183,10 +183,14 @@ def generate_candidates_restrictive(
         if x.address not in exhausted and x.granularity != Granularity.SINGLE_ROW
     ]
     combos: list[list[Concept]] = []
+    grain_check = Grain(components=[*local_candidates]).components_copy
     # for simple operations these, fetch as much as possible.
     if priority_concept.derivation in (PurposeLineage.BASIC, PurposeLineage.ROOT):
-        combos.append(local_candidates)
-    combos.append(Grain(components=[*local_candidates]).components_copy)
+        if set([x.address for x in grain_check]) != set(
+            [x.address for x in local_candidates]
+        ):
+            combos.append(local_candidates)
+    combos.append(grain_check)
     # append the empty set for sourcing concept by itself last
     combos.append([])
     return combos
@@ -201,6 +205,7 @@ def generate_node(
     source_concepts: Callable,
     accept_partial: bool = False,
     history: History | None = None,
+    conditions: WhereClause | None = None,
 ) -> StrategyNode | None:
     # first check in case there is a materialized_concept
     history = history or History()
@@ -214,6 +219,7 @@ def generate_node(
         accept_partial=accept_partial,
         accept_partial_optional=False,
         source_concepts=source_concepts,
+        conditions=conditions,
     )
     if candidate:
@@ -224,7 +230,14 @@ def generate_node(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating window node with optional {[x.address for x in local_optional]}"
         )
         return gen_window_node(
-            concept, local_optional, environment, g, depth + 1, source_concepts, history
+            concept,
+            local_optional,
+            environment,
+            g,
+            depth + 1,
+            source_concepts,
+            history,
+            conditions=conditions,
         )
     elif concept.derivation == PurposeLineage.FILTER:
@@ -232,14 +245,28 @@ def generate_node(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating filter node with optional {[x.address for x in local_optional]}"
         )
         return gen_filter_node(
-            concept, local_optional, environment, g, depth + 1, source_concepts, history
+            concept,
+            local_optional,
+            environment,
+            g,
+            depth + 1,
+            source_concepts=source_concepts,
+            history=history,
+            conditions=conditions,
         )
     elif concept.derivation == PurposeLineage.UNNEST:
         logger.info(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating unnest node with optional {[x.address for x in local_optional]}"
         )
         return gen_unnest_node(
-            concept, local_optional, environment, g, depth + 1, source_concepts, history
+            concept,
+            local_optional,
+            environment,
+            g,
+            depth + 1,
+            source_concepts,
+            history,
+            conditions=conditions,
         )
     elif concept.derivation == PurposeLineage.AGGREGATE:
         # don't push constants up before aggregation
@@ -255,7 +282,14 @@ def generate_node(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating aggregate node with {[x.address for x in agg_optional]}"
         )
         return gen_group_node(
-            concept, agg_optional, environment, g, depth + 1, source_concepts, history
+            concept,
+            agg_optional,
+            environment,
+            g,
+            depth + 1,
+            source_concepts,
+            history,
+            conditions=conditions,
         )
     elif concept.derivation == PurposeLineage.ROWSET:
         logger.info(
@@ -322,6 +356,7 @@ def generate_node(
             accept_partial=accept_partial,
             accept_partial_optional=True,
             source_concepts=source_concepts,
+            conditions=conditions,
         )
     else:
         raise ValueError(f"Unknown derivation {concept.derivation}")
@@ -447,10 +482,13 @@ def search_concepts(
     g: ReferenceGraph,
     accept_partial: bool = False,
     history: History | None = None,
+    conditions: WhereClause | None = None,
 ) -> StrategyNode | None:
     history = history or History()
-    hist = history.get_history(mandatory_list, accept_partial)
+    hist = history.get_history(
+        search=mandatory_list, accept_partial=accept_partial, conditions=conditions
+    )
     if hist is not False:
         logger.info(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Returning search node from history ({'exists' if hist is not None else 'does not exist'}) for {[c.address for c in mandatory_list]} with accept_partial {accept_partial}"
@@ -465,10 +503,14 @@ def search_concepts(
         g=g,
         accept_partial=accept_partial,
         history=history,
+        conditions=conditions,
     )
     # a node may be mutated after be cached; always store a copy
     history.search_to_history(
-        mandatory_list, accept_partial, result.copy() if result else None
+        mandatory_list,
+        accept_partial,
+        result.copy() if result else None,
+        conditions=conditions,
     )
     return result
@@ -480,6 +522,7 @@ def _search_concepts(
     g: ReferenceGraph,
     history: History,
     accept_partial: bool = False,
+    conditions: WhereClause | None = None,
 ) -> StrategyNode | None:
     mandatory_list = unique(mandatory_list, "address")
@@ -521,6 +564,7 @@ def _search_concepts(
                 source_concepts=search_concepts,
                 accept_partial=accept_partial,
                 history=history,
+                conditions=conditions,
             )
             if node:
                 stack.append(node)
@@ -559,22 +603,11 @@ def _search_concepts(
         f"{depth_to_prefix(depth)}{LOGGER_PREFIX} finished sourcing loop (complete: {complete}), have {found} from {[n for n in stack]} (missing {all_mandatory - found}), attempted {attempted}, virtual {virtual}"
     )
     if complete == ValidationResult.COMPLETE:
-        all_partial = [
-            c
-            for c in mandatory_list
-            if all(
-                [
-                    c.address in [x.address for x in p.partial_concepts]
-                    for p in stack
-                    if [c in p.output_concepts]
-                ]
-            )
-        ]
         non_virtual = [c for c in mandatory_list if c.address not in virtual]
         if len(stack) == 1:
             output = stack[0]
             logger.info(
-                f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Source stack has single node, returning that {type(output)}"
+                f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Source stack has single node, returning that {type(output)} with output {[x.address for x in output.output_concepts]}"
             )
             return output
@@ -585,23 +618,26 @@ def _search_concepts(
             g=g,
             parents=stack,
             depth=depth,
-            partial_concepts=all_partial,
         )
         # ensure we can resolve our final merge
         output.resolve()
         logger.info(
-            f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Graph is connected, returning merge node, partial {[c.address for c in all_partial]}"
+            f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Graph is connected, returning merge node, partial {[c.address for c in output.partial_concepts]}"
         )
         return output
     # check that we're not already in a discovery loop
-    if not history.check_started(mandatory_list, accept_partial):
+    if not history.check_started(
+        mandatory_list, accept_partial=accept_partial, conditions=conditions
+    ):
         logger.info(
             f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Stack is not connected graph, flag for accepting partial addresses is {accept_partial}, checking for expanded concepts"
         )
         # gate against further recursion into this
-        history.log_start(mandatory_list, accept_partial)
+        history.log_start(
+            mandatory_list, accept_partial=accept_partial, conditions=conditions
+        )
         expanded = gen_merge_node(
             all_concepts=mandatory_list,
             environment=environment,
@@ -641,6 +677,7 @@ def _search_concepts(
             g=g,
             accept_partial=True,
             history=history,
+            conditions=conditions,
         )
         if partial_search:
             logger.info(
@@ -657,18 +694,22 @@ def source_query_concepts(
     output_concepts: List[Concept],
     environment: Environment,
     g: Optional[ReferenceGraph] = None,
+    conditions: Optional[WhereClause] = None,
+    history: Optional[History] = None,
 ):
-    if not g:
-        g = generate_graph(environment)
     if not output_concepts:
         raise ValueError(f"No output concepts provided {output_concepts}")
-    history = History()
+    if not g:
+        g = generate_graph(environment)
+    history = history or History()
     root = search_concepts(
         mandatory_list=output_concepts,
         environment=environment,
         g=g,
         depth=0,
         history=history,
+        conditions=conditions,
     )
     if not root:

pytrilogy 0.0.2.7__py3-none-any.whl → 0.0.2.9__py3-none-any.whl

Potentially problematic release.

pytrilogy 0.0.2.7py3-none-any.whl → 0.0.2.9py3-none-any.whl