pytrilogy 0.0.1.104__py3-none-any.whl → 0.0.1.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pytrilogy might be problematic. Click here for more details.

Files changed (32) hide show
  1. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/METADATA +1 -1
  2. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/RECORD +32 -31
  3. trilogy/__init__.py +3 -2
  4. trilogy/constants.py +1 -0
  5. trilogy/core/models.py +226 -49
  6. trilogy/core/optimization.py +141 -0
  7. trilogy/core/processing/concept_strategies_v3.py +1 -0
  8. trilogy/core/processing/node_generators/common.py +19 -7
  9. trilogy/core/processing/node_generators/filter_node.py +37 -10
  10. trilogy/core/processing/node_generators/merge_node.py +11 -1
  11. trilogy/core/processing/nodes/base_node.py +4 -2
  12. trilogy/core/processing/nodes/group_node.py +5 -2
  13. trilogy/core/processing/nodes/merge_node.py +13 -8
  14. trilogy/core/query_processor.py +5 -2
  15. trilogy/dialect/base.py +85 -54
  16. trilogy/dialect/bigquery.py +6 -4
  17. trilogy/dialect/common.py +8 -6
  18. trilogy/dialect/config.py +69 -1
  19. trilogy/dialect/duckdb.py +5 -4
  20. trilogy/dialect/enums.py +40 -19
  21. trilogy/dialect/postgres.py +4 -2
  22. trilogy/dialect/presto.py +6 -4
  23. trilogy/dialect/snowflake.py +6 -4
  24. trilogy/dialect/sql_server.py +4 -1
  25. trilogy/executor.py +18 -5
  26. trilogy/parsing/common.py +30 -0
  27. trilogy/parsing/parse_engine.py +43 -83
  28. trilogy/parsing/render.py +0 -122
  29. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/LICENSE.md +0 -0
  30. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/WHEEL +0 -0
  31. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/entry_points.txt +0 -0
  32. {pytrilogy-0.0.1.104.dist-info → pytrilogy-0.0.1.106.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,141 @@
1
+ from trilogy.core.models import (
2
+ CTE,
3
+ SelectStatement,
4
+ PersistStatement,
5
+ Datasource,
6
+ MultiSelectStatement,
7
+ )
8
+ from trilogy.core.enums import PurposeLineage
9
+ from trilogy.constants import logger
10
+ from abc import ABC
11
+
12
+
13
+ class OptimizationRule(ABC):
14
+
15
+ def optimize(self, cte: CTE) -> bool:
16
+ raise NotImplementedError
17
+
18
+ def log(self, message: str):
19
+ logger.info(f"[Optimization][{self.__class__.__name__}] {message}")
20
+
21
+
22
+ class InlineDatasource(OptimizationRule):
23
+
24
+ def optimize(self, cte: CTE) -> bool:
25
+ if not cte.parent_ctes:
26
+ return False
27
+
28
+ optimized = False
29
+ self.log(
30
+ f"Checking {cte.name} for consolidating inline tables with {len(cte.parent_ctes)} parents"
31
+ )
32
+ to_inline: list[CTE] = []
33
+ for parent_cte in cte.parent_ctes:
34
+ if not parent_cte.is_root_datasource:
35
+ self.log(f"parent {parent_cte.name} is not root")
36
+ continue
37
+ if parent_cte.parent_ctes:
38
+ self.log(f"parent {parent_cte.name} has parents")
39
+ continue
40
+ raw_root = parent_cte.source.datasources[0]
41
+ if not isinstance(raw_root, Datasource):
42
+ self.log(f"parent {parent_cte.name} is not datasource")
43
+ continue
44
+ root: Datasource = raw_root
45
+ if not root.can_be_inlined:
46
+ self.log(f"parent {parent_cte.name} datasource is not inlineable")
47
+ continue
48
+ root_outputs = {x.address for x in root.output_concepts}
49
+ cte_outputs = {x.address for x in parent_cte.output_columns}
50
+ if not cte_outputs.issubset(root_outputs):
51
+ self.log(f"Not all {parent_cte.name} outputs are found on datasource")
52
+ continue
53
+
54
+ to_inline.append(parent_cte)
55
+
56
+ for replaceable in to_inline:
57
+ self.log(f"Inlining parent {replaceable.name}")
58
+ cte.inline_parent_datasource(replaceable)
59
+
60
+ return optimized
61
+
62
+
63
+ REGISTERED_RULES: list[OptimizationRule] = [InlineDatasource()]
64
+
65
+
66
+ def filter_irrelevant_ctes(input: list[CTE], root_cte: CTE):
67
+ relevant_ctes = set()
68
+
69
+ def recurse(cte: CTE):
70
+ relevant_ctes.add(cte.name)
71
+ for cte in cte.parent_ctes:
72
+ recurse(cte)
73
+
74
+ recurse(root_cte)
75
+ return [cte for cte in input if cte.name in relevant_ctes]
76
+
77
+
78
+ def is_direct_return_eligible(
79
+ cte: CTE, select: SelectStatement | PersistStatement | MultiSelectStatement
80
+ ) -> bool:
81
+ if isinstance(select, (PersistStatement, MultiSelectStatement)):
82
+ return False
83
+ derived_concepts = [
84
+ c for c in cte.source.output_concepts if c not in cte.source.input_concepts
85
+ ]
86
+ eligible = True
87
+ conditions = (
88
+ set(x.address for x in select.where_clause.concept_arguments)
89
+ if select.where_clause
90
+ else set()
91
+ )
92
+ if conditions and select.limit:
93
+ return False
94
+ for x in derived_concepts:
95
+ if x.derivation == PurposeLineage.WINDOW:
96
+ return False
97
+ if x.derivation == PurposeLineage.AGGREGATE:
98
+ if x.address in conditions:
99
+ return False
100
+ logger.info(
101
+ f"Upleveling output select to final CTE with derived_concepts {[x.address for x in derived_concepts]}"
102
+ )
103
+ return eligible
104
+
105
+
106
+ def sort_select_output(cte: CTE, query: SelectStatement | MultiSelectStatement):
107
+ hidden_addresses = [c.address for c in query.hidden_components]
108
+ output_addresses = [
109
+ c.address for c in query.output_components if c.address not in hidden_addresses
110
+ ]
111
+
112
+ mapping = {x.address: x for x in cte.output_columns}
113
+
114
+ new_output = []
115
+ for x in output_addresses:
116
+ new_output.append(mapping[x])
117
+ cte.output_columns = new_output
118
+
119
+
120
+ def optimize_ctes(
121
+ input: list[CTE], root_cte: CTE, select: SelectStatement | MultiSelectStatement
122
+ ):
123
+ complete = False
124
+
125
+ while not complete:
126
+ actions_taken = False
127
+ for rule in REGISTERED_RULES:
128
+ for cte in input:
129
+ actions_taken = rule.optimize(cte)
130
+ complete = not actions_taken
131
+
132
+ if is_direct_return_eligible(root_cte, select):
133
+ root_cte.order_by = select.order_by
134
+ root_cte.limit = select.limit
135
+ root_cte.condition = (
136
+ select.where_clause.conditional if select.where_clause else None
137
+ )
138
+ root_cte.requires_nesting = False
139
+ sort_select_output(cte, select)
140
+
141
+ return filter_irrelevant_ctes(input, root_cte)
@@ -317,6 +317,7 @@ def generate_node(
317
317
  return gen_basic_node(
318
318
  concept, local_optional, environment, g, depth + 1, source_concepts, history
319
319
  )
320
+
320
321
  elif concept.derivation == PurposeLineage.ROOT:
321
322
  logger.info(
322
323
  f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating select node with optional {[x.address for x in local_optional]}"
@@ -45,21 +45,33 @@ def resolve_function_parent_concepts(concept: Concept) -> List[Concept]:
45
45
  return unique(concept.lineage.concept_arguments, "address")
46
46
 
47
47
 
48
- def resolve_filter_parent_concepts(concept: Concept) -> Tuple[Concept, List[Concept]]:
48
+ def resolve_filter_parent_concepts(
49
+ concept: Concept,
50
+ ) -> Tuple[Concept, List[Concept], List[Concept]]:
49
51
  if not isinstance(concept.lineage, FilterItem):
50
- raise ValueError
52
+ raise ValueError(
53
+ f"Concept {concept} lineage is not filter item, is {type(concept.lineage)}"
54
+ )
51
55
  direct_parent = concept.lineage.content
52
- base = [direct_parent]
53
- base += concept.lineage.where.concept_arguments
56
+ base_existence = []
57
+ base_rows = [direct_parent]
58
+ base_rows += concept.lineage.where.row_arguments
59
+ base_existence += concept.lineage.where.existence_arguments
54
60
  if direct_parent.grain:
55
- base += direct_parent.grain.components_copy
61
+ base_rows += direct_parent.grain.components_copy
56
62
  if (
57
63
  isinstance(direct_parent, Concept)
58
64
  and direct_parent.purpose == Purpose.PROPERTY
59
65
  and direct_parent.keys
60
66
  ):
61
- base += direct_parent.keys
62
- return concept.lineage.content, unique(base, "address")
67
+ base_rows += direct_parent.keys
68
+ if concept.lineage.where.existence_arguments:
69
+ return (
70
+ concept.lineage.content,
71
+ unique(base_rows, "address"),
72
+ unique(base_existence, "address"),
73
+ )
74
+ return concept.lineage.content, unique(base_rows, "address"), []
63
75
 
64
76
 
65
77
  def gen_property_enrichment_node(
@@ -11,7 +11,7 @@ from trilogy.core.processing.node_generators.common import (
11
11
  resolve_filter_parent_concepts,
12
12
  )
13
13
  from trilogy.constants import logger
14
- from trilogy.core.processing.utility import padding
14
+ from trilogy.core.processing.utility import padding, unique
15
15
  from trilogy.core.processing.node_generators.common import concept_to_relevant_joins
16
16
 
17
17
  LOGGER_PREFIX = "[GEN_FILTER_NODE]"
@@ -26,30 +26,57 @@ def gen_filter_node(
26
26
  source_concepts,
27
27
  history: History | None = None,
28
28
  ) -> MergeNode | FilterNode | None:
29
- immediate_parent, parent_concepts = resolve_filter_parent_concepts(concept)
29
+ immediate_parent, parent_row_concepts, parent_existence_concepts = (
30
+ resolve_filter_parent_concepts(concept)
31
+ )
30
32
 
31
- logger.info(f"{padding(depth)}{LOGGER_PREFIX} fetching filter node parents")
33
+ logger.info(
34
+ f"{padding(depth)}{LOGGER_PREFIX} fetching filter node row parents {[x.address for x in parent_row_concepts]}"
35
+ )
36
+ core_parents = []
32
37
  parent = source_concepts(
33
- mandatory_list=parent_concepts,
38
+ mandatory_list=parent_row_concepts,
34
39
  environment=environment,
35
40
  g=g,
36
41
  depth=depth + 1,
37
42
  history=history,
38
43
  )
44
+
39
45
  if not parent:
40
46
  return None
47
+ core_parents.append(parent)
48
+ if parent_existence_concepts:
49
+ logger.info(
50
+ f"{padding(depth)}{LOGGER_PREFIX} fetching filter node existence parents {[x.address for x in parent_existence_concepts]}"
51
+ )
52
+ parent_existence = source_concepts(
53
+ mandatory_list=parent_existence_concepts,
54
+ environment=environment,
55
+ g=g,
56
+ depth=depth + 1,
57
+ history=history,
58
+ )
59
+ if not parent_existence:
60
+ return None
61
+ core_parents.append(parent_existence)
62
+
41
63
  filter_node = FilterNode(
42
- input_concepts=[immediate_parent] + parent_concepts,
43
- output_concepts=[concept, immediate_parent] + parent_concepts,
64
+ input_concepts=unique(
65
+ [immediate_parent] + parent_row_concepts + parent_existence_concepts,
66
+ "address",
67
+ ),
68
+ output_concepts=[concept, immediate_parent] + parent_row_concepts,
44
69
  environment=environment,
45
70
  g=g,
46
- parents=[parent],
71
+ parents=core_parents,
47
72
  )
48
- if not local_optional:
73
+ if not local_optional or all(
74
+ [x.address in [y.address for y in parent_row_concepts] for x in local_optional]
75
+ ):
49
76
  return filter_node
50
77
  enrich_node = source_concepts( # this fetches the parent + join keys
51
78
  # to then connect to the rest of the query
52
- mandatory_list=[immediate_parent] + parent_concepts + local_optional,
79
+ mandatory_list=[immediate_parent] + parent_row_concepts + local_optional,
53
80
  environment=environment,
54
81
  g=g,
55
82
  depth=depth + 1,
@@ -75,7 +102,7 @@ def gen_filter_node(
75
102
  left_node=enrich_node,
76
103
  right_node=filter_node,
77
104
  concepts=concept_to_relevant_joins(
78
- [immediate_parent] + parent_concepts
105
+ [immediate_parent] + parent_row_concepts
79
106
  ),
80
107
  join_type=JoinType.LEFT_OUTER,
81
108
  filter_to_mutual=False,
@@ -87,8 +87,18 @@ def gen_merge_node(
87
87
  ) -> Optional[MergeNode]:
88
88
  join_candidates: List[PathInfo] = []
89
89
  # anchor on datasources
90
+ final_all_concepts = []
91
+ # implicit_upstream = {}
92
+ for x in all_concepts:
93
+ # if x.derivation in (PurposeLineage.AGGREGATE, PurposeLineage.BASIC):
94
+ # final_all_concepts +=resolve_function_parent_concepts(x)
95
+ # elif x.derivation == PurposeLineage.FILTER:
96
+ # final_all_concepts +=resolve_filter_parent_concepts(x)
97
+ # else:
98
+ # final_all_concepts.append(x)
99
+ final_all_concepts.append(x)
90
100
  for datasource in environment.datasources.values():
91
- path = identify_ds_join_paths(all_concepts, g, datasource, accept_partial)
101
+ path = identify_ds_join_paths(final_all_concepts, g, datasource, accept_partial)
92
102
  if path and path.reduced_concepts:
93
103
  join_candidates.append(path)
94
104
  join_candidates.sort(key=lambda x: sum([len(v) for v in x.paths.values()]))
@@ -45,7 +45,7 @@ def concept_list_to_grain(
45
45
 
46
46
 
47
47
  def resolve_concept_map(
48
- inputs: List[QueryDatasource],
48
+ inputs: List[QueryDatasource | Datasource],
49
49
  targets: List[Concept],
50
50
  inherited_inputs: List[Concept],
51
51
  full_joins: List[Concept] | None = None,
@@ -156,7 +156,9 @@ class StrategyNode:
156
156
  return f"{self.__class__.__name__}<{contents}>"
157
157
 
158
158
  def _resolve(self) -> QueryDatasource:
159
- parent_sources = [p.resolve() for p in self.parents]
159
+ parent_sources: List[QueryDatasource | Datasource] = [
160
+ p.resolve() for p in self.parents
161
+ ]
160
162
 
161
163
  # if conditional:
162
164
  # for condition in conditions[1:]:
@@ -4,6 +4,7 @@ from trilogy.constants import logger
4
4
  from trilogy.core.models import (
5
5
  Grain,
6
6
  QueryDatasource,
7
+ Datasource,
7
8
  SourceType,
8
9
  Concept,
9
10
  Environment,
@@ -45,7 +46,9 @@ class GroupNode(StrategyNode):
45
46
  )
46
47
 
47
48
  def _resolve(self) -> QueryDatasource:
48
- parent_sources: list[QueryDatasource] = [p.resolve() for p in self.parents]
49
+ parent_sources: List[QueryDatasource | Datasource] = [
50
+ p.resolve() for p in self.parents
51
+ ]
49
52
 
50
53
  grain = concept_list_to_grain(self.output_concepts, [])
51
54
  comp_grain = Grain()
@@ -66,7 +69,7 @@ class GroupNode(StrategyNode):
66
69
  len(parent_sources) == 1
67
70
  and LooseConceptList(concepts=parent_sources[0].output_concepts)
68
71
  == self.output_lcl
69
- ):
72
+ ) and isinstance(parent_sources[0], QueryDatasource):
70
73
  logger.info(
71
74
  f"{self.logging_prefix}{LOGGER_PREFIX} No group by required, returning parent node"
72
75
  )
@@ -7,6 +7,7 @@ from trilogy.core.models import (
7
7
  Grain,
8
8
  JoinType,
9
9
  QueryDatasource,
10
+ Datasource,
10
11
  SourceType,
11
12
  Concept,
12
13
  UnnestJoin,
@@ -24,8 +25,8 @@ LOGGER_PREFIX = "[CONCEPT DETAIL - MERGE NODE]"
24
25
 
25
26
 
26
27
  def deduplicate_nodes(
27
- merged: dict[str, QueryDatasource], logging_prefix: str
28
- ) -> tuple[bool, dict[str, QueryDatasource], set[str]]:
28
+ merged: dict[str, QueryDatasource | Datasource], logging_prefix: str
29
+ ) -> tuple[bool, dict[str, QueryDatasource | Datasource], set[str]]:
29
30
  duplicates = False
30
31
  removed: set[str] = set()
31
32
  set_map: dict[str, set[str]] = {}
@@ -65,9 +66,9 @@ def deduplicate_nodes(
65
66
 
66
67
  def deduplicate_nodes_and_joins(
67
68
  joins: List[NodeJoin] | None,
68
- merged: dict[str, QueryDatasource],
69
+ merged: dict[str, QueryDatasource | Datasource],
69
70
  logging_prefix: str,
70
- ) -> Tuple[List[NodeJoin] | None, dict[str, QueryDatasource]]:
71
+ ) -> Tuple[List[NodeJoin] | None, dict[str, QueryDatasource | Datasource]]:
71
72
  # it's possible that we have more sources than we need
72
73
  duplicates = True
73
74
  while duplicates:
@@ -211,8 +212,10 @@ class MergeNode(StrategyNode):
211
212
  return joins
212
213
 
213
214
  def _resolve(self) -> QueryDatasource:
214
- parent_sources = [p.resolve() for p in self.parents]
215
- merged: dict[str, QueryDatasource] = {}
215
+ parent_sources: List[QueryDatasource | Datasource] = [
216
+ p.resolve() for p in self.parents
217
+ ]
218
+ merged: dict[str, QueryDatasource | Datasource] = {}
216
219
  final_joins = self.node_joins
217
220
  for source in parent_sources:
218
221
  if source.full_name in merged:
@@ -228,14 +231,15 @@ class MergeNode(StrategyNode):
228
231
  final_joins, merged, self.logging_prefix
229
232
  )
230
233
  # early exit if we can just return the parent
231
- final_datasets: List[QueryDatasource] = list(merged.values())
234
+ final_datasets: List[QueryDatasource | Datasource] = list(merged.values())
232
235
 
233
236
  if len(merged.keys()) == 1:
234
- final: QueryDatasource = list(merged.values())[0]
237
+ final: QueryDatasource | Datasource = list(merged.values())[0]
235
238
  if (
236
239
  set([c.address for c in final.output_concepts])
237
240
  == set([c.address for c in self.output_concepts])
238
241
  and not self.conditions
242
+ and isinstance(final, QueryDatasource)
239
243
  ):
240
244
  logger.info(
241
245
  f"{self.logging_prefix}{LOGGER_PREFIX} Merge node has only one parent with the same"
@@ -255,6 +259,7 @@ class MergeNode(StrategyNode):
255
259
  if (
256
260
  all([c.address in output_set for c in self.all_concepts])
257
261
  and not self.conditions
262
+ and isinstance(dataset, QueryDatasource)
258
263
  ):
259
264
  logger.info(
260
265
  f"{self.logging_prefix}{LOGGER_PREFIX} Merge node not required as parent node {dataset.source_type}"
@@ -29,6 +29,7 @@ from trilogy.hooks.base_hook import BaseHook
29
29
  from trilogy.constants import logger
30
30
  from random import shuffle
31
31
  from trilogy.core.ergonomics import CTE_NAMES
32
+ from trilogy.core.optimization import optimize_ctes
32
33
  from math import ceil
33
34
 
34
35
  LOGGER_PREFIX = "[QUERY BUILD]"
@@ -186,7 +187,7 @@ def datasource_to_ctes(
186
187
  source_map = {k: "" for k in query_datasource.source_map}
187
188
  else:
188
189
  source_map = {
189
- k: "" if not v else source.full_name
190
+ k: "" if not v else source.identifier
190
191
  for k, v in query_datasource.source_map.items()
191
192
  }
192
193
  human_id = generate_cte_name(query_datasource.full_name, name_map)
@@ -315,7 +316,9 @@ def process_query(
315
316
  seen[cte.name] = seen[cte.name] + cte
316
317
  for cte in raw_ctes:
317
318
  cte.parent_ctes = [seen[x.name] for x in cte.parent_ctes]
318
- final_ctes: List[CTE] = list(seen.values())
319
+ deduped_ctes: List[CTE] = list(seen.values())
320
+
321
+ final_ctes = optimize_ctes(deduped_ctes, root_cte, statement)
319
322
 
320
323
  return ProcessedQuery(
321
324
  order_by=statement.order_by,