pytrilogy 0.0.1.117__py3-none-any.whl → 0.0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pytrilogy might be problematic. Click here for more details.
- {pytrilogy-0.0.1.117.dist-info → pytrilogy-0.0.2.1.dist-info}/METADATA +1 -1
- pytrilogy-0.0.2.1.dist-info/RECORD +82 -0
- {pytrilogy-0.0.1.117.dist-info → pytrilogy-0.0.2.1.dist-info}/WHEEL +1 -1
- trilogy/__init__.py +1 -1
- trilogy/constants.py +6 -0
- trilogy/core/enums.py +7 -2
- trilogy/core/env_processor.py +43 -19
- trilogy/core/functions.py +11 -0
- trilogy/core/models.py +737 -146
- trilogy/core/optimization.py +31 -28
- trilogy/core/optimizations/inline_constant.py +4 -1
- trilogy/core/optimizations/inline_datasource.py +25 -4
- trilogy/core/optimizations/predicate_pushdown.py +94 -54
- trilogy/core/processing/concept_strategies_v3.py +69 -39
- trilogy/core/processing/graph_utils.py +3 -3
- trilogy/core/processing/node_generators/__init__.py +0 -2
- trilogy/core/processing/node_generators/basic_node.py +30 -17
- trilogy/core/processing/node_generators/filter_node.py +3 -1
- trilogy/core/processing/node_generators/node_merge_node.py +345 -96
- trilogy/core/processing/node_generators/rowset_node.py +18 -16
- trilogy/core/processing/node_generators/select_node.py +44 -83
- trilogy/core/processing/nodes/__init__.py +2 -0
- trilogy/core/processing/nodes/base_node.py +22 -5
- trilogy/core/processing/nodes/filter_node.py +3 -0
- trilogy/core/processing/nodes/group_node.py +20 -2
- trilogy/core/processing/nodes/merge_node.py +32 -18
- trilogy/core/processing/nodes/select_node_v2.py +17 -3
- trilogy/core/processing/utility.py +100 -8
- trilogy/core/query_processor.py +77 -24
- trilogy/dialect/base.py +11 -46
- trilogy/dialect/bigquery.py +1 -1
- trilogy/dialect/common.py +11 -0
- trilogy/dialect/duckdb.py +1 -1
- trilogy/dialect/presto.py +1 -0
- trilogy/executor.py +29 -0
- trilogy/hooks/graph_hook.py +50 -5
- trilogy/hooks/query_debugger.py +1 -0
- trilogy/parsing/common.py +8 -5
- trilogy/parsing/parse_engine.py +48 -27
- trilogy/parsing/render.py +13 -6
- trilogy/parsing/trilogy.lark +12 -7
- pytrilogy-0.0.1.117.dist-info/RECORD +0 -83
- trilogy/core/processing/node_generators/concept_merge_node.py +0 -214
- {pytrilogy-0.0.1.117.dist-info → pytrilogy-0.0.2.1.dist-info}/LICENSE.md +0 -0
- {pytrilogy-0.0.1.117.dist-info → pytrilogy-0.0.2.1.dist-info}/entry_points.txt +0 -0
- {pytrilogy-0.0.1.117.dist-info → pytrilogy-0.0.2.1.dist-info}/top_level.txt +0 -0
trilogy/core/optimization.py
CHANGED
|
@@ -3,8 +3,6 @@ from trilogy.core.models import (
|
|
|
3
3
|
SelectStatement,
|
|
4
4
|
PersistStatement,
|
|
5
5
|
MultiSelectStatement,
|
|
6
|
-
Conditional,
|
|
7
|
-
BooleanOperator,
|
|
8
6
|
)
|
|
9
7
|
from trilogy.core.enums import PurposeLineage
|
|
10
8
|
from trilogy.constants import logger, CONFIG
|
|
@@ -60,8 +58,6 @@ def is_direct_return_eligible(
|
|
|
60
58
|
if select.where_clause
|
|
61
59
|
else set()
|
|
62
60
|
)
|
|
63
|
-
if conditions and select.limit:
|
|
64
|
-
return False
|
|
65
61
|
for x in derived_concepts:
|
|
66
62
|
if x.derivation == PurposeLineage.WINDOW:
|
|
67
63
|
return False
|
|
@@ -71,7 +67,7 @@ def is_direct_return_eligible(
|
|
|
71
67
|
if x.address in conditions:
|
|
72
68
|
return False
|
|
73
69
|
logger.info(
|
|
74
|
-
f"Upleveling output select to final CTE with derived_concepts {[x.address for x in derived_concepts]}"
|
|
70
|
+
f"[Optimization][EarlyReturn] Upleveling output select to final CTE with derived_concepts {[x.address for x in derived_concepts]}"
|
|
75
71
|
)
|
|
76
72
|
return eligible
|
|
77
73
|
|
|
@@ -93,39 +89,46 @@ def sort_select_output(cte: CTE, query: SelectStatement | MultiSelectStatement):
|
|
|
93
89
|
def optimize_ctes(
|
|
94
90
|
input: list[CTE], root_cte: CTE, select: SelectStatement | MultiSelectStatement
|
|
95
91
|
) -> list[CTE]:
|
|
96
|
-
|
|
97
|
-
REGISTERED_RULES: list["OptimizationRule"] = []
|
|
92
|
+
|
|
98
93
|
if CONFIG.optimizations.direct_return and is_direct_return_eligible(
|
|
99
94
|
root_cte, select
|
|
100
95
|
):
|
|
101
96
|
root_cte.order_by = select.order_by
|
|
102
97
|
root_cte.limit = select.limit
|
|
103
|
-
if select.where_clause:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
98
|
+
# if select.where_clause:
|
|
99
|
+
|
|
100
|
+
# if root_cte.condition:
|
|
101
|
+
# root_cte.condition = Conditional(
|
|
102
|
+
# left=root_cte.condition,
|
|
103
|
+
# operator=BooleanOperator.AND,
|
|
104
|
+
# right=select.where_clause.conditional,
|
|
105
|
+
# )
|
|
106
|
+
# else:
|
|
107
|
+
# root_cte.condition = select.where_clause.conditional
|
|
113
108
|
root_cte.requires_nesting = False
|
|
114
109
|
sort_select_output(root_cte, select)
|
|
110
|
+
|
|
111
|
+
REGISTERED_RULES: list["OptimizationRule"] = []
|
|
112
|
+
if CONFIG.optimizations.constant_inlining:
|
|
113
|
+
REGISTERED_RULES.append(InlineConstant())
|
|
115
114
|
if CONFIG.optimizations.datasource_inlining:
|
|
116
115
|
REGISTERED_RULES.append(InlineDatasource())
|
|
117
116
|
if CONFIG.optimizations.predicate_pushdown:
|
|
118
117
|
REGISTERED_RULES.append(PredicatePushdown())
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
118
|
+
|
|
119
|
+
for rule in REGISTERED_RULES:
|
|
120
|
+
loops = 0
|
|
121
|
+
complete = False
|
|
122
|
+
while not complete and (loops <= MAX_OPTIMIZATION_LOOPS):
|
|
123
|
+
actions_taken = False
|
|
124
|
+
# assume we go through all CTEs once
|
|
125
|
+
look_at = [root_cte, *input]
|
|
126
|
+
inverse_map = gen_inverse_map(look_at)
|
|
127
|
+
for cte in look_at:
|
|
128
|
+
opt = rule.optimize(cte, inverse_map)
|
|
129
|
+
actions_taken = actions_taken or opt
|
|
130
|
+
complete = not actions_taken
|
|
131
|
+
loops += 1
|
|
132
|
+
logger.info(f"finished checking for {type(rule).__name__} in {loops} loops")
|
|
130
133
|
|
|
131
134
|
return filter_irrelevant_ctes(input, root_cte)
|
|
@@ -21,9 +21,12 @@ class InlineConstant(OptimizationRule):
|
|
|
21
21
|
if to_inline:
|
|
22
22
|
inlined = False
|
|
23
23
|
for c in to_inline:
|
|
24
|
-
self.log(f"
|
|
24
|
+
self.log(f"Attempting to inline constant {c.address} on {cte.name}")
|
|
25
25
|
test = cte.inline_constant(c)
|
|
26
26
|
if test:
|
|
27
|
+
self.log(f"Successfully inlined constant to {cte.name}")
|
|
27
28
|
inlined = True
|
|
29
|
+
else:
|
|
30
|
+
self.log(f"Could not inline constant to {cte.name}")
|
|
28
31
|
return inlined
|
|
29
32
|
return False
|
|
@@ -4,15 +4,20 @@ from trilogy.core.models import (
|
|
|
4
4
|
)
|
|
5
5
|
|
|
6
6
|
from trilogy.core.optimizations.base_optimization import OptimizationRule
|
|
7
|
+
from collections import defaultdict
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class InlineDatasource(OptimizationRule):
|
|
10
11
|
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__()
|
|
14
|
+
self.candidates = defaultdict(lambda: set())
|
|
15
|
+
self.count = defaultdict(lambda: 0)
|
|
16
|
+
|
|
11
17
|
def optimize(self, cte: CTE, inverse_map: dict[str, list[CTE]]) -> bool:
|
|
12
18
|
if not cte.parent_ctes:
|
|
13
19
|
return False
|
|
14
20
|
|
|
15
|
-
optimized = False
|
|
16
21
|
self.log(
|
|
17
22
|
f"Checking {cte.name} for consolidating inline tables with {len(cte.parent_ctes)} parents"
|
|
18
23
|
)
|
|
@@ -25,6 +30,9 @@ class InlineDatasource(OptimizationRule):
|
|
|
25
30
|
if parent_cte.parent_ctes:
|
|
26
31
|
self.log(f"parent {parent_cte.name} has parents")
|
|
27
32
|
continue
|
|
33
|
+
if parent_cte.condition:
|
|
34
|
+
self.log(f"parent {parent_cte.name} has condition, cannot be inlined")
|
|
35
|
+
continue
|
|
28
36
|
raw_root = parent_cte.source.datasources[0]
|
|
29
37
|
if not isinstance(raw_root, Datasource):
|
|
30
38
|
self.log(f"parent {parent_cte.name} is not datasource")
|
|
@@ -34,7 +42,8 @@ class InlineDatasource(OptimizationRule):
|
|
|
34
42
|
self.log(f"parent {parent_cte.name} datasource is not inlineable")
|
|
35
43
|
continue
|
|
36
44
|
root_outputs = {x.address for x in root.output_concepts}
|
|
37
|
-
cte_outputs = {x.address for x in
|
|
45
|
+
cte_outputs = {x.address for x in cte.output_columns}
|
|
46
|
+
# cte_inherited_outputs = {x.address for x in parent_cte.output_columns if parent_cte.source_map.get(x.address)}
|
|
38
47
|
grain_components = {x.address for x in root.grain.components}
|
|
39
48
|
if not cte_outputs.issubset(root_outputs):
|
|
40
49
|
self.log(f"Not all {parent_cte.name} outputs are found on datasource")
|
|
@@ -44,11 +53,23 @@ class InlineDatasource(OptimizationRule):
|
|
|
44
53
|
force_group = True
|
|
45
54
|
to_inline.append(parent_cte)
|
|
46
55
|
|
|
56
|
+
optimized = False
|
|
47
57
|
for replaceable in to_inline:
|
|
48
|
-
|
|
58
|
+
if replaceable.name not in self.candidates[cte.name]:
|
|
59
|
+
self.candidates[cte.name].add(replaceable.name)
|
|
60
|
+
self.count[replaceable.source.name] += 1
|
|
61
|
+
return True
|
|
62
|
+
if self.count[replaceable.source.name] > 1:
|
|
63
|
+
self.log(
|
|
64
|
+
f"Skipping inlining raw datasource {replaceable.source.name} ({replaceable.name}) due to multiple references"
|
|
65
|
+
)
|
|
66
|
+
continue
|
|
49
67
|
result = cte.inline_parent_datasource(replaceable, force_group=force_group)
|
|
50
68
|
if result:
|
|
51
|
-
self.log(
|
|
69
|
+
self.log(
|
|
70
|
+
f"Inlined parent {replaceable.name} with {replaceable.source.name}"
|
|
71
|
+
)
|
|
72
|
+
optimized = True
|
|
52
73
|
else:
|
|
53
74
|
self.log(f"Failed to inline {replaceable.name}")
|
|
54
75
|
return optimized
|
|
@@ -21,85 +21,125 @@ def decompose_condition(conditional: Conditional):
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def is_child_of(a, comparison):
|
|
24
|
+
base = comparison == a
|
|
25
|
+
if base:
|
|
26
|
+
return True
|
|
24
27
|
if isinstance(comparison, Conditional):
|
|
25
28
|
return (
|
|
26
29
|
is_child_of(a, comparison.left) or is_child_of(a, comparison.right)
|
|
27
30
|
) and comparison.operator == BooleanOperator.AND
|
|
28
|
-
return
|
|
31
|
+
return base
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
class PredicatePushdown(OptimizationRule):
|
|
32
35
|
|
|
36
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
37
|
+
super().__init__(*args, **kwargs)
|
|
38
|
+
self.complete: dict[str, bool] = {}
|
|
39
|
+
|
|
40
|
+
def _check_parent(
|
|
41
|
+
self,
|
|
42
|
+
parent_cte: CTE,
|
|
43
|
+
candidate: Conditional,
|
|
44
|
+
inverse_map: dict[str, list[CTE]],
|
|
45
|
+
):
|
|
46
|
+
conditions = {x.address for x in candidate.concept_arguments}
|
|
47
|
+
if is_child_of(candidate, parent_cte.condition):
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
materialized = {k for k, v in parent_cte.source_map.items() if v != []}
|
|
51
|
+
if not conditions or not materialized:
|
|
52
|
+
return False
|
|
53
|
+
# if it's a root datasource, we can filter on _any_ of the output concepts
|
|
54
|
+
if parent_cte.is_root_datasource:
|
|
55
|
+
extra_check = {
|
|
56
|
+
x.address for x in parent_cte.source.datasources[0].output_concepts
|
|
57
|
+
}
|
|
58
|
+
if conditions.issubset(extra_check):
|
|
59
|
+
for x in conditions:
|
|
60
|
+
if x not in materialized:
|
|
61
|
+
materialized.add(x)
|
|
62
|
+
parent_cte.source_map[x] = [
|
|
63
|
+
parent_cte.source.datasources[0].name
|
|
64
|
+
]
|
|
65
|
+
if conditions.issubset(materialized):
|
|
66
|
+
children = inverse_map.get(parent_cte.name, [])
|
|
67
|
+
if all([is_child_of(candidate, child.condition) for child in children]):
|
|
68
|
+
self.log(
|
|
69
|
+
f"All concepts are found on {parent_cte.name} with existing {parent_cte.condition} and all it's {len(children)} children include same filter; pushing up {candidate}"
|
|
70
|
+
)
|
|
71
|
+
if parent_cte.condition:
|
|
72
|
+
parent_cte.condition = Conditional(
|
|
73
|
+
left=parent_cte.condition,
|
|
74
|
+
operator=BooleanOperator.AND,
|
|
75
|
+
right=candidate,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
parent_cte.condition = candidate
|
|
79
|
+
return True
|
|
80
|
+
self.debug(
|
|
81
|
+
f"conditions {conditions} not subset of parent {parent_cte.name} parent has {materialized} "
|
|
82
|
+
)
|
|
83
|
+
return False
|
|
84
|
+
|
|
33
85
|
def optimize(self, cte: CTE, inverse_map: dict[str, list[CTE]]) -> bool:
|
|
86
|
+
optimized = False
|
|
34
87
|
|
|
35
88
|
if not cte.parent_ctes:
|
|
36
89
|
self.debug(f"No parent CTEs for {cte.name}")
|
|
37
90
|
|
|
38
91
|
return False
|
|
39
92
|
|
|
40
|
-
optimized = False
|
|
41
93
|
if not cte.condition:
|
|
42
94
|
self.debug(f"No CTE condition for {cte.name}")
|
|
43
95
|
return False
|
|
44
|
-
|
|
96
|
+
if all(
|
|
97
|
+
[
|
|
98
|
+
is_child_of(cte.condition, parent_cte.condition)
|
|
99
|
+
for parent_cte in cte.parent_ctes
|
|
100
|
+
]
|
|
101
|
+
) and not any([isinstance(x, Datasource) for x in cte.source.datasources]):
|
|
102
|
+
self.log(
|
|
103
|
+
f"All parents of {cte.name} have same filter, removing filter from {cte.name}"
|
|
104
|
+
)
|
|
105
|
+
cte.condition = None
|
|
106
|
+
return True
|
|
107
|
+
else:
|
|
108
|
+
mapping = {
|
|
109
|
+
parent.name: is_child_of(cte.condition, parent.condition)
|
|
110
|
+
for parent in cte.parent_ctes
|
|
111
|
+
}
|
|
112
|
+
self.log(
|
|
113
|
+
f"Could not remove filter from {cte.name}, as not all parents have the same filter: {mapping}"
|
|
114
|
+
)
|
|
115
|
+
if self.complete.get(cte.name):
|
|
116
|
+
self.debug("Have done this CTE before")
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
self.debug(
|
|
45
120
|
f"Checking {cte.name} for predicate pushdown with {len(cte.parent_ctes)} parents"
|
|
46
121
|
)
|
|
47
122
|
if isinstance(cte.condition, Conditional):
|
|
48
123
|
candidates = cte.condition.decompose()
|
|
49
124
|
else:
|
|
50
125
|
candidates = [cte.condition]
|
|
51
|
-
self.
|
|
126
|
+
self.debug(
|
|
127
|
+
f"Have {len(candidates)} candidates to try to push down from parent {type(cte.condition)}"
|
|
128
|
+
)
|
|
129
|
+
optimized = False
|
|
52
130
|
for candidate in candidates:
|
|
53
|
-
|
|
131
|
+
self.debug(f"Checking candidate {candidate}")
|
|
54
132
|
for parent_cte in cte.parent_ctes:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
for x in conditions:
|
|
66
|
-
if x not in materialized:
|
|
67
|
-
materialized.add(x)
|
|
68
|
-
parent_cte.source_map[x] = [
|
|
69
|
-
parent_cte.source.datasources[0].name
|
|
70
|
-
]
|
|
71
|
-
if conditions.issubset(materialized):
|
|
72
|
-
if all(
|
|
73
|
-
[
|
|
74
|
-
is_child_of(candidate, child.condition)
|
|
75
|
-
for child in inverse_map.get(parent_cte.name, [])
|
|
76
|
-
]
|
|
77
|
-
):
|
|
78
|
-
self.log(
|
|
79
|
-
f"All concepts are found on {parent_cte.name} and all it's children include same filter; pushing up filter"
|
|
80
|
-
)
|
|
81
|
-
if parent_cte.condition:
|
|
82
|
-
parent_cte.condition = Conditional(
|
|
83
|
-
left=parent_cte.condition,
|
|
84
|
-
operator=BooleanOperator.AND,
|
|
85
|
-
right=candidate,
|
|
86
|
-
)
|
|
87
|
-
else:
|
|
88
|
-
parent_cte.condition = candidate
|
|
89
|
-
optimized = True
|
|
90
|
-
else:
|
|
91
|
-
self.log(
|
|
92
|
-
f"conditions {conditions} not subset of parent {parent_cte.name} parent has {materialized} "
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
if all(
|
|
96
|
-
[
|
|
97
|
-
is_child_of(cte.condition, parent_cte.condition)
|
|
98
|
-
for parent_cte in cte.parent_ctes
|
|
99
|
-
]
|
|
100
|
-
) and not any([isinstance(x, Datasource) for x in cte.source.datasources]):
|
|
101
|
-
self.log("All parents have same filter, removing filter")
|
|
102
|
-
cte.condition = None
|
|
103
|
-
optimized = True
|
|
133
|
+
local_pushdown = self._check_parent(
|
|
134
|
+
parent_cte=parent_cte, candidate=candidate, inverse_map=inverse_map
|
|
135
|
+
)
|
|
136
|
+
optimized = optimized or local_pushdown
|
|
137
|
+
if local_pushdown:
|
|
138
|
+
# taint a CTE again when something is pushed up to it.
|
|
139
|
+
self.complete[parent_cte.name] = False
|
|
140
|
+
self.debug(
|
|
141
|
+
f"Pushed down {candidate} from {cte.name} to {parent_cte.name}"
|
|
142
|
+
)
|
|
104
143
|
|
|
144
|
+
self.complete[cte.name] = True
|
|
105
145
|
return optimized
|
|
@@ -27,7 +27,6 @@ from trilogy.core.processing.node_generators import (
|
|
|
27
27
|
gen_group_to_node,
|
|
28
28
|
gen_rowset_node,
|
|
29
29
|
gen_multiselect_node,
|
|
30
|
-
gen_concept_merge_node,
|
|
31
30
|
)
|
|
32
31
|
|
|
33
32
|
from enum import Enum
|
|
@@ -68,7 +67,6 @@ def get_priority_concept(
|
|
|
68
67
|
# sometimes we need to scan intermediate concepts to get merge keys, so fall back
|
|
69
68
|
# to exhaustive search
|
|
70
69
|
pass_two = [c for c in all_concepts if c.address not in attempted_addresses]
|
|
71
|
-
|
|
72
70
|
for remaining_concept in (pass_one, pass_two):
|
|
73
71
|
priority = (
|
|
74
72
|
# find anything that needs no joins first, so we can exit early
|
|
@@ -79,9 +77,6 @@ def get_priority_concept(
|
|
|
79
77
|
and c.granularity == Granularity.SINGLE_ROW
|
|
80
78
|
]
|
|
81
79
|
+
|
|
82
|
-
# anything that requires merging concept universes
|
|
83
|
-
[c for c in remaining_concept if c.derivation == PurposeLineage.MERGE]
|
|
84
|
-
+
|
|
85
80
|
# then multiselects to remove them from scope
|
|
86
81
|
[c for c in remaining_concept if c.derivation == PurposeLineage.MULTISELECT]
|
|
87
82
|
+
|
|
@@ -174,7 +169,9 @@ def get_priority_concept(
|
|
|
174
169
|
|
|
175
170
|
|
|
176
171
|
def generate_candidates_restrictive(
|
|
177
|
-
priority_concept: Concept,
|
|
172
|
+
priority_concept: Concept,
|
|
173
|
+
candidates: list[Concept],
|
|
174
|
+
exhausted: set[str],
|
|
178
175
|
) -> List[List[Concept]]:
|
|
179
176
|
# if it's single row, joins are irrelevant. Fetch without keys.
|
|
180
177
|
if priority_concept.granularity == Granularity.SINGLE_ROW:
|
|
@@ -216,6 +213,7 @@ def generate_node(
|
|
|
216
213
|
fail_if_not_found=False,
|
|
217
214
|
accept_partial=accept_partial,
|
|
218
215
|
accept_partial_optional=False,
|
|
216
|
+
source_concepts=source_concepts,
|
|
219
217
|
)
|
|
220
218
|
|
|
221
219
|
if candidate:
|
|
@@ -273,14 +271,6 @@ def generate_node(
|
|
|
273
271
|
return gen_multiselect_node(
|
|
274
272
|
concept, local_optional, environment, g, depth + 1, source_concepts, history
|
|
275
273
|
)
|
|
276
|
-
elif concept.derivation == PurposeLineage.MERGE:
|
|
277
|
-
logger.info(
|
|
278
|
-
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating multiselect node with optional {[x.address for x in local_optional]}"
|
|
279
|
-
)
|
|
280
|
-
node = gen_concept_merge_node(
|
|
281
|
-
concept, local_optional, environment, g, depth + 1, source_concepts, history
|
|
282
|
-
)
|
|
283
|
-
return node
|
|
284
274
|
elif concept.derivation == PurposeLineage.CONSTANT:
|
|
285
275
|
logger.info(
|
|
286
276
|
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} for {concept.address}, generating constant node"
|
|
@@ -331,17 +321,60 @@ def generate_node(
|
|
|
331
321
|
fail_if_not_found=False,
|
|
332
322
|
accept_partial=accept_partial,
|
|
333
323
|
accept_partial_optional=True,
|
|
324
|
+
source_concepts=source_concepts,
|
|
334
325
|
)
|
|
335
326
|
else:
|
|
336
327
|
raise ValueError(f"Unknown derivation {concept.derivation}")
|
|
337
328
|
|
|
338
329
|
|
|
330
|
+
def validate_concept(
|
|
331
|
+
concept: Concept,
|
|
332
|
+
node: StrategyNode,
|
|
333
|
+
found_addresses: set[str],
|
|
334
|
+
non_partial_addresses: set[str],
|
|
335
|
+
partial_addresses: set[str],
|
|
336
|
+
virtual_addresses: set[str],
|
|
337
|
+
found_map: dict[str, set[Concept]],
|
|
338
|
+
accept_partial: bool,
|
|
339
|
+
):
|
|
340
|
+
found_map[str(node)].add(concept)
|
|
341
|
+
if concept not in node.partial_concepts:
|
|
342
|
+
|
|
343
|
+
found_addresses.add(concept.address)
|
|
344
|
+
non_partial_addresses.add(concept.address)
|
|
345
|
+
# remove it from our partial tracking
|
|
346
|
+
if concept.address in partial_addresses:
|
|
347
|
+
partial_addresses.remove(concept.address)
|
|
348
|
+
if concept.address in virtual_addresses:
|
|
349
|
+
virtual_addresses.remove(concept.address)
|
|
350
|
+
if concept in node.partial_concepts:
|
|
351
|
+
if concept.address in non_partial_addresses:
|
|
352
|
+
return None
|
|
353
|
+
partial_addresses.add(concept.address)
|
|
354
|
+
if accept_partial:
|
|
355
|
+
found_addresses.add(concept.address)
|
|
356
|
+
found_map[str(node)].add(concept)
|
|
357
|
+
for _, v in concept.pseudonyms.items():
|
|
358
|
+
if v.address == concept.address:
|
|
359
|
+
return
|
|
360
|
+
validate_concept(
|
|
361
|
+
v,
|
|
362
|
+
node,
|
|
363
|
+
found_addresses,
|
|
364
|
+
non_partial_addresses,
|
|
365
|
+
partial_addresses,
|
|
366
|
+
virtual_addresses,
|
|
367
|
+
found_map,
|
|
368
|
+
accept_partial,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
|
|
339
372
|
def validate_stack(
|
|
340
373
|
stack: List[StrategyNode],
|
|
341
374
|
concepts: List[Concept],
|
|
342
375
|
accept_partial: bool = False,
|
|
343
376
|
) -> tuple[ValidationResult, set[str], set[str], set[str], set[str]]:
|
|
344
|
-
found_map = defaultdict(set)
|
|
377
|
+
found_map: dict[str, set[Concept]] = defaultdict(set)
|
|
345
378
|
found_addresses: set[str] = set()
|
|
346
379
|
non_partial_addresses: set[str] = set()
|
|
347
380
|
partial_addresses: set[str] = set()
|
|
@@ -349,27 +382,22 @@ def validate_stack(
|
|
|
349
382
|
for node in stack:
|
|
350
383
|
resolved = node.resolve()
|
|
351
384
|
for concept in resolved.output_concepts:
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
if concept.address in non_partial_addresses:
|
|
363
|
-
continue
|
|
364
|
-
partial_addresses.add(concept.address)
|
|
365
|
-
if accept_partial:
|
|
366
|
-
found_addresses.add(concept.address)
|
|
367
|
-
found_map[str(node)].add(concept)
|
|
385
|
+
validate_concept(
|
|
386
|
+
concept,
|
|
387
|
+
node,
|
|
388
|
+
found_addresses,
|
|
389
|
+
non_partial_addresses,
|
|
390
|
+
partial_addresses,
|
|
391
|
+
virtual_addresses,
|
|
392
|
+
found_map,
|
|
393
|
+
accept_partial,
|
|
394
|
+
)
|
|
368
395
|
for concept in node.virtual_output_concepts:
|
|
369
396
|
if concept.address in non_partial_addresses:
|
|
370
397
|
continue
|
|
371
398
|
found_addresses.add(concept.address)
|
|
372
399
|
virtual_addresses.add(concept.address)
|
|
400
|
+
|
|
373
401
|
# zip in those we know we found
|
|
374
402
|
if not all([c.address in found_addresses for c in concepts]):
|
|
375
403
|
return (
|
|
@@ -379,7 +407,8 @@ def validate_stack(
|
|
|
379
407
|
partial_addresses,
|
|
380
408
|
virtual_addresses,
|
|
381
409
|
)
|
|
382
|
-
|
|
410
|
+
|
|
411
|
+
graph_count, _ = get_disconnected_components(found_map)
|
|
383
412
|
if graph_count in (0, 1):
|
|
384
413
|
return (
|
|
385
414
|
ValidationResult.COMPLETE,
|
|
@@ -415,7 +444,7 @@ def search_concepts(
|
|
|
415
444
|
hist = history.get_history(mandatory_list, accept_partial)
|
|
416
445
|
if hist is not False:
|
|
417
446
|
logger.info(
|
|
418
|
-
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Returning search node from history for {[c.address for c in mandatory_list]} with accept_partial {accept_partial}"
|
|
447
|
+
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Returning search node from history ({'exists' if hist is not None else 'does not exist'}) for {[c.address for c in mandatory_list]} with accept_partial {accept_partial}"
|
|
419
448
|
)
|
|
420
449
|
assert not isinstance(hist, bool)
|
|
421
450
|
return hist
|
|
@@ -445,6 +474,7 @@ def _search_concepts(
|
|
|
445
474
|
) -> StrategyNode | None:
|
|
446
475
|
|
|
447
476
|
mandatory_list = unique(mandatory_list, "address")
|
|
477
|
+
|
|
448
478
|
all_mandatory = set(c.address for c in mandatory_list)
|
|
449
479
|
attempted: set[str] = set()
|
|
450
480
|
|
|
@@ -457,6 +487,7 @@ def _search_concepts(
|
|
|
457
487
|
priority_concept = get_priority_concept(
|
|
458
488
|
mandatory_list, attempted, found_concepts=found, depth=depth
|
|
459
489
|
)
|
|
490
|
+
|
|
460
491
|
logger.info(
|
|
461
492
|
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} priority concept is {str(priority_concept)}"
|
|
462
493
|
)
|
|
@@ -467,16 +498,16 @@ def _search_concepts(
|
|
|
467
498
|
candidate_lists = generate_candidates_restrictive(
|
|
468
499
|
priority_concept, candidates, skip
|
|
469
500
|
)
|
|
470
|
-
for
|
|
501
|
+
for clist in candidate_lists:
|
|
471
502
|
logger.info(
|
|
472
|
-
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Beginning sourcing loop for {str(priority_concept)}, accept_partial {accept_partial} optional {[str(v) for v in
|
|
503
|
+
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} Beginning sourcing loop for {str(priority_concept)}, accept_partial {accept_partial}, optional {[str(v) for v in clist]}, exhausted {[str(c) for c in skip]}"
|
|
473
504
|
)
|
|
474
505
|
node = generate_node(
|
|
475
506
|
priority_concept,
|
|
476
|
-
|
|
507
|
+
clist,
|
|
477
508
|
environment,
|
|
478
509
|
g,
|
|
479
|
-
depth
|
|
510
|
+
depth,
|
|
480
511
|
source_concepts=search_concepts,
|
|
481
512
|
accept_partial=accept_partial,
|
|
482
513
|
history=history,
|
|
@@ -494,7 +525,6 @@ def _search_concepts(
|
|
|
494
525
|
PurposeLineage.ROWSET,
|
|
495
526
|
PurposeLineage.BASIC,
|
|
496
527
|
PurposeLineage.MULTISELECT,
|
|
497
|
-
PurposeLineage.MERGE,
|
|
498
528
|
]:
|
|
499
529
|
skip.add(priority_concept.address)
|
|
500
530
|
break
|
|
@@ -504,7 +534,7 @@ def _search_concepts(
|
|
|
504
534
|
)
|
|
505
535
|
|
|
506
536
|
logger.info(
|
|
507
|
-
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} finished concept loop for {priority_concept} flag for accepting partial addresses is
|
|
537
|
+
f"{depth_to_prefix(depth)}{LOGGER_PREFIX} finished concept loop for {priority_concept} flag for accepting partial addresses is"
|
|
508
538
|
f" {accept_partial} (complete: {complete}), have {found} from {[n for n in stack]} (missing {missing} partial {partial} virtual {virtual}), attempted {attempted}"
|
|
509
539
|
)
|
|
510
540
|
# early exit if we have a complete stack with one node
|
|
@@ -29,10 +29,10 @@ def extract_required_subgraphs(
|
|
|
29
29
|
def extract_mandatory_subgraphs(paths: Dict[str, List[str]], g) -> List[List[Concept]]:
|
|
30
30
|
final: list[list[str]] = []
|
|
31
31
|
assocs: defaultdict[str, list] = defaultdict(list)
|
|
32
|
-
for path in paths:
|
|
33
|
-
extract_required_subgraphs(assocs,
|
|
32
|
+
for path in paths.values():
|
|
33
|
+
extract_required_subgraphs(assocs, path)
|
|
34
34
|
|
|
35
|
-
for
|
|
35
|
+
for _, v in assocs.items():
|
|
36
36
|
final.append(v)
|
|
37
37
|
final_concepts = []
|
|
38
38
|
for value in final:
|
|
@@ -8,7 +8,6 @@ from .unnest_node import gen_unnest_node
|
|
|
8
8
|
from .node_merge_node import gen_merge_node
|
|
9
9
|
from .rowset_node import gen_rowset_node
|
|
10
10
|
from .multiselect_node import gen_multiselect_node
|
|
11
|
-
from .concept_merge_node import gen_concept_merge_node
|
|
12
11
|
|
|
13
12
|
__all__ = [
|
|
14
13
|
"gen_filter_node",
|
|
@@ -21,5 +20,4 @@ __all__ = [
|
|
|
21
20
|
"gen_group_to_node",
|
|
22
21
|
"gen_rowset_node",
|
|
23
22
|
"gen_multiselect_node",
|
|
24
|
-
"gen_concept_merge_node",
|
|
25
23
|
]
|