pytrilogy 0.3.148__cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- LICENSE.md +19 -0
- _preql_import_resolver/__init__.py +5 -0
- _preql_import_resolver/_preql_import_resolver.cpython-312-aarch64-linux-gnu.so +0 -0
- pytrilogy-0.3.148.dist-info/METADATA +555 -0
- pytrilogy-0.3.148.dist-info/RECORD +206 -0
- pytrilogy-0.3.148.dist-info/WHEEL +5 -0
- pytrilogy-0.3.148.dist-info/entry_points.txt +2 -0
- pytrilogy-0.3.148.dist-info/licenses/LICENSE.md +19 -0
- trilogy/__init__.py +27 -0
- trilogy/ai/README.md +10 -0
- trilogy/ai/__init__.py +19 -0
- trilogy/ai/constants.py +92 -0
- trilogy/ai/conversation.py +107 -0
- trilogy/ai/enums.py +7 -0
- trilogy/ai/execute.py +50 -0
- trilogy/ai/models.py +34 -0
- trilogy/ai/prompts.py +100 -0
- trilogy/ai/providers/__init__.py +0 -0
- trilogy/ai/providers/anthropic.py +106 -0
- trilogy/ai/providers/base.py +24 -0
- trilogy/ai/providers/google.py +146 -0
- trilogy/ai/providers/openai.py +89 -0
- trilogy/ai/providers/utils.py +68 -0
- trilogy/authoring/README.md +3 -0
- trilogy/authoring/__init__.py +148 -0
- trilogy/constants.py +119 -0
- trilogy/core/README.md +52 -0
- trilogy/core/__init__.py +0 -0
- trilogy/core/constants.py +6 -0
- trilogy/core/enums.py +454 -0
- trilogy/core/env_processor.py +239 -0
- trilogy/core/environment_helpers.py +320 -0
- trilogy/core/ergonomics.py +193 -0
- trilogy/core/exceptions.py +123 -0
- trilogy/core/functions.py +1240 -0
- trilogy/core/graph_models.py +142 -0
- trilogy/core/internal.py +85 -0
- trilogy/core/models/__init__.py +0 -0
- trilogy/core/models/author.py +2662 -0
- trilogy/core/models/build.py +2603 -0
- trilogy/core/models/build_environment.py +165 -0
- trilogy/core/models/core.py +506 -0
- trilogy/core/models/datasource.py +434 -0
- trilogy/core/models/environment.py +756 -0
- trilogy/core/models/execute.py +1213 -0
- trilogy/core/optimization.py +251 -0
- trilogy/core/optimizations/__init__.py +12 -0
- trilogy/core/optimizations/base_optimization.py +17 -0
- trilogy/core/optimizations/hide_unused_concept.py +47 -0
- trilogy/core/optimizations/inline_datasource.py +102 -0
- trilogy/core/optimizations/predicate_pushdown.py +245 -0
- trilogy/core/processing/README.md +94 -0
- trilogy/core/processing/READMEv2.md +121 -0
- trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
- trilogy/core/processing/__init__.py +0 -0
- trilogy/core/processing/concept_strategies_v3.py +508 -0
- trilogy/core/processing/constants.py +15 -0
- trilogy/core/processing/discovery_node_factory.py +451 -0
- trilogy/core/processing/discovery_utility.py +548 -0
- trilogy/core/processing/discovery_validation.py +167 -0
- trilogy/core/processing/graph_utils.py +43 -0
- trilogy/core/processing/node_generators/README.md +9 -0
- trilogy/core/processing/node_generators/__init__.py +31 -0
- trilogy/core/processing/node_generators/basic_node.py +160 -0
- trilogy/core/processing/node_generators/common.py +270 -0
- trilogy/core/processing/node_generators/constant_node.py +38 -0
- trilogy/core/processing/node_generators/filter_node.py +315 -0
- trilogy/core/processing/node_generators/group_node.py +213 -0
- trilogy/core/processing/node_generators/group_to_node.py +117 -0
- trilogy/core/processing/node_generators/multiselect_node.py +207 -0
- trilogy/core/processing/node_generators/node_merge_node.py +695 -0
- trilogy/core/processing/node_generators/recursive_node.py +88 -0
- trilogy/core/processing/node_generators/rowset_node.py +165 -0
- trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
- trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
- trilogy/core/processing/node_generators/select_merge_node.py +786 -0
- trilogy/core/processing/node_generators/select_node.py +95 -0
- trilogy/core/processing/node_generators/synonym_node.py +98 -0
- trilogy/core/processing/node_generators/union_node.py +91 -0
- trilogy/core/processing/node_generators/unnest_node.py +182 -0
- trilogy/core/processing/node_generators/window_node.py +201 -0
- trilogy/core/processing/nodes/README.md +28 -0
- trilogy/core/processing/nodes/__init__.py +179 -0
- trilogy/core/processing/nodes/base_node.py +522 -0
- trilogy/core/processing/nodes/filter_node.py +75 -0
- trilogy/core/processing/nodes/group_node.py +194 -0
- trilogy/core/processing/nodes/merge_node.py +420 -0
- trilogy/core/processing/nodes/recursive_node.py +46 -0
- trilogy/core/processing/nodes/select_node_v2.py +242 -0
- trilogy/core/processing/nodes/union_node.py +53 -0
- trilogy/core/processing/nodes/unnest_node.py +62 -0
- trilogy/core/processing/nodes/window_node.py +56 -0
- trilogy/core/processing/utility.py +823 -0
- trilogy/core/query_processor.py +604 -0
- trilogy/core/statements/README.md +35 -0
- trilogy/core/statements/__init__.py +0 -0
- trilogy/core/statements/author.py +536 -0
- trilogy/core/statements/build.py +0 -0
- trilogy/core/statements/common.py +20 -0
- trilogy/core/statements/execute.py +155 -0
- trilogy/core/table_processor.py +66 -0
- trilogy/core/utility.py +8 -0
- trilogy/core/validation/README.md +46 -0
- trilogy/core/validation/__init__.py +0 -0
- trilogy/core/validation/common.py +161 -0
- trilogy/core/validation/concept.py +146 -0
- trilogy/core/validation/datasource.py +227 -0
- trilogy/core/validation/environment.py +73 -0
- trilogy/core/validation/fix.py +256 -0
- trilogy/dialect/__init__.py +32 -0
- trilogy/dialect/base.py +1431 -0
- trilogy/dialect/bigquery.py +314 -0
- trilogy/dialect/common.py +147 -0
- trilogy/dialect/config.py +159 -0
- trilogy/dialect/dataframe.py +50 -0
- trilogy/dialect/duckdb.py +376 -0
- trilogy/dialect/enums.py +149 -0
- trilogy/dialect/metadata.py +173 -0
- trilogy/dialect/mock.py +190 -0
- trilogy/dialect/postgres.py +117 -0
- trilogy/dialect/presto.py +110 -0
- trilogy/dialect/results.py +89 -0
- trilogy/dialect/snowflake.py +129 -0
- trilogy/dialect/sql_server.py +137 -0
- trilogy/engine.py +48 -0
- trilogy/execution/__init__.py +17 -0
- trilogy/execution/config.py +119 -0
- trilogy/execution/state/__init__.py +0 -0
- trilogy/execution/state/file_state_store.py +0 -0
- trilogy/execution/state/sqllite_state_store.py +0 -0
- trilogy/execution/state/state_store.py +301 -0
- trilogy/executor.py +656 -0
- trilogy/hooks/__init__.py +4 -0
- trilogy/hooks/base_hook.py +40 -0
- trilogy/hooks/graph_hook.py +135 -0
- trilogy/hooks/query_debugger.py +166 -0
- trilogy/metadata/__init__.py +0 -0
- trilogy/parser.py +10 -0
- trilogy/parsing/README.md +21 -0
- trilogy/parsing/__init__.py +0 -0
- trilogy/parsing/common.py +1069 -0
- trilogy/parsing/config.py +5 -0
- trilogy/parsing/exceptions.py +8 -0
- trilogy/parsing/helpers.py +1 -0
- trilogy/parsing/parse_engine.py +2863 -0
- trilogy/parsing/render.py +773 -0
- trilogy/parsing/trilogy.lark +544 -0
- trilogy/py.typed +0 -0
- trilogy/render.py +45 -0
- trilogy/scripts/README.md +9 -0
- trilogy/scripts/__init__.py +0 -0
- trilogy/scripts/agent.py +41 -0
- trilogy/scripts/agent_info.py +306 -0
- trilogy/scripts/common.py +430 -0
- trilogy/scripts/dependency/Cargo.lock +617 -0
- trilogy/scripts/dependency/Cargo.toml +39 -0
- trilogy/scripts/dependency/README.md +131 -0
- trilogy/scripts/dependency/build.sh +25 -0
- trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
- trilogy/scripts/dependency/src/lib.rs +16 -0
- trilogy/scripts/dependency/src/main.rs +770 -0
- trilogy/scripts/dependency/src/parser.rs +435 -0
- trilogy/scripts/dependency/src/preql.pest +208 -0
- trilogy/scripts/dependency/src/python_bindings.rs +311 -0
- trilogy/scripts/dependency/src/resolver.rs +716 -0
- trilogy/scripts/dependency/tests/base.preql +3 -0
- trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
- trilogy/scripts/dependency/tests/customer.preql +6 -0
- trilogy/scripts/dependency/tests/main.preql +9 -0
- trilogy/scripts/dependency/tests/orders.preql +7 -0
- trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
- trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
- trilogy/scripts/dependency.py +323 -0
- trilogy/scripts/display.py +555 -0
- trilogy/scripts/environment.py +59 -0
- trilogy/scripts/fmt.py +32 -0
- trilogy/scripts/ingest.py +472 -0
- trilogy/scripts/ingest_helpers/__init__.py +1 -0
- trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
- trilogy/scripts/ingest_helpers/formatting.py +93 -0
- trilogy/scripts/ingest_helpers/typing.py +161 -0
- trilogy/scripts/init.py +105 -0
- trilogy/scripts/parallel_execution.py +748 -0
- trilogy/scripts/plan.py +189 -0
- trilogy/scripts/refresh.py +106 -0
- trilogy/scripts/run.py +79 -0
- trilogy/scripts/serve.py +202 -0
- trilogy/scripts/serve_helpers/__init__.py +41 -0
- trilogy/scripts/serve_helpers/file_discovery.py +142 -0
- trilogy/scripts/serve_helpers/index_generation.py +206 -0
- trilogy/scripts/serve_helpers/models.py +38 -0
- trilogy/scripts/single_execution.py +131 -0
- trilogy/scripts/testing.py +129 -0
- trilogy/scripts/trilogy.py +75 -0
- trilogy/std/__init__.py +0 -0
- trilogy/std/color.preql +3 -0
- trilogy/std/date.preql +13 -0
- trilogy/std/display.preql +18 -0
- trilogy/std/geography.preql +22 -0
- trilogy/std/metric.preql +15 -0
- trilogy/std/money.preql +67 -0
- trilogy/std/net.preql +14 -0
- trilogy/std/ranking.preql +7 -0
- trilogy/std/report.preql +5 -0
- trilogy/std/semantic.preql +6 -0
- trilogy/utility.py +34 -0
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from math import ceil
|
|
3
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from trilogy.constants import CONFIG, logger
|
|
6
|
+
from trilogy.core.constants import CONSTANT_DATASET
|
|
7
|
+
from trilogy.core.enums import BooleanOperator, DatasourceState, SourceType
|
|
8
|
+
from trilogy.core.env_processor import generate_graph
|
|
9
|
+
from trilogy.core.ergonomics import generate_cte_names
|
|
10
|
+
from trilogy.core.models.author import MultiSelectLineage, SelectLineage
|
|
11
|
+
from trilogy.core.models.build import (
|
|
12
|
+
BuildConcept,
|
|
13
|
+
BuildConditional,
|
|
14
|
+
BuildDatasource,
|
|
15
|
+
BuildFunction,
|
|
16
|
+
BuildMultiSelectLineage,
|
|
17
|
+
BuildParamaterizedConceptReference,
|
|
18
|
+
BuildSelectLineage,
|
|
19
|
+
Factory,
|
|
20
|
+
)
|
|
21
|
+
from trilogy.core.models.core import DataType
|
|
22
|
+
from trilogy.core.models.datasource import Address, Datasource
|
|
23
|
+
from trilogy.core.models.environment import Environment
|
|
24
|
+
from trilogy.core.models.execute import (
|
|
25
|
+
CTE,
|
|
26
|
+
BaseJoin,
|
|
27
|
+
CTEConceptPair,
|
|
28
|
+
InstantiatedUnnestJoin,
|
|
29
|
+
Join,
|
|
30
|
+
QueryDatasource,
|
|
31
|
+
RecursiveCTE,
|
|
32
|
+
UnionCTE,
|
|
33
|
+
UnnestJoin,
|
|
34
|
+
)
|
|
35
|
+
from trilogy.core.optimization import optimize_ctes
|
|
36
|
+
from trilogy.core.processing.concept_strategies_v3 import source_query_concepts
|
|
37
|
+
from trilogy.core.processing.nodes import History, SelectNode, StrategyNode
|
|
38
|
+
from trilogy.core.statements.author import (
|
|
39
|
+
ConceptDeclarationStatement,
|
|
40
|
+
CopyStatement,
|
|
41
|
+
MultiSelectStatement,
|
|
42
|
+
PersistStatement,
|
|
43
|
+
SelectStatement,
|
|
44
|
+
)
|
|
45
|
+
from trilogy.core.statements.execute import (
|
|
46
|
+
MaterializedDataset,
|
|
47
|
+
ProcessedCopyStatement,
|
|
48
|
+
ProcessedQuery,
|
|
49
|
+
ProcessedQueryPersist,
|
|
50
|
+
)
|
|
51
|
+
from trilogy.hooks.base_hook import BaseHook
|
|
52
|
+
from trilogy.utility import unique
|
|
53
|
+
|
|
54
|
+
LOGGER_PREFIX = "[QUERY BUILD]"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def base_join_to_join(
|
|
58
|
+
base_join: BaseJoin | UnnestJoin, ctes: List[CTE | UnionCTE]
|
|
59
|
+
) -> Join | InstantiatedUnnestJoin:
|
|
60
|
+
"""This function converts joins at the datasource level
|
|
61
|
+
to joins at the CTE level"""
|
|
62
|
+
if isinstance(base_join, UnnestJoin):
|
|
63
|
+
object_to_unnest = base_join.parent.arguments[0]
|
|
64
|
+
if not isinstance(
|
|
65
|
+
object_to_unnest,
|
|
66
|
+
(BuildConcept | BuildParamaterizedConceptReference | BuildFunction),
|
|
67
|
+
):
|
|
68
|
+
raise ValueError(f"Unnest join must be a concept; got {object_to_unnest}")
|
|
69
|
+
return InstantiatedUnnestJoin(
|
|
70
|
+
object_to_unnest=object_to_unnest,
|
|
71
|
+
alias=base_join.alias,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def get_datasource_cte(
|
|
75
|
+
datasource: BuildDatasource | QueryDatasource,
|
|
76
|
+
) -> CTE | UnionCTE:
|
|
77
|
+
eligible = set()
|
|
78
|
+
for cte in ctes:
|
|
79
|
+
if cte.source.identifier == datasource.identifier:
|
|
80
|
+
return cte
|
|
81
|
+
eligible.add(cte.source.identifier)
|
|
82
|
+
for cte in ctes:
|
|
83
|
+
if cte.source.datasources[0].identifier == datasource.identifier:
|
|
84
|
+
return cte
|
|
85
|
+
eligible.add(cte.source.datasources[0].identifier)
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Could not find CTE for datasource {datasource.identifier}; have {eligible}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if base_join.left_datasource is not None:
|
|
91
|
+
left_cte = get_datasource_cte(base_join.left_datasource)
|
|
92
|
+
else:
|
|
93
|
+
# multiple left ctes
|
|
94
|
+
left_cte = None
|
|
95
|
+
right_cte = get_datasource_cte(base_join.right_datasource)
|
|
96
|
+
if base_join.concept_pairs:
|
|
97
|
+
final_pairs = [
|
|
98
|
+
CTEConceptPair(
|
|
99
|
+
left=pair.left,
|
|
100
|
+
right=pair.right,
|
|
101
|
+
existing_datasource=pair.existing_datasource,
|
|
102
|
+
modifiers=pair.modifiers,
|
|
103
|
+
cte=get_datasource_cte(pair.existing_datasource),
|
|
104
|
+
)
|
|
105
|
+
for pair in base_join.concept_pairs
|
|
106
|
+
]
|
|
107
|
+
elif base_join.concepts and base_join.left_datasource:
|
|
108
|
+
final_pairs = [
|
|
109
|
+
CTEConceptPair(
|
|
110
|
+
left=concept,
|
|
111
|
+
right=concept,
|
|
112
|
+
existing_datasource=base_join.left_datasource,
|
|
113
|
+
modifiers=[],
|
|
114
|
+
cte=get_datasource_cte(
|
|
115
|
+
base_join.left_datasource,
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
for concept in base_join.concepts
|
|
119
|
+
]
|
|
120
|
+
else:
|
|
121
|
+
final_pairs = []
|
|
122
|
+
return Join(
|
|
123
|
+
left_cte=left_cte,
|
|
124
|
+
right_cte=right_cte,
|
|
125
|
+
jointype=base_join.join_type,
|
|
126
|
+
joinkey_pairs=final_pairs,
|
|
127
|
+
modifiers=base_join.modifiers,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def generate_source_map(
|
|
132
|
+
query_datasource: QueryDatasource, all_new_ctes: List[CTE | UnionCTE]
|
|
133
|
+
) -> Tuple[Dict[str, list[str]], Dict[str, list[str]]]:
|
|
134
|
+
source_map: Dict[str, list[str]] = defaultdict(list)
|
|
135
|
+
# now populate anything derived in this level
|
|
136
|
+
for qdk, qdv in query_datasource.source_map.items():
|
|
137
|
+
unnest = [x for x in qdv if isinstance(x, UnnestJoin)]
|
|
138
|
+
for _ in unnest:
|
|
139
|
+
source_map[qdk] = []
|
|
140
|
+
if (
|
|
141
|
+
qdk not in source_map
|
|
142
|
+
and len(qdv) == 1
|
|
143
|
+
and isinstance(list(qdv)[0], UnnestJoin)
|
|
144
|
+
):
|
|
145
|
+
source_map[qdk] = []
|
|
146
|
+
basic = [x for x in qdv if isinstance(x, BuildDatasource)]
|
|
147
|
+
for base in basic:
|
|
148
|
+
source_map[qdk].append(base.safe_identifier)
|
|
149
|
+
|
|
150
|
+
ctes = [x for x in qdv if isinstance(x, QueryDatasource)]
|
|
151
|
+
if ctes:
|
|
152
|
+
names = set([x.safe_identifier for x in ctes])
|
|
153
|
+
matches = [
|
|
154
|
+
cte for cte in all_new_ctes if cte.source.safe_identifier in names
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
if not matches and names:
|
|
158
|
+
raise SyntaxError(
|
|
159
|
+
f"Missing parent CTEs for source map; expecting {names}, have {[cte.source.safe_identifier for cte in all_new_ctes]}"
|
|
160
|
+
)
|
|
161
|
+
for cte in matches:
|
|
162
|
+
output_address = [
|
|
163
|
+
x.address
|
|
164
|
+
for x in cte.output_columns
|
|
165
|
+
if x.address not in [z.address for z in cte.partial_concepts]
|
|
166
|
+
]
|
|
167
|
+
if qdk in output_address:
|
|
168
|
+
source_map[qdk].append(cte.safe_identifier)
|
|
169
|
+
# now do a pass that accepts partials
|
|
170
|
+
for cte in matches:
|
|
171
|
+
if qdk not in source_map:
|
|
172
|
+
source_map[qdk] = [cte.safe_identifier]
|
|
173
|
+
if qdk not in source_map:
|
|
174
|
+
if not qdv:
|
|
175
|
+
source_map[qdk] = []
|
|
176
|
+
elif CONFIG.validate_missing:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
f"Missing {qdk} in {source_map}, source map {query_datasource.source_map} "
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# existence lookups use a separate map
|
|
182
|
+
# as they cannot be referenced in row resolution
|
|
183
|
+
existence_source_map: Dict[str, list[str]] = defaultdict(list)
|
|
184
|
+
for ek, ev in query_datasource.existence_source_map.items():
|
|
185
|
+
ids = set([x.safe_identifier for x in ev])
|
|
186
|
+
ematches = [
|
|
187
|
+
cte.name for cte in all_new_ctes if cte.source.safe_identifier in ids
|
|
188
|
+
]
|
|
189
|
+
existence_source_map[ek] = ematches
|
|
190
|
+
return {
|
|
191
|
+
k: [] if not v else list(set(v)) for k, v in source_map.items()
|
|
192
|
+
}, existence_source_map
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def datasource_to_query_datasource(datasource: BuildDatasource) -> QueryDatasource:
|
|
196
|
+
sub_select: Dict[str, Set[Union[BuildDatasource, QueryDatasource, UnnestJoin]]] = {
|
|
197
|
+
**{c.address: {datasource} for c in datasource.concepts},
|
|
198
|
+
}
|
|
199
|
+
concepts = [c for c in datasource.concepts]
|
|
200
|
+
concepts = unique(concepts, "address")
|
|
201
|
+
return QueryDatasource(
|
|
202
|
+
output_concepts=concepts,
|
|
203
|
+
input_concepts=concepts,
|
|
204
|
+
source_map=sub_select,
|
|
205
|
+
grain=datasource.grain,
|
|
206
|
+
datasources=[datasource],
|
|
207
|
+
joins=[],
|
|
208
|
+
partial_concepts=[x.concept for x in datasource.columns if not x.is_complete],
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def generate_cte_name(full_name: str, name_map: dict[str, str]) -> str:
|
|
213
|
+
cte_names = generate_cte_names()
|
|
214
|
+
if CONFIG.human_identifiers:
|
|
215
|
+
if full_name in name_map:
|
|
216
|
+
return name_map[full_name]
|
|
217
|
+
suffix = ""
|
|
218
|
+
idx = len(name_map)
|
|
219
|
+
if idx >= len(cte_names):
|
|
220
|
+
int = ceil(idx / len(cte_names))
|
|
221
|
+
suffix = f"_{int}"
|
|
222
|
+
valid = [x for x in cte_names if x + suffix not in name_map.values()]
|
|
223
|
+
lookup = valid[0]
|
|
224
|
+
new_name = f"{lookup}{suffix}"
|
|
225
|
+
name_map[full_name] = new_name
|
|
226
|
+
return new_name
|
|
227
|
+
else:
|
|
228
|
+
return full_name.replace("<", "").replace(">", "").replace(",", "_")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def resolve_cte_base_name_and_alias_v2(
|
|
232
|
+
name: str,
|
|
233
|
+
source: QueryDatasource,
|
|
234
|
+
source_map: Dict[str, list[str]],
|
|
235
|
+
raw_joins: List[Join | InstantiatedUnnestJoin],
|
|
236
|
+
) -> Tuple[Address | str | None, str | None]:
|
|
237
|
+
if not source.datasources:
|
|
238
|
+
return None, None
|
|
239
|
+
if (
|
|
240
|
+
isinstance(source.datasources[0], BuildDatasource)
|
|
241
|
+
and not source.datasources[0].name == CONSTANT_DATASET
|
|
242
|
+
):
|
|
243
|
+
ds = source.datasources[0]
|
|
244
|
+
return ds.address, ds.safe_identifier
|
|
245
|
+
|
|
246
|
+
joins: List[Join] = [join for join in raw_joins if isinstance(join, Join)]
|
|
247
|
+
if joins and len(joins) > 0:
|
|
248
|
+
candidates = [x.left_cte.name for x in joins if x.left_cte]
|
|
249
|
+
for join in joins:
|
|
250
|
+
if join.joinkey_pairs:
|
|
251
|
+
candidates += [x.cte.name for x in join.joinkey_pairs if x.cte]
|
|
252
|
+
disallowed = [x.right_cte.name for x in joins]
|
|
253
|
+
try:
|
|
254
|
+
cte = [y for y in candidates if y not in disallowed][0]
|
|
255
|
+
return cte, cte
|
|
256
|
+
except IndexError:
|
|
257
|
+
raise SyntaxError(
|
|
258
|
+
f"Invalid join configuration {candidates} {disallowed} for {name}",
|
|
259
|
+
)
|
|
260
|
+
counts: dict[str, int] = defaultdict(lambda: 0)
|
|
261
|
+
output_addresses = [x.address for x in source.output_concepts]
|
|
262
|
+
input_address = [x.address for x in source.input_concepts]
|
|
263
|
+
for k, v in source_map.items():
|
|
264
|
+
for vx in v:
|
|
265
|
+
if k in output_addresses:
|
|
266
|
+
counts[vx] = counts[vx] + 1
|
|
267
|
+
|
|
268
|
+
if k in input_address:
|
|
269
|
+
counts[vx] = counts[vx] + 1
|
|
270
|
+
|
|
271
|
+
counts[vx] = counts[vx]
|
|
272
|
+
if counts:
|
|
273
|
+
return max(counts, key=counts.get), max(counts, key=counts.get) # type: ignore
|
|
274
|
+
return None, None
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def datasource_to_cte(
|
|
278
|
+
query_datasource: QueryDatasource, name_map: dict[str, str]
|
|
279
|
+
) -> CTE | UnionCTE:
|
|
280
|
+
parents: list[CTE | UnionCTE] = []
|
|
281
|
+
if query_datasource.source_type == SourceType.UNION:
|
|
282
|
+
direct_parents: list[CTE | UnionCTE] = []
|
|
283
|
+
for child in query_datasource.datasources:
|
|
284
|
+
assert isinstance(child, QueryDatasource)
|
|
285
|
+
child_cte = datasource_to_cte(child, name_map=name_map)
|
|
286
|
+
direct_parents.append(child_cte)
|
|
287
|
+
parents += child_cte.parent_ctes
|
|
288
|
+
human_id = generate_cte_name(query_datasource.identifier, name_map)
|
|
289
|
+
final = UnionCTE(
|
|
290
|
+
name=human_id,
|
|
291
|
+
source=query_datasource,
|
|
292
|
+
parent_ctes=parents,
|
|
293
|
+
internal_ctes=direct_parents,
|
|
294
|
+
output_columns=[
|
|
295
|
+
c.with_grain(query_datasource.grain)
|
|
296
|
+
for c in query_datasource.output_concepts
|
|
297
|
+
],
|
|
298
|
+
grain=direct_parents[0].grain,
|
|
299
|
+
order_by=query_datasource.ordering,
|
|
300
|
+
)
|
|
301
|
+
return final
|
|
302
|
+
|
|
303
|
+
if len(query_datasource.datasources) > 1 or any(
|
|
304
|
+
[isinstance(x, QueryDatasource) for x in query_datasource.datasources]
|
|
305
|
+
):
|
|
306
|
+
all_new_ctes: List[CTE | UnionCTE] = []
|
|
307
|
+
for datasource in query_datasource.datasources:
|
|
308
|
+
if isinstance(datasource, QueryDatasource):
|
|
309
|
+
sub_datasource = datasource
|
|
310
|
+
else:
|
|
311
|
+
sub_datasource = datasource_to_query_datasource(datasource)
|
|
312
|
+
|
|
313
|
+
sub_cte = datasource_to_cte(sub_datasource, name_map)
|
|
314
|
+
parents.append(sub_cte)
|
|
315
|
+
all_new_ctes.append(sub_cte)
|
|
316
|
+
source_map, existence_map = generate_source_map(query_datasource, all_new_ctes)
|
|
317
|
+
|
|
318
|
+
else:
|
|
319
|
+
# source is the first datasource of the query datasource
|
|
320
|
+
if query_datasource.datasources:
|
|
321
|
+
|
|
322
|
+
source = query_datasource.datasources[0]
|
|
323
|
+
# this is required to ensure that constant datasets
|
|
324
|
+
# render properly on initial access; since they have
|
|
325
|
+
# no actual source
|
|
326
|
+
if source.name == CONSTANT_DATASET:
|
|
327
|
+
source_map = {k: [] for k in query_datasource.source_map}
|
|
328
|
+
existence_map = source_map
|
|
329
|
+
else:
|
|
330
|
+
source_map = {
|
|
331
|
+
k: [] if not v else [source.safe_identifier]
|
|
332
|
+
for k, v in query_datasource.source_map.items()
|
|
333
|
+
}
|
|
334
|
+
existence_map = source_map
|
|
335
|
+
else:
|
|
336
|
+
source_map = {k: [] for k in query_datasource.source_map}
|
|
337
|
+
existence_map = source_map
|
|
338
|
+
|
|
339
|
+
human_id = generate_cte_name(query_datasource.identifier, name_map)
|
|
340
|
+
|
|
341
|
+
final_joins = [
|
|
342
|
+
base_join_to_join(join, [x for x in parents if isinstance(x, (CTE, UnionCTE))])
|
|
343
|
+
for join in query_datasource.joins
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
base_name, base_alias = resolve_cte_base_name_and_alias_v2(
|
|
347
|
+
human_id, query_datasource, source_map, final_joins
|
|
348
|
+
)
|
|
349
|
+
cte_class = CTE
|
|
350
|
+
|
|
351
|
+
if query_datasource.source_type == SourceType.RECURSIVE:
|
|
352
|
+
cte_class = RecursiveCTE
|
|
353
|
+
# extra_kwargs['left_recursive_concept'] = query_datasource.left
|
|
354
|
+
cte = cte_class(
|
|
355
|
+
name=human_id,
|
|
356
|
+
source=query_datasource,
|
|
357
|
+
# output columns are what are selected/grouped by
|
|
358
|
+
output_columns=[
|
|
359
|
+
c.with_grain(query_datasource.grain)
|
|
360
|
+
for c in query_datasource.output_concepts
|
|
361
|
+
],
|
|
362
|
+
source_map=source_map,
|
|
363
|
+
existence_source_map=existence_map,
|
|
364
|
+
# related columns include all referenced columns, such as filtering
|
|
365
|
+
joins=final_joins,
|
|
366
|
+
grain=query_datasource.grain,
|
|
367
|
+
group_to_grain=query_datasource.group_required,
|
|
368
|
+
# we restrict parent_ctes to one level
|
|
369
|
+
# as this set is used as the base for rendering the query
|
|
370
|
+
parent_ctes=parents,
|
|
371
|
+
condition=query_datasource.condition,
|
|
372
|
+
partial_concepts=query_datasource.partial_concepts,
|
|
373
|
+
nullable_concepts=query_datasource.nullable_concepts,
|
|
374
|
+
join_derived_concepts=query_datasource.join_derived_concepts,
|
|
375
|
+
hidden_concepts=query_datasource.hidden_concepts,
|
|
376
|
+
base_name_override=base_name,
|
|
377
|
+
base_alias_override=base_alias,
|
|
378
|
+
order_by=query_datasource.ordering,
|
|
379
|
+
)
|
|
380
|
+
if cte.grain != query_datasource.grain:
|
|
381
|
+
raise ValueError("Grain was corrupted in CTE generation")
|
|
382
|
+
for x in cte.output_columns:
|
|
383
|
+
if (
|
|
384
|
+
x.address not in cte.source_map
|
|
385
|
+
and not any(y in cte.source_map for y in x.pseudonyms)
|
|
386
|
+
and CONFIG.validate_missing
|
|
387
|
+
):
|
|
388
|
+
raise ValueError(
|
|
389
|
+
f"Missing {x.address} in {cte.source_map}, source map {cte.source.source_map.keys()} "
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
return cte
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def get_query_node(
|
|
396
|
+
environment: Environment,
|
|
397
|
+
statement: SelectLineage | MultiSelectLineage,
|
|
398
|
+
history: History | None = None,
|
|
399
|
+
) -> StrategyNode:
|
|
400
|
+
if not statement.output_components:
|
|
401
|
+
raise ValueError(f"Statement has no output components {statement}")
|
|
402
|
+
history = history or History(base_environment=environment)
|
|
403
|
+
logger.info(
|
|
404
|
+
f"{LOGGER_PREFIX} building query node for {statement.output_components} grain {statement.grain}"
|
|
405
|
+
)
|
|
406
|
+
build_cache: dict[str, BuildConcept] = {}
|
|
407
|
+
base_factory = Factory(
|
|
408
|
+
environment=environment,
|
|
409
|
+
build_cache=build_cache,
|
|
410
|
+
)
|
|
411
|
+
build_statement: BuildSelectLineage | BuildMultiSelectLineage = base_factory.build(
|
|
412
|
+
statement
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
build_environment = environment.materialize_for_select(
|
|
416
|
+
build_statement.local_concepts,
|
|
417
|
+
build_cache=build_cache,
|
|
418
|
+
# factory=base_factory
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
graph = generate_graph(build_environment)
|
|
422
|
+
|
|
423
|
+
logger.info(
|
|
424
|
+
f"{LOGGER_PREFIX} getting source datasource for outputs {build_statement.output_components} grain {build_statement.grain}"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
search_concepts: list[BuildConcept] = build_statement.output_components
|
|
428
|
+
|
|
429
|
+
ods: StrategyNode = source_query_concepts(
|
|
430
|
+
output_concepts=search_concepts,
|
|
431
|
+
environment=build_environment,
|
|
432
|
+
g=graph,
|
|
433
|
+
conditions=build_statement.where_clause,
|
|
434
|
+
history=history,
|
|
435
|
+
)
|
|
436
|
+
if not ods:
|
|
437
|
+
raise ValueError(
|
|
438
|
+
f"Could not find source query concepts for {[x.address for x in search_concepts]}"
|
|
439
|
+
)
|
|
440
|
+
ds: StrategyNode = ods
|
|
441
|
+
if build_statement.having_clause:
|
|
442
|
+
final = build_statement.having_clause.conditional
|
|
443
|
+
if ds.conditions:
|
|
444
|
+
final = BuildConditional(
|
|
445
|
+
left=ds.conditions,
|
|
446
|
+
right=build_statement.having_clause.conditional,
|
|
447
|
+
operator=BooleanOperator.AND,
|
|
448
|
+
)
|
|
449
|
+
ds = SelectNode(
|
|
450
|
+
output_concepts=build_statement.output_components,
|
|
451
|
+
input_concepts=ds.usable_outputs,
|
|
452
|
+
parents=[ds],
|
|
453
|
+
environment=ds.environment,
|
|
454
|
+
partial_concepts=ds.partial_concepts,
|
|
455
|
+
conditions=final,
|
|
456
|
+
)
|
|
457
|
+
ds.hidden_concepts = build_statement.hidden_components
|
|
458
|
+
ds.ordering = build_statement.order_by
|
|
459
|
+
# TODO: avoid this
|
|
460
|
+
ds.rebuild_cache()
|
|
461
|
+
return ds
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def get_query_datasources(
|
|
465
|
+
environment: Environment,
|
|
466
|
+
statement: SelectStatement | MultiSelectStatement,
|
|
467
|
+
hooks: Optional[List[BaseHook]] = None,
|
|
468
|
+
) -> QueryDatasource:
|
|
469
|
+
ds = get_query_node(environment, statement.as_lineage(environment))
|
|
470
|
+
|
|
471
|
+
final_qds = ds.resolve()
|
|
472
|
+
|
|
473
|
+
if hooks:
|
|
474
|
+
for hook in hooks:
|
|
475
|
+
hook.process_root_strategy_node(ds)
|
|
476
|
+
|
|
477
|
+
return final_qds
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def flatten_ctes(input: CTE | UnionCTE) -> list[CTE | UnionCTE]:
|
|
481
|
+
output = [input]
|
|
482
|
+
for cte in input.parent_ctes:
|
|
483
|
+
output += flatten_ctes(cte)
|
|
484
|
+
return output
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def process_auto(
|
|
488
|
+
environment: Environment,
|
|
489
|
+
statement: PersistStatement | SelectStatement,
|
|
490
|
+
hooks: List[BaseHook] | None = None,
|
|
491
|
+
):
|
|
492
|
+
if isinstance(statement, PersistStatement):
|
|
493
|
+
return process_persist(environment, statement, hooks)
|
|
494
|
+
elif isinstance(statement, SelectStatement):
|
|
495
|
+
return process_query(environment, statement, hooks)
|
|
496
|
+
elif isinstance(statement, ConceptDeclarationStatement):
|
|
497
|
+
return None
|
|
498
|
+
raise ValueError(f"Do not know how to process {type(statement)}")
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def process_persist(
|
|
502
|
+
environment: Environment,
|
|
503
|
+
statement: PersistStatement,
|
|
504
|
+
hooks: List[BaseHook] | None = None,
|
|
505
|
+
) -> ProcessedQueryPersist:
|
|
506
|
+
ds: Datasource = environment.datasources.get(
|
|
507
|
+
statement.datasource.identifier, statement.datasource
|
|
508
|
+
)
|
|
509
|
+
original_status = ds.status
|
|
510
|
+
# set to unpublished to avoid circular refs
|
|
511
|
+
try:
|
|
512
|
+
ds.status = DatasourceState.UNPUBLISHED
|
|
513
|
+
select = process_query(
|
|
514
|
+
environment=environment, statement=statement.select, hooks=hooks
|
|
515
|
+
)
|
|
516
|
+
except:
|
|
517
|
+
raise
|
|
518
|
+
finally:
|
|
519
|
+
ds.status = original_status
|
|
520
|
+
|
|
521
|
+
# build our object to return
|
|
522
|
+
arg_dict = {k: v for k, v in select.__dict__.items()}
|
|
523
|
+
partition_by: list[str] = []
|
|
524
|
+
partition_types: list[DataType] = []
|
|
525
|
+
for addr in statement.partition_by:
|
|
526
|
+
for c in statement.datasource.columns:
|
|
527
|
+
if c.concept.address == addr and c.is_concrete:
|
|
528
|
+
partition_by.append(c.alias) # type: ignore
|
|
529
|
+
partition_types.append(c.concept.output_datatype)
|
|
530
|
+
break
|
|
531
|
+
return ProcessedQueryPersist(
|
|
532
|
+
**arg_dict,
|
|
533
|
+
output_to=MaterializedDataset(address=statement.address),
|
|
534
|
+
persist_mode=statement.persist_mode,
|
|
535
|
+
partition_by=partition_by,
|
|
536
|
+
datasource=statement.datasource,
|
|
537
|
+
partition_types=partition_types,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def process_copy(
|
|
542
|
+
environment: Environment,
|
|
543
|
+
statement: CopyStatement,
|
|
544
|
+
hooks: List[BaseHook] | None = None,
|
|
545
|
+
) -> ProcessedCopyStatement:
|
|
546
|
+
select = process_query(
|
|
547
|
+
environment=environment, statement=statement.select, hooks=hooks
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# build our object to return
|
|
551
|
+
arg_dict = {k: v for k, v in select.__dict__.items()}
|
|
552
|
+
return ProcessedCopyStatement(
|
|
553
|
+
**arg_dict,
|
|
554
|
+
target=statement.target,
|
|
555
|
+
target_type=statement.target_type,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def process_query(
|
|
560
|
+
environment: Environment,
|
|
561
|
+
statement: SelectStatement | MultiSelectStatement,
|
|
562
|
+
hooks: List[BaseHook] | None = None,
|
|
563
|
+
) -> ProcessedQuery:
|
|
564
|
+
hooks = hooks or []
|
|
565
|
+
|
|
566
|
+
root_datasource = get_query_datasources(
|
|
567
|
+
environment=environment, statement=statement, hooks=hooks
|
|
568
|
+
)
|
|
569
|
+
for hook in hooks:
|
|
570
|
+
hook.process_root_datasource(root_datasource)
|
|
571
|
+
# this should always return 1 - TODO, refactor
|
|
572
|
+
root_cte = datasource_to_cte(root_datasource, environment.cte_name_map)
|
|
573
|
+
|
|
574
|
+
for hook in hooks:
|
|
575
|
+
hook.process_root_cte(root_cte)
|
|
576
|
+
raw_ctes: List[CTE | UnionCTE] = list(reversed(flatten_ctes(root_cte)))
|
|
577
|
+
seen = dict()
|
|
578
|
+
# we can have duplicate CTEs at this point
|
|
579
|
+
# so merge them together
|
|
580
|
+
for cte in raw_ctes:
|
|
581
|
+
if cte.name not in seen:
|
|
582
|
+
seen[cte.name] = cte
|
|
583
|
+
else:
|
|
584
|
+
# merge them up
|
|
585
|
+
seen[cte.name] = seen[cte.name] + cte
|
|
586
|
+
for cte in raw_ctes:
|
|
587
|
+
cte.parent_ctes = [seen[x.name] for x in cte.parent_ctes]
|
|
588
|
+
deduped_ctes: List[CTE | UnionCTE] = list(seen.values())
|
|
589
|
+
|
|
590
|
+
root_cte.limit = statement.limit
|
|
591
|
+
root_cte.hidden_concepts = statement.hidden_components
|
|
592
|
+
|
|
593
|
+
final_ctes = optimize_ctes(deduped_ctes, root_cte, statement)
|
|
594
|
+
|
|
595
|
+
return ProcessedQuery(
|
|
596
|
+
order_by=root_cte.order_by,
|
|
597
|
+
limit=statement.limit,
|
|
598
|
+
output_columns=statement.output_components,
|
|
599
|
+
ctes=final_ctes,
|
|
600
|
+
base=root_cte,
|
|
601
|
+
hidden_columns=set([x for x in statement.hidden_components]),
|
|
602
|
+
local_concepts=statement.local_concepts,
|
|
603
|
+
locally_derived=statement.locally_derived,
|
|
604
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Statement Design
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
## Assert statements
|
|
5
|
+
Used for DQ checks.
|
|
6
|
+
|
|
7
|
+
Unique in that we need to constrain to specific datasources/validation.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Concept check
|
|
12
|
+
|
|
13
|
+
# value comparison
|
|
14
|
+
|
|
15
|
+
two scalar values
|
|
16
|
+
|
|
17
|
+
assert expr from_datasource? = expr from_datasource?
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
assert not max(len(name.split(' '))) =1;
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
assert sum(revenue) from datasource1 = sum(revenue) from datasource2;
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## parallel block
|
|
27
|
+
|
|
28
|
+
begin parallel;
|
|
29
|
+
|
|
30
|
+
assert a == 1;
|
|
31
|
+
assert b == 2;
|
|
32
|
+
|
|
33
|
+
end parallel;
|
|
34
|
+
|
|
35
|
+
|
|
File without changes
|