pytrilogy 0.3.142__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. LICENSE.md +19 -0
  2. _preql_import_resolver/__init__.py +5 -0
  3. _preql_import_resolver/_preql_import_resolver.cp312-win_amd64.pyd +0 -0
  4. pytrilogy-0.3.142.dist-info/METADATA +555 -0
  5. pytrilogy-0.3.142.dist-info/RECORD +200 -0
  6. pytrilogy-0.3.142.dist-info/WHEEL +4 -0
  7. pytrilogy-0.3.142.dist-info/entry_points.txt +2 -0
  8. pytrilogy-0.3.142.dist-info/licenses/LICENSE.md +19 -0
  9. trilogy/__init__.py +16 -0
  10. trilogy/ai/README.md +10 -0
  11. trilogy/ai/__init__.py +19 -0
  12. trilogy/ai/constants.py +92 -0
  13. trilogy/ai/conversation.py +107 -0
  14. trilogy/ai/enums.py +7 -0
  15. trilogy/ai/execute.py +50 -0
  16. trilogy/ai/models.py +34 -0
  17. trilogy/ai/prompts.py +100 -0
  18. trilogy/ai/providers/__init__.py +0 -0
  19. trilogy/ai/providers/anthropic.py +106 -0
  20. trilogy/ai/providers/base.py +24 -0
  21. trilogy/ai/providers/google.py +146 -0
  22. trilogy/ai/providers/openai.py +89 -0
  23. trilogy/ai/providers/utils.py +68 -0
  24. trilogy/authoring/README.md +3 -0
  25. trilogy/authoring/__init__.py +148 -0
  26. trilogy/constants.py +113 -0
  27. trilogy/core/README.md +52 -0
  28. trilogy/core/__init__.py +0 -0
  29. trilogy/core/constants.py +6 -0
  30. trilogy/core/enums.py +443 -0
  31. trilogy/core/env_processor.py +120 -0
  32. trilogy/core/environment_helpers.py +320 -0
  33. trilogy/core/ergonomics.py +193 -0
  34. trilogy/core/exceptions.py +123 -0
  35. trilogy/core/functions.py +1227 -0
  36. trilogy/core/graph_models.py +139 -0
  37. trilogy/core/internal.py +85 -0
  38. trilogy/core/models/__init__.py +0 -0
  39. trilogy/core/models/author.py +2669 -0
  40. trilogy/core/models/build.py +2521 -0
  41. trilogy/core/models/build_environment.py +180 -0
  42. trilogy/core/models/core.py +501 -0
  43. trilogy/core/models/datasource.py +322 -0
  44. trilogy/core/models/environment.py +751 -0
  45. trilogy/core/models/execute.py +1177 -0
  46. trilogy/core/optimization.py +251 -0
  47. trilogy/core/optimizations/__init__.py +12 -0
  48. trilogy/core/optimizations/base_optimization.py +17 -0
  49. trilogy/core/optimizations/hide_unused_concept.py +47 -0
  50. trilogy/core/optimizations/inline_datasource.py +102 -0
  51. trilogy/core/optimizations/predicate_pushdown.py +245 -0
  52. trilogy/core/processing/README.md +94 -0
  53. trilogy/core/processing/READMEv2.md +121 -0
  54. trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
  55. trilogy/core/processing/__init__.py +0 -0
  56. trilogy/core/processing/concept_strategies_v3.py +508 -0
  57. trilogy/core/processing/constants.py +15 -0
  58. trilogy/core/processing/discovery_node_factory.py +451 -0
  59. trilogy/core/processing/discovery_utility.py +548 -0
  60. trilogy/core/processing/discovery_validation.py +167 -0
  61. trilogy/core/processing/graph_utils.py +43 -0
  62. trilogy/core/processing/node_generators/README.md +9 -0
  63. trilogy/core/processing/node_generators/__init__.py +31 -0
  64. trilogy/core/processing/node_generators/basic_node.py +160 -0
  65. trilogy/core/processing/node_generators/common.py +268 -0
  66. trilogy/core/processing/node_generators/constant_node.py +38 -0
  67. trilogy/core/processing/node_generators/filter_node.py +315 -0
  68. trilogy/core/processing/node_generators/group_node.py +213 -0
  69. trilogy/core/processing/node_generators/group_to_node.py +117 -0
  70. trilogy/core/processing/node_generators/multiselect_node.py +205 -0
  71. trilogy/core/processing/node_generators/node_merge_node.py +653 -0
  72. trilogy/core/processing/node_generators/recursive_node.py +88 -0
  73. trilogy/core/processing/node_generators/rowset_node.py +165 -0
  74. trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
  75. trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
  76. trilogy/core/processing/node_generators/select_merge_node.py +748 -0
  77. trilogy/core/processing/node_generators/select_node.py +95 -0
  78. trilogy/core/processing/node_generators/synonym_node.py +98 -0
  79. trilogy/core/processing/node_generators/union_node.py +91 -0
  80. trilogy/core/processing/node_generators/unnest_node.py +182 -0
  81. trilogy/core/processing/node_generators/window_node.py +201 -0
  82. trilogy/core/processing/nodes/README.md +28 -0
  83. trilogy/core/processing/nodes/__init__.py +179 -0
  84. trilogy/core/processing/nodes/base_node.py +519 -0
  85. trilogy/core/processing/nodes/filter_node.py +75 -0
  86. trilogy/core/processing/nodes/group_node.py +194 -0
  87. trilogy/core/processing/nodes/merge_node.py +420 -0
  88. trilogy/core/processing/nodes/recursive_node.py +46 -0
  89. trilogy/core/processing/nodes/select_node_v2.py +242 -0
  90. trilogy/core/processing/nodes/union_node.py +53 -0
  91. trilogy/core/processing/nodes/unnest_node.py +62 -0
  92. trilogy/core/processing/nodes/window_node.py +56 -0
  93. trilogy/core/processing/utility.py +823 -0
  94. trilogy/core/query_processor.py +596 -0
  95. trilogy/core/statements/README.md +35 -0
  96. trilogy/core/statements/__init__.py +0 -0
  97. trilogy/core/statements/author.py +536 -0
  98. trilogy/core/statements/build.py +0 -0
  99. trilogy/core/statements/common.py +20 -0
  100. trilogy/core/statements/execute.py +155 -0
  101. trilogy/core/table_processor.py +66 -0
  102. trilogy/core/utility.py +8 -0
  103. trilogy/core/validation/README.md +46 -0
  104. trilogy/core/validation/__init__.py +0 -0
  105. trilogy/core/validation/common.py +161 -0
  106. trilogy/core/validation/concept.py +146 -0
  107. trilogy/core/validation/datasource.py +227 -0
  108. trilogy/core/validation/environment.py +73 -0
  109. trilogy/core/validation/fix.py +256 -0
  110. trilogy/dialect/__init__.py +32 -0
  111. trilogy/dialect/base.py +1392 -0
  112. trilogy/dialect/bigquery.py +308 -0
  113. trilogy/dialect/common.py +147 -0
  114. trilogy/dialect/config.py +144 -0
  115. trilogy/dialect/dataframe.py +50 -0
  116. trilogy/dialect/duckdb.py +231 -0
  117. trilogy/dialect/enums.py +147 -0
  118. trilogy/dialect/metadata.py +173 -0
  119. trilogy/dialect/mock.py +190 -0
  120. trilogy/dialect/postgres.py +117 -0
  121. trilogy/dialect/presto.py +110 -0
  122. trilogy/dialect/results.py +89 -0
  123. trilogy/dialect/snowflake.py +129 -0
  124. trilogy/dialect/sql_server.py +137 -0
  125. trilogy/engine.py +48 -0
  126. trilogy/execution/config.py +75 -0
  127. trilogy/executor.py +568 -0
  128. trilogy/hooks/__init__.py +4 -0
  129. trilogy/hooks/base_hook.py +40 -0
  130. trilogy/hooks/graph_hook.py +139 -0
  131. trilogy/hooks/query_debugger.py +166 -0
  132. trilogy/metadata/__init__.py +0 -0
  133. trilogy/parser.py +10 -0
  134. trilogy/parsing/README.md +21 -0
  135. trilogy/parsing/__init__.py +0 -0
  136. trilogy/parsing/common.py +1069 -0
  137. trilogy/parsing/config.py +5 -0
  138. trilogy/parsing/exceptions.py +8 -0
  139. trilogy/parsing/helpers.py +1 -0
  140. trilogy/parsing/parse_engine.py +2813 -0
  141. trilogy/parsing/render.py +769 -0
  142. trilogy/parsing/trilogy.lark +540 -0
  143. trilogy/py.typed +0 -0
  144. trilogy/render.py +42 -0
  145. trilogy/scripts/README.md +9 -0
  146. trilogy/scripts/__init__.py +0 -0
  147. trilogy/scripts/agent.py +41 -0
  148. trilogy/scripts/agent_info.py +303 -0
  149. trilogy/scripts/common.py +355 -0
  150. trilogy/scripts/dependency/Cargo.lock +617 -0
  151. trilogy/scripts/dependency/Cargo.toml +39 -0
  152. trilogy/scripts/dependency/README.md +131 -0
  153. trilogy/scripts/dependency/build.sh +25 -0
  154. trilogy/scripts/dependency/src/directory_resolver.rs +177 -0
  155. trilogy/scripts/dependency/src/lib.rs +16 -0
  156. trilogy/scripts/dependency/src/main.rs +770 -0
  157. trilogy/scripts/dependency/src/parser.rs +435 -0
  158. trilogy/scripts/dependency/src/preql.pest +208 -0
  159. trilogy/scripts/dependency/src/python_bindings.rs +303 -0
  160. trilogy/scripts/dependency/src/resolver.rs +716 -0
  161. trilogy/scripts/dependency/tests/base.preql +3 -0
  162. trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
  163. trilogy/scripts/dependency/tests/customer.preql +6 -0
  164. trilogy/scripts/dependency/tests/main.preql +9 -0
  165. trilogy/scripts/dependency/tests/orders.preql +7 -0
  166. trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
  167. trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
  168. trilogy/scripts/dependency.py +323 -0
  169. trilogy/scripts/display.py +512 -0
  170. trilogy/scripts/environment.py +46 -0
  171. trilogy/scripts/fmt.py +32 -0
  172. trilogy/scripts/ingest.py +471 -0
  173. trilogy/scripts/ingest_helpers/__init__.py +1 -0
  174. trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
  175. trilogy/scripts/ingest_helpers/formatting.py +93 -0
  176. trilogy/scripts/ingest_helpers/typing.py +161 -0
  177. trilogy/scripts/init.py +105 -0
  178. trilogy/scripts/parallel_execution.py +713 -0
  179. trilogy/scripts/plan.py +189 -0
  180. trilogy/scripts/run.py +63 -0
  181. trilogy/scripts/serve.py +140 -0
  182. trilogy/scripts/serve_helpers/__init__.py +41 -0
  183. trilogy/scripts/serve_helpers/file_discovery.py +142 -0
  184. trilogy/scripts/serve_helpers/index_generation.py +206 -0
  185. trilogy/scripts/serve_helpers/models.py +38 -0
  186. trilogy/scripts/single_execution.py +131 -0
  187. trilogy/scripts/testing.py +119 -0
  188. trilogy/scripts/trilogy.py +68 -0
  189. trilogy/std/__init__.py +0 -0
  190. trilogy/std/color.preql +3 -0
  191. trilogy/std/date.preql +13 -0
  192. trilogy/std/display.preql +18 -0
  193. trilogy/std/geography.preql +22 -0
  194. trilogy/std/metric.preql +15 -0
  195. trilogy/std/money.preql +67 -0
  196. trilogy/std/net.preql +14 -0
  197. trilogy/std/ranking.preql +7 -0
  198. trilogy/std/report.preql +5 -0
  199. trilogy/std/semantic.preql +6 -0
  200. trilogy/utility.py +34 -0
@@ -0,0 +1,596 @@
1
+ from collections import defaultdict
2
+ from math import ceil
3
+ from typing import Dict, List, Optional, Set, Tuple, Union
4
+
5
+ from trilogy.constants import CONFIG, logger
6
+ from trilogy.core.constants import CONSTANT_DATASET
7
+ from trilogy.core.enums import BooleanOperator, DatasourceState, SourceType
8
+ from trilogy.core.env_processor import generate_graph
9
+ from trilogy.core.ergonomics import generate_cte_names
10
+ from trilogy.core.models.author import MultiSelectLineage, SelectLineage
11
+ from trilogy.core.models.build import (
12
+ BuildConcept,
13
+ BuildConditional,
14
+ BuildDatasource,
15
+ BuildFunction,
16
+ BuildMultiSelectLineage,
17
+ BuildParamaterizedConceptReference,
18
+ BuildSelectLineage,
19
+ Factory,
20
+ )
21
+ from trilogy.core.models.core import DataType
22
+ from trilogy.core.models.datasource import Datasource
23
+ from trilogy.core.models.environment import Environment
24
+ from trilogy.core.models.execute import (
25
+ CTE,
26
+ BaseJoin,
27
+ CTEConceptPair,
28
+ InstantiatedUnnestJoin,
29
+ Join,
30
+ QueryDatasource,
31
+ RecursiveCTE,
32
+ UnionCTE,
33
+ UnnestJoin,
34
+ )
35
+ from trilogy.core.optimization import optimize_ctes
36
+ from trilogy.core.processing.concept_strategies_v3 import source_query_concepts
37
+ from trilogy.core.processing.nodes import History, SelectNode, StrategyNode
38
+ from trilogy.core.statements.author import (
39
+ ConceptDeclarationStatement,
40
+ CopyStatement,
41
+ MultiSelectStatement,
42
+ PersistStatement,
43
+ SelectStatement,
44
+ )
45
+ from trilogy.core.statements.execute import (
46
+ MaterializedDataset,
47
+ ProcessedCopyStatement,
48
+ ProcessedQuery,
49
+ ProcessedQueryPersist,
50
+ )
51
+ from trilogy.hooks.base_hook import BaseHook
52
+ from trilogy.utility import unique
53
+
54
+ LOGGER_PREFIX = "[QUERY BUILD]"
55
+
56
+
57
+ def base_join_to_join(
58
+ base_join: BaseJoin | UnnestJoin, ctes: List[CTE | UnionCTE]
59
+ ) -> Join | InstantiatedUnnestJoin:
60
+ """This function converts joins at the datasource level
61
+ to joins at the CTE level"""
62
+ if isinstance(base_join, UnnestJoin):
63
+ object_to_unnest = base_join.parent.arguments[0]
64
+ if not isinstance(
65
+ object_to_unnest,
66
+ (BuildConcept | BuildParamaterizedConceptReference | BuildFunction),
67
+ ):
68
+ raise ValueError(f"Unnest join must be a concept; got {object_to_unnest}")
69
+ return InstantiatedUnnestJoin(
70
+ object_to_unnest=object_to_unnest,
71
+ alias=base_join.alias,
72
+ )
73
+
74
+ def get_datasource_cte(
75
+ datasource: BuildDatasource | QueryDatasource,
76
+ ) -> CTE | UnionCTE:
77
+ eligible = set()
78
+ for cte in ctes:
79
+ if cte.source.identifier == datasource.identifier:
80
+ return cte
81
+ eligible.add(cte.source.identifier)
82
+ for cte in ctes:
83
+ if cte.source.datasources[0].identifier == datasource.identifier:
84
+ return cte
85
+ eligible.add(cte.source.datasources[0].identifier)
86
+ raise ValueError(
87
+ f"Could not find CTE for datasource {datasource.identifier}; have {eligible}"
88
+ )
89
+
90
+ if base_join.left_datasource is not None:
91
+ left_cte = get_datasource_cte(base_join.left_datasource)
92
+ else:
93
+ # multiple left ctes
94
+ left_cte = None
95
+ right_cte = get_datasource_cte(base_join.right_datasource)
96
+ if base_join.concept_pairs:
97
+ final_pairs = [
98
+ CTEConceptPair(
99
+ left=pair.left,
100
+ right=pair.right,
101
+ existing_datasource=pair.existing_datasource,
102
+ modifiers=pair.modifiers,
103
+ cte=get_datasource_cte(pair.existing_datasource),
104
+ )
105
+ for pair in base_join.concept_pairs
106
+ ]
107
+ elif base_join.concepts and base_join.left_datasource:
108
+ final_pairs = [
109
+ CTEConceptPair(
110
+ left=concept,
111
+ right=concept,
112
+ existing_datasource=base_join.left_datasource,
113
+ modifiers=[],
114
+ cte=get_datasource_cte(
115
+ base_join.left_datasource,
116
+ ),
117
+ )
118
+ for concept in base_join.concepts
119
+ ]
120
+ else:
121
+ final_pairs = []
122
+ return Join(
123
+ left_cte=left_cte,
124
+ right_cte=right_cte,
125
+ jointype=base_join.join_type,
126
+ joinkey_pairs=final_pairs,
127
+ modifiers=base_join.modifiers,
128
+ )
129
+
130
+
131
+ def generate_source_map(
132
+ query_datasource: QueryDatasource, all_new_ctes: List[CTE | UnionCTE]
133
+ ) -> Tuple[Dict[str, list[str]], Dict[str, list[str]]]:
134
+ source_map: Dict[str, list[str]] = defaultdict(list)
135
+ # now populate anything derived in this level
136
+ for qdk, qdv in query_datasource.source_map.items():
137
+ unnest = [x for x in qdv if isinstance(x, UnnestJoin)]
138
+ for _ in unnest:
139
+ source_map[qdk] = []
140
+ if (
141
+ qdk not in source_map
142
+ and len(qdv) == 1
143
+ and isinstance(list(qdv)[0], UnnestJoin)
144
+ ):
145
+ source_map[qdk] = []
146
+ basic = [x for x in qdv if isinstance(x, BuildDatasource)]
147
+ for base in basic:
148
+ source_map[qdk].append(base.safe_identifier)
149
+
150
+ ctes = [x for x in qdv if isinstance(x, QueryDatasource)]
151
+ if ctes:
152
+ names = set([x.safe_identifier for x in ctes])
153
+ matches = [
154
+ cte for cte in all_new_ctes if cte.source.safe_identifier in names
155
+ ]
156
+
157
+ if not matches and names:
158
+ raise SyntaxError(
159
+ f"Missing parent CTEs for source map; expecting {names}, have {[cte.source.safe_identifier for cte in all_new_ctes]}"
160
+ )
161
+ for cte in matches:
162
+ output_address = [
163
+ x.address
164
+ for x in cte.output_columns
165
+ if x.address not in [z.address for z in cte.partial_concepts]
166
+ ]
167
+ if qdk in output_address:
168
+ source_map[qdk].append(cte.safe_identifier)
169
+ # now do a pass that accepts partials
170
+ for cte in matches:
171
+ if qdk not in source_map:
172
+ source_map[qdk] = [cte.safe_identifier]
173
+ if qdk not in source_map:
174
+ if not qdv:
175
+ source_map[qdk] = []
176
+ elif CONFIG.validate_missing:
177
+ raise ValueError(
178
+ f"Missing {qdk} in {source_map}, source map {query_datasource.source_map} "
179
+ )
180
+
181
+ # existence lookups use a separate map
182
+ # as they cannot be referenced in row resolution
183
+ existence_source_map: Dict[str, list[str]] = defaultdict(list)
184
+ for ek, ev in query_datasource.existence_source_map.items():
185
+ ids = set([x.safe_identifier for x in ev])
186
+ ematches = [
187
+ cte.name for cte in all_new_ctes if cte.source.safe_identifier in ids
188
+ ]
189
+ existence_source_map[ek] = ematches
190
+ return {
191
+ k: [] if not v else list(set(v)) for k, v in source_map.items()
192
+ }, existence_source_map
193
+
194
+
195
+ def datasource_to_query_datasource(datasource: BuildDatasource) -> QueryDatasource:
196
+ sub_select: Dict[str, Set[Union[BuildDatasource, QueryDatasource, UnnestJoin]]] = {
197
+ **{c.address: {datasource} for c in datasource.concepts},
198
+ }
199
+ concepts = [c for c in datasource.concepts]
200
+ concepts = unique(concepts, "address")
201
+ return QueryDatasource(
202
+ output_concepts=concepts,
203
+ input_concepts=concepts,
204
+ source_map=sub_select,
205
+ grain=datasource.grain,
206
+ datasources=[datasource],
207
+ joins=[],
208
+ partial_concepts=[x.concept for x in datasource.columns if not x.is_complete],
209
+ )
210
+
211
+
212
+ def generate_cte_name(full_name: str, name_map: dict[str, str]) -> str:
213
+ cte_names = generate_cte_names()
214
+ if CONFIG.human_identifiers:
215
+ if full_name in name_map:
216
+ return name_map[full_name]
217
+ suffix = ""
218
+ idx = len(name_map)
219
+ if idx >= len(cte_names):
220
+ int = ceil(idx / len(cte_names))
221
+ suffix = f"_{int}"
222
+ valid = [x for x in cte_names if x + suffix not in name_map.values()]
223
+ lookup = valid[0]
224
+ new_name = f"{lookup}{suffix}"
225
+ name_map[full_name] = new_name
226
+ return new_name
227
+ else:
228
+ return full_name.replace("<", "").replace(">", "").replace(",", "_")
229
+
230
+
231
+ def resolve_cte_base_name_and_alias_v2(
232
+ name: str,
233
+ source: QueryDatasource,
234
+ source_map: Dict[str, list[str]],
235
+ raw_joins: List[Join | InstantiatedUnnestJoin],
236
+ ) -> Tuple[str | None, str | None]:
237
+ if not source.datasources:
238
+ return None, None
239
+ if (
240
+ isinstance(source.datasources[0], BuildDatasource)
241
+ and not source.datasources[0].name == CONSTANT_DATASET
242
+ ):
243
+ ds = source.datasources[0]
244
+ return ds.safe_location, ds.safe_identifier
245
+
246
+ joins: List[Join] = [join for join in raw_joins if isinstance(join, Join)]
247
+ if joins and len(joins) > 0:
248
+ candidates = [x.left_cte.name for x in joins if x.left_cte]
249
+ for join in joins:
250
+ if join.joinkey_pairs:
251
+ candidates += [x.cte.name for x in join.joinkey_pairs if x.cte]
252
+ disallowed = [x.right_cte.name for x in joins]
253
+ try:
254
+ cte = [y for y in candidates if y not in disallowed][0]
255
+ return cte, cte
256
+ except IndexError:
257
+ raise SyntaxError(
258
+ f"Invalid join configuration {candidates} {disallowed} for {name}",
259
+ )
260
+ counts: dict[str, int] = defaultdict(lambda: 0)
261
+ output_addresses = [x.address for x in source.output_concepts]
262
+ input_address = [x.address for x in source.input_concepts]
263
+ for k, v in source_map.items():
264
+ for vx in v:
265
+ if k in output_addresses:
266
+ counts[vx] = counts[vx] + 1
267
+
268
+ if k in input_address:
269
+ counts[vx] = counts[vx] + 1
270
+
271
+ counts[vx] = counts[vx]
272
+ if counts:
273
+ return max(counts, key=counts.get), max(counts, key=counts.get) # type: ignore
274
+ return None, None
275
+
276
+
277
+ def datasource_to_cte(
278
+ query_datasource: QueryDatasource, name_map: dict[str, str]
279
+ ) -> CTE | UnionCTE:
280
+ parents: list[CTE | UnionCTE] = []
281
+ if query_datasource.source_type == SourceType.UNION:
282
+ direct_parents: list[CTE | UnionCTE] = []
283
+ for child in query_datasource.datasources:
284
+ assert isinstance(child, QueryDatasource)
285
+ child_cte = datasource_to_cte(child, name_map=name_map)
286
+ direct_parents.append(child_cte)
287
+ parents += child_cte.parent_ctes
288
+ human_id = generate_cte_name(query_datasource.identifier, name_map)
289
+ final = UnionCTE(
290
+ name=human_id,
291
+ source=query_datasource,
292
+ parent_ctes=parents,
293
+ internal_ctes=direct_parents,
294
+ output_columns=[
295
+ c.with_grain(query_datasource.grain)
296
+ for c in query_datasource.output_concepts
297
+ ],
298
+ grain=direct_parents[0].grain,
299
+ order_by=query_datasource.ordering,
300
+ )
301
+ return final
302
+
303
+ if len(query_datasource.datasources) > 1 or any(
304
+ [isinstance(x, QueryDatasource) for x in query_datasource.datasources]
305
+ ):
306
+ all_new_ctes: List[CTE | UnionCTE] = []
307
+ for datasource in query_datasource.datasources:
308
+ if isinstance(datasource, QueryDatasource):
309
+ sub_datasource = datasource
310
+ else:
311
+ sub_datasource = datasource_to_query_datasource(datasource)
312
+
313
+ sub_cte = datasource_to_cte(sub_datasource, name_map)
314
+ parents.append(sub_cte)
315
+ all_new_ctes.append(sub_cte)
316
+ source_map, existence_map = generate_source_map(query_datasource, all_new_ctes)
317
+
318
+ else:
319
+ # source is the first datasource of the query datasource
320
+ if query_datasource.datasources:
321
+
322
+ source = query_datasource.datasources[0]
323
+ # this is required to ensure that constant datasets
324
+ # render properly on initial access; since they have
325
+ # no actual source
326
+ if source.name == CONSTANT_DATASET:
327
+ source_map = {k: [] for k in query_datasource.source_map}
328
+ existence_map = source_map
329
+ else:
330
+ source_map = {
331
+ k: [] if not v else [source.safe_identifier]
332
+ for k, v in query_datasource.source_map.items()
333
+ }
334
+ existence_map = source_map
335
+ else:
336
+ source_map = {k: [] for k in query_datasource.source_map}
337
+ existence_map = source_map
338
+
339
+ human_id = generate_cte_name(query_datasource.identifier, name_map)
340
+
341
+ final_joins = [
342
+ base_join_to_join(join, [x for x in parents if isinstance(x, (CTE, UnionCTE))])
343
+ for join in query_datasource.joins
344
+ ]
345
+
346
+ base_name, base_alias = resolve_cte_base_name_and_alias_v2(
347
+ human_id, query_datasource, source_map, final_joins
348
+ )
349
+ cte_class = CTE
350
+
351
+ if query_datasource.source_type == SourceType.RECURSIVE:
352
+ cte_class = RecursiveCTE
353
+ # extra_kwargs['left_recursive_concept'] = query_datasource.left
354
+ cte = cte_class(
355
+ name=human_id,
356
+ source=query_datasource,
357
+ # output columns are what are selected/grouped by
358
+ output_columns=[
359
+ c.with_grain(query_datasource.grain)
360
+ for c in query_datasource.output_concepts
361
+ ],
362
+ source_map=source_map,
363
+ existence_source_map=existence_map,
364
+ # related columns include all referenced columns, such as filtering
365
+ joins=final_joins,
366
+ grain=query_datasource.grain,
367
+ group_to_grain=query_datasource.group_required,
368
+ # we restrict parent_ctes to one level
369
+ # as this set is used as the base for rendering the query
370
+ parent_ctes=parents,
371
+ condition=query_datasource.condition,
372
+ partial_concepts=query_datasource.partial_concepts,
373
+ nullable_concepts=query_datasource.nullable_concepts,
374
+ join_derived_concepts=query_datasource.join_derived_concepts,
375
+ hidden_concepts=query_datasource.hidden_concepts,
376
+ base_name_override=base_name,
377
+ base_alias_override=base_alias,
378
+ order_by=query_datasource.ordering,
379
+ )
380
+ if cte.grain != query_datasource.grain:
381
+ raise ValueError("Grain was corrupted in CTE generation")
382
+ for x in cte.output_columns:
383
+ if (
384
+ x.address not in cte.source_map
385
+ and not any(y in cte.source_map for y in x.pseudonyms)
386
+ and CONFIG.validate_missing
387
+ ):
388
+ raise ValueError(
389
+ f"Missing {x.address} in {cte.source_map}, source map {cte.source.source_map.keys()} "
390
+ )
391
+
392
+ return cte
393
+
394
+
395
+ def get_query_node(
396
+ environment: Environment,
397
+ statement: SelectLineage | MultiSelectLineage,
398
+ history: History | None = None,
399
+ ) -> StrategyNode:
400
+ if not statement.output_components:
401
+ raise ValueError(f"Statement has no output components {statement}")
402
+ history = history or History(base_environment=environment)
403
+ logger.info(
404
+ f"{LOGGER_PREFIX} building query node for {statement.output_components} grain {statement.grain}"
405
+ )
406
+ build_statement: BuildSelectLineage | BuildMultiSelectLineage = Factory(
407
+ environment=environment,
408
+ ).build(statement)
409
+
410
+ build_environment = environment.materialize_for_select(
411
+ build_statement.local_concepts
412
+ )
413
+ graph = generate_graph(build_environment)
414
+
415
+ logger.info(
416
+ f"{LOGGER_PREFIX} getting source datasource for outputs {build_statement.output_components} grain {build_statement.grain}"
417
+ )
418
+
419
+ search_concepts: list[BuildConcept] = build_statement.output_components
420
+
421
+ ods: StrategyNode = source_query_concepts(
422
+ output_concepts=search_concepts,
423
+ environment=build_environment,
424
+ g=graph,
425
+ conditions=build_statement.where_clause,
426
+ history=history,
427
+ )
428
+ if not ods:
429
+ raise ValueError(
430
+ f"Could not find source query concepts for {[x.address for x in search_concepts]}"
431
+ )
432
+ ds: StrategyNode = ods
433
+ if build_statement.having_clause:
434
+ final = build_statement.having_clause.conditional
435
+ if ds.conditions:
436
+ final = BuildConditional(
437
+ left=ds.conditions,
438
+ right=build_statement.having_clause.conditional,
439
+ operator=BooleanOperator.AND,
440
+ )
441
+ ds = SelectNode(
442
+ output_concepts=build_statement.output_components,
443
+ input_concepts=ds.usable_outputs,
444
+ parents=[ds],
445
+ environment=ds.environment,
446
+ partial_concepts=ds.partial_concepts,
447
+ conditions=final,
448
+ )
449
+ ds.hidden_concepts = build_statement.hidden_components
450
+ ds.ordering = build_statement.order_by
451
+ # TODO: avoid this
452
+ ds.rebuild_cache()
453
+ return ds
454
+
455
+
456
+ def get_query_datasources(
457
+ environment: Environment,
458
+ statement: SelectStatement | MultiSelectStatement,
459
+ hooks: Optional[List[BaseHook]] = None,
460
+ ) -> QueryDatasource:
461
+ ds = get_query_node(environment, statement.as_lineage(environment))
462
+
463
+ final_qds = ds.resolve()
464
+
465
+ if hooks:
466
+ for hook in hooks:
467
+ hook.process_root_strategy_node(ds)
468
+
469
+ return final_qds
470
+
471
+
472
+ def flatten_ctes(input: CTE | UnionCTE) -> list[CTE | UnionCTE]:
473
+ output = [input]
474
+ for cte in input.parent_ctes:
475
+ output += flatten_ctes(cte)
476
+ return output
477
+
478
+
479
+ def process_auto(
480
+ environment: Environment,
481
+ statement: PersistStatement | SelectStatement,
482
+ hooks: List[BaseHook] | None = None,
483
+ ):
484
+ if isinstance(statement, PersistStatement):
485
+ return process_persist(environment, statement, hooks)
486
+ elif isinstance(statement, SelectStatement):
487
+ return process_query(environment, statement, hooks)
488
+ elif isinstance(statement, ConceptDeclarationStatement):
489
+ return None
490
+ raise ValueError(f"Do not know how to process {type(statement)}")
491
+
492
+
493
+ def process_persist(
494
+ environment: Environment,
495
+ statement: PersistStatement,
496
+ hooks: List[BaseHook] | None = None,
497
+ ) -> ProcessedQueryPersist:
498
+ ds: Datasource = environment.datasources.get(
499
+ statement.datasource.identifier, statement.datasource
500
+ )
501
+ original_status = ds.status
502
+ # set to unpublished to avoid circular refs
503
+ try:
504
+ ds.status = DatasourceState.UNPUBLISHED
505
+ select = process_query(
506
+ environment=environment, statement=statement.select, hooks=hooks
507
+ )
508
+ except:
509
+ raise
510
+ finally:
511
+ ds.status = original_status
512
+
513
+ # build our object to return
514
+ arg_dict = {k: v for k, v in select.__dict__.items()}
515
+ partition_by: list[str] = []
516
+ partition_types: list[DataType] = []
517
+ for addr in statement.partition_by:
518
+ for c in statement.datasource.columns:
519
+ if c.concept.address == addr and c.is_concrete:
520
+ partition_by.append(c.alias) # type: ignore
521
+ partition_types.append(c.concept.output_datatype)
522
+ break
523
+ return ProcessedQueryPersist(
524
+ **arg_dict,
525
+ output_to=MaterializedDataset(address=statement.address),
526
+ persist_mode=statement.persist_mode,
527
+ partition_by=partition_by,
528
+ datasource=statement.datasource,
529
+ partition_types=partition_types,
530
+ )
531
+
532
+
533
+ def process_copy(
534
+ environment: Environment,
535
+ statement: CopyStatement,
536
+ hooks: List[BaseHook] | None = None,
537
+ ) -> ProcessedCopyStatement:
538
+ select = process_query(
539
+ environment=environment, statement=statement.select, hooks=hooks
540
+ )
541
+
542
+ # build our object to return
543
+ arg_dict = {k: v for k, v in select.__dict__.items()}
544
+ return ProcessedCopyStatement(
545
+ **arg_dict,
546
+ target=statement.target,
547
+ target_type=statement.target_type,
548
+ )
549
+
550
+
551
+ def process_query(
552
+ environment: Environment,
553
+ statement: SelectStatement | MultiSelectStatement,
554
+ hooks: List[BaseHook] | None = None,
555
+ ) -> ProcessedQuery:
556
+ hooks = hooks or []
557
+
558
+ root_datasource = get_query_datasources(
559
+ environment=environment, statement=statement, hooks=hooks
560
+ )
561
+ for hook in hooks:
562
+ hook.process_root_datasource(root_datasource)
563
+ # this should always return 1 - TODO, refactor
564
+ root_cte = datasource_to_cte(root_datasource, environment.cte_name_map)
565
+
566
+ for hook in hooks:
567
+ hook.process_root_cte(root_cte)
568
+ raw_ctes: List[CTE | UnionCTE] = list(reversed(flatten_ctes(root_cte)))
569
+ seen = dict()
570
+ # we can have duplicate CTEs at this point
571
+ # so merge them together
572
+ for cte in raw_ctes:
573
+ if cte.name not in seen:
574
+ seen[cte.name] = cte
575
+ else:
576
+ # merge them up
577
+ seen[cte.name] = seen[cte.name] + cte
578
+ for cte in raw_ctes:
579
+ cte.parent_ctes = [seen[x.name] for x in cte.parent_ctes]
580
+ deduped_ctes: List[CTE | UnionCTE] = list(seen.values())
581
+
582
+ root_cte.limit = statement.limit
583
+ root_cte.hidden_concepts = statement.hidden_components
584
+
585
+ final_ctes = optimize_ctes(deduped_ctes, root_cte, statement)
586
+
587
+ return ProcessedQuery(
588
+ order_by=root_cte.order_by,
589
+ limit=statement.limit,
590
+ output_columns=statement.output_components,
591
+ ctes=final_ctes,
592
+ base=root_cte,
593
+ hidden_columns=set([x for x in statement.hidden_components]),
594
+ local_concepts=statement.local_concepts,
595
+ locally_derived=statement.locally_derived,
596
+ )
@@ -0,0 +1,35 @@
1
+ # Statement Design
2
+
3
+
4
+ ## Assert statements
5
+ Used for DQ checks.
6
+
7
+ Unique in that we need to constrain to specific datasources/validation.
8
+
9
+
10
+
11
+ ## Concept check
12
+
13
+ # value comparison
14
+
15
+ two scalar values
16
+
17
+ assert expr from_datasource? = expr from_datasource?
18
+
19
+
20
+ assert not max(len(name.split(' '))) =1;
21
+
22
+
23
+ assert sum(revenue) from datasource1 = sum(revenue) from datasource2;
24
+
25
+
26
+ ## parallel block
27
+
28
+ begin parallel;
29
+
30
+ assert a == 1;
31
+ assert b == 2;
32
+
33
+ end parallel;
34
+
35
+
File without changes