pytrilogy 0.3.142__cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. LICENSE.md +19 -0
  2. _preql_import_resolver/__init__.py +5 -0
  3. _preql_import_resolver/_preql_import_resolver.cpython-313-x86_64-linux-gnu.so +0 -0
  4. pytrilogy-0.3.142.dist-info/METADATA +555 -0
  5. pytrilogy-0.3.142.dist-info/RECORD +200 -0
  6. pytrilogy-0.3.142.dist-info/WHEEL +5 -0
  7. pytrilogy-0.3.142.dist-info/entry_points.txt +2 -0
  8. pytrilogy-0.3.142.dist-info/licenses/LICENSE.md +19 -0
  9. trilogy/__init__.py +16 -0
  10. trilogy/ai/README.md +10 -0
  11. trilogy/ai/__init__.py +19 -0
  12. trilogy/ai/constants.py +92 -0
  13. trilogy/ai/conversation.py +107 -0
  14. trilogy/ai/enums.py +7 -0
  15. trilogy/ai/execute.py +50 -0
  16. trilogy/ai/models.py +34 -0
  17. trilogy/ai/prompts.py +100 -0
  18. trilogy/ai/providers/__init__.py +0 -0
  19. trilogy/ai/providers/anthropic.py +106 -0
  20. trilogy/ai/providers/base.py +24 -0
  21. trilogy/ai/providers/google.py +146 -0
  22. trilogy/ai/providers/openai.py +89 -0
  23. trilogy/ai/providers/utils.py +68 -0
  24. trilogy/authoring/README.md +3 -0
  25. trilogy/authoring/__init__.py +148 -0
  26. trilogy/constants.py +113 -0
  27. trilogy/core/README.md +52 -0
  28. trilogy/core/__init__.py +0 -0
  29. trilogy/core/constants.py +6 -0
  30. trilogy/core/enums.py +443 -0
  31. trilogy/core/env_processor.py +120 -0
  32. trilogy/core/environment_helpers.py +320 -0
  33. trilogy/core/ergonomics.py +193 -0
  34. trilogy/core/exceptions.py +123 -0
  35. trilogy/core/functions.py +1227 -0
  36. trilogy/core/graph_models.py +139 -0
  37. trilogy/core/internal.py +85 -0
  38. trilogy/core/models/__init__.py +0 -0
  39. trilogy/core/models/author.py +2669 -0
  40. trilogy/core/models/build.py +2521 -0
  41. trilogy/core/models/build_environment.py +180 -0
  42. trilogy/core/models/core.py +501 -0
  43. trilogy/core/models/datasource.py +322 -0
  44. trilogy/core/models/environment.py +751 -0
  45. trilogy/core/models/execute.py +1177 -0
  46. trilogy/core/optimization.py +251 -0
  47. trilogy/core/optimizations/__init__.py +12 -0
  48. trilogy/core/optimizations/base_optimization.py +17 -0
  49. trilogy/core/optimizations/hide_unused_concept.py +47 -0
  50. trilogy/core/optimizations/inline_datasource.py +102 -0
  51. trilogy/core/optimizations/predicate_pushdown.py +245 -0
  52. trilogy/core/processing/README.md +94 -0
  53. trilogy/core/processing/READMEv2.md +121 -0
  54. trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
  55. trilogy/core/processing/__init__.py +0 -0
  56. trilogy/core/processing/concept_strategies_v3.py +508 -0
  57. trilogy/core/processing/constants.py +15 -0
  58. trilogy/core/processing/discovery_node_factory.py +451 -0
  59. trilogy/core/processing/discovery_utility.py +548 -0
  60. trilogy/core/processing/discovery_validation.py +167 -0
  61. trilogy/core/processing/graph_utils.py +43 -0
  62. trilogy/core/processing/node_generators/README.md +9 -0
  63. trilogy/core/processing/node_generators/__init__.py +31 -0
  64. trilogy/core/processing/node_generators/basic_node.py +160 -0
  65. trilogy/core/processing/node_generators/common.py +268 -0
  66. trilogy/core/processing/node_generators/constant_node.py +38 -0
  67. trilogy/core/processing/node_generators/filter_node.py +315 -0
  68. trilogy/core/processing/node_generators/group_node.py +213 -0
  69. trilogy/core/processing/node_generators/group_to_node.py +117 -0
  70. trilogy/core/processing/node_generators/multiselect_node.py +205 -0
  71. trilogy/core/processing/node_generators/node_merge_node.py +653 -0
  72. trilogy/core/processing/node_generators/recursive_node.py +88 -0
  73. trilogy/core/processing/node_generators/rowset_node.py +165 -0
  74. trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
  75. trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
  76. trilogy/core/processing/node_generators/select_merge_node.py +748 -0
  77. trilogy/core/processing/node_generators/select_node.py +95 -0
  78. trilogy/core/processing/node_generators/synonym_node.py +98 -0
  79. trilogy/core/processing/node_generators/union_node.py +91 -0
  80. trilogy/core/processing/node_generators/unnest_node.py +182 -0
  81. trilogy/core/processing/node_generators/window_node.py +201 -0
  82. trilogy/core/processing/nodes/README.md +28 -0
  83. trilogy/core/processing/nodes/__init__.py +179 -0
  84. trilogy/core/processing/nodes/base_node.py +519 -0
  85. trilogy/core/processing/nodes/filter_node.py +75 -0
  86. trilogy/core/processing/nodes/group_node.py +194 -0
  87. trilogy/core/processing/nodes/merge_node.py +420 -0
  88. trilogy/core/processing/nodes/recursive_node.py +46 -0
  89. trilogy/core/processing/nodes/select_node_v2.py +242 -0
  90. trilogy/core/processing/nodes/union_node.py +53 -0
  91. trilogy/core/processing/nodes/unnest_node.py +62 -0
  92. trilogy/core/processing/nodes/window_node.py +56 -0
  93. trilogy/core/processing/utility.py +823 -0
  94. trilogy/core/query_processor.py +596 -0
  95. trilogy/core/statements/README.md +35 -0
  96. trilogy/core/statements/__init__.py +0 -0
  97. trilogy/core/statements/author.py +536 -0
  98. trilogy/core/statements/build.py +0 -0
  99. trilogy/core/statements/common.py +20 -0
  100. trilogy/core/statements/execute.py +155 -0
  101. trilogy/core/table_processor.py +66 -0
  102. trilogy/core/utility.py +8 -0
  103. trilogy/core/validation/README.md +46 -0
  104. trilogy/core/validation/__init__.py +0 -0
  105. trilogy/core/validation/common.py +161 -0
  106. trilogy/core/validation/concept.py +146 -0
  107. trilogy/core/validation/datasource.py +227 -0
  108. trilogy/core/validation/environment.py +73 -0
  109. trilogy/core/validation/fix.py +256 -0
  110. trilogy/dialect/__init__.py +32 -0
  111. trilogy/dialect/base.py +1392 -0
  112. trilogy/dialect/bigquery.py +308 -0
  113. trilogy/dialect/common.py +147 -0
  114. trilogy/dialect/config.py +144 -0
  115. trilogy/dialect/dataframe.py +50 -0
  116. trilogy/dialect/duckdb.py +231 -0
  117. trilogy/dialect/enums.py +147 -0
  118. trilogy/dialect/metadata.py +173 -0
  119. trilogy/dialect/mock.py +190 -0
  120. trilogy/dialect/postgres.py +117 -0
  121. trilogy/dialect/presto.py +110 -0
  122. trilogy/dialect/results.py +89 -0
  123. trilogy/dialect/snowflake.py +129 -0
  124. trilogy/dialect/sql_server.py +137 -0
  125. trilogy/engine.py +48 -0
  126. trilogy/execution/config.py +75 -0
  127. trilogy/executor.py +568 -0
  128. trilogy/hooks/__init__.py +4 -0
  129. trilogy/hooks/base_hook.py +40 -0
  130. trilogy/hooks/graph_hook.py +139 -0
  131. trilogy/hooks/query_debugger.py +166 -0
  132. trilogy/metadata/__init__.py +0 -0
  133. trilogy/parser.py +10 -0
  134. trilogy/parsing/README.md +21 -0
  135. trilogy/parsing/__init__.py +0 -0
  136. trilogy/parsing/common.py +1069 -0
  137. trilogy/parsing/config.py +5 -0
  138. trilogy/parsing/exceptions.py +8 -0
  139. trilogy/parsing/helpers.py +1 -0
  140. trilogy/parsing/parse_engine.py +2813 -0
  141. trilogy/parsing/render.py +769 -0
  142. trilogy/parsing/trilogy.lark +540 -0
  143. trilogy/py.typed +0 -0
  144. trilogy/render.py +42 -0
  145. trilogy/scripts/README.md +9 -0
  146. trilogy/scripts/__init__.py +0 -0
  147. trilogy/scripts/agent.py +41 -0
  148. trilogy/scripts/agent_info.py +303 -0
  149. trilogy/scripts/common.py +355 -0
  150. trilogy/scripts/dependency/Cargo.lock +617 -0
  151. trilogy/scripts/dependency/Cargo.toml +39 -0
  152. trilogy/scripts/dependency/README.md +131 -0
  153. trilogy/scripts/dependency/build.sh +25 -0
  154. trilogy/scripts/dependency/src/directory_resolver.rs +177 -0
  155. trilogy/scripts/dependency/src/lib.rs +16 -0
  156. trilogy/scripts/dependency/src/main.rs +770 -0
  157. trilogy/scripts/dependency/src/parser.rs +435 -0
  158. trilogy/scripts/dependency/src/preql.pest +208 -0
  159. trilogy/scripts/dependency/src/python_bindings.rs +303 -0
  160. trilogy/scripts/dependency/src/resolver.rs +716 -0
  161. trilogy/scripts/dependency/tests/base.preql +3 -0
  162. trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
  163. trilogy/scripts/dependency/tests/customer.preql +6 -0
  164. trilogy/scripts/dependency/tests/main.preql +9 -0
  165. trilogy/scripts/dependency/tests/orders.preql +7 -0
  166. trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
  167. trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
  168. trilogy/scripts/dependency.py +323 -0
  169. trilogy/scripts/display.py +512 -0
  170. trilogy/scripts/environment.py +46 -0
  171. trilogy/scripts/fmt.py +32 -0
  172. trilogy/scripts/ingest.py +471 -0
  173. trilogy/scripts/ingest_helpers/__init__.py +1 -0
  174. trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
  175. trilogy/scripts/ingest_helpers/formatting.py +93 -0
  176. trilogy/scripts/ingest_helpers/typing.py +161 -0
  177. trilogy/scripts/init.py +105 -0
  178. trilogy/scripts/parallel_execution.py +713 -0
  179. trilogy/scripts/plan.py +189 -0
  180. trilogy/scripts/run.py +63 -0
  181. trilogy/scripts/serve.py +140 -0
  182. trilogy/scripts/serve_helpers/__init__.py +41 -0
  183. trilogy/scripts/serve_helpers/file_discovery.py +142 -0
  184. trilogy/scripts/serve_helpers/index_generation.py +206 -0
  185. trilogy/scripts/serve_helpers/models.py +38 -0
  186. trilogy/scripts/single_execution.py +131 -0
  187. trilogy/scripts/testing.py +119 -0
  188. trilogy/scripts/trilogy.py +68 -0
  189. trilogy/std/__init__.py +0 -0
  190. trilogy/std/color.preql +3 -0
  191. trilogy/std/date.preql +13 -0
  192. trilogy/std/display.preql +18 -0
  193. trilogy/std/geography.preql +22 -0
  194. trilogy/std/metric.preql +15 -0
  195. trilogy/std/money.preql +67 -0
  196. trilogy/std/net.preql +14 -0
  197. trilogy/std/ranking.preql +7 -0
  198. trilogy/std/report.preql +5 -0
  199. trilogy/std/semantic.preql +6 -0
  200. trilogy/utility.py +34 -0
@@ -0,0 +1,748 @@
1
+ from functools import reduce
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import networkx as nx
5
+
6
+ from trilogy.constants import logger
7
+ from trilogy.core.enums import Derivation
8
+ from trilogy.core.graph_models import (
9
+ ReferenceGraph,
10
+ concept_to_node,
11
+ get_graph_exact_match,
12
+ prune_sources_for_aggregates,
13
+ prune_sources_for_conditions,
14
+ )
15
+ from trilogy.core.models.build import (
16
+ BuildConcept,
17
+ BuildDatasource,
18
+ BuildGrain,
19
+ BuildWhereClause,
20
+ CanonicalBuildConceptList,
21
+ )
22
+ from trilogy.core.models.build_environment import BuildEnvironment
23
+ from trilogy.core.processing.node_generators.select_helpers.datasource_injection import (
24
+ get_union_sources,
25
+ )
26
+ from trilogy.core.processing.nodes import (
27
+ ConstantNode,
28
+ GroupNode,
29
+ MergeNode,
30
+ SelectNode,
31
+ StrategyNode,
32
+ )
33
+ from trilogy.core.processing.utility import padding
34
+
35
+ if TYPE_CHECKING:
36
+ from trilogy.core.processing.nodes.union_node import UnionNode
37
+
38
+ LOGGER_PREFIX = "[GEN_ROOT_MERGE_NODE]"
39
+
40
+
41
+ def extract_address(node: str):
42
+ return node.split("~")[1].split("@")[0]
43
+
44
+
45
+ def get_graph_partial_nodes(
46
+ g: ReferenceGraph, conditions: BuildWhereClause | None
47
+ ) -> dict[str, list[str]]:
48
+ partial: dict[str, list[str]] = {}
49
+ for node, ds in g.datasources.items():
50
+
51
+ if not isinstance(ds, list):
52
+
53
+ if ds.non_partial_for and conditions == ds.non_partial_for:
54
+ partial[node] = []
55
+ continue
56
+ partial[node] = [concept_to_node(c) for c in ds.partial_concepts]
57
+ # assume union sources have no partial
58
+ else:
59
+ partial[node] = []
60
+ return partial
61
+
62
+
63
+ def get_graph_grains(g: ReferenceGraph) -> dict[str, list[str]]:
64
+ grain_length: dict[str, list[str]] = {}
65
+ for node, lookup in g.datasources.items():
66
+
67
+ base: set[str] = set()
68
+ if not isinstance(lookup, list):
69
+ flookup = [lookup]
70
+ else:
71
+ flookup = lookup
72
+ assert isinstance(flookup, list)
73
+ grain_length[node] = reduce(
74
+ lambda x, y: x.union(y.grain.components), flookup, base # type: ignore
75
+ )
76
+ return grain_length
77
+
78
+
79
+ def subgraph_is_complete(
80
+ nodes: list[str], targets: set[str], mapping: dict[str, str], g: nx.DiGraph
81
+ ) -> bool:
82
+ # Check if all targets are present in mapped nodes
83
+ mapped = {mapping.get(n, n) for n in nodes}
84
+ if not targets.issubset(mapped):
85
+ missing = targets - mapped
86
+ logger.debug(
87
+ f"Subgraph {nodes} is not complete, missing targets {missing} - mapped {mapped}"
88
+ )
89
+ return False
90
+
91
+ # Check if at least one concept node has a datasource edge
92
+ has_ds_edge = {target: False for target in targets}
93
+
94
+ for node in nodes:
95
+ if node.startswith("c~"):
96
+ mapped_node = mapping.get(node, node)
97
+ if mapped_node in targets and not has_ds_edge[mapped_node]:
98
+ # Only check neighbors if we haven't found a ds edge for this mapped node yet
99
+ if any(
100
+ neighbor.startswith("ds~") for neighbor in nx.neighbors(g, node)
101
+ ):
102
+ has_ds_edge[mapped_node] = True
103
+
104
+ return all(has_ds_edge.values())
105
+
106
+
107
+ def create_pruned_concept_graph(
108
+ g: ReferenceGraph,
109
+ all_concepts: List[BuildConcept],
110
+ datasources: list[BuildDatasource],
111
+ accept_partial: bool = False,
112
+ conditions: BuildWhereClause | None = None,
113
+ depth: int = 0,
114
+ ) -> nx.DiGraph:
115
+ orig_g = g
116
+
117
+ g = g.copy()
118
+ union_options = get_union_sources(datasources, all_concepts)
119
+
120
+ for ds_list in union_options:
121
+ node_address = "ds~" + "-".join([x.name for x in ds_list])
122
+ logger.info(
123
+ f"{padding(depth)}{LOGGER_PREFIX} injecting potentially relevant union datasource {node_address}"
124
+ )
125
+ common: set[BuildConcept] = set.intersection(
126
+ *[set(x.output_concepts) for x in ds_list]
127
+ )
128
+ g.add_datasource_node(node_address, ds_list)
129
+ for c in common:
130
+ cnode = concept_to_node(c)
131
+ g.add_edge(node_address, cnode)
132
+ g.add_edge(cnode, node_address)
133
+ prune_sources_for_conditions(g, accept_partial, conditions)
134
+ prune_sources_for_aggregates(g, all_concepts, logger)
135
+ target_addresses = set([c.canonical_address for c in all_concepts])
136
+ concepts: dict[str, BuildConcept] = orig_g.concepts
137
+ datasource_map: dict[str, BuildDatasource] = orig_g.datasources
138
+ relevant_concepts_pre = {
139
+ n: x.canonical_address
140
+ for n in g.nodes()
141
+ # filter out synonyms
142
+ if (x := concepts.get(n, None)) and x.canonical_address in target_addresses
143
+ }
144
+
145
+ relevant_concepts: list[str] = list(relevant_concepts_pre.keys())
146
+ relevent_datasets: list[str] = []
147
+ if not accept_partial:
148
+ partial = {}
149
+ partial = get_graph_partial_nodes(g, conditions)
150
+ to_remove = []
151
+ for edge in g.edges:
152
+ if (
153
+ edge[0] in datasource_map
154
+ and (pnodes := partial.get(edge[0], []))
155
+ and edge[1] in pnodes
156
+ ):
157
+ to_remove.append(edge)
158
+ if (
159
+ edge[1] in datasource_map
160
+ and (pnodes := partial.get(edge[1], []))
161
+ and edge[0] in pnodes
162
+ ):
163
+ to_remove.append(edge)
164
+ for edge in to_remove:
165
+ g.remove_edge(*edge)
166
+
167
+ for n in g.datasources:
168
+ if any([[n, x] in g.edges for x in relevant_concepts]):
169
+ relevent_datasets.append(n)
170
+ continue
171
+ logger.debug(f"Relevant datasets after pruning: {relevent_datasets}")
172
+ # for injecting extra join concepts that are shared between datasets
173
+ # use the original graph, pre-partial pruning
174
+ for n in orig_g.concepts:
175
+ # readd ignoring grain
176
+ # we want to join inclusive of all concepts
177
+ if n not in relevant_concepts:
178
+ n_neighbors = nx.all_neighbors(orig_g, n)
179
+ # check if the irrelevant concept is a join between
180
+ # two relevant datasets
181
+ neighbors = set()
182
+ for neighbor in n_neighbors:
183
+ if neighbor in relevent_datasets:
184
+ neighbors.add(neighbor)
185
+ if len(neighbors) > 1:
186
+ relevant_concepts.append(n)
187
+ continue
188
+ g.remove_nodes_from(
189
+ [
190
+ n
191
+ for n in g.nodes()
192
+ if n not in relevent_datasets and n not in relevant_concepts
193
+ ]
194
+ )
195
+ # from trilogy.hooks.graph_hook import GraphHook
196
+ # GraphHook().query_graph_built(g)
197
+ subgraphs = list(nx.connected_components(g.to_undirected()))
198
+ subgraphs = [
199
+ s
200
+ for s in subgraphs
201
+ if subgraph_is_complete(s, target_addresses, relevant_concepts_pre, g)
202
+ ]
203
+ # from trilogy.hooks.graph_hook import GraphHook
204
+ # GraphHook().query_graph_built(g)
205
+ if not subgraphs:
206
+ logger.info(
207
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - no subgraphs after node prune"
208
+ )
209
+ return None
210
+
211
+ if subgraphs and len(subgraphs) != 1:
212
+ logger.info(
213
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - subgraphs are split - have {len(subgraphs)} from {subgraphs}"
214
+ )
215
+ return None
216
+ # add back any relevant edges that might have been partially filtered
217
+ relevant = set(relevant_concepts + relevent_datasets)
218
+ for edge in orig_g.edges():
219
+ if edge[0] in relevant and edge[1] in relevant:
220
+ g.add_edge(edge[0], edge[1], fast=True)
221
+ # if we have no ds nodes at all, for non constant, we can't find it
222
+ if not any([n.startswith("ds~") for n in g.nodes]):
223
+ logger.info(
224
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - No datasource nodes found"
225
+ )
226
+ return None
227
+ return g
228
+
229
+
230
+ # def deduplicate_nodes(subgraph: nx.DiGraph, nodes: list[str], partial_map: dict[str, list[str]], depth: int) -> list[str]:
231
+ # """
232
+ # Remove duplicate datasource nodes that are connected to the same concepts
233
+ # and have the same partial state, keeping the one with the most unique concepts.
234
+
235
+ # Args:
236
+ # subgraph: NetworkX DiGraph containing the nodes and edges
237
+ # nodes: List of node names to deduplicate
238
+ # partial_map: Map of datasource to partial nodes
239
+
240
+ # Returns:
241
+ # List of deduplicated node names
242
+ # """
243
+ # # Filter for datasource nodes only
244
+ # ds_nodes = [node for node in nodes if node.startswith("ds~")]
245
+ # non_ds_nodes = [node for node in nodes if not node.startswith("ds~")]
246
+
247
+ # if len(ds_nodes) <= 1:
248
+ # return nodes # No deduplication needed
249
+
250
+ # # Build a map of each datasource to its connected concepts and partial state
251
+ # ds_info = {}
252
+
253
+ # for ds_node in ds_nodes:
254
+ # # Get connected concept nodes (nodes starting with "c~")
255
+ # connected_concepts = set()
256
+ # for neighbor in subgraph.neighbors(ds_node):
257
+ # if neighbor.startswith("c~"):
258
+ # connected_concepts.add(neighbor)
259
+
260
+ # # Get partial state for this datasource
261
+ # partial_state = tuple(sorted(partial_map.get(ds_node, [])))
262
+
263
+ # ds_info[ds_node] = {
264
+ # 'concepts': connected_concepts,
265
+ # 'partial_state': partial_state
266
+ # }
267
+
268
+ # # Find datasources to remove (those that are subsets of others)
269
+ # nodes_to_remove = set()
270
+ # logger.info('LOOK HERE')
271
+ # logger.info(ds_info)
272
+ # for ds_a, info_a in ds_info.items():
273
+ # for ds_b, info_b in ds_info.items():
274
+ # if ds_a != ds_b and ds_a not in nodes_to_remove:
275
+ # # Check if ds_a is a subset of ds_b (same partial state and concepts are subset)
276
+ # if (info_a['partial_state'] == info_b['partial_state'] and
277
+ # info_a['concepts'].issubset(info_b['concepts']) and
278
+ # len(info_a['concepts']) < len(info_b['concepts'])):
279
+ # # ds_a connects to fewer concepts than ds_b, so remove ds_a
280
+ # nodes_to_remove.add(ds_a)
281
+ # elif (info_a['partial_state'] == info_b['partial_state'] and
282
+ # info_a['concepts'] == info_b['concepts']):
283
+ # # Exact same concepts and partial state - keep one arbitrarily
284
+ # # (keep the lexicographically smaller one for consistency)
285
+ # if ds_a > ds_b:
286
+ # nodes_to_remove.add(ds_a)
287
+
288
+ # # Keep datasource nodes that weren't marked for removal
289
+ # logger.info(f"{padding(depth)}{LOGGER_PREFIX} Removing duplicate datasource nodes: {nodes_to_remove}")
290
+ # deduplicated_ds_nodes = [ds for ds in ds_nodes if ds not in nodes_to_remove]
291
+
292
+ # # Return deduplicated datasource nodes plus all non-datasource nodes
293
+ # return deduplicated_ds_nodes + non_ds_nodes
294
+
295
+
296
+ def resolve_subgraphs(
297
+ g: ReferenceGraph,
298
+ relevant: list[BuildConcept],
299
+ accept_partial: bool,
300
+ conditions: BuildWhereClause | None,
301
+ depth: int = 0,
302
+ ) -> dict[str, list[str]]:
303
+ """When we have multiple distinct subgraphs within our matched
304
+ nodes that can satisfy a query, resolve which one of those we should
305
+ ultimately ues.
306
+ This should generally return one subgraph for each
307
+ unique set of sub concepts that can be referenced,
308
+ discarding duplicates.
309
+ Duplicate subgraphs will be resolved based on which
310
+ ones are most 'optimal' to use, a hueristic
311
+ that can evolve in the future but is currently based on datasource
312
+ cardinality."""
313
+ datasources = [n for n in g.nodes if n.startswith("ds~")]
314
+ canonical_relevant = set([c.canonical_address for c in relevant])
315
+ canonical_map = {c.canonical_address: c.address for c in relevant}
316
+ subgraphs: dict[str, list[str]] = {
317
+ ds: list(set(list(nx.all_neighbors(g, ds)))) for ds in datasources
318
+ }
319
+ partial_map = get_graph_partial_nodes(g, conditions)
320
+ exact_map = get_graph_exact_match(g, accept_partial, conditions)
321
+ grain_length = get_graph_grains(g)
322
+ concepts: dict[str, BuildConcept] = g.concepts
323
+ non_partial_map = {
324
+ ds: [
325
+ concepts[c].canonical_address
326
+ for c in subgraphs[ds]
327
+ if c not in partial_map[ds]
328
+ ]
329
+ for ds in datasources
330
+ }
331
+ concept_map = {
332
+ ds: [concepts[c].canonical_address for c in subgraphs[ds]] for ds in datasources
333
+ }
334
+ pruned_subgraphs = {}
335
+
336
+ def score_node(input: str):
337
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} scoring node {input}")
338
+ grain = grain_length[input]
339
+ # first - go for lowest grain
340
+ # but if the object we want is in the grain, treat that as "free"
341
+ # ex - pick source with grain(product_id) over grain(order_id)
342
+ # when going for product_id
343
+ score = (
344
+ len(list(grain)) - sum([1 for x in concept_map[input] if x in grain]),
345
+ # then check if it's an exact condition match
346
+ 0 if input in exact_map else 0.5,
347
+ # last, number of concepts
348
+ len(subgraphs[input]),
349
+ input,
350
+ )
351
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} node {input} has score {score}")
352
+ return score
353
+
354
+ for key, nodes in subgraphs.items():
355
+
356
+ value = non_partial_map[key]
357
+ all_concepts = concept_map[key]
358
+ is_subset = False
359
+ matches = set()
360
+ # Compare current list with other lists
361
+ for other_key, other_all_concepts in concept_map.items():
362
+ other_value = non_partial_map[other_key]
363
+ # needs to be a subset of non partial and a subset of all
364
+ if (
365
+ key != other_key
366
+ and set(value).issubset(set(other_value))
367
+ and set(all_concepts).issubset(set(other_all_concepts))
368
+ ):
369
+ if len(value) < len(other_value):
370
+ is_subset = True
371
+ logger.debug(
372
+ f"{padding(depth)}{LOGGER_PREFIX} Dropping subgraph {key} with {value} as it is a subset of {other_key} with {other_value}"
373
+ )
374
+ elif len(value) == len(other_value) and len(all_concepts) == len(
375
+ other_all_concepts
376
+ ):
377
+ matches.add(other_key)
378
+ matches.add(key)
379
+ if matches and not is_subset:
380
+ min_node = min(matches, key=score_node)
381
+ logger.debug(
382
+ f"{padding(depth)}{LOGGER_PREFIX} minimum source score is {min_node}"
383
+ )
384
+ is_subset = key is not min(matches, key=score_node)
385
+ if not is_subset:
386
+ pruned_subgraphs[key] = nodes
387
+
388
+ final_nodes: set[str] = set([n for v in pruned_subgraphs.values() for n in v])
389
+ relevant_concepts_pre = {
390
+ n: x.canonical_address
391
+ for n in g.nodes()
392
+ # filter out synonyms
393
+ if (x := concepts.get(n, None)) and x.canonical_address in canonical_relevant
394
+ }
395
+ logger.debug(
396
+ f"{padding(depth)}{LOGGER_PREFIX} Final nodes before relevance pruning: {final_nodes}"
397
+ )
398
+ for node in final_nodes:
399
+ keep = True
400
+ if node.startswith("c~") and node not in relevant_concepts_pre:
401
+ keep = (
402
+ sum(
403
+ [
404
+ 1 if node in sub_nodes else 0
405
+ for _, sub_nodes in pruned_subgraphs.items()
406
+ ]
407
+ )
408
+ > 1
409
+ )
410
+ if not keep:
411
+ logger.debug(
412
+ f"{padding(depth)}{LOGGER_PREFIX} Pruning node {node} as irrelevant after subgraph resolution"
413
+ )
414
+ pruned_subgraphs = {
415
+ canonical_map.get(k, k): [n for n in v if n != node]
416
+ for k, v in pruned_subgraphs.items()
417
+ }
418
+
419
+ return pruned_subgraphs
420
+
421
+
422
+ def create_datasource_node(
423
+ datasource: BuildDatasource,
424
+ all_concepts: List[BuildConcept],
425
+ accept_partial: bool,
426
+ environment: BuildEnvironment,
427
+ depth: int,
428
+ conditions: BuildWhereClause | None = None,
429
+ ) -> tuple[StrategyNode, bool]:
430
+
431
+ target_grain = BuildGrain.from_concepts(all_concepts, environment=environment)
432
+ # datasource grain may have changed since reference graph creation
433
+ datasource_grain = BuildGrain.from_concepts(
434
+ datasource.grain.components, environment=environment
435
+ )
436
+ # datasource_grain = datasource.grain
437
+ force_group = False
438
+ if not datasource_grain.issubset(target_grain):
439
+ logger.info(
440
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node must be wrapped in group, {datasource_grain} not subset of target grain {target_grain} from {all_concepts}"
441
+ )
442
+ force_group = True
443
+ else:
444
+ logger.info(
445
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node grain {datasource_grain} is subset of target grain {target_grain}, no group required"
446
+ )
447
+ if not datasource_grain.components:
448
+ force_group = True
449
+ partial_concepts = [
450
+ c.concept
451
+ for c in datasource.columns
452
+ if not c.is_complete and c.concept.address in all_concepts
453
+ ]
454
+
455
+ partial_lcl = CanonicalBuildConceptList(concepts=partial_concepts)
456
+ nullable_concepts = [
457
+ c.concept
458
+ for c in datasource.columns
459
+ if c.is_nullable and c.concept.address in all_concepts
460
+ ]
461
+
462
+ nullable_lcl = CanonicalBuildConceptList(concepts=nullable_concepts)
463
+ partial_is_full = conditions and (conditions == datasource.non_partial_for)
464
+
465
+ datasource_conditions = datasource.where.conditional if datasource.where else None
466
+ all_inputs = [c.concept for c in datasource.columns]
467
+ canonical_all = CanonicalBuildConceptList(concepts=all_inputs)
468
+
469
+ # if we're binding a via a canonical address association
470
+ # we need to add it here.
471
+ for x in all_concepts:
472
+ if x not in all_inputs and x in canonical_all:
473
+ all_inputs.append(x)
474
+
475
+ rval = SelectNode(
476
+ input_concepts=all_inputs,
477
+ output_concepts=sorted(all_concepts, key=lambda x: x.address),
478
+ environment=environment,
479
+ parents=[],
480
+ depth=depth,
481
+ partial_concepts=(
482
+ [] if partial_is_full else [c for c in all_concepts if c in partial_lcl]
483
+ ),
484
+ nullable_concepts=[c for c in all_concepts if c in nullable_lcl],
485
+ accept_partial=accept_partial,
486
+ datasource=datasource,
487
+ grain=datasource.grain,
488
+ conditions=datasource_conditions,
489
+ preexisting_conditions=(
490
+ conditions.conditional if partial_is_full and conditions else None
491
+ ),
492
+ )
493
+ return (
494
+ rval,
495
+ force_group,
496
+ )
497
+
498
+
499
+ def create_union_datasource(
500
+ datasource: list[BuildDatasource],
501
+ all_concepts: List[BuildConcept],
502
+ accept_partial: bool,
503
+ environment: BuildEnvironment,
504
+ depth: int,
505
+ conditions: BuildWhereClause | None = None,
506
+ ) -> tuple["UnionNode", bool]:
507
+ from trilogy.core.processing.nodes.union_node import UnionNode
508
+
509
+ logger.info(
510
+ f"{padding(depth)}{LOGGER_PREFIX} generating union node parents with condition {conditions}"
511
+ )
512
+ force_group = False
513
+ parents = []
514
+ for x in datasource:
515
+ subnode, fg = create_datasource_node(
516
+ x,
517
+ all_concepts,
518
+ accept_partial,
519
+ environment,
520
+ depth + 1,
521
+ conditions=conditions,
522
+ )
523
+ parents.append(subnode)
524
+ force_group = force_group or fg
525
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} returning union node")
526
+ return (
527
+ UnionNode(
528
+ output_concepts=all_concepts,
529
+ input_concepts=all_concepts,
530
+ environment=environment,
531
+ parents=parents,
532
+ depth=depth,
533
+ partial_concepts=[],
534
+ ),
535
+ force_group,
536
+ )
537
+
538
+
539
+ def create_select_node(
540
+ ds_name: str,
541
+ subgraph: list[str],
542
+ accept_partial: bool,
543
+ g: ReferenceGraph,
544
+ environment: BuildEnvironment,
545
+ depth: int,
546
+ conditions: BuildWhereClause | None = None,
547
+ ) -> StrategyNode:
548
+ all_concepts = [
549
+ environment.canonical_concepts[extract_address(c)]
550
+ for c in subgraph
551
+ if c.startswith("c~")
552
+ ]
553
+
554
+ if all([c.derivation == Derivation.CONSTANT for c in all_concepts]):
555
+ logger.info(
556
+ f"{padding(depth)}{LOGGER_PREFIX} All concepts {[x.address for x in all_concepts]} are constants, returning constant node"
557
+ )
558
+ return ConstantNode(
559
+ output_concepts=all_concepts,
560
+ input_concepts=[],
561
+ environment=environment,
562
+ parents=[],
563
+ depth=depth,
564
+ # no partial for constants
565
+ partial_concepts=[],
566
+ force_group=False,
567
+ preexisting_conditions=conditions.conditional if conditions else None,
568
+ )
569
+
570
+ datasource: BuildDatasource = g.datasources[ds_name]
571
+
572
+ if isinstance(datasource, BuildDatasource):
573
+ bcandidate, force_group = create_datasource_node(
574
+ datasource,
575
+ all_concepts,
576
+ accept_partial,
577
+ environment,
578
+ depth,
579
+ conditions=conditions,
580
+ )
581
+
582
+ elif isinstance(datasource, list):
583
+ bcandidate, force_group = create_union_datasource(
584
+ datasource,
585
+ all_concepts,
586
+ accept_partial,
587
+ environment,
588
+ depth,
589
+ conditions=conditions,
590
+ )
591
+ else:
592
+ raise ValueError(f"Unknown datasource type {datasource}")
593
+
594
+ # we need to nest the group node one further
595
+ if force_group is True:
596
+ logger.info(
597
+ f"{padding(depth)}{LOGGER_PREFIX} source requires group before consumption."
598
+ )
599
+ candidate: StrategyNode = GroupNode(
600
+ output_concepts=all_concepts,
601
+ input_concepts=all_concepts,
602
+ environment=environment,
603
+ parents=[bcandidate],
604
+ depth=depth + 1,
605
+ partial_concepts=bcandidate.partial_concepts,
606
+ nullable_concepts=bcandidate.nullable_concepts,
607
+ preexisting_conditions=bcandidate.preexisting_conditions,
608
+ force_group=force_group,
609
+ )
610
+ else:
611
+
612
+ candidate = bcandidate
613
+
614
+ return candidate
615
+
616
+
617
+ def gen_select_merge_node(
618
+ all_concepts: List[BuildConcept],
619
+ g: nx.DiGraph,
620
+ environment: BuildEnvironment,
621
+ depth: int,
622
+ accept_partial: bool = False,
623
+ conditions: BuildWhereClause | None = None,
624
+ ) -> Optional[StrategyNode]:
625
+ non_constant = [c for c in all_concepts if c.derivation != Derivation.CONSTANT]
626
+ constants = [c for c in all_concepts if c.derivation == Derivation.CONSTANT]
627
+ if not non_constant and constants:
628
+ logger.info(
629
+ f"{padding(depth)}{LOGGER_PREFIX} only constant inputs to discovery ({constants}), returning constant node directly"
630
+ )
631
+ for x in constants:
632
+ logger.info(
633
+ f"{padding(depth)}{LOGGER_PREFIX} {x} {x.lineage} {x.derivation}"
634
+ )
635
+ if conditions:
636
+ if not all(
637
+ [x.derivation == Derivation.CONSTANT for x in conditions.row_arguments]
638
+ ):
639
+ logger.info(
640
+ f"{padding(depth)}{LOGGER_PREFIX} conditions being passed in to constant node {conditions}, but not all concepts are constants, cannot generate select node."
641
+ )
642
+ return None
643
+ else:
644
+ constants += conditions.row_arguments
645
+
646
+ return ConstantNode(
647
+ output_concepts=constants,
648
+ input_concepts=[],
649
+ environment=environment,
650
+ parents=[],
651
+ depth=depth,
652
+ partial_concepts=[],
653
+ force_group=False,
654
+ conditions=conditions.conditional if conditions else None,
655
+ )
656
+ attempts = [
657
+ False,
658
+ ]
659
+ if accept_partial:
660
+ attempts.append(True)
661
+ logger.info(
662
+ f"{padding(depth)}{LOGGER_PREFIX} searching for root source graph for concepts {[c.address for c in all_concepts]} and conditions {conditions}"
663
+ )
664
+ pruned_concept_graph = None
665
+ for attempt in attempts:
666
+ pruned_concept_graph = create_pruned_concept_graph(
667
+ g,
668
+ non_constant,
669
+ accept_partial=attempt,
670
+ conditions=conditions,
671
+ datasources=list([x for x in environment.datasources.values()]),
672
+ depth=depth,
673
+ )
674
+ if pruned_concept_graph:
675
+ logger.info(
676
+ f"{padding(depth)}{LOGGER_PREFIX} found covering graph w/ partial flag {attempt} {list(pruned_concept_graph.nodes)}"
677
+ )
678
+ break
679
+
680
+ if not pruned_concept_graph:
681
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} no covering graph found.")
682
+ return None
683
+
684
+ sub_nodes = resolve_subgraphs(
685
+ pruned_concept_graph,
686
+ relevant=non_constant,
687
+ accept_partial=accept_partial,
688
+ conditions=conditions,
689
+ depth=depth,
690
+ )
691
+
692
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} fetching subgraphs {sub_nodes}")
693
+
694
+ parents = [
695
+ create_select_node(
696
+ k,
697
+ subgraph,
698
+ g=pruned_concept_graph,
699
+ accept_partial=accept_partial,
700
+ environment=environment,
701
+ depth=depth,
702
+ conditions=conditions,
703
+ )
704
+ for k, subgraph in sub_nodes.items()
705
+ ]
706
+ if not parents:
707
+ return None
708
+
709
+ if constants:
710
+ parents.append(
711
+ ConstantNode(
712
+ output_concepts=constants,
713
+ input_concepts=[],
714
+ environment=environment,
715
+ parents=[],
716
+ depth=depth,
717
+ partial_concepts=[],
718
+ force_group=False,
719
+ preexisting_conditions=conditions.conditional if conditions else None,
720
+ )
721
+ )
722
+
723
+ if len(parents) == 1:
724
+ return parents[0]
725
+ logger.info(
726
+ f"{padding(depth)}{LOGGER_PREFIX} Multiple parent DS nodes resolved - {[type(x) for x in parents]}, wrapping in merge"
727
+ )
728
+
729
+ preexisting_conditions = None
730
+ if conditions and all(
731
+ [
732
+ x.preexisting_conditions
733
+ and x.preexisting_conditions == conditions.conditional
734
+ for x in parents
735
+ ]
736
+ ):
737
+ preexisting_conditions = conditions.conditional
738
+
739
+ base = MergeNode(
740
+ output_concepts=all_concepts,
741
+ input_concepts=non_constant,
742
+ environment=environment,
743
+ depth=depth,
744
+ parents=parents,
745
+ preexisting_conditions=preexisting_conditions,
746
+ )
747
+
748
+ return base