pytrilogy 0.3.148__cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. LICENSE.md +19 -0
  2. _preql_import_resolver/__init__.py +5 -0
  3. _preql_import_resolver/_preql_import_resolver.cpython-312-aarch64-linux-gnu.so +0 -0
  4. pytrilogy-0.3.148.dist-info/METADATA +555 -0
  5. pytrilogy-0.3.148.dist-info/RECORD +206 -0
  6. pytrilogy-0.3.148.dist-info/WHEEL +5 -0
  7. pytrilogy-0.3.148.dist-info/entry_points.txt +2 -0
  8. pytrilogy-0.3.148.dist-info/licenses/LICENSE.md +19 -0
  9. trilogy/__init__.py +27 -0
  10. trilogy/ai/README.md +10 -0
  11. trilogy/ai/__init__.py +19 -0
  12. trilogy/ai/constants.py +92 -0
  13. trilogy/ai/conversation.py +107 -0
  14. trilogy/ai/enums.py +7 -0
  15. trilogy/ai/execute.py +50 -0
  16. trilogy/ai/models.py +34 -0
  17. trilogy/ai/prompts.py +100 -0
  18. trilogy/ai/providers/__init__.py +0 -0
  19. trilogy/ai/providers/anthropic.py +106 -0
  20. trilogy/ai/providers/base.py +24 -0
  21. trilogy/ai/providers/google.py +146 -0
  22. trilogy/ai/providers/openai.py +89 -0
  23. trilogy/ai/providers/utils.py +68 -0
  24. trilogy/authoring/README.md +3 -0
  25. trilogy/authoring/__init__.py +148 -0
  26. trilogy/constants.py +119 -0
  27. trilogy/core/README.md +52 -0
  28. trilogy/core/__init__.py +0 -0
  29. trilogy/core/constants.py +6 -0
  30. trilogy/core/enums.py +454 -0
  31. trilogy/core/env_processor.py +239 -0
  32. trilogy/core/environment_helpers.py +320 -0
  33. trilogy/core/ergonomics.py +193 -0
  34. trilogy/core/exceptions.py +123 -0
  35. trilogy/core/functions.py +1240 -0
  36. trilogy/core/graph_models.py +142 -0
  37. trilogy/core/internal.py +85 -0
  38. trilogy/core/models/__init__.py +0 -0
  39. trilogy/core/models/author.py +2662 -0
  40. trilogy/core/models/build.py +2603 -0
  41. trilogy/core/models/build_environment.py +165 -0
  42. trilogy/core/models/core.py +506 -0
  43. trilogy/core/models/datasource.py +434 -0
  44. trilogy/core/models/environment.py +756 -0
  45. trilogy/core/models/execute.py +1213 -0
  46. trilogy/core/optimization.py +251 -0
  47. trilogy/core/optimizations/__init__.py +12 -0
  48. trilogy/core/optimizations/base_optimization.py +17 -0
  49. trilogy/core/optimizations/hide_unused_concept.py +47 -0
  50. trilogy/core/optimizations/inline_datasource.py +102 -0
  51. trilogy/core/optimizations/predicate_pushdown.py +245 -0
  52. trilogy/core/processing/README.md +94 -0
  53. trilogy/core/processing/READMEv2.md +121 -0
  54. trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
  55. trilogy/core/processing/__init__.py +0 -0
  56. trilogy/core/processing/concept_strategies_v3.py +508 -0
  57. trilogy/core/processing/constants.py +15 -0
  58. trilogy/core/processing/discovery_node_factory.py +451 -0
  59. trilogy/core/processing/discovery_utility.py +548 -0
  60. trilogy/core/processing/discovery_validation.py +167 -0
  61. trilogy/core/processing/graph_utils.py +43 -0
  62. trilogy/core/processing/node_generators/README.md +9 -0
  63. trilogy/core/processing/node_generators/__init__.py +31 -0
  64. trilogy/core/processing/node_generators/basic_node.py +160 -0
  65. trilogy/core/processing/node_generators/common.py +270 -0
  66. trilogy/core/processing/node_generators/constant_node.py +38 -0
  67. trilogy/core/processing/node_generators/filter_node.py +315 -0
  68. trilogy/core/processing/node_generators/group_node.py +213 -0
  69. trilogy/core/processing/node_generators/group_to_node.py +117 -0
  70. trilogy/core/processing/node_generators/multiselect_node.py +207 -0
  71. trilogy/core/processing/node_generators/node_merge_node.py +695 -0
  72. trilogy/core/processing/node_generators/recursive_node.py +88 -0
  73. trilogy/core/processing/node_generators/rowset_node.py +165 -0
  74. trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
  75. trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
  76. trilogy/core/processing/node_generators/select_merge_node.py +786 -0
  77. trilogy/core/processing/node_generators/select_node.py +95 -0
  78. trilogy/core/processing/node_generators/synonym_node.py +98 -0
  79. trilogy/core/processing/node_generators/union_node.py +91 -0
  80. trilogy/core/processing/node_generators/unnest_node.py +182 -0
  81. trilogy/core/processing/node_generators/window_node.py +201 -0
  82. trilogy/core/processing/nodes/README.md +28 -0
  83. trilogy/core/processing/nodes/__init__.py +179 -0
  84. trilogy/core/processing/nodes/base_node.py +522 -0
  85. trilogy/core/processing/nodes/filter_node.py +75 -0
  86. trilogy/core/processing/nodes/group_node.py +194 -0
  87. trilogy/core/processing/nodes/merge_node.py +420 -0
  88. trilogy/core/processing/nodes/recursive_node.py +46 -0
  89. trilogy/core/processing/nodes/select_node_v2.py +242 -0
  90. trilogy/core/processing/nodes/union_node.py +53 -0
  91. trilogy/core/processing/nodes/unnest_node.py +62 -0
  92. trilogy/core/processing/nodes/window_node.py +56 -0
  93. trilogy/core/processing/utility.py +823 -0
  94. trilogy/core/query_processor.py +604 -0
  95. trilogy/core/statements/README.md +35 -0
  96. trilogy/core/statements/__init__.py +0 -0
  97. trilogy/core/statements/author.py +536 -0
  98. trilogy/core/statements/build.py +0 -0
  99. trilogy/core/statements/common.py +20 -0
  100. trilogy/core/statements/execute.py +155 -0
  101. trilogy/core/table_processor.py +66 -0
  102. trilogy/core/utility.py +8 -0
  103. trilogy/core/validation/README.md +46 -0
  104. trilogy/core/validation/__init__.py +0 -0
  105. trilogy/core/validation/common.py +161 -0
  106. trilogy/core/validation/concept.py +146 -0
  107. trilogy/core/validation/datasource.py +227 -0
  108. trilogy/core/validation/environment.py +73 -0
  109. trilogy/core/validation/fix.py +256 -0
  110. trilogy/dialect/__init__.py +32 -0
  111. trilogy/dialect/base.py +1431 -0
  112. trilogy/dialect/bigquery.py +314 -0
  113. trilogy/dialect/common.py +147 -0
  114. trilogy/dialect/config.py +159 -0
  115. trilogy/dialect/dataframe.py +50 -0
  116. trilogy/dialect/duckdb.py +376 -0
  117. trilogy/dialect/enums.py +149 -0
  118. trilogy/dialect/metadata.py +173 -0
  119. trilogy/dialect/mock.py +190 -0
  120. trilogy/dialect/postgres.py +117 -0
  121. trilogy/dialect/presto.py +110 -0
  122. trilogy/dialect/results.py +89 -0
  123. trilogy/dialect/snowflake.py +129 -0
  124. trilogy/dialect/sql_server.py +137 -0
  125. trilogy/engine.py +48 -0
  126. trilogy/execution/__init__.py +17 -0
  127. trilogy/execution/config.py +119 -0
  128. trilogy/execution/state/__init__.py +0 -0
  129. trilogy/execution/state/file_state_store.py +0 -0
  130. trilogy/execution/state/sqllite_state_store.py +0 -0
  131. trilogy/execution/state/state_store.py +301 -0
  132. trilogy/executor.py +656 -0
  133. trilogy/hooks/__init__.py +4 -0
  134. trilogy/hooks/base_hook.py +40 -0
  135. trilogy/hooks/graph_hook.py +135 -0
  136. trilogy/hooks/query_debugger.py +166 -0
  137. trilogy/metadata/__init__.py +0 -0
  138. trilogy/parser.py +10 -0
  139. trilogy/parsing/README.md +21 -0
  140. trilogy/parsing/__init__.py +0 -0
  141. trilogy/parsing/common.py +1069 -0
  142. trilogy/parsing/config.py +5 -0
  143. trilogy/parsing/exceptions.py +8 -0
  144. trilogy/parsing/helpers.py +1 -0
  145. trilogy/parsing/parse_engine.py +2863 -0
  146. trilogy/parsing/render.py +773 -0
  147. trilogy/parsing/trilogy.lark +544 -0
  148. trilogy/py.typed +0 -0
  149. trilogy/render.py +45 -0
  150. trilogy/scripts/README.md +9 -0
  151. trilogy/scripts/__init__.py +0 -0
  152. trilogy/scripts/agent.py +41 -0
  153. trilogy/scripts/agent_info.py +306 -0
  154. trilogy/scripts/common.py +430 -0
  155. trilogy/scripts/dependency/Cargo.lock +617 -0
  156. trilogy/scripts/dependency/Cargo.toml +39 -0
  157. trilogy/scripts/dependency/README.md +131 -0
  158. trilogy/scripts/dependency/build.sh +25 -0
  159. trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
  160. trilogy/scripts/dependency/src/lib.rs +16 -0
  161. trilogy/scripts/dependency/src/main.rs +770 -0
  162. trilogy/scripts/dependency/src/parser.rs +435 -0
  163. trilogy/scripts/dependency/src/preql.pest +208 -0
  164. trilogy/scripts/dependency/src/python_bindings.rs +311 -0
  165. trilogy/scripts/dependency/src/resolver.rs +716 -0
  166. trilogy/scripts/dependency/tests/base.preql +3 -0
  167. trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
  168. trilogy/scripts/dependency/tests/customer.preql +6 -0
  169. trilogy/scripts/dependency/tests/main.preql +9 -0
  170. trilogy/scripts/dependency/tests/orders.preql +7 -0
  171. trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
  172. trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
  173. trilogy/scripts/dependency.py +323 -0
  174. trilogy/scripts/display.py +555 -0
  175. trilogy/scripts/environment.py +59 -0
  176. trilogy/scripts/fmt.py +32 -0
  177. trilogy/scripts/ingest.py +472 -0
  178. trilogy/scripts/ingest_helpers/__init__.py +1 -0
  179. trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
  180. trilogy/scripts/ingest_helpers/formatting.py +93 -0
  181. trilogy/scripts/ingest_helpers/typing.py +161 -0
  182. trilogy/scripts/init.py +105 -0
  183. trilogy/scripts/parallel_execution.py +748 -0
  184. trilogy/scripts/plan.py +189 -0
  185. trilogy/scripts/refresh.py +106 -0
  186. trilogy/scripts/run.py +79 -0
  187. trilogy/scripts/serve.py +202 -0
  188. trilogy/scripts/serve_helpers/__init__.py +41 -0
  189. trilogy/scripts/serve_helpers/file_discovery.py +142 -0
  190. trilogy/scripts/serve_helpers/index_generation.py +206 -0
  191. trilogy/scripts/serve_helpers/models.py +38 -0
  192. trilogy/scripts/single_execution.py +131 -0
  193. trilogy/scripts/testing.py +129 -0
  194. trilogy/scripts/trilogy.py +75 -0
  195. trilogy/std/__init__.py +0 -0
  196. trilogy/std/color.preql +3 -0
  197. trilogy/std/date.preql +13 -0
  198. trilogy/std/display.preql +18 -0
  199. trilogy/std/geography.preql +22 -0
  200. trilogy/std/metric.preql +15 -0
  201. trilogy/std/money.preql +67 -0
  202. trilogy/std/net.preql +14 -0
  203. trilogy/std/ranking.preql +7 -0
  204. trilogy/std/report.preql +5 -0
  205. trilogy/std/semantic.preql +6 -0
  206. trilogy/utility.py +34 -0
@@ -0,0 +1,786 @@
1
+ from functools import reduce
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import networkx as nx
5
+
6
+ from trilogy.constants import logger
7
+ from trilogy.core.enums import Derivation
8
+ from trilogy.core.graph_models import (
9
+ ReferenceGraph,
10
+ concept_to_node,
11
+ get_graph_exact_match,
12
+ prune_sources_for_aggregates,
13
+ prune_sources_for_conditions,
14
+ )
15
+ from trilogy.core.models.build import (
16
+ BuildConcept,
17
+ BuildDatasource,
18
+ BuildGrain,
19
+ BuildUnionDatasource,
20
+ BuildWhereClause,
21
+ CanonicalBuildConceptList,
22
+ )
23
+ from trilogy.core.models.build_environment import BuildEnvironment
24
+ from trilogy.core.processing.node_generators.select_helpers.datasource_injection import (
25
+ get_union_sources,
26
+ )
27
+ from trilogy.core.processing.nodes import (
28
+ ConstantNode,
29
+ GroupNode,
30
+ MergeNode,
31
+ SelectNode,
32
+ StrategyNode,
33
+ )
34
+ from trilogy.core.processing.utility import padding
35
+
36
+ if TYPE_CHECKING:
37
+ from trilogy.core.processing.nodes.union_node import UnionNode
38
+
39
+ LOGGER_PREFIX = "[GEN_ROOT_MERGE_NODE]"
40
+
41
+
42
+ def extract_address(node: str):
43
+ return node.split("~")[1].split("@")[0]
44
+
45
+
46
+ def get_graph_partial_nodes(
47
+ g: ReferenceGraph, conditions: BuildWhereClause | None
48
+ ) -> dict[str, list[str]]:
49
+ partial: dict[str, list[str]] = {}
50
+ for node, ds in g.datasources.items():
51
+
52
+ if not isinstance(ds, list):
53
+
54
+ if ds.non_partial_for and conditions == ds.non_partial_for:
55
+ partial[node] = []
56
+ continue
57
+ partial[node] = [concept_to_node(c) for c in ds.partial_concepts]
58
+ # assume union sources have no partial
59
+ else:
60
+ partial[node] = []
61
+ return partial
62
+
63
+
64
+ def get_graph_grains(g: ReferenceGraph) -> dict[str, list[str]]:
65
+ grain_length: dict[str, list[str]] = {}
66
+ for node, lookup in g.datasources.items():
67
+
68
+ base: set[str] = set()
69
+ if not isinstance(lookup, list):
70
+ flookup = [lookup]
71
+ else:
72
+ flookup = lookup
73
+ assert isinstance(flookup, list)
74
+ grain_length[node] = reduce(
75
+ lambda x, y: x.union(y.grain.components), flookup, base # type: ignore
76
+ )
77
+ return grain_length
78
+
79
+
80
+ def subgraph_is_complete(
81
+ nodes: list[str], targets: set[str], mapping: dict[str, str], g: nx.DiGraph
82
+ ) -> bool:
83
+ # Check if all targets are present in mapped nodes
84
+ mapped = {mapping.get(n, n) for n in nodes}
85
+ if not targets.issubset(mapped):
86
+ missing = targets - mapped
87
+ logger.debug(
88
+ f"Subgraph {nodes} is not complete, missing targets {missing} - mapped {mapped}"
89
+ )
90
+ return False
91
+
92
+ # Check if at least one concept node has a datasource edge
93
+ has_ds_edge = {target: False for target in targets}
94
+
95
+ for node in nodes:
96
+ if node.startswith("c~"):
97
+ mapped_node = mapping.get(node, node)
98
+ if mapped_node in targets and not has_ds_edge[mapped_node]:
99
+ # Only check neighbors if we haven't found a ds edge for this mapped node yet
100
+ if any(
101
+ neighbor.startswith("ds~") for neighbor in nx.neighbors(g, node)
102
+ ):
103
+ has_ds_edge[mapped_node] = True
104
+
105
+ return all(has_ds_edge.values())
106
+
107
+
108
+ def create_pruned_concept_graph(
109
+ g: ReferenceGraph,
110
+ all_concepts: List[BuildConcept],
111
+ datasources: list[BuildDatasource],
112
+ accept_partial: bool = False,
113
+ conditions: BuildWhereClause | None = None,
114
+ depth: int = 0,
115
+ ) -> nx.DiGraph:
116
+ orig_g = g
117
+
118
+ g = g.copy()
119
+ union_options = get_union_sources(datasources, all_concepts)
120
+
121
+ for ds_list in union_options:
122
+ node_address = "ds~" + "-".join([x.name for x in ds_list])
123
+ logger.info(
124
+ f"{padding(depth)}{LOGGER_PREFIX} injecting potentially relevant union datasource {node_address}"
125
+ )
126
+ common: set[BuildConcept] = set.intersection(
127
+ *[set(x.output_concepts) for x in ds_list]
128
+ )
129
+ g.datasources[node_address] = BuildUnionDatasource(children=ds_list)
130
+ for c in common:
131
+ cnode = concept_to_node(c)
132
+ g.add_edge(node_address, cnode)
133
+ g.add_edge(cnode, node_address)
134
+ prune_sources_for_conditions(g, accept_partial, conditions)
135
+ prune_sources_for_aggregates(g, all_concepts, logger)
136
+ target_addresses = set([c.canonical_address for c in all_concepts])
137
+ concepts: dict[str, BuildConcept] = orig_g.concepts
138
+ datasource_map: dict[str, BuildDatasource | BuildUnionDatasource] = (
139
+ orig_g.datasources
140
+ )
141
+ relevant_concepts_pre = {
142
+ n: x.canonical_address
143
+ for n in g.nodes()
144
+ # filter out synonyms
145
+ if (x := concepts.get(n, None)) and x.canonical_address in target_addresses
146
+ }
147
+
148
+ relevant_concepts: list[str] = list(relevant_concepts_pre.keys())
149
+ relevent_datasets: list[str] = []
150
+ if not accept_partial:
151
+ partial = {}
152
+ partial = get_graph_partial_nodes(g, conditions)
153
+ to_remove = []
154
+ for edge in g.edges:
155
+ if (
156
+ edge[0] in datasource_map
157
+ and (pnodes := partial.get(edge[0], []))
158
+ and edge[1] in pnodes
159
+ ):
160
+ to_remove.append(edge)
161
+ if (
162
+ edge[1] in datasource_map
163
+ and (pnodes := partial.get(edge[1], []))
164
+ and edge[0] in pnodes
165
+ ):
166
+ to_remove.append(edge)
167
+ for edge in to_remove:
168
+ g.remove_edge(*edge)
169
+
170
+ g_edges = set(g.edges)
171
+ for n in g.datasources:
172
+ if any((n, x) in g_edges for x in relevant_concepts):
173
+ relevent_datasets.append(n)
174
+ continue
175
+ logger.debug(f"Relevant datasets after pruning: {relevent_datasets}")
176
+ # for injecting extra join concepts that are shared between datasets
177
+ # use the original graph, pre-partial pruning
178
+ for n in orig_g.concepts:
179
+ # readd ignoring grain
180
+ # we want to join inclusive of all concepts
181
+ if n not in relevant_concepts:
182
+ n_neighbors = nx.all_neighbors(orig_g, n)
183
+ # check if the irrelevant concept is a join between
184
+ # two relevant datasets
185
+ neighbors = set()
186
+ for neighbor in n_neighbors:
187
+ if neighbor in relevent_datasets:
188
+ neighbors.add(neighbor)
189
+ if len(neighbors) > 1:
190
+ relevant_concepts.append(n)
191
+ continue
192
+ g.remove_nodes_from(
193
+ [
194
+ n
195
+ for n in g.nodes()
196
+ if n not in relevent_datasets and n not in relevant_concepts
197
+ ]
198
+ )
199
+ # from trilogy.hooks.graph_hook import GraphHook
200
+ # GraphHook().query_graph_built(g)
201
+ subgraphs = list(nx.connected_components(g.to_undirected()))
202
+ subgraphs = [
203
+ s
204
+ for s in subgraphs
205
+ if subgraph_is_complete(s, target_addresses, relevant_concepts_pre, g)
206
+ ]
207
+ # from trilogy.hooks.graph_hook import GraphHook
208
+ # GraphHook().query_graph_built(g)
209
+ if not subgraphs:
210
+ logger.info(
211
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - no subgraphs after node prune"
212
+ )
213
+ return None
214
+
215
+ if subgraphs and len(subgraphs) != 1:
216
+ logger.info(
217
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - subgraphs are split - have {len(subgraphs)} from {subgraphs}"
218
+ )
219
+ return None
220
+ # add back any relevant edges that might have been partially filtered
221
+ relevant = set(relevant_concepts + relevent_datasets)
222
+ for edge in orig_g.edges():
223
+ if edge[0] in relevant and edge[1] in relevant:
224
+ g.add_edge(edge[0], edge[1])
225
+ # if we have no ds nodes at all, for non constant, we can't find it
226
+ if not any([n.startswith("ds~") for n in g.nodes]):
227
+ logger.info(
228
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - No datasource nodes found"
229
+ )
230
+ return None
231
+ return g
232
+
233
+
234
+ # def deduplicate_nodes(subgraph: nx.DiGraph, nodes: list[str], partial_map: dict[str, list[str]], depth: int) -> list[str]:
235
+ # """
236
+ # Remove duplicate datasource nodes that are connected to the same concepts
237
+ # and have the same partial state, keeping the one with the most unique concepts.
238
+
239
+ # Args:
240
+ # subgraph: NetworkX DiGraph containing the nodes and edges
241
+ # nodes: List of node names to deduplicate
242
+ # partial_map: Map of datasource to partial nodes
243
+
244
+ # Returns:
245
+ # List of deduplicated node names
246
+ # """
247
+ # # Filter for datasource nodes only
248
+ # ds_nodes = [node for node in nodes if node.startswith("ds~")]
249
+ # non_ds_nodes = [node for node in nodes if not node.startswith("ds~")]
250
+
251
+ # if len(ds_nodes) <= 1:
252
+ # return nodes # No deduplication needed
253
+
254
+ # # Build a map of each datasource to its connected concepts and partial state
255
+ # ds_info = {}
256
+
257
+ # for ds_node in ds_nodes:
258
+ # # Get connected concept nodes (nodes starting with "c~")
259
+ # connected_concepts = set()
260
+ # for neighbor in subgraph.neighbors(ds_node):
261
+ # if neighbor.startswith("c~"):
262
+ # connected_concepts.add(neighbor)
263
+
264
+ # # Get partial state for this datasource
265
+ # partial_state = tuple(sorted(partial_map.get(ds_node, [])))
266
+
267
+ # ds_info[ds_node] = {
268
+ # 'concepts': connected_concepts,
269
+ # 'partial_state': partial_state
270
+ # }
271
+
272
+ # # Find datasources to remove (those that are subsets of others)
273
+ # nodes_to_remove = set()
274
+ # logger.info('LOOK HERE')
275
+ # logger.info(ds_info)
276
+ # for ds_a, info_a in ds_info.items():
277
+ # for ds_b, info_b in ds_info.items():
278
+ # if ds_a != ds_b and ds_a not in nodes_to_remove:
279
+ # # Check if ds_a is a subset of ds_b (same partial state and concepts are subset)
280
+ # if (info_a['partial_state'] == info_b['partial_state'] and
281
+ # info_a['concepts'].issubset(info_b['concepts']) and
282
+ # len(info_a['concepts']) < len(info_b['concepts'])):
283
+ # # ds_a connects to fewer concepts than ds_b, so remove ds_a
284
+ # nodes_to_remove.add(ds_a)
285
+ # elif (info_a['partial_state'] == info_b['partial_state'] and
286
+ # info_a['concepts'] == info_b['concepts']):
287
+ # # Exact same concepts and partial state - keep one arbitrarily
288
+ # # (keep the lexicographically smaller one for consistency)
289
+ # if ds_a > ds_b:
290
+ # nodes_to_remove.add(ds_a)
291
+
292
+ # # Keep datasource nodes that weren't marked for removal
293
+ # logger.info(f"{padding(depth)}{LOGGER_PREFIX} Removing duplicate datasource nodes: {nodes_to_remove}")
294
+ # deduplicated_ds_nodes = [ds for ds in ds_nodes if ds not in nodes_to_remove]
295
+
296
+ # # Return deduplicated datasource nodes plus all non-datasource nodes
297
+ # return deduplicated_ds_nodes + non_ds_nodes
298
+
299
+
300
+ def filter_pseudonym_duplicates(
301
+ concepts: list[BuildConcept], relevant: list[BuildConcept]
302
+ ) -> list[BuildConcept]:
303
+ """Filter out concepts whose pseudonyms are also in the list, keeping the one in relevant."""
304
+ relevant_addrs = {c.address for c in relevant}
305
+ concept_addrs = {c.address for c in concepts}
306
+ to_remove: set[str] = set()
307
+ for c in concepts:
308
+ for p_addr in c.pseudonyms:
309
+ if p_addr in concept_addrs:
310
+ c_in_relevant = c.address in relevant_addrs
311
+ p_in_relevant = p_addr in relevant_addrs
312
+ if p_in_relevant and not c_in_relevant:
313
+ to_remove.add(c.address)
314
+ break
315
+ elif c_in_relevant and not p_in_relevant:
316
+ to_remove.add(p_addr)
317
+ return [c for c in concepts if c.address not in to_remove]
318
+
319
+
320
+ def resolve_subgraphs(
321
+ g: ReferenceGraph,
322
+ relevant: list[BuildConcept],
323
+ accept_partial: bool,
324
+ conditions: BuildWhereClause | None,
325
+ depth: int = 0,
326
+ ) -> dict[str, list[str]]:
327
+ """When we have multiple distinct subgraphs within our matched
328
+ nodes that can satisfy a query, resolve which one of those we should
329
+ ultimately ues.
330
+ This should generally return one subgraph for each
331
+ unique set of sub concepts that can be referenced,
332
+ discarding duplicates.
333
+ Duplicate subgraphs will be resolved based on which
334
+ ones are most 'optimal' to use, a hueristic
335
+ that can evolve in the future but is currently based on datasource
336
+ cardinality."""
337
+ datasources = [n for n in g.nodes if n.startswith("ds~")]
338
+ canonical_relevant = set([c.canonical_address for c in relevant])
339
+ canonical_map = {c.canonical_address: c.address for c in relevant}
340
+ concepts: dict[str, BuildConcept] = g.concepts
341
+ subgraphs: dict[str, list[str]] = {
342
+ ds: list(set(list(nx.all_neighbors(g, ds)))) for ds in datasources
343
+ }
344
+ # filter pseudonym duplicates from each subgraph, keeping concept in relevant
345
+ for ds in subgraphs:
346
+ ds_concepts = [concepts[n] for n in subgraphs[ds] if n in concepts]
347
+ filtered = filter_pseudonym_duplicates(ds_concepts, relevant)
348
+ filtered_nodes = {concept_to_node(c) for c in filtered}
349
+ subgraphs[ds] = [
350
+ n for n in subgraphs[ds] if n not in concepts or n in filtered_nodes
351
+ ]
352
+ partial_map = get_graph_partial_nodes(g, conditions)
353
+ exact_map = get_graph_exact_match(g, accept_partial, conditions)
354
+ grain_length = get_graph_grains(g)
355
+ non_partial_map = {
356
+ ds: [
357
+ concepts[c].canonical_address
358
+ for c in subgraphs[ds]
359
+ if c not in partial_map[ds]
360
+ ]
361
+ for ds in datasources
362
+ }
363
+ concept_map = {
364
+ ds: [concepts[c].canonical_address for c in subgraphs[ds]] for ds in datasources
365
+ }
366
+ pruned_subgraphs = {}
367
+
368
+ def score_node(input: str):
369
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} scoring node {input}")
370
+ grain = grain_length[input]
371
+ # first - go for lowest grain
372
+ # but if the object we want is in the grain, treat that as "free"
373
+ # ex - pick source with grain(product_id) over grain(order_id)
374
+ # when going for product_id
375
+ score = (
376
+ len(list(grain)) - sum([1 for x in concept_map[input] if x in grain]),
377
+ # then check if it's an exact condition match
378
+ 0 if input in exact_map else 0.5,
379
+ # last, number of concepts
380
+ len(subgraphs[input]),
381
+ input,
382
+ )
383
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} node {input} has score {score}")
384
+ return score
385
+
386
+ for key, nodes in subgraphs.items():
387
+
388
+ value = non_partial_map[key]
389
+ all_concepts = concept_map[key]
390
+ is_subset = False
391
+ matches = set()
392
+ # Compare current list with other lists
393
+ for other_key, other_all_concepts in concept_map.items():
394
+ other_value = non_partial_map[other_key]
395
+ # needs to be a subset of non partial and a subset of all
396
+ if (
397
+ key != other_key
398
+ and set(value).issubset(set(other_value))
399
+ and set(all_concepts).issubset(set(other_all_concepts))
400
+ ):
401
+ if len(value) < len(other_value):
402
+ is_subset = True
403
+ logger.debug(
404
+ f"{padding(depth)}{LOGGER_PREFIX} Dropping subgraph {key} with {value} as it is a subset of {other_key} with {other_value}"
405
+ )
406
+ elif len(value) == len(other_value) and len(all_concepts) == len(
407
+ other_all_concepts
408
+ ):
409
+ matches.add(other_key)
410
+ matches.add(key)
411
+ if matches and not is_subset:
412
+ min_node = min(matches, key=score_node)
413
+ logger.debug(
414
+ f"{padding(depth)}{LOGGER_PREFIX} minimum source score is {min_node}"
415
+ )
416
+ is_subset = key is not min(matches, key=score_node)
417
+ if not is_subset:
418
+ pruned_subgraphs[key] = nodes
419
+
420
+ final_nodes: set[str] = set([n for v in pruned_subgraphs.values() for n in v])
421
+ relevant_concepts_pre = {
422
+ n: x.canonical_address
423
+ for n in g.nodes()
424
+ # filter out synonyms
425
+ if (x := concepts.get(n, None)) and x.canonical_address in canonical_relevant
426
+ }
427
+ logger.debug(
428
+ f"{padding(depth)}{LOGGER_PREFIX} Final nodes before relevance pruning: {final_nodes}"
429
+ )
430
+ for node in final_nodes:
431
+ keep = True
432
+ if node.startswith("c~") and node not in relevant_concepts_pre:
433
+ keep = (
434
+ sum(
435
+ [
436
+ 1 if node in sub_nodes else 0
437
+ for _, sub_nodes in pruned_subgraphs.items()
438
+ ]
439
+ )
440
+ > 1
441
+ )
442
+ if not keep:
443
+ logger.debug(
444
+ f"{padding(depth)}{LOGGER_PREFIX} Pruning node {node} as irrelevant after subgraph resolution"
445
+ )
446
+ pruned_subgraphs = {
447
+ canonical_map.get(k, k): [n for n in v if n != node]
448
+ for k, v in pruned_subgraphs.items()
449
+ }
450
+
451
+ return pruned_subgraphs
452
+
453
+
454
+ def create_datasource_node(
455
+ datasource: BuildDatasource,
456
+ all_concepts: List[BuildConcept],
457
+ accept_partial: bool,
458
+ environment: BuildEnvironment,
459
+ depth: int,
460
+ conditions: BuildWhereClause | None = None,
461
+ ) -> tuple[StrategyNode, bool]:
462
+
463
+ target_grain = BuildGrain.from_concepts(all_concepts, environment=environment)
464
+ # datasource grain may have changed since reference graph creation
465
+ datasource_grain = BuildGrain.from_concepts(
466
+ datasource.grain.components, environment=environment
467
+ )
468
+ # datasource_grain = datasource.grain
469
+ force_group = False
470
+ if not datasource_grain.issubset(target_grain):
471
+ logger.info(
472
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node must be wrapped in group, {datasource_grain} not subset of target grain {target_grain} from {all_concepts}"
473
+ )
474
+ force_group = True
475
+ else:
476
+ logger.info(
477
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node grain {datasource_grain} is subset of target grain {target_grain}, no group required"
478
+ )
479
+ if not datasource_grain.components:
480
+ force_group = True
481
+ partial_concepts = [
482
+ c.concept
483
+ for c in datasource.columns
484
+ if not c.is_complete and c.concept.address in all_concepts
485
+ ]
486
+
487
+ partial_lcl = CanonicalBuildConceptList(concepts=partial_concepts)
488
+ nullable_concepts = [
489
+ c.concept
490
+ for c in datasource.columns
491
+ if c.is_nullable and c.concept.address in all_concepts
492
+ ]
493
+
494
+ nullable_lcl = CanonicalBuildConceptList(concepts=nullable_concepts)
495
+ partial_is_full = conditions and (conditions == datasource.non_partial_for)
496
+
497
+ datasource_conditions = datasource.where.conditional if datasource.where else None
498
+ all_inputs = [c.concept for c in datasource.columns]
499
+ canonical_all = CanonicalBuildConceptList(concepts=all_inputs)
500
+
501
+ # if we're binding via a canonical address association, add it here
502
+ for x in all_concepts:
503
+ if x not in all_inputs and x in canonical_all:
504
+ all_inputs.append(x)
505
+
506
+ rval = SelectNode(
507
+ input_concepts=all_inputs,
508
+ output_concepts=sorted(all_concepts, key=lambda x: x.address),
509
+ environment=environment,
510
+ parents=[],
511
+ depth=depth,
512
+ partial_concepts=(
513
+ [] if partial_is_full else [c for c in all_concepts if c in partial_lcl]
514
+ ),
515
+ nullable_concepts=[c for c in all_concepts if c in nullable_lcl],
516
+ accept_partial=accept_partial,
517
+ datasource=datasource,
518
+ grain=datasource.grain,
519
+ conditions=datasource_conditions,
520
+ preexisting_conditions=(
521
+ conditions.conditional if partial_is_full and conditions else None
522
+ ),
523
+ )
524
+ return (
525
+ rval,
526
+ force_group,
527
+ )
528
+
529
+
530
+ def create_union_datasource(
531
+ datasource: BuildUnionDatasource,
532
+ all_concepts: List[BuildConcept],
533
+ accept_partial: bool,
534
+ environment: BuildEnvironment,
535
+ depth: int,
536
+ conditions: BuildWhereClause | None = None,
537
+ ) -> tuple["UnionNode", bool]:
538
+ from trilogy.core.processing.nodes.union_node import UnionNode
539
+
540
+ datasources = datasource.children
541
+ logger.info(
542
+ f"{padding(depth)}{LOGGER_PREFIX} generating union node parents with condition {conditions}"
543
+ )
544
+ force_group = False
545
+ parents = []
546
+ for x in datasources:
547
+ subnode, fg = create_datasource_node(
548
+ x,
549
+ all_concepts,
550
+ accept_partial,
551
+ environment,
552
+ depth + 1,
553
+ conditions=conditions,
554
+ )
555
+ parents.append(subnode)
556
+ force_group = force_group or fg
557
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} returning union node")
558
+ return (
559
+ UnionNode(
560
+ output_concepts=all_concepts,
561
+ input_concepts=all_concepts,
562
+ environment=environment,
563
+ parents=parents,
564
+ depth=depth,
565
+ partial_concepts=[],
566
+ ),
567
+ force_group,
568
+ )
569
+
570
+
571
+ def create_select_node(
572
+ ds_name: str,
573
+ subgraph: list[str],
574
+ accept_partial: bool,
575
+ g: ReferenceGraph,
576
+ environment: BuildEnvironment,
577
+ depth: int,
578
+ conditions: BuildWhereClause | None = None,
579
+ ) -> StrategyNode:
580
+ all_concepts = [
581
+ environment.canonical_concepts[extract_address(c)]
582
+ for c in subgraph
583
+ if c.startswith("c~")
584
+ ]
585
+ logger.info(
586
+ f"{padding(depth)}{LOGGER_PREFIX} all concepts: {all_concepts} from {subgraph}"
587
+ )
588
+
589
+ if all([c.derivation == Derivation.CONSTANT for c in all_concepts]):
590
+ logger.info(
591
+ f"{padding(depth)}{LOGGER_PREFIX} All concepts {[x.address for x in all_concepts]} are constants, returning constant node"
592
+ )
593
+ return ConstantNode(
594
+ output_concepts=all_concepts,
595
+ input_concepts=[],
596
+ environment=environment,
597
+ parents=[],
598
+ depth=depth,
599
+ # no partial for constants
600
+ partial_concepts=[],
601
+ force_group=False,
602
+ preexisting_conditions=conditions.conditional if conditions else None,
603
+ )
604
+
605
+ datasource: BuildDatasource | BuildUnionDatasource = g.datasources[ds_name]
606
+
607
+ if isinstance(datasource, BuildDatasource):
608
+ bcandidate, force_group = create_datasource_node(
609
+ datasource,
610
+ all_concepts,
611
+ accept_partial,
612
+ environment,
613
+ depth,
614
+ conditions=conditions,
615
+ )
616
+
617
+ elif isinstance(datasource, BuildUnionDatasource):
618
+ bcandidate, force_group = create_union_datasource(
619
+ datasource,
620
+ all_concepts,
621
+ accept_partial,
622
+ environment,
623
+ depth,
624
+ conditions=conditions,
625
+ )
626
+ else:
627
+ raise ValueError(f"Unknown datasource type {datasource}")
628
+
629
+ # we need to nest the group node one further
630
+ if force_group is True:
631
+ logger.info(
632
+ f"{padding(depth)}{LOGGER_PREFIX} source requires group before consumption."
633
+ )
634
+ candidate: StrategyNode = GroupNode(
635
+ output_concepts=all_concepts,
636
+ input_concepts=all_concepts,
637
+ environment=environment,
638
+ parents=[bcandidate],
639
+ depth=depth + 1,
640
+ partial_concepts=bcandidate.partial_concepts,
641
+ nullable_concepts=bcandidate.nullable_concepts,
642
+ preexisting_conditions=bcandidate.preexisting_conditions,
643
+ force_group=force_group,
644
+ )
645
+ else:
646
+
647
+ candidate = bcandidate
648
+
649
+ return candidate
650
+
651
+
652
+ def gen_select_merge_node(
653
+ all_concepts: List[BuildConcept],
654
+ g: nx.DiGraph,
655
+ environment: BuildEnvironment,
656
+ depth: int,
657
+ accept_partial: bool = False,
658
+ conditions: BuildWhereClause | None = None,
659
+ ) -> Optional[StrategyNode]:
660
+ non_constant = [c for c in all_concepts if c.derivation != Derivation.CONSTANT]
661
+ constants = [c for c in all_concepts if c.derivation == Derivation.CONSTANT]
662
+ logger.info(
663
+ f"{padding(depth)}{LOGGER_PREFIX} generating select merge node for {all_concepts}"
664
+ )
665
+ if not non_constant and constants:
666
+ logger.info(
667
+ f"{padding(depth)}{LOGGER_PREFIX} only constant inputs to discovery ({constants}), returning constant node directly"
668
+ )
669
+ for x in constants:
670
+ logger.info(
671
+ f"{padding(depth)}{LOGGER_PREFIX} {x} {x.lineage} {x.derivation}"
672
+ )
673
+ if conditions:
674
+ if not all(
675
+ [x.derivation == Derivation.CONSTANT for x in conditions.row_arguments]
676
+ ):
677
+ logger.info(
678
+ f"{padding(depth)}{LOGGER_PREFIX} conditions being passed in to constant node {conditions}, but not all concepts are constants, cannot generate select node."
679
+ )
680
+ return None
681
+ else:
682
+ constants += conditions.row_arguments
683
+
684
+ return ConstantNode(
685
+ output_concepts=constants,
686
+ input_concepts=[],
687
+ environment=environment,
688
+ parents=[],
689
+ depth=depth,
690
+ partial_concepts=[],
691
+ force_group=False,
692
+ conditions=conditions.conditional if conditions else None,
693
+ )
694
+ attempts = [
695
+ False,
696
+ ]
697
+ if accept_partial:
698
+ attempts.append(True)
699
+ logger.info(
700
+ f"{padding(depth)}{LOGGER_PREFIX} searching for root source graph for concepts {[c.address for c in all_concepts]} and conditions {conditions}"
701
+ )
702
+ pruned_concept_graph = None
703
+ for attempt in attempts:
704
+ pruned_concept_graph = create_pruned_concept_graph(
705
+ g,
706
+ non_constant,
707
+ accept_partial=attempt,
708
+ conditions=conditions,
709
+ datasources=list([x for x in environment.datasources.values()]),
710
+ depth=depth,
711
+ )
712
+ if pruned_concept_graph:
713
+ logger.info(
714
+ f"{padding(depth)}{LOGGER_PREFIX} found covering graph w/ partial flag {attempt} {list(pruned_concept_graph.nodes)}"
715
+ )
716
+ break
717
+
718
+ if not pruned_concept_graph:
719
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} no covering graph found.")
720
+ return None
721
+
722
+ sub_nodes = resolve_subgraphs(
723
+ pruned_concept_graph,
724
+ relevant=non_constant,
725
+ accept_partial=accept_partial,
726
+ conditions=conditions,
727
+ depth=depth,
728
+ )
729
+
730
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} fetching subgraphs {sub_nodes}")
731
+
732
+ parents = [
733
+ create_select_node(
734
+ k,
735
+ subgraph,
736
+ g=pruned_concept_graph,
737
+ accept_partial=accept_partial,
738
+ environment=environment,
739
+ depth=depth,
740
+ conditions=conditions,
741
+ )
742
+ for k, subgraph in sub_nodes.items()
743
+ ]
744
+ if not parents:
745
+ return None
746
+
747
+ if constants:
748
+ parents.append(
749
+ ConstantNode(
750
+ output_concepts=constants,
751
+ input_concepts=[],
752
+ environment=environment,
753
+ parents=[],
754
+ depth=depth,
755
+ partial_concepts=[],
756
+ force_group=False,
757
+ preexisting_conditions=conditions.conditional if conditions else None,
758
+ )
759
+ )
760
+
761
+ if len(parents) == 1:
762
+ return parents[0]
763
+ logger.info(
764
+ f"{padding(depth)}{LOGGER_PREFIX} Multiple parent DS nodes resolved - {[type(x) for x in parents]}, wrapping in merge"
765
+ )
766
+
767
+ preexisting_conditions = None
768
+ if conditions and all(
769
+ [
770
+ x.preexisting_conditions
771
+ and x.preexisting_conditions == conditions.conditional
772
+ for x in parents
773
+ ]
774
+ ):
775
+ preexisting_conditions = conditions.conditional
776
+
777
+ base = MergeNode(
778
+ output_concepts=all_concepts,
779
+ input_concepts=non_constant,
780
+ environment=environment,
781
+ depth=depth,
782
+ parents=parents,
783
+ preexisting_conditions=preexisting_conditions,
784
+ )
785
+
786
+ return base