pytrilogy 0.3.149__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. LICENSE.md +19 -0
  2. _preql_import_resolver/__init__.py +5 -0
  3. _preql_import_resolver/_preql_import_resolver.cp313-win_amd64.pyd +0 -0
  4. pytrilogy-0.3.149.dist-info/METADATA +555 -0
  5. pytrilogy-0.3.149.dist-info/RECORD +207 -0
  6. pytrilogy-0.3.149.dist-info/WHEEL +4 -0
  7. pytrilogy-0.3.149.dist-info/entry_points.txt +2 -0
  8. pytrilogy-0.3.149.dist-info/licenses/LICENSE.md +19 -0
  9. trilogy/__init__.py +27 -0
  10. trilogy/ai/README.md +10 -0
  11. trilogy/ai/__init__.py +19 -0
  12. trilogy/ai/constants.py +92 -0
  13. trilogy/ai/conversation.py +107 -0
  14. trilogy/ai/enums.py +7 -0
  15. trilogy/ai/execute.py +50 -0
  16. trilogy/ai/models.py +34 -0
  17. trilogy/ai/prompts.py +100 -0
  18. trilogy/ai/providers/__init__.py +0 -0
  19. trilogy/ai/providers/anthropic.py +106 -0
  20. trilogy/ai/providers/base.py +24 -0
  21. trilogy/ai/providers/google.py +146 -0
  22. trilogy/ai/providers/openai.py +89 -0
  23. trilogy/ai/providers/utils.py +68 -0
  24. trilogy/authoring/README.md +3 -0
  25. trilogy/authoring/__init__.py +148 -0
  26. trilogy/constants.py +119 -0
  27. trilogy/core/README.md +52 -0
  28. trilogy/core/__init__.py +0 -0
  29. trilogy/core/constants.py +6 -0
  30. trilogy/core/enums.py +454 -0
  31. trilogy/core/env_processor.py +239 -0
  32. trilogy/core/environment_helpers.py +320 -0
  33. trilogy/core/ergonomics.py +193 -0
  34. trilogy/core/exceptions.py +123 -0
  35. trilogy/core/functions.py +1240 -0
  36. trilogy/core/graph_models.py +142 -0
  37. trilogy/core/internal.py +85 -0
  38. trilogy/core/models/__init__.py +0 -0
  39. trilogy/core/models/author.py +2670 -0
  40. trilogy/core/models/build.py +2603 -0
  41. trilogy/core/models/build_environment.py +165 -0
  42. trilogy/core/models/core.py +506 -0
  43. trilogy/core/models/datasource.py +436 -0
  44. trilogy/core/models/environment.py +756 -0
  45. trilogy/core/models/execute.py +1213 -0
  46. trilogy/core/optimization.py +251 -0
  47. trilogy/core/optimizations/__init__.py +12 -0
  48. trilogy/core/optimizations/base_optimization.py +17 -0
  49. trilogy/core/optimizations/hide_unused_concept.py +47 -0
  50. trilogy/core/optimizations/inline_datasource.py +102 -0
  51. trilogy/core/optimizations/predicate_pushdown.py +245 -0
  52. trilogy/core/processing/README.md +94 -0
  53. trilogy/core/processing/READMEv2.md +121 -0
  54. trilogy/core/processing/VIRTUAL_UNNEST.md +30 -0
  55. trilogy/core/processing/__init__.py +0 -0
  56. trilogy/core/processing/concept_strategies_v3.py +508 -0
  57. trilogy/core/processing/constants.py +15 -0
  58. trilogy/core/processing/discovery_node_factory.py +451 -0
  59. trilogy/core/processing/discovery_utility.py +548 -0
  60. trilogy/core/processing/discovery_validation.py +167 -0
  61. trilogy/core/processing/graph_utils.py +43 -0
  62. trilogy/core/processing/node_generators/README.md +9 -0
  63. trilogy/core/processing/node_generators/__init__.py +31 -0
  64. trilogy/core/processing/node_generators/basic_node.py +160 -0
  65. trilogy/core/processing/node_generators/common.py +270 -0
  66. trilogy/core/processing/node_generators/constant_node.py +38 -0
  67. trilogy/core/processing/node_generators/filter_node.py +315 -0
  68. trilogy/core/processing/node_generators/group_node.py +213 -0
  69. trilogy/core/processing/node_generators/group_to_node.py +117 -0
  70. trilogy/core/processing/node_generators/multiselect_node.py +207 -0
  71. trilogy/core/processing/node_generators/node_merge_node.py +695 -0
  72. trilogy/core/processing/node_generators/recursive_node.py +88 -0
  73. trilogy/core/processing/node_generators/rowset_node.py +165 -0
  74. trilogy/core/processing/node_generators/select_helpers/__init__.py +0 -0
  75. trilogy/core/processing/node_generators/select_helpers/datasource_injection.py +261 -0
  76. trilogy/core/processing/node_generators/select_merge_node.py +846 -0
  77. trilogy/core/processing/node_generators/select_node.py +95 -0
  78. trilogy/core/processing/node_generators/synonym_node.py +98 -0
  79. trilogy/core/processing/node_generators/union_node.py +91 -0
  80. trilogy/core/processing/node_generators/unnest_node.py +182 -0
  81. trilogy/core/processing/node_generators/window_node.py +201 -0
  82. trilogy/core/processing/nodes/README.md +28 -0
  83. trilogy/core/processing/nodes/__init__.py +179 -0
  84. trilogy/core/processing/nodes/base_node.py +522 -0
  85. trilogy/core/processing/nodes/filter_node.py +75 -0
  86. trilogy/core/processing/nodes/group_node.py +194 -0
  87. trilogy/core/processing/nodes/merge_node.py +420 -0
  88. trilogy/core/processing/nodes/recursive_node.py +46 -0
  89. trilogy/core/processing/nodes/select_node_v2.py +242 -0
  90. trilogy/core/processing/nodes/union_node.py +53 -0
  91. trilogy/core/processing/nodes/unnest_node.py +62 -0
  92. trilogy/core/processing/nodes/window_node.py +56 -0
  93. trilogy/core/processing/utility.py +823 -0
  94. trilogy/core/query_processor.py +604 -0
  95. trilogy/core/statements/README.md +35 -0
  96. trilogy/core/statements/__init__.py +0 -0
  97. trilogy/core/statements/author.py +536 -0
  98. trilogy/core/statements/build.py +0 -0
  99. trilogy/core/statements/common.py +20 -0
  100. trilogy/core/statements/execute.py +155 -0
  101. trilogy/core/table_processor.py +66 -0
  102. trilogy/core/utility.py +8 -0
  103. trilogy/core/validation/README.md +46 -0
  104. trilogy/core/validation/__init__.py +0 -0
  105. trilogy/core/validation/common.py +161 -0
  106. trilogy/core/validation/concept.py +146 -0
  107. trilogy/core/validation/datasource.py +227 -0
  108. trilogy/core/validation/environment.py +73 -0
  109. trilogy/core/validation/fix.py +256 -0
  110. trilogy/dialect/__init__.py +32 -0
  111. trilogy/dialect/base.py +1432 -0
  112. trilogy/dialect/bigquery.py +314 -0
  113. trilogy/dialect/common.py +147 -0
  114. trilogy/dialect/config.py +159 -0
  115. trilogy/dialect/dataframe.py +50 -0
  116. trilogy/dialect/duckdb.py +397 -0
  117. trilogy/dialect/enums.py +151 -0
  118. trilogy/dialect/metadata.py +173 -0
  119. trilogy/dialect/mock.py +190 -0
  120. trilogy/dialect/postgres.py +117 -0
  121. trilogy/dialect/presto.py +110 -0
  122. trilogy/dialect/results.py +89 -0
  123. trilogy/dialect/snowflake.py +129 -0
  124. trilogy/dialect/sql_server.py +137 -0
  125. trilogy/engine.py +48 -0
  126. trilogy/execution/__init__.py +17 -0
  127. trilogy/execution/config.py +119 -0
  128. trilogy/execution/state/__init__.py +0 -0
  129. trilogy/execution/state/exceptions.py +26 -0
  130. trilogy/execution/state/file_state_store.py +0 -0
  131. trilogy/execution/state/sqllite_state_store.py +0 -0
  132. trilogy/execution/state/state_store.py +406 -0
  133. trilogy/executor.py +692 -0
  134. trilogy/hooks/__init__.py +4 -0
  135. trilogy/hooks/base_hook.py +40 -0
  136. trilogy/hooks/graph_hook.py +135 -0
  137. trilogy/hooks/query_debugger.py +166 -0
  138. trilogy/metadata/__init__.py +0 -0
  139. trilogy/parser.py +10 -0
  140. trilogy/parsing/README.md +21 -0
  141. trilogy/parsing/__init__.py +0 -0
  142. trilogy/parsing/common.py +1069 -0
  143. trilogy/parsing/config.py +5 -0
  144. trilogy/parsing/exceptions.py +8 -0
  145. trilogy/parsing/helpers.py +1 -0
  146. trilogy/parsing/parse_engine.py +2876 -0
  147. trilogy/parsing/render.py +775 -0
  148. trilogy/parsing/trilogy.lark +546 -0
  149. trilogy/py.typed +0 -0
  150. trilogy/render.py +45 -0
  151. trilogy/scripts/README.md +9 -0
  152. trilogy/scripts/__init__.py +0 -0
  153. trilogy/scripts/agent.py +41 -0
  154. trilogy/scripts/agent_info.py +306 -0
  155. trilogy/scripts/common.py +432 -0
  156. trilogy/scripts/dependency/Cargo.lock +617 -0
  157. trilogy/scripts/dependency/Cargo.toml +39 -0
  158. trilogy/scripts/dependency/README.md +131 -0
  159. trilogy/scripts/dependency/build.sh +25 -0
  160. trilogy/scripts/dependency/src/directory_resolver.rs +387 -0
  161. trilogy/scripts/dependency/src/lib.rs +16 -0
  162. trilogy/scripts/dependency/src/main.rs +770 -0
  163. trilogy/scripts/dependency/src/parser.rs +435 -0
  164. trilogy/scripts/dependency/src/preql.pest +208 -0
  165. trilogy/scripts/dependency/src/python_bindings.rs +311 -0
  166. trilogy/scripts/dependency/src/resolver.rs +716 -0
  167. trilogy/scripts/dependency/tests/base.preql +3 -0
  168. trilogy/scripts/dependency/tests/cli_integration.rs +377 -0
  169. trilogy/scripts/dependency/tests/customer.preql +6 -0
  170. trilogy/scripts/dependency/tests/main.preql +9 -0
  171. trilogy/scripts/dependency/tests/orders.preql +7 -0
  172. trilogy/scripts/dependency/tests/test_data/base.preql +9 -0
  173. trilogy/scripts/dependency/tests/test_data/consumer.preql +1 -0
  174. trilogy/scripts/dependency.py +323 -0
  175. trilogy/scripts/display.py +555 -0
  176. trilogy/scripts/environment.py +59 -0
  177. trilogy/scripts/fmt.py +32 -0
  178. trilogy/scripts/ingest.py +487 -0
  179. trilogy/scripts/ingest_helpers/__init__.py +1 -0
  180. trilogy/scripts/ingest_helpers/foreign_keys.py +123 -0
  181. trilogy/scripts/ingest_helpers/formatting.py +93 -0
  182. trilogy/scripts/ingest_helpers/typing.py +161 -0
  183. trilogy/scripts/init.py +105 -0
  184. trilogy/scripts/parallel_execution.py +762 -0
  185. trilogy/scripts/plan.py +189 -0
  186. trilogy/scripts/refresh.py +161 -0
  187. trilogy/scripts/run.py +79 -0
  188. trilogy/scripts/serve.py +202 -0
  189. trilogy/scripts/serve_helpers/__init__.py +41 -0
  190. trilogy/scripts/serve_helpers/file_discovery.py +142 -0
  191. trilogy/scripts/serve_helpers/index_generation.py +206 -0
  192. trilogy/scripts/serve_helpers/models.py +38 -0
  193. trilogy/scripts/single_execution.py +131 -0
  194. trilogy/scripts/testing.py +143 -0
  195. trilogy/scripts/trilogy.py +75 -0
  196. trilogy/std/__init__.py +0 -0
  197. trilogy/std/color.preql +3 -0
  198. trilogy/std/date.preql +13 -0
  199. trilogy/std/display.preql +18 -0
  200. trilogy/std/geography.preql +22 -0
  201. trilogy/std/metric.preql +15 -0
  202. trilogy/std/money.preql +67 -0
  203. trilogy/std/net.preql +14 -0
  204. trilogy/std/ranking.preql +7 -0
  205. trilogy/std/report.preql +5 -0
  206. trilogy/std/semantic.preql +6 -0
  207. trilogy/utility.py +34 -0
@@ -0,0 +1,846 @@
1
+ from functools import reduce
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import networkx as nx
5
+
6
+ from trilogy.constants import logger
7
+ from trilogy.core.enums import AddressType, Derivation
8
+ from trilogy.core.graph_models import (
9
+ ReferenceGraph,
10
+ concept_to_node,
11
+ get_graph_exact_match,
12
+ prune_sources_for_aggregates,
13
+ prune_sources_for_conditions,
14
+ )
15
+ from trilogy.core.models.build import (
16
+ Address,
17
+ BuildConcept,
18
+ BuildDatasource,
19
+ BuildGrain,
20
+ BuildUnionDatasource,
21
+ BuildWhereClause,
22
+ CanonicalBuildConceptList,
23
+ )
24
+ from trilogy.core.models.build_environment import BuildEnvironment
25
+ from trilogy.core.processing.node_generators.select_helpers.datasource_injection import (
26
+ get_union_sources,
27
+ )
28
+ from trilogy.core.processing.nodes import (
29
+ ConstantNode,
30
+ GroupNode,
31
+ MergeNode,
32
+ SelectNode,
33
+ StrategyNode,
34
+ )
35
+ from trilogy.core.processing.utility import padding
36
+
37
+ if TYPE_CHECKING:
38
+ from trilogy.core.processing.nodes.union_node import UnionNode
39
+
40
+ LOGGER_PREFIX = "[GEN_ROOT_MERGE_NODE]"
41
+
42
+
43
+ def extract_address(node: str):
44
+ return node.split("~")[1].split("@")[0]
45
+
46
+
47
+ def get_graph_partial_nodes(
48
+ g: ReferenceGraph, conditions: BuildWhereClause | None
49
+ ) -> dict[str, list[str]]:
50
+ partial: dict[str, list[str]] = {}
51
+ for node, ds in g.datasources.items():
52
+
53
+ if not isinstance(ds, list):
54
+
55
+ if ds.non_partial_for and conditions == ds.non_partial_for:
56
+ partial[node] = []
57
+ continue
58
+ partial[node] = [concept_to_node(c) for c in ds.partial_concepts]
59
+ # assume union sources have no partial
60
+ else:
61
+ partial[node] = []
62
+ return partial
63
+
64
+
65
+ def get_graph_grains(g: ReferenceGraph) -> dict[str, list[str]]:
66
+ grain_length: dict[str, list[str]] = {}
67
+ for node, lookup in g.datasources.items():
68
+
69
+ base: set[str] = set()
70
+ if not isinstance(lookup, list):
71
+ flookup = [lookup]
72
+ else:
73
+ flookup = lookup
74
+ assert isinstance(flookup, list)
75
+ grain_length[node] = reduce(
76
+ lambda x, y: x.union(y.grain.components), flookup, base # type: ignore
77
+ )
78
+ return grain_length
79
+
80
+
81
+ def get_materialization_score(address: Address | AddressType | str) -> int:
82
+ """Score datasource by materialization level. Lower is better (more materialized).
83
+
84
+ - 0: TABLE - fully materialized in the database
85
+ - 1: Static files (CSV, TSV, PARQUET) - data files that need to be read
86
+ - 2: Dynamic sources (QUERY, SQL) - queries that need to be executed
87
+ - 3: Executable scripts (PYTHON_SCRIPT) - scripts that need to run
88
+ """
89
+ if isinstance(address, str):
90
+ return 0
91
+ elif isinstance(address, AddressType):
92
+ address_type = address
93
+ else:
94
+ address_type = address.type
95
+ if address_type == AddressType.TABLE:
96
+ return 0
97
+ if address_type in (AddressType.CSV, AddressType.TSV, AddressType.PARQUET):
98
+ return 1
99
+ if address_type in (AddressType.QUERY, AddressType.SQL):
100
+ return 2
101
+ if address_type == AddressType.PYTHON_SCRIPT:
102
+ return 3
103
+ return 2
104
+
105
+
106
+ def score_datasource_node(
107
+ node: str,
108
+ datasources: dict[str, "BuildDatasource | BuildUnionDatasource"],
109
+ grain_map: dict[str, list[str]],
110
+ concept_map: dict[str, list[str]],
111
+ exact_map: set[str],
112
+ subgraphs: dict[str, list[str]],
113
+ ) -> tuple[int, int, float, int, str]:
114
+ """Score a datasource node for selection priority. Lower score = higher priority.
115
+
116
+ Returns tuple of:
117
+ - materialization_score: 0 (table) to 3 (python script)
118
+ - grain_score: effective grain size (lower is better)
119
+ - exact_match_score: 0 if exact condition match, 0.5 otherwise
120
+ - concept_count: number of concepts (tiebreaker)
121
+ - node_name: alphabetic tiebreaker
122
+ """
123
+ ds = datasources.get(node)
124
+
125
+ # materialization score
126
+ if ds is None:
127
+ mat_score = 2
128
+ elif isinstance(ds, BuildDatasource):
129
+ mat_score = get_materialization_score(ds.address)
130
+ elif isinstance(ds, BuildUnionDatasource):
131
+ mat_score = max(
132
+ get_materialization_score(child.address) for child in ds.children
133
+ )
134
+ else:
135
+ mat_score = 2
136
+
137
+ # grain score
138
+ grain = grain_map[node]
139
+ grain_score = len(list(grain)) - sum([1 for x in concept_map[node] if x in grain])
140
+
141
+ # exact match
142
+ exact_score = 0 if node in exact_map else 0.5
143
+
144
+ # concept count
145
+ concept_count = len(subgraphs[node])
146
+
147
+ return (mat_score, grain_score, exact_score, concept_count, node)
148
+
149
+
150
+ def subgraph_is_complete(
151
+ nodes: list[str], targets: set[str], mapping: dict[str, str], g: nx.DiGraph
152
+ ) -> bool:
153
+ # Check if all targets are present in mapped nodes
154
+ mapped = {mapping.get(n, n) for n in nodes}
155
+ if not targets.issubset(mapped):
156
+ missing = targets - mapped
157
+ logger.debug(
158
+ f"Subgraph {nodes} is not complete, missing targets {missing} - mapped {mapped}"
159
+ )
160
+ return False
161
+
162
+ # Check if at least one concept node has a datasource edge
163
+ has_ds_edge = {target: False for target in targets}
164
+
165
+ for node in nodes:
166
+ if node.startswith("c~"):
167
+ mapped_node = mapping.get(node, node)
168
+ if mapped_node in targets and not has_ds_edge[mapped_node]:
169
+ # Only check neighbors if we haven't found a ds edge for this mapped node yet
170
+ if any(
171
+ neighbor.startswith("ds~") for neighbor in nx.neighbors(g, node)
172
+ ):
173
+ has_ds_edge[mapped_node] = True
174
+
175
+ return all(has_ds_edge.values())
176
+
177
+
178
+ def create_pruned_concept_graph(
179
+ g: ReferenceGraph,
180
+ all_concepts: List[BuildConcept],
181
+ datasources: list[BuildDatasource],
182
+ accept_partial: bool = False,
183
+ conditions: BuildWhereClause | None = None,
184
+ depth: int = 0,
185
+ ) -> nx.DiGraph:
186
+ orig_g = g
187
+
188
+ g = g.copy()
189
+ union_options = get_union_sources(datasources, all_concepts)
190
+
191
+ for ds_list in union_options:
192
+ node_address = "ds~" + "-".join([x.name for x in ds_list])
193
+ logger.info(
194
+ f"{padding(depth)}{LOGGER_PREFIX} injecting potentially relevant union datasource {node_address}"
195
+ )
196
+ common: set[BuildConcept] = set.intersection(
197
+ *[set(x.output_concepts) for x in ds_list]
198
+ )
199
+ g.datasources[node_address] = BuildUnionDatasource(children=ds_list)
200
+ for c in common:
201
+ cnode = concept_to_node(c)
202
+ g.add_edge(node_address, cnode)
203
+ g.add_edge(cnode, node_address)
204
+ prune_sources_for_conditions(g, accept_partial, conditions)
205
+ prune_sources_for_aggregates(g, all_concepts, logger)
206
+ target_addresses = set([c.canonical_address for c in all_concepts])
207
+ concepts: dict[str, BuildConcept] = orig_g.concepts
208
+ datasource_map: dict[str, BuildDatasource | BuildUnionDatasource] = (
209
+ orig_g.datasources
210
+ )
211
+ relevant_concepts_pre = {
212
+ n: x.canonical_address
213
+ for n in g.nodes()
214
+ # filter out synonyms
215
+ if (x := concepts.get(n, None)) and x.canonical_address in target_addresses
216
+ }
217
+
218
+ relevant_concepts: list[str] = list(relevant_concepts_pre.keys())
219
+ relevent_datasets: list[str] = []
220
+ if not accept_partial:
221
+ partial = {}
222
+ partial = get_graph_partial_nodes(g, conditions)
223
+ to_remove = []
224
+ for edge in g.edges:
225
+ if (
226
+ edge[0] in datasource_map
227
+ and (pnodes := partial.get(edge[0], []))
228
+ and edge[1] in pnodes
229
+ ):
230
+ to_remove.append(edge)
231
+ if (
232
+ edge[1] in datasource_map
233
+ and (pnodes := partial.get(edge[1], []))
234
+ and edge[0] in pnodes
235
+ ):
236
+ to_remove.append(edge)
237
+ for edge in to_remove:
238
+ g.remove_edge(*edge)
239
+
240
+ g_edges = set(g.edges)
241
+ for n in g.datasources:
242
+ if any((n, x) in g_edges for x in relevant_concepts):
243
+ relevent_datasets.append(n)
244
+ continue
245
+ logger.debug(f"Relevant datasets after pruning: {relevent_datasets}")
246
+ # for injecting extra join concepts that are shared between datasets
247
+ # use the original graph, pre-partial pruning
248
+ for n in orig_g.concepts:
249
+ # readd ignoring grain
250
+ # we want to join inclusive of all concepts
251
+ if n not in relevant_concepts:
252
+ n_neighbors = nx.all_neighbors(orig_g, n)
253
+ # check if the irrelevant concept is a join between
254
+ # two relevant datasets
255
+ neighbors = set()
256
+ for neighbor in n_neighbors:
257
+ if neighbor in relevent_datasets:
258
+ neighbors.add(neighbor)
259
+ if len(neighbors) > 1:
260
+ relevant_concepts.append(n)
261
+ continue
262
+ g.remove_nodes_from(
263
+ [
264
+ n
265
+ for n in g.nodes()
266
+ if n not in relevent_datasets and n not in relevant_concepts
267
+ ]
268
+ )
269
+ # from trilogy.hooks.graph_hook import GraphHook
270
+ # GraphHook().query_graph_built(g)
271
+ subgraphs = list(nx.connected_components(g.to_undirected()))
272
+ subgraphs = [
273
+ s
274
+ for s in subgraphs
275
+ if subgraph_is_complete(s, target_addresses, relevant_concepts_pre, g)
276
+ ]
277
+ # from trilogy.hooks.graph_hook import GraphHook
278
+ # GraphHook().query_graph_built(g)
279
+ if not subgraphs:
280
+ logger.info(
281
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - no subgraphs after node prune"
282
+ )
283
+ return None
284
+
285
+ if subgraphs and len(subgraphs) != 1:
286
+ logger.info(
287
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - subgraphs are split - have {len(subgraphs)} from {subgraphs}"
288
+ )
289
+ return None
290
+ # add back any relevant edges that might have been partially filtered
291
+ relevant = set(relevant_concepts + relevent_datasets)
292
+ for edge in orig_g.edges():
293
+ if edge[0] in relevant and edge[1] in relevant:
294
+ g.add_edge(edge[0], edge[1])
295
+ # if we have no ds nodes at all, for non constant, we can't find it
296
+ if not any([n.startswith("ds~") for n in g.nodes]):
297
+ logger.info(
298
+ f"{padding(depth)}{LOGGER_PREFIX} cannot resolve root graph - No datasource nodes found"
299
+ )
300
+ return None
301
+ return g
302
+
303
+
304
+ # def deduplicate_nodes(subgraph: nx.DiGraph, nodes: list[str], partial_map: dict[str, list[str]], depth: int) -> list[str]:
305
+ # """
306
+ # Remove duplicate datasource nodes that are connected to the same concepts
307
+ # and have the same partial state, keeping the one with the most unique concepts.
308
+
309
+ # Args:
310
+ # subgraph: NetworkX DiGraph containing the nodes and edges
311
+ # nodes: List of node names to deduplicate
312
+ # partial_map: Map of datasource to partial nodes
313
+
314
+ # Returns:
315
+ # List of deduplicated node names
316
+ # """
317
+ # # Filter for datasource nodes only
318
+ # ds_nodes = [node for node in nodes if node.startswith("ds~")]
319
+ # non_ds_nodes = [node for node in nodes if not node.startswith("ds~")]
320
+
321
+ # if len(ds_nodes) <= 1:
322
+ # return nodes # No deduplication needed
323
+
324
+ # # Build a map of each datasource to its connected concepts and partial state
325
+ # ds_info = {}
326
+
327
+ # for ds_node in ds_nodes:
328
+ # # Get connected concept nodes (nodes starting with "c~")
329
+ # connected_concepts = set()
330
+ # for neighbor in subgraph.neighbors(ds_node):
331
+ # if neighbor.startswith("c~"):
332
+ # connected_concepts.add(neighbor)
333
+
334
+ # # Get partial state for this datasource
335
+ # partial_state = tuple(sorted(partial_map.get(ds_node, [])))
336
+
337
+ # ds_info[ds_node] = {
338
+ # 'concepts': connected_concepts,
339
+ # 'partial_state': partial_state
340
+ # }
341
+
342
+ # # Find datasources to remove (those that are subsets of others)
343
+ # nodes_to_remove = set()
344
+ # logger.info('LOOK HERE')
345
+ # logger.info(ds_info)
346
+ # for ds_a, info_a in ds_info.items():
347
+ # for ds_b, info_b in ds_info.items():
348
+ # if ds_a != ds_b and ds_a not in nodes_to_remove:
349
+ # # Check if ds_a is a subset of ds_b (same partial state and concepts are subset)
350
+ # if (info_a['partial_state'] == info_b['partial_state'] and
351
+ # info_a['concepts'].issubset(info_b['concepts']) and
352
+ # len(info_a['concepts']) < len(info_b['concepts'])):
353
+ # # ds_a connects to fewer concepts than ds_b, so remove ds_a
354
+ # nodes_to_remove.add(ds_a)
355
+ # elif (info_a['partial_state'] == info_b['partial_state'] and
356
+ # info_a['concepts'] == info_b['concepts']):
357
+ # # Exact same concepts and partial state - keep one arbitrarily
358
+ # # (keep the lexicographically smaller one for consistency)
359
+ # if ds_a > ds_b:
360
+ # nodes_to_remove.add(ds_a)
361
+
362
+ # # Keep datasource nodes that weren't marked for removal
363
+ # logger.info(f"{padding(depth)}{LOGGER_PREFIX} Removing duplicate datasource nodes: {nodes_to_remove}")
364
+ # deduplicated_ds_nodes = [ds for ds in ds_nodes if ds not in nodes_to_remove]
365
+
366
+ # # Return deduplicated datasource nodes plus all non-datasource nodes
367
+ # return deduplicated_ds_nodes + non_ds_nodes
368
+
369
+
370
+ def filter_pseudonym_duplicates(
371
+ concepts: list[BuildConcept], relevant: list[BuildConcept]
372
+ ) -> list[BuildConcept]:
373
+ """Filter out concepts whose pseudonyms are also in the list, keeping the one in relevant."""
374
+ relevant_addrs = {c.address for c in relevant}
375
+ concept_addrs = {c.address for c in concepts}
376
+ to_remove: set[str] = set()
377
+ for c in concepts:
378
+ for p_addr in c.pseudonyms:
379
+ if p_addr in concept_addrs:
380
+ c_in_relevant = c.address in relevant_addrs
381
+ p_in_relevant = p_addr in relevant_addrs
382
+ if p_in_relevant and not c_in_relevant:
383
+ to_remove.add(c.address)
384
+ break
385
+ elif c_in_relevant and not p_in_relevant:
386
+ to_remove.add(p_addr)
387
+ return [c for c in concepts if c.address not in to_remove]
388
+
389
+
390
+ def resolve_subgraphs(
391
+ g: ReferenceGraph,
392
+ relevant: list[BuildConcept],
393
+ accept_partial: bool,
394
+ conditions: BuildWhereClause | None,
395
+ depth: int = 0,
396
+ ) -> dict[str, list[str]]:
397
+ """When we have multiple distinct subgraphs within our matched
398
+ nodes that can satisfy a query, resolve which one of those we should
399
+ ultimately ues.
400
+ This should generally return one subgraph for each
401
+ unique set of sub concepts that can be referenced,
402
+ discarding duplicates.
403
+ Duplicate subgraphs will be resolved based on which
404
+ ones are most 'optimal' to use, a hueristic
405
+ that can evolve in the future but is currently based on datasource
406
+ cardinality."""
407
+ datasources = [n for n in g.nodes if n.startswith("ds~")]
408
+ canonical_relevant = set([c.canonical_address for c in relevant])
409
+ canonical_map = {c.canonical_address: c.address for c in relevant}
410
+ concepts: dict[str, BuildConcept] = g.concepts
411
+ subgraphs: dict[str, list[str]] = {
412
+ ds: list(set(list(nx.all_neighbors(g, ds)))) for ds in datasources
413
+ }
414
+ # filter pseudonym duplicates from each subgraph, keeping concept in relevant
415
+ for ds in subgraphs:
416
+ ds_concepts = [concepts[n] for n in subgraphs[ds] if n in concepts]
417
+ filtered = filter_pseudonym_duplicates(ds_concepts, relevant)
418
+ filtered_nodes = {concept_to_node(c) for c in filtered}
419
+ subgraphs[ds] = [
420
+ n for n in subgraphs[ds] if n not in concepts or n in filtered_nodes
421
+ ]
422
+ partial_map = get_graph_partial_nodes(g, conditions)
423
+ exact_map = get_graph_exact_match(g, accept_partial, conditions)
424
+ grain_length = get_graph_grains(g)
425
+ non_partial_map = {
426
+ ds: [
427
+ concepts[c].canonical_address
428
+ for c in subgraphs[ds]
429
+ if c not in partial_map[ds]
430
+ ]
431
+ for ds in datasources
432
+ }
433
+ concept_map = {
434
+ ds: [concepts[c].canonical_address for c in subgraphs[ds]] for ds in datasources
435
+ }
436
+ pruned_subgraphs = {}
437
+
438
+ def score_node(input: str):
439
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} scoring node {input}")
440
+ score = score_datasource_node(
441
+ input, g.datasources, grain_length, concept_map, exact_map, subgraphs
442
+ )
443
+ logger.debug(f"{padding(depth)}{LOGGER_PREFIX} node {input} has score {score}")
444
+ return score
445
+
446
+ for key, nodes in subgraphs.items():
447
+
448
+ value = non_partial_map[key]
449
+ all_concepts = concept_map[key]
450
+ is_subset = False
451
+ matches = set()
452
+ # Compare current list with other lists
453
+ for other_key, other_all_concepts in concept_map.items():
454
+ other_value = non_partial_map[other_key]
455
+ # needs to be a subset of non partial and a subset of all
456
+ if (
457
+ key != other_key
458
+ and set(value).issubset(set(other_value))
459
+ and set(all_concepts).issubset(set(other_all_concepts))
460
+ ):
461
+ if len(value) < len(other_value):
462
+ is_subset = True
463
+ logger.debug(
464
+ f"{padding(depth)}{LOGGER_PREFIX} Dropping subgraph {key} with {value} as it is a subset of {other_key} with {other_value}"
465
+ )
466
+ elif len(value) == len(other_value) and len(all_concepts) == len(
467
+ other_all_concepts
468
+ ):
469
+ matches.add(other_key)
470
+ matches.add(key)
471
+ if matches and not is_subset:
472
+ min_node = min(matches, key=score_node)
473
+ logger.debug(
474
+ f"{padding(depth)}{LOGGER_PREFIX} minimum source score is {min_node}"
475
+ )
476
+ is_subset = key is not min(matches, key=score_node)
477
+ if not is_subset:
478
+ pruned_subgraphs[key] = nodes
479
+
480
+ final_nodes: set[str] = set([n for v in pruned_subgraphs.values() for n in v])
481
+ relevant_concepts_pre = {
482
+ n: x.canonical_address
483
+ for n in g.nodes()
484
+ # filter out synonyms
485
+ if (x := concepts.get(n, None)) and x.canonical_address in canonical_relevant
486
+ }
487
+ logger.debug(
488
+ f"{padding(depth)}{LOGGER_PREFIX} Final nodes before relevance pruning: {final_nodes}"
489
+ )
490
+ for node in final_nodes:
491
+ keep = True
492
+ if node.startswith("c~") and node not in relevant_concepts_pre:
493
+ keep = (
494
+ sum(
495
+ [
496
+ 1 if node in sub_nodes else 0
497
+ for _, sub_nodes in pruned_subgraphs.items()
498
+ ]
499
+ )
500
+ > 1
501
+ )
502
+ if not keep:
503
+ logger.debug(
504
+ f"{padding(depth)}{LOGGER_PREFIX} Pruning node {node} as irrelevant after subgraph resolution"
505
+ )
506
+ pruned_subgraphs = {
507
+ canonical_map.get(k, k): [n for n in v if n != node]
508
+ for k, v in pruned_subgraphs.items()
509
+ }
510
+
511
+ return pruned_subgraphs
512
+
513
+
514
+ def create_datasource_node(
515
+ datasource: BuildDatasource,
516
+ all_concepts: List[BuildConcept],
517
+ accept_partial: bool,
518
+ environment: BuildEnvironment,
519
+ depth: int,
520
+ conditions: BuildWhereClause | None = None,
521
+ ) -> tuple[StrategyNode, bool]:
522
+
523
+ target_grain = BuildGrain.from_concepts(all_concepts, environment=environment)
524
+ # datasource grain may have changed since reference graph creation
525
+ datasource_grain = BuildGrain.from_concepts(
526
+ datasource.grain.components, environment=environment
527
+ )
528
+ # datasource_grain = datasource.grain
529
+ force_group = False
530
+ if not datasource_grain.issubset(target_grain):
531
+ logger.info(
532
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node must be wrapped in group, {datasource_grain} not subset of target grain {target_grain} from {all_concepts}"
533
+ )
534
+ force_group = True
535
+ else:
536
+ logger.info(
537
+ f"{padding(depth)}{LOGGER_PREFIX}_DS_NODE Select node grain {datasource_grain} is subset of target grain {target_grain}, no group required"
538
+ )
539
+ if not datasource_grain.components:
540
+ force_group = True
541
+ partial_concepts = [
542
+ c.concept
543
+ for c in datasource.columns
544
+ if not c.is_complete and c.concept.address in all_concepts
545
+ ]
546
+
547
+ partial_lcl = CanonicalBuildConceptList(concepts=partial_concepts)
548
+ nullable_concepts = [
549
+ c.concept
550
+ for c in datasource.columns
551
+ if c.is_nullable and c.concept.address in all_concepts
552
+ ]
553
+
554
+ nullable_lcl = CanonicalBuildConceptList(concepts=nullable_concepts)
555
+ partial_is_full = conditions and (conditions == datasource.non_partial_for)
556
+
557
+ datasource_conditions = datasource.where.conditional if datasource.where else None
558
+ all_inputs = [c.concept for c in datasource.columns]
559
+ canonical_all = CanonicalBuildConceptList(concepts=all_inputs)
560
+
561
+ # if we're binding via a canonical address association, add it here
562
+ for x in all_concepts:
563
+ if x not in all_inputs and x in canonical_all:
564
+ all_inputs.append(x)
565
+
566
+ rval = SelectNode(
567
+ input_concepts=all_inputs,
568
+ output_concepts=sorted(all_concepts, key=lambda x: x.address),
569
+ environment=environment,
570
+ parents=[],
571
+ depth=depth,
572
+ partial_concepts=(
573
+ [] if partial_is_full else [c for c in all_concepts if c in partial_lcl]
574
+ ),
575
+ nullable_concepts=[c for c in all_concepts if c in nullable_lcl],
576
+ accept_partial=accept_partial,
577
+ datasource=datasource,
578
+ grain=datasource.grain,
579
+ conditions=datasource_conditions,
580
+ preexisting_conditions=(
581
+ conditions.conditional if partial_is_full and conditions else None
582
+ ),
583
+ )
584
+ return (
585
+ rval,
586
+ force_group,
587
+ )
588
+
589
+
590
+ def create_union_datasource(
591
+ datasource: BuildUnionDatasource,
592
+ all_concepts: List[BuildConcept],
593
+ accept_partial: bool,
594
+ environment: BuildEnvironment,
595
+ depth: int,
596
+ conditions: BuildWhereClause | None = None,
597
+ ) -> tuple["UnionNode", bool]:
598
+ from trilogy.core.processing.nodes.union_node import UnionNode
599
+
600
+ datasources = datasource.children
601
+ logger.info(
602
+ f"{padding(depth)}{LOGGER_PREFIX} generating union node parents with condition {conditions}"
603
+ )
604
+ force_group = False
605
+ parents = []
606
+ for x in datasources:
607
+ subnode, fg = create_datasource_node(
608
+ x,
609
+ all_concepts,
610
+ accept_partial,
611
+ environment,
612
+ depth + 1,
613
+ conditions=conditions,
614
+ )
615
+ parents.append(subnode)
616
+ force_group = force_group or fg
617
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} returning union node")
618
+ return (
619
+ UnionNode(
620
+ output_concepts=all_concepts,
621
+ input_concepts=all_concepts,
622
+ environment=environment,
623
+ parents=parents,
624
+ depth=depth,
625
+ partial_concepts=[],
626
+ ),
627
+ force_group,
628
+ )
629
+
630
+
631
+ def create_select_node(
632
+ ds_name: str,
633
+ subgraph: list[str],
634
+ accept_partial: bool,
635
+ g: ReferenceGraph,
636
+ environment: BuildEnvironment,
637
+ depth: int,
638
+ conditions: BuildWhereClause | None = None,
639
+ ) -> StrategyNode:
640
+ all_concepts = [
641
+ environment.canonical_concepts[extract_address(c)]
642
+ for c in subgraph
643
+ if c.startswith("c~")
644
+ ]
645
+ logger.info(
646
+ f"{padding(depth)}{LOGGER_PREFIX} all concepts: {all_concepts} from {subgraph}"
647
+ )
648
+
649
+ if all([c.derivation == Derivation.CONSTANT for c in all_concepts]):
650
+ logger.info(
651
+ f"{padding(depth)}{LOGGER_PREFIX} All concepts {[x.address for x in all_concepts]} are constants, returning constant node"
652
+ )
653
+ return ConstantNode(
654
+ output_concepts=all_concepts,
655
+ input_concepts=[],
656
+ environment=environment,
657
+ parents=[],
658
+ depth=depth,
659
+ # no partial for constants
660
+ partial_concepts=[],
661
+ force_group=False,
662
+ preexisting_conditions=conditions.conditional if conditions else None,
663
+ )
664
+
665
+ datasource: BuildDatasource | BuildUnionDatasource = g.datasources[ds_name]
666
+
667
+ if isinstance(datasource, BuildDatasource):
668
+ bcandidate, force_group = create_datasource_node(
669
+ datasource,
670
+ all_concepts,
671
+ accept_partial,
672
+ environment,
673
+ depth,
674
+ conditions=conditions,
675
+ )
676
+
677
+ elif isinstance(datasource, BuildUnionDatasource):
678
+ bcandidate, force_group = create_union_datasource(
679
+ datasource,
680
+ all_concepts,
681
+ accept_partial,
682
+ environment,
683
+ depth,
684
+ conditions=conditions,
685
+ )
686
+ else:
687
+ raise ValueError(f"Unknown datasource type {datasource}")
688
+
689
+ # we need to nest the group node one further
690
+ if force_group is True:
691
+ logger.info(
692
+ f"{padding(depth)}{LOGGER_PREFIX} source requires group before consumption."
693
+ )
694
+ candidate: StrategyNode = GroupNode(
695
+ output_concepts=all_concepts,
696
+ input_concepts=all_concepts,
697
+ environment=environment,
698
+ parents=[bcandidate],
699
+ depth=depth + 1,
700
+ partial_concepts=bcandidate.partial_concepts,
701
+ nullable_concepts=bcandidate.nullable_concepts,
702
+ preexisting_conditions=bcandidate.preexisting_conditions,
703
+ force_group=force_group,
704
+ )
705
+ else:
706
+
707
+ candidate = bcandidate
708
+
709
+ return candidate
710
+
711
+
712
+ def gen_select_merge_node(
713
+ all_concepts: List[BuildConcept],
714
+ g: nx.DiGraph,
715
+ environment: BuildEnvironment,
716
+ depth: int,
717
+ accept_partial: bool = False,
718
+ conditions: BuildWhereClause | None = None,
719
+ ) -> Optional[StrategyNode]:
720
+ non_constant = [c for c in all_concepts if c.derivation != Derivation.CONSTANT]
721
+ constants = [c for c in all_concepts if c.derivation == Derivation.CONSTANT]
722
+ logger.info(
723
+ f"{padding(depth)}{LOGGER_PREFIX} generating select merge node for {all_concepts}"
724
+ )
725
+ if not non_constant and constants:
726
+ logger.info(
727
+ f"{padding(depth)}{LOGGER_PREFIX} only constant inputs to discovery ({constants}), returning constant node directly"
728
+ )
729
+ for x in constants:
730
+ logger.info(
731
+ f"{padding(depth)}{LOGGER_PREFIX} {x} {x.lineage} {x.derivation}"
732
+ )
733
+ if conditions:
734
+ if not all(
735
+ [x.derivation == Derivation.CONSTANT for x in conditions.row_arguments]
736
+ ):
737
+ logger.info(
738
+ f"{padding(depth)}{LOGGER_PREFIX} conditions being passed in to constant node {conditions}, but not all concepts are constants, cannot generate select node."
739
+ )
740
+ return None
741
+ else:
742
+ constants += conditions.row_arguments
743
+
744
+ return ConstantNode(
745
+ output_concepts=constants,
746
+ input_concepts=[],
747
+ environment=environment,
748
+ parents=[],
749
+ depth=depth,
750
+ partial_concepts=[],
751
+ force_group=False,
752
+ conditions=conditions.conditional if conditions else None,
753
+ )
754
+ attempts = [
755
+ False,
756
+ ]
757
+ if accept_partial:
758
+ attempts.append(True)
759
+ logger.info(
760
+ f"{padding(depth)}{LOGGER_PREFIX} searching for root source graph for concepts {[c.address for c in all_concepts]} and conditions {conditions}"
761
+ )
762
+ pruned_concept_graph = None
763
+ for attempt in attempts:
764
+ pruned_concept_graph = create_pruned_concept_graph(
765
+ g,
766
+ non_constant,
767
+ accept_partial=attempt,
768
+ conditions=conditions,
769
+ datasources=list([x for x in environment.datasources.values()]),
770
+ depth=depth,
771
+ )
772
+ if pruned_concept_graph:
773
+ logger.info(
774
+ f"{padding(depth)}{LOGGER_PREFIX} found covering graph w/ partial flag {attempt} {list(pruned_concept_graph.nodes)}"
775
+ )
776
+ break
777
+
778
+ if not pruned_concept_graph:
779
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} no covering graph found.")
780
+ return None
781
+
782
+ sub_nodes = resolve_subgraphs(
783
+ pruned_concept_graph,
784
+ relevant=non_constant,
785
+ accept_partial=accept_partial,
786
+ conditions=conditions,
787
+ depth=depth,
788
+ )
789
+
790
+ logger.info(f"{padding(depth)}{LOGGER_PREFIX} fetching subgraphs {sub_nodes}")
791
+
792
+ parents = [
793
+ create_select_node(
794
+ k,
795
+ subgraph,
796
+ g=pruned_concept_graph,
797
+ accept_partial=accept_partial,
798
+ environment=environment,
799
+ depth=depth,
800
+ conditions=conditions,
801
+ )
802
+ for k, subgraph in sub_nodes.items()
803
+ ]
804
+ if not parents:
805
+ return None
806
+
807
+ if constants:
808
+ parents.append(
809
+ ConstantNode(
810
+ output_concepts=constants,
811
+ input_concepts=[],
812
+ environment=environment,
813
+ parents=[],
814
+ depth=depth,
815
+ partial_concepts=[],
816
+ force_group=False,
817
+ preexisting_conditions=conditions.conditional if conditions else None,
818
+ )
819
+ )
820
+
821
+ if len(parents) == 1:
822
+ return parents[0]
823
+ logger.info(
824
+ f"{padding(depth)}{LOGGER_PREFIX} Multiple parent DS nodes resolved - {[type(x) for x in parents]}, wrapping in merge"
825
+ )
826
+
827
+ preexisting_conditions = None
828
+ if conditions and all(
829
+ [
830
+ x.preexisting_conditions
831
+ and x.preexisting_conditions == conditions.conditional
832
+ for x in parents
833
+ ]
834
+ ):
835
+ preexisting_conditions = conditions.conditional
836
+
837
+ base = MergeNode(
838
+ output_concepts=all_concepts,
839
+ input_concepts=non_constant,
840
+ environment=environment,
841
+ depth=depth,
842
+ parents=parents,
843
+ preexisting_conditions=preexisting_conditions,
844
+ )
845
+
846
+ return base