pytrilogy 0.0.3.104__py3-none-any.whl → 0.0.3.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pytrilogy might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pytrilogy
3
- Version: 0.0.3.104
3
+ Version: 0.0.3.106
4
4
  Summary: Declarative, typed query language that compiles to SQL.
5
5
  Home-page:
6
6
  Author:
@@ -1,5 +1,5 @@
1
- pytrilogy-0.0.3.104.dist-info/licenses/LICENSE.md,sha256=5ZRvtTyCCFwz1THxDTjAu3Lidds9WjPvvzgVwPSYNDo,1042
2
- trilogy/__init__.py,sha256=HyZF9WId40s9G3BjFS2OBMeHI7XNeE-YU1cFLvNOSWk,304
1
+ pytrilogy-0.0.3.106.dist-info/licenses/LICENSE.md,sha256=5ZRvtTyCCFwz1THxDTjAu3Lidds9WjPvvzgVwPSYNDo,1042
2
+ trilogy/__init__.py,sha256=6_By-LphYYIXu7GSa5PwnchymrRabR6qiUwPJWX62EE,304
3
3
  trilogy/constants.py,sha256=g_zkVCNjGop6coZ1kM8eXXAzCnUN22ldx3TYFz0E9sc,1747
4
4
  trilogy/engine.py,sha256=3MiADf5MKcmxqiHBuRqiYdsXiLj7oitDfVvXvHrfjkA,2178
5
5
  trilogy/executor.py,sha256=KgCAQhHPT-j0rPkBbALX0f84W9-Q-bkjHayGuavg99w,16490
@@ -37,26 +37,26 @@ trilogy/core/optimizations/predicate_pushdown.py,sha256=5ubatgq1IwWQ4L2FDt4--y16
37
37
  trilogy/core/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  trilogy/core/processing/concept_strategies_v3.py,sha256=AcMU1d5uCo8I1PFCkBtmcC6iFmM9vN6xSdKxSVMGfpA,23080
39
39
  trilogy/core/processing/discovery_node_factory.py,sha256=p23jiiHyhrW-Q8ndbnRlqMHJKT8ZqPOA89SzE4xaFFo,15445
40
- trilogy/core/processing/discovery_utility.py,sha256=wIuLsE6yuVykeYZdIqRSagivDNU3-ooiS7z6in4yqho,11518
40
+ trilogy/core/processing/discovery_utility.py,sha256=ZJTTWm34zpR30p-xzElUJCTdx-XT44skKG0-LBfEOg4,12525
41
41
  trilogy/core/processing/discovery_validation.py,sha256=eZ4HfHMpqZLI8MGG2jez8arS8THs6ceuVrQFIY6gXrU,5364
42
42
  trilogy/core/processing/graph_utils.py,sha256=8QUVrkE9j-9C1AyrCb1nQEh8daCe0u1HuXl-Te85lag,1205
43
- trilogy/core/processing/utility.py,sha256=1_oNnk6lWiy-D7LKYr07kU_v7iAM4i6ITUAS4bIiCr4,23444
43
+ trilogy/core/processing/utility.py,sha256=ESs6pKqVP2c9eMdfB2JNjw7D7YnoezVwbLFx1D6OUYA,26088
44
44
  trilogy/core/processing/node_generators/__init__.py,sha256=iVJ-crowPxYeut-hFjyEjfibKIDq7PfB4LEuDAUCjGY,943
45
45
  trilogy/core/processing/node_generators/basic_node.py,sha256=74LoVZXLinRvSzk2LmI1kwza96TnuH3ELoYRIbHB29A,5578
46
46
  trilogy/core/processing/node_generators/common.py,sha256=xF32Kf6B08dZgKs2SOow1HomptSiSC057GCUCHFlS5s,9464
47
47
  trilogy/core/processing/node_generators/constant_node.py,sha256=LfpDq2WrBRZ3tGsLxw77LuigKfhbteWWh9L8BGdMGwk,1146
48
- trilogy/core/processing/node_generators/filter_node.py,sha256=ndPznkcFu_cdCNgaRpgot8oqnzdHv4KAIfjeUIzrE2w,10816
48
+ trilogy/core/processing/node_generators/filter_node.py,sha256=cJ5od1fAfvalaUDO2O4Y6Yrr2RukOCqey7f3zrKSBbI,10808
49
49
  trilogy/core/processing/node_generators/group_node.py,sha256=NdK1rl6Ze94XFWtgeC2dlRiL4pS3lh1ArKGPEltLtnw,8525
50
50
  trilogy/core/processing/node_generators/group_to_node.py,sha256=jKcNCDOY6fNblrdZwaRU0sbUSr9H0moQbAxrGgX6iGA,3832
51
51
  trilogy/core/processing/node_generators/multiselect_node.py,sha256=a505AEixjsjp5jI8Ng3H5KF_AaehkS6HfRfTef64l_o,7063
52
52
  trilogy/core/processing/node_generators/node_merge_node.py,sha256=hNcZxnDLTZyYJWfojg769zH9HB9PfZfESmpN1lcHWXg,23172
53
53
  trilogy/core/processing/node_generators/recursive_node.py,sha256=l5zdh0dURKwmAy8kK4OpMtZfyUEQRk6N-PwSWIyBpSM,2468
54
54
  trilogy/core/processing/node_generators/rowset_node.py,sha256=MuVNIexXhqGONho_mewqMOwaYXNUnjjvyPvk_RDGNYE,5943
55
- trilogy/core/processing/node_generators/select_merge_node.py,sha256=KQvGoNT5ZBWQ_caEomRTtG1PKZC7OPT4PKfY0QmwMGE,22270
55
+ trilogy/core/processing/node_generators/select_merge_node.py,sha256=ORF9H7A-yT2wzQZYVex2asmm7_y0b2_lP6U0e48asNA,25290
56
56
  trilogy/core/processing/node_generators/select_node.py,sha256=Ta1G39V94gjX_AgyZDz9OqnwLz4BjY3D6Drx9YpziMQ,3555
57
57
  trilogy/core/processing/node_generators/synonym_node.py,sha256=AnAsa_Wj50NJ_IK0HSgab_7klYmKVrv0WI1uUe-GvEY,3766
58
58
  trilogy/core/processing/node_generators/union_node.py,sha256=NxQbnRRoYMI4WjMeph41yk4E6yipj53qdGuNt-Mozxw,2818
59
- trilogy/core/processing/node_generators/unnest_node.py,sha256=7uOZzBidEEKeZE0VW_XlgHGhEYf_snEHtV8UgJ_ZjyY,4048
59
+ trilogy/core/processing/node_generators/unnest_node.py,sha256=u_hVHFYMz-ZylDdHH9mhFSRpxuKcTGvrrOP0rxrY_Xg,3901
60
60
  trilogy/core/processing/node_generators/window_node.py,sha256=A90linr4pkZtTNfn9k2YNLqrJ_SFII3lbHxB-BC6mI8,6688
61
61
  trilogy/core/processing/node_generators/select_helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  trilogy/core/processing/node_generators/select_helpers/datasource_injection.py,sha256=m2YQ4OmG0N2O61a7NEq1ZzbTa7JsCC00lxB2ymjcYRI,8224
@@ -119,8 +119,8 @@ trilogy/std/money.preql,sha256=XWwvAV3WxBsHX9zfptoYRnBigcfYwrYtBHXTME0xJuQ,2082
119
119
  trilogy/std/net.preql,sha256=WZCuvH87_rZntZiuGJMmBDMVKkdhTtxeHOkrXNwJ1EE,416
120
120
  trilogy/std/ranking.preql,sha256=LDoZrYyz4g3xsII9XwXfmstZD-_92i1Eox1UqkBIfi8,83
121
121
  trilogy/std/report.preql,sha256=LbV-XlHdfw0jgnQ8pV7acG95xrd1-p65fVpiIc-S7W4,202
122
- pytrilogy-0.0.3.104.dist-info/METADATA,sha256=IJmkrwnxe7gz3s89ZYVrDe6SkRY2cf6xNpmj5GTXkSE,11839
123
- pytrilogy-0.0.3.104.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
- pytrilogy-0.0.3.104.dist-info/entry_points.txt,sha256=ewBPU2vLnVexZVnB-NrVj-p3E-4vukg83Zk8A55Wp2w,56
125
- pytrilogy-0.0.3.104.dist-info/top_level.txt,sha256=cAy__NW_eMAa_yT9UnUNlZLFfxcg6eimUAZ184cdNiE,8
126
- pytrilogy-0.0.3.104.dist-info/RECORD,,
122
+ pytrilogy-0.0.3.106.dist-info/METADATA,sha256=NM64Zgq3r16YOeN1tu1QI2sM-NyoLm86gnecFfRGQVs,11839
123
+ pytrilogy-0.0.3.106.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
124
+ pytrilogy-0.0.3.106.dist-info/entry_points.txt,sha256=ewBPU2vLnVexZVnB-NrVj-p3E-4vukg83Zk8A55Wp2w,56
125
+ pytrilogy-0.0.3.106.dist-info/top_level.txt,sha256=cAy__NW_eMAa_yT9UnUNlZLFfxcg6eimUAZ184cdNiE,8
126
+ pytrilogy-0.0.3.106.dist-info/RECORD,,
trilogy/__init__.py CHANGED
@@ -4,6 +4,6 @@ from trilogy.dialect.enums import Dialects
4
4
  from trilogy.executor import Executor
5
5
  from trilogy.parser import parse
6
6
 
7
- __version__ = "0.0.3.104"
7
+ __version__ = "0.0.3.106"
8
8
 
9
9
  __all__ = ["parse", "Executor", "Dialects", "Environment", "CONFIG"]
@@ -27,18 +27,22 @@ def calculate_effective_parent_grain(
27
27
  ) -> BuildGrain:
28
28
  # calculate the effective grain of the parent node
29
29
  # this is the union of all parent grains
30
- if isinstance(node, MergeNode):
30
+ if isinstance(node, QueryDatasource):
31
31
  grain = BuildGrain()
32
- qds = node.resolve()
32
+ qds = node
33
33
  if not qds.joins:
34
34
  return qds.datasources[0].grain
35
+ seen = set()
35
36
  for join in qds.joins:
36
37
  if isinstance(join, UnnestJoin):
38
+ grain += BuildGrain(components=set([x.address for x in join.concepts]))
37
39
  continue
38
40
  pairs = join.concept_pairs or []
39
41
  for key in pairs:
40
42
  left = key.existing_datasource
43
+ logger.info(f"adding left grain {left.grain} for join key {key.left}")
41
44
  grain += left.grain
45
+ seen.add(left.name)
42
46
  keys = [key.right for key in pairs]
43
47
  join_grain = BuildGrain.from_concepts(keys)
44
48
  if join_grain == join.right_datasource.grain:
@@ -48,6 +52,24 @@ def calculate_effective_parent_grain(
48
52
  f"join changes grain, adding {join.right_datasource.grain} to {grain}"
49
53
  )
50
54
  grain += join.right_datasource.grain
55
+ seen.add(join.right_datasource.name)
56
+ for x in qds.datasources:
57
+ # if we haven't seen it, it's still contributing to grain
58
+ # unless used ONLY in a subselect
59
+ # so the existence check is a [bad] proxy for that
60
+ if x.name not in seen and not (
61
+ qds.condition
62
+ and qds.condition.existence_arguments
63
+ and any(
64
+ [
65
+ c.address in block
66
+ for c in x.output_concepts
67
+ for block in qds.condition.existence_arguments
68
+ ]
69
+ )
70
+ ):
71
+ logger.info(f"adding unjoined grain {x.grain} for datasource {x.name}")
72
+ grain += x.grain
51
73
  return grain
52
74
  else:
53
75
  return node.grain or BuildGrain()
@@ -75,7 +97,7 @@ def check_if_group_required(
75
97
  if comp_grain.issubset(target_grain):
76
98
 
77
99
  logger.info(
78
- f"{padding}{LOGGER_PREFIX} Group requirement check: {comp_grain}, target: {target_grain}, grain is subset of target, no group node required"
100
+ f"{padding}{LOGGER_PREFIX} Group requirement check: {comp_grain}, target: {target_grain}, grain is subset of target, no group node required"
79
101
  )
80
102
  return GroupRequiredResponse(target_grain, comp_grain, False)
81
103
  # find out what extra is in the comp grain vs target grain
@@ -4,7 +4,6 @@ from trilogy.constants import logger
4
4
  from trilogy.core.models.build import (
5
5
  BuildConcept,
6
6
  BuildFilterItem,
7
- BuildGrain,
8
7
  BuildWhereClause,
9
8
  )
10
9
  from trilogy.core.models.build_environment import BuildEnvironment
@@ -238,10 +237,10 @@ def gen_filter_node(
238
237
  if not parent.preexisting_conditions == where.conditional:
239
238
  parent.add_condition(where.conditional)
240
239
  parent.add_existence_concepts(flattened_existence, False)
241
- parent.grain = BuildGrain.from_concepts(
242
- parent.output_concepts,
243
- environment=environment,
244
- )
240
+ # parent.grain = BuildGrain.from_concepts(
241
+ # parent.output_concepts,
242
+ # environment=environment,
243
+ # )
245
244
  parent.rebuild_cache()
246
245
  filter_node = parent
247
246
  else:
@@ -224,6 +224,72 @@ def create_pruned_concept_graph(
224
224
  return g
225
225
 
226
226
 
227
+ # def deduplicate_nodes(subgraph: nx.DiGraph, nodes: list[str], partial_map: dict[str, list[str]], depth: int) -> list[str]:
228
+ # """
229
+ # Remove duplicate datasource nodes that are connected to the same concepts
230
+ # and have the same partial state, keeping the one with the most unique concepts.
231
+
232
+ # Args:
233
+ # subgraph: NetworkX DiGraph containing the nodes and edges
234
+ # nodes: List of node names to deduplicate
235
+ # partial_map: Map of datasource to partial nodes
236
+
237
+ # Returns:
238
+ # List of deduplicated node names
239
+ # """
240
+ # # Filter for datasource nodes only
241
+ # ds_nodes = [node for node in nodes if node.startswith("ds~")]
242
+ # non_ds_nodes = [node for node in nodes if not node.startswith("ds~")]
243
+
244
+ # if len(ds_nodes) <= 1:
245
+ # return nodes # No deduplication needed
246
+
247
+ # # Build a map of each datasource to its connected concepts and partial state
248
+ # ds_info = {}
249
+
250
+ # for ds_node in ds_nodes:
251
+ # # Get connected concept nodes (nodes starting with "c~")
252
+ # connected_concepts = set()
253
+ # for neighbor in subgraph.neighbors(ds_node):
254
+ # if neighbor.startswith("c~"):
255
+ # connected_concepts.add(neighbor)
256
+
257
+ # # Get partial state for this datasource
258
+ # partial_state = tuple(sorted(partial_map.get(ds_node, [])))
259
+
260
+ # ds_info[ds_node] = {
261
+ # 'concepts': connected_concepts,
262
+ # 'partial_state': partial_state
263
+ # }
264
+
265
+ # # Find datasources to remove (those that are subsets of others)
266
+ # nodes_to_remove = set()
267
+ # logger.info('LOOK HERE')
268
+ # logger.info(ds_info)
269
+ # for ds_a, info_a in ds_info.items():
270
+ # for ds_b, info_b in ds_info.items():
271
+ # if ds_a != ds_b and ds_a not in nodes_to_remove:
272
+ # # Check if ds_a is a subset of ds_b (same partial state and concepts are subset)
273
+ # if (info_a['partial_state'] == info_b['partial_state'] and
274
+ # info_a['concepts'].issubset(info_b['concepts']) and
275
+ # len(info_a['concepts']) < len(info_b['concepts'])):
276
+ # # ds_a connects to fewer concepts than ds_b, so remove ds_a
277
+ # nodes_to_remove.add(ds_a)
278
+ # elif (info_a['partial_state'] == info_b['partial_state'] and
279
+ # info_a['concepts'] == info_b['concepts']):
280
+ # # Exact same concepts and partial state - keep one arbitrarily
281
+ # # (keep the lexicographically smaller one for consistency)
282
+ # if ds_a > ds_b:
283
+ # nodes_to_remove.add(ds_a)
284
+
285
+ # # Keep datasource nodes that weren't marked for removal
286
+ # logger.info(f"{padding(depth)}{LOGGER_PREFIX} Removing duplicate datasource nodes: {nodes_to_remove}")
287
+ # deduplicated_ds_nodes = [ds for ds in ds_nodes if ds not in nodes_to_remove]
288
+
289
+ # # Return deduplicated datasource nodes plus all non-datasource nodes
290
+ # return deduplicated_ds_nodes + non_ds_nodes
291
+
292
+
227
293
  def resolve_subgraphs(
228
294
  g: ReferenceGraph,
229
295
  relevant: list[BuildConcept],
@@ -4,7 +4,6 @@ from trilogy.constants import logger
4
4
  from trilogy.core.models.build import (
5
5
  BuildConcept,
6
6
  BuildFunction,
7
- BuildGrain,
8
7
  BuildWhereClause,
9
8
  )
10
9
  from trilogy.core.models.build_environment import BuildEnvironment
@@ -104,10 +103,6 @@ def gen_unnest_node(
104
103
  preexisting_conditions=(
105
104
  conditional if conditional and local_conditions is False else None
106
105
  ),
107
- grain=BuildGrain.from_concepts(
108
- concepts=base.output_concepts,
109
- environment=environment,
110
- ),
111
106
  )
112
107
  # qds = new.resolve()
113
108
  # assert qds.source_map[concept.address] == {base.resolve()}
@@ -90,13 +90,86 @@ class GroupRequiredResponse:
90
90
  required: bool
91
91
 
92
92
 
93
+ def find_all_connecting_concepts(g: nx.Graph, ds1: str, ds2: str) -> set[str]:
94
+ """Find all concepts that connect two datasources"""
95
+ concepts1 = set(g.neighbors(ds1))
96
+ concepts2 = set(g.neighbors(ds2))
97
+ return concepts1 & concepts2
98
+
99
+
100
+ def get_connection_keys(
101
+ all_connections: dict[tuple[str, str], set[str]], left: str, right: str
102
+ ) -> set[str]:
103
+ """Get all concepts that connect two datasources"""
104
+ lookup = sorted([left, right])
105
+ key: tuple[str, str] = (lookup[0], lookup[1])
106
+ return all_connections.get(key, set())
107
+
108
+
109
+ def get_join_type(
110
+ left: str,
111
+ right: str,
112
+ partials: dict[str, list[str]],
113
+ nullables: dict[str, list[str]],
114
+ all_connecting_keys: set[str],
115
+ ) -> JoinType:
116
+ left_is_partial = any(key in partials.get(left, []) for key in all_connecting_keys)
117
+ left_is_nullable = any(
118
+ key in nullables.get(left, []) for key in all_connecting_keys
119
+ )
120
+ right_is_partial = any(
121
+ key in partials.get(right, []) for key in all_connecting_keys
122
+ )
123
+ right_is_nullable = any(
124
+ key in nullables.get(right, []) for key in all_connecting_keys
125
+ )
126
+
127
+ if left_is_nullable and right_is_nullable:
128
+ join_type = JoinType.FULL
129
+ elif left_is_partial and right_is_partial:
130
+ join_type = JoinType.FULL
131
+ elif left_is_partial:
132
+ join_type = JoinType.FULL
133
+ elif right_is_nullable:
134
+ join_type = JoinType.RIGHT_OUTER
135
+ elif right_is_partial or left_is_nullable:
136
+ join_type = JoinType.LEFT_OUTER
137
+ # we can't inner join if the left was an outer join
138
+ else:
139
+ join_type = JoinType.INNER
140
+ return join_type
141
+
142
+
143
+ def reduce_join_types(join_types: Set[JoinType]) -> JoinType:
144
+ final_join_type = JoinType.INNER
145
+ if any([x == JoinType.FULL for x in join_types]):
146
+ final_join_type = JoinType.FULL
147
+ elif any([x == JoinType.LEFT_OUTER for x in join_types]):
148
+ final_join_type = JoinType.LEFT_OUTER
149
+ elif any([x == JoinType.RIGHT_OUTER for x in join_types]):
150
+ final_join_type = JoinType.RIGHT_OUTER
151
+
152
+ return final_join_type
153
+
154
+
93
155
  def resolve_join_order_v2(
94
156
  g: nx.Graph, partials: dict[str, list[str]], nullables: dict[str, list[str]]
95
157
  ) -> list[JoinOrderOutput]:
96
158
  datasources = [x for x in g.nodes if x.startswith("ds~")]
97
159
  concepts = [x for x in g.nodes if x.startswith("c~")]
98
160
 
161
+ # Pre-compute all possible connections between datasources
162
+ all_connections: dict[tuple[str, str], set[str]] = {}
163
+ for i, ds1 in enumerate(datasources):
164
+ for ds2 in datasources[i + 1 :]:
165
+ connecting_concepts = find_all_connecting_concepts(g, ds1, ds2)
166
+ if connecting_concepts:
167
+ key = tuple(sorted([ds1, ds2]))
168
+ all_connections[key] = connecting_concepts
169
+
99
170
  output: list[JoinOrderOutput] = []
171
+
172
+ # create our map of pivots, or common join concepts
100
173
  pivot_map = {
101
174
  concept: [x for x in g.neighbors(concept) if x in datasources]
102
175
  for concept in concepts
@@ -108,8 +181,9 @@ def resolve_join_order_v2(
108
181
  )
109
182
  )
110
183
  solo = [x for x in pivot_map if len(pivot_map[x]) == 1]
111
- eligible_left = set()
184
+ eligible_left: set[str] = set()
112
185
 
186
+ # while we have pivots, keep joining them in
113
187
  while pivots:
114
188
  next_pivots = [
115
189
  x for x in pivots if any(y in eligible_left for y in pivot_map[x])
@@ -120,7 +194,7 @@ def resolve_join_order_v2(
120
194
  else:
121
195
  root = pivots.pop(0)
122
196
 
123
- # sort so less partials is last and eligible lefts are
197
+ # sort so less partials is last and eligible lefts are first
124
198
  def score_key(x: str) -> tuple[int, int, str]:
125
199
  base = 1
126
200
  # if it's left, higher weight
@@ -133,79 +207,56 @@ def resolve_join_order_v2(
133
207
  base -= 1
134
208
  return (base, len(x), x)
135
209
 
136
- # get remainig un-joined datasets
210
+ # get remaining un-joined datasets
137
211
  to_join = sorted(
138
212
  [x for x in pivot_map[root] if x not in eligible_left], key=score_key
139
213
  )
140
214
  while to_join:
141
215
  # need to sort this to ensure we join on the best match
142
- base = sorted(
143
- [x for x in pivot_map[root] if x in eligible_left], key=score_key
144
- )
216
+ # but check ALL left in case there are non-pivt keys to join on
217
+ base = sorted([x for x in eligible_left], key=score_key)
145
218
  if not base:
146
219
  new = to_join.pop()
147
220
  eligible_left.add(new)
148
221
  base = [new]
149
222
  right = to_join.pop()
150
223
  # we already joined it
151
- # this could happen if the same pivot is shared with multiple Dses
224
+ # this could happen if the same pivot is shared with multiple DSes
152
225
  if right in eligible_left:
153
226
  continue
227
+
154
228
  joinkeys: dict[str, set[str]] = {}
155
229
  # sorting puts the best candidate last for pop
156
230
  # so iterate over the reversed list
157
231
  join_types = set()
232
+
158
233
  for left_candidate in reversed(base):
159
- common = nx.common_neighbors(g, left_candidate, right)
234
+ # Get all concepts that connect these two datasources
235
+ all_connecting_keys = get_connection_keys(
236
+ all_connections, left_candidate, right
237
+ )
160
238
 
161
- if not common:
239
+ if not all_connecting_keys:
162
240
  continue
241
+
242
+ # Check if we already have this exact set of keys
163
243
  exists = False
164
244
  for _, v in joinkeys.items():
165
- if v == common:
245
+ if v == all_connecting_keys:
166
246
  exists = True
167
247
  if exists:
168
248
  continue
169
- left_is_partial = any(
170
- key in partials.get(left_candidate, []) for key in common
171
- )
172
- left_is_nullable = any(
173
- key in nullables.get(left_candidate, []) for key in common
174
- )
175
- right_is_partial = any(key in partials.get(right, []) for key in common)
176
- # we don't care if left is nullable for join type (just keys), but if we did
177
- # left_is_nullable = any(
178
- # key in nullables.get(left_candidate, []) for key in common
179
- # )
180
- right_is_nullable = any(
181
- key in nullables.get(right, []) for key in common
182
- )
183
- if left_is_nullable and right_is_nullable:
184
- join_type = JoinType.FULL
185
- elif left_is_partial and right_is_partial:
186
- join_type = JoinType.FULL
187
- elif left_is_partial:
188
- join_type = JoinType.FULL
189
- elif right_is_nullable:
190
- join_type = JoinType.RIGHT_OUTER
191
- elif right_is_partial or left_is_nullable:
192
- join_type = JoinType.LEFT_OUTER
193
- # we can't inner join if the left was an outer join
194
- else:
195
- join_type = JoinType.INNER
196
249
 
250
+ join_type = get_join_type(
251
+ left_candidate, right, partials, nullables, all_connecting_keys
252
+ )
197
253
  join_types.add(join_type)
198
- joinkeys[left_candidate] = common
199
- final_join_type = JoinType.INNER
200
- if any([x == JoinType.FULL for x in join_types]):
201
- final_join_type = JoinType.FULL
202
- elif any([x == JoinType.LEFT_OUTER for x in join_types]):
203
- final_join_type = JoinType.LEFT_OUTER
204
- elif any([x == JoinType.RIGHT_OUTER for x in join_types]):
205
- final_join_type = JoinType.RIGHT_OUTER
254
+ joinkeys[left_candidate] = all_connecting_keys
255
+
256
+ final_join_type = reduce_join_types(join_types)
257
+
206
258
  output.append(
207
259
  JoinOrderOutput(
208
- # left=left_candidate,
209
260
  right=right,
210
261
  type=final_join_type,
211
262
  keys=joinkeys,
@@ -216,7 +267,6 @@ def resolve_join_order_v2(
216
267
  for concept in solo:
217
268
  for ds in pivot_map[concept]:
218
269
  # if we already have it, skip it
219
-
220
270
  if ds in eligible_left:
221
271
  continue
222
272
  # if we haven't had ANY left datasources yet
@@ -224,17 +274,39 @@ def resolve_join_order_v2(
224
274
  if not eligible_left:
225
275
  eligible_left.add(ds)
226
276
  continue
227
- # otherwise do a full out join
228
- output.append(
229
- JoinOrderOutput(
230
- # pick random one to be left
231
- left=list(eligible_left)[0],
232
- right=ds,
233
- type=JoinType.FULL,
234
- keys={},
277
+ # otherwise do a full outer join
278
+ # Try to find if there are any connecting keys with existing left tables
279
+ best_left = None
280
+ best_keys: set[str] = set()
281
+ for existing_left in eligible_left:
282
+ connecting_keys = get_connection_keys(
283
+ all_connections, existing_left, ds
284
+ )
285
+ if connecting_keys and len(connecting_keys) > len(best_keys):
286
+ best_left = existing_left
287
+ best_keys = connecting_keys
288
+
289
+ if best_left and best_keys:
290
+ output.append(
291
+ JoinOrderOutput(
292
+ left=best_left,
293
+ right=ds,
294
+ type=JoinType.FULL,
295
+ keys={best_left: best_keys},
296
+ )
297
+ )
298
+ else:
299
+ output.append(
300
+ JoinOrderOutput(
301
+ # pick random one to be left
302
+ left=list(eligible_left)[0],
303
+ right=ds,
304
+ type=JoinType.FULL,
305
+ keys={},
306
+ )
235
307
  )
236
- )
237
308
  eligible_left.add(ds)
309
+
238
310
  # only once we have all joins
239
311
  # do we know if some inners need to be left outers
240
312
  for review_join in output:
@@ -248,6 +320,7 @@ def resolve_join_order_v2(
248
320
  ]
249
321
  ):
250
322
  review_join.type = JoinType.LEFT_OUTER
323
+
251
324
  return output
252
325
 
253
326
 
@@ -352,7 +425,9 @@ def resolve_instantiated_concept(
352
425
  )
353
426
 
354
427
 
355
- def reduce_concept_pairs(input: list[ConceptPair]) -> list[ConceptPair]:
428
+ def reduce_concept_pairs(
429
+ input: list[ConceptPair], right_source: QueryDatasource | BuildDatasource
430
+ ) -> list[ConceptPair]:
356
431
  left_keys = set()
357
432
  right_keys = set()
358
433
  for pair in input:
@@ -361,7 +436,10 @@ def reduce_concept_pairs(input: list[ConceptPair]) -> list[ConceptPair]:
361
436
  if pair.right.purpose == Purpose.KEY:
362
437
  right_keys.add(pair.right.address)
363
438
  final: list[ConceptPair] = []
439
+ seen_right_keys = set()
364
440
  for pair in input:
441
+ if pair.right.address in seen_right_keys:
442
+ continue
365
443
  if (
366
444
  pair.left.purpose == Purpose.PROPERTY
367
445
  and pair.left.keys
@@ -374,7 +452,15 @@ def reduce_concept_pairs(input: list[ConceptPair]) -> list[ConceptPair]:
374
452
  and pair.right.keys.issubset(right_keys)
375
453
  ):
376
454
  continue
455
+
456
+ seen_right_keys.add(pair.right.address)
377
457
  final.append(pair)
458
+ all_keys = set([x.right.address for x in final])
459
+ if right_source.grain.components and right_source.grain.components.issubset(
460
+ all_keys
461
+ ):
462
+ return [x for x in final if x.right.address in right_source.grain.components]
463
+
378
464
  return final
379
465
 
380
466
 
@@ -443,7 +529,8 @@ def get_node_joins(
443
529
  )
444
530
  for k, v in j.keys.items()
445
531
  for concept in v
446
- ]
532
+ ],
533
+ ds_node_map[j.right],
447
534
  ),
448
535
  )
449
536
  for j in joins