omnata-plugin-runtime 0.11.4a320__tar.gz → 0.11.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: omnata-plugin-runtime
3
- Version: 0.11.4a320
3
+ Version: 0.11.5
4
4
  Summary: Classes and common runtime components for building and running Omnata Plugins
5
5
  License-File: LICENSE
6
6
  Author: James Weakley
7
7
  Author-email: james.weakley@omnata.com
8
- Requires-Python: >=3.8,<=3.11
8
+ Requires-Python: >=3.9,<=3.11
9
9
  Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.8
11
10
  Classifier: Programming Language :: Python :: 3.9
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "omnata-plugin-runtime"
3
- version = "0.11.4-a320"
3
+ version = "0.11.5"
4
4
  description = "Classes and common runtime components for building and running Omnata Plugins"
5
5
  authors = ["James Weakley <james.weakley@omnata.com>"]
6
6
  readme = "README.md"
7
7
  packages = [{include = "omnata_plugin_runtime", from = "src"}]
8
8
 
9
9
  [tool.poetry.dependencies]
10
- python = ">=3.8, <=3.11"
10
+ python = ">=3.9, <=3.11"
11
11
  snowflake-snowpark-python = ">=1.20.0,<=1.24.0" # latest version available on Snowflake Anaconda, but allow pinning to 1.20.0 for to_pandas_batches workaround
12
12
  snowflake-connector-python = "^3, <=3.12.0" # latest version available on Snowflake Anaconda
13
13
  cryptography = "<=43.0.0"
@@ -664,9 +664,12 @@ class SnowflakeViewParts(BaseModel):
664
664
  )
665
665
  joined_parts:List[SnowflakeViewPart] = []
666
666
  # remove the joins from the main part if they are not in the raw stream locations
667
+ original_join_count = len(main_stream_view_part.joins)
667
668
  main_stream_view_part.joins = [join for join in main_stream_view_part.joins
668
669
  if join.join_stream_name in raw_stream_locations
669
670
  and join.join_stream_name in stream_schemas]
671
+ if len(main_stream_view_part.joins) < original_join_count:
672
+ logger.debug(f"Removed {original_join_count - len(main_stream_view_part.joins)} joins from stream: {stream_name} due to missing raw stream locations or schemas")
670
673
 
671
674
  for join in main_stream_view_part.joins:
672
675
  logger.debug(f"Generating view parts for join stream: {join.join_stream_name}")
@@ -679,6 +682,8 @@ class SnowflakeViewParts(BaseModel):
679
682
  column_name_expression=column_name_expression,
680
683
  plugin_app_database=plugin_app_database
681
684
  ))
685
+ if len(main_stream_view_part.joins) == 0:
686
+ logger.debug(f"No joins found for stream: {stream_name}")
682
687
  # For each column, the plugin can advise which fields (of the same stream or joined) are required for the join, which comes through as referenced_columns
683
688
  # on the SnowflakeViewColumn object.
684
689
  # Until this generate function is called with the raw stream names, we don't know which streams the user has actually selected, nor which
@@ -697,7 +702,8 @@ class SnowflakeViewParts(BaseModel):
697
702
 
698
703
  # Process all joins to build the mappings
699
704
  for part in [main_stream_view_part] + joined_parts:
700
- logger.debug(f"Processing joins for stream: {part.stream_name}")
705
+ joined_parts_names = [j.join_stream_name for j in part.joins]
706
+ logger.debug(f"Processing joins for stream: {part.stream_name} (joined streams: {joined_parts_names})")
701
707
  # Make sure the part's stream name is in the mappings
702
708
  if part.stream_name not in stream_to_aliases:
703
709
  stream_to_aliases[part.stream_name] = [part.stream_name]
@@ -807,19 +813,8 @@ class SnowflakeViewParts(BaseModel):
807
813
  # If we get here, no circular references were found
808
814
  logger.debug("No circular references found")
809
815
 
810
- # Now proceed with the actual pruning process
811
- # First, removing unavailable columns from other streams
812
- # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
813
-
814
- # Now proceed with the actual pruning process
815
- # First, removing unavailable columns from other streams
816
- # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
817
-
818
- prune_count = 0
819
- while prune(main_stream_view_part, joined_parts):
820
- prune_count += 1
821
- if prune_count > 10:
822
- raise ValueError("Pruning of columns from the view has entered an infinite loop")
816
+ # Prune columns using graph-based dependency resolution (single pass)
817
+ prune(main_stream_view_part, joined_parts)
823
818
 
824
819
  return cls(main_part=main_stream_view_part, joined_parts=joined_parts)
825
820
 
@@ -844,81 +839,183 @@ def find_part(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart
844
839
 
845
840
  def prune(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart]) -> bool:
846
841
  """
847
- Prunes columns from view parts that reference fields that don't exist in the referenced streams.
842
+ Prunes columns from view parts using graph-based dependency resolution.
848
843
 
849
- This function handles:
850
- 1. Direct dependencies - removing columns that directly reference non-existent columns
851
- 2. Transitive dependencies - removing columns that depend on columns that were removed
844
+ Uses TopologicalSorter to:
845
+ 1. Build a complete dependency graph of all columns across all parts
846
+ 2. Identify "root" columns that must be kept (in main part or used in joins)
847
+ 3. Traverse dependencies to find all transitively required columns
848
+ 4. Remove columns that aren't needed
852
849
 
853
850
  Returns True if any columns were removed, False otherwise.
854
- Raises ValueError if a cyclic dependency is detected.
855
851
  """
856
- columns_removed = False
857
-
858
- # Helper function to check if a column should be kept or removed
859
- def should_keep_column(column: SnowflakeViewColumn, part: SnowflakeViewPart) -> bool:
860
- """
861
- Checks if a column should be kept based on its dependencies.
862
- Returns True if the column should be kept, False if it should be removed.
863
- """
864
- # If no references, keep the column
865
- if not column.referenced_columns:
866
- return True
852
+
853
+ all_parts = [view_part] + joined_parts
854
+
855
+ # Build column registry: (stream_name, column_name) -> column object
856
+ all_columns: Dict[Tuple[str, str], SnowflakeViewColumn] = {}
857
+ for part in all_parts:
858
+ for column in part.columns:
859
+ all_columns[(part.stream_name, column.original_name)] = column
860
+
861
+ # Build dependency graph for topological analysis
862
+ # Key: (stream, column), Value: list of (stream, column) dependencies
863
+ # Also track columns with invalid dependencies (reference non-existent columns)
864
+ dependency_graph: Dict[Tuple[str, str], List[Tuple[str, str]]] = {}
865
+ columns_with_invalid_deps: set[Tuple[str, str]] = set()
866
+
867
+ # First pass: build dependency graph and detect direct invalid references
868
+ for part in all_parts:
869
+ for column in part.columns:
870
+ key = (part.stream_name, column.original_name)
871
+ deps = []
872
+ has_invalid_dep = False
867
873
 
868
- # Check each referenced stream and its fields
869
- for ref_stream_name, ref_fields in column.referenced_columns.items():
870
- # Find the referenced part
871
- ref_part = find_part(view_part, joined_parts,ref_stream_name)
874
+ if column.referenced_columns:
875
+ for ref_stream_name, ref_fields in column.referenced_columns.items():
876
+ # Resolve stream alias to actual stream name
877
+ resolved_stream = ref_stream_name
878
+ for join in view_part.joins:
879
+ if join.join_stream_alias == ref_stream_name:
880
+ resolved_stream = join.join_stream_name
881
+ break
882
+
883
+ for ref_field in ref_fields:
884
+ dep_key = (resolved_stream, ref_field)
885
+ if dep_key in all_columns:
886
+ deps.append(dep_key)
887
+ else:
888
+ logger.warning(
889
+ f"Column {column.original_name} in {part.stream_name} references "
890
+ f"{ref_field} in {resolved_stream}, which doesn't exist"
891
+ )
892
+ has_invalid_dep = True
872
893
 
873
- # If referenced stream doesn't exist, remove the column
874
- if ref_part is None:
875
- logger.warning(
876
- f"Column {column.name} in stream {part.stream_name} references stream "
877
- f"{ref_stream_name}, but it was not provided"
878
- )
879
- return False
880
-
881
- # Check each referenced field
882
- for ref_field in ref_fields:
883
- # Find the referenced column
884
- ref_column = next((c for c in ref_part.columns if c.original_name == ref_field), None)
894
+ dependency_graph[key] = deps
895
+ if has_invalid_dep:
896
+ columns_with_invalid_deps.add(key)
897
+
898
+ # Second pass: propagate invalidity to columns that depend on invalid columns
899
+ # Keep iterating until no new invalid columns are found
900
+ changed = True
901
+ while changed:
902
+ changed = False
903
+ for col_key, deps in dependency_graph.items():
904
+ if col_key not in columns_with_invalid_deps:
905
+ # Check if any dependency is invalid
906
+ for dep_key in deps:
907
+ if dep_key in columns_with_invalid_deps:
908
+ logger.warning(
909
+ f"Column {col_key[1]} in {col_key[0]} depends on "
910
+ f"{dep_key[1]} in {dep_key[0]}, which has invalid dependencies"
911
+ )
912
+ columns_with_invalid_deps.add(col_key)
913
+ changed = True
914
+ break
915
+
916
+ # Build alias to stream mapping
917
+ alias_to_stream: Dict[str, str] = {}
918
+ for part in all_parts:
919
+ alias_to_stream[part.stream_name] = part.stream_name
920
+ for join in part.joins:
921
+ alias_to_stream[join.join_stream_alias] = join.join_stream_name
922
+ # left_alias might be an alias for a joined stream, resolve it
923
+ if join.left_alias not in alias_to_stream:
924
+ # Try to find the stream for this alias
925
+ for other_part in all_parts:
926
+ if other_part.stream_name == join.left_alias:
927
+ alias_to_stream[join.left_alias] = other_part.stream_name
928
+ break
929
+
930
+ # Identify root columns that must be kept
931
+ needed_columns: set[Tuple[str, str]] = set()
932
+
933
+ # 1. All columns in the main part are needed (except those with invalid dependencies)
934
+ for column in view_part.columns:
935
+ col_key = (view_part.stream_name, column.original_name)
936
+ if col_key not in columns_with_invalid_deps:
937
+ needed_columns.add(col_key)
938
+
939
+ # 2. All columns used in join conditions are needed (except those with invalid dependencies)
940
+ for part in all_parts:
941
+ for join in part.joins:
942
+ # Resolve left_alias to actual stream name
943
+ left_stream = alias_to_stream.get(join.left_alias, join.left_alias)
944
+ left_key = (left_stream, join.left_column)
945
+ right_key = (join.join_stream_name, join.join_stream_column)
946
+ if left_key not in columns_with_invalid_deps:
947
+ needed_columns.add(left_key)
948
+ if right_key not in columns_with_invalid_deps:
949
+ needed_columns.add(right_key)
950
+
951
+ logger.debug(f"Identified {len(needed_columns)} root columns to keep (excluding {len(columns_with_invalid_deps)} with invalid deps)")
952
+
953
+ # 3. Find all transitive dependencies using recursive traversal
954
+ # Skip columns with invalid dependencies and their dependents
955
+ def collect_dependencies(col_key: Tuple[str, str], visited: set[Tuple[str, str]]) -> None:
956
+ """Recursively collect all columns that col_key depends on"""
957
+ if col_key in visited or col_key not in dependency_graph:
958
+ return
959
+ if col_key in columns_with_invalid_deps:
960
+ return # Don't traverse dependencies of invalid columns
961
+ visited.add(col_key)
962
+
963
+ for dep_key in dependency_graph[col_key]:
964
+ if dep_key in all_columns and dep_key not in columns_with_invalid_deps:
965
+ needed_columns.add(dep_key)
966
+ collect_dependencies(dep_key, visited)
967
+
968
+ visited_global: set[Tuple[str, str]] = set()
969
+ for root_col in list(needed_columns):
970
+ collect_dependencies(root_col, visited_global)
971
+
972
+ # Remove columns that are not needed
973
+ columns_removed = False
974
+ for part in all_parts:
975
+ original_count = len(part.columns)
976
+ removed_cols = [col for col in part.columns
977
+ if (part.stream_name, col.original_name) not in needed_columns]
978
+
979
+ # Log warnings for each removed column with the reason
980
+ for col in removed_cols:
981
+ # Determine why the column is being removed
982
+ col_key = (part.stream_name, col.original_name)
983
+ if col.referenced_columns:
984
+ # Check if any referenced columns don't exist
985
+ missing_refs = []
986
+ for ref_stream_name, ref_fields in col.referenced_columns.items():
987
+ resolved_stream = ref_stream_name
988
+ for join in view_part.joins:
989
+ if join.join_stream_alias == ref_stream_name:
990
+ resolved_stream = join.join_stream_name
991
+ break
992
+ for ref_field in ref_fields:
993
+ if (resolved_stream, ref_field) not in all_columns:
994
+ missing_refs.append(f"{ref_field} in {resolved_stream}")
885
995
 
886
- # If referenced column doesn't exist, remove the column
887
- if ref_column is None:
996
+ if missing_refs:
888
997
  logger.warning(
889
- f"Column {column.name} in stream {part.stream_name} references field "
890
- f"{ref_field} in stream {ref_stream_name}, but it was not provided"
998
+ f"Removing column {col.original_name} from {part.stream_name} because it references "
999
+ f"non-existent column(s): {', '.join(missing_refs)}"
891
1000
  )
892
- return False
893
-
894
- # All dependencies are satisfied
895
- return True
896
-
897
- # Process columns for removal
898
- for column in view_part.columns[:]: # Use a copy to allow safe removal
899
- if not should_keep_column(column, view_part):
900
- view_part.columns.remove(column)
1001
+ else:
1002
+ # Column is not needed (not referenced by main part)
1003
+ logger.debug(
1004
+ f"Removing column {col.original_name} from {part.stream_name} because it is not "
1005
+ f"referenced by the main part or any join conditions"
1006
+ )
1007
+ else:
1008
+ logger.debug(
1009
+ f"Removing column {col.original_name} from {part.stream_name} because it is not "
1010
+ f"referenced by the main part or any join conditions"
1011
+ )
1012
+
1013
+ part.columns = [col for col in part.columns
1014
+ if (part.stream_name, col.original_name) in needed_columns]
1015
+
1016
+ if removed_cols:
901
1017
  columns_removed = True
902
1018
 
903
- # Process joined parts
904
- for joined_part in joined_parts:
905
- # We have to avoid pruning columns that are referenced by joins to this stream.
906
- # first, we determine all aliases for this stream (multiple join paths back to the same stream are allowed)
907
- aliases_for_stream = [j.join_stream_alias for j in view_part.joins if j.join_stream_name == joined_part.stream_name]
908
- # now find all joins using this stream as the join stream
909
- columns_used_in_joins = [
910
- j.left_column for j in view_part.joins if j.left_alias in aliases_for_stream
911
- ]
912
- for column in joined_part.columns[:]: # Use a copy to allow safe removal
913
- # First check if the column is a join column
914
- if column.original_name in columns_used_in_joins:
915
- # If it's a join column, we need to keep it
916
- continue
917
-
918
- if not should_keep_column(column, joined_part):
919
- joined_part.columns.remove(column)
920
- columns_removed = True
921
-
922
1019
  return columns_removed
923
1020
 
924
1021
  class JsonSchemaTopLevel(BaseModel):