PyPI - omnata-plugin-runtime - Versions diffs - 0.11.4a320__tar.gz → 0.11.5__tar.gz - Mend

omnata-plugin-runtime 0.11.4a320tar.gz → 0.11.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{omnata_plugin_runtime-0.11.4a320 → omnata_plugin_runtime-0.11.5}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,12 @@
 Metadata-Version: 2.4
 Name: omnata-plugin-runtime
-Version: 0.11.4a320
+Version: 0.11.5
 Summary: Classes and common runtime components for building and running Omnata Plugins
 License-File: LICENSE
 Author: James Weakley
 Author-email: james.weakley@omnata.com
-Requires-Python: >=3.8,<=3.11
+Requires-Python: >=3.9,<=3.11
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11

{omnata_plugin_runtime-0.11.4a320 → omnata_plugin_runtime-0.11.5}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "omnata-plugin-runtime"
-version = "0.11.4-a320"
+version = "0.11.5"
 description = "Classes and common runtime components for building and running Omnata Plugins"
 authors = ["James Weakley <james.weakley@omnata.com>"]
 readme = "README.md"
 packages = [{include = "omnata_plugin_runtime", from = "src"}]
 [tool.poetry.dependencies]
-python = ">=3.8, <=3.11"
+python = ">=3.9, <=3.11"
 snowflake-snowpark-python = ">=1.20.0,<=1.24.0" # latest version available on Snowflake Anaconda, but allow pinning to 1.20.0 for to_pandas_batches workaround
 snowflake-connector-python = "^3, <=3.12.0" # latest version available on Snowflake Anaconda
 cryptography = "<=43.0.0"

{omnata_plugin_runtime-0.11.4a320 → omnata_plugin_runtime-0.11.5}/src/omnata_plugin_runtime/json_schema.py RENAMED Viewed

@@ -664,9 +664,12 @@ class SnowflakeViewParts(BaseModel):
         )
         joined_parts:List[SnowflakeViewPart] = []
         # remove the joins from the main part if they are not in the raw stream locations
+        original_join_count = len(main_stream_view_part.joins)
         main_stream_view_part.joins = [join for join in main_stream_view_part.joins
                                        if join.join_stream_name in raw_stream_locations
                                        and join.join_stream_name in stream_schemas]
+        if len(main_stream_view_part.joins) < original_join_count:
+            logger.debug(f"Removed {original_join_count - len(main_stream_view_part.joins)} joins from stream: {stream_name} due to missing raw stream locations or schemas")
         for join in main_stream_view_part.joins:
             logger.debug(f"Generating view parts for join stream: {join.join_stream_name}")
@@ -679,6 +682,8 @@ class SnowflakeViewParts(BaseModel):
                 column_name_expression=column_name_expression,
                 plugin_app_database=plugin_app_database
             ))
+        if len(main_stream_view_part.joins) == 0:
+            logger.debug(f"No joins found for stream: {stream_name}")
         # For each column, the plugin can advise which fields (of the same stream or joined) are required for the join, which comes through as referenced_columns
         # on the SnowflakeViewColumn object.
         # Until this generate function is called with the raw stream names, we don't know which streams the user has actually selected, nor which
@@ -697,7 +702,8 @@ class SnowflakeViewParts(BaseModel):
         # Process all joins to build the mappings
         for part in [main_stream_view_part] + joined_parts:
-            logger.debug(f"Processing joins for stream: {part.stream_name}")
+            joined_parts_names = [j.join_stream_name for j in part.joins]
+            logger.debug(f"Processing joins for stream: {part.stream_name} (joined streams: {joined_parts_names})")
             # Make sure the part's stream name is in the mappings
             if part.stream_name not in stream_to_aliases:
                 stream_to_aliases[part.stream_name] = [part.stream_name]
@@ -807,19 +813,8 @@ class SnowflakeViewParts(BaseModel):
         # If we get here, no circular references were found
         logger.debug("No circular references found")
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        prune_count = 0
-        while prune(main_stream_view_part, joined_parts):
-            prune_count += 1
-            if prune_count > 10:
-                raise ValueError("Pruning of columns from the view has entered an infinite loop")
+        # Prune columns using graph-based dependency resolution (single pass)
+        prune(main_stream_view_part, joined_parts)
         return cls(main_part=main_stream_view_part, joined_parts=joined_parts)
@@ -844,81 +839,183 @@ def find_part(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart
 def prune(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart]) -> bool:
     """
-    Prunes columns from view parts that reference fields that don't exist in the referenced streams.
+    Prunes columns from view parts using graph-based dependency resolution.
-    This function handles:
-    1. Direct dependencies - removing columns that directly reference non-existent columns
-    2. Transitive dependencies - removing columns that depend on columns that were removed
+    Uses TopologicalSorter to:
+    1. Build a complete dependency graph of all columns across all parts
+    2. Identify "root" columns that must be kept (in main part or used in joins)
+    3. Traverse dependencies to find all transitively required columns
+    4. Remove columns that aren't needed
     Returns True if any columns were removed, False otherwise.
-    Raises ValueError if a cyclic dependency is detected.
     """
-    columns_removed = False
-    # Helper function to check if a column should be kept or removed
-    def should_keep_column(column: SnowflakeViewColumn, part: SnowflakeViewPart) -> bool:
-        """
-        Checks if a column should be kept based on its dependencies.
-        Returns True if the column should be kept, False if it should be removed.
-        """
-        # If no references, keep the column
-        if not column.referenced_columns:
-            return True
+    all_parts = [view_part] + joined_parts
+    # Build column registry: (stream_name, column_name) -> column object
+    all_columns: Dict[Tuple[str, str], SnowflakeViewColumn] = {}
+    for part in all_parts:
+        for column in part.columns:
+            all_columns[(part.stream_name, column.original_name)] = column
+    # Build dependency graph for topological analysis
+    # Key: (stream, column), Value: list of (stream, column) dependencies
+    # Also track columns with invalid dependencies (reference non-existent columns)
+    dependency_graph: Dict[Tuple[str, str], List[Tuple[str, str]]] = {}
+    columns_with_invalid_deps: set[Tuple[str, str]] = set()
+    # First pass: build dependency graph and detect direct invalid references
+    for part in all_parts:
+        for column in part.columns:
+            key = (part.stream_name, column.original_name)
+            deps = []
+            has_invalid_dep = False
-        # Check each referenced stream and its fields
-        for ref_stream_name, ref_fields in column.referenced_columns.items():
-            # Find the referenced part
-            ref_part = find_part(view_part, joined_parts,ref_stream_name)
+            if column.referenced_columns:
+                for ref_stream_name, ref_fields in column.referenced_columns.items():
+                    # Resolve stream alias to actual stream name
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        dep_key = (resolved_stream, ref_field)
+                        if dep_key in all_columns:
+                            deps.append(dep_key)
+                        else:
+                            logger.warning(
+                                f"Column {column.original_name} in {part.stream_name} references "
+                                f"{ref_field} in {resolved_stream}, which doesn't exist"
+                            )
+                            has_invalid_dep = True
-            # If referenced stream doesn't exist, remove the column
-            if ref_part is None:
-                logger.warning(
-                    f"Column {column.name} in stream {part.stream_name} references stream "
-                    f"{ref_stream_name}, but it was not provided"
-                )
-                return False
-            # Check each referenced field
-            for ref_field in ref_fields:
-                # Find the referenced column
-                ref_column = next((c for c in ref_part.columns if c.original_name == ref_field), None)
+            dependency_graph[key] = deps
+            if has_invalid_dep:
+                columns_with_invalid_deps.add(key)
+    # Second pass: propagate invalidity to columns that depend on invalid columns
+    # Keep iterating until no new invalid columns are found
+    changed = True
+    while changed:
+        changed = False
+        for col_key, deps in dependency_graph.items():
+            if col_key not in columns_with_invalid_deps:
+                # Check if any dependency is invalid
+                for dep_key in deps:
+                    if dep_key in columns_with_invalid_deps:
+                        logger.warning(
+                            f"Column {col_key[1]} in {col_key[0]} depends on "
+                            f"{dep_key[1]} in {dep_key[0]}, which has invalid dependencies"
+                        )
+                        columns_with_invalid_deps.add(col_key)
+                        changed = True
+                        break
+    # Build alias to stream mapping
+    alias_to_stream: Dict[str, str] = {}
+    for part in all_parts:
+        alias_to_stream[part.stream_name] = part.stream_name
+        for join in part.joins:
+            alias_to_stream[join.join_stream_alias] = join.join_stream_name
+            # left_alias might be an alias for a joined stream, resolve it
+            if join.left_alias not in alias_to_stream:
+                # Try to find the stream for this alias
+                for other_part in all_parts:
+                    if other_part.stream_name == join.left_alias:
+                        alias_to_stream[join.left_alias] = other_part.stream_name
+                        break
+    # Identify root columns that must be kept
+    needed_columns: set[Tuple[str, str]] = set()
+    # 1. All columns in the main part are needed (except those with invalid dependencies)
+    for column in view_part.columns:
+        col_key = (view_part.stream_name, column.original_name)
+        if col_key not in columns_with_invalid_deps:
+            needed_columns.add(col_key)
+    # 2. All columns used in join conditions are needed (except those with invalid dependencies)
+    for part in all_parts:
+        for join in part.joins:
+            # Resolve left_alias to actual stream name
+            left_stream = alias_to_stream.get(join.left_alias, join.left_alias)
+            left_key = (left_stream, join.left_column)
+            right_key = (join.join_stream_name, join.join_stream_column)
+            if left_key not in columns_with_invalid_deps:
+                needed_columns.add(left_key)
+            if right_key not in columns_with_invalid_deps:
+                needed_columns.add(right_key)
+    logger.debug(f"Identified {len(needed_columns)} root columns to keep (excluding {len(columns_with_invalid_deps)} with invalid deps)")
+    # 3. Find all transitive dependencies using recursive traversal
+    # Skip columns with invalid dependencies and their dependents
+    def collect_dependencies(col_key: Tuple[str, str], visited: set[Tuple[str, str]]) -> None:
+        """Recursively collect all columns that col_key depends on"""
+        if col_key in visited or col_key not in dependency_graph:
+            return
+        if col_key in columns_with_invalid_deps:
+            return  # Don't traverse dependencies of invalid columns
+        visited.add(col_key)
+        for dep_key in dependency_graph[col_key]:
+            if dep_key in all_columns and dep_key not in columns_with_invalid_deps:
+                needed_columns.add(dep_key)
+                collect_dependencies(dep_key, visited)
+    visited_global: set[Tuple[str, str]] = set()
+    for root_col in list(needed_columns):
+        collect_dependencies(root_col, visited_global)
+    # Remove columns that are not needed
+    columns_removed = False
+    for part in all_parts:
+        original_count = len(part.columns)
+        removed_cols = [col for col in part.columns
+                       if (part.stream_name, col.original_name) not in needed_columns]
+        # Log warnings for each removed column with the reason
+        for col in removed_cols:
+            # Determine why the column is being removed
+            col_key = (part.stream_name, col.original_name)
+            if col.referenced_columns:
+                # Check if any referenced columns don't exist
+                missing_refs = []
+                for ref_stream_name, ref_fields in col.referenced_columns.items():
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        if (resolved_stream, ref_field) not in all_columns:
+                            missing_refs.append(f"{ref_field} in {resolved_stream}")
-                # If referenced column doesn't exist, remove the column
-                if ref_column is None:
+                if missing_refs:
                     logger.warning(
-                        f"Column {column.name} in stream {part.stream_name} references field "
-                        f"{ref_field} in stream {ref_stream_name}, but it was not provided"
+                        f"Removing column {col.original_name} from {part.stream_name} because it references "
+                        f"non-existent column(s): {', '.join(missing_refs)}"
                     )
-                    return False
-        # All dependencies are satisfied
-        return True
-    # Process columns for removal
-    for column in view_part.columns[:]:  # Use a copy to allow safe removal
-        if not should_keep_column(column, view_part):
-            view_part.columns.remove(column)
+                else:
+                    # Column is not needed (not referenced by main part)
+                    logger.debug(
+                        f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                        f"referenced by the main part or any join conditions"
+                    )
+            else:
+                logger.debug(
+                    f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                    f"referenced by the main part or any join conditions"
+                )
+        part.columns = [col for col in part.columns
+                       if (part.stream_name, col.original_name) in needed_columns]
+        if removed_cols:
             columns_removed = True
-    # Process joined parts
-    for joined_part in joined_parts:
-        # We have to avoid pruning columns that are referenced by joins to this stream.
-        # first, we determine all aliases for this stream (multiple join paths back to the same stream are allowed)
-        aliases_for_stream = [j.join_stream_alias for j in view_part.joins if j.join_stream_name == joined_part.stream_name]
-        # now find all joins using this stream as the join stream
-        columns_used_in_joins = [
-            j.left_column for j in view_part.joins if j.left_alias in aliases_for_stream
-        ]
-        for column in joined_part.columns[:]:  # Use a copy to allow safe removal
-            # First check if the column is a join column
-            if column.original_name in columns_used_in_joins:
-                # If it's a join column, we need to keep it
-                continue
-            if not should_keep_column(column, joined_part):
-                joined_part.columns.remove(column)
-                columns_removed = True
     return columns_removed
 class JsonSchemaTopLevel(BaseModel):