PyPI - omnata-plugin-runtime - Versions diffs - 0.11.0a302__tar.gz → 0.11.7a324__tar.gz - Mend

omnata-plugin-runtime 0.11.0a302tar.gz → 0.11.7a324tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of omnata-plugin-runtime might be problematic. Click here for more details.

Files changed (13) hide show

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: omnata-plugin-runtime
-Version: 0.11.0a302
+Version: 0.11.7a324
 Summary: Classes and common runtime components for building and running Omnata Plugins
+License-File: LICENSE
 Author: James Weakley
 Author-email: james.weakley@omnata.com
-Requires-Python: >=3.8,<=3.11
+Requires-Python: >=3.9,<=3.11
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "omnata-plugin-runtime"
-version = "0.11.0-a302"
+version = "0.11.7-a324"
 description = "Classes and common runtime components for building and running Omnata Plugins"
 authors = ["James Weakley <james.weakley@omnata.com>"]
 readme = "README.md"
 packages = [{include = "omnata_plugin_runtime", from = "src"}]
 [tool.poetry.dependencies]
-python = ">=3.8, <=3.11"
+python = ">=3.9, <=3.11"
 snowflake-snowpark-python = ">=1.20.0,<=1.24.0" # latest version available on Snowflake Anaconda, but allow pinning to 1.20.0 for to_pandas_batches workaround
 snowflake-connector-python = "^3, <=3.12.0" # latest version available on Snowflake Anaconda
 cryptography = "<=43.0.0"

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/json_schema.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import Any, Dict, Optional, Literal, List, Union, Tuple
 from typing_extensions import Self
 from pydantic import BaseModel, Field, model_validator, computed_field
 from jinja2 import Environment
+from graphlib import TopologicalSorter
 from .logging import logger
 class JsonSchemaProperty(BaseModel):
@@ -271,57 +272,60 @@ class SnowflakeViewColumn(BaseModel):
         )
     @classmethod
-    def order_by_reference(cls,current_stream_name:str,columns:List[Self]) -> List[Self]:
+    def order_by_reference(cls, current_stream_name: str, columns: List[Self]) -> List[Self]:
         """
-        In some situations, column expressions may reference the alias of another column
-        This is allowed in Snowflake, as long as the aliased column is defined before it's used in a later column
-        So we need to sort the columns so that if the name of the column appears (in quotes) in the expression of another column, it is ordered first
+        Uses topological sorting to order columns so that if a column references another column,
+        the referenced column appears first in the list. This is required by Snowflake when
+        column expressions reference the alias of another column.
+        OMNATA_ system columns are always placed at the front of the result.
         """
         logger.debug(
             f"Ordering columns by reference for stream: {current_stream_name} ({len(columns)} columns)"
         )
-        # Collect columns to be moved
-        columns_to_move:List[Self] = []
-        # Collect Omnata System columns and keep them at the front
-        omnata_system_columns_start = []
-        for column in columns[:]:
-            if column.original_name.startswith("OMNATA_"):
-                columns.remove(column)
-                omnata_system_columns_start.append(column)
+        # Separate OMNATA system columns - they always go first
+        omnata_system_columns = []
+        regular_columns = []
         for column in columns:
-            for other_column in columns:
-                if column==other_column:
-                    continue
-                if column.original_name in (other_column.referenced_columns or {}).get(current_stream_name,[]):
-                    if column not in columns_to_move:
-                        logger.debug(
-                            f"Column {column.original_name} references {other_column.original_name}, moving it to the front"
-                        )
-                        columns_to_move.append(column)
-        # we need to do another pass just on columns_to_move, because they may reference each other
-        # if any do, they go to the front, otherwise they are appended
-        columns_to_move_final:List[Self] = []
-        for column in columns_to_move:
-            for other_column in columns_to_move:
-                if column==other_column:
-                    continue
-                if column.original_name in (other_column.referenced_columns or {}).get(current_stream_name,[]):
-                    if column not in columns_to_move_final:
-                        logger.debug(
-                            f"Second pass: Column {column.original_name} is referenced by {other_column.original_name}, moving it to the front"
-                        )
-                        columns_to_move_final.insert(0, column)
-                        continue
-            if column not in columns_to_move_final:
-                columns_to_move_final.append(column)
+            if column.original_name.startswith("OMNATA_"):
+                omnata_system_columns.append(column)
+            else:
+                regular_columns.append(column)
-        # Move collected columns to the front
-        columns_to_move_final.reverse()
-        for column in columns_to_move_final:
-            columns.remove(column)
-            columns.insert(0, column)
-        return omnata_system_columns_start + columns
+        # Build dependency graph: column_name -> list of columns it depends on
+        # (i.e., columns that must appear BEFORE it in the final order)
+        graph: Dict[str, List[str]] = {}
+        column_by_name: Dict[str, Self] = {}
+        for column in regular_columns:
+            column_by_name[column.original_name] = column
+            # Initialize with empty dependencies
+            graph[column.original_name] = []
+            # Add dependencies from referenced_columns
+            if column.referenced_columns:
+                referenced_in_current_stream = column.referenced_columns.get(current_stream_name, [])
+                for ref_col_name in referenced_in_current_stream:
+                    # This column depends on ref_col_name, so ref_col_name must come first
+                    graph[column.original_name].append(ref_col_name)
+                    logger.debug(
+                        f"Column {column.original_name} depends on {ref_col_name}"
+                    )
+        # Use TopologicalSorter to sort the columns
+        try:
+            ts = TopologicalSorter(graph)
+            sorted_column_names = list(ts.static_order())
+        except ValueError as e:
+            # This would indicate a circular dependency
+            raise ValueError(f"Circular dependency detected in column references for stream {current_stream_name}: {e}")
+        # Reconstruct the column list in topological order
+        sorted_columns = [column_by_name[name] for name in sorted_column_names if name in column_by_name]
+        # Return OMNATA system columns first, followed by sorted regular columns
+        return omnata_system_columns + sorted_columns
 class SnowflakeViewJoin(BaseModel):
@@ -470,10 +474,20 @@ class SnowflakeViewPart(BaseModel):
             c.name_with_comment(binding_list) for c in self.columns
         ]
-    def cte_text(self,original_name: bool = False, include_only_columns:Optional[List[str]] = None) -> str:
+    def cte_text(self,original_name: bool = False,
+            include_only_columns:Optional[List[str]] = None,
+            include_extra_columns:Optional[List[str]] = None
+            ) -> str:
         """
         Returns the CTE text for this view part.
         """
+        if include_extra_columns is not None:
+            # includes direct columns plus any extra specified
+            return f""" "{self.stream_name}" as (
+    select {', '.join([c.definition(original_name=original_name,remove_stream_prefix=self.stream_name) for c in self.columns
+                       if c.original_name in include_extra_columns or not c.is_join_column])}
+    from {self.raw_table_location.get_fully_qualified_name()}
+) """
         if include_only_columns is None:
             return f""" "{self.stream_name}" as (
     select {', '.join([c.definition(original_name=original_name,remove_stream_prefix=self.stream_name) for c in self.direct_columns()])}
@@ -504,6 +518,30 @@ class SnowflakeViewParts(BaseModel):
         ..., description="The other streams that are joined to the main stream"
     )
+    def column_indirectly_references_other_streams(
+        self,
+        all_view_parts:List[SnowflakeViewPart],
+        stream_name:str,column_name:str) -> bool:
+        for part in all_view_parts:
+            if part.stream_name == stream_name:
+                for col in part.columns:
+                    if col.original_name == column_name:
+                        if col.referenced_columns:
+                            for ref_stream, ref_cols in col.referenced_columns.items():
+                                if ref_stream != stream_name:
+                                    return True
+                                else:
+                                    # we have to call this recursively in case the referenced column also references other streams
+                                    result = any(
+                                        self.column_indirectly_references_other_streams(
+                                            all_view_parts, ref_stream, ref_col
+                                        ) for ref_col in ref_cols
+                                    )
+                                    if result:
+                                        return True
+        return False
     def view_body(self):
         """
         Creates a view definition from the parts.
@@ -519,31 +557,55 @@ class SnowflakeViewParts(BaseModel):
         # first, we need to collapse all referenced columns into a single map
         all_referenced_columns:Dict[str,List[str]] = {}
+        # if a column references other columns, but there are no dependencies outside of its own stream, we can include those columns in the initial CTE for that stream
+        # because they can be calculated directly without needing joins
+        columns_only_referencing_own_stream:Dict[str,List[str]] = {}
         for part in [self.main_part] + self.joined_parts:
-            # if the main part references any columns in this part in its joins, we need to include those columns
+            # if the main part references any columns in this part in its joins, we need to include those columns because they are used in the join condition
             aliases_for_stream = [j.join_stream_alias for j in self.main_part.joins
                 if j.join_stream_name == part.stream_name]
             columns_used_in_joins = [
                 j.left_column for j in self.main_part.joins if j.left_alias in aliases_for_stream
             ]
-            if part.stream_name not in all_referenced_columns:
-                all_referenced_columns[part.stream_name] = []
-            all_referenced_columns[part.stream_name] += columns_used_in_joins
+            all_referenced_columns.setdefault(part.stream_name, []).extend(columns_used_in_joins)
+            # now, for each column in the part, if it references columns in other streams, we need to include those columns
             for column in part.columns:
                 if column.referenced_columns:
                     for stream_name, referenced_columns in column.referenced_columns.items():
-                        if stream_name not in all_referenced_columns:
-                            all_referenced_columns[stream_name] = []
-                        all_referenced_columns[stream_name] += referenced_columns
+                        aliases_for_referenced_stream = [j.join_stream_name for j in self.main_part.joins
+                            if j.join_stream_alias == stream_name]
+                        all_referenced_columns.setdefault(stream_name, []).extend(referenced_columns)
+                        # the stream name could be an alias, so we need to check if it's one of the aliases for this part
+                        for stream_name_for_alias in aliases_for_referenced_stream:
+                            all_referenced_columns.setdefault(stream_name_for_alias, []).extend(referenced_columns)
+                        # populate columns_only_referencing_own_stream by following the chain of references until we reach a column that references another stream or has no references
+                        if self.column_indirectly_references_other_streams(
+                            [self.main_part] + self.joined_parts, part.stream_name, column.original_name
+                        ) == False:
+                            columns_only_referencing_own_stream.setdefault(part.stream_name, []).append(column.original_name)
+                else:
+                    # if the column has no references, it can be included in the initial CTE for its own stream
+                    # but only if no columns in other streams reference it
+                    referenced_by_other_columns = False
+                    for other_column in part.columns:
+                        if other_column==column:
+                            continue
+                        if other_column.referenced_columns:
+                            for ref_stream, ref_cols in other_column.referenced_columns.items():
+                                if ref_stream != part.stream_name and column.original_name in ref_cols:
+                                    referenced_by_other_columns = True
+                                    break
+                    if not referenced_by_other_columns:
+                        columns_only_referencing_own_stream.setdefault(part.stream_name, []).append(column.original_name)
+            # if this part has joins to other streams, we need to include the join columns
             for join in part.joins:
-                if join.join_stream_name not in all_referenced_columns:
-                    all_referenced_columns[join.join_stream_name] = []
-                all_referenced_columns[join.join_stream_name].append(join.join_stream_column)
-                all_referenced_columns[part.stream_name].append(join.left_column)
+                all_referenced_columns.setdefault(join.join_stream_name, []).append(join.join_stream_column)
+                all_referenced_columns.setdefault(join.join_stream_alias, []).append(join.join_stream_column)
+                all_referenced_columns.setdefault(part.stream_name, []).append(join.left_column)
         ctes = [
-                self.main_part.cte_text(original_name=True)
+                self.main_part.cte_text(original_name=True,include_extra_columns=columns_only_referencing_own_stream.get(self.main_part.stream_name))
             ] + [
                 part.cte_text(original_name=True,include_only_columns=all_referenced_columns.get(part.stream_name))
             for part in joined_parts_deduped
@@ -553,9 +615,9 @@ class SnowflakeViewParts(BaseModel):
         final_cte = f""" OMNATA_FINAL_CTE as (
             select {', '.join(
             [
-                f'"{self.main_part.stream_name}"."{c.original_name}"' for c in self.main_part.direct_columns()
+                f'"{self.main_part.stream_name}"."{c.original_name}"' for c in self.main_part.columns if not c.is_join_column or c.original_name in columns_only_referencing_own_stream.get(self.main_part.stream_name,[])
             ]+[
-                c.definition(original_name=True) for c in self.main_part.join_columns()
+                c.definition(original_name=True) for c in self.main_part.columns if c.is_join_column and c.original_name not in columns_only_referencing_own_stream.get(self.main_part.stream_name,[])
             ])}
             from "{self.main_part.stream_name}" """
         if len(self.main_part.joins) > 0:
@@ -602,9 +664,12 @@ class SnowflakeViewParts(BaseModel):
         )
         joined_parts:List[SnowflakeViewPart] = []
         # remove the joins from the main part if they are not in the raw stream locations
+        original_join_count = len(main_stream_view_part.joins)
         main_stream_view_part.joins = [join for join in main_stream_view_part.joins
                                        if join.join_stream_name in raw_stream_locations
                                        and join.join_stream_name in stream_schemas]
+        if len(main_stream_view_part.joins) < original_join_count:
+            logger.debug(f"Removed {original_join_count - len(main_stream_view_part.joins)} joins from stream: {stream_name} due to missing raw stream locations or schemas")
         for join in main_stream_view_part.joins:
             logger.debug(f"Generating view parts for join stream: {join.join_stream_name}")
@@ -617,6 +682,8 @@ class SnowflakeViewParts(BaseModel):
                 column_name_expression=column_name_expression,
                 plugin_app_database=plugin_app_database
             ))
+        if len(main_stream_view_part.joins) == 0:
+            logger.debug(f"No joins found for stream: {stream_name}")
         # For each column, the plugin can advise which fields (of the same stream or joined) are required for the join, which comes through as referenced_columns
         # on the SnowflakeViewColumn object.
         # Until this generate function is called with the raw stream names, we don't know which streams the user has actually selected, nor which
@@ -635,7 +702,8 @@ class SnowflakeViewParts(BaseModel):
         # Process all joins to build the mappings
         for part in [main_stream_view_part] + joined_parts:
-            logger.debug(f"Processing joins for stream: {part.stream_name}")
+            joined_parts_names = [j.join_stream_name for j in part.joins]
+            logger.debug(f"Processing joins for stream: {part.stream_name} (joined streams: {joined_parts_names})")
             # Make sure the part's stream name is in the mappings
             if part.stream_name not in stream_to_aliases:
                 stream_to_aliases[part.stream_name] = [part.stream_name]
@@ -745,105 +813,209 @@ class SnowflakeViewParts(BaseModel):
         # If we get here, no circular references were found
         logger.debug("No circular references found")
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        prune_count = 0
-        while prune(main_stream_view_part, joined_parts):
-            prune_count += 1
-            if prune_count > 10:
-                raise ValueError("Pruning of columns from the view has entered an infinite loop")
+        # Prune columns using graph-based dependency resolution (single pass)
+        prune(main_stream_view_part, joined_parts)
         return cls(main_part=main_stream_view_part, joined_parts=joined_parts)
+# Helper function to find a view part by stream name
+def find_part(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart], stream_name: str) -> Optional[SnowflakeViewPart]:
+    if stream_name == view_part.stream_name:
+        return view_part
+    for part in joined_parts:
+        if part.stream_name == stream_name:
+            return part
+    for join in view_part.joins:
+        if join.join_stream_alias == stream_name:
+            # this is the join, we need to find the actual stream
+            for part in joined_parts:
+                if part.stream_name == join.join_stream_name:
+                    return part
+            logger.warning(
+                f"Join alias {stream_name} maps to stream {join.join_stream_name}, but that stream is not in the joined parts"
+            )
+    return None
 def prune(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart]) -> bool:
     """
-    Prunes columns from view parts that reference fields that don't exist in the referenced streams.
+    Prunes columns from view parts using graph-based dependency resolution.
-    This function handles:
-    1. Direct dependencies - removing columns that directly reference non-existent columns
-    2. Transitive dependencies - removing columns that depend on columns that were removed
+    Uses TopologicalSorter to:
+    1. Build a complete dependency graph of all columns across all parts
+    2. Identify "root" columns that must be kept (in main part or used in joins)
+    3. Traverse dependencies to find all transitively required columns
+    4. Remove columns that aren't needed
     Returns True if any columns were removed, False otherwise.
-    Raises ValueError if a cyclic dependency is detected.
     """
-    columns_removed = False
-    # Helper function to find a view part by stream name
-    def find_part(stream_name: str) -> Optional[SnowflakeViewPart]:
-        if stream_name == view_part.stream_name:
-            return view_part
-        return next((p for p in joined_parts if p.stream_name == stream_name), None)
-    # Helper function to check if a column should be kept or removed
-    def should_keep_column(column: SnowflakeViewColumn, part: SnowflakeViewPart) -> bool:
-        """
-        Checks if a column should be kept based on its dependencies.
-        Returns True if the column should be kept, False if it should be removed.
-        """
-        # If no references, keep the column
-        if not column.referenced_columns:
-            return True
+    all_parts = [view_part] + joined_parts
+    # Build column registry: (stream_name, column_name) -> column object
+    all_columns: Dict[Tuple[str, str], SnowflakeViewColumn] = {}
+    for part in all_parts:
+        for column in part.columns:
+            all_columns[(part.stream_name, column.original_name)] = column
+    # Build dependency graph for topological analysis
+    # Key: (stream, column), Value: list of (stream, column) dependencies
+    # Also track columns with invalid dependencies (reference non-existent columns)
+    dependency_graph: Dict[Tuple[str, str], List[Tuple[str, str]]] = {}
+    columns_with_invalid_deps: set[Tuple[str, str]] = set()
+    # First pass: build dependency graph and detect direct invalid references
+    for part in all_parts:
+        for column in part.columns:
+            key = (part.stream_name, column.original_name)
+            deps = []
+            has_invalid_dep = False
-        # Check each referenced stream and its fields
-        for ref_stream_name, ref_fields in column.referenced_columns.items():
-            # Find the referenced part
-            ref_part = find_part(ref_stream_name)
+            if column.referenced_columns:
+                for ref_stream_name, ref_fields in column.referenced_columns.items():
+                    # Resolve stream alias to actual stream name
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        dep_key = (resolved_stream, ref_field)
+                        if dep_key in all_columns:
+                            deps.append(dep_key)
+                        else:
+                            logger.warning(
+                                f"Column {column.original_name} in {part.stream_name} references "
+                                f"{ref_field} in {resolved_stream}, which doesn't exist"
+                            )
+                            has_invalid_dep = True
-            # If referenced stream doesn't exist, remove the column
-            if ref_part is None:
-                logger.warning(
-                    f"Column {column.name} in stream {part.stream_name} references stream "
-                    f"{ref_stream_name}, but it was not provided"
-                )
-                return False
-            # Check each referenced field
-            for ref_field in ref_fields:
-                # Find the referenced column
-                ref_column = next((c for c in ref_part.columns if c.original_name == ref_field), None)
+            dependency_graph[key] = deps
+            if has_invalid_dep:
+                columns_with_invalid_deps.add(key)
+    # Second pass: propagate invalidity to columns that depend on invalid columns
+    # Keep iterating until no new invalid columns are found
+    changed = True
+    while changed:
+        changed = False
+        for col_key, deps in dependency_graph.items():
+            if col_key not in columns_with_invalid_deps:
+                # Check if any dependency is invalid
+                for dep_key in deps:
+                    if dep_key in columns_with_invalid_deps:
+                        logger.warning(
+                            f"Column {col_key[1]} in {col_key[0]} depends on "
+                            f"{dep_key[1]} in {dep_key[0]}, which has invalid dependencies"
+                        )
+                        columns_with_invalid_deps.add(col_key)
+                        changed = True
+                        break
+    # Build alias to stream mapping
+    alias_to_stream: Dict[str, str] = {}
+    for part in all_parts:
+        alias_to_stream[part.stream_name] = part.stream_name
+        for join in part.joins:
+            alias_to_stream[join.join_stream_alias] = join.join_stream_name
+            # left_alias might be an alias for a joined stream, resolve it
+            if join.left_alias not in alias_to_stream:
+                # Try to find the stream for this alias
+                for other_part in all_parts:
+                    if other_part.stream_name == join.left_alias:
+                        alias_to_stream[join.left_alias] = other_part.stream_name
+                        break
+    # Identify root columns that must be kept
+    needed_columns: set[Tuple[str, str]] = set()
+    # 1. All columns in the main part are needed (except those with invalid dependencies)
+    for column in view_part.columns:
+        col_key = (view_part.stream_name, column.original_name)
+        if col_key not in columns_with_invalid_deps:
+            needed_columns.add(col_key)
+    # 2. All columns used in join conditions are needed (except those with invalid dependencies)
+    for part in all_parts:
+        for join in part.joins:
+            # Resolve left_alias to actual stream name
+            left_stream = alias_to_stream.get(join.left_alias, join.left_alias)
+            left_key = (left_stream, join.left_column)
+            right_key = (join.join_stream_name, join.join_stream_column)
+            if left_key not in columns_with_invalid_deps:
+                needed_columns.add(left_key)
+            if right_key not in columns_with_invalid_deps:
+                needed_columns.add(right_key)
+    logger.debug(f"Identified {len(needed_columns)} root columns to keep (excluding {len(columns_with_invalid_deps)} with invalid deps)")
+    # 3. Find all transitive dependencies using recursive traversal
+    # Skip columns with invalid dependencies and their dependents
+    def collect_dependencies(col_key: Tuple[str, str], visited: set[Tuple[str, str]]) -> None:
+        """Recursively collect all columns that col_key depends on"""
+        if col_key in visited or col_key not in dependency_graph:
+            return
+        if col_key in columns_with_invalid_deps:
+            return  # Don't traverse dependencies of invalid columns
+        visited.add(col_key)
+        for dep_key in dependency_graph[col_key]:
+            if dep_key in all_columns and dep_key not in columns_with_invalid_deps:
+                needed_columns.add(dep_key)
+                collect_dependencies(dep_key, visited)
+    visited_global: set[Tuple[str, str]] = set()
+    for root_col in list(needed_columns):
+        collect_dependencies(root_col, visited_global)
+    # Remove columns that are not needed
+    columns_removed = False
+    for part in all_parts:
+        original_count = len(part.columns)
+        removed_cols = [col for col in part.columns
+                       if (part.stream_name, col.original_name) not in needed_columns]
+        # Log warnings for each removed column with the reason
+        for col in removed_cols:
+            # Determine why the column is being removed
+            col_key = (part.stream_name, col.original_name)
+            if col.referenced_columns:
+                # Check if any referenced columns don't exist
+                missing_refs = []
+                for ref_stream_name, ref_fields in col.referenced_columns.items():
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        if (resolved_stream, ref_field) not in all_columns:
+                            missing_refs.append(f"{ref_field} in {resolved_stream}")
-                # If referenced column doesn't exist, remove the column
-                if ref_column is None:
+                if missing_refs:
                     logger.warning(
-                        f"Column {column.name} in stream {part.stream_name} references field "
-                        f"{ref_field} in stream {ref_stream_name}, but it was not provided"
+                        f"Removing column {col.original_name} from {part.stream_name} because it references "
+                        f"non-existent column(s): {', '.join(missing_refs)}"
                     )
-                    return False
-        # All dependencies are satisfied
-        return True
-    # Process columns for removal
-    for column in view_part.columns[:]:  # Use a copy to allow safe removal
-        if not should_keep_column(column, view_part):
-            view_part.columns.remove(column)
+                else:
+                    # Column is not needed (not referenced by main part)
+                    logger.debug(
+                        f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                        f"referenced by the main part or any join conditions"
+                    )
+            else:
+                logger.debug(
+                    f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                    f"referenced by the main part or any join conditions"
+                )
+        part.columns = [col for col in part.columns
+                       if (part.stream_name, col.original_name) in needed_columns]
+        if removed_cols:
             columns_removed = True
-    # Process joined parts
-    for joined_part in joined_parts:
-        # We have to avoid pruning columns that are referenced by joins to this stream.
-        # first, we determine all aliases for this stream (multiple join paths back to the same stream are allowed)
-        aliases_for_stream = [j.join_stream_alias for j in view_part.joins if j.join_stream_name == joined_part.stream_name]
-        # now find all joins using this stream as the join stream
-        columns_used_in_joins = [
-            j.left_column for j in view_part.joins if j.left_alias in aliases_for_stream
-        ]
-        for column in joined_part.columns[:]:  # Use a copy to allow safe removal
-            # First check if the column is a join column
-            if column.original_name in columns_used_in_joins:
-                # If it's a join column, we need to keep it
-                continue
-            if not should_keep_column(column, joined_part):
-                joined_part.columns.remove(column)
-                columns_removed = True
     return columns_removed
 class JsonSchemaTopLevel(BaseModel):

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/logging.py RENAMED Viewed

@@ -9,9 +9,10 @@ from typing import Dict, List, Optional
 from snowflake.snowpark import Session
 from pydantic import ValidationError
 from snowflake import telemetry
-from opentelemetry import trace
+from opentelemetry import trace, metrics
 tracer = trace.get_tracer('omnata_plugin_runtime')
+meter = metrics.get_meter('omnata_plugin_runtime')
 class CustomLoggerAdapter(logging.LoggerAdapter):
     """

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/omnata_plugin.py RENAMED Viewed

@@ -48,7 +48,7 @@ from snowflake.snowpark import Session
 from snowflake.snowpark.functions import col
 from tenacity import Retrying, stop_after_attempt, wait_fixed, retry_if_exception_message
-from .logging import OmnataPluginLogHandler, logger, tracer
+from .logging import OmnataPluginLogHandler, logger, tracer, meter
 from opentelemetry import context
 import math
 import numpy as np
@@ -1185,41 +1185,52 @@ class InboundSyncRequest(SyncRequest):
             query_id: Optional[str] = None
     ) -> str:
         binding_values = []
-        values_clauses = []
+        select_clauses = []
         with self._snowflake_query_lock:
             if query_id is None:
                 query_id = self._get_query_id_for_now()
             for stream_name, latest_state in stream_states_for_upload.items():
                 binding_values.extend([stream_name, query_id, json.dumps(latest_state)])
-                values_clauses.append(
-                    f"(?, ?, PARSE_JSON(?))"
+                select_clauses.append(
+                    f"select ?, ?, PARSE_JSON(?)"
                 )
             final_query = f"""INSERT INTO {self.state_register_table_name} (STREAM_NAME, QUERY_ID, STATE_VALUE)
-                VALUES {','.join(values_clauses)}"""
+                {' union all '.join(select_clauses)}"""
             self._session.sql(final_query, binding_values).collect()
+            streams_included = list(stream_states_for_upload.keys())
+            logger.debug(f"Inserted state for streams: {streams_included} with query ID {query_id}")
     def apply_progress_updates(self, ignore_errors:bool = True):
         """
         Sends a message to the plugin with the current progress of the sync run, if it has changed since last time.
         """
-        if self._apply_results is not None:
-            with self._apply_results_lock:
-                new_progress_update = PluginMessageStreamProgressUpdate(
-                        stream_total_counts=self._stream_record_counts,
-                        # records could have been marked as completed, but still have results to apply
-                        completed_streams=[s for s in self._completed_streams if s not in self._apply_results or self._apply_results[s] is None],
-                        stream_errors=self._omnata_log_handler.stream_global_errors,
-                        total_records_estimate=self._total_records_estimate
-                    )
-            if self._last_stream_progress_update is None or new_progress_update != self._last_stream_progress_update:
-                result = self._plugin_message(
-                    message=new_progress_update,
-                    ignore_errors=ignore_errors
+        with self._apply_results_lock:
+            new_progress_update = PluginMessageStreamProgressUpdate(
+                    stream_total_counts=self._stream_record_counts,
+                    # records could have been marked as completed, but still have results to apply
+                    completed_streams=[s for s in self._completed_streams
+                        if s not in self._apply_results
+                            or self._apply_results[s] is None
+                            or len(self._apply_results[s]) == 0],
+                    stream_errors=self._omnata_log_handler.stream_global_errors,
+                    total_records_estimate=self._total_records_estimate
                 )
-                if result is None:
-                    return False
-                self._last_stream_progress_update = new_progress_update
+        if self._last_stream_progress_update is None or new_progress_update != self._last_stream_progress_update:
+            result = self._plugin_message(
+                message=new_progress_update,
+                ignore_errors=ignore_errors
+            )
+            if result is None:
+                return False
+            self._last_stream_progress_update = new_progress_update
+        completed_streams_awaiting_results_upload = [
+            s for s in self._completed_streams if s in self._apply_results and self._apply_results[s] is not None
+        ]
+        if len(completed_streams_awaiting_results_upload) > 0:
+            logger.debug(
+                f"Streams marked as completed but awaiting upload: {', '.join(completed_streams_awaiting_results_upload)}"
+            )
         return True
     def apply_cancellation(self):
@@ -1286,7 +1297,7 @@ class InboundSyncRequest(SyncRequest):
             # if the total exceeds 200MB, we apply the results immediately
             all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results.values())
             # flatten
-            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
+            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist if isinstance(x, pandas.DataFrame)]
             combined_length = sum([len(x) for x in all_dfs])
             # first, don't bother if the count is less than 10000, since it's unlikely to be even close
             if combined_length > 10000:
@@ -1336,7 +1347,7 @@ class InboundSyncRequest(SyncRequest):
             combined_length = sum([len(x) for x in all_dfs])
             # first, don't both if the count is less than 10000, since it's unlikely to be even close
             if combined_length > 10000:
-                if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
+                if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
                     logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
                     self.apply_results_queue()
@@ -1345,9 +1356,11 @@ class InboundSyncRequest(SyncRequest):
         Marks a stream as completed, this is called automatically per stream when using @managed_inbound_processing.
         If @managed_inbound_processing is not used, call this whenever a stream has finished recieving records.
         """
-        self._completed_streams.append(stream_name)
-        # dedup just in case it's called twice
-        self._completed_streams = list(set(self._completed_streams))
+        logger.debug(f"Marking stream {stream_name} as completed locally")
+        with self._apply_results_lock:
+            self._completed_streams.append(stream_name)
+            # dedup just in case it's called twice
+            self._completed_streams = list(set(self._completed_streams))
     def set_stream_record_count(self, stream_name: str, count: int):
         """
@@ -1845,6 +1858,40 @@ class OmnataPlugin(ABC):
         raise NotImplementedError(
             "Your plugin class must implement the inbound_configuration_form method"
         )
+    def outbound_tuning_parameters(
+        self, parameters: OutboundSyncConfigurationParameters
+    ) -> OutboundSyncConfigurationForm:
+        """
+        Returns the form definition for declaring outbound tuning parameters.
+        The returned form should consist of static fields with default values that represent the
+        plugin's recommended runtime behaviour. This form is optional and is only rendered when a
+        user opts to override those defaults at sync runtime, so it must be safe to fall back to the
+        provided defaults when no tuning parameters are configured.
+        :param OutboundSyncConfigurationParameters parameters the current outbound configuration
+        :return: An OutboundSyncConfigurationForm describing the available tuning parameters
+        :rtype: OutboundSyncConfigurationForm
+        """
+        return OutboundSyncConfigurationForm(fields=[])
+    def inbound_tuning_parameters(
+        self, parameters: InboundSyncConfigurationParameters
+    ) -> InboundSyncConfigurationForm:
+        """
+        Returns the form definition for declaring inbound tuning parameters.
+        The returned form should consist of static fields with default values that represent the
+        plugin's recommended runtime behaviour. This form is optional and is only rendered when a
+        user opts to override those defaults at sync runtime, so it must be safe to fall back to the
+        provided defaults when no tuning parameters are configured.
+        :param InboundSyncConfigurationParameters parameters the current inbound configuration
+        :return: An InboundSyncConfigurationForm describing the available tuning parameters
+        :rtype: InboundSyncConfigurationForm
+        """
+        return InboundSyncConfigurationForm(fields=[])
     def inbound_stream_list(
         self, parameters: InboundSyncConfigurationParameters
@@ -2283,6 +2330,15 @@ def __managed_inbound_processing_worker(
         try:
             stream: StoredStreamConfiguration = streams_queue.get_nowait()
             logger.debug(f"stream returned from queue: {stream}")
+            sync_request: InboundSyncRequest = cast(
+                InboundSyncRequest, plugin_class_obj._sync_request
+            )  # pylint: disable=protected-access
+            stream_duration_gauge = meter.create_gauge(
+                name="omnata.sync_run.stream_duration",
+                description="The duration of stream processing",
+                unit="s",
+            )
+            start_time = time.time()
             # restore the first argument, was originally the dataframe/generator but now it's the appropriately sized dataframe
             try:
                 with tracer.start_as_current_span("managed_inbound_processing") as managed_inbound_processing_span:
@@ -2294,7 +2350,7 @@ def __managed_inbound_processing_worker(
                         logger.info(f"worker {worker_index} requested that {stream.stream_name} be not marked as complete")
                     else:
                         logger.info(f"worker {worker_index} marking stream {stream.stream_name} as complete")
-                        plugin_class_obj._sync_request.mark_stream_complete(stream.stream_name)
+                        sync_request.mark_stream_complete(stream.stream_name)
             except InterruptedWhileWaitingException:
                 # If an inbound run is cancelled while waiting for rate limiting, this should mean that
                 # the cancellation is handled elsewhere, so we don't need to do anything special here other than stop waiting
@@ -2314,6 +2370,19 @@ def __managed_inbound_processing_worker(
                     omnata_plugin_logger.error(f"{type(e).__name__} syncing stream {stream.stream_name}",
                                 exc_info=True,
                                 extra={'stream_name':stream.stream_name})
+            finally:
+                duration = time.time() - start_time
+                stream_duration_gauge.set(
+                    amount=duration,
+                    attributes={
+                        "stream_name": stream.stream_name,
+                        "sync_run_id": str(sync_request._run_id),
+                        "sync_id": str(sync_request._sync_id),
+                        "branch_name": str(sync_request._branch_name) if sync_request._branch_name is not None else 'main',
+                        "sync_direction": "inbound",
+                        "plugin_id": plugin_class_obj.get_manifest().plugin_id,
+                    },
+                )
         except queue.Empty:
             logger.debug("streams queue is empty")
             return

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/plugin_entrypoints.py RENAMED Viewed

@@ -282,6 +282,11 @@ class PluginEntrypoint:
                     # token is set. We throw it here as an error since that's currently how it flows back to the engine with a DELAYED state
                     raise DeadlineReachedException()
             finally:
+                # try to upload any remaining results
+                try:
+                    inbound_sync_request.apply_results_queue()
+                except Exception as e:
+                    logger.warning(f"Error uploading remaining results: {str(e)}", exc_info=True)
                 # cancel the thread so we don't leave anything hanging around and cop a nasty error
                 try:
                     inbound_sync_request._thread_cancellation_token.set()  # pylint: disable=protected-access

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/LICENSE RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/README.md RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/__init__.py RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/api.py RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/configuration.py RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/forms.py RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.0a302 → omnata_plugin_runtime-0.11.7a324}/src/omnata_plugin_runtime/rate_limiting.py RENAMED Viewed

File without changes

omnata-plugin-runtime 0.11.0a302__tar.gz → 0.11.7a324__tar.gz

Potentially problematic release.

omnata-plugin-runtime 0.11.0a302tar.gz → 0.11.7a324tar.gz