PyPI - omnata-plugin-runtime - Versions diffs - 0.11.4__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

omnata-plugin-runtime 0.11.4py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

omnata_plugin_runtime/json_schema.py CHANGED Viewed

@@ -664,9 +664,12 @@ class SnowflakeViewParts(BaseModel):
         )
         joined_parts:List[SnowflakeViewPart] = []
         # remove the joins from the main part if they are not in the raw stream locations
+        original_join_count = len(main_stream_view_part.joins)
         main_stream_view_part.joins = [join for join in main_stream_view_part.joins
                                        if join.join_stream_name in raw_stream_locations
                                        and join.join_stream_name in stream_schemas]
+        if len(main_stream_view_part.joins) < original_join_count:
+            logger.debug(f"Removed {original_join_count - len(main_stream_view_part.joins)} joins from stream: {stream_name} due to missing raw stream locations or schemas")
         for join in main_stream_view_part.joins:
             logger.debug(f"Generating view parts for join stream: {join.join_stream_name}")
@@ -679,6 +682,8 @@ class SnowflakeViewParts(BaseModel):
                 column_name_expression=column_name_expression,
                 plugin_app_database=plugin_app_database
             ))
+        if len(main_stream_view_part.joins) == 0:
+            logger.debug(f"No joins found for stream: {stream_name}")
         # For each column, the plugin can advise which fields (of the same stream or joined) are required for the join, which comes through as referenced_columns
         # on the SnowflakeViewColumn object.
         # Until this generate function is called with the raw stream names, we don't know which streams the user has actually selected, nor which
@@ -697,7 +702,8 @@ class SnowflakeViewParts(BaseModel):
         # Process all joins to build the mappings
         for part in [main_stream_view_part] + joined_parts:
-            logger.debug(f"Processing joins for stream: {part.stream_name}")
+            joined_parts_names = [j.join_stream_name for j in part.joins]
+            logger.debug(f"Processing joins for stream: {part.stream_name} (joined streams: {joined_parts_names})")
             # Make sure the part's stream name is in the mappings
             if part.stream_name not in stream_to_aliases:
                 stream_to_aliases[part.stream_name] = [part.stream_name]
@@ -807,19 +813,8 @@ class SnowflakeViewParts(BaseModel):
         # If we get here, no circular references were found
         logger.debug("No circular references found")
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        # Now proceed with the actual pruning process
-        # First, removing unavailable columns from other streams
-        # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
-        prune_count = 0
-        while prune(main_stream_view_part, joined_parts):
-            prune_count += 1
-            if prune_count > 10:
-                raise ValueError("Pruning of columns from the view has entered an infinite loop")
+        # Prune columns using graph-based dependency resolution (single pass)
+        prune(main_stream_view_part, joined_parts)
         return cls(main_part=main_stream_view_part, joined_parts=joined_parts)
@@ -844,81 +839,183 @@ def find_part(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart
 def prune(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart]) -> bool:
     """
-    Prunes columns from view parts that reference fields that don't exist in the referenced streams.
+    Prunes columns from view parts using graph-based dependency resolution.
-    This function handles:
-    1. Direct dependencies - removing columns that directly reference non-existent columns
-    2. Transitive dependencies - removing columns that depend on columns that were removed
+    Uses TopologicalSorter to:
+    1. Build a complete dependency graph of all columns across all parts
+    2. Identify "root" columns that must be kept (in main part or used in joins)
+    3. Traverse dependencies to find all transitively required columns
+    4. Remove columns that aren't needed
     Returns True if any columns were removed, False otherwise.
-    Raises ValueError if a cyclic dependency is detected.
     """
-    columns_removed = False
-    # Helper function to check if a column should be kept or removed
-    def should_keep_column(column: SnowflakeViewColumn, part: SnowflakeViewPart) -> bool:
-        """
-        Checks if a column should be kept based on its dependencies.
-        Returns True if the column should be kept, False if it should be removed.
-        """
-        # If no references, keep the column
-        if not column.referenced_columns:
-            return True
+    all_parts = [view_part] + joined_parts
+    # Build column registry: (stream_name, column_name) -> column object
+    all_columns: Dict[Tuple[str, str], SnowflakeViewColumn] = {}
+    for part in all_parts:
+        for column in part.columns:
+            all_columns[(part.stream_name, column.original_name)] = column
+    # Build dependency graph for topological analysis
+    # Key: (stream, column), Value: list of (stream, column) dependencies
+    # Also track columns with invalid dependencies (reference non-existent columns)
+    dependency_graph: Dict[Tuple[str, str], List[Tuple[str, str]]] = {}
+    columns_with_invalid_deps: set[Tuple[str, str]] = set()
+    # First pass: build dependency graph and detect direct invalid references
+    for part in all_parts:
+        for column in part.columns:
+            key = (part.stream_name, column.original_name)
+            deps = []
+            has_invalid_dep = False
-        # Check each referenced stream and its fields
-        for ref_stream_name, ref_fields in column.referenced_columns.items():
-            # Find the referenced part
-            ref_part = find_part(view_part, joined_parts,ref_stream_name)
+            if column.referenced_columns:
+                for ref_stream_name, ref_fields in column.referenced_columns.items():
+                    # Resolve stream alias to actual stream name
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        dep_key = (resolved_stream, ref_field)
+                        if dep_key in all_columns:
+                            deps.append(dep_key)
+                        else:
+                            logger.warning(
+                                f"Column {column.original_name} in {part.stream_name} references "
+                                f"{ref_field} in {resolved_stream}, which doesn't exist"
+                            )
+                            has_invalid_dep = True
-            # If referenced stream doesn't exist, remove the column
-            if ref_part is None:
-                logger.warning(
-                    f"Column {column.name} in stream {part.stream_name} references stream "
-                    f"{ref_stream_name}, but it was not provided"
-                )
-                return False
-            # Check each referenced field
-            for ref_field in ref_fields:
-                # Find the referenced column
-                ref_column = next((c for c in ref_part.columns if c.original_name == ref_field), None)
+            dependency_graph[key] = deps
+            if has_invalid_dep:
+                columns_with_invalid_deps.add(key)
+    # Second pass: propagate invalidity to columns that depend on invalid columns
+    # Keep iterating until no new invalid columns are found
+    changed = True
+    while changed:
+        changed = False
+        for col_key, deps in dependency_graph.items():
+            if col_key not in columns_with_invalid_deps:
+                # Check if any dependency is invalid
+                for dep_key in deps:
+                    if dep_key in columns_with_invalid_deps:
+                        logger.warning(
+                            f"Column {col_key[1]} in {col_key[0]} depends on "
+                            f"{dep_key[1]} in {dep_key[0]}, which has invalid dependencies"
+                        )
+                        columns_with_invalid_deps.add(col_key)
+                        changed = True
+                        break
+    # Build alias to stream mapping
+    alias_to_stream: Dict[str, str] = {}
+    for part in all_parts:
+        alias_to_stream[part.stream_name] = part.stream_name
+        for join in part.joins:
+            alias_to_stream[join.join_stream_alias] = join.join_stream_name
+            # left_alias might be an alias for a joined stream, resolve it
+            if join.left_alias not in alias_to_stream:
+                # Try to find the stream for this alias
+                for other_part in all_parts:
+                    if other_part.stream_name == join.left_alias:
+                        alias_to_stream[join.left_alias] = other_part.stream_name
+                        break
+    # Identify root columns that must be kept
+    needed_columns: set[Tuple[str, str]] = set()
+    # 1. All columns in the main part are needed (except those with invalid dependencies)
+    for column in view_part.columns:
+        col_key = (view_part.stream_name, column.original_name)
+        if col_key not in columns_with_invalid_deps:
+            needed_columns.add(col_key)
+    # 2. All columns used in join conditions are needed (except those with invalid dependencies)
+    for part in all_parts:
+        for join in part.joins:
+            # Resolve left_alias to actual stream name
+            left_stream = alias_to_stream.get(join.left_alias, join.left_alias)
+            left_key = (left_stream, join.left_column)
+            right_key = (join.join_stream_name, join.join_stream_column)
+            if left_key not in columns_with_invalid_deps:
+                needed_columns.add(left_key)
+            if right_key not in columns_with_invalid_deps:
+                needed_columns.add(right_key)
+    logger.debug(f"Identified {len(needed_columns)} root columns to keep (excluding {len(columns_with_invalid_deps)} with invalid deps)")
+    # 3. Find all transitive dependencies using recursive traversal
+    # Skip columns with invalid dependencies and their dependents
+    def collect_dependencies(col_key: Tuple[str, str], visited: set[Tuple[str, str]]) -> None:
+        """Recursively collect all columns that col_key depends on"""
+        if col_key in visited or col_key not in dependency_graph:
+            return
+        if col_key in columns_with_invalid_deps:
+            return  # Don't traverse dependencies of invalid columns
+        visited.add(col_key)
+        for dep_key in dependency_graph[col_key]:
+            if dep_key in all_columns and dep_key not in columns_with_invalid_deps:
+                needed_columns.add(dep_key)
+                collect_dependencies(dep_key, visited)
+    visited_global: set[Tuple[str, str]] = set()
+    for root_col in list(needed_columns):
+        collect_dependencies(root_col, visited_global)
+    # Remove columns that are not needed
+    columns_removed = False
+    for part in all_parts:
+        original_count = len(part.columns)
+        removed_cols = [col for col in part.columns
+                       if (part.stream_name, col.original_name) not in needed_columns]
+        # Log warnings for each removed column with the reason
+        for col in removed_cols:
+            # Determine why the column is being removed
+            col_key = (part.stream_name, col.original_name)
+            if col.referenced_columns:
+                # Check if any referenced columns don't exist
+                missing_refs = []
+                for ref_stream_name, ref_fields in col.referenced_columns.items():
+                    resolved_stream = ref_stream_name
+                    for join in view_part.joins:
+                        if join.join_stream_alias == ref_stream_name:
+                            resolved_stream = join.join_stream_name
+                            break
+                    for ref_field in ref_fields:
+                        if (resolved_stream, ref_field) not in all_columns:
+                            missing_refs.append(f"{ref_field} in {resolved_stream}")
-                # If referenced column doesn't exist, remove the column
-                if ref_column is None:
+                if missing_refs:
                     logger.warning(
-                        f"Column {column.name} in stream {part.stream_name} references field "
-                        f"{ref_field} in stream {ref_stream_name}, but it was not provided"
+                        f"Removing column {col.original_name} from {part.stream_name} because it references "
+                        f"non-existent column(s): {', '.join(missing_refs)}"
                     )
-                    return False
-        # All dependencies are satisfied
-        return True
-    # Process columns for removal
-    for column in view_part.columns[:]:  # Use a copy to allow safe removal
-        if not should_keep_column(column, view_part):
-            view_part.columns.remove(column)
+                else:
+                    # Column is not needed (not referenced by main part)
+                    logger.debug(
+                        f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                        f"referenced by the main part or any join conditions"
+                    )
+            else:
+                logger.debug(
+                    f"Removing column {col.original_name} from {part.stream_name} because it is not "
+                    f"referenced by the main part or any join conditions"
+                )
+        part.columns = [col for col in part.columns
+                       if (part.stream_name, col.original_name) in needed_columns]
+        if removed_cols:
             columns_removed = True
-    # Process joined parts
-    for joined_part in joined_parts:
-        # We have to avoid pruning columns that are referenced by joins to this stream.
-        # first, we determine all aliases for this stream (multiple join paths back to the same stream are allowed)
-        aliases_for_stream = [j.join_stream_alias for j in view_part.joins if j.join_stream_name == joined_part.stream_name]
-        # now find all joins using this stream as the join stream
-        columns_used_in_joins = [
-            j.left_column for j in view_part.joins if j.left_alias in aliases_for_stream
-        ]
-        for column in joined_part.columns[:]:  # Use a copy to allow safe removal
-            # First check if the column is a join column
-            if column.original_name in columns_used_in_joins:
-                # If it's a join column, we need to keep it
-                continue
-            if not should_keep_column(column, joined_part):
-                joined_part.columns.remove(column)
-                columns_removed = True
     return columns_removed
 class JsonSchemaTopLevel(BaseModel):

omnata_plugin_runtime/logging.py CHANGED Viewed

@@ -9,9 +9,10 @@ from typing import Dict, List, Optional
 from snowflake.snowpark import Session
 from pydantic import ValidationError
 from snowflake import telemetry
-from opentelemetry import trace
+from opentelemetry import trace, metrics
 tracer = trace.get_tracer('omnata_plugin_runtime')
+meter = metrics.get_meter('omnata_plugin_runtime')
 class CustomLoggerAdapter(logging.LoggerAdapter):
     """

omnata_plugin_runtime/omnata_plugin.py CHANGED Viewed

@@ -15,7 +15,7 @@ if tuple(sys.version_info[:2]) >= (3, 9):
 else:
     # Python 3.8 and below
     from typing_extensions import Annotated
+from dataclasses import dataclass
 import zipfile
 import datetime
 import http
@@ -48,7 +48,12 @@ from snowflake.snowpark import Session
 from snowflake.snowpark.functions import col
 from tenacity import Retrying, stop_after_attempt, wait_fixed, retry_if_exception_message
-from .logging import OmnataPluginLogHandler, logger, tracer
+from .logging import OmnataPluginLogHandler, logger, tracer, meter
+stream_duration_gauge = meter.create_gauge(
+    name="omnata.sync_run.stream_duration",
+    description="The duration of stream processing",
+    unit="s",
+)
 from opentelemetry import context
 import math
 import numpy as np
@@ -265,6 +270,29 @@ def jinja_filter(func):
     func.is_jinja_filter = True
     return func
+@dataclass
+class StateResult:
+    """
+    Represents the current cursor state of a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    new_state: Any
+@dataclass
+class RecordsToUploadResult:
+    """
+    Represents the records to upload for a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    records: pandas.DataFrame
+@dataclass
+class CriteriaDeleteResult:
+    """
+    Represents the result of processing criteria deletes for a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    criteria_deletes: pandas.DataFrame
 class SyncRequest(ABC):
     """
@@ -1057,7 +1085,6 @@ class InboundSyncRequest(SyncRequest):
         }
         # These are similar to the results, but represent requests to delete records by some criteria
-        self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
         self._temp_tables = {}
         self._temp_table_lock = threading.Lock()
         self._results_exist: Dict[
@@ -1096,7 +1123,9 @@ class InboundSyncRequest(SyncRequest):
         self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
         self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
         # this is keyed on stream name, each containing a list of dataframes and state updates mixed
-        self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
+        self._apply_results: Dict[str, List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = {}
+        # track the start times of each stream, so we can calculate durations. The int is a epoch (time.time()) value
+        self._stream_start_times: Dict[str, int] = {}
     def apply_results_queue(self):
         """
@@ -1105,7 +1134,8 @@ class InboundSyncRequest(SyncRequest):
         logger.debug("InboundSyncRequest apply_results_queue")
         if self._apply_results is not None:
             with self._apply_results_lock:
-                results:List[pandas.DataFrame] = []
+                records_to_upload:List[pandas.DataFrame] = []
+                criteria_deletes_to_upload:List[pandas.DataFrame] = []
                 stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
                 for stream_name, stream_results in self._apply_results.items():
                     # the stream results contains an ordered sequence of dataframes and state updates (append only)
@@ -1113,9 +1143,9 @@ class InboundSyncRequest(SyncRequest):
                     # so first, we iterate backwards to find the last state update
                     last_state_index = -1
                     for i in range(len(stream_results) - 1, -1, -1):
-                        if isinstance(stream_results[i], dict):
+                        if isinstance(stream_results[i], StateResult):
                             last_state_index = i
-                            stream_states_for_upload[stream_name] = stream_results[i]
+                            stream_states_for_upload[stream_name] = stream_results[i].new_state
                             break
                     # if there are no state updates, we can't do anything with this stream
                     if last_state_index == -1:
@@ -1124,56 +1154,54 @@ class InboundSyncRequest(SyncRequest):
                         )
                         continue
                     assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
-                    # now we can take the dataframes up to the last state update
-                    dfs = stream_results[:last_state_index]
-                    non_empty_dfs = [
-                        x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
+                    # now we can take the record dataframes up to the last state update
+                    results_subset = stream_results[:last_state_index]
+                    non_empty_record_dfs:List[pandas.DataFrame] = [
+                        x.records for x in results_subset
+                        if x is not None and isinstance(x, RecordsToUploadResult) and len(x.records) > 0
                     ]
                     # get the total length of all the dataframes
-                    total_length = sum([len(x) for x in non_empty_dfs])
+                    total_length = sum([len(x) for x in non_empty_record_dfs])
                     # add the count of this batch to the total for this stream
                     self._stream_record_counts[
                         stream_name
                     ] = self._stream_record_counts[stream_name] + total_length
-                    results.extend(non_empty_dfs)
+                    records_to_upload.extend(non_empty_record_dfs)
+                    # also handle any criteria deletes
+                    criteria_deletes_to_upload.extend([
+                        x.criteria_deletes for x in results_subset
+                        if x is not None and isinstance(x, CriteriaDeleteResult) and len(x.criteria_deletes) > 0
+                    ])
                     # now remove everything up to the last state update
                     # we do this so that we don't apply the same state update multiple times
+                    # keep everything after the last state update
                     self._apply_results[stream_name] = stream_results[
                         last_state_index + 1 :
-                    ]  # keep everything after the last state update
-                if len(results) > 0:
-                    logger.debug(
-                        f"Applying {len(results)} batches of queued results"
-                    )
-                    # upload all cached apply results
-                    all_dfs = pandas.concat(results)
-                    query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
-                    # now that the results have been updated, we need to insert records into the state register table
-                    # we do this by inserting the latest state for each stream
+                    ]
+                if len(records_to_upload) > 0 or len(criteria_deletes_to_upload) > 0:
+                    if len(records_to_upload) > 0:
+                        logger.debug(
+                            f"Applying {len(records_to_upload)} batches of queued results"
+                        )
+                        # upload all cached apply results
+                        records_to_upload_combined = pandas.concat(records_to_upload)
+                        self._apply_results_dataframe(list(stream_states_for_upload.keys()), records_to_upload_combined)
+                        # now that the results have been updated, we need to insert records into the state register table
+                        # we do this by inserting the latest state for each stream
+                    if len(criteria_deletes_to_upload) > 0:
+                        logger.debug(
+                            f"Applying {len(criteria_deletes_to_upload)} batches of queued criteria deletes"
+                        )
+                        # upload all cached apply results
+                        all_criteria_deletes = pandas.concat(criteria_deletes_to_upload)
+                        self._apply_criteria_deletes_dataframe(all_criteria_deletes)
+                    query_id = self._get_query_id_for_now()
                     self._directly_insert_to_state_register(
                         stream_states_for_upload, query_id=query_id
                     )
-        # also take care of uploading delete requests
-        # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
-        # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
-        if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
-            with self._apply_results_lock:
-                results:List[pandas.DataFrame] = []
-                for stream_name, stream_results in self._apply_results_criteria_deletes.items():
-                    results.extend([
-                        x for x in stream_results if x is not None and len(x) > 0
-                    ])
-                if len(results) > 0:
-                    logger.debug(
-                        f"Applying {len(results)} batches of queued criteria deletes"
-                    )
-                    # upload all cached apply results
-                    all_dfs = pandas.concat(results)
-                    self._apply_criteria_deletes_dataframe(all_dfs)
-                    # clear the delete requests
-                    self._apply_results_criteria_deletes = {}
         # update the inbound stream record counts, so we can see progress
         # we do this last, because marking a stream as completed will cause the sync engine to process it
@@ -1281,29 +1309,40 @@ class InboundSyncRequest(SyncRequest):
         if stream_name is None or len(stream_name) == 0:
             raise ValueError("Stream name cannot be empty")
         with self._apply_results_lock:
-            existing_results: List[pandas.DataFrame] = []
+            existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
             if stream_name in self._apply_results:
                 existing_results = self._apply_results[stream_name]
-            existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
+            existing_results.append(RecordsToUploadResult(
+                records=self._preprocess_results_list(stream_name, results, is_delete)
+            ))
             if new_state is not None:
-                existing_results.append(new_state)  # append the new state at the end
+                existing_results.append(
+                    StateResult(new_state=new_state)
+                )  # append the new state at the end
             self._apply_results[stream_name] = existing_results
-        # if the total size of all the dataframes exceeds 200MB, apply the results immediately
-        # we'll use df.memory_usage(index=True) for this
         if self.development_mode is False:
             # note: we want to do it for all values in self._apply_results, not just the new one
-            # so first we need to get the list of lists from the dictionary values and flatten it
-            # then we can sum the memory usage of each dataframe
-            # if the total exceeds 200MB, we apply the results immediately
-            all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results.values())
-            # flatten
-            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist if isinstance(x, pandas.DataFrame)]
-            combined_length = sum([len(x) for x in all_dfs])
-            # first, don't bother if the count is less than 10000, since it's unlikely to be even close
-            if combined_length > 10000:
-                if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
-                    logger.debug(f"Applying results queue immediately due to combined dataframe size")
-                    self.apply_results_queue()
+            self._apply_results_if_size_exceeded()
+    def _apply_results_if_size_exceeded(self,):
+        # so first we need to get the list of lists from the dictionary values and flatten it
+        # then we can sum the memory usage of each dataframe
+        # if the total exceeds 200MB, we apply the results immediately
+        all_df_lists:List[List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = list(self._apply_results.values())
+        # flatten
+        all_dfs:List[pandas.DataFrame] = []
+        for sublist in all_df_lists:
+            for x in sublist:
+                if isinstance(x, RecordsToUploadResult):
+                    all_dfs.append(x.records)
+                if isinstance(x, CriteriaDeleteResult):
+                    all_dfs.append(x.criteria_deletes)
+        combined_length = sum([len(x) for x in all_dfs])
+        # first, don't bother if the count is less than 10000, since it's unlikely to be even close
+        if combined_length > 10000:
+            if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
+                logger.debug(f"Applying results queue immediately due to combined dataframe size")
+                self.apply_results_queue()
     def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
         """
@@ -1329,27 +1368,22 @@ class InboundSyncRequest(SyncRequest):
             logger.debug(
                 f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
             )
-            existing_results: List[pandas.DataFrame] = []
-            if stream_name in self._apply_results_criteria_deletes:
-                existing_results = self._apply_results_criteria_deletes[stream_name]
-            existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
-            self._apply_results_criteria_deletes[stream_name] = existing_results
-        # if the total size of all the dataframes exceeds 200MB, apply the results immediately
-        # we'll use df.memory_usage(index=True) for this
+            existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
+            if stream_name in self._apply_results:
+                existing_results = self._apply_results[stream_name]
+            existing_results.append(
+                CriteriaDeleteResult(
+                    criteria_deletes=pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}])))
+            self._apply_results[stream_name] = existing_results
         if self.development_mode is False:
-            # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
-            # so first we need to get the list of lists from the dictionary values and flatten it
-            # then we can sum the memory usage of each dataframe
-            # if the total exceeds 200MB, we apply the results immediately
-            all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
-            # flatten
-            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
-            combined_length = sum([len(x) for x in all_dfs])
-            # first, don't both if the count is less than 10000, since it's unlikely to be even close
-            if combined_length > 10000:
-                if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
-                    logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
-                    self.apply_results_queue()
+            self._apply_results_if_size_exceeded()
+    def mark_stream_started(self, stream_name: str):
+        """
+        Marks a stream as started, this is called automatically per stream when using @managed_inbound_processing.
+        """
+        logger.debug(f"Marking stream {stream_name} as started locally")
+        self._stream_start_times[stream_name] = time.time()
     def mark_stream_complete(self, stream_name: str):
         """
@@ -1357,6 +1391,20 @@ class InboundSyncRequest(SyncRequest):
         If @managed_inbound_processing is not used, call this whenever a stream has finished recieving records.
         """
         logger.debug(f"Marking stream {stream_name} as completed locally")
+        if stream_name in self._stream_start_times:
+            start_time = self._stream_start_times[stream_name]
+            duration = time.time() - start_time
+            stream_duration_gauge.set(
+                amount=duration,
+                attributes={
+                    "stream_name": stream_name,
+                    "sync_run_id": str(self._run_id),
+                    "sync_id": str(self._sync_id),
+                    "branch_name": str(self._branch_name) if self._branch_name is not None else 'main',
+                    "sync_direction": "inbound",
+                    "plugin_id": self.plugin_instance.get_manifest().plugin_id,
+                },
+            )
         with self._apply_results_lock:
             self._completed_streams.append(stream_name)
             # dedup just in case it's called twice
@@ -1463,7 +1511,7 @@ class InboundSyncRequest(SyncRequest):
             logger.debug(f"Failure to convert inbound data: {str(exception)}")
         return data
-    def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
+    def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]) -> pandas.DataFrame:
         """
         Creates a dataframe from the enqueued list, ready to upload.
         The result is a dataframe contain all (and only):
@@ -1608,7 +1656,7 @@ class InboundSyncRequest(SyncRequest):
         hash_object = hashlib.sha256(key_string.encode())
         return hash_object.hexdigest()
-    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
+    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
         """
         Applies results for an inbound sync. The results are staged into a temporary
         table in Snowflake, so that we can make an atomic commit at the end.
@@ -1635,7 +1683,6 @@ class InboundSyncRequest(SyncRequest):
                                 raise ValueError(
                                     f"Failed to write results to table {self._full_results_table_name}"
                                 )
-                            query_id = self._get_query_id_for_now()
                             logger.debug(
                                 f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
                             )
@@ -1648,7 +1695,6 @@ class InboundSyncRequest(SyncRequest):
                             #                                )
                             for stream_name in stream_names:
                                 self._results_exist[stream_name] = True
-                            return query_id
         else:
             logger.debug("Results dataframe is empty, not applying")
@@ -2330,6 +2376,11 @@ def __managed_inbound_processing_worker(
         try:
             stream: StoredStreamConfiguration = streams_queue.get_nowait()
             logger.debug(f"stream returned from queue: {stream}")
+            sync_request: InboundSyncRequest = cast(
+                InboundSyncRequest, plugin_class_obj._sync_request
+            )  # pylint: disable=protected-access
+            if stream.stream_name not in sync_request._stream_start_times:
+                sync_request.mark_stream_started(stream.stream_name)
             # restore the first argument, was originally the dataframe/generator but now it's the appropriately sized dataframe
             try:
                 with tracer.start_as_current_span("managed_inbound_processing") as managed_inbound_processing_span:
@@ -2341,7 +2392,7 @@ def __managed_inbound_processing_worker(
                         logger.info(f"worker {worker_index} requested that {stream.stream_name} be not marked as complete")
                     else:
                         logger.info(f"worker {worker_index} marking stream {stream.stream_name} as complete")
-                        plugin_class_obj._sync_request.mark_stream_complete(stream.stream_name)
+                        sync_request.mark_stream_complete(stream.stream_name)
             except InterruptedWhileWaitingException:
                 # If an inbound run is cancelled while waiting for rate limiting, this should mean that
                 # the cancellation is handled elsewhere, so we don't need to do anything special here other than stop waiting

omnata_plugin_runtime-0.12.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,56 @@
+Metadata-Version: 2.4
+Name: omnata-plugin-runtime
+Version: 0.12.1
+Summary: Classes and common runtime components for building and running Omnata Plugins
+License-File: LICENSE
+Author: James Weakley
+Author-email: james.weakley@omnata.com
+Requires-Python: >=3.10,<=3.13
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: annotated-types (<=0.6.0)
+Requires-Dist: certifi (<=2025.1.31)
+Requires-Dist: cffi (<=2.0.0)
+Requires-Dist: charset-normalizer (<=3.4.4)
+Requires-Dist: cryptography (<=46.0.3)
+Requires-Dist: filelock (<=3.20.0)
+Requires-Dist: idna (<=3.11)
+Requires-Dist: jinja2 (>=3.1.2,<=3.1.6)
+Requires-Dist: markupsafe (<=3.0.2)
+Requires-Dist: numpy (<=2.3.5)
+Requires-Dist: opentelemetry-api (<=1.38.0)
+Requires-Dist: packaging (<=25.0)
+Requires-Dist: pandas (<=2.3.3)
+Requires-Dist: platformdirs (<=4.5.0)
+Requires-Dist: protobuf (<=6.33.0)
+Requires-Dist: pyarrow (<=21.0.0)
+Requires-Dist: pycparser (<=2.23)
+Requires-Dist: pydantic (>=2,<=2.12.4)
+Requires-Dist: pydantic-core (<=2.41.5)
+Requires-Dist: pyjwt (<=2.10.1)
+Requires-Dist: pyopenssl (<=225.3.0)
+Requires-Dist: pytz (<=2025.2)
+Requires-Dist: pyyaml (<=6.0.3)
+Requires-Dist: requests (>=2,<=2.32.5)
+Requires-Dist: setuptools (<=80.9.0)
+Requires-Dist: snowflake-connector-python (>=3,<4)
+Requires-Dist: snowflake-snowpark-python (>=1.20.0,<=1.43.0)
+Requires-Dist: snowflake-telemetry-python (<=0.5.0)
+Requires-Dist: tenacity (>=8,<9)
+Requires-Dist: tomlkit (<=0.13.3)
+Requires-Dist: urllib3 (<=2.5.0)
+Requires-Dist: wheel (<=0.45.1)
+Requires-Dist: wrapt (<=2.0.1)
+Description-Content-Type: text/markdown
+# omnata-plugin-runtime
+This package is a runtime dependency for [Omnata Plugins](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins).
+It contains data classes, interfaces and application logic used to perform plugin operations.
+For instructions on creating plugins, visit our [docs site](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins/creating-plugins).

{omnata_plugin_runtime-0.11.4.dist-info → omnata_plugin_runtime-0.12.1.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,12 @@ omnata_plugin_runtime/__init__.py,sha256=MS9d1whnfT_B3-ThqZ7l63QeC_8OEKTuaYV5wTw
 omnata_plugin_runtime/api.py,sha256=5gbjbnFy72Xjf0E3kbG23G0V2J3CorvD5kpBn_BkdlI,8084
 omnata_plugin_runtime/configuration.py,sha256=SffokJfgvy6V3kUsoEjXcK3GdNgHo6U3mgBEs0qBv4I,46972
 omnata_plugin_runtime/forms.py,sha256=Lrbr3otsFDrvHWJw7v-slsW4PvEHJ6BG1Yl8oaJfiDo,20529
-omnata_plugin_runtime/json_schema.py,sha256=HGqqsJGzKT7PSW2re4teyGTiTv-ytEhOSzuvubiz-uY,54826
-omnata_plugin_runtime/logging.py,sha256=WBuZt8lF9E5oFWM4KYQbE8dDJ_HctJ1pN3BHwU6rcd0,4461
-omnata_plugin_runtime/omnata_plugin.py,sha256=xqAIxFdb2X4ryK4VetQxI4u4UdMyN2xs4toLHKasIdU,142045
+omnata_plugin_runtime/json_schema.py,sha256=ZfHMG-XSJBE9Smt33Y6GPpl5skF7pB1TRCf9AvWuw-Y,59705
+omnata_plugin_runtime/logging.py,sha256=qUtRA9syQNnjfJZHA2W18K282voXX6vHwrBIPOBo1n8,4521
+omnata_plugin_runtime/omnata_plugin.py,sha256=8FT3XNdZzty76OldvcxdKpbKrPENKjAIbwa_rxceVyg,143564
 omnata_plugin_runtime/plugin_entrypoints.py,sha256=_1pDLov3iQorGmfcae8Sw2bVjxw1vYeowBaKKNzRclQ,32629
 omnata_plugin_runtime/rate_limiting.py,sha256=qpr5esU4Ks8hMzuMpSR3gLFdor2ZUXYWCjmsQH_K6lQ,25882
-omnata_plugin_runtime-0.11.4.dist-info/METADATA,sha256=bHTXobn0dW15ESTEMBybxEN55Eu5X3UJEW-v8B-pBwM,2229
-omnata_plugin_runtime-0.11.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-omnata_plugin_runtime-0.11.4.dist-info/licenses/LICENSE,sha256=rGaMQG3R3F5-JGDp_-rlMKpDIkg5n0SI4kctTk8eZSI,56
-omnata_plugin_runtime-0.11.4.dist-info/RECORD,,
+omnata_plugin_runtime-0.12.1.dist-info/METADATA,sha256=SCl6ee1e3Q8DN0wa47snuMAOBABw387hC54HXuYSTcs,2222
+omnata_plugin_runtime-0.12.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+omnata_plugin_runtime-0.12.1.dist-info/licenses/LICENSE,sha256=rGaMQG3R3F5-JGDp_-rlMKpDIkg5n0SI4kctTk8eZSI,56
+omnata_plugin_runtime-0.12.1.dist-info/RECORD,,

omnata_plugin_runtime-0.11.4.dist-info/METADATA DELETED Viewed

@@ -1,56 +0,0 @@
-Metadata-Version: 2.4
-Name: omnata-plugin-runtime
-Version: 0.11.4
-Summary: Classes and common runtime components for building and running Omnata Plugins
-License-File: LICENSE
-Author: James Weakley
-Author-email: james.weakley@omnata.com
-Requires-Python: >=3.8,<=3.11
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Requires-Dist: annotated-types (<=0.6.0)
-Requires-Dist: certifi (<=2024.8.30)
-Requires-Dist: cffi (<=1.16.0)
-Requires-Dist: charset-normalizer (<=3.3.2)
-Requires-Dist: cryptography (<=43.0.0)
-Requires-Dist: filelock (<=3.13.1)
-Requires-Dist: idna (<=3.7)
-Requires-Dist: jinja2 (>=3.1.2,<=3.1.4)
-Requires-Dist: markupsafe (<=2.1.3)
-Requires-Dist: numpy (<=2.1.3)
-Requires-Dist: opentelemetry-api (<=1.23.0)
-Requires-Dist: packaging (<=24.1)
-Requires-Dist: pandas (<=2.2.3)
-Requires-Dist: platformdirs (<=3.10.0)
-Requires-Dist: protobuf (<=4.25.3)
-Requires-Dist: pyarrow (<=16.1.0)
-Requires-Dist: pycparser (<=2.21)
-Requires-Dist: pydantic (>=2,<=2.8.2)
-Requires-Dist: pydantic-core (<=2.21.0)
-Requires-Dist: pyjwt (<=2.8.0)
-Requires-Dist: pyopenssl (<=24.2.1)
-Requires-Dist: pytz (<=2024.1)
-Requires-Dist: pyyaml (<=6.0.1)
-Requires-Dist: requests (>=2,<=2.32.3)
-Requires-Dist: setuptools (<=72.1.0)
-Requires-Dist: snowflake-connector-python (>=3,<=3.12.0)
-Requires-Dist: snowflake-snowpark-python (>=1.20.0,<=1.24.0)
-Requires-Dist: snowflake-telemetry-python (<=0.5.0)
-Requires-Dist: tenacity (>=8,<=8.2.3)
-Requires-Dist: tomlkit (<=0.11.1)
-Requires-Dist: urllib3 (<=2.2.2)
-Requires-Dist: wheel (<=0.43.0)
-Requires-Dist: wrapt (<=1.14.1)
-Description-Content-Type: text/markdown
-# omnata-plugin-runtime
-This package is a runtime dependency for [Omnata Plugins](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins).
-It contains data classes, interfaces and application logic used to perform plugin operations.
-For instructions on creating plugins, visit our [docs site](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins/creating-plugins).

{omnata_plugin_runtime-0.11.4.dist-info → omnata_plugin_runtime-0.12.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.11.4.dist-info → omnata_plugin_runtime-0.12.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

omnata-plugin-runtime 0.11.4__py3-none-any.whl → 0.12.1__py3-none-any.whl

omnata-plugin-runtime 0.11.4py3-none-any.whl → 0.12.1py3-none-any.whl