omnata-plugin-runtime 0.11.4__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -664,9 +664,12 @@ class SnowflakeViewParts(BaseModel):
664
664
  )
665
665
  joined_parts:List[SnowflakeViewPart] = []
666
666
  # remove the joins from the main part if they are not in the raw stream locations
667
+ original_join_count = len(main_stream_view_part.joins)
667
668
  main_stream_view_part.joins = [join for join in main_stream_view_part.joins
668
669
  if join.join_stream_name in raw_stream_locations
669
670
  and join.join_stream_name in stream_schemas]
671
+ if len(main_stream_view_part.joins) < original_join_count:
672
+ logger.debug(f"Removed {original_join_count - len(main_stream_view_part.joins)} joins from stream: {stream_name} due to missing raw stream locations or schemas")
670
673
 
671
674
  for join in main_stream_view_part.joins:
672
675
  logger.debug(f"Generating view parts for join stream: {join.join_stream_name}")
@@ -679,6 +682,8 @@ class SnowflakeViewParts(BaseModel):
679
682
  column_name_expression=column_name_expression,
680
683
  plugin_app_database=plugin_app_database
681
684
  ))
685
+ if len(main_stream_view_part.joins) == 0:
686
+ logger.debug(f"No joins found for stream: {stream_name}")
682
687
  # For each column, the plugin can advise which fields (of the same stream or joined) are required for the join, which comes through as referenced_columns
683
688
  # on the SnowflakeViewColumn object.
684
689
  # Until this generate function is called with the raw stream names, we don't know which streams the user has actually selected, nor which
@@ -697,7 +702,8 @@ class SnowflakeViewParts(BaseModel):
697
702
 
698
703
  # Process all joins to build the mappings
699
704
  for part in [main_stream_view_part] + joined_parts:
700
- logger.debug(f"Processing joins for stream: {part.stream_name}")
705
+ joined_parts_names = [j.join_stream_name for j in part.joins]
706
+ logger.debug(f"Processing joins for stream: {part.stream_name} (joined streams: {joined_parts_names})")
701
707
  # Make sure the part's stream name is in the mappings
702
708
  if part.stream_name not in stream_to_aliases:
703
709
  stream_to_aliases[part.stream_name] = [part.stream_name]
@@ -807,19 +813,8 @@ class SnowflakeViewParts(BaseModel):
807
813
  # If we get here, no circular references were found
808
814
  logger.debug("No circular references found")
809
815
 
810
- # Now proceed with the actual pruning process
811
- # First, removing unavailable columns from other streams
812
- # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
813
-
814
- # Now proceed with the actual pruning process
815
- # First, removing unavailable columns from other streams
816
- # then, we can do a final pass and remove columns that reference fields that are not available in the current stream
817
-
818
- prune_count = 0
819
- while prune(main_stream_view_part, joined_parts):
820
- prune_count += 1
821
- if prune_count > 10:
822
- raise ValueError("Pruning of columns from the view has entered an infinite loop")
816
+ # Prune columns using graph-based dependency resolution (single pass)
817
+ prune(main_stream_view_part, joined_parts)
823
818
 
824
819
  return cls(main_part=main_stream_view_part, joined_parts=joined_parts)
825
820
 
@@ -844,81 +839,183 @@ def find_part(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart
844
839
 
845
840
  def prune(view_part: SnowflakeViewPart, joined_parts: List[SnowflakeViewPart]) -> bool:
846
841
  """
847
- Prunes columns from view parts that reference fields that don't exist in the referenced streams.
842
+ Prunes columns from view parts using graph-based dependency resolution.
848
843
 
849
- This function handles:
850
- 1. Direct dependencies - removing columns that directly reference non-existent columns
851
- 2. Transitive dependencies - removing columns that depend on columns that were removed
844
+ Uses TopologicalSorter to:
845
+ 1. Build a complete dependency graph of all columns across all parts
846
+ 2. Identify "root" columns that must be kept (in main part or used in joins)
847
+ 3. Traverse dependencies to find all transitively required columns
848
+ 4. Remove columns that aren't needed
852
849
 
853
850
  Returns True if any columns were removed, False otherwise.
854
- Raises ValueError if a cyclic dependency is detected.
855
851
  """
856
- columns_removed = False
857
-
858
- # Helper function to check if a column should be kept or removed
859
- def should_keep_column(column: SnowflakeViewColumn, part: SnowflakeViewPart) -> bool:
860
- """
861
- Checks if a column should be kept based on its dependencies.
862
- Returns True if the column should be kept, False if it should be removed.
863
- """
864
- # If no references, keep the column
865
- if not column.referenced_columns:
866
- return True
852
+
853
+ all_parts = [view_part] + joined_parts
854
+
855
+ # Build column registry: (stream_name, column_name) -> column object
856
+ all_columns: Dict[Tuple[str, str], SnowflakeViewColumn] = {}
857
+ for part in all_parts:
858
+ for column in part.columns:
859
+ all_columns[(part.stream_name, column.original_name)] = column
860
+
861
+ # Build dependency graph for topological analysis
862
+ # Key: (stream, column), Value: list of (stream, column) dependencies
863
+ # Also track columns with invalid dependencies (reference non-existent columns)
864
+ dependency_graph: Dict[Tuple[str, str], List[Tuple[str, str]]] = {}
865
+ columns_with_invalid_deps: set[Tuple[str, str]] = set()
866
+
867
+ # First pass: build dependency graph and detect direct invalid references
868
+ for part in all_parts:
869
+ for column in part.columns:
870
+ key = (part.stream_name, column.original_name)
871
+ deps = []
872
+ has_invalid_dep = False
867
873
 
868
- # Check each referenced stream and its fields
869
- for ref_stream_name, ref_fields in column.referenced_columns.items():
870
- # Find the referenced part
871
- ref_part = find_part(view_part, joined_parts,ref_stream_name)
874
+ if column.referenced_columns:
875
+ for ref_stream_name, ref_fields in column.referenced_columns.items():
876
+ # Resolve stream alias to actual stream name
877
+ resolved_stream = ref_stream_name
878
+ for join in view_part.joins:
879
+ if join.join_stream_alias == ref_stream_name:
880
+ resolved_stream = join.join_stream_name
881
+ break
882
+
883
+ for ref_field in ref_fields:
884
+ dep_key = (resolved_stream, ref_field)
885
+ if dep_key in all_columns:
886
+ deps.append(dep_key)
887
+ else:
888
+ logger.warning(
889
+ f"Column {column.original_name} in {part.stream_name} references "
890
+ f"{ref_field} in {resolved_stream}, which doesn't exist"
891
+ )
892
+ has_invalid_dep = True
872
893
 
873
- # If referenced stream doesn't exist, remove the column
874
- if ref_part is None:
875
- logger.warning(
876
- f"Column {column.name} in stream {part.stream_name} references stream "
877
- f"{ref_stream_name}, but it was not provided"
878
- )
879
- return False
880
-
881
- # Check each referenced field
882
- for ref_field in ref_fields:
883
- # Find the referenced column
884
- ref_column = next((c for c in ref_part.columns if c.original_name == ref_field), None)
894
+ dependency_graph[key] = deps
895
+ if has_invalid_dep:
896
+ columns_with_invalid_deps.add(key)
897
+
898
+ # Second pass: propagate invalidity to columns that depend on invalid columns
899
+ # Keep iterating until no new invalid columns are found
900
+ changed = True
901
+ while changed:
902
+ changed = False
903
+ for col_key, deps in dependency_graph.items():
904
+ if col_key not in columns_with_invalid_deps:
905
+ # Check if any dependency is invalid
906
+ for dep_key in deps:
907
+ if dep_key in columns_with_invalid_deps:
908
+ logger.warning(
909
+ f"Column {col_key[1]} in {col_key[0]} depends on "
910
+ f"{dep_key[1]} in {dep_key[0]}, which has invalid dependencies"
911
+ )
912
+ columns_with_invalid_deps.add(col_key)
913
+ changed = True
914
+ break
915
+
916
+ # Build alias to stream mapping
917
+ alias_to_stream: Dict[str, str] = {}
918
+ for part in all_parts:
919
+ alias_to_stream[part.stream_name] = part.stream_name
920
+ for join in part.joins:
921
+ alias_to_stream[join.join_stream_alias] = join.join_stream_name
922
+ # left_alias might be an alias for a joined stream, resolve it
923
+ if join.left_alias not in alias_to_stream:
924
+ # Try to find the stream for this alias
925
+ for other_part in all_parts:
926
+ if other_part.stream_name == join.left_alias:
927
+ alias_to_stream[join.left_alias] = other_part.stream_name
928
+ break
929
+
930
+ # Identify root columns that must be kept
931
+ needed_columns: set[Tuple[str, str]] = set()
932
+
933
+ # 1. All columns in the main part are needed (except those with invalid dependencies)
934
+ for column in view_part.columns:
935
+ col_key = (view_part.stream_name, column.original_name)
936
+ if col_key not in columns_with_invalid_deps:
937
+ needed_columns.add(col_key)
938
+
939
+ # 2. All columns used in join conditions are needed (except those with invalid dependencies)
940
+ for part in all_parts:
941
+ for join in part.joins:
942
+ # Resolve left_alias to actual stream name
943
+ left_stream = alias_to_stream.get(join.left_alias, join.left_alias)
944
+ left_key = (left_stream, join.left_column)
945
+ right_key = (join.join_stream_name, join.join_stream_column)
946
+ if left_key not in columns_with_invalid_deps:
947
+ needed_columns.add(left_key)
948
+ if right_key not in columns_with_invalid_deps:
949
+ needed_columns.add(right_key)
950
+
951
+ logger.debug(f"Identified {len(needed_columns)} root columns to keep (excluding {len(columns_with_invalid_deps)} with invalid deps)")
952
+
953
+ # 3. Find all transitive dependencies using recursive traversal
954
+ # Skip columns with invalid dependencies and their dependents
955
+ def collect_dependencies(col_key: Tuple[str, str], visited: set[Tuple[str, str]]) -> None:
956
+ """Recursively collect all columns that col_key depends on"""
957
+ if col_key in visited or col_key not in dependency_graph:
958
+ return
959
+ if col_key in columns_with_invalid_deps:
960
+ return # Don't traverse dependencies of invalid columns
961
+ visited.add(col_key)
962
+
963
+ for dep_key in dependency_graph[col_key]:
964
+ if dep_key in all_columns and dep_key not in columns_with_invalid_deps:
965
+ needed_columns.add(dep_key)
966
+ collect_dependencies(dep_key, visited)
967
+
968
+ visited_global: set[Tuple[str, str]] = set()
969
+ for root_col in list(needed_columns):
970
+ collect_dependencies(root_col, visited_global)
971
+
972
+ # Remove columns that are not needed
973
+ columns_removed = False
974
+ for part in all_parts:
975
+ original_count = len(part.columns)
976
+ removed_cols = [col for col in part.columns
977
+ if (part.stream_name, col.original_name) not in needed_columns]
978
+
979
+ # Log warnings for each removed column with the reason
980
+ for col in removed_cols:
981
+ # Determine why the column is being removed
982
+ col_key = (part.stream_name, col.original_name)
983
+ if col.referenced_columns:
984
+ # Check if any referenced columns don't exist
985
+ missing_refs = []
986
+ for ref_stream_name, ref_fields in col.referenced_columns.items():
987
+ resolved_stream = ref_stream_name
988
+ for join in view_part.joins:
989
+ if join.join_stream_alias == ref_stream_name:
990
+ resolved_stream = join.join_stream_name
991
+ break
992
+ for ref_field in ref_fields:
993
+ if (resolved_stream, ref_field) not in all_columns:
994
+ missing_refs.append(f"{ref_field} in {resolved_stream}")
885
995
 
886
- # If referenced column doesn't exist, remove the column
887
- if ref_column is None:
996
+ if missing_refs:
888
997
  logger.warning(
889
- f"Column {column.name} in stream {part.stream_name} references field "
890
- f"{ref_field} in stream {ref_stream_name}, but it was not provided"
998
+ f"Removing column {col.original_name} from {part.stream_name} because it references "
999
+ f"non-existent column(s): {', '.join(missing_refs)}"
891
1000
  )
892
- return False
893
-
894
- # All dependencies are satisfied
895
- return True
896
-
897
- # Process columns for removal
898
- for column in view_part.columns[:]: # Use a copy to allow safe removal
899
- if not should_keep_column(column, view_part):
900
- view_part.columns.remove(column)
1001
+ else:
1002
+ # Column is not needed (not referenced by main part)
1003
+ logger.debug(
1004
+ f"Removing column {col.original_name} from {part.stream_name} because it is not "
1005
+ f"referenced by the main part or any join conditions"
1006
+ )
1007
+ else:
1008
+ logger.debug(
1009
+ f"Removing column {col.original_name} from {part.stream_name} because it is not "
1010
+ f"referenced by the main part or any join conditions"
1011
+ )
1012
+
1013
+ part.columns = [col for col in part.columns
1014
+ if (part.stream_name, col.original_name) in needed_columns]
1015
+
1016
+ if removed_cols:
901
1017
  columns_removed = True
902
1018
 
903
- # Process joined parts
904
- for joined_part in joined_parts:
905
- # We have to avoid pruning columns that are referenced by joins to this stream.
906
- # first, we determine all aliases for this stream (multiple join paths back to the same stream are allowed)
907
- aliases_for_stream = [j.join_stream_alias for j in view_part.joins if j.join_stream_name == joined_part.stream_name]
908
- # now find all joins using this stream as the join stream
909
- columns_used_in_joins = [
910
- j.left_column for j in view_part.joins if j.left_alias in aliases_for_stream
911
- ]
912
- for column in joined_part.columns[:]: # Use a copy to allow safe removal
913
- # First check if the column is a join column
914
- if column.original_name in columns_used_in_joins:
915
- # If it's a join column, we need to keep it
916
- continue
917
-
918
- if not should_keep_column(column, joined_part):
919
- joined_part.columns.remove(column)
920
- columns_removed = True
921
-
922
1019
  return columns_removed
923
1020
 
924
1021
  class JsonSchemaTopLevel(BaseModel):
@@ -9,9 +9,10 @@ from typing import Dict, List, Optional
9
9
  from snowflake.snowpark import Session
10
10
  from pydantic import ValidationError
11
11
  from snowflake import telemetry
12
- from opentelemetry import trace
12
+ from opentelemetry import trace, metrics
13
13
 
14
14
  tracer = trace.get_tracer('omnata_plugin_runtime')
15
+ meter = metrics.get_meter('omnata_plugin_runtime')
15
16
 
16
17
  class CustomLoggerAdapter(logging.LoggerAdapter):
17
18
  """
@@ -15,7 +15,7 @@ if tuple(sys.version_info[:2]) >= (3, 9):
15
15
  else:
16
16
  # Python 3.8 and below
17
17
  from typing_extensions import Annotated
18
-
18
+ from dataclasses import dataclass
19
19
  import zipfile
20
20
  import datetime
21
21
  import http
@@ -48,7 +48,12 @@ from snowflake.snowpark import Session
48
48
  from snowflake.snowpark.functions import col
49
49
  from tenacity import Retrying, stop_after_attempt, wait_fixed, retry_if_exception_message
50
50
 
51
- from .logging import OmnataPluginLogHandler, logger, tracer
51
+ from .logging import OmnataPluginLogHandler, logger, tracer, meter
52
+ stream_duration_gauge = meter.create_gauge(
53
+ name="omnata.sync_run.stream_duration",
54
+ description="The duration of stream processing",
55
+ unit="s",
56
+ )
52
57
  from opentelemetry import context
53
58
  import math
54
59
  import numpy as np
@@ -265,6 +270,29 @@ def jinja_filter(func):
265
270
  func.is_jinja_filter = True
266
271
  return func
267
272
 
273
+ @dataclass
274
+ class StateResult:
275
+ """
276
+ Represents the current cursor state of a stream. This simple wrapper just helps us identify what type of
277
+ object is in the apply_results list.
278
+ """
279
+ new_state: Any
280
+
281
+ @dataclass
282
+ class RecordsToUploadResult:
283
+ """
284
+ Represents the records to upload for a stream. This simple wrapper just helps us identify what type of
285
+ object is in the apply_results list.
286
+ """
287
+ records: pandas.DataFrame
288
+
289
+ @dataclass
290
+ class CriteriaDeleteResult:
291
+ """
292
+ Represents the result of processing criteria deletes for a stream. This simple wrapper just helps us identify what type of
293
+ object is in the apply_results list.
294
+ """
295
+ criteria_deletes: pandas.DataFrame
268
296
 
269
297
  class SyncRequest(ABC):
270
298
  """
@@ -1057,7 +1085,6 @@ class InboundSyncRequest(SyncRequest):
1057
1085
  }
1058
1086
 
1059
1087
  # These are similar to the results, but represent requests to delete records by some criteria
1060
- self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
1061
1088
  self._temp_tables = {}
1062
1089
  self._temp_table_lock = threading.Lock()
1063
1090
  self._results_exist: Dict[
@@ -1096,7 +1123,9 @@ class InboundSyncRequest(SyncRequest):
1096
1123
  self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
1097
1124
  self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
1098
1125
  # this is keyed on stream name, each containing a list of dataframes and state updates mixed
1099
- self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
1126
+ self._apply_results: Dict[str, List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = {}
1127
+ # track the start times of each stream, so we can calculate durations. The int is a epoch (time.time()) value
1128
+ self._stream_start_times: Dict[str, int] = {}
1100
1129
 
1101
1130
  def apply_results_queue(self):
1102
1131
  """
@@ -1105,7 +1134,8 @@ class InboundSyncRequest(SyncRequest):
1105
1134
  logger.debug("InboundSyncRequest apply_results_queue")
1106
1135
  if self._apply_results is not None:
1107
1136
  with self._apply_results_lock:
1108
- results:List[pandas.DataFrame] = []
1137
+ records_to_upload:List[pandas.DataFrame] = []
1138
+ criteria_deletes_to_upload:List[pandas.DataFrame] = []
1109
1139
  stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
1110
1140
  for stream_name, stream_results in self._apply_results.items():
1111
1141
  # the stream results contains an ordered sequence of dataframes and state updates (append only)
@@ -1113,9 +1143,9 @@ class InboundSyncRequest(SyncRequest):
1113
1143
  # so first, we iterate backwards to find the last state update
1114
1144
  last_state_index = -1
1115
1145
  for i in range(len(stream_results) - 1, -1, -1):
1116
- if isinstance(stream_results[i], dict):
1146
+ if isinstance(stream_results[i], StateResult):
1117
1147
  last_state_index = i
1118
- stream_states_for_upload[stream_name] = stream_results[i]
1148
+ stream_states_for_upload[stream_name] = stream_results[i].new_state
1119
1149
  break
1120
1150
  # if there are no state updates, we can't do anything with this stream
1121
1151
  if last_state_index == -1:
@@ -1124,56 +1154,54 @@ class InboundSyncRequest(SyncRequest):
1124
1154
  )
1125
1155
  continue
1126
1156
  assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
1127
- # now we can take the dataframes up to the last state update
1128
- dfs = stream_results[:last_state_index]
1129
- non_empty_dfs = [
1130
- x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
1157
+ # now we can take the record dataframes up to the last state update
1158
+ results_subset = stream_results[:last_state_index]
1159
+ non_empty_record_dfs:List[pandas.DataFrame] = [
1160
+ x.records for x in results_subset
1161
+ if x is not None and isinstance(x, RecordsToUploadResult) and len(x.records) > 0
1131
1162
  ]
1132
1163
  # get the total length of all the dataframes
1133
- total_length = sum([len(x) for x in non_empty_dfs])
1164
+ total_length = sum([len(x) for x in non_empty_record_dfs])
1134
1165
  # add the count of this batch to the total for this stream
1135
1166
  self._stream_record_counts[
1136
1167
  stream_name
1137
1168
  ] = self._stream_record_counts[stream_name] + total_length
1138
- results.extend(non_empty_dfs)
1169
+ records_to_upload.extend(non_empty_record_dfs)
1170
+ # also handle any criteria deletes
1171
+ criteria_deletes_to_upload.extend([
1172
+ x.criteria_deletes for x in results_subset
1173
+ if x is not None and isinstance(x, CriteriaDeleteResult) and len(x.criteria_deletes) > 0
1174
+ ])
1139
1175
  # now remove everything up to the last state update
1140
1176
  # we do this so that we don't apply the same state update multiple times
1177
+ # keep everything after the last state update
1141
1178
  self._apply_results[stream_name] = stream_results[
1142
1179
  last_state_index + 1 :
1143
- ] # keep everything after the last state update
1144
- if len(results) > 0:
1145
- logger.debug(
1146
- f"Applying {len(results)} batches of queued results"
1147
- )
1148
- # upload all cached apply results
1149
- all_dfs = pandas.concat(results)
1150
- query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
1151
- # now that the results have been updated, we need to insert records into the state register table
1152
- # we do this by inserting the latest state for each stream
1180
+ ]
1181
+
1182
+ if len(records_to_upload) > 0 or len(criteria_deletes_to_upload) > 0:
1183
+ if len(records_to_upload) > 0:
1184
+ logger.debug(
1185
+ f"Applying {len(records_to_upload)} batches of queued results"
1186
+ )
1187
+ # upload all cached apply results
1188
+ records_to_upload_combined = pandas.concat(records_to_upload)
1189
+ self._apply_results_dataframe(list(stream_states_for_upload.keys()), records_to_upload_combined)
1190
+ # now that the results have been updated, we need to insert records into the state register table
1191
+ # we do this by inserting the latest state for each stream
1192
+ if len(criteria_deletes_to_upload) > 0:
1193
+ logger.debug(
1194
+ f"Applying {len(criteria_deletes_to_upload)} batches of queued criteria deletes"
1195
+ )
1196
+ # upload all cached apply results
1197
+ all_criteria_deletes = pandas.concat(criteria_deletes_to_upload)
1198
+ self._apply_criteria_deletes_dataframe(all_criteria_deletes)
1199
+
1200
+ query_id = self._get_query_id_for_now()
1153
1201
  self._directly_insert_to_state_register(
1154
1202
  stream_states_for_upload, query_id=query_id
1155
1203
  )
1156
1204
 
1157
- # also take care of uploading delete requests
1158
- # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
1159
- # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
1160
- if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
1161
- with self._apply_results_lock:
1162
- results:List[pandas.DataFrame] = []
1163
- for stream_name, stream_results in self._apply_results_criteria_deletes.items():
1164
- results.extend([
1165
- x for x in stream_results if x is not None and len(x) > 0
1166
- ])
1167
- if len(results) > 0:
1168
- logger.debug(
1169
- f"Applying {len(results)} batches of queued criteria deletes"
1170
- )
1171
- # upload all cached apply results
1172
- all_dfs = pandas.concat(results)
1173
- self._apply_criteria_deletes_dataframe(all_dfs)
1174
- # clear the delete requests
1175
- self._apply_results_criteria_deletes = {}
1176
-
1177
1205
 
1178
1206
  # update the inbound stream record counts, so we can see progress
1179
1207
  # we do this last, because marking a stream as completed will cause the sync engine to process it
@@ -1281,29 +1309,40 @@ class InboundSyncRequest(SyncRequest):
1281
1309
  if stream_name is None or len(stream_name) == 0:
1282
1310
  raise ValueError("Stream name cannot be empty")
1283
1311
  with self._apply_results_lock:
1284
- existing_results: List[pandas.DataFrame] = []
1312
+ existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
1285
1313
  if stream_name in self._apply_results:
1286
1314
  existing_results = self._apply_results[stream_name]
1287
- existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
1315
+ existing_results.append(RecordsToUploadResult(
1316
+ records=self._preprocess_results_list(stream_name, results, is_delete)
1317
+ ))
1288
1318
  if new_state is not None:
1289
- existing_results.append(new_state) # append the new state at the end
1319
+ existing_results.append(
1320
+ StateResult(new_state=new_state)
1321
+ ) # append the new state at the end
1290
1322
  self._apply_results[stream_name] = existing_results
1291
- # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1292
- # we'll use df.memory_usage(index=True) for this
1293
1323
  if self.development_mode is False:
1294
1324
  # note: we want to do it for all values in self._apply_results, not just the new one
1295
- # so first we need to get the list of lists from the dictionary values and flatten it
1296
- # then we can sum the memory usage of each dataframe
1297
- # if the total exceeds 200MB, we apply the results immediately
1298
- all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results.values())
1299
- # flatten
1300
- all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist if isinstance(x, pandas.DataFrame)]
1301
- combined_length = sum([len(x) for x in all_dfs])
1302
- # first, don't bother if the count is less than 10000, since it's unlikely to be even close
1303
- if combined_length > 10000:
1304
- if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
1305
- logger.debug(f"Applying results queue immediately due to combined dataframe size")
1306
- self.apply_results_queue()
1325
+ self._apply_results_if_size_exceeded()
1326
+
1327
+ def _apply_results_if_size_exceeded(self,):
1328
+ # so first we need to get the list of lists from the dictionary values and flatten it
1329
+ # then we can sum the memory usage of each dataframe
1330
+ # if the total exceeds 200MB, we apply the results immediately
1331
+ all_df_lists:List[List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = list(self._apply_results.values())
1332
+ # flatten
1333
+ all_dfs:List[pandas.DataFrame] = []
1334
+ for sublist in all_df_lists:
1335
+ for x in sublist:
1336
+ if isinstance(x, RecordsToUploadResult):
1337
+ all_dfs.append(x.records)
1338
+ if isinstance(x, CriteriaDeleteResult):
1339
+ all_dfs.append(x.criteria_deletes)
1340
+ combined_length = sum([len(x) for x in all_dfs])
1341
+ # first, don't bother if the count is less than 10000, since it's unlikely to be even close
1342
+ if combined_length > 10000:
1343
+ if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
1344
+ logger.debug(f"Applying results queue immediately due to combined dataframe size")
1345
+ self.apply_results_queue()
1307
1346
 
1308
1347
  def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
1309
1348
  """
@@ -1329,27 +1368,22 @@ class InboundSyncRequest(SyncRequest):
1329
1368
  logger.debug(
1330
1369
  f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
1331
1370
  )
1332
- existing_results: List[pandas.DataFrame] = []
1333
- if stream_name in self._apply_results_criteria_deletes:
1334
- existing_results = self._apply_results_criteria_deletes[stream_name]
1335
- existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
1336
- self._apply_results_criteria_deletes[stream_name] = existing_results
1337
- # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1338
- # we'll use df.memory_usage(index=True) for this
1371
+ existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
1372
+ if stream_name in self._apply_results:
1373
+ existing_results = self._apply_results[stream_name]
1374
+ existing_results.append(
1375
+ CriteriaDeleteResult(
1376
+ criteria_deletes=pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}])))
1377
+ self._apply_results[stream_name] = existing_results
1339
1378
  if self.development_mode is False:
1340
- # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
1341
- # so first we need to get the list of lists from the dictionary values and flatten it
1342
- # then we can sum the memory usage of each dataframe
1343
- # if the total exceeds 200MB, we apply the results immediately
1344
- all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
1345
- # flatten
1346
- all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
1347
- combined_length = sum([len(x) for x in all_dfs])
1348
- # first, don't both if the count is less than 10000, since it's unlikely to be even close
1349
- if combined_length > 10000:
1350
- if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
1351
- logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
1352
- self.apply_results_queue()
1379
+ self._apply_results_if_size_exceeded()
1380
+
1381
+ def mark_stream_started(self, stream_name: str):
1382
+ """
1383
+ Marks a stream as started, this is called automatically per stream when using @managed_inbound_processing.
1384
+ """
1385
+ logger.debug(f"Marking stream {stream_name} as started locally")
1386
+ self._stream_start_times[stream_name] = time.time()
1353
1387
 
1354
1388
  def mark_stream_complete(self, stream_name: str):
1355
1389
  """
@@ -1357,6 +1391,20 @@ class InboundSyncRequest(SyncRequest):
1357
1391
  If @managed_inbound_processing is not used, call this whenever a stream has finished recieving records.
1358
1392
  """
1359
1393
  logger.debug(f"Marking stream {stream_name} as completed locally")
1394
+ if stream_name in self._stream_start_times:
1395
+ start_time = self._stream_start_times[stream_name]
1396
+ duration = time.time() - start_time
1397
+ stream_duration_gauge.set(
1398
+ amount=duration,
1399
+ attributes={
1400
+ "stream_name": stream_name,
1401
+ "sync_run_id": str(self._run_id),
1402
+ "sync_id": str(self._sync_id),
1403
+ "branch_name": str(self._branch_name) if self._branch_name is not None else 'main',
1404
+ "sync_direction": "inbound",
1405
+ "plugin_id": self.plugin_instance.get_manifest().plugin_id,
1406
+ },
1407
+ )
1360
1408
  with self._apply_results_lock:
1361
1409
  self._completed_streams.append(stream_name)
1362
1410
  # dedup just in case it's called twice
@@ -1463,7 +1511,7 @@ class InboundSyncRequest(SyncRequest):
1463
1511
  logger.debug(f"Failure to convert inbound data: {str(exception)}")
1464
1512
  return data
1465
1513
 
1466
- def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
1514
+ def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]) -> pandas.DataFrame:
1467
1515
  """
1468
1516
  Creates a dataframe from the enqueued list, ready to upload.
1469
1517
  The result is a dataframe contain all (and only):
@@ -1608,7 +1656,7 @@ class InboundSyncRequest(SyncRequest):
1608
1656
  hash_object = hashlib.sha256(key_string.encode())
1609
1657
  return hash_object.hexdigest()
1610
1658
 
1611
- def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
1659
+ def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
1612
1660
  """
1613
1661
  Applies results for an inbound sync. The results are staged into a temporary
1614
1662
  table in Snowflake, so that we can make an atomic commit at the end.
@@ -1635,7 +1683,6 @@ class InboundSyncRequest(SyncRequest):
1635
1683
  raise ValueError(
1636
1684
  f"Failed to write results to table {self._full_results_table_name}"
1637
1685
  )
1638
- query_id = self._get_query_id_for_now()
1639
1686
  logger.debug(
1640
1687
  f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
1641
1688
  )
@@ -1648,7 +1695,6 @@ class InboundSyncRequest(SyncRequest):
1648
1695
  # )
1649
1696
  for stream_name in stream_names:
1650
1697
  self._results_exist[stream_name] = True
1651
- return query_id
1652
1698
  else:
1653
1699
  logger.debug("Results dataframe is empty, not applying")
1654
1700
 
@@ -2330,6 +2376,11 @@ def __managed_inbound_processing_worker(
2330
2376
  try:
2331
2377
  stream: StoredStreamConfiguration = streams_queue.get_nowait()
2332
2378
  logger.debug(f"stream returned from queue: {stream}")
2379
+ sync_request: InboundSyncRequest = cast(
2380
+ InboundSyncRequest, plugin_class_obj._sync_request
2381
+ ) # pylint: disable=protected-access
2382
+ if stream.stream_name not in sync_request._stream_start_times:
2383
+ sync_request.mark_stream_started(stream.stream_name)
2333
2384
  # restore the first argument, was originally the dataframe/generator but now it's the appropriately sized dataframe
2334
2385
  try:
2335
2386
  with tracer.start_as_current_span("managed_inbound_processing") as managed_inbound_processing_span:
@@ -2341,7 +2392,7 @@ def __managed_inbound_processing_worker(
2341
2392
  logger.info(f"worker {worker_index} requested that {stream.stream_name} be not marked as complete")
2342
2393
  else:
2343
2394
  logger.info(f"worker {worker_index} marking stream {stream.stream_name} as complete")
2344
- plugin_class_obj._sync_request.mark_stream_complete(stream.stream_name)
2395
+ sync_request.mark_stream_complete(stream.stream_name)
2345
2396
  except InterruptedWhileWaitingException:
2346
2397
  # If an inbound run is cancelled while waiting for rate limiting, this should mean that
2347
2398
  # the cancellation is handled elsewhere, so we don't need to do anything special here other than stop waiting
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: omnata-plugin-runtime
3
+ Version: 0.12.1
4
+ Summary: Classes and common runtime components for building and running Omnata Plugins
5
+ License-File: LICENSE
6
+ Author: James Weakley
7
+ Author-email: james.weakley@omnata.com
8
+ Requires-Python: >=3.10,<=3.13
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: annotated-types (<=0.6.0)
15
+ Requires-Dist: certifi (<=2025.1.31)
16
+ Requires-Dist: cffi (<=2.0.0)
17
+ Requires-Dist: charset-normalizer (<=3.4.4)
18
+ Requires-Dist: cryptography (<=46.0.3)
19
+ Requires-Dist: filelock (<=3.20.0)
20
+ Requires-Dist: idna (<=3.11)
21
+ Requires-Dist: jinja2 (>=3.1.2,<=3.1.6)
22
+ Requires-Dist: markupsafe (<=3.0.2)
23
+ Requires-Dist: numpy (<=2.3.5)
24
+ Requires-Dist: opentelemetry-api (<=1.38.0)
25
+ Requires-Dist: packaging (<=25.0)
26
+ Requires-Dist: pandas (<=2.3.3)
27
+ Requires-Dist: platformdirs (<=4.5.0)
28
+ Requires-Dist: protobuf (<=6.33.0)
29
+ Requires-Dist: pyarrow (<=21.0.0)
30
+ Requires-Dist: pycparser (<=2.23)
31
+ Requires-Dist: pydantic (>=2,<=2.12.4)
32
+ Requires-Dist: pydantic-core (<=2.41.5)
33
+ Requires-Dist: pyjwt (<=2.10.1)
34
+ Requires-Dist: pyopenssl (<=225.3.0)
35
+ Requires-Dist: pytz (<=2025.2)
36
+ Requires-Dist: pyyaml (<=6.0.3)
37
+ Requires-Dist: requests (>=2,<=2.32.5)
38
+ Requires-Dist: setuptools (<=80.9.0)
39
+ Requires-Dist: snowflake-connector-python (>=3,<4)
40
+ Requires-Dist: snowflake-snowpark-python (>=1.20.0,<=1.43.0)
41
+ Requires-Dist: snowflake-telemetry-python (<=0.5.0)
42
+ Requires-Dist: tenacity (>=8,<9)
43
+ Requires-Dist: tomlkit (<=0.13.3)
44
+ Requires-Dist: urllib3 (<=2.5.0)
45
+ Requires-Dist: wheel (<=0.45.1)
46
+ Requires-Dist: wrapt (<=2.0.1)
47
+ Description-Content-Type: text/markdown
48
+
49
+ # omnata-plugin-runtime
50
+ This package is a runtime dependency for [Omnata Plugins](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins).
51
+
52
+ It contains data classes, interfaces and application logic used to perform plugin operations.
53
+
54
+ For instructions on creating plugins, visit our [docs site](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins/creating-plugins).
55
+
56
+
@@ -2,12 +2,12 @@ omnata_plugin_runtime/__init__.py,sha256=MS9d1whnfT_B3-ThqZ7l63QeC_8OEKTuaYV5wTw
2
2
  omnata_plugin_runtime/api.py,sha256=5gbjbnFy72Xjf0E3kbG23G0V2J3CorvD5kpBn_BkdlI,8084
3
3
  omnata_plugin_runtime/configuration.py,sha256=SffokJfgvy6V3kUsoEjXcK3GdNgHo6U3mgBEs0qBv4I,46972
4
4
  omnata_plugin_runtime/forms.py,sha256=Lrbr3otsFDrvHWJw7v-slsW4PvEHJ6BG1Yl8oaJfiDo,20529
5
- omnata_plugin_runtime/json_schema.py,sha256=HGqqsJGzKT7PSW2re4teyGTiTv-ytEhOSzuvubiz-uY,54826
6
- omnata_plugin_runtime/logging.py,sha256=WBuZt8lF9E5oFWM4KYQbE8dDJ_HctJ1pN3BHwU6rcd0,4461
7
- omnata_plugin_runtime/omnata_plugin.py,sha256=xqAIxFdb2X4ryK4VetQxI4u4UdMyN2xs4toLHKasIdU,142045
5
+ omnata_plugin_runtime/json_schema.py,sha256=ZfHMG-XSJBE9Smt33Y6GPpl5skF7pB1TRCf9AvWuw-Y,59705
6
+ omnata_plugin_runtime/logging.py,sha256=qUtRA9syQNnjfJZHA2W18K282voXX6vHwrBIPOBo1n8,4521
7
+ omnata_plugin_runtime/omnata_plugin.py,sha256=8FT3XNdZzty76OldvcxdKpbKrPENKjAIbwa_rxceVyg,143564
8
8
  omnata_plugin_runtime/plugin_entrypoints.py,sha256=_1pDLov3iQorGmfcae8Sw2bVjxw1vYeowBaKKNzRclQ,32629
9
9
  omnata_plugin_runtime/rate_limiting.py,sha256=qpr5esU4Ks8hMzuMpSR3gLFdor2ZUXYWCjmsQH_K6lQ,25882
10
- omnata_plugin_runtime-0.11.4.dist-info/METADATA,sha256=bHTXobn0dW15ESTEMBybxEN55Eu5X3UJEW-v8B-pBwM,2229
11
- omnata_plugin_runtime-0.11.4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
12
- omnata_plugin_runtime-0.11.4.dist-info/licenses/LICENSE,sha256=rGaMQG3R3F5-JGDp_-rlMKpDIkg5n0SI4kctTk8eZSI,56
13
- omnata_plugin_runtime-0.11.4.dist-info/RECORD,,
10
+ omnata_plugin_runtime-0.12.1.dist-info/METADATA,sha256=SCl6ee1e3Q8DN0wa47snuMAOBABw387hC54HXuYSTcs,2222
11
+ omnata_plugin_runtime-0.12.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
12
+ omnata_plugin_runtime-0.12.1.dist-info/licenses/LICENSE,sha256=rGaMQG3R3F5-JGDp_-rlMKpDIkg5n0SI4kctTk8eZSI,56
13
+ omnata_plugin_runtime-0.12.1.dist-info/RECORD,,
@@ -1,56 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: omnata-plugin-runtime
3
- Version: 0.11.4
4
- Summary: Classes and common runtime components for building and running Omnata Plugins
5
- License-File: LICENSE
6
- Author: James Weakley
7
- Author-email: james.weakley@omnata.com
8
- Requires-Python: >=3.8,<=3.11
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.8
11
- Classifier: Programming Language :: Python :: 3.9
12
- Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Requires-Dist: annotated-types (<=0.6.0)
15
- Requires-Dist: certifi (<=2024.8.30)
16
- Requires-Dist: cffi (<=1.16.0)
17
- Requires-Dist: charset-normalizer (<=3.3.2)
18
- Requires-Dist: cryptography (<=43.0.0)
19
- Requires-Dist: filelock (<=3.13.1)
20
- Requires-Dist: idna (<=3.7)
21
- Requires-Dist: jinja2 (>=3.1.2,<=3.1.4)
22
- Requires-Dist: markupsafe (<=2.1.3)
23
- Requires-Dist: numpy (<=2.1.3)
24
- Requires-Dist: opentelemetry-api (<=1.23.0)
25
- Requires-Dist: packaging (<=24.1)
26
- Requires-Dist: pandas (<=2.2.3)
27
- Requires-Dist: platformdirs (<=3.10.0)
28
- Requires-Dist: protobuf (<=4.25.3)
29
- Requires-Dist: pyarrow (<=16.1.0)
30
- Requires-Dist: pycparser (<=2.21)
31
- Requires-Dist: pydantic (>=2,<=2.8.2)
32
- Requires-Dist: pydantic-core (<=2.21.0)
33
- Requires-Dist: pyjwt (<=2.8.0)
34
- Requires-Dist: pyopenssl (<=24.2.1)
35
- Requires-Dist: pytz (<=2024.1)
36
- Requires-Dist: pyyaml (<=6.0.1)
37
- Requires-Dist: requests (>=2,<=2.32.3)
38
- Requires-Dist: setuptools (<=72.1.0)
39
- Requires-Dist: snowflake-connector-python (>=3,<=3.12.0)
40
- Requires-Dist: snowflake-snowpark-python (>=1.20.0,<=1.24.0)
41
- Requires-Dist: snowflake-telemetry-python (<=0.5.0)
42
- Requires-Dist: tenacity (>=8,<=8.2.3)
43
- Requires-Dist: tomlkit (<=0.11.1)
44
- Requires-Dist: urllib3 (<=2.2.2)
45
- Requires-Dist: wheel (<=0.43.0)
46
- Requires-Dist: wrapt (<=1.14.1)
47
- Description-Content-Type: text/markdown
48
-
49
- # omnata-plugin-runtime
50
- This package is a runtime dependency for [Omnata Plugins](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins).
51
-
52
- It contains data classes, interfaces and application logic used to perform plugin operations.
53
-
54
- For instructions on creating plugins, visit our [docs site](https://docs.omnata.com/omnata-product-documentation/omnata-sync-for-snowflake/plugins/creating-plugins).
55
-
56
-