PyPI - omnata-plugin-runtime - Versions diffs - 0.11.8a326__tar.gz → 0.11.9__tar.gz - Mend

omnata-plugin-runtime 0.11.8a326tar.gz → 0.11.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: omnata-plugin-runtime
-Version: 0.11.8a326
+Version: 0.11.9
 Summary: Classes and common runtime components for building and running Omnata Plugins
 License-File: LICENSE
 Author: James Weakley

{omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "omnata-plugin-runtime"
-version = "0.11.8-a326"
+version = "0.11.9"
 description = "Classes and common runtime components for building and running Omnata Plugins"
 authors = ["James Weakley <james.weakley@omnata.com>"]
 readme = "README.md"

{omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/omnata_plugin.py RENAMED Viewed

@@ -15,7 +15,7 @@ if tuple(sys.version_info[:2]) >= (3, 9):
 else:
     # Python 3.8 and below
     from typing_extensions import Annotated
+from dataclasses import dataclass
 import zipfile
 import datetime
 import http
@@ -270,6 +270,29 @@ def jinja_filter(func):
     func.is_jinja_filter = True
     return func
+@dataclass
+class StateResult:
+    """
+    Represents the current cursor state of a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    new_state: Any
+@dataclass
+class RecordsToUploadResult:
+    """
+    Represents the records to upload for a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    records: pandas.DataFrame
+@dataclass
+class CriteriaDeleteResult:
+    """
+    Represents the result of processing criteria deletes for a stream. This simple wrapper just helps us identify what type of
+    object is in the apply_results list.
+    """
+    criteria_deletes: pandas.DataFrame
 class SyncRequest(ABC):
     """
@@ -1062,7 +1085,6 @@ class InboundSyncRequest(SyncRequest):
         }
         # These are similar to the results, but represent requests to delete records by some criteria
-        self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
         self._temp_tables = {}
         self._temp_table_lock = threading.Lock()
         self._results_exist: Dict[
@@ -1101,7 +1123,7 @@ class InboundSyncRequest(SyncRequest):
         self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
         self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
         # this is keyed on stream name, each containing a list of dataframes and state updates mixed
-        self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
+        self._apply_results: Dict[str, List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = {}
         # track the start times of each stream, so we can calculate durations. The int is a epoch (time.time()) value
         self._stream_start_times: Dict[str, int] = {}
@@ -1112,7 +1134,8 @@ class InboundSyncRequest(SyncRequest):
         logger.debug("InboundSyncRequest apply_results_queue")
         if self._apply_results is not None:
             with self._apply_results_lock:
-                results:List[pandas.DataFrame] = []
+                records_to_upload:List[pandas.DataFrame] = []
+                criteria_deletes_to_upload:List[pandas.DataFrame] = []
                 stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
                 for stream_name, stream_results in self._apply_results.items():
                     # the stream results contains an ordered sequence of dataframes and state updates (append only)
@@ -1120,9 +1143,9 @@ class InboundSyncRequest(SyncRequest):
                     # so first, we iterate backwards to find the last state update
                     last_state_index = -1
                     for i in range(len(stream_results) - 1, -1, -1):
-                        if isinstance(stream_results[i], dict):
+                        if isinstance(stream_results[i], StateResult):
                             last_state_index = i
-                            stream_states_for_upload[stream_name] = stream_results[i]
+                            stream_states_for_upload[stream_name] = stream_results[i].new_state
                             break
                     # if there are no state updates, we can't do anything with this stream
                     if last_state_index == -1:
@@ -1131,56 +1154,54 @@ class InboundSyncRequest(SyncRequest):
                         )
                         continue
                     assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
-                    # now we can take the dataframes up to the last state update
-                    dfs = stream_results[:last_state_index]
-                    non_empty_dfs = [
-                        x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
+                    # now we can take the record dataframes up to the last state update
+                    results_subset = stream_results[:last_state_index]
+                    non_empty_record_dfs:List[pandas.DataFrame] = [
+                        x.records for x in results_subset
+                        if x is not None and isinstance(x, RecordsToUploadResult) and len(x.records) > 0
                     ]
                     # get the total length of all the dataframes
-                    total_length = sum([len(x) for x in non_empty_dfs])
+                    total_length = sum([len(x) for x in non_empty_record_dfs])
                     # add the count of this batch to the total for this stream
                     self._stream_record_counts[
                         stream_name
                     ] = self._stream_record_counts[stream_name] + total_length
-                    results.extend(non_empty_dfs)
+                    records_to_upload.extend(non_empty_record_dfs)
+                    # also handle any criteria deletes
+                    criteria_deletes_to_upload.extend([
+                        x.criteria_deletes for x in results_subset
+                        if x is not None and isinstance(x, CriteriaDeleteResult) and len(x.criteria_deletes) > 0
+                    ])
                     # now remove everything up to the last state update
                     # we do this so that we don't apply the same state update multiple times
+                    # keep everything after the last state update
                     self._apply_results[stream_name] = stream_results[
                         last_state_index + 1 :
-                    ]  # keep everything after the last state update
-                if len(results) > 0:
-                    logger.debug(
-                        f"Applying {len(results)} batches of queued results"
-                    )
-                    # upload all cached apply results
-                    all_dfs = pandas.concat(results)
-                    query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
-                    # now that the results have been updated, we need to insert records into the state register table
-                    # we do this by inserting the latest state for each stream
+                    ]
+                if len(records_to_upload) > 0 or len(criteria_deletes_to_upload) > 0:
+                    if len(records_to_upload) > 0:
+                        logger.debug(
+                            f"Applying {len(records_to_upload)} batches of queued results"
+                        )
+                        # upload all cached apply results
+                        records_to_upload_combined = pandas.concat(records_to_upload)
+                        self._apply_results_dataframe(list(stream_states_for_upload.keys()), records_to_upload_combined)
+                        # now that the results have been updated, we need to insert records into the state register table
+                        # we do this by inserting the latest state for each stream
+                    if len(criteria_deletes_to_upload) > 0:
+                        logger.debug(
+                            f"Applying {len(criteria_deletes_to_upload)} batches of queued criteria deletes"
+                        )
+                        # upload all cached apply results
+                        all_criteria_deletes = pandas.concat(criteria_deletes_to_upload)
+                        self._apply_criteria_deletes_dataframe(all_criteria_deletes)
+                    query_id = self._get_query_id_for_now()
                     self._directly_insert_to_state_register(
                         stream_states_for_upload, query_id=query_id
                     )
-        # also take care of uploading delete requests
-        # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
-        # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
-        if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
-            with self._apply_results_lock:
-                results:List[pandas.DataFrame] = []
-                for stream_name, stream_results in self._apply_results_criteria_deletes.items():
-                    results.extend([
-                        x for x in stream_results if x is not None and len(x) > 0
-                    ])
-                if len(results) > 0:
-                    logger.debug(
-                        f"Applying {len(results)} batches of queued criteria deletes"
-                    )
-                    # upload all cached apply results
-                    all_dfs = pandas.concat(results)
-                    self._apply_criteria_deletes_dataframe(all_dfs)
-                    # clear the delete requests
-                    self._apply_results_criteria_deletes = {}
         # update the inbound stream record counts, so we can see progress
         # we do this last, because marking a stream as completed will cause the sync engine to process it
@@ -1288,29 +1309,40 @@ class InboundSyncRequest(SyncRequest):
         if stream_name is None or len(stream_name) == 0:
             raise ValueError("Stream name cannot be empty")
         with self._apply_results_lock:
-            existing_results: List[pandas.DataFrame] = []
+            existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
             if stream_name in self._apply_results:
                 existing_results = self._apply_results[stream_name]
-            existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
+            existing_results.append(RecordsToUploadResult(
+                records=self._preprocess_results_list(stream_name, results, is_delete)
+            ))
             if new_state is not None:
-                existing_results.append(new_state)  # append the new state at the end
+                existing_results.append(
+                    StateResult(new_state=new_state)
+                )  # append the new state at the end
             self._apply_results[stream_name] = existing_results
-        # if the total size of all the dataframes exceeds 200MB, apply the results immediately
-        # we'll use df.memory_usage(index=True) for this
         if self.development_mode is False:
             # note: we want to do it for all values in self._apply_results, not just the new one
-            # so first we need to get the list of lists from the dictionary values and flatten it
-            # then we can sum the memory usage of each dataframe
-            # if the total exceeds 200MB, we apply the results immediately
-            all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results.values())
-            # flatten
-            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist if isinstance(x, pandas.DataFrame)]
-            combined_length = sum([len(x) for x in all_dfs])
-            # first, don't bother if the count is less than 10000, since it's unlikely to be even close
-            if combined_length > 10000:
-                if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
-                    logger.debug(f"Applying results queue immediately due to combined dataframe size")
-                    self.apply_results_queue()
+            self._apply_results_if_size_exceeded()
+    def _apply_results_if_size_exceeded(self,):
+        # so first we need to get the list of lists from the dictionary values and flatten it
+        # then we can sum the memory usage of each dataframe
+        # if the total exceeds 200MB, we apply the results immediately
+        all_df_lists:List[List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = list(self._apply_results.values())
+        # flatten
+        all_dfs:List[pandas.DataFrame] = []
+        for sublist in all_df_lists:
+            for x in sublist:
+                if isinstance(x, RecordsToUploadResult):
+                    all_dfs.append(x.records)
+                if isinstance(x, CriteriaDeleteResult):
+                    all_dfs.append(x.criteria_deletes)
+        combined_length = sum([len(x) for x in all_dfs])
+        # first, don't bother if the count is less than 10000, since it's unlikely to be even close
+        if combined_length > 10000:
+            if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
+                logger.debug(f"Applying results queue immediately due to combined dataframe size")
+                self.apply_results_queue()
     def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
         """
@@ -1336,27 +1368,15 @@ class InboundSyncRequest(SyncRequest):
             logger.debug(
                 f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
             )
-            existing_results: List[pandas.DataFrame] = []
-            if stream_name in self._apply_results_criteria_deletes:
-                existing_results = self._apply_results_criteria_deletes[stream_name]
-            existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
-            self._apply_results_criteria_deletes[stream_name] = existing_results
-        # if the total size of all the dataframes exceeds 200MB, apply the results immediately
-        # we'll use df.memory_usage(index=True) for this
+            existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
+            if stream_name in self._apply_results:
+                existing_results = self._apply_results[stream_name]
+            existing_results.append(
+                CriteriaDeleteResult(
+                    criteria_deletes=pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}])))
+            self._apply_results[stream_name] = existing_results
         if self.development_mode is False:
-            # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
-            # so first we need to get the list of lists from the dictionary values and flatten it
-            # then we can sum the memory usage of each dataframe
-            # if the total exceeds 200MB, we apply the results immediately
-            all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
-            # flatten
-            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
-            combined_length = sum([len(x) for x in all_dfs])
-            # first, don't both if the count is less than 10000, since it's unlikely to be even close
-            if combined_length > 10000:
-                if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
-                    logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
-                    self.apply_results_queue()
+            self._apply_results_if_size_exceeded()
     def mark_stream_started(self, stream_name: str):
         """
@@ -1491,7 +1511,7 @@ class InboundSyncRequest(SyncRequest):
             logger.debug(f"Failure to convert inbound data: {str(exception)}")
         return data
-    def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
+    def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]) -> pandas.DataFrame:
         """
         Creates a dataframe from the enqueued list, ready to upload.
         The result is a dataframe contain all (and only):
@@ -1636,7 +1656,7 @@ class InboundSyncRequest(SyncRequest):
         hash_object = hashlib.sha256(key_string.encode())
         return hash_object.hexdigest()
-    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
+    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
         """
         Applies results for an inbound sync. The results are staged into a temporary
         table in Snowflake, so that we can make an atomic commit at the end.
@@ -1663,7 +1683,6 @@ class InboundSyncRequest(SyncRequest):
                                 raise ValueError(
                                     f"Failed to write results to table {self._full_results_table_name}"
                                 )
-                            query_id = self._get_query_id_for_now()
                             logger.debug(
                                 f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
                             )
@@ -1676,7 +1695,6 @@ class InboundSyncRequest(SyncRequest):
                             #                                )
                             for stream_name in stream_names:
                                 self._results_exist[stream_name] = True
-                            return query_id
         else:
             logger.debug("Results dataframe is empty, not applying")