PyPI - omnata-plugin-runtime - Versions diffs - 0.3.13a49__py3-none-any.whl → 0.3.14a51__py3-none-any.whl - Mend

omnata-plugin-runtime 0.3.13a49py3-none-any.whl → 0.3.14a51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

omnata_plugin_runtime/omnata_plugin.py CHANGED Viewed

@@ -868,6 +868,12 @@ class InboundSyncRequest(SyncRequest):
             s.stream_name: s for s in streams
         }
         self._apply_results: Dict[str, List[pandas.DataFrame]] = {}
+        # named by convention, see SyncRunProcessor.enqueue
+        self._criteria_deletes_table_name = (
+            f"{self._source_app_name}.{self._results_schema_name}.{self._results_table_name}_CRITERIA_DELETES"
+        )
+        # These are similar to the results, but represent requests to delete records by some criteria
+        self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
         self._latest_states: Dict[str, Any] = {}
         self._temp_tables = {}
         self._temp_table_lock = threading.Lock()
@@ -890,25 +896,28 @@ class InboundSyncRequest(SyncRequest):
         logger.info("InboundSyncRequest apply_results_queue ")
         if self._apply_results is not None:
             with self._apply_results_lock:
+                results:List[pandas.DataFrame] = []
+                stream_names:List[str] = []
                 for stream_name, stream_results in self._apply_results.items():
-                    results = [
+                    results.extend([
                         x for x in stream_results if x is not None and len(x) > 0
-                    ]  # remove any None/empty dataframes
-                    if len(results) > 0:
-                        logger.info(
-                            f"Applying {len(results)} batches of queued results"
-                        )
-                        # upload all cached apply results
-                        all_dfs = pandas.concat(results)
-                        #logger.info(f"applying: {all_dfs}")
-                        self._apply_results_dataframe(stream_name, all_dfs)
-                        # add the count of this batch to the total for this stream
-                        self._stream_record_counts[
-                            stream_name
-                        ] = self._stream_record_counts[stream_name] + len(all_dfs)
-                    # update the stream state object too
-                    self._apply_latest_states()
-                    self._apply_results[stream_name] = None
+                    ])  # remove any None/empty dataframes
+                    stream_names.append(stream_name)
+                if len(results) > 0:
+                    logger.info(
+                        f"Applying {len(results)} batches of queued results"
+                    )
+                    # upload all cached apply results
+                    all_dfs = pandas.concat(results)
+                    #logger.info(f"applying: {all_dfs}")
+                    self._apply_results_dataframe(stream_names, all_dfs)
+                    # add the count of this batch to the total for this stream
+                    self._stream_record_counts[
+                        stream_name
+                    ] = self._stream_record_counts[stream_name] + len(all_dfs)
+                # update the stream state object too
+                self._apply_latest_states()
+                self._apply_results[stream_name] = None
                 self._apply_results = {}
                 # update the inbound stream record counts, so we can see progress
                 self._plugin_message(
@@ -918,6 +927,22 @@ class InboundSyncRequest(SyncRequest):
                         stream_errors=self._omnata_log_handler.stream_global_errors
                     )
                 )
+        # also take care of uploading delete requests
+        if self._apply_results_criteria_deletes is not None:
+            with self._apply_results_lock:
+                results:List[pandas.DataFrame] = []
+                for stream_name, stream_results in self._apply_results_criteria_deletes.items():
+                    results.extend([
+                        x for x in stream_results if x is not None and len(x) > 0
+                    ])
+                if len(results) > 0:
+                    logger.info(
+                        f"Applying {len(results)} batches of queued criteria deletes"
+                    )
+                    # upload all cached apply results
+                    all_dfs = pandas.concat(results)
+                    #logger.info(f"applying: {all_dfs}")
+                    self._apply_criteria_deletes_dataframe(all_dfs)
     def apply_cancellation(self):
         """
@@ -953,9 +978,15 @@ class InboundSyncRequest(SyncRequest):
             message=PluginMessageAbandonedStreams(abandoned_streams=abandoned_streams)
         )
-    def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any):
+    def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any, is_delete:Union[bool,List[bool]] = False):
         """
-        Adds some results to the queue for applying asynchronously
+        Adds some results to the queue for applying asynchronously.
+        stream_name: str, the name of the stream
+        results: List[Dict], the results to enqueue
+        new_state: Any, the new state which applies to the stream, given the new results
+        is_delete: Union[bool,List[bool]], whether the results are deletes or not
+        is_delete can be a single value, which means all results are the same, or a list of booleans, which means each result is different
+        For records where is_delete is True, you can provide the current record value if it is known, or just the identifier
         """
         logger.info(f"Enqueueing {len(results)} results for upload")
         if stream_name is None or len(stream_name) == 0:
@@ -964,7 +995,7 @@ class InboundSyncRequest(SyncRequest):
             existing_results: List[pandas.DataFrame] = []
             if stream_name in self._apply_results:
                 existing_results = self._apply_results[stream_name]
-            existing_results.append(self._preprocess_results_list(stream_name, results))
+            existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
             self._apply_results[stream_name] = existing_results
             current_latest = self._latest_states or {}
             self._latest_states = {**current_latest, **{stream_name: new_state}}
@@ -984,7 +1015,52 @@ class InboundSyncRequest(SyncRequest):
                 if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
                     logger.info(f"Applying results queue immediately due to combined dataframe size")
                     self.apply_results_queue()
+    def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
+        """
+        Submits some critera (field→value dict) which will cause matching records to be marked as deleted at the end of the run.
+        This feature was created primarily for array fields that become child streams.
+        The parent record is updated, which means there is a set of new children, but we need to delete the previously sync'd records and we don't know their identifiers.
+        The criteria is applied before the new records for the current run are applied. In other words, it will not delete any records from the current run.
+        For a record to be deleted, it must match fields with all the criteria supplied. At least one field value must be provided.
+        """
+        if len(criteria) == 0:
+            raise ValueError("At least one field value must be provided for deletion criteria")
+        if stream_name not in self._streams_dict:
+            raise ValueError(
+                f"Cannot delete records for stream {stream_name} as its configuration doesn't exist"
+            )
+        # append the new criteria to the self._criteria_deletes_table_name table
+        # this table has two columns:
+        # STREAM_NAME: string
+        # DELETE_CRITERIA: object
+        with self._apply_results_lock:
+            logger.info(
+                f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
+            )
+            existing_results: List[pandas.DataFrame] = []
+            if stream_name in self._apply_results_criteria_deletes:
+                existing_results = self._apply_results_criteria_deletes[stream_name]
+            existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
+            self._apply_results_criteria_deletes[stream_name] = existing_results
+        # if the total size of all the dataframes exceeds 200MB, apply the results immediately
+        # we'll use df.memory_usage(index=True) for this
+        if self.development_mode is False:
+            # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
+            # so first we need to get the list of lists from the dictionary values and flatten it
+            # then we can sum the memory usage of each dataframe
+            # if the total exceeds 200MB, we apply the results immediately
+            all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
+            # flatten
+            all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
+            combined_length = sum([len(x) for x in all_dfs])
+            # first, don't both if the count is less than 10000, since it's unlikely to be even close
+            if combined_length > 10000:
+                if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
+                    logger.info(f"Applying criteria deletes queue immediately due to combined dataframe size")
+                    self.apply_results_queue()
     def mark_stream_complete(self, stream_name: str):
         """
@@ -1045,14 +1121,15 @@ class InboundSyncRequest(SyncRequest):
             logger.debug(f"Failure to convert inbound data: {str(exception)}")
         return data
-    def _preprocess_results_list(self, stream_name: str, results: List[Dict]):
+    def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
         """
         Creates a dataframe from the enqueued list, ready to upload.
         The result is a dataframe contain all (and only):
         'APP_IDENTIFIER' string
         'STREAM_NAME' string
         'RETRIEVE_DATE' datetime (UTC)
-        'RECORD_DATA' object
+        'RECORD_DATA' object,
+        'IS_DELETED' boolean
         """
         # for required_column in ['RECORD_DATA']:
         #    if required_column not in results_df.columns:
@@ -1063,6 +1140,9 @@ class InboundSyncRequest(SyncRequest):
             )
         logger.info(f"preprocessing for stream: {self._streams_dict[stream_name]}")
         if len(results) > 0:
+            if isinstance(is_delete, list):
+                if len(results) != len(is_delete):
+                    raise ValueError(f"results and is_delete lists must be the same length")
             # We need to remove any values (included nesting) which are empty dicts. This is to prevent the arrow error:
             # Cannot write struct type '<field_name>' with no child field to Parquet. Consider adding a dummy child field.
             results = [remove_empty_dict_values(result) for result in results]
@@ -1108,11 +1188,20 @@ class InboundSyncRequest(SyncRequest):
                         )
                 results_df["APP_IDENTIFIER"] = results_df["RECORD_DATA"].apply(lambda x: get_nested_value(dict(x),primary_key_field))
-            # we jump the record data to a json string to make uploading to Snowflake less error prone
-            results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(json.dumps)
             # the timestamps in Snowflake are TIMESTAMP_LTZ, so we upload in string format to ensure the
             # timezone information is present.
             results_df["RETRIEVE_DATE"] = str(datetime.datetime.now().astimezone())
+            # create the IS_DELETED column from the is_delete list
+            results_df["IS_DELETED"] = is_delete
+            # for each record, if IS_DELETED is true and RECORD_DATA only contains a single key, we assume that's the identifier
+            # in this case, we nullify the RECORD_DATA column to indicate that the delete operation does not contain the full record
+            for index, row in results_df.iterrows():
+                if row["IS_DELETED"] and len(row["RECORD_DATA"]) == 1:
+                    results_df.at[index, "RECORD_DATA"] = None
+            # we dump the record data to a json string to make uploading to Snowflake less error prone, but only if it's not None
+            results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(
+                lambda x: json.dumps(x) if x is not None else None
+            )
             results_df["STREAM_NAME"] = stream_name
         else:
             results_df = pandas.DataFrame(
@@ -1122,16 +1211,17 @@ class InboundSyncRequest(SyncRequest):
                     "STREAM_NAME",
                     "RECORD_DATA",
                     "RETRIEVE_DATE",
+                    "IS_DELETED"
                 ],
             )
         # trim out the columns we don't need to return
         return results_df[
             results_df.columns.intersection(
-                ["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE"]
+                ["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE", "IS_DELETED"]
             )
         ]
-    def _apply_results_dataframe(self, stream_name: str, results_df: pandas.DataFrame):
+    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
         """
         Applies results for an inbound sync. The results are staged into a temporary
         table in Snowflake, so that we can make an atomic commit at the end.
@@ -1164,7 +1254,8 @@ class InboundSyncRequest(SyncRequest):
                 #                                column_order='index',
                 #                                #create_temp_table=True
                 #                                )
-                self._results_exist[stream_name] = True
+                for stream_name in stream_names:
+                    self._results_exist[stream_name] = True
         else:
             logger.info("Results dataframe is empty, not applying")
@@ -1175,6 +1266,34 @@ class InboundSyncRequest(SyncRequest):
         """
         self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
+    def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
+        """
+        Applies results for an inbound sync. The results are staged into a temporary
+        table in Snowflake, so that we can make an atomic commit at the end.
+        """
+        if len(results_df) > 0:
+            with self._snowflake_query_lock:
+                logger.info(
+                    f"Applying {len(results_df)} criteria deletes to {self._criteria_deletes_table_name}"
+                )
+                # try setting parquet engine here, since the engine parameter does not seem to make it through to the write_pandas function
+                success, nchunks, nrows, _ = write_pandas(
+                    conn=self._session._conn._cursor.connection,  # pylint: disable=protected-access
+                    df=results_df,
+                    table_name=self._criteria_deletes_table_name,
+                    quote_identifiers=False,  # already done in get_temp_table_name
+                    table_type="transient"
+                )
+                if not success:
+                    raise ValueError(
+                        f"Failed to write results to table {self._criteria_deletes_table_name}"
+                    )
+                logger.info(
+                    f"Wrote {nrows} rows and {nchunks} chunks to table {self._criteria_deletes_table_name}"
+                )
+        else:
+            logger.info("Results dataframe is empty, not applying")
 class ConnectResponse(SubscriptableBaseModel):
     """

{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: omnata-plugin-runtime
-Version: 0.3.13a49
+Version: 0.3.14a51
 Summary: Classes and common runtime components for building and running Omnata Plugins
 Author: James Weakley
 Author-email: james.weakley@omnata.com

{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/RECORD RENAMED Viewed

@@ -3,10 +3,10 @@ omnata_plugin_runtime/api.py,sha256=_N5ok5LN7GDO4J9n3yduXp3tpjmhpySY__U2baiygrs,
 omnata_plugin_runtime/configuration.py,sha256=at29ExowF_T4_2U9gY0BF4IVdwC-vDytmNRHL7UCWh8,34742
 omnata_plugin_runtime/forms.py,sha256=30CJB24TqfLYNnkplZdUbeqA-P9rUIBujVKXw_S-wKY,18371
 omnata_plugin_runtime/logging.py,sha256=bn7eKoNWvtuyTk7RTwBS9UARMtqkiICtgMtzq3KA2V0,3272
-omnata_plugin_runtime/omnata_plugin.py,sha256=JSxz2Q5j1s6Fkawfl2rZf0GnkBaJlLMJ_PH13899xxg,91321
+omnata_plugin_runtime/omnata_plugin.py,sha256=yFZUiCPyFkCH1GRE5_J-ZgvtclWTJ0q3cGyc8x2Oi1c,98853
 omnata_plugin_runtime/plugin_entrypoints.py,sha256=_XgmWsrHoSshkl5Z2T27BAGVnBh4yH-8lni5sdGlSz8,27670
 omnata_plugin_runtime/rate_limiting.py,sha256=se6MftQI5NrVHaLb1hByPCgAESPQhkAgIG7KIU1clDU,16562
-omnata_plugin_runtime-0.3.13a49.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
-omnata_plugin_runtime-0.3.13a49.dist-info/METADATA,sha256=9aa8ni0R9o2Nx4JRmc235lUOKwYdyO8ZtGdkla8424o,1604
-omnata_plugin_runtime-0.3.13a49.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-omnata_plugin_runtime-0.3.13a49.dist-info/RECORD,,
+omnata_plugin_runtime-0.3.14a51.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
+omnata_plugin_runtime-0.3.14a51.dist-info/METADATA,sha256=FFW46VXQvWKVO8h_y8JHDrGXVpCaXmxdrY1Y4HYT6rw,1604
+omnata_plugin_runtime-0.3.14a51.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+omnata_plugin_runtime-0.3.14a51.dist-info/RECORD,,

{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/LICENSE RENAMED Viewed

File without changes

{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/WHEEL RENAMED Viewed

File without changes

omnata-plugin-runtime 0.3.13a49__py3-none-any.whl → 0.3.14a51__py3-none-any.whl

omnata-plugin-runtime 0.3.13a49py3-none-any.whl → 0.3.14a51py3-none-any.whl