PyPI - omnata-plugin-runtime - Versions diffs - 0.10.33__tar.gz → 0.11.0a298__tar.gz - Mend

omnata-plugin-runtime 0.10.33tar.gz → 0.11.0a298tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{omnata_plugin_runtime-0.10.33 → omnata_plugin_runtime-0.11.0a298}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: omnata-plugin-runtime
-Version: 0.10.33
+Version: 0.11.0a298
 Summary: Classes and common runtime components for building and running Omnata Plugins
 Author: James Weakley
 Author-email: james.weakley@omnata.com

{omnata_plugin_runtime-0.10.33 → omnata_plugin_runtime-0.11.0a298}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "omnata-plugin-runtime"
-version = "0.10.33"
+version = "0.11.0-a298"
 description = "Classes and common runtime components for building and running Omnata Plugins"
 authors = ["James Weakley <james.weakley@omnata.com>"]
 readme = "README.md"

{omnata_plugin_runtime-0.10.33 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/json_schema.py RENAMED Viewed

@@ -405,6 +405,13 @@ class FullyQualifiedTable(BaseModel):
         return self.get_fully_qualified_name(
             table_override=f"{self.table_name}_CRITERIA_DELETES"
         )
+    def get_fully_qualified_state_register_table_name(self) -> str:
+        """
+        Returns the fully qualified name of the state register table.
+        This is used to store state values for syncs, paired with query IDs to use with time travel.
+        """
+        return self.get_fully_qualified_name(table_override=f"{self.table_name}_STATE_REGISTER")
 class SnowflakeViewPart(BaseModel):
     """

{omnata_plugin_runtime-0.10.33 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/omnata_plugin.py RENAMED Viewed

@@ -93,6 +93,9 @@ from .rate_limiting import (
     RateLimitState,
     RateLimitedSession
 )
+from .json_schema import (
+    FullyQualifiedTable
+)
 SortDirectionType = Literal["asc", "desc"]
@@ -1055,7 +1058,6 @@ class InboundSyncRequest(SyncRequest):
         # These are similar to the results, but represent requests to delete records by some criteria
         self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
-        self._latest_states: Dict[str, Any] = {}
         self._temp_tables = {}
         self._temp_table_lock = threading.Lock()
         self._results_exist: Dict[
@@ -1085,24 +1087,47 @@ class InboundSyncRequest(SyncRequest):
             sync_id=sync_id,
             branch_name=branch_name
         )
-        # named by convention, see SyncRunProcessor.enqueue
-        self._criteria_deletes_table_name = (
-            f"{self._source_app_name}.{self._results_schema_name}.{self._results_table_name}_CRITERIA_DELETES"
+        # The results table name is also used to derive several other table/stage names
+        results_table = FullyQualifiedTable(
+            database_name= self._source_app_name,
+            schema_name= self._results_schema_name,
+            table_name= self._results_table_name
         )
-        self._apply_results: Dict[str, List[pandas.DataFrame]] = {}
+        self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
+        self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
+        # this is keyed on stream name, each containing a list of dataframes and state updates mixed
+        self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
     def apply_results_queue(self):
         """
-        Merges all of the queued results and applies them
+        Merges all of the queued results and applies them, including state updates.
         """
-        logger.debug("InboundSyncRequest apply_results_queue ")
+        logger.debug("InboundSyncRequest apply_results_queue")
         if self._apply_results is not None:
             with self._apply_results_lock:
                 results:List[pandas.DataFrame] = []
-                stream_names:List[str] = []
+                stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
                 for stream_name, stream_results in self._apply_results.items():
+                    # the stream results contains an ordered sequence of dataframes and state updates (append only)
+                    # we only want to apply the dataframes up until the most recent state update
+                    # so first, we iterate backwards to find the last state update
+                    last_state_index = -1
+                    for i in range(len(stream_results) - 1, -1, -1):
+                        if isinstance(stream_results[i], dict):
+                            last_state_index = i
+                            stream_states_for_upload[stream_name] = stream_results[i]
+                            break
+                    # if there are no state updates, we can't do anything with this stream
+                    if last_state_index == -1:
+                        logger.debug(
+                            f"No state updates for stream {stream_name}, skipping"
+                        )
+                        continue
+                    assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
+                    # now we can take the dataframes up to the last state update
+                    dfs = stream_results[:last_state_index]
                     non_empty_dfs = [
-                        x for x in stream_results if x is not None and len(x) > 0
+                        x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
                     ]
                     # get the total length of all the dataframes
                     total_length = sum([len(x) for x in non_empty_dfs])
@@ -1110,22 +1135,28 @@ class InboundSyncRequest(SyncRequest):
                     self._stream_record_counts[
                         stream_name
                     ] = self._stream_record_counts[stream_name] + total_length
-                    results.extend(non_empty_dfs)  # remove any None/empty dataframes
-                    stream_names.append(stream_name)
+                    results.extend(non_empty_dfs)
+                    # now remove everything up to the last state update
+                    # we do this so that we don't apply the same state update multiple times
+                    self._apply_results[stream_name] = stream_results[
+                        last_state_index + 1 :
+                    ]  # keep everything after the last state update
                 if len(results) > 0:
                     logger.debug(
                         f"Applying {len(results)} batches of queued results"
                     )
                     # upload all cached apply results
                     all_dfs = pandas.concat(results)
-                    self._apply_results_dataframe(stream_names, all_dfs)
-                # update the stream state object too
-                self._apply_latest_states()
-                for stream_name in stream_names:
-                    self._apply_results[stream_name] = None
-                self._apply_results = {}
+                    query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
+                    # now that the results have been updated, we need to insert records into the state register table
+                    # we do this by inserting the latest state for each stream
+                    self._directly_insert_to_state_register(
+                        stream_states_for_upload, query_id=query_id
+                    )
         # also take care of uploading delete requests
+        # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
+        # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
         if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
             with self._apply_results_lock:
                 results:List[pandas.DataFrame] = []
@@ -1149,6 +1180,25 @@ class InboundSyncRequest(SyncRequest):
         # so we need to make sure all the results are applied first
         self.apply_progress_updates()
+    def _directly_insert_to_state_register(
+            self, stream_states_for_upload: Dict[str, Dict[str, Any]],
+            query_id: Optional[str] = None
+    ) -> str:
+        binding_values = []
+        values_clauses = []
+        with self._snowflake_query_lock:
+            if query_id is None:
+                query_id = self._get_query_id_for_now()
+            for stream_name, latest_state in stream_states_for_upload.items():
+                binding_values.append(stream_name, query_id, json.dumps(latest_state))
+                values_clauses.append(
+                    f"(?, ?, PARSE_JSON(?))"
+                )
+            final_query = f"""INSERT INTO {self.state_register_table_name} (STREAM_NAME, QUERY_ID, LATEST_STATE)
+                VALUES {','.join(values_clauses)}"""
+            self._session.sql(final_query, binding_values).collect()
     def apply_progress_updates(self, ignore_errors:bool = True):
         """
         Sends a message to the plugin with the current progress of the sync run, if it has changed since last time.
@@ -1224,9 +1274,9 @@ class InboundSyncRequest(SyncRequest):
             if stream_name in self._apply_results:
                 existing_results = self._apply_results[stream_name]
             existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
+            if new_state is not None:
+                existing_results.append(new_state)  # append the new state at the end
             self._apply_results[stream_name] = existing_results
-            current_latest = self._latest_states or {}
-            self._latest_states = {**current_latest, **{stream_name: new_state}}
         # if the total size of all the dataframes exceeds 200MB, apply the results immediately
         # we'll use df.memory_usage(index=True) for this
         if self.development_mode is False:
@@ -1321,9 +1371,42 @@ class InboundSyncRequest(SyncRequest):
         instead you should store state using the new_state parameter in the enqueue_results
         method to ensure it's applied along with the associated new records.
         """
+        self.enqueue_state(
+            stream_name=stream_name,
+            new_state=new_state,
+            query_id=None  # query_id will be generated automatically if not provided
+        )
+    def enqueue_state(self, stream_name: str, new_state: Any, query_id: Optional[str] = None):
+        """
+        Enqueues some new stream state to be stored. This method should be called whenever the state of a stream changes.
+        If there have been records enqueued here for this stream, it is assumed that the state is related to those records.
+        In this case, the state will be applied after the records are applied.
+        If there are no records enqueued for this stream, the state will be applied immediately as it is assumed that the results
+        were directly inserted, and therefore we need to capture the current query ID before more results are inserted.
+        """
         with self._apply_results_lock:
-            current_latest = self._latest_states or {}
-            self._latest_states = {**current_latest, **{stream_name: new_state}}
+            if stream_name in self._apply_results:
+                if len(self._apply_results[stream_name]) > 0:
+                    self._apply_results[stream_name].append(new_state)
+                    return
+        self._directly_insert_to_state_register(
+            {
+                stream_name: new_state
+            }, query_id=query_id
+        )
+    def _get_query_id_for_now(self):
+        """
+        Gets a Snowflake query ID right now. Note that this does not require a Snowflake lock, the caller
+        should ensure that this is called in a thread-safe manner.
+        """
+        job=self._session.sql("select 1").collect_nowait()
+        job.result()
+        return job.query_id
     def get_queued_results(self, stream_name: str):
         """
@@ -1337,7 +1420,7 @@ class InboundSyncRequest(SyncRequest):
                 "get_queued_results was called, but no results have been queued"
             )
         concat_results = pandas.concat(self._apply_results[stream_name])
-        return concat_results
+        return [c for c in concat_results if c is not None and isinstance(c, pandas.DataFrame) and len(c) > 0]
     def _convert_by_json_schema(
         self, stream_name: str, data: Dict, json_schema: Dict
@@ -1512,10 +1595,11 @@ class InboundSyncRequest(SyncRequest):
         hash_object = hashlib.sha256(key_string.encode())
         return hash_object.hexdigest()
-    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
+    def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
         """
         Applies results for an inbound sync. The results are staged into a temporary
         table in Snowflake, so that we can make an atomic commit at the end.
+        Returns a query ID that can be used for checkpointing after the copy into command has run.
         """
         if len(results_df) > 0:
             with self._snowflake_query_lock:
@@ -1538,6 +1622,7 @@ class InboundSyncRequest(SyncRequest):
                                 raise ValueError(
                                     f"Failed to write results to table {self._full_results_table_name}"
                                 )
+                            query_id = self._get_query_id_for_now()
                             logger.debug(
                                 f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
                             )
@@ -1550,19 +1635,10 @@ class InboundSyncRequest(SyncRequest):
                             #                                )
                             for stream_name in stream_names:
                                 self._results_exist[stream_name] = True
+                            return query_id
         else:
             logger.debug("Results dataframe is empty, not applying")
-    def _apply_latest_states(self):
-        """
-        Updates the SYNC table to have the latest stream states.
-        TODO: This should be done in concert with the results, revisit
-        """
-        if self._last_states_update is None or json.dumps(self._latest_states) != json.dumps(self._last_states_update):
-            self._last_states_update = json.loads(json.dumps(self._latest_states))
-            self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
     def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
         """
         Applies results for an inbound sync. The results are staged into a temporary

{omnata_plugin_runtime-0.10.33 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/plugin_entrypoints.py RENAMED Viewed

@@ -250,7 +250,7 @@ class PluginEntrypoint:
                 self._plugin_instance._configuration_parameters = parameters
                 inbound_sync_request.update_activity("Invoking plugin")
-                logger.info(f"inbound sync request: {inbound_sync_request}")
+                logger.info(f"Inbound sync request: {json.dumps(to_jsonable_python(inbound_sync_request))}")
                 # plugin_instance._inbound_sync_request = outbound_sync_request
                 with tracer.start_as_current_span("invoke_plugin"):
                     with HttpRateLimiting(inbound_sync_request, parameters):