omnata-plugin-runtime 0.10.33__tar.gz → 0.11.0a298__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: omnata-plugin-runtime
3
- Version: 0.10.33
3
+ Version: 0.11.0a298
4
4
  Summary: Classes and common runtime components for building and running Omnata Plugins
5
5
  Author: James Weakley
6
6
  Author-email: james.weakley@omnata.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "omnata-plugin-runtime"
3
- version = "0.10.33"
3
+ version = "0.11.0-a298"
4
4
  description = "Classes and common runtime components for building and running Omnata Plugins"
5
5
  authors = ["James Weakley <james.weakley@omnata.com>"]
6
6
  readme = "README.md"
@@ -405,6 +405,13 @@ class FullyQualifiedTable(BaseModel):
405
405
  return self.get_fully_qualified_name(
406
406
  table_override=f"{self.table_name}_CRITERIA_DELETES"
407
407
  )
408
+
409
+ def get_fully_qualified_state_register_table_name(self) -> str:
410
+ """
411
+ Returns the fully qualified name of the state register table.
412
+ This is used to store state values for syncs, paired with query IDs to use with time travel.
413
+ """
414
+ return self.get_fully_qualified_name(table_override=f"{self.table_name}_STATE_REGISTER")
408
415
 
409
416
  class SnowflakeViewPart(BaseModel):
410
417
  """
@@ -93,6 +93,9 @@ from .rate_limiting import (
93
93
  RateLimitState,
94
94
  RateLimitedSession
95
95
  )
96
+ from .json_schema import (
97
+ FullyQualifiedTable
98
+ )
96
99
 
97
100
  SortDirectionType = Literal["asc", "desc"]
98
101
 
@@ -1055,7 +1058,6 @@ class InboundSyncRequest(SyncRequest):
1055
1058
 
1056
1059
  # These are similar to the results, but represent requests to delete records by some criteria
1057
1060
  self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
1058
- self._latest_states: Dict[str, Any] = {}
1059
1061
  self._temp_tables = {}
1060
1062
  self._temp_table_lock = threading.Lock()
1061
1063
  self._results_exist: Dict[
@@ -1085,24 +1087,47 @@ class InboundSyncRequest(SyncRequest):
1085
1087
  sync_id=sync_id,
1086
1088
  branch_name=branch_name
1087
1089
  )
1088
- # named by convention, see SyncRunProcessor.enqueue
1089
- self._criteria_deletes_table_name = (
1090
- f"{self._source_app_name}.{self._results_schema_name}.{self._results_table_name}_CRITERIA_DELETES"
1090
+ # The results table name is also used to derive several other table/stage names
1091
+ results_table = FullyQualifiedTable(
1092
+ database_name= self._source_app_name,
1093
+ schema_name= self._results_schema_name,
1094
+ table_name= self._results_table_name
1091
1095
  )
1092
- self._apply_results: Dict[str, List[pandas.DataFrame]] = {}
1096
+ self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
1097
+ self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
1098
+ # this is keyed on stream name, each containing a list of dataframes and state updates mixed
1099
+ self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
1093
1100
 
1094
1101
  def apply_results_queue(self):
1095
1102
  """
1096
- Merges all of the queued results and applies them
1103
+ Merges all of the queued results and applies them, including state updates.
1097
1104
  """
1098
- logger.debug("InboundSyncRequest apply_results_queue ")
1105
+ logger.debug("InboundSyncRequest apply_results_queue")
1099
1106
  if self._apply_results is not None:
1100
1107
  with self._apply_results_lock:
1101
1108
  results:List[pandas.DataFrame] = []
1102
- stream_names:List[str] = []
1109
+ stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
1103
1110
  for stream_name, stream_results in self._apply_results.items():
1111
+ # the stream results contains an ordered sequence of dataframes and state updates (append only)
1112
+ # we only want to apply the dataframes up until the most recent state update
1113
+ # so first, we iterate backwards to find the last state update
1114
+ last_state_index = -1
1115
+ for i in range(len(stream_results) - 1, -1, -1):
1116
+ if isinstance(stream_results[i], dict):
1117
+ last_state_index = i
1118
+ stream_states_for_upload[stream_name] = stream_results[i]
1119
+ break
1120
+ # if there are no state updates, we can't do anything with this stream
1121
+ if last_state_index == -1:
1122
+ logger.debug(
1123
+ f"No state updates for stream {stream_name}, skipping"
1124
+ )
1125
+ continue
1126
+ assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
1127
+ # now we can take the dataframes up to the last state update
1128
+ dfs = stream_results[:last_state_index]
1104
1129
  non_empty_dfs = [
1105
- x for x in stream_results if x is not None and len(x) > 0
1130
+ x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
1106
1131
  ]
1107
1132
  # get the total length of all the dataframes
1108
1133
  total_length = sum([len(x) for x in non_empty_dfs])
@@ -1110,22 +1135,28 @@ class InboundSyncRequest(SyncRequest):
1110
1135
  self._stream_record_counts[
1111
1136
  stream_name
1112
1137
  ] = self._stream_record_counts[stream_name] + total_length
1113
- results.extend(non_empty_dfs) # remove any None/empty dataframes
1114
- stream_names.append(stream_name)
1138
+ results.extend(non_empty_dfs)
1139
+ # now remove everything up to the last state update
1140
+ # we do this so that we don't apply the same state update multiple times
1141
+ self._apply_results[stream_name] = stream_results[
1142
+ last_state_index + 1 :
1143
+ ] # keep everything after the last state update
1115
1144
  if len(results) > 0:
1116
1145
  logger.debug(
1117
1146
  f"Applying {len(results)} batches of queued results"
1118
1147
  )
1119
1148
  # upload all cached apply results
1120
1149
  all_dfs = pandas.concat(results)
1121
- self._apply_results_dataframe(stream_names, all_dfs)
1122
- # update the stream state object too
1123
- self._apply_latest_states()
1124
- for stream_name in stream_names:
1125
- self._apply_results[stream_name] = None
1126
- self._apply_results = {}
1127
-
1150
+ query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
1151
+ # now that the results have been updated, we need to insert records into the state register table
1152
+ # we do this by inserting the latest state for each stream
1153
+ self._directly_insert_to_state_register(
1154
+ stream_states_for_upload, query_id=query_id
1155
+ )
1156
+
1128
1157
  # also take care of uploading delete requests
1158
+ # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
1159
+ # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
1129
1160
  if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
1130
1161
  with self._apply_results_lock:
1131
1162
  results:List[pandas.DataFrame] = []
@@ -1149,6 +1180,25 @@ class InboundSyncRequest(SyncRequest):
1149
1180
  # so we need to make sure all the results are applied first
1150
1181
  self.apply_progress_updates()
1151
1182
 
1183
+ def _directly_insert_to_state_register(
1184
+ self, stream_states_for_upload: Dict[str, Dict[str, Any]],
1185
+ query_id: Optional[str] = None
1186
+ ) -> str:
1187
+ binding_values = []
1188
+ values_clauses = []
1189
+
1190
+ with self._snowflake_query_lock:
1191
+ if query_id is None:
1192
+ query_id = self._get_query_id_for_now()
1193
+ for stream_name, latest_state in stream_states_for_upload.items():
1194
+ binding_values.append(stream_name, query_id, json.dumps(latest_state))
1195
+ values_clauses.append(
1196
+ f"(?, ?, PARSE_JSON(?))"
1197
+ )
1198
+ final_query = f"""INSERT INTO {self.state_register_table_name} (STREAM_NAME, QUERY_ID, LATEST_STATE)
1199
+ VALUES {','.join(values_clauses)}"""
1200
+ self._session.sql(final_query, binding_values).collect()
1201
+
1152
1202
  def apply_progress_updates(self, ignore_errors:bool = True):
1153
1203
  """
1154
1204
  Sends a message to the plugin with the current progress of the sync run, if it has changed since last time.
@@ -1224,9 +1274,9 @@ class InboundSyncRequest(SyncRequest):
1224
1274
  if stream_name in self._apply_results:
1225
1275
  existing_results = self._apply_results[stream_name]
1226
1276
  existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
1277
+ if new_state is not None:
1278
+ existing_results.append(new_state) # append the new state at the end
1227
1279
  self._apply_results[stream_name] = existing_results
1228
- current_latest = self._latest_states or {}
1229
- self._latest_states = {**current_latest, **{stream_name: new_state}}
1230
1280
  # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1231
1281
  # we'll use df.memory_usage(index=True) for this
1232
1282
  if self.development_mode is False:
@@ -1321,9 +1371,42 @@ class InboundSyncRequest(SyncRequest):
1321
1371
  instead you should store state using the new_state parameter in the enqueue_results
1322
1372
  method to ensure it's applied along with the associated new records.
1323
1373
  """
1374
+ self.enqueue_state(
1375
+ stream_name=stream_name,
1376
+ new_state=new_state,
1377
+ query_id=None # query_id will be generated automatically if not provided
1378
+ )
1379
+
1380
+ def enqueue_state(self, stream_name: str, new_state: Any, query_id: Optional[str] = None):
1381
+ """
1382
+ Enqueues some new stream state to be stored. This method should be called whenever the state of a stream changes.
1383
+
1384
+ If there have been records enqueued here for this stream, it is assumed that the state is related to those records.
1385
+ In this case, the state will be applied after the records are applied.
1386
+ If there are no records enqueued for this stream, the state will be applied immediately as it is assumed that the results
1387
+ were directly inserted, and therefore we need to capture the current query ID before more results are inserted.
1388
+ """
1324
1389
  with self._apply_results_lock:
1325
- current_latest = self._latest_states or {}
1326
- self._latest_states = {**current_latest, **{stream_name: new_state}}
1390
+ if stream_name in self._apply_results:
1391
+ if len(self._apply_results[stream_name]) > 0:
1392
+ self._apply_results[stream_name].append(new_state)
1393
+ return
1394
+
1395
+ self._directly_insert_to_state_register(
1396
+ {
1397
+ stream_name: new_state
1398
+ }, query_id=query_id
1399
+ )
1400
+
1401
+
1402
+ def _get_query_id_for_now(self):
1403
+ """
1404
+ Gets a Snowflake query ID right now. Note that this does not require a Snowflake lock, the caller
1405
+ should ensure that this is called in a thread-safe manner.
1406
+ """
1407
+ job=self._session.sql("select 1").collect_nowait()
1408
+ job.result()
1409
+ return job.query_id
1327
1410
 
1328
1411
  def get_queued_results(self, stream_name: str):
1329
1412
  """
@@ -1337,7 +1420,7 @@ class InboundSyncRequest(SyncRequest):
1337
1420
  "get_queued_results was called, but no results have been queued"
1338
1421
  )
1339
1422
  concat_results = pandas.concat(self._apply_results[stream_name])
1340
- return concat_results
1423
+ return [c for c in concat_results if c is not None and isinstance(c, pandas.DataFrame) and len(c) > 0]
1341
1424
 
1342
1425
  def _convert_by_json_schema(
1343
1426
  self, stream_name: str, data: Dict, json_schema: Dict
@@ -1512,10 +1595,11 @@ class InboundSyncRequest(SyncRequest):
1512
1595
  hash_object = hashlib.sha256(key_string.encode())
1513
1596
  return hash_object.hexdigest()
1514
1597
 
1515
- def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
1598
+ def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
1516
1599
  """
1517
1600
  Applies results for an inbound sync. The results are staged into a temporary
1518
1601
  table in Snowflake, so that we can make an atomic commit at the end.
1602
+ Returns a query ID that can be used for checkpointing after the copy into command has run.
1519
1603
  """
1520
1604
  if len(results_df) > 0:
1521
1605
  with self._snowflake_query_lock:
@@ -1538,6 +1622,7 @@ class InboundSyncRequest(SyncRequest):
1538
1622
  raise ValueError(
1539
1623
  f"Failed to write results to table {self._full_results_table_name}"
1540
1624
  )
1625
+ query_id = self._get_query_id_for_now()
1541
1626
  logger.debug(
1542
1627
  f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
1543
1628
  )
@@ -1550,19 +1635,10 @@ class InboundSyncRequest(SyncRequest):
1550
1635
  # )
1551
1636
  for stream_name in stream_names:
1552
1637
  self._results_exist[stream_name] = True
1638
+ return query_id
1553
1639
  else:
1554
1640
  logger.debug("Results dataframe is empty, not applying")
1555
1641
 
1556
- def _apply_latest_states(self):
1557
- """
1558
- Updates the SYNC table to have the latest stream states.
1559
- TODO: This should be done in concert with the results, revisit
1560
- """
1561
- if self._last_states_update is None or json.dumps(self._latest_states) != json.dumps(self._last_states_update):
1562
- self._last_states_update = json.loads(json.dumps(self._latest_states))
1563
- self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
1564
-
1565
-
1566
1642
  def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
1567
1643
  """
1568
1644
  Applies results for an inbound sync. The results are staged into a temporary
@@ -250,7 +250,7 @@ class PluginEntrypoint:
250
250
  self._plugin_instance._configuration_parameters = parameters
251
251
 
252
252
  inbound_sync_request.update_activity("Invoking plugin")
253
- logger.info(f"inbound sync request: {inbound_sync_request}")
253
+ logger.info(f"Inbound sync request: {json.dumps(to_jsonable_python(inbound_sync_request))}")
254
254
  # plugin_instance._inbound_sync_request = outbound_sync_request
255
255
  with tracer.start_as_current_span("invoke_plugin"):
256
256
  with HttpRateLimiting(inbound_sync_request, parameters):