omnata-plugin-runtime 0.11.7a325__tar.gz → 0.11.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: omnata-plugin-runtime
3
- Version: 0.11.7a325
3
+ Version: 0.11.9
4
4
  Summary: Classes and common runtime components for building and running Omnata Plugins
5
5
  License-File: LICENSE
6
6
  Author: James Weakley
@@ -20,7 +20,7 @@ Requires-Dist: idna (<=3.7)
20
20
  Requires-Dist: jinja2 (>=3.1.2,<=3.1.4)
21
21
  Requires-Dist: markupsafe (<=2.1.3)
22
22
  Requires-Dist: numpy (<=2.1.3)
23
- Requires-Dist: opentelemetry-api (<=1.23.0)
23
+ Requires-Dist: opentelemetry-api (<=1.37.0)
24
24
  Requires-Dist: packaging (<=24.1)
25
25
  Requires-Dist: pandas (<=2.2.3)
26
26
  Requires-Dist: platformdirs (<=3.10.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "omnata-plugin-runtime"
3
- version = "0.11.7-a325"
3
+ version = "0.11.9"
4
4
  description = "Classes and common runtime components for building and running Omnata Plugins"
5
5
  authors = ["James Weakley <james.weakley@omnata.com>"]
6
6
  readme = "README.md"
@@ -39,7 +39,7 @@ pyyaml = "<=6.0.1" # latest version available on Snowflake Anaconda
39
39
  cffi = "<=1.16.0" # latest version available on Snowflake Anaconda
40
40
  pyarrow = "<=16.1.0" # latest version available on Snowflake Anaconda
41
41
  wrapt = "<=1.14.1" # latest version available on Snowflake Anaconda
42
- opentelemetry-api = "<=1.23.0" # latest version available on Snowflake Anaconda
42
+ opentelemetry-api = "<=1.37.0" # latest version available on Snowflake Anaconda
43
43
  snowflake-telemetry-python = "<=0.5.0" # latest version available on Snowflake Anaconda
44
44
  protobuf = "<=4.25.3" # latest version available on Snowflake Anaconda
45
45
 
@@ -15,7 +15,7 @@ if tuple(sys.version_info[:2]) >= (3, 9):
15
15
  else:
16
16
  # Python 3.8 and below
17
17
  from typing_extensions import Annotated
18
-
18
+ from dataclasses import dataclass
19
19
  import zipfile
20
20
  import datetime
21
21
  import http
@@ -270,6 +270,29 @@ def jinja_filter(func):
270
270
  func.is_jinja_filter = True
271
271
  return func
272
272
 
273
+ @dataclass
274
+ class StateResult:
275
+ """
276
+ Represents the current cursor state of a stream. This simple wrapper just helps us identify what type of
277
+ object is in the apply_results list.
278
+ """
279
+ new_state: Any
280
+
281
+ @dataclass
282
+ class RecordsToUploadResult:
283
+ """
284
+ Represents the records to upload for a stream. This simple wrapper just helps us identify what type of
285
+ object is in the apply_results list.
286
+ """
287
+ records: pandas.DataFrame
288
+
289
+ @dataclass
290
+ class CriteriaDeleteResult:
291
+ """
292
+ Represents the result of processing criteria deletes for a stream. This simple wrapper just helps us identify what type of
293
+ object is in the apply_results list.
294
+ """
295
+ criteria_deletes: pandas.DataFrame
273
296
 
274
297
  class SyncRequest(ABC):
275
298
  """
@@ -1062,7 +1085,6 @@ class InboundSyncRequest(SyncRequest):
1062
1085
  }
1063
1086
 
1064
1087
  # These are similar to the results, but represent requests to delete records by some criteria
1065
- self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
1066
1088
  self._temp_tables = {}
1067
1089
  self._temp_table_lock = threading.Lock()
1068
1090
  self._results_exist: Dict[
@@ -1101,7 +1123,7 @@ class InboundSyncRequest(SyncRequest):
1101
1123
  self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
1102
1124
  self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
1103
1125
  # this is keyed on stream name, each containing a list of dataframes and state updates mixed
1104
- self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
1126
+ self._apply_results: Dict[str, List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = {}
1105
1127
  # track the start times of each stream, so we can calculate durations. The int is a epoch (time.time()) value
1106
1128
  self._stream_start_times: Dict[str, int] = {}
1107
1129
 
@@ -1112,7 +1134,8 @@ class InboundSyncRequest(SyncRequest):
1112
1134
  logger.debug("InboundSyncRequest apply_results_queue")
1113
1135
  if self._apply_results is not None:
1114
1136
  with self._apply_results_lock:
1115
- results:List[pandas.DataFrame] = []
1137
+ records_to_upload:List[pandas.DataFrame] = []
1138
+ criteria_deletes_to_upload:List[pandas.DataFrame] = []
1116
1139
  stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
1117
1140
  for stream_name, stream_results in self._apply_results.items():
1118
1141
  # the stream results contains an ordered sequence of dataframes and state updates (append only)
@@ -1120,9 +1143,9 @@ class InboundSyncRequest(SyncRequest):
1120
1143
  # so first, we iterate backwards to find the last state update
1121
1144
  last_state_index = -1
1122
1145
  for i in range(len(stream_results) - 1, -1, -1):
1123
- if isinstance(stream_results[i], dict):
1146
+ if isinstance(stream_results[i], StateResult):
1124
1147
  last_state_index = i
1125
- stream_states_for_upload[stream_name] = stream_results[i]
1148
+ stream_states_for_upload[stream_name] = stream_results[i].new_state
1126
1149
  break
1127
1150
  # if there are no state updates, we can't do anything with this stream
1128
1151
  if last_state_index == -1:
@@ -1131,56 +1154,54 @@ class InboundSyncRequest(SyncRequest):
1131
1154
  )
1132
1155
  continue
1133
1156
  assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
1134
- # now we can take the dataframes up to the last state update
1135
- dfs = stream_results[:last_state_index]
1136
- non_empty_dfs = [
1137
- x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
1157
+ # now we can take the record dataframes up to the last state update
1158
+ results_subset = stream_results[:last_state_index]
1159
+ non_empty_record_dfs:List[pandas.DataFrame] = [
1160
+ x.records for x in results_subset
1161
+ if x is not None and isinstance(x, RecordsToUploadResult) and len(x.records) > 0
1138
1162
  ]
1139
1163
  # get the total length of all the dataframes
1140
- total_length = sum([len(x) for x in non_empty_dfs])
1164
+ total_length = sum([len(x) for x in non_empty_record_dfs])
1141
1165
  # add the count of this batch to the total for this stream
1142
1166
  self._stream_record_counts[
1143
1167
  stream_name
1144
1168
  ] = self._stream_record_counts[stream_name] + total_length
1145
- results.extend(non_empty_dfs)
1169
+ records_to_upload.extend(non_empty_record_dfs)
1170
+ # also handle any criteria deletes
1171
+ criteria_deletes_to_upload.extend([
1172
+ x.criteria_deletes for x in results_subset
1173
+ if x is not None and isinstance(x, CriteriaDeleteResult) and len(x.criteria_deletes) > 0
1174
+ ])
1146
1175
  # now remove everything up to the last state update
1147
1176
  # we do this so that we don't apply the same state update multiple times
1177
+ # keep everything after the last state update
1148
1178
  self._apply_results[stream_name] = stream_results[
1149
1179
  last_state_index + 1 :
1150
- ] # keep everything after the last state update
1151
- if len(results) > 0:
1152
- logger.debug(
1153
- f"Applying {len(results)} batches of queued results"
1154
- )
1155
- # upload all cached apply results
1156
- all_dfs = pandas.concat(results)
1157
- query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
1158
- # now that the results have been updated, we need to insert records into the state register table
1159
- # we do this by inserting the latest state for each stream
1180
+ ]
1181
+
1182
+ if len(records_to_upload) > 0 or len(criteria_deletes_to_upload) > 0:
1183
+ if len(records_to_upload) > 0:
1184
+ logger.debug(
1185
+ f"Applying {len(records_to_upload)} batches of queued results"
1186
+ )
1187
+ # upload all cached apply results
1188
+ records_to_upload_combined = pandas.concat(records_to_upload)
1189
+ self._apply_results_dataframe(list(stream_states_for_upload.keys()), records_to_upload_combined)
1190
+ # now that the results have been updated, we need to insert records into the state register table
1191
+ # we do this by inserting the latest state for each stream
1192
+ if len(criteria_deletes_to_upload) > 0:
1193
+ logger.debug(
1194
+ f"Applying {len(criteria_deletes_to_upload)} batches of queued criteria deletes"
1195
+ )
1196
+ # upload all cached apply results
1197
+ all_criteria_deletes = pandas.concat(criteria_deletes_to_upload)
1198
+ self._apply_criteria_deletes_dataframe(all_criteria_deletes)
1199
+
1200
+ query_id = self._get_query_id_for_now()
1160
1201
  self._directly_insert_to_state_register(
1161
1202
  stream_states_for_upload, query_id=query_id
1162
1203
  )
1163
1204
 
1164
- # also take care of uploading delete requests
1165
- # technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
1166
- # and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
1167
- if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
1168
- with self._apply_results_lock:
1169
- results:List[pandas.DataFrame] = []
1170
- for stream_name, stream_results in self._apply_results_criteria_deletes.items():
1171
- results.extend([
1172
- x for x in stream_results if x is not None and len(x) > 0
1173
- ])
1174
- if len(results) > 0:
1175
- logger.debug(
1176
- f"Applying {len(results)} batches of queued criteria deletes"
1177
- )
1178
- # upload all cached apply results
1179
- all_dfs = pandas.concat(results)
1180
- self._apply_criteria_deletes_dataframe(all_dfs)
1181
- # clear the delete requests
1182
- self._apply_results_criteria_deletes = {}
1183
-
1184
1205
 
1185
1206
  # update the inbound stream record counts, so we can see progress
1186
1207
  # we do this last, because marking a stream as completed will cause the sync engine to process it
@@ -1288,29 +1309,40 @@ class InboundSyncRequest(SyncRequest):
1288
1309
  if stream_name is None or len(stream_name) == 0:
1289
1310
  raise ValueError("Stream name cannot be empty")
1290
1311
  with self._apply_results_lock:
1291
- existing_results: List[pandas.DataFrame] = []
1312
+ existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
1292
1313
  if stream_name in self._apply_results:
1293
1314
  existing_results = self._apply_results[stream_name]
1294
- existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
1315
+ existing_results.append(RecordsToUploadResult(
1316
+ records=self._preprocess_results_list(stream_name, results, is_delete)
1317
+ ))
1295
1318
  if new_state is not None:
1296
- existing_results.append(new_state) # append the new state at the end
1319
+ existing_results.append(
1320
+ StateResult(new_state=new_state)
1321
+ ) # append the new state at the end
1297
1322
  self._apply_results[stream_name] = existing_results
1298
- # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1299
- # we'll use df.memory_usage(index=True) for this
1300
1323
  if self.development_mode is False:
1301
1324
  # note: we want to do it for all values in self._apply_results, not just the new one
1302
- # so first we need to get the list of lists from the dictionary values and flatten it
1303
- # then we can sum the memory usage of each dataframe
1304
- # if the total exceeds 200MB, we apply the results immediately
1305
- all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results.values())
1306
- # flatten
1307
- all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist if isinstance(x, pandas.DataFrame)]
1308
- combined_length = sum([len(x) for x in all_dfs])
1309
- # first, don't bother if the count is less than 10000, since it's unlikely to be even close
1310
- if combined_length > 10000:
1311
- if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
1312
- logger.debug(f"Applying results queue immediately due to combined dataframe size")
1313
- self.apply_results_queue()
1325
+ self._apply_results_if_size_exceeded()
1326
+
1327
+ def _apply_results_if_size_exceeded(self,):
1328
+ # so first we need to get the list of lists from the dictionary values and flatten it
1329
+ # then we can sum the memory usage of each dataframe
1330
+ # if the total exceeds 200MB, we apply the results immediately
1331
+ all_df_lists:List[List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = list(self._apply_results.values())
1332
+ # flatten
1333
+ all_dfs:List[pandas.DataFrame] = []
1334
+ for sublist in all_df_lists:
1335
+ for x in sublist:
1336
+ if isinstance(x, RecordsToUploadResult):
1337
+ all_dfs.append(x.records)
1338
+ if isinstance(x, CriteriaDeleteResult):
1339
+ all_dfs.append(x.criteria_deletes)
1340
+ combined_length = sum([len(x) for x in all_dfs])
1341
+ # first, don't bother if the count is less than 10000, since it's unlikely to be even close
1342
+ if combined_length > 10000:
1343
+ if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
1344
+ logger.debug(f"Applying results queue immediately due to combined dataframe size")
1345
+ self.apply_results_queue()
1314
1346
 
1315
1347
  def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
1316
1348
  """
@@ -1336,27 +1368,15 @@ class InboundSyncRequest(SyncRequest):
1336
1368
  logger.debug(
1337
1369
  f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
1338
1370
  )
1339
- existing_results: List[pandas.DataFrame] = []
1340
- if stream_name in self._apply_results_criteria_deletes:
1341
- existing_results = self._apply_results_criteria_deletes[stream_name]
1342
- existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
1343
- self._apply_results_criteria_deletes[stream_name] = existing_results
1344
- # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1345
- # we'll use df.memory_usage(index=True) for this
1371
+ existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
1372
+ if stream_name in self._apply_results:
1373
+ existing_results = self._apply_results[stream_name]
1374
+ existing_results.append(
1375
+ CriteriaDeleteResult(
1376
+ criteria_deletes=pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}])))
1377
+ self._apply_results[stream_name] = existing_results
1346
1378
  if self.development_mode is False:
1347
- # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
1348
- # so first we need to get the list of lists from the dictionary values and flatten it
1349
- # then we can sum the memory usage of each dataframe
1350
- # if the total exceeds 200MB, we apply the results immediately
1351
- all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
1352
- # flatten
1353
- all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
1354
- combined_length = sum([len(x) for x in all_dfs])
1355
- # first, don't both if the count is less than 10000, since it's unlikely to be even close
1356
- if combined_length > 10000:
1357
- if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
1358
- logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
1359
- self.apply_results_queue()
1379
+ self._apply_results_if_size_exceeded()
1360
1380
 
1361
1381
  def mark_stream_started(self, stream_name: str):
1362
1382
  """
@@ -1491,7 +1511,7 @@ class InboundSyncRequest(SyncRequest):
1491
1511
  logger.debug(f"Failure to convert inbound data: {str(exception)}")
1492
1512
  return data
1493
1513
 
1494
- def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
1514
+ def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]) -> pandas.DataFrame:
1495
1515
  """
1496
1516
  Creates a dataframe from the enqueued list, ready to upload.
1497
1517
  The result is a dataframe contain all (and only):
@@ -1636,7 +1656,7 @@ class InboundSyncRequest(SyncRequest):
1636
1656
  hash_object = hashlib.sha256(key_string.encode())
1637
1657
  return hash_object.hexdigest()
1638
1658
 
1639
- def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
1659
+ def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
1640
1660
  """
1641
1661
  Applies results for an inbound sync. The results are staged into a temporary
1642
1662
  table in Snowflake, so that we can make an atomic commit at the end.
@@ -1663,7 +1683,6 @@ class InboundSyncRequest(SyncRequest):
1663
1683
  raise ValueError(
1664
1684
  f"Failed to write results to table {self._full_results_table_name}"
1665
1685
  )
1666
- query_id = self._get_query_id_for_now()
1667
1686
  logger.debug(
1668
1687
  f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
1669
1688
  )
@@ -1676,7 +1695,6 @@ class InboundSyncRequest(SyncRequest):
1676
1695
  # )
1677
1696
  for stream_name in stream_names:
1678
1697
  self._results_exist[stream_name] = True
1679
- return query_id
1680
1698
  else:
1681
1699
  logger.debug("Results dataframe is empty, not applying")
1682
1700