omnata-plugin-runtime 0.11.8a326__tar.gz → 0.11.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/PKG-INFO +1 -1
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/pyproject.toml +1 -1
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/omnata_plugin.py +101 -83
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/LICENSE +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/README.md +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/__init__.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/api.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/configuration.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/forms.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/json_schema.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/logging.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/plugin_entrypoints.py +0 -0
- {omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/rate_limiting.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "omnata-plugin-runtime"
|
|
3
|
-
version = "0.11.
|
|
3
|
+
version = "0.11.9"
|
|
4
4
|
description = "Classes and common runtime components for building and running Omnata Plugins"
|
|
5
5
|
authors = ["James Weakley <james.weakley@omnata.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -15,7 +15,7 @@ if tuple(sys.version_info[:2]) >= (3, 9):
|
|
|
15
15
|
else:
|
|
16
16
|
# Python 3.8 and below
|
|
17
17
|
from typing_extensions import Annotated
|
|
18
|
-
|
|
18
|
+
from dataclasses import dataclass
|
|
19
19
|
import zipfile
|
|
20
20
|
import datetime
|
|
21
21
|
import http
|
|
@@ -270,6 +270,29 @@ def jinja_filter(func):
|
|
|
270
270
|
func.is_jinja_filter = True
|
|
271
271
|
return func
|
|
272
272
|
|
|
273
|
+
@dataclass
|
|
274
|
+
class StateResult:
|
|
275
|
+
"""
|
|
276
|
+
Represents the current cursor state of a stream. This simple wrapper just helps us identify what type of
|
|
277
|
+
object is in the apply_results list.
|
|
278
|
+
"""
|
|
279
|
+
new_state: Any
|
|
280
|
+
|
|
281
|
+
@dataclass
|
|
282
|
+
class RecordsToUploadResult:
|
|
283
|
+
"""
|
|
284
|
+
Represents the records to upload for a stream. This simple wrapper just helps us identify what type of
|
|
285
|
+
object is in the apply_results list.
|
|
286
|
+
"""
|
|
287
|
+
records: pandas.DataFrame
|
|
288
|
+
|
|
289
|
+
@dataclass
|
|
290
|
+
class CriteriaDeleteResult:
|
|
291
|
+
"""
|
|
292
|
+
Represents the result of processing criteria deletes for a stream. This simple wrapper just helps us identify what type of
|
|
293
|
+
object is in the apply_results list.
|
|
294
|
+
"""
|
|
295
|
+
criteria_deletes: pandas.DataFrame
|
|
273
296
|
|
|
274
297
|
class SyncRequest(ABC):
|
|
275
298
|
"""
|
|
@@ -1062,7 +1085,6 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1062
1085
|
}
|
|
1063
1086
|
|
|
1064
1087
|
# These are similar to the results, but represent requests to delete records by some criteria
|
|
1065
|
-
self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
|
|
1066
1088
|
self._temp_tables = {}
|
|
1067
1089
|
self._temp_table_lock = threading.Lock()
|
|
1068
1090
|
self._results_exist: Dict[
|
|
@@ -1101,7 +1123,7 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1101
1123
|
self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
|
|
1102
1124
|
self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
|
|
1103
1125
|
# this is keyed on stream name, each containing a list of dataframes and state updates mixed
|
|
1104
|
-
self._apply_results: Dict[str, List[
|
|
1126
|
+
self._apply_results: Dict[str, List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = {}
|
|
1105
1127
|
# track the start times of each stream, so we can calculate durations. The int is a epoch (time.time()) value
|
|
1106
1128
|
self._stream_start_times: Dict[str, int] = {}
|
|
1107
1129
|
|
|
@@ -1112,7 +1134,8 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1112
1134
|
logger.debug("InboundSyncRequest apply_results_queue")
|
|
1113
1135
|
if self._apply_results is not None:
|
|
1114
1136
|
with self._apply_results_lock:
|
|
1115
|
-
|
|
1137
|
+
records_to_upload:List[pandas.DataFrame] = []
|
|
1138
|
+
criteria_deletes_to_upload:List[pandas.DataFrame] = []
|
|
1116
1139
|
stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
|
|
1117
1140
|
for stream_name, stream_results in self._apply_results.items():
|
|
1118
1141
|
# the stream results contains an ordered sequence of dataframes and state updates (append only)
|
|
@@ -1120,9 +1143,9 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1120
1143
|
# so first, we iterate backwards to find the last state update
|
|
1121
1144
|
last_state_index = -1
|
|
1122
1145
|
for i in range(len(stream_results) - 1, -1, -1):
|
|
1123
|
-
if isinstance(stream_results[i],
|
|
1146
|
+
if isinstance(stream_results[i], StateResult):
|
|
1124
1147
|
last_state_index = i
|
|
1125
|
-
stream_states_for_upload[stream_name] = stream_results[i]
|
|
1148
|
+
stream_states_for_upload[stream_name] = stream_results[i].new_state
|
|
1126
1149
|
break
|
|
1127
1150
|
# if there are no state updates, we can't do anything with this stream
|
|
1128
1151
|
if last_state_index == -1:
|
|
@@ -1131,56 +1154,54 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1131
1154
|
)
|
|
1132
1155
|
continue
|
|
1133
1156
|
assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
|
|
1134
|
-
# now we can take the dataframes up to the last state update
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
x for x in
|
|
1157
|
+
# now we can take the record dataframes up to the last state update
|
|
1158
|
+
results_subset = stream_results[:last_state_index]
|
|
1159
|
+
non_empty_record_dfs:List[pandas.DataFrame] = [
|
|
1160
|
+
x.records for x in results_subset
|
|
1161
|
+
if x is not None and isinstance(x, RecordsToUploadResult) and len(x.records) > 0
|
|
1138
1162
|
]
|
|
1139
1163
|
# get the total length of all the dataframes
|
|
1140
|
-
total_length = sum([len(x) for x in
|
|
1164
|
+
total_length = sum([len(x) for x in non_empty_record_dfs])
|
|
1141
1165
|
# add the count of this batch to the total for this stream
|
|
1142
1166
|
self._stream_record_counts[
|
|
1143
1167
|
stream_name
|
|
1144
1168
|
] = self._stream_record_counts[stream_name] + total_length
|
|
1145
|
-
|
|
1169
|
+
records_to_upload.extend(non_empty_record_dfs)
|
|
1170
|
+
# also handle any criteria deletes
|
|
1171
|
+
criteria_deletes_to_upload.extend([
|
|
1172
|
+
x.criteria_deletes for x in results_subset
|
|
1173
|
+
if x is not None and isinstance(x, CriteriaDeleteResult) and len(x.criteria_deletes) > 0
|
|
1174
|
+
])
|
|
1146
1175
|
# now remove everything up to the last state update
|
|
1147
1176
|
# we do this so that we don't apply the same state update multiple times
|
|
1177
|
+
# keep everything after the last state update
|
|
1148
1178
|
self._apply_results[stream_name] = stream_results[
|
|
1149
1179
|
last_state_index + 1 :
|
|
1150
|
-
]
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1180
|
+
]
|
|
1181
|
+
|
|
1182
|
+
if len(records_to_upload) > 0 or len(criteria_deletes_to_upload) > 0:
|
|
1183
|
+
if len(records_to_upload) > 0:
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Applying {len(records_to_upload)} batches of queued results"
|
|
1186
|
+
)
|
|
1187
|
+
# upload all cached apply results
|
|
1188
|
+
records_to_upload_combined = pandas.concat(records_to_upload)
|
|
1189
|
+
self._apply_results_dataframe(list(stream_states_for_upload.keys()), records_to_upload_combined)
|
|
1190
|
+
# now that the results have been updated, we need to insert records into the state register table
|
|
1191
|
+
# we do this by inserting the latest state for each stream
|
|
1192
|
+
if len(criteria_deletes_to_upload) > 0:
|
|
1193
|
+
logger.debug(
|
|
1194
|
+
f"Applying {len(criteria_deletes_to_upload)} batches of queued criteria deletes"
|
|
1195
|
+
)
|
|
1196
|
+
# upload all cached apply results
|
|
1197
|
+
all_criteria_deletes = pandas.concat(criteria_deletes_to_upload)
|
|
1198
|
+
self._apply_criteria_deletes_dataframe(all_criteria_deletes)
|
|
1199
|
+
|
|
1200
|
+
query_id = self._get_query_id_for_now()
|
|
1160
1201
|
self._directly_insert_to_state_register(
|
|
1161
1202
|
stream_states_for_upload, query_id=query_id
|
|
1162
1203
|
)
|
|
1163
1204
|
|
|
1164
|
-
# also take care of uploading delete requests
|
|
1165
|
-
# technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
|
|
1166
|
-
# and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
|
|
1167
|
-
if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
|
|
1168
|
-
with self._apply_results_lock:
|
|
1169
|
-
results:List[pandas.DataFrame] = []
|
|
1170
|
-
for stream_name, stream_results in self._apply_results_criteria_deletes.items():
|
|
1171
|
-
results.extend([
|
|
1172
|
-
x for x in stream_results if x is not None and len(x) > 0
|
|
1173
|
-
])
|
|
1174
|
-
if len(results) > 0:
|
|
1175
|
-
logger.debug(
|
|
1176
|
-
f"Applying {len(results)} batches of queued criteria deletes"
|
|
1177
|
-
)
|
|
1178
|
-
# upload all cached apply results
|
|
1179
|
-
all_dfs = pandas.concat(results)
|
|
1180
|
-
self._apply_criteria_deletes_dataframe(all_dfs)
|
|
1181
|
-
# clear the delete requests
|
|
1182
|
-
self._apply_results_criteria_deletes = {}
|
|
1183
|
-
|
|
1184
1205
|
|
|
1185
1206
|
# update the inbound stream record counts, so we can see progress
|
|
1186
1207
|
# we do this last, because marking a stream as completed will cause the sync engine to process it
|
|
@@ -1288,29 +1309,40 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1288
1309
|
if stream_name is None or len(stream_name) == 0:
|
|
1289
1310
|
raise ValueError("Stream name cannot be empty")
|
|
1290
1311
|
with self._apply_results_lock:
|
|
1291
|
-
existing_results: List[
|
|
1312
|
+
existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
|
|
1292
1313
|
if stream_name in self._apply_results:
|
|
1293
1314
|
existing_results = self._apply_results[stream_name]
|
|
1294
|
-
existing_results.append(
|
|
1315
|
+
existing_results.append(RecordsToUploadResult(
|
|
1316
|
+
records=self._preprocess_results_list(stream_name, results, is_delete)
|
|
1317
|
+
))
|
|
1295
1318
|
if new_state is not None:
|
|
1296
|
-
existing_results.append(
|
|
1319
|
+
existing_results.append(
|
|
1320
|
+
StateResult(new_state=new_state)
|
|
1321
|
+
) # append the new state at the end
|
|
1297
1322
|
self._apply_results[stream_name] = existing_results
|
|
1298
|
-
# if the total size of all the dataframes exceeds 200MB, apply the results immediately
|
|
1299
|
-
# we'll use df.memory_usage(index=True) for this
|
|
1300
1323
|
if self.development_mode is False:
|
|
1301
1324
|
# note: we want to do it for all values in self._apply_results, not just the new one
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1325
|
+
self._apply_results_if_size_exceeded()
|
|
1326
|
+
|
|
1327
|
+
def _apply_results_if_size_exceeded(self,):
|
|
1328
|
+
# so first we need to get the list of lists from the dictionary values and flatten it
|
|
1329
|
+
# then we can sum the memory usage of each dataframe
|
|
1330
|
+
# if the total exceeds 200MB, we apply the results immediately
|
|
1331
|
+
all_df_lists:List[List[RecordsToUploadResult | StateResult | CriteriaDeleteResult]] = list(self._apply_results.values())
|
|
1332
|
+
# flatten
|
|
1333
|
+
all_dfs:List[pandas.DataFrame] = []
|
|
1334
|
+
for sublist in all_df_lists:
|
|
1335
|
+
for x in sublist:
|
|
1336
|
+
if isinstance(x, RecordsToUploadResult):
|
|
1337
|
+
all_dfs.append(x.records)
|
|
1338
|
+
if isinstance(x, CriteriaDeleteResult):
|
|
1339
|
+
all_dfs.append(x.criteria_deletes)
|
|
1340
|
+
combined_length = sum([len(x) for x in all_dfs])
|
|
1341
|
+
# first, don't bother if the count is less than 10000, since it's unlikely to be even close
|
|
1342
|
+
if combined_length > 10000:
|
|
1343
|
+
if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
|
|
1344
|
+
logger.debug(f"Applying results queue immediately due to combined dataframe size")
|
|
1345
|
+
self.apply_results_queue()
|
|
1314
1346
|
|
|
1315
1347
|
def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
|
|
1316
1348
|
"""
|
|
@@ -1336,27 +1368,15 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1336
1368
|
logger.debug(
|
|
1337
1369
|
f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
|
|
1338
1370
|
)
|
|
1339
|
-
existing_results: List[
|
|
1340
|
-
if stream_name in self.
|
|
1341
|
-
existing_results = self.
|
|
1342
|
-
existing_results.append(
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1371
|
+
existing_results: List[RecordsToUploadResult | StateResult | CriteriaDeleteResult] = []
|
|
1372
|
+
if stream_name in self._apply_results:
|
|
1373
|
+
existing_results = self._apply_results[stream_name]
|
|
1374
|
+
existing_results.append(
|
|
1375
|
+
CriteriaDeleteResult(
|
|
1376
|
+
criteria_deletes=pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}])))
|
|
1377
|
+
self._apply_results[stream_name] = existing_results
|
|
1346
1378
|
if self.development_mode is False:
|
|
1347
|
-
|
|
1348
|
-
# so first we need to get the list of lists from the dictionary values and flatten it
|
|
1349
|
-
# then we can sum the memory usage of each dataframe
|
|
1350
|
-
# if the total exceeds 200MB, we apply the results immediately
|
|
1351
|
-
all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
|
|
1352
|
-
# flatten
|
|
1353
|
-
all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
|
|
1354
|
-
combined_length = sum([len(x) for x in all_dfs])
|
|
1355
|
-
# first, don't both if the count is less than 10000, since it's unlikely to be even close
|
|
1356
|
-
if combined_length > 10000:
|
|
1357
|
-
if sum([x.memory_usage(index=True).sum() for x in all_dfs if isinstance(x, pandas.DataFrame)]) > 200000000:
|
|
1358
|
-
logger.debug(f"Applying criteria deletes queue immediately due to combined dataframe size")
|
|
1359
|
-
self.apply_results_queue()
|
|
1379
|
+
self._apply_results_if_size_exceeded()
|
|
1360
1380
|
|
|
1361
1381
|
def mark_stream_started(self, stream_name: str):
|
|
1362
1382
|
"""
|
|
@@ -1491,7 +1511,7 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1491
1511
|
logger.debug(f"Failure to convert inbound data: {str(exception)}")
|
|
1492
1512
|
return data
|
|
1493
1513
|
|
|
1494
|
-
def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
|
|
1514
|
+
def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]) -> pandas.DataFrame:
|
|
1495
1515
|
"""
|
|
1496
1516
|
Creates a dataframe from the enqueued list, ready to upload.
|
|
1497
1517
|
The result is a dataframe contain all (and only):
|
|
@@ -1636,7 +1656,7 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1636
1656
|
hash_object = hashlib.sha256(key_string.encode())
|
|
1637
1657
|
return hash_object.hexdigest()
|
|
1638
1658
|
|
|
1639
|
-
def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame)
|
|
1659
|
+
def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
|
|
1640
1660
|
"""
|
|
1641
1661
|
Applies results for an inbound sync. The results are staged into a temporary
|
|
1642
1662
|
table in Snowflake, so that we can make an atomic commit at the end.
|
|
@@ -1663,7 +1683,6 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1663
1683
|
raise ValueError(
|
|
1664
1684
|
f"Failed to write results to table {self._full_results_table_name}"
|
|
1665
1685
|
)
|
|
1666
|
-
query_id = self._get_query_id_for_now()
|
|
1667
1686
|
logger.debug(
|
|
1668
1687
|
f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
|
|
1669
1688
|
)
|
|
@@ -1676,7 +1695,6 @@ class InboundSyncRequest(SyncRequest):
|
|
|
1676
1695
|
# )
|
|
1677
1696
|
for stream_name in stream_names:
|
|
1678
1697
|
self._results_exist[stream_name] = True
|
|
1679
|
-
return query_id
|
|
1680
1698
|
else:
|
|
1681
1699
|
logger.debug("Results dataframe is empty, not applying")
|
|
1682
1700
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/api.py
RENAMED
|
File without changes
|
|
File without changes
|
{omnata_plugin_runtime-0.11.8a326 → omnata_plugin_runtime-0.11.9}/src/omnata_plugin_runtime/forms.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|