omnata-plugin-runtime 0.10.33a297__tar.gz → 0.11.0a298__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/PKG-INFO +1 -1
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/pyproject.toml +1 -1
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/json_schema.py +7 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/omnata_plugin.py +110 -34
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/plugin_entrypoints.py +1 -1
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/LICENSE +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/README.md +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/__init__.py +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/api.py +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/configuration.py +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/forms.py +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/logging.py +0 -0
- {omnata_plugin_runtime-0.10.33a297 → omnata_plugin_runtime-0.11.0a298}/src/omnata_plugin_runtime/rate_limiting.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "omnata-plugin-runtime"
|
3
|
-
version = "0.
|
3
|
+
version = "0.11.0-a298"
|
4
4
|
description = "Classes and common runtime components for building and running Omnata Plugins"
|
5
5
|
authors = ["James Weakley <james.weakley@omnata.com>"]
|
6
6
|
readme = "README.md"
|
@@ -405,6 +405,13 @@ class FullyQualifiedTable(BaseModel):
|
|
405
405
|
return self.get_fully_qualified_name(
|
406
406
|
table_override=f"{self.table_name}_CRITERIA_DELETES"
|
407
407
|
)
|
408
|
+
|
409
|
+
def get_fully_qualified_state_register_table_name(self) -> str:
|
410
|
+
"""
|
411
|
+
Returns the fully qualified name of the state register table.
|
412
|
+
This is used to store state values for syncs, paired with query IDs to use with time travel.
|
413
|
+
"""
|
414
|
+
return self.get_fully_qualified_name(table_override=f"{self.table_name}_STATE_REGISTER")
|
408
415
|
|
409
416
|
class SnowflakeViewPart(BaseModel):
|
410
417
|
"""
|
@@ -93,6 +93,9 @@ from .rate_limiting import (
|
|
93
93
|
RateLimitState,
|
94
94
|
RateLimitedSession
|
95
95
|
)
|
96
|
+
from .json_schema import (
|
97
|
+
FullyQualifiedTable
|
98
|
+
)
|
96
99
|
|
97
100
|
SortDirectionType = Literal["asc", "desc"]
|
98
101
|
|
@@ -1055,7 +1058,6 @@ class InboundSyncRequest(SyncRequest):
|
|
1055
1058
|
|
1056
1059
|
# These are similar to the results, but represent requests to delete records by some criteria
|
1057
1060
|
self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
|
1058
|
-
self._latest_states: Dict[str, Any] = {}
|
1059
1061
|
self._temp_tables = {}
|
1060
1062
|
self._temp_table_lock = threading.Lock()
|
1061
1063
|
self._results_exist: Dict[
|
@@ -1085,24 +1087,47 @@ class InboundSyncRequest(SyncRequest):
|
|
1085
1087
|
sync_id=sync_id,
|
1086
1088
|
branch_name=branch_name
|
1087
1089
|
)
|
1088
|
-
#
|
1089
|
-
|
1090
|
-
|
1090
|
+
# The results table name is also used to derive several other table/stage names
|
1091
|
+
results_table = FullyQualifiedTable(
|
1092
|
+
database_name= self._source_app_name,
|
1093
|
+
schema_name= self._results_schema_name,
|
1094
|
+
table_name= self._results_table_name
|
1091
1095
|
)
|
1092
|
-
self.
|
1096
|
+
self._criteria_deletes_table_name = results_table.get_fully_qualified_criteria_deletes_table_name()
|
1097
|
+
self.state_register_table_name = results_table.get_fully_qualified_state_register_table_name()
|
1098
|
+
# this is keyed on stream name, each containing a list of dataframes and state updates mixed
|
1099
|
+
self._apply_results: Dict[str, List[pandas.DataFrame | Dict]] = {}
|
1093
1100
|
|
1094
1101
|
def apply_results_queue(self):
|
1095
1102
|
"""
|
1096
|
-
Merges all of the queued results and applies them
|
1103
|
+
Merges all of the queued results and applies them, including state updates.
|
1097
1104
|
"""
|
1098
|
-
logger.debug("InboundSyncRequest apply_results_queue
|
1105
|
+
logger.debug("InboundSyncRequest apply_results_queue")
|
1099
1106
|
if self._apply_results is not None:
|
1100
1107
|
with self._apply_results_lock:
|
1101
1108
|
results:List[pandas.DataFrame] = []
|
1102
|
-
|
1109
|
+
stream_states_for_upload:Dict[str, Dict[str, Any]] = {}
|
1103
1110
|
for stream_name, stream_results in self._apply_results.items():
|
1111
|
+
# the stream results contains an ordered sequence of dataframes and state updates (append only)
|
1112
|
+
# we only want to apply the dataframes up until the most recent state update
|
1113
|
+
# so first, we iterate backwards to find the last state update
|
1114
|
+
last_state_index = -1
|
1115
|
+
for i in range(len(stream_results) - 1, -1, -1):
|
1116
|
+
if isinstance(stream_results[i], dict):
|
1117
|
+
last_state_index = i
|
1118
|
+
stream_states_for_upload[stream_name] = stream_results[i]
|
1119
|
+
break
|
1120
|
+
# if there are no state updates, we can't do anything with this stream
|
1121
|
+
if last_state_index == -1:
|
1122
|
+
logger.debug(
|
1123
|
+
f"No state updates for stream {stream_name}, skipping"
|
1124
|
+
)
|
1125
|
+
continue
|
1126
|
+
assert isinstance(stream_states_for_upload[stream_name], dict), "Latest state must be a dictionary"
|
1127
|
+
# now we can take the dataframes up to the last state update
|
1128
|
+
dfs = stream_results[:last_state_index]
|
1104
1129
|
non_empty_dfs = [
|
1105
|
-
x for x in
|
1130
|
+
x for x in dfs if x is not None and isinstance(x, pandas.DataFrame) and len(x) > 0
|
1106
1131
|
]
|
1107
1132
|
# get the total length of all the dataframes
|
1108
1133
|
total_length = sum([len(x) for x in non_empty_dfs])
|
@@ -1110,22 +1135,28 @@ class InboundSyncRequest(SyncRequest):
|
|
1110
1135
|
self._stream_record_counts[
|
1111
1136
|
stream_name
|
1112
1137
|
] = self._stream_record_counts[stream_name] + total_length
|
1113
|
-
results.extend(non_empty_dfs)
|
1114
|
-
|
1138
|
+
results.extend(non_empty_dfs)
|
1139
|
+
# now remove everything up to the last state update
|
1140
|
+
# we do this so that we don't apply the same state update multiple times
|
1141
|
+
self._apply_results[stream_name] = stream_results[
|
1142
|
+
last_state_index + 1 :
|
1143
|
+
] # keep everything after the last state update
|
1115
1144
|
if len(results) > 0:
|
1116
1145
|
logger.debug(
|
1117
1146
|
f"Applying {len(results)} batches of queued results"
|
1118
1147
|
)
|
1119
1148
|
# upload all cached apply results
|
1120
1149
|
all_dfs = pandas.concat(results)
|
1121
|
-
self._apply_results_dataframe(
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1150
|
+
query_id = self._apply_results_dataframe(list(stream_states_for_upload.keys()), all_dfs)
|
1151
|
+
# now that the results have been updated, we need to insert records into the state register table
|
1152
|
+
# we do this by inserting the latest state for each stream
|
1153
|
+
self._directly_insert_to_state_register(
|
1154
|
+
stream_states_for_upload, query_id=query_id
|
1155
|
+
)
|
1156
|
+
|
1128
1157
|
# also take care of uploading delete requests
|
1158
|
+
# technically these should be managed along with the state, however there aren't any scenarios where checkpointing is done
|
1159
|
+
# and deletes have an impact. This is because we only checkpoint in scenarios where the target table is empty first
|
1129
1160
|
if hasattr(self,'_apply_results_criteria_deletes') and self._apply_results_criteria_deletes is not None:
|
1130
1161
|
with self._apply_results_lock:
|
1131
1162
|
results:List[pandas.DataFrame] = []
|
@@ -1149,6 +1180,25 @@ class InboundSyncRequest(SyncRequest):
|
|
1149
1180
|
# so we need to make sure all the results are applied first
|
1150
1181
|
self.apply_progress_updates()
|
1151
1182
|
|
1183
|
+
def _directly_insert_to_state_register(
|
1184
|
+
self, stream_states_for_upload: Dict[str, Dict[str, Any]],
|
1185
|
+
query_id: Optional[str] = None
|
1186
|
+
) -> str:
|
1187
|
+
binding_values = []
|
1188
|
+
values_clauses = []
|
1189
|
+
|
1190
|
+
with self._snowflake_query_lock:
|
1191
|
+
if query_id is None:
|
1192
|
+
query_id = self._get_query_id_for_now()
|
1193
|
+
for stream_name, latest_state in stream_states_for_upload.items():
|
1194
|
+
binding_values.append(stream_name, query_id, json.dumps(latest_state))
|
1195
|
+
values_clauses.append(
|
1196
|
+
f"(?, ?, PARSE_JSON(?))"
|
1197
|
+
)
|
1198
|
+
final_query = f"""INSERT INTO {self.state_register_table_name} (STREAM_NAME, QUERY_ID, LATEST_STATE)
|
1199
|
+
VALUES {','.join(values_clauses)}"""
|
1200
|
+
self._session.sql(final_query, binding_values).collect()
|
1201
|
+
|
1152
1202
|
def apply_progress_updates(self, ignore_errors:bool = True):
|
1153
1203
|
"""
|
1154
1204
|
Sends a message to the plugin with the current progress of the sync run, if it has changed since last time.
|
@@ -1224,9 +1274,9 @@ class InboundSyncRequest(SyncRequest):
|
|
1224
1274
|
if stream_name in self._apply_results:
|
1225
1275
|
existing_results = self._apply_results[stream_name]
|
1226
1276
|
existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
|
1277
|
+
if new_state is not None:
|
1278
|
+
existing_results.append(new_state) # append the new state at the end
|
1227
1279
|
self._apply_results[stream_name] = existing_results
|
1228
|
-
current_latest = self._latest_states or {}
|
1229
|
-
self._latest_states = {**current_latest, **{stream_name: new_state}}
|
1230
1280
|
# if the total size of all the dataframes exceeds 200MB, apply the results immediately
|
1231
1281
|
# we'll use df.memory_usage(index=True) for this
|
1232
1282
|
if self.development_mode is False:
|
@@ -1321,9 +1371,42 @@ class InboundSyncRequest(SyncRequest):
|
|
1321
1371
|
instead you should store state using the new_state parameter in the enqueue_results
|
1322
1372
|
method to ensure it's applied along with the associated new records.
|
1323
1373
|
"""
|
1374
|
+
self.enqueue_state(
|
1375
|
+
stream_name=stream_name,
|
1376
|
+
new_state=new_state,
|
1377
|
+
query_id=None # query_id will be generated automatically if not provided
|
1378
|
+
)
|
1379
|
+
|
1380
|
+
def enqueue_state(self, stream_name: str, new_state: Any, query_id: Optional[str] = None):
|
1381
|
+
"""
|
1382
|
+
Enqueues some new stream state to be stored. This method should be called whenever the state of a stream changes.
|
1383
|
+
|
1384
|
+
If there have been records enqueued here for this stream, it is assumed that the state is related to those records.
|
1385
|
+
In this case, the state will be applied after the records are applied.
|
1386
|
+
If there are no records enqueued for this stream, the state will be applied immediately as it is assumed that the results
|
1387
|
+
were directly inserted, and therefore we need to capture the current query ID before more results are inserted.
|
1388
|
+
"""
|
1324
1389
|
with self._apply_results_lock:
|
1325
|
-
|
1326
|
-
|
1390
|
+
if stream_name in self._apply_results:
|
1391
|
+
if len(self._apply_results[stream_name]) > 0:
|
1392
|
+
self._apply_results[stream_name].append(new_state)
|
1393
|
+
return
|
1394
|
+
|
1395
|
+
self._directly_insert_to_state_register(
|
1396
|
+
{
|
1397
|
+
stream_name: new_state
|
1398
|
+
}, query_id=query_id
|
1399
|
+
)
|
1400
|
+
|
1401
|
+
|
1402
|
+
def _get_query_id_for_now(self):
|
1403
|
+
"""
|
1404
|
+
Gets a Snowflake query ID right now. Note that this does not require a Snowflake lock, the caller
|
1405
|
+
should ensure that this is called in a thread-safe manner.
|
1406
|
+
"""
|
1407
|
+
job=self._session.sql("select 1").collect_nowait()
|
1408
|
+
job.result()
|
1409
|
+
return job.query_id
|
1327
1410
|
|
1328
1411
|
def get_queued_results(self, stream_name: str):
|
1329
1412
|
"""
|
@@ -1337,7 +1420,7 @@ class InboundSyncRequest(SyncRequest):
|
|
1337
1420
|
"get_queued_results was called, but no results have been queued"
|
1338
1421
|
)
|
1339
1422
|
concat_results = pandas.concat(self._apply_results[stream_name])
|
1340
|
-
return concat_results
|
1423
|
+
return [c for c in concat_results if c is not None and isinstance(c, pandas.DataFrame) and len(c) > 0]
|
1341
1424
|
|
1342
1425
|
def _convert_by_json_schema(
|
1343
1426
|
self, stream_name: str, data: Dict, json_schema: Dict
|
@@ -1512,10 +1595,11 @@ class InboundSyncRequest(SyncRequest):
|
|
1512
1595
|
hash_object = hashlib.sha256(key_string.encode())
|
1513
1596
|
return hash_object.hexdigest()
|
1514
1597
|
|
1515
|
-
def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
|
1598
|
+
def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame) -> Optional[str]:
|
1516
1599
|
"""
|
1517
1600
|
Applies results for an inbound sync. The results are staged into a temporary
|
1518
1601
|
table in Snowflake, so that we can make an atomic commit at the end.
|
1602
|
+
Returns a query ID that can be used for checkpointing after the copy into command has run.
|
1519
1603
|
"""
|
1520
1604
|
if len(results_df) > 0:
|
1521
1605
|
with self._snowflake_query_lock:
|
@@ -1538,6 +1622,7 @@ class InboundSyncRequest(SyncRequest):
|
|
1538
1622
|
raise ValueError(
|
1539
1623
|
f"Failed to write results to table {self._full_results_table_name}"
|
1540
1624
|
)
|
1625
|
+
query_id = self._get_query_id_for_now()
|
1541
1626
|
logger.debug(
|
1542
1627
|
f"Wrote {nrows} rows and {nchunks} chunks to table {self._full_results_table_name}"
|
1543
1628
|
)
|
@@ -1550,19 +1635,10 @@ class InboundSyncRequest(SyncRequest):
|
|
1550
1635
|
# )
|
1551
1636
|
for stream_name in stream_names:
|
1552
1637
|
self._results_exist[stream_name] = True
|
1638
|
+
return query_id
|
1553
1639
|
else:
|
1554
1640
|
logger.debug("Results dataframe is empty, not applying")
|
1555
1641
|
|
1556
|
-
def _apply_latest_states(self):
|
1557
|
-
"""
|
1558
|
-
Updates the SYNC table to have the latest stream states.
|
1559
|
-
TODO: This should be done in concert with the results, revisit
|
1560
|
-
"""
|
1561
|
-
if self._last_states_update is None or json.dumps(self._latest_states) != json.dumps(self._last_states_update):
|
1562
|
-
self._last_states_update = json.loads(json.dumps(self._latest_states))
|
1563
|
-
self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
|
1564
|
-
|
1565
|
-
|
1566
1642
|
def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
|
1567
1643
|
"""
|
1568
1644
|
Applies results for an inbound sync. The results are staged into a temporary
|
@@ -250,7 +250,7 @@ class PluginEntrypoint:
|
|
250
250
|
self._plugin_instance._configuration_parameters = parameters
|
251
251
|
|
252
252
|
inbound_sync_request.update_activity("Invoking plugin")
|
253
|
-
logger.info(f"
|
253
|
+
logger.info(f"Inbound sync request: {json.dumps(to_jsonable_python(inbound_sync_request))}")
|
254
254
|
# plugin_instance._inbound_sync_request = outbound_sync_request
|
255
255
|
with tracer.start_as_current_span("invoke_plugin"):
|
256
256
|
with HttpRateLimiting(inbound_sync_request, parameters):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|