omnata-plugin-runtime 0.3.13a49__py3-none-any.whl → 0.3.14a51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnata_plugin_runtime/omnata_plugin.py +147 -28
- {omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/METADATA +1 -1
- {omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/RECORD +5 -5
- {omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/LICENSE +0 -0
- {omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/WHEEL +0 -0
@@ -868,6 +868,12 @@ class InboundSyncRequest(SyncRequest):
|
|
868
868
|
s.stream_name: s for s in streams
|
869
869
|
}
|
870
870
|
self._apply_results: Dict[str, List[pandas.DataFrame]] = {}
|
871
|
+
# named by convention, see SyncRunProcessor.enqueue
|
872
|
+
self._criteria_deletes_table_name = (
|
873
|
+
f"{self._source_app_name}.{self._results_schema_name}.{self._results_table_name}_CRITERIA_DELETES"
|
874
|
+
)
|
875
|
+
# These are similar to the results, but represent requests to delete records by some criteria
|
876
|
+
self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
|
871
877
|
self._latest_states: Dict[str, Any] = {}
|
872
878
|
self._temp_tables = {}
|
873
879
|
self._temp_table_lock = threading.Lock()
|
@@ -890,25 +896,28 @@ class InboundSyncRequest(SyncRequest):
|
|
890
896
|
logger.info("InboundSyncRequest apply_results_queue ")
|
891
897
|
if self._apply_results is not None:
|
892
898
|
with self._apply_results_lock:
|
899
|
+
results:List[pandas.DataFrame] = []
|
900
|
+
stream_names:List[str] = []
|
893
901
|
for stream_name, stream_results in self._apply_results.items():
|
894
|
-
results
|
902
|
+
results.extend([
|
895
903
|
x for x in stream_results if x is not None and len(x) > 0
|
896
|
-
] # remove any None/empty dataframes
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
)
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
904
|
+
]) # remove any None/empty dataframes
|
905
|
+
stream_names.append(stream_name)
|
906
|
+
if len(results) > 0:
|
907
|
+
logger.info(
|
908
|
+
f"Applying {len(results)} batches of queued results"
|
909
|
+
)
|
910
|
+
# upload all cached apply results
|
911
|
+
all_dfs = pandas.concat(results)
|
912
|
+
#logger.info(f"applying: {all_dfs}")
|
913
|
+
self._apply_results_dataframe(stream_names, all_dfs)
|
914
|
+
# add the count of this batch to the total for this stream
|
915
|
+
self._stream_record_counts[
|
916
|
+
stream_name
|
917
|
+
] = self._stream_record_counts[stream_name] + len(all_dfs)
|
918
|
+
# update the stream state object too
|
919
|
+
self._apply_latest_states()
|
920
|
+
self._apply_results[stream_name] = None
|
912
921
|
self._apply_results = {}
|
913
922
|
# update the inbound stream record counts, so we can see progress
|
914
923
|
self._plugin_message(
|
@@ -918,6 +927,22 @@ class InboundSyncRequest(SyncRequest):
|
|
918
927
|
stream_errors=self._omnata_log_handler.stream_global_errors
|
919
928
|
)
|
920
929
|
)
|
930
|
+
# also take care of uploading delete requests
|
931
|
+
if self._apply_results_criteria_deletes is not None:
|
932
|
+
with self._apply_results_lock:
|
933
|
+
results:List[pandas.DataFrame] = []
|
934
|
+
for stream_name, stream_results in self._apply_results_criteria_deletes.items():
|
935
|
+
results.extend([
|
936
|
+
x for x in stream_results if x is not None and len(x) > 0
|
937
|
+
])
|
938
|
+
if len(results) > 0:
|
939
|
+
logger.info(
|
940
|
+
f"Applying {len(results)} batches of queued criteria deletes"
|
941
|
+
)
|
942
|
+
# upload all cached apply results
|
943
|
+
all_dfs = pandas.concat(results)
|
944
|
+
#logger.info(f"applying: {all_dfs}")
|
945
|
+
self._apply_criteria_deletes_dataframe(all_dfs)
|
921
946
|
|
922
947
|
def apply_cancellation(self):
|
923
948
|
"""
|
@@ -953,9 +978,15 @@ class InboundSyncRequest(SyncRequest):
|
|
953
978
|
message=PluginMessageAbandonedStreams(abandoned_streams=abandoned_streams)
|
954
979
|
)
|
955
980
|
|
956
|
-
def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any):
|
981
|
+
def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any, is_delete:Union[bool,List[bool]] = False):
|
957
982
|
"""
|
958
|
-
Adds some results to the queue for applying asynchronously
|
983
|
+
Adds some results to the queue for applying asynchronously.
|
984
|
+
stream_name: str, the name of the stream
|
985
|
+
results: List[Dict], the results to enqueue
|
986
|
+
new_state: Any, the new state which applies to the stream, given the new results
|
987
|
+
is_delete: Union[bool,List[bool]], whether the results are deletes or not
|
988
|
+
is_delete can be a single value, which means all results are the same, or a list of booleans, which means each result is different
|
989
|
+
For records where is_delete is True, you can provide the current record value if it is known, or just the identifier
|
959
990
|
"""
|
960
991
|
logger.info(f"Enqueueing {len(results)} results for upload")
|
961
992
|
if stream_name is None or len(stream_name) == 0:
|
@@ -964,7 +995,7 @@ class InboundSyncRequest(SyncRequest):
|
|
964
995
|
existing_results: List[pandas.DataFrame] = []
|
965
996
|
if stream_name in self._apply_results:
|
966
997
|
existing_results = self._apply_results[stream_name]
|
967
|
-
existing_results.append(self._preprocess_results_list(stream_name, results))
|
998
|
+
existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
|
968
999
|
self._apply_results[stream_name] = existing_results
|
969
1000
|
current_latest = self._latest_states or {}
|
970
1001
|
self._latest_states = {**current_latest, **{stream_name: new_state}}
|
@@ -984,7 +1015,52 @@ class InboundSyncRequest(SyncRequest):
|
|
984
1015
|
if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
|
985
1016
|
logger.info(f"Applying results queue immediately due to combined dataframe size")
|
986
1017
|
self.apply_results_queue()
|
987
|
-
|
1018
|
+
|
1019
|
+
def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
|
1020
|
+
"""
|
1021
|
+
Submits some critera (field→value dict) which will cause matching records to be marked as deleted at the end of the run.
|
1022
|
+
This feature was created primarily for array fields that become child streams.
|
1023
|
+
The parent record is updated, which means there is a set of new children, but we need to delete the previously sync'd records and we don't know their identifiers.
|
1024
|
+
|
1025
|
+
The criteria is applied before the new records for the current run are applied. In other words, it will not delete any records from the current run.
|
1026
|
+
|
1027
|
+
For a record to be deleted, it must match fields with all the criteria supplied. At least one field value must be provided.
|
1028
|
+
"""
|
1029
|
+
if len(criteria) == 0:
|
1030
|
+
raise ValueError("At least one field value must be provided for deletion criteria")
|
1031
|
+
if stream_name not in self._streams_dict:
|
1032
|
+
raise ValueError(
|
1033
|
+
f"Cannot delete records for stream {stream_name} as its configuration doesn't exist"
|
1034
|
+
)
|
1035
|
+
# append the new criteria to the self._criteria_deletes_table_name table
|
1036
|
+
# this table has two columns:
|
1037
|
+
# STREAM_NAME: string
|
1038
|
+
# DELETE_CRITERIA: object
|
1039
|
+
with self._apply_results_lock:
|
1040
|
+
logger.info(
|
1041
|
+
f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
|
1042
|
+
)
|
1043
|
+
existing_results: List[pandas.DataFrame] = []
|
1044
|
+
if stream_name in self._apply_results_criteria_deletes:
|
1045
|
+
existing_results = self._apply_results_criteria_deletes[stream_name]
|
1046
|
+
existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
|
1047
|
+
self._apply_results_criteria_deletes[stream_name] = existing_results
|
1048
|
+
# if the total size of all the dataframes exceeds 200MB, apply the results immediately
|
1049
|
+
# we'll use df.memory_usage(index=True) for this
|
1050
|
+
if self.development_mode is False:
|
1051
|
+
# note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
|
1052
|
+
# so first we need to get the list of lists from the dictionary values and flatten it
|
1053
|
+
# then we can sum the memory usage of each dataframe
|
1054
|
+
# if the total exceeds 200MB, we apply the results immediately
|
1055
|
+
all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
|
1056
|
+
# flatten
|
1057
|
+
all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
|
1058
|
+
combined_length = sum([len(x) for x in all_dfs])
|
1059
|
+
# first, don't both if the count is less than 10000, since it's unlikely to be even close
|
1060
|
+
if combined_length > 10000:
|
1061
|
+
if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
|
1062
|
+
logger.info(f"Applying criteria deletes queue immediately due to combined dataframe size")
|
1063
|
+
self.apply_results_queue()
|
988
1064
|
|
989
1065
|
def mark_stream_complete(self, stream_name: str):
|
990
1066
|
"""
|
@@ -1045,14 +1121,15 @@ class InboundSyncRequest(SyncRequest):
|
|
1045
1121
|
logger.debug(f"Failure to convert inbound data: {str(exception)}")
|
1046
1122
|
return data
|
1047
1123
|
|
1048
|
-
def _preprocess_results_list(self, stream_name: str, results: List[Dict]):
|
1124
|
+
def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
|
1049
1125
|
"""
|
1050
1126
|
Creates a dataframe from the enqueued list, ready to upload.
|
1051
1127
|
The result is a dataframe contain all (and only):
|
1052
1128
|
'APP_IDENTIFIER' string
|
1053
1129
|
'STREAM_NAME' string
|
1054
1130
|
'RETRIEVE_DATE' datetime (UTC)
|
1055
|
-
'RECORD_DATA' object
|
1131
|
+
'RECORD_DATA' object,
|
1132
|
+
'IS_DELETED' boolean
|
1056
1133
|
"""
|
1057
1134
|
# for required_column in ['RECORD_DATA']:
|
1058
1135
|
# if required_column not in results_df.columns:
|
@@ -1063,6 +1140,9 @@ class InboundSyncRequest(SyncRequest):
|
|
1063
1140
|
)
|
1064
1141
|
logger.info(f"preprocessing for stream: {self._streams_dict[stream_name]}")
|
1065
1142
|
if len(results) > 0:
|
1143
|
+
if isinstance(is_delete, list):
|
1144
|
+
if len(results) != len(is_delete):
|
1145
|
+
raise ValueError(f"results and is_delete lists must be the same length")
|
1066
1146
|
# We need to remove any values (included nesting) which are empty dicts. This is to prevent the arrow error:
|
1067
1147
|
# Cannot write struct type '<field_name>' with no child field to Parquet. Consider adding a dummy child field.
|
1068
1148
|
results = [remove_empty_dict_values(result) for result in results]
|
@@ -1108,11 +1188,20 @@ class InboundSyncRequest(SyncRequest):
|
|
1108
1188
|
)
|
1109
1189
|
|
1110
1190
|
results_df["APP_IDENTIFIER"] = results_df["RECORD_DATA"].apply(lambda x: get_nested_value(dict(x),primary_key_field))
|
1111
|
-
# we jump the record data to a json string to make uploading to Snowflake less error prone
|
1112
|
-
results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(json.dumps)
|
1113
1191
|
# the timestamps in Snowflake are TIMESTAMP_LTZ, so we upload in string format to ensure the
|
1114
1192
|
# timezone information is present.
|
1115
1193
|
results_df["RETRIEVE_DATE"] = str(datetime.datetime.now().astimezone())
|
1194
|
+
# create the IS_DELETED column from the is_delete list
|
1195
|
+
results_df["IS_DELETED"] = is_delete
|
1196
|
+
# for each record, if IS_DELETED is true and RECORD_DATA only contains a single key, we assume that's the identifier
|
1197
|
+
# in this case, we nullify the RECORD_DATA column to indicate that the delete operation does not contain the full record
|
1198
|
+
for index, row in results_df.iterrows():
|
1199
|
+
if row["IS_DELETED"] and len(row["RECORD_DATA"]) == 1:
|
1200
|
+
results_df.at[index, "RECORD_DATA"] = None
|
1201
|
+
# we dump the record data to a json string to make uploading to Snowflake less error prone, but only if it's not None
|
1202
|
+
results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(
|
1203
|
+
lambda x: json.dumps(x) if x is not None else None
|
1204
|
+
)
|
1116
1205
|
results_df["STREAM_NAME"] = stream_name
|
1117
1206
|
else:
|
1118
1207
|
results_df = pandas.DataFrame(
|
@@ -1122,16 +1211,17 @@ class InboundSyncRequest(SyncRequest):
|
|
1122
1211
|
"STREAM_NAME",
|
1123
1212
|
"RECORD_DATA",
|
1124
1213
|
"RETRIEVE_DATE",
|
1214
|
+
"IS_DELETED"
|
1125
1215
|
],
|
1126
1216
|
)
|
1127
1217
|
# trim out the columns we don't need to return
|
1128
1218
|
return results_df[
|
1129
1219
|
results_df.columns.intersection(
|
1130
|
-
["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE"]
|
1220
|
+
["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE", "IS_DELETED"]
|
1131
1221
|
)
|
1132
1222
|
]
|
1133
1223
|
|
1134
|
-
def _apply_results_dataframe(self,
|
1224
|
+
def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
|
1135
1225
|
"""
|
1136
1226
|
Applies results for an inbound sync. The results are staged into a temporary
|
1137
1227
|
table in Snowflake, so that we can make an atomic commit at the end.
|
@@ -1164,7 +1254,8 @@ class InboundSyncRequest(SyncRequest):
|
|
1164
1254
|
# column_order='index',
|
1165
1255
|
# #create_temp_table=True
|
1166
1256
|
# )
|
1167
|
-
|
1257
|
+
for stream_name in stream_names:
|
1258
|
+
self._results_exist[stream_name] = True
|
1168
1259
|
else:
|
1169
1260
|
logger.info("Results dataframe is empty, not applying")
|
1170
1261
|
|
@@ -1175,6 +1266,34 @@ class InboundSyncRequest(SyncRequest):
|
|
1175
1266
|
"""
|
1176
1267
|
self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
|
1177
1268
|
|
1269
|
+
def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
|
1270
|
+
"""
|
1271
|
+
Applies results for an inbound sync. The results are staged into a temporary
|
1272
|
+
table in Snowflake, so that we can make an atomic commit at the end.
|
1273
|
+
"""
|
1274
|
+
if len(results_df) > 0:
|
1275
|
+
with self._snowflake_query_lock:
|
1276
|
+
logger.info(
|
1277
|
+
f"Applying {len(results_df)} criteria deletes to {self._criteria_deletes_table_name}"
|
1278
|
+
)
|
1279
|
+
# try setting parquet engine here, since the engine parameter does not seem to make it through to the write_pandas function
|
1280
|
+
success, nchunks, nrows, _ = write_pandas(
|
1281
|
+
conn=self._session._conn._cursor.connection, # pylint: disable=protected-access
|
1282
|
+
df=results_df,
|
1283
|
+
table_name=self._criteria_deletes_table_name,
|
1284
|
+
quote_identifiers=False, # already done in get_temp_table_name
|
1285
|
+
table_type="transient"
|
1286
|
+
)
|
1287
|
+
if not success:
|
1288
|
+
raise ValueError(
|
1289
|
+
f"Failed to write results to table {self._criteria_deletes_table_name}"
|
1290
|
+
)
|
1291
|
+
logger.info(
|
1292
|
+
f"Wrote {nrows} rows and {nchunks} chunks to table {self._criteria_deletes_table_name}"
|
1293
|
+
)
|
1294
|
+
else:
|
1295
|
+
logger.info("Results dataframe is empty, not applying")
|
1296
|
+
|
1178
1297
|
|
1179
1298
|
class ConnectResponse(SubscriptableBaseModel):
|
1180
1299
|
"""
|
{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/RECORD
RENAMED
@@ -3,10 +3,10 @@ omnata_plugin_runtime/api.py,sha256=_N5ok5LN7GDO4J9n3yduXp3tpjmhpySY__U2baiygrs,
|
|
3
3
|
omnata_plugin_runtime/configuration.py,sha256=at29ExowF_T4_2U9gY0BF4IVdwC-vDytmNRHL7UCWh8,34742
|
4
4
|
omnata_plugin_runtime/forms.py,sha256=30CJB24TqfLYNnkplZdUbeqA-P9rUIBujVKXw_S-wKY,18371
|
5
5
|
omnata_plugin_runtime/logging.py,sha256=bn7eKoNWvtuyTk7RTwBS9UARMtqkiICtgMtzq3KA2V0,3272
|
6
|
-
omnata_plugin_runtime/omnata_plugin.py,sha256=
|
6
|
+
omnata_plugin_runtime/omnata_plugin.py,sha256=yFZUiCPyFkCH1GRE5_J-ZgvtclWTJ0q3cGyc8x2Oi1c,98853
|
7
7
|
omnata_plugin_runtime/plugin_entrypoints.py,sha256=_XgmWsrHoSshkl5Z2T27BAGVnBh4yH-8lni5sdGlSz8,27670
|
8
8
|
omnata_plugin_runtime/rate_limiting.py,sha256=se6MftQI5NrVHaLb1hByPCgAESPQhkAgIG7KIU1clDU,16562
|
9
|
-
omnata_plugin_runtime-0.3.
|
10
|
-
omnata_plugin_runtime-0.3.
|
11
|
-
omnata_plugin_runtime-0.3.
|
12
|
-
omnata_plugin_runtime-0.3.
|
9
|
+
omnata_plugin_runtime-0.3.14a51.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
|
10
|
+
omnata_plugin_runtime-0.3.14a51.dist-info/METADATA,sha256=FFW46VXQvWKVO8h_y8JHDrGXVpCaXmxdrY1Y4HYT6rw,1604
|
11
|
+
omnata_plugin_runtime-0.3.14a51.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
12
|
+
omnata_plugin_runtime-0.3.14a51.dist-info/RECORD,,
|
{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/LICENSE
RENAMED
File without changes
|
{omnata_plugin_runtime-0.3.13a49.dist-info → omnata_plugin_runtime-0.3.14a51.dist-info}/WHEEL
RENAMED
File without changes
|