omnata-plugin-runtime 0.3.13a49__py3-none-any.whl → 0.3.14a51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -868,6 +868,12 @@ class InboundSyncRequest(SyncRequest):
868
868
  s.stream_name: s for s in streams
869
869
  }
870
870
  self._apply_results: Dict[str, List[pandas.DataFrame]] = {}
871
+ # named by convention, see SyncRunProcessor.enqueue
872
+ self._criteria_deletes_table_name = (
873
+ f"{self._source_app_name}.{self._results_schema_name}.{self._results_table_name}_CRITERIA_DELETES"
874
+ )
875
+ # These are similar to the results, but represent requests to delete records by some criteria
876
+ self._apply_results_criteria_deletes: Dict[str, List[pandas.DataFrame]] = {}
871
877
  self._latest_states: Dict[str, Any] = {}
872
878
  self._temp_tables = {}
873
879
  self._temp_table_lock = threading.Lock()
@@ -890,25 +896,28 @@ class InboundSyncRequest(SyncRequest):
890
896
  logger.info("InboundSyncRequest apply_results_queue ")
891
897
  if self._apply_results is not None:
892
898
  with self._apply_results_lock:
899
+ results:List[pandas.DataFrame] = []
900
+ stream_names:List[str] = []
893
901
  for stream_name, stream_results in self._apply_results.items():
894
- results = [
902
+ results.extend([
895
903
  x for x in stream_results if x is not None and len(x) > 0
896
- ] # remove any None/empty dataframes
897
- if len(results) > 0:
898
- logger.info(
899
- f"Applying {len(results)} batches of queued results"
900
- )
901
- # upload all cached apply results
902
- all_dfs = pandas.concat(results)
903
- #logger.info(f"applying: {all_dfs}")
904
- self._apply_results_dataframe(stream_name, all_dfs)
905
- # add the count of this batch to the total for this stream
906
- self._stream_record_counts[
907
- stream_name
908
- ] = self._stream_record_counts[stream_name] + len(all_dfs)
909
- # update the stream state object too
910
- self._apply_latest_states()
911
- self._apply_results[stream_name] = None
904
+ ]) # remove any None/empty dataframes
905
+ stream_names.append(stream_name)
906
+ if len(results) > 0:
907
+ logger.info(
908
+ f"Applying {len(results)} batches of queued results"
909
+ )
910
+ # upload all cached apply results
911
+ all_dfs = pandas.concat(results)
912
+ #logger.info(f"applying: {all_dfs}")
913
+ self._apply_results_dataframe(stream_names, all_dfs)
914
+ # add the count of this batch to the total for this stream
915
+ self._stream_record_counts[
916
+ stream_name
917
+ ] = self._stream_record_counts[stream_name] + len(all_dfs)
918
+ # update the stream state object too
919
+ self._apply_latest_states()
920
+ self._apply_results[stream_name] = None
912
921
  self._apply_results = {}
913
922
  # update the inbound stream record counts, so we can see progress
914
923
  self._plugin_message(
@@ -918,6 +927,22 @@ class InboundSyncRequest(SyncRequest):
918
927
  stream_errors=self._omnata_log_handler.stream_global_errors
919
928
  )
920
929
  )
930
+ # also take care of uploading delete requests
931
+ if self._apply_results_criteria_deletes is not None:
932
+ with self._apply_results_lock:
933
+ results:List[pandas.DataFrame] = []
934
+ for stream_name, stream_results in self._apply_results_criteria_deletes.items():
935
+ results.extend([
936
+ x for x in stream_results if x is not None and len(x) > 0
937
+ ])
938
+ if len(results) > 0:
939
+ logger.info(
940
+ f"Applying {len(results)} batches of queued criteria deletes"
941
+ )
942
+ # upload all cached apply results
943
+ all_dfs = pandas.concat(results)
944
+ #logger.info(f"applying: {all_dfs}")
945
+ self._apply_criteria_deletes_dataframe(all_dfs)
921
946
 
922
947
  def apply_cancellation(self):
923
948
  """
@@ -953,9 +978,15 @@ class InboundSyncRequest(SyncRequest):
953
978
  message=PluginMessageAbandonedStreams(abandoned_streams=abandoned_streams)
954
979
  )
955
980
 
956
- def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any):
981
+ def enqueue_results(self, stream_name: str, results: List[Dict], new_state: Any, is_delete:Union[bool,List[bool]] = False):
957
982
  """
958
- Adds some results to the queue for applying asynchronously
983
+ Adds some results to the queue for applying asynchronously.
984
+ stream_name: str, the name of the stream
985
+ results: List[Dict], the results to enqueue
986
+ new_state: Any, the new state which applies to the stream, given the new results
987
+ is_delete: Union[bool,List[bool]], whether the results are deletes or not
988
+ is_delete can be a single value, which means all results are the same, or a list of booleans, which means each result is different
989
+ For records where is_delete is True, you can provide the current record value if it is known, or just the identifier
959
990
  """
960
991
  logger.info(f"Enqueueing {len(results)} results for upload")
961
992
  if stream_name is None or len(stream_name) == 0:
@@ -964,7 +995,7 @@ class InboundSyncRequest(SyncRequest):
964
995
  existing_results: List[pandas.DataFrame] = []
965
996
  if stream_name in self._apply_results:
966
997
  existing_results = self._apply_results[stream_name]
967
- existing_results.append(self._preprocess_results_list(stream_name, results))
998
+ existing_results.append(self._preprocess_results_list(stream_name, results, is_delete))
968
999
  self._apply_results[stream_name] = existing_results
969
1000
  current_latest = self._latest_states or {}
970
1001
  self._latest_states = {**current_latest, **{stream_name: new_state}}
@@ -984,7 +1015,52 @@ class InboundSyncRequest(SyncRequest):
984
1015
  if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
985
1016
  logger.info(f"Applying results queue immediately due to combined dataframe size")
986
1017
  self.apply_results_queue()
987
-
1018
+
1019
+ def delete_by_criteria(self, stream_name: str, criteria: Dict[str, Any]):
1020
+ """
1021
+ Submits some critera (field→value dict) which will cause matching records to be marked as deleted at the end of the run.
1022
+ This feature was created primarily for array fields that become child streams.
1023
+ The parent record is updated, which means there is a set of new children, but we need to delete the previously sync'd records and we don't know their identifiers.
1024
+
1025
+ The criteria is applied before the new records for the current run are applied. In other words, it will not delete any records from the current run.
1026
+
1027
+ For a record to be deleted, it must match fields with all the criteria supplied. At least one field value must be provided.
1028
+ """
1029
+ if len(criteria) == 0:
1030
+ raise ValueError("At least one field value must be provided for deletion criteria")
1031
+ if stream_name not in self._streams_dict:
1032
+ raise ValueError(
1033
+ f"Cannot delete records for stream {stream_name} as its configuration doesn't exist"
1034
+ )
1035
+ # append the new criteria to the self._criteria_deletes_table_name table
1036
+ # this table has two columns:
1037
+ # STREAM_NAME: string
1038
+ # DELETE_CRITERIA: object
1039
+ with self._apply_results_lock:
1040
+ logger.info(
1041
+ f"Enqueuing {len(criteria)} delete criteria for stream {stream_name} for upload"
1042
+ )
1043
+ existing_results: List[pandas.DataFrame] = []
1044
+ if stream_name in self._apply_results_criteria_deletes:
1045
+ existing_results = self._apply_results_criteria_deletes[stream_name]
1046
+ existing_results.append(pandas.DataFrame([{"STREAM_NAME":stream_name,"DELETE_CRITERIA": criteria}]))
1047
+ self._apply_results_criteria_deletes[stream_name] = existing_results
1048
+ # if the total size of all the dataframes exceeds 200MB, apply the results immediately
1049
+ # we'll use df.memory_usage(index=True) for this
1050
+ if self.development_mode is False:
1051
+ # note: we want to do it for all values in self._apply_results_criteria_deletes, not just the new one
1052
+ # so first we need to get the list of lists from the dictionary values and flatten it
1053
+ # then we can sum the memory usage of each dataframe
1054
+ # if the total exceeds 200MB, we apply the results immediately
1055
+ all_df_lists:List[List[pandas.DataFrame]] = list(self._apply_results_criteria_deletes.values())
1056
+ # flatten
1057
+ all_dfs:List[pandas.DataFrame] = [x for sublist in all_df_lists for x in sublist]
1058
+ combined_length = sum([len(x) for x in all_dfs])
1059
+ # first, don't both if the count is less than 10000, since it's unlikely to be even close
1060
+ if combined_length > 10000:
1061
+ if sum([x.memory_usage(index=True).sum() for x in all_dfs]) > 200000000:
1062
+ logger.info(f"Applying criteria deletes queue immediately due to combined dataframe size")
1063
+ self.apply_results_queue()
988
1064
 
989
1065
  def mark_stream_complete(self, stream_name: str):
990
1066
  """
@@ -1045,14 +1121,15 @@ class InboundSyncRequest(SyncRequest):
1045
1121
  logger.debug(f"Failure to convert inbound data: {str(exception)}")
1046
1122
  return data
1047
1123
 
1048
- def _preprocess_results_list(self, stream_name: str, results: List[Dict]):
1124
+ def _preprocess_results_list(self, stream_name: str, results: List[Dict],is_delete:Union[bool,List[bool]]):
1049
1125
  """
1050
1126
  Creates a dataframe from the enqueued list, ready to upload.
1051
1127
  The result is a dataframe contain all (and only):
1052
1128
  'APP_IDENTIFIER' string
1053
1129
  'STREAM_NAME' string
1054
1130
  'RETRIEVE_DATE' datetime (UTC)
1055
- 'RECORD_DATA' object
1131
+ 'RECORD_DATA' object,
1132
+ 'IS_DELETED' boolean
1056
1133
  """
1057
1134
  # for required_column in ['RECORD_DATA']:
1058
1135
  # if required_column not in results_df.columns:
@@ -1063,6 +1140,9 @@ class InboundSyncRequest(SyncRequest):
1063
1140
  )
1064
1141
  logger.info(f"preprocessing for stream: {self._streams_dict[stream_name]}")
1065
1142
  if len(results) > 0:
1143
+ if isinstance(is_delete, list):
1144
+ if len(results) != len(is_delete):
1145
+ raise ValueError(f"results and is_delete lists must be the same length")
1066
1146
  # We need to remove any values (included nesting) which are empty dicts. This is to prevent the arrow error:
1067
1147
  # Cannot write struct type '<field_name>' with no child field to Parquet. Consider adding a dummy child field.
1068
1148
  results = [remove_empty_dict_values(result) for result in results]
@@ -1108,11 +1188,20 @@ class InboundSyncRequest(SyncRequest):
1108
1188
  )
1109
1189
 
1110
1190
  results_df["APP_IDENTIFIER"] = results_df["RECORD_DATA"].apply(lambda x: get_nested_value(dict(x),primary_key_field))
1111
- # we jump the record data to a json string to make uploading to Snowflake less error prone
1112
- results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(json.dumps)
1113
1191
  # the timestamps in Snowflake are TIMESTAMP_LTZ, so we upload in string format to ensure the
1114
1192
  # timezone information is present.
1115
1193
  results_df["RETRIEVE_DATE"] = str(datetime.datetime.now().astimezone())
1194
+ # create the IS_DELETED column from the is_delete list
1195
+ results_df["IS_DELETED"] = is_delete
1196
+ # for each record, if IS_DELETED is true and RECORD_DATA only contains a single key, we assume that's the identifier
1197
+ # in this case, we nullify the RECORD_DATA column to indicate that the delete operation does not contain the full record
1198
+ for index, row in results_df.iterrows():
1199
+ if row["IS_DELETED"] and len(row["RECORD_DATA"]) == 1:
1200
+ results_df.at[index, "RECORD_DATA"] = None
1201
+ # we dump the record data to a json string to make uploading to Snowflake less error prone, but only if it's not None
1202
+ results_df["RECORD_DATA"] = results_df["RECORD_DATA"].apply(
1203
+ lambda x: json.dumps(x) if x is not None else None
1204
+ )
1116
1205
  results_df["STREAM_NAME"] = stream_name
1117
1206
  else:
1118
1207
  results_df = pandas.DataFrame(
@@ -1122,16 +1211,17 @@ class InboundSyncRequest(SyncRequest):
1122
1211
  "STREAM_NAME",
1123
1212
  "RECORD_DATA",
1124
1213
  "RETRIEVE_DATE",
1214
+ "IS_DELETED"
1125
1215
  ],
1126
1216
  )
1127
1217
  # trim out the columns we don't need to return
1128
1218
  return results_df[
1129
1219
  results_df.columns.intersection(
1130
- ["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE"]
1220
+ ["APP_IDENTIFIER", "STREAM_NAME", "RECORD_DATA", "RETRIEVE_DATE", "IS_DELETED"]
1131
1221
  )
1132
1222
  ]
1133
1223
 
1134
- def _apply_results_dataframe(self, stream_name: str, results_df: pandas.DataFrame):
1224
+ def _apply_results_dataframe(self, stream_names: List[str], results_df: pandas.DataFrame):
1135
1225
  """
1136
1226
  Applies results for an inbound sync. The results are staged into a temporary
1137
1227
  table in Snowflake, so that we can make an atomic commit at the end.
@@ -1164,7 +1254,8 @@ class InboundSyncRequest(SyncRequest):
1164
1254
  # column_order='index',
1165
1255
  # #create_temp_table=True
1166
1256
  # )
1167
- self._results_exist[stream_name] = True
1257
+ for stream_name in stream_names:
1258
+ self._results_exist[stream_name] = True
1168
1259
  else:
1169
1260
  logger.info("Results dataframe is empty, not applying")
1170
1261
 
@@ -1175,6 +1266,34 @@ class InboundSyncRequest(SyncRequest):
1175
1266
  """
1176
1267
  self._plugin_message(PluginMessageStreamState(stream_state=self._latest_states))
1177
1268
 
1269
+ def _apply_criteria_deletes_dataframe(self, results_df: pandas.DataFrame):
1270
+ """
1271
+ Applies results for an inbound sync. The results are staged into a temporary
1272
+ table in Snowflake, so that we can make an atomic commit at the end.
1273
+ """
1274
+ if len(results_df) > 0:
1275
+ with self._snowflake_query_lock:
1276
+ logger.info(
1277
+ f"Applying {len(results_df)} criteria deletes to {self._criteria_deletes_table_name}"
1278
+ )
1279
+ # try setting parquet engine here, since the engine parameter does not seem to make it through to the write_pandas function
1280
+ success, nchunks, nrows, _ = write_pandas(
1281
+ conn=self._session._conn._cursor.connection, # pylint: disable=protected-access
1282
+ df=results_df,
1283
+ table_name=self._criteria_deletes_table_name,
1284
+ quote_identifiers=False, # already done in get_temp_table_name
1285
+ table_type="transient"
1286
+ )
1287
+ if not success:
1288
+ raise ValueError(
1289
+ f"Failed to write results to table {self._criteria_deletes_table_name}"
1290
+ )
1291
+ logger.info(
1292
+ f"Wrote {nrows} rows and {nchunks} chunks to table {self._criteria_deletes_table_name}"
1293
+ )
1294
+ else:
1295
+ logger.info("Results dataframe is empty, not applying")
1296
+
1178
1297
 
1179
1298
  class ConnectResponse(SubscriptableBaseModel):
1180
1299
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: omnata-plugin-runtime
3
- Version: 0.3.13a49
3
+ Version: 0.3.14a51
4
4
  Summary: Classes and common runtime components for building and running Omnata Plugins
5
5
  Author: James Weakley
6
6
  Author-email: james.weakley@omnata.com
@@ -3,10 +3,10 @@ omnata_plugin_runtime/api.py,sha256=_N5ok5LN7GDO4J9n3yduXp3tpjmhpySY__U2baiygrs,
3
3
  omnata_plugin_runtime/configuration.py,sha256=at29ExowF_T4_2U9gY0BF4IVdwC-vDytmNRHL7UCWh8,34742
4
4
  omnata_plugin_runtime/forms.py,sha256=30CJB24TqfLYNnkplZdUbeqA-P9rUIBujVKXw_S-wKY,18371
5
5
  omnata_plugin_runtime/logging.py,sha256=bn7eKoNWvtuyTk7RTwBS9UARMtqkiICtgMtzq3KA2V0,3272
6
- omnata_plugin_runtime/omnata_plugin.py,sha256=JSxz2Q5j1s6Fkawfl2rZf0GnkBaJlLMJ_PH13899xxg,91321
6
+ omnata_plugin_runtime/omnata_plugin.py,sha256=yFZUiCPyFkCH1GRE5_J-ZgvtclWTJ0q3cGyc8x2Oi1c,98853
7
7
  omnata_plugin_runtime/plugin_entrypoints.py,sha256=_XgmWsrHoSshkl5Z2T27BAGVnBh4yH-8lni5sdGlSz8,27670
8
8
  omnata_plugin_runtime/rate_limiting.py,sha256=se6MftQI5NrVHaLb1hByPCgAESPQhkAgIG7KIU1clDU,16562
9
- omnata_plugin_runtime-0.3.13a49.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
10
- omnata_plugin_runtime-0.3.13a49.dist-info/METADATA,sha256=9aa8ni0R9o2Nx4JRmc235lUOKwYdyO8ZtGdkla8424o,1604
11
- omnata_plugin_runtime-0.3.13a49.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
- omnata_plugin_runtime-0.3.13a49.dist-info/RECORD,,
9
+ omnata_plugin_runtime-0.3.14a51.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
10
+ omnata_plugin_runtime-0.3.14a51.dist-info/METADATA,sha256=FFW46VXQvWKVO8h_y8JHDrGXVpCaXmxdrY1Y4HYT6rw,1604
11
+ omnata_plugin_runtime-0.3.14a51.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
12
+ omnata_plugin_runtime-0.3.14a51.dist-info/RECORD,,