sapiopycommons 2024.3.18a156__py3-none-any.whl → 2025.1.17a402__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sapiopycommons might be problematic. Click here for more details.

Files changed (52) hide show
  1. sapiopycommons/callbacks/__init__.py +0 -0
  2. sapiopycommons/callbacks/callback_util.py +2041 -0
  3. sapiopycommons/callbacks/field_builder.py +545 -0
  4. sapiopycommons/chem/IndigoMolecules.py +52 -5
  5. sapiopycommons/chem/Molecules.py +114 -30
  6. sapiopycommons/customreport/__init__.py +0 -0
  7. sapiopycommons/customreport/column_builder.py +60 -0
  8. sapiopycommons/customreport/custom_report_builder.py +137 -0
  9. sapiopycommons/customreport/term_builder.py +315 -0
  10. sapiopycommons/datatype/attachment_util.py +17 -15
  11. sapiopycommons/datatype/data_fields.py +61 -0
  12. sapiopycommons/datatype/pseudo_data_types.py +440 -0
  13. sapiopycommons/eln/experiment_handler.py +390 -90
  14. sapiopycommons/eln/experiment_report_util.py +649 -0
  15. sapiopycommons/eln/plate_designer.py +152 -0
  16. sapiopycommons/files/complex_data_loader.py +31 -0
  17. sapiopycommons/files/file_bridge.py +153 -25
  18. sapiopycommons/files/file_bridge_handler.py +555 -0
  19. sapiopycommons/files/file_data_handler.py +633 -0
  20. sapiopycommons/files/file_util.py +270 -158
  21. sapiopycommons/files/file_validator.py +569 -0
  22. sapiopycommons/files/file_writer.py +377 -0
  23. sapiopycommons/flowcyto/flow_cyto.py +77 -0
  24. sapiopycommons/flowcyto/flowcyto_data.py +75 -0
  25. sapiopycommons/general/accession_service.py +375 -0
  26. sapiopycommons/general/aliases.py +259 -18
  27. sapiopycommons/general/audit_log.py +185 -0
  28. sapiopycommons/general/custom_report_util.py +252 -31
  29. sapiopycommons/general/directive_util.py +86 -0
  30. sapiopycommons/general/exceptions.py +69 -7
  31. sapiopycommons/general/popup_util.py +85 -18
  32. sapiopycommons/general/sapio_links.py +50 -0
  33. sapiopycommons/general/storage_util.py +148 -0
  34. sapiopycommons/general/time_util.py +97 -7
  35. sapiopycommons/multimodal/multimodal.py +146 -0
  36. sapiopycommons/multimodal/multimodal_data.py +490 -0
  37. sapiopycommons/processtracking/__init__.py +0 -0
  38. sapiopycommons/processtracking/custom_workflow_handler.py +406 -0
  39. sapiopycommons/processtracking/endpoints.py +192 -0
  40. sapiopycommons/recordmodel/record_handler.py +653 -149
  41. sapiopycommons/rules/eln_rule_handler.py +89 -8
  42. sapiopycommons/rules/on_save_rule_handler.py +89 -12
  43. sapiopycommons/sftpconnect/__init__.py +0 -0
  44. sapiopycommons/sftpconnect/sftp_builder.py +70 -0
  45. sapiopycommons/webhook/webhook_context.py +39 -0
  46. sapiopycommons/webhook/webhook_handlers.py +617 -69
  47. sapiopycommons/webhook/webservice_handlers.py +317 -0
  48. {sapiopycommons-2024.3.18a156.dist-info → sapiopycommons-2025.1.17a402.dist-info}/METADATA +5 -4
  49. sapiopycommons-2025.1.17a402.dist-info/RECORD +60 -0
  50. {sapiopycommons-2024.3.18a156.dist-info → sapiopycommons-2025.1.17a402.dist-info}/WHEEL +1 -1
  51. sapiopycommons-2024.3.18a156.dist-info/RECORD +0 -28
  52. {sapiopycommons-2024.3.18a156.dist-info → sapiopycommons-2025.1.17a402.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,633 @@
1
+ import re
2
+ from typing import Any, Callable, Iterable
3
+
4
+ from sapiopycommons.general.aliases import SapioRecord
5
+ from sapiopycommons.general.exceptions import SapioException
6
+ from sapiopycommons.general.time_util import TimeUtil
7
+ from sapiopycommons.recordmodel.record_handler import RecordHandler
8
+
9
+ FilterList = Iterable[int] | range | Callable[[int, dict[str, Any]], bool] | None
10
+ """A FilterList is an object used to determine if a row in the file data should be skipped over. This can take the
11
+ form of am iterable (e.g. list, set) of its or a range where row indices in the list or range are skipped, or it can be
12
+ a callable function where rows are skipped if the function returns true. Callable function have two input parameters;
13
+ the first is the index of the row and the second is the dict for that row. If None, then the list isn't used as a
14
+ filter."""
15
+
16
+
17
+ class FileDataHandler:
18
+ """
19
+ A FileDataHandler takes in a list of dictionaries, presumably from a tokenized CSV or XLSX file, and allows for the
20
+ filtering and modification of its contents. This can be used for simpler querying of values from a file, such as
21
+ getting every value under a header in order to use it to query records in the system.
22
+
23
+ Look into using this in combination with FileValidator to prepare files for the FileValidator and for use in
24
+ data record fields.
25
+ """
26
+ file_data: list[dict[str, Any]]
27
+
28
+ def __init__(self, file_data: list[dict[str, Any]]):
29
+ """
30
+ :param file_data: A list of dictionaries. Every dictionary in the list is expected to have the same keys.
31
+ FileUtil.tokenize_csv and tokenize_xlsx can be used to convert a file into such a list.
32
+ CustomReportUtil can also generate lists of dictionaries that match this criteria.
33
+ """
34
+ self.file_data = file_data
35
+
36
+ def get_row(self, index: int) -> dict[str, Any]:
37
+ """
38
+ Get a particular row of data given its index.
39
+
40
+ :param index: The index of the row to return.
41
+ :return: The file data for the corresponding row.
42
+ """
43
+ return self.file_data[index]
44
+
45
+ def get_rows(self, indices: list[int]) -> list[dict[str, Any]]:
46
+ """
47
+ Get a list of rows given their indices. Rows will be returned in the same order as the index list.
48
+
49
+ :param indices: The indices of the rows to return.
50
+ :return: The file data for the corresponding rows, in the same order as the input.
51
+ """
52
+ return [self.file_data[index] for index in indices]
53
+
54
+ def has_headers(self, headers: list[str]) -> list[str]:
55
+ """
56
+ Determine if the file data for this handler has all the provided headers (dict keys). This assumes that every
57
+ dict in the file data list has the same keys, so only the first row is used to verify the headers. The file data
58
+ may have extra headers than just those in the provided list.
59
+
60
+ :param headers: A list of headers/key names to check.
61
+ :return: A list of all the headers that are in the provided headers list but not in the file data.
62
+ """
63
+ first_row: dict[str, Any] = self.get_row(0)
64
+ missing_headers: list[str] = []
65
+ for header in headers:
66
+ if header not in first_row:
67
+ missing_headers.append(header)
68
+ return missing_headers
69
+
70
+ def empty_cells(self, header: str,
71
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
72
+ """
73
+ Find the index of all rows under a particular header which are empty. A cell under a header is considered empty
74
+ if either the value is None or the object type under the header has the __len__ special method and len(value)
75
+ returns zero; this is used to return rows with empty strings and lists. (Not simply using bool(value) because we
76
+ want explicit zero numerical values to be considered occupied.)
77
+
78
+ :param header: The header of the column to check the contents of.
79
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
80
+ description for the forms that a whitelist can take.
81
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
82
+ description for the forms that a blacklist can take.
83
+ :return: The index of every row with an empty cell under the specified header.
84
+ """
85
+ indices: list[int] = []
86
+ for i, row in enumerate(self.file_data):
87
+ if self.skip_row(i, row, whitelist, blacklist):
88
+ continue
89
+ value: Any = row.get(header)
90
+ if value is None or (hasattr(value, "__len__") and len(value) == 0):
91
+ indices.append(i)
92
+ return indices
93
+
94
+ def occupied_cells(self, header: str,
95
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
96
+ """
97
+ Find the index of all rows under a particular header which are occupied. A cell under a header is considered
98
+ occupied if the value is not None, or, should the object type under the header have the __len__ special method,
99
+ len(value) returns zero; this is used to avoid returning rows with empty strings and lists. (Not simply using
100
+ bool(value) because we want explicit zero numerical values to be considered occupied.)
101
+
102
+ :param header: The header of the column to check the contents of.
103
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
104
+ description for the forms that a whitelist can take.
105
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
106
+ description for the forms that a blacklist can take.
107
+ :return: The index of every row with an occupied cell under the specified header.
108
+ """
109
+ indices: list[int] = []
110
+ for i, row in enumerate(self.file_data):
111
+ if self.skip_row(i, row, whitelist, blacklist):
112
+ continue
113
+ value: Any = row.get(header)
114
+ if value is not None and (not hasattr(value, "__len__") or len(value) != 0):
115
+ indices.append(i)
116
+ return indices
117
+
118
+ def find_values(self, header: str, values: Iterable[Any],
119
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
120
+ """
121
+ Get the index of the rows with a value under the header matching the input values.
122
+
123
+ :param header: The header of the column to check the contents of.
124
+ :param values: A collection of values that the cells in the column must be within.
125
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
126
+ description for the forms that a whitelist can take.
127
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
128
+ description for the forms that a blacklist can take.
129
+ :return: The index of every row with a value in the input collection under the specified header.
130
+ """
131
+ indices: list[int] = []
132
+ for i, row in enumerate(self.file_data):
133
+ if self.skip_row(i, row, whitelist, blacklist):
134
+ continue
135
+ if row.get(header) in values:
136
+ indices.append(i)
137
+ return indices
138
+
139
+ def get_values_list(self, header: str,
140
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[Any]:
141
+ """
142
+ Get a list of every value under a specific header in order of appearance.
143
+
144
+ :param header: The header of the column to check the contents of.
145
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
146
+ description for the forms that a whitelist can take.
147
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
148
+ description for the forms that a blacklist can take.
149
+ :return: A list of the values under the specified header.
150
+ """
151
+ values: list[Any] = []
152
+ for i, row in enumerate(self.file_data):
153
+ if self.skip_row(i, row, whitelist, blacklist):
154
+ continue
155
+ values.append(row.get(header))
156
+ return values
157
+
158
+ def get_values_set(self, header: str,
159
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> set[Any]:
160
+ """
161
+ Get a set of values under a specified header.
162
+
163
+ :param header: The header of the column to check the contents of.
164
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
165
+ description for the forms that a whitelist can take.
166
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
167
+ description for the forms that a blacklist can take.
168
+ :return: A set of the values under the specified header.
169
+ """
170
+ values: set[Any] = set()
171
+ for i, row in enumerate(self.file_data):
172
+ if self.skip_row(i, row, whitelist, blacklist):
173
+ continue
174
+ values.add(row.get(header))
175
+ return values
176
+
177
+ def get_values_dict(self, header: str,
178
+ *, whitelist: FilterList = None, blacklist: FilterList = None) \
179
+ -> dict[Any, list[int]]:
180
+ """
181
+ Get a dict of values under a specified header where each value is mapped to its row(s) of origin.
182
+
183
+ :param header: The header of the column to check the contents of.
184
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
185
+ description for the forms that a whitelist can take.
186
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
187
+ description for the forms that a blacklist can take.
188
+ :return: A dict of the values under the specified header, mapping the cell values to the indices that they come
189
+ from.
190
+ """
191
+ values: dict[Any, list[int]] = {}
192
+ for i, row in enumerate(self.file_data):
193
+ if self.skip_row(i, row, whitelist, blacklist):
194
+ continue
195
+ values.setdefault(row.get(header), []).append(i)
196
+ return values
197
+
198
+ def get_duplicates(self, header: str,
199
+ *, whitelist: FilterList = None, blacklist: FilterList = None) \
200
+ -> dict[Any, list[int]]:
201
+ """
202
+ Get a dict of values under a specific header that appear more than once in that column where each value is
203
+ mapped by its rows of origin.
204
+
205
+ :param header: The header of the column to check the contents of.
206
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
207
+ description for the forms that a whitelist can take.
208
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
209
+ description for the forms that a blacklist can take.
210
+ :return: A dict of the values under the specified header, mapping the cell values to the indices that they come
211
+ from.
212
+ """
213
+ values: dict[Any, list[int]] = self.get_values_dict(header, whitelist=whitelist, blacklist=blacklist)
214
+ duplicates: dict[Any, list[int]] = {}
215
+ for value, indices in values.items():
216
+ if len(indices) > 1:
217
+ duplicates.update({value: indices})
218
+ return duplicates
219
+
220
+ def get_by_function(self, func: Callable[[int, dict[str, Any]], bool],
221
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
222
+ """
223
+ Get the index of every row where some function returns true. This can be used for more complex validation than
224
+ checking that a value is present or avoiding duplicates. Look at functions in this class like get_inside_range
225
+ or get_in_List to see how this can be used. These example functions only check the contents of a single header,
226
+ but you could create a function that checks across multiple headers as well.
227
+
228
+ :param func: A callable function where the input is the data for a row and the output is a boolean.
229
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
230
+ description for the forms that a whitelist can take.
231
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
232
+ description for the forms that a blacklist can take.
233
+ :return: The index of every row where the provided function returns true.
234
+ """
235
+ indices: list[int] = []
236
+ for i, row in enumerate(self.file_data):
237
+ if self.skip_row(i, row, whitelist, blacklist):
238
+ continue
239
+ if func(i, row):
240
+ indices.append(i)
241
+ return indices
242
+
243
+ def get_inside_range(self, header: str, min_val: float | int, max_val: float | int,
244
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
245
+ """
246
+ Get the index of every row with a value under the header inside a range defined by the min and max values.
247
+ This range check is inclusive (i.e. min <= x <= max).
248
+
249
+ :param header: The header of the column to check the contents of.
250
+ :param min_val: The minimum allowed value of the cell.
251
+ :param max_val: The maximum allowed value of the cell.
252
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
253
+ description for the forms that a whitelist can take.
254
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
255
+ description for the forms that a blacklist can take.
256
+ :return: The index of every row with a value under the specified header inside the range.
257
+ """
258
+ def func(index: int, row: dict[str, Any]) -> bool:
259
+ nonlocal header, min_val, max_val
260
+ return min_val <= row.get(header) <= max_val
261
+
262
+ return self.get_by_function(func, whitelist=whitelist, blacklist=blacklist)
263
+
264
+ def get_outside_range(self, header: str, min_val: float | int, max_val: float | int,
265
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
266
+ """
267
+ Get the index of every row with a value under the header inside a range defined by the min and max values.
268
+ This range check is exclusive (i.e. x < min or max < x).
269
+
270
+ :param header: The header of the column to check the contents of.
271
+ :param min_val: The value that the cell may be lesser than.
272
+ :param max_val: The value that the cell may be greater than.
273
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
274
+ description for the forms that a whitelist can take.
275
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
276
+ description for the forms that a blacklist can take.
277
+ :return: The index of every row with a value under the specified header outside the range.
278
+ """
279
+ def func(index: int, row: dict[str, Any]) -> bool:
280
+ value = row.get(header)
281
+ return value < min_val or value > max_val
282
+
283
+ return self.get_by_function(func, whitelist=whitelist, blacklist=blacklist)
284
+
285
+ def get_in_list(self, header: str, values: list[Any],
286
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
287
+ """
288
+ Get the index of every row with a value under the header that is within a given list.
289
+
290
+ :param header: The header of the column to check the contents of.
291
+ :param values: A list of values to check the header cells against.
292
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
293
+ description for the forms that a whitelist can take.
294
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
295
+ description for the forms that a blacklist can take.
296
+ :return: The index of every row with a value under the specified header inside the list.
297
+ """
298
+ return self.get_by_function(lambda i, row: row.get(header) in values, whitelist=whitelist, blacklist=blacklist)
299
+
300
+ def get_not_in_list(self, header: str, values: list[Any],
301
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
302
+ """
303
+ Get the index of every row with a value under the header that is not within a given list.
304
+
305
+ :param header: The header of the column to check the contents of.
306
+ :param values: A list of values to check the header cells against.
307
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
308
+ description for the forms that a whitelist can take.
309
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
310
+ description for the forms that a blacklist can take.
311
+ :return: The index of every row with a value under the specified header not inside the range.
312
+ """
313
+ return self.get_by_function(lambda i, row: row.get(header) not in values, whitelist=whitelist, blacklist=blacklist)
314
+
315
+ def get_matches(self, header: str, pattern: str | re.Pattern[str],
316
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
317
+ """
318
+ Get the index of every row with a value under the given header than matches a regex pattern. Unless you set up
319
+ your regex pattern to require that the entire string must match, can return rows where only a substring of
320
+ the cell matches.
321
+
322
+ :param header: The header of the column to check the contents of.
323
+ :param pattern: A regex pattern to run on the cells.
324
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
325
+ description for the forms that a whitelist can take.
326
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
327
+ description for the forms that a blacklist can take.
328
+ :return: The index of every row with a value under the specified header that matches the regex pattern.
329
+ """
330
+ def func(index: int, row: dict[str, Any]) -> bool:
331
+ return len(re.findall(pattern, row.get(header))) > 0
332
+
333
+ return self.get_by_function(func, whitelist=whitelist, blacklist=blacklist)
334
+
335
+ def get_mismatches(self, header: str, pattern: str | re.Pattern[str],
336
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> list[int]:
337
+ """
338
+ Get the index of every row with a value under the given header than doesn't match a regex pattern.
339
+
340
+ :param header: The header of the column to check the contents of.
341
+ :param pattern: A regex pattern to run on the cells.
342
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
343
+ description for the forms that a whitelist can take.
344
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
345
+ description for the forms that a blacklist can take.
346
+ :return: The index of every row with a value under the specified header that doesn't match the regex pattern.
347
+ """
348
+ def func(index: int, row: dict[str, Any]) -> bool:
349
+ return len(re.findall(pattern, row.get(header))) == 0
350
+
351
+ return self.get_by_function(func, whitelist=whitelist, blacklist=blacklist)
352
+
353
+ def set_defaults(self, header: str, value: Any,
354
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
355
+ """
356
+ For all rows which are returned by the empty_cells function, set their value to some default. Useful for when
357
+ a file may have "optional" headers and some default value is assumed if no value is given in the file.
358
+
359
+ :param header: The header of the column to update the contents of.
360
+ :param value: The value to set empty cells to.
361
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
362
+ description for the forms that a whitelist can take.
363
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
364
+ description for the forms that a blacklist can take.
365
+ """
366
+ indices: list[int] = self.empty_cells(header, whitelist=whitelist, blacklist=blacklist)
367
+ rows: list[dict[str, Any]] = self.get_rows(indices)
368
+ for row in rows:
369
+ row.update({header: value})
370
+
371
+ def for_each(self, func: Callable[[int, dict[str, Any]], None],
372
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
373
+ """
374
+ Run a function on rows in the file data. It is expected that the function updates values of the row, although
375
+ this doesn't need to be the case. Look at functions in this class like update_timestamps or update_lists to see
376
+ how this can be used.
377
+
378
+ :param func: A callable function where the input is the index and data for a row. Has no output.
379
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
380
+ description for the forms that a whitelist can take.
381
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
382
+ description for the forms that a blacklist can take.
383
+ """
384
+ for i, row in enumerate(self.file_data):
385
+ if self.skip_row(i, row, whitelist, blacklist):
386
+ continue
387
+ func(i, row)
388
+
389
+ def update_timestamps(self, header: str, time_format: str, timezone: str | int | None = None,
390
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
391
+ """
392
+ Given a header whose contents contain integer timestamps since the epoch, convert them to human-readable
393
+ date/time strings for a given format.
394
+
395
+ :param header: The header of the column to update the contents of.
396
+ :param time_format: The time format to convert the timestamp to. See TimeUtil for specifics.
397
+ :param timezone: The timezone of the timestamp to convert from. If not specified, uses whatever timezone was
398
+ set as the default by the TimeUtil class elsewhere in your project.
399
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
400
+ description for the forms that a whitelist can take.
401
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
402
+ description for the forms that a blacklist can take.
403
+ """
404
+ def func(index: int, row: dict[str, Any]) -> None:
405
+ row.update({header: TimeUtil.millis_to_format(row.get(header), time_format, timezone)})
406
+
407
+ self.for_each(func, whitelist=whitelist, blacklist=blacklist)
408
+
409
+ def update_dates(self, header: str, time_format: str, timezone: str | int | None = None,
410
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
411
+ """
412
+ Given a header whose contents contain human-readable date/time strings for a given format, convert them to
413
+ integer timestamps since the epoch.
414
+
415
+ :param header: The header of the column to update the contents of.
416
+ :param time_format: The time format to convert the timestamp from. See TimeUtil for specifics.
417
+ :param timezone: The timezone of the time point to convert from. If not specified, uses whatever timezone was
418
+ set as the default by the TimeUtil class elsewhere in your project.
419
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
420
+ description for the forms that a whitelist can take.
421
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
422
+ description for the forms that a blacklist can take.
423
+ """
424
+ def func(index: int, row: dict[str, Any]) -> None:
425
+ row.update({header: TimeUtil.format_to_millis(row.get(header), time_format, timezone)})
426
+
427
+ self.for_each(func, whitelist=whitelist, blacklist=blacklist)
428
+
429
+ def update_lists(self, header: str, separator: str = ",",
430
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
431
+ """
432
+ Given a header whose contents contain strings representing a list of values, convert them to a list of strings
433
+ by splitting the contents on a separator.
434
+
435
+ :param header: The header of the column to update the contents of.
436
+ :param separator: The separator to split the cell values on.
437
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
438
+ description for the forms that a whitelist can take.
439
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
440
+ description for the forms that a blacklist can take.
441
+ """
442
+ def func(index: int, row: dict[str, Any]) -> None:
443
+ row.update({header: row.get(header).split(separator)})
444
+
445
+ self.for_each(func, whitelist=whitelist, blacklist=blacklist)
446
+
447
+ def replace_values(self, header: str, replacements: dict[Any, Any],
448
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
449
+ """
450
+ For every cell under a header, if the value in the cell is equivalent ot a key in the replacements set, then
451
+ set that cell to the value for that key.
452
+
453
+ :param header: The header of the column to update the contents of.
454
+ :param replacements: A dictionary of values to replace to values to replace them with.
455
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
456
+ description for the forms that a whitelist can take.
457
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
458
+ description for the forms that a blacklist can take.
459
+ """
460
+ def func(index: int, row: dict[str, Any]) -> None:
461
+ value: Any = row.get(header)
462
+ if value in replacements:
463
+ row.update({header: replacements.get(value)})
464
+
465
+ self.for_each(func, whitelist=whitelist, blacklist=blacklist)
466
+
467
+ def expand_values(self, header: str, expansions: dict[str, str],
468
+ *, whitelist: FilterList = None, blacklist: FilterList = None) -> None:
469
+ """
470
+ Given a header whose contents contain strings, if the string contains any of the keys in the expansions dict,
471
+ replace that substring of the cell with the value for that key. For example, a cell that contains "horse" with
472
+ a dict {"r": "u"} would result in a cell being updated to read "house".
473
+
474
+ Expansions are recursive; e.g. "horse" with the dict {"r": "u", "u": "r"} would replace the "r" with "u"
475
+ would then replace that "u" with "r", resulting in no change.
476
+
477
+ :param header: The header of the column to update the contents of.
478
+ :param expansions: A dictionary of values to replace to values to replace them with.
479
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
480
+ description for the forms that a whitelist can take.
481
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
482
+ description for the forms that a blacklist can take.
483
+ """
484
+ def func(index: int, row: dict[str, Any]) -> None:
485
+ value: str = row.get(header)
486
+ for key in expansions:
487
+ if key in value:
488
+ value = value.replace(key, expansions.get(key))
489
+ row.update({header: value})
490
+
491
+ self.for_each(func, whitelist=whitelist, blacklist=blacklist)
492
+
493
+ def match_records(self, header: str, field: str, records: list[SapioRecord]) -> dict[SapioRecord, dict[str, Any]]:
494
+ """
495
+ Match a list of records to rows in the file given a header and field to match on.
496
+
497
+ The expectation is that no two records have the same value for the given field and that no two rows in the file
498
+ map to the same record. It is allowable that a record may have no matching row in the file.
499
+
500
+ :param header: The header of the column to check against.
501
+ :param field: The data field name on the records to check against.
502
+ :param records: The records to map to the file rows.
503
+ :return: A dict of record to the row where that record's field value matches the row's value under the header.
504
+ """
505
+ mapped_records: dict[Any, SapioRecord] = RecordHandler.map_by_unique_field(records, field)
506
+ matches: dict[SapioRecord, dict[str, Any]] = {}
507
+ for row in self.file_data:
508
+ value: Any = row.get(header)
509
+ record: SapioRecord = mapped_records.get(value)
510
+ if record:
511
+ if record in matches:
512
+ raise SapioException(f"The header {header} value {value} matches with multiple records in the "
513
+ f"given list of records.")
514
+ matches.update({record: row})
515
+ return matches
516
+
517
+ def update_records(self, match_header: str, match_field: str, records: list[SapioRecord],
518
+ header_to_fields: dict[str, str]) -> None:
519
+ """
520
+ Match a list of records to rows in the file given a header and field ot match on, then update the fields of
521
+ those records using values from the column of the matching row.
522
+
523
+ The expectation is that no two records have the same value for the given field and that no two rows in the file
524
+ map to the same record. It is allowable that a record may have no matching row in the file.
525
+
526
+ :param match_header: The header of the column to match against.
527
+ :param match_field: The data field name on the records to match against.
528
+ :param records: The records to update.
529
+ :param header_to_fields: A dict of file header to record field name to.
530
+ """
531
+ records_to_row: dict[SapioRecord, dict[str, Any]] = self.match_records(match_header, match_field, records)
532
+ for record, row in records_to_row.items():
533
+ for header, field in header_to_fields.items():
534
+ record.set_field_value(field, row.get(header))
535
+
536
+ def find_missing_values(self, header: str, field: str, records: list[SapioRecord]) -> list[int]:
537
+ """
538
+ Given a header and a list of records with a field to check, return every value that is present in the file
539
+ but is not present in the record fields.
540
+
541
+ :param header: The header of the column to check against.
542
+ :param field: The data field name on the records to check against.
543
+ :param records: The records to check if their values exist in the file.
544
+ :return: A list of row indices for rows that contain a value that isn't present in the given records.
545
+ """
546
+ missing: list[int] = []
547
+ values: list[Any] = [x.get_field_value(field) for x in records]
548
+ for i, row in enumerate(self.file_data):
549
+ if row.get(header) not in values:
550
+ missing.append(i)
551
+ return missing
552
+
553
+ def get_differences(self, diff_check: list[dict[str, Any]], headers: list[str]) -> dict[str, list[int]]:
554
+ """
555
+ Given a list of dictionaries and a list of headers to check, return an index of every row where the values
556
+ in the file and the values in the given list differ. The given list should be the same size as the file. Rows
557
+ in the file are compared against the matching index in the diff check list.
558
+
559
+ :param diff_check: A list of dictionaries similar to the list used to initialize this FileDataHandler. The
560
+ number of elements in this list should be equivalent to the number of elements in hte initializing list.
561
+ This list's dicts do not necessarily need to have all the same headers as the initializing dicts, but both
562
+ should have the headers that are in the headers parameter.
563
+ :param headers: A list of the specific headers that should be difference checked.
564
+ :return: A dictionary of headers to a list of indices for the rows that differed between the file and input.
565
+ """
566
+ differences: dict[str, list[int]] = {}
567
+ for i, (row, other_row) in enumerate(zip(self.file_data, diff_check, strict=True)):
568
+ for header in headers:
569
+ if row.get(header) != other_row.get(header):
570
+ differences.setdefault(header, []).append(i)
571
+ return differences
572
+
573
+ def get_differences_for_index(self, index: int, diff_check: dict[str, Any], headers: list[str]) -> list[str]:
574
+ """
575
+ Given a row index, a dictionary , and a list of headers to check, return the headers where the values
576
+ in the row at the given index and the values in the given dict differ.
577
+
578
+ :param index: The index of the row to check against the given dict.
579
+ :param diff_check: A dictionary similar to the dictionaries in the list used to initialize this FileDataHandler.
580
+ This dict does not necessarily need to have all the same headers as the initializing dicts, but both
581
+ should have the headers that are in the headers parameter.
582
+ :param headers: A list of the specific headers that should be difference checked.
583
+ :return: A list of headers that differed between the file and input.
584
+ """
585
+ differences: list[str] = []
586
+ row: dict[str, Any] = self.get_row(index)
587
+ for header in headers:
588
+ if row.get(header) != diff_check.get(header):
589
+ differences.append(header)
590
+ return differences
591
+
592
+ def get_differences_for_record(self, row: dict[str, Any] | int, record: SapioRecord,
593
+ field_mappings: dict[str, str]) -> list[str]:
594
+ """
595
+ Given a row, a record, and a dictionary mapping headers in the row to fields on the record, return a list of
596
+ all file headers where the value in the file differs from the value in the record.
597
+
598
+ :param row: A row from the file, either as a dictionary or an integer for the row's index.
599
+ :param record: A record to compare against.
600
+ :param field_mappings: A dictionary that maps header names in the file to field names on the record.
601
+ :return: A list of file headers where the file and record have different values.
602
+ """
603
+ if isinstance(row, int):
604
+ row: dict[str, Any] = self.get_row(row)
605
+
606
+ differences: list[str] = []
607
+ for header, field in field_mappings.items():
608
+ if row.get(header) != record.get_field_value(field):
609
+ differences.append(header)
610
+ return differences
611
+
612
+ @staticmethod
613
+ def skip_row(i: int, row: dict[str, Any], whitelist: FilterList, blacklist: FilterList) -> bool:
614
+ """
615
+ Determine whether a row should be skipped given whitelist and blacklist filters.
616
+
617
+ :param i: The index of the row.
618
+ :param row: The file data of the row.
619
+ :param whitelist: If a row doesn't match the whitelist, it will be skipped over. See the FilterList alias
620
+ description for the forms that a whitelist can take.
621
+ :param blacklist: If a row matches the blacklist, it will be skipped over. See the FilterList alias
622
+ description for the forms that a blacklist can take.
623
+ :return: Whether the row should be skipped.
624
+ """
625
+ if whitelist is not None:
626
+ if isinstance(whitelist, (list, range)):
627
+ return i not in whitelist
628
+ return whitelist(i, row)
629
+ if blacklist is not None:
630
+ if isinstance(blacklist, (list, range)):
631
+ return i in blacklist
632
+ return blacklist(i, row)
633
+ return False