sapiopycommons 2025.6.19a564__py3-none-any.whl → 2026.1.22a847__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. sapiopycommons/ai/__init__.py +0 -0
  2. sapiopycommons/ai/agent_service_base.py +2051 -0
  3. sapiopycommons/ai/converter_service_base.py +163 -0
  4. sapiopycommons/ai/external_credentials.py +131 -0
  5. sapiopycommons/ai/protoapi/agent/agent_pb2.py +87 -0
  6. sapiopycommons/ai/protoapi/agent/agent_pb2.pyi +282 -0
  7. sapiopycommons/ai/protoapi/agent/agent_pb2_grpc.py +154 -0
  8. sapiopycommons/ai/protoapi/agent/entry_pb2.py +49 -0
  9. sapiopycommons/ai/protoapi/agent/entry_pb2.pyi +40 -0
  10. sapiopycommons/ai/protoapi/agent/entry_pb2_grpc.py +24 -0
  11. sapiopycommons/ai/protoapi/agent/item/item_container_pb2.py +61 -0
  12. sapiopycommons/ai/protoapi/agent/item/item_container_pb2.pyi +181 -0
  13. sapiopycommons/ai/protoapi/agent/item/item_container_pb2_grpc.py +24 -0
  14. sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2.py +41 -0
  15. sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2.pyi +36 -0
  16. sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2_grpc.py +24 -0
  17. sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2.py +51 -0
  18. sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2.pyi +59 -0
  19. sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2_grpc.py +24 -0
  20. sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2.py +123 -0
  21. sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2.pyi +599 -0
  22. sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2_grpc.py +24 -0
  23. sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2.py +59 -0
  24. sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2.pyi +68 -0
  25. sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2_grpc.py +149 -0
  26. sapiopycommons/ai/protoapi/pipeline/script/script_pb2.py +69 -0
  27. sapiopycommons/ai/protoapi/pipeline/script/script_pb2.pyi +109 -0
  28. sapiopycommons/ai/protoapi/pipeline/script/script_pb2_grpc.py +153 -0
  29. sapiopycommons/ai/protoapi/pipeline/step_output_pb2.py +49 -0
  30. sapiopycommons/ai/protoapi/pipeline/step_output_pb2.pyi +56 -0
  31. sapiopycommons/ai/protoapi/pipeline/step_output_pb2_grpc.py +24 -0
  32. sapiopycommons/ai/protoapi/pipeline/step_pb2.py +43 -0
  33. sapiopycommons/ai/protoapi/pipeline/step_pb2.pyi +44 -0
  34. sapiopycommons/ai/protoapi/pipeline/step_pb2_grpc.py +24 -0
  35. sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2.py +39 -0
  36. sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2.pyi +33 -0
  37. sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2_grpc.py +24 -0
  38. sapiopycommons/ai/protobuf_utils.py +583 -0
  39. sapiopycommons/ai/request_validation.py +561 -0
  40. sapiopycommons/ai/server.py +152 -0
  41. sapiopycommons/ai/test_client.py +534 -0
  42. sapiopycommons/callbacks/callback_util.py +53 -24
  43. sapiopycommons/eln/experiment_handler.py +12 -5
  44. sapiopycommons/files/assay_plate_reader.py +93 -0
  45. sapiopycommons/files/file_text_converter.py +207 -0
  46. sapiopycommons/files/file_util.py +128 -1
  47. sapiopycommons/files/temp_files.py +82 -0
  48. sapiopycommons/flowcyto/flow_cyto.py +2 -24
  49. sapiopycommons/general/accession_service.py +2 -28
  50. sapiopycommons/general/aliases.py +4 -1
  51. sapiopycommons/general/macros.py +172 -0
  52. sapiopycommons/general/time_util.py +199 -4
  53. sapiopycommons/multimodal/multimodal.py +2 -24
  54. sapiopycommons/recordmodel/record_handler.py +200 -111
  55. sapiopycommons/rules/eln_rule_handler.py +3 -0
  56. sapiopycommons/rules/on_save_rule_handler.py +3 -0
  57. sapiopycommons/webhook/webhook_handlers.py +6 -4
  58. sapiopycommons/webhook/webservice_handlers.py +1 -1
  59. {sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/METADATA +2 -2
  60. sapiopycommons-2026.1.22a847.dist-info/RECORD +113 -0
  61. sapiopycommons-2025.6.19a564.dist-info/RECORD +0 -68
  62. {sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/WHEEL +0 -0
  63. {sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/licenses/LICENSE +0 -0
@@ -780,7 +780,7 @@ class CallbackUtil:
780
780
  # FR-47690: Set default values for fields that aren't present.
781
781
  for row in values:
782
782
  for field in fields:
783
- if field.data_field_name not in values:
783
+ if field.data_field_name not in row:
784
784
  row[field.data_field_name] = field.default_value
785
785
 
786
786
  # Convert the group_by parameter to a field name.
@@ -858,9 +858,9 @@ class CallbackUtil:
858
858
  raise SapioException("No records provided.")
859
859
  data_type: str = AliasUtil.to_singular_data_type_name(records)
860
860
  if index_field is not None:
861
- field_map_list: list[FieldMap] = self.__get_indexed_field_maps(records, index_field)
861
+ field_map_list: list[FieldMap] = self.__get_indexed_field_maps(records, index_field, True)
862
862
  else:
863
- field_map_list: list[FieldMap] = AliasUtil.to_field_map_list(records)
863
+ field_map_list: list[FieldMap] = AliasUtil.to_field_map_list(records, True)
864
864
 
865
865
  # Convert the group_by parameter to a field name.
866
866
  if group_by is not None:
@@ -882,6 +882,18 @@ class CallbackUtil:
882
882
  temp_dt = self.__temp_dt_from_field_names(data_type, fields, None, default_modifier, field_modifiers)
883
883
  temp_dt.record_image_assignable = bool(image_data)
884
884
 
885
+ # PR-47894: If the RecordId field is not present in the layout, then it should not be included in the field
886
+ # maps, as otherwise selection list fields can break.
887
+ remove_record_id: bool = True
888
+ for field_def in temp_dt.get_field_def_list():
889
+ if field_def.data_field_name == "RecordId":
890
+ remove_record_id = False
891
+ break
892
+ if remove_record_id:
893
+ for field_map in field_map_list:
894
+ if "RecordId" in field_map:
895
+ del field_map["RecordId"]
896
+
885
897
  # Send the request to the user.
886
898
  request = TableEntryDialogRequest(title, msg, temp_dt, field_map_list,
887
899
  record_image_data_list=image_data, group_by_field=group_by,
@@ -1765,8 +1777,11 @@ class CallbackUtil:
1765
1777
  blank_result_handling = BlankResultHandling.REPEAT
1766
1778
  def not_blank_func(r: list[DataRecord]) -> bool:
1767
1779
  return bool(r)
1768
- return self.__send_dialog_blank_results(request, self.callback.show_input_selection_dialog, not_blank_func,
1769
- blank_result_handling, repeat_message, cancel_message)
1780
+ response: list[DataRecord] = self.__send_dialog_blank_results(request,
1781
+ self.callback.show_input_selection_dialog,
1782
+ not_blank_func, blank_result_handling,
1783
+ repeat_message, cancel_message)
1784
+ return self.rec_handler.wrap_models(response, wrapper_type)
1770
1785
 
1771
1786
  # FR-47690: Deprecated the require_authentication parameter.
1772
1787
  # noinspection PyUnusedLocal
@@ -1812,7 +1827,8 @@ class CallbackUtil:
1812
1827
  return response
1813
1828
 
1814
1829
  def request_file(self, title: str, exts: Iterable[str] | None = None,
1815
- show_image_editor: bool = False, show_camera_button: bool = False) -> tuple[str, bytes]:
1830
+ show_image_editor: bool = False, show_camera_button: bool = False,
1831
+ *, enforce_file_extensions: bool = True) -> tuple[str, bytes]:
1816
1832
  """
1817
1833
  Request a single file from the user.
1818
1834
 
@@ -1822,6 +1838,8 @@ class CallbackUtil:
1822
1838
  :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
1823
1839
  :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
1824
1840
  rather than selecting an existing file.
1841
+ :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
1842
+ enforced. If false, then the user may upload any file type.
1825
1843
  :return: The file name and bytes of the uploaded file.
1826
1844
  """
1827
1845
  # If no extensions were provided, use an empty list for the extensions instead.
@@ -1841,11 +1859,12 @@ class CallbackUtil:
1841
1859
  file_path: str = self.__send_dialog(request, self.callback.show_file_dialog, data_sink=do_consume)
1842
1860
 
1843
1861
  # Verify that each of the file given matches the expected extension(s).
1844
- self.__verify_file(file_path, sink.data, exts)
1862
+ self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
1845
1863
  return file_path, sink.data
1846
1864
 
1847
1865
  def request_files(self, title: str, exts: Iterable[str] | None = None,
1848
- show_image_editor: bool = False, show_camera_button: bool = False) -> dict[str, bytes]:
1866
+ show_image_editor: bool = False, show_camera_button: bool = False,
1867
+ *, enforce_file_extensions: bool = True) -> dict[str, bytes]:
1849
1868
  """
1850
1869
  Request multiple files from the user.
1851
1870
 
@@ -1855,6 +1874,8 @@ class CallbackUtil:
1855
1874
  :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
1856
1875
  :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
1857
1876
  rather than selecting an existing file.
1877
+ :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
1878
+ enforced. If false, then the user may upload any file type.
1858
1879
  :return: A dictionary of file name to file bytes for each file the user uploaded.
1859
1880
  """
1860
1881
  # If no extensions were provided, use an empty list for the extensions instead.
@@ -1870,7 +1891,7 @@ class CallbackUtil:
1870
1891
  for file_path in file_paths:
1871
1892
  sink = InMemoryRecordDataSink(self.user)
1872
1893
  sink.consume_client_callback_file_path_data(file_path)
1873
- self.__verify_file(file_path, sink.data, exts)
1894
+ self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
1874
1895
  ret_dict.update({file_path: sink.data})
1875
1896
 
1876
1897
  return ret_dict
@@ -1887,16 +1908,17 @@ class CallbackUtil:
1887
1908
  """
1888
1909
  if file_path is None or len(file_path) == 0 or file_bytes is None or len(file_bytes) == 0:
1889
1910
  raise SapioUserErrorException("Empty file provided or file unable to be read.")
1890
- if allowed_extensions:
1891
- matches: bool = False
1892
- for ext in allowed_extensions:
1893
- # FR-47690: Changed to a case-insensitive match.
1894
- if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
1895
- matches = True
1896
- break
1897
- if matches is False:
1898
- raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
1899
- + (",".join(allowed_extensions)))
1911
+ if not allowed_extensions:
1912
+ return
1913
+ matches: bool = False
1914
+ for ext in allowed_extensions:
1915
+ # FR-47690: Changed to a case-insensitive match.
1916
+ if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
1917
+ matches = True
1918
+ break
1919
+ if not matches:
1920
+ raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
1921
+ + (",".join(allowed_extensions)))
1900
1922
 
1901
1923
  def write_file(self, file_name: str, file_data: str | bytes) -> None:
1902
1924
  """
@@ -1918,7 +1940,8 @@ class CallbackUtil:
1918
1940
  self.write_file(zip_name, FileUtil.zip_files(files))
1919
1941
 
1920
1942
  @staticmethod
1921
- def __get_indexed_field_maps(records: Iterable[SapioRecord], index_field: str) -> list[FieldMap]:
1943
+ def __get_indexed_field_maps(records: Iterable[SapioRecord], index_field: str, include_record_id: bool = False) \
1944
+ -> list[FieldMap]:
1922
1945
  """
1923
1946
  For dialogs that accept multiple records, we may want to be able to match the returned results back to the
1924
1947
  records that they're for. In this case, we need to add an index to each record so that we can match them back
@@ -1928,12 +1951,13 @@ class CallbackUtil:
1928
1951
  :param records: The records to return indexed field maps of.
1929
1952
  :param index_field: The name of the field to use as the index. Make sure that this field doesn't exist on the
1930
1953
  records, as then it will overwrite the existing value.
1954
+ :param include_record_id: Whether to include the RecordId field in the field maps.
1931
1955
  :return: A list of field maps for the records, with an index field added to each. The value of the index on
1932
1956
  each field map is the record's record ID (even if it's a record model with a negative ID).
1933
1957
  """
1934
1958
  ret_val: list[FieldMap] = []
1935
1959
  for record in records:
1936
- field_map: FieldMap = AliasUtil.to_field_map(record)
1960
+ field_map: FieldMap = AliasUtil.to_field_map(record, include_record_id)
1937
1961
  field_map[index_field] = AliasUtil.to_record_id(record)
1938
1962
  ret_val.append(field_map)
1939
1963
  return ret_val
@@ -1974,7 +1998,10 @@ class CallbackUtil:
1974
1998
  if field_def.key_field:
1975
1999
  field_def = modifier.modify_field(field_def)
1976
2000
  builder.add_field(field_def, column, span)
1977
- return builder.get_temporary_data_type()
2001
+ # PR-47917: Set fill_view to false on the layout of temp data types created by CallbackUtil.
2002
+ temp_dt = builder.get_temporary_data_type()
2003
+ temp_dt.data_type_layout.fill_view = False
2004
+ return temp_dt
1978
2005
 
1979
2006
  def __temp_dt_from_field_names(self, data_type: str, fields: Iterable[FieldIdentifier | FieldFilterCriteria],
1980
2007
  column_positions: dict[str, tuple[int, int]] | None,
@@ -2045,8 +2072,10 @@ class CallbackUtil:
2045
2072
  modifier: FieldModifier = field_modifiers.get(field_name, default_modifier)
2046
2073
  builder.add_field(modifier.modify_field(field_def), current_column, span)
2047
2074
  current_column += span
2048
-
2049
- return builder.get_temporary_data_type()
2075
+ # PR-47917: Set fill_view to false on the layout of temp data types created by CallbackUtil.
2076
+ temp_dt = builder.get_temporary_data_type()
2077
+ temp_dt.data_type_layout.fill_view = False
2078
+ return temp_dt
2050
2079
 
2051
2080
  # CR-47309: Allow layouts to be provided in place of field names for record dialogs.
2052
2081
  def __temp_dt_from_layout(self, data_type: str, layout: DataTypeLayoutIdentifier,
@@ -206,12 +206,11 @@ class ExperimentHandler:
206
206
  else:
207
207
  user = context
208
208
  context = None
209
- if context is not None and context.eln_experiment is not None and experiment is None:
210
- experiment = context.eln_experiment
211
209
  # FR-46495 - Allow the init function of ExperimentHandler to take in an ElnExperiment that is separate from the
212
210
  # context.
213
211
  # CR-37038 - Allow other experiment object types to be provided. Convert them all down to ElnExperiment.
214
- if (context is None or context.eln_experiment is None) and experiment is not None:
212
+ # PR-47793 - Fix cases where both a SapioWebhookContext and an experiment parameter are provided.
213
+ if experiment is not None:
215
214
  eln_manager = DataMgmtServer.get_eln_manager(user)
216
215
  # If this object is already an ElnExperiment, do nothing.
217
216
  if isinstance(experiment, ElnExperiment):
@@ -227,13 +226,19 @@ class ExperimentHandler:
227
226
  raise SapioException(f"No experiment with notebook ID {notebook_id} located in the system.")
228
227
  # If this object is a record, assume it is an experiment record that we can query the system with.
229
228
  else:
230
- record_id: int = AliasUtil.to_record_ids([experiment])[0]
229
+ record_id: int = AliasUtil.to_record_id(experiment)
231
230
  experiment: ElnExperiment = eln_manager.get_eln_experiment_by_record_id(record_id)
232
231
  if not experiment:
233
232
  raise SapioException(f"No experiment with record ID {record_id} located in the system.")
233
+ elif context is not None and context.eln_experiment is not None:
234
+ experiment = context.eln_experiment
235
+
234
236
  if experiment is None:
235
237
  raise SapioException("Cannot initialize ExperimentHandler. No ELN Experiment found in the provided "
236
238
  "parameters.")
239
+ elif not isinstance(experiment, ElnExperiment):
240
+ raise SapioException("Cannot initialize ExperimentHandler. The experiment variable is not an "
241
+ "ElnExperiment!")
237
242
 
238
243
  return user, context, experiment
239
244
 
@@ -1425,7 +1430,9 @@ class ExperimentHandler:
1425
1430
  :return: The map of options for the input step.
1426
1431
  """
1427
1432
  step: ElnEntryStep = self.get_step(step)
1428
- if step not in self._step_options:
1433
+ # PR-47796: Fix the get_step_options function making a webservice query every time it is called instead of
1434
+ # properly checking its cache of entry options.
1435
+ if step.get_id() not in self._step_options:
1429
1436
  self._step_options.update(ExperimentReportUtil.get_experiment_entry_options(self.user,
1430
1437
  self.get_all_steps()))
1431
1438
  return self._step_options[step.get_id()]
@@ -0,0 +1,93 @@
1
+ import base64
2
+ import dataclasses
3
+ from typing import Any
4
+
5
+ from databind.core.dataclasses import dataclass
6
+ from databind.json import loads
7
+ from sapiopylib.rest.utils.singletons import SapioContextManager
8
+
9
+
10
+ @dataclasses.dataclass
11
+ class ProcessAssayPlateRequest:
12
+ """
13
+ A request to process the results of assay plate reader with a configuration set in Sapio.
14
+
15
+ Attributes:
16
+ num_rows (int): The number of rows in the plate.
17
+ num_columns (int): The number of columns in the plate.
18
+ plate_ids_in_context (list[str]): List of plate IDs that are in context for this request.
19
+ filename (str): The name of the file containing the assay data.
20
+ file_data (bytes): The binary content of the file.
21
+ plate_reader_config_name (str): The name of the plate reader configuration to use.
22
+ """
23
+ num_rows: int
24
+ num_columns: int
25
+ plate_ids_in_context: list[str] | None
26
+ filename: str
27
+ file_data: bytes
28
+ plate_reader_config_name: str
29
+
30
+ def to_json(self) -> dict[str, Any]:
31
+ return {
32
+ "numRows": self.num_rows,
33
+ "numCols": self.num_columns,
34
+ "plateIdsInContext": self.plate_ids_in_context,
35
+ "fileName": self.filename,
36
+ "fileDataBase64": base64.b64encode(self.file_data).decode('utf-8'),
37
+ "plateReaderName": self.plate_reader_config_name
38
+ }
39
+
40
+
41
+ @dataclass
42
+ class AssayPlateResultIdent:
43
+ plateId: str
44
+ channelIdOrBlock: str
45
+ kineticAssaySeconds: float | None
46
+
47
+
48
+ @dataclass
49
+ class AssayResultDatum:
50
+ """
51
+ Describes the data received from an assay plate reader.
52
+ Most of the time, the data is a single value, but sometimes it can be multiple values, especially for kinetic data.
53
+ """
54
+ DEFAULT_PROPERTY_NAME: str = "read"
55
+ rowPosition: str
56
+ columnPosition: str
57
+ valueByPropertyName: dict[str, float]
58
+ textValueByPropertyName: dict[str, str]
59
+
60
+
61
+ @dataclass
62
+ class AssayPlateResult:
63
+ """
64
+ Assay plate load result for a single plate in a file. A file can have more than one of this result if it has multiple plate of data in a single file.
65
+ """
66
+ resultIdent: AssayPlateResultIdent
67
+ numRows: int
68
+ numColumns: int
69
+ resultDatum: list[AssayResultDatum]
70
+
71
+
72
+ @dataclass
73
+ class AssayFileLoadResult:
74
+ """
75
+ The entire top-level file loading result for an assay plate reader file.
76
+ """
77
+ filename: str
78
+ plateResultList: list[AssayPlateResult]
79
+
80
+
81
+ class AssayPlateReader(SapioContextManager):
82
+ """
83
+ This class contains services for Sapio Assay Plate Reader.
84
+ """
85
+
86
+ def process_plate_reader_data(self, request: ProcessAssayPlateRequest) -> AssayFileLoadResult:
87
+ """
88
+ Processes the assay plate reader data using provided request into a structured result using configuration defined in Sapio.
89
+ """
90
+ payload = request.to_json()
91
+ response = self.user.plugin_post("assayplatereader/process", payload=payload)
92
+ self.user.raise_for_status(response)
93
+ return loads(response.text, AssayFileLoadResult)
@@ -0,0 +1,207 @@
1
+ import io
2
+ import os
3
+ import tempfile
4
+ from enum import Enum, auto
5
+
6
+ class FileType(Enum):
7
+ """Supported file types for conversion."""
8
+ TXT = auto()
9
+ MD = auto()
10
+ CSV = auto()
11
+ DOC = auto()
12
+ DOCX = auto()
13
+ XLS = auto()
14
+ XLSX = auto()
15
+ PPT = auto()
16
+ PPTX = auto()
17
+ PDF = auto()
18
+ UNKNOWN = auto()
19
+
20
+
21
+ class FileToTextConverter:
22
+ """
23
+ A class for converting various file types to raw text.
24
+ """
25
+ @staticmethod
26
+ def mime_type_to_enum(mime_type: str) -> FileType:
27
+ """
28
+ Converts a MIME type to a FileType enum.
29
+
30
+ :param mime_type: The MIME type string to convert.
31
+ :return: The corresponding FileType enum, or UNKNOWN if not recognized.
32
+ """
33
+ if not mime_type or not mime_type.strip():
34
+ return FileType.UNKNOWN
35
+
36
+ mime_map = {
37
+ "text/plain": FileType.TXT,
38
+ "text/markdown": FileType.MD,
39
+ "text/csv": FileType.CSV,
40
+ "application/msword": FileType.DOC,
41
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
42
+ "application/vnd.ms-excel": FileType.XLS,
43
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
44
+ "application/vnd.ms-powerpoint": FileType.PPT,
45
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
46
+ "application/pdf": FileType.PDF,
47
+ }
48
+ return mime_map.get(mime_type, FileType.UNKNOWN)
49
+
50
+ @staticmethod
51
+ def file_extension_to_enum(file_path: str) -> FileType:
52
+ """
53
+ Converts a file path or extension to a FileType enum.
54
+
55
+ :param file_path: The file path or extension to convert.
56
+ :return: The corresponding FileType enum, or UNKNOWN if not recognized.
57
+ """
58
+ if not file_path or not file_path.strip():
59
+ return FileType.UNKNOWN
60
+
61
+ # Extract the file extension, removing the leading dot and making it lowercase
62
+ file_extension = os.path.splitext(file_path)[1].lstrip('.').lower()
63
+
64
+ ext_map = {
65
+ "txt": FileType.TXT,
66
+ "md": FileType.MD,
67
+ "csv": FileType.CSV,
68
+ "doc": FileType.DOC,
69
+ "docx": FileType.DOCX,
70
+ "xls": FileType.XLS,
71
+ "xlsx": FileType.XLSX,
72
+ "ppt": FileType.PPT,
73
+ "pptx": FileType.PPTX,
74
+ "pdf": FileType.PDF,
75
+ }
76
+ return ext_map.get(file_extension, FileType.UNKNOWN)
77
+
78
+ @classmethod
79
+ def parse_file(cls, file_type: FileType, file_bytes: bytes) -> str | None:
80
+ """
81
+ Parses file bytes based on the FileType and returns the text content.
82
+
83
+ :param file_type: The type of the file to parse.
84
+ :param file_bytes: The raw bytes of the file to parse.
85
+ :return: The text content of the file, or None if the file type is not supported or parsing fails.
86
+ """
87
+ if file_type is None or file_bytes is None:
88
+ return None
89
+ if not file_bytes:
90
+ return ""
91
+
92
+ # Dispatch to the correct parser method
93
+ parser_map = {
94
+ FileType.TXT: cls._parse_plain_text,
95
+ FileType.MD: cls._parse_plain_text,
96
+ FileType.CSV: cls._parse_plain_text,
97
+ FileType.DOC: cls._parse_doc,
98
+ FileType.DOCX: cls._parse_docx,
99
+ FileType.XLS: cls._parse_xls,
100
+ FileType.XLSX: cls._parse_xlsx,
101
+ FileType.PPT: cls._parse_ppt,
102
+ FileType.PPTX: cls._parse_pptx,
103
+ FileType.PDF: cls._parse_pdf,
104
+ }
105
+
106
+ parser_func = parser_map.get(file_type)
107
+
108
+ if parser_func:
109
+ return parser_func(file_bytes)
110
+
111
+ return None
112
+
113
+ @staticmethod
114
+ def _parse_plain_text(file_bytes: bytes) -> str:
115
+ return file_bytes.decode('utf-8')
116
+
117
+ @staticmethod
118
+ def _run_textract(file_bytes: bytes, extension: str) -> str:
119
+ """
120
+ Helper to run textract on in-memory bytes by writing to a temp file.
121
+ Note: textract may require external system dependencies.
122
+ """
123
+ import textract
124
+ with tempfile.NamedTemporaryFile(suffix=f".{extension}", delete=True) as temp_file:
125
+ temp_file.write(file_bytes)
126
+ temp_file.flush() # Ensure all bytes are written to disk
127
+ text = textract.process(temp_file.name).decode('utf-8')
128
+ return text
129
+
130
+ @classmethod
131
+ def _parse_doc(cls, file_bytes: bytes) -> str:
132
+ return cls._run_textract(file_bytes, 'doc')
133
+
134
+ @staticmethod
135
+ def _parse_docx(file_bytes: bytes) -> str:
136
+ import docx
137
+ with io.BytesIO(file_bytes) as stream:
138
+ document = docx.Document(stream)
139
+ return "\n".join(para.text for para in document.paragraphs if para.text.strip())
140
+
141
+ @staticmethod
142
+ def _parse_xls(file_bytes: bytes) -> str:
143
+ import xlrd
144
+ workbook = xlrd.open_workbook(file_contents=file_bytes)
145
+ text_parts = []
146
+ for sheet in workbook.sheets():
147
+ text_parts.append(f"Sheet: {sheet.name}\n")
148
+ for row_idx in range(sheet.nrows):
149
+ row_cells = []
150
+ for col_idx in range(sheet.ncols):
151
+ cell_text = str(sheet.cell_value(row_idx, col_idx))
152
+ if cell_text.strip():
153
+ row_cells.append(cell_text + "\t")
154
+ if row_cells:
155
+ text_parts.append("".join(row_cells))
156
+ text_parts.append("\n")
157
+ text_parts.append("\n")
158
+ return "".join(text_parts)
159
+
160
+ @staticmethod
161
+ def _parse_xlsx(file_bytes: bytes) -> str:
162
+ import openpyxl
163
+ with io.BytesIO(file_bytes) as stream:
164
+ workbook = openpyxl.load_workbook(stream, read_only=True)
165
+ text_parts = []
166
+ for sheet in workbook.worksheets:
167
+ text_parts.append(f"Sheet: {sheet.title}\n")
168
+ for row in sheet.iter_rows():
169
+ row_cells = []
170
+ for cell in row:
171
+ cell_text = str(cell.value) if cell.value is not None else ""
172
+ if cell_text.strip():
173
+ row_cells.append(cell_text + "\t")
174
+ if row_cells:
175
+ text_parts.append("".join(row_cells))
176
+ text_parts.append("\n")
177
+ text_parts.append("\n")
178
+ return "".join(text_parts)
179
+
180
+ @classmethod
181
+ def _parse_ppt(cls, file_bytes: bytes) -> str:
182
+ return cls._run_textract(file_bytes, 'ppt')
183
+
184
+ @staticmethod
185
+ def _parse_pptx(file_bytes: bytes) -> str:
186
+ import pptx
187
+ with io.BytesIO(file_bytes) as stream:
188
+ presentation = pptx.Presentation(stream)
189
+ text_parts = []
190
+ for slide in presentation.slides:
191
+ for shape in slide.shapes:
192
+ if shape.has_text_frame:
193
+ text = shape.text_frame.text
194
+ if text and text.strip():
195
+ text_parts.append(text)
196
+ return "\n".join(text_parts)
197
+
198
+ @staticmethod
199
+ def _parse_pdf(file_bytes: bytes) -> str:
200
+ """Parses a PDF file's bytes and extracts text using PyMuPDF."""
201
+ import pymupdf
202
+ text_parts = []
203
+ with io.BytesIO(file_bytes) as stream:
204
+ with pymupdf.open(stream=stream) as doc:
205
+ for page in doc:
206
+ text_parts.append(page.get_text())
207
+ return "\n".join(text_parts)
@@ -1,4 +1,7 @@
1
+ import gzip
1
2
  import io
3
+ import tarfile
4
+ import time
2
5
  import warnings
3
6
  import zipfile
4
7
 
@@ -322,7 +325,7 @@ class FileUtil:
322
325
  @staticmethod
323
326
  def zip_files(files: dict[str, str | bytes]) -> bytes:
324
327
  """
325
- Create a zip file for a collection of files.
328
+ Create a .zip file for a collection of files.
326
329
 
327
330
  :param files: A dictionary of file name to file data as a string or bytes.
328
331
  :return: The bytes for a zip file containing the input files.
@@ -335,6 +338,130 @@ class FileUtil:
335
338
  # throws an I/O exception.
336
339
  return zip_buffer.getvalue()
337
340
 
341
+ # FR-47422: Add a function for unzipping files that may have been zipped by the above function.
342
+ @staticmethod
343
+ def unzip_files(zip_file: bytes) -> dict[str, bytes]:
344
+ """
345
+ Decompress a .zip file from an in-memory bytes object and extracts all files into a dictionary.
346
+
347
+ :param zip_file: The bytes of the zip file to be decompressed.
348
+ :return: A dictionary of file name to file bytes for each file in the zip.
349
+ """
350
+ extracted_files: dict[str, bytes] = {}
351
+ with io.BytesIO(zip_file) as zip_buffer:
352
+ with zipfile.ZipFile(zip_buffer, "r") as zip_file:
353
+ for file_name in zip_file.namelist():
354
+ with zip_file.open(file_name) as file:
355
+ extracted_files[file_name] = file.read()
356
+ return extracted_files
357
+
358
+ # FR-47422: Add functions for compressing and decompressing .gz, .tar, and .tar.gz files.
359
+ @staticmethod
360
+ def gzip_file(file_data: bytes | str) -> bytes:
361
+ """
362
+ Create a .gz file for a single file.
363
+
364
+ :param file_data: The file data to be compressed as bytes or a string.
365
+ :return: The bytes of the gzip-compressed file.
366
+ """
367
+ return gzip.compress(file_data.encode() if isinstance(file_data, str) else file_data)
368
+
369
+ @staticmethod
370
+ def ungzip_file(gzip_file: bytes) -> bytes:
371
+ """
372
+ Decompress a .gz file.
373
+
374
+ :param gzip_file: The bytes of the gzip-compressed file.
375
+ :return: The decompressed file data as bytes.
376
+ """
377
+ return gzip.decompress(gzip_file)
378
+
379
+ @staticmethod
380
+ def tar_files(files: dict[str, str | bytes]) -> bytes:
381
+ """
382
+ Create a .tar file for a collection of files.
383
+
384
+ :param files: A dictionary of file name to file data as a string or bytes.
385
+ :return: The bytes for a tar file containing the input files.
386
+ """
387
+ with io.BytesIO() as tar_buffer:
388
+ with tarfile.open(fileobj=tar_buffer, mode="w") as tar:
389
+ for name, data in files.items():
390
+ if isinstance(data, str):
391
+ data: bytes = data.encode('utf-8')
392
+
393
+ tarinfo = tarfile.TarInfo(name=name)
394
+ tarinfo.size = len(data)
395
+ tarinfo.mtime = int(time.time())
396
+
397
+ with io.BytesIO(data) as file:
398
+ tar.addfile(tarinfo=tarinfo, fileobj=file)
399
+
400
+ tar_buffer.seek(0)
401
+ return tar_buffer.getvalue()
402
+
403
+ @staticmethod
404
+ def untar_files(tar_file: bytes) -> dict[str, bytes]:
405
+ """
406
+ Decompress a .tar file from an in-memory bytes object and extracts all files into a dictionary.
407
+
408
+ :param tar_file: The bytes of the tar file to be decompressed.
409
+ :return: A dictionary of file name to file bytes for each file in the tar.
410
+ """
411
+ extracted_files: dict[str, bytes] = {}
412
+ with io.BytesIO(tar_file) as tar_buffer:
413
+ with tarfile.open(fileobj=tar_buffer, mode="r") as tar:
414
+ for member in tar.getmembers():
415
+ if member.isfile():
416
+ file_obj = tar.extractfile(member)
417
+ if file_obj:
418
+ with file_obj:
419
+ extracted_files[member.name] = file_obj.read()
420
+ return extracted_files
421
+
422
+ @staticmethod
423
+ def tar_gzip_files(files: dict[str, str | bytes]) -> bytes:
424
+ """
425
+ Create a .tar.gz file for a collection of files.
426
+
427
+ :param files: A dictionary of file name to file data as a string or bytes.
428
+ :return: The bytes for a tar.gz file containing the input files.
429
+ """
430
+ with io.BytesIO() as tar_buffer:
431
+ with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar:
432
+ for name, data in files.items():
433
+ if isinstance(data, str):
434
+ data: bytes = data.encode('utf-8')
435
+
436
+ tarinfo = tarfile.TarInfo(name=name)
437
+ tarinfo.size = len(data)
438
+ tarinfo.mtime = int(time.time())
439
+
440
+ with io.BytesIO(data) as file:
441
+ tar.addfile(tarinfo=tarinfo, fileobj=file)
442
+
443
+ tar_buffer.seek(0)
444
+ return tar_buffer.getvalue()
445
+
446
+ @staticmethod
447
+ def untar_gzip_files(tar_gzip_file: bytes) -> dict[str, bytes]:
448
+ """
449
+ Decompress a .tar.gz file from an in-memory bytes object and extracts all files into a dictionary.
450
+
451
+ :param tar_gzip_file: The bytes of the tar.gz file to be decompressed.
452
+ :return: A dictionary of file name to file bytes for each file in the tar.gz
453
+ """
454
+ extracted_files: dict[str, bytes] = {}
455
+ with io.BytesIO(tar_gzip_file) as tar_buffer:
456
+ with tarfile.open(fileobj=tar_buffer, mode="r:gz") as tar:
457
+ for member in tar.getmembers():
458
+ if member.isfile():
459
+ file_obj = tar.extractfile(member)
460
+ if file_obj:
461
+ with file_obj:
462
+ extracted_files[member.name] = file_obj.read()
463
+ return extracted_files
464
+
338
465
  # Deprecated functions:
339
466
 
340
467
  # FR-46097 - Add write file request shorthand functions to FileUtil.