PyPI - sapiopycommons - Versions diffs - 2025.6.19a564__py3-none-any.whl → 2026.1.22a847__py3-none-any.whl - Mend

sapiopycommons 2025.6.19a564py3-none-any.whl → 2026.1.22a847py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sapiopycommons/ai/__init__.py +0 -0
sapiopycommons/ai/agent_service_base.py +2051 -0
sapiopycommons/ai/converter_service_base.py +163 -0
sapiopycommons/ai/external_credentials.py +131 -0
sapiopycommons/ai/protoapi/agent/agent_pb2.py +87 -0
sapiopycommons/ai/protoapi/agent/agent_pb2.pyi +282 -0
sapiopycommons/ai/protoapi/agent/agent_pb2_grpc.py +154 -0
sapiopycommons/ai/protoapi/agent/entry_pb2.py +49 -0
sapiopycommons/ai/protoapi/agent/entry_pb2.pyi +40 -0
sapiopycommons/ai/protoapi/agent/entry_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/agent/item/item_container_pb2.py +61 -0
sapiopycommons/ai/protoapi/agent/item/item_container_pb2.pyi +181 -0
sapiopycommons/ai/protoapi/agent/item/item_container_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2.py +41 -0
sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2.pyi +36 -0
sapiopycommons/ai/protoapi/externalcredentials/external_credentials_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2.py +51 -0
sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2.pyi +59 -0
sapiopycommons/ai/protoapi/fielddefinitions/fields_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2.py +123 -0
sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2.pyi +599 -0
sapiopycommons/ai/protoapi/fielddefinitions/velox_field_def_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2.py +59 -0
sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2.pyi +68 -0
sapiopycommons/ai/protoapi/pipeline/converter/converter_pb2_grpc.py +149 -0
sapiopycommons/ai/protoapi/pipeline/script/script_pb2.py +69 -0
sapiopycommons/ai/protoapi/pipeline/script/script_pb2.pyi +109 -0
sapiopycommons/ai/protoapi/pipeline/script/script_pb2_grpc.py +153 -0
sapiopycommons/ai/protoapi/pipeline/step_output_pb2.py +49 -0
sapiopycommons/ai/protoapi/pipeline/step_output_pb2.pyi +56 -0
sapiopycommons/ai/protoapi/pipeline/step_output_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/pipeline/step_pb2.py +43 -0
sapiopycommons/ai/protoapi/pipeline/step_pb2.pyi +44 -0
sapiopycommons/ai/protoapi/pipeline/step_pb2_grpc.py +24 -0
sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2.py +39 -0
sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2.pyi +33 -0
sapiopycommons/ai/protoapi/session/sapio_conn_info_pb2_grpc.py +24 -0
sapiopycommons/ai/protobuf_utils.py +583 -0
sapiopycommons/ai/request_validation.py +561 -0
sapiopycommons/ai/server.py +152 -0
sapiopycommons/ai/test_client.py +534 -0
sapiopycommons/callbacks/callback_util.py +53 -24
sapiopycommons/eln/experiment_handler.py +12 -5
sapiopycommons/files/assay_plate_reader.py +93 -0
sapiopycommons/files/file_text_converter.py +207 -0
sapiopycommons/files/file_util.py +128 -1
sapiopycommons/files/temp_files.py +82 -0
sapiopycommons/flowcyto/flow_cyto.py +2 -24
sapiopycommons/general/accession_service.py +2 -28
sapiopycommons/general/aliases.py +4 -1
sapiopycommons/general/macros.py +172 -0
sapiopycommons/general/time_util.py +199 -4
sapiopycommons/multimodal/multimodal.py +2 -24
sapiopycommons/recordmodel/record_handler.py +200 -111
sapiopycommons/rules/eln_rule_handler.py +3 -0
sapiopycommons/rules/on_save_rule_handler.py +3 -0
sapiopycommons/webhook/webhook_handlers.py +6 -4
sapiopycommons/webhook/webservice_handlers.py +1 -1
{sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/METADATA +2 -2
sapiopycommons-2026.1.22a847.dist-info/RECORD +113 -0
sapiopycommons-2025.6.19a564.dist-info/RECORD +0 -68
{sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/WHEEL +0 -0
{sapiopycommons-2025.6.19a564.dist-info → sapiopycommons-2026.1.22a847.dist-info}/licenses/LICENSE +0 -0

sapiopycommons/callbacks/callback_util.py CHANGED Viewed

@@ -780,7 +780,7 @@ class CallbackUtil:
         # FR-47690: Set default values for fields that aren't present.
         for row in values:
             for field in fields:
-                if field.data_field_name not in values:
+                if field.data_field_name not in row:
                     row[field.data_field_name] = field.default_value
         # Convert the group_by parameter to a field name.
@@ -858,9 +858,9 @@ class CallbackUtil:
             raise SapioException("No records provided.")
         data_type: str = AliasUtil.to_singular_data_type_name(records)
         if index_field is not None:
-            field_map_list: list[FieldMap] = self.__get_indexed_field_maps(records, index_field)
+            field_map_list: list[FieldMap] = self.__get_indexed_field_maps(records, index_field, True)
         else:
-            field_map_list: list[FieldMap] = AliasUtil.to_field_map_list(records)
+            field_map_list: list[FieldMap] = AliasUtil.to_field_map_list(records, True)
         # Convert the group_by parameter to a field name.
         if group_by is not None:
@@ -882,6 +882,18 @@ class CallbackUtil:
             temp_dt = self.__temp_dt_from_field_names(data_type, fields, None, default_modifier, field_modifiers)
         temp_dt.record_image_assignable = bool(image_data)
+        # PR-47894: If the RecordId field is not present in the layout, then it should not be included in the field
+        # maps, as otherwise selection list fields can break.
+        remove_record_id: bool = True
+        for field_def in temp_dt.get_field_def_list():
+            if field_def.data_field_name == "RecordId":
+                remove_record_id = False
+                break
+        if remove_record_id:
+            for field_map in field_map_list:
+                if "RecordId" in field_map:
+                    del field_map["RecordId"]
         # Send the request to the user.
         request = TableEntryDialogRequest(title, msg, temp_dt, field_map_list,
                                           record_image_data_list=image_data, group_by_field=group_by,
@@ -1765,8 +1777,11 @@ class CallbackUtil:
             blank_result_handling = BlankResultHandling.REPEAT
         def not_blank_func(r: list[DataRecord]) -> bool:
             return bool(r)
-        return self.__send_dialog_blank_results(request, self.callback.show_input_selection_dialog, not_blank_func,
-                                                blank_result_handling, repeat_message, cancel_message)
+        response: list[DataRecord] = self.__send_dialog_blank_results(request,
+                                                                      self.callback.show_input_selection_dialog,
+                                                                      not_blank_func, blank_result_handling,
+                                                                      repeat_message, cancel_message)
+        return self.rec_handler.wrap_models(response, wrapper_type)
     # FR-47690: Deprecated the require_authentication parameter.
     # noinspection PyUnusedLocal
@@ -1812,7 +1827,8 @@ class CallbackUtil:
         return response
     def request_file(self, title: str, exts: Iterable[str] | None = None,
-                     show_image_editor: bool = False, show_camera_button: bool = False) -> tuple[str, bytes]:
+                     show_image_editor: bool = False, show_camera_button: bool = False,
+                     *, enforce_file_extensions: bool = True) -> tuple[str, bytes]:
         """
         Request a single file from the user.
@@ -1822,6 +1838,8 @@ class CallbackUtil:
         :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
         :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
             rather than selecting an existing file.
+        :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
+            enforced. If false, then the user may upload any file type.
         :return: The file name and bytes of the uploaded file.
         """
         # If no extensions were provided, use an empty list for the extensions instead.
@@ -1841,11 +1859,12 @@ class CallbackUtil:
             file_path: str = self.__send_dialog(request, self.callback.show_file_dialog, data_sink=do_consume)
         # Verify that each of the file given matches the expected extension(s).
-        self.__verify_file(file_path, sink.data, exts)
+        self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
         return file_path, sink.data
     def request_files(self, title: str, exts: Iterable[str] | None = None,
-                      show_image_editor: bool = False, show_camera_button: bool = False) -> dict[str, bytes]:
+                      show_image_editor: bool = False, show_camera_button: bool = False,
+                      *, enforce_file_extensions: bool = True) -> dict[str, bytes]:
         """
         Request multiple files from the user.
@@ -1855,6 +1874,8 @@ class CallbackUtil:
         :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
         :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
             rather than selecting an existing file.
+        :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
+            enforced. If false, then the user may upload any file type.
         :return: A dictionary of file name to file bytes for each file the user uploaded.
         """
         # If no extensions were provided, use an empty list for the extensions instead.
@@ -1870,7 +1891,7 @@ class CallbackUtil:
         for file_path in file_paths:
             sink = InMemoryRecordDataSink(self.user)
             sink.consume_client_callback_file_path_data(file_path)
-            self.__verify_file(file_path, sink.data, exts)
+            self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
             ret_dict.update({file_path: sink.data})
         return ret_dict
@@ -1887,16 +1908,17 @@ class CallbackUtil:
         """
         if file_path is None or len(file_path) == 0 or file_bytes is None or len(file_bytes) == 0:
             raise SapioUserErrorException("Empty file provided or file unable to be read.")
-        if allowed_extensions:
-            matches: bool = False
-            for ext in allowed_extensions:
-                # FR-47690: Changed to a case-insensitive match.
-                if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
-                    matches = True
-                    break
-            if matches is False:
-                raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
-                                              + (",".join(allowed_extensions)))
+        if not allowed_extensions:
+            return
+        matches: bool = False
+        for ext in allowed_extensions:
+            # FR-47690: Changed to a case-insensitive match.
+            if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
+                matches = True
+                break
+        if not matches:
+            raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
+                                          + (",".join(allowed_extensions)))
     def write_file(self, file_name: str, file_data: str | bytes) -> None:
         """
@@ -1918,7 +1940,8 @@ class CallbackUtil:
         self.write_file(zip_name, FileUtil.zip_files(files))
     @staticmethod
-    def __get_indexed_field_maps(records: Iterable[SapioRecord], index_field: str) -> list[FieldMap]:
+    def __get_indexed_field_maps(records: Iterable[SapioRecord], index_field: str, include_record_id: bool = False) \
+            -> list[FieldMap]:
         """
         For dialogs that accept multiple records, we may want to be able to match the returned results back to the
         records that they're for. In this case, we need to add an index to each record so that we can match them back
@@ -1928,12 +1951,13 @@ class CallbackUtil:
         :param records: The records to return indexed field maps of.
         :param index_field: The name of the field to use as the index. Make sure that this field doesn't exist on the
             records, as then it will overwrite the existing value.
+        :param include_record_id: Whether to include the RecordId field in the field maps.
         :return: A list of field maps for the records, with an index field added to each. The value of the index on
             each field map is the record's record ID (even if it's a record model with a negative ID).
         """
         ret_val: list[FieldMap] = []
         for record in records:
-            field_map: FieldMap = AliasUtil.to_field_map(record)
+            field_map: FieldMap = AliasUtil.to_field_map(record, include_record_id)
             field_map[index_field] = AliasUtil.to_record_id(record)
             ret_val.append(field_map)
         return ret_val
@@ -1974,7 +1998,10 @@ class CallbackUtil:
             if field_def.key_field:
                 field_def = modifier.modify_field(field_def)
             builder.add_field(field_def, column, span)
-        return builder.get_temporary_data_type()
+        # PR-47917: Set fill_view to false on the layout of temp data types created by CallbackUtil.
+        temp_dt = builder.get_temporary_data_type()
+        temp_dt.data_type_layout.fill_view = False
+        return temp_dt
     def __temp_dt_from_field_names(self, data_type: str, fields: Iterable[FieldIdentifier | FieldFilterCriteria],
                                    column_positions: dict[str, tuple[int, int]] | None,
@@ -2045,8 +2072,10 @@ class CallbackUtil:
                 modifier: FieldModifier = field_modifiers.get(field_name, default_modifier)
                 builder.add_field(modifier.modify_field(field_def), current_column, span)
                 current_column += span
-        return builder.get_temporary_data_type()
+        # PR-47917: Set fill_view to false on the layout of temp data types created by CallbackUtil.
+        temp_dt = builder.get_temporary_data_type()
+        temp_dt.data_type_layout.fill_view = False
+        return temp_dt
     # CR-47309: Allow layouts to be provided in place of field names for record dialogs.
     def __temp_dt_from_layout(self, data_type: str, layout: DataTypeLayoutIdentifier,

sapiopycommons/eln/experiment_handler.py CHANGED Viewed

@@ -206,12 +206,11 @@ class ExperimentHandler:
         else:
             user = context
             context = None
-        if context is not None and context.eln_experiment is not None and experiment is None:
-            experiment = context.eln_experiment
         # FR-46495 - Allow the init function of ExperimentHandler to take in an ElnExperiment that is separate from the
         # context.
         # CR-37038 - Allow other experiment object types to be provided. Convert them all down to ElnExperiment.
-        if (context is None or context.eln_experiment is None) and experiment is not None:
+        # PR-47793 - Fix cases where both a SapioWebhookContext and an experiment parameter are provided.
+        if experiment is not None:
             eln_manager = DataMgmtServer.get_eln_manager(user)
             # If this object is already an ElnExperiment, do nothing.
             if isinstance(experiment, ElnExperiment):
@@ -227,13 +226,19 @@ class ExperimentHandler:
                     raise SapioException(f"No experiment with notebook ID {notebook_id} located in the system.")
             # If this object is a record, assume it is an experiment record that we can query the system with.
             else:
-                record_id: int = AliasUtil.to_record_ids([experiment])[0]
+                record_id: int = AliasUtil.to_record_id(experiment)
                 experiment: ElnExperiment = eln_manager.get_eln_experiment_by_record_id(record_id)
                 if not experiment:
                     raise SapioException(f"No experiment with record ID {record_id} located in the system.")
+        elif context is not None and context.eln_experiment is not None:
+            experiment = context.eln_experiment
         if experiment is None:
             raise SapioException("Cannot initialize ExperimentHandler. No ELN Experiment found in the provided "
                                  "parameters.")
+        elif not isinstance(experiment, ElnExperiment):
+            raise SapioException("Cannot initialize ExperimentHandler. The experiment variable is not an "
+                                 "ElnExperiment!")
         return user, context, experiment
@@ -1425,7 +1430,9 @@ class ExperimentHandler:
         :return: The map of options for the input step.
         """
         step: ElnEntryStep = self.get_step(step)
-        if step not in self._step_options:
+        # PR-47796: Fix the get_step_options function making a webservice query every time it is called instead of
+        # properly checking its cache of entry options.
+        if step.get_id() not in self._step_options:
             self._step_options.update(ExperimentReportUtil.get_experiment_entry_options(self.user,
                                                                                         self.get_all_steps()))
         return self._step_options[step.get_id()]

sapiopycommons/files/assay_plate_reader.py ADDED Viewed

@@ -0,0 +1,93 @@
+import base64
+import dataclasses
+from typing import Any
+from databind.core.dataclasses import dataclass
+from databind.json import loads
+from sapiopylib.rest.utils.singletons import SapioContextManager
+@dataclasses.dataclass
+class ProcessAssayPlateRequest:
+    """
+    A request to process the results of assay plate reader with a configuration set in Sapio.
+    Attributes:
+        num_rows (int): The number of rows in the plate.
+        num_columns (int): The number of columns in the plate.
+        plate_ids_in_context (list[str]): List of plate IDs that are in context for this request.
+        filename (str): The name of the file containing the assay data.
+        file_data (bytes): The binary content of the file.
+        plate_reader_config_name (str): The name of the plate reader configuration to use.
+    """
+    num_rows: int
+    num_columns: int
+    plate_ids_in_context: list[str] | None
+    filename: str
+    file_data: bytes
+    plate_reader_config_name: str
+    def to_json(self) -> dict[str, Any]:
+        return {
+            "numRows": self.num_rows,
+            "numCols": self.num_columns,
+            "plateIdsInContext": self.plate_ids_in_context,
+            "fileName": self.filename,
+            "fileDataBase64": base64.b64encode(self.file_data).decode('utf-8'),
+            "plateReaderName": self.plate_reader_config_name
+        }
+@dataclass
+class AssayPlateResultIdent:
+    plateId: str
+    channelIdOrBlock: str
+    kineticAssaySeconds: float | None
+@dataclass
+class AssayResultDatum:
+    """
+    Describes the data received from an assay plate reader.
+    Most of the time, the data is a single value, but sometimes it can be multiple values, especially for kinetic data.
+    """
+    DEFAULT_PROPERTY_NAME: str = "read"
+    rowPosition: str
+    columnPosition: str
+    valueByPropertyName: dict[str, float]
+    textValueByPropertyName: dict[str, str]
+@dataclass
+class AssayPlateResult:
+    """
+    Assay plate load result for a single plate in a file. A file can have more than one of this result if it has multiple plate of data in a single file.
+    """
+    resultIdent: AssayPlateResultIdent
+    numRows: int
+    numColumns: int
+    resultDatum: list[AssayResultDatum]
+@dataclass
+class AssayFileLoadResult:
+    """
+    The entire top-level file loading result for an assay plate reader file.
+    """
+    filename: str
+    plateResultList: list[AssayPlateResult]
+class AssayPlateReader(SapioContextManager):
+    """
+    This class contains services for Sapio Assay Plate Reader.
+    """
+    def process_plate_reader_data(self, request: ProcessAssayPlateRequest) -> AssayFileLoadResult:
+        """
+        Processes the assay plate reader data using provided request into a structured result using configuration defined in Sapio.
+        """
+        payload = request.to_json()
+        response = self.user.plugin_post("assayplatereader/process", payload=payload)
+        self.user.raise_for_status(response)
+        return loads(response.text, AssayFileLoadResult)

sapiopycommons/files/file_text_converter.py ADDED Viewed

@@ -0,0 +1,207 @@
+import io
+import os
+import tempfile
+from enum import Enum, auto
+class FileType(Enum):
+    """Supported file types for conversion."""
+    TXT = auto()
+    MD = auto()
+    CSV = auto()
+    DOC = auto()
+    DOCX = auto()
+    XLS = auto()
+    XLSX = auto()
+    PPT = auto()
+    PPTX = auto()
+    PDF = auto()
+    UNKNOWN = auto()
+class FileToTextConverter:
+    """
+    A class for converting various file types to raw text.
+    """
+    @staticmethod
+    def mime_type_to_enum(mime_type: str) -> FileType:
+        """
+        Converts a MIME type to a FileType enum.
+        :param mime_type: The MIME type string to convert.
+        :return: The corresponding FileType enum, or UNKNOWN if not recognized.
+        """
+        if not mime_type or not mime_type.strip():
+            return FileType.UNKNOWN
+        mime_map = {
+            "text/plain": FileType.TXT,
+            "text/markdown": FileType.MD,
+            "text/csv": FileType.CSV,
+            "application/msword": FileType.DOC,
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
+            "application/vnd.ms-excel": FileType.XLS,
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
+            "application/vnd.ms-powerpoint": FileType.PPT,
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
+            "application/pdf": FileType.PDF,
+        }
+        return mime_map.get(mime_type, FileType.UNKNOWN)
+    @staticmethod
+    def file_extension_to_enum(file_path: str) -> FileType:
+        """
+        Converts a file path or extension to a FileType enum.
+        :param file_path: The file path or extension to convert.
+        :return: The corresponding FileType enum, or UNKNOWN if not recognized.
+        """
+        if not file_path or not file_path.strip():
+            return FileType.UNKNOWN
+        # Extract the file extension, removing the leading dot and making it lowercase
+        file_extension = os.path.splitext(file_path)[1].lstrip('.').lower()
+        ext_map = {
+            "txt": FileType.TXT,
+            "md": FileType.MD,
+            "csv": FileType.CSV,
+            "doc": FileType.DOC,
+            "docx": FileType.DOCX,
+            "xls": FileType.XLS,
+            "xlsx": FileType.XLSX,
+            "ppt": FileType.PPT,
+            "pptx": FileType.PPTX,
+            "pdf": FileType.PDF,
+        }
+        return ext_map.get(file_extension, FileType.UNKNOWN)
+    @classmethod
+    def parse_file(cls, file_type: FileType, file_bytes: bytes) -> str | None:
+        """
+        Parses file bytes based on the FileType and returns the text content.
+        :param file_type: The type of the file to parse.
+        :param file_bytes: The raw bytes of the file to parse.
+        :return: The text content of the file, or None if the file type is not supported or parsing fails.
+        """
+        if file_type is None or file_bytes is None:
+            return None
+        if not file_bytes:
+            return ""
+        # Dispatch to the correct parser method
+        parser_map = {
+            FileType.TXT: cls._parse_plain_text,
+            FileType.MD: cls._parse_plain_text,
+            FileType.CSV: cls._parse_plain_text,
+            FileType.DOC: cls._parse_doc,
+            FileType.DOCX: cls._parse_docx,
+            FileType.XLS: cls._parse_xls,
+            FileType.XLSX: cls._parse_xlsx,
+            FileType.PPT: cls._parse_ppt,
+            FileType.PPTX: cls._parse_pptx,
+            FileType.PDF: cls._parse_pdf,
+        }
+        parser_func = parser_map.get(file_type)
+        if parser_func:
+            return parser_func(file_bytes)
+        return None
+    @staticmethod
+    def _parse_plain_text(file_bytes: bytes) -> str:
+        return file_bytes.decode('utf-8')
+    @staticmethod
+    def _run_textract(file_bytes: bytes, extension: str) -> str:
+        """
+        Helper to run textract on in-memory bytes by writing to a temp file.
+        Note: textract may require external system dependencies.
+        """
+        import textract
+        with tempfile.NamedTemporaryFile(suffix=f".{extension}", delete=True) as temp_file:
+            temp_file.write(file_bytes)
+            temp_file.flush()  # Ensure all bytes are written to disk
+            text = textract.process(temp_file.name).decode('utf-8')
+        return text
+    @classmethod
+    def _parse_doc(cls, file_bytes: bytes) -> str:
+        return cls._run_textract(file_bytes, 'doc')
+    @staticmethod
+    def _parse_docx(file_bytes: bytes) -> str:
+        import docx
+        with io.BytesIO(file_bytes) as stream:
+            document = docx.Document(stream)
+            return "\n".join(para.text for para in document.paragraphs if para.text.strip())
+    @staticmethod
+    def _parse_xls(file_bytes: bytes) -> str:
+        import xlrd
+        workbook = xlrd.open_workbook(file_contents=file_bytes)
+        text_parts = []
+        for sheet in workbook.sheets():
+            text_parts.append(f"Sheet: {sheet.name}\n")
+            for row_idx in range(sheet.nrows):
+                row_cells = []
+                for col_idx in range(sheet.ncols):
+                    cell_text = str(sheet.cell_value(row_idx, col_idx))
+                    if cell_text.strip():
+                        row_cells.append(cell_text + "\t")
+                if row_cells:
+                    text_parts.append("".join(row_cells))
+                text_parts.append("\n")
+            text_parts.append("\n")
+        return "".join(text_parts)
+    @staticmethod
+    def _parse_xlsx(file_bytes: bytes) -> str:
+        import openpyxl
+        with io.BytesIO(file_bytes) as stream:
+            workbook = openpyxl.load_workbook(stream, read_only=True)
+            text_parts = []
+            for sheet in workbook.worksheets:
+                text_parts.append(f"Sheet: {sheet.title}\n")
+                for row in sheet.iter_rows():
+                    row_cells = []
+                    for cell in row:
+                        cell_text = str(cell.value) if cell.value is not None else ""
+                        if cell_text.strip():
+                            row_cells.append(cell_text + "\t")
+                    if row_cells:
+                        text_parts.append("".join(row_cells))
+                    text_parts.append("\n")
+                text_parts.append("\n")
+            return "".join(text_parts)
+    @classmethod
+    def _parse_ppt(cls, file_bytes: bytes) -> str:
+        return cls._run_textract(file_bytes, 'ppt')
+    @staticmethod
+    def _parse_pptx(file_bytes: bytes) -> str:
+        import pptx
+        with io.BytesIO(file_bytes) as stream:
+            presentation = pptx.Presentation(stream)
+            text_parts = []
+            for slide in presentation.slides:
+                for shape in slide.shapes:
+                    if shape.has_text_frame:
+                        text = shape.text_frame.text
+                        if text and text.strip():
+                            text_parts.append(text)
+            return "\n".join(text_parts)
+    @staticmethod
+    def _parse_pdf(file_bytes: bytes) -> str:
+        """Parses a PDF file's bytes and extracts text using PyMuPDF."""
+        import pymupdf
+        text_parts = []
+        with io.BytesIO(file_bytes) as stream:
+            with pymupdf.open(stream=stream) as doc:
+                for page in doc:
+                    text_parts.append(page.get_text())
+        return "\n".join(text_parts)

sapiopycommons/files/file_util.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import gzip
 import io
+import tarfile
+import time
 import warnings
 import zipfile
@@ -322,7 +325,7 @@ class FileUtil:
     @staticmethod
     def zip_files(files: dict[str, str | bytes]) -> bytes:
         """
-        Create a zip file for a collection of files.
+        Create a .zip file for a collection of files.
         :param files: A dictionary of file name to file data as a string or bytes.
         :return: The bytes for a zip file containing the input files.
@@ -335,6 +338,130 @@ class FileUtil:
             # throws an I/O exception.
             return zip_buffer.getvalue()
+    # FR-47422: Add a function for unzipping files that may have been zipped by the above function.
+    @staticmethod
+    def unzip_files(zip_file: bytes) -> dict[str, bytes]:
+        """
+        Decompress a .zip file from an in-memory bytes object and extracts all files into a dictionary.
+        :param zip_file: The bytes of the zip file to be decompressed.
+        :return: A dictionary of file name to file bytes for each file in the zip.
+        """
+        extracted_files: dict[str, bytes] = {}
+        with io.BytesIO(zip_file) as zip_buffer:
+            with zipfile.ZipFile(zip_buffer, "r") as zip_file:
+                for file_name in zip_file.namelist():
+                    with zip_file.open(file_name) as file:
+                        extracted_files[file_name] = file.read()
+        return extracted_files
+    # FR-47422: Add functions for compressing and decompressing .gz, .tar, and .tar.gz files.
+    @staticmethod
+    def gzip_file(file_data: bytes | str) -> bytes:
+        """
+        Create a .gz file for a single file.
+        :param file_data: The file data to be compressed as bytes or a string.
+        :return: The bytes of the gzip-compressed file.
+        """
+        return gzip.compress(file_data.encode() if isinstance(file_data, str) else file_data)
+    @staticmethod
+    def ungzip_file(gzip_file: bytes) -> bytes:
+        """
+        Decompress a .gz file.
+        :param gzip_file: The bytes of the gzip-compressed file.
+        :return: The decompressed file data as bytes.
+        """
+        return gzip.decompress(gzip_file)
+    @staticmethod
+    def tar_files(files: dict[str, str | bytes]) -> bytes:
+        """
+        Create a .tar file for a collection of files.
+        :param files: A dictionary of file name to file data as a string or bytes.
+        :return: The bytes for a tar file containing the input files.
+        """
+        with io.BytesIO() as tar_buffer:
+            with tarfile.open(fileobj=tar_buffer, mode="w") as tar:
+                for name, data in files.items():
+                    if isinstance(data, str):
+                        data: bytes = data.encode('utf-8')
+                    tarinfo = tarfile.TarInfo(name=name)
+                    tarinfo.size = len(data)
+                    tarinfo.mtime = int(time.time())
+                    with io.BytesIO(data) as file:
+                        tar.addfile(tarinfo=tarinfo, fileobj=file)
+            tar_buffer.seek(0)
+            return tar_buffer.getvalue()
+    @staticmethod
+    def untar_files(tar_file: bytes) -> dict[str, bytes]:
+        """
+        Decompress a .tar file from an in-memory bytes object and extracts all files into a dictionary.
+        :param tar_file: The bytes of the tar file to be decompressed.
+        :return: A dictionary of file name to file bytes for each file in the tar.
+        """
+        extracted_files: dict[str, bytes] = {}
+        with io.BytesIO(tar_file) as tar_buffer:
+            with tarfile.open(fileobj=tar_buffer, mode="r") as tar:
+                for member in tar.getmembers():
+                    if member.isfile():
+                        file_obj = tar.extractfile(member)
+                        if file_obj:
+                            with file_obj:
+                                extracted_files[member.name] = file_obj.read()
+        return extracted_files
+    @staticmethod
+    def tar_gzip_files(files: dict[str, str | bytes]) -> bytes:
+        """
+        Create a .tar.gz file for a collection of files.
+        :param files: A dictionary of file name to file data as a string or bytes.
+        :return: The bytes for a tar.gz file containing the input files.
+        """
+        with io.BytesIO() as tar_buffer:
+            with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar:
+                for name, data in files.items():
+                    if isinstance(data, str):
+                        data: bytes = data.encode('utf-8')
+                    tarinfo = tarfile.TarInfo(name=name)
+                    tarinfo.size = len(data)
+                    tarinfo.mtime = int(time.time())
+                    with io.BytesIO(data) as file:
+                        tar.addfile(tarinfo=tarinfo, fileobj=file)
+            tar_buffer.seek(0)
+            return tar_buffer.getvalue()
+    @staticmethod
+    def untar_gzip_files(tar_gzip_file: bytes) -> dict[str, bytes]:
+        """
+        Decompress a .tar.gz file from an in-memory bytes object and extracts all files into a dictionary.
+        :param tar_gzip_file: The bytes of the tar.gz file to be decompressed.
+        :return: A dictionary of file name to file bytes for each file in the tar.gz
+        """
+        extracted_files: dict[str, bytes] = {}
+        with io.BytesIO(tar_gzip_file) as tar_buffer:
+            with tarfile.open(fileobj=tar_buffer, mode="r:gz") as tar:
+                for member in tar.getmembers():
+                    if member.isfile():
+                        file_obj = tar.extractfile(member)
+                        if file_obj:
+                            with file_obj:
+                                extracted_files[member.name] = file_obj.read()
+        return extracted_files
     # Deprecated functions:
     # FR-46097 - Add write file request shorthand functions to FileUtil.

sapiopycommons 2025.6.19a564__py3-none-any.whl → 2026.1.22a847__py3-none-any.whl

sapiopycommons 2025.6.19a564py3-none-any.whl → 2026.1.22a847py3-none-any.whl