sapiopycommons 2025.7.14a610__py3-none-any.whl → 2025.7.15a611__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sapiopycommons might be problematic. Click here for more details.

@@ -780,7 +780,7 @@ class CallbackUtil:
780
780
  # FR-47690: Set default values for fields that aren't present.
781
781
  for row in values:
782
782
  for field in fields:
783
- if field.data_field_name not in values:
783
+ if field.data_field_name not in row:
784
784
  row[field.data_field_name] = field.default_value
785
785
 
786
786
  # Convert the group_by parameter to a field name.
@@ -1812,7 +1812,8 @@ class CallbackUtil:
1812
1812
  return response
1813
1813
 
1814
1814
  def request_file(self, title: str, exts: Iterable[str] | None = None,
1815
- show_image_editor: bool = False, show_camera_button: bool = False) -> tuple[str, bytes]:
1815
+ show_image_editor: bool = False, show_camera_button: bool = False,
1816
+ *, enforce_file_extensions: bool = True) -> tuple[str, bytes]:
1816
1817
  """
1817
1818
  Request a single file from the user.
1818
1819
 
@@ -1822,6 +1823,8 @@ class CallbackUtil:
1822
1823
  :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
1823
1824
  :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
1824
1825
  rather than selecting an existing file.
1826
+ :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
1827
+ enforced. If false, then the user may upload any file type.
1825
1828
  :return: The file name and bytes of the uploaded file.
1826
1829
  """
1827
1830
  # If no extensions were provided, use an empty list for the extensions instead.
@@ -1841,11 +1844,12 @@ class CallbackUtil:
1841
1844
  file_path: str = self.__send_dialog(request, self.callback.show_file_dialog, data_sink=do_consume)
1842
1845
 
1843
1846
  # Verify that each of the file given matches the expected extension(s).
1844
- self.__verify_file(file_path, sink.data, exts)
1847
+ self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
1845
1848
  return file_path, sink.data
1846
1849
 
1847
1850
  def request_files(self, title: str, exts: Iterable[str] | None = None,
1848
- show_image_editor: bool = False, show_camera_button: bool = False) -> dict[str, bytes]:
1851
+ show_image_editor: bool = False, show_camera_button: bool = False,
1852
+ *, enforce_file_extensions: bool = True) -> dict[str, bytes]:
1849
1853
  """
1850
1854
  Request multiple files from the user.
1851
1855
 
@@ -1855,6 +1859,8 @@ class CallbackUtil:
1855
1859
  :param show_image_editor: Whether the user will see an image editor when image is uploaded in this file prompt.
1856
1860
  :param show_camera_button: Whether the user will be able to use camera to take a picture as an upload request,
1857
1861
  rather than selecting an existing file.
1862
+ :param enforce_file_extensions: If true, then the file extensions provided in the exts parameter will be
1863
+ enforced. If false, then the user may upload any file type.
1858
1864
  :return: A dictionary of file name to file bytes for each file the user uploaded.
1859
1865
  """
1860
1866
  # If no extensions were provided, use an empty list for the extensions instead.
@@ -1870,7 +1876,7 @@ class CallbackUtil:
1870
1876
  for file_path in file_paths:
1871
1877
  sink = InMemoryRecordDataSink(self.user)
1872
1878
  sink.consume_client_callback_file_path_data(file_path)
1873
- self.__verify_file(file_path, sink.data, exts)
1879
+ self.__verify_file(file_path, sink.data, exts if enforce_file_extensions else None)
1874
1880
  ret_dict.update({file_path: sink.data})
1875
1881
 
1876
1882
  return ret_dict
@@ -1887,16 +1893,17 @@ class CallbackUtil:
1887
1893
  """
1888
1894
  if file_path is None or len(file_path) == 0 or file_bytes is None or len(file_bytes) == 0:
1889
1895
  raise SapioUserErrorException("Empty file provided or file unable to be read.")
1890
- if allowed_extensions:
1891
- matches: bool = False
1892
- for ext in allowed_extensions:
1893
- # FR-47690: Changed to a case-insensitive match.
1894
- if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
1895
- matches = True
1896
- break
1897
- if matches is False:
1898
- raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
1899
- + (",".join(allowed_extensions)))
1896
+ if not allowed_extensions:
1897
+ return
1898
+ matches: bool = False
1899
+ for ext in allowed_extensions:
1900
+ # FR-47690: Changed to a case-insensitive match.
1901
+ if file_path.casefold().endswith("." + ext.lstrip(".").casefold()):
1902
+ matches = True
1903
+ break
1904
+ if not matches:
1905
+ raise SapioUserErrorException("Unsupported file type. Expecting the following extension(s): "
1906
+ + (",".join(allowed_extensions)))
1900
1907
 
1901
1908
  def write_file(self, file_name: str, file_data: str | bytes) -> None:
1902
1909
  """
@@ -6,8 +6,6 @@ indigo = Indigo()
6
6
  renderer = IndigoRenderer(indigo)
7
7
  indigo.setOption("render-output-format", "svg")
8
8
  indigo.setOption("ignore-stereochemistry-errors", True)
9
- # Ignore only if loading as non-query object. That is the meaning of this flag. Does nothing if it's query molecule.
10
- indigo.setOption("ignore-noncritical-query-features", True)
11
9
  indigo.setOption("render-stereo-style", "ext")
12
10
  indigo.setOption("aromaticity-model", "generic")
13
11
  indigo.setOption("render-coloring", True)
@@ -0,0 +1,207 @@
1
+ import io
2
+ import os
3
+ import tempfile
4
+ from enum import Enum, auto
5
+
6
+ class FileType(Enum):
7
+ """Supported file types for conversion."""
8
+ TXT = auto()
9
+ MD = auto()
10
+ CSV = auto()
11
+ DOC = auto()
12
+ DOCX = auto()
13
+ XLS = auto()
14
+ XLSX = auto()
15
+ PPT = auto()
16
+ PPTX = auto()
17
+ PDF = auto()
18
+ UNKNOWN = auto()
19
+
20
+
21
+ class FileToTextConverter:
22
+ """
23
+ A class for converting various file types to raw text.
24
+ """
25
+ @staticmethod
26
+ def mime_type_to_enum(mime_type: str) -> FileType:
27
+ """
28
+ Converts a MIME type to a FileType enum.
29
+
30
+ :param mime_type: The MIME type string to convert.
31
+ :return: The corresponding FileType enum, or UNKNOWN if not recognized.
32
+ """
33
+ if not mime_type or not mime_type.strip():
34
+ return FileType.UNKNOWN
35
+
36
+ mime_map = {
37
+ "text/plain": FileType.TXT,
38
+ "text/markdown": FileType.MD,
39
+ "text/csv": FileType.CSV,
40
+ "application/msword": FileType.DOC,
41
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
42
+ "application/vnd.ms-excel": FileType.XLS,
43
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
44
+ "application/vnd.ms-powerpoint": FileType.PPT,
45
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
46
+ "application/pdf": FileType.PDF,
47
+ }
48
+ return mime_map.get(mime_type, FileType.UNKNOWN)
49
+
50
+ @staticmethod
51
+ def file_extension_to_enum(file_path: str) -> FileType:
52
+ """
53
+ Converts a file path or extension to a FileType enum.
54
+
55
+ :param file_path: The file path or extension to convert.
56
+ :return: The corresponding FileType enum, or UNKNOWN if not recognized.
57
+ """
58
+ if not file_path or not file_path.strip():
59
+ return FileType.UNKNOWN
60
+
61
+ # Extract the file extension, removing the leading dot and making it lowercase
62
+ file_extension = os.path.splitext(file_path)[1].lstrip('.').lower()
63
+
64
+ ext_map = {
65
+ "txt": FileType.TXT,
66
+ "md": FileType.MD,
67
+ "csv": FileType.CSV,
68
+ "doc": FileType.DOC,
69
+ "docx": FileType.DOCX,
70
+ "xls": FileType.XLS,
71
+ "xlsx": FileType.XLSX,
72
+ "ppt": FileType.PPT,
73
+ "pptx": FileType.PPTX,
74
+ "pdf": FileType.PDF,
75
+ }
76
+ return ext_map.get(file_extension, FileType.UNKNOWN)
77
+
78
+ @classmethod
79
+ def parse_file(cls, file_type: FileType, file_bytes: bytes) -> str | None:
80
+ """
81
+ Parses file bytes based on the FileType and returns the text content.
82
+
83
+ :param file_type: The type of the file to parse.
84
+ :param file_bytes: The raw bytes of the file to parse.
85
+ :return: The text content of the file, or None if the file type is not supported or parsing fails.
86
+ """
87
+ if file_type is None or file_bytes is None:
88
+ return None
89
+ if not file_bytes:
90
+ return ""
91
+
92
+ # Dispatch to the correct parser method
93
+ parser_map = {
94
+ FileType.TXT: cls._parse_plain_text,
95
+ FileType.MD: cls._parse_plain_text,
96
+ FileType.CSV: cls._parse_plain_text,
97
+ FileType.DOC: cls._parse_doc,
98
+ FileType.DOCX: cls._parse_docx,
99
+ FileType.XLS: cls._parse_xls,
100
+ FileType.XLSX: cls._parse_xlsx,
101
+ FileType.PPT: cls._parse_ppt,
102
+ FileType.PPTX: cls._parse_pptx,
103
+ FileType.PDF: cls._parse_pdf,
104
+ }
105
+
106
+ parser_func = parser_map.get(file_type)
107
+
108
+ if parser_func:
109
+ return parser_func(file_bytes)
110
+
111
+ return None
112
+
113
+ @staticmethod
114
+ def _parse_plain_text(file_bytes: bytes) -> str:
115
+ return file_bytes.decode('utf-8')
116
+
117
+ @staticmethod
118
+ def _run_textract(file_bytes: bytes, extension: str) -> str:
119
+ """
120
+ Helper to run textract on in-memory bytes by writing to a temp file.
121
+ Note: textract may require external system dependencies.
122
+ """
123
+ import textract
124
+ with tempfile.NamedTemporaryFile(suffix=f".{extension}", delete=True) as temp_file:
125
+ temp_file.write(file_bytes)
126
+ temp_file.flush() # Ensure all bytes are written to disk
127
+ text = textract.process(temp_file.name).decode('utf-8')
128
+ return text
129
+
130
+ @classmethod
131
+ def _parse_doc(cls, file_bytes: bytes) -> str:
132
+ return cls._run_textract(file_bytes, 'doc')
133
+
134
+ @staticmethod
135
+ def _parse_docx(file_bytes: bytes) -> str:
136
+ import docx
137
+ with io.BytesIO(file_bytes) as stream:
138
+ document = docx.Document(stream)
139
+ return "\n".join(para.text for para in document.paragraphs if para.text.strip())
140
+
141
+ @staticmethod
142
+ def _parse_xls(file_bytes: bytes) -> str:
143
+ import xlrd
144
+ workbook = xlrd.open_workbook(file_contents=file_bytes)
145
+ text_parts = []
146
+ for sheet in workbook.sheets():
147
+ text_parts.append(f"Sheet: {sheet.name}\n")
148
+ for row_idx in range(sheet.nrows):
149
+ row_cells = []
150
+ for col_idx in range(sheet.ncols):
151
+ cell_text = str(sheet.cell_value(row_idx, col_idx))
152
+ if cell_text.strip():
153
+ row_cells.append(cell_text + "\t")
154
+ if row_cells:
155
+ text_parts.append("".join(row_cells))
156
+ text_parts.append("\n")
157
+ text_parts.append("\n")
158
+ return "".join(text_parts)
159
+
160
+ @staticmethod
161
+ def _parse_xlsx(file_bytes: bytes) -> str:
162
+ import openpyxl
163
+ with io.BytesIO(file_bytes) as stream:
164
+ workbook = openpyxl.load_workbook(stream, read_only=True)
165
+ text_parts = []
166
+ for sheet in workbook.worksheets:
167
+ text_parts.append(f"Sheet: {sheet.title}\n")
168
+ for row in sheet.iter_rows():
169
+ row_cells = []
170
+ for cell in row:
171
+ cell_text = str(cell.value) if cell.value is not None else ""
172
+ if cell_text.strip():
173
+ row_cells.append(cell_text + "\t")
174
+ if row_cells:
175
+ text_parts.append("".join(row_cells))
176
+ text_parts.append("\n")
177
+ text_parts.append("\n")
178
+ return "".join(text_parts)
179
+
180
+ @classmethod
181
+ def _parse_ppt(cls, file_bytes: bytes) -> str:
182
+ return cls._run_textract(file_bytes, 'ppt')
183
+
184
+ @staticmethod
185
+ def _parse_pptx(file_bytes: bytes) -> str:
186
+ import pptx
187
+ with io.BytesIO(file_bytes) as stream:
188
+ presentation = pptx.Presentation(stream)
189
+ text_parts = []
190
+ for slide in presentation.slides:
191
+ for shape in slide.shapes:
192
+ if shape.has_text_frame:
193
+ text = shape.text_frame.text
194
+ if text and text.strip():
195
+ text_parts.append(text)
196
+ return "\n".join(text_parts)
197
+
198
+ @staticmethod
199
+ def _parse_pdf(file_bytes: bytes) -> str:
200
+ """Parses a PDF file's bytes and extracts text using PyMuPDF."""
201
+ import pymupdf
202
+ text_parts = []
203
+ with io.BytesIO(file_bytes) as stream:
204
+ with pymupdf.open(stream=stream) as doc:
205
+ for page in doc:
206
+ text_parts.append(page.get_text())
207
+ return "\n".join(text_parts)