sapiopycommons 2024.9.20a333__tar.gz → 2024.9.30a335__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sapiopycommons might be problematic. Click here for more details.

Files changed (69) hide show
  1. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/PKG-INFO +1 -1
  2. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/pyproject.toml +1 -1
  3. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/IndigoMolecules.py +0 -1
  4. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/Molecules.py +19 -77
  5. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_util.py +17 -7
  6. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/multimodal/multimodal_data.py +3 -6
  7. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/data_type_models.py +17495 -38169
  8. sapiopycommons-2024.9.20a333/src/sapiopycommons/flowcyto/flow_cyto.py +0 -77
  9. sapiopycommons-2024.9.20a333/src/sapiopycommons/flowcyto/flowcyto_data.py +0 -75
  10. sapiopycommons-2024.9.20a333/tests/chem_test_curation_queue.py +0 -31
  11. sapiopycommons-2024.9.20a333/tests/curation_queue_test.sdf +0 -168
  12. sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E01_008_clean.fcs +0 -0
  13. sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E03_009_clean.fcs +0 -0
  14. sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E05_010_clean.fcs +0 -0
  15. sapiopycommons-2024.9.20a333/tests/flowcyto/8_color_ICS.wsp +0 -4833
  16. sapiopycommons-2024.9.20a333/tests/flowcyto/COVID19_W_001_O.fcs +0 -0
  17. sapiopycommons-2024.9.20a333/tests/flowcyto_test.py +0 -71
  18. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/.gitignore +0 -0
  19. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/LICENSE +0 -0
  20. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/README.md +0 -0
  21. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/__init__.py +0 -0
  22. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/callbacks/__init__.py +0 -0
  23. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/callbacks/callback_util.py +0 -0
  24. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/__init__.py +0 -0
  25. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/__init__.py +0 -0
  26. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/column_builder.py +0 -0
  27. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/custom_report_builder.py +0 -0
  28. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/term_builder.py +0 -0
  29. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/datatype/__init__.py +0 -0
  30. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/datatype/attachment_util.py +0 -0
  31. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/__init__.py +0 -0
  32. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/experiment_handler.py +0 -0
  33. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/experiment_report_util.py +0 -0
  34. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/plate_designer.py +0 -0
  35. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/__init__.py +0 -0
  36. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/complex_data_loader.py +0 -0
  37. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_bridge.py +0 -0
  38. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_bridge_handler.py +0 -0
  39. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_data_handler.py +0 -0
  40. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_validator.py +0 -0
  41. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_writer.py +0 -0
  42. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/__init__.py +0 -0
  43. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/accession_service.py +0 -0
  44. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/aliases.py +0 -0
  45. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/audit_log.py +0 -0
  46. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/custom_report_util.py +0 -0
  47. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/exceptions.py +0 -0
  48. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/popup_util.py +0 -0
  49. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/sapio_links.py +0 -0
  50. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/storage_util.py +0 -0
  51. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/time_util.py +0 -0
  52. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/multimodal/multimodal.py +0 -0
  53. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/processtracking/__init__.py +0 -0
  54. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/processtracking/endpoints.py +0 -0
  55. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/recordmodel/__init__.py +0 -0
  56. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/recordmodel/record_handler.py +0 -0
  57. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/__init__.py +0 -0
  58. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/eln_rule_handler.py +0 -0
  59. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/on_save_rule_handler.py +0 -0
  60. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/__init__.py +0 -0
  61. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/webhook_handlers.py +0 -0
  62. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/webservice_handlers.py +0 -0
  63. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/_do_not_add_init_py_here +0 -0
  64. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/accession_test.py +0 -0
  65. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/bio_reg_test.py +0 -0
  66. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/chem_test.py +0 -0
  67. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/kappa.chains.fasta +0 -0
  68. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/mafft_test.py +0 -0
  69. {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/test.gb +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: sapiopycommons
3
- Version: 2024.9.20a333
3
+ Version: 2024.9.30a335
4
4
  Summary: Official Sapio Python API Utilities Package
5
5
  Project-URL: Homepage, https://github.com/sapiosciences
6
6
  Author-email: Jonathan Steck <jsteck@sapiosciences.com>, Yechen Qiao <yqiao@sapiosciences.com>
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sapiopycommons"
7
- version='2024.09.20a333'
7
+ version='2024.09.30a335'
8
8
  authors = [
9
9
  { name="Jonathan Steck", email="jsteck@sapiosciences.com" },
10
10
  { name="Yechen Qiao", email="yqiao@sapiosciences.com" },
@@ -9,7 +9,6 @@ indigo.setOption("ignore-stereochemistry-errors", True)
9
9
  indigo.setOption("render-stereo-style", "ext")
10
10
  indigo.setOption("aromaticity-model", "generic")
11
11
  indigo.setOption("render-coloring", True)
12
- indigo.setOption("molfile-saving-mode", "3000")
13
12
  indigo_inchi = IndigoInchi(indigo);
14
13
 
15
14
 
@@ -1,6 +1,5 @@
1
1
  # Author Yechen Qiao
2
2
  # Common Molecule Utilities for Molecule Transfers with Sapio
3
- from typing import cast
4
3
 
5
4
  from rdkit import Chem
6
5
  from rdkit.Chem import Crippen, MolToInchi
@@ -21,25 +20,6 @@ tautomer_params.tautomerReassignStereo = False
21
20
  tautomer_params.tautomerRemoveIsotopicHs = True
22
21
  enumerator = rdMolStandardize.TautomerEnumerator(tautomer_params)
23
22
 
24
-
25
- def get_enhanced_stereo_reg_hash(mol: Mol, enhanced_stereo: bool) -> str:
26
- """
27
- Get the Registration Hash for the molecule by the current registration configuration.
28
- When we are running if we are canonicalization of tautomers or cleaning up any other way, do they first before calling.
29
- :param mol: The molecule to obtain hash for.
30
- :param canonical_tautomer: Whether the registry system canonicalize the tautomers.
31
- :param enhanced_stereo: Whether we are computing enhanced stereo at all.
32
- :return: The enhanced stereo hash.
33
- """
34
- if enhanced_stereo:
35
- from rdkit.Chem.RegistrationHash import GetMolLayers, GetMolHash, HashScheme
36
- layers = GetMolLayers(mol, enable_tautomer_hash_v2=True)
37
- hash_scheme: HashScheme = HashScheme.TAUTOMER_INSENSITIVE_LAYERS
38
- return GetMolHash(layers, hash_scheme=hash_scheme)
39
- else:
40
- return ""
41
-
42
-
43
23
  def neutralize_atoms(mol) -> Mol:
44
24
  """
45
25
  Neutralize atoms per https://baoilleach.blogspot.com/2019/12/no-charge-simple-approach-to.html
@@ -106,6 +86,7 @@ def mol_to_img(mol_str: str) -> str:
106
86
  return renderer.renderToString(mol)
107
87
 
108
88
 
89
+
109
90
  def mol_to_sapio_partial_pojo(mol: Mol):
110
91
  """
111
92
  Get the minimum information about molecule to Sapio, just its SMILES, V3000, and image data.
@@ -115,7 +96,7 @@ def mol_to_sapio_partial_pojo(mol: Mol):
115
96
  Chem.SanitizeMol(mol)
116
97
  mol.UpdatePropertyCache()
117
98
  smiles = Chem.MolToSmiles(mol)
118
- molBlock = Chem.MolToMolBlock(mol, forceV3000=True)
99
+ molBlock = Chem.MolToMolBlock(mol)
119
100
  img = mol_to_img(mol)
120
101
  molecule = dict()
121
102
  molecule["smiles"] = smiles
@@ -124,52 +105,23 @@ def mol_to_sapio_partial_pojo(mol: Mol):
124
105
  return molecule
125
106
 
126
107
 
127
- def get_cxs_smiles_hash(mol: Mol, enhanced_stereo: bool) -> str:
128
- """
129
- Return the SHA1 CXS Smiles hash for the canonical, isomeric CXS SMILES of the molecule.
130
- """
131
- if not enhanced_stereo:
132
- return ""
133
- import hashlib
134
- return hashlib.sha1(Chem.MolToCXSmiles(mol, canonical=True, isomericSmiles=True).encode()).hexdigest()
135
-
136
-
137
- def get_has_or_group(mol: Mol, enhanced_stereo: bool) -> bool:
138
- """
139
- Return true if and only if: enhanced stereochemistry is enabled and there is at least one OR group in mol.
140
- """
141
- if not enhanced_stereo:
142
- return False
143
- from rdkit.Chem import StereoGroup_vect, STEREO_OR
144
- stereo_groups: StereoGroup_vect = mol.GetStereoGroups()
145
- for stereo_group in stereo_groups:
146
- if stereo_group.GetGroupType() == STEREO_OR:
147
- return True
148
- return False
149
-
150
-
151
- def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
108
+ def mol_to_sapio_substance(mol: Mol, include_stereoisomers: bool = False,
152
109
  normalize: bool = False, remove_salt: bool = False, make_images: bool = False,
153
- salt_def: str | None = None, canonical_tautomer: bool = True,
154
- enhanced_stereo: bool = False, remove_atom_map: bool = True):
110
+ salt_def: str | None = None, canonical_tautomer: bool = True):
155
111
  """
156
112
  Convert a molecule in RDKit to a molecule POJO in Sapio.
157
113
 
158
114
  :param mol: The molecule in RDKit.
115
+ :param include_stereoisomers: If true, will compute all stereoisomer permutations of this molecule.
159
116
  :param normalize If true, will normalize the functional groups and return normalized result.
160
117
  :param remove_salt If true, we will remove salts iteratively from the molecule before returning their data.
161
118
  We will also populate desaltedList with molecules we deleted.
162
- :param make_images Whether to make images as part of the result without having another script to resolve it.
163
119
  :param salt_def: if not none, specifies custom salt to be used during the desalt process.
164
120
  :param canonical_tautomer: if True, we will attempt to compute canonical tautomer for the molecule. Slow!
165
121
  This is needed for a registry. Note it stops after enumeration of 1000.
166
- :param enhanced_stereo: If enabled, enhanced stereo hash will be produced.
167
- :param remove_atom_map: When set, clear all atom AAM maps that were set had it been merged into some reactions earlier.
168
122
  :return: The molecule POJO for Sapio.
169
123
  """
170
124
  molecule = dict()
171
- if remove_atom_map:
172
- [a.SetAtomMapNum(0) for a in mol.GetAtoms()]
173
125
  Chem.SanitizeMol(mol)
174
126
  mol.UpdatePropertyCache()
175
127
  Chem.GetSymmSSSR(mol)
@@ -205,7 +157,7 @@ def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
205
157
  exactMass = Descriptors.ExactMolWt(mol)
206
158
  molFormula = rdMolDescriptors.CalcMolFormula(mol)
207
159
  charge = Chem.GetFormalCharge(mol)
208
- molBlock = Chem.MolToMolBlock(mol, forceV3000=True)
160
+ molBlock = Chem.MolToMolBlock(mol)
209
161
 
210
162
  molecule["cLogP"] = cLogP
211
163
  molecule["tpsa"] = tpsa
@@ -229,38 +181,28 @@ def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
229
181
  # We need to test the INCHI can be loaded back to indigo.
230
182
  indigo_mol = indigo.loadMolecule(molBlock)
231
183
  indigo_mol.aromatize()
232
- if enhanced_stereo:
233
- # Remove enhanced stereo layer when generating InChI as the stereo hash is generated separately for reg.
234
- mol_copy: Mol = Chem.MolFromMolBlock(Chem.MolToMolBlock(mol))
235
- Chem.CanonicalizeEnhancedStereo(mol_copy)
236
- molecule["inchi"] = Chem.MolToInchi(mol_copy)
237
- molecule["inchiKey"] = Chem.MolToInchiKey(mol_copy)
238
- else:
239
- indigo_inchi.resetOptions()
240
- indigo_inchi_str = indigo_inchi.getInchi(indigo_mol)
241
- molecule["inchi"] = indigo_inchi_str
242
- indigo_inchi_key_str = indigo_inchi.getInchiKey(indigo_inchi_str)
243
- molecule["inchiKey"] = indigo_inchi_key_str
184
+ indigo_inchi.resetOptions()
185
+ indigo_inchi_str = indigo_inchi.getInchi(indigo_mol)
186
+ molecule["inchi"] = indigo_inchi_str
187
+ indigo_inchi_key_str = indigo_inchi.getInchiKey(indigo_inchi_str)
188
+ molecule["inchiKey"] = indigo_inchi_key_str
244
189
  molecule["smiles"] = indigo_mol.smiles()
245
- molecule["reg_hash"] = get_enhanced_stereo_reg_hash(mol, enhanced_stereo=enhanced_stereo)
246
- molecule["cxsmiles_hash"] = get_cxs_smiles_hash(mol, enhanced_stereo=enhanced_stereo)
247
- molecule["has_or_group"] = get_has_or_group(mol, enhanced_stereo=enhanced_stereo)
248
190
 
191
+ if include_stereoisomers and has_chiral_centers(mol):
192
+ stereoisomers = find_all_possible_stereoisomers(mol, only_unassigned=False, try_embedding=False, unique=True)
193
+ molecule["stereoisomers"] = [mol_to_sapio_partial_pojo(x) for x in stereoisomers]
249
194
  return molecule
250
195
 
251
196
 
252
- def mol_to_sapio_compound(mol: Mol, include_stereoisomers=False, enhanced_stereo: bool = False,
197
+ def mol_to_sapio_compound(mol: Mol, include_stereoisomers: bool = False,
253
198
  salt_def: str | None = None, resolve_canonical: bool = True,
254
- make_images: bool = False, canonical_tautomer: bool = True,
255
- remove_atom_map: bool = True):
199
+ make_images: bool = False, canonical_tautomer: bool = True):
256
200
  ret = dict()
257
- ret['originalMol'] = mol_to_sapio_substance(mol, include_stereoisomers=False,
201
+ ret['originalMol'] = mol_to_sapio_substance(mol, include_stereoisomers,
258
202
  normalize=False, remove_salt=False, make_images=make_images,
259
- canonical_tautomer=canonical_tautomer,
260
- enhanced_stereo=enhanced_stereo, remove_atom_map=remove_atom_map)
203
+ canonical_tautomer=canonical_tautomer)
261
204
  if resolve_canonical:
262
205
  ret['canonicalMol'] = mol_to_sapio_substance(mol, include_stereoisomers=False,
263
206
  normalize=True, remove_salt=True, make_images=make_images,
264
- salt_def=salt_def, canonical_tautomer=canonical_tautomer,
265
- enhanced_stereo=enhanced_stereo, remove_atom_map=remove_atom_map)
207
+ salt_def=salt_def, canonical_tautomer=canonical_tautomer)
266
208
  return ret
@@ -23,8 +23,8 @@ class FileUtil:
23
23
  """
24
24
  @staticmethod
25
25
  def tokenize_csv(file_bytes: bytes, required_headers: list[str] | None = None, header_row_index: int | None = 0,
26
- seperator: str = ",", *, encoding: str | None = None, exception_on_empty: bool = True) \
27
- -> tuple[list[dict[str, str]], list[list[str]]]:
26
+ seperator: str = ",", *, encoding: str | None = None, encoding_error: str | None = "strict",
27
+ exception_on_empty: bool = True) -> tuple[list[dict[str, str]], list[list[str]]]:
28
28
  """
29
29
  Tokenize a CSV file. The provided file must be uniform. That is, if row 1 has 10 cells, all the rows in the file
30
30
  must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
@@ -39,7 +39,11 @@ class FileUtil:
39
39
  :param seperator: The character that separates cells in the table.
40
40
  :param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
41
41
  contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
42
- ISO-8859-1 as the encoding.
42
+ ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
43
+ :param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
44
+ is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
45
+ characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
46
+ https://docs.python.org/3/library/codecs.html#error-handlers
43
47
  :param exception_on_empty: Throw a user error exception if the provided file bytes result in an empty list in
44
48
  the first element of the returned tuple.
45
49
  :return: The CSV parsed into a list of dicts where each dict is a row, mapping the headers to the cells for
@@ -49,7 +53,7 @@ class FileUtil:
49
53
  # Parse the file bytes into two DataFrames. The first is metadata of the file located above the header row,
50
54
  # while the second is the body of the file below the header row.
51
55
  file_body, file_metadata = FileUtil.csv_to_data_frames(file_bytes, header_row_index, seperator,
52
- encoding=encoding)
56
+ encoding=encoding, encoding_error=encoding_error)
53
57
  # Parse the metadata from above the header row index into a list of lists.
54
58
  metadata: list[list[str]] = FileUtil.data_frame_to_lists(file_metadata)
55
59
  # Parse the data from the file body into a list of dicts.
@@ -90,7 +94,8 @@ class FileUtil:
90
94
 
91
95
  @staticmethod
92
96
  def csv_to_data_frames(file_bytes: bytes, header_row_index: int | None = 0, seperator: str = ",",
93
- *, encoding: str | None = None) -> tuple[DataFrame, DataFrame | None]:
97
+ *, encoding: str | None = None, encoding_error: str | None = "strict") \
98
+ -> tuple[DataFrame, DataFrame | None]:
94
99
  """
95
100
  Parse the file bytes for a CSV into DataFrames. The provided file must be uniform. That is, if row 1 has 10
96
101
  cells, all the rows in the file must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
@@ -103,7 +108,11 @@ class FileUtil:
103
108
  :param seperator: The character that separates cells in the table.
104
109
  :param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
105
110
  contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
106
- ISO-8859-1 as the encoding.
111
+ ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
112
+ :param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
113
+ is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
114
+ characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
115
+ https://docs.python.org/3/library/codecs.html#error-handlers
107
116
  :return: A tuple of two DataFrames. The first is the frame for the CSV table body, while the second is for the
108
117
  metadata from above the header row, or None if there is no metadata.
109
118
  """
@@ -115,7 +124,8 @@ class FileUtil:
115
124
  # can throw off the header row index.
116
125
  file_metadata = pandas.read_csv(file_io, header=None, dtype=dtype(str),
117
126
  skiprows=lambda x: x >= header_row_index,
118
- skip_blank_lines=False, sep=seperator, encoding=encoding)
127
+ skip_blank_lines=False, sep=seperator, encoding=encoding,
128
+ encoding_errors=encoding_error)
119
129
  with io.BytesIO(file_bytes) as file_io:
120
130
  # The use of the dtype argument is to ensure that everything from the file gets read as a string. Added
121
131
  # because some numerical values would get ".0" appended to them, even when casting the DataFrame cell to a
@@ -38,9 +38,6 @@ class PyMolecule:
38
38
  normError: str | None
39
39
  desaltError: str | None
40
40
  desaltedList: list[str] | None
41
- registrationHash: str | None
42
- hasOrGroup: bool
43
- CXSMILESHash: str | None
44
41
 
45
42
 
46
43
  @dataclass
@@ -103,9 +100,9 @@ class PyMoleculeLoaderResult:
103
100
  compoundList: the compounds successfully loaded.
104
101
  errorList: an error record is added here for each one we failed to load in Sapio.
105
102
  """
106
- compoundByStr: dict[str, PyCompound] | None
107
- compoundList: list[PyCompound] | None
108
- errorList: list[ChemLoadingError] | None
103
+ compoundByStr: dict[str, PyCompound]
104
+ compoundList: list[PyCompound]
105
+ errorList: list[ChemLoadingError]
109
106
 
110
107
 
111
108
  @dataclass