sapiopycommons 2024.9.20a333__tar.gz → 2024.9.30a335__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sapiopycommons might be problematic. Click here for more details.
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/PKG-INFO +1 -1
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/pyproject.toml +1 -1
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/IndigoMolecules.py +0 -1
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/Molecules.py +19 -77
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_util.py +17 -7
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/multimodal/multimodal_data.py +3 -6
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/data_type_models.py +17495 -38169
- sapiopycommons-2024.9.20a333/src/sapiopycommons/flowcyto/flow_cyto.py +0 -77
- sapiopycommons-2024.9.20a333/src/sapiopycommons/flowcyto/flowcyto_data.py +0 -75
- sapiopycommons-2024.9.20a333/tests/chem_test_curation_queue.py +0 -31
- sapiopycommons-2024.9.20a333/tests/curation_queue_test.sdf +0 -168
- sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E01_008_clean.fcs +0 -0
- sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E03_009_clean.fcs +0 -0
- sapiopycommons-2024.9.20a333/tests/flowcyto/101_DEN084Y5_15_E05_010_clean.fcs +0 -0
- sapiopycommons-2024.9.20a333/tests/flowcyto/8_color_ICS.wsp +0 -4833
- sapiopycommons-2024.9.20a333/tests/flowcyto/COVID19_W_001_O.fcs +0 -0
- sapiopycommons-2024.9.20a333/tests/flowcyto_test.py +0 -71
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/.gitignore +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/LICENSE +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/README.md +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/callbacks/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/callbacks/callback_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/column_builder.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/custom_report_builder.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/customreport/term_builder.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/datatype/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/datatype/attachment_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/experiment_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/experiment_report_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/eln/plate_designer.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/complex_data_loader.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_bridge.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_bridge_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_data_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_validator.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_writer.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/accession_service.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/aliases.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/audit_log.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/custom_report_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/exceptions.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/popup_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/sapio_links.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/storage_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/general/time_util.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/multimodal/multimodal.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/processtracking/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/processtracking/endpoints.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/recordmodel/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/recordmodel/record_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/eln_rule_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/rules/on_save_rule_handler.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/__init__.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/webhook_handlers.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/webhook/webservice_handlers.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/_do_not_add_init_py_here +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/accession_test.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/bio_reg_test.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/chem_test.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/kappa.chains.fasta +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/mafft_test.py +0 -0
- {sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/tests/test.gb +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: sapiopycommons
|
|
3
|
-
Version: 2024.9.
|
|
3
|
+
Version: 2024.9.30a335
|
|
4
4
|
Summary: Official Sapio Python API Utilities Package
|
|
5
5
|
Project-URL: Homepage, https://github.com/sapiosciences
|
|
6
6
|
Author-email: Jonathan Steck <jsteck@sapiosciences.com>, Yechen Qiao <yqiao@sapiosciences.com>
|
|
@@ -9,7 +9,6 @@ indigo.setOption("ignore-stereochemistry-errors", True)
|
|
|
9
9
|
indigo.setOption("render-stereo-style", "ext")
|
|
10
10
|
indigo.setOption("aromaticity-model", "generic")
|
|
11
11
|
indigo.setOption("render-coloring", True)
|
|
12
|
-
indigo.setOption("molfile-saving-mode", "3000")
|
|
13
12
|
indigo_inchi = IndigoInchi(indigo);
|
|
14
13
|
|
|
15
14
|
|
{sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/chem/Molecules.py
RENAMED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# Author Yechen Qiao
|
|
2
2
|
# Common Molecule Utilities for Molecule Transfers with Sapio
|
|
3
|
-
from typing import cast
|
|
4
3
|
|
|
5
4
|
from rdkit import Chem
|
|
6
5
|
from rdkit.Chem import Crippen, MolToInchi
|
|
@@ -21,25 +20,6 @@ tautomer_params.tautomerReassignStereo = False
|
|
|
21
20
|
tautomer_params.tautomerRemoveIsotopicHs = True
|
|
22
21
|
enumerator = rdMolStandardize.TautomerEnumerator(tautomer_params)
|
|
23
22
|
|
|
24
|
-
|
|
25
|
-
def get_enhanced_stereo_reg_hash(mol: Mol, enhanced_stereo: bool) -> str:
|
|
26
|
-
"""
|
|
27
|
-
Get the Registration Hash for the molecule by the current registration configuration.
|
|
28
|
-
When we are running if we are canonicalization of tautomers or cleaning up any other way, do they first before calling.
|
|
29
|
-
:param mol: The molecule to obtain hash for.
|
|
30
|
-
:param canonical_tautomer: Whether the registry system canonicalize the tautomers.
|
|
31
|
-
:param enhanced_stereo: Whether we are computing enhanced stereo at all.
|
|
32
|
-
:return: The enhanced stereo hash.
|
|
33
|
-
"""
|
|
34
|
-
if enhanced_stereo:
|
|
35
|
-
from rdkit.Chem.RegistrationHash import GetMolLayers, GetMolHash, HashScheme
|
|
36
|
-
layers = GetMolLayers(mol, enable_tautomer_hash_v2=True)
|
|
37
|
-
hash_scheme: HashScheme = HashScheme.TAUTOMER_INSENSITIVE_LAYERS
|
|
38
|
-
return GetMolHash(layers, hash_scheme=hash_scheme)
|
|
39
|
-
else:
|
|
40
|
-
return ""
|
|
41
|
-
|
|
42
|
-
|
|
43
23
|
def neutralize_atoms(mol) -> Mol:
|
|
44
24
|
"""
|
|
45
25
|
Neutralize atoms per https://baoilleach.blogspot.com/2019/12/no-charge-simple-approach-to.html
|
|
@@ -106,6 +86,7 @@ def mol_to_img(mol_str: str) -> str:
|
|
|
106
86
|
return renderer.renderToString(mol)
|
|
107
87
|
|
|
108
88
|
|
|
89
|
+
|
|
109
90
|
def mol_to_sapio_partial_pojo(mol: Mol):
|
|
110
91
|
"""
|
|
111
92
|
Get the minimum information about molecule to Sapio, just its SMILES, V3000, and image data.
|
|
@@ -115,7 +96,7 @@ def mol_to_sapio_partial_pojo(mol: Mol):
|
|
|
115
96
|
Chem.SanitizeMol(mol)
|
|
116
97
|
mol.UpdatePropertyCache()
|
|
117
98
|
smiles = Chem.MolToSmiles(mol)
|
|
118
|
-
molBlock = Chem.MolToMolBlock(mol
|
|
99
|
+
molBlock = Chem.MolToMolBlock(mol)
|
|
119
100
|
img = mol_to_img(mol)
|
|
120
101
|
molecule = dict()
|
|
121
102
|
molecule["smiles"] = smiles
|
|
@@ -124,52 +105,23 @@ def mol_to_sapio_partial_pojo(mol: Mol):
|
|
|
124
105
|
return molecule
|
|
125
106
|
|
|
126
107
|
|
|
127
|
-
def
|
|
128
|
-
"""
|
|
129
|
-
Return the SHA1 CXS Smiles hash for the canonical, isomeric CXS SMILES of the molecule.
|
|
130
|
-
"""
|
|
131
|
-
if not enhanced_stereo:
|
|
132
|
-
return ""
|
|
133
|
-
import hashlib
|
|
134
|
-
return hashlib.sha1(Chem.MolToCXSmiles(mol, canonical=True, isomericSmiles=True).encode()).hexdigest()
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def get_has_or_group(mol: Mol, enhanced_stereo: bool) -> bool:
|
|
138
|
-
"""
|
|
139
|
-
Return true if and only if: enhanced stereochemistry is enabled and there is at least one OR group in mol.
|
|
140
|
-
"""
|
|
141
|
-
if not enhanced_stereo:
|
|
142
|
-
return False
|
|
143
|
-
from rdkit.Chem import StereoGroup_vect, STEREO_OR
|
|
144
|
-
stereo_groups: StereoGroup_vect = mol.GetStereoGroups()
|
|
145
|
-
for stereo_group in stereo_groups:
|
|
146
|
-
if stereo_group.GetGroupType() == STEREO_OR:
|
|
147
|
-
return True
|
|
148
|
-
return False
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
|
|
108
|
+
def mol_to_sapio_substance(mol: Mol, include_stereoisomers: bool = False,
|
|
152
109
|
normalize: bool = False, remove_salt: bool = False, make_images: bool = False,
|
|
153
|
-
salt_def: str | None = None, canonical_tautomer: bool = True
|
|
154
|
-
enhanced_stereo: bool = False, remove_atom_map: bool = True):
|
|
110
|
+
salt_def: str | None = None, canonical_tautomer: bool = True):
|
|
155
111
|
"""
|
|
156
112
|
Convert a molecule in RDKit to a molecule POJO in Sapio.
|
|
157
113
|
|
|
158
114
|
:param mol: The molecule in RDKit.
|
|
115
|
+
:param include_stereoisomers: If true, will compute all stereoisomer permutations of this molecule.
|
|
159
116
|
:param normalize If true, will normalize the functional groups and return normalized result.
|
|
160
117
|
:param remove_salt If true, we will remove salts iteratively from the molecule before returning their data.
|
|
161
118
|
We will also populate desaltedList with molecules we deleted.
|
|
162
|
-
:param make_images Whether to make images as part of the result without having another script to resolve it.
|
|
163
119
|
:param salt_def: if not none, specifies custom salt to be used during the desalt process.
|
|
164
120
|
:param canonical_tautomer: if True, we will attempt to compute canonical tautomer for the molecule. Slow!
|
|
165
121
|
This is needed for a registry. Note it stops after enumeration of 1000.
|
|
166
|
-
:param enhanced_stereo: If enabled, enhanced stereo hash will be produced.
|
|
167
|
-
:param remove_atom_map: When set, clear all atom AAM maps that were set had it been merged into some reactions earlier.
|
|
168
122
|
:return: The molecule POJO for Sapio.
|
|
169
123
|
"""
|
|
170
124
|
molecule = dict()
|
|
171
|
-
if remove_atom_map:
|
|
172
|
-
[a.SetAtomMapNum(0) for a in mol.GetAtoms()]
|
|
173
125
|
Chem.SanitizeMol(mol)
|
|
174
126
|
mol.UpdatePropertyCache()
|
|
175
127
|
Chem.GetSymmSSSR(mol)
|
|
@@ -205,7 +157,7 @@ def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
|
|
|
205
157
|
exactMass = Descriptors.ExactMolWt(mol)
|
|
206
158
|
molFormula = rdMolDescriptors.CalcMolFormula(mol)
|
|
207
159
|
charge = Chem.GetFormalCharge(mol)
|
|
208
|
-
molBlock = Chem.MolToMolBlock(mol
|
|
160
|
+
molBlock = Chem.MolToMolBlock(mol)
|
|
209
161
|
|
|
210
162
|
molecule["cLogP"] = cLogP
|
|
211
163
|
molecule["tpsa"] = tpsa
|
|
@@ -229,38 +181,28 @@ def mol_to_sapio_substance(mol: Mol, include_stereoisomers=False,
|
|
|
229
181
|
# We need to test the INCHI can be loaded back to indigo.
|
|
230
182
|
indigo_mol = indigo.loadMolecule(molBlock)
|
|
231
183
|
indigo_mol.aromatize()
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
molecule["inchiKey"] = Chem.MolToInchiKey(mol_copy)
|
|
238
|
-
else:
|
|
239
|
-
indigo_inchi.resetOptions()
|
|
240
|
-
indigo_inchi_str = indigo_inchi.getInchi(indigo_mol)
|
|
241
|
-
molecule["inchi"] = indigo_inchi_str
|
|
242
|
-
indigo_inchi_key_str = indigo_inchi.getInchiKey(indigo_inchi_str)
|
|
243
|
-
molecule["inchiKey"] = indigo_inchi_key_str
|
|
184
|
+
indigo_inchi.resetOptions()
|
|
185
|
+
indigo_inchi_str = indigo_inchi.getInchi(indigo_mol)
|
|
186
|
+
molecule["inchi"] = indigo_inchi_str
|
|
187
|
+
indigo_inchi_key_str = indigo_inchi.getInchiKey(indigo_inchi_str)
|
|
188
|
+
molecule["inchiKey"] = indigo_inchi_key_str
|
|
244
189
|
molecule["smiles"] = indigo_mol.smiles()
|
|
245
|
-
molecule["reg_hash"] = get_enhanced_stereo_reg_hash(mol, enhanced_stereo=enhanced_stereo)
|
|
246
|
-
molecule["cxsmiles_hash"] = get_cxs_smiles_hash(mol, enhanced_stereo=enhanced_stereo)
|
|
247
|
-
molecule["has_or_group"] = get_has_or_group(mol, enhanced_stereo=enhanced_stereo)
|
|
248
190
|
|
|
191
|
+
if include_stereoisomers and has_chiral_centers(mol):
|
|
192
|
+
stereoisomers = find_all_possible_stereoisomers(mol, only_unassigned=False, try_embedding=False, unique=True)
|
|
193
|
+
molecule["stereoisomers"] = [mol_to_sapio_partial_pojo(x) for x in stereoisomers]
|
|
249
194
|
return molecule
|
|
250
195
|
|
|
251
196
|
|
|
252
|
-
def mol_to_sapio_compound(mol: Mol, include_stereoisomers
|
|
197
|
+
def mol_to_sapio_compound(mol: Mol, include_stereoisomers: bool = False,
|
|
253
198
|
salt_def: str | None = None, resolve_canonical: bool = True,
|
|
254
|
-
make_images: bool = False, canonical_tautomer: bool = True
|
|
255
|
-
remove_atom_map: bool = True):
|
|
199
|
+
make_images: bool = False, canonical_tautomer: bool = True):
|
|
256
200
|
ret = dict()
|
|
257
|
-
ret['originalMol'] = mol_to_sapio_substance(mol, include_stereoisomers
|
|
201
|
+
ret['originalMol'] = mol_to_sapio_substance(mol, include_stereoisomers,
|
|
258
202
|
normalize=False, remove_salt=False, make_images=make_images,
|
|
259
|
-
canonical_tautomer=canonical_tautomer
|
|
260
|
-
enhanced_stereo=enhanced_stereo, remove_atom_map=remove_atom_map)
|
|
203
|
+
canonical_tautomer=canonical_tautomer)
|
|
261
204
|
if resolve_canonical:
|
|
262
205
|
ret['canonicalMol'] = mol_to_sapio_substance(mol, include_stereoisomers=False,
|
|
263
206
|
normalize=True, remove_salt=True, make_images=make_images,
|
|
264
|
-
salt_def=salt_def, canonical_tautomer=canonical_tautomer
|
|
265
|
-
enhanced_stereo=enhanced_stereo, remove_atom_map=remove_atom_map)
|
|
207
|
+
salt_def=salt_def, canonical_tautomer=canonical_tautomer)
|
|
266
208
|
return ret
|
{sapiopycommons-2024.9.20a333 → sapiopycommons-2024.9.30a335}/src/sapiopycommons/files/file_util.py
RENAMED
|
@@ -23,8 +23,8 @@ class FileUtil:
|
|
|
23
23
|
"""
|
|
24
24
|
@staticmethod
|
|
25
25
|
def tokenize_csv(file_bytes: bytes, required_headers: list[str] | None = None, header_row_index: int | None = 0,
|
|
26
|
-
seperator: str = ",", *, encoding: str | None = None,
|
|
27
|
-
|
|
26
|
+
seperator: str = ",", *, encoding: str | None = None, encoding_error: str | None = "strict",
|
|
27
|
+
exception_on_empty: bool = True) -> tuple[list[dict[str, str]], list[list[str]]]:
|
|
28
28
|
"""
|
|
29
29
|
Tokenize a CSV file. The provided file must be uniform. That is, if row 1 has 10 cells, all the rows in the file
|
|
30
30
|
must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
|
|
@@ -39,7 +39,11 @@ class FileUtil:
|
|
|
39
39
|
:param seperator: The character that separates cells in the table.
|
|
40
40
|
:param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
|
|
41
41
|
contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
|
|
42
|
-
ISO-8859-1 as the encoding.
|
|
42
|
+
ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
|
|
43
|
+
:param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
|
|
44
|
+
is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
|
|
45
|
+
characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
|
|
46
|
+
https://docs.python.org/3/library/codecs.html#error-handlers
|
|
43
47
|
:param exception_on_empty: Throw a user error exception if the provided file bytes result in an empty list in
|
|
44
48
|
the first element of the returned tuple.
|
|
45
49
|
:return: The CSV parsed into a list of dicts where each dict is a row, mapping the headers to the cells for
|
|
@@ -49,7 +53,7 @@ class FileUtil:
|
|
|
49
53
|
# Parse the file bytes into two DataFrames. The first is metadata of the file located above the header row,
|
|
50
54
|
# while the second is the body of the file below the header row.
|
|
51
55
|
file_body, file_metadata = FileUtil.csv_to_data_frames(file_bytes, header_row_index, seperator,
|
|
52
|
-
encoding=encoding)
|
|
56
|
+
encoding=encoding, encoding_error=encoding_error)
|
|
53
57
|
# Parse the metadata from above the header row index into a list of lists.
|
|
54
58
|
metadata: list[list[str]] = FileUtil.data_frame_to_lists(file_metadata)
|
|
55
59
|
# Parse the data from the file body into a list of dicts.
|
|
@@ -90,7 +94,8 @@ class FileUtil:
|
|
|
90
94
|
|
|
91
95
|
@staticmethod
|
|
92
96
|
def csv_to_data_frames(file_bytes: bytes, header_row_index: int | None = 0, seperator: str = ",",
|
|
93
|
-
*, encoding: str | None = None
|
|
97
|
+
*, encoding: str | None = None, encoding_error: str | None = "strict") \
|
|
98
|
+
-> tuple[DataFrame, DataFrame | None]:
|
|
94
99
|
"""
|
|
95
100
|
Parse the file bytes for a CSV into DataFrames. The provided file must be uniform. That is, if row 1 has 10
|
|
96
101
|
cells, all the rows in the file must have 10 cells. Otherwise, the Pandas parser throws a tokenizer exception.
|
|
@@ -103,7 +108,11 @@ class FileUtil:
|
|
|
103
108
|
:param seperator: The character that separates cells in the table.
|
|
104
109
|
:param encoding: The encoding used to read the given file bytes. If not provided, uses utf-8. If your file
|
|
105
110
|
contains a non-utf-8 character, then a UnicodeDecodeError will be thrown. If this happens, consider using
|
|
106
|
-
ISO-8859-1 as the encoding.
|
|
111
|
+
ISO-8859-1 as the encoding, or investigate what encoding would handle the characters in your file.
|
|
112
|
+
:param encoding_error: The error handling behavior if an encoding error is encountered. By default, the behavior
|
|
113
|
+
is "strict", meaning that encoding errors raise an exception. Change this to "ignore" to skip over invalid
|
|
114
|
+
characters or "replace" to replace invalid characters with a ? character. For a full list of options, see
|
|
115
|
+
https://docs.python.org/3/library/codecs.html#error-handlers
|
|
107
116
|
:return: A tuple of two DataFrames. The first is the frame for the CSV table body, while the second is for the
|
|
108
117
|
metadata from above the header row, or None if there is no metadata.
|
|
109
118
|
"""
|
|
@@ -115,7 +124,8 @@ class FileUtil:
|
|
|
115
124
|
# can throw off the header row index.
|
|
116
125
|
file_metadata = pandas.read_csv(file_io, header=None, dtype=dtype(str),
|
|
117
126
|
skiprows=lambda x: x >= header_row_index,
|
|
118
|
-
skip_blank_lines=False, sep=seperator, encoding=encoding
|
|
127
|
+
skip_blank_lines=False, sep=seperator, encoding=encoding,
|
|
128
|
+
encoding_errors=encoding_error)
|
|
119
129
|
with io.BytesIO(file_bytes) as file_io:
|
|
120
130
|
# The use of the dtype argument is to ensure that everything from the file gets read as a string. Added
|
|
121
131
|
# because some numerical values would get ".0" appended to them, even when casting the DataFrame cell to a
|
|
@@ -38,9 +38,6 @@ class PyMolecule:
|
|
|
38
38
|
normError: str | None
|
|
39
39
|
desaltError: str | None
|
|
40
40
|
desaltedList: list[str] | None
|
|
41
|
-
registrationHash: str | None
|
|
42
|
-
hasOrGroup: bool
|
|
43
|
-
CXSMILESHash: str | None
|
|
44
41
|
|
|
45
42
|
|
|
46
43
|
@dataclass
|
|
@@ -103,9 +100,9 @@ class PyMoleculeLoaderResult:
|
|
|
103
100
|
compoundList: the compounds successfully loaded.
|
|
104
101
|
errorList: an error record is added here for each one we failed to load in Sapio.
|
|
105
102
|
"""
|
|
106
|
-
compoundByStr: dict[str, PyCompound]
|
|
107
|
-
compoundList: list[PyCompound]
|
|
108
|
-
errorList: list[ChemLoadingError]
|
|
103
|
+
compoundByStr: dict[str, PyCompound]
|
|
104
|
+
compoundList: list[PyCompound]
|
|
105
|
+
errorList: list[ChemLoadingError]
|
|
109
106
|
|
|
110
107
|
|
|
111
108
|
@dataclass
|