smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +9 -4
- smftools/_version.py +1 -1
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +0 -2
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/fast5_to_pod5.py +4 -1
- smftools/informatics/helpers/__init__.py +3 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
- smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
- smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +29 -3
- smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
- smftools/informatics/helpers/find_conversion_sites.py +5 -4
- smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
- smftools/informatics/helpers/split_and_index_BAM.py +1 -5
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/general_plotting.py +566 -89
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +13 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +849 -43
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/evaluation/__init__.py +0 -0
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -123,54 +123,840 @@ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
|
123
123
|
else:
|
|
124
124
|
print('Keeping input files')
|
|
125
125
|
|
|
126
|
-
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./"):
|
|
126
|
+
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./uns_backups", verbose=True):
|
|
127
127
|
"""
|
|
128
|
-
|
|
128
|
+
Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
adata (AnnData): The AnnData object to save.
|
|
132
|
-
path (str): Output .h5ad file path.
|
|
133
|
-
compression (str): Compression method for h5ad file.
|
|
134
|
-
backup (bool): If True, saves problematic columns to CSV files.
|
|
135
|
-
backup_dir (str): Directory to store backups if backup=True.
|
|
130
|
+
Returns a report dict and prints a summary of what was converted/backed up/skipped.
|
|
136
131
|
"""
|
|
137
|
-
import
|
|
132
|
+
import os, json, pickle
|
|
133
|
+
import numpy as np
|
|
138
134
|
import pandas as pd
|
|
139
|
-
import
|
|
135
|
+
import warnings
|
|
136
|
+
import anndata as _ad
|
|
140
137
|
|
|
141
138
|
os.makedirs(backup_dir, exist_ok=True)
|
|
142
139
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
140
|
+
# report structure
|
|
141
|
+
report = {
|
|
142
|
+
"obs_converted_columns": [],
|
|
143
|
+
"obs_backed_up_columns": [],
|
|
144
|
+
"var_converted_columns": [],
|
|
145
|
+
"var_backed_up_columns": [],
|
|
146
|
+
"uns_backed_up_keys": [],
|
|
147
|
+
"uns_json_keys": [],
|
|
148
|
+
"layers_converted": [],
|
|
149
|
+
"layers_skipped": [],
|
|
150
|
+
"obsm_converted": [],
|
|
151
|
+
"obsm_skipped": [],
|
|
152
|
+
"X_replaced_or_converted": None,
|
|
153
|
+
"errors": [],
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def _backup(obj, name):
|
|
157
|
+
"""Pickle obj to backup_dir/name.pkl and return filename (or None)."""
|
|
158
|
+
fname = os.path.join(backup_dir, f"{name}.pkl")
|
|
159
|
+
try:
|
|
160
|
+
with open(fname, "wb") as fh:
|
|
161
|
+
pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
|
|
162
|
+
if verbose:
|
|
163
|
+
print(f" backed up {name} -> {fname}")
|
|
164
|
+
return fname
|
|
165
|
+
except Exception as e:
|
|
166
|
+
msg = f"failed to back up {name}: {e}"
|
|
167
|
+
if verbose:
|
|
168
|
+
print(" " + msg)
|
|
169
|
+
report["errors"].append(msg)
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _make_obs_var_safe(df: pd.DataFrame, which: str):
|
|
173
|
+
"""
|
|
174
|
+
Return a sanitized copy of df where:
|
|
175
|
+
- object columns converted to strings (with backup)
|
|
176
|
+
- categorical columns' categories coerced to str (with backup)
|
|
177
|
+
"""
|
|
178
|
+
df = df.copy()
|
|
179
|
+
for col in list(df.columns):
|
|
180
|
+
ser = df[col]
|
|
181
|
+
# categorical handling
|
|
182
|
+
try:
|
|
183
|
+
is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
|
|
184
|
+
except Exception:
|
|
185
|
+
is_cat = False
|
|
186
|
+
|
|
187
|
+
if is_cat:
|
|
188
|
+
try:
|
|
189
|
+
cats = ser.cat.categories
|
|
190
|
+
cats_str = cats.astype(str)
|
|
191
|
+
df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
|
|
192
|
+
if verbose:
|
|
193
|
+
print(f" coerced categorical column '{which}.{col}' -> string categories")
|
|
194
|
+
if which == "obs":
|
|
195
|
+
report["obs_converted_columns"].append(col)
|
|
196
|
+
else:
|
|
197
|
+
report["var_converted_columns"].append(col)
|
|
198
|
+
except Exception:
|
|
199
|
+
# backup then coerce
|
|
200
|
+
if backup:
|
|
201
|
+
_backup(ser, f"{which}.{col}_categorical_backup")
|
|
202
|
+
if which == "obs":
|
|
203
|
+
report["obs_backed_up_columns"].append(col)
|
|
204
|
+
else:
|
|
205
|
+
report["var_backed_up_columns"].append(col)
|
|
206
|
+
df[col] = ser.astype(str)
|
|
207
|
+
if verbose:
|
|
208
|
+
print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# object dtype handling: try to coerce each element to string
|
|
212
|
+
try:
|
|
213
|
+
is_obj = ser.dtype == object or pd.api.types.is_object_dtype(ser.dtype)
|
|
214
|
+
except Exception:
|
|
215
|
+
is_obj = False
|
|
216
|
+
|
|
217
|
+
if is_obj:
|
|
218
|
+
# test whether converting to string succeeds for all elements
|
|
219
|
+
try:
|
|
220
|
+
_ = np.array(ser.values.astype(str))
|
|
221
|
+
if backup:
|
|
222
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
223
|
+
if which == "obs":
|
|
224
|
+
report["obs_backed_up_columns"].append(col)
|
|
225
|
+
else:
|
|
226
|
+
report["var_backed_up_columns"].append(col)
|
|
227
|
+
df[col] = ser.values.astype(str)
|
|
228
|
+
if verbose:
|
|
229
|
+
print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
|
|
230
|
+
if which == "obs":
|
|
231
|
+
report["obs_converted_columns"].append(col)
|
|
232
|
+
else:
|
|
233
|
+
report["var_converted_columns"].append(col)
|
|
234
|
+
except Exception:
|
|
235
|
+
# fallback: attempt per-element json.dumps; if fails mark as backed-up and coerce via str()
|
|
236
|
+
convertible = True
|
|
237
|
+
for val in ser.values:
|
|
238
|
+
try:
|
|
239
|
+
json.dumps(val, default=str)
|
|
240
|
+
except Exception:
|
|
241
|
+
convertible = False
|
|
242
|
+
break
|
|
243
|
+
if convertible:
|
|
244
|
+
if backup:
|
|
245
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
246
|
+
if which == "obs":
|
|
247
|
+
report["obs_backed_up_columns"].append(col)
|
|
248
|
+
else:
|
|
249
|
+
report["var_backed_up_columns"].append(col)
|
|
250
|
+
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
251
|
+
if verbose:
|
|
252
|
+
print(f" json-stringified object column '{which}.{col}' (backup={backup})")
|
|
253
|
+
if which == "obs":
|
|
254
|
+
report["obs_converted_columns"].append(col)
|
|
255
|
+
else:
|
|
256
|
+
report["var_converted_columns"].append(col)
|
|
257
|
+
else:
|
|
258
|
+
# fallback to string repr and backup
|
|
259
|
+
if backup:
|
|
260
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
261
|
+
if which == "obs":
|
|
262
|
+
report["obs_backed_up_columns"].append(col)
|
|
263
|
+
else:
|
|
264
|
+
report["var_backed_up_columns"].append(col)
|
|
265
|
+
df[col] = ser.astype(str)
|
|
266
|
+
if verbose:
|
|
267
|
+
print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
|
|
268
|
+
if which == "obs":
|
|
269
|
+
report["obs_converted_columns"].append(col)
|
|
270
|
+
else:
|
|
271
|
+
report["var_converted_columns"].append(col)
|
|
272
|
+
return df
|
|
273
|
+
|
|
274
|
+
def _sanitize_uns(uns: dict):
|
|
275
|
+
"""
|
|
276
|
+
For each key/value in uns:
|
|
277
|
+
- if json.dumps(value) works: keep it
|
|
278
|
+
- else: pickle value to backup dir, and add a JSON-stringified representation under key+'_json'
|
|
279
|
+
"""
|
|
280
|
+
clean = {}
|
|
281
|
+
backed_up = []
|
|
282
|
+
for k, v in uns.items():
|
|
283
|
+
try:
|
|
284
|
+
json.dumps(v)
|
|
285
|
+
clean[k] = v
|
|
286
|
+
except Exception:
|
|
287
|
+
try:
|
|
288
|
+
s = json.dumps(v, default=str)
|
|
289
|
+
clean[k + "_json"] = s
|
|
290
|
+
if backup:
|
|
291
|
+
_backup(v, f"uns_{k}_backup")
|
|
292
|
+
backed_up.append(k)
|
|
293
|
+
if verbose:
|
|
294
|
+
print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
|
|
295
|
+
report["uns_json_keys"].append(k)
|
|
296
|
+
except Exception:
|
|
297
|
+
try:
|
|
298
|
+
if backup:
|
|
299
|
+
_backup(v, f"uns_{k}_backup")
|
|
300
|
+
clean[k + "_str"] = str(v)
|
|
301
|
+
backed_up.append(k)
|
|
302
|
+
if verbose:
|
|
303
|
+
print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
|
|
304
|
+
report["uns_backed_up_keys"].append(k)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
msg = f"uns['{k}'] could not be preserved: {e}"
|
|
307
|
+
report["errors"].append(msg)
|
|
308
|
+
if verbose:
|
|
309
|
+
print(" " + msg)
|
|
310
|
+
if backed_up and verbose:
|
|
311
|
+
print(f"Sanitized .uns keys (backed up): {backed_up}")
|
|
312
|
+
return clean
|
|
313
|
+
|
|
314
|
+
def _sanitize_layers_obsm(src_dict, which: str):
|
|
315
|
+
"""
|
|
316
|
+
Ensure arrays in layers/obsm are numeric and non-object dtype.
|
|
317
|
+
Returns a cleaned dict suitable to pass into AnnData(...)
|
|
318
|
+
If an entry is not convertible, it is backed up & skipped.
|
|
319
|
+
"""
|
|
320
|
+
cleaned = {}
|
|
321
|
+
for k, v in src_dict.items():
|
|
322
|
+
try:
|
|
323
|
+
arr = np.asarray(v)
|
|
324
|
+
if arr.dtype == object:
|
|
325
|
+
try:
|
|
326
|
+
arr_f = arr.astype(float)
|
|
327
|
+
cleaned[k] = arr_f
|
|
328
|
+
report_key = f"{which}.{k}"
|
|
329
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
330
|
+
if verbose:
|
|
331
|
+
print(f" {which}.{k} object array coerced to float.")
|
|
332
|
+
except Exception:
|
|
333
|
+
try:
|
|
334
|
+
arr_i = arr.astype(int)
|
|
335
|
+
cleaned[k] = arr_i
|
|
336
|
+
report_key = f"{which}.{k}"
|
|
337
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
338
|
+
if verbose:
|
|
339
|
+
print(f" {which}.{k} object array coerced to int.")
|
|
340
|
+
except Exception:
|
|
341
|
+
if backup:
|
|
342
|
+
_backup(v, f"{which}_{k}_backup")
|
|
343
|
+
if which == "layers":
|
|
344
|
+
report["layers_skipped"].append(k)
|
|
345
|
+
else:
|
|
346
|
+
report["obsm_skipped"].append(k)
|
|
347
|
+
if verbose:
|
|
348
|
+
print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
|
|
349
|
+
continue
|
|
350
|
+
else:
|
|
351
|
+
cleaned[k] = arr
|
|
352
|
+
except Exception as e:
|
|
353
|
+
if backup:
|
|
354
|
+
_backup(v, f"{which}_{k}_backup")
|
|
355
|
+
if which == "layers":
|
|
356
|
+
report["layers_skipped"].append(k)
|
|
357
|
+
else:
|
|
358
|
+
report["obsm_skipped"].append(k)
|
|
359
|
+
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
360
|
+
report["errors"].append(msg)
|
|
361
|
+
if verbose:
|
|
362
|
+
print(msg)
|
|
363
|
+
continue
|
|
364
|
+
return cleaned
|
|
365
|
+
|
|
366
|
+
# ---------- sanitize obs, var ----------
|
|
367
|
+
try:
|
|
368
|
+
obs_clean = _make_obs_var_safe(adata.obs, "obs")
|
|
369
|
+
except Exception as e:
|
|
370
|
+
msg = f"Failed to sanitize obs: {e}"
|
|
371
|
+
report["errors"].append(msg)
|
|
372
|
+
if verbose:
|
|
373
|
+
print(msg)
|
|
374
|
+
obs_clean = adata.obs.copy()
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
var_clean = _make_obs_var_safe(adata.var, "var")
|
|
378
|
+
except Exception as e:
|
|
379
|
+
msg = f"Failed to sanitize var: {e}"
|
|
380
|
+
report["errors"].append(msg)
|
|
381
|
+
if verbose:
|
|
382
|
+
print(msg)
|
|
383
|
+
var_clean = adata.var.copy()
|
|
384
|
+
|
|
385
|
+
# ---------- sanitize uns ----------
|
|
386
|
+
try:
|
|
387
|
+
uns_clean = _sanitize_uns(adata.uns)
|
|
388
|
+
except Exception as e:
|
|
389
|
+
msg = f"Failed to sanitize uns: {e}"
|
|
390
|
+
report["errors"].append(msg)
|
|
391
|
+
if verbose:
|
|
392
|
+
print(msg)
|
|
393
|
+
uns_clean = {}
|
|
394
|
+
|
|
395
|
+
# ---------- sanitize layers and obsm ----------
|
|
396
|
+
layers_src = getattr(adata, "layers", {})
|
|
397
|
+
obsm_src = getattr(adata, "obsm", {})
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
layers_clean = _sanitize_layers_obsm(layers_src, "layers")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
msg = f"Failed to sanitize layers: {e}"
|
|
403
|
+
report["errors"].append(msg)
|
|
404
|
+
if verbose:
|
|
405
|
+
print(msg)
|
|
406
|
+
layers_clean = {}
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
obsm_clean = _sanitize_layers_obsm(obsm_src, "obsm")
|
|
410
|
+
except Exception as e:
|
|
411
|
+
msg = f"Failed to sanitize obsm: {e}"
|
|
412
|
+
report["errors"].append(msg)
|
|
413
|
+
if verbose:
|
|
414
|
+
print(msg)
|
|
415
|
+
obsm_clean = {}
|
|
416
|
+
|
|
417
|
+
# ---------- handle X ----------
|
|
418
|
+
X_to_use = adata.X
|
|
419
|
+
try:
|
|
420
|
+
X_arr = np.asarray(adata.X)
|
|
421
|
+
if X_arr.dtype == object:
|
|
422
|
+
try:
|
|
423
|
+
X_to_use = X_arr.astype(float)
|
|
424
|
+
report["X_replaced_or_converted"] = "converted_to_float"
|
|
425
|
+
if verbose:
|
|
426
|
+
print("Converted adata.X object-dtype -> float")
|
|
427
|
+
except Exception:
|
|
428
|
+
if backup:
|
|
429
|
+
_backup(adata.X, "X_backup")
|
|
430
|
+
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
431
|
+
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
432
|
+
if verbose:
|
|
433
|
+
print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
|
|
434
|
+
except Exception as e:
|
|
435
|
+
msg = f"Error handling adata.X: {e}"
|
|
436
|
+
report["errors"].append(msg)
|
|
437
|
+
if verbose:
|
|
438
|
+
print(msg)
|
|
439
|
+
X_to_use = adata.X
|
|
440
|
+
|
|
441
|
+
# ---------- build lightweight AnnData copy ----------
|
|
442
|
+
try:
|
|
443
|
+
adata_copy = _ad.AnnData(
|
|
444
|
+
X=X_to_use,
|
|
445
|
+
obs=obs_clean,
|
|
446
|
+
var=var_clean,
|
|
447
|
+
layers=layers_clean,
|
|
448
|
+
uns=uns_clean,
|
|
449
|
+
obsm=obsm_clean,
|
|
450
|
+
varm=getattr(adata, "varm", None),
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# preserve names (as strings)
|
|
454
|
+
try:
|
|
455
|
+
adata_copy.obs_names = adata.obs_names.astype(str)
|
|
456
|
+
adata_copy.var_names = adata.var_names.astype(str)
|
|
457
|
+
except Exception:
|
|
458
|
+
adata_copy.obs_names = adata.obs_names
|
|
459
|
+
adata_copy.var_names = adata.var_names
|
|
460
|
+
|
|
461
|
+
# --- write
|
|
462
|
+
adata_copy.write_h5ad(path, compression=compression)
|
|
463
|
+
if verbose:
|
|
464
|
+
print(f"Saved safely to {path}")
|
|
465
|
+
except Exception as e:
|
|
466
|
+
msg = f"Failed to write h5ad: {e}"
|
|
467
|
+
report["errors"].append(msg)
|
|
468
|
+
if verbose:
|
|
469
|
+
print(msg)
|
|
470
|
+
raise
|
|
471
|
+
|
|
472
|
+
# Print a concise interactive report
|
|
473
|
+
print("\n=== safe_write_h5ad REPORT ===")
|
|
474
|
+
print(f"Saved file: {path}")
|
|
475
|
+
print(f"Adata shape: {adata.shape}")
|
|
476
|
+
if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
|
|
477
|
+
print(f"obs: converted columns -> {report['obs_converted_columns']}")
|
|
478
|
+
print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
|
|
479
|
+
else:
|
|
480
|
+
print("obs: no problematic columns found.")
|
|
481
|
+
|
|
482
|
+
if report["var_converted_columns"] or report["var_backed_up_columns"]:
|
|
483
|
+
print(f"var: converted columns -> {report['var_converted_columns']}")
|
|
484
|
+
print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
|
|
485
|
+
else:
|
|
486
|
+
print("var: no problematic columns found.")
|
|
487
|
+
|
|
488
|
+
if report["uns_json_keys"] or report["uns_backed_up_keys"]:
|
|
489
|
+
print(f".uns: jsonified keys -> {report['uns_json_keys']}")
|
|
490
|
+
print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
|
|
491
|
+
else:
|
|
492
|
+
print(".uns: no problematic keys found.")
|
|
493
|
+
|
|
494
|
+
if report["layers_converted"] or report["layers_skipped"]:
|
|
495
|
+
print(f"layers: converted -> {report['layers_converted']}")
|
|
496
|
+
print(f"layers: skipped -> {report['layers_skipped']}")
|
|
497
|
+
else:
|
|
498
|
+
print("layers: no problematic entries found.")
|
|
499
|
+
|
|
500
|
+
if report["obsm_converted"] or report["obsm_skipped"]:
|
|
501
|
+
print(f"obsm: converted -> {report['obsm_converted']}")
|
|
502
|
+
print(f"obsm: skipped -> {report['obsm_skipped']}")
|
|
503
|
+
else:
|
|
504
|
+
print("obsm: no problematic entries found.")
|
|
505
|
+
|
|
506
|
+
if report["X_replaced_or_converted"]:
|
|
507
|
+
print(f"adata.X handled: {report['X_replaced_or_converted']}")
|
|
508
|
+
else:
|
|
509
|
+
print("adata.X: no changes.")
|
|
510
|
+
|
|
511
|
+
if report["errors"]:
|
|
512
|
+
print("\nWarnings / errors encountered:")
|
|
513
|
+
for e in report["errors"]:
|
|
514
|
+
print(" -", e)
|
|
515
|
+
|
|
516
|
+
print("=== end report ===\n")
|
|
517
|
+
return report
|
|
518
|
+
|
|
519
|
+
def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
|
|
520
|
+
"""
|
|
521
|
+
Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
|
|
522
|
+
from the backup_dir produced during save.
|
|
523
|
+
|
|
524
|
+
Parameters
|
|
525
|
+
----------
|
|
526
|
+
path : str
|
|
527
|
+
Path to the cleaned .h5ad produced by safe_write_h5ad.
|
|
528
|
+
backup_dir : str
|
|
529
|
+
Directory where safe_write_h5ad stored pickled backups (default "./uns_backups").
|
|
530
|
+
restore_backups : bool
|
|
531
|
+
If True, attempt to load pickled backups and restore original objects into adata.
|
|
532
|
+
re_categorize : bool
|
|
533
|
+
If True, try to coerce small unique-count string columns back into pandas.Categorical.
|
|
534
|
+
categorical_threshold : int
|
|
535
|
+
Max unique values for a column to be considered categorical for automatic recasting.
|
|
536
|
+
verbose : bool
|
|
537
|
+
Print progress/summary.
|
|
538
|
+
|
|
539
|
+
Returns
|
|
540
|
+
-------
|
|
541
|
+
(adata, report) :
|
|
542
|
+
adata : AnnData
|
|
543
|
+
The reloaded (and possibly restored) AnnData instance.
|
|
544
|
+
report : dict
|
|
545
|
+
A report describing restored items, parsed JSON keys, and any failures.
|
|
546
|
+
"""
|
|
547
|
+
import os
|
|
548
|
+
import json
|
|
549
|
+
import pickle
|
|
550
|
+
import numpy as np
|
|
551
|
+
import pandas as pd
|
|
552
|
+
import anndata as _ad
|
|
553
|
+
|
|
554
|
+
report = {
|
|
555
|
+
"restored_obs_columns": [],
|
|
556
|
+
"restored_var_columns": [],
|
|
557
|
+
"restored_uns_keys": [],
|
|
558
|
+
"parsed_uns_json_keys": [],
|
|
559
|
+
"restored_layers": [],
|
|
560
|
+
"restored_obsm": [],
|
|
561
|
+
"recategorized_obs": [],
|
|
562
|
+
"recategorized_var": [],
|
|
563
|
+
"missing_backups": [],
|
|
564
|
+
"errors": [],
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if verbose:
|
|
568
|
+
print(f"[safe_read_h5ad] loading {path}")
|
|
569
|
+
|
|
570
|
+
# 1) load the cleaned h5ad
|
|
571
|
+
try:
|
|
572
|
+
adata = _ad.read_h5ad(path)
|
|
573
|
+
except Exception as e:
|
|
574
|
+
raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
|
|
575
|
+
|
|
576
|
+
# Ensure backup_dir exists (may be relative to cwd)
|
|
577
|
+
backup_dir = os.path.abspath(backup_dir)
|
|
578
|
+
if verbose:
|
|
579
|
+
print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
|
|
580
|
+
|
|
581
|
+
def _load_pickle_if_exists(fname):
|
|
582
|
+
if os.path.exists(fname):
|
|
583
|
+
try:
|
|
584
|
+
with open(fname, "rb") as fh:
|
|
585
|
+
val = pickle.load(fh)
|
|
586
|
+
return val
|
|
587
|
+
except Exception as e:
|
|
588
|
+
report["errors"].append(f"Failed to load pickle {fname}: {e}")
|
|
589
|
+
if verbose:
|
|
590
|
+
print(f" error loading {fname}: {e}")
|
|
591
|
+
return None
|
|
592
|
+
return None
|
|
593
|
+
|
|
594
|
+
# 2) Restore obs columns
|
|
595
|
+
for col in list(adata.obs.columns):
|
|
596
|
+
# Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
|
|
597
|
+
bname1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
|
|
598
|
+
bname2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
|
|
599
|
+
restored = False
|
|
600
|
+
|
|
601
|
+
if restore_backups:
|
|
602
|
+
val = _load_pickle_if_exists(bname2)
|
|
603
|
+
if val is not None:
|
|
604
|
+
# val may be the categorical series or categories
|
|
605
|
+
try:
|
|
606
|
+
# If pickled numpy array or pandas Series, coerce to same index alignment
|
|
607
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
608
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
609
|
+
else:
|
|
610
|
+
# fallback: place pickled object directly
|
|
611
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
612
|
+
report["restored_obs_columns"].append((col, bname2))
|
|
613
|
+
restored = True
|
|
614
|
+
if verbose:
|
|
615
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
|
|
616
|
+
except Exception as e:
|
|
617
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
|
|
618
|
+
restored = False
|
|
619
|
+
|
|
620
|
+
if not restored:
|
|
621
|
+
val = _load_pickle_if_exists(bname1)
|
|
622
|
+
if val is not None:
|
|
623
|
+
try:
|
|
624
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
625
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
626
|
+
else:
|
|
627
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
628
|
+
report["restored_obs_columns"].append((col, bname1))
|
|
629
|
+
restored = True
|
|
630
|
+
if verbose:
|
|
631
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
|
|
632
|
+
except Exception as e:
|
|
633
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
|
|
634
|
+
restored = False
|
|
635
|
+
|
|
636
|
+
# If not restored and column dtype is object but contains JSON-like strings, try json.loads per element
|
|
637
|
+
if (not restored) and (adata.obs[col].dtype == object):
|
|
638
|
+
sample_vals = adata.obs[col].dropna().astype(str).head(20).tolist()
|
|
639
|
+
looks_like_json = False
|
|
640
|
+
for sv in sample_vals:
|
|
641
|
+
svs = sv.strip()
|
|
642
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
643
|
+
looks_like_json = True
|
|
644
|
+
break
|
|
645
|
+
if looks_like_json:
|
|
646
|
+
parsed = []
|
|
647
|
+
success_parse = True
|
|
648
|
+
for v in adata.obs[col].astype(str).values:
|
|
649
|
+
try:
|
|
650
|
+
parsed.append(json.loads(v))
|
|
651
|
+
except Exception:
|
|
652
|
+
# if any element fails, don't convert whole column
|
|
653
|
+
success_parse = False
|
|
654
|
+
break
|
|
655
|
+
if success_parse:
|
|
656
|
+
adata.obs[col] = pd.Series(parsed, index=adata.obs.index)
|
|
657
|
+
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
658
|
+
restored = True
|
|
659
|
+
if verbose:
|
|
660
|
+
print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
|
|
661
|
+
|
|
662
|
+
# If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
|
|
663
|
+
if (not restored) and re_categorize and adata.obs[col].dtype == object:
|
|
664
|
+
try:
|
|
665
|
+
nunique = adata.obs[col].dropna().astype(str).nunique()
|
|
666
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
667
|
+
# cast to category
|
|
668
|
+
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
669
|
+
report["recategorized_obs"].append(col)
|
|
670
|
+
if verbose:
|
|
671
|
+
print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
|
|
672
|
+
except Exception as e:
|
|
673
|
+
report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
|
|
674
|
+
|
|
675
|
+
# 3) Restore var columns (same logic)
|
|
676
|
+
for col in list(adata.var.columns):
|
|
677
|
+
bname1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
678
|
+
bname2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
679
|
+
restored = False
|
|
680
|
+
|
|
681
|
+
if restore_backups:
|
|
682
|
+
val = _load_pickle_if_exists(bname2)
|
|
683
|
+
if val is not None:
|
|
684
|
+
try:
|
|
685
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
686
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
687
|
+
else:
|
|
688
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
689
|
+
report["restored_var_columns"].append((col, bname2))
|
|
690
|
+
restored = True
|
|
691
|
+
if verbose:
|
|
692
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
|
|
693
|
+
except Exception as e:
|
|
694
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
|
|
695
|
+
|
|
696
|
+
if not restored:
|
|
697
|
+
val = _load_pickle_if_exists(bname1)
|
|
698
|
+
if val is not None:
|
|
699
|
+
try:
|
|
700
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
701
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
702
|
+
else:
|
|
703
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
704
|
+
report["restored_var_columns"].append((col, bname1))
|
|
705
|
+
restored = True
|
|
706
|
+
if verbose:
|
|
707
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
|
|
708
|
+
except Exception as e:
|
|
709
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
|
|
710
|
+
|
|
711
|
+
if (not restored) and (adata.var[col].dtype == object):
|
|
712
|
+
# try JSON parsing
|
|
713
|
+
sample_vals = adata.var[col].dropna().astype(str).head(20).tolist()
|
|
714
|
+
looks_like_json = False
|
|
715
|
+
for sv in sample_vals:
|
|
716
|
+
svs = sv.strip()
|
|
717
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
718
|
+
looks_like_json = True
|
|
719
|
+
break
|
|
720
|
+
if looks_like_json:
|
|
721
|
+
parsed = []
|
|
722
|
+
success_parse = True
|
|
723
|
+
for v in adata.var[col].astype(str).values:
|
|
724
|
+
try:
|
|
725
|
+
parsed.append(json.loads(v))
|
|
726
|
+
except Exception:
|
|
727
|
+
success_parse = False
|
|
728
|
+
break
|
|
729
|
+
if success_parse:
|
|
730
|
+
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
731
|
+
report["restored_var_columns"].append((col, "parsed_json"))
|
|
732
|
+
if verbose:
|
|
733
|
+
print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
|
|
734
|
+
|
|
735
|
+
if (not restored) and re_categorize and adata.var[col].dtype == object:
|
|
736
|
+
try:
|
|
737
|
+
nunique = adata.var[col].dropna().astype(str).nunique()
|
|
738
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
739
|
+
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
740
|
+
report["recategorized_var"].append(col)
|
|
741
|
+
if verbose:
|
|
742
|
+
print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
|
|
743
|
+
except Exception as e:
|
|
744
|
+
report["errors"].append(f"Failed to recategorize var.{col}: {e}")
|
|
745
|
+
|
|
746
|
+
# 4) Restore uns: look for uns_{k}_backup.pkl, or keys like "<k>_json"
|
|
747
|
+
uns_keys = list(adata.uns.keys())
|
|
748
|
+
# First, if we have "<k>_json", convert back into k
|
|
749
|
+
for k in uns_keys:
|
|
750
|
+
if k.endswith("_json"):
|
|
751
|
+
base = k[:-5]
|
|
752
|
+
sval = adata.uns.get(k)
|
|
753
|
+
try:
|
|
754
|
+
parsed = json.loads(sval)
|
|
755
|
+
adata.uns[base] = parsed
|
|
756
|
+
report["parsed_uns_json_keys"].append(base)
|
|
757
|
+
if verbose:
|
|
758
|
+
print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
|
|
759
|
+
# remove the _json entry
|
|
760
|
+
try:
|
|
761
|
+
del adata.uns[k]
|
|
762
|
+
except KeyError:
|
|
763
|
+
pass
|
|
764
|
+
except Exception as e:
|
|
765
|
+
report["errors"].append(f"Failed to json-parse uns['{k}']: {e}")
|
|
766
|
+
|
|
767
|
+
# Now try to restore pickled backups for uns keys
|
|
768
|
+
# Look for files named uns_<key>_backup.pkl
|
|
769
|
+
# We will attempt to restore into adata.uns[key] if backup exists
|
|
770
|
+
for fname in os.listdir(backup_dir) if os.path.isdir(backup_dir) else []:
|
|
771
|
+
if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
|
|
772
|
+
continue
|
|
773
|
+
# fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
|
|
774
|
+
key = fname[len("uns_"):-len("_backup.pkl")]
|
|
775
|
+
full = os.path.join(backup_dir, fname)
|
|
776
|
+
val = _load_pickle_if_exists(full)
|
|
777
|
+
if val is not None:
|
|
778
|
+
adata.uns[key] = val
|
|
779
|
+
report["restored_uns_keys"].append((key, full))
|
|
780
|
+
if verbose:
|
|
781
|
+
print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
782
|
+
|
|
783
|
+
# 5) Restore layers and obsm from backups if present
|
|
784
|
+
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
|
|
785
|
+
if os.path.isdir(backup_dir):
|
|
786
|
+
for fname in os.listdir(backup_dir):
|
|
787
|
+
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
788
|
+
layer_name = fname[len("layers_"):-len("_backup.pkl")]
|
|
789
|
+
full = os.path.join(backup_dir, fname)
|
|
790
|
+
val = _load_pickle_if_exists(full)
|
|
791
|
+
if val is not None:
|
|
792
|
+
try:
|
|
793
|
+
adata.layers[layer_name] = np.asarray(val)
|
|
794
|
+
report["restored_layers"].append((layer_name, full))
|
|
795
|
+
if verbose:
|
|
796
|
+
print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
|
|
797
|
+
except Exception as e:
|
|
798
|
+
report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
|
|
799
|
+
|
|
800
|
+
if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
|
|
801
|
+
obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
|
|
802
|
+
full = os.path.join(backup_dir, fname)
|
|
803
|
+
val = _load_pickle_if_exists(full)
|
|
804
|
+
if val is not None:
|
|
805
|
+
try:
|
|
806
|
+
adata.obsm[obsm_name] = np.asarray(val)
|
|
807
|
+
report["restored_obsm"].append((obsm_name, full))
|
|
808
|
+
if verbose:
|
|
809
|
+
print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
|
|
810
|
+
except Exception as e:
|
|
811
|
+
report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
|
|
812
|
+
|
|
813
|
+
# 6) If restore_backups True but some expected backups missing, note them
|
|
814
|
+
if restore_backups and os.path.isdir(backup_dir):
|
|
815
|
+
# detect common expected names from obs/var/uns/layers in adata
|
|
816
|
+
expected_missing = []
|
|
817
|
+
# obs/var columns
|
|
818
|
+
for col in list(adata.obs.columns):
|
|
819
|
+
p1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
|
|
820
|
+
p2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
|
|
821
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
822
|
+
# we don't require backups for every column; only record if column still looks like placeholder strings
|
|
823
|
+
if adata.obs[col].dtype == object:
|
|
824
|
+
expected_missing.append(("obs", col))
|
|
825
|
+
for col in list(adata.var.columns):
|
|
826
|
+
p1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
827
|
+
p2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
828
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
829
|
+
if adata.var[col].dtype == object:
|
|
830
|
+
expected_missing.append(("var", col))
|
|
831
|
+
# uns keys
|
|
832
|
+
for k in adata.uns.keys():
|
|
833
|
+
# if we have *_json or *_str variants we expect backups optionally
|
|
834
|
+
if k.endswith("_json") or k.endswith("_str"):
|
|
835
|
+
b = os.path.join(backup_dir, f"uns_{k[:-5]}_backup.pkl")
|
|
836
|
+
if not os.path.exists(b):
|
|
837
|
+
report["missing_backups"].append(("uns", k))
|
|
838
|
+
if expected_missing and verbose:
|
|
839
|
+
n = len(expected_missing)
|
|
840
|
+
if verbose:
|
|
841
|
+
print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
|
|
842
|
+
# add to report
|
|
843
|
+
report["missing_backups"].extend(expected_missing)
|
|
844
|
+
|
|
845
|
+
# final summary print
|
|
846
|
+
if verbose:
|
|
847
|
+
print("\n=== safe_read_h5ad summary ===")
|
|
848
|
+
if report["restored_obs_columns"]:
|
|
849
|
+
print("Restored obs columns:", report["restored_obs_columns"])
|
|
850
|
+
if report["restored_var_columns"]:
|
|
851
|
+
print("Restored var columns:", report["restored_var_columns"])
|
|
852
|
+
if report["restored_uns_keys"]:
|
|
853
|
+
print("Restored uns keys:", report["restored_uns_keys"])
|
|
854
|
+
if report["parsed_uns_json_keys"]:
|
|
855
|
+
print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
|
|
856
|
+
if report["restored_layers"]:
|
|
857
|
+
print("Restored layers:", report["restored_layers"])
|
|
858
|
+
if report["restored_obsm"]:
|
|
859
|
+
print("Restored obsm:", report["restored_obsm"])
|
|
860
|
+
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
861
|
+
print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
|
|
862
|
+
if report["missing_backups"]:
|
|
863
|
+
print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
|
|
864
|
+
if report["errors"]:
|
|
865
|
+
print("Errors encountered (see report['errors']):")
|
|
866
|
+
for e in report["errors"]:
|
|
867
|
+
print(" -", e)
|
|
868
|
+
print("=== end summary ===\n")
|
|
869
|
+
|
|
870
|
+
return adata, report
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
# def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./", verbose=True):
|
|
874
|
+
# """
|
|
875
|
+
# Saves an AnnData object safely by omitting problematic columns from .obs and .var.
|
|
876
|
+
|
|
877
|
+
# Parameters:
|
|
878
|
+
# adata (AnnData): The AnnData object to save.
|
|
879
|
+
# path (str): Output .h5ad file path.
|
|
880
|
+
# compression (str): Compression method for h5ad file.
|
|
881
|
+
# backup (bool): If True, saves problematic columns to CSV files.
|
|
882
|
+
# backup_dir (str): Directory to store backups if backup=True.
|
|
883
|
+
# """
|
|
884
|
+
# import anndata as ad
|
|
885
|
+
# import pandas as pd
|
|
886
|
+
# import os
|
|
887
|
+
# import numpy as np
|
|
888
|
+
# import json
|
|
889
|
+
|
|
890
|
+
# os.makedirs(backup_dir, exist_ok=True)
|
|
891
|
+
|
|
892
|
+
# def filter_df(df, df_name):
|
|
893
|
+
# bad_cols = []
|
|
894
|
+
# for col in df.columns:
|
|
895
|
+
# if df[col].dtype == 'object':
|
|
896
|
+
# if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
|
|
897
|
+
# bad_cols.append(col)
|
|
898
|
+
# elif pd.api.types.is_categorical_dtype(df[col]):
|
|
899
|
+
# if not all(isinstance(x, (str, type(None))) for x in df[col].cat.categories):
|
|
900
|
+
# bad_cols.append(col)
|
|
901
|
+
# if bad_cols and verbose:
|
|
902
|
+
# print(f"Skipping columns from {df_name}: {bad_cols}")
|
|
903
|
+
# if backup and bad_cols:
|
|
904
|
+
# df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
|
|
905
|
+
# if verbose:
|
|
906
|
+
# print(f"Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
|
|
907
|
+
# return df.drop(columns=bad_cols)
|
|
908
|
+
|
|
909
|
+
# def is_serializable(val):
|
|
910
|
+
# try:
|
|
911
|
+
# json.dumps(val)
|
|
912
|
+
# return True
|
|
913
|
+
# except (TypeError, OverflowError):
|
|
914
|
+
# return False
|
|
915
|
+
|
|
916
|
+
# def clean_uns(uns_dict):
|
|
917
|
+
# clean_uns = {}
|
|
918
|
+
# bad_keys = []
|
|
919
|
+
# for k, v in uns_dict.items():
|
|
920
|
+
# if isinstance(v, (str, int, float, type(None), list, np.ndarray, pd.DataFrame, dict)):
|
|
921
|
+
# clean_uns[k] = v
|
|
922
|
+
# elif is_serializable(v):
|
|
923
|
+
# clean_uns[k] = v
|
|
924
|
+
# else:
|
|
925
|
+
# bad_keys.append(k)
|
|
926
|
+
# if backup:
|
|
927
|
+
# try:
|
|
928
|
+
# with open(os.path.join(backup_dir, f"uns_{k}_backup.txt"), "w") as f:
|
|
929
|
+
# f.write(str(v))
|
|
930
|
+
# except Exception:
|
|
931
|
+
# pass
|
|
932
|
+
# if bad_keys and verbose:
|
|
933
|
+
# print(f"Skipping entries from .uns: {bad_keys}")
|
|
934
|
+
# return clean_uns
|
|
935
|
+
|
|
936
|
+
# # Clean obs and var and uns
|
|
937
|
+
# obs_clean = filter_df(adata.obs, "obs")
|
|
938
|
+
# var_clean = filter_df(adata.var, "var")
|
|
939
|
+
# uns_clean = clean_uns(adata.uns)
|
|
940
|
+
|
|
941
|
+
# # Save clean version
|
|
942
|
+
# adata_copy = ad.AnnData(
|
|
943
|
+
# X=adata.X,
|
|
944
|
+
# obs=obs_clean,
|
|
945
|
+
# var=var_clean,
|
|
946
|
+
# layers=adata.layers,
|
|
947
|
+
# uns=uns_clean,
|
|
948
|
+
# obsm=adata.obsm,
|
|
949
|
+
# varm=adata.varm
|
|
950
|
+
# )
|
|
951
|
+
|
|
952
|
+
# adata_copy.obs_names = adata_copy.obs_names.astype(str)
|
|
953
|
+
# adata_copy.var_names = adata_copy.var_names.astype(str)
|
|
954
|
+
|
|
955
|
+
# adata_copy.write_h5ad(path, compression=compression)
|
|
956
|
+
|
|
957
|
+
# print(f"Saved safely to {path}")
|
|
958
|
+
|
|
959
|
+
def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
174
960
|
import numpy as np
|
|
175
961
|
import anndata as ad
|
|
176
962
|
|
|
@@ -194,5 +980,25 @@ def merge_barcoded_anndatas(adata_single, adata_double):
|
|
|
194
980
|
adata_merged.uns = {**adata_single.uns, **adata_double.uns}
|
|
195
981
|
|
|
196
982
|
return adata_merged
|
|
197
|
-
|
|
198
|
-
|
|
983
|
+
######################################################################################################
|
|
984
|
+
|
|
985
|
+
### File conversion misc ###
|
|
986
|
+
import argparse
|
|
987
|
+
from Bio import SeqIO
|
|
988
|
+
def genbank_to_gff(genbank_file, output_file, record_id):
|
|
989
|
+
with open(output_file, "w") as out:
|
|
990
|
+
for record in SeqIO.parse(genbank_file, "genbank"):
|
|
991
|
+
for feature in record.features:
|
|
992
|
+
# Skip features without location information
|
|
993
|
+
if feature.location is None:
|
|
994
|
+
continue
|
|
995
|
+
# Extract feature information
|
|
996
|
+
start = feature.location.start + 1 # Convert to 1-based index
|
|
997
|
+
end = feature.location.end
|
|
998
|
+
strand = "+" if feature.location.strand == 1 else "-"
|
|
999
|
+
feature_type = feature.type
|
|
1000
|
+
# Format attributes
|
|
1001
|
+
attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
|
|
1002
|
+
# Write GFF3 line
|
|
1003
|
+
gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
|
|
1004
|
+
out.write(gff3_line + "\n")
|