smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. smftools/__init__.py +9 -4
  2. smftools/_version.py +1 -1
  3. smftools/cli.py +184 -0
  4. smftools/config/__init__.py +1 -0
  5. smftools/config/conversion.yaml +33 -0
  6. smftools/config/deaminase.yaml +56 -0
  7. smftools/config/default.yaml +253 -0
  8. smftools/config/direct.yaml +17 -0
  9. smftools/config/experiment_config.py +1191 -0
  10. smftools/hmm/HMM.py +1576 -0
  11. smftools/hmm/__init__.py +20 -0
  12. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  13. smftools/hmm/call_hmm_peaks.py +106 -0
  14. smftools/{tools → hmm}/display_hmm.py +3 -3
  15. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  16. smftools/{tools → hmm}/train_hmm.py +1 -1
  17. smftools/informatics/__init__.py +0 -2
  18. smftools/informatics/archived/deaminase_smf.py +132 -0
  19. smftools/informatics/fast5_to_pod5.py +4 -1
  20. smftools/informatics/helpers/__init__.py +3 -4
  21. smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
  22. smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
  23. smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
  24. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
  25. smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
  26. smftools/informatics/helpers/discover_input_files.py +100 -0
  27. smftools/informatics/helpers/extract_base_identities.py +29 -3
  28. smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
  29. smftools/informatics/helpers/find_conversion_sites.py +5 -4
  30. smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
  31. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  32. smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
  33. smftools/informatics/helpers/split_and_index_BAM.py +1 -5
  34. smftools/load_adata.py +1346 -0
  35. smftools/machine_learning/__init__.py +12 -0
  36. smftools/machine_learning/data/__init__.py +2 -0
  37. smftools/machine_learning/data/anndata_data_module.py +234 -0
  38. smftools/machine_learning/evaluation/__init__.py +2 -0
  39. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  40. smftools/machine_learning/evaluation/evaluators.py +223 -0
  41. smftools/machine_learning/inference/__init__.py +3 -0
  42. smftools/machine_learning/inference/inference_utils.py +27 -0
  43. smftools/machine_learning/inference/lightning_inference.py +68 -0
  44. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  45. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  46. smftools/machine_learning/models/base.py +295 -0
  47. smftools/machine_learning/models/cnn.py +138 -0
  48. smftools/machine_learning/models/lightning_base.py +345 -0
  49. smftools/machine_learning/models/mlp.py +26 -0
  50. smftools/{tools → machine_learning}/models/positional.py +3 -2
  51. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  52. smftools/machine_learning/models/sklearn_models.py +273 -0
  53. smftools/machine_learning/models/transformer.py +303 -0
  54. smftools/machine_learning/training/__init__.py +2 -0
  55. smftools/machine_learning/training/train_lightning_model.py +135 -0
  56. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  57. smftools/plotting/__init__.py +4 -1
  58. smftools/plotting/autocorrelation_plotting.py +611 -0
  59. smftools/plotting/general_plotting.py +566 -89
  60. smftools/plotting/hmm_plotting.py +260 -0
  61. smftools/plotting/qc_plotting.py +270 -0
  62. smftools/preprocessing/__init__.py +13 -8
  63. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  64. smftools/preprocessing/append_base_context.py +122 -0
  65. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  66. smftools/preprocessing/calculate_complexity_II.py +248 -0
  67. smftools/preprocessing/calculate_coverage.py +10 -1
  68. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  69. smftools/preprocessing/clean_NaN.py +17 -1
  70. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  71. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  72. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  73. smftools/preprocessing/invert_adata.py +12 -5
  74. smftools/preprocessing/load_sample_sheet.py +19 -4
  75. smftools/readwrite.py +849 -43
  76. smftools/tools/__init__.py +3 -32
  77. smftools/tools/calculate_umap.py +5 -5
  78. smftools/tools/general_tools.py +3 -3
  79. smftools/tools/position_stats.py +468 -106
  80. smftools/tools/read_stats.py +115 -1
  81. smftools/tools/spatial_autocorrelation.py +562 -0
  82. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
  83. smftools-0.2.1.dist-info/RECORD +161 -0
  84. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  85. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  86. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  87. smftools/informatics/load_adata.py +0 -182
  88. smftools/preprocessing/append_C_context.py +0 -82
  89. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  90. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  91. smftools/preprocessing/filter_reads_on_length.py +0 -51
  92. smftools/tools/call_hmm_peaks.py +0 -105
  93. smftools/tools/data/__init__.py +0 -2
  94. smftools/tools/data/anndata_data_module.py +0 -90
  95. smftools/tools/evaluation/__init__.py +0 -0
  96. smftools/tools/inference/__init__.py +0 -1
  97. smftools/tools/inference/lightning_inference.py +0 -41
  98. smftools/tools/models/base.py +0 -14
  99. smftools/tools/models/cnn.py +0 -34
  100. smftools/tools/models/lightning_base.py +0 -41
  101. smftools/tools/models/mlp.py +0 -17
  102. smftools/tools/models/sklearn_models.py +0 -40
  103. smftools/tools/models/transformer.py +0 -133
  104. smftools/tools/training/__init__.py +0 -1
  105. smftools/tools/training/train_lightning_model.py +0 -47
  106. smftools-0.1.7.dist-info/RECORD +0 -136
  107. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  108. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  109. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  110. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  111. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  112. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  113. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  114. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  115. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  116. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  117. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  118. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  119. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  120. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -123,54 +123,840 @@ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
123
123
  else:
124
124
  print('Keeping input files')
125
125
 
126
- def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./"):
126
+ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./uns_backups", verbose=True):
127
127
  """
128
- Saves an AnnData object safely by omitting problematic columns from .obs and .var.
128
+ Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
129
129
 
130
- Parameters:
131
- adata (AnnData): The AnnData object to save.
132
- path (str): Output .h5ad file path.
133
- compression (str): Compression method for h5ad file.
134
- backup (bool): If True, saves problematic columns to CSV files.
135
- backup_dir (str): Directory to store backups if backup=True.
130
+ Returns a report dict and prints a summary of what was converted/backed up/skipped.
136
131
  """
137
- import anndata as ad
132
+ import os, json, pickle
133
+ import numpy as np
138
134
  import pandas as pd
139
- import os
135
+ import warnings
136
+ import anndata as _ad
140
137
 
141
138
  os.makedirs(backup_dir, exist_ok=True)
142
139
 
143
- def filter_df(df, df_name):
144
- bad_cols = []
145
- for col in df.columns:
146
- if df[col].dtype == 'object':
147
- if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
148
- bad_cols.append(col)
149
- if bad_cols:
150
- print(f"⚠️ Skipping columns from {df_name}: {bad_cols}")
151
- if backup:
152
- df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
153
- print(f"📝 Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
154
- return df.drop(columns=bad_cols)
155
-
156
- # Clean obs and var
157
- obs_clean = filter_df(adata.obs, "obs")
158
- var_clean = filter_df(adata.var, "var")
159
-
160
- # Save clean version
161
- adata_copy = ad.AnnData(
162
- X=adata.X,
163
- obs=obs_clean,
164
- var=var_clean,
165
- layers=adata.layers,
166
- uns=adata.uns,
167
- obsm=adata.obsm,
168
- varm=adata.varm
169
- )
170
- adata_copy.write_h5ad(path, compression=compression)
171
- print(f" Saved safely to {path}")
172
-
173
- def merge_barcoded_anndatas(adata_single, adata_double):
140
+ # report structure
141
+ report = {
142
+ "obs_converted_columns": [],
143
+ "obs_backed_up_columns": [],
144
+ "var_converted_columns": [],
145
+ "var_backed_up_columns": [],
146
+ "uns_backed_up_keys": [],
147
+ "uns_json_keys": [],
148
+ "layers_converted": [],
149
+ "layers_skipped": [],
150
+ "obsm_converted": [],
151
+ "obsm_skipped": [],
152
+ "X_replaced_or_converted": None,
153
+ "errors": [],
154
+ }
155
+
156
+ def _backup(obj, name):
157
+ """Pickle obj to backup_dir/name.pkl and return filename (or None)."""
158
+ fname = os.path.join(backup_dir, f"{name}.pkl")
159
+ try:
160
+ with open(fname, "wb") as fh:
161
+ pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
162
+ if verbose:
163
+ print(f" backed up {name} -> {fname}")
164
+ return fname
165
+ except Exception as e:
166
+ msg = f"failed to back up {name}: {e}"
167
+ if verbose:
168
+ print(" " + msg)
169
+ report["errors"].append(msg)
170
+ return None
171
+
172
+ def _make_obs_var_safe(df: pd.DataFrame, which: str):
173
+ """
174
+ Return a sanitized copy of df where:
175
+ - object columns converted to strings (with backup)
176
+ - categorical columns' categories coerced to str (with backup)
177
+ """
178
+ df = df.copy()
179
+ for col in list(df.columns):
180
+ ser = df[col]
181
+ # categorical handling
182
+ try:
183
+ is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
184
+ except Exception:
185
+ is_cat = False
186
+
187
+ if is_cat:
188
+ try:
189
+ cats = ser.cat.categories
190
+ cats_str = cats.astype(str)
191
+ df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
192
+ if verbose:
193
+ print(f" coerced categorical column '{which}.{col}' -> string categories")
194
+ if which == "obs":
195
+ report["obs_converted_columns"].append(col)
196
+ else:
197
+ report["var_converted_columns"].append(col)
198
+ except Exception:
199
+ # backup then coerce
200
+ if backup:
201
+ _backup(ser, f"{which}.{col}_categorical_backup")
202
+ if which == "obs":
203
+ report["obs_backed_up_columns"].append(col)
204
+ else:
205
+ report["var_backed_up_columns"].append(col)
206
+ df[col] = ser.astype(str)
207
+ if verbose:
208
+ print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
209
+ continue
210
+
211
+ # object dtype handling: try to coerce each element to string
212
+ try:
213
+ is_obj = ser.dtype == object or pd.api.types.is_object_dtype(ser.dtype)
214
+ except Exception:
215
+ is_obj = False
216
+
217
+ if is_obj:
218
+ # test whether converting to string succeeds for all elements
219
+ try:
220
+ _ = np.array(ser.values.astype(str))
221
+ if backup:
222
+ _backup(ser.values, f"{which}.{col}_backup")
223
+ if which == "obs":
224
+ report["obs_backed_up_columns"].append(col)
225
+ else:
226
+ report["var_backed_up_columns"].append(col)
227
+ df[col] = ser.values.astype(str)
228
+ if verbose:
229
+ print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
230
+ if which == "obs":
231
+ report["obs_converted_columns"].append(col)
232
+ else:
233
+ report["var_converted_columns"].append(col)
234
+ except Exception:
235
+ # fallback: attempt per-element json.dumps; if fails mark as backed-up and coerce via str()
236
+ convertible = True
237
+ for val in ser.values:
238
+ try:
239
+ json.dumps(val, default=str)
240
+ except Exception:
241
+ convertible = False
242
+ break
243
+ if convertible:
244
+ if backup:
245
+ _backup(ser.values, f"{which}.{col}_backup")
246
+ if which == "obs":
247
+ report["obs_backed_up_columns"].append(col)
248
+ else:
249
+ report["var_backed_up_columns"].append(col)
250
+ df[col] = [json.dumps(v, default=str) for v in ser.values]
251
+ if verbose:
252
+ print(f" json-stringified object column '{which}.{col}' (backup={backup})")
253
+ if which == "obs":
254
+ report["obs_converted_columns"].append(col)
255
+ else:
256
+ report["var_converted_columns"].append(col)
257
+ else:
258
+ # fallback to string repr and backup
259
+ if backup:
260
+ _backup(ser.values, f"{which}.{col}_backup")
261
+ if which == "obs":
262
+ report["obs_backed_up_columns"].append(col)
263
+ else:
264
+ report["var_backed_up_columns"].append(col)
265
+ df[col] = ser.astype(str)
266
+ if verbose:
267
+ print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
268
+ if which == "obs":
269
+ report["obs_converted_columns"].append(col)
270
+ else:
271
+ report["var_converted_columns"].append(col)
272
+ return df
273
+
274
+ def _sanitize_uns(uns: dict):
275
+ """
276
+ For each key/value in uns:
277
+ - if json.dumps(value) works: keep it
278
+ - else: pickle value to backup dir, and add a JSON-stringified representation under key+'_json'
279
+ """
280
+ clean = {}
281
+ backed_up = []
282
+ for k, v in uns.items():
283
+ try:
284
+ json.dumps(v)
285
+ clean[k] = v
286
+ except Exception:
287
+ try:
288
+ s = json.dumps(v, default=str)
289
+ clean[k + "_json"] = s
290
+ if backup:
291
+ _backup(v, f"uns_{k}_backup")
292
+ backed_up.append(k)
293
+ if verbose:
294
+ print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
295
+ report["uns_json_keys"].append(k)
296
+ except Exception:
297
+ try:
298
+ if backup:
299
+ _backup(v, f"uns_{k}_backup")
300
+ clean[k + "_str"] = str(v)
301
+ backed_up.append(k)
302
+ if verbose:
303
+ print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
304
+ report["uns_backed_up_keys"].append(k)
305
+ except Exception as e:
306
+ msg = f"uns['{k}'] could not be preserved: {e}"
307
+ report["errors"].append(msg)
308
+ if verbose:
309
+ print(" " + msg)
310
+ if backed_up and verbose:
311
+ print(f"Sanitized .uns keys (backed up): {backed_up}")
312
+ return clean
313
+
314
+ def _sanitize_layers_obsm(src_dict, which: str):
315
+ """
316
+ Ensure arrays in layers/obsm are numeric and non-object dtype.
317
+ Returns a cleaned dict suitable to pass into AnnData(...)
318
+ If an entry is not convertible, it is backed up & skipped.
319
+ """
320
+ cleaned = {}
321
+ for k, v in src_dict.items():
322
+ try:
323
+ arr = np.asarray(v)
324
+ if arr.dtype == object:
325
+ try:
326
+ arr_f = arr.astype(float)
327
+ cleaned[k] = arr_f
328
+ report_key = f"{which}.{k}"
329
+ report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
330
+ if verbose:
331
+ print(f" {which}.{k} object array coerced to float.")
332
+ except Exception:
333
+ try:
334
+ arr_i = arr.astype(int)
335
+ cleaned[k] = arr_i
336
+ report_key = f"{which}.{k}"
337
+ report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
338
+ if verbose:
339
+ print(f" {which}.{k} object array coerced to int.")
340
+ except Exception:
341
+ if backup:
342
+ _backup(v, f"{which}_{k}_backup")
343
+ if which == "layers":
344
+ report["layers_skipped"].append(k)
345
+ else:
346
+ report["obsm_skipped"].append(k)
347
+ if verbose:
348
+ print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
349
+ continue
350
+ else:
351
+ cleaned[k] = arr
352
+ except Exception as e:
353
+ if backup:
354
+ _backup(v, f"{which}_{k}_backup")
355
+ if which == "layers":
356
+ report["layers_skipped"].append(k)
357
+ else:
358
+ report["obsm_skipped"].append(k)
359
+ msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
360
+ report["errors"].append(msg)
361
+ if verbose:
362
+ print(msg)
363
+ continue
364
+ return cleaned
365
+
366
+ # ---------- sanitize obs, var ----------
367
+ try:
368
+ obs_clean = _make_obs_var_safe(adata.obs, "obs")
369
+ except Exception as e:
370
+ msg = f"Failed to sanitize obs: {e}"
371
+ report["errors"].append(msg)
372
+ if verbose:
373
+ print(msg)
374
+ obs_clean = adata.obs.copy()
375
+
376
+ try:
377
+ var_clean = _make_obs_var_safe(adata.var, "var")
378
+ except Exception as e:
379
+ msg = f"Failed to sanitize var: {e}"
380
+ report["errors"].append(msg)
381
+ if verbose:
382
+ print(msg)
383
+ var_clean = adata.var.copy()
384
+
385
+ # ---------- sanitize uns ----------
386
+ try:
387
+ uns_clean = _sanitize_uns(adata.uns)
388
+ except Exception as e:
389
+ msg = f"Failed to sanitize uns: {e}"
390
+ report["errors"].append(msg)
391
+ if verbose:
392
+ print(msg)
393
+ uns_clean = {}
394
+
395
+ # ---------- sanitize layers and obsm ----------
396
+ layers_src = getattr(adata, "layers", {})
397
+ obsm_src = getattr(adata, "obsm", {})
398
+
399
+ try:
400
+ layers_clean = _sanitize_layers_obsm(layers_src, "layers")
401
+ except Exception as e:
402
+ msg = f"Failed to sanitize layers: {e}"
403
+ report["errors"].append(msg)
404
+ if verbose:
405
+ print(msg)
406
+ layers_clean = {}
407
+
408
+ try:
409
+ obsm_clean = _sanitize_layers_obsm(obsm_src, "obsm")
410
+ except Exception as e:
411
+ msg = f"Failed to sanitize obsm: {e}"
412
+ report["errors"].append(msg)
413
+ if verbose:
414
+ print(msg)
415
+ obsm_clean = {}
416
+
417
+ # ---------- handle X ----------
418
+ X_to_use = adata.X
419
+ try:
420
+ X_arr = np.asarray(adata.X)
421
+ if X_arr.dtype == object:
422
+ try:
423
+ X_to_use = X_arr.astype(float)
424
+ report["X_replaced_or_converted"] = "converted_to_float"
425
+ if verbose:
426
+ print("Converted adata.X object-dtype -> float")
427
+ except Exception:
428
+ if backup:
429
+ _backup(adata.X, "X_backup")
430
+ X_to_use = np.zeros_like(X_arr, dtype=float)
431
+ report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
432
+ if verbose:
433
+ print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
434
+ except Exception as e:
435
+ msg = f"Error handling adata.X: {e}"
436
+ report["errors"].append(msg)
437
+ if verbose:
438
+ print(msg)
439
+ X_to_use = adata.X
440
+
441
+ # ---------- build lightweight AnnData copy ----------
442
+ try:
443
+ adata_copy = _ad.AnnData(
444
+ X=X_to_use,
445
+ obs=obs_clean,
446
+ var=var_clean,
447
+ layers=layers_clean,
448
+ uns=uns_clean,
449
+ obsm=obsm_clean,
450
+ varm=getattr(adata, "varm", None),
451
+ )
452
+
453
+ # preserve names (as strings)
454
+ try:
455
+ adata_copy.obs_names = adata.obs_names.astype(str)
456
+ adata_copy.var_names = adata.var_names.astype(str)
457
+ except Exception:
458
+ adata_copy.obs_names = adata.obs_names
459
+ adata_copy.var_names = adata.var_names
460
+
461
+ # --- write
462
+ adata_copy.write_h5ad(path, compression=compression)
463
+ if verbose:
464
+ print(f"Saved safely to {path}")
465
+ except Exception as e:
466
+ msg = f"Failed to write h5ad: {e}"
467
+ report["errors"].append(msg)
468
+ if verbose:
469
+ print(msg)
470
+ raise
471
+
472
+ # Print a concise interactive report
473
+ print("\n=== safe_write_h5ad REPORT ===")
474
+ print(f"Saved file: {path}")
475
+ print(f"Adata shape: {adata.shape}")
476
+ if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
477
+ print(f"obs: converted columns -> {report['obs_converted_columns']}")
478
+ print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
479
+ else:
480
+ print("obs: no problematic columns found.")
481
+
482
+ if report["var_converted_columns"] or report["var_backed_up_columns"]:
483
+ print(f"var: converted columns -> {report['var_converted_columns']}")
484
+ print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
485
+ else:
486
+ print("var: no problematic columns found.")
487
+
488
+ if report["uns_json_keys"] or report["uns_backed_up_keys"]:
489
+ print(f".uns: jsonified keys -> {report['uns_json_keys']}")
490
+ print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
491
+ else:
492
+ print(".uns: no problematic keys found.")
493
+
494
+ if report["layers_converted"] or report["layers_skipped"]:
495
+ print(f"layers: converted -> {report['layers_converted']}")
496
+ print(f"layers: skipped -> {report['layers_skipped']}")
497
+ else:
498
+ print("layers: no problematic entries found.")
499
+
500
+ if report["obsm_converted"] or report["obsm_skipped"]:
501
+ print(f"obsm: converted -> {report['obsm_converted']}")
502
+ print(f"obsm: skipped -> {report['obsm_skipped']}")
503
+ else:
504
+ print("obsm: no problematic entries found.")
505
+
506
+ if report["X_replaced_or_converted"]:
507
+ print(f"adata.X handled: {report['X_replaced_or_converted']}")
508
+ else:
509
+ print("adata.X: no changes.")
510
+
511
+ if report["errors"]:
512
+ print("\nWarnings / errors encountered:")
513
+ for e in report["errors"]:
514
+ print(" -", e)
515
+
516
+ print("=== end report ===\n")
517
+ return report
518
+
519
+ def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
520
+ """
521
+ Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
522
+ from the backup_dir produced during save.
523
+
524
+ Parameters
525
+ ----------
526
+ path : str
527
+ Path to the cleaned .h5ad produced by safe_write_h5ad.
528
+ backup_dir : str
529
+ Directory where safe_write_h5ad stored pickled backups (default "./uns_backups").
530
+ restore_backups : bool
531
+ If True, attempt to load pickled backups and restore original objects into adata.
532
+ re_categorize : bool
533
+ If True, try to coerce small unique-count string columns back into pandas.Categorical.
534
+ categorical_threshold : int
535
+ Max unique values for a column to be considered categorical for automatic recasting.
536
+ verbose : bool
537
+ Print progress/summary.
538
+
539
+ Returns
540
+ -------
541
+ (adata, report) :
542
+ adata : AnnData
543
+ The reloaded (and possibly restored) AnnData instance.
544
+ report : dict
545
+ A report describing restored items, parsed JSON keys, and any failures.
546
+ """
547
+ import os
548
+ import json
549
+ import pickle
550
+ import numpy as np
551
+ import pandas as pd
552
+ import anndata as _ad
553
+
554
+ report = {
555
+ "restored_obs_columns": [],
556
+ "restored_var_columns": [],
557
+ "restored_uns_keys": [],
558
+ "parsed_uns_json_keys": [],
559
+ "restored_layers": [],
560
+ "restored_obsm": [],
561
+ "recategorized_obs": [],
562
+ "recategorized_var": [],
563
+ "missing_backups": [],
564
+ "errors": [],
565
+ }
566
+
567
+ if verbose:
568
+ print(f"[safe_read_h5ad] loading {path}")
569
+
570
+ # 1) load the cleaned h5ad
571
+ try:
572
+ adata = _ad.read_h5ad(path)
573
+ except Exception as e:
574
+ raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
575
+
576
+ # Ensure backup_dir exists (may be relative to cwd)
577
+ backup_dir = os.path.abspath(backup_dir)
578
+ if verbose:
579
+ print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
580
+
581
+ def _load_pickle_if_exists(fname):
582
+ if os.path.exists(fname):
583
+ try:
584
+ with open(fname, "rb") as fh:
585
+ val = pickle.load(fh)
586
+ return val
587
+ except Exception as e:
588
+ report["errors"].append(f"Failed to load pickle {fname}: {e}")
589
+ if verbose:
590
+ print(f" error loading {fname}: {e}")
591
+ return None
592
+ return None
593
+
594
+ # 2) Restore obs columns
595
+ for col in list(adata.obs.columns):
596
+ # Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
597
+ bname1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
598
+ bname2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
599
+ restored = False
600
+
601
+ if restore_backups:
602
+ val = _load_pickle_if_exists(bname2)
603
+ if val is not None:
604
+ # val may be the categorical series or categories
605
+ try:
606
+ # If pickled numpy array or pandas Series, coerce to same index alignment
607
+ if hasattr(val, "shape") and (len(val) == adata.shape[0]):
608
+ adata.obs[col] = pd.Series(val, index=adata.obs.index)
609
+ else:
610
+ # fallback: place pickled object directly
611
+ adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
612
+ report["restored_obs_columns"].append((col, bname2))
613
+ restored = True
614
+ if verbose:
615
+ print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
616
+ except Exception as e:
617
+ report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
618
+ restored = False
619
+
620
+ if not restored:
621
+ val = _load_pickle_if_exists(bname1)
622
+ if val is not None:
623
+ try:
624
+ if hasattr(val, "shape") and (len(val) == adata.shape[0]):
625
+ adata.obs[col] = pd.Series(val, index=adata.obs.index)
626
+ else:
627
+ adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
628
+ report["restored_obs_columns"].append((col, bname1))
629
+ restored = True
630
+ if verbose:
631
+ print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
632
+ except Exception as e:
633
+ report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
634
+ restored = False
635
+
636
+ # If not restored and column dtype is object but contains JSON-like strings, try json.loads per element
637
+ if (not restored) and (adata.obs[col].dtype == object):
638
+ sample_vals = adata.obs[col].dropna().astype(str).head(20).tolist()
639
+ looks_like_json = False
640
+ for sv in sample_vals:
641
+ svs = sv.strip()
642
+ if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
643
+ looks_like_json = True
644
+ break
645
+ if looks_like_json:
646
+ parsed = []
647
+ success_parse = True
648
+ for v in adata.obs[col].astype(str).values:
649
+ try:
650
+ parsed.append(json.loads(v))
651
+ except Exception:
652
+ # if any element fails, don't convert whole column
653
+ success_parse = False
654
+ break
655
+ if success_parse:
656
+ adata.obs[col] = pd.Series(parsed, index=adata.obs.index)
657
+ report["restored_obs_columns"].append((col, "parsed_json"))
658
+ restored = True
659
+ if verbose:
660
+ print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
661
+
662
+ # If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
663
+ if (not restored) and re_categorize and adata.obs[col].dtype == object:
664
+ try:
665
+ nunique = adata.obs[col].dropna().astype(str).nunique()
666
+ if nunique > 0 and nunique <= categorical_threshold:
667
+ # cast to category
668
+ adata.obs[col] = adata.obs[col].astype(str).astype("category")
669
+ report["recategorized_obs"].append(col)
670
+ if verbose:
671
+ print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
672
+ except Exception as e:
673
+ report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
674
+
675
+ # 3) Restore var columns (same logic)
676
+ for col in list(adata.var.columns):
677
+ bname1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
678
+ bname2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
679
+ restored = False
680
+
681
+ if restore_backups:
682
+ val = _load_pickle_if_exists(bname2)
683
+ if val is not None:
684
+ try:
685
+ if hasattr(val, "shape") and (len(val) == adata.shape[1]):
686
+ adata.var[col] = pd.Series(val, index=adata.var.index)
687
+ else:
688
+ adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
689
+ report["restored_var_columns"].append((col, bname2))
690
+ restored = True
691
+ if verbose:
692
+ print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
693
+ except Exception as e:
694
+ report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
695
+
696
+ if not restored:
697
+ val = _load_pickle_if_exists(bname1)
698
+ if val is not None:
699
+ try:
700
+ if hasattr(val, "shape") and (len(val) == adata.shape[1]):
701
+ adata.var[col] = pd.Series(val, index=adata.var.index)
702
+ else:
703
+ adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
704
+ report["restored_var_columns"].append((col, bname1))
705
+ restored = True
706
+ if verbose:
707
+ print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
708
+ except Exception as e:
709
+ report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
710
+
711
+ if (not restored) and (adata.var[col].dtype == object):
712
+ # try JSON parsing
713
+ sample_vals = adata.var[col].dropna().astype(str).head(20).tolist()
714
+ looks_like_json = False
715
+ for sv in sample_vals:
716
+ svs = sv.strip()
717
+ if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
718
+ looks_like_json = True
719
+ break
720
+ if looks_like_json:
721
+ parsed = []
722
+ success_parse = True
723
+ for v in adata.var[col].astype(str).values:
724
+ try:
725
+ parsed.append(json.loads(v))
726
+ except Exception:
727
+ success_parse = False
728
+ break
729
+ if success_parse:
730
+ adata.var[col] = pd.Series(parsed, index=adata.var.index)
731
+ report["restored_var_columns"].append((col, "parsed_json"))
732
+ if verbose:
733
+ print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
734
+
735
+ if (not restored) and re_categorize and adata.var[col].dtype == object:
736
+ try:
737
+ nunique = adata.var[col].dropna().astype(str).nunique()
738
+ if nunique > 0 and nunique <= categorical_threshold:
739
+ adata.var[col] = adata.var[col].astype(str).astype("category")
740
+ report["recategorized_var"].append(col)
741
+ if verbose:
742
+ print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
743
+ except Exception as e:
744
+ report["errors"].append(f"Failed to recategorize var.{col}: {e}")
745
+
746
+ # 4) Restore uns: look for uns_{k}_backup.pkl, or keys like "<k>_json"
747
+ uns_keys = list(adata.uns.keys())
748
+ # First, if we have "<k>_json", convert back into k
749
+ for k in uns_keys:
750
+ if k.endswith("_json"):
751
+ base = k[:-5]
752
+ sval = adata.uns.get(k)
753
+ try:
754
+ parsed = json.loads(sval)
755
+ adata.uns[base] = parsed
756
+ report["parsed_uns_json_keys"].append(base)
757
+ if verbose:
758
+ print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
759
+ # remove the _json entry
760
+ try:
761
+ del adata.uns[k]
762
+ except KeyError:
763
+ pass
764
+ except Exception as e:
765
+ report["errors"].append(f"Failed to json-parse uns['{k}']: {e}")
766
+
767
+ # Now try to restore pickled backups for uns keys
768
+ # Look for files named uns_<key>_backup.pkl
769
+ # We will attempt to restore into adata.uns[key] if backup exists
770
+ for fname in os.listdir(backup_dir) if os.path.isdir(backup_dir) else []:
771
+ if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
772
+ continue
773
+ # fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
774
+ key = fname[len("uns_"):-len("_backup.pkl")]
775
+ full = os.path.join(backup_dir, fname)
776
+ val = _load_pickle_if_exists(full)
777
+ if val is not None:
778
+ adata.uns[key] = val
779
+ report["restored_uns_keys"].append((key, full))
780
+ if verbose:
781
+ print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
782
+
783
+ # 5) Restore layers and obsm from backups if present
784
+ # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
785
+ if os.path.isdir(backup_dir):
786
+ for fname in os.listdir(backup_dir):
787
+ if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
788
+ layer_name = fname[len("layers_"):-len("_backup.pkl")]
789
+ full = os.path.join(backup_dir, fname)
790
+ val = _load_pickle_if_exists(full)
791
+ if val is not None:
792
+ try:
793
+ adata.layers[layer_name] = np.asarray(val)
794
+ report["restored_layers"].append((layer_name, full))
795
+ if verbose:
796
+ print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
797
+ except Exception as e:
798
+ report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
799
+
800
+ if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
801
+ obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
802
+ full = os.path.join(backup_dir, fname)
803
+ val = _load_pickle_if_exists(full)
804
+ if val is not None:
805
+ try:
806
+ adata.obsm[obsm_name] = np.asarray(val)
807
+ report["restored_obsm"].append((obsm_name, full))
808
+ if verbose:
809
+ print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
810
+ except Exception as e:
811
+ report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
812
+
813
+ # 6) If restore_backups True but some expected backups missing, note them
814
+ if restore_backups and os.path.isdir(backup_dir):
815
+ # detect common expected names from obs/var/uns/layers in adata
816
+ expected_missing = []
817
+ # obs/var columns
818
+ for col in list(adata.obs.columns):
819
+ p1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
820
+ p2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
821
+ if (not os.path.exists(p1)) and (not os.path.exists(p2)):
822
+ # we don't require backups for every column; only record if column still looks like placeholder strings
823
+ if adata.obs[col].dtype == object:
824
+ expected_missing.append(("obs", col))
825
+ for col in list(adata.var.columns):
826
+ p1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
827
+ p2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
828
+ if (not os.path.exists(p1)) and (not os.path.exists(p2)):
829
+ if adata.var[col].dtype == object:
830
+ expected_missing.append(("var", col))
831
+ # uns keys
832
+ for k in adata.uns.keys():
833
+ # if we have *_json or *_str variants we expect backups optionally
834
+ if k.endswith("_json") or k.endswith("_str"):
835
+ b = os.path.join(backup_dir, f"uns_{k[:-5]}_backup.pkl")
836
+ if not os.path.exists(b):
837
+ report["missing_backups"].append(("uns", k))
838
+ if expected_missing and verbose:
839
+ n = len(expected_missing)
840
+ if verbose:
841
+ print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
842
+ # add to report
843
+ report["missing_backups"].extend(expected_missing)
844
+
845
+ # final summary print
846
+ if verbose:
847
+ print("\n=== safe_read_h5ad summary ===")
848
+ if report["restored_obs_columns"]:
849
+ print("Restored obs columns:", report["restored_obs_columns"])
850
+ if report["restored_var_columns"]:
851
+ print("Restored var columns:", report["restored_var_columns"])
852
+ if report["restored_uns_keys"]:
853
+ print("Restored uns keys:", report["restored_uns_keys"])
854
+ if report["parsed_uns_json_keys"]:
855
+ print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
856
+ if report["restored_layers"]:
857
+ print("Restored layers:", report["restored_layers"])
858
+ if report["restored_obsm"]:
859
+ print("Restored obsm:", report["restored_obsm"])
860
+ if report["recategorized_obs"] or report["recategorized_var"]:
861
+ print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
862
+ if report["missing_backups"]:
863
+ print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
864
+ if report["errors"]:
865
+ print("Errors encountered (see report['errors']):")
866
+ for e in report["errors"]:
867
+ print(" -", e)
868
+ print("=== end summary ===\n")
869
+
870
+ return adata, report
871
+
872
+
873
+ # def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./", verbose=True):
874
+ # """
875
+ # Saves an AnnData object safely by omitting problematic columns from .obs and .var.
876
+
877
+ # Parameters:
878
+ # adata (AnnData): The AnnData object to save.
879
+ # path (str): Output .h5ad file path.
880
+ # compression (str): Compression method for h5ad file.
881
+ # backup (bool): If True, saves problematic columns to CSV files.
882
+ # backup_dir (str): Directory to store backups if backup=True.
883
+ # """
884
+ # import anndata as ad
885
+ # import pandas as pd
886
+ # import os
887
+ # import numpy as np
888
+ # import json
889
+
890
+ # os.makedirs(backup_dir, exist_ok=True)
891
+
892
+ # def filter_df(df, df_name):
893
+ # bad_cols = []
894
+ # for col in df.columns:
895
+ # if df[col].dtype == 'object':
896
+ # if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
897
+ # bad_cols.append(col)
898
+ # elif pd.api.types.is_categorical_dtype(df[col]):
899
+ # if not all(isinstance(x, (str, type(None))) for x in df[col].cat.categories):
900
+ # bad_cols.append(col)
901
+ # if bad_cols and verbose:
902
+ # print(f"Skipping columns from {df_name}: {bad_cols}")
903
+ # if backup and bad_cols:
904
+ # df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
905
+ # if verbose:
906
+ # print(f"Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
907
+ # return df.drop(columns=bad_cols)
908
+
909
+ # def is_serializable(val):
910
+ # try:
911
+ # json.dumps(val)
912
+ # return True
913
+ # except (TypeError, OverflowError):
914
+ # return False
915
+
916
+ # def clean_uns(uns_dict):
917
+ # clean_uns = {}
918
+ # bad_keys = []
919
+ # for k, v in uns_dict.items():
920
+ # if isinstance(v, (str, int, float, type(None), list, np.ndarray, pd.DataFrame, dict)):
921
+ # clean_uns[k] = v
922
+ # elif is_serializable(v):
923
+ # clean_uns[k] = v
924
+ # else:
925
+ # bad_keys.append(k)
926
+ # if backup:
927
+ # try:
928
+ # with open(os.path.join(backup_dir, f"uns_{k}_backup.txt"), "w") as f:
929
+ # f.write(str(v))
930
+ # except Exception:
931
+ # pass
932
+ # if bad_keys and verbose:
933
+ # print(f"Skipping entries from .uns: {bad_keys}")
934
+ # return clean_uns
935
+
936
+ # # Clean obs and var and uns
937
+ # obs_clean = filter_df(adata.obs, "obs")
938
+ # var_clean = filter_df(adata.var, "var")
939
+ # uns_clean = clean_uns(adata.uns)
940
+
941
+ # # Save clean version
942
+ # adata_copy = ad.AnnData(
943
+ # X=adata.X,
944
+ # obs=obs_clean,
945
+ # var=var_clean,
946
+ # layers=adata.layers,
947
+ # uns=uns_clean,
948
+ # obsm=adata.obsm,
949
+ # varm=adata.varm
950
+ # )
951
+
952
+ # adata_copy.obs_names = adata_copy.obs_names.astype(str)
953
+ # adata_copy.var_names = adata_copy.var_names.astype(str)
954
+
955
+ # adata_copy.write_h5ad(path, compression=compression)
956
+
957
+ # print(f"Saved safely to {path}")
958
+
959
+ def merge_barcoded_anndatas_core(adata_single, adata_double):
174
960
  import numpy as np
175
961
  import anndata as ad
176
962
 
@@ -194,5 +980,25 @@ def merge_barcoded_anndatas(adata_single, adata_double):
194
980
  adata_merged.uns = {**adata_single.uns, **adata_double.uns}
195
981
 
196
982
  return adata_merged
197
-
198
- ######################################################################################################
983
+ ######################################################################################################
984
+
985
+ ### File conversion misc ###
986
+ import argparse
987
+ from Bio import SeqIO
988
+ def genbank_to_gff(genbank_file, output_file, record_id):
989
+ with open(output_file, "w") as out:
990
+ for record in SeqIO.parse(genbank_file, "genbank"):
991
+ for feature in record.features:
992
+ # Skip features without location information
993
+ if feature.location is None:
994
+ continue
995
+ # Extract feature information
996
+ start = feature.location.start + 1 # Convert to 1-based index
997
+ end = feature.location.end
998
+ strand = "+" if feature.location.strand == 1 else "-"
999
+ feature_type = feature.type
1000
+ # Format attributes
1001
+ attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
1002
+ # Write GFF3 line
1003
+ gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
1004
+ out.write(gff3_line + "\n")