smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -1,15 +1,14 @@
1
1
  ## readwrite ##
2
2
  from __future__ import annotations
3
3
 
4
+ import warnings
4
5
  from pathlib import Path
5
- from typing import Union, Iterable
6
-
7
- from pathlib import Path
8
- from typing import Iterable, Sequence, Optional
6
+ from typing import Iterable, List, Sequence, Union
9
7
 
10
- import warnings
11
- import pandas as pd
12
8
  import anndata as ad
9
+ import pandas as pd
10
+ from Bio import SeqIO
11
+
13
12
 
14
13
  ######################################################################################################
15
14
  ## Datetime functionality
@@ -18,20 +17,26 @@ def date_string():
18
17
  Each time this is called, it returns the current date string
19
18
  """
20
19
  from datetime import datetime
20
+
21
21
  current_date = datetime.now()
22
22
  date_string = current_date.strftime("%Y%m%d")
23
23
  date_string = date_string[2:]
24
24
  return date_string
25
25
 
26
+
26
27
  def time_string():
27
28
  """
28
29
  Each time this is called, it returns the current time string
29
30
  """
30
31
  from datetime import datetime
32
+
31
33
  current_time = datetime.now()
32
34
  return current_time.strftime("%H:%M:%S")
35
+
36
+
33
37
  ######################################################################################################
34
38
 
39
+
35
40
  ######################################################################################################
36
41
  ## General file and directory handling
37
42
  def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
@@ -57,11 +62,12 @@ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None
57
62
  p = Path(d)
58
63
 
59
64
  # If someone passes in a file path, make its parent
60
- if p.suffix: # p.suffix != "" means it's a file
65
+ if p.suffix: # p.suffix != "" means it's a file
61
66
  p = p.parent
62
67
 
63
68
  p.mkdir(parents=True, exist_ok=True)
64
69
 
70
+
65
71
  def add_or_update_column_in_csv(
66
72
  csv_path: str | Path,
67
73
  column_name: str,
@@ -117,19 +123,20 @@ def add_or_update_column_in_csv(
117
123
  # Sequence case: lengths must match
118
124
  if len(values) != len(df):
119
125
  raise ValueError(
120
- f"Length mismatch: CSV has {len(df)} rows "
121
- f"but values has {len(values)} entries."
126
+ f"Length mismatch: CSV has {len(df)} rows but values has {len(values)} entries."
122
127
  )
123
128
 
124
129
  df[column_name] = list(values)
125
130
  df.to_csv(csv_path, index=index)
126
131
  return df
127
132
 
133
+
128
134
  ######################################################################################################
129
135
 
130
136
  ######################################################################################################
131
137
  ## Numpy, Pandas, Anndata functionality
132
138
 
139
+
133
140
  def adata_to_df(adata, layer=None):
134
141
  """
135
142
  Convert an AnnData object into a Pandas DataFrame.
@@ -142,8 +149,6 @@ def adata_to_df(adata, layer=None):
142
149
  pd.DataFrame: A DataFrame where rows are observations and columns are positions.
143
150
  """
144
151
  import pandas as pd
145
- import anndata as ad
146
- import numpy as np
147
152
 
148
153
  # Validate that the requested layer exists
149
154
  if layer and layer not in adata.layers:
@@ -153,28 +158,83 @@ def adata_to_df(adata, layer=None):
153
158
  data_matrix = adata.layers.get(layer, adata.X)
154
159
 
155
160
  # Ensure matrix is dense (handle sparse formats)
156
- if hasattr(data_matrix, "toarray"):
161
+ if hasattr(data_matrix, "toarray"):
157
162
  data_matrix = data_matrix.toarray()
158
163
 
159
164
  # Ensure obs and var have unique indices
160
165
  if adata.obs.index.duplicated().any():
161
- raise ValueError("Duplicate values found in `adata.obs.index`. Ensure unique observation indices.")
162
-
166
+ raise ValueError(
167
+ "Duplicate values found in `adata.obs.index`. Ensure unique observation indices."
168
+ )
169
+
163
170
  if adata.var.index.duplicated().any():
164
- raise ValueError("Duplicate values found in `adata.var.index`. Ensure unique variable indices.")
171
+ raise ValueError(
172
+ "Duplicate values found in `adata.var.index`. Ensure unique variable indices."
173
+ )
165
174
 
166
175
  # Convert to DataFrame
167
176
  df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
168
177
 
169
178
  return df
170
179
 
180
+
171
181
  def save_matrix(matrix, save_name):
172
182
  """
173
183
  Input: A numpy matrix and a save_name
174
184
  Output: A txt file representation of the data matrix
175
185
  """
176
186
  import numpy as np
177
- np.savetxt(f'{save_name}.txt', matrix)
187
+
188
+ np.savetxt(f"{save_name}.txt", matrix)
189
+
190
+
191
+ def _harmonize_var_schema(adatas: List[ad.AnnData]) -> None:
192
+ """
193
+ In-place:
194
+ - Make every AnnData.var have the *union* of columns.
195
+ - Normalize dtypes so columns can hold NaN and round-trip via HDF5:
196
+ * ints -> float64 (to support NaN)
197
+ * objects -> try numeric->float64, else pandas 'string'
198
+ """
199
+ import numpy as np
200
+
201
+ # 1) Union of all .var columns
202
+ all_cols = set()
203
+ for a in adatas:
204
+ all_cols.update(a.var.columns)
205
+ all_cols = list(all_cols)
206
+
207
+ # 2) Add any missing columns as float64 NaN
208
+ for a in adatas:
209
+ missing = [c for c in all_cols if c not in a.var.columns]
210
+ for c in missing:
211
+ a.var[c] = np.nan # becomes float64 by default
212
+
213
+ # 3) Normalize dtypes per AnnData so concat doesn't create mixed/object columns
214
+ for a in adatas:
215
+ for c in a.var.columns:
216
+ s = a.var[c]
217
+ dt = s.dtype
218
+
219
+ # Integer/unsigned -> float64 (so NaN fits)
220
+ if dt.kind in ("i", "u"):
221
+ a.var[c] = s.astype("float64")
222
+ continue
223
+
224
+ # Object -> numeric if possible; else pandas 'string'
225
+ if dt == "O":
226
+ try:
227
+ s_num = pd.to_numeric(s, errors="raise")
228
+ a.var[c] = s_num.astype("float64")
229
+ except Exception:
230
+ a.var[c] = s.astype("string")
231
+
232
+ # Optional: ensure consistent column order (sorted + stable)
233
+ # Not required, but can make diffs easier to read:
234
+ all_cols_sorted = sorted(all_cols)
235
+ for a in adatas:
236
+ a.var = a.var.reindex(columns=all_cols_sorted)
237
+
178
238
 
179
239
  def concatenate_h5ads(
180
240
  output_path: str | Path,
@@ -243,8 +303,7 @@ def concatenate_h5ads(
243
303
  # collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
244
304
  suffixes_lower = tuple(s.lower() for s in file_suffixes)
245
305
  h5_paths = sorted(
246
- p for p in input_dir.iterdir()
247
- if p.is_file() and p.suffix.lower() in suffixes_lower
306
+ p for p in input_dir.iterdir() if p.is_file() and p.suffix.lower() in suffixes_lower
248
307
  )
249
308
 
250
309
  else:
@@ -255,9 +314,7 @@ def concatenate_h5ads(
255
314
 
256
315
  df = pd.read_csv(csv_path, dtype=str)
257
316
  if csv_column not in df.columns:
258
- raise ValueError(
259
- f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
260
- )
317
+ raise ValueError(f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths.")
261
318
  paths = df[csv_column].dropna().astype(str).tolist()
262
319
  if not paths:
263
320
  raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
@@ -280,27 +337,41 @@ def concatenate_h5ads(
280
337
  for p in h5_paths:
281
338
  print(f" - {p}")
282
339
 
283
- final_adata: Optional[ad.AnnData] = None
284
-
340
+ # Load all first so we can harmonize schemas before concat
341
+ loaded: List[ad.AnnData] = []
285
342
  for p in h5_paths:
286
343
  print(f"{time_string()}: Reading {p}")
287
- temp_adata, read_report = safe_read_h5ad(p, restore_backups=restore_backups)
288
-
289
- if final_adata is None:
290
- print(f"{time_string()}: Initializing final AnnData with {p}")
291
- final_adata = temp_adata
292
- else:
293
- print(f"{time_string()}: Concatenating {p} into final AnnData")
294
- final_adata = ad.concat(
295
- [final_adata, temp_adata],
296
- join="outer",
297
- merge='unique',
298
- uns_merge='unique',
299
- index_unique=None,
300
- )
344
+ a, _ = safe_read_h5ad(p, restore_backups=restore_backups)
345
+ loaded.append(a)
346
+
347
+ # Critical: make every .var share the same columns + safe dtypes
348
+ _harmonize_var_schema(loaded)
349
+
350
+ print(f"{time_string()}: Concatenating {len(loaded)} AnnData objects")
351
+ final_adata = ad.concat(
352
+ loaded,
353
+ axis=0, # stack observations
354
+ join="outer", # keep union of variables
355
+ merge="unique",
356
+ uns_merge="unique",
357
+ index_unique=None,
358
+ )
359
+
360
+ # Defensive pass: ensure final var dtypes are write-safe
361
+ for c in final_adata.var.columns:
362
+ s = final_adata.var[c]
363
+ dt = s.dtype
364
+ if dt.kind in ("i", "u"):
365
+ final_adata.var[c] = s.astype("float64")
366
+ elif dt == "O":
367
+ try:
368
+ s_num = pd.to_numeric(s, errors="raise")
369
+ final_adata.var[c] = s_num.astype("float64")
370
+ except Exception:
371
+ final_adata.var[c] = s.astype("string")
301
372
 
302
- if final_adata is None:
303
- raise RuntimeError("Unexpected: no AnnData objects loaded.")
373
+ # Let anndata write pandas StringArray reliably
374
+ ad.settings.allow_write_nullable_strings = True
304
375
 
305
376
  print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
306
377
  safe_write_h5ad(final_adata, output_path, backup=restore_backups)
@@ -325,18 +396,21 @@ def concatenate_h5ads(
325
396
 
326
397
  return output_path
327
398
 
399
+
328
400
  def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
329
401
  """
330
402
  Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
331
403
 
332
404
  Returns a report dict and prints a summary of what was converted/backed up/skipped.
333
405
  """
334
- import os, json, pickle
406
+ import json
407
+ import os
408
+ import pickle
335
409
  from pathlib import Path
410
+
411
+ import anndata as _ad
336
412
  import numpy as np
337
413
  import pandas as pd
338
- import warnings
339
- import anndata as _ad
340
414
 
341
415
  path = Path(path)
342
416
 
@@ -413,7 +487,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
413
487
  report["var_backed_up_columns"].append(col)
414
488
  df[col] = ser.astype(str)
415
489
  if verbose:
416
- print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
490
+ print(
491
+ f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
492
+ )
417
493
  continue
418
494
 
419
495
  # object dtype handling: try to coerce each element to string
@@ -434,7 +510,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
434
510
  report["var_backed_up_columns"].append(col)
435
511
  df[col] = ser.values.astype(str)
436
512
  if verbose:
437
- print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
513
+ print(
514
+ f" converted object column '{which}.{col}' -> strings (backup={backup})"
515
+ )
438
516
  if which == "obs":
439
517
  report["obs_converted_columns"].append(col)
440
518
  else:
@@ -457,7 +535,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
457
535
  report["var_backed_up_columns"].append(col)
458
536
  df[col] = [json.dumps(v, default=str) for v in ser.values]
459
537
  if verbose:
460
- print(f" json-stringified object column '{which}.{col}' (backup={backup})")
538
+ print(
539
+ f" json-stringified object column '{which}.{col}' (backup={backup})"
540
+ )
461
541
  if which == "obs":
462
542
  report["obs_converted_columns"].append(col)
463
543
  else:
@@ -472,7 +552,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
472
552
  report["var_backed_up_columns"].append(col)
473
553
  df[col] = ser.astype(str)
474
554
  if verbose:
475
- print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
555
+ print(
556
+ f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
557
+ )
476
558
  if which == "obs":
477
559
  report["obs_converted_columns"].append(col)
478
560
  else:
@@ -499,7 +581,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
499
581
  _backup(v, f"uns_{k}_backup")
500
582
  backed_up.append(k)
501
583
  if verbose:
502
- print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
584
+ print(
585
+ f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
586
+ )
503
587
  report["uns_json_keys"].append(k)
504
588
  except Exception:
505
589
  try:
@@ -534,7 +618,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
534
618
  arr_f = arr.astype(float)
535
619
  cleaned[k] = arr_f
536
620
  report_key = f"{which}.{k}"
537
- report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
621
+ report["layers_converted"].append(
622
+ report_key
623
+ ) if which == "layers" else report["obsm_converted"].append(report_key)
538
624
  if verbose:
539
625
  print(f" {which}.{k} object array coerced to float.")
540
626
  except Exception:
@@ -542,7 +628,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
542
628
  arr_i = arr.astype(int)
543
629
  cleaned[k] = arr_i
544
630
  report_key = f"{which}.{k}"
545
- report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
631
+ report["layers_converted"].append(
632
+ report_key
633
+ ) if which == "layers" else report["obsm_converted"].append(report_key)
546
634
  if verbose:
547
635
  print(f" {which}.{k} object array coerced to int.")
548
636
  except Exception:
@@ -553,7 +641,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
553
641
  else:
554
642
  report["obsm_skipped"].append(k)
555
643
  if verbose:
556
- print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
644
+ print(
645
+ f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
646
+ )
557
647
  continue
558
648
  else:
559
649
  cleaned[k] = arr
@@ -638,7 +728,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
638
728
  X_to_use = np.zeros_like(X_arr, dtype=float)
639
729
  report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
640
730
  if verbose:
641
- print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
731
+ print(
732
+ "adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
733
+ )
642
734
  except Exception as e:
643
735
  msg = f"Error handling adata.X: {e}"
644
736
  report["errors"].append(msg)
@@ -722,9 +814,121 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
722
814
  print(" -", e)
723
815
 
724
816
  print("=== end report ===\n")
817
+
818
+ # ---------- create CSV output directory ----------
819
+ try:
820
+ csv_dir = path.parent / "csvs"
821
+ csv_dir.mkdir(exist_ok=True)
822
+ if verbose:
823
+ print(f"CSV outputs will be written to: {csv_dir}")
824
+ except Exception as e:
825
+ msg = f"Failed to create CSV output directory: {e}"
826
+ report["errors"].append(msg)
827
+ if verbose:
828
+ print(msg)
829
+ csv_dir = path.parent # fallback just in case
830
+
831
+ # ---------- write keys summary CSV ----------
832
+ try:
833
+ meta_rows = []
834
+
835
+ # obs columns
836
+ for col in adata_copy.obs.columns:
837
+ meta_rows.append(
838
+ {
839
+ "kind": "obs",
840
+ "name": col,
841
+ "dtype": str(adata_copy.obs[col].dtype),
842
+ }
843
+ )
844
+
845
+ # var columns
846
+ for col in adata_copy.var.columns:
847
+ meta_rows.append(
848
+ {
849
+ "kind": "var",
850
+ "name": col,
851
+ "dtype": str(adata_copy.var[col].dtype),
852
+ }
853
+ )
854
+
855
+ # layers
856
+ for k, v in adata_copy.layers.items():
857
+ meta_rows.append(
858
+ {
859
+ "kind": "layer",
860
+ "name": k,
861
+ "dtype": str(np.asarray(v).dtype),
862
+ }
863
+ )
864
+
865
+ # obsm
866
+ for k, v in adata_copy.obsm.items():
867
+ meta_rows.append(
868
+ {
869
+ "kind": "obsm",
870
+ "name": k,
871
+ "dtype": str(np.asarray(v).dtype),
872
+ }
873
+ )
874
+
875
+ # uns
876
+ for k, v in adata_copy.uns.items():
877
+ meta_rows.append(
878
+ {
879
+ "kind": "uns",
880
+ "name": k,
881
+ "dtype": type(v).__name__,
882
+ }
883
+ )
884
+
885
+ meta_df = pd.DataFrame(meta_rows)
886
+
887
+ # same base name, inside csvs/
888
+ base = path.stem # removes .h5ad
889
+ meta_path = csv_dir / f"{base}.keys.csv"
890
+
891
+ meta_df.to_csv(meta_path, index=False)
892
+ if verbose:
893
+ print(f"Wrote keys summary CSV to {meta_path}")
894
+
895
+ except Exception as e:
896
+ msg = f"Failed to write keys CSV: {e}"
897
+ report["errors"].append(msg)
898
+ if verbose:
899
+ print(msg)
900
+
901
+ # ---------- write full obs and var dataframes ----------
902
+ try:
903
+ base = path.stem
904
+
905
+ obs_path = csv_dir / f"{base}.obs.csv"
906
+ var_path = csv_dir / f"{base}.var.csv"
907
+
908
+ adata_copy.obs.to_csv(obs_path, index=True)
909
+ adata_copy.var.to_csv(var_path, index=True)
910
+
911
+ if verbose:
912
+ print(f"Wrote obs DataFrame to {obs_path}")
913
+ print(f"Wrote var DataFrame to {var_path}")
914
+
915
+ except Exception as e:
916
+ msg = f"Failed to write obs/var CSVs: {e}"
917
+ report["errors"].append(msg)
918
+ if verbose:
919
+ print(msg)
920
+
725
921
  return report
726
922
 
727
- def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
923
+
924
+ def safe_read_h5ad(
925
+ path,
926
+ backup_dir=None,
927
+ restore_backups=True,
928
+ re_categorize=True,
929
+ categorical_threshold=100,
930
+ verbose=True,
931
+ ):
728
932
  """
729
933
  Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
730
934
  from the backup_dir produced during save.
@@ -752,13 +956,14 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
752
956
  report : dict
753
957
  A report describing restored items, parsed JSON keys, and any failures.
754
958
  """
755
- import os
756
- from pathlib import Path
757
959
  import json
960
+ import os
758
961
  import pickle
962
+ from pathlib import Path
963
+
964
+ import anndata as _ad
759
965
  import numpy as np
760
966
  import pandas as pd
761
- import anndata as _ad
762
967
 
763
968
  path = Path(path)
764
969
 
@@ -837,7 +1042,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
837
1042
  if hasattr(val, "shape") and (len(val) == adata.shape[0]):
838
1043
  adata.obs[col] = pd.Series(val, index=adata.obs.index)
839
1044
  else:
840
- adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
1045
+ adata.obs[col] = pd.Series(
1046
+ [val] * adata.shape[0], index=adata.obs.index
1047
+ )
841
1048
  report["restored_obs_columns"].append((col, bname1))
842
1049
  restored = True
843
1050
  if verbose:
@@ -852,7 +1059,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
852
1059
  looks_like_json = False
853
1060
  for sv in sample_vals:
854
1061
  svs = sv.strip()
855
- if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
1062
+ if (svs.startswith("{") and svs.endswith("}")) or (
1063
+ svs.startswith("[") and svs.endswith("]")
1064
+ ):
856
1065
  looks_like_json = True
857
1066
  break
858
1067
  if looks_like_json:
@@ -870,7 +1079,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
870
1079
  report["restored_obs_columns"].append((col, "parsed_json"))
871
1080
  restored = True
872
1081
  if verbose:
873
- print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
1082
+ print(
1083
+ f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
1084
+ )
874
1085
 
875
1086
  # If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
876
1087
  if (not restored) and re_categorize and adata.obs[col].dtype == object:
@@ -881,7 +1092,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
881
1092
  adata.obs[col] = adata.obs[col].astype(str).astype("category")
882
1093
  report["recategorized_obs"].append(col)
883
1094
  if verbose:
884
- print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
1095
+ print(
1096
+ f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
1097
+ )
885
1098
  except Exception as e:
886
1099
  report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
887
1100
 
@@ -913,7 +1126,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
913
1126
  if hasattr(val, "shape") and (len(val) == adata.shape[1]):
914
1127
  adata.var[col] = pd.Series(val, index=adata.var.index)
915
1128
  else:
916
- adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
1129
+ adata.var[col] = pd.Series(
1130
+ [val] * adata.shape[1], index=adata.var.index
1131
+ )
917
1132
  report["restored_var_columns"].append((col, bname1))
918
1133
  restored = True
919
1134
  if verbose:
@@ -927,7 +1142,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
927
1142
  looks_like_json = False
928
1143
  for sv in sample_vals:
929
1144
  svs = sv.strip()
930
- if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
1145
+ if (svs.startswith("{") and svs.endswith("}")) or (
1146
+ svs.startswith("[") and svs.endswith("]")
1147
+ ):
931
1148
  looks_like_json = True
932
1149
  break
933
1150
  if looks_like_json:
@@ -943,7 +1160,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
943
1160
  adata.var[col] = pd.Series(parsed, index=adata.var.index)
944
1161
  report["restored_var_columns"].append((col, "parsed_json"))
945
1162
  if verbose:
946
- print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
1163
+ print(
1164
+ f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
1165
+ )
947
1166
 
948
1167
  if (not restored) and re_categorize and adata.var[col].dtype == object:
949
1168
  try:
@@ -952,7 +1171,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
952
1171
  adata.var[col] = adata.var[col].astype(str).astype("category")
953
1172
  report["recategorized_var"].append(col)
954
1173
  if verbose:
955
- print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
1174
+ print(
1175
+ f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
1176
+ )
956
1177
  except Exception as e:
957
1178
  report["errors"].append(f"Failed to recategorize var.{col}: {e}")
958
1179
 
@@ -984,7 +1205,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
984
1205
  if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
985
1206
  continue
986
1207
  # fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
987
- key = fname[len("uns_"):-len("_backup.pkl")]
1208
+ key = fname[len("uns_") : -len("_backup.pkl")]
988
1209
  full = os.path.join(backup_dir, fname)
989
1210
  val = _load_pickle_if_exists(full)
990
1211
  if val is not None:
@@ -998,7 +1219,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
998
1219
  if os.path.isdir(backup_dir):
999
1220
  for fname in os.listdir(backup_dir):
1000
1221
  if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
1001
- layer_name = fname[len("layers_"):-len("_backup.pkl")]
1222
+ layer_name = fname[len("layers_") : -len("_backup.pkl")]
1002
1223
  full = os.path.join(backup_dir, fname)
1003
1224
  val = _load_pickle_if_exists(full)
1004
1225
  if val is not None:
@@ -1008,10 +1229,12 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1008
1229
  if verbose:
1009
1230
  print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
1010
1231
  except Exception as e:
1011
- report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
1232
+ report["errors"].append(
1233
+ f"Failed to restore layers['{layer_name}'] from {full}: {e}"
1234
+ )
1012
1235
 
1013
1236
  if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
1014
- obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
1237
+ obsm_name = fname[len("obsm_") : -len("_backup.pkl")]
1015
1238
  full = os.path.join(backup_dir, fname)
1016
1239
  val = _load_pickle_if_exists(full)
1017
1240
  if val is not None:
@@ -1021,7 +1244,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1021
1244
  if verbose:
1022
1245
  print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
1023
1246
  except Exception as e:
1024
- report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
1247
+ report["errors"].append(
1248
+ f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
1249
+ )
1025
1250
 
1026
1251
  # 6) If restore_backups True but some expected backups missing, note them
1027
1252
  if restore_backups and os.path.isdir(backup_dir):
@@ -1051,7 +1276,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1051
1276
  if expected_missing and verbose:
1052
1277
  n = len(expected_missing)
1053
1278
  if verbose:
1054
- print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
1279
+ print(
1280
+ f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
1281
+ )
1055
1282
  # add to report
1056
1283
  report["missing_backups"].extend(expected_missing)
1057
1284
 
@@ -1071,9 +1298,16 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1071
1298
  if report["restored_obsm"]:
1072
1299
  print("Restored obsm:", report["restored_obsm"])
1073
1300
  if report["recategorized_obs"] or report["recategorized_var"]:
1074
- print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
1301
+ print(
1302
+ "Recategorized columns (obs/var):",
1303
+ report["recategorized_obs"],
1304
+ report["recategorized_var"],
1305
+ )
1075
1306
  if report["missing_backups"]:
1076
- print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
1307
+ print(
1308
+ "Missing backups or object columns without backups (investigate):",
1309
+ report["missing_backups"],
1310
+ )
1077
1311
  if report["errors"]:
1078
1312
  print("Errors encountered (see report['errors']):")
1079
1313
  for e in report["errors"]:
@@ -1082,9 +1316,10 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1082
1316
 
1083
1317
  return adata, report
1084
1318
 
1319
+
1085
1320
  def merge_barcoded_anndatas_core(adata_single, adata_double):
1086
- import numpy as np
1087
1321
  import anndata as ad
1322
+ import numpy as np
1088
1323
 
1089
1324
  # Step 1: Identify overlap
1090
1325
  overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
@@ -1093,24 +1328,25 @@ def merge_barcoded_anndatas_core(adata_single, adata_double):
1093
1328
  adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
1094
1329
 
1095
1330
  # Step 3: Add source tag
1096
- adata_single_filtered.obs['source'] = 'single_barcode'
1097
- adata_double.obs['source'] = 'double_barcode'
1331
+ adata_single_filtered.obs["source"] = "single_barcode"
1332
+ adata_double.obs["source"] = "double_barcode"
1098
1333
 
1099
1334
  # Step 4: Concatenate all components
1100
- adata_merged = ad.concat([
1101
- adata_single_filtered,
1102
- adata_double
1103
- ], join='outer', merge='same') # merge='same' preserves matching layers, obsm, etc.
1335
+ adata_merged = ad.concat(
1336
+ [adata_single_filtered, adata_double], join="outer", merge="same"
1337
+ ) # merge='same' preserves matching layers, obsm, etc.
1104
1338
 
1105
1339
  # Step 5: Merge `.uns`
1106
1340
  adata_merged.uns = {**adata_single.uns, **adata_double.uns}
1107
1341
 
1108
1342
  return adata_merged
1343
+
1344
+
1109
1345
  ######################################################################################################
1110
1346
 
1111
1347
  ### File conversion misc ###
1112
- import argparse
1113
- from Bio import SeqIO
1348
+
1349
+
1114
1350
  def genbank_to_gff(genbank_file, output_file, record_id):
1115
1351
  with open(output_file, "w") as out:
1116
1352
  for record in SeqIO.parse(genbank_file, "genbank"):
@@ -1126,5 +1362,18 @@ def genbank_to_gff(genbank_file, output_file, record_id):
1126
1362
  # Format attributes
1127
1363
  attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
1128
1364
  # Write GFF3 line
1129
- gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
1130
- out.write(gff3_line + "\n")
1365
+ gff3_line = "\t".join(
1366
+ str(x)
1367
+ for x in [
1368
+ record_id,
1369
+ feature.type,
1370
+ feature_type,
1371
+ start,
1372
+ end,
1373
+ ".",
1374
+ strand,
1375
+ ".",
1376
+ attributes,
1377
+ ]
1378
+ )
1379
+ out.write(gff3_line + "\n")