smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -1,15 +1,14 @@
1
1
  ## readwrite ##
2
2
  from __future__ import annotations
3
3
 
4
+ import warnings
4
5
  from pathlib import Path
5
- from typing import Union, Iterable
6
-
7
- from pathlib import Path
8
- from typing import Iterable, Sequence, Optional
6
+ from typing import Iterable, List, Sequence, Union
9
7
 
10
- import warnings
11
- import pandas as pd
12
8
  import anndata as ad
9
+ import pandas as pd
10
+ from Bio import SeqIO
11
+
13
12
 
14
13
  ######################################################################################################
15
14
  ## Datetime functionality
@@ -18,20 +17,26 @@ def date_string():
18
17
  Each time this is called, it returns the current date string
19
18
  """
20
19
  from datetime import datetime
20
+
21
21
  current_date = datetime.now()
22
22
  date_string = current_date.strftime("%Y%m%d")
23
23
  date_string = date_string[2:]
24
24
  return date_string
25
25
 
26
+
26
27
  def time_string():
27
28
  """
28
29
  Each time this is called, it returns the current time string
29
30
  """
30
31
  from datetime import datetime
32
+
31
33
  current_time = datetime.now()
32
34
  return current_time.strftime("%H:%M:%S")
35
+
36
+
33
37
  ######################################################################################################
34
38
 
39
+
35
40
  ######################################################################################################
36
41
  ## General file and directory handling
37
42
  def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
@@ -57,11 +62,12 @@ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None
57
62
  p = Path(d)
58
63
 
59
64
  # If someone passes in a file path, make its parent
60
- if p.suffix: # p.suffix != "" means it's a file
65
+ if p.suffix: # p.suffix != "" means it's a file
61
66
  p = p.parent
62
67
 
63
68
  p.mkdir(parents=True, exist_ok=True)
64
69
 
70
+
65
71
  def add_or_update_column_in_csv(
66
72
  csv_path: str | Path,
67
73
  column_name: str,
@@ -117,19 +123,20 @@ def add_or_update_column_in_csv(
117
123
  # Sequence case: lengths must match
118
124
  if len(values) != len(df):
119
125
  raise ValueError(
120
- f"Length mismatch: CSV has {len(df)} rows "
121
- f"but values has {len(values)} entries."
126
+ f"Length mismatch: CSV has {len(df)} rows but values has {len(values)} entries."
122
127
  )
123
128
 
124
129
  df[column_name] = list(values)
125
130
  df.to_csv(csv_path, index=index)
126
131
  return df
127
132
 
133
+
128
134
  ######################################################################################################
129
135
 
130
136
  ######################################################################################################
131
137
  ## Numpy, Pandas, Anndata functionality
132
138
 
139
+
133
140
  def adata_to_df(adata, layer=None):
134
141
  """
135
142
  Convert an AnnData object into a Pandas DataFrame.
@@ -142,8 +149,6 @@ def adata_to_df(adata, layer=None):
142
149
  pd.DataFrame: A DataFrame where rows are observations and columns are positions.
143
150
  """
144
151
  import pandas as pd
145
- import anndata as ad
146
- import numpy as np
147
152
 
148
153
  # Validate that the requested layer exists
149
154
  if layer and layer not in adata.layers:
@@ -153,28 +158,83 @@ def adata_to_df(adata, layer=None):
153
158
  data_matrix = adata.layers.get(layer, adata.X)
154
159
 
155
160
  # Ensure matrix is dense (handle sparse formats)
156
- if hasattr(data_matrix, "toarray"):
161
+ if hasattr(data_matrix, "toarray"):
157
162
  data_matrix = data_matrix.toarray()
158
163
 
159
164
  # Ensure obs and var have unique indices
160
165
  if adata.obs.index.duplicated().any():
161
- raise ValueError("Duplicate values found in `adata.obs.index`. Ensure unique observation indices.")
162
-
166
+ raise ValueError(
167
+ "Duplicate values found in `adata.obs.index`. Ensure unique observation indices."
168
+ )
169
+
163
170
  if adata.var.index.duplicated().any():
164
- raise ValueError("Duplicate values found in `adata.var.index`. Ensure unique variable indices.")
171
+ raise ValueError(
172
+ "Duplicate values found in `adata.var.index`. Ensure unique variable indices."
173
+ )
165
174
 
166
175
  # Convert to DataFrame
167
176
  df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
168
177
 
169
178
  return df
170
179
 
180
+
171
181
  def save_matrix(matrix, save_name):
172
182
  """
173
183
  Input: A numpy matrix and a save_name
174
184
  Output: A txt file representation of the data matrix
175
185
  """
176
186
  import numpy as np
177
- np.savetxt(f'{save_name}.txt', matrix)
187
+
188
+ np.savetxt(f"{save_name}.txt", matrix)
189
+
190
+
191
+ def _harmonize_var_schema(adatas: List[ad.AnnData]) -> None:
192
+ """
193
+ In-place:
194
+ - Make every AnnData.var have the *union* of columns.
195
+ - Normalize dtypes so columns can hold NaN and round-trip via HDF5:
196
+ * ints -> float64 (to support NaN)
197
+ * objects -> try numeric->float64, else pandas 'string'
198
+ """
199
+ import numpy as np
200
+
201
+ # 1) Union of all .var columns
202
+ all_cols = set()
203
+ for a in adatas:
204
+ all_cols.update(a.var.columns)
205
+ all_cols = list(all_cols)
206
+
207
+ # 2) Add any missing columns as float64 NaN
208
+ for a in adatas:
209
+ missing = [c for c in all_cols if c not in a.var.columns]
210
+ for c in missing:
211
+ a.var[c] = np.nan # becomes float64 by default
212
+
213
+ # 3) Normalize dtypes per AnnData so concat doesn't create mixed/object columns
214
+ for a in adatas:
215
+ for c in a.var.columns:
216
+ s = a.var[c]
217
+ dt = s.dtype
218
+
219
+ # Integer/unsigned -> float64 (so NaN fits)
220
+ if dt.kind in ("i", "u"):
221
+ a.var[c] = s.astype("float64")
222
+ continue
223
+
224
+ # Object -> numeric if possible; else pandas 'string'
225
+ if dt == "O":
226
+ try:
227
+ s_num = pd.to_numeric(s, errors="raise")
228
+ a.var[c] = s_num.astype("float64")
229
+ except Exception:
230
+ a.var[c] = s.astype("string")
231
+
232
+ # Optional: ensure consistent column order (sorted + stable)
233
+ # Not required, but can make diffs easier to read:
234
+ all_cols_sorted = sorted(all_cols)
235
+ for a in adatas:
236
+ a.var = a.var.reindex(columns=all_cols_sorted)
237
+
178
238
 
179
239
  def concatenate_h5ads(
180
240
  output_path: str | Path,
@@ -243,8 +303,7 @@ def concatenate_h5ads(
243
303
  # collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
244
304
  suffixes_lower = tuple(s.lower() for s in file_suffixes)
245
305
  h5_paths = sorted(
246
- p for p in input_dir.iterdir()
247
- if p.is_file() and p.suffix.lower() in suffixes_lower
306
+ p for p in input_dir.iterdir() if p.is_file() and p.suffix.lower() in suffixes_lower
248
307
  )
249
308
 
250
309
  else:
@@ -255,9 +314,7 @@ def concatenate_h5ads(
255
314
 
256
315
  df = pd.read_csv(csv_path, dtype=str)
257
316
  if csv_column not in df.columns:
258
- raise ValueError(
259
- f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
260
- )
317
+ raise ValueError(f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths.")
261
318
  paths = df[csv_column].dropna().astype(str).tolist()
262
319
  if not paths:
263
320
  raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
@@ -280,27 +337,41 @@ def concatenate_h5ads(
280
337
  for p in h5_paths:
281
338
  print(f" - {p}")
282
339
 
283
- final_adata: Optional[ad.AnnData] = None
284
-
340
+ # Load all first so we can harmonize schemas before concat
341
+ loaded: List[ad.AnnData] = []
285
342
  for p in h5_paths:
286
343
  print(f"{time_string()}: Reading {p}")
287
- temp_adata, read_report = safe_read_h5ad(p, restore_backups=restore_backups)
288
-
289
- if final_adata is None:
290
- print(f"{time_string()}: Initializing final AnnData with {p}")
291
- final_adata = temp_adata
292
- else:
293
- print(f"{time_string()}: Concatenating {p} into final AnnData")
294
- final_adata = ad.concat(
295
- [final_adata, temp_adata],
296
- join="outer",
297
- merge='unique',
298
- uns_merge='unique',
299
- index_unique=None,
300
- )
344
+ a, _ = safe_read_h5ad(p, restore_backups=restore_backups)
345
+ loaded.append(a)
346
+
347
+ # Critical: make every .var share the same columns + safe dtypes
348
+ _harmonize_var_schema(loaded)
349
+
350
+ print(f"{time_string()}: Concatenating {len(loaded)} AnnData objects")
351
+ final_adata = ad.concat(
352
+ loaded,
353
+ axis=0, # stack observations
354
+ join="outer", # keep union of variables
355
+ merge="unique",
356
+ uns_merge="unique",
357
+ index_unique=None,
358
+ )
359
+
360
+ # Defensive pass: ensure final var dtypes are write-safe
361
+ for c in final_adata.var.columns:
362
+ s = final_adata.var[c]
363
+ dt = s.dtype
364
+ if dt.kind in ("i", "u"):
365
+ final_adata.var[c] = s.astype("float64")
366
+ elif dt == "O":
367
+ try:
368
+ s_num = pd.to_numeric(s, errors="raise")
369
+ final_adata.var[c] = s_num.astype("float64")
370
+ except Exception:
371
+ final_adata.var[c] = s.astype("string")
301
372
 
302
- if final_adata is None:
303
- raise RuntimeError("Unexpected: no AnnData objects loaded.")
373
+ # Let anndata write pandas StringArray reliably
374
+ ad.settings.allow_write_nullable_strings = True
304
375
 
305
376
  print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
306
377
  safe_write_h5ad(final_adata, output_path, backup=restore_backups)
@@ -325,18 +396,21 @@ def concatenate_h5ads(
325
396
 
326
397
  return output_path
327
398
 
399
+
328
400
  def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
329
401
  """
330
402
  Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
331
403
 
332
404
  Returns a report dict and prints a summary of what was converted/backed up/skipped.
333
405
  """
334
- import os, json, pickle
406
+ import json
407
+ import os
408
+ import pickle
335
409
  from pathlib import Path
410
+
411
+ import anndata as _ad
336
412
  import numpy as np
337
413
  import pandas as pd
338
- import warnings
339
- import anndata as _ad
340
414
 
341
415
  path = Path(path)
342
416
 
@@ -413,7 +487,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
413
487
  report["var_backed_up_columns"].append(col)
414
488
  df[col] = ser.astype(str)
415
489
  if verbose:
416
- print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
490
+ print(
491
+ f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
492
+ )
417
493
  continue
418
494
 
419
495
  # object dtype handling: try to coerce each element to string
@@ -434,7 +510,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
434
510
  report["var_backed_up_columns"].append(col)
435
511
  df[col] = ser.values.astype(str)
436
512
  if verbose:
437
- print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
513
+ print(
514
+ f" converted object column '{which}.{col}' -> strings (backup={backup})"
515
+ )
438
516
  if which == "obs":
439
517
  report["obs_converted_columns"].append(col)
440
518
  else:
@@ -457,7 +535,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
457
535
  report["var_backed_up_columns"].append(col)
458
536
  df[col] = [json.dumps(v, default=str) for v in ser.values]
459
537
  if verbose:
460
- print(f" json-stringified object column '{which}.{col}' (backup={backup})")
538
+ print(
539
+ f" json-stringified object column '{which}.{col}' (backup={backup})"
540
+ )
461
541
  if which == "obs":
462
542
  report["obs_converted_columns"].append(col)
463
543
  else:
@@ -472,7 +552,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
472
552
  report["var_backed_up_columns"].append(col)
473
553
  df[col] = ser.astype(str)
474
554
  if verbose:
475
- print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
555
+ print(
556
+ f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
557
+ )
476
558
  if which == "obs":
477
559
  report["obs_converted_columns"].append(col)
478
560
  else:
@@ -499,7 +581,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
499
581
  _backup(v, f"uns_{k}_backup")
500
582
  backed_up.append(k)
501
583
  if verbose:
502
- print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
584
+ print(
585
+ f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
586
+ )
503
587
  report["uns_json_keys"].append(k)
504
588
  except Exception:
505
589
  try:
@@ -534,7 +618,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
534
618
  arr_f = arr.astype(float)
535
619
  cleaned[k] = arr_f
536
620
  report_key = f"{which}.{k}"
537
- report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
621
+ report["layers_converted"].append(
622
+ report_key
623
+ ) if which == "layers" else report["obsm_converted"].append(report_key)
538
624
  if verbose:
539
625
  print(f" {which}.{k} object array coerced to float.")
540
626
  except Exception:
@@ -542,7 +628,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
542
628
  arr_i = arr.astype(int)
543
629
  cleaned[k] = arr_i
544
630
  report_key = f"{which}.{k}"
545
- report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
631
+ report["layers_converted"].append(
632
+ report_key
633
+ ) if which == "layers" else report["obsm_converted"].append(report_key)
546
634
  if verbose:
547
635
  print(f" {which}.{k} object array coerced to int.")
548
636
  except Exception:
@@ -553,7 +641,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
553
641
  else:
554
642
  report["obsm_skipped"].append(k)
555
643
  if verbose:
556
- print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
644
+ print(
645
+ f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
646
+ )
557
647
  continue
558
648
  else:
559
649
  cleaned[k] = arr
@@ -638,7 +728,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
638
728
  X_to_use = np.zeros_like(X_arr, dtype=float)
639
729
  report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
640
730
  if verbose:
641
- print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
731
+ print(
732
+ "adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
733
+ )
642
734
  except Exception as e:
643
735
  msg = f"Error handling adata.X: {e}"
644
736
  report["errors"].append(msg)
@@ -731,7 +823,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
731
823
  print(f"CSV outputs will be written to: {csv_dir}")
732
824
  except Exception as e:
733
825
  msg = f"Failed to create CSV output directory: {e}"
734
- report['errors'].append(msg)
826
+ report["errors"].append(msg)
735
827
  if verbose:
736
828
  print(msg)
737
829
  csv_dir = path.parent # fallback just in case
@@ -742,48 +834,58 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
742
834
 
743
835
  # obs columns
744
836
  for col in adata_copy.obs.columns:
745
- meta_rows.append({
746
- "kind": "obs",
747
- "name": col,
748
- "dtype": str(adata_copy.obs[col].dtype),
749
- })
837
+ meta_rows.append(
838
+ {
839
+ "kind": "obs",
840
+ "name": col,
841
+ "dtype": str(adata_copy.obs[col].dtype),
842
+ }
843
+ )
750
844
 
751
845
  # var columns
752
846
  for col in adata_copy.var.columns:
753
- meta_rows.append({
754
- "kind": "var",
755
- "name": col,
756
- "dtype": str(adata_copy.var[col].dtype),
757
- })
847
+ meta_rows.append(
848
+ {
849
+ "kind": "var",
850
+ "name": col,
851
+ "dtype": str(adata_copy.var[col].dtype),
852
+ }
853
+ )
758
854
 
759
855
  # layers
760
856
  for k, v in adata_copy.layers.items():
761
- meta_rows.append({
762
- "kind": "layer",
763
- "name": k,
764
- "dtype": str(np.asarray(v).dtype),
765
- })
857
+ meta_rows.append(
858
+ {
859
+ "kind": "layer",
860
+ "name": k,
861
+ "dtype": str(np.asarray(v).dtype),
862
+ }
863
+ )
766
864
 
767
865
  # obsm
768
866
  for k, v in adata_copy.obsm.items():
769
- meta_rows.append({
770
- "kind": "obsm",
771
- "name": k,
772
- "dtype": str(np.asarray(v).dtype),
773
- })
867
+ meta_rows.append(
868
+ {
869
+ "kind": "obsm",
870
+ "name": k,
871
+ "dtype": str(np.asarray(v).dtype),
872
+ }
873
+ )
774
874
 
775
875
  # uns
776
876
  for k, v in adata_copy.uns.items():
777
- meta_rows.append({
778
- "kind": "uns",
779
- "name": k,
780
- "dtype": type(v).__name__,
781
- })
877
+ meta_rows.append(
878
+ {
879
+ "kind": "uns",
880
+ "name": k,
881
+ "dtype": type(v).__name__,
882
+ }
883
+ )
782
884
 
783
885
  meta_df = pd.DataFrame(meta_rows)
784
886
 
785
887
  # same base name, inside csvs/
786
- base = path.stem # removes .h5ad
888
+ base = path.stem # removes .h5ad
787
889
  meta_path = csv_dir / f"{base}.keys.csv"
788
890
 
789
891
  meta_df.to_csv(meta_path, index=False)
@@ -818,7 +920,15 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
818
920
 
819
921
  return report
820
922
 
821
- def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
923
+
924
+ def safe_read_h5ad(
925
+ path,
926
+ backup_dir=None,
927
+ restore_backups=True,
928
+ re_categorize=True,
929
+ categorical_threshold=100,
930
+ verbose=True,
931
+ ):
822
932
  """
823
933
  Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
824
934
  from the backup_dir produced during save.
@@ -846,13 +956,14 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
846
956
  report : dict
847
957
  A report describing restored items, parsed JSON keys, and any failures.
848
958
  """
849
- import os
850
- from pathlib import Path
851
959
  import json
960
+ import os
852
961
  import pickle
962
+ from pathlib import Path
963
+
964
+ import anndata as _ad
853
965
  import numpy as np
854
966
  import pandas as pd
855
- import anndata as _ad
856
967
 
857
968
  path = Path(path)
858
969
 
@@ -931,7 +1042,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
931
1042
  if hasattr(val, "shape") and (len(val) == adata.shape[0]):
932
1043
  adata.obs[col] = pd.Series(val, index=adata.obs.index)
933
1044
  else:
934
- adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
1045
+ adata.obs[col] = pd.Series(
1046
+ [val] * adata.shape[0], index=adata.obs.index
1047
+ )
935
1048
  report["restored_obs_columns"].append((col, bname1))
936
1049
  restored = True
937
1050
  if verbose:
@@ -946,7 +1059,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
946
1059
  looks_like_json = False
947
1060
  for sv in sample_vals:
948
1061
  svs = sv.strip()
949
- if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
1062
+ if (svs.startswith("{") and svs.endswith("}")) or (
1063
+ svs.startswith("[") and svs.endswith("]")
1064
+ ):
950
1065
  looks_like_json = True
951
1066
  break
952
1067
  if looks_like_json:
@@ -964,7 +1079,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
964
1079
  report["restored_obs_columns"].append((col, "parsed_json"))
965
1080
  restored = True
966
1081
  if verbose:
967
- print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
1082
+ print(
1083
+ f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
1084
+ )
968
1085
 
969
1086
  # If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
970
1087
  if (not restored) and re_categorize and adata.obs[col].dtype == object:
@@ -975,7 +1092,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
975
1092
  adata.obs[col] = adata.obs[col].astype(str).astype("category")
976
1093
  report["recategorized_obs"].append(col)
977
1094
  if verbose:
978
- print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
1095
+ print(
1096
+ f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
1097
+ )
979
1098
  except Exception as e:
980
1099
  report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
981
1100
 
@@ -1007,7 +1126,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1007
1126
  if hasattr(val, "shape") and (len(val) == adata.shape[1]):
1008
1127
  adata.var[col] = pd.Series(val, index=adata.var.index)
1009
1128
  else:
1010
- adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
1129
+ adata.var[col] = pd.Series(
1130
+ [val] * adata.shape[1], index=adata.var.index
1131
+ )
1011
1132
  report["restored_var_columns"].append((col, bname1))
1012
1133
  restored = True
1013
1134
  if verbose:
@@ -1021,7 +1142,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1021
1142
  looks_like_json = False
1022
1143
  for sv in sample_vals:
1023
1144
  svs = sv.strip()
1024
- if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
1145
+ if (svs.startswith("{") and svs.endswith("}")) or (
1146
+ svs.startswith("[") and svs.endswith("]")
1147
+ ):
1025
1148
  looks_like_json = True
1026
1149
  break
1027
1150
  if looks_like_json:
@@ -1037,7 +1160,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1037
1160
  adata.var[col] = pd.Series(parsed, index=adata.var.index)
1038
1161
  report["restored_var_columns"].append((col, "parsed_json"))
1039
1162
  if verbose:
1040
- print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
1163
+ print(
1164
+ f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
1165
+ )
1041
1166
 
1042
1167
  if (not restored) and re_categorize and adata.var[col].dtype == object:
1043
1168
  try:
@@ -1046,7 +1171,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1046
1171
  adata.var[col] = adata.var[col].astype(str).astype("category")
1047
1172
  report["recategorized_var"].append(col)
1048
1173
  if verbose:
1049
- print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
1174
+ print(
1175
+ f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
1176
+ )
1050
1177
  except Exception as e:
1051
1178
  report["errors"].append(f"Failed to recategorize var.{col}: {e}")
1052
1179
 
@@ -1078,7 +1205,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1078
1205
  if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
1079
1206
  continue
1080
1207
  # fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
1081
- key = fname[len("uns_"):-len("_backup.pkl")]
1208
+ key = fname[len("uns_") : -len("_backup.pkl")]
1082
1209
  full = os.path.join(backup_dir, fname)
1083
1210
  val = _load_pickle_if_exists(full)
1084
1211
  if val is not None:
@@ -1092,7 +1219,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1092
1219
  if os.path.isdir(backup_dir):
1093
1220
  for fname in os.listdir(backup_dir):
1094
1221
  if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
1095
- layer_name = fname[len("layers_"):-len("_backup.pkl")]
1222
+ layer_name = fname[len("layers_") : -len("_backup.pkl")]
1096
1223
  full = os.path.join(backup_dir, fname)
1097
1224
  val = _load_pickle_if_exists(full)
1098
1225
  if val is not None:
@@ -1102,10 +1229,12 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1102
1229
  if verbose:
1103
1230
  print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
1104
1231
  except Exception as e:
1105
- report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
1232
+ report["errors"].append(
1233
+ f"Failed to restore layers['{layer_name}'] from {full}: {e}"
1234
+ )
1106
1235
 
1107
1236
  if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
1108
- obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
1237
+ obsm_name = fname[len("obsm_") : -len("_backup.pkl")]
1109
1238
  full = os.path.join(backup_dir, fname)
1110
1239
  val = _load_pickle_if_exists(full)
1111
1240
  if val is not None:
@@ -1115,7 +1244,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1115
1244
  if verbose:
1116
1245
  print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
1117
1246
  except Exception as e:
1118
- report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
1247
+ report["errors"].append(
1248
+ f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
1249
+ )
1119
1250
 
1120
1251
  # 6) If restore_backups True but some expected backups missing, note them
1121
1252
  if restore_backups and os.path.isdir(backup_dir):
@@ -1145,7 +1276,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1145
1276
  if expected_missing and verbose:
1146
1277
  n = len(expected_missing)
1147
1278
  if verbose:
1148
- print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
1279
+ print(
1280
+ f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
1281
+ )
1149
1282
  # add to report
1150
1283
  report["missing_backups"].extend(expected_missing)
1151
1284
 
@@ -1165,9 +1298,16 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1165
1298
  if report["restored_obsm"]:
1166
1299
  print("Restored obsm:", report["restored_obsm"])
1167
1300
  if report["recategorized_obs"] or report["recategorized_var"]:
1168
- print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
1301
+ print(
1302
+ "Recategorized columns (obs/var):",
1303
+ report["recategorized_obs"],
1304
+ report["recategorized_var"],
1305
+ )
1169
1306
  if report["missing_backups"]:
1170
- print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
1307
+ print(
1308
+ "Missing backups or object columns without backups (investigate):",
1309
+ report["missing_backups"],
1310
+ )
1171
1311
  if report["errors"]:
1172
1312
  print("Errors encountered (see report['errors']):")
1173
1313
  for e in report["errors"]:
@@ -1176,9 +1316,10 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
1176
1316
 
1177
1317
  return adata, report
1178
1318
 
1319
+
1179
1320
  def merge_barcoded_anndatas_core(adata_single, adata_double):
1180
- import numpy as np
1181
1321
  import anndata as ad
1322
+ import numpy as np
1182
1323
 
1183
1324
  # Step 1: Identify overlap
1184
1325
  overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
@@ -1187,24 +1328,25 @@ def merge_barcoded_anndatas_core(adata_single, adata_double):
1187
1328
  adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
1188
1329
 
1189
1330
  # Step 3: Add source tag
1190
- adata_single_filtered.obs['source'] = 'single_barcode'
1191
- adata_double.obs['source'] = 'double_barcode'
1331
+ adata_single_filtered.obs["source"] = "single_barcode"
1332
+ adata_double.obs["source"] = "double_barcode"
1192
1333
 
1193
1334
  # Step 4: Concatenate all components
1194
- adata_merged = ad.concat([
1195
- adata_single_filtered,
1196
- adata_double
1197
- ], join='outer', merge='same') # merge='same' preserves matching layers, obsm, etc.
1335
+ adata_merged = ad.concat(
1336
+ [adata_single_filtered, adata_double], join="outer", merge="same"
1337
+ ) # merge='same' preserves matching layers, obsm, etc.
1198
1338
 
1199
1339
  # Step 5: Merge `.uns`
1200
1340
  adata_merged.uns = {**adata_single.uns, **adata_double.uns}
1201
1341
 
1202
1342
  return adata_merged
1343
+
1344
+
1203
1345
  ######################################################################################################
1204
1346
 
1205
1347
  ### File conversion misc ###
1206
- import argparse
1207
- from Bio import SeqIO
1348
+
1349
+
1208
1350
  def genbank_to_gff(genbank_file, output_file, record_id):
1209
1351
  with open(output_file, "w") as out:
1210
1352
  for record in SeqIO.parse(genbank_file, "genbank"):
@@ -1220,5 +1362,18 @@ def genbank_to_gff(genbank_file, output_file, record_id):
1220
1362
  # Format attributes
1221
1363
  attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
1222
1364
  # Write GFF3 line
1223
- gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
1224
- out.write(gff3_line + "\n")
1365
+ gff3_line = "\t".join(
1366
+ str(x)
1367
+ for x in [
1368
+ record_id,
1369
+ feature.type,
1370
+ feature_type,
1371
+ start,
1372
+ end,
1373
+ ".",
1374
+ strand,
1375
+ ".",
1376
+ attributes,
1377
+ ]
1378
+ )
1379
+ out.write(gff3_line + "\n")