smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py CHANGED
@@ -1,4 +1,15 @@
1
1
  ## readwrite ##
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+ from typing import Union, Iterable
6
+
7
+ from pathlib import Path
8
+ from typing import Iterable, Sequence, Optional
9
+
10
+ import warnings
11
+ import pandas as pd
12
+ import anndata as ad
2
13
 
3
14
  ######################################################################################################
4
15
  ## Datetime functionality
@@ -21,6 +32,101 @@ def time_string():
21
32
  return current_time.strftime("%H:%M:%S")
22
33
  ######################################################################################################
23
34
 
35
+ ######################################################################################################
36
+ ## General file and directory handling
37
+ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
38
+ """
39
+ Create one or multiple directories.
40
+
41
+ Parameters
42
+ ----------
43
+ directories : str | Path | list/iterable of str | Path
44
+ Paths of directories to create. If a file path is passed,
45
+ the parent directory is created.
46
+
47
+ Returns
48
+ -------
49
+ None
50
+ """
51
+
52
+ # allow user to pass a single string/Path
53
+ if isinstance(directories, (str, Path)):
54
+ directories = [directories]
55
+
56
+ for d in directories:
57
+ p = Path(d)
58
+
59
+ # If someone passes in a file path, make its parent
60
+ if p.suffix: # p.suffix != "" means it's a file
61
+ p = p.parent
62
+
63
+ p.mkdir(parents=True, exist_ok=True)
64
+
65
+ def add_or_update_column_in_csv(
66
+ csv_path: str | Path,
67
+ column_name: str,
68
+ values,
69
+ index: bool = False,
70
+ ):
71
+ """
72
+ Add (or overwrite) a column in a CSV file.
73
+ If the CSV does not exist, create it containing only that column.
74
+
75
+ Parameters
76
+ ----------
77
+ csv_path : str | Path
78
+ Path to the CSV file.
79
+ column_name : str
80
+ Name of the column to add or update.
81
+ values : list | scalar | callable
82
+ - If list/Series: must match the number of rows.
83
+ - If scalar: broadcast to all rows (or single-row CSV if new file).
84
+ - If callable(df): function should return the column values based on df.
85
+ index : bool
86
+ Whether to write the pandas index into the CSV. Default False.
87
+
88
+ Returns
89
+ -------
90
+ pd.DataFrame : the updated DataFrame.
91
+ """
92
+ csv_path = Path(csv_path)
93
+ csv_path.parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ # Case 1 — CSV does not exist → create it
96
+ if not csv_path.exists():
97
+ if hasattr(values, "__len__") and not isinstance(values, str):
98
+ df = pd.DataFrame({column_name: list(values)})
99
+ else:
100
+ df = pd.DataFrame({column_name: [values]})
101
+ df.to_csv(csv_path, index=index)
102
+ return df
103
+
104
+ # Case 2 — CSV exists → load + modify
105
+ df = pd.read_csv(csv_path)
106
+
107
+ # If values is callable, call it with df
108
+ if callable(values):
109
+ values = values(df)
110
+
111
+ # Broadcast scalar
112
+ if not hasattr(values, "__len__") or isinstance(values, str):
113
+ df[column_name] = values
114
+ df.to_csv(csv_path, index=index)
115
+ return df
116
+
117
+ # Sequence case: lengths must match
118
+ if len(values) != len(df):
119
+ raise ValueError(
120
+ f"Length mismatch: CSV has {len(df)} rows "
121
+ f"but values has {len(values)} entries."
122
+ )
123
+
124
+ df[column_name] = list(values)
125
+ df.to_csv(csv_path, index=index)
126
+ return df
127
+
128
+ ######################################################################################################
129
+
24
130
  ######################################################################################################
25
131
  ## Numpy, Pandas, Anndata functionality
26
132
 
@@ -62,7 +168,6 @@ def adata_to_df(adata, layer=None):
62
168
 
63
169
  return df
64
170
 
65
-
66
171
  def save_matrix(matrix, save_name):
67
172
  """
68
173
  Input: A numpy matrix and a save_name
@@ -71,106 +176,913 @@ def save_matrix(matrix, save_name):
71
176
  import numpy as np
72
177
  np.savetxt(f'{save_name}.txt', matrix)
73
178
 
74
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
179
+ def concatenate_h5ads(
180
+ output_path: str | Path,
181
+ *,
182
+ input_dir: str | Path | None = None,
183
+ csv_path: str | Path | None = None,
184
+ csv_column: str = "h5ad_path",
185
+ file_suffixes: Sequence[str] = (".h5ad", ".h5ad.gz"),
186
+ delete_inputs: bool = False,
187
+ restore_backups: bool = True,
188
+ ) -> Path:
75
189
  """
76
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
77
- Input: an output file path relative to the directory in which the function is called
190
+ Concatenate multiple .h5ad files into one AnnData and write it safely.
191
+
192
+ Two input modes (choose ONE):
193
+ 1) Directory mode: use all *.h5ad / *.h5ad.gz in `input_dir`.
194
+ 2) CSV mode: use file paths from column `csv_column` in `csv_path`.
195
+
196
+ Parameters
197
+ ----------
198
+ output_path
199
+ Path to the final concatenated .h5ad (can be .h5ad or .h5ad.gz).
200
+ input_dir
201
+ Directory containing .h5ad files to concatenate. If None and csv_path
202
+ is also None, defaults to the current working directory.
203
+ csv_path
204
+ Path to a CSV containing file paths to concatenate (in column `csv_column`).
205
+ csv_column
206
+ Name of the column in the CSV containing .h5ad paths.
207
+ file_suffixes
208
+ Tuple of allowed suffixes (default: (".h5ad", ".h5ad.gz")).
209
+ delete_inputs
210
+ If True, delete the input .h5ad files after successful write of output.
211
+ restore_backups
212
+ Passed through to `safe_read_h5ad(restore_backups=...)`.
213
+
214
+ Returns
215
+ -------
216
+ Path
217
+ The path to the written concatenated .h5ad file.
218
+
219
+ Raises
220
+ ------
221
+ ValueError
222
+ If both `input_dir` and `csv_path` are provided, or none contain files.
223
+ FileNotFoundError
224
+ If specified CSV or directory does not exist.
78
225
  """
79
- import os
80
- import anndata as ad
81
- # Runtime warnings
82
- import warnings
83
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
84
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
85
-
86
- # List all files in the directory
87
- files = os.listdir(os.getcwd())
88
- # get current working directory
89
- cwd = os.getcwd()
90
- suffix = file_suffix
91
- # Filter file names that contain the search string in their filename and keep them in a list
92
- hdfs = [hdf for hdf in files if suffix in hdf]
93
- # Sort file list by names and print the list of file names
94
- hdfs.sort()
95
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
96
- # Iterate over all of the hdf5 files and concatenate them.
97
- final_adata = None
98
- for hdf in hdfs:
99
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
100
- temp_adata = ad.read_h5ad(hdf)
101
- if final_adata:
102
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
103
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
104
- else:
105
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
226
+
227
+ # ------------------------------------------------------------------
228
+ # Setup and input resolution
229
+ # ------------------------------------------------------------------
230
+ output_path = Path(output_path)
231
+
232
+ if input_dir is not None and csv_path is not None:
233
+ raise ValueError("Provide either `input_dir` OR `csv_path`, not both.")
234
+
235
+ if csv_path is None:
236
+ # Directory mode
237
+ input_dir = Path(input_dir) if input_dir is not None else Path.cwd()
238
+ if not input_dir.exists():
239
+ raise FileNotFoundError(f"Input directory does not exist: {input_dir}")
240
+ if not input_dir.is_dir():
241
+ raise ValueError(f"input_dir is not a directory: {input_dir}")
242
+
243
+ # collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
244
+ suffixes_lower = tuple(s.lower() for s in file_suffixes)
245
+ h5_paths = sorted(
246
+ p for p in input_dir.iterdir()
247
+ if p.is_file() and p.suffix.lower() in suffixes_lower
248
+ )
249
+
250
+ else:
251
+ # CSV mode
252
+ csv_path = Path(csv_path)
253
+ if not csv_path.exists():
254
+ raise FileNotFoundError(f"CSV path does not exist: {csv_path}")
255
+
256
+ df = pd.read_csv(csv_path, dtype=str)
257
+ if csv_column not in df.columns:
258
+ raise ValueError(
259
+ f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
260
+ )
261
+ paths = df[csv_column].dropna().astype(str).tolist()
262
+ if not paths:
263
+ raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
264
+
265
+ h5_paths = [Path(p).expanduser() for p in paths]
266
+
267
+ if not h5_paths:
268
+ raise ValueError("No input .h5ad files found to concatenate.")
269
+
270
+ # Ensure directory for output exists
271
+ output_path.parent.mkdir(parents=True, exist_ok=True)
272
+
273
+ # ------------------------------------------------------------------
274
+ # Concatenate
275
+ # ------------------------------------------------------------------
276
+ warnings.filterwarnings("ignore", category=UserWarning, module="anndata")
277
+ warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")
278
+
279
+ print(f"{time_string()}: Found {len(h5_paths)} input h5ad files:")
280
+ for p in h5_paths:
281
+ print(f" - {p}")
282
+
283
+ final_adata: Optional[ad.AnnData] = None
284
+
285
+ for p in h5_paths:
286
+ print(f"{time_string()}: Reading {p}")
287
+ temp_adata, read_report = safe_read_h5ad(p, restore_backups=restore_backups)
288
+
289
+ if final_adata is None:
290
+ print(f"{time_string()}: Initializing final AnnData with {p}")
106
291
  final_adata = temp_adata
107
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
108
- final_adata.write_h5ad(output_file, compression='gzip')
292
+ else:
293
+ print(f"{time_string()}: Concatenating {p} into final AnnData")
294
+ final_adata = ad.concat(
295
+ [final_adata, temp_adata],
296
+ join="outer",
297
+ merge='unique',
298
+ uns_merge='unique',
299
+ index_unique=None,
300
+ )
301
+
302
+ if final_adata is None:
303
+ raise RuntimeError("Unexpected: no AnnData objects loaded.")
109
304
 
110
- # Delete the individual h5ad files and only keep the final concatenated file
305
+ print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
306
+ safe_write_h5ad(final_adata, output_path, backup=restore_backups)
307
+
308
+ # ------------------------------------------------------------------
309
+ # Optional cleanup (delete inputs)
310
+ # ------------------------------------------------------------------
111
311
  if delete_inputs:
112
- files = os.listdir(os.getcwd())
113
- hdfs = [hdf for hdf in files if suffix in hdf]
114
- if output_file in hdfs:
115
- hdfs.remove(output_file)
116
- # Iterate over the files and delete them
117
- for hdf in hdfs:
118
- try:
119
- os.remove(hdf)
120
- print(f"Deleted file: {hdf}")
121
- except OSError as e:
122
- print(f"Error deleting file {hdf}: {e}")
312
+ out_resolved = output_path.resolve()
313
+ for p in h5_paths:
314
+ try:
315
+ # Don't delete the output file if it happens to be in the list
316
+ if p.resolve() == out_resolved:
317
+ continue
318
+ if p.exists():
319
+ p.unlink()
320
+ print(f"Deleted input file: {p}")
321
+ except OSError as e:
322
+ print(f"Error deleting file {p}: {e}")
123
323
  else:
124
- print('Keeping input files')
324
+ print("Keeping input files.")
125
325
 
126
- def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./"):
326
+ return output_path
327
+
328
+ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
127
329
  """
128
- Saves an AnnData object safely by omitting problematic columns from .obs and .var.
330
+ Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
129
331
 
130
- Parameters:
131
- adata (AnnData): The AnnData object to save.
132
- path (str): Output .h5ad file path.
133
- compression (str): Compression method for h5ad file.
134
- backup (bool): If True, saves problematic columns to CSV files.
135
- backup_dir (str): Directory to store backups if backup=True.
332
+ Returns a report dict and prints a summary of what was converted/backed up/skipped.
136
333
  """
137
- import anndata as ad
334
+ import os, json, pickle
335
+ from pathlib import Path
336
+ import numpy as np
138
337
  import pandas as pd
139
- import os
338
+ import warnings
339
+ import anndata as _ad
340
+
341
+ path = Path(path)
342
+
343
+ if not backup_dir:
344
+ backup_dir = path.parent / str(path.name).split(".")[0]
140
345
 
141
346
  os.makedirs(backup_dir, exist_ok=True)
142
347
 
143
- def filter_df(df, df_name):
144
- bad_cols = []
145
- for col in df.columns:
146
- if df[col].dtype == 'object':
147
- if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
148
- bad_cols.append(col)
149
- if bad_cols:
150
- print(f"⚠️ Skipping columns from {df_name}: {bad_cols}")
151
- if backup:
152
- df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
153
- print(f"📝 Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
154
- return df.drop(columns=bad_cols)
155
-
156
- # Clean obs and var
157
- obs_clean = filter_df(adata.obs, "obs")
158
- var_clean = filter_df(adata.var, "var")
159
-
160
- # Save clean version
161
- adata_copy = ad.AnnData(
162
- X=adata.X,
163
- obs=obs_clean,
164
- var=var_clean,
165
- layers=adata.layers,
166
- uns=adata.uns,
167
- obsm=adata.obsm,
168
- varm=adata.varm
169
- )
170
- adata_copy.write_h5ad(path, compression=compression)
171
- print(f" Saved safely to {path}")
172
-
173
- def merge_barcoded_anndatas(adata_single, adata_double):
348
+ # report structure
349
+ report = {
350
+ "obs_converted_columns": [],
351
+ "obs_backed_up_columns": [],
352
+ "var_converted_columns": [],
353
+ "var_backed_up_columns": [],
354
+ "uns_backed_up_keys": [],
355
+ "uns_json_keys": [],
356
+ "layers_converted": [],
357
+ "layers_skipped": [],
358
+ "obsm_converted": [],
359
+ "obsm_skipped": [],
360
+ "X_replaced_or_converted": None,
361
+ "errors": [],
362
+ }
363
+
364
+ def _backup(obj, name):
365
+ """Pickle obj to backup_dir/name.pkl and return filename (or None)."""
366
+ fname = backup_dir / f"{name}.pkl"
367
+ try:
368
+ with open(fname, "wb") as fh:
369
+ pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
370
+ if verbose:
371
+ print(f" backed up {name} -> {fname}")
372
+ return fname
373
+ except Exception as e:
374
+ msg = f"failed to back up {name}: {e}"
375
+ if verbose:
376
+ print(" " + msg)
377
+ report["errors"].append(msg)
378
+ return None
379
+
380
+ def _make_obs_var_safe(df: pd.DataFrame, which: str):
381
+ """
382
+ Return a sanitized copy of df where:
383
+ - object columns converted to strings (with backup)
384
+ - categorical columns' categories coerced to str (with backup)
385
+ """
386
+ df = df.copy()
387
+ for col in list(df.columns):
388
+ ser = df[col]
389
+ # categorical handling
390
+ try:
391
+ is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
392
+ except Exception:
393
+ is_cat = False
394
+
395
+ if is_cat:
396
+ try:
397
+ cats = ser.cat.categories
398
+ cats_str = cats.astype(str)
399
+ df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
400
+ if verbose:
401
+ print(f" coerced categorical column '{which}.{col}' -> string categories")
402
+ if which == "obs":
403
+ report["obs_converted_columns"].append(col)
404
+ else:
405
+ report["var_converted_columns"].append(col)
406
+ except Exception:
407
+ # backup then coerce
408
+ if backup:
409
+ _backup(ser, f"{which}.{col}_categorical_backup")
410
+ if which == "obs":
411
+ report["obs_backed_up_columns"].append(col)
412
+ else:
413
+ report["var_backed_up_columns"].append(col)
414
+ df[col] = ser.astype(str)
415
+ if verbose:
416
+ print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
417
+ continue
418
+
419
+ # object dtype handling: try to coerce each element to string
420
+ try:
421
+ is_obj = ser.dtype == object or pd.api.types.is_object_dtype(ser.dtype)
422
+ except Exception:
423
+ is_obj = False
424
+
425
+ if is_obj:
426
+ # test whether converting to string succeeds for all elements
427
+ try:
428
+ _ = np.array(ser.values.astype(str))
429
+ if backup:
430
+ _backup(ser.values, f"{which}.{col}_backup")
431
+ if which == "obs":
432
+ report["obs_backed_up_columns"].append(col)
433
+ else:
434
+ report["var_backed_up_columns"].append(col)
435
+ df[col] = ser.values.astype(str)
436
+ if verbose:
437
+ print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
438
+ if which == "obs":
439
+ report["obs_converted_columns"].append(col)
440
+ else:
441
+ report["var_converted_columns"].append(col)
442
+ except Exception:
443
+ # fallback: attempt per-element json.dumps; if fails mark as backed-up and coerce via str()
444
+ convertible = True
445
+ for val in ser.values:
446
+ try:
447
+ json.dumps(val, default=str)
448
+ except Exception:
449
+ convertible = False
450
+ break
451
+ if convertible:
452
+ if backup:
453
+ _backup(ser.values, f"{which}.{col}_backup")
454
+ if which == "obs":
455
+ report["obs_backed_up_columns"].append(col)
456
+ else:
457
+ report["var_backed_up_columns"].append(col)
458
+ df[col] = [json.dumps(v, default=str) for v in ser.values]
459
+ if verbose:
460
+ print(f" json-stringified object column '{which}.{col}' (backup={backup})")
461
+ if which == "obs":
462
+ report["obs_converted_columns"].append(col)
463
+ else:
464
+ report["var_converted_columns"].append(col)
465
+ else:
466
+ # fallback to string repr and backup
467
+ if backup:
468
+ _backup(ser.values, f"{which}.{col}_backup")
469
+ if which == "obs":
470
+ report["obs_backed_up_columns"].append(col)
471
+ else:
472
+ report["var_backed_up_columns"].append(col)
473
+ df[col] = ser.astype(str)
474
+ if verbose:
475
+ print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
476
+ if which == "obs":
477
+ report["obs_converted_columns"].append(col)
478
+ else:
479
+ report["var_converted_columns"].append(col)
480
+ return df
481
+
482
+ def _sanitize_uns(uns: dict):
483
+ """
484
+ For each key/value in uns:
485
+ - if json.dumps(value) works: keep it
486
+ - else: pickle value to backup dir, and add a JSON-stringified representation under key+'_json'
487
+ """
488
+ clean = {}
489
+ backed_up = []
490
+ for k, v in uns.items():
491
+ try:
492
+ json.dumps(v)
493
+ clean[k] = v
494
+ except Exception:
495
+ try:
496
+ s = json.dumps(v, default=str)
497
+ clean[k + "_json"] = s
498
+ if backup:
499
+ _backup(v, f"uns_{k}_backup")
500
+ backed_up.append(k)
501
+ if verbose:
502
+ print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
503
+ report["uns_json_keys"].append(k)
504
+ except Exception:
505
+ try:
506
+ if backup:
507
+ _backup(v, f"uns_{k}_backup")
508
+ clean[k + "_str"] = str(v)
509
+ backed_up.append(k)
510
+ if verbose:
511
+ print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
512
+ report["uns_backed_up_keys"].append(k)
513
+ except Exception as e:
514
+ msg = f"uns['{k}'] could not be preserved: {e}"
515
+ report["errors"].append(msg)
516
+ if verbose:
517
+ print(" " + msg)
518
+ if backed_up and verbose:
519
+ print(f"Sanitized .uns keys (backed up): {backed_up}")
520
+ return clean
521
+
522
+ def _sanitize_layers_obsm(src_dict, which: str):
523
+ """
524
+ Ensure arrays in layers/obsm are numeric and non-object dtype.
525
+ Returns a cleaned dict suitable to pass into AnnData(...)
526
+ If an entry is not convertible, it is backed up & skipped.
527
+ """
528
+ cleaned = {}
529
+ for k, v in src_dict.items():
530
+ try:
531
+ arr = np.asarray(v)
532
+ if arr.dtype == object:
533
+ try:
534
+ arr_f = arr.astype(float)
535
+ cleaned[k] = arr_f
536
+ report_key = f"{which}.{k}"
537
+ report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
538
+ if verbose:
539
+ print(f" {which}.{k} object array coerced to float.")
540
+ except Exception:
541
+ try:
542
+ arr_i = arr.astype(int)
543
+ cleaned[k] = arr_i
544
+ report_key = f"{which}.{k}"
545
+ report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
546
+ if verbose:
547
+ print(f" {which}.{k} object array coerced to int.")
548
+ except Exception:
549
+ if backup:
550
+ _backup(v, f"{which}_{k}_backup")
551
+ if which == "layers":
552
+ report["layers_skipped"].append(k)
553
+ else:
554
+ report["obsm_skipped"].append(k)
555
+ if verbose:
556
+ print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
557
+ continue
558
+ else:
559
+ cleaned[k] = arr
560
+ except Exception as e:
561
+ if backup:
562
+ _backup(v, f"{which}_{k}_backup")
563
+ if which == "layers":
564
+ report["layers_skipped"].append(k)
565
+ else:
566
+ report["obsm_skipped"].append(k)
567
+ msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
568
+ report["errors"].append(msg)
569
+ if verbose:
570
+ print(msg)
571
+ continue
572
+ return cleaned
573
+
574
+ # ---------- sanitize obs, var ----------
575
+ try:
576
+ obs_clean = _make_obs_var_safe(adata.obs, "obs")
577
+ except Exception as e:
578
+ msg = f"Failed to sanitize obs: {e}"
579
+ report["errors"].append(msg)
580
+ if verbose:
581
+ print(msg)
582
+ obs_clean = adata.obs.copy()
583
+
584
+ try:
585
+ var_clean = _make_obs_var_safe(adata.var, "var")
586
+ except Exception as e:
587
+ msg = f"Failed to sanitize var: {e}"
588
+ report["errors"].append(msg)
589
+ if verbose:
590
+ print(msg)
591
+ var_clean = adata.var.copy()
592
+
593
+ # ---------- sanitize uns ----------
594
+ try:
595
+ uns_clean = _sanitize_uns(adata.uns)
596
+ except Exception as e:
597
+ msg = f"Failed to sanitize uns: {e}"
598
+ report["errors"].append(msg)
599
+ if verbose:
600
+ print(msg)
601
+ uns_clean = {}
602
+
603
+ # ---------- sanitize layers and obsm ----------
604
+ layers_src = getattr(adata, "layers", {})
605
+ obsm_src = getattr(adata, "obsm", {})
606
+
607
+ try:
608
+ layers_clean = _sanitize_layers_obsm(layers_src, "layers")
609
+ except Exception as e:
610
+ msg = f"Failed to sanitize layers: {e}"
611
+ report["errors"].append(msg)
612
+ if verbose:
613
+ print(msg)
614
+ layers_clean = {}
615
+
616
+ try:
617
+ obsm_clean = _sanitize_layers_obsm(obsm_src, "obsm")
618
+ except Exception as e:
619
+ msg = f"Failed to sanitize obsm: {e}"
620
+ report["errors"].append(msg)
621
+ if verbose:
622
+ print(msg)
623
+ obsm_clean = {}
624
+
625
+ # ---------- handle X ----------
626
+ X_to_use = adata.X
627
+ try:
628
+ X_arr = np.asarray(adata.X)
629
+ if X_arr.dtype == object:
630
+ try:
631
+ X_to_use = X_arr.astype(float)
632
+ report["X_replaced_or_converted"] = "converted_to_float"
633
+ if verbose:
634
+ print("Converted adata.X object-dtype -> float")
635
+ except Exception:
636
+ if backup:
637
+ _backup(adata.X, "X_backup")
638
+ X_to_use = np.zeros_like(X_arr, dtype=float)
639
+ report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
640
+ if verbose:
641
+ print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
642
+ except Exception as e:
643
+ msg = f"Error handling adata.X: {e}"
644
+ report["errors"].append(msg)
645
+ if verbose:
646
+ print(msg)
647
+ X_to_use = adata.X
648
+
649
+ # ---------- build lightweight AnnData copy ----------
650
+ try:
651
+ adata_copy = _ad.AnnData(
652
+ X=X_to_use,
653
+ obs=obs_clean,
654
+ var=var_clean,
655
+ layers=layers_clean,
656
+ uns=uns_clean,
657
+ obsm=obsm_clean,
658
+ varm=getattr(adata, "varm", None),
659
+ )
660
+
661
+ # preserve names (as strings)
662
+ try:
663
+ adata_copy.obs_names = adata.obs_names.astype(str)
664
+ adata_copy.var_names = adata.var_names.astype(str)
665
+ except Exception:
666
+ adata_copy.obs_names = adata.obs_names
667
+ adata_copy.var_names = adata.var_names
668
+
669
+ # --- write
670
+ adata_copy.write_h5ad(path, compression=compression)
671
+ if verbose:
672
+ print(f"Saved safely to {path}")
673
+ except Exception as e:
674
+ msg = f"Failed to write h5ad: {e}"
675
+ report["errors"].append(msg)
676
+ if verbose:
677
+ print(msg)
678
+ raise
679
+
680
+ # Print a concise interactive report
681
+ print("\n=== safe_write_h5ad REPORT ===")
682
+ print(f"Saved file: {path}")
683
+ print(f"Adata shape: {adata.shape}")
684
+ if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
685
+ print(f"obs: converted columns -> {report['obs_converted_columns']}")
686
+ print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
687
+ else:
688
+ print("obs: no problematic columns found.")
689
+
690
+ if report["var_converted_columns"] or report["var_backed_up_columns"]:
691
+ print(f"var: converted columns -> {report['var_converted_columns']}")
692
+ print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
693
+ else:
694
+ print("var: no problematic columns found.")
695
+
696
+ if report["uns_json_keys"] or report["uns_backed_up_keys"]:
697
+ print(f".uns: jsonified keys -> {report['uns_json_keys']}")
698
+ print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
699
+ else:
700
+ print(".uns: no problematic keys found.")
701
+
702
+ if report["layers_converted"] or report["layers_skipped"]:
703
+ print(f"layers: converted -> {report['layers_converted']}")
704
+ print(f"layers: skipped -> {report['layers_skipped']}")
705
+ else:
706
+ print("layers: no problematic entries found.")
707
+
708
+ if report["obsm_converted"] or report["obsm_skipped"]:
709
+ print(f"obsm: converted -> {report['obsm_converted']}")
710
+ print(f"obsm: skipped -> {report['obsm_skipped']}")
711
+ else:
712
+ print("obsm: no problematic entries found.")
713
+
714
+ if report["X_replaced_or_converted"]:
715
+ print(f"adata.X handled: {report['X_replaced_or_converted']}")
716
+ else:
717
+ print("adata.X: no changes.")
718
+
719
+ if report["errors"]:
720
+ print("\nWarnings / errors encountered:")
721
+ for e in report["errors"]:
722
+ print(" -", e)
723
+
724
+ print("=== end report ===\n")
725
+ return report
726
+
727
+ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
728
+ """
729
+ Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
730
+ from the backup_dir produced during save.
731
+
732
+ Parameters
733
+ ----------
734
+ path : str
735
+ Path to the cleaned .h5ad produced by safe_write_h5ad.
736
+ backup_dir : str
737
+ Directory where safe_write_h5ad stored pickled backups (default "./uns_backups").
738
+ restore_backups : bool
739
+ If True, attempt to load pickled backups and restore original objects into adata.
740
+ re_categorize : bool
741
+ If True, try to coerce small unique-count string columns back into pandas.Categorical.
742
+ categorical_threshold : int
743
+ Max unique values for a column to be considered categorical for automatic recasting.
744
+ verbose : bool
745
+ Print progress/summary.
746
+
747
+ Returns
748
+ -------
749
+ (adata, report) :
750
+ adata : AnnData
751
+ The reloaded (and possibly restored) AnnData instance.
752
+ report : dict
753
+ A report describing restored items, parsed JSON keys, and any failures.
754
+ """
755
+ import os
756
+ from pathlib import Path
757
+ import json
758
+ import pickle
759
+ import numpy as np
760
+ import pandas as pd
761
+ import anndata as _ad
762
+
763
+ path = Path(path)
764
+
765
+ if not backup_dir:
766
+ backup_dir = path.parent / str(path.name).split(".")[0]
767
+
768
+ report = {
769
+ "restored_obs_columns": [],
770
+ "restored_var_columns": [],
771
+ "restored_uns_keys": [],
772
+ "parsed_uns_json_keys": [],
773
+ "restored_layers": [],
774
+ "restored_obsm": [],
775
+ "recategorized_obs": [],
776
+ "recategorized_var": [],
777
+ "missing_backups": [],
778
+ "errors": [],
779
+ }
780
+
781
+ if verbose:
782
+ print(f"[safe_read_h5ad] loading {path}")
783
+
784
+ # 1) load the cleaned h5ad
785
+ try:
786
+ adata = _ad.read_h5ad(path)
787
+ except Exception as e:
788
+ raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
789
+
790
+ # Ensure backup_dir exists (may be relative to cwd)
791
+ if verbose:
792
+ print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
793
+
794
+ def _load_pickle_if_exists(fname):
795
+ if os.path.exists(fname):
796
+ try:
797
+ with open(fname, "rb") as fh:
798
+ val = pickle.load(fh)
799
+ return val
800
+ except Exception as e:
801
+ report["errors"].append(f"Failed to load pickle {fname}: {e}")
802
+ if verbose:
803
+ print(f" error loading {fname}: {e}")
804
+ return None
805
+ return None
806
+
807
+ # 2) Restore obs columns
808
+ for col in list(adata.obs.columns):
809
+ # Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
810
+ bname1 = backup_dir / f"obs.{col}_backup.pkl"
811
+ bname2 = backup_dir / f"obs.{col}_categorical_backup.pkl"
812
+ restored = False
813
+
814
+ if restore_backups:
815
+ val = _load_pickle_if_exists(bname2)
816
+ if val is not None:
817
+ # val may be the categorical series or categories
818
+ try:
819
+ # If pickled numpy array or pandas Series, coerce to same index alignment
820
+ if hasattr(val, "shape") and (len(val) == adata.shape[0]):
821
+ adata.obs[col] = pd.Series(val, index=adata.obs.index)
822
+ else:
823
+ # fallback: place pickled object directly
824
+ adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
825
+ report["restored_obs_columns"].append((col, bname2))
826
+ restored = True
827
+ if verbose:
828
+ print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
829
+ except Exception as e:
830
+ report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
831
+ restored = False
832
+
833
+ if not restored:
834
+ val = _load_pickle_if_exists(bname1)
835
+ if val is not None:
836
+ try:
837
+ if hasattr(val, "shape") and (len(val) == adata.shape[0]):
838
+ adata.obs[col] = pd.Series(val, index=adata.obs.index)
839
+ else:
840
+ adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
841
+ report["restored_obs_columns"].append((col, bname1))
842
+ restored = True
843
+ if verbose:
844
+ print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
845
+ except Exception as e:
846
+ report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
847
+ restored = False
848
+
849
+ # If not restored and column dtype is object but contains JSON-like strings, try json.loads per element
850
+ if (not restored) and (adata.obs[col].dtype == object):
851
+ sample_vals = adata.obs[col].dropna().astype(str).head(20).tolist()
852
+ looks_like_json = False
853
+ for sv in sample_vals:
854
+ svs = sv.strip()
855
+ if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
856
+ looks_like_json = True
857
+ break
858
+ if looks_like_json:
859
+ parsed = []
860
+ success_parse = True
861
+ for v in adata.obs[col].astype(str).values:
862
+ try:
863
+ parsed.append(json.loads(v))
864
+ except Exception:
865
+ # if any element fails, don't convert whole column
866
+ success_parse = False
867
+ break
868
+ if success_parse:
869
+ adata.obs[col] = pd.Series(parsed, index=adata.obs.index)
870
+ report["restored_obs_columns"].append((col, "parsed_json"))
871
+ restored = True
872
+ if verbose:
873
+ print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
874
+
875
+ # If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
876
+ if (not restored) and re_categorize and adata.obs[col].dtype == object:
877
+ try:
878
+ nunique = adata.obs[col].dropna().astype(str).nunique()
879
+ if nunique > 0 and nunique <= categorical_threshold:
880
+ # cast to category
881
+ adata.obs[col] = adata.obs[col].astype(str).astype("category")
882
+ report["recategorized_obs"].append(col)
883
+ if verbose:
884
+ print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
885
+ except Exception as e:
886
+ report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
887
+
888
+ # 3) Restore var columns (same logic)
889
+ for col in list(adata.var.columns):
890
+ bname1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
891
+ bname2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
892
+ restored = False
893
+
894
+ if restore_backups:
895
+ val = _load_pickle_if_exists(bname2)
896
+ if val is not None:
897
+ try:
898
+ if hasattr(val, "shape") and (len(val) == adata.shape[1]):
899
+ adata.var[col] = pd.Series(val, index=adata.var.index)
900
+ else:
901
+ adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
902
+ report["restored_var_columns"].append((col, bname2))
903
+ restored = True
904
+ if verbose:
905
+ print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
906
+ except Exception as e:
907
+ report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
908
+
909
+ if not restored:
910
+ val = _load_pickle_if_exists(bname1)
911
+ if val is not None:
912
+ try:
913
+ if hasattr(val, "shape") and (len(val) == adata.shape[1]):
914
+ adata.var[col] = pd.Series(val, index=adata.var.index)
915
+ else:
916
+ adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
917
+ report["restored_var_columns"].append((col, bname1))
918
+ restored = True
919
+ if verbose:
920
+ print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
921
+ except Exception as e:
922
+ report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
923
+
924
+ if (not restored) and (adata.var[col].dtype == object):
925
+ # try JSON parsing
926
+ sample_vals = adata.var[col].dropna().astype(str).head(20).tolist()
927
+ looks_like_json = False
928
+ for sv in sample_vals:
929
+ svs = sv.strip()
930
+ if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
931
+ looks_like_json = True
932
+ break
933
+ if looks_like_json:
934
+ parsed = []
935
+ success_parse = True
936
+ for v in adata.var[col].astype(str).values:
937
+ try:
938
+ parsed.append(json.loads(v))
939
+ except Exception:
940
+ success_parse = False
941
+ break
942
+ if success_parse:
943
+ adata.var[col] = pd.Series(parsed, index=adata.var.index)
944
+ report["restored_var_columns"].append((col, "parsed_json"))
945
+ if verbose:
946
+ print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
947
+
948
+ if (not restored) and re_categorize and adata.var[col].dtype == object:
949
+ try:
950
+ nunique = adata.var[col].dropna().astype(str).nunique()
951
+ if nunique > 0 and nunique <= categorical_threshold:
952
+ adata.var[col] = adata.var[col].astype(str).astype("category")
953
+ report["recategorized_var"].append(col)
954
+ if verbose:
955
+ print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
956
+ except Exception as e:
957
+ report["errors"].append(f"Failed to recategorize var.{col}: {e}")
958
+
959
+ # 4) Restore uns: look for uns_{k}_backup.pkl, or keys like "<k>_json"
960
+ uns_keys = list(adata.uns.keys())
961
+ # First, if we have "<k>_json", convert back into k
962
+ for k in uns_keys:
963
+ if k.endswith("_json"):
964
+ base = k[:-5]
965
+ sval = adata.uns.get(k)
966
+ try:
967
+ parsed = json.loads(sval)
968
+ adata.uns[base] = parsed
969
+ report["parsed_uns_json_keys"].append(base)
970
+ if verbose:
971
+ print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
972
+ # remove the _json entry
973
+ try:
974
+ del adata.uns[k]
975
+ except KeyError:
976
+ pass
977
+ except Exception as e:
978
+ report["errors"].append(f"Failed to json-parse uns['{k}']: {e}")
979
+
980
+ # Now try to restore pickled backups for uns keys
981
+ # Look for files named uns_<key>_backup.pkl
982
+ # We will attempt to restore into adata.uns[key] if backup exists
983
+ for fname in os.listdir(backup_dir) if os.path.isdir(backup_dir) else []:
984
+ if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
985
+ continue
986
+ # fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
987
+ key = fname[len("uns_"):-len("_backup.pkl")]
988
+ full = os.path.join(backup_dir, fname)
989
+ val = _load_pickle_if_exists(full)
990
+ if val is not None:
991
+ adata.uns[key] = val
992
+ report["restored_uns_keys"].append((key, full))
993
+ if verbose:
994
+ print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
995
+
996
+ # 5) Restore layers and obsm from backups if present
997
+ # expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
998
+ if os.path.isdir(backup_dir):
999
+ for fname in os.listdir(backup_dir):
1000
+ if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
1001
+ layer_name = fname[len("layers_"):-len("_backup.pkl")]
1002
+ full = os.path.join(backup_dir, fname)
1003
+ val = _load_pickle_if_exists(full)
1004
+ if val is not None:
1005
+ try:
1006
+ adata.layers[layer_name] = np.asarray(val)
1007
+ report["restored_layers"].append((layer_name, full))
1008
+ if verbose:
1009
+ print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
1010
+ except Exception as e:
1011
+ report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
1012
+
1013
+ if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
1014
+ obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
1015
+ full = os.path.join(backup_dir, fname)
1016
+ val = _load_pickle_if_exists(full)
1017
+ if val is not None:
1018
+ try:
1019
+ adata.obsm[obsm_name] = np.asarray(val)
1020
+ report["restored_obsm"].append((obsm_name, full))
1021
+ if verbose:
1022
+ print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
1023
+ except Exception as e:
1024
+ report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
1025
+
1026
+ # 6) If restore_backups True but some expected backups missing, note them
1027
+ if restore_backups and os.path.isdir(backup_dir):
1028
+ # detect common expected names from obs/var/uns/layers in adata
1029
+ expected_missing = []
1030
+ # obs/var columns
1031
+ for col in list(adata.obs.columns):
1032
+ p1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
1033
+ p2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
1034
+ if (not os.path.exists(p1)) and (not os.path.exists(p2)):
1035
+ # we don't require backups for every column; only record if column still looks like placeholder strings
1036
+ if adata.obs[col].dtype == object:
1037
+ expected_missing.append(("obs", col))
1038
+ for col in list(adata.var.columns):
1039
+ p1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
1040
+ p2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
1041
+ if (not os.path.exists(p1)) and (not os.path.exists(p2)):
1042
+ if adata.var[col].dtype == object:
1043
+ expected_missing.append(("var", col))
1044
+ # uns keys
1045
+ for k in adata.uns.keys():
1046
+ # if we have *_json or *_str variants we expect backups optionally
1047
+ if k.endswith("_json") or k.endswith("_str"):
1048
+ b = os.path.join(backup_dir, f"uns_{k[:-5]}_backup.pkl")
1049
+ if not os.path.exists(b):
1050
+ report["missing_backups"].append(("uns", k))
1051
+ if expected_missing and verbose:
1052
+ n = len(expected_missing)
1053
+ if verbose:
1054
+ print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
1055
+ # add to report
1056
+ report["missing_backups"].extend(expected_missing)
1057
+
1058
+ # final summary print
1059
+ if verbose:
1060
+ print("\n=== safe_read_h5ad summary ===")
1061
+ if report["restored_obs_columns"]:
1062
+ print("Restored obs columns:", report["restored_obs_columns"])
1063
+ if report["restored_var_columns"]:
1064
+ print("Restored var columns:", report["restored_var_columns"])
1065
+ if report["restored_uns_keys"]:
1066
+ print("Restored uns keys:", report["restored_uns_keys"])
1067
+ if report["parsed_uns_json_keys"]:
1068
+ print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
1069
+ if report["restored_layers"]:
1070
+ print("Restored layers:", report["restored_layers"])
1071
+ if report["restored_obsm"]:
1072
+ print("Restored obsm:", report["restored_obsm"])
1073
+ if report["recategorized_obs"] or report["recategorized_var"]:
1074
+ print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
1075
+ if report["missing_backups"]:
1076
+ print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
1077
+ if report["errors"]:
1078
+ print("Errors encountered (see report['errors']):")
1079
+ for e in report["errors"]:
1080
+ print(" -", e)
1081
+ print("=== end summary ===\n")
1082
+
1083
+ return adata, report
1084
+
1085
+ def merge_barcoded_anndatas_core(adata_single, adata_double):
174
1086
  import numpy as np
175
1087
  import anndata as ad
176
1088
 
@@ -194,5 +1106,25 @@ def merge_barcoded_anndatas(adata_single, adata_double):
194
1106
  adata_merged.uns = {**adata_single.uns, **adata_double.uns}
195
1107
 
196
1108
  return adata_merged
197
-
198
- ######################################################################################################
1109
+ ######################################################################################################
1110
+
1111
+ ### File conversion misc ###
1112
+ import argparse
1113
+ from Bio import SeqIO
1114
+ def genbank_to_gff(genbank_file, output_file, record_id):
1115
+ with open(output_file, "w") as out:
1116
+ for record in SeqIO.parse(genbank_file, "genbank"):
1117
+ for feature in record.features:
1118
+ # Skip features without location information
1119
+ if feature.location is None:
1120
+ continue
1121
+ # Extract feature information
1122
+ start = feature.location.start + 1 # Convert to 1-based index
1123
+ end = feature.location.end
1124
+ strand = "+" if feature.location.strand == 1 else "-"
1125
+ feature_type = feature.type
1126
+ # Format attributes
1127
+ attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
1128
+ # Write GFF3 line
1129
+ gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
1130
+ out.write(gff3_line + "\n")