smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
ADDED
|
@@ -0,0 +1,1004 @@
|
|
|
1
|
+
## readwrite ##
|
|
2
|
+
|
|
3
|
+
######################################################################################################
|
|
4
|
+
## Datetime functionality
|
|
5
|
+
def date_string():
|
|
6
|
+
"""
|
|
7
|
+
Each time this is called, it returns the current date string
|
|
8
|
+
"""
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
current_date = datetime.now()
|
|
11
|
+
date_string = current_date.strftime("%Y%m%d")
|
|
12
|
+
date_string = date_string[2:]
|
|
13
|
+
return date_string
|
|
14
|
+
|
|
15
|
+
def time_string():
|
|
16
|
+
"""
|
|
17
|
+
Each time this is called, it returns the current time string
|
|
18
|
+
"""
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
current_time = datetime.now()
|
|
21
|
+
return current_time.strftime("%H:%M:%S")
|
|
22
|
+
######################################################################################################
|
|
23
|
+
|
|
24
|
+
######################################################################################################
|
|
25
|
+
## Numpy, Pandas, Anndata functionality
|
|
26
|
+
|
|
27
|
+
def adata_to_df(adata, layer=None):
|
|
28
|
+
"""
|
|
29
|
+
Convert an AnnData object into a Pandas DataFrame.
|
|
30
|
+
|
|
31
|
+
Parameters:
|
|
32
|
+
adata (AnnData): The input AnnData object.
|
|
33
|
+
layer (str, optional): The layer to extract. If None, uses adata.X.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
pd.DataFrame: A DataFrame where rows are observations and columns are positions.
|
|
37
|
+
"""
|
|
38
|
+
import pandas as pd
|
|
39
|
+
import anndata as ad
|
|
40
|
+
import numpy as np
|
|
41
|
+
|
|
42
|
+
# Validate that the requested layer exists
|
|
43
|
+
if layer and layer not in adata.layers:
|
|
44
|
+
raise ValueError(f"Layer '{layer}' not found in adata.layers.")
|
|
45
|
+
|
|
46
|
+
# Extract the data matrix
|
|
47
|
+
data_matrix = adata.layers.get(layer, adata.X)
|
|
48
|
+
|
|
49
|
+
# Ensure matrix is dense (handle sparse formats)
|
|
50
|
+
if hasattr(data_matrix, "toarray"):
|
|
51
|
+
data_matrix = data_matrix.toarray()
|
|
52
|
+
|
|
53
|
+
# Ensure obs and var have unique indices
|
|
54
|
+
if adata.obs.index.duplicated().any():
|
|
55
|
+
raise ValueError("Duplicate values found in `adata.obs.index`. Ensure unique observation indices.")
|
|
56
|
+
|
|
57
|
+
if adata.var.index.duplicated().any():
|
|
58
|
+
raise ValueError("Duplicate values found in `adata.var.index`. Ensure unique variable indices.")
|
|
59
|
+
|
|
60
|
+
# Convert to DataFrame
|
|
61
|
+
df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
|
|
62
|
+
|
|
63
|
+
return df
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def save_matrix(matrix, save_name):
|
|
67
|
+
"""
|
|
68
|
+
Input: A numpy matrix and a save_name
|
|
69
|
+
Output: A txt file representation of the data matrix
|
|
70
|
+
"""
|
|
71
|
+
import numpy as np
|
|
72
|
+
np.savetxt(f'{save_name}.txt', matrix)
|
|
73
|
+
|
|
74
|
+
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
75
|
+
"""
|
|
76
|
+
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
77
|
+
Input: an output file path relative to the directory in which the function is called
|
|
78
|
+
"""
|
|
79
|
+
import os
|
|
80
|
+
import anndata as ad
|
|
81
|
+
# Runtime warnings
|
|
82
|
+
import warnings
|
|
83
|
+
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
84
|
+
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
85
|
+
|
|
86
|
+
# List all files in the directory
|
|
87
|
+
files = os.listdir(os.getcwd())
|
|
88
|
+
# get current working directory
|
|
89
|
+
cwd = os.getcwd()
|
|
90
|
+
suffix = file_suffix
|
|
91
|
+
# Filter file names that contain the search string in their filename and keep them in a list
|
|
92
|
+
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
93
|
+
# Sort file list by names and print the list of file names
|
|
94
|
+
hdfs.sort()
|
|
95
|
+
print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
|
|
96
|
+
# Iterate over all of the hdf5 files and concatenate them.
|
|
97
|
+
final_adata = None
|
|
98
|
+
for hdf in hdfs:
|
|
99
|
+
print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
|
|
100
|
+
temp_adata = ad.read_h5ad(hdf)
|
|
101
|
+
if final_adata:
|
|
102
|
+
print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
103
|
+
final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
|
|
104
|
+
else:
|
|
105
|
+
print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
106
|
+
final_adata = temp_adata
|
|
107
|
+
print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
|
|
108
|
+
final_adata.write_h5ad(output_file, compression='gzip')
|
|
109
|
+
|
|
110
|
+
# Delete the individual h5ad files and only keep the final concatenated file
|
|
111
|
+
if delete_inputs:
|
|
112
|
+
files = os.listdir(os.getcwd())
|
|
113
|
+
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
114
|
+
if output_file in hdfs:
|
|
115
|
+
hdfs.remove(output_file)
|
|
116
|
+
# Iterate over the files and delete them
|
|
117
|
+
for hdf in hdfs:
|
|
118
|
+
try:
|
|
119
|
+
os.remove(hdf)
|
|
120
|
+
print(f"Deleted file: {hdf}")
|
|
121
|
+
except OSError as e:
|
|
122
|
+
print(f"Error deleting file {hdf}: {e}")
|
|
123
|
+
else:
|
|
124
|
+
print('Keeping input files')
|
|
125
|
+
|
|
126
|
+
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./uns_backups", verbose=True):
|
|
127
|
+
"""
|
|
128
|
+
Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
|
|
129
|
+
|
|
130
|
+
Returns a report dict and prints a summary of what was converted/backed up/skipped.
|
|
131
|
+
"""
|
|
132
|
+
import os, json, pickle
|
|
133
|
+
import numpy as np
|
|
134
|
+
import pandas as pd
|
|
135
|
+
import warnings
|
|
136
|
+
import anndata as _ad
|
|
137
|
+
|
|
138
|
+
os.makedirs(backup_dir, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
# report structure
|
|
141
|
+
report = {
|
|
142
|
+
"obs_converted_columns": [],
|
|
143
|
+
"obs_backed_up_columns": [],
|
|
144
|
+
"var_converted_columns": [],
|
|
145
|
+
"var_backed_up_columns": [],
|
|
146
|
+
"uns_backed_up_keys": [],
|
|
147
|
+
"uns_json_keys": [],
|
|
148
|
+
"layers_converted": [],
|
|
149
|
+
"layers_skipped": [],
|
|
150
|
+
"obsm_converted": [],
|
|
151
|
+
"obsm_skipped": [],
|
|
152
|
+
"X_replaced_or_converted": None,
|
|
153
|
+
"errors": [],
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def _backup(obj, name):
|
|
157
|
+
"""Pickle obj to backup_dir/name.pkl and return filename (or None)."""
|
|
158
|
+
fname = os.path.join(backup_dir, f"{name}.pkl")
|
|
159
|
+
try:
|
|
160
|
+
with open(fname, "wb") as fh:
|
|
161
|
+
pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)
|
|
162
|
+
if verbose:
|
|
163
|
+
print(f" backed up {name} -> {fname}")
|
|
164
|
+
return fname
|
|
165
|
+
except Exception as e:
|
|
166
|
+
msg = f"failed to back up {name}: {e}"
|
|
167
|
+
if verbose:
|
|
168
|
+
print(" " + msg)
|
|
169
|
+
report["errors"].append(msg)
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _make_obs_var_safe(df: pd.DataFrame, which: str):
|
|
173
|
+
"""
|
|
174
|
+
Return a sanitized copy of df where:
|
|
175
|
+
- object columns converted to strings (with backup)
|
|
176
|
+
- categorical columns' categories coerced to str (with backup)
|
|
177
|
+
"""
|
|
178
|
+
df = df.copy()
|
|
179
|
+
for col in list(df.columns):
|
|
180
|
+
ser = df[col]
|
|
181
|
+
# categorical handling
|
|
182
|
+
try:
|
|
183
|
+
is_cat = pd.api.types.is_categorical_dtype(ser.dtype)
|
|
184
|
+
except Exception:
|
|
185
|
+
is_cat = False
|
|
186
|
+
|
|
187
|
+
if is_cat:
|
|
188
|
+
try:
|
|
189
|
+
cats = ser.cat.categories
|
|
190
|
+
cats_str = cats.astype(str)
|
|
191
|
+
df[col] = pd.Categorical(ser.astype(str), categories=cats_str)
|
|
192
|
+
if verbose:
|
|
193
|
+
print(f" coerced categorical column '{which}.{col}' -> string categories")
|
|
194
|
+
if which == "obs":
|
|
195
|
+
report["obs_converted_columns"].append(col)
|
|
196
|
+
else:
|
|
197
|
+
report["var_converted_columns"].append(col)
|
|
198
|
+
except Exception:
|
|
199
|
+
# backup then coerce
|
|
200
|
+
if backup:
|
|
201
|
+
_backup(ser, f"{which}.{col}_categorical_backup")
|
|
202
|
+
if which == "obs":
|
|
203
|
+
report["obs_backed_up_columns"].append(col)
|
|
204
|
+
else:
|
|
205
|
+
report["var_backed_up_columns"].append(col)
|
|
206
|
+
df[col] = ser.astype(str)
|
|
207
|
+
if verbose:
|
|
208
|
+
print(f" coerced categorical column '{which}.{col}' -> strings (backup={backup})")
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# object dtype handling: try to coerce each element to string
|
|
212
|
+
try:
|
|
213
|
+
is_obj = ser.dtype == object or pd.api.types.is_object_dtype(ser.dtype)
|
|
214
|
+
except Exception:
|
|
215
|
+
is_obj = False
|
|
216
|
+
|
|
217
|
+
if is_obj:
|
|
218
|
+
# test whether converting to string succeeds for all elements
|
|
219
|
+
try:
|
|
220
|
+
_ = np.array(ser.values.astype(str))
|
|
221
|
+
if backup:
|
|
222
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
223
|
+
if which == "obs":
|
|
224
|
+
report["obs_backed_up_columns"].append(col)
|
|
225
|
+
else:
|
|
226
|
+
report["var_backed_up_columns"].append(col)
|
|
227
|
+
df[col] = ser.values.astype(str)
|
|
228
|
+
if verbose:
|
|
229
|
+
print(f" converted object column '{which}.{col}' -> strings (backup={backup})")
|
|
230
|
+
if which == "obs":
|
|
231
|
+
report["obs_converted_columns"].append(col)
|
|
232
|
+
else:
|
|
233
|
+
report["var_converted_columns"].append(col)
|
|
234
|
+
except Exception:
|
|
235
|
+
# fallback: attempt per-element json.dumps; if fails mark as backed-up and coerce via str()
|
|
236
|
+
convertible = True
|
|
237
|
+
for val in ser.values:
|
|
238
|
+
try:
|
|
239
|
+
json.dumps(val, default=str)
|
|
240
|
+
except Exception:
|
|
241
|
+
convertible = False
|
|
242
|
+
break
|
|
243
|
+
if convertible:
|
|
244
|
+
if backup:
|
|
245
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
246
|
+
if which == "obs":
|
|
247
|
+
report["obs_backed_up_columns"].append(col)
|
|
248
|
+
else:
|
|
249
|
+
report["var_backed_up_columns"].append(col)
|
|
250
|
+
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
251
|
+
if verbose:
|
|
252
|
+
print(f" json-stringified object column '{which}.{col}' (backup={backup})")
|
|
253
|
+
if which == "obs":
|
|
254
|
+
report["obs_converted_columns"].append(col)
|
|
255
|
+
else:
|
|
256
|
+
report["var_converted_columns"].append(col)
|
|
257
|
+
else:
|
|
258
|
+
# fallback to string repr and backup
|
|
259
|
+
if backup:
|
|
260
|
+
_backup(ser.values, f"{which}.{col}_backup")
|
|
261
|
+
if which == "obs":
|
|
262
|
+
report["obs_backed_up_columns"].append(col)
|
|
263
|
+
else:
|
|
264
|
+
report["var_backed_up_columns"].append(col)
|
|
265
|
+
df[col] = ser.astype(str)
|
|
266
|
+
if verbose:
|
|
267
|
+
print(f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up).")
|
|
268
|
+
if which == "obs":
|
|
269
|
+
report["obs_converted_columns"].append(col)
|
|
270
|
+
else:
|
|
271
|
+
report["var_converted_columns"].append(col)
|
|
272
|
+
return df
|
|
273
|
+
|
|
274
|
+
def _sanitize_uns(uns: dict):
|
|
275
|
+
"""
|
|
276
|
+
For each key/value in uns:
|
|
277
|
+
- if json.dumps(value) works: keep it
|
|
278
|
+
- else: pickle value to backup dir, and add a JSON-stringified representation under key+'_json'
|
|
279
|
+
"""
|
|
280
|
+
clean = {}
|
|
281
|
+
backed_up = []
|
|
282
|
+
for k, v in uns.items():
|
|
283
|
+
try:
|
|
284
|
+
json.dumps(v)
|
|
285
|
+
clean[k] = v
|
|
286
|
+
except Exception:
|
|
287
|
+
try:
|
|
288
|
+
s = json.dumps(v, default=str)
|
|
289
|
+
clean[k + "_json"] = s
|
|
290
|
+
if backup:
|
|
291
|
+
_backup(v, f"uns_{k}_backup")
|
|
292
|
+
backed_up.append(k)
|
|
293
|
+
if verbose:
|
|
294
|
+
print(f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})")
|
|
295
|
+
report["uns_json_keys"].append(k)
|
|
296
|
+
except Exception:
|
|
297
|
+
try:
|
|
298
|
+
if backup:
|
|
299
|
+
_backup(v, f"uns_{k}_backup")
|
|
300
|
+
clean[k + "_str"] = str(v)
|
|
301
|
+
backed_up.append(k)
|
|
302
|
+
if verbose:
|
|
303
|
+
print(f" uns['{k}'] stored as string under '{k}_str' (backed up).")
|
|
304
|
+
report["uns_backed_up_keys"].append(k)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
msg = f"uns['{k}'] could not be preserved: {e}"
|
|
307
|
+
report["errors"].append(msg)
|
|
308
|
+
if verbose:
|
|
309
|
+
print(" " + msg)
|
|
310
|
+
if backed_up and verbose:
|
|
311
|
+
print(f"Sanitized .uns keys (backed up): {backed_up}")
|
|
312
|
+
return clean
|
|
313
|
+
|
|
314
|
+
def _sanitize_layers_obsm(src_dict, which: str):
|
|
315
|
+
"""
|
|
316
|
+
Ensure arrays in layers/obsm are numeric and non-object dtype.
|
|
317
|
+
Returns a cleaned dict suitable to pass into AnnData(...)
|
|
318
|
+
If an entry is not convertible, it is backed up & skipped.
|
|
319
|
+
"""
|
|
320
|
+
cleaned = {}
|
|
321
|
+
for k, v in src_dict.items():
|
|
322
|
+
try:
|
|
323
|
+
arr = np.asarray(v)
|
|
324
|
+
if arr.dtype == object:
|
|
325
|
+
try:
|
|
326
|
+
arr_f = arr.astype(float)
|
|
327
|
+
cleaned[k] = arr_f
|
|
328
|
+
report_key = f"{which}.{k}"
|
|
329
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
330
|
+
if verbose:
|
|
331
|
+
print(f" {which}.{k} object array coerced to float.")
|
|
332
|
+
except Exception:
|
|
333
|
+
try:
|
|
334
|
+
arr_i = arr.astype(int)
|
|
335
|
+
cleaned[k] = arr_i
|
|
336
|
+
report_key = f"{which}.{k}"
|
|
337
|
+
report["layers_converted"].append(report_key) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
338
|
+
if verbose:
|
|
339
|
+
print(f" {which}.{k} object array coerced to int.")
|
|
340
|
+
except Exception:
|
|
341
|
+
if backup:
|
|
342
|
+
_backup(v, f"{which}_{k}_backup")
|
|
343
|
+
if which == "layers":
|
|
344
|
+
report["layers_skipped"].append(k)
|
|
345
|
+
else:
|
|
346
|
+
report["obsm_skipped"].append(k)
|
|
347
|
+
if verbose:
|
|
348
|
+
print(f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}")
|
|
349
|
+
continue
|
|
350
|
+
else:
|
|
351
|
+
cleaned[k] = arr
|
|
352
|
+
except Exception as e:
|
|
353
|
+
if backup:
|
|
354
|
+
_backup(v, f"{which}_{k}_backup")
|
|
355
|
+
if which == "layers":
|
|
356
|
+
report["layers_skipped"].append(k)
|
|
357
|
+
else:
|
|
358
|
+
report["obsm_skipped"].append(k)
|
|
359
|
+
msg = f" SKIPPING {which}.{k} due to conversion error: {e}"
|
|
360
|
+
report["errors"].append(msg)
|
|
361
|
+
if verbose:
|
|
362
|
+
print(msg)
|
|
363
|
+
continue
|
|
364
|
+
return cleaned
|
|
365
|
+
|
|
366
|
+
# ---------- sanitize obs, var ----------
|
|
367
|
+
try:
|
|
368
|
+
obs_clean = _make_obs_var_safe(adata.obs, "obs")
|
|
369
|
+
except Exception as e:
|
|
370
|
+
msg = f"Failed to sanitize obs: {e}"
|
|
371
|
+
report["errors"].append(msg)
|
|
372
|
+
if verbose:
|
|
373
|
+
print(msg)
|
|
374
|
+
obs_clean = adata.obs.copy()
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
var_clean = _make_obs_var_safe(adata.var, "var")
|
|
378
|
+
except Exception as e:
|
|
379
|
+
msg = f"Failed to sanitize var: {e}"
|
|
380
|
+
report["errors"].append(msg)
|
|
381
|
+
if verbose:
|
|
382
|
+
print(msg)
|
|
383
|
+
var_clean = adata.var.copy()
|
|
384
|
+
|
|
385
|
+
# ---------- sanitize uns ----------
|
|
386
|
+
try:
|
|
387
|
+
uns_clean = _sanitize_uns(adata.uns)
|
|
388
|
+
except Exception as e:
|
|
389
|
+
msg = f"Failed to sanitize uns: {e}"
|
|
390
|
+
report["errors"].append(msg)
|
|
391
|
+
if verbose:
|
|
392
|
+
print(msg)
|
|
393
|
+
uns_clean = {}
|
|
394
|
+
|
|
395
|
+
# ---------- sanitize layers and obsm ----------
|
|
396
|
+
layers_src = getattr(adata, "layers", {})
|
|
397
|
+
obsm_src = getattr(adata, "obsm", {})
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
layers_clean = _sanitize_layers_obsm(layers_src, "layers")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
msg = f"Failed to sanitize layers: {e}"
|
|
403
|
+
report["errors"].append(msg)
|
|
404
|
+
if verbose:
|
|
405
|
+
print(msg)
|
|
406
|
+
layers_clean = {}
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
obsm_clean = _sanitize_layers_obsm(obsm_src, "obsm")
|
|
410
|
+
except Exception as e:
|
|
411
|
+
msg = f"Failed to sanitize obsm: {e}"
|
|
412
|
+
report["errors"].append(msg)
|
|
413
|
+
if verbose:
|
|
414
|
+
print(msg)
|
|
415
|
+
obsm_clean = {}
|
|
416
|
+
|
|
417
|
+
# ---------- handle X ----------
|
|
418
|
+
X_to_use = adata.X
|
|
419
|
+
try:
|
|
420
|
+
X_arr = np.asarray(adata.X)
|
|
421
|
+
if X_arr.dtype == object:
|
|
422
|
+
try:
|
|
423
|
+
X_to_use = X_arr.astype(float)
|
|
424
|
+
report["X_replaced_or_converted"] = "converted_to_float"
|
|
425
|
+
if verbose:
|
|
426
|
+
print("Converted adata.X object-dtype -> float")
|
|
427
|
+
except Exception:
|
|
428
|
+
if backup:
|
|
429
|
+
_backup(adata.X, "X_backup")
|
|
430
|
+
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
431
|
+
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
432
|
+
if verbose:
|
|
433
|
+
print("adata.X had object dtype and couldn't be converted; replaced with zeros (backup set).")
|
|
434
|
+
except Exception as e:
|
|
435
|
+
msg = f"Error handling adata.X: {e}"
|
|
436
|
+
report["errors"].append(msg)
|
|
437
|
+
if verbose:
|
|
438
|
+
print(msg)
|
|
439
|
+
X_to_use = adata.X
|
|
440
|
+
|
|
441
|
+
# ---------- build lightweight AnnData copy ----------
|
|
442
|
+
try:
|
|
443
|
+
adata_copy = _ad.AnnData(
|
|
444
|
+
X=X_to_use,
|
|
445
|
+
obs=obs_clean,
|
|
446
|
+
var=var_clean,
|
|
447
|
+
layers=layers_clean,
|
|
448
|
+
uns=uns_clean,
|
|
449
|
+
obsm=obsm_clean,
|
|
450
|
+
varm=getattr(adata, "varm", None),
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# preserve names (as strings)
|
|
454
|
+
try:
|
|
455
|
+
adata_copy.obs_names = adata.obs_names.astype(str)
|
|
456
|
+
adata_copy.var_names = adata.var_names.astype(str)
|
|
457
|
+
except Exception:
|
|
458
|
+
adata_copy.obs_names = adata.obs_names
|
|
459
|
+
adata_copy.var_names = adata.var_names
|
|
460
|
+
|
|
461
|
+
# --- write
|
|
462
|
+
adata_copy.write_h5ad(path, compression=compression)
|
|
463
|
+
if verbose:
|
|
464
|
+
print(f"Saved safely to {path}")
|
|
465
|
+
except Exception as e:
|
|
466
|
+
msg = f"Failed to write h5ad: {e}"
|
|
467
|
+
report["errors"].append(msg)
|
|
468
|
+
if verbose:
|
|
469
|
+
print(msg)
|
|
470
|
+
raise
|
|
471
|
+
|
|
472
|
+
# Print a concise interactive report
|
|
473
|
+
print("\n=== safe_write_h5ad REPORT ===")
|
|
474
|
+
print(f"Saved file: {path}")
|
|
475
|
+
print(f"Adata shape: {adata.shape}")
|
|
476
|
+
if report["obs_converted_columns"] or report["obs_backed_up_columns"]:
|
|
477
|
+
print(f"obs: converted columns -> {report['obs_converted_columns']}")
|
|
478
|
+
print(f"obs: backed-up columns -> {report['obs_backed_up_columns']}")
|
|
479
|
+
else:
|
|
480
|
+
print("obs: no problematic columns found.")
|
|
481
|
+
|
|
482
|
+
if report["var_converted_columns"] or report["var_backed_up_columns"]:
|
|
483
|
+
print(f"var: converted columns -> {report['var_converted_columns']}")
|
|
484
|
+
print(f"var: backed-up columns -> {report['var_backed_up_columns']}")
|
|
485
|
+
else:
|
|
486
|
+
print("var: no problematic columns found.")
|
|
487
|
+
|
|
488
|
+
if report["uns_json_keys"] or report["uns_backed_up_keys"]:
|
|
489
|
+
print(f".uns: jsonified keys -> {report['uns_json_keys']}")
|
|
490
|
+
print(f".uns: backed-up keys -> {report['uns_backed_up_keys']}")
|
|
491
|
+
else:
|
|
492
|
+
print(".uns: no problematic keys found.")
|
|
493
|
+
|
|
494
|
+
if report["layers_converted"] or report["layers_skipped"]:
|
|
495
|
+
print(f"layers: converted -> {report['layers_converted']}")
|
|
496
|
+
print(f"layers: skipped -> {report['layers_skipped']}")
|
|
497
|
+
else:
|
|
498
|
+
print("layers: no problematic entries found.")
|
|
499
|
+
|
|
500
|
+
if report["obsm_converted"] or report["obsm_skipped"]:
|
|
501
|
+
print(f"obsm: converted -> {report['obsm_converted']}")
|
|
502
|
+
print(f"obsm: skipped -> {report['obsm_skipped']}")
|
|
503
|
+
else:
|
|
504
|
+
print("obsm: no problematic entries found.")
|
|
505
|
+
|
|
506
|
+
if report["X_replaced_or_converted"]:
|
|
507
|
+
print(f"adata.X handled: {report['X_replaced_or_converted']}")
|
|
508
|
+
else:
|
|
509
|
+
print("adata.X: no changes.")
|
|
510
|
+
|
|
511
|
+
if report["errors"]:
|
|
512
|
+
print("\nWarnings / errors encountered:")
|
|
513
|
+
for e in report["errors"]:
|
|
514
|
+
print(" -", e)
|
|
515
|
+
|
|
516
|
+
print("=== end report ===\n")
|
|
517
|
+
return report
|
|
518
|
+
|
|
519
|
+
def safe_read_h5ad(path, backup_dir="./uns_backups", restore_backups=True, re_categorize=True, categorical_threshold=100, verbose=True):
|
|
520
|
+
"""
|
|
521
|
+
Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
|
|
522
|
+
from the backup_dir produced during save.
|
|
523
|
+
|
|
524
|
+
Parameters
|
|
525
|
+
----------
|
|
526
|
+
path : str
|
|
527
|
+
Path to the cleaned .h5ad produced by safe_write_h5ad.
|
|
528
|
+
backup_dir : str
|
|
529
|
+
Directory where safe_write_h5ad stored pickled backups (default "./uns_backups").
|
|
530
|
+
restore_backups : bool
|
|
531
|
+
If True, attempt to load pickled backups and restore original objects into adata.
|
|
532
|
+
re_categorize : bool
|
|
533
|
+
If True, try to coerce small unique-count string columns back into pandas.Categorical.
|
|
534
|
+
categorical_threshold : int
|
|
535
|
+
Max unique values for a column to be considered categorical for automatic recasting.
|
|
536
|
+
verbose : bool
|
|
537
|
+
Print progress/summary.
|
|
538
|
+
|
|
539
|
+
Returns
|
|
540
|
+
-------
|
|
541
|
+
(adata, report) :
|
|
542
|
+
adata : AnnData
|
|
543
|
+
The reloaded (and possibly restored) AnnData instance.
|
|
544
|
+
report : dict
|
|
545
|
+
A report describing restored items, parsed JSON keys, and any failures.
|
|
546
|
+
"""
|
|
547
|
+
import os
|
|
548
|
+
import json
|
|
549
|
+
import pickle
|
|
550
|
+
import numpy as np
|
|
551
|
+
import pandas as pd
|
|
552
|
+
import anndata as _ad
|
|
553
|
+
|
|
554
|
+
report = {
|
|
555
|
+
"restored_obs_columns": [],
|
|
556
|
+
"restored_var_columns": [],
|
|
557
|
+
"restored_uns_keys": [],
|
|
558
|
+
"parsed_uns_json_keys": [],
|
|
559
|
+
"restored_layers": [],
|
|
560
|
+
"restored_obsm": [],
|
|
561
|
+
"recategorized_obs": [],
|
|
562
|
+
"recategorized_var": [],
|
|
563
|
+
"missing_backups": [],
|
|
564
|
+
"errors": [],
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if verbose:
|
|
568
|
+
print(f"[safe_read_h5ad] loading {path}")
|
|
569
|
+
|
|
570
|
+
# 1) load the cleaned h5ad
|
|
571
|
+
try:
|
|
572
|
+
adata = _ad.read_h5ad(path)
|
|
573
|
+
except Exception as e:
|
|
574
|
+
raise RuntimeError(f"Failed to read h5ad at {path}: {e}")
|
|
575
|
+
|
|
576
|
+
# Ensure backup_dir exists (may be relative to cwd)
|
|
577
|
+
backup_dir = os.path.abspath(backup_dir)
|
|
578
|
+
if verbose:
|
|
579
|
+
print(f"[safe_read_h5ad] looking for backups in {backup_dir}")
|
|
580
|
+
|
|
581
|
+
def _load_pickle_if_exists(fname):
|
|
582
|
+
if os.path.exists(fname):
|
|
583
|
+
try:
|
|
584
|
+
with open(fname, "rb") as fh:
|
|
585
|
+
val = pickle.load(fh)
|
|
586
|
+
return val
|
|
587
|
+
except Exception as e:
|
|
588
|
+
report["errors"].append(f"Failed to load pickle {fname}: {e}")
|
|
589
|
+
if verbose:
|
|
590
|
+
print(f" error loading {fname}: {e}")
|
|
591
|
+
return None
|
|
592
|
+
return None
|
|
593
|
+
|
|
594
|
+
# 2) Restore obs columns
|
|
595
|
+
for col in list(adata.obs.columns):
|
|
596
|
+
# Look for backup with exact naming from safe_write_h5ad: "obs.<col>_backup.pkl" or "obs.<col>_categorical_backup.pkl"
|
|
597
|
+
bname1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
|
|
598
|
+
bname2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
|
|
599
|
+
restored = False
|
|
600
|
+
|
|
601
|
+
if restore_backups:
|
|
602
|
+
val = _load_pickle_if_exists(bname2)
|
|
603
|
+
if val is not None:
|
|
604
|
+
# val may be the categorical series or categories
|
|
605
|
+
try:
|
|
606
|
+
# If pickled numpy array or pandas Series, coerce to same index alignment
|
|
607
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
608
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
609
|
+
else:
|
|
610
|
+
# fallback: place pickled object directly
|
|
611
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
612
|
+
report["restored_obs_columns"].append((col, bname2))
|
|
613
|
+
restored = True
|
|
614
|
+
if verbose:
|
|
615
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname2}")
|
|
616
|
+
except Exception as e:
|
|
617
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname2}: {e}")
|
|
618
|
+
restored = False
|
|
619
|
+
|
|
620
|
+
if not restored:
|
|
621
|
+
val = _load_pickle_if_exists(bname1)
|
|
622
|
+
if val is not None:
|
|
623
|
+
try:
|
|
624
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
625
|
+
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
626
|
+
else:
|
|
627
|
+
adata.obs[col] = pd.Series([val] * adata.shape[0], index=adata.obs.index)
|
|
628
|
+
report["restored_obs_columns"].append((col, bname1))
|
|
629
|
+
restored = True
|
|
630
|
+
if verbose:
|
|
631
|
+
print(f"[safe_read_h5ad] restored obs.{col} from {bname1}")
|
|
632
|
+
except Exception as e:
|
|
633
|
+
report["errors"].append(f"Failed to restore obs.{col} from {bname1}: {e}")
|
|
634
|
+
restored = False
|
|
635
|
+
|
|
636
|
+
# If not restored and column dtype is object but contains JSON-like strings, try json.loads per element
|
|
637
|
+
if (not restored) and (adata.obs[col].dtype == object):
|
|
638
|
+
sample_vals = adata.obs[col].dropna().astype(str).head(20).tolist()
|
|
639
|
+
looks_like_json = False
|
|
640
|
+
for sv in sample_vals:
|
|
641
|
+
svs = sv.strip()
|
|
642
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
643
|
+
looks_like_json = True
|
|
644
|
+
break
|
|
645
|
+
if looks_like_json:
|
|
646
|
+
parsed = []
|
|
647
|
+
success_parse = True
|
|
648
|
+
for v in adata.obs[col].astype(str).values:
|
|
649
|
+
try:
|
|
650
|
+
parsed.append(json.loads(v))
|
|
651
|
+
except Exception:
|
|
652
|
+
# if any element fails, don't convert whole column
|
|
653
|
+
success_parse = False
|
|
654
|
+
break
|
|
655
|
+
if success_parse:
|
|
656
|
+
adata.obs[col] = pd.Series(parsed, index=adata.obs.index)
|
|
657
|
+
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
658
|
+
restored = True
|
|
659
|
+
if verbose:
|
|
660
|
+
print(f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects")
|
|
661
|
+
|
|
662
|
+
# If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
|
|
663
|
+
if (not restored) and re_categorize and adata.obs[col].dtype == object:
|
|
664
|
+
try:
|
|
665
|
+
nunique = adata.obs[col].dropna().astype(str).nunique()
|
|
666
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
667
|
+
# cast to category
|
|
668
|
+
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
669
|
+
report["recategorized_obs"].append(col)
|
|
670
|
+
if verbose:
|
|
671
|
+
print(f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})")
|
|
672
|
+
except Exception as e:
|
|
673
|
+
report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
|
|
674
|
+
|
|
675
|
+
# 3) Restore var columns (same logic)
|
|
676
|
+
for col in list(adata.var.columns):
|
|
677
|
+
bname1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
678
|
+
bname2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
679
|
+
restored = False
|
|
680
|
+
|
|
681
|
+
if restore_backups:
|
|
682
|
+
val = _load_pickle_if_exists(bname2)
|
|
683
|
+
if val is not None:
|
|
684
|
+
try:
|
|
685
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
686
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
687
|
+
else:
|
|
688
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
689
|
+
report["restored_var_columns"].append((col, bname2))
|
|
690
|
+
restored = True
|
|
691
|
+
if verbose:
|
|
692
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname2}")
|
|
693
|
+
except Exception as e:
|
|
694
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname2}: {e}")
|
|
695
|
+
|
|
696
|
+
if not restored:
|
|
697
|
+
val = _load_pickle_if_exists(bname1)
|
|
698
|
+
if val is not None:
|
|
699
|
+
try:
|
|
700
|
+
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
701
|
+
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
702
|
+
else:
|
|
703
|
+
adata.var[col] = pd.Series([val] * adata.shape[1], index=adata.var.index)
|
|
704
|
+
report["restored_var_columns"].append((col, bname1))
|
|
705
|
+
restored = True
|
|
706
|
+
if verbose:
|
|
707
|
+
print(f"[safe_read_h5ad] restored var.{col} from {bname1}")
|
|
708
|
+
except Exception as e:
|
|
709
|
+
report["errors"].append(f"Failed to restore var.{col} from {bname1}: {e}")
|
|
710
|
+
|
|
711
|
+
if (not restored) and (adata.var[col].dtype == object):
|
|
712
|
+
# try JSON parsing
|
|
713
|
+
sample_vals = adata.var[col].dropna().astype(str).head(20).tolist()
|
|
714
|
+
looks_like_json = False
|
|
715
|
+
for sv in sample_vals:
|
|
716
|
+
svs = sv.strip()
|
|
717
|
+
if (svs.startswith("{") and svs.endswith("}")) or (svs.startswith("[") and svs.endswith("]")):
|
|
718
|
+
looks_like_json = True
|
|
719
|
+
break
|
|
720
|
+
if looks_like_json:
|
|
721
|
+
parsed = []
|
|
722
|
+
success_parse = True
|
|
723
|
+
for v in adata.var[col].astype(str).values:
|
|
724
|
+
try:
|
|
725
|
+
parsed.append(json.loads(v))
|
|
726
|
+
except Exception:
|
|
727
|
+
success_parse = False
|
|
728
|
+
break
|
|
729
|
+
if success_parse:
|
|
730
|
+
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
731
|
+
report["restored_var_columns"].append((col, "parsed_json"))
|
|
732
|
+
if verbose:
|
|
733
|
+
print(f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects")
|
|
734
|
+
|
|
735
|
+
if (not restored) and re_categorize and adata.var[col].dtype == object:
|
|
736
|
+
try:
|
|
737
|
+
nunique = adata.var[col].dropna().astype(str).nunique()
|
|
738
|
+
if nunique > 0 and nunique <= categorical_threshold:
|
|
739
|
+
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
740
|
+
report["recategorized_var"].append(col)
|
|
741
|
+
if verbose:
|
|
742
|
+
print(f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})")
|
|
743
|
+
except Exception as e:
|
|
744
|
+
report["errors"].append(f"Failed to recategorize var.{col}: {e}")
|
|
745
|
+
|
|
746
|
+
# 4) Restore uns: look for uns_{k}_backup.pkl, or keys like "<k>_json"
|
|
747
|
+
uns_keys = list(adata.uns.keys())
|
|
748
|
+
# First, if we have "<k>_json", convert back into k
|
|
749
|
+
for k in uns_keys:
|
|
750
|
+
if k.endswith("_json"):
|
|
751
|
+
base = k[:-5]
|
|
752
|
+
sval = adata.uns.get(k)
|
|
753
|
+
try:
|
|
754
|
+
parsed = json.loads(sval)
|
|
755
|
+
adata.uns[base] = parsed
|
|
756
|
+
report["parsed_uns_json_keys"].append(base)
|
|
757
|
+
if verbose:
|
|
758
|
+
print(f"[safe_read_h5ad] parsed adata.uns['{k}'] -> adata.uns['{base}']")
|
|
759
|
+
# remove the _json entry
|
|
760
|
+
try:
|
|
761
|
+
del adata.uns[k]
|
|
762
|
+
except KeyError:
|
|
763
|
+
pass
|
|
764
|
+
except Exception as e:
|
|
765
|
+
report["errors"].append(f"Failed to json-parse uns['{k}']: {e}")
|
|
766
|
+
|
|
767
|
+
# Now try to restore pickled backups for uns keys
|
|
768
|
+
# Look for files named uns_<key>_backup.pkl
|
|
769
|
+
# We will attempt to restore into adata.uns[key] if backup exists
|
|
770
|
+
for fname in os.listdir(backup_dir) if os.path.isdir(backup_dir) else []:
|
|
771
|
+
if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
|
|
772
|
+
continue
|
|
773
|
+
# fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
|
|
774
|
+
key = fname[len("uns_"):-len("_backup.pkl")]
|
|
775
|
+
full = os.path.join(backup_dir, fname)
|
|
776
|
+
val = _load_pickle_if_exists(full)
|
|
777
|
+
if val is not None:
|
|
778
|
+
adata.uns[key] = val
|
|
779
|
+
report["restored_uns_keys"].append((key, full))
|
|
780
|
+
if verbose:
|
|
781
|
+
print(f"[safe_read_h5ad] restored adata.uns['{key}'] from {full}")
|
|
782
|
+
|
|
783
|
+
# 5) Restore layers and obsm from backups if present
|
|
784
|
+
# expected backup names: layers_<name>_backup.pkl, obsm_<name>_backup.pkl
|
|
785
|
+
if os.path.isdir(backup_dir):
|
|
786
|
+
for fname in os.listdir(backup_dir):
|
|
787
|
+
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
788
|
+
layer_name = fname[len("layers_"):-len("_backup.pkl")]
|
|
789
|
+
full = os.path.join(backup_dir, fname)
|
|
790
|
+
val = _load_pickle_if_exists(full)
|
|
791
|
+
if val is not None:
|
|
792
|
+
try:
|
|
793
|
+
adata.layers[layer_name] = np.asarray(val)
|
|
794
|
+
report["restored_layers"].append((layer_name, full))
|
|
795
|
+
if verbose:
|
|
796
|
+
print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
|
|
797
|
+
except Exception as e:
|
|
798
|
+
report["errors"].append(f"Failed to restore layers['{layer_name}'] from {full}: {e}")
|
|
799
|
+
|
|
800
|
+
if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
|
|
801
|
+
obsm_name = fname[len("obsm_"):-len("_backup.pkl")]
|
|
802
|
+
full = os.path.join(backup_dir, fname)
|
|
803
|
+
val = _load_pickle_if_exists(full)
|
|
804
|
+
if val is not None:
|
|
805
|
+
try:
|
|
806
|
+
adata.obsm[obsm_name] = np.asarray(val)
|
|
807
|
+
report["restored_obsm"].append((obsm_name, full))
|
|
808
|
+
if verbose:
|
|
809
|
+
print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
|
|
810
|
+
except Exception as e:
|
|
811
|
+
report["errors"].append(f"Failed to restore obsm['{obsm_name}'] from {full}: {e}")
|
|
812
|
+
|
|
813
|
+
# 6) If restore_backups True but some expected backups missing, note them
|
|
814
|
+
if restore_backups and os.path.isdir(backup_dir):
|
|
815
|
+
# detect common expected names from obs/var/uns/layers in adata
|
|
816
|
+
expected_missing = []
|
|
817
|
+
# obs/var columns
|
|
818
|
+
for col in list(adata.obs.columns):
|
|
819
|
+
p1 = os.path.join(backup_dir, f"obs.{col}_backup.pkl")
|
|
820
|
+
p2 = os.path.join(backup_dir, f"obs.{col}_categorical_backup.pkl")
|
|
821
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
822
|
+
# we don't require backups for every column; only record if column still looks like placeholder strings
|
|
823
|
+
if adata.obs[col].dtype == object:
|
|
824
|
+
expected_missing.append(("obs", col))
|
|
825
|
+
for col in list(adata.var.columns):
|
|
826
|
+
p1 = os.path.join(backup_dir, f"var.{col}_backup.pkl")
|
|
827
|
+
p2 = os.path.join(backup_dir, f"var.{col}_categorical_backup.pkl")
|
|
828
|
+
if (not os.path.exists(p1)) and (not os.path.exists(p2)):
|
|
829
|
+
if adata.var[col].dtype == object:
|
|
830
|
+
expected_missing.append(("var", col))
|
|
831
|
+
# uns keys
|
|
832
|
+
for k in adata.uns.keys():
|
|
833
|
+
# if we have *_json or *_str variants we expect backups optionally
|
|
834
|
+
if k.endswith("_json") or k.endswith("_str"):
|
|
835
|
+
b = os.path.join(backup_dir, f"uns_{k[:-5]}_backup.pkl")
|
|
836
|
+
if not os.path.exists(b):
|
|
837
|
+
report["missing_backups"].append(("uns", k))
|
|
838
|
+
if expected_missing and verbose:
|
|
839
|
+
n = len(expected_missing)
|
|
840
|
+
if verbose:
|
|
841
|
+
print(f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable.")
|
|
842
|
+
# add to report
|
|
843
|
+
report["missing_backups"].extend(expected_missing)
|
|
844
|
+
|
|
845
|
+
# final summary print
|
|
846
|
+
if verbose:
|
|
847
|
+
print("\n=== safe_read_h5ad summary ===")
|
|
848
|
+
if report["restored_obs_columns"]:
|
|
849
|
+
print("Restored obs columns:", report["restored_obs_columns"])
|
|
850
|
+
if report["restored_var_columns"]:
|
|
851
|
+
print("Restored var columns:", report["restored_var_columns"])
|
|
852
|
+
if report["restored_uns_keys"]:
|
|
853
|
+
print("Restored uns keys:", report["restored_uns_keys"])
|
|
854
|
+
if report["parsed_uns_json_keys"]:
|
|
855
|
+
print("Parsed uns JSON keys:", report["parsed_uns_json_keys"])
|
|
856
|
+
if report["restored_layers"]:
|
|
857
|
+
print("Restored layers:", report["restored_layers"])
|
|
858
|
+
if report["restored_obsm"]:
|
|
859
|
+
print("Restored obsm:", report["restored_obsm"])
|
|
860
|
+
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
861
|
+
print("Recategorized columns (obs/var):", report["recategorized_obs"], report["recategorized_var"])
|
|
862
|
+
if report["missing_backups"]:
|
|
863
|
+
print("Missing backups or object columns without backups (investigate):", report["missing_backups"])
|
|
864
|
+
if report["errors"]:
|
|
865
|
+
print("Errors encountered (see report['errors']):")
|
|
866
|
+
for e in report["errors"]:
|
|
867
|
+
print(" -", e)
|
|
868
|
+
print("=== end summary ===\n")
|
|
869
|
+
|
|
870
|
+
return adata, report
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
# def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./", verbose=True):
|
|
874
|
+
# """
|
|
875
|
+
# Saves an AnnData object safely by omitting problematic columns from .obs and .var.
|
|
876
|
+
|
|
877
|
+
# Parameters:
|
|
878
|
+
# adata (AnnData): The AnnData object to save.
|
|
879
|
+
# path (str): Output .h5ad file path.
|
|
880
|
+
# compression (str): Compression method for h5ad file.
|
|
881
|
+
# backup (bool): If True, saves problematic columns to CSV files.
|
|
882
|
+
# backup_dir (str): Directory to store backups if backup=True.
|
|
883
|
+
# """
|
|
884
|
+
# import anndata as ad
|
|
885
|
+
# import pandas as pd
|
|
886
|
+
# import os
|
|
887
|
+
# import numpy as np
|
|
888
|
+
# import json
|
|
889
|
+
|
|
890
|
+
# os.makedirs(backup_dir, exist_ok=True)
|
|
891
|
+
|
|
892
|
+
# def filter_df(df, df_name):
|
|
893
|
+
# bad_cols = []
|
|
894
|
+
# for col in df.columns:
|
|
895
|
+
# if df[col].dtype == 'object':
|
|
896
|
+
# if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
|
|
897
|
+
# bad_cols.append(col)
|
|
898
|
+
# elif pd.api.types.is_categorical_dtype(df[col]):
|
|
899
|
+
# if not all(isinstance(x, (str, type(None))) for x in df[col].cat.categories):
|
|
900
|
+
# bad_cols.append(col)
|
|
901
|
+
# if bad_cols and verbose:
|
|
902
|
+
# print(f"Skipping columns from {df_name}: {bad_cols}")
|
|
903
|
+
# if backup and bad_cols:
|
|
904
|
+
# df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
|
|
905
|
+
# if verbose:
|
|
906
|
+
# print(f"Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
|
|
907
|
+
# return df.drop(columns=bad_cols)
|
|
908
|
+
|
|
909
|
+
# def is_serializable(val):
|
|
910
|
+
# try:
|
|
911
|
+
# json.dumps(val)
|
|
912
|
+
# return True
|
|
913
|
+
# except (TypeError, OverflowError):
|
|
914
|
+
# return False
|
|
915
|
+
|
|
916
|
+
# def clean_uns(uns_dict):
|
|
917
|
+
# clean_uns = {}
|
|
918
|
+
# bad_keys = []
|
|
919
|
+
# for k, v in uns_dict.items():
|
|
920
|
+
# if isinstance(v, (str, int, float, type(None), list, np.ndarray, pd.DataFrame, dict)):
|
|
921
|
+
# clean_uns[k] = v
|
|
922
|
+
# elif is_serializable(v):
|
|
923
|
+
# clean_uns[k] = v
|
|
924
|
+
# else:
|
|
925
|
+
# bad_keys.append(k)
|
|
926
|
+
# if backup:
|
|
927
|
+
# try:
|
|
928
|
+
# with open(os.path.join(backup_dir, f"uns_{k}_backup.txt"), "w") as f:
|
|
929
|
+
# f.write(str(v))
|
|
930
|
+
# except Exception:
|
|
931
|
+
# pass
|
|
932
|
+
# if bad_keys and verbose:
|
|
933
|
+
# print(f"Skipping entries from .uns: {bad_keys}")
|
|
934
|
+
# return clean_uns
|
|
935
|
+
|
|
936
|
+
# # Clean obs and var and uns
|
|
937
|
+
# obs_clean = filter_df(adata.obs, "obs")
|
|
938
|
+
# var_clean = filter_df(adata.var, "var")
|
|
939
|
+
# uns_clean = clean_uns(adata.uns)
|
|
940
|
+
|
|
941
|
+
# # Save clean version
|
|
942
|
+
# adata_copy = ad.AnnData(
|
|
943
|
+
# X=adata.X,
|
|
944
|
+
# obs=obs_clean,
|
|
945
|
+
# var=var_clean,
|
|
946
|
+
# layers=adata.layers,
|
|
947
|
+
# uns=uns_clean,
|
|
948
|
+
# obsm=adata.obsm,
|
|
949
|
+
# varm=adata.varm
|
|
950
|
+
# )
|
|
951
|
+
|
|
952
|
+
# adata_copy.obs_names = adata_copy.obs_names.astype(str)
|
|
953
|
+
# adata_copy.var_names = adata_copy.var_names.astype(str)
|
|
954
|
+
|
|
955
|
+
# adata_copy.write_h5ad(path, compression=compression)
|
|
956
|
+
|
|
957
|
+
# print(f"Saved safely to {path}")
|
|
958
|
+
|
|
959
|
+
def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
960
|
+
import numpy as np
|
|
961
|
+
import anndata as ad
|
|
962
|
+
|
|
963
|
+
# Step 1: Identify overlap
|
|
964
|
+
overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
|
|
965
|
+
|
|
966
|
+
# Step 2: Filter out overlaps from adata_single
|
|
967
|
+
adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
|
|
968
|
+
|
|
969
|
+
# Step 3: Add source tag
|
|
970
|
+
adata_single_filtered.obs['source'] = 'single_barcode'
|
|
971
|
+
adata_double.obs['source'] = 'double_barcode'
|
|
972
|
+
|
|
973
|
+
# Step 4: Concatenate all components
|
|
974
|
+
adata_merged = ad.concat([
|
|
975
|
+
adata_single_filtered,
|
|
976
|
+
adata_double
|
|
977
|
+
], join='outer', merge='same') # merge='same' preserves matching layers, obsm, etc.
|
|
978
|
+
|
|
979
|
+
# Step 5: Merge `.uns`
|
|
980
|
+
adata_merged.uns = {**adata_single.uns, **adata_double.uns}
|
|
981
|
+
|
|
982
|
+
return adata_merged
|
|
983
|
+
######################################################################################################
|
|
984
|
+
|
|
985
|
+
### File conversion misc ###
|
|
986
|
+
import argparse
|
|
987
|
+
from Bio import SeqIO
|
|
988
|
+
def genbank_to_gff(genbank_file, output_file, record_id):
|
|
989
|
+
with open(output_file, "w") as out:
|
|
990
|
+
for record in SeqIO.parse(genbank_file, "genbank"):
|
|
991
|
+
for feature in record.features:
|
|
992
|
+
# Skip features without location information
|
|
993
|
+
if feature.location is None:
|
|
994
|
+
continue
|
|
995
|
+
# Extract feature information
|
|
996
|
+
start = feature.location.start + 1 # Convert to 1-based index
|
|
997
|
+
end = feature.location.end
|
|
998
|
+
strand = "+" if feature.location.strand == 1 else "-"
|
|
999
|
+
feature_type = feature.type
|
|
1000
|
+
# Format attributes
|
|
1001
|
+
attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
|
|
1002
|
+
# Write GFF3 line
|
|
1003
|
+
gff3_line = "\t".join(str(x) for x in [record_id, feature.type, feature_type, start, end, ".", strand, ".", attributes])
|
|
1004
|
+
out.write(gff3_line + "\n")
|