smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/readwrite.py
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
## readwrite ##
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import warnings
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import Iterable, Sequence, Optional
|
|
6
|
+
from typing import Iterable, List, Sequence, Union
|
|
9
7
|
|
|
10
|
-
import warnings
|
|
11
|
-
import pandas as pd
|
|
12
8
|
import anndata as ad
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from Bio import SeqIO
|
|
11
|
+
|
|
13
12
|
|
|
14
13
|
######################################################################################################
|
|
15
14
|
## Datetime functionality
|
|
@@ -18,20 +17,26 @@ def date_string():
|
|
|
18
17
|
Each time this is called, it returns the current date string
|
|
19
18
|
"""
|
|
20
19
|
from datetime import datetime
|
|
20
|
+
|
|
21
21
|
current_date = datetime.now()
|
|
22
22
|
date_string = current_date.strftime("%Y%m%d")
|
|
23
23
|
date_string = date_string[2:]
|
|
24
24
|
return date_string
|
|
25
25
|
|
|
26
|
+
|
|
26
27
|
def time_string():
|
|
27
28
|
"""
|
|
28
29
|
Each time this is called, it returns the current time string
|
|
29
30
|
"""
|
|
30
31
|
from datetime import datetime
|
|
32
|
+
|
|
31
33
|
current_time = datetime.now()
|
|
32
34
|
return current_time.strftime("%H:%M:%S")
|
|
35
|
+
|
|
36
|
+
|
|
33
37
|
######################################################################################################
|
|
34
38
|
|
|
39
|
+
|
|
35
40
|
######################################################################################################
|
|
36
41
|
## General file and directory handling
|
|
37
42
|
def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None:
|
|
@@ -57,11 +62,12 @@ def make_dirs(directories: Union[str, Path, Iterable[Union[str, Path]]]) -> None
|
|
|
57
62
|
p = Path(d)
|
|
58
63
|
|
|
59
64
|
# If someone passes in a file path, make its parent
|
|
60
|
-
if p.suffix:
|
|
65
|
+
if p.suffix: # p.suffix != "" means it's a file
|
|
61
66
|
p = p.parent
|
|
62
67
|
|
|
63
68
|
p.mkdir(parents=True, exist_ok=True)
|
|
64
69
|
|
|
70
|
+
|
|
65
71
|
def add_or_update_column_in_csv(
|
|
66
72
|
csv_path: str | Path,
|
|
67
73
|
column_name: str,
|
|
@@ -117,19 +123,20 @@ def add_or_update_column_in_csv(
|
|
|
117
123
|
# Sequence case: lengths must match
|
|
118
124
|
if len(values) != len(df):
|
|
119
125
|
raise ValueError(
|
|
120
|
-
f"Length mismatch: CSV has {len(df)} rows "
|
|
121
|
-
f"but values has {len(values)} entries."
|
|
126
|
+
f"Length mismatch: CSV has {len(df)} rows but values has {len(values)} entries."
|
|
122
127
|
)
|
|
123
128
|
|
|
124
129
|
df[column_name] = list(values)
|
|
125
130
|
df.to_csv(csv_path, index=index)
|
|
126
131
|
return df
|
|
127
132
|
|
|
133
|
+
|
|
128
134
|
######################################################################################################
|
|
129
135
|
|
|
130
136
|
######################################################################################################
|
|
131
137
|
## Numpy, Pandas, Anndata functionality
|
|
132
138
|
|
|
139
|
+
|
|
133
140
|
def adata_to_df(adata, layer=None):
|
|
134
141
|
"""
|
|
135
142
|
Convert an AnnData object into a Pandas DataFrame.
|
|
@@ -142,8 +149,6 @@ def adata_to_df(adata, layer=None):
|
|
|
142
149
|
pd.DataFrame: A DataFrame where rows are observations and columns are positions.
|
|
143
150
|
"""
|
|
144
151
|
import pandas as pd
|
|
145
|
-
import anndata as ad
|
|
146
|
-
import numpy as np
|
|
147
152
|
|
|
148
153
|
# Validate that the requested layer exists
|
|
149
154
|
if layer and layer not in adata.layers:
|
|
@@ -153,28 +158,83 @@ def adata_to_df(adata, layer=None):
|
|
|
153
158
|
data_matrix = adata.layers.get(layer, adata.X)
|
|
154
159
|
|
|
155
160
|
# Ensure matrix is dense (handle sparse formats)
|
|
156
|
-
if hasattr(data_matrix, "toarray"):
|
|
161
|
+
if hasattr(data_matrix, "toarray"):
|
|
157
162
|
data_matrix = data_matrix.toarray()
|
|
158
163
|
|
|
159
164
|
# Ensure obs and var have unique indices
|
|
160
165
|
if adata.obs.index.duplicated().any():
|
|
161
|
-
raise ValueError(
|
|
162
|
-
|
|
166
|
+
raise ValueError(
|
|
167
|
+
"Duplicate values found in `adata.obs.index`. Ensure unique observation indices."
|
|
168
|
+
)
|
|
169
|
+
|
|
163
170
|
if adata.var.index.duplicated().any():
|
|
164
|
-
raise ValueError(
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"Duplicate values found in `adata.var.index`. Ensure unique variable indices."
|
|
173
|
+
)
|
|
165
174
|
|
|
166
175
|
# Convert to DataFrame
|
|
167
176
|
df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
|
|
168
177
|
|
|
169
178
|
return df
|
|
170
179
|
|
|
180
|
+
|
|
171
181
|
def save_matrix(matrix, save_name):
|
|
172
182
|
"""
|
|
173
183
|
Input: A numpy matrix and a save_name
|
|
174
184
|
Output: A txt file representation of the data matrix
|
|
175
185
|
"""
|
|
176
186
|
import numpy as np
|
|
177
|
-
|
|
187
|
+
|
|
188
|
+
np.savetxt(f"{save_name}.txt", matrix)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _harmonize_var_schema(adatas: List[ad.AnnData]) -> None:
|
|
192
|
+
"""
|
|
193
|
+
In-place:
|
|
194
|
+
- Make every AnnData.var have the *union* of columns.
|
|
195
|
+
- Normalize dtypes so columns can hold NaN and round-trip via HDF5:
|
|
196
|
+
* ints -> float64 (to support NaN)
|
|
197
|
+
* objects -> try numeric->float64, else pandas 'string'
|
|
198
|
+
"""
|
|
199
|
+
import numpy as np
|
|
200
|
+
|
|
201
|
+
# 1) Union of all .var columns
|
|
202
|
+
all_cols = set()
|
|
203
|
+
for a in adatas:
|
|
204
|
+
all_cols.update(a.var.columns)
|
|
205
|
+
all_cols = list(all_cols)
|
|
206
|
+
|
|
207
|
+
# 2) Add any missing columns as float64 NaN
|
|
208
|
+
for a in adatas:
|
|
209
|
+
missing = [c for c in all_cols if c not in a.var.columns]
|
|
210
|
+
for c in missing:
|
|
211
|
+
a.var[c] = np.nan # becomes float64 by default
|
|
212
|
+
|
|
213
|
+
# 3) Normalize dtypes per AnnData so concat doesn't create mixed/object columns
|
|
214
|
+
for a in adatas:
|
|
215
|
+
for c in a.var.columns:
|
|
216
|
+
s = a.var[c]
|
|
217
|
+
dt = s.dtype
|
|
218
|
+
|
|
219
|
+
# Integer/unsigned -> float64 (so NaN fits)
|
|
220
|
+
if dt.kind in ("i", "u"):
|
|
221
|
+
a.var[c] = s.astype("float64")
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
# Object -> numeric if possible; else pandas 'string'
|
|
225
|
+
if dt == "O":
|
|
226
|
+
try:
|
|
227
|
+
s_num = pd.to_numeric(s, errors="raise")
|
|
228
|
+
a.var[c] = s_num.astype("float64")
|
|
229
|
+
except Exception:
|
|
230
|
+
a.var[c] = s.astype("string")
|
|
231
|
+
|
|
232
|
+
# Optional: ensure consistent column order (sorted + stable)
|
|
233
|
+
# Not required, but can make diffs easier to read:
|
|
234
|
+
all_cols_sorted = sorted(all_cols)
|
|
235
|
+
for a in adatas:
|
|
236
|
+
a.var = a.var.reindex(columns=all_cols_sorted)
|
|
237
|
+
|
|
178
238
|
|
|
179
239
|
def concatenate_h5ads(
|
|
180
240
|
output_path: str | Path,
|
|
@@ -243,8 +303,7 @@ def concatenate_h5ads(
|
|
|
243
303
|
# collect all *.h5ad / *.h5ad.gz (or whatever file_suffixes specify)
|
|
244
304
|
suffixes_lower = tuple(s.lower() for s in file_suffixes)
|
|
245
305
|
h5_paths = sorted(
|
|
246
|
-
p for p in input_dir.iterdir()
|
|
247
|
-
if p.is_file() and p.suffix.lower() in suffixes_lower
|
|
306
|
+
p for p in input_dir.iterdir() if p.is_file() and p.suffix.lower() in suffixes_lower
|
|
248
307
|
)
|
|
249
308
|
|
|
250
309
|
else:
|
|
@@ -255,9 +314,7 @@ def concatenate_h5ads(
|
|
|
255
314
|
|
|
256
315
|
df = pd.read_csv(csv_path, dtype=str)
|
|
257
316
|
if csv_column not in df.columns:
|
|
258
|
-
raise ValueError(
|
|
259
|
-
f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths."
|
|
260
|
-
)
|
|
317
|
+
raise ValueError(f"CSV {csv_path} must contain column '{csv_column}' with .h5ad paths.")
|
|
261
318
|
paths = df[csv_column].dropna().astype(str).tolist()
|
|
262
319
|
if not paths:
|
|
263
320
|
raise ValueError(f"No non-empty paths in column '{csv_column}' of {csv_path}.")
|
|
@@ -280,27 +337,41 @@ def concatenate_h5ads(
|
|
|
280
337
|
for p in h5_paths:
|
|
281
338
|
print(f" - {p}")
|
|
282
339
|
|
|
283
|
-
|
|
284
|
-
|
|
340
|
+
# Load all first so we can harmonize schemas before concat
|
|
341
|
+
loaded: List[ad.AnnData] = []
|
|
285
342
|
for p in h5_paths:
|
|
286
343
|
print(f"{time_string()}: Reading {p}")
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
344
|
+
a, _ = safe_read_h5ad(p, restore_backups=restore_backups)
|
|
345
|
+
loaded.append(a)
|
|
346
|
+
|
|
347
|
+
# Critical: make every .var share the same columns + safe dtypes
|
|
348
|
+
_harmonize_var_schema(loaded)
|
|
349
|
+
|
|
350
|
+
print(f"{time_string()}: Concatenating {len(loaded)} AnnData objects")
|
|
351
|
+
final_adata = ad.concat(
|
|
352
|
+
loaded,
|
|
353
|
+
axis=0, # stack observations
|
|
354
|
+
join="outer", # keep union of variables
|
|
355
|
+
merge="unique",
|
|
356
|
+
uns_merge="unique",
|
|
357
|
+
index_unique=None,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Defensive pass: ensure final var dtypes are write-safe
|
|
361
|
+
for c in final_adata.var.columns:
|
|
362
|
+
s = final_adata.var[c]
|
|
363
|
+
dt = s.dtype
|
|
364
|
+
if dt.kind in ("i", "u"):
|
|
365
|
+
final_adata.var[c] = s.astype("float64")
|
|
366
|
+
elif dt == "O":
|
|
367
|
+
try:
|
|
368
|
+
s_num = pd.to_numeric(s, errors="raise")
|
|
369
|
+
final_adata.var[c] = s_num.astype("float64")
|
|
370
|
+
except Exception:
|
|
371
|
+
final_adata.var[c] = s.astype("string")
|
|
301
372
|
|
|
302
|
-
|
|
303
|
-
|
|
373
|
+
# Let anndata write pandas StringArray reliably
|
|
374
|
+
ad.settings.allow_write_nullable_strings = True
|
|
304
375
|
|
|
305
376
|
print(f"{time_string()}: Writing concatenated AnnData to {output_path}")
|
|
306
377
|
safe_write_h5ad(final_adata, output_path, backup=restore_backups)
|
|
@@ -325,18 +396,21 @@ def concatenate_h5ads(
|
|
|
325
396
|
|
|
326
397
|
return output_path
|
|
327
398
|
|
|
399
|
+
|
|
328
400
|
def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=None, verbose=True):
|
|
329
401
|
"""
|
|
330
402
|
Save an AnnData safely by sanitizing .obs, .var, .uns, .layers, and .obsm.
|
|
331
403
|
|
|
332
404
|
Returns a report dict and prints a summary of what was converted/backed up/skipped.
|
|
333
405
|
"""
|
|
334
|
-
import
|
|
406
|
+
import json
|
|
407
|
+
import os
|
|
408
|
+
import pickle
|
|
335
409
|
from pathlib import Path
|
|
410
|
+
|
|
411
|
+
import anndata as _ad
|
|
336
412
|
import numpy as np
|
|
337
413
|
import pandas as pd
|
|
338
|
-
import warnings
|
|
339
|
-
import anndata as _ad
|
|
340
414
|
|
|
341
415
|
path = Path(path)
|
|
342
416
|
|
|
@@ -413,7 +487,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
413
487
|
report["var_backed_up_columns"].append(col)
|
|
414
488
|
df[col] = ser.astype(str)
|
|
415
489
|
if verbose:
|
|
416
|
-
print(
|
|
490
|
+
print(
|
|
491
|
+
f" coerced categorical column '{which}.{col}' -> strings (backup={backup})"
|
|
492
|
+
)
|
|
417
493
|
continue
|
|
418
494
|
|
|
419
495
|
# object dtype handling: try to coerce each element to string
|
|
@@ -434,7 +510,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
434
510
|
report["var_backed_up_columns"].append(col)
|
|
435
511
|
df[col] = ser.values.astype(str)
|
|
436
512
|
if verbose:
|
|
437
|
-
print(
|
|
513
|
+
print(
|
|
514
|
+
f" converted object column '{which}.{col}' -> strings (backup={backup})"
|
|
515
|
+
)
|
|
438
516
|
if which == "obs":
|
|
439
517
|
report["obs_converted_columns"].append(col)
|
|
440
518
|
else:
|
|
@@ -457,7 +535,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
457
535
|
report["var_backed_up_columns"].append(col)
|
|
458
536
|
df[col] = [json.dumps(v, default=str) for v in ser.values]
|
|
459
537
|
if verbose:
|
|
460
|
-
print(
|
|
538
|
+
print(
|
|
539
|
+
f" json-stringified object column '{which}.{col}' (backup={backup})"
|
|
540
|
+
)
|
|
461
541
|
if which == "obs":
|
|
462
542
|
report["obs_converted_columns"].append(col)
|
|
463
543
|
else:
|
|
@@ -472,7 +552,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
472
552
|
report["var_backed_up_columns"].append(col)
|
|
473
553
|
df[col] = ser.astype(str)
|
|
474
554
|
if verbose:
|
|
475
|
-
print(
|
|
555
|
+
print(
|
|
556
|
+
f" WARNING: column '{which}.{col}' was complex; coerced via str() (backed up)."
|
|
557
|
+
)
|
|
476
558
|
if which == "obs":
|
|
477
559
|
report["obs_converted_columns"].append(col)
|
|
478
560
|
else:
|
|
@@ -499,7 +581,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
499
581
|
_backup(v, f"uns_{k}_backup")
|
|
500
582
|
backed_up.append(k)
|
|
501
583
|
if verbose:
|
|
502
|
-
print(
|
|
584
|
+
print(
|
|
585
|
+
f" uns['{k}'] non-JSON -> stored '{k}_json' and backed up (backup={backup})"
|
|
586
|
+
)
|
|
503
587
|
report["uns_json_keys"].append(k)
|
|
504
588
|
except Exception:
|
|
505
589
|
try:
|
|
@@ -534,7 +618,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
534
618
|
arr_f = arr.astype(float)
|
|
535
619
|
cleaned[k] = arr_f
|
|
536
620
|
report_key = f"{which}.{k}"
|
|
537
|
-
report["layers_converted"].append(
|
|
621
|
+
report["layers_converted"].append(
|
|
622
|
+
report_key
|
|
623
|
+
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
538
624
|
if verbose:
|
|
539
625
|
print(f" {which}.{k} object array coerced to float.")
|
|
540
626
|
except Exception:
|
|
@@ -542,7 +628,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
542
628
|
arr_i = arr.astype(int)
|
|
543
629
|
cleaned[k] = arr_i
|
|
544
630
|
report_key = f"{which}.{k}"
|
|
545
|
-
report["layers_converted"].append(
|
|
631
|
+
report["layers_converted"].append(
|
|
632
|
+
report_key
|
|
633
|
+
) if which == "layers" else report["obsm_converted"].append(report_key)
|
|
546
634
|
if verbose:
|
|
547
635
|
print(f" {which}.{k} object array coerced to int.")
|
|
548
636
|
except Exception:
|
|
@@ -553,7 +641,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
553
641
|
else:
|
|
554
642
|
report["obsm_skipped"].append(k)
|
|
555
643
|
if verbose:
|
|
556
|
-
print(
|
|
644
|
+
print(
|
|
645
|
+
f" SKIPPING {which}.{k} (object dtype not numeric). Backed up: {backup}"
|
|
646
|
+
)
|
|
557
647
|
continue
|
|
558
648
|
else:
|
|
559
649
|
cleaned[k] = arr
|
|
@@ -638,7 +728,9 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
638
728
|
X_to_use = np.zeros_like(X_arr, dtype=float)
|
|
639
729
|
report["X_replaced_or_converted"] = "replaced_with_zeros_backup"
|
|
640
730
|
if verbose:
|
|
641
|
-
print(
|
|
731
|
+
print(
|
|
732
|
+
"adata.X had object dtype and couldn't be converted; replaced with zeros (backup set)."
|
|
733
|
+
)
|
|
642
734
|
except Exception as e:
|
|
643
735
|
msg = f"Error handling adata.X: {e}"
|
|
644
736
|
report["errors"].append(msg)
|
|
@@ -731,7 +823,7 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
731
823
|
print(f"CSV outputs will be written to: {csv_dir}")
|
|
732
824
|
except Exception as e:
|
|
733
825
|
msg = f"Failed to create CSV output directory: {e}"
|
|
734
|
-
report[
|
|
826
|
+
report["errors"].append(msg)
|
|
735
827
|
if verbose:
|
|
736
828
|
print(msg)
|
|
737
829
|
csv_dir = path.parent # fallback just in case
|
|
@@ -742,48 +834,58 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
742
834
|
|
|
743
835
|
# obs columns
|
|
744
836
|
for col in adata_copy.obs.columns:
|
|
745
|
-
meta_rows.append(
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
837
|
+
meta_rows.append(
|
|
838
|
+
{
|
|
839
|
+
"kind": "obs",
|
|
840
|
+
"name": col,
|
|
841
|
+
"dtype": str(adata_copy.obs[col].dtype),
|
|
842
|
+
}
|
|
843
|
+
)
|
|
750
844
|
|
|
751
845
|
# var columns
|
|
752
846
|
for col in adata_copy.var.columns:
|
|
753
|
-
meta_rows.append(
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
847
|
+
meta_rows.append(
|
|
848
|
+
{
|
|
849
|
+
"kind": "var",
|
|
850
|
+
"name": col,
|
|
851
|
+
"dtype": str(adata_copy.var[col].dtype),
|
|
852
|
+
}
|
|
853
|
+
)
|
|
758
854
|
|
|
759
855
|
# layers
|
|
760
856
|
for k, v in adata_copy.layers.items():
|
|
761
|
-
meta_rows.append(
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
857
|
+
meta_rows.append(
|
|
858
|
+
{
|
|
859
|
+
"kind": "layer",
|
|
860
|
+
"name": k,
|
|
861
|
+
"dtype": str(np.asarray(v).dtype),
|
|
862
|
+
}
|
|
863
|
+
)
|
|
766
864
|
|
|
767
865
|
# obsm
|
|
768
866
|
for k, v in adata_copy.obsm.items():
|
|
769
|
-
meta_rows.append(
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
867
|
+
meta_rows.append(
|
|
868
|
+
{
|
|
869
|
+
"kind": "obsm",
|
|
870
|
+
"name": k,
|
|
871
|
+
"dtype": str(np.asarray(v).dtype),
|
|
872
|
+
}
|
|
873
|
+
)
|
|
774
874
|
|
|
775
875
|
# uns
|
|
776
876
|
for k, v in adata_copy.uns.items():
|
|
777
|
-
meta_rows.append(
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
877
|
+
meta_rows.append(
|
|
878
|
+
{
|
|
879
|
+
"kind": "uns",
|
|
880
|
+
"name": k,
|
|
881
|
+
"dtype": type(v).__name__,
|
|
882
|
+
}
|
|
883
|
+
)
|
|
782
884
|
|
|
783
885
|
meta_df = pd.DataFrame(meta_rows)
|
|
784
886
|
|
|
785
887
|
# same base name, inside csvs/
|
|
786
|
-
base = path.stem
|
|
888
|
+
base = path.stem # removes .h5ad
|
|
787
889
|
meta_path = csv_dir / f"{base}.keys.csv"
|
|
788
890
|
|
|
789
891
|
meta_df.to_csv(meta_path, index=False)
|
|
@@ -818,7 +920,15 @@ def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir=No
|
|
|
818
920
|
|
|
819
921
|
return report
|
|
820
922
|
|
|
821
|
-
|
|
923
|
+
|
|
924
|
+
def safe_read_h5ad(
|
|
925
|
+
path,
|
|
926
|
+
backup_dir=None,
|
|
927
|
+
restore_backups=True,
|
|
928
|
+
re_categorize=True,
|
|
929
|
+
categorical_threshold=100,
|
|
930
|
+
verbose=True,
|
|
931
|
+
):
|
|
822
932
|
"""
|
|
823
933
|
Safely load an AnnData saved by safe_write_h5ad and attempt to restore complex objects
|
|
824
934
|
from the backup_dir produced during save.
|
|
@@ -846,13 +956,14 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
846
956
|
report : dict
|
|
847
957
|
A report describing restored items, parsed JSON keys, and any failures.
|
|
848
958
|
"""
|
|
849
|
-
import os
|
|
850
|
-
from pathlib import Path
|
|
851
959
|
import json
|
|
960
|
+
import os
|
|
852
961
|
import pickle
|
|
962
|
+
from pathlib import Path
|
|
963
|
+
|
|
964
|
+
import anndata as _ad
|
|
853
965
|
import numpy as np
|
|
854
966
|
import pandas as pd
|
|
855
|
-
import anndata as _ad
|
|
856
967
|
|
|
857
968
|
path = Path(path)
|
|
858
969
|
|
|
@@ -931,7 +1042,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
931
1042
|
if hasattr(val, "shape") and (len(val) == adata.shape[0]):
|
|
932
1043
|
adata.obs[col] = pd.Series(val, index=adata.obs.index)
|
|
933
1044
|
else:
|
|
934
|
-
adata.obs[col] = pd.Series(
|
|
1045
|
+
adata.obs[col] = pd.Series(
|
|
1046
|
+
[val] * adata.shape[0], index=adata.obs.index
|
|
1047
|
+
)
|
|
935
1048
|
report["restored_obs_columns"].append((col, bname1))
|
|
936
1049
|
restored = True
|
|
937
1050
|
if verbose:
|
|
@@ -946,7 +1059,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
946
1059
|
looks_like_json = False
|
|
947
1060
|
for sv in sample_vals:
|
|
948
1061
|
svs = sv.strip()
|
|
949
|
-
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1062
|
+
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1063
|
+
svs.startswith("[") and svs.endswith("]")
|
|
1064
|
+
):
|
|
950
1065
|
looks_like_json = True
|
|
951
1066
|
break
|
|
952
1067
|
if looks_like_json:
|
|
@@ -964,7 +1079,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
964
1079
|
report["restored_obs_columns"].append((col, "parsed_json"))
|
|
965
1080
|
restored = True
|
|
966
1081
|
if verbose:
|
|
967
|
-
print(
|
|
1082
|
+
print(
|
|
1083
|
+
f"[safe_read_h5ad] parsed obs.{col} JSON strings back to Python objects"
|
|
1084
|
+
)
|
|
968
1085
|
|
|
969
1086
|
# If still not restored and re_categorize=True, try to convert small unique string columns back to categorical
|
|
970
1087
|
if (not restored) and re_categorize and adata.obs[col].dtype == object:
|
|
@@ -975,7 +1092,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
975
1092
|
adata.obs[col] = adata.obs[col].astype(str).astype("category")
|
|
976
1093
|
report["recategorized_obs"].append(col)
|
|
977
1094
|
if verbose:
|
|
978
|
-
print(
|
|
1095
|
+
print(
|
|
1096
|
+
f"[safe_read_h5ad] recast obs.{col} -> categorical (n_unique={nunique})"
|
|
1097
|
+
)
|
|
979
1098
|
except Exception as e:
|
|
980
1099
|
report["errors"].append(f"Failed to recategorize obs.{col}: {e}")
|
|
981
1100
|
|
|
@@ -1007,7 +1126,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1007
1126
|
if hasattr(val, "shape") and (len(val) == adata.shape[1]):
|
|
1008
1127
|
adata.var[col] = pd.Series(val, index=adata.var.index)
|
|
1009
1128
|
else:
|
|
1010
|
-
adata.var[col] = pd.Series(
|
|
1129
|
+
adata.var[col] = pd.Series(
|
|
1130
|
+
[val] * adata.shape[1], index=adata.var.index
|
|
1131
|
+
)
|
|
1011
1132
|
report["restored_var_columns"].append((col, bname1))
|
|
1012
1133
|
restored = True
|
|
1013
1134
|
if verbose:
|
|
@@ -1021,7 +1142,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1021
1142
|
looks_like_json = False
|
|
1022
1143
|
for sv in sample_vals:
|
|
1023
1144
|
svs = sv.strip()
|
|
1024
|
-
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1145
|
+
if (svs.startswith("{") and svs.endswith("}")) or (
|
|
1146
|
+
svs.startswith("[") and svs.endswith("]")
|
|
1147
|
+
):
|
|
1025
1148
|
looks_like_json = True
|
|
1026
1149
|
break
|
|
1027
1150
|
if looks_like_json:
|
|
@@ -1037,7 +1160,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1037
1160
|
adata.var[col] = pd.Series(parsed, index=adata.var.index)
|
|
1038
1161
|
report["restored_var_columns"].append((col, "parsed_json"))
|
|
1039
1162
|
if verbose:
|
|
1040
|
-
print(
|
|
1163
|
+
print(
|
|
1164
|
+
f"[safe_read_h5ad] parsed var.{col} JSON strings back to Python objects"
|
|
1165
|
+
)
|
|
1041
1166
|
|
|
1042
1167
|
if (not restored) and re_categorize and adata.var[col].dtype == object:
|
|
1043
1168
|
try:
|
|
@@ -1046,7 +1171,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1046
1171
|
adata.var[col] = adata.var[col].astype(str).astype("category")
|
|
1047
1172
|
report["recategorized_var"].append(col)
|
|
1048
1173
|
if verbose:
|
|
1049
|
-
print(
|
|
1174
|
+
print(
|
|
1175
|
+
f"[safe_read_h5ad] recast var.{col} -> categorical (n_unique={nunique})"
|
|
1176
|
+
)
|
|
1050
1177
|
except Exception as e:
|
|
1051
1178
|
report["errors"].append(f"Failed to recategorize var.{col}: {e}")
|
|
1052
1179
|
|
|
@@ -1078,7 +1205,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1078
1205
|
if not fname.startswith("uns_") or not fname.endswith("_backup.pkl"):
|
|
1079
1206
|
continue
|
|
1080
1207
|
# fname example: "uns_clustermap_results_backup.pkl" -> key name between 'uns_' and '_backup.pkl'
|
|
1081
|
-
key = fname[len("uns_")
|
|
1208
|
+
key = fname[len("uns_") : -len("_backup.pkl")]
|
|
1082
1209
|
full = os.path.join(backup_dir, fname)
|
|
1083
1210
|
val = _load_pickle_if_exists(full)
|
|
1084
1211
|
if val is not None:
|
|
@@ -1092,7 +1219,7 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1092
1219
|
if os.path.isdir(backup_dir):
|
|
1093
1220
|
for fname in os.listdir(backup_dir):
|
|
1094
1221
|
if fname.startswith("layers_") and fname.endswith("_backup.pkl"):
|
|
1095
|
-
layer_name = fname[len("layers_")
|
|
1222
|
+
layer_name = fname[len("layers_") : -len("_backup.pkl")]
|
|
1096
1223
|
full = os.path.join(backup_dir, fname)
|
|
1097
1224
|
val = _load_pickle_if_exists(full)
|
|
1098
1225
|
if val is not None:
|
|
@@ -1102,10 +1229,12 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1102
1229
|
if verbose:
|
|
1103
1230
|
print(f"[safe_read_h5ad] restored layers['{layer_name}'] from {full}")
|
|
1104
1231
|
except Exception as e:
|
|
1105
|
-
report["errors"].append(
|
|
1232
|
+
report["errors"].append(
|
|
1233
|
+
f"Failed to restore layers['{layer_name}'] from {full}: {e}"
|
|
1234
|
+
)
|
|
1106
1235
|
|
|
1107
1236
|
if fname.startswith("obsm_") and fname.endswith("_backup.pkl"):
|
|
1108
|
-
obsm_name = fname[len("obsm_")
|
|
1237
|
+
obsm_name = fname[len("obsm_") : -len("_backup.pkl")]
|
|
1109
1238
|
full = os.path.join(backup_dir, fname)
|
|
1110
1239
|
val = _load_pickle_if_exists(full)
|
|
1111
1240
|
if val is not None:
|
|
@@ -1115,7 +1244,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1115
1244
|
if verbose:
|
|
1116
1245
|
print(f"[safe_read_h5ad] restored obsm['{obsm_name}'] from {full}")
|
|
1117
1246
|
except Exception as e:
|
|
1118
|
-
report["errors"].append(
|
|
1247
|
+
report["errors"].append(
|
|
1248
|
+
f"Failed to restore obsm['{obsm_name}'] from {full}: {e}"
|
|
1249
|
+
)
|
|
1119
1250
|
|
|
1120
1251
|
# 6) If restore_backups True but some expected backups missing, note them
|
|
1121
1252
|
if restore_backups and os.path.isdir(backup_dir):
|
|
@@ -1145,7 +1276,9 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1145
1276
|
if expected_missing and verbose:
|
|
1146
1277
|
n = len(expected_missing)
|
|
1147
1278
|
if verbose:
|
|
1148
|
-
print(
|
|
1279
|
+
print(
|
|
1280
|
+
f"[safe_read_h5ad] note: {n} obs/var object columns may not have backups; check if their content is acceptable."
|
|
1281
|
+
)
|
|
1149
1282
|
# add to report
|
|
1150
1283
|
report["missing_backups"].extend(expected_missing)
|
|
1151
1284
|
|
|
@@ -1165,9 +1298,16 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1165
1298
|
if report["restored_obsm"]:
|
|
1166
1299
|
print("Restored obsm:", report["restored_obsm"])
|
|
1167
1300
|
if report["recategorized_obs"] or report["recategorized_var"]:
|
|
1168
|
-
print(
|
|
1301
|
+
print(
|
|
1302
|
+
"Recategorized columns (obs/var):",
|
|
1303
|
+
report["recategorized_obs"],
|
|
1304
|
+
report["recategorized_var"],
|
|
1305
|
+
)
|
|
1169
1306
|
if report["missing_backups"]:
|
|
1170
|
-
print(
|
|
1307
|
+
print(
|
|
1308
|
+
"Missing backups or object columns without backups (investigate):",
|
|
1309
|
+
report["missing_backups"],
|
|
1310
|
+
)
|
|
1171
1311
|
if report["errors"]:
|
|
1172
1312
|
print("Errors encountered (see report['errors']):")
|
|
1173
1313
|
for e in report["errors"]:
|
|
@@ -1176,9 +1316,10 @@ def safe_read_h5ad(path, backup_dir=None, restore_backups=True, re_categorize=Tr
|
|
|
1176
1316
|
|
|
1177
1317
|
return adata, report
|
|
1178
1318
|
|
|
1319
|
+
|
|
1179
1320
|
def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
1180
|
-
import numpy as np
|
|
1181
1321
|
import anndata as ad
|
|
1322
|
+
import numpy as np
|
|
1182
1323
|
|
|
1183
1324
|
# Step 1: Identify overlap
|
|
1184
1325
|
overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
|
|
@@ -1187,24 +1328,25 @@ def merge_barcoded_anndatas_core(adata_single, adata_double):
|
|
|
1187
1328
|
adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
|
|
1188
1329
|
|
|
1189
1330
|
# Step 3: Add source tag
|
|
1190
|
-
adata_single_filtered.obs[
|
|
1191
|
-
adata_double.obs[
|
|
1331
|
+
adata_single_filtered.obs["source"] = "single_barcode"
|
|
1332
|
+
adata_double.obs["source"] = "double_barcode"
|
|
1192
1333
|
|
|
1193
1334
|
# Step 4: Concatenate all components
|
|
1194
|
-
adata_merged = ad.concat(
|
|
1195
|
-
adata_single_filtered,
|
|
1196
|
-
|
|
1197
|
-
], join='outer', merge='same') # merge='same' preserves matching layers, obsm, etc.
|
|
1335
|
+
adata_merged = ad.concat(
|
|
1336
|
+
[adata_single_filtered, adata_double], join="outer", merge="same"
|
|
1337
|
+
) # merge='same' preserves matching layers, obsm, etc.
|
|
1198
1338
|
|
|
1199
1339
|
# Step 5: Merge `.uns`
|
|
1200
1340
|
adata_merged.uns = {**adata_single.uns, **adata_double.uns}
|
|
1201
1341
|
|
|
1202
1342
|
return adata_merged
|
|
1343
|
+
|
|
1344
|
+
|
|
1203
1345
|
######################################################################################################
|
|
1204
1346
|
|
|
1205
1347
|
### File conversion misc ###
|
|
1206
|
-
|
|
1207
|
-
|
|
1348
|
+
|
|
1349
|
+
|
|
1208
1350
|
def genbank_to_gff(genbank_file, output_file, record_id):
|
|
1209
1351
|
with open(output_file, "w") as out:
|
|
1210
1352
|
for record in SeqIO.parse(genbank_file, "genbank"):
|
|
@@ -1220,5 +1362,18 @@ def genbank_to_gff(genbank_file, output_file, record_id):
|
|
|
1220
1362
|
# Format attributes
|
|
1221
1363
|
attributes = ";".join(f"{k}={v}" for k, v in feature.qualifiers.items())
|
|
1222
1364
|
# Write GFF3 line
|
|
1223
|
-
gff3_line = "\t".join(
|
|
1224
|
-
|
|
1365
|
+
gff3_line = "\t".join(
|
|
1366
|
+
str(x)
|
|
1367
|
+
for x in [
|
|
1368
|
+
record_id,
|
|
1369
|
+
feature.type,
|
|
1370
|
+
feature_type,
|
|
1371
|
+
start,
|
|
1372
|
+
end,
|
|
1373
|
+
".",
|
|
1374
|
+
strand,
|
|
1375
|
+
".",
|
|
1376
|
+
attributes,
|
|
1377
|
+
]
|
|
1378
|
+
)
|
|
1379
|
+
out.write(gff3_line + "\n")
|