smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,8 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import os
|
|
5
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
1
6
|
from pathlib import Path
|
|
2
|
-
import
|
|
7
|
+
from typing import Dict, List, Optional, Union
|
|
8
|
+
|
|
3
9
|
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
4
11
|
import scipy.sparse as sp
|
|
5
|
-
|
|
12
|
+
|
|
13
|
+
from smftools.logging_utils import get_logger
|
|
14
|
+
from smftools.optional_imports import require
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
6
18
|
|
|
7
19
|
def add_demux_type_annotation(
|
|
8
20
|
adata,
|
|
@@ -71,14 +83,16 @@ def add_demux_type_annotation(
|
|
|
71
83
|
|
|
72
84
|
return adata
|
|
73
85
|
|
|
86
|
+
|
|
74
87
|
def add_read_length_and_mapping_qc(
|
|
75
88
|
adata,
|
|
76
89
|
bam_files: Optional[List[str]] = None,
|
|
77
90
|
read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
|
|
78
91
|
uns_flag: str = "add_read_length_and_mapping_qc_performed",
|
|
79
|
-
extract_read_features_from_bam_callable
|
|
92
|
+
extract_read_features_from_bam_callable=None,
|
|
80
93
|
bypass: bool = False,
|
|
81
|
-
force_redo: bool = True
|
|
94
|
+
force_redo: bool = True,
|
|
95
|
+
samtools_backend: str | None = "auto",
|
|
82
96
|
):
|
|
83
97
|
"""
|
|
84
98
|
Populate adata.obs with read/mapping QC columns.
|
|
@@ -98,6 +112,7 @@ def add_read_length_and_mapping_qc(
|
|
|
98
112
|
Optional callable(bam_path) -> dict mapping read_name -> list/tuple of metrics.
|
|
99
113
|
If not provided and bam_files is given, function will attempt to call `extract_read_features_from_bam`
|
|
100
114
|
from the global namespace (your existing helper).
|
|
115
|
+
|
|
101
116
|
Returns
|
|
102
117
|
-------
|
|
103
118
|
None (mutates final_adata in-place)
|
|
@@ -113,11 +128,15 @@ def add_read_length_and_mapping_qc(
|
|
|
113
128
|
if read_metrics is None:
|
|
114
129
|
read_metrics = {}
|
|
115
130
|
if bam_files:
|
|
116
|
-
extractor = extract_read_features_from_bam_callable or globals().get(
|
|
131
|
+
extractor = extract_read_features_from_bam_callable or globals().get(
|
|
132
|
+
"extract_read_features_from_bam"
|
|
133
|
+
)
|
|
117
134
|
if extractor is None:
|
|
118
|
-
raise ValueError(
|
|
135
|
+
raise ValueError(
|
|
136
|
+
"No `read_metrics` provided and `extract_read_features_from_bam` not found."
|
|
137
|
+
)
|
|
119
138
|
for bam in bam_files:
|
|
120
|
-
bam_read_metrics = extractor(bam)
|
|
139
|
+
bam_read_metrics = extractor(bam, samtools_backend)
|
|
121
140
|
if not isinstance(bam_read_metrics, dict):
|
|
122
141
|
raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
|
|
123
142
|
read_metrics.update(bam_read_metrics)
|
|
@@ -130,11 +149,11 @@ def add_read_length_and_mapping_qc(
|
|
|
130
149
|
if len(read_metrics) == 0:
|
|
131
150
|
# fill with NaNs
|
|
132
151
|
n = adata.n_obs
|
|
133
|
-
adata.obs[
|
|
134
|
-
adata.obs[
|
|
135
|
-
adata.obs[
|
|
136
|
-
adata.obs[
|
|
137
|
-
adata.obs[
|
|
152
|
+
adata.obs["read_length"] = np.full(n, np.nan)
|
|
153
|
+
adata.obs["mapped_length"] = np.full(n, np.nan)
|
|
154
|
+
adata.obs["reference_length"] = np.full(n, np.nan)
|
|
155
|
+
adata.obs["read_quality"] = np.full(n, np.nan)
|
|
156
|
+
adata.obs["mapping_quality"] = np.full(n, np.nan)
|
|
138
157
|
else:
|
|
139
158
|
# Build DF robustly
|
|
140
159
|
# Convert values to lists where possible, else to [val, val, val...]
|
|
@@ -151,35 +170,45 @@ def add_read_length_and_mapping_qc(
|
|
|
151
170
|
vals = vals + [np.nan] * (max_cols - len(vals))
|
|
152
171
|
rows[k] = vals[:max_cols]
|
|
153
172
|
|
|
154
|
-
df = pd.DataFrame.from_dict(
|
|
155
|
-
|
|
156
|
-
|
|
173
|
+
df = pd.DataFrame.from_dict(
|
|
174
|
+
rows,
|
|
175
|
+
orient="index",
|
|
176
|
+
columns=[
|
|
177
|
+
"read_length",
|
|
178
|
+
"read_quality",
|
|
179
|
+
"reference_length",
|
|
180
|
+
"mapped_length",
|
|
181
|
+
"mapping_quality",
|
|
182
|
+
],
|
|
183
|
+
)
|
|
157
184
|
|
|
158
185
|
# Reindex to final_adata.obs_names so order matches adata
|
|
159
186
|
# If obs_names are not present as keys in df, the results will be NaN
|
|
160
187
|
df_reindexed = df.reindex(adata.obs_names).astype(float)
|
|
161
188
|
|
|
162
|
-
adata.obs[
|
|
163
|
-
adata.obs[
|
|
164
|
-
adata.obs[
|
|
165
|
-
adata.obs[
|
|
166
|
-
adata.obs[
|
|
189
|
+
adata.obs["read_length"] = df_reindexed["read_length"].values
|
|
190
|
+
adata.obs["mapped_length"] = df_reindexed["mapped_length"].values
|
|
191
|
+
adata.obs["reference_length"] = df_reindexed["reference_length"].values
|
|
192
|
+
adata.obs["read_quality"] = df_reindexed["read_quality"].values
|
|
193
|
+
adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
|
|
167
194
|
|
|
168
195
|
# Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
|
|
169
196
|
# read_length_to_reference_length_ratio
|
|
170
|
-
rl = pd.to_numeric(adata.obs[
|
|
171
|
-
ref_len = pd.to_numeric(adata.obs[
|
|
172
|
-
mapped_len = pd.to_numeric(adata.obs[
|
|
197
|
+
rl = pd.to_numeric(adata.obs["read_length"], errors="coerce").to_numpy(dtype=float)
|
|
198
|
+
ref_len = pd.to_numeric(adata.obs["reference_length"], errors="coerce").to_numpy(dtype=float)
|
|
199
|
+
mapped_len = pd.to_numeric(adata.obs["mapped_length"], errors="coerce").to_numpy(dtype=float)
|
|
173
200
|
|
|
174
201
|
# safe divisions: use np.where to avoid warnings and replace inf with nan
|
|
175
|
-
with np.errstate(divide=
|
|
202
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
176
203
|
rl_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), rl / ref_len, np.nan)
|
|
177
|
-
mapped_to_ref = np.where(
|
|
204
|
+
mapped_to_ref = np.where(
|
|
205
|
+
(ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan
|
|
206
|
+
)
|
|
178
207
|
mapped_to_read = np.where((rl != 0) & np.isfinite(rl), mapped_len / rl, np.nan)
|
|
179
208
|
|
|
180
|
-
adata.obs[
|
|
181
|
-
adata.obs[
|
|
182
|
-
adata.obs[
|
|
209
|
+
adata.obs["read_length_to_reference_length_ratio"] = rl_to_ref
|
|
210
|
+
adata.obs["mapped_length_to_reference_length_ratio"] = mapped_to_ref
|
|
211
|
+
adata.obs["mapped_length_to_read_length_ratio"] = mapped_to_read
|
|
183
212
|
|
|
184
213
|
# Add read level raw modification signal: sum over X rows
|
|
185
214
|
X = adata.X
|
|
@@ -189,9 +218,152 @@ def add_read_length_and_mapping_qc(
|
|
|
189
218
|
else:
|
|
190
219
|
raw_sig = np.asarray(X.sum(axis=1)).ravel()
|
|
191
220
|
|
|
192
|
-
adata.obs[
|
|
221
|
+
adata.obs["Raw_modification_signal"] = raw_sig
|
|
193
222
|
|
|
194
223
|
# mark as done
|
|
195
224
|
adata.uns[uns_flag] = True
|
|
196
225
|
|
|
197
|
-
return None
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _collect_read_origins_from_pod5(pod5_path: str, target_ids: set[str]) -> dict[str, str]:
|
|
230
|
+
"""
|
|
231
|
+
Worker function: scan one POD5 file and return a mapping
|
|
232
|
+
{read_id: pod5_basename} only for read_ids in `target_ids`.
|
|
233
|
+
"""
|
|
234
|
+
p5 = require("pod5", extra="ont", purpose="POD5 metadata")
|
|
235
|
+
Reader = p5.Reader
|
|
236
|
+
|
|
237
|
+
basename = os.path.basename(pod5_path)
|
|
238
|
+
mapping: dict[str, str] = {}
|
|
239
|
+
|
|
240
|
+
with Reader(pod5_path) as reader:
|
|
241
|
+
for read in reader.reads():
|
|
242
|
+
# Cast read id to string
|
|
243
|
+
rid = str(read.read_id)
|
|
244
|
+
if rid in target_ids:
|
|
245
|
+
mapping[rid] = basename
|
|
246
|
+
|
|
247
|
+
return mapping
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def annotate_pod5_origin(
|
|
251
|
+
adata,
|
|
252
|
+
pod5_path_or_dir: str | Path,
|
|
253
|
+
pattern: str = "*.pod5",
|
|
254
|
+
n_jobs: int | None = None,
|
|
255
|
+
fill_value: str | None = "unknown",
|
|
256
|
+
verbose: bool = True,
|
|
257
|
+
csv_path: str | None = None,
|
|
258
|
+
):
|
|
259
|
+
"""
|
|
260
|
+
Add `pod5_origin` column to `adata.obs`, containing the POD5 basename
|
|
261
|
+
each read came from.
|
|
262
|
+
|
|
263
|
+
Parameters
|
|
264
|
+
----------
|
|
265
|
+
adata
|
|
266
|
+
AnnData with obs_names == read_ids (as strings).
|
|
267
|
+
pod5_path_or_dir
|
|
268
|
+
Directory containing POD5 files or path to a single POD5 file.
|
|
269
|
+
pattern
|
|
270
|
+
Glob pattern for POD5 files inside `pod5_dir`.
|
|
271
|
+
n_jobs
|
|
272
|
+
Number of worker processes. If None or <=1, runs serially.
|
|
273
|
+
fill_value
|
|
274
|
+
Value to use when a read_id is not found in any POD5 file.
|
|
275
|
+
If None, leaves missing as NaN.
|
|
276
|
+
verbose
|
|
277
|
+
Print progress info.
|
|
278
|
+
csv_path
|
|
279
|
+
Path to a csv of the read to pod5 origin mapping
|
|
280
|
+
|
|
281
|
+
Returns
|
|
282
|
+
-------
|
|
283
|
+
None (modifies `adata` in-place).
|
|
284
|
+
"""
|
|
285
|
+
pod5_path_or_dir = Path(pod5_path_or_dir)
|
|
286
|
+
|
|
287
|
+
# --- Resolve input into a list of pod5 files ---
|
|
288
|
+
if pod5_path_or_dir.is_dir():
|
|
289
|
+
pod5_files = sorted(str(p) for p in pod5_path_or_dir.glob(pattern))
|
|
290
|
+
if not pod5_files:
|
|
291
|
+
raise FileNotFoundError(
|
|
292
|
+
f"No POD5 files matching {pattern!r} in {str(pod5_path_or_dir)!r}"
|
|
293
|
+
)
|
|
294
|
+
elif pod5_path_or_dir.is_file():
|
|
295
|
+
if pod5_path_or_dir.suffix.lower() != ".pod5":
|
|
296
|
+
raise ValueError(f"Expected a .pod5 file, got: {pod5_path_or_dir}")
|
|
297
|
+
pod5_files = [str(pod5_path_or_dir)]
|
|
298
|
+
else:
|
|
299
|
+
raise FileNotFoundError(f"Path does not exist: {pod5_path_or_dir}")
|
|
300
|
+
|
|
301
|
+
# Make sure obs_names are strings
|
|
302
|
+
obs_names = adata.obs_names.astype(str)
|
|
303
|
+
target_ids = set(obs_names) # only these are interesting
|
|
304
|
+
|
|
305
|
+
if verbose:
|
|
306
|
+
logger.info(f"Found {len(pod5_files)} POD5 files.")
|
|
307
|
+
logger.info(f"Tracking {len(target_ids)} read IDs from AnnData.")
|
|
308
|
+
|
|
309
|
+
# --- Collect mappings (possibly multiprocessed) ---
|
|
310
|
+
global_mapping: dict[str, str] = {}
|
|
311
|
+
|
|
312
|
+
if n_jobs is None or n_jobs <= 1:
|
|
313
|
+
# Serial version (less overhead, useful for debugging)
|
|
314
|
+
if verbose:
|
|
315
|
+
logger.debug("Running in SERIAL mode.")
|
|
316
|
+
for f in pod5_files:
|
|
317
|
+
if verbose:
|
|
318
|
+
logger.debug(f" Scanning {os.path.basename(f)} ...")
|
|
319
|
+
part = _collect_read_origins_from_pod5(f, target_ids)
|
|
320
|
+
global_mapping.update(part)
|
|
321
|
+
else:
|
|
322
|
+
if verbose:
|
|
323
|
+
logger.debug(f"Running in PARALLEL mode with {n_jobs} workers.")
|
|
324
|
+
with ProcessPoolExecutor(max_workers=n_jobs) as ex:
|
|
325
|
+
futures = {
|
|
326
|
+
ex.submit(_collect_read_origins_from_pod5, f, target_ids): f for f in pod5_files
|
|
327
|
+
}
|
|
328
|
+
for fut in as_completed(futures):
|
|
329
|
+
f = futures[fut]
|
|
330
|
+
try:
|
|
331
|
+
part = fut.result()
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.warning(f"Error while processing {f}: {e}")
|
|
334
|
+
continue
|
|
335
|
+
global_mapping.update(part)
|
|
336
|
+
if verbose:
|
|
337
|
+
logger.info(f" Finished {os.path.basename(f)} ({len(part)} matching reads)")
|
|
338
|
+
|
|
339
|
+
if verbose:
|
|
340
|
+
logger.info(f"Total reads matched: {len(global_mapping)}")
|
|
341
|
+
|
|
342
|
+
# --- Populate obs['pod5_origin'] in AnnData order, memory-efficiently ---
|
|
343
|
+
origin = np.empty(adata.n_obs, dtype=object)
|
|
344
|
+
default = None if fill_value is None else fill_value
|
|
345
|
+
for i, rid in enumerate(obs_names):
|
|
346
|
+
origin[i] = global_mapping.get(rid, default)
|
|
347
|
+
|
|
348
|
+
adata.obs["pod5_origin"] = origin
|
|
349
|
+
if verbose:
|
|
350
|
+
logger.info("Assigned `pod5_origin` to adata.obs.")
|
|
351
|
+
|
|
352
|
+
# --- Optionally write a CSV ---
|
|
353
|
+
if csv_path is not None:
|
|
354
|
+
if verbose:
|
|
355
|
+
logger.info(f"Writing CSV mapping to: {csv_path}")
|
|
356
|
+
|
|
357
|
+
# Create DataFrame in AnnData order for easier cross-referencing
|
|
358
|
+
df = pd.DataFrame(
|
|
359
|
+
{
|
|
360
|
+
"read_id": obs_names,
|
|
361
|
+
"pod5_origin": origin,
|
|
362
|
+
}
|
|
363
|
+
)
|
|
364
|
+
df.to_csv(csv_path, index=False)
|
|
365
|
+
|
|
366
|
+
if verbose:
|
|
367
|
+
logger.info("CSV saved.")
|
|
368
|
+
|
|
369
|
+
return global_mapping
|