smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,61 +1,70 @@
|
|
|
1
|
-
|
|
2
|
-
import time
|
|
3
|
-
import os
|
|
4
|
-
import gc
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import anndata as ad
|
|
7
|
-
from tqdm import tqdm
|
|
8
|
-
import multiprocessing
|
|
9
|
-
from multiprocessing import Manager, Lock, current_process, Pool
|
|
10
|
-
import traceback
|
|
11
|
-
import gzip
|
|
12
|
-
import torch
|
|
1
|
+
from __future__ import annotations
|
|
13
2
|
|
|
3
|
+
import gc
|
|
4
|
+
import logging
|
|
14
5
|
import shutil
|
|
6
|
+
import time
|
|
7
|
+
import traceback
|
|
8
|
+
from multiprocessing import Manager, Pool, current_process
|
|
15
9
|
from pathlib import Path
|
|
16
|
-
from typing import
|
|
10
|
+
from typing import TYPE_CHECKING, Iterable, Optional, Union
|
|
11
|
+
|
|
12
|
+
import anndata as ad
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
17
15
|
|
|
18
|
-
from
|
|
16
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
17
|
+
from smftools.optional_imports import require
|
|
18
|
+
|
|
19
|
+
from ..readwrite import make_dirs
|
|
20
|
+
from .bam_functions import count_aligned_reads, extract_base_identities
|
|
19
21
|
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
20
22
|
from .fasta_functions import find_conversion_sites
|
|
21
|
-
from .bam_functions import count_aligned_reads, extract_base_identities
|
|
22
23
|
from .ohe import ohe_batching
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
25
|
+
logger = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import torch
|
|
29
|
+
|
|
30
|
+
torch = require("torch", extra="torch", purpose="converted BAM processing")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def converted_BAM_to_adata(
|
|
34
|
+
converted_FASTA: str | Path,
|
|
35
|
+
split_dir: Path,
|
|
36
|
+
output_dir: Path,
|
|
37
|
+
input_already_demuxed: bool,
|
|
38
|
+
mapping_threshold: float,
|
|
39
|
+
experiment_name: str,
|
|
40
|
+
conversions: list[str],
|
|
41
|
+
bam_suffix: str,
|
|
42
|
+
device: str | torch.device = "cpu",
|
|
43
|
+
num_threads: int = 8,
|
|
44
|
+
deaminase_footprinting: bool = False,
|
|
45
|
+
delete_intermediates: bool = True,
|
|
46
|
+
double_barcoded_path: Path | None = None,
|
|
47
|
+
samtools_backend: str | None = "auto",
|
|
48
|
+
) -> tuple[ad.AnnData | None, Path]:
|
|
49
|
+
"""Convert BAM files into an AnnData object by binarizing modified base identities.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
converted_FASTA: Path to the converted FASTA reference.
|
|
53
|
+
split_dir: Directory containing converted BAM files.
|
|
54
|
+
output_dir: Output directory for intermediate and final files.
|
|
55
|
+
input_already_demuxed: Whether input reads were originally demultiplexed.
|
|
56
|
+
mapping_threshold: Minimum fraction of aligned reads required for inclusion.
|
|
57
|
+
experiment_name: Name for the output AnnData object.
|
|
58
|
+
conversions: List of modification types (e.g., ``["unconverted", "5mC", "6mA"]``).
|
|
59
|
+
bam_suffix: File suffix for BAM files.
|
|
60
|
+
device: Torch device or device string.
|
|
61
|
+
num_threads: Number of parallel processing threads.
|
|
62
|
+
deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
|
|
63
|
+
delete_intermediates: Whether to remove intermediate files after processing.
|
|
64
|
+
double_barcoded_path: Path to dorado demux summary file of double-ended barcodes.
|
|
56
65
|
|
|
57
66
|
Returns:
|
|
58
|
-
|
|
67
|
+
tuple[anndata.AnnData | None, Path]: The AnnData object (if generated) and its path.
|
|
59
68
|
"""
|
|
60
69
|
if torch.cuda.is_available():
|
|
61
70
|
device = torch.device("cuda")
|
|
@@ -64,69 +73,91 @@ def converted_BAM_to_adata(converted_FASTA,
|
|
|
64
73
|
else:
|
|
65
74
|
device = torch.device("cpu")
|
|
66
75
|
|
|
67
|
-
|
|
76
|
+
logger.debug(f"Using device: {device}")
|
|
68
77
|
|
|
69
78
|
## Set Up Directories and File Paths
|
|
70
|
-
h5_dir = output_dir /
|
|
71
|
-
tmp_dir = output_dir /
|
|
79
|
+
h5_dir = output_dir / "h5ads"
|
|
80
|
+
tmp_dir = output_dir / "tmp"
|
|
72
81
|
final_adata = None
|
|
73
|
-
final_adata_path = h5_dir / f
|
|
82
|
+
final_adata_path = h5_dir / f"{experiment_name}.h5ad.gz"
|
|
74
83
|
|
|
75
84
|
if final_adata_path.exists():
|
|
76
|
-
|
|
85
|
+
logger.debug(f"{final_adata_path} already exists. Using existing AnnData object.")
|
|
77
86
|
return final_adata, final_adata_path
|
|
78
87
|
|
|
79
88
|
make_dirs([h5_dir, tmp_dir])
|
|
80
89
|
|
|
81
90
|
bam_files = sorted(
|
|
82
|
-
p
|
|
83
|
-
|
|
84
|
-
and p.suffix == ".bam"
|
|
85
|
-
and "unclassified" not in p.name
|
|
91
|
+
p
|
|
92
|
+
for p in split_dir.iterdir()
|
|
93
|
+
if p.is_file() and p.suffix == ".bam" and "unclassified" not in p.name
|
|
86
94
|
)
|
|
87
95
|
|
|
88
|
-
bam_path_list =
|
|
89
|
-
|
|
96
|
+
bam_path_list = bam_files
|
|
97
|
+
|
|
98
|
+
bam_names = [bam.name for bam in bam_files]
|
|
99
|
+
logger.info(f"Found {len(bam_files)} BAM files within {split_dir}: {bam_names}")
|
|
90
100
|
|
|
91
101
|
## Process Conversion Sites
|
|
92
|
-
max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(
|
|
102
|
+
max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(
|
|
103
|
+
converted_FASTA, conversions, deaminase_footprinting
|
|
104
|
+
)
|
|
93
105
|
|
|
94
106
|
## Filter BAM Files by Mapping Threshold
|
|
95
|
-
records_to_analyze = filter_bams_by_mapping_threshold(
|
|
107
|
+
records_to_analyze = filter_bams_by_mapping_threshold(
|
|
108
|
+
bam_path_list, bam_files, mapping_threshold, samtools_backend
|
|
109
|
+
)
|
|
96
110
|
|
|
97
111
|
## Process BAMs in Parallel
|
|
98
|
-
final_adata = process_bams_parallel(
|
|
112
|
+
final_adata = process_bams_parallel(
|
|
113
|
+
bam_path_list,
|
|
114
|
+
records_to_analyze,
|
|
115
|
+
record_FASTA_dict,
|
|
116
|
+
chromosome_FASTA_dict,
|
|
117
|
+
tmp_dir,
|
|
118
|
+
h5_dir,
|
|
119
|
+
num_threads,
|
|
120
|
+
max_reference_length,
|
|
121
|
+
device,
|
|
122
|
+
deaminase_footprinting,
|
|
123
|
+
samtools_backend,
|
|
124
|
+
)
|
|
99
125
|
|
|
100
|
-
final_adata.uns[
|
|
126
|
+
final_adata.uns["References"] = {}
|
|
101
127
|
for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
|
|
102
|
-
final_adata.var[f
|
|
103
|
-
final_adata.var[f
|
|
104
|
-
final_adata.uns[f
|
|
105
|
-
final_adata.uns[
|
|
128
|
+
final_adata.var[f"{chromosome}_top_strand_FASTA_base"] = list(seq)
|
|
129
|
+
final_adata.var[f"{chromosome}_bottom_strand_FASTA_base"] = list(comp)
|
|
130
|
+
final_adata.uns[f"{chromosome}_FASTA_sequence"] = seq
|
|
131
|
+
final_adata.uns["References"][f"{chromosome}_FASTA_sequence"] = seq
|
|
106
132
|
|
|
107
133
|
final_adata.obs_names_make_unique()
|
|
108
134
|
cols = final_adata.obs.columns
|
|
109
135
|
|
|
110
136
|
# Make obs cols categorical
|
|
111
137
|
for col in cols:
|
|
112
|
-
final_adata.obs[col] = final_adata.obs[col].astype(
|
|
138
|
+
final_adata.obs[col] = final_adata.obs[col].astype("category")
|
|
113
139
|
|
|
114
140
|
if input_already_demuxed:
|
|
115
141
|
final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
|
|
116
142
|
final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
|
|
117
143
|
else:
|
|
118
144
|
from .h5ad_functions import add_demux_type_annotation
|
|
145
|
+
|
|
119
146
|
double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
|
|
147
|
+
logger.info("Adding demux type to each read")
|
|
120
148
|
add_demux_type_annotation(final_adata, double_barcoded_reads)
|
|
121
149
|
|
|
122
150
|
## Delete intermediate h5ad files and temp directories
|
|
123
151
|
if delete_intermediates:
|
|
152
|
+
logger.info("Deleting intermediate h5ad files")
|
|
124
153
|
delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
|
|
125
|
-
|
|
154
|
+
|
|
126
155
|
return final_adata, final_adata_path
|
|
127
156
|
|
|
128
157
|
|
|
129
|
-
def process_conversion_sites(
|
|
158
|
+
def process_conversion_sites(
|
|
159
|
+
converted_FASTA, conversions=["unconverted", "5mC"], deaminase_footprinting=False
|
|
160
|
+
):
|
|
130
161
|
"""
|
|
131
162
|
Extracts conversion sites and determines the max reference length.
|
|
132
163
|
|
|
@@ -147,7 +178,9 @@ def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC']
|
|
|
147
178
|
conversion_types = conversions[1:]
|
|
148
179
|
|
|
149
180
|
# Process the unconverted sequence once
|
|
150
|
-
modification_dict[unconverted] = find_conversion_sites(
|
|
181
|
+
modification_dict[unconverted] = find_conversion_sites(
|
|
182
|
+
converted_FASTA, unconverted, conversions, deaminase_footprinting
|
|
183
|
+
)
|
|
151
184
|
# Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
|
|
152
185
|
|
|
153
186
|
# Get **max sequence length** from unconverted records
|
|
@@ -166,15 +199,25 @@ def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC']
|
|
|
166
199
|
record_FASTA_dict[record] = [
|
|
167
200
|
sequence + "N" * (max_reference_length - sequence_length),
|
|
168
201
|
complement + "N" * (max_reference_length - sequence_length),
|
|
169
|
-
chromosome,
|
|
202
|
+
chromosome,
|
|
203
|
+
record,
|
|
204
|
+
sequence_length,
|
|
205
|
+
max_reference_length - sequence_length,
|
|
206
|
+
unconverted,
|
|
207
|
+
"top",
|
|
170
208
|
]
|
|
171
209
|
|
|
172
210
|
if chromosome not in chromosome_FASTA_dict:
|
|
173
|
-
chromosome_FASTA_dict[chromosome] = [
|
|
211
|
+
chromosome_FASTA_dict[chromosome] = [
|
|
212
|
+
sequence + "N" * (max_reference_length - sequence_length),
|
|
213
|
+
complement + "N" * (max_reference_length - sequence_length),
|
|
214
|
+
]
|
|
174
215
|
|
|
175
216
|
# Process converted records
|
|
176
217
|
for conversion in conversion_types:
|
|
177
|
-
modification_dict[conversion] = find_conversion_sites(
|
|
218
|
+
modification_dict[conversion] = find_conversion_sites(
|
|
219
|
+
converted_FASTA, conversion, conversions, deaminase_footprinting
|
|
220
|
+
)
|
|
178
221
|
# Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
|
|
179
222
|
|
|
180
223
|
for record, values in modification_dict[conversion].items():
|
|
@@ -193,32 +236,47 @@ def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC']
|
|
|
193
236
|
record_FASTA_dict[converted_name] = [
|
|
194
237
|
sequence + "N" * (max_reference_length - sequence_length),
|
|
195
238
|
complement + "N" * (max_reference_length - sequence_length),
|
|
196
|
-
chromosome,
|
|
197
|
-
|
|
239
|
+
chromosome,
|
|
240
|
+
unconverted_name,
|
|
241
|
+
sequence_length,
|
|
242
|
+
max_reference_length - sequence_length,
|
|
243
|
+
conversion,
|
|
244
|
+
strand,
|
|
198
245
|
]
|
|
199
246
|
|
|
200
|
-
|
|
247
|
+
logger.debug("Updated record_FASTA_dict Keys:", list(record_FASTA_dict.keys()))
|
|
201
248
|
return max_reference_length, record_FASTA_dict, chromosome_FASTA_dict
|
|
202
249
|
|
|
203
250
|
|
|
204
|
-
def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold):
|
|
251
|
+
def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold, samtools_backend):
|
|
205
252
|
"""Filters BAM files based on mapping threshold."""
|
|
206
253
|
records_to_analyze = set()
|
|
207
254
|
|
|
208
255
|
for i, bam in enumerate(bam_path_list):
|
|
209
|
-
aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam)
|
|
256
|
+
aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam, samtools_backend)
|
|
210
257
|
aligned_percent = aligned_reads * 100 / (aligned_reads + unaligned_reads)
|
|
211
|
-
|
|
258
|
+
logger.info(f"{aligned_percent:.2f}% of reads in {bam_files[i].name} aligned successfully.")
|
|
212
259
|
|
|
213
260
|
for record, (count, percent) in record_counts.items():
|
|
214
261
|
if percent >= mapping_threshold:
|
|
215
262
|
records_to_analyze.add(record)
|
|
216
263
|
|
|
217
|
-
|
|
264
|
+
logger.info(f"Analyzing the following FASTA records: {records_to_analyze}")
|
|
218
265
|
return records_to_analyze
|
|
219
266
|
|
|
220
267
|
|
|
221
|
-
def process_single_bam(
|
|
268
|
+
def process_single_bam(
|
|
269
|
+
bam_index,
|
|
270
|
+
bam,
|
|
271
|
+
records_to_analyze,
|
|
272
|
+
record_FASTA_dict,
|
|
273
|
+
chromosome_FASTA_dict,
|
|
274
|
+
tmp_dir,
|
|
275
|
+
max_reference_length,
|
|
276
|
+
device,
|
|
277
|
+
deaminase_footprinting,
|
|
278
|
+
samtools_backend,
|
|
279
|
+
):
|
|
222
280
|
"""Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
|
|
223
281
|
adata_list = []
|
|
224
282
|
|
|
@@ -230,34 +288,58 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
230
288
|
sequence = chromosome_FASTA_dict[chromosome][0]
|
|
231
289
|
|
|
232
290
|
# Extract Base Identities
|
|
233
|
-
fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read =
|
|
291
|
+
fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = (
|
|
292
|
+
extract_base_identities(
|
|
293
|
+
bam, record, range(current_length), max_reference_length, sequence, samtools_backend
|
|
294
|
+
)
|
|
295
|
+
)
|
|
234
296
|
mismatch_trend_series = pd.Series(mismatch_trend_per_read)
|
|
235
297
|
|
|
236
298
|
# Skip processing if both forward and reverse base identities are empty
|
|
237
299
|
if not fwd_bases and not rev_bases:
|
|
238
|
-
|
|
300
|
+
logger.debug(
|
|
301
|
+
f"[Worker {current_process().pid}] Skipping {sample} - No valid base identities for {record}."
|
|
302
|
+
)
|
|
239
303
|
continue
|
|
240
304
|
|
|
241
305
|
merged_bin = {}
|
|
242
306
|
|
|
243
307
|
# Binarize the Base Identities if they exist
|
|
244
308
|
if fwd_bases:
|
|
245
|
-
fwd_bin = binarize_converted_base_identities(
|
|
309
|
+
fwd_bin = binarize_converted_base_identities(
|
|
310
|
+
fwd_bases,
|
|
311
|
+
strand,
|
|
312
|
+
mod_type,
|
|
313
|
+
bam,
|
|
314
|
+
device,
|
|
315
|
+
deaminase_footprinting,
|
|
316
|
+
mismatch_trend_per_read,
|
|
317
|
+
)
|
|
246
318
|
merged_bin.update(fwd_bin)
|
|
247
319
|
|
|
248
320
|
if rev_bases:
|
|
249
|
-
rev_bin = binarize_converted_base_identities(
|
|
321
|
+
rev_bin = binarize_converted_base_identities(
|
|
322
|
+
rev_bases,
|
|
323
|
+
strand,
|
|
324
|
+
mod_type,
|
|
325
|
+
bam,
|
|
326
|
+
device,
|
|
327
|
+
deaminase_footprinting,
|
|
328
|
+
mismatch_trend_per_read,
|
|
329
|
+
)
|
|
250
330
|
merged_bin.update(rev_bin)
|
|
251
331
|
|
|
252
332
|
# Skip if merged_bin is empty (no valid binarized data)
|
|
253
333
|
if not merged_bin:
|
|
254
|
-
|
|
334
|
+
logger.debug(
|
|
335
|
+
f"[Worker {current_process().pid}] Skipping {sample} - No valid binarized data for {record}."
|
|
336
|
+
)
|
|
255
337
|
continue
|
|
256
338
|
|
|
257
339
|
# Convert to DataFrame
|
|
258
340
|
# for key in merged_bin:
|
|
259
341
|
# merged_bin[key] = merged_bin[key].cpu().numpy() # Move to CPU & convert to NumPy
|
|
260
|
-
bin_df = pd.DataFrame.from_dict(merged_bin, orient=
|
|
342
|
+
bin_df = pd.DataFrame.from_dict(merged_bin, orient="index")
|
|
261
343
|
sorted_index = sorted(bin_df.index)
|
|
262
344
|
bin_df = bin_df.reindex(sorted_index)
|
|
263
345
|
|
|
@@ -265,14 +347,18 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
265
347
|
one_hot_reads = {}
|
|
266
348
|
|
|
267
349
|
if fwd_bases:
|
|
268
|
-
fwd_ohe_files = ohe_batching(
|
|
350
|
+
fwd_ohe_files = ohe_batching(
|
|
351
|
+
fwd_bases, tmp_dir, record, f"{bam_index}_fwd", batch_size=100000
|
|
352
|
+
)
|
|
269
353
|
for ohe_file in fwd_ohe_files:
|
|
270
354
|
tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
|
|
271
355
|
one_hot_reads.update(tmp_ohe_dict)
|
|
272
356
|
del tmp_ohe_dict
|
|
273
357
|
|
|
274
358
|
if rev_bases:
|
|
275
|
-
rev_ohe_files = ohe_batching(
|
|
359
|
+
rev_ohe_files = ohe_batching(
|
|
360
|
+
rev_bases, tmp_dir, record, f"{bam_index}_rev", batch_size=100000
|
|
361
|
+
)
|
|
276
362
|
for ohe_file in rev_ohe_files:
|
|
277
363
|
tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
|
|
278
364
|
one_hot_reads.update(tmp_ohe_dict)
|
|
@@ -280,7 +366,9 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
280
366
|
|
|
281
367
|
# Skip if one_hot_reads is empty
|
|
282
368
|
if not one_hot_reads:
|
|
283
|
-
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"[Worker {current_process().pid}] Skipping {sample} - No valid one-hot encoded data for {record}."
|
|
371
|
+
)
|
|
284
372
|
continue
|
|
285
373
|
|
|
286
374
|
gc.collect()
|
|
@@ -291,11 +379,15 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
291
379
|
|
|
292
380
|
# Skip if no read names exist
|
|
293
381
|
if not read_names:
|
|
294
|
-
|
|
382
|
+
logger.debug(
|
|
383
|
+
f"[Worker {current_process().pid}] Skipping {sample} - No reads found in one-hot encoded data for {record}."
|
|
384
|
+
)
|
|
295
385
|
continue
|
|
296
386
|
|
|
297
387
|
sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
|
|
298
|
-
df_A, df_C, df_G, df_T, df_N = [
|
|
388
|
+
df_A, df_C, df_G, df_T, df_N = [
|
|
389
|
+
np.zeros((len(sorted_index), sequence_length), dtype=int) for _ in range(5)
|
|
390
|
+
]
|
|
299
391
|
|
|
300
392
|
# Populate One-Hot Arrays
|
|
301
393
|
for j, read_name in enumerate(sorted_index):
|
|
@@ -310,8 +402,8 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
310
402
|
adata.var_names = bin_df.columns.astype(str)
|
|
311
403
|
adata.obs["Sample"] = [sample] * len(adata)
|
|
312
404
|
try:
|
|
313
|
-
barcode = sample.split(
|
|
314
|
-
except:
|
|
405
|
+
barcode = sample.split("barcode")[1]
|
|
406
|
+
except Exception:
|
|
315
407
|
barcode = np.nan
|
|
316
408
|
adata.obs["Barcode"] = [int(barcode)] * len(adata)
|
|
317
409
|
adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
|
|
@@ -323,49 +415,81 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
323
415
|
adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
|
|
324
416
|
|
|
325
417
|
# Attach One-Hot Encodings to Layers
|
|
326
|
-
adata.layers["
|
|
327
|
-
adata.layers["
|
|
328
|
-
adata.layers["
|
|
329
|
-
adata.layers["
|
|
330
|
-
adata.layers["
|
|
418
|
+
adata.layers["A_binary_sequence_encoding"] = df_A
|
|
419
|
+
adata.layers["C_binary_sequence_encoding"] = df_C
|
|
420
|
+
adata.layers["G_binary_sequence_encoding"] = df_G
|
|
421
|
+
adata.layers["T_binary_sequence_encoding"] = df_T
|
|
422
|
+
adata.layers["N_binary_sequence_encoding"] = df_N
|
|
331
423
|
|
|
332
424
|
adata_list.append(adata)
|
|
333
425
|
|
|
334
426
|
return ad.concat(adata_list, join="outer") if adata_list else None
|
|
335
427
|
|
|
428
|
+
|
|
336
429
|
def timestamp():
|
|
337
430
|
"""Returns a formatted timestamp for logging."""
|
|
338
431
|
return time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
339
432
|
|
|
340
433
|
|
|
341
|
-
def worker_function(
|
|
434
|
+
def worker_function(
|
|
435
|
+
bam_index,
|
|
436
|
+
bam,
|
|
437
|
+
records_to_analyze,
|
|
438
|
+
shared_record_FASTA_dict,
|
|
439
|
+
chromosome_FASTA_dict,
|
|
440
|
+
tmp_dir,
|
|
441
|
+
h5_dir,
|
|
442
|
+
max_reference_length,
|
|
443
|
+
device,
|
|
444
|
+
deaminase_footprinting,
|
|
445
|
+
samtools_backend,
|
|
446
|
+
progress_queue,
|
|
447
|
+
log_level,
|
|
448
|
+
log_file,
|
|
449
|
+
):
|
|
342
450
|
"""Worker function that processes a single BAM and writes the output to an H5AD file."""
|
|
451
|
+
_ensure_worker_logging(log_level, log_file)
|
|
343
452
|
worker_id = current_process().pid # Get worker process ID
|
|
344
453
|
sample = bam.stem
|
|
345
454
|
|
|
346
455
|
try:
|
|
347
|
-
|
|
456
|
+
logger.info(f"[Worker {worker_id}] Processing BAM: {sample}")
|
|
348
457
|
|
|
349
458
|
h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
|
|
350
459
|
if h5ad_path.exists():
|
|
351
|
-
|
|
460
|
+
logger.debug(f"[Worker {worker_id}] Skipping {sample}: Already processed.")
|
|
352
461
|
progress_queue.put(sample)
|
|
353
462
|
return
|
|
354
463
|
|
|
355
464
|
# Filter records specific to this BAM
|
|
356
|
-
bam_records_to_analyze = {
|
|
465
|
+
bam_records_to_analyze = {
|
|
466
|
+
record for record in records_to_analyze if record in shared_record_FASTA_dict
|
|
467
|
+
}
|
|
357
468
|
|
|
358
469
|
if not bam_records_to_analyze:
|
|
359
|
-
|
|
470
|
+
logger.debug(
|
|
471
|
+
f"[Worker {worker_id}] No valid records to analyze for {sample}. Skipping."
|
|
472
|
+
)
|
|
360
473
|
progress_queue.put(sample)
|
|
361
474
|
return
|
|
362
475
|
|
|
363
476
|
# Process BAM
|
|
364
|
-
adata = process_single_bam(
|
|
477
|
+
adata = process_single_bam(
|
|
478
|
+
bam_index,
|
|
479
|
+
bam,
|
|
480
|
+
bam_records_to_analyze,
|
|
481
|
+
shared_record_FASTA_dict,
|
|
482
|
+
chromosome_FASTA_dict,
|
|
483
|
+
tmp_dir,
|
|
484
|
+
max_reference_length,
|
|
485
|
+
device,
|
|
486
|
+
deaminase_footprinting,
|
|
487
|
+
samtools_backend,
|
|
488
|
+
)
|
|
365
489
|
|
|
366
490
|
if adata is not None:
|
|
367
491
|
adata.write_h5ad(str(h5ad_path))
|
|
368
|
-
|
|
492
|
+
logger.info(f"[Worker {worker_id}] Completed processing for BAM: {sample}")
|
|
369
493
|
|
|
370
494
|
# Free memory
|
|
371
495
|
del adata
|
|
@@ -373,22 +497,31 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
|
|
|
373
497
|
|
|
374
498
|
progress_queue.put(sample)
|
|
375
499
|
|
|
376
|
-
except Exception
|
|
377
|
-
|
|
500
|
+
except Exception:
|
|
501
|
+
logger.warning(
|
|
502
|
+
f"[Worker {worker_id}] ERROR while processing {sample}:\n{traceback.format_exc()}"
|
|
503
|
+
)
|
|
378
504
|
progress_queue.put(sample) # Still signal completion to prevent deadlock
|
|
379
505
|
|
|
380
|
-
|
|
506
|
+
|
|
507
|
+
def process_bams_parallel(
|
|
508
|
+
bam_path_list,
|
|
509
|
+
records_to_analyze,
|
|
510
|
+
record_FASTA_dict,
|
|
511
|
+
chromosome_FASTA_dict,
|
|
512
|
+
tmp_dir,
|
|
513
|
+
h5_dir,
|
|
514
|
+
num_threads,
|
|
515
|
+
max_reference_length,
|
|
516
|
+
device,
|
|
517
|
+
deaminase_footprinting,
|
|
518
|
+
samtools_backend,
|
|
519
|
+
):
|
|
381
520
|
"""Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
|
|
382
521
|
make_dirs(h5_dir) # Ensure h5_dir exists
|
|
383
522
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
# Ensure macOS uses forkserver to avoid spawning issues
|
|
387
|
-
try:
|
|
388
|
-
import multiprocessing
|
|
389
|
-
multiprocessing.set_start_method("forkserver", force=True)
|
|
390
|
-
except RuntimeError:
|
|
391
|
-
print(f"{timestamp()} [WARNING] Multiprocessing context already set. Skipping set_start_method.")
|
|
523
|
+
logger.info(f"Starting parallel BAM processing with {num_threads} threads...")
|
|
524
|
+
log_level, log_file = _get_logger_config()
|
|
392
525
|
|
|
393
526
|
with Manager() as manager:
|
|
394
527
|
progress_queue = manager.Queue()
|
|
@@ -396,11 +529,29 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
396
529
|
|
|
397
530
|
with Pool(processes=num_threads) as pool:
|
|
398
531
|
results = [
|
|
399
|
-
pool.apply_async(
|
|
532
|
+
pool.apply_async(
|
|
533
|
+
worker_function,
|
|
534
|
+
(
|
|
535
|
+
i,
|
|
536
|
+
bam,
|
|
537
|
+
records_to_analyze,
|
|
538
|
+
shared_record_FASTA_dict,
|
|
539
|
+
chromosome_FASTA_dict,
|
|
540
|
+
tmp_dir,
|
|
541
|
+
h5_dir,
|
|
542
|
+
max_reference_length,
|
|
543
|
+
device,
|
|
544
|
+
deaminase_footprinting,
|
|
545
|
+
samtools_backend,
|
|
546
|
+
progress_queue,
|
|
547
|
+
log_level,
|
|
548
|
+
log_file,
|
|
549
|
+
),
|
|
550
|
+
)
|
|
400
551
|
for i, bam in enumerate(bam_path_list)
|
|
401
552
|
]
|
|
402
553
|
|
|
403
|
-
|
|
554
|
+
logger.info(f"Submitting {len(results)} BAMs for processing.")
|
|
404
555
|
|
|
405
556
|
# Track completed BAMs
|
|
406
557
|
completed_bams = set()
|
|
@@ -409,24 +560,58 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
409
560
|
processed_bam = progress_queue.get(timeout=2400) # Wait for a finished BAM
|
|
410
561
|
completed_bams.add(processed_bam)
|
|
411
562
|
except Exception as e:
|
|
412
|
-
|
|
563
|
+
logger.error(f"Timeout waiting for worker process. Possible crash? {e}")
|
|
564
|
+
_log_async_result_errors(results, bam_path_list)
|
|
413
565
|
|
|
414
566
|
pool.close()
|
|
415
567
|
pool.join() # Ensure all workers finish
|
|
416
568
|
|
|
569
|
+
_log_async_result_errors(results, bam_path_list)
|
|
570
|
+
|
|
417
571
|
# Final Concatenation Step
|
|
418
|
-
h5ad_files = [
|
|
572
|
+
h5ad_files = [f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
|
|
419
573
|
|
|
420
574
|
if not h5ad_files:
|
|
421
|
-
|
|
575
|
+
logger.warning(f"No valid H5AD files generated. Exiting.")
|
|
422
576
|
return None
|
|
423
577
|
|
|
424
|
-
|
|
578
|
+
logger.info(f"Concatenating {len(h5ad_files)} H5AD files into final output...")
|
|
425
579
|
final_adata = ad.concat([ad.read_h5ad(f) for f in h5ad_files], join="outer")
|
|
426
580
|
|
|
427
|
-
|
|
581
|
+
logger.info(f"Successfully generated final AnnData object.")
|
|
428
582
|
return final_adata
|
|
429
583
|
|
|
584
|
+
|
|
585
|
+
def _log_async_result_errors(results, bam_path_list):
|
|
586
|
+
"""Log worker failures captured by multiprocessing AsyncResult objects."""
|
|
587
|
+
for bam, result in zip(bam_path_list, results):
|
|
588
|
+
if not result.ready():
|
|
589
|
+
continue
|
|
590
|
+
try:
|
|
591
|
+
result.get()
|
|
592
|
+
except Exception as exc:
|
|
593
|
+
logger.error("Worker process failed for %s: %s", bam, exc)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def _get_logger_config() -> tuple[int, Path | None]:
|
|
597
|
+
smftools_logger = logging.getLogger("smftools")
|
|
598
|
+
level = smftools_logger.level
|
|
599
|
+
if level == logging.NOTSET:
|
|
600
|
+
level = logging.INFO
|
|
601
|
+
log_file: Path | None = None
|
|
602
|
+
for handler in smftools_logger.handlers:
|
|
603
|
+
if isinstance(handler, logging.FileHandler):
|
|
604
|
+
log_file = Path(handler.baseFilename)
|
|
605
|
+
break
|
|
606
|
+
return level, log_file
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def _ensure_worker_logging(log_level: int, log_file: Path | None) -> None:
|
|
610
|
+
smftools_logger = logging.getLogger("smftools")
|
|
611
|
+
if not smftools_logger.handlers:
|
|
612
|
+
setup_logging(level=log_level, log_file=log_file)
|
|
613
|
+
|
|
614
|
+
|
|
430
615
|
def delete_intermediate_h5ads_and_tmpdir(
|
|
431
616
|
h5_dir: Union[str, Path, Iterable[str], None],
|
|
432
617
|
tmp_dir: Optional[Union[str, Path]] = None,
|
|
@@ -450,25 +635,27 @@ def delete_intermediate_h5ads_and_tmpdir(
|
|
|
450
635
|
verbose : bool
|
|
451
636
|
Print progress / warnings.
|
|
452
637
|
"""
|
|
638
|
+
|
|
453
639
|
# Helper: remove a single file path (Path-like or string)
|
|
454
640
|
def _maybe_unlink(p: Path):
|
|
641
|
+
"""Remove a file path if it exists and is a file."""
|
|
455
642
|
if not p.exists():
|
|
456
643
|
if verbose:
|
|
457
|
-
|
|
644
|
+
logger.debug(f"[skip] not found: {p}")
|
|
458
645
|
return
|
|
459
646
|
if not p.is_file():
|
|
460
647
|
if verbose:
|
|
461
|
-
|
|
648
|
+
logger.debug(f"[skip] not a file: {p}")
|
|
462
649
|
return
|
|
463
650
|
if dry_run:
|
|
464
|
-
|
|
651
|
+
logger.debug(f"[dry-run] would remove file: {p}")
|
|
465
652
|
return
|
|
466
653
|
try:
|
|
467
654
|
p.unlink()
|
|
468
655
|
if verbose:
|
|
469
|
-
|
|
656
|
+
logger.info(f"Removed file: {p}")
|
|
470
657
|
except Exception as e:
|
|
471
|
-
|
|
658
|
+
logger.warning(f"[error] failed to remove file {p}: {e}")
|
|
472
659
|
|
|
473
660
|
# Handle h5_dir input (directory OR iterable of file paths)
|
|
474
661
|
if h5_dir is not None:
|
|
@@ -483,7 +670,7 @@ def delete_intermediate_h5ads_and_tmpdir(
|
|
|
483
670
|
else:
|
|
484
671
|
if verbose:
|
|
485
672
|
# optional: comment this out if too noisy
|
|
486
|
-
|
|
673
|
+
logger.debug(f"[skip] not matching pattern: {p.name}")
|
|
487
674
|
else:
|
|
488
675
|
# treat as iterable of file paths
|
|
489
676
|
for f in h5_dir:
|
|
@@ -493,25 +680,25 @@ def delete_intermediate_h5ads_and_tmpdir(
|
|
|
493
680
|
_maybe_unlink(p)
|
|
494
681
|
else:
|
|
495
682
|
if verbose:
|
|
496
|
-
|
|
683
|
+
logger.debug(f"[skip] not matching pattern or not a file: {p}")
|
|
497
684
|
|
|
498
685
|
# Remove tmp_dir recursively (if provided)
|
|
499
686
|
if tmp_dir is not None:
|
|
500
687
|
td = Path(tmp_dir)
|
|
501
688
|
if not td.exists():
|
|
502
689
|
if verbose:
|
|
503
|
-
|
|
690
|
+
logger.debug(f"[skip] tmp_dir not found: {td}")
|
|
504
691
|
else:
|
|
505
692
|
if not td.is_dir():
|
|
506
693
|
if verbose:
|
|
507
|
-
|
|
694
|
+
logger.debug(f"[skip] tmp_dir is not a directory: {td}")
|
|
508
695
|
else:
|
|
509
696
|
if dry_run:
|
|
510
|
-
|
|
697
|
+
logger.debug(f"[dry-run] would remove directory tree: {td}")
|
|
511
698
|
else:
|
|
512
699
|
try:
|
|
513
700
|
shutil.rmtree(td)
|
|
514
701
|
if verbose:
|
|
515
|
-
|
|
702
|
+
logger.info(f"Removed directory tree: {td}")
|
|
516
703
|
except Exception as e:
|
|
517
|
-
|
|
704
|
+
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|