smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,29 +11,48 @@ import traceback
|
|
|
11
11
|
import gzip
|
|
12
12
|
import torch
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
import shutil
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Union, Iterable, Optional
|
|
17
|
+
|
|
18
|
+
from ..readwrite import make_dirs, safe_write_h5ad
|
|
15
19
|
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
16
|
-
from .
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
19
|
-
from .make_dirs import make_dirs
|
|
20
|
-
from .ohe_batching import ohe_batching
|
|
20
|
+
from .fasta_functions import find_conversion_sites
|
|
21
|
+
from .bam_functions import count_aligned_reads, extract_base_identities
|
|
22
|
+
from .ohe import ohe_batching
|
|
21
23
|
|
|
22
24
|
if __name__ == "__main__":
|
|
23
25
|
multiprocessing.set_start_method("forkserver", force=True)
|
|
24
26
|
|
|
25
|
-
def
|
|
27
|
+
def converted_BAM_to_adata(converted_FASTA,
|
|
28
|
+
split_dir,
|
|
29
|
+
output_dir,
|
|
30
|
+
input_already_demuxed,
|
|
31
|
+
mapping_threshold,
|
|
32
|
+
experiment_name,
|
|
33
|
+
conversions,
|
|
34
|
+
bam_suffix,
|
|
35
|
+
device='cpu',
|
|
36
|
+
num_threads=8,
|
|
37
|
+
deaminase_footprinting=False,
|
|
38
|
+
delete_intermediates=True,
|
|
39
|
+
double_barcoded_path = None,
|
|
40
|
+
):
|
|
26
41
|
"""
|
|
27
42
|
Converts BAM files into an AnnData object by binarizing modified base identities.
|
|
28
43
|
|
|
29
44
|
Parameters:
|
|
30
|
-
converted_FASTA (
|
|
31
|
-
split_dir (
|
|
45
|
+
converted_FASTA (Path): Path to the converted FASTA reference.
|
|
46
|
+
split_dir (Path): Directory containing converted BAM files.
|
|
47
|
+
output_dir (Path): Directory of the output dir
|
|
48
|
+
input_already_demuxed (bool): Whether input reads were originally demuxed
|
|
32
49
|
mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
|
|
33
50
|
experiment_name (str): Name for the output AnnData object.
|
|
34
|
-
|
|
51
|
+
conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
|
|
35
52
|
bam_suffix (str): File suffix for BAM files.
|
|
36
53
|
num_threads (int): Number of parallel processing threads.
|
|
54
|
+
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
55
|
+
double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
|
|
37
56
|
|
|
38
57
|
Returns:
|
|
39
58
|
str: Path to the final AnnData object.
|
|
@@ -48,50 +67,73 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
|
|
|
48
67
|
print(f"Using device: {device}")
|
|
49
68
|
|
|
50
69
|
## Set Up Directories and File Paths
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
final_adata_path =
|
|
70
|
+
h5_dir = output_dir / 'h5ads'
|
|
71
|
+
tmp_dir = output_dir / 'tmp'
|
|
72
|
+
final_adata = None
|
|
73
|
+
final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
|
|
55
74
|
|
|
56
|
-
if
|
|
75
|
+
if final_adata_path.exists():
|
|
57
76
|
print(f"{final_adata_path} already exists. Using existing AnnData object.")
|
|
58
|
-
return final_adata_path
|
|
77
|
+
return final_adata, final_adata_path
|
|
59
78
|
|
|
60
79
|
make_dirs([h5_dir, tmp_dir])
|
|
61
80
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
81
|
+
bam_files = sorted(
|
|
82
|
+
p for p in split_dir.iterdir()
|
|
83
|
+
if p.is_file()
|
|
84
|
+
and p.suffix == ".bam"
|
|
85
|
+
and "unclassified" not in p.name
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
bam_path_list = [split_dir / f for f in bam_files]
|
|
66
89
|
print(f"Found {len(bam_files)} BAM files: {bam_files}")
|
|
67
90
|
|
|
68
91
|
## Process Conversion Sites
|
|
69
|
-
max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA,
|
|
92
|
+
max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversions, deaminase_footprinting)
|
|
70
93
|
|
|
71
94
|
## Filter BAM Files by Mapping Threshold
|
|
72
95
|
records_to_analyze = filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold)
|
|
73
96
|
|
|
74
97
|
## Process BAMs in Parallel
|
|
75
|
-
final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device)
|
|
98
|
+
final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
|
|
76
99
|
|
|
100
|
+
final_adata.uns['References'] = {}
|
|
77
101
|
for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
|
|
78
102
|
final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
|
|
79
103
|
final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
|
|
80
104
|
final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
|
|
105
|
+
final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
|
|
106
|
+
|
|
107
|
+
final_adata.obs_names_make_unique()
|
|
108
|
+
cols = final_adata.obs.columns
|
|
81
109
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
110
|
+
# Make obs cols categorical
|
|
111
|
+
for col in cols:
|
|
112
|
+
final_adata.obs[col] = final_adata.obs[col].astype('category')
|
|
113
|
+
|
|
114
|
+
if input_already_demuxed:
|
|
115
|
+
final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
|
|
116
|
+
final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
|
|
117
|
+
else:
|
|
118
|
+
from .h5ad_functions import add_demux_type_annotation
|
|
119
|
+
double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
|
|
120
|
+
add_demux_type_annotation(final_adata, double_barcoded_reads)
|
|
121
|
+
|
|
122
|
+
## Delete intermediate h5ad files and temp directories
|
|
123
|
+
if delete_intermediates:
|
|
124
|
+
delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
|
|
125
|
+
|
|
85
126
|
return final_adata, final_adata_path
|
|
86
127
|
|
|
87
128
|
|
|
88
|
-
def process_conversion_sites(converted_FASTA,
|
|
129
|
+
def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC'], deaminase_footprinting=False):
|
|
89
130
|
"""
|
|
90
131
|
Extracts conversion sites and determines the max reference length.
|
|
91
132
|
|
|
92
133
|
Parameters:
|
|
93
134
|
converted_FASTA (str): Path to the converted reference FASTA.
|
|
94
|
-
|
|
135
|
+
conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
|
|
136
|
+
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
95
137
|
|
|
96
138
|
Returns:
|
|
97
139
|
max_reference_length (int): The length of the longest sequence.
|
|
@@ -101,11 +143,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
|
|
|
101
143
|
record_FASTA_dict = {}
|
|
102
144
|
chromosome_FASTA_dict = {}
|
|
103
145
|
max_reference_length = 0
|
|
104
|
-
unconverted =
|
|
105
|
-
|
|
146
|
+
unconverted = conversions[0]
|
|
147
|
+
conversion_types = conversions[1:]
|
|
106
148
|
|
|
107
149
|
# Process the unconverted sequence once
|
|
108
|
-
modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted,
|
|
150
|
+
modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversions, deaminase_footprinting)
|
|
109
151
|
# Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
|
|
110
152
|
|
|
111
153
|
# Get **max sequence length** from unconverted records
|
|
@@ -114,7 +156,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
|
|
|
114
156
|
# Add **unconverted records** to `record_FASTA_dict`
|
|
115
157
|
for record, values in modification_dict[unconverted].items():
|
|
116
158
|
sequence_length, top_coords, bottom_coords, sequence, complement = values
|
|
117
|
-
|
|
159
|
+
|
|
160
|
+
if not deaminase_footprinting:
|
|
161
|
+
chromosome = record.replace(f"_{unconverted}_top", "")
|
|
162
|
+
else:
|
|
163
|
+
chromosome = record
|
|
118
164
|
|
|
119
165
|
# Store **original sequence**
|
|
120
166
|
record_FASTA_dict[record] = [
|
|
@@ -127,13 +173,17 @@ def process_conversion_sites(converted_FASTA, conversion_types):
|
|
|
127
173
|
chromosome_FASTA_dict[chromosome] = [sequence + "N" * (max_reference_length - sequence_length), complement + "N" * (max_reference_length - sequence_length)]
|
|
128
174
|
|
|
129
175
|
# Process converted records
|
|
130
|
-
for conversion in
|
|
131
|
-
modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion,
|
|
176
|
+
for conversion in conversion_types:
|
|
177
|
+
modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversions, deaminase_footprinting)
|
|
132
178
|
# Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
|
|
133
179
|
|
|
134
180
|
for record, values in modification_dict[conversion].items():
|
|
135
181
|
sequence_length, top_coords, bottom_coords, sequence, complement = values
|
|
136
|
-
|
|
182
|
+
|
|
183
|
+
if not deaminase_footprinting:
|
|
184
|
+
chromosome = record.split(f"_{unconverted}_")[0] # Extract chromosome name
|
|
185
|
+
else:
|
|
186
|
+
chromosome = record
|
|
137
187
|
|
|
138
188
|
# Add **both strands** for converted records
|
|
139
189
|
for strand in ["top", "bottom"]:
|
|
@@ -168,18 +218,20 @@ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold
|
|
|
168
218
|
return records_to_analyze
|
|
169
219
|
|
|
170
220
|
|
|
171
|
-
def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tmp_dir, max_reference_length, device):
|
|
221
|
+
def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting):
|
|
172
222
|
"""Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
|
|
173
223
|
adata_list = []
|
|
174
224
|
|
|
175
225
|
for record in records_to_analyze:
|
|
176
|
-
sample =
|
|
226
|
+
sample = bam.stem
|
|
177
227
|
chromosome = record_FASTA_dict[record][2]
|
|
178
228
|
current_length = record_FASTA_dict[record][4]
|
|
179
229
|
mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
|
|
230
|
+
sequence = chromosome_FASTA_dict[chromosome][0]
|
|
180
231
|
|
|
181
232
|
# Extract Base Identities
|
|
182
|
-
fwd_bases, rev_bases = extract_base_identities(bam, record, range(current_length), max_reference_length)
|
|
233
|
+
fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, range(current_length), max_reference_length, sequence)
|
|
234
|
+
mismatch_trend_series = pd.Series(mismatch_trend_per_read)
|
|
183
235
|
|
|
184
236
|
# Skip processing if both forward and reverse base identities are empty
|
|
185
237
|
if not fwd_bases and not rev_bases:
|
|
@@ -190,11 +242,11 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
|
|
|
190
242
|
|
|
191
243
|
# Binarize the Base Identities if they exist
|
|
192
244
|
if fwd_bases:
|
|
193
|
-
fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device)
|
|
245
|
+
fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device,deaminase_footprinting, mismatch_trend_per_read)
|
|
194
246
|
merged_bin.update(fwd_bin)
|
|
195
247
|
|
|
196
248
|
if rev_bases:
|
|
197
|
-
rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device)
|
|
249
|
+
rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device, deaminase_footprinting, mismatch_trend_per_read)
|
|
198
250
|
merged_bin.update(rev_bin)
|
|
199
251
|
|
|
200
252
|
# Skip if merged_bin is empty (no valid binarized data)
|
|
@@ -257,11 +309,18 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
|
|
|
257
309
|
adata.obs_names = bin_df.index.astype(str)
|
|
258
310
|
adata.var_names = bin_df.columns.astype(str)
|
|
259
311
|
adata.obs["Sample"] = [sample] * len(adata)
|
|
312
|
+
try:
|
|
313
|
+
barcode = sample.split('barcode')[1]
|
|
314
|
+
except:
|
|
315
|
+
barcode = np.nan
|
|
316
|
+
adata.obs["Barcode"] = [int(barcode)] * len(adata)
|
|
317
|
+
adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
|
|
260
318
|
adata.obs["Reference"] = [chromosome] * len(adata)
|
|
261
319
|
adata.obs["Strand"] = [strand] * len(adata)
|
|
262
320
|
adata.obs["Dataset"] = [mod_type] * len(adata)
|
|
263
321
|
adata.obs["Reference_dataset_strand"] = [f"{chromosome}_{mod_type}_{strand}"] * len(adata)
|
|
264
322
|
adata.obs["Reference_strand"] = [f"{chromosome}_{strand}"] * len(adata)
|
|
323
|
+
adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
|
|
265
324
|
|
|
266
325
|
# Attach One-Hot Encodings to Layers
|
|
267
326
|
adata.layers["A_binary_encoding"] = df_A
|
|
@@ -279,16 +338,16 @@ def timestamp():
|
|
|
279
338
|
return time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
280
339
|
|
|
281
340
|
|
|
282
|
-
def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue):
|
|
341
|
+
def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
|
|
283
342
|
"""Worker function that processes a single BAM and writes the output to an H5AD file."""
|
|
284
343
|
worker_id = current_process().pid # Get worker process ID
|
|
285
|
-
sample =
|
|
344
|
+
sample = bam.stem
|
|
286
345
|
|
|
287
346
|
try:
|
|
288
347
|
print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
|
|
289
348
|
|
|
290
|
-
h5ad_path =
|
|
291
|
-
if
|
|
349
|
+
h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
|
|
350
|
+
if h5ad_path.exists():
|
|
292
351
|
print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
|
|
293
352
|
progress_queue.put(sample)
|
|
294
353
|
return
|
|
@@ -302,10 +361,10 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
|
|
|
302
361
|
return
|
|
303
362
|
|
|
304
363
|
# Process BAM
|
|
305
|
-
adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, tmp_dir, max_reference_length, device)
|
|
364
|
+
adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
|
|
306
365
|
|
|
307
366
|
if adata is not None:
|
|
308
|
-
adata.write_h5ad(h5ad_path)
|
|
367
|
+
adata.write_h5ad(str(h5ad_path))
|
|
309
368
|
print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
|
|
310
369
|
|
|
311
370
|
# Free memory
|
|
@@ -318,9 +377,9 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
|
|
|
318
377
|
print(f"{timestamp()} [Worker {worker_id}] ERROR while processing {sample}:\n{traceback.format_exc()}")
|
|
319
378
|
progress_queue.put(sample) # Still signal completion to prevent deadlock
|
|
320
379
|
|
|
321
|
-
def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device):
|
|
380
|
+
def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
|
|
322
381
|
"""Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
|
|
323
|
-
|
|
382
|
+
make_dirs(h5_dir) # Ensure h5_dir exists
|
|
324
383
|
|
|
325
384
|
print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
|
|
326
385
|
|
|
@@ -337,7 +396,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
337
396
|
|
|
338
397
|
with Pool(processes=num_threads) as pool:
|
|
339
398
|
results = [
|
|
340
|
-
pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue))
|
|
399
|
+
pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue))
|
|
341
400
|
for i, bam in enumerate(bam_path_list)
|
|
342
401
|
]
|
|
343
402
|
|
|
@@ -356,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
356
415
|
pool.join() # Ensure all workers finish
|
|
357
416
|
|
|
358
417
|
# Final Concatenation Step
|
|
359
|
-
h5ad_files = [
|
|
418
|
+
h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
|
|
360
419
|
|
|
361
420
|
if not h5ad_files:
|
|
362
421
|
print(f"{timestamp()} No valid H5AD files generated. Exiting.")
|
|
@@ -366,4 +425,93 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
366
425
|
final_adata = ad.concat([ad.read_h5ad(f) for f in h5ad_files], join="outer")
|
|
367
426
|
|
|
368
427
|
print(f"{timestamp()} Successfully generated final AnnData object.")
|
|
369
|
-
return final_adata
|
|
428
|
+
return final_adata
|
|
429
|
+
|
|
430
|
+
def delete_intermediate_h5ads_and_tmpdir(
|
|
431
|
+
h5_dir: Union[str, Path, Iterable[str], None],
|
|
432
|
+
tmp_dir: Optional[Union[str, Path]] = None,
|
|
433
|
+
*,
|
|
434
|
+
dry_run: bool = False,
|
|
435
|
+
verbose: bool = True,
|
|
436
|
+
):
|
|
437
|
+
"""
|
|
438
|
+
Delete intermediate .h5ad files and a temporary directory.
|
|
439
|
+
|
|
440
|
+
Parameters
|
|
441
|
+
----------
|
|
442
|
+
h5_dir : str | Path | iterable[str] | None
|
|
443
|
+
If a directory path is given, all files directly inside it will be considered.
|
|
444
|
+
If an iterable of file paths is given, those files will be considered.
|
|
445
|
+
Only files ending with '.h5ad' (and not ending with '.gz') are removed.
|
|
446
|
+
tmp_dir : str | Path | None
|
|
447
|
+
Path to a directory to remove recursively (e.g. a temp dir created earlier).
|
|
448
|
+
dry_run : bool
|
|
449
|
+
If True, print what *would* be removed but do not actually delete.
|
|
450
|
+
verbose : bool
|
|
451
|
+
Print progress / warnings.
|
|
452
|
+
"""
|
|
453
|
+
# Helper: remove a single file path (Path-like or string)
|
|
454
|
+
def _maybe_unlink(p: Path):
|
|
455
|
+
if not p.exists():
|
|
456
|
+
if verbose:
|
|
457
|
+
print(f"[skip] not found: {p}")
|
|
458
|
+
return
|
|
459
|
+
if not p.is_file():
|
|
460
|
+
if verbose:
|
|
461
|
+
print(f"[skip] not a file: {p}")
|
|
462
|
+
return
|
|
463
|
+
if dry_run:
|
|
464
|
+
print(f"[dry-run] would remove file: {p}")
|
|
465
|
+
return
|
|
466
|
+
try:
|
|
467
|
+
p.unlink()
|
|
468
|
+
if verbose:
|
|
469
|
+
print(f"Removed file: {p}")
|
|
470
|
+
except Exception as e:
|
|
471
|
+
print(f"[error] failed to remove file {p}: {e}")
|
|
472
|
+
|
|
473
|
+
# Handle h5_dir input (directory OR iterable of file paths)
|
|
474
|
+
if h5_dir is not None:
|
|
475
|
+
# If it's a path to a directory, iterate its children
|
|
476
|
+
if isinstance(h5_dir, (str, Path)) and Path(h5_dir).is_dir():
|
|
477
|
+
dpath = Path(h5_dir)
|
|
478
|
+
for p in dpath.iterdir():
|
|
479
|
+
# only target top-level files (not recursing); require '.h5ad' suffix and exclude gz
|
|
480
|
+
name = p.name.lower()
|
|
481
|
+
if name.endswith(".h5ad") and not name.endswith(".gz"):
|
|
482
|
+
_maybe_unlink(p)
|
|
483
|
+
else:
|
|
484
|
+
if verbose:
|
|
485
|
+
# optional: comment this out if too noisy
|
|
486
|
+
print(f"[skip] not matching pattern: {p.name}")
|
|
487
|
+
else:
|
|
488
|
+
# treat as iterable of file paths
|
|
489
|
+
for f in h5_dir:
|
|
490
|
+
p = Path(f)
|
|
491
|
+
name = p.name.lower()
|
|
492
|
+
if name.endswith(".h5ad") and not name.endswith(".gz"):
|
|
493
|
+
_maybe_unlink(p)
|
|
494
|
+
else:
|
|
495
|
+
if verbose:
|
|
496
|
+
print(f"[skip] not matching pattern or not a file: {p}")
|
|
497
|
+
|
|
498
|
+
# Remove tmp_dir recursively (if provided)
|
|
499
|
+
if tmp_dir is not None:
|
|
500
|
+
td = Path(tmp_dir)
|
|
501
|
+
if not td.exists():
|
|
502
|
+
if verbose:
|
|
503
|
+
print(f"[skip] tmp_dir not found: {td}")
|
|
504
|
+
else:
|
|
505
|
+
if not td.is_dir():
|
|
506
|
+
if verbose:
|
|
507
|
+
print(f"[skip] tmp_dir is not a directory: {td}")
|
|
508
|
+
else:
|
|
509
|
+
if dry_run:
|
|
510
|
+
print(f"[dry-run] would remove directory tree: {td}")
|
|
511
|
+
else:
|
|
512
|
+
try:
|
|
513
|
+
shutil.rmtree(td)
|
|
514
|
+
if verbose:
|
|
515
|
+
print(f"Removed directory tree: {td}")
|
|
516
|
+
except Exception as e:
|
|
517
|
+
print(f"[error] failed to remove tmp dir {td}: {e}")
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
from ..readwrite import make_dirs, time_string
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from typing import Union, List, Dict, Tuple
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import gzip
|
|
11
|
+
|
|
12
|
+
from Bio import SeqIO
|
|
13
|
+
from Bio.SeqRecord import SeqRecord
|
|
14
|
+
from Bio.Seq import Seq
|
|
15
|
+
from pyfaidx import Fasta
|
|
16
|
+
import pysam
|
|
17
|
+
|
|
18
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
19
|
+
from itertools import chain
|
|
20
|
+
|
|
21
|
+
def _convert_FASTA_record(record, modification_type, strand, unconverted):
|
|
22
|
+
""" Converts a FASTA record based on modification type and strand. """
|
|
23
|
+
conversion_maps = {
|
|
24
|
+
('5mC', 'top'): ('C', 'T'),
|
|
25
|
+
('5mC', 'bottom'): ('G', 'A'),
|
|
26
|
+
('6mA', 'top'): ('A', 'G'),
|
|
27
|
+
('6mA', 'bottom'): ('T', 'C')
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
sequence = str(record.seq).upper()
|
|
31
|
+
|
|
32
|
+
if modification_type == unconverted:
|
|
33
|
+
return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
|
|
34
|
+
|
|
35
|
+
if (modification_type, strand) not in conversion_maps:
|
|
36
|
+
raise ValueError(f"Invalid combination: {modification_type}, {strand}")
|
|
37
|
+
|
|
38
|
+
original_base, converted_base = conversion_maps[(modification_type, strand)]
|
|
39
|
+
new_seq = sequence.replace(original_base, converted_base)
|
|
40
|
+
|
|
41
|
+
return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
|
|
42
|
+
|
|
43
|
+
def _process_fasta_record(args):
|
|
44
|
+
"""
|
|
45
|
+
Processes a single FASTA record for parallel execution.
|
|
46
|
+
Args:
|
|
47
|
+
args (tuple): (record, modification_types, strands, unconverted)
|
|
48
|
+
Returns:
|
|
49
|
+
list of modified SeqRecord objects.
|
|
50
|
+
"""
|
|
51
|
+
record, modification_types, strands, unconverted = args
|
|
52
|
+
modified_records = []
|
|
53
|
+
|
|
54
|
+
for modification_type in modification_types:
|
|
55
|
+
for i, strand in enumerate(strands):
|
|
56
|
+
if i > 0 and modification_type == unconverted:
|
|
57
|
+
continue # Ensure unconverted is added only once
|
|
58
|
+
|
|
59
|
+
modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
|
|
60
|
+
|
|
61
|
+
return modified_records
|
|
62
|
+
|
|
63
|
+
def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
|
|
64
|
+
"""
|
|
65
|
+
Converts an input FASTA file and writes a new converted FASTA file efficiently.
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
input_fasta (str): Path to the unconverted FASTA file.
|
|
69
|
+
modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
|
|
70
|
+
strands (list): List of strands ('top', 'bottom').
|
|
71
|
+
output_fasta (str): Path to the converted FASTA output file.
|
|
72
|
+
num_threads (int): Number of parallel threads to use.
|
|
73
|
+
chunk_size (int): Number of records to process per write batch.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
None (Writes the converted FASTA file).
|
|
77
|
+
"""
|
|
78
|
+
unconverted = modification_types[0]
|
|
79
|
+
input_fasta = str(input_fasta)
|
|
80
|
+
output_fasta = str(output_fasta)
|
|
81
|
+
|
|
82
|
+
# Detect if input is gzipped
|
|
83
|
+
open_func = gzip.open if input_fasta.endswith('.gz') else open
|
|
84
|
+
file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
|
|
85
|
+
|
|
86
|
+
def _fasta_record_generator():
|
|
87
|
+
""" Lazily yields FASTA records from file. """
|
|
88
|
+
with open_func(input_fasta, file_mode) as handle:
|
|
89
|
+
for record in SeqIO.parse(handle, 'fasta'):
|
|
90
|
+
yield record
|
|
91
|
+
|
|
92
|
+
with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
|
|
93
|
+
# Process records in parallel using a named function (avoiding lambda)
|
|
94
|
+
results = executor.map(
|
|
95
|
+
_process_fasta_record,
|
|
96
|
+
((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
buffer = []
|
|
100
|
+
for modified_records in results:
|
|
101
|
+
buffer.extend(modified_records)
|
|
102
|
+
|
|
103
|
+
# Write out in chunks to save memory
|
|
104
|
+
if len(buffer) >= chunk_size:
|
|
105
|
+
SeqIO.write(buffer, output_handle, 'fasta')
|
|
106
|
+
buffer.clear()
|
|
107
|
+
|
|
108
|
+
# Write any remaining records
|
|
109
|
+
if buffer:
|
|
110
|
+
SeqIO.write(buffer, output_handle, 'fasta')
|
|
111
|
+
|
|
112
|
+
def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
113
|
+
fasta = Path(fasta)
|
|
114
|
+
pysam.faidx(str(fasta)) # creates <fasta>.fai
|
|
115
|
+
|
|
116
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
117
|
+
if write_chrom_sizes:
|
|
118
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
119
|
+
with fai.open() as f_in, chrom_sizes.open("w") as out:
|
|
120
|
+
for line in f_in:
|
|
121
|
+
chrom, size = line.split()[:2]
|
|
122
|
+
out.write(f"{chrom}\t{size}\n")
|
|
123
|
+
return chrom_sizes
|
|
124
|
+
return fai
|
|
125
|
+
|
|
126
|
+
def get_chromosome_lengths(fasta: str | Path) -> Path:
|
|
127
|
+
"""
|
|
128
|
+
Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
|
|
129
|
+
"""
|
|
130
|
+
fasta = Path(fasta)
|
|
131
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
132
|
+
if not fai.exists():
|
|
133
|
+
index_fasta(fasta, write_chrom_sizes=True) # will also create .chrom.sizes
|
|
134
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
135
|
+
if chrom_sizes.exists():
|
|
136
|
+
print(f"Using existing chrom length file: {chrom_sizes}")
|
|
137
|
+
return chrom_sizes
|
|
138
|
+
|
|
139
|
+
# Build chrom.sizes from .fai
|
|
140
|
+
with fai.open() as f_in, chrom_sizes.open("w") as out:
|
|
141
|
+
for line in f_in:
|
|
142
|
+
chrom, size = line.split()[:2]
|
|
143
|
+
out.write(f"{chrom}\t{size}\n")
|
|
144
|
+
return chrom_sizes
|
|
145
|
+
|
|
146
|
+
def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
|
|
147
|
+
"""
|
|
148
|
+
Return {record_id: (length, sequence)} from a FASTA.
|
|
149
|
+
Direct methylation specific
|
|
150
|
+
"""
|
|
151
|
+
fasta_file = Path(fasta_file)
|
|
152
|
+
print(f"{time_string()}: Opening FASTA file {fasta_file}")
|
|
153
|
+
record_dict: Dict[str, Tuple[int, str]] = {}
|
|
154
|
+
with fasta_file.open("r") as f:
|
|
155
|
+
for rec in SeqIO.parse(f, "fasta"):
|
|
156
|
+
seq = str(rec.seq).upper()
|
|
157
|
+
record_dict[rec.id] = (len(seq), seq)
|
|
158
|
+
return record_dict
|
|
159
|
+
|
|
160
|
+
def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
|
|
161
|
+
"""
|
|
162
|
+
Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
|
|
163
|
+
|
|
164
|
+
Parameters:
|
|
165
|
+
fasta_file (str): Path to the converted reference FASTA.
|
|
166
|
+
modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
|
|
167
|
+
conversions (list): List of conversion types. The first element is the unconverted record type.
|
|
168
|
+
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
dict: Dictionary where keys are **both unconverted & converted record names**.
|
|
172
|
+
Values contain:
|
|
173
|
+
[sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
|
|
174
|
+
"""
|
|
175
|
+
unconverted = conversions[0]
|
|
176
|
+
record_dict = {}
|
|
177
|
+
|
|
178
|
+
# Define base mapping based on modification type
|
|
179
|
+
base_mappings = {
|
|
180
|
+
'5mC': ('C', 'G'), # Cytosine and Guanine
|
|
181
|
+
'6mA': ('A', 'T') # Adenine and Thymine
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
# Read FASTA file and process records
|
|
185
|
+
with open(fasta_file, "r") as f:
|
|
186
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
187
|
+
if unconverted in record.id or deaminase_footprinting:
|
|
188
|
+
sequence = str(record.seq).upper()
|
|
189
|
+
complement = str(record.seq.complement()).upper()
|
|
190
|
+
sequence_length = len(sequence)
|
|
191
|
+
|
|
192
|
+
# Unconverted case: store the full sequence without coordinate filtering
|
|
193
|
+
if modification_type == unconverted:
|
|
194
|
+
record_dict[record.id] = [sequence_length, [], [], sequence, complement]
|
|
195
|
+
|
|
196
|
+
# Process converted records: extract modified base positions
|
|
197
|
+
elif modification_type in base_mappings:
|
|
198
|
+
top_base, bottom_base = base_mappings[modification_type]
|
|
199
|
+
seq_array = np.array(list(sequence))
|
|
200
|
+
top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
|
|
201
|
+
bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
|
|
202
|
+
|
|
203
|
+
record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
|
|
204
|
+
|
|
205
|
+
else:
|
|
206
|
+
raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
|
|
207
|
+
|
|
208
|
+
return record_dict
|
|
209
|
+
|
|
210
|
+
def subsample_fasta_from_bed(
|
|
211
|
+
input_FASTA: str | Path,
|
|
212
|
+
input_bed: str | Path,
|
|
213
|
+
output_directory: str | Path,
|
|
214
|
+
output_FASTA: str | Path
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Take a genome-wide FASTA file and a BED file containing
|
|
218
|
+
coordinate windows of interest. Outputs a subsampled FASTA.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
# Normalize everything to Path
|
|
222
|
+
input_FASTA = Path(input_FASTA)
|
|
223
|
+
input_bed = Path(input_bed)
|
|
224
|
+
output_directory = Path(output_directory)
|
|
225
|
+
output_FASTA = Path(output_FASTA)
|
|
226
|
+
|
|
227
|
+
# Ensure output directory exists
|
|
228
|
+
output_directory.mkdir(parents=True, exist_ok=True)
|
|
229
|
+
|
|
230
|
+
output_FASTA_path = output_directory / output_FASTA
|
|
231
|
+
|
|
232
|
+
# Load the FASTA file using pyfaidx
|
|
233
|
+
fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
|
|
234
|
+
|
|
235
|
+
# Open BED + output FASTA
|
|
236
|
+
with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
|
|
237
|
+
for line in bed:
|
|
238
|
+
fields = line.strip().split()
|
|
239
|
+
chrom = fields[0]
|
|
240
|
+
start = int(fields[1]) # BED is 0-based
|
|
241
|
+
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
242
|
+
desc = " ".join(fields[3:]) if len(fields) > 3 else ""
|
|
243
|
+
|
|
244
|
+
if chrom not in fasta:
|
|
245
|
+
print(f"Warning: {chrom} not found in FASTA")
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
|
|
249
|
+
sequence = fasta[chrom][start:end].seq
|
|
250
|
+
|
|
251
|
+
header = f">{chrom}:{start}-{end}"
|
|
252
|
+
if desc:
|
|
253
|
+
header += f" {desc}"
|
|
254
|
+
|
|
255
|
+
out_fasta.write(f"{header}\n{sequence}\n")
|