smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,17 @@
|
|
|
1
1
|
import subprocess
|
|
2
|
-
from pathlib import Path
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
|
|
4
|
+
def canoncall(
|
|
5
|
+
model_dir,
|
|
6
|
+
model,
|
|
7
|
+
pod5_dir,
|
|
8
|
+
barcode_kit,
|
|
9
|
+
bam,
|
|
10
|
+
bam_suffix,
|
|
11
|
+
barcode_both_ends=True,
|
|
12
|
+
trim=False,
|
|
13
|
+
device="auto",
|
|
14
|
+
):
|
|
5
15
|
"""
|
|
6
16
|
Wrapper function for dorado canonical base calling.
|
|
7
17
|
|
|
@@ -15,13 +25,24 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
|
|
|
15
25
|
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
16
26
|
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
17
27
|
device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
|
|
18
|
-
|
|
28
|
+
|
|
19
29
|
Returns:
|
|
20
30
|
None
|
|
21
31
|
Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
|
|
22
32
|
"""
|
|
23
33
|
output = bam + bam_suffix
|
|
24
|
-
command = [
|
|
34
|
+
command = [
|
|
35
|
+
"dorado",
|
|
36
|
+
"basecaller",
|
|
37
|
+
"--models-directory",
|
|
38
|
+
model_dir,
|
|
39
|
+
"--kit-name",
|
|
40
|
+
barcode_kit,
|
|
41
|
+
"--device",
|
|
42
|
+
device,
|
|
43
|
+
"--batchsize",
|
|
44
|
+
"0",
|
|
45
|
+
]
|
|
25
46
|
if barcode_both_ends:
|
|
26
47
|
command.append("--barcode-both-ends")
|
|
27
48
|
if not trim:
|
|
@@ -32,7 +53,19 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
|
|
|
32
53
|
with open(output, "w") as outfile:
|
|
33
54
|
subprocess.run(command, stdout=outfile)
|
|
34
55
|
|
|
35
|
-
|
|
56
|
+
|
|
57
|
+
def modcall(
|
|
58
|
+
model_dir,
|
|
59
|
+
model,
|
|
60
|
+
pod5_dir,
|
|
61
|
+
barcode_kit,
|
|
62
|
+
mod_list,
|
|
63
|
+
bam,
|
|
64
|
+
bam_suffix,
|
|
65
|
+
barcode_both_ends=True,
|
|
66
|
+
trim=False,
|
|
67
|
+
device="auto",
|
|
68
|
+
):
|
|
36
69
|
"""
|
|
37
70
|
Wrapper function for dorado modified base calling.
|
|
38
71
|
|
|
@@ -47,14 +80,23 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
|
|
|
47
80
|
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
48
81
|
trim (bool): Whether to trim barcodes, adapters, and primers from read ends
|
|
49
82
|
device (str): Device to use for basecalling. auto, metal, cpu, cuda.
|
|
50
|
-
|
|
83
|
+
|
|
51
84
|
Returns:
|
|
52
85
|
None
|
|
53
86
|
Outputs a BAM file holding the modified base calls output by the dorado basecaller.
|
|
54
87
|
"""
|
|
55
88
|
import subprocess
|
|
89
|
+
|
|
56
90
|
output = bam + bam_suffix
|
|
57
|
-
command = [
|
|
91
|
+
command = [
|
|
92
|
+
"dorado",
|
|
93
|
+
"basecaller",
|
|
94
|
+
"--models-directory",
|
|
95
|
+
model_dir,
|
|
96
|
+
"--kit-name",
|
|
97
|
+
barcode_kit,
|
|
98
|
+
"--modified-bases",
|
|
99
|
+
]
|
|
58
100
|
command += mod_list
|
|
59
101
|
command += ["--device", device, "--batchsize", "0"]
|
|
60
102
|
if barcode_both_ends:
|
|
@@ -62,6 +104,6 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
|
|
|
62
104
|
if not trim:
|
|
63
105
|
command.append("--no-trim")
|
|
64
106
|
command += [model, pod5_dir]
|
|
65
|
-
print(f
|
|
107
|
+
print(f"Running: {' '.join(command)}")
|
|
66
108
|
with open(output, "w") as outfile:
|
|
67
|
-
subprocess.run(command, stdout=outfile)
|
|
109
|
+
subprocess.run(command, stdout=outfile)
|
|
@@ -1,20 +1,22 @@
|
|
|
1
|
-
|
|
1
|
+
import concurrent.futures
|
|
2
2
|
import os
|
|
3
|
-
import
|
|
4
|
-
from
|
|
5
|
-
import pysam
|
|
6
|
-
import pybedtools
|
|
7
|
-
import pyBigWig
|
|
3
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
4
|
+
from pathlib import Path
|
|
8
5
|
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
9
7
|
import numpy as np
|
|
10
8
|
import pandas as pd
|
|
11
|
-
import
|
|
12
|
-
|
|
9
|
+
import pybedtools
|
|
10
|
+
import pyBigWig
|
|
11
|
+
import pysam
|
|
13
12
|
|
|
14
|
-
|
|
13
|
+
from smftools.logging_utils import get_logger
|
|
15
14
|
|
|
16
15
|
from ..readwrite import make_dirs
|
|
17
16
|
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
18
20
|
def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
19
21
|
"""
|
|
20
22
|
BED → bedGraph → bigWig
|
|
@@ -33,14 +35,14 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
|
33
35
|
bigwig = parent / f"{stem}.bw"
|
|
34
36
|
|
|
35
37
|
# 1) Compute coverage → bedGraph
|
|
36
|
-
|
|
38
|
+
logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
37
39
|
bt = pybedtools.BedTool(str(bed))
|
|
38
40
|
# bedtools genomecov -bg
|
|
39
41
|
coverage = bt.genome_coverage(bg=True, genome=str(fai))
|
|
40
42
|
coverage.saveas(str(bedgraph))
|
|
41
43
|
|
|
42
44
|
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
43
|
-
|
|
45
|
+
logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
44
46
|
|
|
45
47
|
# read chrom sizes from the FASTA .fai index
|
|
46
48
|
chrom_sizes = {}
|
|
@@ -61,9 +63,10 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
|
61
63
|
|
|
62
64
|
bw.close()
|
|
63
65
|
|
|
64
|
-
|
|
66
|
+
logger.debug(f"BigWig written: {bigwig}")
|
|
65
67
|
return str(bigwig)
|
|
66
68
|
|
|
69
|
+
|
|
67
70
|
def _plot_bed_histograms(
|
|
68
71
|
bed_file,
|
|
69
72
|
plotting_directory,
|
|
@@ -71,9 +74,9 @@ def _plot_bed_histograms(
|
|
|
71
74
|
*,
|
|
72
75
|
bins=60,
|
|
73
76
|
clip_quantiles=(0.0, 0.995),
|
|
74
|
-
cov_bin_size=1000,
|
|
75
|
-
rows_per_fig=6,
|
|
76
|
-
include_mapq_quality=True,
|
|
77
|
+
cov_bin_size=1000, # coverage bin size in bp
|
|
78
|
+
rows_per_fig=6, # paginate if many chromosomes
|
|
79
|
+
include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
|
|
77
80
|
coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
|
|
78
81
|
):
|
|
79
82
|
"""
|
|
@@ -113,19 +116,30 @@ def _plot_bed_histograms(
|
|
|
113
116
|
os.makedirs(plotting_directory, exist_ok=True)
|
|
114
117
|
|
|
115
118
|
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
116
|
-
|
|
119
|
+
logger.debug(f"[plot_bed_histograms] Loading: {bed_file}")
|
|
117
120
|
|
|
118
121
|
# Load BED-like table
|
|
119
|
-
cols = [
|
|
120
|
-
df = pd.read_csv(
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
cols = ["chrom", "start", "end", "read_len", "qname", "mapq", "avg_q"]
|
|
123
|
+
df = pd.read_csv(
|
|
124
|
+
bed_file,
|
|
125
|
+
sep="\t",
|
|
126
|
+
header=None,
|
|
127
|
+
names=cols,
|
|
128
|
+
dtype={
|
|
129
|
+
"chrom": str,
|
|
130
|
+
"start": int,
|
|
131
|
+
"end": int,
|
|
132
|
+
"read_len": int,
|
|
133
|
+
"qname": str,
|
|
134
|
+
"mapq": float,
|
|
135
|
+
"avg_q": float,
|
|
136
|
+
},
|
|
137
|
+
)
|
|
124
138
|
|
|
125
139
|
# Drop unaligned records (chrom == '*') if present
|
|
126
|
-
df = df[df[
|
|
140
|
+
df = df[df["chrom"] != "*"].copy()
|
|
127
141
|
if df.empty:
|
|
128
|
-
|
|
142
|
+
logger.debug("[plot_bed_histograms] No aligned reads found; nothing to plot.")
|
|
129
143
|
return
|
|
130
144
|
|
|
131
145
|
# Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
|
|
@@ -135,15 +149,16 @@ def _plot_bed_histograms(
|
|
|
135
149
|
|
|
136
150
|
if coordinate_mode == "one_based":
|
|
137
151
|
# convert to 0-based half-open [start0, end0)
|
|
138
|
-
start0 = df[
|
|
139
|
-
end0
|
|
152
|
+
start0 = df["start"].to_numpy() - 1
|
|
153
|
+
end0 = df["end"].to_numpy() # inclusive in input -> +1 already handled by not subtracting
|
|
140
154
|
else:
|
|
141
155
|
# already 0-based half-open (assumption)
|
|
142
|
-
start0 = df[
|
|
143
|
-
end0
|
|
156
|
+
start0 = df["start"].to_numpy()
|
|
157
|
+
end0 = df["end"].to_numpy()
|
|
144
158
|
|
|
145
159
|
# Clip helper for hist tails
|
|
146
160
|
def _clip_series(s, q=(0.0, 0.995)):
|
|
161
|
+
"""Clip a Series to quantile bounds for plotting."""
|
|
147
162
|
if q is None:
|
|
148
163
|
return s.to_numpy()
|
|
149
164
|
lo = s.quantile(q[0]) if q[0] is not None else s.min()
|
|
@@ -157,42 +172,42 @@ def _plot_bed_histograms(
|
|
|
157
172
|
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
158
173
|
|
|
159
174
|
# Keep only chroms present in FASTA and with at least one read
|
|
160
|
-
chroms = [c for c in df[
|
|
175
|
+
chroms = [c for c in df["chrom"].unique() if c in ref_lengths]
|
|
161
176
|
# Order chromosomes by FASTA order
|
|
162
177
|
chrom_order = [c for c in ref_names if c in chroms]
|
|
163
178
|
|
|
164
179
|
if not chrom_order:
|
|
165
|
-
|
|
180
|
+
logger.debug(
|
|
181
|
+
"[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting."
|
|
182
|
+
)
|
|
166
183
|
return
|
|
167
184
|
|
|
168
185
|
# Pagination
|
|
169
186
|
def _sanitize(name: str) -> str:
|
|
187
|
+
"""Sanitize a string for use in filenames."""
|
|
170
188
|
return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
|
|
171
189
|
|
|
172
190
|
cols_per_fig = 4 if include_mapq_quality else 2
|
|
173
191
|
|
|
174
192
|
for start_idx in range(0, len(chrom_order), rows_per_fig):
|
|
175
|
-
chunk = chrom_order[start_idx:start_idx + rows_per_fig]
|
|
193
|
+
chunk = chrom_order[start_idx : start_idx + rows_per_fig]
|
|
176
194
|
nrows = len(chunk)
|
|
177
195
|
ncols = cols_per_fig
|
|
178
196
|
|
|
179
197
|
fig, axes = plt.subplots(
|
|
180
|
-
nrows=nrows, ncols=ncols,
|
|
181
|
-
figsize=(4.0 * ncols, 2.6 * nrows),
|
|
182
|
-
dpi=160,
|
|
183
|
-
squeeze=False
|
|
198
|
+
nrows=nrows, ncols=ncols, figsize=(4.0 * ncols, 2.6 * nrows), dpi=160, squeeze=False
|
|
184
199
|
)
|
|
185
200
|
|
|
186
201
|
for r, chrom in enumerate(chunk):
|
|
187
202
|
chrom_len = ref_lengths[chrom]
|
|
188
|
-
mask =
|
|
203
|
+
mask = df["chrom"].to_numpy() == chrom
|
|
189
204
|
|
|
190
205
|
# Slice per-chrom arrays for speed
|
|
191
206
|
s0 = start0[mask]
|
|
192
207
|
e0 = end0[mask]
|
|
193
|
-
len_arr = df.loc[mask,
|
|
194
|
-
mapq_arr = df.loc[mask,
|
|
195
|
-
q_arr = df.loc[mask,
|
|
208
|
+
len_arr = df.loc[mask, "read_len"]
|
|
209
|
+
mapq_arr = df.loc[mask, "mapq"]
|
|
210
|
+
q_arr = df.loc[mask, "avg_q"]
|
|
196
211
|
|
|
197
212
|
# --- Col 1: Read length histogram (clipped) ---
|
|
198
213
|
ax = axes[r, 0]
|
|
@@ -222,7 +237,7 @@ def _plot_bed_histograms(
|
|
|
222
237
|
|
|
223
238
|
# Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
|
|
224
239
|
for lo, hi in zip(b_lo, b_hi):
|
|
225
|
-
cov[lo:hi + 1] += 1
|
|
240
|
+
cov[lo : hi + 1] += 1
|
|
226
241
|
|
|
227
242
|
x_mid = (edges[:-1] + edges[1:]) / 2.0
|
|
228
243
|
ax.plot(x_mid, cov)
|
|
@@ -237,7 +252,12 @@ def _plot_bed_histograms(
|
|
|
237
252
|
# --- Col 3: MAPQ ---
|
|
238
253
|
ax = axes[r, 2]
|
|
239
254
|
# Clip MAPQ upper tail if needed (usually 60)
|
|
240
|
-
ax.hist(
|
|
255
|
+
ax.hist(
|
|
256
|
+
_clip_series(mapq_arr.fillna(0), clip_quantiles),
|
|
257
|
+
bins=bins,
|
|
258
|
+
edgecolor="black",
|
|
259
|
+
alpha=0.7,
|
|
260
|
+
)
|
|
241
261
|
if r == 0:
|
|
242
262
|
ax.set_title("MAPQ")
|
|
243
263
|
ax.set_xlabel("MAPQ")
|
|
@@ -245,7 +265,12 @@ def _plot_bed_histograms(
|
|
|
245
265
|
|
|
246
266
|
# --- Col 4: Avg base quality ---
|
|
247
267
|
ax = axes[r, 3]
|
|
248
|
-
ax.hist(
|
|
268
|
+
ax.hist(
|
|
269
|
+
_clip_series(q_arr.fillna(np.nan), clip_quantiles),
|
|
270
|
+
bins=bins,
|
|
271
|
+
edgecolor="black",
|
|
272
|
+
alpha=0.7,
|
|
273
|
+
)
|
|
249
274
|
if r == 0:
|
|
250
275
|
ax.set_title("Avg base qual")
|
|
251
276
|
ax.set_xlabel("Phred")
|
|
@@ -254,7 +279,8 @@ def _plot_bed_histograms(
|
|
|
254
279
|
fig.suptitle(
|
|
255
280
|
f"{bed_basename} — per-chromosome QC "
|
|
256
281
|
f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
|
|
257
|
-
y=0.995,
|
|
282
|
+
y=0.995,
|
|
283
|
+
fontsize=11,
|
|
258
284
|
)
|
|
259
285
|
fig.tight_layout(rect=[0, 0, 1, 0.98])
|
|
260
286
|
|
|
@@ -263,7 +289,8 @@ def _plot_bed_histograms(
|
|
|
263
289
|
plt.savefig(out_png, bbox_inches="tight")
|
|
264
290
|
plt.close(fig)
|
|
265
291
|
|
|
266
|
-
|
|
292
|
+
logger.debug("[plot_bed_histograms] Done.")
|
|
293
|
+
|
|
267
294
|
|
|
268
295
|
def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
269
296
|
"""
|
|
@@ -287,9 +314,9 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
287
314
|
bed_dir = out_dir / "beds"
|
|
288
315
|
make_dirs([plotting_dir, bed_dir])
|
|
289
316
|
|
|
290
|
-
bed_output = bed_dir /
|
|
317
|
+
bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
|
|
291
318
|
|
|
292
|
-
|
|
319
|
+
logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
293
320
|
|
|
294
321
|
with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
|
|
295
322
|
for read in bam.fetch(until_eof=True):
|
|
@@ -317,20 +344,24 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
317
344
|
|
|
318
345
|
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
319
346
|
|
|
320
|
-
|
|
347
|
+
logger.debug(f"BED-like file created: {bed_output}")
|
|
321
348
|
|
|
322
349
|
def split_bed(bed):
|
|
323
350
|
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
324
351
|
bed = str(bed)
|
|
325
352
|
aligned = bed.replace(".bed", "_aligned.bed")
|
|
326
353
|
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
327
|
-
with
|
|
354
|
+
with (
|
|
355
|
+
open(bed, "r") as infile,
|
|
356
|
+
open(aligned, "w") as aligned_out,
|
|
357
|
+
open(unaligned, "w") as unaligned_out,
|
|
358
|
+
):
|
|
328
359
|
for line in infile:
|
|
329
360
|
(unaligned_out if line.startswith("*\t") else aligned_out).write(line)
|
|
330
361
|
os.remove(bed)
|
|
331
362
|
return aligned
|
|
332
363
|
|
|
333
|
-
|
|
364
|
+
logger.debug(f"Splitting: {bed_output}")
|
|
334
365
|
aligned_bed = split_bed(bed_output)
|
|
335
366
|
|
|
336
367
|
with ProcessPoolExecutor() as executor:
|
|
@@ -340,7 +371,8 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
340
371
|
futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
|
|
341
372
|
concurrent.futures.wait(futures)
|
|
342
373
|
|
|
343
|
-
|
|
374
|
+
logger.debug("Processing completed successfully.")
|
|
375
|
+
|
|
344
376
|
|
|
345
377
|
def extract_read_lengths_from_bed(file_path):
|
|
346
378
|
"""
|
|
@@ -352,15 +384,16 @@ def extract_read_lengths_from_bed(file_path):
|
|
|
352
384
|
read_dict (dict)
|
|
353
385
|
"""
|
|
354
386
|
import pandas as pd
|
|
355
|
-
|
|
356
|
-
|
|
387
|
+
|
|
388
|
+
columns = ["chrom", "start", "end", "length", "name"]
|
|
389
|
+
df = pd.read_csv(file_path, sep="\t", header=None, names=columns, comment="#")
|
|
357
390
|
read_dict = {}
|
|
358
391
|
for _, row in df.iterrows():
|
|
359
|
-
chrom = row[
|
|
360
|
-
start = row[
|
|
361
|
-
end = row[
|
|
362
|
-
name = row[
|
|
363
|
-
length = row[
|
|
392
|
+
chrom = row["chrom"]
|
|
393
|
+
start = row["start"]
|
|
394
|
+
end = row["end"]
|
|
395
|
+
name = row["name"]
|
|
396
|
+
length = row["length"]
|
|
364
397
|
read_dict[name] = length
|
|
365
398
|
|
|
366
|
-
return read_dict
|
|
399
|
+
return read_dict
|
|
@@ -1,4 +1,13 @@
|
|
|
1
|
-
def binarize_converted_base_identities(
|
|
1
|
+
def binarize_converted_base_identities(
|
|
2
|
+
base_identities,
|
|
3
|
+
strand,
|
|
4
|
+
modification_type,
|
|
5
|
+
bam,
|
|
6
|
+
device="cpu",
|
|
7
|
+
deaminase_footprinting=False,
|
|
8
|
+
mismatch_trend_per_read={},
|
|
9
|
+
on_missing="nan",
|
|
10
|
+
):
|
|
2
11
|
"""
|
|
3
12
|
Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
|
|
4
13
|
|
|
@@ -10,7 +19,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
10
19
|
deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
|
|
11
20
|
mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
|
|
12
21
|
on_missing (str): Error handling if a read is missing
|
|
13
|
-
|
|
22
|
+
|
|
14
23
|
Returns:
|
|
15
24
|
dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
|
|
16
25
|
If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
|
|
@@ -64,14 +73,16 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
64
73
|
|
|
65
74
|
# Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
|
|
66
75
|
bin_maps = {
|
|
67
|
-
("top", "5mC"):
|
|
76
|
+
("top", "5mC"): {"C": 1.0, "T": 0.0},
|
|
68
77
|
("bottom", "5mC"): {"G": 1.0, "A": 0.0},
|
|
69
|
-
("top", "6mA"):
|
|
78
|
+
("top", "6mA"): {"A": 1.0, "G": 0.0},
|
|
70
79
|
("bottom", "6mA"): {"T": 1.0, "C": 0.0},
|
|
71
80
|
}
|
|
72
81
|
key = (strand, modification_type)
|
|
73
82
|
if key not in bin_maps:
|
|
74
|
-
raise ValueError(
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'"
|
|
85
|
+
)
|
|
75
86
|
|
|
76
87
|
base_map = bin_maps[key]
|
|
77
88
|
|
|
@@ -110,7 +121,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
110
121
|
# binarized_base_identities[key] = binarized
|
|
111
122
|
|
|
112
123
|
# return binarized_base_identities
|
|
113
|
-
|
|
124
|
+
|
|
114
125
|
# else:
|
|
115
126
|
# binarization_maps = {
|
|
116
127
|
# ('top', '5mC'): {'C': 1, 'T': 0},
|
|
@@ -152,7 +163,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
152
163
|
|
|
153
164
|
# # Fetch the appropriate mapping
|
|
154
165
|
# base_map = binarization_maps[(strand, modification_type)]
|
|
155
|
-
|
|
166
|
+
|
|
156
167
|
# # Convert mapping to tensor
|
|
157
168
|
# base_keys = list(base_map.keys())
|
|
158
169
|
# base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# complement_base_list
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
def complement_base_list(sequence):
|
|
4
5
|
"""
|
|
5
6
|
Takes a list of DNA base identities and returns their complement.
|
|
@@ -11,11 +12,11 @@ def complement_base_list(sequence):
|
|
|
11
12
|
complement (list): A list of complementary DNA bases.
|
|
12
13
|
"""
|
|
13
14
|
complement_mapping = {
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
"A": "T",
|
|
16
|
+
"T": "A",
|
|
17
|
+
"C": "G",
|
|
18
|
+
"G": "C",
|
|
19
|
+
"N": "N", # Handling ambiguous bases like 'N'
|
|
19
20
|
}
|
|
20
21
|
|
|
21
|
-
return [complement_mapping[base] for base in sequence]
|
|
22
|
+
return [complement_mapping[base] for base in sequence]
|