smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
|
|
5
|
+
"""
|
|
6
|
+
Wrapper function for dorado canonical base calling.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
10
|
+
model (str): a string representing the the dorado basecalling model.
|
|
11
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
12
|
+
barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
|
|
13
|
+
bam (str): File path to the BAM file to output.
|
|
14
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
15
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
16
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
17
|
+
device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
None
|
|
21
|
+
Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
|
|
22
|
+
"""
|
|
23
|
+
output = bam + bam_suffix
|
|
24
|
+
command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
|
|
25
|
+
if barcode_both_ends:
|
|
26
|
+
command.append("--barcode-both-ends")
|
|
27
|
+
if not trim:
|
|
28
|
+
command.append("--no-trim")
|
|
29
|
+
command += [model, pod5_dir]
|
|
30
|
+
command_string = " ".join(command)
|
|
31
|
+
print(f"Running {command_string}\n to generate {output}")
|
|
32
|
+
with open(output, "w") as outfile:
|
|
33
|
+
subprocess.run(command, stdout=outfile)
|
|
34
|
+
|
|
35
|
+
def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
|
|
36
|
+
"""
|
|
37
|
+
Wrapper function for dorado modified base calling.
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
41
|
+
model (str): a string representing the the dorado basecalling model.
|
|
42
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
43
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
44
|
+
mod_list (list): A list of modification types to use in the analysis.
|
|
45
|
+
bam (str): File path to the BAM file to output.
|
|
46
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
47
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
48
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends
|
|
49
|
+
device (str): Device to use for basecalling. auto, metal, cpu, cuda.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
None
|
|
53
|
+
Outputs a BAM file holding the modified base calls output by the dorado basecaller.
|
|
54
|
+
"""
|
|
55
|
+
import subprocess
|
|
56
|
+
output = bam + bam_suffix
|
|
57
|
+
command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
|
|
58
|
+
command += mod_list
|
|
59
|
+
command += ["--device", device, "--batchsize", "0"]
|
|
60
|
+
if barcode_both_ends:
|
|
61
|
+
command.append("--barcode-both-ends")
|
|
62
|
+
if not trim:
|
|
63
|
+
command.append("--no-trim")
|
|
64
|
+
command += [model, pod5_dir]
|
|
65
|
+
print(f'Running: {" ".join(command)}')
|
|
66
|
+
with open(output, "w") as outfile:
|
|
67
|
+
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
import pysam
|
|
6
|
+
import pybedtools
|
|
7
|
+
import pyBigWig
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
13
|
+
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
|
|
16
|
+
from ..readwrite import make_dirs
|
|
17
|
+
|
|
18
|
+
def _bed_to_bigwig(fasta: str, bed: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
BED → bedGraph → bigWig
|
|
21
|
+
Requires:
|
|
22
|
+
- FASTA must have .fai index present
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
bed = Path(bed)
|
|
26
|
+
fa = Path(fasta) # path to .fa
|
|
27
|
+
parent = bed.parent
|
|
28
|
+
stem = bed.stem
|
|
29
|
+
fa_stem = fa.stem
|
|
30
|
+
fai = parent / f"{fa_stem}.fai"
|
|
31
|
+
|
|
32
|
+
bedgraph = parent / f"{stem}.bedgraph"
|
|
33
|
+
bigwig = parent / f"{stem}.bw"
|
|
34
|
+
|
|
35
|
+
# 1) Compute coverage → bedGraph
|
|
36
|
+
print(f"[pybedtools] generating coverage bedgraph from {bed}")
|
|
37
|
+
bt = pybedtools.BedTool(str(bed))
|
|
38
|
+
# bedtools genomecov -bg
|
|
39
|
+
coverage = bt.genome_coverage(bg=True, genome=str(fai))
|
|
40
|
+
coverage.saveas(str(bedgraph))
|
|
41
|
+
|
|
42
|
+
# 2) Convert bedGraph → BigWig via pyBigWig
|
|
43
|
+
print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
|
|
44
|
+
|
|
45
|
+
# read chrom sizes from the FASTA .fai index
|
|
46
|
+
chrom_sizes = {}
|
|
47
|
+
with open(fai) as f:
|
|
48
|
+
for line in f:
|
|
49
|
+
fields = line.strip().split("\t")
|
|
50
|
+
chrom = fields[0]
|
|
51
|
+
size = int(fields[1])
|
|
52
|
+
chrom_sizes[chrom] = size
|
|
53
|
+
|
|
54
|
+
bw = pyBigWig.open(str(bigwig), "w")
|
|
55
|
+
bw.addHeader(list(chrom_sizes.items()))
|
|
56
|
+
|
|
57
|
+
with open(bedgraph) as f:
|
|
58
|
+
for line in f:
|
|
59
|
+
chrom, start, end, coverage = line.strip().split()
|
|
60
|
+
bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
|
|
61
|
+
|
|
62
|
+
bw.close()
|
|
63
|
+
|
|
64
|
+
print(f"BigWig written: {bigwig}")
|
|
65
|
+
return str(bigwig)
|
|
66
|
+
|
|
67
|
+
def _plot_bed_histograms(
|
|
68
|
+
bed_file,
|
|
69
|
+
plotting_directory,
|
|
70
|
+
fasta,
|
|
71
|
+
*,
|
|
72
|
+
bins=60,
|
|
73
|
+
clip_quantiles=(0.0, 0.995),
|
|
74
|
+
cov_bin_size=1000, # coverage bin size in bp
|
|
75
|
+
rows_per_fig=6, # paginate if many chromosomes
|
|
76
|
+
include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
|
|
77
|
+
coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Plot per-chromosome QC grids from a BED-like file.
|
|
81
|
+
|
|
82
|
+
Expects columns:
|
|
83
|
+
chrom, start, end, read_len, qname, mapq, avg_base_qual
|
|
84
|
+
|
|
85
|
+
For each chromosome:
|
|
86
|
+
- Column 1: Read length histogram
|
|
87
|
+
- Column 2: Coverage across the chromosome (binned)
|
|
88
|
+
- (optional) Column 3: MAPQ histogram
|
|
89
|
+
- (optional) Column 4: Avg base quality histogram
|
|
90
|
+
|
|
91
|
+
The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
|
|
92
|
+
Saves one PNG per page under `plotting_directory`.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
bed_file : str
|
|
97
|
+
plotting_directory : str
|
|
98
|
+
fasta : str
|
|
99
|
+
Reference FASTA (used to get chromosome lengths).
|
|
100
|
+
bins : int
|
|
101
|
+
Histogram bins for read length / MAPQ / quality.
|
|
102
|
+
clip_quantiles : (float, float)
|
|
103
|
+
Clip hist tails for readability (e.g., (0, 0.995)).
|
|
104
|
+
cov_bin_size : int
|
|
105
|
+
Bin size (bp) for coverage plot; bigger = faster/coarser.
|
|
106
|
+
rows_per_fig : int
|
|
107
|
+
Number of chromosomes per page.
|
|
108
|
+
include_mapq_quality : bool
|
|
109
|
+
If True, add MAPQ and avg base quality histograms as extra columns.
|
|
110
|
+
coordinate_mode : {"one_based","zero_based"}
|
|
111
|
+
One-based, inclusive (your file) vs BED-standard zero-based, half-open.
|
|
112
|
+
"""
|
|
113
|
+
os.makedirs(plotting_directory, exist_ok=True)
|
|
114
|
+
|
|
115
|
+
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
116
|
+
print(f"[plot_bed_histograms] Loading: {bed_file}")
|
|
117
|
+
|
|
118
|
+
# Load BED-like table
|
|
119
|
+
cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
|
|
120
|
+
df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
|
|
121
|
+
'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
|
|
122
|
+
'mapq': float, 'avg_q': float
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Drop unaligned records (chrom == '*') if present
|
|
126
|
+
df = df[df['chrom'] != '*'].copy()
|
|
127
|
+
if df.empty:
|
|
128
|
+
print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
# Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
|
|
132
|
+
# Input is typically one_based inclusive (from your writer).
|
|
133
|
+
if coordinate_mode not in {"one_based", "zero_based"}:
|
|
134
|
+
raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
|
|
135
|
+
|
|
136
|
+
if coordinate_mode == "one_based":
|
|
137
|
+
# convert to 0-based half-open [start0, end0)
|
|
138
|
+
start0 = df['start'].to_numpy() - 1
|
|
139
|
+
end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
|
|
140
|
+
else:
|
|
141
|
+
# already 0-based half-open (assumption)
|
|
142
|
+
start0 = df['start'].to_numpy()
|
|
143
|
+
end0 = df['end'].to_numpy()
|
|
144
|
+
|
|
145
|
+
# Clip helper for hist tails
|
|
146
|
+
def _clip_series(s, q=(0.0, 0.995)):
|
|
147
|
+
if q is None:
|
|
148
|
+
return s.to_numpy()
|
|
149
|
+
lo = s.quantile(q[0]) if q[0] is not None else s.min()
|
|
150
|
+
hi = s.quantile(q[1]) if q[1] is not None else s.max()
|
|
151
|
+
x = s.to_numpy(dtype=float)
|
|
152
|
+
return np.clip(x, lo, hi)
|
|
153
|
+
|
|
154
|
+
# Load chromosome order/lengths from FASTA
|
|
155
|
+
with pysam.FastaFile(fasta) as fa:
|
|
156
|
+
ref_names = list(fa.references)
|
|
157
|
+
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
158
|
+
|
|
159
|
+
# Keep only chroms present in FASTA and with at least one read
|
|
160
|
+
chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
|
|
161
|
+
# Order chromosomes by FASTA order
|
|
162
|
+
chrom_order = [c for c in ref_names if c in chroms]
|
|
163
|
+
|
|
164
|
+
if not chrom_order:
|
|
165
|
+
print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
# Pagination
|
|
169
|
+
def _sanitize(name: str) -> str:
|
|
170
|
+
return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
|
|
171
|
+
|
|
172
|
+
cols_per_fig = 4 if include_mapq_quality else 2
|
|
173
|
+
|
|
174
|
+
for start_idx in range(0, len(chrom_order), rows_per_fig):
|
|
175
|
+
chunk = chrom_order[start_idx:start_idx + rows_per_fig]
|
|
176
|
+
nrows = len(chunk)
|
|
177
|
+
ncols = cols_per_fig
|
|
178
|
+
|
|
179
|
+
fig, axes = plt.subplots(
|
|
180
|
+
nrows=nrows, ncols=ncols,
|
|
181
|
+
figsize=(4.0 * ncols, 2.6 * nrows),
|
|
182
|
+
dpi=160,
|
|
183
|
+
squeeze=False
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
for r, chrom in enumerate(chunk):
|
|
187
|
+
chrom_len = ref_lengths[chrom]
|
|
188
|
+
mask = (df['chrom'].to_numpy() == chrom)
|
|
189
|
+
|
|
190
|
+
# Slice per-chrom arrays for speed
|
|
191
|
+
s0 = start0[mask]
|
|
192
|
+
e0 = end0[mask]
|
|
193
|
+
len_arr = df.loc[mask, 'read_len']
|
|
194
|
+
mapq_arr = df.loc[mask, 'mapq']
|
|
195
|
+
q_arr = df.loc[mask, 'avg_q']
|
|
196
|
+
|
|
197
|
+
# --- Col 1: Read length histogram (clipped) ---
|
|
198
|
+
ax = axes[r, 0]
|
|
199
|
+
ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
200
|
+
if r == 0:
|
|
201
|
+
ax.set_title("Read length")
|
|
202
|
+
ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
|
|
203
|
+
ax.set_xlabel("bp")
|
|
204
|
+
ax.grid(alpha=0.25)
|
|
205
|
+
|
|
206
|
+
# --- Col 2: Coverage (binned over genome) ---
|
|
207
|
+
ax = axes[r, 1]
|
|
208
|
+
nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
|
|
209
|
+
# Bin edges in 0-based coords
|
|
210
|
+
edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
|
|
211
|
+
|
|
212
|
+
# Compute per-bin "read count coverage": number of reads overlapping each bin.
|
|
213
|
+
# Approximate by incrementing all bins touched by the interval.
|
|
214
|
+
# (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
|
|
215
|
+
cov = np.zeros(nb, dtype=np.int32)
|
|
216
|
+
# bin indices overlapped by each read (0-based half-open)
|
|
217
|
+
b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
|
|
218
|
+
b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
|
|
219
|
+
# ensure valid ordering
|
|
220
|
+
b_lo = np.minimum(b0, b1)
|
|
221
|
+
b_hi = np.maximum(b0, b1)
|
|
222
|
+
|
|
223
|
+
# Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
|
|
224
|
+
for lo, hi in zip(b_lo, b_hi):
|
|
225
|
+
cov[lo:hi + 1] += 1
|
|
226
|
+
|
|
227
|
+
x_mid = (edges[:-1] + edges[1:]) / 2.0
|
|
228
|
+
ax.plot(x_mid, cov)
|
|
229
|
+
if r == 0:
|
|
230
|
+
ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
|
|
231
|
+
ax.set_xlim(0, chrom_len)
|
|
232
|
+
ax.set_xlabel("Position (bp)")
|
|
233
|
+
ax.set_ylabel("") # already show chrom on col 1
|
|
234
|
+
ax.grid(alpha=0.25)
|
|
235
|
+
|
|
236
|
+
if include_mapq_quality:
|
|
237
|
+
# --- Col 3: MAPQ ---
|
|
238
|
+
ax = axes[r, 2]
|
|
239
|
+
# Clip MAPQ upper tail if needed (usually 60)
|
|
240
|
+
ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
241
|
+
if r == 0:
|
|
242
|
+
ax.set_title("MAPQ")
|
|
243
|
+
ax.set_xlabel("MAPQ")
|
|
244
|
+
ax.grid(alpha=0.25)
|
|
245
|
+
|
|
246
|
+
# --- Col 4: Avg base quality ---
|
|
247
|
+
ax = axes[r, 3]
|
|
248
|
+
ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
249
|
+
if r == 0:
|
|
250
|
+
ax.set_title("Avg base qual")
|
|
251
|
+
ax.set_xlabel("Phred")
|
|
252
|
+
ax.grid(alpha=0.25)
|
|
253
|
+
|
|
254
|
+
fig.suptitle(
|
|
255
|
+
f"{bed_basename} — per-chromosome QC "
|
|
256
|
+
f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
|
|
257
|
+
y=0.995, fontsize=11
|
|
258
|
+
)
|
|
259
|
+
fig.tight_layout(rect=[0, 0, 1, 0.98])
|
|
260
|
+
|
|
261
|
+
page = start_idx // rows_per_fig + 1
|
|
262
|
+
out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
|
|
263
|
+
plt.savefig(out_png, bbox_inches="tight")
|
|
264
|
+
plt.close(fig)
|
|
265
|
+
|
|
266
|
+
print("[plot_bed_histograms] Done.")
|
|
267
|
+
|
|
268
|
+
def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
269
|
+
"""
|
|
270
|
+
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
271
|
+
Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
|
|
272
|
+
|
|
273
|
+
Parameters:
|
|
274
|
+
aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
|
|
275
|
+
out_dir (str): Directory to output files.
|
|
276
|
+
fasta (str): File path to the reference genome.
|
|
277
|
+
make_bigwigs (bool): Whether to generate bigwig files.
|
|
278
|
+
threads (int): Number of threads to use.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
None
|
|
282
|
+
"""
|
|
283
|
+
threads = threads or os.cpu_count() # Use max available cores if not specified
|
|
284
|
+
|
|
285
|
+
# Create necessary directories
|
|
286
|
+
plotting_dir = out_dir / "bed_cov_histograms"
|
|
287
|
+
bed_dir = out_dir / "beds"
|
|
288
|
+
make_dirs([plotting_dir, bed_dir])
|
|
289
|
+
|
|
290
|
+
bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
|
|
291
|
+
|
|
292
|
+
print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
293
|
+
|
|
294
|
+
with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
|
|
295
|
+
for read in bam.fetch(until_eof=True):
|
|
296
|
+
if read.is_unmapped:
|
|
297
|
+
chrom = "*"
|
|
298
|
+
start1 = 1
|
|
299
|
+
rl = read.query_length or 0
|
|
300
|
+
mapq = 0
|
|
301
|
+
else:
|
|
302
|
+
chrom = bam.get_reference_name(read.reference_id)
|
|
303
|
+
# pysam reference_start is 0-based → +1 for 1-based SAM-like start
|
|
304
|
+
start1 = int(read.reference_start) + 1
|
|
305
|
+
rl = read.query_length or 0
|
|
306
|
+
mapq = int(read.mapping_quality)
|
|
307
|
+
|
|
308
|
+
# End position in 1-based inclusive coords
|
|
309
|
+
end1 = start1 + (rl or 0) - 1
|
|
310
|
+
|
|
311
|
+
qname = read.query_name
|
|
312
|
+
quals = read.query_qualities
|
|
313
|
+
if quals is None or rl == 0:
|
|
314
|
+
avg_q = float("nan")
|
|
315
|
+
else:
|
|
316
|
+
avg_q = float(np.mean(quals))
|
|
317
|
+
|
|
318
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
319
|
+
|
|
320
|
+
print(f"BED-like file created: {bed_output}")
|
|
321
|
+
|
|
322
|
+
def split_bed(bed):
|
|
323
|
+
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
324
|
+
bed = str(bed)
|
|
325
|
+
aligned = bed.replace(".bed", "_aligned.bed")
|
|
326
|
+
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
327
|
+
with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
|
|
328
|
+
for line in infile:
|
|
329
|
+
(unaligned_out if line.startswith("*\t") else aligned_out).write(line)
|
|
330
|
+
os.remove(bed)
|
|
331
|
+
return aligned
|
|
332
|
+
|
|
333
|
+
print(f"Splitting: {bed_output}")
|
|
334
|
+
aligned_bed = split_bed(bed_output)
|
|
335
|
+
|
|
336
|
+
with ProcessPoolExecutor() as executor:
|
|
337
|
+
futures = []
|
|
338
|
+
futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
|
|
339
|
+
if make_bigwigs:
|
|
340
|
+
futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
|
|
341
|
+
concurrent.futures.wait(futures)
|
|
342
|
+
|
|
343
|
+
print("Processing completed successfully.")
|
|
344
|
+
|
|
345
|
+
def extract_read_lengths_from_bed(file_path):
|
|
346
|
+
"""
|
|
347
|
+
Load a dict of read names that points to the read length
|
|
348
|
+
|
|
349
|
+
Params:
|
|
350
|
+
file_path (str): file path to a bed file
|
|
351
|
+
Returns:
|
|
352
|
+
read_dict (dict)
|
|
353
|
+
"""
|
|
354
|
+
import pandas as pd
|
|
355
|
+
columns = ['chrom', 'start', 'end', 'length', 'name']
|
|
356
|
+
df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
|
|
357
|
+
read_dict = {}
|
|
358
|
+
for _, row in df.iterrows():
|
|
359
|
+
chrom = row['chrom']
|
|
360
|
+
start = row['start']
|
|
361
|
+
end = row['end']
|
|
362
|
+
name = row['name']
|
|
363
|
+
length = row['length']
|
|
364
|
+
read_dict[name] = length
|
|
365
|
+
|
|
366
|
+
return read_dict
|
|
@@ -15,19 +15,19 @@ import shutil
|
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import Union, Iterable, Optional
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from ..readwrite import make_dirs, safe_write_h5ad
|
|
19
19
|
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
20
|
-
from .
|
|
21
|
-
from .
|
|
22
|
-
from .
|
|
23
|
-
from .make_dirs import make_dirs
|
|
24
|
-
from .ohe_batching import ohe_batching
|
|
20
|
+
from .fasta_functions import find_conversion_sites
|
|
21
|
+
from .bam_functions import count_aligned_reads, extract_base_identities
|
|
22
|
+
from .ohe import ohe_batching
|
|
25
23
|
|
|
26
24
|
if __name__ == "__main__":
|
|
27
25
|
multiprocessing.set_start_method("forkserver", force=True)
|
|
28
26
|
|
|
29
|
-
def
|
|
27
|
+
def converted_BAM_to_adata(converted_FASTA,
|
|
30
28
|
split_dir,
|
|
29
|
+
output_dir,
|
|
30
|
+
input_already_demuxed,
|
|
31
31
|
mapping_threshold,
|
|
32
32
|
experiment_name,
|
|
33
33
|
conversions,
|
|
@@ -35,20 +35,24 @@ def converted_BAM_to_adata_II(converted_FASTA,
|
|
|
35
35
|
device='cpu',
|
|
36
36
|
num_threads=8,
|
|
37
37
|
deaminase_footprinting=False,
|
|
38
|
-
delete_intermediates=True
|
|
38
|
+
delete_intermediates=True,
|
|
39
|
+
double_barcoded_path = None,
|
|
39
40
|
):
|
|
40
41
|
"""
|
|
41
42
|
Converts BAM files into an AnnData object by binarizing modified base identities.
|
|
42
43
|
|
|
43
44
|
Parameters:
|
|
44
|
-
converted_FASTA (
|
|
45
|
-
split_dir (
|
|
45
|
+
converted_FASTA (Path): Path to the converted FASTA reference.
|
|
46
|
+
split_dir (Path): Directory containing converted BAM files.
|
|
47
|
+
output_dir (Path): Directory of the output dir
|
|
48
|
+
input_already_demuxed (bool): Whether input reads were originally demuxed
|
|
46
49
|
mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
|
|
47
50
|
experiment_name (str): Name for the output AnnData object.
|
|
48
51
|
conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
|
|
49
52
|
bam_suffix (str): File suffix for BAM files.
|
|
50
53
|
num_threads (int): Number of parallel processing threads.
|
|
51
54
|
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
55
|
+
double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
|
|
52
56
|
|
|
53
57
|
Returns:
|
|
54
58
|
str: Path to the final AnnData object.
|
|
@@ -63,22 +67,25 @@ def converted_BAM_to_adata_II(converted_FASTA,
|
|
|
63
67
|
print(f"Using device: {device}")
|
|
64
68
|
|
|
65
69
|
## Set Up Directories and File Paths
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
tmp_dir = os.path.join(split_dir, 'tmp')
|
|
70
|
+
h5_dir = output_dir / 'h5ads'
|
|
71
|
+
tmp_dir = output_dir / 'tmp'
|
|
69
72
|
final_adata = None
|
|
70
|
-
final_adata_path =
|
|
73
|
+
final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
|
|
71
74
|
|
|
72
|
-
if
|
|
75
|
+
if final_adata_path.exists():
|
|
73
76
|
print(f"{final_adata_path} already exists. Using existing AnnData object.")
|
|
74
77
|
return final_adata, final_adata_path
|
|
75
78
|
|
|
76
79
|
make_dirs([h5_dir, tmp_dir])
|
|
77
80
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
81
|
+
bam_files = sorted(
|
|
82
|
+
p for p in split_dir.iterdir()
|
|
83
|
+
if p.is_file()
|
|
84
|
+
and p.suffix == ".bam"
|
|
85
|
+
and "unclassified" not in p.name
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
bam_path_list = [split_dir / f for f in bam_files]
|
|
82
89
|
print(f"Found {len(bam_files)} BAM files: {bam_files}")
|
|
83
90
|
|
|
84
91
|
## Process Conversion Sites
|
|
@@ -90,10 +97,12 @@ def converted_BAM_to_adata_II(converted_FASTA,
|
|
|
90
97
|
## Process BAMs in Parallel
|
|
91
98
|
final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
|
|
92
99
|
|
|
100
|
+
final_adata.uns['References'] = {}
|
|
93
101
|
for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
|
|
94
102
|
final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
|
|
95
103
|
final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
|
|
96
104
|
final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
|
|
105
|
+
final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
|
|
97
106
|
|
|
98
107
|
final_adata.obs_names_make_unique()
|
|
99
108
|
cols = final_adata.obs.columns
|
|
@@ -102,10 +111,13 @@ def converted_BAM_to_adata_II(converted_FASTA,
|
|
|
102
111
|
for col in cols:
|
|
103
112
|
final_adata.obs[col] = final_adata.obs[col].astype('category')
|
|
104
113
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
114
|
+
if input_already_demuxed:
|
|
115
|
+
final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
|
|
116
|
+
final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
|
|
117
|
+
else:
|
|
118
|
+
from .h5ad_functions import add_demux_type_annotation
|
|
119
|
+
double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
|
|
120
|
+
add_demux_type_annotation(final_adata, double_barcoded_reads)
|
|
109
121
|
|
|
110
122
|
## Delete intermediate h5ad files and temp directories
|
|
111
123
|
if delete_intermediates:
|
|
@@ -211,7 +223,7 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
|
|
|
211
223
|
adata_list = []
|
|
212
224
|
|
|
213
225
|
for record in records_to_analyze:
|
|
214
|
-
sample =
|
|
226
|
+
sample = bam.stem
|
|
215
227
|
chromosome = record_FASTA_dict[record][2]
|
|
216
228
|
current_length = record_FASTA_dict[record][4]
|
|
217
229
|
mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
|
|
@@ -329,13 +341,13 @@ def timestamp():
|
|
|
329
341
|
def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
|
|
330
342
|
"""Worker function that processes a single BAM and writes the output to an H5AD file."""
|
|
331
343
|
worker_id = current_process().pid # Get worker process ID
|
|
332
|
-
sample =
|
|
344
|
+
sample = bam.stem
|
|
333
345
|
|
|
334
346
|
try:
|
|
335
347
|
print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
|
|
336
348
|
|
|
337
|
-
h5ad_path =
|
|
338
|
-
if
|
|
349
|
+
h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
|
|
350
|
+
if h5ad_path.exists():
|
|
339
351
|
print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
|
|
340
352
|
progress_queue.put(sample)
|
|
341
353
|
return
|
|
@@ -352,7 +364,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
|
|
|
352
364
|
adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
|
|
353
365
|
|
|
354
366
|
if adata is not None:
|
|
355
|
-
adata.write_h5ad(h5ad_path)
|
|
367
|
+
adata.write_h5ad(str(h5ad_path))
|
|
356
368
|
print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
|
|
357
369
|
|
|
358
370
|
# Free memory
|
|
@@ -367,7 +379,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
|
|
|
367
379
|
|
|
368
380
|
def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
|
|
369
381
|
"""Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
|
|
370
|
-
|
|
382
|
+
make_dirs(h5_dir) # Ensure h5_dir exists
|
|
371
383
|
|
|
372
384
|
print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
|
|
373
385
|
|
|
@@ -403,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
|
|
|
403
415
|
pool.join() # Ensure all workers finish
|
|
404
416
|
|
|
405
417
|
# Final Concatenation Step
|
|
406
|
-
h5ad_files = [
|
|
418
|
+
h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
|
|
407
419
|
|
|
408
420
|
if not h5ad_files:
|
|
409
421
|
print(f"{timestamp()} No valid H5AD files generated. Exiting.")
|