smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,67 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+
4
+ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
5
+ """
6
+ Wrapper function for dorado canonical base calling.
7
+
8
+ Parameters:
9
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
10
+ model (str): a string representing the the dorado basecalling model.
11
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
12
+ barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
13
+ bam (str): File path to the BAM file to output.
14
+ bam_suffix (str): The suffix to use for the BAM file.
15
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
16
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
17
+ device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
18
+
19
+ Returns:
20
+ None
21
+ Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
22
+ """
23
+ output = bam + bam_suffix
24
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
25
+ if barcode_both_ends:
26
+ command.append("--barcode-both-ends")
27
+ if not trim:
28
+ command.append("--no-trim")
29
+ command += [model, pod5_dir]
30
+ command_string = " ".join(command)
31
+ print(f"Running {command_string}\n to generate {output}")
32
+ with open(output, "w") as outfile:
33
+ subprocess.run(command, stdout=outfile)
34
+
35
+ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
36
+ """
37
+ Wrapper function for dorado modified base calling.
38
+
39
+ Parameters:
40
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
41
+ model (str): a string representing the the dorado basecalling model.
42
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
43
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
44
+ mod_list (list): A list of modification types to use in the analysis.
45
+ bam (str): File path to the BAM file to output.
46
+ bam_suffix (str): The suffix to use for the BAM file.
47
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
48
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends
49
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda.
50
+
51
+ Returns:
52
+ None
53
+ Outputs a BAM file holding the modified base calls output by the dorado basecaller.
54
+ """
55
+ import subprocess
56
+ output = bam + bam_suffix
57
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
58
+ command += mod_list
59
+ command += ["--device", device, "--batchsize", "0"]
60
+ if barcode_both_ends:
61
+ command.append("--barcode-both-ends")
62
+ if not trim:
63
+ command.append("--no-trim")
64
+ command += [model, pod5_dir]
65
+ print(f'Running: {" ".join(command)}')
66
+ with open(output, "w") as outfile:
67
+ subprocess.run(command, stdout=outfile)
@@ -0,0 +1,366 @@
1
+ from pathlib import Path
2
+ import os
3
+ import subprocess
4
+ from typing import List, Optional, Union
5
+ import pysam
6
+ import pybedtools
7
+ import pyBigWig
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import concurrent.futures
12
+ from concurrent.futures import ProcessPoolExecutor
13
+
14
+ import matplotlib.pyplot as plt
15
+
16
+ from ..readwrite import make_dirs
17
+
18
+ def _bed_to_bigwig(fasta: str, bed: str) -> str:
19
+ """
20
+ BED → bedGraph → bigWig
21
+ Requires:
22
+ - FASTA must have .fai index present
23
+ """
24
+
25
+ bed = Path(bed)
26
+ fa = Path(fasta) # path to .fa
27
+ parent = bed.parent
28
+ stem = bed.stem
29
+ fa_stem = fa.stem
30
+ fai = parent / f"{fa_stem}.fai"
31
+
32
+ bedgraph = parent / f"{stem}.bedgraph"
33
+ bigwig = parent / f"{stem}.bw"
34
+
35
+ # 1) Compute coverage → bedGraph
36
+ print(f"[pybedtools] generating coverage bedgraph from {bed}")
37
+ bt = pybedtools.BedTool(str(bed))
38
+ # bedtools genomecov -bg
39
+ coverage = bt.genome_coverage(bg=True, genome=str(fai))
40
+ coverage.saveas(str(bedgraph))
41
+
42
+ # 2) Convert bedGraph → BigWig via pyBigWig
43
+ print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
44
+
45
+ # read chrom sizes from the FASTA .fai index
46
+ chrom_sizes = {}
47
+ with open(fai) as f:
48
+ for line in f:
49
+ fields = line.strip().split("\t")
50
+ chrom = fields[0]
51
+ size = int(fields[1])
52
+ chrom_sizes[chrom] = size
53
+
54
+ bw = pyBigWig.open(str(bigwig), "w")
55
+ bw.addHeader(list(chrom_sizes.items()))
56
+
57
+ with open(bedgraph) as f:
58
+ for line in f:
59
+ chrom, start, end, coverage = line.strip().split()
60
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
61
+
62
+ bw.close()
63
+
64
+ print(f"BigWig written: {bigwig}")
65
+ return str(bigwig)
66
+
67
+ def _plot_bed_histograms(
68
+ bed_file,
69
+ plotting_directory,
70
+ fasta,
71
+ *,
72
+ bins=60,
73
+ clip_quantiles=(0.0, 0.995),
74
+ cov_bin_size=1000, # coverage bin size in bp
75
+ rows_per_fig=6, # paginate if many chromosomes
76
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
77
+ coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
78
+ ):
79
+ """
80
+ Plot per-chromosome QC grids from a BED-like file.
81
+
82
+ Expects columns:
83
+ chrom, start, end, read_len, qname, mapq, avg_base_qual
84
+
85
+ For each chromosome:
86
+ - Column 1: Read length histogram
87
+ - Column 2: Coverage across the chromosome (binned)
88
+ - (optional) Column 3: MAPQ histogram
89
+ - (optional) Column 4: Avg base quality histogram
90
+
91
+ The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
92
+ Saves one PNG per page under `plotting_directory`.
93
+
94
+ Parameters
95
+ ----------
96
+ bed_file : str
97
+ plotting_directory : str
98
+ fasta : str
99
+ Reference FASTA (used to get chromosome lengths).
100
+ bins : int
101
+ Histogram bins for read length / MAPQ / quality.
102
+ clip_quantiles : (float, float)
103
+ Clip hist tails for readability (e.g., (0, 0.995)).
104
+ cov_bin_size : int
105
+ Bin size (bp) for coverage plot; bigger = faster/coarser.
106
+ rows_per_fig : int
107
+ Number of chromosomes per page.
108
+ include_mapq_quality : bool
109
+ If True, add MAPQ and avg base quality histograms as extra columns.
110
+ coordinate_mode : {"one_based","zero_based"}
111
+ One-based, inclusive (your file) vs BED-standard zero-based, half-open.
112
+ """
113
+ os.makedirs(plotting_directory, exist_ok=True)
114
+
115
+ bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
116
+ print(f"[plot_bed_histograms] Loading: {bed_file}")
117
+
118
+ # Load BED-like table
119
+ cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
120
+ df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
121
+ 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
122
+ 'mapq': float, 'avg_q': float
123
+ })
124
+
125
+ # Drop unaligned records (chrom == '*') if present
126
+ df = df[df['chrom'] != '*'].copy()
127
+ if df.empty:
128
+ print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
129
+ return
130
+
131
+ # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
132
+ # Input is typically one_based inclusive (from your writer).
133
+ if coordinate_mode not in {"one_based", "zero_based"}:
134
+ raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
135
+
136
+ if coordinate_mode == "one_based":
137
+ # convert to 0-based half-open [start0, end0)
138
+ start0 = df['start'].to_numpy() - 1
139
+ end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
140
+ else:
141
+ # already 0-based half-open (assumption)
142
+ start0 = df['start'].to_numpy()
143
+ end0 = df['end'].to_numpy()
144
+
145
+ # Clip helper for hist tails
146
+ def _clip_series(s, q=(0.0, 0.995)):
147
+ if q is None:
148
+ return s.to_numpy()
149
+ lo = s.quantile(q[0]) if q[0] is not None else s.min()
150
+ hi = s.quantile(q[1]) if q[1] is not None else s.max()
151
+ x = s.to_numpy(dtype=float)
152
+ return np.clip(x, lo, hi)
153
+
154
+ # Load chromosome order/lengths from FASTA
155
+ with pysam.FastaFile(fasta) as fa:
156
+ ref_names = list(fa.references)
157
+ ref_lengths = dict(zip(ref_names, fa.lengths))
158
+
159
+ # Keep only chroms present in FASTA and with at least one read
160
+ chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
161
+ # Order chromosomes by FASTA order
162
+ chrom_order = [c for c in ref_names if c in chroms]
163
+
164
+ if not chrom_order:
165
+ print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
166
+ return
167
+
168
+ # Pagination
169
+ def _sanitize(name: str) -> str:
170
+ return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
171
+
172
+ cols_per_fig = 4 if include_mapq_quality else 2
173
+
174
+ for start_idx in range(0, len(chrom_order), rows_per_fig):
175
+ chunk = chrom_order[start_idx:start_idx + rows_per_fig]
176
+ nrows = len(chunk)
177
+ ncols = cols_per_fig
178
+
179
+ fig, axes = plt.subplots(
180
+ nrows=nrows, ncols=ncols,
181
+ figsize=(4.0 * ncols, 2.6 * nrows),
182
+ dpi=160,
183
+ squeeze=False
184
+ )
185
+
186
+ for r, chrom in enumerate(chunk):
187
+ chrom_len = ref_lengths[chrom]
188
+ mask = (df['chrom'].to_numpy() == chrom)
189
+
190
+ # Slice per-chrom arrays for speed
191
+ s0 = start0[mask]
192
+ e0 = end0[mask]
193
+ len_arr = df.loc[mask, 'read_len']
194
+ mapq_arr = df.loc[mask, 'mapq']
195
+ q_arr = df.loc[mask, 'avg_q']
196
+
197
+ # --- Col 1: Read length histogram (clipped) ---
198
+ ax = axes[r, 0]
199
+ ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
200
+ if r == 0:
201
+ ax.set_title("Read length")
202
+ ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
203
+ ax.set_xlabel("bp")
204
+ ax.grid(alpha=0.25)
205
+
206
+ # --- Col 2: Coverage (binned over genome) ---
207
+ ax = axes[r, 1]
208
+ nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
209
+ # Bin edges in 0-based coords
210
+ edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
211
+
212
+ # Compute per-bin "read count coverage": number of reads overlapping each bin.
213
+ # Approximate by incrementing all bins touched by the interval.
214
+ # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
215
+ cov = np.zeros(nb, dtype=np.int32)
216
+ # bin indices overlapped by each read (0-based half-open)
217
+ b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
218
+ b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
219
+ # ensure valid ordering
220
+ b_lo = np.minimum(b0, b1)
221
+ b_hi = np.maximum(b0, b1)
222
+
223
+ # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
224
+ for lo, hi in zip(b_lo, b_hi):
225
+ cov[lo:hi + 1] += 1
226
+
227
+ x_mid = (edges[:-1] + edges[1:]) / 2.0
228
+ ax.plot(x_mid, cov)
229
+ if r == 0:
230
+ ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
231
+ ax.set_xlim(0, chrom_len)
232
+ ax.set_xlabel("Position (bp)")
233
+ ax.set_ylabel("") # already show chrom on col 1
234
+ ax.grid(alpha=0.25)
235
+
236
+ if include_mapq_quality:
237
+ # --- Col 3: MAPQ ---
238
+ ax = axes[r, 2]
239
+ # Clip MAPQ upper tail if needed (usually 60)
240
+ ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
241
+ if r == 0:
242
+ ax.set_title("MAPQ")
243
+ ax.set_xlabel("MAPQ")
244
+ ax.grid(alpha=0.25)
245
+
246
+ # --- Col 4: Avg base quality ---
247
+ ax = axes[r, 3]
248
+ ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
249
+ if r == 0:
250
+ ax.set_title("Avg base qual")
251
+ ax.set_xlabel("Phred")
252
+ ax.grid(alpha=0.25)
253
+
254
+ fig.suptitle(
255
+ f"{bed_basename} — per-chromosome QC "
256
+ f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
257
+ y=0.995, fontsize=11
258
+ )
259
+ fig.tight_layout(rect=[0, 0, 1, 0.98])
260
+
261
+ page = start_idx // rows_per_fig + 1
262
+ out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
263
+ plt.savefig(out_png, bbox_inches="tight")
264
+ plt.close(fig)
265
+
266
+ print("[plot_bed_histograms] Done.")
267
+
268
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
269
+ """
270
+ Takes an aligned BAM as input and writes a BED file of reads as output.
271
+ Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
272
+
273
+ Parameters:
274
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
275
+ out_dir (str): Directory to output files.
276
+ fasta (str): File path to the reference genome.
277
+ make_bigwigs (bool): Whether to generate bigwig files.
278
+ threads (int): Number of threads to use.
279
+
280
+ Returns:
281
+ None
282
+ """
283
+ threads = threads or os.cpu_count() # Use max available cores if not specified
284
+
285
+ # Create necessary directories
286
+ plotting_dir = out_dir / "bed_cov_histograms"
287
+ bed_dir = out_dir / "beds"
288
+ make_dirs([plotting_dir, bed_dir])
289
+
290
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
291
+
292
+ print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
293
+
294
+ with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
295
+ for read in bam.fetch(until_eof=True):
296
+ if read.is_unmapped:
297
+ chrom = "*"
298
+ start1 = 1
299
+ rl = read.query_length or 0
300
+ mapq = 0
301
+ else:
302
+ chrom = bam.get_reference_name(read.reference_id)
303
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
304
+ start1 = int(read.reference_start) + 1
305
+ rl = read.query_length or 0
306
+ mapq = int(read.mapping_quality)
307
+
308
+ # End position in 1-based inclusive coords
309
+ end1 = start1 + (rl or 0) - 1
310
+
311
+ qname = read.query_name
312
+ quals = read.query_qualities
313
+ if quals is None or rl == 0:
314
+ avg_q = float("nan")
315
+ else:
316
+ avg_q = float(np.mean(quals))
317
+
318
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
319
+
320
+ print(f"BED-like file created: {bed_output}")
321
+
322
+ def split_bed(bed):
323
+ """Splits into aligned and unaligned reads (chrom == '*')."""
324
+ bed = str(bed)
325
+ aligned = bed.replace(".bed", "_aligned.bed")
326
+ unaligned = bed.replace(".bed", "_unaligned.bed")
327
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
328
+ for line in infile:
329
+ (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
330
+ os.remove(bed)
331
+ return aligned
332
+
333
+ print(f"Splitting: {bed_output}")
334
+ aligned_bed = split_bed(bed_output)
335
+
336
+ with ProcessPoolExecutor() as executor:
337
+ futures = []
338
+ futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
339
+ if make_bigwigs:
340
+ futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
341
+ concurrent.futures.wait(futures)
342
+
343
+ print("Processing completed successfully.")
344
+
345
+ def extract_read_lengths_from_bed(file_path):
346
+ """
347
+ Load a dict of read names that points to the read length
348
+
349
+ Params:
350
+ file_path (str): file path to a bed file
351
+ Returns:
352
+ read_dict (dict)
353
+ """
354
+ import pandas as pd
355
+ columns = ['chrom', 'start', 'end', 'length', 'name']
356
+ df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
357
+ read_dict = {}
358
+ for _, row in df.iterrows():
359
+ chrom = row['chrom']
360
+ start = row['start']
361
+ end = row['end']
362
+ name = row['name']
363
+ length = row['length']
364
+ read_dict[name] = length
365
+
366
+ return read_dict
@@ -15,19 +15,19 @@ import shutil
15
15
  from pathlib import Path
16
16
  from typing import Union, Iterable, Optional
17
17
 
18
- from ... import readwrite
18
+ from ..readwrite import make_dirs, safe_write_h5ad
19
19
  from .binarize_converted_base_identities import binarize_converted_base_identities
20
- from .find_conversion_sites import find_conversion_sites
21
- from .count_aligned_reads import count_aligned_reads
22
- from .extract_base_identities import extract_base_identities
23
- from .make_dirs import make_dirs
24
- from .ohe_batching import ohe_batching
20
+ from .fasta_functions import find_conversion_sites
21
+ from .bam_functions import count_aligned_reads, extract_base_identities
22
+ from .ohe import ohe_batching
25
23
 
26
24
  if __name__ == "__main__":
27
25
  multiprocessing.set_start_method("forkserver", force=True)
28
26
 
29
- def converted_BAM_to_adata_II(converted_FASTA,
27
+ def converted_BAM_to_adata(converted_FASTA,
30
28
  split_dir,
29
+ output_dir,
30
+ input_already_demuxed,
31
31
  mapping_threshold,
32
32
  experiment_name,
33
33
  conversions,
@@ -35,20 +35,24 @@ def converted_BAM_to_adata_II(converted_FASTA,
35
35
  device='cpu',
36
36
  num_threads=8,
37
37
  deaminase_footprinting=False,
38
- delete_intermediates=True
38
+ delete_intermediates=True,
39
+ double_barcoded_path = None,
39
40
  ):
40
41
  """
41
42
  Converts BAM files into an AnnData object by binarizing modified base identities.
42
43
 
43
44
  Parameters:
44
- converted_FASTA (str): Path to the converted FASTA reference.
45
- split_dir (str): Directory containing converted BAM files.
45
+ converted_FASTA (Path): Path to the converted FASTA reference.
46
+ split_dir (Path): Directory containing converted BAM files.
47
+ output_dir (Path): Directory of the output dir
48
+ input_already_demuxed (bool): Whether input reads were originally demuxed
46
49
  mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
47
50
  experiment_name (str): Name for the output AnnData object.
48
51
  conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
49
52
  bam_suffix (str): File suffix for BAM files.
50
53
  num_threads (int): Number of parallel processing threads.
51
54
  deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
55
+ double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
52
56
 
53
57
  Returns:
54
58
  str: Path to the final AnnData object.
@@ -63,22 +67,25 @@ def converted_BAM_to_adata_II(converted_FASTA,
63
67
  print(f"Using device: {device}")
64
68
 
65
69
  ## Set Up Directories and File Paths
66
- #parent_dir = os.path.dirname(split_dir)
67
- h5_dir = os.path.join(split_dir, 'h5ads')
68
- tmp_dir = os.path.join(split_dir, 'tmp')
70
+ h5_dir = output_dir / 'h5ads'
71
+ tmp_dir = output_dir / 'tmp'
69
72
  final_adata = None
70
- final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{os.path.basename(split_dir)}.h5ad.gz')
73
+ final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
71
74
 
72
- if os.path.exists(final_adata_path):
75
+ if final_adata_path.exists():
73
76
  print(f"{final_adata_path} already exists. Using existing AnnData object.")
74
77
  return final_adata, final_adata_path
75
78
 
76
79
  make_dirs([h5_dir, tmp_dir])
77
80
 
78
- ## Get BAM Files ##
79
- bam_files = [f for f in os.listdir(split_dir) if f.endswith(bam_suffix) and not f.endswith('.bai') and 'unclassified' not in f]
80
- bam_files.sort()
81
- bam_path_list = [os.path.join(split_dir, f) for f in bam_files]
81
+ bam_files = sorted(
82
+ p for p in split_dir.iterdir()
83
+ if p.is_file()
84
+ and p.suffix == ".bam"
85
+ and "unclassified" not in p.name
86
+ )
87
+
88
+ bam_path_list = [split_dir / f for f in bam_files]
82
89
  print(f"Found {len(bam_files)} BAM files: {bam_files}")
83
90
 
84
91
  ## Process Conversion Sites
@@ -90,10 +97,12 @@ def converted_BAM_to_adata_II(converted_FASTA,
90
97
  ## Process BAMs in Parallel
91
98
  final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
92
99
 
100
+ final_adata.uns['References'] = {}
93
101
  for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
94
102
  final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
95
103
  final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
96
104
  final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
105
+ final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
97
106
 
98
107
  final_adata.obs_names_make_unique()
99
108
  cols = final_adata.obs.columns
@@ -102,10 +111,13 @@ def converted_BAM_to_adata_II(converted_FASTA,
102
111
  for col in cols:
103
112
  final_adata.obs[col] = final_adata.obs[col].astype('category')
104
113
 
105
- ## Save Final AnnData
106
- print(f"Saving AnnData to {final_adata_path}")
107
- backup_dir=os.path.join(os.path.dirname(final_adata_path), 'adata_accessory_data')
108
- readwrite.safe_write_h5ad(final_adata, final_adata_path, compression='gzip', backup=True, backup_dir=backup_dir)
114
+ if input_already_demuxed:
115
+ final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
116
+ final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
117
+ else:
118
+ from .h5ad_functions import add_demux_type_annotation
119
+ double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
120
+ add_demux_type_annotation(final_adata, double_barcoded_reads)
109
121
 
110
122
  ## Delete intermediate h5ad files and temp directories
111
123
  if delete_intermediates:
@@ -211,7 +223,7 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, ch
211
223
  adata_list = []
212
224
 
213
225
  for record in records_to_analyze:
214
- sample = os.path.basename(bam).split(sep=".bam")[0]
226
+ sample = bam.stem
215
227
  chromosome = record_FASTA_dict[record][2]
216
228
  current_length = record_FASTA_dict[record][4]
217
229
  mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
@@ -329,13 +341,13 @@ def timestamp():
329
341
  def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
330
342
  """Worker function that processes a single BAM and writes the output to an H5AD file."""
331
343
  worker_id = current_process().pid # Get worker process ID
332
- sample = os.path.basename(bam).split(sep=".bam")[0]
344
+ sample = bam.stem
333
345
 
334
346
  try:
335
347
  print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
336
348
 
337
- h5ad_path = os.path.join(h5_dir, f"{sample}.h5ad")
338
- if os.path.exists(h5ad_path):
349
+ h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
350
+ if h5ad_path.exists():
339
351
  print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
340
352
  progress_queue.put(sample)
341
353
  return
@@ -352,7 +364,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
352
364
  adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
353
365
 
354
366
  if adata is not None:
355
- adata.write_h5ad(h5ad_path)
367
+ adata.write_h5ad(str(h5ad_path))
356
368
  print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
357
369
 
358
370
  # Free memory
@@ -367,7 +379,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
367
379
 
368
380
  def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
369
381
  """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
370
- os.makedirs(h5_dir, exist_ok=True) # Ensure h5_dir exists
382
+ make_dirs(h5_dir) # Ensure h5_dir exists
371
383
 
372
384
  print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
373
385
 
@@ -403,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
403
415
  pool.join() # Ensure all workers finish
404
416
 
405
417
  # Final Concatenation Step
406
- h5ad_files = [os.path.join(h5_dir, f) for f in os.listdir(h5_dir) if f.endswith(".h5ad")]
418
+ h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
407
419
 
408
420
  if not h5ad_files:
409
421
  print(f"{timestamp()} No valid H5AD files generated. Exiting.")