smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,67 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+
4
+ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
5
+ """
6
+ Wrapper function for dorado canonical base calling.
7
+
8
+ Parameters:
9
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
10
+ model (str): a string representing the the dorado basecalling model.
11
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
12
+ barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
13
+ bam (str): File path to the BAM file to output.
14
+ bam_suffix (str): The suffix to use for the BAM file.
15
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
16
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
17
+ device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
18
+
19
+ Returns:
20
+ None
21
+ Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
22
+ """
23
+ output = bam + bam_suffix
24
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
25
+ if barcode_both_ends:
26
+ command.append("--barcode-both-ends")
27
+ if not trim:
28
+ command.append("--no-trim")
29
+ command += [model, pod5_dir]
30
+ command_string = " ".join(command)
31
+ print(f"Running {command_string}\n to generate {output}")
32
+ with open(output, "w") as outfile:
33
+ subprocess.run(command, stdout=outfile)
34
+
35
+ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
36
+ """
37
+ Wrapper function for dorado modified base calling.
38
+
39
+ Parameters:
40
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
41
+ model (str): a string representing the the dorado basecalling model.
42
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
43
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
44
+ mod_list (list): A list of modification types to use in the analysis.
45
+ bam (str): File path to the BAM file to output.
46
+ bam_suffix (str): The suffix to use for the BAM file.
47
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
48
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends
49
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda.
50
+
51
+ Returns:
52
+ None
53
+ Outputs a BAM file holding the modified base calls output by the dorado basecaller.
54
+ """
55
+ import subprocess
56
+ output = bam + bam_suffix
57
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
58
+ command += mod_list
59
+ command += ["--device", device, "--batchsize", "0"]
60
+ if barcode_both_ends:
61
+ command.append("--barcode-both-ends")
62
+ if not trim:
63
+ command.append("--no-trim")
64
+ command += [model, pod5_dir]
65
+ print(f'Running: {" ".join(command)}')
66
+ with open(output, "w") as outfile:
67
+ subprocess.run(command, stdout=outfile)
@@ -0,0 +1,366 @@
1
+ from pathlib import Path
2
+ import os
3
+ import subprocess
4
+ from typing import List, Optional, Union
5
+ import pysam
6
+ import pybedtools
7
+ import pyBigWig
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import concurrent.futures
12
+ from concurrent.futures import ProcessPoolExecutor
13
+
14
+ import matplotlib.pyplot as plt
15
+
16
+ from ..readwrite import make_dirs
17
+
18
+ def _bed_to_bigwig(fasta: str, bed: str) -> str:
19
+ """
20
+ BED → bedGraph → bigWig
21
+ Requires:
22
+ - FASTA must have .fai index present
23
+ """
24
+
25
+ bed = Path(bed)
26
+ fa = Path(fasta) # path to .fa
27
+ parent = bed.parent
28
+ stem = bed.stem
29
+ fa_stem = fa.stem
30
+ fai = parent / f"{fa_stem}.fai"
31
+
32
+ bedgraph = parent / f"{stem}.bedgraph"
33
+ bigwig = parent / f"{stem}.bw"
34
+
35
+ # 1) Compute coverage → bedGraph
36
+ print(f"[pybedtools] generating coverage bedgraph from {bed}")
37
+ bt = pybedtools.BedTool(str(bed))
38
+ # bedtools genomecov -bg
39
+ coverage = bt.genome_coverage(bg=True, genome=str(fai))
40
+ coverage.saveas(str(bedgraph))
41
+
42
+ # 2) Convert bedGraph → BigWig via pyBigWig
43
+ print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
44
+
45
+ # read chrom sizes from the FASTA .fai index
46
+ chrom_sizes = {}
47
+ with open(fai) as f:
48
+ for line in f:
49
+ fields = line.strip().split("\t")
50
+ chrom = fields[0]
51
+ size = int(fields[1])
52
+ chrom_sizes[chrom] = size
53
+
54
+ bw = pyBigWig.open(str(bigwig), "w")
55
+ bw.addHeader(list(chrom_sizes.items()))
56
+
57
+ with open(bedgraph) as f:
58
+ for line in f:
59
+ chrom, start, end, coverage = line.strip().split()
60
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
61
+
62
+ bw.close()
63
+
64
+ print(f"BigWig written: {bigwig}")
65
+ return str(bigwig)
66
+
67
+ def _plot_bed_histograms(
68
+ bed_file,
69
+ plotting_directory,
70
+ fasta,
71
+ *,
72
+ bins=60,
73
+ clip_quantiles=(0.0, 0.995),
74
+ cov_bin_size=1000, # coverage bin size in bp
75
+ rows_per_fig=6, # paginate if many chromosomes
76
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
77
+ coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
78
+ ):
79
+ """
80
+ Plot per-chromosome QC grids from a BED-like file.
81
+
82
+ Expects columns:
83
+ chrom, start, end, read_len, qname, mapq, avg_base_qual
84
+
85
+ For each chromosome:
86
+ - Column 1: Read length histogram
87
+ - Column 2: Coverage across the chromosome (binned)
88
+ - (optional) Column 3: MAPQ histogram
89
+ - (optional) Column 4: Avg base quality histogram
90
+
91
+ The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
92
+ Saves one PNG per page under `plotting_directory`.
93
+
94
+ Parameters
95
+ ----------
96
+ bed_file : str
97
+ plotting_directory : str
98
+ fasta : str
99
+ Reference FASTA (used to get chromosome lengths).
100
+ bins : int
101
+ Histogram bins for read length / MAPQ / quality.
102
+ clip_quantiles : (float, float)
103
+ Clip hist tails for readability (e.g., (0, 0.995)).
104
+ cov_bin_size : int
105
+ Bin size (bp) for coverage plot; bigger = faster/coarser.
106
+ rows_per_fig : int
107
+ Number of chromosomes per page.
108
+ include_mapq_quality : bool
109
+ If True, add MAPQ and avg base quality histograms as extra columns.
110
+ coordinate_mode : {"one_based","zero_based"}
111
+ One-based, inclusive (your file) vs BED-standard zero-based, half-open.
112
+ """
113
+ os.makedirs(plotting_directory, exist_ok=True)
114
+
115
+ bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
116
+ print(f"[plot_bed_histograms] Loading: {bed_file}")
117
+
118
+ # Load BED-like table
119
+ cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
120
+ df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
121
+ 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
122
+ 'mapq': float, 'avg_q': float
123
+ })
124
+
125
+ # Drop unaligned records (chrom == '*') if present
126
+ df = df[df['chrom'] != '*'].copy()
127
+ if df.empty:
128
+ print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
129
+ return
130
+
131
+ # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
132
+ # Input is typically one_based inclusive (from your writer).
133
+ if coordinate_mode not in {"one_based", "zero_based"}:
134
+ raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
135
+
136
+ if coordinate_mode == "one_based":
137
+ # convert to 0-based half-open [start0, end0)
138
+ start0 = df['start'].to_numpy() - 1
139
+ end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
140
+ else:
141
+ # already 0-based half-open (assumption)
142
+ start0 = df['start'].to_numpy()
143
+ end0 = df['end'].to_numpy()
144
+
145
+ # Clip helper for hist tails
146
+ def _clip_series(s, q=(0.0, 0.995)):
147
+ if q is None:
148
+ return s.to_numpy()
149
+ lo = s.quantile(q[0]) if q[0] is not None else s.min()
150
+ hi = s.quantile(q[1]) if q[1] is not None else s.max()
151
+ x = s.to_numpy(dtype=float)
152
+ return np.clip(x, lo, hi)
153
+
154
+ # Load chromosome order/lengths from FASTA
155
+ with pysam.FastaFile(fasta) as fa:
156
+ ref_names = list(fa.references)
157
+ ref_lengths = dict(zip(ref_names, fa.lengths))
158
+
159
+ # Keep only chroms present in FASTA and with at least one read
160
+ chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
161
+ # Order chromosomes by FASTA order
162
+ chrom_order = [c for c in ref_names if c in chroms]
163
+
164
+ if not chrom_order:
165
+ print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
166
+ return
167
+
168
+ # Pagination
169
+ def _sanitize(name: str) -> str:
170
+ return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
171
+
172
+ cols_per_fig = 4 if include_mapq_quality else 2
173
+
174
+ for start_idx in range(0, len(chrom_order), rows_per_fig):
175
+ chunk = chrom_order[start_idx:start_idx + rows_per_fig]
176
+ nrows = len(chunk)
177
+ ncols = cols_per_fig
178
+
179
+ fig, axes = plt.subplots(
180
+ nrows=nrows, ncols=ncols,
181
+ figsize=(4.0 * ncols, 2.6 * nrows),
182
+ dpi=160,
183
+ squeeze=False
184
+ )
185
+
186
+ for r, chrom in enumerate(chunk):
187
+ chrom_len = ref_lengths[chrom]
188
+ mask = (df['chrom'].to_numpy() == chrom)
189
+
190
+ # Slice per-chrom arrays for speed
191
+ s0 = start0[mask]
192
+ e0 = end0[mask]
193
+ len_arr = df.loc[mask, 'read_len']
194
+ mapq_arr = df.loc[mask, 'mapq']
195
+ q_arr = df.loc[mask, 'avg_q']
196
+
197
+ # --- Col 1: Read length histogram (clipped) ---
198
+ ax = axes[r, 0]
199
+ ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
200
+ if r == 0:
201
+ ax.set_title("Read length")
202
+ ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
203
+ ax.set_xlabel("bp")
204
+ ax.grid(alpha=0.25)
205
+
206
+ # --- Col 2: Coverage (binned over genome) ---
207
+ ax = axes[r, 1]
208
+ nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
209
+ # Bin edges in 0-based coords
210
+ edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
211
+
212
+ # Compute per-bin "read count coverage": number of reads overlapping each bin.
213
+ # Approximate by incrementing all bins touched by the interval.
214
+ # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
215
+ cov = np.zeros(nb, dtype=np.int32)
216
+ # bin indices overlapped by each read (0-based half-open)
217
+ b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
218
+ b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
219
+ # ensure valid ordering
220
+ b_lo = np.minimum(b0, b1)
221
+ b_hi = np.maximum(b0, b1)
222
+
223
+ # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
224
+ for lo, hi in zip(b_lo, b_hi):
225
+ cov[lo:hi + 1] += 1
226
+
227
+ x_mid = (edges[:-1] + edges[1:]) / 2.0
228
+ ax.plot(x_mid, cov)
229
+ if r == 0:
230
+ ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
231
+ ax.set_xlim(0, chrom_len)
232
+ ax.set_xlabel("Position (bp)")
233
+ ax.set_ylabel("") # already show chrom on col 1
234
+ ax.grid(alpha=0.25)
235
+
236
+ if include_mapq_quality:
237
+ # --- Col 3: MAPQ ---
238
+ ax = axes[r, 2]
239
+ # Clip MAPQ upper tail if needed (usually 60)
240
+ ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
241
+ if r == 0:
242
+ ax.set_title("MAPQ")
243
+ ax.set_xlabel("MAPQ")
244
+ ax.grid(alpha=0.25)
245
+
246
+ # --- Col 4: Avg base quality ---
247
+ ax = axes[r, 3]
248
+ ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
249
+ if r == 0:
250
+ ax.set_title("Avg base qual")
251
+ ax.set_xlabel("Phred")
252
+ ax.grid(alpha=0.25)
253
+
254
+ fig.suptitle(
255
+ f"{bed_basename} — per-chromosome QC "
256
+ f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
257
+ y=0.995, fontsize=11
258
+ )
259
+ fig.tight_layout(rect=[0, 0, 1, 0.98])
260
+
261
+ page = start_idx // rows_per_fig + 1
262
+ out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
263
+ plt.savefig(out_png, bbox_inches="tight")
264
+ plt.close(fig)
265
+
266
+ print("[plot_bed_histograms] Done.")
267
+
268
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
269
+ """
270
+ Takes an aligned BAM as input and writes a BED file of reads as output.
271
+ Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
272
+
273
+ Parameters:
274
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
275
+ out_dir (str): Directory to output files.
276
+ fasta (str): File path to the reference genome.
277
+ make_bigwigs (bool): Whether to generate bigwig files.
278
+ threads (int): Number of threads to use.
279
+
280
+ Returns:
281
+ None
282
+ """
283
+ threads = threads or os.cpu_count() # Use max available cores if not specified
284
+
285
+ # Create necessary directories
286
+ plotting_dir = out_dir / "bed_cov_histograms"
287
+ bed_dir = out_dir / "beds"
288
+ make_dirs([plotting_dir, bed_dir])
289
+
290
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
291
+
292
+ print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
293
+
294
+ with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
295
+ for read in bam.fetch(until_eof=True):
296
+ if read.is_unmapped:
297
+ chrom = "*"
298
+ start1 = 1
299
+ rl = read.query_length or 0
300
+ mapq = 0
301
+ else:
302
+ chrom = bam.get_reference_name(read.reference_id)
303
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
304
+ start1 = int(read.reference_start) + 1
305
+ rl = read.query_length or 0
306
+ mapq = int(read.mapping_quality)
307
+
308
+ # End position in 1-based inclusive coords
309
+ end1 = start1 + (rl or 0) - 1
310
+
311
+ qname = read.query_name
312
+ quals = read.query_qualities
313
+ if quals is None or rl == 0:
314
+ avg_q = float("nan")
315
+ else:
316
+ avg_q = float(np.mean(quals))
317
+
318
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
319
+
320
+ print(f"BED-like file created: {bed_output}")
321
+
322
+ def split_bed(bed):
323
+ """Splits into aligned and unaligned reads (chrom == '*')."""
324
+ bed = str(bed)
325
+ aligned = bed.replace(".bed", "_aligned.bed")
326
+ unaligned = bed.replace(".bed", "_unaligned.bed")
327
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
328
+ for line in infile:
329
+ (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
330
+ os.remove(bed)
331
+ return aligned
332
+
333
+ print(f"Splitting: {bed_output}")
334
+ aligned_bed = split_bed(bed_output)
335
+
336
+ with ProcessPoolExecutor() as executor:
337
+ futures = []
338
+ futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
339
+ if make_bigwigs:
340
+ futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
341
+ concurrent.futures.wait(futures)
342
+
343
+ print("Processing completed successfully.")
344
+
345
+ def extract_read_lengths_from_bed(file_path):
346
+ """
347
+ Load a dict of read names that points to the read length
348
+
349
+ Params:
350
+ file_path (str): file path to a bed file
351
+ Returns:
352
+ read_dict (dict)
353
+ """
354
+ import pandas as pd
355
+ columns = ['chrom', 'start', 'end', 'length', 'name']
356
+ df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
357
+ read_dict = {}
358
+ for _, row in df.iterrows():
359
+ chrom = row['chrom']
360
+ start = row['start']
361
+ end = row['end']
362
+ name = row['name']
363
+ length = row['length']
364
+ read_dict[name] = length
365
+
366
+ return read_dict
@@ -0,0 +1,172 @@
1
+ def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
2
+ """
3
+ Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
4
+
5
+ Parameters:
6
+ base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
7
+ strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
8
+ modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
9
+ bam (str): The bam file path
10
+ deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
11
+ mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
12
+ on_missing (str): Error handling if a read is missing
13
+
14
+ Returns:
15
+ dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
16
+ If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
17
+ """
18
+ import numpy as np
19
+
20
+ if mismatch_trend_per_read is None:
21
+ mismatch_trend_per_read = {}
22
+
23
+ # Fast path
24
+ if modification_type == "unconverted" and not deaminase_footprinting:
25
+ return {k: np.full(len(v), np.nan, dtype=np.float32) for k, v in base_identities.items()}
26
+
27
+ out = {}
28
+
29
+ if deaminase_footprinting:
30
+ valid_trends = {"C->T", "G->A"}
31
+
32
+ for read_id, bases in base_identities.items():
33
+ trend_raw = mismatch_trend_per_read.get(read_id, None)
34
+ if trend_raw is None:
35
+ if on_missing == "error":
36
+ raise KeyError(f"Missing mismatch trend for read '{read_id}'")
37
+ out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
38
+ continue
39
+
40
+ trend = trend_raw.replace(" ", "").upper()
41
+ if trend not in valid_trends:
42
+ if on_missing == "error":
43
+ raise KeyError(
44
+ f"Invalid mismatch trend '{trend_raw}' for read '{read_id}'. "
45
+ f"Expected one of {sorted(valid_trends)}"
46
+ )
47
+ out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
48
+ continue
49
+
50
+ arr = np.asarray(bases, dtype="<U1")
51
+ res = np.full(arr.shape, np.nan, dtype=np.float32)
52
+
53
+ if trend == "C->T":
54
+ # C (unconverted) -> 0, T (converted) -> 1
55
+ res[arr == "C"] = 0.0
56
+ res[arr == "T"] = 1.0
57
+ else: # "G->A"
58
+ res[arr == "G"] = 0.0
59
+ res[arr == "A"] = 1.0
60
+
61
+ out[read_id] = res
62
+
63
+ return out
64
+
65
+ # Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
66
+ bin_maps = {
67
+ ("top", "5mC"): {"C": 1.0, "T": 0.0},
68
+ ("bottom", "5mC"): {"G": 1.0, "A": 0.0},
69
+ ("top", "6mA"): {"A": 1.0, "G": 0.0},
70
+ ("bottom", "6mA"): {"T": 1.0, "C": 0.0},
71
+ }
72
+ key = (strand, modification_type)
73
+ if key not in bin_maps:
74
+ raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
75
+
76
+ base_map = bin_maps[key]
77
+
78
+ for read_id, bases in base_identities.items():
79
+ arr = np.asarray(bases, dtype="<U1")
80
+ res = np.full(arr.shape, np.nan, dtype=np.float32)
81
+ # mask-assign; unknown characters (N, -, etc.) remain NaN
82
+ for b, v in base_map.items():
83
+ res[arr == b] = v
84
+ out[read_id] = res
85
+
86
+ return out
87
+
88
+ # if mismatch_trend_per_read is None:
89
+ # mismatch_trend_per_read = {}
90
+
91
+ # # If the modification type is 'unconverted', return NaN for all positions if the deaminase_footprinting strategy is not being used.
92
+ # if modification_type == "unconverted" and not deaminase_footprinting:
93
+ # #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
94
+ # return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
95
+
96
+ # # Define mappings for binarization based on strand and modification type
97
+ # if deaminase_footprinting:
98
+ # binarization_maps = {
99
+ # ('C->T'): {'C': 0, 'T': 1},
100
+ # ('G->A'): {'G': 0, 'A': 1},
101
+ # }
102
+
103
+ # binarized_base_identities = {}
104
+ # for key, bases in base_identities.items():
105
+ # arr = np.array(bases, dtype='<U1')
106
+ # # Fetch the appropriate mapping
107
+ # conversion_type = mismatch_trend_per_read[key]
108
+ # base_map = binarization_maps.get(conversion_type, None)
109
+ # binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
110
+ # binarized_base_identities[key] = binarized
111
+
112
+ # return binarized_base_identities
113
+
114
+ # else:
115
+ # binarization_maps = {
116
+ # ('top', '5mC'): {'C': 1, 'T': 0},
117
+ # ('top', '6mA'): {'A': 1, 'G': 0},
118
+ # ('bottom', '5mC'): {'G': 1, 'A': 0},
119
+ # ('bottom', '6mA'): {'T': 1, 'C': 0}
120
+ # }
121
+
122
+ # if (strand, modification_type) not in binarization_maps:
123
+ # raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
124
+
125
+ # # Fetch the appropriate mapping
126
+ # base_map = binarization_maps[(strand, modification_type)]
127
+
128
+ # binarized_base_identities = {}
129
+ # for key, bases in base_identities.items():
130
+ # arr = np.array(bases, dtype='<U1')
131
+ # binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
132
+ # binarized_base_identities[key] = binarized
133
+
134
+ # return binarized_base_identities
135
+ # import torch
136
+
137
+ # # If the modification type is 'unconverted', return NaN for all positions
138
+ # if modification_type == "unconverted":
139
+ # print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
140
+ # return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
141
+
142
+ # # Define mappings for binarization based on strand and modification type
143
+ # binarization_maps = {
144
+ # ('top', '5mC'): {'C': 1, 'T': 0},
145
+ # ('top', '6mA'): {'A': 1, 'G': 0},
146
+ # ('bottom', '5mC'): {'G': 1, 'A': 0},
147
+ # ('bottom', '6mA'): {'T': 1, 'C': 0}
148
+ # }
149
+
150
+ # if (strand, modification_type) not in binarization_maps:
151
+ # raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
152
+
153
+ # # Fetch the appropriate mapping
154
+ # base_map = binarization_maps[(strand, modification_type)]
155
+
156
+ # # Convert mapping to tensor
157
+ # base_keys = list(base_map.keys())
158
+ # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
159
+
160
+ # # Create a lookup dictionary (ASCII-based for fast mapping)
161
+ # lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
162
+ # for k, v in zip(base_keys, base_values):
163
+ # lookup_table[ord(k)] = v
164
+
165
+ # # Process reads
166
+ # binarized_base_identities = {}
167
+ # for key, bases in base_identities.items():
168
+ # bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
169
+ # binarized = lookup_table[bases_tensor] # Efficient lookup
170
+ # binarized_base_identities[key] = binarized
171
+
172
+ # return binarized_base_identities