smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,87 @@
1
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
2
+ """
3
+ Takes an aligned BAM as input and writes a BED file of reads as output.
4
+ Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
5
+
6
+ Parameters:
7
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
8
+ out_dir (str): Directory to output files.
9
+ fasta (str): File path to the reference genome.
10
+ make_bigwigs (bool): Whether to generate bigwig files.
11
+ threads (int): Number of threads to use.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import subprocess
17
+ import os
18
+ from pathlib import Path
19
+ import pysam
20
+ import numpy as np
21
+ import concurrent.futures
22
+ from concurrent.futures import ProcessPoolExecutor
23
+ from .bed_to_bigwig import bed_to_bigwig
24
+ from ...readwrite import make_dirs
25
+ from .plot_bed_histograms import plot_bed_histograms
26
+
27
+ threads = threads or os.cpu_count() # Use max available cores if not specified
28
+
29
+ # Create necessary directories
30
+ plotting_dir = out_dir / "bed_cov_histograms"
31
+ bed_dir = out_dir / "beds"
32
+ make_dirs([plotting_dir, bed_dir])
33
+
34
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
35
+
36
+ print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
37
+
38
+ with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
39
+ for read in bam.fetch(until_eof=True):
40
+ if read.is_unmapped:
41
+ chrom = "*"
42
+ start1 = 1
43
+ rl = read.query_length or 0
44
+ mapq = 0
45
+ else:
46
+ chrom = bam.get_reference_name(read.reference_id)
47
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
48
+ start1 = int(read.reference_start) + 1
49
+ rl = read.query_length or 0
50
+ mapq = int(read.mapping_quality)
51
+
52
+ # End position in 1-based inclusive coords
53
+ end1 = start1 + (rl or 0) - 1
54
+
55
+ qname = read.query_name
56
+ quals = read.query_qualities
57
+ if quals is None or rl == 0:
58
+ avg_q = float("nan")
59
+ else:
60
+ avg_q = float(np.mean(quals))
61
+
62
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
63
+
64
+ print(f"BED-like file created: {bed_output}")
65
+
66
+ def split_bed(bed):
67
+ """Splits into aligned and unaligned reads (chrom == '*')."""
68
+ bed = str(bed)
69
+ aligned = bed.replace(".bed", "_aligned.bed")
70
+ unaligned = bed.replace(".bed", "_unaligned.bed")
71
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
72
+ for line in infile:
73
+ (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
74
+ os.remove(bed)
75
+ return aligned
76
+
77
+ print(f"Splitting: {bed_output}")
78
+ aligned_bed = split_bed(bed_output)
79
+
80
+ with ProcessPoolExecutor() as executor:
81
+ futures = []
82
+ futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
83
+ if make_bigwigs:
84
+ futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
85
+ concurrent.futures.wait(futures)
86
+
87
+ print("Processing completed successfully.")
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import Iterable, Optional, Tuple, List
7
+
8
+ def bam_qc(
9
+ bam_files: Iterable[str | Path],
10
+ bam_qc_dir: str | Path,
11
+ threads: Optional[int],
12
+ modality: str,
13
+ stats: bool = True,
14
+ flagstats: bool = True,
15
+ idxstats: bool = True,
16
+ ) -> None:
17
+ """
18
+ QC for BAM/CRAMs: stats, flagstat, idxstats.
19
+ Prefers pysam; falls back to `samtools` if needed.
20
+ Runs BAMs in parallel (up to `threads`, default serial).
21
+ """
22
+ import subprocess
23
+ import shutil
24
+
25
+ # Try to import pysam once
26
+ try:
27
+ import pysam
28
+ HAVE_PYSAM = True
29
+ except Exception:
30
+ HAVE_PYSAM = False
31
+
32
+ bam_qc_dir = Path(bam_qc_dir)
33
+ bam_qc_dir.mkdir(parents=True, exist_ok=True)
34
+
35
+ bam_files = [Path(b) for b in bam_files]
36
+
37
+ def _has_index(p: Path) -> bool:
38
+ if p.suffix.lower() == ".bam":
39
+ bai = p.with_suffix(p.suffix + ".bai")
40
+ bai_alt = Path(str(p) + ".bai")
41
+ return bai.exists() or bai_alt.exists()
42
+ if p.suffix.lower() == ".cram":
43
+ crai = Path(str(p) + ".crai")
44
+ return crai.exists()
45
+ return False
46
+
47
+ def _ensure_index(p: Path) -> None:
48
+ if _has_index(p):
49
+ return
50
+ if HAVE_PYSAM:
51
+ # pysam.index supports both BAM & CRAM
52
+ pysam.index(str(p))
53
+ else:
54
+ cmd = ["samtools", "index", str(p)]
55
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
56
+
57
+ def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
58
+ # outputs + return (file, [(task_name, returncode)])
59
+ results: List[Tuple[str, int]] = []
60
+ base = bam.stem # filename without .bam
61
+ out_stats = bam_qc_dir / f"{base}_stats.txt"
62
+ out_flag = bam_qc_dir / f"{base}_flagstat.txt"
63
+ out_idx = bam_qc_dir / f"{base}_idxstats.txt"
64
+
65
+ # Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
66
+ try:
67
+ _ensure_index(bam)
68
+ except Exception as e:
69
+ # Still attempt stats/flagstat if requested
70
+ print(f"[warn] Indexing failed for {bam}: {e}")
71
+
72
+ # Choose runner per task
73
+ def run_stats():
74
+ if not stats:
75
+ return
76
+ if HAVE_PYSAM and hasattr(pysam, "stats"):
77
+ txt = pysam.stats(str(bam))
78
+ out_stats.write_text(txt)
79
+ results.append(("stats(pysam)", 0))
80
+ else:
81
+ cmd = ["samtools", "stats", str(bam)]
82
+ with open(out_stats, "w") as fh:
83
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
84
+ results.append(("stats(samtools)", cp.returncode))
85
+ if cp.returncode != 0:
86
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
87
+
88
+ def run_flagstat():
89
+ if not flagstats:
90
+ return
91
+ if HAVE_PYSAM and hasattr(pysam, "flagstat"):
92
+ txt = pysam.flagstat(str(bam))
93
+ out_flag.write_text(txt)
94
+ results.append(("flagstat(pysam)", 0))
95
+ else:
96
+ cmd = ["samtools", "flagstat", str(bam)]
97
+ with open(out_flag, "w") as fh:
98
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
99
+ results.append(("flagstat(samtools)", cp.returncode))
100
+ if cp.returncode != 0:
101
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
102
+
103
+ def run_idxstats():
104
+ if not idxstats:
105
+ return
106
+ if HAVE_PYSAM and hasattr(pysam, "idxstats"):
107
+ txt = pysam.idxstats(str(bam))
108
+ out_idx.write_text(txt)
109
+ results.append(("idxstats(pysam)", 0))
110
+ else:
111
+ cmd = ["samtools", "idxstats", str(bam)]
112
+ with open(out_idx, "w") as fh:
113
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
114
+ results.append(("idxstats(samtools)", cp.returncode))
115
+ if cp.returncode != 0:
116
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
117
+
118
+ # Sanity: ensure samtools exists if pysam missing
119
+ if not HAVE_PYSAM:
120
+ if not shutil.which("samtools"):
121
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
122
+
123
+ # Execute tasks (serial per file; parallelized across files)
124
+ run_stats()
125
+ run_flagstat()
126
+ run_idxstats()
127
+ return bam, results
128
+
129
+ # Parallel across BAMs
130
+ max_workers = int(threads) if threads and int(threads) > 0 else 1
131
+ futures = []
132
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
133
+ for b in bam_files:
134
+ futures.append(ex.submit(_run_one, b))
135
+
136
+ for fut in as_completed(futures):
137
+ try:
138
+ bam, res = fut.result()
139
+ summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
140
+ print(f"[qc] {bam.name}: {summary}")
141
+ except Exception as e:
142
+ print(f"[error] QC failed: {e}")
143
+
144
+ # Placeholders to keep your signature stable
145
+ if modality not in {"conversion", "direct"}:
146
+ print(f"[warn] Unknown modality '{modality}', continuing.")
147
+
148
+ print("QC processing completed.")
149
+
150
+ # def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
151
+ # """
152
+ # Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
153
+
154
+ # Parameters:
155
+ # - bam_files: List of BAM file paths.
156
+ # - bam_qc_dir: Directory to save QC reports.
157
+ # - threads: Number threads to use.
158
+ # - modality: 'conversion' or 'direct' (affects processing mode).
159
+ # - stats: Run `samtools stats` if True.
160
+ # - flagstats: Run `samtools flagstat` if True.
161
+ # - idxstats: Run `samtools idxstats` if True.
162
+ # """
163
+ # import os
164
+ # import subprocess
165
+
166
+ # # Ensure the QC output directory exists
167
+ # os.makedirs(bam_qc_dir, exist_ok=True)
168
+
169
+ # if threads:
170
+ # threads = str(threads)
171
+ # else:
172
+ # pass
173
+
174
+ # for bam in bam_files:
175
+ # bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
176
+
177
+ # # Run samtools QC commands based on selected options
178
+ # if stats:
179
+ # stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
180
+ # if threads:
181
+ # command = ["samtools", "stats", "-@", threads, bam]
182
+ # else:
183
+ # command = ["samtools", "stats", bam]
184
+ # print(f"Running: {' '.join(command)} > {stats_out}")
185
+ # with open(stats_out, "w") as out_file:
186
+ # subprocess.run(command, stdout=out_file)
187
+
188
+ # if flagstats:
189
+ # flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
190
+ # if threads:
191
+ # command = ["samtools", "flagstat", "-@", threads, bam]
192
+ # else:
193
+ # command = ["samtools", "flagstat", bam]
194
+ # print(f"Running: {' '.join(command)} > {flagstats_out}")
195
+ # with open(flagstats_out, "w") as out_file:
196
+ # subprocess.run(command, stdout=out_file)
197
+
198
+ # if idxstats:
199
+ # idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
200
+ # if threads:
201
+ # command = ["samtools", "idxstats", "-@", threads, bam]
202
+ # else:
203
+ # command = ["samtools", "idxstats", bam]
204
+ # print(f"Running: {' '.join(command)} > {idxstats_out}")
205
+ # with open(idxstats_out, "w") as out_file:
206
+ # subprocess.run(command, stdout=out_file)
207
+
208
+ # if modality == 'conversion':
209
+ # pass
210
+ # elif modality == 'direct':
211
+ # pass
212
+
213
+ # print("QC processing completed.")
@@ -0,0 +1,90 @@
1
+ from pathlib import Path
2
+ import pybedtools
3
+ import pyBigWig
4
+
5
+ def bed_to_bigwig(fasta: str, bed: str) -> str:
6
+ """
7
+ BED → bedGraph → bigWig
8
+ Requires:
9
+ - FASTA must have .fai index present
10
+ """
11
+
12
+ bed = Path(bed)
13
+ fa = Path(fasta) # path to .fa
14
+ parent = bed.parent
15
+ stem = bed.stem
16
+ fa_stem = fa.stem
17
+ fai = parent / f"{fa_stem}.fai"
18
+
19
+ bedgraph = parent / f"{stem}.bedgraph"
20
+ bigwig = parent / f"{stem}.bw"
21
+
22
+ # 1) Compute coverage → bedGraph
23
+ print(f"[pybedtools] generating coverage bedgraph from {bed}")
24
+ bt = pybedtools.BedTool(str(bed))
25
+ # bedtools genomecov -bg
26
+ coverage = bt.genome_coverage(bg=True, genome=str(fai))
27
+ coverage.saveas(str(bedgraph))
28
+
29
+ # 2) Convert bedGraph → BigWig via pyBigWig
30
+ print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
31
+
32
+ # read chrom sizes from the FASTA .fai index
33
+ chrom_sizes = {}
34
+ with open(fai) as f:
35
+ for line in f:
36
+ fields = line.strip().split("\t")
37
+ chrom = fields[0]
38
+ size = int(fields[1])
39
+ chrom_sizes[chrom] = size
40
+
41
+ bw = pyBigWig.open(str(bigwig), "w")
42
+ bw.addHeader(list(chrom_sizes.items()))
43
+
44
+ with open(bedgraph) as f:
45
+ for line in f:
46
+ chrom, start, end, coverage = line.strip().split()
47
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
48
+
49
+ bw.close()
50
+
51
+ print(f"BigWig written: {bigwig}")
52
+ return str(bigwig)
53
+
54
+ # def bed_to_bigwig(fasta, bed):
55
+ # """
56
+ # Takes a bed file of reads and makes a bedgraph plus a bigwig
57
+
58
+ # Parameters:
59
+ # fasta (str): File path to the reference genome to align to.
60
+ # bed (str): File path to the input bed.
61
+ # Returns:
62
+ # None
63
+ # """
64
+ # import os
65
+ # import subprocess
66
+
67
+ # bed_basename = os.path.basename(bed)
68
+ # parent_dir = os.path.dirname(bed)
69
+ # bed_basename_minus_suffix = bed_basename.split('.bed')[0]
70
+ # fasta_basename = os.path.basename(fasta)
71
+ # fasta_dir = os.path.dirname(fasta)
72
+ # fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
73
+ # chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
74
+ # chrom_path = os.path.join(fasta_dir, chrom_basename)
75
+ # bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
76
+ # bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
77
+ # bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
78
+ # bigwig_output = os.path.join(parent_dir, bigwig_basename)
79
+
80
+ # # Make the bedgraph
81
+ # with open(bedgraph_output, 'w') as outfile:
82
+ # # Command as a list
83
+ # command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
84
+ # print(f'Making bedgraph from {bed_basename}')
85
+ # subprocess.run(command, stdout=outfile)
86
+
87
+ # # Make the bigwig
88
+ # command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
89
+ # print(f'Making bigwig from {bedgraph_basename}')
90
+ # subprocess.run(command)
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Tuple, Union, Optional
5
+ import re
6
+ from itertools import zip_longest
7
+
8
+ import pysam
9
+ from tqdm import tqdm
10
+
11
+
12
+ def concatenate_fastqs_to_bam(
13
+ fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
14
+ output_bam: Union[str, Path],
15
+ barcode_tag: str = "BC",
16
+ barcode_map: Optional[Dict[Union[str, Path], str]] = None,
17
+ add_read_group: bool = True,
18
+ rg_sample_field: Optional[str] = None,
19
+ progress: bool = True,
20
+ auto_pair: bool = True,
21
+ ) -> Dict[str, Any]:
22
+ """
23
+ Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
24
+
25
+ Parameters
26
+ ----------
27
+ fastq_files : list[Path|str] or list[(Path|str, Path|str)]
28
+ Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
29
+ output_bam : Path|str
30
+ Output BAM path (parent directory will be created).
31
+ barcode_tag : str
32
+ SAM tag used to store barcode on each read (default 'BC').
33
+ barcode_map : dict or None
34
+ Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
35
+ add_read_group : bool
36
+ If True, add @RG header lines (ID = barcode) and set each read's RG tag.
37
+ rg_sample_field : str or None
38
+ If set, include SM=<value> in @RG.
39
+ progress : bool
40
+ Show tqdm progress bars.
41
+ auto_pair : bool
42
+ Auto-pair R1/R2 based on filename patterns if given a flat list.
43
+
44
+ Returns
45
+ -------
46
+ dict
47
+ {'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
48
+ """
49
+
50
+ # ---------- helpers (Pathlib-only) ----------
51
+ def _strip_fastq_ext(p: Path) -> str:
52
+ """
53
+ Remove common FASTQ multi-suffixes; return stem-like name.
54
+ """
55
+ name = p.name
56
+ lowers = name.lower()
57
+ for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
58
+ if lowers.endswith(ext):
59
+ return name[: -len(ext)]
60
+ return p.stem # fallback: remove last suffix only
61
+
62
+ def _extract_barcode_from_filename(p: Path) -> str:
63
+ stem = _strip_fastq_ext(p)
64
+ if "_" in stem:
65
+ token = stem.split("_")[-1]
66
+ if token:
67
+ return token
68
+ return stem
69
+
70
+ def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
71
+ # return (prefix, readnum) if matches; else (None, None)
72
+ patterns = [
73
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
74
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
75
+ ]
76
+ for pat in patterns:
77
+ m = re.match(pat, stem)
78
+ if m:
79
+ return m.group(1), int(m.group(2))
80
+ return None, None
81
+
82
+ def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
83
+ pref_map: Dict[str, Dict[int, Path]] = {}
84
+ unpaired: List[Path] = []
85
+ for pth in paths:
86
+ stem = _strip_fastq_ext(pth)
87
+ pref, num = _classify_read_token(stem)
88
+ if pref is None:
89
+ unpaired.append(pth)
90
+ else:
91
+ entry = pref_map.setdefault(pref, {})
92
+ entry[num] = pth
93
+ pairs: List[Tuple[Path, Path]] = []
94
+ leftovers: List[Path] = []
95
+ for d in pref_map.values():
96
+ if 1 in d and 2 in d:
97
+ pairs.append((d[1], d[2]))
98
+ else:
99
+ leftovers.extend(d.values())
100
+ leftovers.extend(unpaired)
101
+ return pairs, leftovers
102
+
103
+ def _fastq_iter(p: Path):
104
+ # pysam.FastxFile handles compressed extensions transparently
105
+ with pysam.FastxFile(str(p)) as fx:
106
+ for rec in fx:
107
+ yield rec # rec.name, rec.sequence, rec.quality
108
+
109
+ def _make_unaligned_segment(
110
+ name: str,
111
+ seq: str,
112
+ qual: Optional[str],
113
+ bc: str,
114
+ read1: bool,
115
+ read2: bool,
116
+ ) -> pysam.AlignedSegment:
117
+ a = pysam.AlignedSegment()
118
+ a.query_name = name
119
+ a.query_sequence = seq
120
+ if qual is not None:
121
+ a.query_qualities = pysam.qualitystring_to_array(qual)
122
+ a.is_unmapped = True
123
+ a.is_paired = read1 or read2
124
+ a.is_read1 = read1
125
+ a.is_read2 = read2
126
+ a.mate_is_unmapped = a.is_paired
127
+ a.reference_id = -1
128
+ a.reference_start = -1
129
+ a.next_reference_id = -1
130
+ a.next_reference_start = -1
131
+ a.template_length = 0
132
+ a.set_tag(barcode_tag, str(bc), value_type="Z")
133
+ if add_read_group:
134
+ a.set_tag("RG", str(bc), value_type="Z")
135
+ return a
136
+
137
+ # ---------- normalize inputs to Path ----------
138
+ def _to_path_pair(x) -> Tuple[Path, Path]:
139
+ a, b = x
140
+ return Path(a), Path(b)
141
+
142
+ explicit_pairs: List[Tuple[Path, Path]] = []
143
+ singles: List[Path] = []
144
+
145
+ if not isinstance(fastq_files, (list, tuple)):
146
+ raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
147
+
148
+ if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
149
+ explicit_pairs = [_to_path_pair(x) for x in fastq_files]
150
+ else:
151
+ flat_paths = [Path(x) for x in fastq_files if x is not None]
152
+ if auto_pair:
153
+ explicit_pairs, leftovers = _pair_by_filename(flat_paths)
154
+ singles = leftovers
155
+ else:
156
+ singles = flat_paths
157
+
158
+ output_bam = Path(output_bam)
159
+ output_bam.parent.mkdir(parents=True, exist_ok=True)
160
+
161
+ # ---------- barcodes ----------
162
+ barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
163
+ per_path_barcode: Dict[Path, str] = {}
164
+ barcodes_in_order: List[str] = []
165
+
166
+ for r1, r2 in explicit_pairs:
167
+ bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
168
+ per_path_barcode[r1] = bc
169
+ per_path_barcode[r2] = bc
170
+ if bc not in barcodes_in_order:
171
+ barcodes_in_order.append(bc)
172
+ for pth in singles:
173
+ bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
174
+ per_path_barcode[pth] = bc
175
+ if bc not in barcodes_in_order:
176
+ barcodes_in_order.append(bc)
177
+
178
+ # ---------- BAM header ----------
179
+ header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
180
+ if add_read_group:
181
+ header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
182
+ header.setdefault("PG", []).append(
183
+ {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
184
+ )
185
+
186
+ # ---------- counters ----------
187
+ per_file_counts: Dict[Path, int] = {}
188
+ total_written = 0
189
+ paired_pairs_written = 0
190
+ singletons_written = 0
191
+
192
+ # ---------- write BAM ----------
193
+ with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
194
+ # Paired
195
+ it_pairs = explicit_pairs
196
+ if progress and it_pairs:
197
+ it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
198
+ for r1_path, r2_path in it_pairs:
199
+ if not (r1_path.exists() and r2_path.exists()):
200
+ raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
201
+ bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
202
+
203
+ it1 = _fastq_iter(r1_path)
204
+ it2 = _fastq_iter(r2_path)
205
+
206
+ for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
207
+ def _clean(n: Optional[str]) -> Optional[str]:
208
+ if n is None:
209
+ return None
210
+ return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
211
+
212
+ name = (
213
+ _clean(getattr(rec1, "name", None))
214
+ or _clean(getattr(rec2, "name", None))
215
+ or getattr(rec1, "name", None)
216
+ or getattr(rec2, "name", None)
217
+ )
218
+
219
+ if rec1 is not None:
220
+ a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
221
+ bam_out.write(a1)
222
+ per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
223
+ total_written += 1
224
+ if rec2 is not None:
225
+ a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
226
+ bam_out.write(a2)
227
+ per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
228
+ total_written += 1
229
+
230
+ if rec1 is not None and rec2 is not None:
231
+ paired_pairs_written += 1
232
+ else:
233
+ if rec1 is not None:
234
+ singletons_written += 1
235
+ if rec2 is not None:
236
+ singletons_written += 1
237
+
238
+ # Singles
239
+ it_singles = singles
240
+ if progress and it_singles:
241
+ it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
242
+ for pth in it_singles:
243
+ if not pth.exists():
244
+ raise FileNotFoundError(pth)
245
+ bc = per_path_barcode.get(pth, "barcode")
246
+ for rec in _fastq_iter(pth):
247
+ a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
248
+ bam_out.write(a)
249
+ per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
250
+ total_written += 1
251
+ singletons_written += 1
252
+
253
+ return {
254
+ "total_reads": total_written,
255
+ "per_file": {str(k): v for k, v in per_file_counts.items()},
256
+ "paired_pairs_written": paired_pairs_written,
257
+ "singletons_written": singletons_written,
258
+ "barcodes": barcodes_in_order,
259
+ }
@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
14
14
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
15
 
16
16
  """
17
- from .. import readwrite
17
+ from ... import readwrite
18
18
  import pysam
19
19
  from tqdm import tqdm
20
20
  from collections import defaultdict
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
25
25
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
26
26
  record_counts = defaultdict(int)
27
27
 
28
- with pysam.AlignmentFile(bam_file, "rb") as bam:
28
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
29
29
  total_reads = bam.mapped + bam.unmapped
30
30
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
31
31
  for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):