smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -15,22 +15,23 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
15
15
  """
16
16
  import subprocess
17
17
  import os
18
+ from pathlib import Path
18
19
  import pysam
19
20
  import numpy as np
20
21
  import concurrent.futures
21
22
  from concurrent.futures import ProcessPoolExecutor
22
23
  from .bed_to_bigwig import bed_to_bigwig
23
- from . import make_dirs
24
+ from ...readwrite import make_dirs
24
25
  from .plot_bed_histograms import plot_bed_histograms
25
26
 
26
27
  threads = threads or os.cpu_count() # Use max available cores if not specified
27
28
 
28
29
  # Create necessary directories
29
- plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
30
- bed_dir = os.path.join(out_dir, "beds")
30
+ plotting_dir = out_dir / "bed_cov_histograms"
31
+ bed_dir = out_dir / "beds"
31
32
  make_dirs([plotting_dir, bed_dir])
32
33
 
33
- bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
34
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
34
35
 
35
36
  print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
36
37
 
@@ -64,6 +65,7 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
64
65
 
65
66
  def split_bed(bed):
66
67
  """Splits into aligned and unaligned reads (chrom == '*')."""
68
+ bed = str(bed)
67
69
  aligned = bed.replace(".bed", "_aligned.bed")
68
70
  unaligned = bed.replace(".bed", "_unaligned.bed")
69
71
  with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import Iterable, Optional, Tuple, List
7
+
8
+ def bam_qc(
9
+ bam_files: Iterable[str | Path],
10
+ bam_qc_dir: str | Path,
11
+ threads: Optional[int],
12
+ modality: str,
13
+ stats: bool = True,
14
+ flagstats: bool = True,
15
+ idxstats: bool = True,
16
+ ) -> None:
17
+ """
18
+ QC for BAM/CRAMs: stats, flagstat, idxstats.
19
+ Prefers pysam; falls back to `samtools` if needed.
20
+ Runs BAMs in parallel (up to `threads`, default serial).
21
+ """
22
+ import subprocess
23
+ import shutil
24
+
25
+ # Try to import pysam once
26
+ try:
27
+ import pysam
28
+ HAVE_PYSAM = True
29
+ except Exception:
30
+ HAVE_PYSAM = False
31
+
32
+ bam_qc_dir = Path(bam_qc_dir)
33
+ bam_qc_dir.mkdir(parents=True, exist_ok=True)
34
+
35
+ bam_files = [Path(b) for b in bam_files]
36
+
37
+ def _has_index(p: Path) -> bool:
38
+ if p.suffix.lower() == ".bam":
39
+ bai = p.with_suffix(p.suffix + ".bai")
40
+ bai_alt = Path(str(p) + ".bai")
41
+ return bai.exists() or bai_alt.exists()
42
+ if p.suffix.lower() == ".cram":
43
+ crai = Path(str(p) + ".crai")
44
+ return crai.exists()
45
+ return False
46
+
47
+ def _ensure_index(p: Path) -> None:
48
+ if _has_index(p):
49
+ return
50
+ if HAVE_PYSAM:
51
+ # pysam.index supports both BAM & CRAM
52
+ pysam.index(str(p))
53
+ else:
54
+ cmd = ["samtools", "index", str(p)]
55
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
56
+
57
+ def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
58
+ # outputs + return (file, [(task_name, returncode)])
59
+ results: List[Tuple[str, int]] = []
60
+ base = bam.stem # filename without .bam
61
+ out_stats = bam_qc_dir / f"{base}_stats.txt"
62
+ out_flag = bam_qc_dir / f"{base}_flagstat.txt"
63
+ out_idx = bam_qc_dir / f"{base}_idxstats.txt"
64
+
65
+ # Make sure index exists (samtools stats/flagstat don’t require, idxstats does)
66
+ try:
67
+ _ensure_index(bam)
68
+ except Exception as e:
69
+ # Still attempt stats/flagstat if requested
70
+ print(f"[warn] Indexing failed for {bam}: {e}")
71
+
72
+ # Choose runner per task
73
+ def run_stats():
74
+ if not stats:
75
+ return
76
+ if HAVE_PYSAM and hasattr(pysam, "stats"):
77
+ txt = pysam.stats(str(bam))
78
+ out_stats.write_text(txt)
79
+ results.append(("stats(pysam)", 0))
80
+ else:
81
+ cmd = ["samtools", "stats", str(bam)]
82
+ with open(out_stats, "w") as fh:
83
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
84
+ results.append(("stats(samtools)", cp.returncode))
85
+ if cp.returncode != 0:
86
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
87
+
88
+ def run_flagstat():
89
+ if not flagstats:
90
+ return
91
+ if HAVE_PYSAM and hasattr(pysam, "flagstat"):
92
+ txt = pysam.flagstat(str(bam))
93
+ out_flag.write_text(txt)
94
+ results.append(("flagstat(pysam)", 0))
95
+ else:
96
+ cmd = ["samtools", "flagstat", str(bam)]
97
+ with open(out_flag, "w") as fh:
98
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
99
+ results.append(("flagstat(samtools)", cp.returncode))
100
+ if cp.returncode != 0:
101
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
102
+
103
+ def run_idxstats():
104
+ if not idxstats:
105
+ return
106
+ if HAVE_PYSAM and hasattr(pysam, "idxstats"):
107
+ txt = pysam.idxstats(str(bam))
108
+ out_idx.write_text(txt)
109
+ results.append(("idxstats(pysam)", 0))
110
+ else:
111
+ cmd = ["samtools", "idxstats", str(bam)]
112
+ with open(out_idx, "w") as fh:
113
+ cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
114
+ results.append(("idxstats(samtools)", cp.returncode))
115
+ if cp.returncode != 0:
116
+ raise RuntimeError(cp.stderr.decode(errors="replace"))
117
+
118
+ # Sanity: ensure samtools exists if pysam missing
119
+ if not HAVE_PYSAM:
120
+ if not shutil.which("samtools"):
121
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
122
+
123
+ # Execute tasks (serial per file; parallelized across files)
124
+ run_stats()
125
+ run_flagstat()
126
+ run_idxstats()
127
+ return bam, results
128
+
129
+ # Parallel across BAMs
130
+ max_workers = int(threads) if threads and int(threads) > 0 else 1
131
+ futures = []
132
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
133
+ for b in bam_files:
134
+ futures.append(ex.submit(_run_one, b))
135
+
136
+ for fut in as_completed(futures):
137
+ try:
138
+ bam, res = fut.result()
139
+ summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
140
+ print(f"[qc] {bam.name}: {summary}")
141
+ except Exception as e:
142
+ print(f"[error] QC failed: {e}")
143
+
144
+ # Placeholders to keep your signature stable
145
+ if modality not in {"conversion", "direct"}:
146
+ print(f"[warn] Unknown modality '{modality}', continuing.")
147
+
148
+ print("QC processing completed.")
149
+
150
+ # def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
151
+ # """
152
+ # Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
153
+
154
+ # Parameters:
155
+ # - bam_files: List of BAM file paths.
156
+ # - bam_qc_dir: Directory to save QC reports.
157
+ # - threads: Number threads to use.
158
+ # - modality: 'conversion' or 'direct' (affects processing mode).
159
+ # - stats: Run `samtools stats` if True.
160
+ # - flagstats: Run `samtools flagstat` if True.
161
+ # - idxstats: Run `samtools idxstats` if True.
162
+ # """
163
+ # import os
164
+ # import subprocess
165
+
166
+ # # Ensure the QC output directory exists
167
+ # os.makedirs(bam_qc_dir, exist_ok=True)
168
+
169
+ # if threads:
170
+ # threads = str(threads)
171
+ # else:
172
+ # pass
173
+
174
+ # for bam in bam_files:
175
+ # bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
176
+
177
+ # # Run samtools QC commands based on selected options
178
+ # if stats:
179
+ # stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
180
+ # if threads:
181
+ # command = ["samtools", "stats", "-@", threads, bam]
182
+ # else:
183
+ # command = ["samtools", "stats", bam]
184
+ # print(f"Running: {' '.join(command)} > {stats_out}")
185
+ # with open(stats_out, "w") as out_file:
186
+ # subprocess.run(command, stdout=out_file)
187
+
188
+ # if flagstats:
189
+ # flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
190
+ # if threads:
191
+ # command = ["samtools", "flagstat", "-@", threads, bam]
192
+ # else:
193
+ # command = ["samtools", "flagstat", bam]
194
+ # print(f"Running: {' '.join(command)} > {flagstats_out}")
195
+ # with open(flagstats_out, "w") as out_file:
196
+ # subprocess.run(command, stdout=out_file)
197
+
198
+ # if idxstats:
199
+ # idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
200
+ # if threads:
201
+ # command = ["samtools", "idxstats", "-@", threads, bam]
202
+ # else:
203
+ # command = ["samtools", "idxstats", bam]
204
+ # print(f"Running: {' '.join(command)} > {idxstats_out}")
205
+ # with open(idxstats_out, "w") as out_file:
206
+ # subprocess.run(command, stdout=out_file)
207
+
208
+ # if modality == 'conversion':
209
+ # pass
210
+ # elif modality == 'direct':
211
+ # pass
212
+
213
+ # print("QC processing completed.")
@@ -0,0 +1,90 @@
1
+ from pathlib import Path
2
+ import pybedtools
3
+ import pyBigWig
4
+
5
+ def bed_to_bigwig(fasta: str, bed: str) -> str:
6
+ """
7
+ BED → bedGraph → bigWig
8
+ Requires:
9
+ - FASTA must have .fai index present
10
+ """
11
+
12
+ bed = Path(bed)
13
+ fa = Path(fasta) # path to .fa
14
+ parent = bed.parent
15
+ stem = bed.stem
16
+ fa_stem = fa.stem
17
+ fai = parent / f"{fa_stem}.fai"
18
+
19
+ bedgraph = parent / f"{stem}.bedgraph"
20
+ bigwig = parent / f"{stem}.bw"
21
+
22
+ # 1) Compute coverage → bedGraph
23
+ print(f"[pybedtools] generating coverage bedgraph from {bed}")
24
+ bt = pybedtools.BedTool(str(bed))
25
+ # bedtools genomecov -bg
26
+ coverage = bt.genome_coverage(bg=True, genome=str(fai))
27
+ coverage.saveas(str(bedgraph))
28
+
29
+ # 2) Convert bedGraph → BigWig via pyBigWig
30
+ print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
31
+
32
+ # read chrom sizes from the FASTA .fai index
33
+ chrom_sizes = {}
34
+ with open(fai) as f:
35
+ for line in f:
36
+ fields = line.strip().split("\t")
37
+ chrom = fields[0]
38
+ size = int(fields[1])
39
+ chrom_sizes[chrom] = size
40
+
41
+ bw = pyBigWig.open(str(bigwig), "w")
42
+ bw.addHeader(list(chrom_sizes.items()))
43
+
44
+ with open(bedgraph) as f:
45
+ for line in f:
46
+ chrom, start, end, coverage = line.strip().split()
47
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
48
+
49
+ bw.close()
50
+
51
+ print(f"BigWig written: {bigwig}")
52
+ return str(bigwig)
53
+
54
+ # def bed_to_bigwig(fasta, bed):
55
+ # """
56
+ # Takes a bed file of reads and makes a bedgraph plus a bigwig
57
+
58
+ # Parameters:
59
+ # fasta (str): File path to the reference genome to align to.
60
+ # bed (str): File path to the input bed.
61
+ # Returns:
62
+ # None
63
+ # """
64
+ # import os
65
+ # import subprocess
66
+
67
+ # bed_basename = os.path.basename(bed)
68
+ # parent_dir = os.path.dirname(bed)
69
+ # bed_basename_minus_suffix = bed_basename.split('.bed')[0]
70
+ # fasta_basename = os.path.basename(fasta)
71
+ # fasta_dir = os.path.dirname(fasta)
72
+ # fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
73
+ # chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
74
+ # chrom_path = os.path.join(fasta_dir, chrom_basename)
75
+ # bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
76
+ # bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
77
+ # bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
78
+ # bigwig_output = os.path.join(parent_dir, bigwig_basename)
79
+
80
+ # # Make the bedgraph
81
+ # with open(bedgraph_output, 'w') as outfile:
82
+ # # Command as a list
83
+ # command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
84
+ # print(f'Making bedgraph from {bed_basename}')
85
+ # subprocess.run(command, stdout=outfile)
86
+
87
+ # # Make the bigwig
88
+ # command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
89
+ # print(f'Making bigwig from {bedgraph_basename}')
90
+ # subprocess.run(command)
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Tuple, Union, Optional
5
+ import re
6
+ from itertools import zip_longest
7
+
8
+ import pysam
9
+ from tqdm import tqdm
10
+
11
+
12
+ def concatenate_fastqs_to_bam(
13
+ fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
14
+ output_bam: Union[str, Path],
15
+ barcode_tag: str = "BC",
16
+ barcode_map: Optional[Dict[Union[str, Path], str]] = None,
17
+ add_read_group: bool = True,
18
+ rg_sample_field: Optional[str] = None,
19
+ progress: bool = True,
20
+ auto_pair: bool = True,
21
+ ) -> Dict[str, Any]:
22
+ """
23
+ Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
24
+
25
+ Parameters
26
+ ----------
27
+ fastq_files : list[Path|str] or list[(Path|str, Path|str)]
28
+ Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
29
+ output_bam : Path|str
30
+ Output BAM path (parent directory will be created).
31
+ barcode_tag : str
32
+ SAM tag used to store barcode on each read (default 'BC').
33
+ barcode_map : dict or None
34
+ Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
35
+ add_read_group : bool
36
+ If True, add @RG header lines (ID = barcode) and set each read's RG tag.
37
+ rg_sample_field : str or None
38
+ If set, include SM=<value> in @RG.
39
+ progress : bool
40
+ Show tqdm progress bars.
41
+ auto_pair : bool
42
+ Auto-pair R1/R2 based on filename patterns if given a flat list.
43
+
44
+ Returns
45
+ -------
46
+ dict
47
+ {'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
48
+ """
49
+
50
+ # ---------- helpers (Pathlib-only) ----------
51
+ def _strip_fastq_ext(p: Path) -> str:
52
+ """
53
+ Remove common FASTQ multi-suffixes; return stem-like name.
54
+ """
55
+ name = p.name
56
+ lowers = name.lower()
57
+ for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
58
+ if lowers.endswith(ext):
59
+ return name[: -len(ext)]
60
+ return p.stem # fallback: remove last suffix only
61
+
62
+ def _extract_barcode_from_filename(p: Path) -> str:
63
+ stem = _strip_fastq_ext(p)
64
+ if "_" in stem:
65
+ token = stem.split("_")[-1]
66
+ if token:
67
+ return token
68
+ return stem
69
+
70
+ def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
71
+ # return (prefix, readnum) if matches; else (None, None)
72
+ patterns = [
73
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
74
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
75
+ ]
76
+ for pat in patterns:
77
+ m = re.match(pat, stem)
78
+ if m:
79
+ return m.group(1), int(m.group(2))
80
+ return None, None
81
+
82
+ def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
83
+ pref_map: Dict[str, Dict[int, Path]] = {}
84
+ unpaired: List[Path] = []
85
+ for pth in paths:
86
+ stem = _strip_fastq_ext(pth)
87
+ pref, num = _classify_read_token(stem)
88
+ if pref is None:
89
+ unpaired.append(pth)
90
+ else:
91
+ entry = pref_map.setdefault(pref, {})
92
+ entry[num] = pth
93
+ pairs: List[Tuple[Path, Path]] = []
94
+ leftovers: List[Path] = []
95
+ for d in pref_map.values():
96
+ if 1 in d and 2 in d:
97
+ pairs.append((d[1], d[2]))
98
+ else:
99
+ leftovers.extend(d.values())
100
+ leftovers.extend(unpaired)
101
+ return pairs, leftovers
102
+
103
+ def _fastq_iter(p: Path):
104
+ # pysam.FastxFile handles compressed extensions transparently
105
+ with pysam.FastxFile(str(p)) as fx:
106
+ for rec in fx:
107
+ yield rec # rec.name, rec.sequence, rec.quality
108
+
109
+ def _make_unaligned_segment(
110
+ name: str,
111
+ seq: str,
112
+ qual: Optional[str],
113
+ bc: str,
114
+ read1: bool,
115
+ read2: bool,
116
+ ) -> pysam.AlignedSegment:
117
+ a = pysam.AlignedSegment()
118
+ a.query_name = name
119
+ a.query_sequence = seq
120
+ if qual is not None:
121
+ a.query_qualities = pysam.qualitystring_to_array(qual)
122
+ a.is_unmapped = True
123
+ a.is_paired = read1 or read2
124
+ a.is_read1 = read1
125
+ a.is_read2 = read2
126
+ a.mate_is_unmapped = a.is_paired
127
+ a.reference_id = -1
128
+ a.reference_start = -1
129
+ a.next_reference_id = -1
130
+ a.next_reference_start = -1
131
+ a.template_length = 0
132
+ a.set_tag(barcode_tag, str(bc), value_type="Z")
133
+ if add_read_group:
134
+ a.set_tag("RG", str(bc), value_type="Z")
135
+ return a
136
+
137
+ # ---------- normalize inputs to Path ----------
138
+ def _to_path_pair(x) -> Tuple[Path, Path]:
139
+ a, b = x
140
+ return Path(a), Path(b)
141
+
142
+ explicit_pairs: List[Tuple[Path, Path]] = []
143
+ singles: List[Path] = []
144
+
145
+ if not isinstance(fastq_files, (list, tuple)):
146
+ raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
147
+
148
+ if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
149
+ explicit_pairs = [_to_path_pair(x) for x in fastq_files]
150
+ else:
151
+ flat_paths = [Path(x) for x in fastq_files if x is not None]
152
+ if auto_pair:
153
+ explicit_pairs, leftovers = _pair_by_filename(flat_paths)
154
+ singles = leftovers
155
+ else:
156
+ singles = flat_paths
157
+
158
+ output_bam = Path(output_bam)
159
+ output_bam.parent.mkdir(parents=True, exist_ok=True)
160
+
161
+ # ---------- barcodes ----------
162
+ barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
163
+ per_path_barcode: Dict[Path, str] = {}
164
+ barcodes_in_order: List[str] = []
165
+
166
+ for r1, r2 in explicit_pairs:
167
+ bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
168
+ per_path_barcode[r1] = bc
169
+ per_path_barcode[r2] = bc
170
+ if bc not in barcodes_in_order:
171
+ barcodes_in_order.append(bc)
172
+ for pth in singles:
173
+ bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
174
+ per_path_barcode[pth] = bc
175
+ if bc not in barcodes_in_order:
176
+ barcodes_in_order.append(bc)
177
+
178
+ # ---------- BAM header ----------
179
+ header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
180
+ if add_read_group:
181
+ header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
182
+ header.setdefault("PG", []).append(
183
+ {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
184
+ )
185
+
186
+ # ---------- counters ----------
187
+ per_file_counts: Dict[Path, int] = {}
188
+ total_written = 0
189
+ paired_pairs_written = 0
190
+ singletons_written = 0
191
+
192
+ # ---------- write BAM ----------
193
+ with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
194
+ # Paired
195
+ it_pairs = explicit_pairs
196
+ if progress and it_pairs:
197
+ it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
198
+ for r1_path, r2_path in it_pairs:
199
+ if not (r1_path.exists() and r2_path.exists()):
200
+ raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
201
+ bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
202
+
203
+ it1 = _fastq_iter(r1_path)
204
+ it2 = _fastq_iter(r2_path)
205
+
206
+ for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
207
+ def _clean(n: Optional[str]) -> Optional[str]:
208
+ if n is None:
209
+ return None
210
+ return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
211
+
212
+ name = (
213
+ _clean(getattr(rec1, "name", None))
214
+ or _clean(getattr(rec2, "name", None))
215
+ or getattr(rec1, "name", None)
216
+ or getattr(rec2, "name", None)
217
+ )
218
+
219
+ if rec1 is not None:
220
+ a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
221
+ bam_out.write(a1)
222
+ per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
223
+ total_written += 1
224
+ if rec2 is not None:
225
+ a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
226
+ bam_out.write(a2)
227
+ per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
228
+ total_written += 1
229
+
230
+ if rec1 is not None and rec2 is not None:
231
+ paired_pairs_written += 1
232
+ else:
233
+ if rec1 is not None:
234
+ singletons_written += 1
235
+ if rec2 is not None:
236
+ singletons_written += 1
237
+
238
+ # Singles
239
+ it_singles = singles
240
+ if progress and it_singles:
241
+ it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
242
+ for pth in it_singles:
243
+ if not pth.exists():
244
+ raise FileNotFoundError(pth)
245
+ bc = per_path_barcode.get(pth, "barcode")
246
+ for rec in _fastq_iter(pth):
247
+ a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
248
+ bam_out.write(a)
249
+ per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
250
+ total_written += 1
251
+ singletons_written += 1
252
+
253
+ return {
254
+ "total_reads": total_written,
255
+ "per_file": {str(k): v for k, v in per_file_counts.items()},
256
+ "paired_pairs_written": paired_pairs_written,
257
+ "singletons_written": singletons_written,
258
+ "barcodes": barcodes_in_order,
259
+ }
@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
14
14
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
15
 
16
16
  """
17
- from .. import readwrite
17
+ from ... import readwrite
18
18
  import pysam
19
19
  from tqdm import tqdm
20
20
  from collections import defaultdict
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
25
25
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
26
26
  record_counts = defaultdict(int)
27
27
 
28
- with pysam.AlignmentFile(bam_file, "rb") as bam:
28
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
29
29
  total_reads = bam.mapped + bam.unmapped
30
30
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
31
31
  for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
18
18
  bam_files (list): List of split BAM file path strings
19
19
  Splits an input BAM file on barcode value and makes a BAM index file.
20
20
  """
21
- from .. import readwrite
21
+ from ...readwrite import make_dirs
22
22
  import os
23
23
  import subprocess
24
24
  import glob
25
- from .make_dirs import make_dirs
26
25
 
27
- input_bam = aligned_sorted_BAM + bam_suffix
26
+ input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
28
27
  command = ["dorado", "demux", "--kit-name", barcode_kit]
29
28
  if barcode_both_ends:
30
29
  command.append("--barcode-both-ends")
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
34
33
  command += ["-t", str(threads)]
35
34
  else:
36
35
  pass
37
- command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
38
- command.append(input_bam)
36
+ command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
37
+ command.append(str(input_bam))
39
38
  command_string = ' '.join(command)
40
39
  print(f"Running: {command_string}")
41
40
  subprocess.run(command)
42
41
 
43
- # Make a BAM index file for the BAMs in that directory
44
- bam_pattern = '*' + bam_suffix
45
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
46
- bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
47
- bam_files.sort()
42
+ bam_files = sorted(
43
+ p for p in split_dir.glob(f"*{bam_suffix}")
44
+ if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
45
+ )
48
46
 
49
47
  if not bam_files:
50
48
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
@@ -27,7 +27,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
27
27
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
28
28
 
29
29
  #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
30
- with pysam.AlignmentFile(bam_file, "rb") as bam:
30
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
31
31
  total_reads = bam.mapped
32
32
  ref_seq = sequence.upper()
33
33
  for read in bam.fetch(chromosome):