smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,259 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Tuple, Union, Optional
5
+ import re
6
+ from itertools import zip_longest
7
+
8
+ import pysam
9
+ from tqdm import tqdm
10
+
11
+
12
+ def concatenate_fastqs_to_bam(
13
+ fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
14
+ output_bam: Union[str, Path],
15
+ barcode_tag: str = "BC",
16
+ barcode_map: Optional[Dict[Union[str, Path], str]] = None,
17
+ add_read_group: bool = True,
18
+ rg_sample_field: Optional[str] = None,
19
+ progress: bool = True,
20
+ auto_pair: bool = True,
21
+ ) -> Dict[str, Any]:
22
+ """
23
+ Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
24
+
25
+ Parameters
26
+ ----------
27
+ fastq_files : list[Path|str] or list[(Path|str, Path|str)]
28
+ Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
29
+ output_bam : Path|str
30
+ Output BAM path (parent directory will be created).
31
+ barcode_tag : str
32
+ SAM tag used to store barcode on each read (default 'BC').
33
+ barcode_map : dict or None
34
+ Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
35
+ add_read_group : bool
36
+ If True, add @RG header lines (ID = barcode) and set each read's RG tag.
37
+ rg_sample_field : str or None
38
+ If set, include SM=<value> in @RG.
39
+ progress : bool
40
+ Show tqdm progress bars.
41
+ auto_pair : bool
42
+ Auto-pair R1/R2 based on filename patterns if given a flat list.
43
+
44
+ Returns
45
+ -------
46
+ dict
47
+ {'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
48
+ """
49
+
50
+ # ---------- helpers (Pathlib-only) ----------
51
+ def _strip_fastq_ext(p: Path) -> str:
52
+ """
53
+ Remove common FASTQ multi-suffixes; return stem-like name.
54
+ """
55
+ name = p.name
56
+ lowers = name.lower()
57
+ for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
58
+ if lowers.endswith(ext):
59
+ return name[: -len(ext)]
60
+ return p.stem # fallback: remove last suffix only
61
+
62
+ def _extract_barcode_from_filename(p: Path) -> str:
63
+ stem = _strip_fastq_ext(p)
64
+ if "_" in stem:
65
+ token = stem.split("_")[-1]
66
+ if token:
67
+ return token
68
+ return stem
69
+
70
+ def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
71
+ # return (prefix, readnum) if matches; else (None, None)
72
+ patterns = [
73
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
74
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
75
+ ]
76
+ for pat in patterns:
77
+ m = re.match(pat, stem)
78
+ if m:
79
+ return m.group(1), int(m.group(2))
80
+ return None, None
81
+
82
+ def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
83
+ pref_map: Dict[str, Dict[int, Path]] = {}
84
+ unpaired: List[Path] = []
85
+ for pth in paths:
86
+ stem = _strip_fastq_ext(pth)
87
+ pref, num = _classify_read_token(stem)
88
+ if pref is None:
89
+ unpaired.append(pth)
90
+ else:
91
+ entry = pref_map.setdefault(pref, {})
92
+ entry[num] = pth
93
+ pairs: List[Tuple[Path, Path]] = []
94
+ leftovers: List[Path] = []
95
+ for d in pref_map.values():
96
+ if 1 in d and 2 in d:
97
+ pairs.append((d[1], d[2]))
98
+ else:
99
+ leftovers.extend(d.values())
100
+ leftovers.extend(unpaired)
101
+ return pairs, leftovers
102
+
103
+ def _fastq_iter(p: Path):
104
+ # pysam.FastxFile handles compressed extensions transparently
105
+ with pysam.FastxFile(str(p)) as fx:
106
+ for rec in fx:
107
+ yield rec # rec.name, rec.sequence, rec.quality
108
+
109
+ def _make_unaligned_segment(
110
+ name: str,
111
+ seq: str,
112
+ qual: Optional[str],
113
+ bc: str,
114
+ read1: bool,
115
+ read2: bool,
116
+ ) -> pysam.AlignedSegment:
117
+ a = pysam.AlignedSegment()
118
+ a.query_name = name
119
+ a.query_sequence = seq
120
+ if qual is not None:
121
+ a.query_qualities = pysam.qualitystring_to_array(qual)
122
+ a.is_unmapped = True
123
+ a.is_paired = read1 or read2
124
+ a.is_read1 = read1
125
+ a.is_read2 = read2
126
+ a.mate_is_unmapped = a.is_paired
127
+ a.reference_id = -1
128
+ a.reference_start = -1
129
+ a.next_reference_id = -1
130
+ a.next_reference_start = -1
131
+ a.template_length = 0
132
+ a.set_tag(barcode_tag, str(bc), value_type="Z")
133
+ if add_read_group:
134
+ a.set_tag("RG", str(bc), value_type="Z")
135
+ return a
136
+
137
+ # ---------- normalize inputs to Path ----------
138
+ def _to_path_pair(x) -> Tuple[Path, Path]:
139
+ a, b = x
140
+ return Path(a), Path(b)
141
+
142
+ explicit_pairs: List[Tuple[Path, Path]] = []
143
+ singles: List[Path] = []
144
+
145
+ if not isinstance(fastq_files, (list, tuple)):
146
+ raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
147
+
148
+ if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
149
+ explicit_pairs = [_to_path_pair(x) for x in fastq_files]
150
+ else:
151
+ flat_paths = [Path(x) for x in fastq_files if x is not None]
152
+ if auto_pair:
153
+ explicit_pairs, leftovers = _pair_by_filename(flat_paths)
154
+ singles = leftovers
155
+ else:
156
+ singles = flat_paths
157
+
158
+ output_bam = Path(output_bam)
159
+ output_bam.parent.mkdir(parents=True, exist_ok=True)
160
+
161
+ # ---------- barcodes ----------
162
+ barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
163
+ per_path_barcode: Dict[Path, str] = {}
164
+ barcodes_in_order: List[str] = []
165
+
166
+ for r1, r2 in explicit_pairs:
167
+ bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
168
+ per_path_barcode[r1] = bc
169
+ per_path_barcode[r2] = bc
170
+ if bc not in barcodes_in_order:
171
+ barcodes_in_order.append(bc)
172
+ for pth in singles:
173
+ bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
174
+ per_path_barcode[pth] = bc
175
+ if bc not in barcodes_in_order:
176
+ barcodes_in_order.append(bc)
177
+
178
+ # ---------- BAM header ----------
179
+ header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
180
+ if add_read_group:
181
+ header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
182
+ header.setdefault("PG", []).append(
183
+ {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
184
+ )
185
+
186
+ # ---------- counters ----------
187
+ per_file_counts: Dict[Path, int] = {}
188
+ total_written = 0
189
+ paired_pairs_written = 0
190
+ singletons_written = 0
191
+
192
+ # ---------- write BAM ----------
193
+ with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
194
+ # Paired
195
+ it_pairs = explicit_pairs
196
+ if progress and it_pairs:
197
+ it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
198
+ for r1_path, r2_path in it_pairs:
199
+ if not (r1_path.exists() and r2_path.exists()):
200
+ raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
201
+ bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
202
+
203
+ it1 = _fastq_iter(r1_path)
204
+ it2 = _fastq_iter(r2_path)
205
+
206
+ for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
207
+ def _clean(n: Optional[str]) -> Optional[str]:
208
+ if n is None:
209
+ return None
210
+ return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
211
+
212
+ name = (
213
+ _clean(getattr(rec1, "name", None))
214
+ or _clean(getattr(rec2, "name", None))
215
+ or getattr(rec1, "name", None)
216
+ or getattr(rec2, "name", None)
217
+ )
218
+
219
+ if rec1 is not None:
220
+ a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
221
+ bam_out.write(a1)
222
+ per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
223
+ total_written += 1
224
+ if rec2 is not None:
225
+ a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
226
+ bam_out.write(a2)
227
+ per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
228
+ total_written += 1
229
+
230
+ if rec1 is not None and rec2 is not None:
231
+ paired_pairs_written += 1
232
+ else:
233
+ if rec1 is not None:
234
+ singletons_written += 1
235
+ if rec2 is not None:
236
+ singletons_written += 1
237
+
238
+ # Singles
239
+ it_singles = singles
240
+ if progress and it_singles:
241
+ it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
242
+ for pth in it_singles:
243
+ if not pth.exists():
244
+ raise FileNotFoundError(pth)
245
+ bc = per_path_barcode.get(pth, "barcode")
246
+ for rec in _fastq_iter(pth):
247
+ a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
248
+ bam_out.write(a)
249
+ per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
250
+ total_written += 1
251
+ singletons_written += 1
252
+
253
+ return {
254
+ "total_reads": total_written,
255
+ "per_file": {str(k): v for k, v in per_file_counts.items()},
256
+ "paired_pairs_written": paired_pairs_written,
257
+ "singletons_written": singletons_written,
258
+ "barcodes": barcodes_in_order,
259
+ }
@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
14
14
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
15
 
16
16
  """
17
- from .. import readwrite
17
+ from ... import readwrite
18
18
  import pysam
19
19
  from tqdm import tqdm
20
20
  from collections import defaultdict
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
25
25
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
26
26
  record_counts = defaultdict(int)
27
27
 
28
- with pysam.AlignmentFile(bam_file, "rb") as bam:
28
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
29
29
  total_reads = bam.mapped + bam.unmapped
30
30
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
31
31
  for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
18
18
  bam_files (list): List of split BAM file path strings
19
19
  Splits an input BAM file on barcode value and makes a BAM index file.
20
20
  """
21
- from .. import readwrite
21
+ from ...readwrite import make_dirs
22
22
  import os
23
23
  import subprocess
24
24
  import glob
25
- from .make_dirs import make_dirs
26
25
 
27
- input_bam = aligned_sorted_BAM + bam_suffix
26
+ input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
28
27
  command = ["dorado", "demux", "--kit-name", barcode_kit]
29
28
  if barcode_both_ends:
30
29
  command.append("--barcode-both-ends")
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
34
33
  command += ["-t", str(threads)]
35
34
  else:
36
35
  pass
37
- command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
38
- command.append(input_bam)
36
+ command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
37
+ command.append(str(input_bam))
39
38
  command_string = ' '.join(command)
40
39
  print(f"Running: {command_string}")
41
40
  subprocess.run(command)
42
41
 
43
- # Make a BAM index file for the BAMs in that directory
44
- bam_pattern = '*' + bam_suffix
45
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
46
- bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
47
- bam_files.sort()
42
+ bam_files = sorted(
43
+ p for p in split_dir.glob(f"*{bam_suffix}")
44
+ if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
45
+ )
48
46
 
49
47
  if not bam_files:
50
48
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
@@ -27,7 +27,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
27
27
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
28
28
 
29
29
  #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
30
- with pysam.AlignmentFile(bam_file, "rb") as bam:
30
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
31
31
  total_reads = bam.mapped
32
32
  ref_seq = sequence.upper()
33
33
  for read in bam.fetch(chromosome):
@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
23
23
  import glob
24
24
  import zipfile
25
25
 
26
- os.chdir(mod_tsv_dir)
27
26
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
28
- bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
27
+ bam_files = glob.glob(split_dir / f"*{bam_suffix}")
28
+ print(f"Running modkit extract for the following bam files: {bam_files}")
29
29
 
30
30
  if threads:
31
31
  threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
35
35
  for input_file in bam_files:
36
36
  print(input_file)
37
37
  # Extract the file basename
38
- file_name = os.path.basename(input_file)
38
+ file_name = input_file.name
39
39
  if skip_unclassified and "unclassified" in file_name:
40
40
  print("Skipping modkit extract on unclassified reads")
41
41
  else:
42
42
  # Construct the output TSV file path
43
- output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
44
- output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
45
- if os.path.exists(f"{output_tsv}.gz"):
46
- print(f"{output_tsv}.gz already exists, skipping modkit extract")
43
+ output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
44
+ output_tsv_gz = output_tsv + '.gz'
45
+ if output_tsv_gz.exists():
46
+ print(f"{output_tsv_gz} already exists, skipping modkit extract")
47
47
  else:
48
48
  print(f"Extracting modification data from {input_file}")
49
49
  if modkit_summary:
50
50
  # Run modkit summary
51
- subprocess.run(["modkit", "summary", input_file])
51
+ subprocess.run(["modkit", "summary", str(input_file)])
52
52
  else:
53
53
  pass
54
54
  # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
61
61
  "--mod-thresholds", f"a:{m6A_threshold}",
62
62
  "--mod-thresholds", f"h:{hm5C_threshold}",
63
63
  "-t", threads,
64
- input_file, output_tsv
64
+ str(input_file), str(output_tsv)
65
65
  ]
66
66
  else:
67
67
  extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
71
71
  "--mod-thresholds", f"m:{m5C_threshold}",
72
72
  "--mod-thresholds", f"a:{m6A_threshold}",
73
73
  "--mod-thresholds", f"h:{hm5C_threshold}",
74
- input_file, output_tsv
74
+ str(input_file), str(output_tsv)
75
75
  ]
76
76
  subprocess.run(extract_command)
77
77
  # Zip the output TSV
78
78
  print(f'zipping {output_tsv}')
79
79
  if threads:
80
- zip_command = ["pigz", "-f", "-p", threads, output_tsv]
80
+ zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
81
81
  else:
82
- zip_command = ["pigz", "-f", output_tsv]
83
- subprocess.run(zip_command, check=True)
82
+ zip_command = ["pigz", "-f", str(output_tsv)]
83
+ subprocess.run(zip_command, check=True)
84
+
85
+ return
@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
67
67
  None (Writes the converted FASTA file).
68
68
  """
69
69
  unconverted = modification_types[0]
70
+ input_fasta = str(input_fasta)
71
+ output_fasta = str(output_fasta)
70
72
 
71
73
  # Detect if input is gzipped
72
74
  open_func = gzip.open if input_fasta.endswith('.gz') else open
@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
8
8
  fasta (str): Path to the input fasta
9
9
  """
10
10
  import os
11
+ from pathlib import Path
11
12
  import subprocess
12
13
  from .index_fasta import index_fasta
13
14
 
14
15
  # Make a fasta index file if one isn't already available
15
- index_path = f'{fasta}.fai'
16
- if os.path.exists(index_path):
16
+ index_path = fasta / '.fai'
17
+ if index_path.exists():
17
18
  print(f'Using existing fasta index file: {index_path}')
18
19
  else:
19
20
  index_fasta(fasta)
20
21
 
21
- parent_dir = os.path.dirname(fasta)
22
- fasta_basename = os.path.basename(fasta)
23
- chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
- chrom_path = os.path.join(parent_dir, chrom_basename)
22
+ parent_dir = fasta.parent
23
+ fasta_basename = fasta.name
24
+ chrom_basename = fasta.stem + '.chrom.sizes'
25
+ chrom_path = parent_dir / chrom_basename
25
26
 
26
27
  # Make a chromosome length file
27
- if os.path.exists(chrom_path):
28
+ if chrom_path.exists():
28
29
  print(f'Using existing chrom length index file: {chrom_path}')
29
30
  else:
30
31
  with open(chrom_path, 'w') as outfile:
31
- command = ["cut", "-f1,2", index_path]
32
+ command = ["cut", "-f1,2", str(index_path)]
32
33
  subprocess.run(command, stdout=outfile)
@@ -0,0 +1,24 @@
1
+ import pysam
2
+ from pathlib import Path
3
+
4
+ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
5
+ """
6
+ Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
7
+
8
+ Returns
9
+ -------
10
+ Path: path to chrom.sizes file (if requested), else .fai
11
+ """
12
+ fasta = Path(fasta)
13
+ pysam.faidx(str(fasta)) # makes fasta.fai
14
+
15
+ if write_chrom_sizes:
16
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
17
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
18
+ with open(fai) as f_in, open(chrom_sizes, "w") as out:
19
+ for line in f_in:
20
+ chrom, size = line.split()[:2]
21
+ out.write(f"{chrom}\t{size}\n")
22
+ return chrom_sizes
23
+
24
+ return fasta.with_suffix(fasta.suffix + ".fai")
@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
13
13
  import os
14
14
  import subprocess
15
15
 
16
- os.chdir(mod_bed_dir)
17
16
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
18
17
  command = [
19
- "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
18
+ "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
20
19
  "--partition-tag", "BC",
21
20
  "--only-tabs",
22
21
  "--filter-threshold", f'{filter_threshold}',
@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
16
16
  import subprocess
17
17
 
18
18
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
19
- subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
19
+ subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
20
20
  command = [
21
- "modkit", "summary", aligned_sorted_output,
21
+ "modkit", "summary", str(aligned_sorted_output),
22
22
  "--filter-threshold", f"{filter_threshold}",
23
23
  "--mod-thresholds", f"m:{m5C_threshold}",
24
24
  "--mod-thresholds", f"a:{m6A_threshold}",
@@ -1,24 +1,5 @@
1
1
  # plot_bed_histograms
2
2
 
3
- def plot_bed_histograms(bed_file, plotting_directory, fasta):
4
- """
5
- Plots read length, coverage, mapq, read quality stats for each record.
6
-
7
- Parameters:
8
- bed_file (str): Path to the bed file to derive metrics from.
9
- plot_directory (str): Path to the directory to write out historgrams.
10
- fasta (str): Path to FASTA corresponding to bed
11
-
12
- Returns:
13
- None
14
- """
15
- import pandas as pd
16
- import matplotlib.pyplot as plt
17
- import numpy as np
18
- import os
19
-
20
- # plot_bed_histograms.py
21
-
22
3
  def plot_bed_histograms(
23
4
  bed_file,
24
5
  plotting_directory,
@@ -15,13 +15,14 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
15
15
  Writes out split BAM files.
16
16
  """
17
17
  import pysam
18
+ from pathlib import Path
18
19
  import os
19
20
 
20
- bam_base = os.path.basename(input_bam)
21
- bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
21
+ bam_base = input_bam.name
22
+ bam_base_minus_suffix = input_bam.stem
22
23
 
23
24
  # Open the input BAM file for reading
24
- with pysam.AlignmentFile(input_bam, "rb") as bam:
25
+ with pysam.AlignmentFile(str(input_bam), "rb") as bam:
25
26
  # Create a dictionary to store output BAM files
26
27
  output_files = {}
27
28
  # Iterate over each read in the BAM file
@@ -32,8 +33,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
32
33
  #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
33
34
  # Open the output BAM file corresponding to the barcode
34
35
  if bc_tag not in output_files:
35
- output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
36
- output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
36
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
37
+ output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
37
38
  # Write the read to the corresponding output BAM file
38
39
  output_files[bc_tag].write(read)
39
40
  except KeyError:
@@ -12,21 +12,21 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
12
12
  None
13
13
  Splits an input BAM file on barcode value and makes a BAM index file.
14
14
  """
15
- from .. import readwrite
15
+ from ...readwrite import date_string, make_dirs
16
+ from pathlib import Path
16
17
  import os
17
- import subprocess
18
+ import pysam
18
19
  import glob
19
20
  from .separate_bam_by_bc import separate_bam_by_bc
20
- from .make_dirs import make_dirs
21
21
 
22
22
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
23
- file_prefix = readwrite.date_string()
23
+ file_prefix = date_string()
24
24
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
25
25
  # Make a BAM index file for the BAMs in that directory
26
26
  bam_pattern = '*' + bam_suffix
27
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
28
- bam_files = [bam for bam in bam_files if '.bai' not in bam]
27
+ bam_files = glob.glob(split_dir / bam_pattern)
28
+ bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
29
29
  for input_file in bam_files:
30
- subprocess.run(["samtools", "index", input_file])
30
+ pysam.index(input_file)
31
31
 
32
32
  return bam_files
@@ -0,0 +1,49 @@
1
+ from pathlib import Path
2
+ from pyfaidx import Fasta
3
+
4
+ def subsample_fasta_from_bed(
5
+ input_FASTA: str | Path,
6
+ input_bed: str | Path,
7
+ output_directory: str | Path,
8
+ output_FASTA: str | Path
9
+ ) -> None:
10
+ """
11
+ Take a genome-wide FASTA file and a BED file containing
12
+ coordinate windows of interest. Outputs a subsampled FASTA.
13
+ """
14
+
15
+ # Normalize everything to Path
16
+ input_FASTA = Path(input_FASTA)
17
+ input_bed = Path(input_bed)
18
+ output_directory = Path(output_directory)
19
+ output_FASTA = Path(output_FASTA)
20
+
21
+ # Ensure output directory exists
22
+ output_directory.mkdir(parents=True, exist_ok=True)
23
+
24
+ output_FASTA_path = output_directory / output_FASTA
25
+
26
+ # Load the FASTA file using pyfaidx
27
+ fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
28
+
29
+ # Open BED + output FASTA
30
+ with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
31
+ for line in bed:
32
+ fields = line.strip().split()
33
+ chrom = fields[0]
34
+ start = int(fields[1]) # BED is 0-based
35
+ end = int(fields[2]) # BED is 0-based and end is exclusive
36
+ desc = " ".join(fields[3:]) if len(fields) > 3 else ""
37
+
38
+ if chrom not in fasta:
39
+ print(f"Warning: {chrom} not found in FASTA")
40
+ continue
41
+
42
+ # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
43
+ sequence = fasta[chrom][start:end].seq
44
+
45
+ header = f">{chrom}:{start}-{end}"
46
+ if desc:
47
+ header += f" {desc}"
48
+
49
+ out_fasta.write(f"{header}\n{sequence}\n")