smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,637 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import Union, Iterable
4
+
5
+ from .helpers import AdataPaths
6
+
7
+ def check_executable_exists(cmd: str) -> bool:
8
+ """Return True if a command-line executable is available in PATH."""
9
+ return shutil.which(cmd) is not None
10
+
11
+ def delete_tsvs(
12
+ tsv_dir: Union[str, Path, Iterable[str], None],
13
+ *,
14
+ dry_run: bool = False,
15
+ verbose: bool = True,
16
+ ):
17
+ """
18
+ Delete intermediate tsv files.
19
+
20
+ Parameters
21
+ ----------
22
+
23
+ tsv_dir : str | Path | None
24
+ Path to a directory to remove recursively (e.g. a tsv dir created earlier).
25
+ dry_run : bool
26
+ If True, print what *would* be removed but do not actually delete.
27
+ verbose : bool
28
+ Print progress / warnings.
29
+ """
30
+ # Helper: remove a single file path (Path-like or string)
31
+ def _maybe_unlink(p: Path):
32
+ if not p.exists():
33
+ if verbose:
34
+ print(f"[skip] not found: {p}")
35
+ return
36
+ if not p.is_file():
37
+ if verbose:
38
+ print(f"[skip] not a file: {p}")
39
+ return
40
+ if dry_run:
41
+ print(f"[dry-run] would remove file: {p}")
42
+ return
43
+ try:
44
+ p.unlink()
45
+ if verbose:
46
+ print(f"Removed file: {p}")
47
+ except Exception as e:
48
+ print(f"[error] failed to remove file {p}: {e}")
49
+
50
+ # Remove tmp_dir recursively (if provided)
51
+ if tsv_dir is not None:
52
+ td = Path(tsv_dir)
53
+ if not td.exists():
54
+ if verbose:
55
+ print(f"[skip] tsv_dir not found: {td}")
56
+ else:
57
+ if not td.is_dir():
58
+ if verbose:
59
+ print(f"[skip] tsv_dir is not a directory: {td}")
60
+ else:
61
+ if dry_run:
62
+ print(f"[dry-run] would remove directory tree: {td}")
63
+ else:
64
+ try:
65
+ shutil.rmtree(td)
66
+ if verbose:
67
+ print(f"Removed directory tree: {td}")
68
+ except Exception as e:
69
+ print(f"[error] failed to remove tmp dir {td}: {e}")
70
+
71
+ def load_adata_core(cfg, paths: AdataPaths):
72
+ """
73
+ Core load pipeline.
74
+
75
+ Assumes:
76
+ - cfg is a fully initialized ExperimentConfig
77
+ - paths is an AdataPaths object describing canonical h5ad stage paths
78
+ - No stage-skipping or early returns based on existing AnnDatas are done here
79
+ (that happens in the wrapper).
80
+
81
+ Does:
82
+ - handle input format (fast5/pod5/fastq/bam/h5ad)
83
+ - basecalling / alignment / demux / BAM QC
84
+ - optional bed + bigwig generation
85
+ - AnnData construction (conversion or direct modality)
86
+ - basic read-level QC annotations
87
+ - write raw AnnData to paths.raw
88
+ - run MultiQC
89
+ - optional deletion of intermediate BAMs
90
+
91
+ Returns
92
+ -------
93
+ raw_adata : anndata.AnnData
94
+ Newly created raw AnnData object.
95
+ raw_adata_path : Path
96
+ Path where the raw AnnData was written (paths.raw).
97
+ cfg : ExperimentConfig
98
+ (Same object, possibly with some fields updated, e.g. fasta path.)
99
+ """
100
+ import os
101
+ from pathlib import Path
102
+
103
+ import numpy as np
104
+ import pandas as pd
105
+ import anndata as ad
106
+ import scanpy as sc
107
+
108
+ from .helpers import write_gz_h5ad
109
+
110
+ from ..readwrite import make_dirs, add_or_update_column_in_csv
111
+
112
+ from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
113
+ from ..informatics.bed_functions import aligned_BAM_to_bed
114
+ from ..informatics.pod5_functions import fast5_to_pod5
115
+ from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
116
+ from ..informatics.basecalling import modcall, canoncall
117
+ from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
118
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
119
+ from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
120
+ from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
121
+ from ..informatics.run_multiqc import run_multiqc
122
+
123
+ ################################### 1) General params and input organization ###################################
124
+ output_directory = Path(cfg.output_directory)
125
+ make_dirs([output_directory])
126
+
127
+ raw_adata_path = paths.raw
128
+ pp_adata_path = paths.pp
129
+ pp_dup_rem_adata_path = paths.pp_dedup
130
+ spatial_adata_path = paths.spatial
131
+ hmm_adata_path = paths.hmm
132
+
133
+ # Naming of the demultiplexed output directory
134
+ double_barcoded_path = cfg.split_path / "both_ends_barcoded"
135
+ single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
136
+
137
+ # Direct methylation detection SMF specific parameters
138
+ if cfg.smf_modality == "direct":
139
+ mod_bed_dir = cfg.output_directory / "mod_beds"
140
+ add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
141
+ mod_tsv_dir = cfg.output_directory / "mod_tsvs"
142
+ add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
143
+ bam_qc_dir = cfg.output_directory / "bam_qc"
144
+ mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
145
+
146
+ if not check_executable_exists("dorado"):
147
+ raise RuntimeError(
148
+ "Error: 'dorado' is not installed or not in PATH. "
149
+ "Install from https://github.com/nanoporetech/dorado"
150
+ )
151
+ if not check_executable_exists("modkit"):
152
+ raise RuntimeError(
153
+ "Error: 'modkit' is not installed or not in PATH. "
154
+ "Install from https://github.com/nanoporetech/modkit"
155
+ )
156
+ else:
157
+ mod_bed_dir = None
158
+ mod_tsv_dir = None
159
+ mods = None
160
+
161
+ # demux / aligner executables
162
+ if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
163
+ if not check_executable_exists("dorado"):
164
+ raise RuntimeError(
165
+ "Error: 'dorado' is not installed or not in PATH. "
166
+ "Install from https://github.com/nanoporetech/dorado"
167
+ )
168
+
169
+ if cfg.aligner == "minimap2":
170
+ if not check_executable_exists("minimap2"):
171
+ raise RuntimeError(
172
+ "Error: 'minimap2' is not installed or not in PATH. "
173
+ "Install minimap2"
174
+ )
175
+
176
+ # # Detect the input filetypes
177
+ # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
178
+ if cfg.input_type == "fast5":
179
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
180
+ output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
181
+ if output_pod5.exists():
182
+ pass
183
+ else:
184
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
185
+ fast5_to_pod5(cfg.input_data_path, output_pod5)
186
+ # Reassign the pod5_dir variable to point to the new pod5 file.
187
+ cfg.input_data_path = output_pod5
188
+ cfg.input_type = "pod5"
189
+ # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
190
+ elif cfg.input_type == "fastq":
191
+ # Output file for FASTQ concatenation.
192
+ output_bam = cfg.output_directory / 'canonical_basecalls.bam'
193
+ if output_bam.exists():
194
+ pass
195
+ else:
196
+ summary = concatenate_fastqs_to_bam(
197
+ cfg.input_files,
198
+ output_bam,
199
+ barcode_tag='BC',
200
+ gzip_suffixes=('.gz','.gzip'),
201
+ barcode_map=cfg.fastq_barcode_map,
202
+ add_read_group=True,
203
+ rg_sample_field=None,
204
+ progress=False,
205
+ auto_pair=cfg.fastq_auto_pairing)
206
+
207
+ print(f"Found the following barcodes: {summary['barcodes']}")
208
+
209
+ # Set the input data path to the concatenated BAM.
210
+ cfg.input_data_path = output_bam
211
+ cfg.input_type = "bam"
212
+ elif cfg.input_type == "h5ad":
213
+ pass
214
+ else:
215
+ pass
216
+
217
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
218
+
219
+ # Determine if the input data needs to be basecalled
220
+ if cfg.input_type == "pod5":
221
+ print(f'Detected pod5 inputs: {cfg.input_files}')
222
+ basecall = True
223
+ elif cfg.input_type in ["bam"]:
224
+ print(f'Detected bam input: {cfg.input_files}')
225
+ basecall = False
226
+ else:
227
+ print('Error, can not find input bam or pod5')
228
+
229
+ # Generate the base name of the unaligned bam without the .bam suffix
230
+ if basecall:
231
+ model_basename = Path(cfg.model).name
232
+ model_basename = str(model_basename).replace('.', '_')
233
+ if cfg.smf_modality == 'direct':
234
+ mod_string = "_".join(cfg.mod_list)
235
+ bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
236
+ else:
237
+ bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
238
+ else:
239
+ bam_base = cfg.input_data_path.name
240
+ bam = cfg.output_directory / bam_base
241
+
242
+ # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
243
+ unaligned_output = bam.with_suffix(cfg.bam_suffix)
244
+ aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
245
+ aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
246
+ aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
247
+ aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
248
+
249
+ add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
250
+ add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
251
+ add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
252
+ ########################################################################################################################
253
+
254
+ ################################### 2) FASTA Handling ###################################
255
+ from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
256
+
257
+ try:
258
+ cfg.fasta = Path(cfg.fasta)
259
+ except:
260
+ print("Need to provide an input FASTA path to proceed with smftools load")
261
+
262
+ # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
263
+ if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
264
+ fasta_basename = cfg.fasta.parent / cfg.fasta.stem
265
+ bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
266
+ output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
267
+ subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
268
+ fasta = cfg.output_directory / output_FASTA
269
+ else:
270
+ fasta = cfg.fasta
271
+
272
+ # For conversion style SMF, make a converted reference FASTA
273
+ if cfg.smf_modality == 'conversion':
274
+ fasta_basename = fasta.parent / fasta.stem
275
+ converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
276
+ converted_FASTA = cfg.output_directory / converted_FASTA_basename
277
+ if 'converted.fa' in fasta.name:
278
+ print(f'{fasta} is already converted. Using existing converted FASTA.')
279
+ converted_FASTA = fasta
280
+ elif converted_FASTA.exists():
281
+ print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
282
+ else:
283
+ generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
284
+ fasta = converted_FASTA
285
+
286
+ add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
287
+
288
+ # Make a FAI and .chrom.names file for the fasta
289
+ get_chromosome_lengths(fasta)
290
+ ########################################################################################################################
291
+
292
+ ################################### 3) Basecalling ###################################
293
+ from ..informatics.basecalling import modcall, canoncall
294
+ # 1) Basecall using dorado
295
+ if basecall and cfg.sequencer == 'ont':
296
+ try:
297
+ cfg.model_dir = Path(cfg.model_dir)
298
+ except:
299
+ print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
300
+ if aligned_sorted_output.exists():
301
+ print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
302
+ elif unaligned_output.exists():
303
+ print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
304
+ elif cfg.smf_modality != 'direct':
305
+ canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
306
+ else:
307
+ modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
308
+ elif basecall:
309
+ print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
310
+ else:
311
+ pass
312
+ ########################################################################################################################
313
+
314
+ ################################### 4) Alignment and sorting #############################################
315
+ from ..informatics.bam_functions import align_and_sort_BAM
316
+ from ..informatics.bed_functions import aligned_BAM_to_bed
317
+ # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
318
+ if aligned_sorted_output.exists():
319
+ print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
320
+ else:
321
+ align_and_sort_BAM(fasta, unaligned_output, cfg)
322
+ # Deleted the unsorted aligned output
323
+ aligned_output.unlink()
324
+
325
+ if cfg.make_beds:
326
+ # Make beds and provide basic histograms
327
+ bed_dir = cfg.output_directory / 'beds'
328
+ if bed_dir.is_dir():
329
+ print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
330
+ else:
331
+ aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
332
+ ########################################################################################################################
333
+
334
+ ################################### 5) Demultiplexing ######################################################################
335
+ from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
336
+ # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
337
+ if cfg.input_already_demuxed:
338
+ if cfg.split_path.is_dir():
339
+ print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
340
+
341
+ all_bam_files = sorted(
342
+ p for p in cfg.split_path.iterdir()
343
+ if p.is_file()
344
+ and p.suffix == cfg.bam_suffix
345
+ )
346
+ unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
347
+ bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
348
+
349
+ else:
350
+ make_dirs([cfg.split_path])
351
+ all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
352
+ cfg.split_path,
353
+ cfg.bam_suffix)
354
+
355
+ unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
356
+ bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
357
+
358
+ se_bam_files = bam_files
359
+ bam_dir = cfg.split_path
360
+
361
+ else:
362
+ if single_barcoded_path.is_dir():
363
+ print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
364
+
365
+ all_se_bam_files = sorted(
366
+ p for p in single_barcoded_path.iterdir()
367
+ if p.is_file()
368
+ and p.suffix == cfg.bam_suffix
369
+ )
370
+ unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
371
+ se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
372
+ else:
373
+ make_dirs([cfg.split_path, single_barcoded_path])
374
+ all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
375
+ single_barcoded_path,
376
+ cfg.bam_suffix,
377
+ cfg.barcode_kit,
378
+ False,
379
+ cfg.trim,
380
+ cfg.threads)
381
+
382
+ unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
383
+ se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
384
+
385
+ if double_barcoded_path.is_dir():
386
+ print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
387
+
388
+ all_de_bam_files = sorted(
389
+ p for p in double_barcoded_path.iterdir()
390
+ if p.is_file()
391
+ and p.suffix == cfg.bam_suffix
392
+ )
393
+ unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
394
+ de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
395
+ else:
396
+ make_dirs([cfg.split_path, double_barcoded_path])
397
+ all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
398
+ double_barcoded_path,
399
+ cfg.bam_suffix,
400
+ cfg.barcode_kit,
401
+ True,
402
+ cfg.trim,
403
+ cfg.threads)
404
+
405
+ unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
406
+ de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
407
+
408
+ bam_files = se_bam_files + de_bam_files
409
+ unclassified_bams = unclassified_se_bams + unclassified_de_bams
410
+ bam_dir = single_barcoded_path
411
+
412
+ add_or_update_column_in_csv(cfg.summary_file, "demuxed_bams", [se_bam_files])
413
+
414
+ if cfg.make_beds:
415
+ # Make beds and provide basic histograms
416
+ bed_dir = cfg.split_path / 'beds'
417
+ if bed_dir.is_dir():
418
+ print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
419
+ else:
420
+ for bam in bam_files:
421
+ aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
422
+ ########################################################################################################################
423
+
424
+ ################################### 6) SAMTools based BAM QC ######################################################################
425
+ from ..informatics.bam_functions import bam_qc
426
+ # 5) Samtools QC metrics on split BAM files
427
+ bam_qc_dir = cfg.split_path / "bam_qc"
428
+ if bam_qc_dir.is_dir():
429
+ print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
430
+ else:
431
+ make_dirs([bam_qc_dir])
432
+ bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
433
+ ########################################################################################################################
434
+
435
+ ################################### 7) AnnData loading ######################################################################
436
+ if cfg.smf_modality != 'direct':
437
+ from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
438
+ # 6) Take the converted BAM and load it into an adata object.
439
+ if cfg.smf_modality == 'deaminase':
440
+ deaminase_footprinting = True
441
+ else:
442
+ deaminase_footprinting = False
443
+ raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
444
+ bam_dir,
445
+ cfg.output_directory,
446
+ cfg.input_already_demuxed,
447
+ cfg.mapping_threshold,
448
+ cfg.experiment_name,
449
+ cfg.conversion_types,
450
+ cfg.bam_suffix,
451
+ cfg.device,
452
+ cfg.threads,
453
+ deaminase_footprinting,
454
+ delete_intermediates=cfg.delete_intermediate_hdfs,
455
+ double_barcoded_path=double_barcoded_path)
456
+ else:
457
+ if mod_bed_dir.is_dir():
458
+ print(f'{mod_bed_dir} already exists, skipping making modbeds')
459
+ else:
460
+ from ..informatics.modkit_functions import modQC, make_modbed
461
+ make_dirs([mod_bed_dir])
462
+
463
+ modQC(aligned_sorted_output,
464
+ cfg.thresholds) # get QC metrics for mod calls
465
+
466
+ make_modbed(aligned_sorted_output,
467
+ cfg.thresholds,
468
+ mod_bed_dir) # Generate bed files of position methylation summaries for every sample
469
+
470
+ from ..informatics.modkit_functions import extract_mods
471
+ make_dirs([mod_tsv_dir])
472
+
473
+ extract_mods(cfg.thresholds,
474
+ mod_tsv_dir,
475
+ bam_dir,
476
+ cfg.bam_suffix,
477
+ skip_unclassified=cfg.skip_unclassified,
478
+ modkit_summary=False,
479
+ threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
480
+
481
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
482
+ #6 Load the modification data from TSVs into an adata object
483
+ raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
484
+ bam_dir,
485
+ cfg.output_directory,
486
+ cfg.input_already_demuxed,
487
+ cfg.mapping_threshold,
488
+ cfg.experiment_name,
489
+ mods,
490
+ cfg.batch_size,
491
+ mod_tsv_dir,
492
+ cfg.delete_batch_hdfs,
493
+ cfg.threads,
494
+ double_barcoded_path)
495
+ if cfg.delete_intermediate_tsvs:
496
+ delete_tsvs(mod_tsv_dir)
497
+
498
+ raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
499
+ raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
500
+
501
+ ########################################################################################################################
502
+
503
+ ############################################### Add basic read length, read quality, mapping quality stats ###############################################
504
+ from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
505
+ from ..informatics.bam_functions import extract_read_features_from_bam
506
+ add_read_length_and_mapping_qc(raw_adata, se_bam_files,
507
+ extract_read_features_from_bam_callable=extract_read_features_from_bam,
508
+ bypass=cfg.bypass_add_read_length_and_mapping_qc,
509
+ force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
510
+
511
+ raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
512
+ ########################################################################################################################
513
+
514
+ ############################################### Save final adata ###############################################
515
+ print(f"Saving AnnData to {raw_adata_path}")
516
+ write_gz_h5ad(raw_adata, raw_adata_path)
517
+ ########################################################################################################################
518
+
519
+ ############################################### MultiQC HTML Report ###############################################
520
+ from ..informatics.run_multiqc import run_multiqc
521
+ # multiqc ###
522
+ mqc_dir = cfg.split_path / "multiqc"
523
+ if mqc_dir.is_dir():
524
+ print(f'{mqc_dir} already exists, skipping multiqc')
525
+ else:
526
+ run_multiqc(cfg.split_path, mqc_dir)
527
+ ########################################################################################################################
528
+
529
+ ############################################### delete intermediate BAM files ###############################################
530
+ if cfg.delete_intermediate_bams:
531
+ # delete aligned and sorted bam
532
+ aligned_sorted_output.unlink()
533
+ bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
534
+ bai.unlink()
535
+ # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
536
+ for bam in bam_files:
537
+ bai = bam.parent / (bam.name + '.bai')
538
+ bam.unlink()
539
+ bai.unlink()
540
+ for bam in unclassified_bams:
541
+ bai = bam.parent / (bam.name + '.bai')
542
+ bam.unlink()
543
+ bai.unlink()
544
+ ########################################################################################################################
545
+
546
+ return raw_adata, raw_adata_path, cfg
547
+
548
+ def load_adata(config_path: str):
549
+ """
550
+ CLI-facing wrapper for the load pipeline.
551
+
552
+ - Reads config CSV into ExperimentConfig
553
+ - Computes canonical paths for all downstream AnnData stages
554
+ - Registers those in the summary CSV
555
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
556
+ - If needed, calls the core pipeline to actually build the raw AnnData
557
+
558
+ Returns
559
+ -------
560
+ adata : anndata.AnnData | None
561
+ Newly created AnnData object, or None if we skipped because a later-stage
562
+ AnnData already exists.
563
+ adata_path : pathlib.Path
564
+ Path to the "current" AnnData that should be used downstream.
565
+ cfg : ExperimentConfig
566
+ Config object for downstream steps.
567
+ """
568
+ from importlib import resources
569
+ from datetime import datetime
570
+ from pathlib import Path
571
+
572
+ import pandas as pd # used for summary file reading downstream if needed
573
+
574
+ from ..readwrite import make_dirs, add_or_update_column_in_csv
575
+ from ..config import LoadExperimentConfig, ExperimentConfig
576
+
577
+ from .helpers import get_adata_paths
578
+
579
+ date_str = datetime.today().strftime("%y%m%d")
580
+
581
+ # -----------------------------
582
+ # 1) Load config into cfg
583
+ # -----------------------------
584
+ loader = LoadExperimentConfig(config_path)
585
+ defaults_dir = resources.files("smftools").joinpath("config")
586
+ cfg, report = ExperimentConfig.from_var_dict(
587
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
588
+ )
589
+
590
+ # Ensure base output dir
591
+ make_dirs([cfg.output_directory])
592
+
593
+ # -----------------------------
594
+ # 2) Compute and register paths
595
+ # -----------------------------
596
+ paths = get_adata_paths(cfg)
597
+
598
+ # experiment-level metadata in summary CSV
599
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
600
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
601
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
602
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
603
+
604
+ # AnnData stage paths
605
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
606
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
607
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
608
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
609
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
610
+
611
+ # -----------------------------
612
+ # 3) Stage skipping logic
613
+ # -----------------------------
614
+ if not getattr(cfg, "force_redo_load_adata", False):
615
+ if paths.hmm.exists():
616
+ print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
617
+ return None, paths.hmm, cfg
618
+ if paths.spatial.exists():
619
+ print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
620
+ return None, paths.spatial, cfg
621
+ if paths.pp_dedup.exists():
622
+ print(
623
+ f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
624
+ f"Skipping smftools load"
625
+ )
626
+ return None, paths.pp_dedup, cfg
627
+ if paths.pp.exists():
628
+ print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
629
+ return None, paths.pp, cfg
630
+ if paths.raw.exists():
631
+ print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
632
+ return None, paths.raw, cfg
633
+
634
+ # If we get here, we actually want to run the full load pipeline
635
+ adata, adata_path, cfg = load_adata_core(cfg, paths)
636
+
637
+ return adata, adata_path, cfg