smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,577 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import Union, Iterable
4
+
5
+ def check_executable_exists(cmd: str) -> bool:
6
+ """Return True if a command-line executable is available in PATH."""
7
+ return shutil.which(cmd) is not None
8
+
9
+ def delete_tsvs(
10
+ tsv_dir: Union[str, Path, Iterable[str], None],
11
+ *,
12
+ dry_run: bool = False,
13
+ verbose: bool = True,
14
+ ):
15
+ """
16
+ Delete intermediate tsv files.
17
+
18
+ Parameters
19
+ ----------
20
+
21
+ tsv_dir : str | Path | None
22
+ Path to a directory to remove recursively (e.g. a tsv dir created earlier).
23
+ dry_run : bool
24
+ If True, print what *would* be removed but do not actually delete.
25
+ verbose : bool
26
+ Print progress / warnings.
27
+ """
28
+ # Helper: remove a single file path (Path-like or string)
29
+ def _maybe_unlink(p: Path):
30
+ if not p.exists():
31
+ if verbose:
32
+ print(f"[skip] not found: {p}")
33
+ return
34
+ if not p.is_file():
35
+ if verbose:
36
+ print(f"[skip] not a file: {p}")
37
+ return
38
+ if dry_run:
39
+ print(f"[dry-run] would remove file: {p}")
40
+ return
41
+ try:
42
+ p.unlink()
43
+ if verbose:
44
+ print(f"Removed file: {p}")
45
+ except Exception as e:
46
+ print(f"[error] failed to remove file {p}: {e}")
47
+
48
+ # Remove tmp_dir recursively (if provided)
49
+ if tsv_dir is not None:
50
+ td = Path(tsv_dir)
51
+ if not td.exists():
52
+ if verbose:
53
+ print(f"[skip] tsv_dir not found: {td}")
54
+ else:
55
+ if not td.is_dir():
56
+ if verbose:
57
+ print(f"[skip] tsv_dir is not a directory: {td}")
58
+ else:
59
+ if dry_run:
60
+ print(f"[dry-run] would remove directory tree: {td}")
61
+ else:
62
+ try:
63
+ shutil.rmtree(td)
64
+ if verbose:
65
+ print(f"Removed directory tree: {td}")
66
+ except Exception as e:
67
+ print(f"[error] failed to remove tmp dir {td}: {e}")
68
+
69
+ def load_adata(config_path):
70
+ """
71
+ High-level function to call for converting raw sequencing data to an adata object.
72
+ Command line accesses this through smftools load <config_path>
73
+ Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
74
+ Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
75
+ Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
76
+
77
+ Parameters:
78
+ config_path (str): A string representing the file path to the experiment configuration csv file.
79
+
80
+ Returns:
81
+ adata, adata_path, se_bam_files, cfg
82
+ """
83
+ from ..readwrite import make_dirs, safe_write_h5ad, add_or_update_column_in_csv
84
+ from ..config import LoadExperimentConfig, ExperimentConfig
85
+ from ..informatics.bam_functions import concatenate_fastqs_to_bam
86
+ from ..informatics.pod5_functions import fast5_to_pod5
87
+ from ..informatics.fasta_functions import subsample_fasta_from_bed
88
+
89
+ import numpy as np
90
+ import pandas as pd
91
+ import anndata as ad
92
+ import scanpy as sc
93
+
94
+ import os
95
+ from importlib import resources
96
+ from pathlib import Path
97
+
98
+ from datetime import datetime
99
+ date_str = datetime.today().strftime("%y%m%d")
100
+
101
+ ################################### 1) General params and input organization ###################################
102
+
103
+ # Load experiment config parameters into global variables
104
+ loader = LoadExperimentConfig(config_path)
105
+ defaults_dir = resources.files("smftools").joinpath("config")
106
+ cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
107
+
108
+ # Make initial output directory
109
+ make_dirs([cfg.output_directory])
110
+
111
+ # Make a csv that contains experiment summary file paths
112
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
113
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
114
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
115
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
116
+
117
+ # Initial h5ad file naming
118
+ h5_dir = cfg.output_directory / 'h5ads'
119
+ raw_adata_path = h5_dir / f'{cfg.experiment_name}.h5ad.gz'
120
+
121
+ # Preprocessed adata path info
122
+ pp_adata_basename = raw_adata_path.name.split(".")[0] + '_preprocessed.h5ad.gz'
123
+ pp_adata_path = raw_adata_path.parent / pp_adata_basename
124
+
125
+ # Preprocessed duplicate removed adata path info
126
+ if cfg.smf_modality == 'direct':
127
+ # For direct SMF, link the duplicate removed version just to the preprocessed version, since there is not a duplicate removal step for direct workflow
128
+ pp_dup_rem_adata_path = pp_adata_path
129
+ else:
130
+ pp_dup_rem_adata_basename = pp_adata_path.name.split(".")[0] + '_duplicates_removed.h5ad.gz'
131
+ pp_dup_rem_adata_path = pp_adata_path.parent / pp_dup_rem_adata_basename
132
+
133
+ # Preprocessed duplicate removed adata with basic analyses appended path info
134
+ spatial_adata_basename = pp_dup_rem_adata_path.name.split(".")[0] + '_spatial.h5ad.gz'
135
+ spatial_adata_path = pp_dup_rem_adata_path.parent / spatial_adata_basename
136
+
137
+ # hmm adata
138
+ hmm_adata_basename = spatial_adata_path.name.split(".")[0] + '_hmm.h5ad.gz'
139
+ hmm_adata_path = spatial_adata_path.parent / hmm_adata_basename
140
+
141
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", raw_adata_path)
142
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", pp_adata_path)
143
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", pp_dup_rem_adata_path)
144
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
145
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
146
+
147
+ if cfg.force_redo_load_adata:
148
+ pass
149
+ elif hmm_adata_path.exists():
150
+ print(f"HMM AnnData already exists: {hmm_adata_path}\n Skipping smftools load")
151
+ return None, hmm_adata_path, cfg
152
+ elif spatial_adata_path.exists():
153
+ print(f"Spatial AnnData already exists: {spatial_adata_path}\n Skipping smftools load")
154
+ return None, spatial_adata_path, cfg
155
+ elif pp_dup_rem_adata_path.exists():
156
+ print(f"Preprocessed deduplicated AnnData already exists: {pp_dup_rem_adata_path}\n Skipping smftools load")
157
+ return None, pp_dup_rem_adata_path, cfg
158
+ elif pp_adata_path.exists():
159
+ print(f"Preprocessed Anndata already exists: {pp_adata_path}\n Skipping smftools load")
160
+ return None, pp_adata_path, cfg
161
+ elif raw_adata_path.exists():
162
+ print(f"Anndata from smftools load already exists: {raw_adata_path}\n Skipping smftools load")
163
+ return None, raw_adata_path, cfg
164
+ else:
165
+ pass
166
+
167
+ # Naming of the demultiplexed output directory
168
+ double_barcoded_path = cfg.split_path / "both_ends_barcoded"
169
+ single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
170
+
171
+ # Direct methylation detection SMF specific parameters
172
+ if cfg.smf_modality == 'direct':
173
+ mod_bed_dir = cfg.output_directory / "mod_beds"
174
+ add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
175
+ mod_tsv_dir = cfg.output_directory / "mod_tsvs"
176
+ add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
177
+ bam_qc_dir = cfg.output_directory / "bam_qc"
178
+ mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
179
+ mods = [mod_map[mod] for mod in cfg.mod_list]
180
+ if not check_executable_exists("dorado"):
181
+ raise RuntimeError(
182
+ "Error: 'dorado' is not installed or not in PATH. "
183
+ "Install from https://github.com/nanoporetech/dorado"
184
+ )
185
+ if not check_executable_exists("modkit"):
186
+ raise RuntimeError(
187
+ "Error: 'modkit' is not installed or not in PATH. "
188
+ "Install from https://github.com/nanoporetech/modkit"
189
+ )
190
+ else:
191
+ pass
192
+
193
+ if not cfg.input_already_demuxed or cfg.aligner == "dorado":
194
+ if not check_executable_exists("dorado"):
195
+ raise RuntimeError(
196
+ "Error: 'dorado' is not installed or not in PATH. "
197
+ "Install from https://github.com/nanoporetech/dorado"
198
+ )
199
+
200
+ if cfg.aligner == "minimap2":
201
+ if not check_executable_exists("minimap2"):
202
+ raise RuntimeError(
203
+ "Error: 'minimap2' is not installed or not in PATH. "
204
+ "Install minimap2"
205
+ )
206
+
207
+ # # Detect the input filetypes
208
+ # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
209
+ if cfg.input_type == "fast5":
210
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
211
+ output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
212
+ if output_pod5.exists():
213
+ pass
214
+ else:
215
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
216
+ fast5_to_pod5(cfg.input_data_path, output_pod5)
217
+ # Reassign the pod5_dir variable to point to the new pod5 file.
218
+ cfg.input_data_path = output_pod5
219
+ cfg.input_type == "pod5"
220
+ # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
221
+ elif cfg.input_type == "fastq":
222
+ # Output file for FASTQ concatenation.
223
+ output_bam = cfg.output_directory / 'canonical_basecalls.bam'
224
+ if output_bam.exists():
225
+ pass
226
+ else:
227
+ summary = concatenate_fastqs_to_bam(
228
+ cfg.input_files,
229
+ output_bam,
230
+ barcode_tag='BC',
231
+ gzip_suffixes=('.gz','.gzip'),
232
+ barcode_map=cfg.fastq_barcode_map,
233
+ add_read_group=True,
234
+ rg_sample_field=None,
235
+ progress=False,
236
+ auto_pair=cfg.fastq_auto_pairing)
237
+
238
+ print(f"Found the following barcodes: {summary['barcodes']}")
239
+
240
+ # Set the input data path to the concatenated BAM.
241
+ cfg.input_data_path = output_bam
242
+ cfg.input_type = "bam"
243
+ elif cfg.input_type == "h5ad":
244
+ pass
245
+ else:
246
+ pass
247
+
248
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
249
+
250
+ # Determine if the input data needs to be basecalled
251
+ if cfg.input_type == "pod5":
252
+ print(f'Detected pod5 inputs: {cfg.input_files}')
253
+ basecall = True
254
+ elif cfg.input_type in ["bam"]:
255
+ print(f'Detected bam input: {cfg.input_files}')
256
+ basecall = False
257
+ else:
258
+ print('Error, can not find input bam or pod5')
259
+
260
+ # Generate the base name of the unaligned bam without the .bam suffix
261
+ if basecall:
262
+ model_basename = Path(cfg.model).name
263
+ model_basename = str(model_basename).replace('.', '_')
264
+ if cfg.smf_modality == 'direct':
265
+ mod_string = "_".join(cfg.mod_list)
266
+ bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
267
+ else:
268
+ bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
269
+ else:
270
+ bam_base = cfg.input_data_path.name
271
+ bam = cfg.output_directory / bam_base
272
+
273
+ # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
274
+ unaligned_output = bam.with_suffix(cfg.bam_suffix)
275
+ aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
276
+ aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
277
+ aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
278
+ aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
279
+
280
+ add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
281
+ add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
282
+ add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
283
+ ########################################################################################################################
284
+
285
+ ################################### 2) FASTA Handling ###################################
286
+ from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
287
+
288
+ try:
289
+ cfg.fasta = Path(cfg.fasta)
290
+ except:
291
+ print("Need to provide an input FASTA path to proceed with smftools load")
292
+
293
+ # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
294
+ if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
295
+ fasta_basename = cfg.fasta.parent / cfg.fasta.stem
296
+ bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
297
+ output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
298
+ subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
299
+ fasta = cfg.output_directory / output_FASTA
300
+ else:
301
+ fasta = cfg.fasta
302
+
303
+ # For conversion style SMF, make a converted reference FASTA
304
+ if cfg.smf_modality == 'conversion':
305
+ fasta_basename = fasta.parent / fasta.stem
306
+ converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
307
+ converted_FASTA = cfg.output_directory / converted_FASTA_basename
308
+ if 'converted.fa' in fasta.name:
309
+ print(f'{fasta} is already converted. Using existing converted FASTA.')
310
+ converted_FASTA = fasta
311
+ elif converted_FASTA.exists():
312
+ print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
313
+ else:
314
+ generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
315
+ fasta = converted_FASTA
316
+
317
+ add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
318
+
319
+ # Make a FAI and .chrom.names file for the fasta
320
+ get_chromosome_lengths(fasta)
321
+ ########################################################################################################################
322
+
323
+ ################################### 3) Basecalling ###################################
324
+ from ..informatics.basecalling import modcall, canoncall
325
+ # 1) Basecall using dorado
326
+ if basecall and cfg.sequencer == 'ont':
327
+ try:
328
+ cfg.model_dir = Path(cfg.model_dir)
329
+ except:
330
+ print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
331
+ if aligned_sorted_output.exists():
332
+ print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
333
+ elif unaligned_output.exists():
334
+ print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
335
+ elif cfg.smf_modality != 'direct':
336
+ canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
337
+ else:
338
+ modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
339
+ elif basecall:
340
+ print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
341
+ else:
342
+ pass
343
+ ########################################################################################################################
344
+
345
+ ################################### 4) Alignment and sorting #############################################
346
+ from ..informatics.bam_functions import align_and_sort_BAM
347
+ from ..informatics.bed_functions import aligned_BAM_to_bed
348
+ # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
349
+ if aligned_sorted_output.exists():
350
+ print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
351
+ else:
352
+ align_and_sort_BAM(fasta, unaligned_output, cfg.bam_suffix, cfg.output_directory, cfg.make_bigwigs, cfg.threads, cfg.aligner, cfg.aligner_args)
353
+ # Deleted the unsorted aligned output
354
+ aligned_output.unlink()
355
+
356
+ if cfg.make_beds:
357
+ # Make beds and provide basic histograms
358
+ bed_dir = cfg.output_directory / 'beds'
359
+ if bed_dir.is_dir():
360
+ print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
361
+ else:
362
+ aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
363
+ ########################################################################################################################
364
+
365
+ ################################### 5) Demultiplexing ######################################################################
366
+ from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
367
+ # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
368
+ if cfg.input_already_demuxed:
369
+ if cfg.split_path.is_dir():
370
+ print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
371
+
372
+ all_bam_files = sorted(
373
+ p for p in cfg.split_path.iterdir()
374
+ if p.is_file()
375
+ and p.suffix == cfg.bam_suffix
376
+ )
377
+ unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
378
+ bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
379
+
380
+ else:
381
+ make_dirs([cfg.split_path])
382
+ all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
383
+ cfg.split_path,
384
+ cfg.bam_suffix)
385
+
386
+ unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
387
+ bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
388
+
389
+ se_bam_files = bam_files
390
+ bam_dir = cfg.split_path
391
+
392
+ else:
393
+ if single_barcoded_path.is_dir():
394
+ print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
395
+
396
+ all_se_bam_files = sorted(
397
+ p for p in single_barcoded_path.iterdir()
398
+ if p.is_file()
399
+ and p.suffix == cfg.bam_suffix
400
+ )
401
+ unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
402
+ se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
403
+ else:
404
+ make_dirs([cfg.split_path, single_barcoded_path])
405
+ all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
406
+ single_barcoded_path,
407
+ cfg.bam_suffix,
408
+ cfg.barcode_kit,
409
+ False,
410
+ cfg.trim,
411
+ cfg.threads)
412
+
413
+ unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
414
+ se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
415
+
416
+ if double_barcoded_path.is_dir():
417
+ print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
418
+
419
+ all_de_bam_files = sorted(
420
+ p for p in double_barcoded_path.iterdir()
421
+ if p.is_file()
422
+ and p.suffix == cfg.bam_suffix
423
+ )
424
+ unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
425
+ de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
426
+ else:
427
+ make_dirs([cfg.split_path, double_barcoded_path])
428
+ all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
429
+ double_barcoded_path,
430
+ cfg.bam_suffix,
431
+ cfg.barcode_kit,
432
+ True,
433
+ cfg.trim,
434
+ cfg.threads)
435
+
436
+ unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
437
+ de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
438
+
439
+ bam_files = se_bam_files + de_bam_files
440
+ unclassified_bams = unclassified_se_bams + unclassified_de_bams
441
+ bam_dir = single_barcoded_path
442
+
443
+ add_or_update_column_in_csv(cfg.summary_file, "demuxed_bams", [se_bam_files])
444
+
445
+ if cfg.make_beds:
446
+ # Make beds and provide basic histograms
447
+ bed_dir = cfg.split_path / 'beds'
448
+ if bed_dir.is_dir():
449
+ print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
450
+ else:
451
+ for bam in bam_files:
452
+ aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
453
+ ########################################################################################################################
454
+
455
+ ################################### 6) SAMTools based BAM QC ######################################################################
456
+ from ..informatics.bam_functions import bam_qc
457
+ # 5) Samtools QC metrics on split BAM files
458
+ bam_qc_dir = cfg.split_path / "bam_qc"
459
+ if bam_qc_dir.is_dir():
460
+ print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
461
+ else:
462
+ make_dirs([bam_qc_dir])
463
+ bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
464
+ ########################################################################################################################
465
+
466
+ ################################### 7) AnnData loading ######################################################################
467
+ if cfg.smf_modality != 'direct':
468
+ from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
469
+ # 6) Take the converted BAM and load it into an adata object.
470
+ if cfg.smf_modality == 'deaminase':
471
+ deaminase_footprinting = True
472
+ else:
473
+ deaminase_footprinting = False
474
+ raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
475
+ bam_dir,
476
+ cfg.output_directory,
477
+ cfg.input_already_demuxed,
478
+ cfg.mapping_threshold,
479
+ cfg.experiment_name,
480
+ cfg.conversion_types,
481
+ cfg.bam_suffix,
482
+ cfg.device,
483
+ cfg.threads,
484
+ deaminase_footprinting,
485
+ delete_intermediates=cfg.delete_intermediate_hdfs,
486
+ double_barcoded_path=double_barcoded_path)
487
+ else:
488
+ if mod_bed_dir.is_dir():
489
+ print(f'{mod_bed_dir} already exists, skipping making modbeds')
490
+ else:
491
+ from ..informatics.modkit_functions import modQC, make_modbed
492
+ make_dirs([mod_bed_dir])
493
+
494
+ modQC(aligned_sorted_output,
495
+ cfg.thresholds) # get QC metrics for mod calls
496
+
497
+ make_modbed(aligned_sorted_output,
498
+ cfg.thresholds,
499
+ mod_bed_dir) # Generate bed files of position methylation summaries for every sample
500
+
501
+ from ..informatics.modkit_functions import extract_mods
502
+ make_dirs([mod_tsv_dir])
503
+
504
+ extract_mods(cfg.thresholds,
505
+ mod_tsv_dir,
506
+ bam_dir,
507
+ cfg.bam_suffix,
508
+ skip_unclassified=cfg.skip_unclassified,
509
+ modkit_summary=False,
510
+ threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
511
+
512
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
513
+ #6 Load the modification data from TSVs into an adata object
514
+ raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
515
+ bam_dir,
516
+ cfg.output_directory,
517
+ cfg.input_already_demuxed,
518
+ cfg.mapping_threshold,
519
+ cfg.experiment_name,
520
+ mods,
521
+ cfg.batch_size,
522
+ mod_tsv_dir,
523
+ cfg.delete_batch_hdfs,
524
+ cfg.threads,
525
+ double_barcoded_path)
526
+ if cfg.delete_intermediate_tsvs:
527
+ delete_tsvs(mod_tsv_dir)
528
+
529
+ raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
530
+ raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
531
+
532
+ ########################################################################################################################
533
+
534
+ ############################################### Add basic read length, read quality, mapping quality stats ###############################################
535
+ from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
536
+ from ..informatics.bam_functions import extract_read_features_from_bam
537
+ add_read_length_and_mapping_qc(raw_adata, se_bam_files,
538
+ extract_read_features_from_bam_callable=extract_read_features_from_bam,
539
+ bypass=cfg.bypass_add_read_length_and_mapping_qc,
540
+ force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
541
+
542
+ raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
543
+ ########################################################################################################################
544
+
545
+ ############################################### Save final adata ###############################################
546
+ print(f"Saving AnnData to {raw_adata_path}")
547
+ safe_write_h5ad(raw_adata, raw_adata_path, compression='gzip', backup=True)
548
+ ########################################################################################################################
549
+
550
+ ############################################### MultiQC HTML Report ###############################################
551
+ from ..informatics.run_multiqc import run_multiqc
552
+ # multiqc ###
553
+ mqc_dir = cfg.split_path / "multiqc"
554
+ if mqc_dir.is_dir():
555
+ print(f'{mqc_dir} already exists, skipping multiqc')
556
+ else:
557
+ run_multiqc(cfg.split_path, mqc_dir)
558
+ ########################################################################################################################
559
+
560
+ ############################################### delete intermediate BAM files ###############################################
561
+ if cfg.delete_intermediate_bams:
562
+ # delete aligned and sorted bam
563
+ aligned_sorted_output.unlink()
564
+ bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
565
+ bai.unlink()
566
+ # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
567
+ for bam in bam_files:
568
+ bai = bam.parent / (bam.name + '.bai')
569
+ bam.unlink()
570
+ bai.unlink()
571
+ for bam in unclassified_bams:
572
+ bai = bam.parent / (bam.name + '.bai')
573
+ bam.unlink()
574
+ bai.unlink()
575
+ ########################################################################################################################
576
+
577
+ return raw_adata, raw_adata_path, cfg