smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import shutil
4
5
  from pathlib import Path
5
6
  from typing import Iterable, Union
6
7
 
7
8
  import numpy as np
8
9
 
9
- from smftools.logging_utils import get_logger
10
+ from smftools.constants import LOAD_DIR, LOGGING_DIR
11
+ from smftools.logging_utils import get_logger, setup_logging
10
12
 
11
13
  from .helpers import AdataPaths
12
14
 
@@ -103,63 +105,29 @@ def load_adata(config_path: str):
103
105
  from datetime import datetime
104
106
  from importlib import resources
105
107
 
106
- from ..config import ExperimentConfig, LoadExperimentConfig
107
- from ..readwrite import add_or_update_column_in_csv, make_dirs
108
- from .helpers import get_adata_paths
109
-
110
- date_str = datetime.today().strftime("%y%m%d")
108
+ from ..readwrite import make_dirs
109
+ from .helpers import get_adata_paths, load_experiment_config
111
110
 
112
111
  # -----------------------------
113
112
  # 1) Load config into cfg
114
113
  # -----------------------------
115
- loader = LoadExperimentConfig(config_path)
116
- defaults_dir = resources.files("smftools").joinpath("config")
117
- cfg, report = ExperimentConfig.from_var_dict(
118
- loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
119
- )
114
+ cfg = load_experiment_config(config_path)
120
115
 
121
116
  # Ensure base output dir
122
- make_dirs([cfg.output_directory])
117
+ output_directory = Path(cfg.output_directory)
118
+ make_dirs([output_directory])
123
119
 
124
120
  # -----------------------------
125
121
  # 2) Compute and register paths
126
122
  # -----------------------------
127
123
  paths = get_adata_paths(cfg)
128
124
 
129
- # experiment-level metadata in summary CSV
130
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
131
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
132
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
133
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
134
-
135
- # AnnData stage paths
136
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
137
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
138
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
139
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
140
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
141
-
142
125
  # -----------------------------
143
126
  # 3) Stage skipping logic
144
127
  # -----------------------------
145
128
  if not getattr(cfg, "force_redo_load_adata", False):
146
- if paths.hmm.exists():
147
- logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
148
- return None, paths.hmm, cfg
149
- if paths.spatial.exists():
150
- logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
151
- return None, paths.spatial, cfg
152
- if paths.pp_dedup.exists():
153
- logger.debug(
154
- f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
155
- f"Skipping smftools load"
156
- )
157
- return None, paths.pp_dedup, cfg
158
- if paths.pp.exists():
159
- logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
160
- return None, paths.pp, cfg
161
129
  if paths.raw.exists():
162
- logger.debug(
130
+ logger.info(
163
131
  f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
164
132
  )
165
133
  return None, paths.raw, cfg
@@ -199,6 +167,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
199
167
  cfg : ExperimentConfig
200
168
  (Same object, possibly with some fields updated, e.g. fasta path.)
201
169
  """
170
+ from datetime import datetime
202
171
 
203
172
  from ..informatics.bam_functions import (
204
173
  align_and_sort_BAM,
@@ -206,6 +175,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
206
175
  concatenate_fastqs_to_bam,
207
176
  demux_and_index_BAM,
208
177
  extract_read_features_from_bam,
178
+ extract_read_tags_from_bam,
209
179
  split_and_index_BAM,
210
180
  )
211
181
  from ..informatics.basecalling import canoncall, modcall
@@ -216,7 +186,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
216
186
  get_chromosome_lengths,
217
187
  subsample_fasta_from_bed,
218
188
  )
219
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
189
+ from ..informatics.h5ad_functions import (
190
+ add_read_length_and_mapping_qc,
191
+ add_read_tag_annotations,
192
+ add_secondary_supplementary_alignment_flags,
193
+ )
220
194
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
221
195
  from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
222
196
  from ..informatics.pod5_functions import fast5_to_pod5
@@ -226,8 +200,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
226
200
  from .helpers import write_gz_h5ad
227
201
 
228
202
  ################################### 1) General params and input organization ###################################
203
+ date_str = datetime.today().strftime("%y%m%d")
204
+ now = datetime.now()
205
+ time_str = now.strftime("%H%M%S")
206
+
207
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
208
+
229
209
  output_directory = Path(cfg.output_directory)
230
- make_dirs([output_directory])
210
+ load_directory = output_directory / LOAD_DIR
211
+ logging_directory = load_directory / LOGGING_DIR
212
+
213
+ make_dirs([output_directory, load_directory])
214
+
215
+ if cfg.emit_log_file:
216
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
217
+ make_dirs([logging_directory])
218
+ else:
219
+ log_file = None
220
+
221
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
231
222
 
232
223
  raw_adata_path = paths.raw
233
224
  pp_adata_path = paths.pp
@@ -241,11 +232,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
241
232
 
242
233
  # Direct methylation detection SMF specific parameters
243
234
  if cfg.smf_modality == "direct":
244
- mod_bed_dir = cfg.output_directory / "mod_beds"
245
- add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
246
- mod_tsv_dir = cfg.output_directory / "mod_tsvs"
247
- add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
248
- bam_qc_dir = cfg.output_directory / "bam_qc"
235
+ mod_bed_dir = load_directory / "mod_beds"
236
+ mod_tsv_dir = load_directory / "mod_tsvs"
237
+ bam_qc_dir = load_directory / "bam_qc"
249
238
  mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
250
239
 
251
240
  if not check_executable_exists("dorado"):
@@ -281,7 +270,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
281
270
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
282
271
  if cfg.input_type == "fast5":
283
272
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
284
- output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
273
+ output_pod5 = load_directory / "FAST5s_to_POD5.pod5"
285
274
  if output_pod5.exists():
286
275
  pass
287
276
  else:
@@ -295,7 +284,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
295
284
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
296
285
  elif cfg.input_type == "fastq":
297
286
  # Output file for FASTQ concatenation.
298
- output_bam = cfg.output_directory / "canonical_basecalls.bam"
287
+ output_bam = load_directory / "canonical_basecalls.bam"
299
288
  if output_bam.exists():
300
289
  logger.debug("Output BAM already exists")
301
290
  else:
@@ -323,8 +312,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
323
312
  else:
324
313
  pass
325
314
 
326
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
327
-
328
315
  # Determine if the input data needs to be basecalled
329
316
  if cfg.input_type == "pod5":
330
317
  logger.info(f"Detected pod5 inputs: {cfg.input_files}")
@@ -341,25 +328,24 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
341
328
  model_basename = str(model_basename).replace(".", "_")
342
329
  if cfg.smf_modality == "direct":
343
330
  mod_string = "_".join(cfg.mod_list)
344
- bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
331
+ bam = load_directory / f"{model_basename}_{mod_string}_calls"
345
332
  else:
346
- bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
333
+ bam = load_directory / f"{model_basename}_canonical_basecalls"
347
334
  else:
348
- bam_base = cfg.input_data_path.name
349
- bam = cfg.output_directory / bam_base
335
+ bam_base = cfg.input_data_path.stem
336
+ bam = cfg.input_data_path.parent / bam_base
350
337
 
351
338
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
352
339
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
340
+
353
341
  aligned_BAM = (
354
- cfg.output_directory / (bam.stem + "_aligned")
342
+ load_directory / (bam.stem + "_aligned")
355
343
  ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
344
+
356
345
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
357
346
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
358
347
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
359
348
 
360
- add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
361
- add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
362
- add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
363
349
  ########################################################################################################################
364
350
 
365
351
  ################################### 2) FASTA Handling ###################################
@@ -373,11 +359,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
373
359
  if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
374
360
  fasta_stem = cfg.fasta.stem
375
361
  bed_stem = Path(cfg.fasta_regions_of_interest).stem
376
- output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
362
+ output_FASTA = load_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
377
363
 
378
364
  logger.info("Subsampling FASTA records using the provided BED file")
379
365
  subsample_fasta_from_bed(
380
- cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
366
+ cfg.fasta, cfg.fasta_regions_of_interest, load_directory, output_FASTA
381
367
  )
382
368
  fasta = output_FASTA
383
369
  else:
@@ -388,7 +374,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
388
374
  if cfg.smf_modality == "conversion":
389
375
  fasta_stem = fasta.stem
390
376
  converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
391
- converted_FASTA = cfg.output_directory / converted_FASTA_basename
377
+ converted_FASTA = load_directory / converted_FASTA_basename
392
378
 
393
379
  if "converted.fa" in fasta.name:
394
380
  logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
@@ -400,8 +386,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
400
386
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
401
387
  fasta = converted_FASTA
402
388
 
403
- add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
404
-
405
389
  # Make a FAI and .chrom.names file for the fasta
406
390
  get_chromosome_lengths(fasta)
407
391
  ########################################################################################################################
@@ -462,13 +446,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
462
446
  logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
463
447
  else:
464
448
  logger.info(f"Aligning and sorting reads")
465
- align_and_sort_BAM(fasta, unaligned_output, cfg)
449
+ align_and_sort_BAM(fasta, unaligned_output, aligned_output, cfg)
466
450
  # Deleted the unsorted aligned output
467
451
  aligned_output.unlink()
468
452
 
469
453
  if cfg.make_beds:
470
454
  # Make beds and provide basic histograms
471
- bed_dir = cfg.output_directory / "beds"
455
+ bed_dir = load_directory / "beds"
472
456
  if bed_dir.is_dir():
473
457
  logger.debug(
474
458
  f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
@@ -477,7 +461,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
477
461
  logger.info("Making bed files from the aligned and sorted BAM file")
478
462
  aligned_BAM_to_bed(
479
463
  aligned_sorted_output,
480
- cfg.output_directory,
464
+ load_directory,
481
465
  fasta,
482
466
  cfg.make_bigwigs,
483
467
  cfg.threads,
@@ -515,6 +499,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
515
499
 
516
500
  se_bam_files = bam_files
517
501
  bam_dir = cfg.split_path
502
+ double_barcoded_path = None
518
503
 
519
504
  else:
520
505
  if single_barcoded_path.is_dir():
@@ -608,7 +593,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
608
593
  ################################### 6) SAMTools based BAM QC ######################################################################
609
594
 
610
595
  # 5) Samtools QC metrics on split BAM files
611
- bam_qc_dir = cfg.split_path / "bam_qc"
596
+ bam_qc_dir = load_directory / "bam_qc"
612
597
  if bam_qc_dir.is_dir():
613
598
  logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
614
599
  else:
@@ -637,7 +622,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
637
622
  raw_adata, raw_adata_path = converted_BAM_to_adata(
638
623
  fasta,
639
624
  bam_dir,
640
- cfg.output_directory,
625
+ load_directory,
641
626
  cfg.input_already_demuxed,
642
627
  cfg.mapping_threshold,
643
628
  cfg.experiment_name,
@@ -694,7 +679,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
694
679
  raw_adata, raw_adata_path = modkit_extract_to_adata(
695
680
  fasta,
696
681
  bam_dir,
697
- cfg.output_directory,
682
+ load_directory,
698
683
  cfg.input_already_demuxed,
699
684
  cfg.mapping_threshold,
700
685
  cfg.experiment_name,
@@ -728,6 +713,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
728
713
  samtools_backend=cfg.samtools_backend,
729
714
  )
730
715
 
716
+ logger.info("Adding BAM tags and BAM flags to adata.obs")
717
+ add_read_tag_annotations(
718
+ raw_adata,
719
+ se_bam_files,
720
+ tag_names=getattr(cfg, "bam_tag_names", ["NM", "MD", "MM", "ML"]),
721
+ include_flags=True,
722
+ include_cigar=True,
723
+ extract_read_tags_from_bam_callable=extract_read_tags_from_bam,
724
+ samtools_backend=cfg.samtools_backend,
725
+ )
726
+
727
+ if getattr(cfg, "annotate_secondary_supplementary", False):
728
+ logger.info("Annotating secondary/supplementary alignments from aligned BAM")
729
+ add_secondary_supplementary_alignment_flags(
730
+ raw_adata,
731
+ aligned_sorted_output,
732
+ samtools_backend=cfg.samtools_backend,
733
+ )
734
+
731
735
  raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
732
736
  ########################################################################################################################
733
737
 
@@ -740,7 +744,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
740
744
  raw_adata,
741
745
  cfg.input_data_path,
742
746
  n_jobs=cfg.threads,
743
- csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
747
+ csv_path=load_directory / "read_to_pod5_origin_mapping.csv",
744
748
  )
745
749
  ########################################################################################################################
746
750
 
@@ -759,12 +763,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
759
763
  ############################################### MultiQC HTML Report ###############################################
760
764
 
761
765
  # multiqc ###
762
- mqc_dir = cfg.split_path / "multiqc"
766
+ mqc_dir = load_directory / "multiqc"
763
767
  if mqc_dir.is_dir():
764
768
  logger.info(f"{mqc_dir} already exists, skipping multiqc")
765
769
  else:
766
770
  logger.info("Running multiqc")
767
- run_multiqc(cfg.split_path, mqc_dir)
771
+ run_multiqc(bam_qc_dir, mqc_dir)
768
772
  ########################################################################################################################
769
773
 
770
774
  ############################################### delete intermediate BAM files ###############################################