smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. smftools/__init__.py +5 -1
  2. smftools/_version.py +1 -1
  3. smftools/informatics/__init__.py +2 -0
  4. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  5. smftools/informatics/basecall_pod5s.py +80 -0
  6. smftools/informatics/conversion_smf.py +63 -10
  7. smftools/informatics/direct_smf.py +66 -18
  8. smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
  9. smftools/informatics/helpers/__init__.py +16 -2
  10. smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
  11. smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
  12. smftools/informatics/helpers/bam_qc.py +66 -0
  13. smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
  14. smftools/informatics/helpers/canoncall.py +12 -3
  15. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
  16. smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
  17. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  18. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  19. smftools/informatics/helpers/extract_base_identities.py +33 -46
  20. smftools/informatics/helpers/extract_mods.py +55 -23
  21. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  22. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  23. smftools/informatics/helpers/find_conversion_sites.py +33 -44
  24. smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
  25. smftools/informatics/helpers/modcall.py +13 -5
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
  27. smftools/informatics/helpers/ohe_batching.py +65 -41
  28. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  29. smftools/informatics/helpers/one_hot_decode.py +27 -0
  30. smftools/informatics/helpers/one_hot_encode.py +45 -9
  31. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
  32. smftools/informatics/helpers/run_multiqc.py +28 -0
  33. smftools/informatics/helpers/split_and_index_BAM.py +3 -8
  34. smftools/informatics/load_adata.py +58 -3
  35. smftools/plotting/__init__.py +15 -0
  36. smftools/plotting/classifiers.py +355 -0
  37. smftools/plotting/general_plotting.py +205 -0
  38. smftools/plotting/position_stats.py +462 -0
  39. smftools/preprocessing/__init__.py +6 -7
  40. smftools/preprocessing/append_C_context.py +22 -9
  41. smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
  42. smftools/preprocessing/binarize_on_Youden.py +35 -32
  43. smftools/preprocessing/binary_layers_to_ohe.py +13 -3
  44. smftools/preprocessing/calculate_complexity.py +3 -2
  45. smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
  46. smftools/preprocessing/calculate_coverage.py +26 -25
  47. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  48. smftools/preprocessing/calculate_position_Youden.py +18 -7
  49. smftools/preprocessing/calculate_read_length_stats.py +39 -46
  50. smftools/preprocessing/clean_NaN.py +33 -25
  51. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  52. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
  53. smftools/preprocessing/filter_reads_on_length.py +14 -4
  54. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  55. smftools/preprocessing/invert_adata.py +18 -11
  56. smftools/preprocessing/load_sample_sheet.py +30 -16
  57. smftools/preprocessing/recipes.py +22 -20
  58. smftools/preprocessing/subsample_adata.py +58 -0
  59. smftools/readwrite.py +105 -13
  60. smftools/tools/__init__.py +49 -0
  61. smftools/tools/apply_hmm.py +202 -0
  62. smftools/tools/apply_hmm_batched.py +241 -0
  63. smftools/tools/archived/classify_methylated_features.py +66 -0
  64. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  65. smftools/tools/archived/subset_adata_v1.py +32 -0
  66. smftools/tools/archived/subset_adata_v2.py +46 -0
  67. smftools/tools/calculate_distances.py +18 -0
  68. smftools/tools/calculate_umap.py +62 -0
  69. smftools/tools/call_hmm_peaks.py +105 -0
  70. smftools/tools/classifiers.py +787 -0
  71. smftools/tools/cluster_adata_on_methylation.py +105 -0
  72. smftools/tools/data/__init__.py +2 -0
  73. smftools/tools/data/anndata_data_module.py +90 -0
  74. smftools/tools/data/preprocessing.py +6 -0
  75. smftools/tools/display_hmm.py +18 -0
  76. smftools/tools/general_tools.py +69 -0
  77. smftools/tools/hmm_readwrite.py +16 -0
  78. smftools/tools/inference/__init__.py +1 -0
  79. smftools/tools/inference/lightning_inference.py +41 -0
  80. smftools/tools/models/__init__.py +9 -0
  81. smftools/tools/models/base.py +14 -0
  82. smftools/tools/models/cnn.py +34 -0
  83. smftools/tools/models/lightning_base.py +41 -0
  84. smftools/tools/models/mlp.py +17 -0
  85. smftools/tools/models/positional.py +17 -0
  86. smftools/tools/models/rnn.py +16 -0
  87. smftools/tools/models/sklearn_models.py +40 -0
  88. smftools/tools/models/transformer.py +133 -0
  89. smftools/tools/models/wrappers.py +20 -0
  90. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  91. smftools/tools/position_stats.py +239 -0
  92. smftools/tools/read_stats.py +70 -0
  93. smftools/tools/subset_adata.py +19 -23
  94. smftools/tools/train_hmm.py +78 -0
  95. smftools/tools/training/__init__.py +1 -0
  96. smftools/tools/training/train_lightning_model.py +47 -0
  97. smftools/tools/utils/__init__.py +2 -0
  98. smftools/tools/utils/device.py +10 -0
  99. smftools/tools/utils/grl.py +14 -0
  100. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
  101. smftools-0.1.7.dist-info/RECORD +136 -0
  102. smftools/tools/apply_HMM.py +0 -1
  103. smftools/tools/read_HMM.py +0 -1
  104. smftools/tools/train_HMM.py +0 -43
  105. smftools-0.1.3.dist-info/RECORD +0 -84
  106. /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
  107. /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
  108. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  109. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py CHANGED
@@ -8,6 +8,7 @@ from . import preprocessing as pp
8
8
  from . import tools as tl
9
9
  from . import plotting as pl
10
10
  from . import readwrite, datasets
11
+ from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
11
12
 
12
13
 
13
14
  from importlib.metadata import version
@@ -16,10 +17,13 @@ package_name = "smftools"
16
17
  __version__ = version(package_name)
17
18
 
18
19
  __all__ = [
20
+ "adata_to_df",
19
21
  "inform",
20
22
  "pp",
21
23
  "tl",
22
24
  "pl",
23
25
  "readwrite",
24
- "datasets"
26
+ "datasets",
27
+ "safe_write_h5ad",
28
+ "merge_barcoded_anndatas"
25
29
  ]
smftools/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.1.7"
@@ -1,4 +1,5 @@
1
1
  from . import helpers
2
+ from .basecall_pod5s import basecall_pod5s
2
3
  from .load_adata import load_adata
3
4
  from .subsample_fasta_from_bed import subsample_fasta_from_bed
4
5
  from .subsample_pod5 import subsample_pod5
@@ -6,6 +7,7 @@ from .fast5_to_pod5 import fast5_to_pod5
6
7
 
7
8
 
8
9
  __all__ = [
10
+ "basecall_pod5s",
9
11
  "load_adata",
10
12
  "subsample_fasta_from_bed",
11
13
  "subsample_pod5",
@@ -0,0 +1,29 @@
1
+ import pysam
2
+ import sys
3
+
4
+ def extract_reads(bam_file_path, num_reads=10):
5
+ # Open the BAM file
6
+ bam_file = pysam.AlignmentFile(bam_file_path, "rb")
7
+
8
+ # Iterate through the first 'num_reads' reads and print the sequences
9
+ count = 0
10
+ for read in bam_file:
11
+ print(f"Read {count + 1}: {read.query_sequence}")
12
+ count += 1
13
+ if count >= num_reads:
14
+ break
15
+
16
+ # Close the BAM file
17
+ bam_file.close()
18
+
19
+ if __name__ == "__main__":
20
+ # Ensure a BAM file path is provided as a command line argument
21
+ if len(sys.argv) < 2:
22
+ print("Usage: python extract_reads.py <path_to_bam_file>")
23
+ sys.exit(1)
24
+
25
+ # Get the BAM file path from command line arguments
26
+ bam_file_path = sys.argv[1]
27
+
28
+ # Call the function to extract the first 10 reads
29
+ extract_reads(bam_file_path)
@@ -0,0 +1,80 @@
1
+ # basecall_pod5s
2
+
3
+ def basecall_pod5s(config_path):
4
+ """
5
+ Basecall from pod5s given a config file.
6
+
7
+ Parameters:
8
+ config_path (str): File path to the basecall configuration file
9
+
10
+ Returns:
11
+ None
12
+ """
13
+ # Lazy importing of packages
14
+ from .helpers import LoadExperimentConfig, make_dirs, canoncall, modcall
15
+ from .fast5_to_pod5 import fast5_to_pod5
16
+ import os
17
+ from pathlib import Path
18
+
19
+ # Default params
20
+ bam_suffix = '.bam' # If different, change from here.
21
+
22
+ # Load experiment config parameters into global variables
23
+ experiment_config = LoadExperimentConfig(config_path)
24
+ var_dict = experiment_config.var_dict
25
+
26
+ # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
27
+ default_value = None
28
+
29
+ # General config variable init
30
+ input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
31
+ output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
32
+ model = var_dict.get('model', default_value) # needed for dorado basecaller
33
+ barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
34
+ barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
35
+ trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
36
+ device = var_dict.get('device', 'auto')
37
+
38
+ # Modified basecalling specific variable init
39
+ filter_threshold = var_dict.get('filter_threshold', default_value)
40
+ m6A_threshold = var_dict.get('m6A_threshold', default_value)
41
+ m5C_threshold = var_dict.get('m5C_threshold', default_value)
42
+ hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
43
+ thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
44
+ mod_list = var_dict.get('mod_list', default_value)
45
+
46
+ # Make initial output directory
47
+ make_dirs([output_directory])
48
+ os.chdir(output_directory)
49
+
50
+ # Get the input filetype
51
+ if Path(input_data_path).is_file():
52
+ input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
53
+ input_is_pod5 = input_data_filetype in ['.pod5','.p5']
54
+ input_is_fast5 = input_data_filetype in ['.fast5','.f5']
55
+
56
+ elif Path(input_data_path).is_dir():
57
+ # Get the file names in the input data dir
58
+ input_files = os.listdir(input_data_path)
59
+ input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
60
+ input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
61
+
62
+ # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
63
+ if input_is_fast5 and not input_is_pod5:
64
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
65
+ output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
66
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
67
+ fast5_to_pod5(input_data_path, output_pod5)
68
+ # Reassign the pod5_dir variable to point to the new pod5 file.
69
+ input_data_path = output_pod5
70
+
71
+ model_basename = os.path.basename(model)
72
+ model_basename = model_basename.replace('.', '_')
73
+
74
+ if mod_list:
75
+ mod_string = "_".join(mod_list)
76
+ bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
77
+ modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
78
+ else:
79
+ bam=f"{output_directory}/{model_basename}_canonical_basecalls"
80
+ canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
@@ -1,6 +1,6 @@
1
1
  ## conversion_smf
2
2
 
3
- def conversion_smf(fasta, output_directory, conversion_types, strands, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall):
3
+ def conversion_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
4
4
  """
5
5
  Processes sequencing data from a conversion SMF experiment to an adata object.
6
6
 
@@ -9,7 +9,8 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
9
9
  output_directory (str): A file path to the directory to output all the analyses.
10
10
  conversion_type (list): A list of strings of the conversion types to use in the analysis.
11
11
  strands (list): A list of converstion strands to use in the experiment.
12
- model (str): a string representing the file path to the dorado basecalling model.
12
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
13
+ model (str): a string representing the dorado basecalling model.
13
14
  input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
14
15
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
16
  barcode_kit (str): A string representing the barcoding kit used in the experiment.
@@ -17,12 +18,21 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
17
18
  experiment_name (str): A string to provide an experiment name to the output adata file.
18
19
  bam_suffix (str): A suffix to add to the bam file.
19
20
  basecall (bool): Whether to go through basecalling or not.
21
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
22
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
23
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda
24
+ make_bigwigs (bool): Whether to make bigwigs
25
+ threads (int): cpu threads available for processing.
26
+ input_already_demuxed (bool): Whether the input files were already demultiplexed
20
27
 
21
28
  Returns:
22
- None
29
+ final_adata_path (str): Path to the final adata object
30
+ sorted_output (str): Path to the aligned, sorted BAM
23
31
  """
24
- from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, get_chromosome_lengths, split_and_index_BAM, make_dirs
32
+ from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
25
33
  import os
34
+ import glob
35
+
26
36
  if basecall:
27
37
  model_basename = os.path.basename(model)
28
38
  model_basename = model_basename.replace('.', '_')
@@ -56,7 +66,7 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
56
66
  if os.path.exists(canoncall_output):
57
67
  print(canoncall_output + ' already exists. Using existing basecalled BAM.')
58
68
  else:
59
- canoncall(model, input_data_path, barcode_kit, bam, bam_suffix)
69
+ canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
60
70
  else:
61
71
  canoncall_output = input_data_path
62
72
 
@@ -66,14 +76,57 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
66
76
  if os.path.exists(aligned_output) and os.path.exists(sorted_output):
67
77
  print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
68
78
  else:
69
- align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory)
79
+ align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads)
80
+
81
+ # Make beds and provide basic histograms
82
+ bed_dir = os.path.join(output_directory, 'beds')
83
+ if os.path.isdir(bed_dir):
84
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
85
+ else:
86
+ aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
70
87
 
71
88
  ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
89
+ if barcode_both_ends:
90
+ split_dir = split_dir + '_both_ends_barcoded'
91
+ else:
92
+ split_dir = split_dir + '_at_least_one_end_barcoded'
93
+
72
94
  if os.path.isdir(split_dir):
73
- print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
95
+ print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
96
+ bam_pattern = '*' + bam_suffix
97
+ bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
98
+ bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
99
+ bam_files.sort()
74
100
  else:
75
101
  make_dirs([split_dir])
76
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA)
102
+ if input_already_demuxed:
103
+ bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
104
+ else:
105
+ bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
106
+
107
+ # Make beds and provide basic histograms
108
+ bed_dir = os.path.join(split_dir, 'beds')
109
+ if os.path.isdir(bed_dir):
110
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
111
+ else:
112
+ for bam in bam_files:
113
+ aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
114
+
115
+ # 5) Samtools QC metrics on split BAM files
116
+ bam_qc_dir = f"{split_dir}/bam_qc"
117
+ if os.path.isdir(bam_qc_dir):
118
+ print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
119
+ else:
120
+ make_dirs([bam_qc_dir])
121
+ bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
122
+
123
+ # multiqc ###
124
+ if os.path.isdir(f"{split_dir}/multiqc"):
125
+ print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
126
+ else:
127
+ run_multiqc(split_dir, f"{split_dir}/multiqc")
128
+
129
+ # 6) Take the converted BAM and load it into an adata object.
130
+ final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device)
77
131
 
78
- # 5) Take the converted BAM and load it into an adata object.
79
- converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
132
+ return final_adata, final_adata_path, sorted_output, bam_files
@@ -1,6 +1,6 @@
1
1
  ## direct_smf
2
2
 
3
- def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall):
3
+ def direct_smf(fasta, output_directory, mod_list, model_dir, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads):
4
4
  """
5
5
  Processes sequencing data from a direct methylation detection Nanopore SMF experiment to an AnnData object.
6
6
 
@@ -8,7 +8,8 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
8
8
  fasta (str): File path to the reference genome to align to.
9
9
  output_directory (str): A file path to the directory to output all the analyses.
10
10
  mod_list (list): A list of strings of the modification types to use in the analysis.
11
- model (str): a string representing the file path to the dorado basecalling model.
11
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
12
+ model (str): a string representing the dorado basecalling model.
12
13
  thresholds (list): A list of floats to pass for call thresholds.
13
14
  input_data_path (str): a string representing the file path to the experiment directory containing the input sequencing files.
14
15
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
@@ -18,11 +19,19 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
18
19
  bam_suffix (str): A suffix to add to the bam file.
19
20
  batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
20
21
  basecall (bool): Whether to basecall
22
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
23
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends
24
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda
25
+ make_bigwigs (bool): Whether to make bigwigs
26
+ skip_unclassified (bool): Whether to skip unclassified reads when extracting mods and loading anndata
27
+ delete_batch_hdfs (bool): Whether to delete intermediate hdf5 files.
28
+ threads (int): cpu threads available for processing.
21
29
 
22
30
  Returns:
23
- None
31
+ final_adata_path (str): Path to the final adata object
32
+ sorted_output (str): Path to the aligned, sorted BAM
24
33
  """
25
- from .helpers import align_and_sort_BAM, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
34
+ from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc
26
35
  import os
27
36
 
28
37
  if basecall:
@@ -35,8 +44,15 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
35
44
  bam=os.path.join(output_directory, bam_base)
36
45
  aligned_BAM=f"{bam}_aligned"
37
46
  aligned_sorted_BAM=f"{aligned_BAM}_sorted"
38
- mod_bed_dir=f"{output_directory}/split_mod_beds"
39
- mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
47
+
48
+ if barcode_both_ends:
49
+ split_dir = split_dir + '_both_ends_barcoded'
50
+ else:
51
+ split_dir = split_dir + '_at_least_one_end_barcoded'
52
+
53
+ mod_bed_dir=f"{split_dir}/split_mod_beds"
54
+ mod_tsv_dir=f"{split_dir}/split_mod_tsvs"
55
+ bam_qc_dir = f"{split_dir}/bam_qc"
40
56
 
41
57
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
42
58
  mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
@@ -53,7 +69,7 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
53
69
  if os.path.exists(modcall_output):
54
70
  print(modcall_output + ' already exists. Using existing basecalled BAM.')
55
71
  else:
56
- modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix)
72
+ modcall(model_dir, model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
57
73
  else:
58
74
  modcall_output = input_data_path
59
75
 
@@ -63,27 +79,59 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
63
79
  if os.path.exists(aligned_output) and os.path.exists(sorted_output):
64
80
  print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
65
81
  else:
66
- align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory)
82
+ align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory, make_bigwigs, threads)
83
+
84
+ # Make beds and provide basic histograms
85
+ bed_dir = os.path.join(output_directory, 'beds')
86
+ if os.path.isdir(bed_dir):
87
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
88
+ else:
89
+ aligned_BAM_to_bed(aligned_output, output_directory, fasta, make_bigwigs, threads)
67
90
 
68
91
  # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
69
92
  if os.path.isdir(split_dir):
70
- print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
93
+ print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
94
+ bam_files = os.listdir(split_dir)
95
+ bam_files = [os.path.join(split_dir, file) for file in bam_files if '.bam' in file and '.bai' not in file and 'unclassified' not in file]
96
+ bam_files.sort()
71
97
  else:
72
98
  make_dirs([split_dir])
73
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, fasta)
99
+ bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
100
+ # split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA) # deprecated, just use dorado demux
101
+
102
+ # Make beds and provide basic histograms
103
+ bed_dir = os.path.join(split_dir, 'beds')
104
+ if os.path.isdir(bed_dir):
105
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
106
+ else:
107
+ for bam in bam_files:
108
+ aligned_BAM_to_bed(bam, split_dir, fasta, make_bigwigs, threads)
109
+
110
+ # 4) Samtools QC metrics on split BAM files
111
+ if os.path.isdir(bam_qc_dir):
112
+ print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
113
+ else:
114
+ make_dirs([bam_qc_dir])
115
+ bam_qc(bam_files, bam_qc_dir, threads, modality='direct')
74
116
 
75
- # 4) Using nanopore modkit to work with modified BAM files ###
117
+ # 5) Using nanopore modkit to work with modified BAM files ###
76
118
  if os.path.isdir(mod_bed_dir):
77
- print(mod_bed_dir + ' already exists')
119
+ print(mod_bed_dir + ' already exists, skipping making modbeds')
78
120
  else:
79
121
  make_dirs([mod_bed_dir])
80
122
  modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
81
123
  make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
82
- if os.path.isdir(mod_tsv_dir):
83
- print(mod_tsv_dir + ' already exists')
124
+
125
+ # multiqc ###
126
+ if os.path.isdir(f"{split_dir}/multiqc"):
127
+ print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
84
128
  else:
85
- make_dirs([mod_tsv_dir])
86
- extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix) # Extract methylations calls for split BAM files into split TSV files
129
+ run_multiqc(split_dir, f"{split_dir}/multiqc")
130
+
131
+ make_dirs([mod_tsv_dir])
132
+ extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified, threads) # Extract methylations calls for split BAM files into split TSV files
133
+
134
+ #6 Load the modification data from TSVs into an adata object
135
+ final_adata, final_adata_path = modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs, threads)
87
136
 
88
- #5 Load the modification data from TSVs into an adata object
89
- modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir)
137
+ return final_adata, final_adata_path, sorted_output, bam_files
@@ -42,6 +42,7 @@ class LoadExperimentConfig:
42
42
  """
43
43
  def __init__(self, experiment_config):
44
44
  import pandas as pd
45
+ print(f"Loading experiment config from {experiment_config}")
45
46
  # Read the CSV into a pandas DataFrame
46
47
  df = pd.read_csv(experiment_config)
47
48
  # Initialize an empty dictionary to store variables
@@ -1,14 +1,18 @@
1
1
  from .align_and_sort_BAM import align_and_sort_BAM
2
2
  from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ from .bam_qc import bam_qc
3
4
  from .bed_to_bigwig import bed_to_bigwig
4
5
  from .binarize_converted_base_identities import binarize_converted_base_identities
5
6
  from .canoncall import canoncall
6
7
  from .complement_base_list import complement_base_list
7
- from .converted_BAM_to_adata import converted_BAM_to_adata
8
+ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
8
9
  from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
9
10
  from .count_aligned_reads import count_aligned_reads
11
+ from .demux_and_index_BAM import demux_and_index_BAM
10
12
  from .extract_base_identities import extract_base_identities
11
13
  from .extract_mods import extract_mods
14
+ from .extract_read_features_from_bam import extract_read_features_from_bam
15
+ from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
12
16
  from .extract_readnames_from_BAM import extract_readnames_from_BAM
13
17
  from .find_conversion_sites import find_conversion_sites
14
18
  from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
@@ -23,22 +27,29 @@ from .modkit_extract_to_adata import modkit_extract_to_adata
23
27
  from .modQC import modQC
24
28
  from .one_hot_encode import one_hot_encode
25
29
  from .ohe_batching import ohe_batching
30
+ from .one_hot_decode import one_hot_decode
31
+ from .ohe_layers_decode import ohe_layers_decode
26
32
  from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
33
+ from .run_multiqc import run_multiqc
27
34
  from .separate_bam_by_bc import separate_bam_by_bc
28
35
  from .split_and_index_BAM import split_and_index_BAM
29
36
 
30
37
  __all__ = [
31
38
  "align_and_sort_BAM",
32
39
  "aligned_BAM_to_bed",
40
+ "bam_qc",
33
41
  "bed_to_bigwig",
34
42
  "binarize_converted_base_identities",
35
43
  "canoncall",
36
44
  "complement_base_list",
37
- "converted_BAM_to_adata",
45
+ "converted_BAM_to_adata_II",
38
46
  "concatenate_fastqs_to_bam",
39
47
  "count_aligned_reads",
48
+ "demux_and_index_BAM",
40
49
  "extract_base_identities",
41
50
  "extract_mods",
51
+ "extract_read_features_from_bam",
52
+ "extract_read_lengths_from_bed",
42
53
  "extract_readnames_from_BAM",
43
54
  "find_conversion_sites",
44
55
  "convert_FASTA_record",
@@ -54,7 +65,10 @@ __all__ = [
54
65
  "modQC",
55
66
  "one_hot_encode",
56
67
  "ohe_batching",
68
+ "one_hot_decode",
69
+ "ohe_layers_decode",
57
70
  "plot_read_length_and_coverage_histograms",
71
+ "run_multiqc",
58
72
  "separate_bam_by_bc",
59
73
  "split_and_index_BAM"
60
74
  ]
@@ -1,6 +1,6 @@
1
1
  ## align_and_sort_BAM
2
2
 
3
- def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
3
+ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
4
4
  """
5
5
  A wrapper for running dorado aligner and samtools functions
6
6
 
@@ -9,6 +9,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
9
9
  input (str): File path to the basecalled file to align. Works for .bam and .fastq files
10
10
  bam_suffix (str): The suffix to use for the BAM file.
11
11
  output_directory (str): A file path to the directory to output all the analyses.
12
+ make_bigwigs (bool): Whether to make bigwigs
13
+ threads (int): Number of additional threads to use
12
14
 
13
15
  Returns:
14
16
  None
@@ -16,9 +18,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
16
18
  """
17
19
  import subprocess
18
20
  import os
19
- from .aligned_BAM_to_bed import aligned_BAM_to_bed
20
- from .extract_readnames_from_BAM import extract_readnames_from_BAM
21
- from .make_dirs import make_dirs
21
+
22
22
  input_basename = os.path.basename(input)
23
23
  input_suffix = '.' + input_basename.split('.')[1]
24
24
 
@@ -28,21 +28,32 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
28
28
  aligned_sorted_BAM=f"{aligned_BAM}_sorted"
29
29
  aligned_output = aligned_BAM + bam_suffix
30
30
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
31
+
32
+ if threads:
33
+ threads = str(threads)
34
+ else:
35
+ pass
31
36
 
32
37
  # Run dorado aligner
33
- subprocess.run(["dorado", "aligner", "--secondary", "no", fasta, input], stdout=open(aligned_output, "w"))
38
+ print(f"Aligning BAM to Reference: {input}")
39
+ if threads:
40
+ alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
41
+ else:
42
+ alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
43
+ subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
34
44
 
35
45
  # Sort the BAM on positional coordinates
36
- subprocess.run(["samtools", "sort", "-o", aligned_sorted_output, aligned_output])
46
+ print(f"Sorting BAM: {aligned_output}")
47
+ if threads:
48
+ sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
49
+ else:
50
+ sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
51
+ subprocess.run(sort_command)
37
52
 
38
53
  # Create a BAM index file
39
- subprocess.run(["samtools", "index", aligned_sorted_output])
40
-
41
- # Make a bed file of coordinates for the BAM
42
- plotting_dir = os.path.join(output_directory, 'coverage_and_readlength_histograms')
43
- bed_dir = os.path.join(output_directory, 'read_alignment_coordinates')
44
- make_dirs([plotting_dir, bed_dir])
45
- aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
46
-
47
- # Make a text file of reads for the BAM
48
- extract_readnames_from_BAM(aligned_sorted_output)
54
+ print(f"Indexing BAM: {aligned_sorted_output}")
55
+ if threads:
56
+ index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
57
+ else:
58
+ index_command = ["samtools", "index", aligned_sorted_output]
59
+ subprocess.run(index_command)