smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +29 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  5. smftools/datasets/F1_sample_sheet.csv +5 -0
  6. smftools/datasets/__init__.py +9 -0
  7. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  8. smftools/datasets/datasets.py +28 -0
  9. smftools/informatics/__init__.py +16 -0
  10. smftools/informatics/archived/bam_conversion.py +59 -0
  11. smftools/informatics/archived/bam_direct.py +63 -0
  12. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  13. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  14. smftools/informatics/basecall_pod5s.py +80 -0
  15. smftools/informatics/conversion_smf.py +132 -0
  16. smftools/informatics/direct_smf.py +137 -0
  17. smftools/informatics/fast5_to_pod5.py +21 -0
  18. smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
  19. smftools/informatics/helpers/__init__.py +74 -0
  20. smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
  21. smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
  22. smftools/informatics/helpers/archived/informatics.py +260 -0
  23. smftools/informatics/helpers/archived/load_adata.py +516 -0
  24. smftools/informatics/helpers/bam_qc.py +66 -0
  25. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  26. smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
  27. smftools/informatics/helpers/canoncall.py +34 -0
  28. smftools/informatics/helpers/complement_base_list.py +21 -0
  29. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
  30. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  31. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  32. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  33. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  34. smftools/informatics/helpers/extract_base_identities.py +44 -0
  35. smftools/informatics/helpers/extract_mods.py +83 -0
  36. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  37. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  38. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  39. smftools/informatics/helpers/find_conversion_sites.py +50 -0
  40. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  41. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  42. smftools/informatics/helpers/get_native_references.py +28 -0
  43. smftools/informatics/helpers/index_fasta.py +12 -0
  44. smftools/informatics/helpers/make_dirs.py +21 -0
  45. smftools/informatics/helpers/make_modbed.py +27 -0
  46. smftools/informatics/helpers/modQC.py +27 -0
  47. smftools/informatics/helpers/modcall.py +36 -0
  48. smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
  49. smftools/informatics/helpers/ohe_batching.py +76 -0
  50. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  51. smftools/informatics/helpers/one_hot_decode.py +27 -0
  52. smftools/informatics/helpers/one_hot_encode.py +57 -0
  53. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
  54. smftools/informatics/helpers/run_multiqc.py +28 -0
  55. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  56. smftools/informatics/helpers/split_and_index_BAM.py +36 -0
  57. smftools/informatics/load_adata.py +182 -0
  58. smftools/informatics/readwrite.py +106 -0
  59. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  60. smftools/informatics/subsample_pod5.py +104 -0
  61. smftools/plotting/__init__.py +15 -0
  62. smftools/plotting/classifiers.py +355 -0
  63. smftools/plotting/general_plotting.py +205 -0
  64. smftools/plotting/position_stats.py +462 -0
  65. smftools/preprocessing/__init__.py +33 -0
  66. smftools/preprocessing/append_C_context.py +82 -0
  67. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  68. smftools/preprocessing/archives/preprocessing.py +614 -0
  69. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  70. smftools/preprocessing/binarize_on_Youden.py +45 -0
  71. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  72. smftools/preprocessing/calculate_complexity.py +72 -0
  73. smftools/preprocessing/calculate_consensus.py +47 -0
  74. smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
  75. smftools/preprocessing/calculate_coverage.py +42 -0
  76. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  77. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  78. smftools/preprocessing/calculate_position_Youden.py +115 -0
  79. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  80. smftools/preprocessing/clean_NaN.py +46 -0
  81. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  82. smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
  83. smftools/preprocessing/filter_reads_on_length.py +51 -0
  84. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  85. smftools/preprocessing/invert_adata.py +30 -0
  86. smftools/preprocessing/load_sample_sheet.py +38 -0
  87. smftools/preprocessing/make_dirs.py +21 -0
  88. smftools/preprocessing/min_non_diagonal.py +25 -0
  89. smftools/preprocessing/recipes.py +127 -0
  90. smftools/preprocessing/subsample_adata.py +58 -0
  91. smftools/readwrite.py +198 -0
  92. smftools/tools/__init__.py +49 -0
  93. smftools/tools/apply_hmm.py +202 -0
  94. smftools/tools/apply_hmm_batched.py +241 -0
  95. smftools/tools/archived/classify_methylated_features.py +66 -0
  96. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  97. smftools/tools/archived/subset_adata_v1.py +32 -0
  98. smftools/tools/archived/subset_adata_v2.py +46 -0
  99. smftools/tools/calculate_distances.py +18 -0
  100. smftools/tools/calculate_umap.py +62 -0
  101. smftools/tools/call_hmm_peaks.py +105 -0
  102. smftools/tools/classifiers.py +787 -0
  103. smftools/tools/cluster_adata_on_methylation.py +105 -0
  104. smftools/tools/data/__init__.py +2 -0
  105. smftools/tools/data/anndata_data_module.py +90 -0
  106. smftools/tools/data/preprocessing.py +6 -0
  107. smftools/tools/display_hmm.py +18 -0
  108. smftools/tools/evaluation/__init__.py +0 -0
  109. smftools/tools/general_tools.py +69 -0
  110. smftools/tools/hmm_readwrite.py +16 -0
  111. smftools/tools/inference/__init__.py +1 -0
  112. smftools/tools/inference/lightning_inference.py +41 -0
  113. smftools/tools/models/__init__.py +9 -0
  114. smftools/tools/models/base.py +14 -0
  115. smftools/tools/models/cnn.py +34 -0
  116. smftools/tools/models/lightning_base.py +41 -0
  117. smftools/tools/models/mlp.py +17 -0
  118. smftools/tools/models/positional.py +17 -0
  119. smftools/tools/models/rnn.py +16 -0
  120. smftools/tools/models/sklearn_models.py +40 -0
  121. smftools/tools/models/transformer.py +133 -0
  122. smftools/tools/models/wrappers.py +20 -0
  123. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  124. smftools/tools/position_stats.py +239 -0
  125. smftools/tools/read_stats.py +70 -0
  126. smftools/tools/subset_adata.py +28 -0
  127. smftools/tools/train_hmm.py +78 -0
  128. smftools/tools/training/__init__.py +1 -0
  129. smftools/tools/training/train_lightning_model.py +47 -0
  130. smftools/tools/utils/__init__.py +2 -0
  131. smftools/tools/utils/device.py +10 -0
  132. smftools/tools/utils/grl.py +14 -0
  133. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
  134. smftools-0.1.7.dist-info/RECORD +136 -0
  135. smftools-0.1.6.dist-info/RECORD +0 -4
  136. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  137. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,137 @@
1
+ ## direct_smf
2
+
3
+ def direct_smf(fasta, output_directory, mod_list, model_dir, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads):
4
+ """
5
+ Processes sequencing data from a direct methylation detection Nanopore SMF experiment to an AnnData object.
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ output_directory (str): A file path to the directory to output all the analyses.
10
+ mod_list (list): A list of strings of the modification types to use in the analysis.
11
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
12
+ model (str): a string representing the dorado basecalling model.
13
+ thresholds (list): A list of floats to pass for call thresholds.
14
+ input_data_path (str): a string representing the file path to the experiment directory containing the input sequencing files.
15
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
16
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
17
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
18
+ experiment_name (str): A string to provide an experiment name to the output adata file.
19
+ bam_suffix (str): A suffix to add to the bam file.
20
+ batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
21
+ basecall (bool): Whether to basecall
22
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
23
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends
24
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda
25
+ make_bigwigs (bool): Whether to make bigwigs
26
+ skip_unclassified (bool): Whether to skip unclassified reads when extracting mods and loading anndata
27
+ delete_batch_hdfs (bool): Whether to delete intermediate hdf5 files.
28
+ threads (int): cpu threads available for processing.
29
+
30
+ Returns:
31
+ final_adata_path (str): Path to the final adata object
32
+ sorted_output (str): Path to the aligned, sorted BAM
33
+ """
34
+ from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc
35
+ import os
36
+
37
+ if basecall:
38
+ model_basename = os.path.basename(model)
39
+ model_basename = model_basename.replace('.', '_')
40
+ mod_string = "_".join(mod_list)
41
+ bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
42
+ else:
43
+ bam_base=os.path.basename(input_data_path).split('.bam')[0]
44
+ bam=os.path.join(output_directory, bam_base)
45
+ aligned_BAM=f"{bam}_aligned"
46
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
47
+
48
+ if barcode_both_ends:
49
+ split_dir = split_dir + '_both_ends_barcoded'
50
+ else:
51
+ split_dir = split_dir + '_at_least_one_end_barcoded'
52
+
53
+ mod_bed_dir=f"{split_dir}/split_mod_beds"
54
+ mod_tsv_dir=f"{split_dir}/split_mod_tsvs"
55
+ bam_qc_dir = f"{split_dir}/bam_qc"
56
+
57
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
58
+ mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
59
+ mods = [mod_map[mod] for mod in mod_list]
60
+
61
+ # Make a FAI and .chrom.names file for the fasta
62
+ get_chromosome_lengths(fasta)
63
+
64
+ os.chdir(output_directory)
65
+
66
+ # 1) Basecall using dorado
67
+ if basecall:
68
+ modcall_output = bam + bam_suffix
69
+ if os.path.exists(modcall_output):
70
+ print(modcall_output + ' already exists. Using existing basecalled BAM.')
71
+ else:
72
+ modcall(model_dir, model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
73
+ else:
74
+ modcall_output = input_data_path
75
+
76
+ # 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
77
+ aligned_output = aligned_BAM + bam_suffix
78
+ sorted_output = aligned_sorted_BAM + bam_suffix
79
+ if os.path.exists(aligned_output) and os.path.exists(sorted_output):
80
+ print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
81
+ else:
82
+ align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory, make_bigwigs, threads)
83
+
84
+ # Make beds and provide basic histograms
85
+ bed_dir = os.path.join(output_directory, 'beds')
86
+ if os.path.isdir(bed_dir):
87
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
88
+ else:
89
+ aligned_BAM_to_bed(aligned_output, output_directory, fasta, make_bigwigs, threads)
90
+
91
+ # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
92
+ if os.path.isdir(split_dir):
93
+ print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
94
+ bam_files = os.listdir(split_dir)
95
+ bam_files = [os.path.join(split_dir, file) for file in bam_files if '.bam' in file and '.bai' not in file and 'unclassified' not in file]
96
+ bam_files.sort()
97
+ else:
98
+ make_dirs([split_dir])
99
+ bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
100
+ # split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA) # deprecated, just use dorado demux
101
+
102
+ # Make beds and provide basic histograms
103
+ bed_dir = os.path.join(split_dir, 'beds')
104
+ if os.path.isdir(bed_dir):
105
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
106
+ else:
107
+ for bam in bam_files:
108
+ aligned_BAM_to_bed(bam, split_dir, fasta, make_bigwigs, threads)
109
+
110
+ # 4) Samtools QC metrics on split BAM files
111
+ if os.path.isdir(bam_qc_dir):
112
+ print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
113
+ else:
114
+ make_dirs([bam_qc_dir])
115
+ bam_qc(bam_files, bam_qc_dir, threads, modality='direct')
116
+
117
+ # 5) Using nanopore modkit to work with modified BAM files ###
118
+ if os.path.isdir(mod_bed_dir):
119
+ print(mod_bed_dir + ' already exists, skipping making modbeds')
120
+ else:
121
+ make_dirs([mod_bed_dir])
122
+ modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
123
+ make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
124
+
125
+ # multiqc ###
126
+ if os.path.isdir(f"{split_dir}/multiqc"):
127
+ print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
128
+ else:
129
+ run_multiqc(split_dir, f"{split_dir}/multiqc")
130
+
131
+ make_dirs([mod_tsv_dir])
132
+ extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified, threads) # Extract methylations calls for split BAM files into split TSV files
133
+
134
+ #6 Load the modification data from TSVs into an adata object
135
+ final_adata, final_adata_path = modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs, threads)
136
+
137
+ return final_adata, final_adata_path, sorted_output, bam_files
@@ -0,0 +1,21 @@
1
+ # fast5_to_pod5
2
+
3
+ def fast5_to_pod5(fast5_dir, output_pod5='FAST5s_to_POD5.pod5'):
4
+ """
5
+ Convert Nanopore FAST5 files to POD5 file
6
+
7
+ Parameters:
8
+ fast5_dir (str): String representing the file path to a directory containing all FAST5 files to convert into a single POD5 output.
9
+ output_pod5 (str): The name of the output POD5.
10
+
11
+ Returns:
12
+ None
13
+
14
+ """
15
+ import subprocess
16
+ from pathlib import Path
17
+
18
+ if Path(fast5_dir).is_file():
19
+ subprocess.run(["pod5", "convert", "fast5", fast5_dir, "--output", output_pod5])
20
+ elif Path(fast5_dir).is_dir():
21
+ subprocess.run(["pod5", "convert", "fast5", f".{fast5_dir}*.fast5", "--output", output_pod5])
@@ -0,0 +1,75 @@
1
+ ## LoadExperimentConfig
2
+
3
+ class LoadExperimentConfig:
4
+ """
5
+ Loads in the experiment configuration csv and saves global variables with experiment configuration parameters.
6
+ Parameters:
7
+ experiment_config (str): A string representing the file path to the experiment configuration csv file.
8
+
9
+ Attributes:
10
+ var_dict (dict): A dictionary containing experiment configuration parameters.
11
+
12
+ Example:
13
+ >>> import pandas as pd
14
+ >>> from io import StringIO
15
+ >>> csv_data = '''variable,value,type
16
+ ... mapping_threshold,0.05,float
17
+ ... batch_size,4,int
18
+ ... testing_bool,True,bool
19
+ ... strands,"[bottom, top]",list
20
+ ... split_dir,split_bams,string
21
+ ... pod5_dir,None,string
22
+ ... pod5_dir,,string
23
+ ... '''
24
+ >>> csv_file = StringIO(csv_data)
25
+ >>> df = pd.read_csv(csv_file)
26
+ >>> df.to_csv('test_config.csv', index=False)
27
+ >>> config_loader = LoadExperimentConfig('test_config.csv')
28
+ >>> config_loader.var_dict['mapping_threshold']
29
+ 0.05
30
+ >>> config_loader.var_dict['batch_size']
31
+ 4
32
+ >>> config_loader.var_dict['testing_bool']
33
+ True
34
+ >>> config_loader.var_dict['strands']
35
+ ['bottom', 'top']
36
+ >>> config_loader.var_dict['split_dir']
37
+ 'split_bams'
38
+ >>> config_loader.var_dict['pod5_dir'] is None
39
+ True
40
+ >>> config_loader.var_dict['pod5_dir'] is None
41
+ True
42
+ """
43
+ def __init__(self, experiment_config):
44
+ import pandas as pd
45
+ print(f"Loading experiment config from {experiment_config}")
46
+ # Read the CSV into a pandas DataFrame
47
+ df = pd.read_csv(experiment_config)
48
+ # Initialize an empty dictionary to store variables
49
+ var_dict = {}
50
+ # Iterate through each row in the DataFrame
51
+ for _, row in df.iterrows():
52
+ var_name = str(row['variable'])
53
+ value = row['value']
54
+ dtype = row['type']
55
+ # Handle empty and None values
56
+ if pd.isna(value) or value in ['None', '']:
57
+ value = None
58
+ else:
59
+ # Handle different data types
60
+ if dtype == 'list':
61
+ # Convert the string representation of a list to an actual list
62
+ value = value.strip('()[]').replace(', ', ',').split(',')
63
+ elif dtype == 'int':
64
+ value = int(value)
65
+ elif dtype == 'float':
66
+ value = float(value)
67
+ elif dtype == 'bool':
68
+ value = value.lower() == 'true'
69
+ elif dtype == 'string':
70
+ value = str(value)
71
+ # Store the variable in the dictionary
72
+ var_dict[var_name] = value
73
+ # Save the dictionary as an attribute of the class
74
+ self.var_dict = var_dict
75
+
@@ -0,0 +1,74 @@
1
+ from .align_and_sort_BAM import align_and_sort_BAM
2
+ from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ from .bam_qc import bam_qc
4
+ from .bed_to_bigwig import bed_to_bigwig
5
+ from .binarize_converted_base_identities import binarize_converted_base_identities
6
+ from .canoncall import canoncall
7
+ from .complement_base_list import complement_base_list
8
+ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
+ from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
+ from .count_aligned_reads import count_aligned_reads
11
+ from .demux_and_index_BAM import demux_and_index_BAM
12
+ from .extract_base_identities import extract_base_identities
13
+ from .extract_mods import extract_mods
14
+ from .extract_read_features_from_bam import extract_read_features_from_bam
15
+ from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
16
+ from .extract_readnames_from_BAM import extract_readnames_from_BAM
17
+ from .find_conversion_sites import find_conversion_sites
18
+ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
19
+ from .get_chromosome_lengths import get_chromosome_lengths
20
+ from .get_native_references import get_native_references
21
+ from .index_fasta import index_fasta
22
+ from .LoadExperimentConfig import LoadExperimentConfig
23
+ from .make_dirs import make_dirs
24
+ from .make_modbed import make_modbed
25
+ from .modcall import modcall
26
+ from .modkit_extract_to_adata import modkit_extract_to_adata
27
+ from .modQC import modQC
28
+ from .one_hot_encode import one_hot_encode
29
+ from .ohe_batching import ohe_batching
30
+ from .one_hot_decode import one_hot_decode
31
+ from .ohe_layers_decode import ohe_layers_decode
32
+ from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
33
+ from .run_multiqc import run_multiqc
34
+ from .separate_bam_by_bc import separate_bam_by_bc
35
+ from .split_and_index_BAM import split_and_index_BAM
36
+
37
+ __all__ = [
38
+ "align_and_sort_BAM",
39
+ "aligned_BAM_to_bed",
40
+ "bam_qc",
41
+ "bed_to_bigwig",
42
+ "binarize_converted_base_identities",
43
+ "canoncall",
44
+ "complement_base_list",
45
+ "converted_BAM_to_adata_II",
46
+ "concatenate_fastqs_to_bam",
47
+ "count_aligned_reads",
48
+ "demux_and_index_BAM",
49
+ "extract_base_identities",
50
+ "extract_mods",
51
+ "extract_read_features_from_bam",
52
+ "extract_read_lengths_from_bed",
53
+ "extract_readnames_from_BAM",
54
+ "find_conversion_sites",
55
+ "convert_FASTA_record",
56
+ "generate_converted_FASTA",
57
+ "get_chromosome_lengths",
58
+ "get_native_references",
59
+ "index_fasta",
60
+ "LoadExperimentConfig",
61
+ "make_dirs",
62
+ "make_modbed",
63
+ "modcall",
64
+ "modkit_extract_to_adata",
65
+ "modQC",
66
+ "one_hot_encode",
67
+ "ohe_batching",
68
+ "one_hot_decode",
69
+ "ohe_layers_decode",
70
+ "plot_read_length_and_coverage_histograms",
71
+ "run_multiqc",
72
+ "separate_bam_by_bc",
73
+ "split_and_index_BAM"
74
+ ]
@@ -0,0 +1,59 @@
1
+ ## align_and_sort_BAM
2
+
3
+ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
4
+ """
5
+ A wrapper for running dorado aligner and samtools functions
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ input (str): File path to the basecalled file to align. Works for .bam and .fastq files
10
+ bam_suffix (str): The suffix to use for the BAM file.
11
+ output_directory (str): A file path to the directory to output all the analyses.
12
+ make_bigwigs (bool): Whether to make bigwigs
13
+ threads (int): Number of additional threads to use
14
+
15
+ Returns:
16
+ None
17
+ The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
18
+ """
19
+ import subprocess
20
+ import os
21
+
22
+ input_basename = os.path.basename(input)
23
+ input_suffix = '.' + input_basename.split('.')[1]
24
+
25
+ output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
26
+
27
+ aligned_BAM=f"{output_path_minus_suffix}_aligned"
28
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
29
+ aligned_output = aligned_BAM + bam_suffix
30
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
31
+
32
+ if threads:
33
+ threads = str(threads)
34
+ else:
35
+ pass
36
+
37
+ # Run dorado aligner
38
+ print(f"Aligning BAM to Reference: {input}")
39
+ if threads:
40
+ alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
41
+ else:
42
+ alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
43
+ subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
44
+
45
+ # Sort the BAM on positional coordinates
46
+ print(f"Sorting BAM: {aligned_output}")
47
+ if threads:
48
+ sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
49
+ else:
50
+ sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
51
+ subprocess.run(sort_command)
52
+
53
+ # Create a BAM index file
54
+ print(f"Indexing BAM: {aligned_sorted_output}")
55
+ if threads:
56
+ index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
57
+ else:
58
+ index_command = ["samtools", "index", aligned_sorted_output]
59
+ subprocess.run(index_command)
@@ -0,0 +1,74 @@
1
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
2
+ """
3
+ Takes an aligned BAM as input and writes a BED file of reads as output.
4
+ Bed columns are: Record name, start position, end position, read length, read name.
5
+
6
+ Parameters:
7
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
8
+ out_dir (str): Directory to output files.
9
+ fasta (str): File path to the reference genome.
10
+ make_bigwigs (bool): Whether to generate bigwig files.
11
+ threads (int): Number of threads to use.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import subprocess
17
+ import os
18
+ import concurrent.futures
19
+ from concurrent.futures import ProcessPoolExecutor
20
+ from .bed_to_bigwig import bed_to_bigwig
21
+ from . import make_dirs
22
+ from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
23
+
24
+ threads = threads or os.cpu_count() # Use max available cores if not specified
25
+
26
+ # Create necessary directories
27
+ plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
28
+ bed_dir = os.path.join(out_dir, "beds")
29
+ make_dirs([plotting_dir, bed_dir])
30
+
31
+ bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
32
+
33
+ print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
34
+
35
+ # Convert BAM to BED format
36
+ with open(bed_output, "w") as output_file:
37
+ samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
38
+ awk_process = subprocess.Popen(
39
+ ["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
40
+ stdin=samtools_view.stdout,
41
+ stdout=output_file
42
+ )
43
+
44
+ samtools_view.stdout.close()
45
+ awk_process.wait()
46
+ samtools_view.wait()
47
+
48
+ print(f"BED file created: {bed_output}")
49
+
50
+ def split_bed(bed):
51
+ """Splits BED into aligned and unaligned reads."""
52
+ aligned = bed.replace(".bed", "_aligned.bed")
53
+ unaligned = bed.replace(".bed", "_unaligned.bed")
54
+
55
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
56
+ for line in infile:
57
+ (unaligned_out if line.startswith("*") else aligned_out).write(line)
58
+
59
+ os.remove(bed)
60
+ return aligned
61
+
62
+ print(f"Splitting BED: {bed_output}")
63
+ aligned_bed = split_bed(bed_output)
64
+
65
+ with ProcessPoolExecutor() as executor: # Use processes instead of threads
66
+ futures = []
67
+ futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
68
+ if make_bigwigs:
69
+ futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
70
+
71
+ # Wait for all tasks to complete
72
+ concurrent.futures.wait(futures)
73
+
74
+ print("Processing completed successfully.")