smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
1
+ # basecall_pod5s
2
+
3
+ def basecall_pod5s(config_path):
4
+ """
5
+ Basecall from pod5s given a config file.
6
+
7
+ Parameters:
8
+ config_path (str): File path to the basecall configuration file
9
+
10
+ Returns:
11
+ None
12
+ """
13
+ # Lazy importing of packages
14
+ from .helpers import LoadExperimentConfig, make_dirs, canoncall, modcall
15
+ from .fast5_to_pod5 import fast5_to_pod5
16
+ import os
17
+ from pathlib import Path
18
+
19
+ # Default params
20
+ bam_suffix = '.bam' # If different, change from here.
21
+
22
+ # Load experiment config parameters into global variables
23
+ experiment_config = LoadExperimentConfig(config_path)
24
+ var_dict = experiment_config.var_dict
25
+
26
+ # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
27
+ default_value = None
28
+
29
+ # General config variable init
30
+ input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
31
+ output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
32
+ model = var_dict.get('model', default_value) # needed for dorado basecaller
33
+ barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
34
+ barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
35
+ trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
36
+ device = var_dict.get('device', 'auto')
37
+
38
+ # Modified basecalling specific variable init
39
+ filter_threshold = var_dict.get('filter_threshold', default_value)
40
+ m6A_threshold = var_dict.get('m6A_threshold', default_value)
41
+ m5C_threshold = var_dict.get('m5C_threshold', default_value)
42
+ hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
43
+ thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
44
+ mod_list = var_dict.get('mod_list', default_value)
45
+
46
+ # Make initial output directory
47
+ make_dirs([output_directory])
48
+ os.chdir(output_directory)
49
+
50
+ # Get the input filetype
51
+ if Path(input_data_path).is_file():
52
+ input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
53
+ input_is_pod5 = input_data_filetype in ['.pod5','.p5']
54
+ input_is_fast5 = input_data_filetype in ['.fast5','.f5']
55
+
56
+ elif Path(input_data_path).is_dir():
57
+ # Get the file names in the input data dir
58
+ input_files = os.listdir(input_data_path)
59
+ input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
60
+ input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
61
+
62
+ # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
63
+ if input_is_fast5 and not input_is_pod5:
64
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
65
+ output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
66
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
67
+ fast5_to_pod5(input_data_path, output_pod5)
68
+ # Reassign the pod5_dir variable to point to the new pod5 file.
69
+ input_data_path = output_pod5
70
+
71
+ model_basename = os.path.basename(model)
72
+ model_basename = model_basename.replace('.', '_')
73
+
74
+ if mod_list:
75
+ mod_string = "_".join(mod_list)
76
+ bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
77
+ modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
78
+ else:
79
+ bam=f"{output_directory}/{model_basename}_canonical_basecalls"
80
+ canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
@@ -0,0 +1,24 @@
1
+ # fast5_to_pod5
2
+
3
+ def fast5_to_pod5(fast5_dir, output_pod5='FAST5s_to_POD5.pod5'):
4
+ """
5
+ Convert Nanopore FAST5 files to POD5 file
6
+
7
+ Parameters:
8
+ fast5_dir (str): String representing the file path to a directory containing all FAST5 files to convert into a single POD5 output.
9
+ output_pod5 (str): The name of the output POD5.
10
+
11
+ Returns:
12
+ None
13
+
14
+ """
15
+ import subprocess
16
+ from pathlib import Path
17
+
18
+ if isinstance(fast5_dir, (list, tuple)):
19
+ cmd = ["pod5", "convert", "fast5"] + fast5_dir + ["--output", output_pod5]
20
+ subprocess.run(cmd)
21
+ elif Path(fast5_dir).is_file():
22
+ subprocess.run(["pod5", "convert", "fast5", fast5_dir, "--output", output_pod5])
23
+ elif Path(fast5_dir).is_dir():
24
+ subprocess.run(["pod5", "convert", "fast5", f".{fast5_dir}*.fast5", "--output", output_pod5])
@@ -0,0 +1,73 @@
1
+ from .align_and_sort_BAM import align_and_sort_BAM
2
+ from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ from .bam_qc import bam_qc
4
+ from .bed_to_bigwig import bed_to_bigwig
5
+ from .binarize_converted_base_identities import binarize_converted_base_identities
6
+ from .canoncall import canoncall
7
+ from .complement_base_list import complement_base_list
8
+ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
+ from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
+ from .count_aligned_reads import count_aligned_reads
11
+ from .demux_and_index_BAM import demux_and_index_BAM
12
+ from .discover_input_files import *
13
+ from .extract_base_identities import extract_base_identities
14
+ from .extract_mods import extract_mods
15
+ from .extract_read_features_from_bam import extract_read_features_from_bam
16
+ from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
17
+ from .extract_readnames_from_BAM import extract_readnames_from_BAM
18
+ from .find_conversion_sites import find_conversion_sites
19
+ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
20
+ from .get_chromosome_lengths import get_chromosome_lengths
21
+ from .get_native_references import get_native_references
22
+ from .index_fasta import index_fasta
23
+ from .make_dirs import make_dirs
24
+ from .make_modbed import make_modbed
25
+ from .modcall import modcall
26
+ from .modkit_extract_to_adata import modkit_extract_to_adata
27
+ from .modQC import modQC
28
+ from .one_hot_encode import one_hot_encode
29
+ from .ohe_batching import ohe_batching
30
+ from .one_hot_decode import one_hot_decode
31
+ from .ohe_layers_decode import ohe_layers_decode
32
+ from .plot_bed_histograms import plot_bed_histograms
33
+ from .run_multiqc import run_multiqc
34
+ from .separate_bam_by_bc import separate_bam_by_bc
35
+ from .split_and_index_BAM import split_and_index_BAM
36
+
37
+ __all__ = [
38
+ "align_and_sort_BAM",
39
+ "aligned_BAM_to_bed",
40
+ "bam_qc",
41
+ "bed_to_bigwig",
42
+ "binarize_converted_base_identities",
43
+ "canoncall",
44
+ "complement_base_list",
45
+ "converted_BAM_to_adata_II",
46
+ "concatenate_fastqs_to_bam",
47
+ "count_aligned_reads",
48
+ "demux_and_index_BAM",
49
+ "extract_base_identities",
50
+ "extract_mods",
51
+ "extract_read_features_from_bam",
52
+ "extract_read_lengths_from_bed",
53
+ "extract_readnames_from_BAM",
54
+ "find_conversion_sites",
55
+ "convert_FASTA_record",
56
+ "generate_converted_FASTA",
57
+ "get_chromosome_lengths",
58
+ "get_native_references",
59
+ "index_fasta",
60
+ "make_dirs",
61
+ "make_modbed",
62
+ "modcall",
63
+ "modkit_extract_to_adata",
64
+ "modQC",
65
+ "one_hot_encode",
66
+ "ohe_batching",
67
+ "one_hot_decode",
68
+ "ohe_layers_decode",
69
+ "plot_bed_histograms",
70
+ "run_multiqc",
71
+ "separate_bam_by_bc",
72
+ "split_and_index_BAM"
73
+ ]
@@ -0,0 +1,86 @@
1
+ ## align_and_sort_BAM
2
+
3
+ def align_and_sort_BAM(fasta,
4
+ input,
5
+ bam_suffix='.bam',
6
+ output_directory='aligned_outputs',
7
+ make_bigwigs=False,
8
+ threads=None,
9
+ aligner='minimap2',
10
+ aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
11
+ """
12
+ A wrapper for running dorado aligner and samtools functions
13
+
14
+ Parameters:
15
+ fasta (str): File path to the reference genome to align to.
16
+ input (str): File path to the basecalled file to align. Works for .bam and .fastq files
17
+ bam_suffix (str): The suffix to use for the BAM file.
18
+ output_directory (str): A file path to the directory to output all the analyses.
19
+ make_bigwigs (bool): Whether to make bigwigs
20
+ threads (int): Number of additional threads to use
21
+ aligner (str): Aligner to use. minimap2 and dorado options
22
+ aligner_args (list): list of optional parameters to use for the alignment
23
+
24
+ Returns:
25
+ None
26
+ The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
27
+ """
28
+ import subprocess
29
+ import os
30
+
31
+ input_basename = os.path.basename(input)
32
+ input_suffix = '.' + input_basename.split('.')[1]
33
+ input_as_fastq = input_basename.split('.')[0] + '.fastq'
34
+
35
+ output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
36
+
37
+ aligned_BAM=f"{output_path_minus_suffix}_aligned"
38
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
39
+ aligned_output = aligned_BAM + bam_suffix
40
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
41
+
42
+ if threads:
43
+ threads = str(threads)
44
+ else:
45
+ pass
46
+
47
+ if aligner == 'minimap2':
48
+ print(f"Converting BAM to FASTQ: {input}")
49
+ bam_to_fastq_command = ['samtools', 'fastq', input]
50
+ subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
51
+ print(f"Aligning FASTQ to Reference: {input_as_fastq}")
52
+ if threads:
53
+ minimap_command = ['minimap2'] + aligner_args + ['-t', threads, fasta, input_as_fastq]
54
+ else:
55
+ minimap_command = ['minimap2'] + aligner_args + [fasta, input_as_fastq]
56
+ subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
57
+ os.remove(input_as_fastq)
58
+
59
+ elif aligner == 'dorado':
60
+ # Run dorado aligner
61
+ print(f"Aligning BAM to Reference: {input}")
62
+ if threads:
63
+ alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [fasta, input]
64
+ else:
65
+ alignment_command = ["dorado", "aligner"] + aligner_args + [fasta, input]
66
+ subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
67
+
68
+ else:
69
+ print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
70
+ return
71
+
72
+ # Sort the BAM on positional coordinates
73
+ print(f"Sorting BAM: {aligned_output}")
74
+ if threads:
75
+ sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
76
+ else:
77
+ sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
78
+ subprocess.run(sort_command)
79
+
80
+ # Create a BAM index file
81
+ print(f"Indexing BAM: {aligned_sorted_output}")
82
+ if threads:
83
+ index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
84
+ else:
85
+ index_command = ["samtools", "index", aligned_sorted_output]
86
+ subprocess.run(index_command)
@@ -0,0 +1,85 @@
1
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
2
+ """
3
+ Takes an aligned BAM as input and writes a BED file of reads as output.
4
+ Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
5
+
6
+ Parameters:
7
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
8
+ out_dir (str): Directory to output files.
9
+ fasta (str): File path to the reference genome.
10
+ make_bigwigs (bool): Whether to generate bigwig files.
11
+ threads (int): Number of threads to use.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import subprocess
17
+ import os
18
+ import pysam
19
+ import numpy as np
20
+ import concurrent.futures
21
+ from concurrent.futures import ProcessPoolExecutor
22
+ from .bed_to_bigwig import bed_to_bigwig
23
+ from . import make_dirs
24
+ from .plot_bed_histograms import plot_bed_histograms
25
+
26
+ threads = threads or os.cpu_count() # Use max available cores if not specified
27
+
28
+ # Create necessary directories
29
+ plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
30
+ bed_dir = os.path.join(out_dir, "beds")
31
+ make_dirs([plotting_dir, bed_dir])
32
+
33
+ bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
34
+
35
+ print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
36
+
37
+ with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
38
+ for read in bam.fetch(until_eof=True):
39
+ if read.is_unmapped:
40
+ chrom = "*"
41
+ start1 = 1
42
+ rl = read.query_length or 0
43
+ mapq = 0
44
+ else:
45
+ chrom = bam.get_reference_name(read.reference_id)
46
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
47
+ start1 = int(read.reference_start) + 1
48
+ rl = read.query_length or 0
49
+ mapq = int(read.mapping_quality)
50
+
51
+ # End position in 1-based inclusive coords
52
+ end1 = start1 + (rl or 0) - 1
53
+
54
+ qname = read.query_name
55
+ quals = read.query_qualities
56
+ if quals is None or rl == 0:
57
+ avg_q = float("nan")
58
+ else:
59
+ avg_q = float(np.mean(quals))
60
+
61
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
62
+
63
+ print(f"BED-like file created: {bed_output}")
64
+
65
+ def split_bed(bed):
66
+ """Splits into aligned and unaligned reads (chrom == '*')."""
67
+ aligned = bed.replace(".bed", "_aligned.bed")
68
+ unaligned = bed.replace(".bed", "_unaligned.bed")
69
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
70
+ for line in infile:
71
+ (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
72
+ os.remove(bed)
73
+ return aligned
74
+
75
+ print(f"Splitting: {bed_output}")
76
+ aligned_bed = split_bed(bed_output)
77
+
78
+ with ProcessPoolExecutor() as executor:
79
+ futures = []
80
+ futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
81
+ if make_bigwigs:
82
+ futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
83
+ concurrent.futures.wait(futures)
84
+
85
+ print("Processing completed successfully.")
@@ -0,0 +1,260 @@
1
+ ## fasta_module
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ from Bio import SeqIO
5
+ from Bio.SeqRecord import SeqRecord
6
+ from Bio.Seq import Seq
7
+ import pysam
8
+
9
+ ######################################################################################################
10
+ ## FASTA functionality
11
+ # General
12
+
13
+ # Conversion specific
14
+ def modify_sequence_and_id(record, modification_type, strand):
15
+ """
16
+ Input: Takes a FASTA record, modification type, and strand as input
17
+ Output: Returns a new seqrecord object with the conversions of interest
18
+ """
19
+ if modification_type == '5mC':
20
+ if strand == 'top':
21
+ # Replace every 'C' with 'T' in the sequence
22
+ new_seq = record.seq.upper().replace('C', 'T')
23
+ elif strand == 'bottom':
24
+ # Replace every 'G' with 'A' in the sequence
25
+ new_seq = record.seq.upper().replace('G', 'A')
26
+ else:
27
+ print('need to provide a valid strand string: top or bottom')
28
+ elif modification_type == '6mA':
29
+ if strand == 'top':
30
+ # Replace every 'A' with 'G' in the sequence
31
+ new_seq = record.seq.upper().replace('A', 'G')
32
+ elif strand == 'bottom':
33
+ # Replace every 'T' with 'C' in the sequence
34
+ new_seq = record.seq.upper().replace('T', 'C')
35
+ else:
36
+ print('need to provide a valid strand string: top or bottom')
37
+ elif modification_type == 'unconverted':
38
+ new_seq = record.seq.upper()
39
+ else:
40
+ print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
41
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
42
+ # Return a new SeqRecord with modified sequence and ID
43
+ return record.__class__(new_seq, id=new_id, description=record.description)
44
+
45
+ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
46
+ """
47
+ Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
48
+ Output: Writes out a new fasta with all stranded conversions
49
+ Notes: Uses modify_sequence_and_id function on every record within the FASTA
50
+ """
51
+ with open(output_fasta, 'w') as output_handle:
52
+ modified_records = []
53
+ # Iterate over each record in the input FASTA
54
+ for record in SeqIO.parse(input_fasta, 'fasta'):
55
+ # Iterate over each modification type of interest
56
+ for modification_type in modification_types:
57
+ # Iterate over the strands of interest
58
+ for i, strand in enumerate(strands):
59
+ if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
60
+ pass
61
+ else:
62
+ # Add the modified record to the list of modified records
63
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
64
+ modified_records.append(modify_sequence_and_id(record, modification_type, strand))
65
+ # write out the concatenated FASTA file of modified sequences
66
+ SeqIO.write(modified_records, output_handle, 'fasta')
67
+
68
+ def find_coordinates(fasta_file, modification_type):
69
+ """
70
+ A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
71
+ If searching for adenine conversions, it will find coordinates of all adenines.
72
+ Input: A FASTA file and the modification_types of interest
73
+ Returns:
74
+ A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
75
+ """
76
+ print('{0}: Finding positions of interest in reference FASTA > {1}'.format(time_string(), fasta_file))
77
+ # Initialize lists to hold top and bottom strand positional coordinates of interest
78
+ top_strand_coordinates = []
79
+ bottom_strand_coordinates = []
80
+ record_dict = {}
81
+ print('{0}: Opening FASTA file {1}'.format(time_string(), fasta_file))
82
+ # Open the FASTA record as read only
83
+ with open(fasta_file, "r") as f:
84
+ # Iterate over records in the FASTA
85
+ for record in SeqIO.parse(f, "fasta"):
86
+ # Only iterate over the unconverted records for the reference
87
+ if 'unconverted' in record.id:
88
+ print('{0}: Iterating over record {1} in FASTA file {2}'.format(time_string(), record, fasta_file))
89
+ # Extract the sequence string of the record
90
+ sequence = str(record.seq).upper()
91
+ sequence_length = len(sequence)
92
+ if modification_type == '5mC':
93
+ # Iterate over the sequence string from the record
94
+ for i in range(0, len(sequence)):
95
+ if sequence[i] == 'C':
96
+ top_strand_coordinates.append(i) # 0-indexed coordinate
97
+ if sequence[i] == 'G':
98
+ bottom_strand_coordinates.append(i) # 0-indexed coordinate
99
+ print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(time_string()))
100
+ elif modification_type == '6mA':
101
+ # Iterate over the sequence string from the record
102
+ for i in range(0, len(sequence)):
103
+ if sequence[i] == 'A':
104
+ top_strand_coordinates.append(i) # 0-indexed coordinate
105
+ if sequence[i] == 'T':
106
+ bottom_strand_coordinates.append(i) # 0-indexed coordinate
107
+ print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(time_string()))
108
+ else:
109
+ print('modification_type not found. Please try 5mC or 6mA')
110
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
111
+ else:
112
+ pass
113
+ return record_dict
114
+
115
+ # Direct methylation specific
116
+ def get_references(fasta_file):
117
+ """
118
+ Input: A FASTA file
119
+ Returns:
120
+ A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
121
+ """
122
+ record_dict = {}
123
+ print('{0}: Opening FASTA file {1}'.format(time_string(), fasta_file))
124
+ # Open the FASTA record as read only
125
+ with open(fasta_file, "r") as f:
126
+ # Iterate over records in the FASTA
127
+ for record in SeqIO.parse(f, "fasta"):
128
+ # Extract the sequence string of the record
129
+ sequence = str(record.seq).upper()
130
+ sequence_length = len(sequence)
131
+ record_dict[record.id] = [sequence_length, sequence]
132
+ return record_dict
133
+ ######################################################################################################
134
+
135
+ ######################################################################################################
136
+ ## BAM functionality
137
+ # General
138
+ def separate_bam_by_bc(input_bam, output_prefix):
139
+ """
140
+ Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
141
+ Output: Splits the BAM based on the BC SAM tag value.
142
+ """
143
+ # Open the input BAM file for reading
144
+ with pysam.AlignmentFile(input_bam, "rb") as bam:
145
+ # Create a dictionary to store output BAM files
146
+ output_files = {}
147
+ # Iterate over each read in the BAM file
148
+ for read in bam:
149
+ try:
150
+ # Get the barcode tag value
151
+ bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
152
+ # Open the output BAM file corresponding to the barcode
153
+ if bc_tag not in output_files:
154
+ output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
155
+ # Write the read to the corresponding output BAM file
156
+ output_files[bc_tag].write(read)
157
+ except KeyError:
158
+ print(f"BC tag not present for read: {read.query_name}")
159
+ # Close all output BAM files
160
+ for output_file in output_files.values():
161
+ output_file.close()
162
+
163
+ def count_aligned_reads(bam_file):
164
+ """
165
+ Input: A BAM alignment file.
166
+ Output: The number of aligned/unaligned reads in the BAM file. Also returns a dictionary, keyed by reference id that points to a tuple. The tuple contains an integer number of mapped reads to that reference, followed by the proportion of mapped reads that map to that reference
167
+ """
168
+ print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
169
+ aligned_reads_count = 0
170
+ unaligned_reads_count = 0
171
+ # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
172
+ record_counts = {}
173
+ with pysam.AlignmentFile(bam_file, "rb") as bam:
174
+ # Iterate over reads to get the total mapped read counts and the reads that map to each reference
175
+ for read in bam:
176
+ if read.is_unmapped:
177
+ unaligned_reads_count += 1
178
+ else:
179
+ aligned_reads_count += 1
180
+ if read.reference_name in record_counts:
181
+ record_counts[read.reference_name] += 1
182
+ else:
183
+ record_counts[read.reference_name] = 1
184
+ # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
185
+ for reference in record_counts:
186
+ proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
187
+ record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
188
+ return aligned_reads_count, unaligned_reads_count, record_counts
189
+
190
+ def extract_base_identity_at_coordinates(bam_file, chromosome, positions, max_reference_length):
191
+ """
192
+ Input: A position sorted BAM file, chromosome number, position coordinate set, and reference length to extract the base identitity from the read.
193
+ Output: A dictionary, keyed by read name, that points to a list of Base identities from each read.
194
+ If the read does not contain that position, fill the list at that index with a N value.
195
+ """
196
+ positions = set(positions)
197
+ # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
198
+ base_identities = {}
199
+ # Open the postion sorted BAM file
200
+ print('{0}: Reading BAM file: {1}'.format(time_string(), bam_file))
201
+ with pysam.AlignmentFile(bam_file, "rb") as bam:
202
+ # Iterate over every read in the bam that comes from the chromosome of interest
203
+ print('{0}: Iterating over reads in bam'.format(time_string()))
204
+ for read in bam.fetch(chromosome):
205
+ if read.query_name in base_identities:
206
+ pass
207
+ #print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
208
+ else:
209
+ # Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
210
+ base_identities[read.query_name] = ['N'] * max_reference_length
211
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
212
+ for read_position, reference_position in read.get_aligned_pairs():
213
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
214
+ if reference_position in positions and read_position:
215
+ # get the base_identity in the read corresponding to that position
216
+ base_identity = read.query_sequence[read_position]
217
+ # Add the base identity to array
218
+ base_identities[read.query_name][reference_position] = base_identity
219
+ return base_identities
220
+
221
+ # Conversion SMF specific
222
+ def binarize_converted_base_identities(base_identities, strand, modification_type):
223
+ """
224
+ Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
225
+ Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
226
+ """
227
+ binarized_base_identities = {}
228
+ # Iterate over base identity keys to binarize the base identities
229
+ for key in base_identities.keys():
230
+ if strand == 'top':
231
+ if modification_type == '5mC':
232
+ binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
233
+ elif modification_type == '6mA':
234
+ binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
235
+ elif strand == 'bottom':
236
+ if modification_type == '5mC':
237
+ binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
238
+ elif modification_type == '6mA':
239
+ binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
240
+ else:
241
+ pass
242
+ return binarized_base_identities
243
+
244
+ # Direct methylation specific
245
+
246
+ ######################################################################################################
247
+
248
+ ######################################################################################################
249
+ # String encodings
250
+ def one_hot_encode(sequence):
251
+ """
252
+ Input: A sequence string of a read.
253
+ Output: One hot encoding of the sequence string.
254
+ """
255
+ mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
256
+ one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
257
+ for i, nucleotide in enumerate(sequence):
258
+ one_hot_matrix[i, mapping[nucleotide]] = 1
259
+ return one_hot_matrix
260
+ ######################################################################################################