smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +3 -2
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_sample_sheet.csv +5 -0
  5. smftools/datasets/datasets.py +14 -11
  6. smftools/informatics/__init__.py +10 -7
  7. smftools/informatics/archived/bam_conversion.py +59 -0
  8. smftools/informatics/archived/bam_direct.py +63 -0
  9. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  10. smftools/informatics/conversion_smf.py +79 -0
  11. smftools/informatics/direct_smf.py +89 -0
  12. smftools/informatics/fast5_to_pod5.py +21 -0
  13. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  14. smftools/informatics/helpers/__init__.py +22 -4
  15. smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
  16. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  17. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  18. smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
  19. smftools/informatics/helpers/canoncall.py +14 -1
  20. smftools/informatics/helpers/complement_base_list.py +21 -0
  21. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
  23. smftools/informatics/helpers/count_aligned_reads.py +25 -14
  24. smftools/informatics/helpers/extract_base_identities.py +44 -23
  25. smftools/informatics/helpers/extract_mods.py +17 -5
  26. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  27. smftools/informatics/helpers/find_conversion_sites.py +24 -16
  28. smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
  29. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  30. smftools/informatics/helpers/get_native_references.py +10 -7
  31. smftools/informatics/helpers/index_fasta.py +12 -0
  32. smftools/informatics/helpers/make_dirs.py +9 -3
  33. smftools/informatics/helpers/make_modbed.py +10 -4
  34. smftools/informatics/helpers/modQC.py +10 -2
  35. smftools/informatics/helpers/modcall.py +16 -2
  36. smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
  37. smftools/informatics/helpers/ohe_batching.py +52 -0
  38. smftools/informatics/helpers/one_hot_encode.py +15 -8
  39. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  40. smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
  41. smftools/informatics/helpers/split_and_index_BAM.py +31 -11
  42. smftools/informatics/load_adata.py +127 -0
  43. smftools/informatics/readwrite.py +13 -16
  44. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  45. smftools/informatics/subsample_pod5.py +104 -0
  46. smftools/preprocessing/__init__.py +6 -7
  47. smftools/preprocessing/append_C_context.py +52 -22
  48. smftools/preprocessing/binarize_on_Youden.py +8 -4
  49. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  50. smftools/preprocessing/calculate_complexity.py +26 -14
  51. smftools/preprocessing/calculate_consensus.py +47 -0
  52. smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
  53. smftools/preprocessing/calculate_coverage.py +14 -8
  54. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  55. smftools/preprocessing/calculate_position_Youden.py +21 -12
  56. smftools/preprocessing/calculate_read_length_stats.py +67 -8
  57. smftools/preprocessing/clean_NaN.py +13 -6
  58. smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
  59. smftools/preprocessing/filter_reads_on_length.py +16 -6
  60. smftools/preprocessing/invert_adata.py +10 -5
  61. smftools/preprocessing/load_sample_sheet.py +24 -0
  62. smftools/preprocessing/make_dirs.py +21 -0
  63. smftools/preprocessing/mark_duplicates.py +54 -30
  64. smftools/preprocessing/min_non_diagonal.py +9 -4
  65. smftools/preprocessing/recipes.py +125 -0
  66. smftools/preprocessing/remove_duplicates.py +15 -6
  67. smftools/readwrite.py +13 -16
  68. smftools/tools/apply_HMM.py +1 -0
  69. smftools/tools/cluster.py +0 -0
  70. smftools/tools/read_HMM.py +1 -0
  71. smftools/tools/subset_adata.py +32 -0
  72. smftools/tools/train_HMM.py +43 -0
  73. smftools-0.1.3.dist-info/METADATA +94 -0
  74. smftools-0.1.3.dist-info/RECORD +84 -0
  75. smftools/informatics/helpers/align_BAM.py +0 -49
  76. smftools/informatics/helpers/load_experiment_config.py +0 -17
  77. smftools/informatics/pod5_conversion.py +0 -26
  78. smftools/informatics/pod5_direct.py +0 -29
  79. smftools/informatics/pod5_to_adata.py +0 -17
  80. smftools-0.1.0.dist-info/METADATA +0 -75
  81. smftools-0.1.0.dist-info/RECORD +0 -58
  82. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  83. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  84. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  85. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  86. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,48 @@
1
+ ## align_and_sort_BAM
2
+
3
+ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
4
+ """
5
+ A wrapper for running dorado aligner and samtools functions
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ input (str): File path to the basecalled file to align. Works for .bam and .fastq files
10
+ bam_suffix (str): The suffix to use for the BAM file.
11
+ output_directory (str): A file path to the directory to output all the analyses.
12
+
13
+ Returns:
14
+ None
15
+ The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
16
+ """
17
+ import subprocess
18
+ import os
19
+ from .aligned_BAM_to_bed import aligned_BAM_to_bed
20
+ from .extract_readnames_from_BAM import extract_readnames_from_BAM
21
+ from .make_dirs import make_dirs
22
+ input_basename = os.path.basename(input)
23
+ input_suffix = '.' + input_basename.split('.')[1]
24
+
25
+ output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
26
+
27
+ aligned_BAM=f"{output_path_minus_suffix}_aligned"
28
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
29
+ aligned_output = aligned_BAM + bam_suffix
30
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
31
+
32
+ # Run dorado aligner
33
+ subprocess.run(["dorado", "aligner", "--secondary", "no", fasta, input], stdout=open(aligned_output, "w"))
34
+
35
+ # Sort the BAM on positional coordinates
36
+ subprocess.run(["samtools", "sort", "-o", aligned_sorted_output, aligned_output])
37
+
38
+ # Create a BAM index file
39
+ subprocess.run(["samtools", "index", aligned_sorted_output])
40
+
41
+ # Make a bed file of coordinates for the BAM
42
+ plotting_dir = os.path.join(output_directory, 'coverage_and_readlength_histograms')
43
+ bed_dir = os.path.join(output_directory, 'read_alignment_coordinates')
44
+ make_dirs([plotting_dir, bed_dir])
45
+ aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
46
+
47
+ # Make a text file of reads for the BAM
48
+ extract_readnames_from_BAM(aligned_sorted_output)
@@ -0,0 +1,73 @@
1
+ # aligned_BAM_to_bed
2
+
3
+ def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
4
+ """
5
+ Takes an aligned BAM as input and writes a bed file of reads as output.
6
+ Bed columns are: Record name, start position, end position, read length, read name
7
+
8
+ Parameters:
9
+ aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
10
+ plotting_dir (str): Path to write out read alignment length and coverage histograms
11
+ bed_dir (str): Path to write out read alignment coordinates
12
+ fasta (str): File path to the reference genome to align to.
13
+
14
+ Returns:
15
+ None
16
+
17
+ """
18
+ import subprocess
19
+ import os
20
+ from .bed_to_bigwig import bed_to_bigwig
21
+ from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
22
+
23
+ bed_output_basename = os.path.basename(aligned_BAM).split('.bam')[0] + '_bed.bed'
24
+ bed_output = os.path.join(bed_dir, bed_output_basename)
25
+
26
+ samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
27
+ with open(bed_output, "w") as output_file:
28
+ awk_process = subprocess.Popen(["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'], stdin=samtools_view.stdout, stdout=output_file)
29
+ samtools_view.stdout.close()
30
+ awk_process.wait()
31
+ samtools_view.wait()
32
+
33
+ def split_bed(bed, delete_input=True):
34
+ """
35
+ Reads in a BED file and splits it into two separate BED files based on alignment status.
36
+
37
+ Parameters:
38
+ bed (str): Path to the input BED file.
39
+ delete_input (bool): Whether to delete the input bed file
40
+
41
+ Returns:
42
+ aligned (str): Path to the aligned bed file
43
+ """
44
+ unaligned = bed.split('.bed')[0] + '_unaligned.bed'
45
+ aligned = bed.split('.bed')[0] + '_aligned.bed'
46
+
47
+ with open(bed, 'r') as infile, \
48
+ open(unaligned, 'w') as unaligned_outfile, \
49
+ open(aligned, 'w') as aligned_outfile:
50
+
51
+ for line in infile:
52
+ fields = line.strip().split('\t')
53
+
54
+ if fields[0] == '*':
55
+ unaligned_outfile.write(line)
56
+ else:
57
+ aligned_outfile.write(line)
58
+
59
+ if delete_input:
60
+ os.remove(bed)
61
+
62
+ return aligned
63
+
64
+ aligned_bed = split_bed(bed_output)
65
+
66
+ # Write out basic plots of reference coverage and read lengths
67
+ plot_read_length_and_coverage_histograms(aligned_bed, plotting_dir)
68
+
69
+ # Make a bedgraph and bigwig for the aligned reads
70
+ bed_to_bigwig(fasta, aligned_bed)
71
+
72
+
73
+
@@ -0,0 +1,39 @@
1
+ # bed_to_bigwig
2
+
3
+ def bed_to_bigwig(fasta, bed):
4
+ """
5
+ Takes a bed file of reads and makes a bedgraph plus a bigwig
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ bed (str): File path to the input bed.
10
+ Returns:
11
+ None
12
+ """
13
+ import os
14
+ import subprocess
15
+
16
+ bed_basename = os.path.basename(bed)
17
+ parent_dir = os.path.dirname(bed)
18
+ bed_basename_minus_suffix = bed_basename.split('.bed')[0]
19
+ fasta_basename = os.path.basename(fasta)
20
+ fasta_dir = os.path.dirname(fasta)
21
+ fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
22
+ chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
23
+ chrom_path = os.path.join(fasta_dir, chrom_basename)
24
+ bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
25
+ bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
26
+ bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
27
+ bigwig_output = os.path.join(parent_dir, bigwig_basename)
28
+
29
+ # Make the bedgraph
30
+ with open(bedgraph_output, 'w') as outfile:
31
+ # Command as a list
32
+ command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
33
+ print(f'Making bedgraph from {bed_basename}')
34
+ subprocess.run(command, stdout=outfile)
35
+
36
+ # Make the bigwig
37
+ command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
38
+ print(f'Making bigwig from {bedgraph_basename}')
39
+ subprocess.run(command)
@@ -1,11 +1,18 @@
1
1
  ## binarize_converted_base_identities
2
- import numpy as np
3
2
  # Conversion SMF specific
4
3
  def binarize_converted_base_identities(base_identities, strand, modification_type):
5
4
  """
6
- Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
7
- Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
5
+ Binarizes conversion SMF data within a sequence string
6
+
7
+ Parameters:
8
+ base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
9
+ strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
10
+ modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
11
+
12
+ Returns:
13
+ binarized_base_identities (dict): A binarized dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry methylation information.
8
14
  """
15
+ import numpy as np
9
16
  binarized_base_identities = {}
10
17
  # Iterate over base identity keys to binarize the base identities
11
18
  for key in base_identities.keys():
@@ -20,5 +27,5 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
20
27
  elif modification_type == '6mA':
21
28
  binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
22
29
  else:
23
- pass
30
+ print(f"{strand} not recognized")
24
31
  return binarized_base_identities
@@ -1,12 +1,25 @@
1
1
  ## canoncall
2
- import subprocess
3
2
 
4
3
  # Conversion SMF specific
5
4
  def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
6
5
  """
7
6
  Wrapper function for dorado canonical base calling.
7
+
8
+ Parameters:
9
+ model (str): a string representing the file path to the dorado basecalling model.
10
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
+ barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
12
+ bam (str): File path to the BAM file to output.
13
+ bam_suffix (str): The suffix to use for the BAM file.
14
+
15
+ Returns:
16
+ None
17
+ Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
8
18
  """
19
+ import subprocess
9
20
  output = bam + bam_suffix
10
21
  command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
22
+ command_string = " ".join(command)
23
+ print(f"Running {command_string}\n to generate {output}")
11
24
  with open(output, "w") as outfile:
12
25
  subprocess.run(command, stdout=outfile)
@@ -0,0 +1,21 @@
1
+ # complement_base_list
2
+
3
+ def complement_base_list(sequence):
4
+ """
5
+ Takes a list of DNA base identities and returns their complement.
6
+
7
+ Parameters:
8
+ sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
9
+
10
+ Returns:
11
+ complement (list): A list of complementary DNA bases.
12
+ """
13
+ complement_mapping = {
14
+ 'A': 'T',
15
+ 'T': 'A',
16
+ 'C': 'G',
17
+ 'G': 'C',
18
+ 'N': 'N' # Handling ambiguous bases like 'N'
19
+ }
20
+
21
+ return [complement_mapping[base] for base in sequence]
@@ -0,0 +1,54 @@
1
+ # concatenate_fastqs_to_bam
2
+
3
+ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
4
+ """
5
+ Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
6
+
7
+ Parameters:
8
+ fastq_files (list): List of paths to demultiplexed FASTQ files.
9
+ output_bam (str): Path to the output BAM file.
10
+ barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
11
+ gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import os
17
+ import pysam
18
+ import gzip
19
+ from Bio import SeqIO
20
+ from tqdm import tqdm
21
+
22
+ n_fastqs = len(fastq_files)
23
+
24
+ with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
25
+ for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
26
+ # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
27
+ base_name = os.path.basename(fastq_file)
28
+ if n_fastqs > 1:
29
+ if base_name.endswith('.fastq.gz'):
30
+ barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
31
+ elif base_name.endswith('.fq.gz'):
32
+ barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
33
+ elif base_name.endswith('.fastq'):
34
+ barcode = base_name.split('_')[-1].replace('.fastq', '')
35
+ elif base_name.endswith('.fq'):
36
+ barcode = base_name.split('_')[-1].replace('.fq', '')
37
+ else:
38
+ raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
39
+
40
+ # Read the FASTQ file (handle gzipped and non-gzipped files)
41
+ open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
42
+ with open_func(fastq_file, 'rt') as fq_in:
43
+ for record in SeqIO.parse(fq_in, 'fastq'):
44
+ # Create an unaligned BAM entry for each FASTQ record
45
+ aln = pysam.AlignedSegment()
46
+ aln.query_name = record.id
47
+ aln.query_sequence = str(record.seq)
48
+ aln.flag = 4 # Unmapped
49
+ aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
50
+ if n_fastqs > 1:
51
+ # Add the barcode to the BC tag
52
+ aln.set_tag(barcode_tag, barcode)
53
+ # Write to BAM file
54
+ bam_out.write(aln)
@@ -1,147 +1,233 @@
1
1
  ## converted_BAM_to_adata
2
- from .. import readwrite
3
- from .binarize_converted_base_identities import binarize_converted_base_identities
4
- from .find_conversion_sites import find_conversion_sites
5
- from .count_aligned_reads import count_aligned_reads
6
- from .extract_base_identities import extract_base_identities
7
- from .one_hot_encode import one_hot_encode
8
- import pandas as pd
9
- import numpy as np
10
- import anndata as ad
11
- import os
12
2
 
13
3
  def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
14
4
  """
5
+ A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
6
+
7
+ Parameters:
8
+ converted_FASTA (str): A string representing the file path to the converted FASTA reference.
9
+ split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
10
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
+ experiment_name (str): A string to provide an experiment name to the output adata file.
12
+ conversion_types (list): A list of strings of the conversion types to use in the analysis.
13
+ bam_suffix (str): The suffix to use for the BAM file.
15
14
 
15
+ Returns:
16
+ None
17
+ Outputs a single gzipped adata object for the experiment.
16
18
  """
19
+ from .. import readwrite
20
+ from .binarize_converted_base_identities import binarize_converted_base_identities
21
+ from .find_conversion_sites import find_conversion_sites
22
+ from .count_aligned_reads import count_aligned_reads
23
+ from .extract_base_identities import extract_base_identities
24
+ from .make_dirs import make_dirs
25
+ from .ohe_batching import ohe_batching
26
+ import pandas as pd
27
+ import numpy as np
28
+ import anndata as ad
29
+ import os
30
+ from tqdm import tqdm
31
+ import gc
32
+
33
+ ##########################################################################################
34
+ ## Get file paths and make necessary directories. ##
17
35
  # Get all of the input BAM files
18
36
  files = os.listdir(split_dir)
19
- # Change directory to the BAM directory
20
- os.chdir(split_dir)
37
+ # Make output dir
38
+ parent_dir = os.path.dirname(split_dir)
39
+ h5_dir = os.path.join(parent_dir, 'h5ads')
40
+ tmp_dir = os.path.join(parent_dir, 'tmp')
41
+ make_dirs([h5_dir, tmp_dir])
21
42
  # Filter file names that contain the search string in their filename and keep them in a list
22
43
  bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
23
44
  # Sort file list by names and print the list of file names
24
45
  bams.sort()
46
+ bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
25
47
  print(f'Found the following BAMS: {bams}')
26
48
  final_adata = None
49
+ ##########################################################################################
50
+
51
+ ##########################################################################################
27
52
 
28
- # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
53
+ ## need to fix this section
54
+ # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
29
55
  modification_dict = {}
56
+ # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
57
+ record_FASTA_dict = {}
30
58
  # While populating the dictionary, also extract the longest sequence record in the input references
31
59
  max_reference_length = 0
32
60
  for conversion_type in conversion_types:
33
- modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type)
61
+ # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
62
+ modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
63
+ # Get the max reference length
34
64
  for record in modification_dict[conversion_type].keys():
35
65
  if modification_dict[conversion_type][record][0] > max_reference_length:
36
66
  max_reference_length = modification_dict[conversion_type][record][0]
37
67
 
38
- # Iterate over the experiment BAM files
39
- for bam_index, bam in enumerate(bams):
40
- # Give each bam a sample name
41
- sample = bam.split(sep=bam_suffix)[0]
68
+ mod_type, strand = record.split('_')[-2:]
69
+
70
+ chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
71
+ unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
72
+ current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
73
+ delta_max_length = max_reference_length - current_reference_length
74
+ sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
75
+ complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
76
+ record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
77
+ ##########################################################################################
78
+
79
+ ##########################################################################################
80
+ bam_alignment_stats_dict = {}
81
+ records_to_analyze = []
82
+ for bam_index, bam in enumerate(bam_path_list):
83
+ bam_alignment_stats_dict[bam_index] = {}
42
84
  # look at aligned read proportions in the bam
43
85
  aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
44
86
  percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
45
- print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
46
- records_to_analyze = []
87
+ print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
88
+ bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
47
89
  # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
48
90
  for record in record_counts:
49
91
  print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
50
92
  if record_counts[record][1] >= mapping_threshold:
51
93
  records_to_analyze.append(record)
52
- print(f'Records to analyze: {records_to_analyze}')
53
- # Iterate over records to analyze (ie all conversions detected)
54
- record_FASTA_dict = {}
55
- for record in records_to_analyze:
56
- mod_type, strand = record.split('_')[-2:]
57
- if strand == 'top':
58
- strand_index = 1
59
- elif strand == 'bottom':
60
- strand_index = 2
94
+ bam_alignment_stats_dict[bam_index]
95
+ bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
96
+ records_to_analyze = set(records_to_analyze)
97
+ ##########################################################################################
61
98
 
62
- chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
63
- unconverted_chromosome_name = chromosome + '_unconverted_top'
64
- positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
65
- current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
66
- delta_max_length = max_reference_length - current_reference_length
67
- sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
68
- record_FASTA_dict[f'{record}'] = sequence
69
- print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
99
+ ##########################################################################################
100
+ # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
101
+ # Save the file paths in the bam_record_ohe_files dict.
102
+ bam_record_ohe_files = {}
103
+
104
+ # Iterate over split bams
105
+ for bam_index, bam in enumerate(bam_path_list):
106
+ # Iterate over references to process
107
+ for record in records_to_analyze:
108
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
109
+ sample = bams[bam_index].split(sep=bam_suffix)[0]
110
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
111
+ current_reference_length = record_FASTA_dict[unconverted_record_name][4]
112
+ mod_type = record_FASTA_dict[unconverted_record_name][6]
113
+ strand = record_FASTA_dict[unconverted_record_name][7]
114
+
115
+ # Extract the base identities of reads aligned to the record
116
+ fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
70
117
 
71
- # Get a dictionary of positional identities keyed by read id
72
- print(f'Extracting base identities of target positions')
73
- target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
74
118
  # binarize the dictionary of positional identities
75
- print(f'Binarizing base identities of target positions')
76
- binarized_base_identities = binarize_converted_base_identities(target_base_identities, strand, mod_type)
119
+ print(f'Binarizing base identities')
120
+ fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
121
+ rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
122
+ merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
77
123
  # converts the base identity dictionary to a dataframe.
78
- binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
124
+ binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
79
125
  sorted_index = sorted(binarized_base_identities_df.index)
80
126
  binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
81
- # Get the sequence string of every read
82
- print(f'Extracting base identities of all positions in each read')
83
- all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
84
- # One hot encode the sequence string of the reads
85
- print(f'One hot encoding base identities of all positions in each read')
86
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
87
-
88
- # Initialize empty DataFrames for each base
89
- read_names = list(one_hot_reads.keys())
90
- sequence_length = one_hot_reads[read_names[0]].shape[0]
91
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
92
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
93
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
94
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
95
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
96
-
97
- # Iterate through the dictionary and populate the DataFrames
98
- for read_name, one_hot_array in one_hot_reads.items():
99
- df_A.loc[read_name] = one_hot_array[:, 0]
100
- df_C.loc[read_name] = one_hot_array[:, 1]
101
- df_G.loc[read_name] = one_hot_array[:, 2]
102
- df_T.loc[read_name] = one_hot_array[:, 3]
103
- df_N.loc[read_name] = one_hot_array[:, 4]
104
-
105
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
106
127
 
107
128
  # Load an anndata object with the sample data
108
129
  X = binarized_base_identities_df.values
109
130
  adata = ad.AnnData(X, dtype=X.dtype)
110
- adata.obs_names = binarized_base_identities_df.index
111
- adata.obs_names = adata.obs_names.astype(str)
112
- adata.var_names = binarized_base_identities_df.columns
113
- adata.var_names = adata.var_names.astype(str)
114
- adata.obs['Sample'] = [sample] * len(adata)
115
- adata.obs['Strand'] = [strand] * len(adata)
116
- adata.obs['Dataset'] = [mod_type] * len(adata)
117
- adata.obs['Reference'] = [record] * len(adata)
118
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
119
-
120
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
121
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
131
+ if adata.shape[0] > 0:
132
+ adata.obs_names = binarized_base_identities_df.index.astype(str)
133
+ adata.var_names = binarized_base_identities_df.columns.astype(str)
134
+ adata.obs['Sample'] = [sample] * len(adata)
135
+ adata.obs['Strand'] = [strand] * len(adata)
136
+ adata.obs['Dataset'] = [mod_type] * len(adata)
137
+ adata.obs['Reference'] = [record] * len(adata)
138
+ adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
139
+
140
+ read_mapping_direction = []
141
+ for read_id in adata.obs_names:
142
+ if read_id in fwd_base_identities.keys():
143
+ read_mapping_direction.append('fwd')
144
+ elif read_id in rev_base_identities.keys():
145
+ read_mapping_direction.append('rev')
146
+ else:
147
+ read_mapping_direction.append('unk')
148
+
149
+ adata.obs['Read_mapping_direction'] = read_mapping_direction
150
+
151
+ # One hot encode the sequence string of the reads
152
+ fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
153
+ rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
154
+ bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
155
+ del fwd_base_identities, rev_base_identities
156
+
157
+ one_hot_reads = {}
158
+ n_rows_OHE = 5
159
+ for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
160
+ tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
161
+ one_hot_reads.update(tmp_ohe_dict)
162
+ del tmp_ohe_dict
163
+
164
+ read_names = list(one_hot_reads.keys())
165
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
166
+
167
+ sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
168
+ df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
169
+ df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
170
+ df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
171
+ df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
172
+ df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
173
+
174
+ for read_name, one_hot_array in one_hot_reads.items():
175
+ one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
176
+ dict_A[read_name] = one_hot_array[0, :]
177
+ dict_C[read_name] = one_hot_array[1, :]
178
+ dict_G[read_name] = one_hot_array[2, :]
179
+ dict_T[read_name] = one_hot_array[3, :]
180
+ dict_N[read_name] = one_hot_array[4, :]
181
+
182
+ del one_hot_reads
183
+ gc.collect()
184
+
185
+ for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
186
+ df_A.iloc[j] = dict_A[read_name]
187
+ df_C.iloc[j] = dict_C[read_name]
188
+ df_G.iloc[j] = dict_G[read_name]
189
+ df_T.iloc[j] = dict_T[read_name]
190
+ df_N.iloc[j] = dict_N[read_name]
191
+
192
+ del dict_A, dict_C, dict_G, dict_T, dict_N
193
+ gc.collect()
194
+
195
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
196
+
197
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
198
+ adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
199
+ ohe_df_map[j] = None # Reassign pointer for memory usage purposes
200
+
201
+ if final_adata:
202
+ if adata.shape[0] > 0:
203
+ final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
204
+ else:
205
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
206
+ else:
207
+ if adata.shape[0] > 0:
208
+ final_adata = adata
209
+ else:
210
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
122
211
 
123
- if final_adata:
124
- final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
125
212
  else:
126
- final_adata = adata
213
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
214
+
215
+ # Set obs columns to type 'category'
216
+ for col in final_adata.obs.columns:
217
+ final_adata.obs[col] = final_adata.obs[col].astype('category')
127
218
 
128
- for record in record_FASTA_dict.keys():
129
- chromosome = record.split('_')[0]
130
- sequence = record_FASTA_dict[record]
219
+ for record in records_to_analyze:
220
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
221
+ sequence = record_FASTA_dict[unconverted_record_name][0]
222
+ complement = record_FASTA_dict[unconverted_record_name][1]
223
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
224
+ final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
225
+ final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
131
226
  final_adata.uns[f'{record}_FASTA_sequence'] = sequence
132
- final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
133
- record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
134
- layer_map, layer_counts = {}, []
135
- for i, layer in enumerate(record_subset.layers):
136
- layer_map[i] = layer.split('_')[0]
137
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
138
- count_array = np.array(layer_counts)
139
- nucleotide_indexes = np.argmax(count_array, axis=0)
140
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
141
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
142
227
 
143
228
  ######################################################################################################
144
229
 
145
230
  ######################################################################################################
146
231
  ## Export the final adata object
147
- final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(readwrite.date_string(), experiment_name), compression='gzip')
232
+ final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
233
+ final_adata.write_h5ad(final_output, compression='gzip')