smftools 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +1 -1
  3. smftools/_version.py +1 -0
  4. smftools/datasets/datasets.py +11 -9
  5. smftools/informatics/__init__.py +8 -7
  6. smftools/informatics/bam_conversion.py +47 -0
  7. smftools/informatics/bam_direct.py +49 -0
  8. smftools/informatics/basecalls_to_adata.py +42 -0
  9. smftools/informatics/fast5_to_pod5.py +19 -0
  10. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  11. smftools/informatics/helpers/__init__.py +4 -4
  12. smftools/informatics/helpers/align_and_sort_BAM.py +52 -0
  13. smftools/informatics/helpers/binarize_converted_base_identities.py +10 -3
  14. smftools/informatics/helpers/canoncall.py +12 -1
  15. smftools/informatics/helpers/converted_BAM_to_adata.py +30 -13
  16. smftools/informatics/helpers/count_aligned_reads.py +12 -5
  17. smftools/informatics/helpers/extract_base_identities.py +13 -6
  18. smftools/informatics/helpers/extract_mods.py +17 -5
  19. smftools/informatics/helpers/find_conversion_sites.py +15 -9
  20. smftools/informatics/helpers/generate_converted_FASTA.py +49 -29
  21. smftools/informatics/helpers/get_native_references.py +10 -7
  22. smftools/informatics/helpers/make_dirs.py +9 -3
  23. smftools/informatics/helpers/make_modbed.py +10 -4
  24. smftools/informatics/helpers/modQC.py +10 -2
  25. smftools/informatics/helpers/modcall.py +13 -1
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +25 -13
  27. smftools/informatics/helpers/one_hot_encode.py +8 -3
  28. smftools/informatics/helpers/separate_bam_by_bc.py +18 -5
  29. smftools/informatics/helpers/split_and_index_BAM.py +18 -10
  30. smftools/informatics/pod5_conversion.py +34 -7
  31. smftools/informatics/pod5_direct.py +31 -5
  32. smftools/informatics/pod5_to_adata.py +31 -8
  33. smftools/informatics/readwrite.py +13 -16
  34. smftools/informatics/subsample_pod5.py +48 -0
  35. smftools/preprocessing/__init__.py +0 -6
  36. smftools/preprocessing/append_C_context.py +15 -8
  37. smftools/preprocessing/binarize_on_Youden.py +8 -4
  38. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  39. smftools/preprocessing/calculate_complexity.py +26 -14
  40. smftools/preprocessing/calculate_converted_read_methylation_stats.py +12 -5
  41. smftools/preprocessing/calculate_coverage.py +13 -7
  42. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  43. smftools/preprocessing/calculate_position_Youden.py +21 -12
  44. smftools/preprocessing/calculate_read_length_stats.py +11 -6
  45. smftools/preprocessing/clean_NaN.py +12 -5
  46. smftools/preprocessing/filter_converted_reads_on_methylation.py +12 -5
  47. smftools/preprocessing/filter_reads_on_length.py +13 -5
  48. smftools/preprocessing/invert_adata.py +9 -5
  49. smftools/preprocessing/mark_duplicates.py +20 -11
  50. smftools/preprocessing/min_non_diagonal.py +9 -4
  51. smftools/preprocessing/remove_duplicates.py +9 -3
  52. smftools/readwrite.py +13 -16
  53. smftools-0.1.1.dist-info/METADATA +88 -0
  54. smftools-0.1.1.dist-info/RECORD +64 -0
  55. smftools/informatics/helpers/align_BAM.py +0 -49
  56. smftools/informatics/helpers/load_experiment_config.py +0 -17
  57. smftools-0.1.0.dist-info/METADATA +0 -75
  58. smftools-0.1.0.dist-info/RECORD +0 -58
  59. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  60. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  61. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  62. {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/WHEEL +0 -0
  63. {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,22 +1,28 @@
1
1
  ## find_conversion_sites
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
- def find_conversion_sites(fasta_file, modification_type):
3
+ def find_conversion_sites(fasta_file, modification_type, conversion_types):
9
4
  """
10
5
  A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
11
6
  If searching for adenine conversions, it will find coordinates of all adenines.
12
- Input: A FASTA file and the modification_types of interest
7
+
8
+ Parameters:
9
+ fasta_file (str): A string representing the file path to the unconverted reference FASTA.
10
+ modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
+ conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
12
+
13
13
  Returns:
14
- A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
14
+ record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
15
15
  """
16
+ from .. import readwrite
17
+ from Bio import SeqIO
18
+ from Bio.SeqRecord import SeqRecord
19
+ from Bio.Seq import Seq
20
+
16
21
  print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
17
22
  # Initialize lists to hold top and bottom strand positional coordinates of interest
18
23
  top_strand_coordinates = []
19
24
  bottom_strand_coordinates = []
25
+ unconverted = conversion_types[0]
20
26
  record_dict = {}
21
27
  print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
22
28
  # Open the FASTA record as read only
@@ -24,7 +30,7 @@ def find_conversion_sites(fasta_file, modification_type):
24
30
  # Iterate over records in the FASTA
25
31
  for record in SeqIO.parse(f, "fasta"):
26
32
  # Only iterate over the unconverted records for the reference
27
- if 'unconverted' in record.id:
33
+ if unconverted in record.id:
28
34
  print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
29
35
  # Extract the sequence string of the record
30
36
  sequence = str(record.seq).upper()
@@ -1,14 +1,16 @@
1
1
  ## generate_converted_FASTA
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
- def convert_FASTA_record(record, modification_type, strand):
3
+ def convert_FASTA_record(record, modification_type, strand, unconverted):
9
4
  """
10
- Input: Takes a FASTA record, modification type, and strand as input
11
- Output: Returns a new seqrecord object with the conversions of interest
5
+ Takes a FASTA record and converts every instance of a base to the converted state.
6
+
7
+ Parameters:
8
+ record (str): The name of the record instance within the FASTA.
9
+ modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
10
+ strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
11
+ Returns:
12
+ new_seq (str): Converted sequence string.
13
+ new_id (str): Record id for the converted sequence string.
12
14
  """
13
15
  if modification_type == '5mC':
14
16
  if strand == 'top':
@@ -18,7 +20,8 @@ def convert_FASTA_record(record, modification_type, strand):
18
20
  # Replace every 'G' with 'A' in the sequence
19
21
  new_seq = record.seq.upper().replace('G', 'A')
20
22
  else:
21
- print('need to provide a valid strand string: top or bottom')
23
+ print('need to provide a valid strand string: top or bottom')
24
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
22
25
  elif modification_type == '6mA':
23
26
  if strand == 'top':
24
27
  # Replace every 'A' with 'G' in the sequence
@@ -28,32 +31,49 @@ def convert_FASTA_record(record, modification_type, strand):
28
31
  new_seq = record.seq.upper().replace('T', 'C')
29
32
  else:
30
33
  print('need to provide a valid strand string: top or bottom')
31
- elif modification_type == 'unconverted':
34
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
35
+ elif modification_type == unconverted:
32
36
  new_seq = record.seq.upper()
37
+ new_id = '{0}_{1}_top'.format(record.id, modification_type)
33
38
  else:
34
- print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
35
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
36
- # Return a new SeqRecord with modified sequence and ID
39
+ print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
40
+
41
+ return new_seq, new_id
37
42
 
38
43
  def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
39
44
  """
40
- Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
41
- Output: Writes out a new fasta with all stranded conversions
42
- Notes: Uses modify_sequence_and_id function on every record within the FASTA
45
+ Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
46
+
47
+ Parameters:
48
+ input_FASTA (str): A string representing the path to the unconverted FASTA file.
49
+ modification_types (list): A list of modification types to use in the experiment.
50
+ strands (list): A list of converstion strands to use in the experiment.
51
+ output_FASTA (str): A string representing the path to the converted FASTA output file.
52
+ Returns:
53
+ None
54
+ Writes out a converted FASTA reference for the experiment.
43
55
  """
56
+ from .. import readwrite
57
+ from Bio import SeqIO
58
+ from Bio.SeqRecord import SeqRecord
59
+ from Bio.Seq import Seq
60
+ modified_records = []
61
+ unconverted = modification_types[0]
62
+ # Iterate over each record in the input FASTA
63
+ for record in SeqIO.parse(input_fasta, 'fasta'):
64
+ record_description = record.description
65
+ # Iterate over each modification type of interest
66
+ for modification_type in modification_types:
67
+ # Iterate over the strands of interest
68
+ for i, strand in enumerate(strands):
69
+ if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
70
+ pass
71
+ else:
72
+ # Add the modified record to the list of modified records
73
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
74
+ new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
75
+ new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
76
+ modified_records.append(new_record)
44
77
  with open(output_fasta, 'w') as output_handle:
45
- modified_records = []
46
- # Iterate over each record in the input FASTA
47
- for record in SeqIO.parse(input_fasta, 'fasta'):
48
- # Iterate over each modification type of interest
49
- for modification_type in modification_types:
50
- # Iterate over the strands of interest
51
- for i, strand in enumerate(strands):
52
- if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
53
- pass
54
- else:
55
- # Add the modified record to the list of modified records
56
- print(f'converting {modification_type} on the {strand} strand of record {record}')
57
- modified_records.append(convert_FASTA_record(record, modification_type, strand))
58
78
  # write out the concatenated FASTA file of modified sequences
59
79
  SeqIO.write(modified_records, output_handle, 'fasta')
@@ -1,17 +1,20 @@
1
1
  ## get_native_references
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
3
  # Direct methylation specific
9
4
  def get_native_references(fasta_file):
10
5
  """
11
- Input: A FASTA file
6
+ Makes a dictionary keyed by record id which points to the record length and record sequence.
7
+
8
+ Paramaters:
9
+ fasta_file (str): A string representing the path to the FASTA file for the experiment.
10
+
12
11
  Returns:
13
- A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
12
+ None
14
13
  """
14
+ from .. import readwrite
15
+ from Bio import SeqIO
16
+ from Bio.SeqRecord import SeqRecord
17
+ from Bio.Seq import Seq
15
18
  record_dict = {}
16
19
  print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
17
20
  # Open the FASTA record as read only
@@ -1,12 +1,18 @@
1
1
  ## make_dirs
2
- import os
3
2
 
4
3
  # General
5
4
  def make_dirs(directories):
6
5
  """
7
- Input: Takes a list of file paths to make directories for
8
- Output: Makes each directory in the list if the directory doesn't already exist.
6
+ Takes a list of file paths and makes new directories if the directory does not already exist.
7
+
8
+ Parameters:
9
+ directories (list): A list of directories to make
10
+
11
+ Returns:
12
+ None
9
13
  """
14
+ import os
15
+
10
16
  for directory in directories:
11
17
  if not os.path.isdir(directory):
12
18
  os.mkdir(directory)
@@ -1,19 +1,25 @@
1
1
  ## make_modbed
2
- import os
3
- import subprocess
4
2
 
5
3
  # Direct SMF
6
4
  def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
7
5
  """
8
- Generating Barcode position methylation summaries starting from the overall BAM file that was direct output of dorado aligner
6
+ Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
7
+ Parameters:
8
+ aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
9
+
10
+ Returns:
11
+ None
9
12
  """
13
+ import os
14
+ import subprocess
15
+
10
16
  os.chdir(mod_bed_dir)
11
17
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
12
18
  command = [
13
19
  "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
14
20
  "--partition-tag", "BC",
15
21
  "--only-tabs",
16
- "--filter-threshold", filter_threshold,
22
+ "--filter-threshold", f'{filter_threshold}',
17
23
  "--mod-thresholds", f"m:{m5C_threshold}",
18
24
  "--mod-thresholds", f"a:{m6A_threshold}",
19
25
  "--mod-thresholds", f"h:{hm5C_threshold}"
@@ -1,17 +1,25 @@
1
1
  ## modQC
2
- import subprocess
3
2
 
4
3
  # Direct SMF
5
4
  def modQC(aligned_sorted_output, thresholds):
6
5
  """
7
6
  Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
8
7
  It is generally good to look at these parameters on positive and negative controls.
8
+
9
+ Parameters:
10
+ aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
11
+ thresholds (list): A list of floats to pass for call thresholds.
12
+
13
+ Returns:
14
+ None
9
15
  """
16
+ import subprocess
17
+
10
18
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
11
19
  subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
12
20
  command = [
13
21
  "modkit", "summary", aligned_sorted_output,
14
- "--filter-threshold", filter_threshold,
22
+ "--filter-threshold", f"{filter_threshold}",
15
23
  "--mod-thresholds", f"m:{m5C_threshold}",
16
24
  "--mod-thresholds", f"a:{m6A_threshold}",
17
25
  "--mod-thresholds", f"h:{hm5C_threshold}"
@@ -1,11 +1,23 @@
1
1
  ## modcall
2
- import subprocess
3
2
 
4
3
  # Direct methylation specific
5
4
  def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
6
5
  """
7
6
  Wrapper function for dorado modified base calling.
7
+
8
+ Parameters:
9
+ model (str): a string representing the file path to the dorado basecalling model.
10
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
12
+ mod_list (list): A list of modification types to use in the analysis.
13
+ bam (str): File path to the BAM file to output.
14
+ bam_suffix (str): The suffix to use for the BAM file.
15
+
16
+ Returns:
17
+ None
18
+ Outputs a BAM file holding the modified base calls output by the dorado basecaller.
8
19
  """
20
+ import subprocess
9
21
  output = bam + bam_suffix
10
22
  command = [
11
23
  "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
@@ -1,20 +1,31 @@
1
1
  ## modkit_extract_to_adata
2
- from .. import readwrite
3
- from .get_native_references import get_native_references
4
- from .count_aligned_reads import count_aligned_reads
5
- from .extract_base_identities import extract_base_identities
6
- from .one_hot_encode import one_hot_encode
7
- import pandas as pd
8
- import anndata as ad
9
- import os
10
- import gc
11
- import math
12
- import numpy as np
13
2
 
14
3
  def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
15
4
  """
16
-
5
+ Takes modkit extract outputs and organizes it into an adata object
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ bam (str): File path to the aligned_sorted non-split modified BAM file
10
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
+ experiment_name (str): A string to provide an experiment name to the output adata file.
12
+ mods (list): A list of strings of the modification types to use in the analysis.
13
+ batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
14
+
15
+ Returns:
16
+ None
17
17
  """
18
+ from .. import readwrite
19
+ from .get_native_references import get_native_references
20
+ from .count_aligned_reads import count_aligned_reads
21
+ from .extract_base_identities import extract_base_identities
22
+ from .one_hot_encode import one_hot_encode
23
+ import pandas as pd
24
+ import anndata as ad
25
+ import os
26
+ import gc
27
+ import math
28
+ import numpy as np
18
29
  ###################################################
19
30
  ### Get input tsv file names into a sorted list ###
20
31
  # List all files in the directory
@@ -56,7 +67,8 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
56
67
  delta_max_length = max_reference_length - current_reference_length
57
68
  sequence = reference_dict[record][1] + 'N'*delta_max_length
58
69
  # Get a dictionary of positional base identities keyed by read id
59
- base_identities = extract_base_identities(bam, record, current_reference_length, max_reference_length)
70
+ positions = range(current_reference_length)
71
+ base_identities = extract_base_identities(bam, record, positions, max_reference_length)
60
72
  # One hot encode the sequence string of the reads
61
73
  one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
62
74
  record_seq_dict[record] = (one_hot_reads, sequence)
@@ -1,12 +1,17 @@
1
1
  # one_hot_encode
2
- from .. import readwrite
3
2
 
4
3
  # String encodings
5
4
  def one_hot_encode(sequence):
6
5
  """
7
- Input: A sequence string of a read.
8
- Output: One hot encoding of the sequence string.
6
+ One hot encodes a sequence string.
7
+ Parameters:
8
+ sequence (str): A DNA sequence string.
9
+
10
+ Returns:
11
+ one_hot_matrix (ndarray): A numpy ndarray holding a vstacked one hot encoding of the input sequence string.
9
12
  """
13
+ import numpy as np
14
+
10
15
  mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
11
16
  one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
12
17
  for i, nucleotide in enumerate(sequence):
@@ -1,12 +1,25 @@
1
1
  ## separate_bam_by_bc
2
- import pysam
3
2
 
4
3
  # General
5
- def separate_bam_by_bc(input_bam, output_prefix):
4
+ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
6
5
  """
7
- Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
8
- Output: Splits the BAM based on the BC SAM tag value.
6
+ Separates an input BAM file on the BC SAM tag values.
7
+
8
+ Parameters:
9
+ input_bam (str): File path to the BAM file to split.
10
+ output_prefix (str): A prefix to append to the output BAM.
11
+ bam_suffix (str): A suffix to add to the bam file.
12
+
13
+ Returns:
14
+ None
15
+ Writes out split BAM files.
9
16
  """
17
+ import pysam
18
+ import os
19
+
20
+ bam_base = os.path.basename(input_bam)
21
+ bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
22
+
10
23
  # Open the input BAM file for reading
11
24
  with pysam.AlignmentFile(input_bam, "rb") as bam:
12
25
  # Create a dictionary to store output BAM files
@@ -18,7 +31,7 @@ def separate_bam_by_bc(input_bam, output_prefix):
18
31
  bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
19
32
  # Open the output BAM file corresponding to the barcode
20
33
  if bc_tag not in output_files:
21
- output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
34
+ output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}", "wb", header=bam.header)
22
35
  # Write the read to the corresponding output BAM file
23
36
  output_files[bc_tag].write(read)
24
37
  except KeyError:
@@ -1,21 +1,29 @@
1
1
  ## split_and_index_BAM
2
- from .. import readwrite
3
- import os
4
- import subprocess
5
- import glob
6
- from .separate_bam_by_bc import separate_bam_by_bc
7
2
 
8
3
  def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
9
4
  """
10
- A wrapper function for splitting BAMS and indexing them
5
+ A wrapper function for splitting BAMS and indexing them.
6
+ Parameters:
7
+ aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
+ bam_suffix (str): A suffix to add to the bam file.
10
+
11
+ Returns:
12
+ None
13
+ Splits an input BAM file on barcode value and makes a BAM index file.
11
14
  """
15
+ from .. import readwrite
16
+ import os
17
+ import subprocess
18
+ import glob
19
+ from .separate_bam_by_bc import separate_bam_by_bc
20
+
12
21
  os.chdir(split_dir)
13
22
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
14
- file_prefix = readwrite.datestring()
15
- separate_bam_by_bc(aligned_sorted_output, file_prefix)
23
+ file_prefix = readwrite.date_string()
24
+ separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix)
16
25
  # Make a BAM index file for the BAMs in that directory
17
26
  bam_pattern = '*' + bam_suffix
18
27
  bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
19
28
  for input_file in bam_files:
20
- subprocess.run(["samtools", "index", input_file])
21
- print(f"Indexed {input_file}")
29
+ subprocess.run(["samtools", "index", input_file])
@@ -1,23 +1,50 @@
1
1
  ## pod5_conversion
2
- from .helpers import align_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
3
- import subprocess
4
2
 
5
3
  def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
6
4
  """
7
- Converts a POD5 file from a nanopore conversion SMF experiment to an adata object
5
+ Converts a POD5 file from a nanopore conversion SMF experiment to an adata object.
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ output_directory (str): A file path to the directory to output all the analyses.
10
+ conversion_type (list): A list of strings of the conversion types to use in the analysis.
11
+ strands (list): A list of converstion strands to use in the experiment.
12
+ model (str): a string representing the file path to the dorado basecalling model.
13
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
+ experiment_name (str): A string to provide an experiment name to the output adata file.
18
+ bam_suffix (str): A suffix to add to the bam file.
19
+
20
+ Returns:
21
+ None
8
22
  """
9
- bam=f"{output_directory}/HAC_basecalls"
23
+ from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
24
+ import os
25
+ model_basename = os.path.basename(model)
26
+ model_basename = model_basename.replace('.', '_')
27
+ bam=f"{output_directory}/{model_basename}_canonical_basecalls"
10
28
  aligned_BAM=f"{bam}_aligned"
11
29
  aligned_sorted_BAM=f"{aligned_BAM}_sorted"
30
+
31
+ os.chdir(output_directory)
32
+
12
33
  # 1) Convert FASTA file
13
- converted_FASTA=fasta.split('.fa')[0]+'_converted.fasta'
14
- generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
34
+ fasta_basename = os.path.basename(fasta)
35
+ converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
36
+ converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
37
+ if os.path.exists(converted_FASTA):
38
+ print(converted_FASTA + ' already exists. Using existing converted FASTA.')
39
+ else:
40
+ generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
15
41
 
16
42
  # 2) Basecall from the input POD5 to generate a singular output BAM
17
43
  canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
18
44
 
19
45
  # 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
20
- align_BAM(converted_FASTA, bam, bam_suffix)
46
+ input_BAM = bam + bam_suffix
47
+ align_and_sort_BAM(converted_FASTA, input_BAM, bam_suffix, output_directory)
21
48
 
22
49
  ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
23
50
  split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
@@ -1,24 +1,50 @@
1
1
  ## pod5_direct
2
- from .helpers import align_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM
3
2
 
4
3
  def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
5
4
  """
6
-
5
+ Converts a POD5 file from a nanopore native SMF experiment to an adata object.
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ output_directory (str): A file path to the directory to output all the analyses.
10
+ mod_list (list): A list of strings of the modification types to use in the analysis.
11
+ model (str): a string representing the file path to the dorado basecalling model.
12
+ thresholds (list): A list of floats to pass for call thresholds.
13
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
+ experiment_name (str): A string to provide an experiment name to the output adata file.
18
+ bam_suffix (str): A suffix to add to the bam file.
19
+ batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
20
+
21
+ Returns:
22
+ None
7
23
  """
8
- bam=f"{output_directory}/HAC_mod_calls"
24
+ from .helpers import align_and_sort_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
25
+ import os
26
+ model_basename = os.path.basename(model)
27
+ model_basename = model_basename.replace('.', '_')
28
+ mod_string = "_".join(mod_list)
29
+ bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
9
30
  aligned_BAM=f"{bam}_aligned"
10
31
  aligned_sorted_BAM=f"{aligned_BAM}_sorted"
11
32
  mod_bed_dir=f"{output_directory}/split_mod_beds"
12
33
  mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
13
34
 
35
+ make_dirs([mod_bed_dir, mod_tsv_dir])
36
+
14
37
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
15
38
  mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
16
39
  mods = [mod_map[mod] for mod in mod_list]
17
40
 
41
+ os.chdir(output_directory)
42
+
18
43
  # 1) Basecall using dorado
19
44
  modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
20
- # 2) Align the BAM to the converted reference FASTA. Also make an index and a bed file of mapped reads
21
- align_BAM(fasta, bam, bam_suffix)
45
+ # 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
46
+ input_BAM = bam + bam_suffix
47
+ align_and_sort_BAM(fasta, input_BAM, bam_suffix, output_directory)
22
48
  # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
23
49
  split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
24
50
  # 4) Using nanopore modkit to work with modified BAM files ###
@@ -1,17 +1,40 @@
1
1
  ## pod5_to_adata
2
- from .helpers import load_experiment_config
3
- from.pod5_direct import pod5_direct
4
- from.pod5_conversion import pod5_conversion
5
2
 
6
- def pod5_to_adata(config_path, ):
3
+ def pod5_to_adata(config_path):
7
4
  """
8
-
5
+ High-level function to call for converting raw sequencing data to an adata object.
6
+
7
+ Parameters:
8
+ config_path (str): A string representing the file path to the experiment configuration csv file.
9
+
10
+ Returns:
11
+ None
9
12
  """
13
+ from .helpers import LoadExperimentConfig, make_dirs
14
+ import os
15
+ bam_suffix = '.bam' # If different, change from here.
16
+ split_dir = 'split_BAMs' # If different, change from here.
17
+ strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
18
+ conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
19
+
10
20
  # Load experiment config parameters into global variables
11
- load_experiment_config(config_path)
21
+ experiment_config = LoadExperimentConfig(config_path)
22
+ var_dict = experiment_config.var_dict
23
+ for key, value in var_dict.items():
24
+ globals()[key] = value
25
+
26
+ conversions += conversion_types
27
+
28
+ split_path = os.path.join(output_directory, split_dir)
29
+ make_dirs([output_directory, split_path])
30
+ os.chdir(output_directory)
31
+
12
32
  if smf_modality == 'conversion':
13
- (fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
33
+ from .pod5_conversion import pod5_conversion
34
+ pod5_conversion(fasta, output_directory, conversions, strands, model, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
14
35
  elif smf_modality == 'direct':
15
- pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
36
+ from .pod5_direct import pod5_direct
37
+ thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
38
+ pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
16
39
  else:
17
40
  print("Error")