smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +3 -2
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_sample_sheet.csv +5 -0
  5. smftools/datasets/datasets.py +14 -11
  6. smftools/informatics/__init__.py +10 -7
  7. smftools/informatics/archived/bam_conversion.py +59 -0
  8. smftools/informatics/archived/bam_direct.py +63 -0
  9. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  10. smftools/informatics/conversion_smf.py +79 -0
  11. smftools/informatics/direct_smf.py +89 -0
  12. smftools/informatics/fast5_to_pod5.py +21 -0
  13. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  14. smftools/informatics/helpers/__init__.py +22 -4
  15. smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
  16. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  17. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  18. smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
  19. smftools/informatics/helpers/canoncall.py +14 -1
  20. smftools/informatics/helpers/complement_base_list.py +21 -0
  21. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
  23. smftools/informatics/helpers/count_aligned_reads.py +25 -14
  24. smftools/informatics/helpers/extract_base_identities.py +44 -23
  25. smftools/informatics/helpers/extract_mods.py +17 -5
  26. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  27. smftools/informatics/helpers/find_conversion_sites.py +24 -16
  28. smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
  29. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  30. smftools/informatics/helpers/get_native_references.py +10 -7
  31. smftools/informatics/helpers/index_fasta.py +12 -0
  32. smftools/informatics/helpers/make_dirs.py +9 -3
  33. smftools/informatics/helpers/make_modbed.py +10 -4
  34. smftools/informatics/helpers/modQC.py +10 -2
  35. smftools/informatics/helpers/modcall.py +16 -2
  36. smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
  37. smftools/informatics/helpers/ohe_batching.py +52 -0
  38. smftools/informatics/helpers/one_hot_encode.py +15 -8
  39. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  40. smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
  41. smftools/informatics/helpers/split_and_index_BAM.py +31 -11
  42. smftools/informatics/load_adata.py +127 -0
  43. smftools/informatics/readwrite.py +13 -16
  44. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  45. smftools/informatics/subsample_pod5.py +104 -0
  46. smftools/preprocessing/__init__.py +6 -7
  47. smftools/preprocessing/append_C_context.py +52 -22
  48. smftools/preprocessing/binarize_on_Youden.py +8 -4
  49. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  50. smftools/preprocessing/calculate_complexity.py +26 -14
  51. smftools/preprocessing/calculate_consensus.py +47 -0
  52. smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
  53. smftools/preprocessing/calculate_coverage.py +14 -8
  54. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  55. smftools/preprocessing/calculate_position_Youden.py +21 -12
  56. smftools/preprocessing/calculate_read_length_stats.py +67 -8
  57. smftools/preprocessing/clean_NaN.py +13 -6
  58. smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
  59. smftools/preprocessing/filter_reads_on_length.py +16 -6
  60. smftools/preprocessing/invert_adata.py +10 -5
  61. smftools/preprocessing/load_sample_sheet.py +24 -0
  62. smftools/preprocessing/make_dirs.py +21 -0
  63. smftools/preprocessing/mark_duplicates.py +54 -30
  64. smftools/preprocessing/min_non_diagonal.py +9 -4
  65. smftools/preprocessing/recipes.py +125 -0
  66. smftools/preprocessing/remove_duplicates.py +15 -6
  67. smftools/readwrite.py +13 -16
  68. smftools/tools/apply_HMM.py +1 -0
  69. smftools/tools/cluster.py +0 -0
  70. smftools/tools/read_HMM.py +1 -0
  71. smftools/tools/subset_adata.py +32 -0
  72. smftools/tools/train_HMM.py +43 -0
  73. smftools-0.1.3.dist-info/METADATA +94 -0
  74. smftools-0.1.3.dist-info/RECORD +84 -0
  75. smftools/informatics/helpers/align_BAM.py +0 -49
  76. smftools/informatics/helpers/load_experiment_config.py +0 -17
  77. smftools/informatics/pod5_conversion.py +0 -26
  78. smftools/informatics/pod5_direct.py +0 -29
  79. smftools/informatics/pod5_to_adata.py +0 -17
  80. smftools-0.1.0.dist-info/METADATA +0 -75
  81. smftools-0.1.0.dist-info/RECORD +0 -58
  82. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  83. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  84. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  85. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  86. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,43 @@
1
1
  ## count_aligned_reads
2
- from .. import readwrite
3
- # bioinformatic operations
4
- import pysam
5
2
 
6
3
  # General
7
4
  def count_aligned_reads(bam_file):
8
5
  """
9
- Input: A BAM alignment file.
10
- Output: The number of aligned/unaligned reads in the BAM file. Also returns a dictionary, keyed by reference id that points to a tuple. The tuple contains an integer number of mapped reads to that reference, followed by the proportion of mapped reads that map to that reference
6
+ Counts the number of aligned reads in a bam file that map to each reference record.
7
+
8
+ Parameters:
9
+ bam_file (str): A string representing the path to an aligned BAM file.
10
+
11
+ Returns:
12
+ aligned_reads_count (int): The total number or reads aligned in the BAM.
13
+ unaligned_reads_count (int): The total number of reads not aligned in the BAM.
14
+ record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
+
11
16
  """
17
+ from .. import readwrite
18
+ import pysam
19
+ from tqdm import tqdm
20
+ from collections import defaultdict
21
+
12
22
  print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
13
23
  aligned_reads_count = 0
14
24
  unaligned_reads_count = 0
15
25
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
16
- record_counts = {}
26
+ record_counts = defaultdict(int)
27
+
17
28
  with pysam.AlignmentFile(bam_file, "rb") as bam:
29
+ total_reads = bam.mapped + bam.unmapped
18
30
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
19
- for read in bam:
20
- if read.is_unmapped:
31
+ for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
32
+ if read.is_unmapped:
21
33
  unaligned_reads_count += 1
22
- else:
34
+ else:
23
35
  aligned_reads_count += 1
24
- if read.reference_name in record_counts:
25
- record_counts[read.reference_name] += 1
26
- else:
27
- record_counts[read.reference_name] = 1
36
+ record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
37
+
28
38
  # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
29
39
  for reference in record_counts:
30
40
  proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
31
41
  record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
32
- return aligned_reads_count, unaligned_reads_count, record_counts
42
+
43
+ return aligned_reads_count, unaligned_reads_count, dict(record_counts)
@@ -1,36 +1,57 @@
1
1
  ## extract_base_identities
2
- from .. import readwrite
3
- # bioinformatic operations
4
- import pysam
5
2
 
6
3
  # General
7
4
  def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
8
5
  """
9
- Input: A position sorted BAM file, chromosome number, position coordinate set, and reference length to extract the base identitity from the read.
10
- Output: A dictionary, keyed by read name, that points to a list of Base identities from each read.
11
- If the read does not contain that position, fill the list at that index with a N value.
6
+ Extracts the base identities from every position within the mapped reads that have a reference coordinate
7
+
8
+ Parameters:
9
+ bam (str): File path to the BAM file to align (excluding the file suffix).
10
+ chromosome (str): A string representing the name of the record within the reference FASTA.
11
+ positions (list): A list of position coordinates within the record to extract.
12
+ max_reference_length (int): The maximum length of a record in the reference set.
13
+
14
+ Returns:
15
+ fwd_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from forward mapped reads. If the read does not contain that position, fill the list at that index with a N value.
16
+ rev_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from reverse mapped reads. If the read does not contain that position, fill the list at that index with a N value.
12
17
  """
18
+ from .. import readwrite
19
+ import pysam
20
+ from tqdm import tqdm
21
+
13
22
  positions = set(positions)
14
23
  # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
15
- base_identities = {}
24
+ fwd_base_identities = {}
25
+ rev_base_identities = {}
16
26
  # Open the postion sorted BAM file
17
27
  print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
18
28
  with pysam.AlignmentFile(bam_file, "rb") as bam:
19
29
  # Iterate over every read in the bam that comes from the chromosome of interest
20
30
  print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
21
- for read in bam.fetch(chromosome):
22
- if read.query_name in base_identities:
23
- pass
24
- #print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
25
- else:
26
- # Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
27
- base_identities[read.query_name] = ['N'] * max_reference_length
28
- # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
29
- for read_position, reference_position in read.get_aligned_pairs():
30
- # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
31
- if reference_position in positions and read_position:
32
- # get the base_identity in the read corresponding to that position
33
- base_identity = read.query_sequence[read_position]
34
- # Add the base identity to array
35
- base_identities[read.query_name][reference_position] = base_identity
36
- return base_identities
31
+ total_reads = bam.mapped
32
+ for read in tqdm(bam.fetch(chromosome), desc='Extracting base identities from reads in BAM', total=total_reads):
33
+ # Only iterate over mapped reads
34
+ if read.is_mapped:
35
+ # Get sequence of read. PySam reports fwd mapped reads as the true read sequence. Pysam reports rev mapped reads as the reverse complement of the read.
36
+ query_sequence = read.query_sequence
37
+ # If the read aligned as a reverse complement, mark that the read is reversed
38
+ if read.is_reverse:
39
+ # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
40
+ rev_base_identities[read.query_name] = ['N'] * max_reference_length
41
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
42
+ for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
43
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
44
+ if reference_position in positions and read_position:
45
+ # get the base_identity in the read corresponding to that position
46
+ rev_base_identities[read.query_name][reference_position] = query_sequence[read_position]
47
+ else:
48
+ # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
49
+ fwd_base_identities[read.query_name] = ['N'] * max_reference_length
50
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
51
+ for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
52
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
53
+ if reference_position in positions and read_position:
54
+ # get the base_identity in the read corresponding to that position
55
+ fwd_base_identities[read.query_name][reference_position] = query_sequence[read_position]
56
+
57
+ return fwd_base_identities, rev_base_identities
@@ -1,13 +1,25 @@
1
1
  ## extract_mods
2
- import os
3
- import subprocess
4
- import glob
5
- import zipfile
6
2
 
7
3
  def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix):
8
4
  """
9
5
  Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
6
+
7
+ Parameters:
8
+ thresholds (list): A list of thresholds to use for marking each basecalled base as passing or failing on canonical and modification call status.
9
+ mod_tsv_dir (str): A string representing the file path to the directory to hold the modkit extract outputs.
10
+ split_dit (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
11
+ bam_suffix (str): The suffix to use for the BAM file.
12
+
13
+ Returns:
14
+ None
15
+ Runs modkit extract on input aligned_sorted_split modified BAM files to output zipped TSVs containing modification calls.
16
+
10
17
  """
18
+ import os
19
+ import subprocess
20
+ import glob
21
+ import zipfile
22
+
11
23
  os.chdir(mod_tsv_dir)
12
24
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
13
25
  bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
@@ -23,7 +35,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix):
23
35
  # Run modkit extract
24
36
  subprocess.run([
25
37
  "modkit", "extract",
26
- "--filter-threshold", filter_threshold,
38
+ "--filter-threshold", f'{filter_threshold}',
27
39
  "--mod-thresholds", f"m:{m5C_threshold}",
28
40
  "--mod-thresholds", f"a:{m6A_threshold}",
29
41
  "--mod-thresholds", f"h:{hm5C_threshold}",
@@ -0,0 +1,22 @@
1
+ # extract_readnames_from_BAM
2
+
3
+ def extract_readnames_from_BAM(aligned_BAM):
4
+ """
5
+ Takes a BAM and writes out a txt file containing read names from the BAM
6
+
7
+ Parameters:
8
+ aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
9
+
10
+ Returns:
11
+ None
12
+
13
+ """
14
+ import subprocess
15
+ # Make a text file of reads for the BAM
16
+ txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
17
+ samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
18
+ with open(txt_output, "w") as output_file:
19
+ cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
20
+ samtools_view.stdout.close()
21
+ cut_process.wait()
22
+ samtools_view.wait()
@@ -1,33 +1,40 @@
1
1
  ## find_conversion_sites
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
- def find_conversion_sites(fasta_file, modification_type):
3
+ def find_conversion_sites(fasta_file, modification_type, conversion_types):
9
4
  """
10
5
  A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
11
6
  If searching for adenine conversions, it will find coordinates of all adenines.
12
- Input: A FASTA file and the modification_types of interest
7
+
8
+ Parameters:
9
+ fasta_file (str): A string representing the file path to the converted reference FASTA.
10
+ modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
+ conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
12
+
13
13
  Returns:
14
- A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
14
+ record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string, 5) Complement sequence
15
15
  """
16
- print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
16
+ from .. import readwrite
17
+ from Bio import SeqIO
18
+ from Bio.SeqRecord import SeqRecord
19
+ from Bio.Seq import Seq
20
+
21
+ #print('{0}: Finding positions of interest in reference FASTA: {1}'.format(readwrite.time_string(), fasta_file))
17
22
  # Initialize lists to hold top and bottom strand positional coordinates of interest
18
23
  top_strand_coordinates = []
19
24
  bottom_strand_coordinates = []
25
+ unconverted = conversion_types[0]
20
26
  record_dict = {}
21
- print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
27
+ #print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
22
28
  # Open the FASTA record as read only
23
29
  with open(fasta_file, "r") as f:
24
30
  # Iterate over records in the FASTA
25
31
  for record in SeqIO.parse(f, "fasta"):
26
32
  # Only iterate over the unconverted records for the reference
27
- if 'unconverted' in record.id:
28
- print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
33
+ if unconverted in record.id:
34
+ #print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
29
35
  # Extract the sequence string of the record
30
36
  sequence = str(record.seq).upper()
37
+ complement = str(record.seq.complement()).upper()
31
38
  sequence_length = len(sequence)
32
39
  if modification_type == '5mC':
33
40
  # Iterate over the sequence string from the record
@@ -36,7 +43,7 @@ def find_conversion_sites(fasta_file, modification_type):
36
43
  top_strand_coordinates.append(i) # 0-indexed coordinate
37
44
  if sequence[i] == 'G':
38
45
  bottom_strand_coordinates.append(i) # 0-indexed coordinate
39
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
46
+ #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
40
47
  elif modification_type == '6mA':
41
48
  # Iterate over the sequence string from the record
42
49
  for i in range(0, len(sequence)):
@@ -44,10 +51,11 @@ def find_conversion_sites(fasta_file, modification_type):
44
51
  top_strand_coordinates.append(i) # 0-indexed coordinate
45
52
  if sequence[i] == 'T':
46
53
  bottom_strand_coordinates.append(i) # 0-indexed coordinate
47
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
54
+ #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
48
55
  else:
49
- print('modification_type not found. Please try 5mC or 6mA')
50
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
56
+ #print('modification_type not found. Please try 5mC or 6mA')
57
+ pass
58
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
51
59
  else:
52
60
  pass
53
61
  return record_dict
@@ -1,14 +1,16 @@
1
1
  ## generate_converted_FASTA
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
- def convert_FASTA_record(record, modification_type, strand):
3
+ def convert_FASTA_record(record, modification_type, strand, unconverted):
9
4
  """
10
- Input: Takes a FASTA record, modification type, and strand as input
11
- Output: Returns a new seqrecord object with the conversions of interest
5
+ Takes a FASTA record and converts every instance of a base to the converted state.
6
+
7
+ Parameters:
8
+ record (str): The name of the record instance within the FASTA.
9
+ modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
10
+ strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
11
+ Returns:
12
+ new_seq (str): Converted sequence string.
13
+ new_id (str): Record id for the converted sequence string.
12
14
  """
13
15
  if modification_type == '5mC':
14
16
  if strand == 'top':
@@ -18,7 +20,8 @@ def convert_FASTA_record(record, modification_type, strand):
18
20
  # Replace every 'G' with 'A' in the sequence
19
21
  new_seq = record.seq.upper().replace('G', 'A')
20
22
  else:
21
- print('need to provide a valid strand string: top or bottom')
23
+ print('need to provide a valid strand string: top or bottom')
24
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
22
25
  elif modification_type == '6mA':
23
26
  if strand == 'top':
24
27
  # Replace every 'A' with 'G' in the sequence
@@ -28,32 +31,68 @@ def convert_FASTA_record(record, modification_type, strand):
28
31
  new_seq = record.seq.upper().replace('T', 'C')
29
32
  else:
30
33
  print('need to provide a valid strand string: top or bottom')
31
- elif modification_type == 'unconverted':
34
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
35
+ elif modification_type == unconverted:
32
36
  new_seq = record.seq.upper()
37
+ new_id = '{0}_{1}_top'.format(record.id, modification_type)
33
38
  else:
34
- print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
35
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
36
- # Return a new SeqRecord with modified sequence and ID
39
+ print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
40
+
41
+ return new_seq, new_id
37
42
 
38
43
  def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
39
44
  """
40
- Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
41
- Output: Writes out a new fasta with all stranded conversions
42
- Notes: Uses modify_sequence_and_id function on every record within the FASTA
45
+ Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
46
+
47
+ Parameters:
48
+ input_FASTA (str): A string representing the path to the unconverted FASTA file.
49
+ modification_types (list): A list of modification types to use in the experiment.
50
+ strands (list): A list of converstion strands to use in the experiment.
51
+ output_FASTA (str): A string representing the path to the converted FASTA output file.
52
+ Returns:
53
+ None
54
+ Writes out a converted FASTA reference for the experiment.
43
55
  """
44
- with open(output_fasta, 'w') as output_handle:
45
- modified_records = []
46
- # Iterate over each record in the input FASTA
56
+ from .. import readwrite
57
+ from Bio import SeqIO
58
+ from Bio.SeqRecord import SeqRecord
59
+ from Bio.Seq import Seq
60
+ import gzip
61
+ modified_records = []
62
+ unconverted = modification_types[0]
63
+ # Iterate over each record in the input FASTA
64
+ if '.gz' in input_fasta:
65
+ with gzip.open(input_fasta, 'rt') as handle:
66
+ for record in SeqIO.parse(handle, 'fasta'):
67
+ record_description = record.description
68
+ # Iterate over each modification type of interest
69
+ for modification_type in modification_types:
70
+ # Iterate over the strands of interest
71
+ for i, strand in enumerate(strands):
72
+ if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
73
+ pass
74
+ else:
75
+ # Add the modified record to the list of modified records
76
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
77
+ new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
78
+ new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
79
+ modified_records.append(new_record)
80
+ else:
47
81
  for record in SeqIO.parse(input_fasta, 'fasta'):
82
+ record_description = record.description
48
83
  # Iterate over each modification type of interest
49
84
  for modification_type in modification_types:
50
85
  # Iterate over the strands of interest
51
86
  for i, strand in enumerate(strands):
52
- if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
87
+ if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
53
88
  pass
54
89
  else:
55
90
  # Add the modified record to the list of modified records
56
91
  print(f'converting {modification_type} on the {strand} strand of record {record}')
57
- modified_records.append(convert_FASTA_record(record, modification_type, strand))
92
+ new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
93
+ new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
94
+ modified_records.append(new_record)
95
+
96
+ with open(output_fasta, 'w') as output_handle:
58
97
  # write out the concatenated FASTA file of modified sequences
59
98
  SeqIO.write(modified_records, output_handle, 'fasta')
@@ -0,0 +1,32 @@
1
+ # get_chromosome_lengths
2
+
3
+ def get_chromosome_lengths(fasta):
4
+ """
5
+ Generates a file containing chromosome lengths within an input FASTA.
6
+
7
+ Parameters:
8
+ fasta (str): Path to the input fasta
9
+ """
10
+ import os
11
+ import subprocess
12
+ from .index_fasta import index_fasta
13
+
14
+ # Make a fasta index file if one isn't already available
15
+ index_path = f'{fasta}.fai'
16
+ if os.path.exists(index_path):
17
+ print(f'Using existing fasta index file: {index_path}')
18
+ else:
19
+ index_fasta(fasta)
20
+
21
+ parent_dir = os.path.dirname(fasta)
22
+ fasta_basename = os.path.basename(fasta)
23
+ chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
+ chrom_path = os.path.join(parent_dir, chrom_basename)
25
+
26
+ # Make a chromosome length file
27
+ if os.path.exists(chrom_path):
28
+ print(f'Using existing chrom length index file: {chrom_path}')
29
+ else:
30
+ with open(chrom_path, 'w') as outfile:
31
+ command = ["cut", "-f1,2", index_path]
32
+ subprocess.run(command, stdout=outfile)
@@ -1,17 +1,20 @@
1
1
  ## get_native_references
2
- from .. import readwrite
3
- # bioinformatic operations
4
- from Bio import SeqIO
5
- from Bio.SeqRecord import SeqRecord
6
- from Bio.Seq import Seq
7
2
 
8
3
  # Direct methylation specific
9
4
  def get_native_references(fasta_file):
10
5
  """
11
- Input: A FASTA file
6
+ Makes a dictionary keyed by record id which points to the record length and record sequence.
7
+
8
+ Paramaters:
9
+ fasta_file (str): A string representing the path to the FASTA file for the experiment.
10
+
12
11
  Returns:
13
- A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
12
+ None
14
13
  """
14
+ from .. import readwrite
15
+ from Bio import SeqIO
16
+ from Bio.SeqRecord import SeqRecord
17
+ from Bio.Seq import Seq
15
18
  record_dict = {}
16
19
  print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
17
20
  # Open the FASTA record as read only
@@ -0,0 +1,12 @@
1
+ # index_fasta
2
+
3
+ def index_fasta(fasta):
4
+ """
5
+ Generate a FASTA index file for an input fasta.
6
+
7
+ Parameters:
8
+ fasta (str): Path to the input fasta to make an index file for.
9
+ """
10
+ import subprocess
11
+
12
+ subprocess.run(["samtools", "faidx", fasta])
@@ -1,12 +1,18 @@
1
1
  ## make_dirs
2
- import os
3
2
 
4
3
  # General
5
4
  def make_dirs(directories):
6
5
  """
7
- Input: Takes a list of file paths to make directories for
8
- Output: Makes each directory in the list if the directory doesn't already exist.
6
+ Takes a list of file paths and makes new directories if the directory does not already exist.
7
+
8
+ Parameters:
9
+ directories (list): A list of directories to make
10
+
11
+ Returns:
12
+ None
9
13
  """
14
+ import os
15
+
10
16
  for directory in directories:
11
17
  if not os.path.isdir(directory):
12
18
  os.mkdir(directory)
@@ -1,19 +1,25 @@
1
1
  ## make_modbed
2
- import os
3
- import subprocess
4
2
 
5
3
  # Direct SMF
6
4
  def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
7
5
  """
8
- Generating Barcode position methylation summaries starting from the overall BAM file that was direct output of dorado aligner
6
+ Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
7
+ Parameters:
8
+ aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
9
+
10
+ Returns:
11
+ None
9
12
  """
13
+ import os
14
+ import subprocess
15
+
10
16
  os.chdir(mod_bed_dir)
11
17
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
12
18
  command = [
13
19
  "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
14
20
  "--partition-tag", "BC",
15
21
  "--only-tabs",
16
- "--filter-threshold", filter_threshold,
22
+ "--filter-threshold", f'{filter_threshold}',
17
23
  "--mod-thresholds", f"m:{m5C_threshold}",
18
24
  "--mod-thresholds", f"a:{m6A_threshold}",
19
25
  "--mod-thresholds", f"h:{hm5C_threshold}"
@@ -1,17 +1,25 @@
1
1
  ## modQC
2
- import subprocess
3
2
 
4
3
  # Direct SMF
5
4
  def modQC(aligned_sorted_output, thresholds):
6
5
  """
7
6
  Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
8
7
  It is generally good to look at these parameters on positive and negative controls.
8
+
9
+ Parameters:
10
+ aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
11
+ thresholds (list): A list of floats to pass for call thresholds.
12
+
13
+ Returns:
14
+ None
9
15
  """
16
+ import subprocess
17
+
10
18
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
11
19
  subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
12
20
  command = [
13
21
  "modkit", "summary", aligned_sorted_output,
14
- "--filter-threshold", filter_threshold,
22
+ "--filter-threshold", f"{filter_threshold}",
15
23
  "--mod-thresholds", f"m:{m5C_threshold}",
16
24
  "--mod-thresholds", f"a:{m6A_threshold}",
17
25
  "--mod-thresholds", f"h:{hm5C_threshold}"
@@ -1,14 +1,28 @@
1
1
  ## modcall
2
- import subprocess
3
2
 
4
3
  # Direct methylation specific
5
4
  def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
6
5
  """
7
6
  Wrapper function for dorado modified base calling.
7
+
8
+ Parameters:
9
+ model (str): a string representing the file path to the dorado basecalling model.
10
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
12
+ mod_list (list): A list of modification types to use in the analysis.
13
+ bam (str): File path to the BAM file to output.
14
+ bam_suffix (str): The suffix to use for the BAM file.
15
+
16
+ Returns:
17
+ None
18
+ Outputs a BAM file holding the modified base calls output by the dorado basecaller.
8
19
  """
20
+ import subprocess
9
21
  output = bam + bam_suffix
10
22
  command = [
11
23
  "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
12
- "--modified-bases", ",".join(mod_list)] # Join MOD_LIST elements with commas
24
+ "--modified-bases"]
25
+ command += mod_list
26
+ print(f'Running: {" ".join(command)}')
13
27
  with open(output, "w") as outfile:
14
28
  subprocess.run(command, stdout=outfile)