smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. smftools/_settings.py +3 -2
  2. smftools/_version.py +1 -1
  3. smftools/datasets/F1_sample_sheet.csv +5 -0
  4. smftools/datasets/datasets.py +8 -7
  5. smftools/informatics/__init__.py +7 -5
  6. smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
  7. smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
  8. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  9. smftools/informatics/conversion_smf.py +79 -0
  10. smftools/informatics/direct_smf.py +89 -0
  11. smftools/informatics/fast5_to_pod5.py +8 -6
  12. smftools/informatics/helpers/__init__.py +18 -0
  13. smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
  14. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  15. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  16. smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
  17. smftools/informatics/helpers/canoncall.py +2 -0
  18. smftools/informatics/helpers/complement_base_list.py +21 -0
  19. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  20. smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
  21. smftools/informatics/helpers/count_aligned_reads.py +13 -9
  22. smftools/informatics/helpers/extract_base_identities.py +34 -20
  23. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  24. smftools/informatics/helpers/find_conversion_sites.py +11 -9
  25. smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
  26. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  27. smftools/informatics/helpers/index_fasta.py +12 -0
  28. smftools/informatics/helpers/modcall.py +3 -1
  29. smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
  30. smftools/informatics/helpers/ohe_batching.py +52 -0
  31. smftools/informatics/helpers/one_hot_encode.py +10 -8
  32. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  33. smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
  34. smftools/informatics/helpers/split_and_index_BAM.py +16 -4
  35. smftools/informatics/load_adata.py +127 -0
  36. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  37. smftools/informatics/subsample_pod5.py +69 -13
  38. smftools/preprocessing/__init__.py +6 -1
  39. smftools/preprocessing/append_C_context.py +37 -14
  40. smftools/preprocessing/calculate_complexity.py +2 -2
  41. smftools/preprocessing/calculate_consensus.py +47 -0
  42. smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
  43. smftools/preprocessing/calculate_coverage.py +2 -2
  44. smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
  45. smftools/preprocessing/calculate_read_length_stats.py +56 -2
  46. smftools/preprocessing/clean_NaN.py +2 -2
  47. smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
  48. smftools/preprocessing/filter_reads_on_length.py +4 -2
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/preprocessing/load_sample_sheet.py +24 -0
  51. smftools/preprocessing/make_dirs.py +21 -0
  52. smftools/preprocessing/mark_duplicates.py +34 -19
  53. smftools/preprocessing/recipes.py +125 -0
  54. smftools/preprocessing/remove_duplicates.py +7 -4
  55. smftools/tools/apply_HMM.py +1 -0
  56. smftools/tools/cluster.py +0 -0
  57. smftools/tools/read_HMM.py +1 -0
  58. smftools/tools/subset_adata.py +32 -0
  59. smftools/tools/train_HMM.py +43 -0
  60. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
  61. smftools-0.1.3.dist-info/RECORD +84 -0
  62. smftools/informatics/basecalls_to_adata.py +0 -42
  63. smftools/informatics/pod5_conversion.py +0 -53
  64. smftools/informatics/pod5_direct.py +0 -55
  65. smftools/informatics/pod5_to_adata.py +0 -40
  66. smftools-0.1.1.dist-info/RECORD +0 -64
  67. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  68. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
5
5
  Binarizes conversion SMF data within a sequence string
6
6
 
7
7
  Parameters:
8
- base_identities (dict): A dictionary returned by extract_base_identity_at_coordinates.
8
+ base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
9
9
  strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
10
10
  modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
11
11
 
@@ -27,5 +27,5 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
27
27
  elif modification_type == '6mA':
28
28
  binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
29
29
  else:
30
- pass
30
+ print(f"{strand} not recognized")
31
31
  return binarized_base_identities
@@ -19,5 +19,7 @@ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
19
19
  import subprocess
20
20
  output = bam + bam_suffix
21
21
  command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
22
+ command_string = " ".join(command)
23
+ print(f"Running {command_string}\n to generate {output}")
22
24
  with open(output, "w") as outfile:
23
25
  subprocess.run(command, stdout=outfile)
@@ -0,0 +1,21 @@
1
+ # complement_base_list
2
+
3
+ def complement_base_list(sequence):
4
+ """
5
+ Takes a list of DNA base identities and returns their complement.
6
+
7
+ Parameters:
8
+ sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
9
+
10
+ Returns:
11
+ complement (list): A list of complementary DNA bases.
12
+ """
13
+ complement_mapping = {
14
+ 'A': 'T',
15
+ 'T': 'A',
16
+ 'C': 'G',
17
+ 'G': 'C',
18
+ 'N': 'N' # Handling ambiguous bases like 'N'
19
+ }
20
+
21
+ return [complement_mapping[base] for base in sequence]
@@ -0,0 +1,54 @@
1
+ # concatenate_fastqs_to_bam
2
+
3
+ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
4
+ """
5
+ Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
6
+
7
+ Parameters:
8
+ fastq_files (list): List of paths to demultiplexed FASTQ files.
9
+ output_bam (str): Path to the output BAM file.
10
+ barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
11
+ gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import os
17
+ import pysam
18
+ import gzip
19
+ from Bio import SeqIO
20
+ from tqdm import tqdm
21
+
22
+ n_fastqs = len(fastq_files)
23
+
24
+ with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
25
+ for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
26
+ # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
27
+ base_name = os.path.basename(fastq_file)
28
+ if n_fastqs > 1:
29
+ if base_name.endswith('.fastq.gz'):
30
+ barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
31
+ elif base_name.endswith('.fq.gz'):
32
+ barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
33
+ elif base_name.endswith('.fastq'):
34
+ barcode = base_name.split('_')[-1].replace('.fastq', '')
35
+ elif base_name.endswith('.fq'):
36
+ barcode = base_name.split('_')[-1].replace('.fq', '')
37
+ else:
38
+ raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
39
+
40
+ # Read the FASTQ file (handle gzipped and non-gzipped files)
41
+ open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
42
+ with open_func(fastq_file, 'rt') as fq_in:
43
+ for record in SeqIO.parse(fq_in, 'fastq'):
44
+ # Create an unaligned BAM entry for each FASTQ record
45
+ aln = pysam.AlignedSegment()
46
+ aln.query_name = record.id
47
+ aln.query_sequence = str(record.seq)
48
+ aln.flag = 4 # Unmapped
49
+ aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
50
+ if n_fastqs > 1:
51
+ # Add the barcode to the BC tag
52
+ aln.set_tag(barcode_tag, barcode)
53
+ # Write to BAM file
54
+ bam_out.write(aln)
@@ -21,144 +21,213 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
21
21
  from .find_conversion_sites import find_conversion_sites
22
22
  from .count_aligned_reads import count_aligned_reads
23
23
  from .extract_base_identities import extract_base_identities
24
- from .one_hot_encode import one_hot_encode
24
+ from .make_dirs import make_dirs
25
+ from .ohe_batching import ohe_batching
25
26
  import pandas as pd
26
27
  import numpy as np
27
28
  import anndata as ad
28
29
  import os
29
-
30
+ from tqdm import tqdm
31
+ import gc
32
+
33
+ ##########################################################################################
34
+ ## Get file paths and make necessary directories. ##
30
35
  # Get all of the input BAM files
31
36
  files = os.listdir(split_dir)
32
- # Change directory to the BAM directory
33
- os.chdir(split_dir)
37
+ # Make output dir
38
+ parent_dir = os.path.dirname(split_dir)
39
+ h5_dir = os.path.join(parent_dir, 'h5ads')
40
+ tmp_dir = os.path.join(parent_dir, 'tmp')
41
+ make_dirs([h5_dir, tmp_dir])
34
42
  # Filter file names that contain the search string in their filename and keep them in a list
35
43
  bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
36
44
  # Sort file list by names and print the list of file names
37
45
  bams.sort()
46
+ bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
38
47
  print(f'Found the following BAMS: {bams}')
39
48
  final_adata = None
49
+ ##########################################################################################
50
+
51
+ ##########################################################################################
40
52
 
41
- # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
53
+ ## need to fix this section
54
+ # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
42
55
  modification_dict = {}
56
+ # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
57
+ record_FASTA_dict = {}
43
58
  # While populating the dictionary, also extract the longest sequence record in the input references
44
59
  max_reference_length = 0
45
60
  for conversion_type in conversion_types:
61
+ # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
46
62
  modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
63
+ # Get the max reference length
47
64
  for record in modification_dict[conversion_type].keys():
48
65
  if modification_dict[conversion_type][record][0] > max_reference_length:
49
66
  max_reference_length = modification_dict[conversion_type][record][0]
50
67
 
51
- # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
52
- record_FASTA_dict = {}
68
+ mod_type, strand = record.split('_')[-2:]
53
69
 
54
- # Iterate over the experiment BAM files
55
- for bam_index, bam in enumerate(bams):
56
- # Give each bam a sample name
57
- sample = bam.split(sep=bam_suffix)[0]
70
+ chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
71
+ unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
72
+ current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
73
+ delta_max_length = max_reference_length - current_reference_length
74
+ sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
75
+ complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
76
+ record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
77
+ ##########################################################################################
78
+
79
+ ##########################################################################################
80
+ bam_alignment_stats_dict = {}
81
+ records_to_analyze = []
82
+ for bam_index, bam in enumerate(bam_path_list):
83
+ bam_alignment_stats_dict[bam_index] = {}
58
84
  # look at aligned read proportions in the bam
59
85
  aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
60
86
  percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
61
- print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
62
- records_to_analyze = []
87
+ print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
88
+ bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
63
89
  # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
64
90
  for record in record_counts:
65
91
  print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
66
92
  if record_counts[record][1] >= mapping_threshold:
67
93
  records_to_analyze.append(record)
68
- print(f'Records to analyze: {records_to_analyze}')
69
- # Iterate over records to analyze (ie all conversions detected)
94
+ bam_alignment_stats_dict[bam_index]
95
+ bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
96
+ records_to_analyze = set(records_to_analyze)
97
+ ##########################################################################################
98
+
99
+ ##########################################################################################
100
+ # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
101
+ # Save the file paths in the bam_record_ohe_files dict.
102
+ bam_record_ohe_files = {}
103
+
104
+ # Iterate over split bams
105
+ for bam_index, bam in enumerate(bam_path_list):
106
+ # Iterate over references to process
70
107
  for record in records_to_analyze:
71
- mod_type, strand = record.split('_')[-2:]
72
- if strand == 'top':
73
- strand_index = 1
74
- elif strand == 'bottom':
75
- strand_index = 2
76
-
77
- chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
78
- unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
79
- positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
80
- current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
81
- delta_max_length = max_reference_length - current_reference_length
82
- sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
83
- record_FASTA_dict[f'{record}'] = sequence
84
- print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
108
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
109
+ sample = bams[bam_index].split(sep=bam_suffix)[0]
110
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
111
+ current_reference_length = record_FASTA_dict[unconverted_record_name][4]
112
+ mod_type = record_FASTA_dict[unconverted_record_name][6]
113
+ strand = record_FASTA_dict[unconverted_record_name][7]
114
+
115
+ # Extract the base identities of reads aligned to the record
116
+ fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
85
117
 
86
- # Get a dictionary of positional identities keyed by read id
87
- print(f'Extracting base identities of target positions')
88
- target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
89
118
  # binarize the dictionary of positional identities
90
- print(f'Binarizing base identities of target positions')
91
- binarized_base_identities = binarize_converted_base_identities(target_base_identities, strand, mod_type)
119
+ print(f'Binarizing base identities')
120
+ fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
121
+ rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
122
+ merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
92
123
  # converts the base identity dictionary to a dataframe.
93
- binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
124
+ binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
94
125
  sorted_index = sorted(binarized_base_identities_df.index)
95
126
  binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
96
- # Get the sequence string of every read
97
- print(f'Extracting base identities of all positions in each read')
98
- all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
99
- # One hot encode the sequence string of the reads
100
- print(f'One hot encoding base identities of all positions in each read')
101
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
102
-
103
- # Initialize empty DataFrames for each base
104
- read_names = list(one_hot_reads.keys())
105
- sequence_length = one_hot_reads[read_names[0]].shape[0]
106
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
107
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
108
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
109
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
110
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
111
-
112
- # Iterate through the dictionary and populate the DataFrames
113
- for read_name, one_hot_array in one_hot_reads.items():
114
- df_A.loc[read_name] = one_hot_array[:, 0]
115
- df_C.loc[read_name] = one_hot_array[:, 1]
116
- df_G.loc[read_name] = one_hot_array[:, 2]
117
- df_T.loc[read_name] = one_hot_array[:, 3]
118
- df_N.loc[read_name] = one_hot_array[:, 4]
119
-
120
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
121
127
 
122
128
  # Load an anndata object with the sample data
123
129
  X = binarized_base_identities_df.values
124
130
  adata = ad.AnnData(X, dtype=X.dtype)
125
- adata.obs_names = binarized_base_identities_df.index
126
- adata.obs_names = adata.obs_names.astype(str)
127
- adata.var_names = binarized_base_identities_df.columns
128
- adata.var_names = adata.var_names.astype(str)
129
- adata.obs['Sample'] = [sample] * len(adata)
130
- adata.obs['Strand'] = [strand] * len(adata)
131
- adata.obs['Dataset'] = [mod_type] * len(adata)
132
- adata.obs['Reference'] = [record] * len(adata)
133
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
134
-
135
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
136
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
131
+ if adata.shape[0] > 0:
132
+ adata.obs_names = binarized_base_identities_df.index.astype(str)
133
+ adata.var_names = binarized_base_identities_df.columns.astype(str)
134
+ adata.obs['Sample'] = [sample] * len(adata)
135
+ adata.obs['Strand'] = [strand] * len(adata)
136
+ adata.obs['Dataset'] = [mod_type] * len(adata)
137
+ adata.obs['Reference'] = [record] * len(adata)
138
+ adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
139
+
140
+ read_mapping_direction = []
141
+ for read_id in adata.obs_names:
142
+ if read_id in fwd_base_identities.keys():
143
+ read_mapping_direction.append('fwd')
144
+ elif read_id in rev_base_identities.keys():
145
+ read_mapping_direction.append('rev')
146
+ else:
147
+ read_mapping_direction.append('unk')
148
+
149
+ adata.obs['Read_mapping_direction'] = read_mapping_direction
150
+
151
+ # One hot encode the sequence string of the reads
152
+ fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
153
+ rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
154
+ bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
155
+ del fwd_base_identities, rev_base_identities
156
+
157
+ one_hot_reads = {}
158
+ n_rows_OHE = 5
159
+ for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
160
+ tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
161
+ one_hot_reads.update(tmp_ohe_dict)
162
+ del tmp_ohe_dict
163
+
164
+ read_names = list(one_hot_reads.keys())
165
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
166
+
167
+ sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
168
+ df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
169
+ df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
170
+ df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
171
+ df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
172
+ df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
173
+
174
+ for read_name, one_hot_array in one_hot_reads.items():
175
+ one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
176
+ dict_A[read_name] = one_hot_array[0, :]
177
+ dict_C[read_name] = one_hot_array[1, :]
178
+ dict_G[read_name] = one_hot_array[2, :]
179
+ dict_T[read_name] = one_hot_array[3, :]
180
+ dict_N[read_name] = one_hot_array[4, :]
181
+
182
+ del one_hot_reads
183
+ gc.collect()
184
+
185
+ for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
186
+ df_A.iloc[j] = dict_A[read_name]
187
+ df_C.iloc[j] = dict_C[read_name]
188
+ df_G.iloc[j] = dict_G[read_name]
189
+ df_T.iloc[j] = dict_T[read_name]
190
+ df_N.iloc[j] = dict_N[read_name]
191
+
192
+ del dict_A, dict_C, dict_G, dict_T, dict_N
193
+ gc.collect()
194
+
195
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
196
+
197
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
198
+ adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
199
+ ohe_df_map[j] = None # Reassign pointer for memory usage purposes
200
+
201
+ if final_adata:
202
+ if adata.shape[0] > 0:
203
+ final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
204
+ else:
205
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
206
+ else:
207
+ if adata.shape[0] > 0:
208
+ final_adata = adata
209
+ else:
210
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
137
211
 
138
- if final_adata:
139
- final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
140
212
  else:
141
- final_adata = adata
142
-
143
- for record in record_FASTA_dict.keys():
144
- chromosome = record.split('_')[0]
145
- sequence = record_FASTA_dict[record]
213
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
214
+
215
+ # Set obs columns to type 'category'
216
+ for col in final_adata.obs.columns:
217
+ final_adata.obs[col] = final_adata.obs[col].astype('category')
218
+
219
+ for record in records_to_analyze:
220
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
221
+ sequence = record_FASTA_dict[unconverted_record_name][0]
222
+ complement = record_FASTA_dict[unconverted_record_name][1]
223
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
224
+ final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
225
+ final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
146
226
  final_adata.uns[f'{record}_FASTA_sequence'] = sequence
147
- final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
148
-
149
- # May need to remove the bottom for conversion SMF
150
- record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
151
- layer_map, layer_counts = {}, []
152
- for i, layer in enumerate(record_subset.layers):
153
- layer_map[i] = layer.split('_')[0]
154
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
155
- count_array = np.array(layer_counts)
156
- nucleotide_indexes = np.argmax(count_array, axis=0)
157
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
158
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
159
227
 
160
228
  ######################################################################################################
161
229
 
162
230
  ######################################################################################################
163
231
  ## Export the final adata object
164
- final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(readwrite.date_string(), experiment_name), compression='gzip')
232
+ final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
233
+ final_adata.write_h5ad(final_output, compression='gzip')
@@ -16,24 +16,28 @@ def count_aligned_reads(bam_file):
16
16
  """
17
17
  from .. import readwrite
18
18
  import pysam
19
+ from tqdm import tqdm
20
+ from collections import defaultdict
21
+
19
22
  print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
20
23
  aligned_reads_count = 0
21
24
  unaligned_reads_count = 0
22
25
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
23
- record_counts = {}
26
+ record_counts = defaultdict(int)
27
+
24
28
  with pysam.AlignmentFile(bam_file, "rb") as bam:
29
+ total_reads = bam.mapped + bam.unmapped
25
30
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
26
- for read in bam:
27
- if read.is_unmapped:
31
+ for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
32
+ if read.is_unmapped:
28
33
  unaligned_reads_count += 1
29
- else:
34
+ else:
30
35
  aligned_reads_count += 1
31
- if read.reference_name in record_counts:
32
- record_counts[read.reference_name] += 1
33
- else:
34
- record_counts[read.reference_name] = 1
36
+ record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
37
+
35
38
  # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
36
39
  for reference in record_counts:
37
40
  proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
38
41
  record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
39
- return aligned_reads_count, unaligned_reads_count, record_counts
42
+
43
+ return aligned_reads_count, unaligned_reads_count, dict(record_counts)
@@ -3,7 +3,7 @@
3
3
  # General
4
4
  def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
5
5
  """
6
- Extracts the base identities from every position within the read that has a reference coordinate
6
+ Extracts the base identities from every position within the mapped reads that have a reference coordinate
7
7
 
8
8
  Parameters:
9
9
  bam (str): File path to the BAM file to align (excluding the file suffix).
@@ -12,32 +12,46 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
12
12
  max_reference_length (int): The maximum length of a record in the reference set.
13
13
 
14
14
  Returns:
15
- base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities. If the read does not contain that position, fill the list at that index with a N value.
16
-
15
+ fwd_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from forward mapped reads. If the read does not contain that position, fill the list at that index with a N value.
16
+ rev_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from reverse mapped reads. If the read does not contain that position, fill the list at that index with a N value.
17
17
  """
18
18
  from .. import readwrite
19
19
  import pysam
20
+ from tqdm import tqdm
21
+
20
22
  positions = set(positions)
21
23
  # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
22
- base_identities = {}
24
+ fwd_base_identities = {}
25
+ rev_base_identities = {}
23
26
  # Open the postion sorted BAM file
24
27
  print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
25
28
  with pysam.AlignmentFile(bam_file, "rb") as bam:
26
29
  # Iterate over every read in the bam that comes from the chromosome of interest
27
30
  print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
28
- for read in bam.fetch(chromosome):
29
- if read.query_name in base_identities:
30
- pass
31
- #print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
32
- else:
33
- # Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
34
- base_identities[read.query_name] = ['N'] * max_reference_length
35
- # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
36
- for read_position, reference_position in read.get_aligned_pairs():
37
- # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
38
- if reference_position in positions and read_position:
39
- # get the base_identity in the read corresponding to that position
40
- base_identity = read.query_sequence[read_position]
41
- # Add the base identity to array
42
- base_identities[read.query_name][reference_position] = base_identity
43
- return base_identities
31
+ total_reads = bam.mapped
32
+ for read in tqdm(bam.fetch(chromosome), desc='Extracting base identities from reads in BAM', total=total_reads):
33
+ # Only iterate over mapped reads
34
+ if read.is_mapped:
35
+ # Get sequence of read. PySam reports fwd mapped reads as the true read sequence. Pysam reports rev mapped reads as the reverse complement of the read.
36
+ query_sequence = read.query_sequence
37
+ # If the read aligned as a reverse complement, mark that the read is reversed
38
+ if read.is_reverse:
39
+ # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
40
+ rev_base_identities[read.query_name] = ['N'] * max_reference_length
41
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
42
+ for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
43
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
44
+ if reference_position in positions and read_position:
45
+ # get the base_identity in the read corresponding to that position
46
+ rev_base_identities[read.query_name][reference_position] = query_sequence[read_position]
47
+ else:
48
+ # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
49
+ fwd_base_identities[read.query_name] = ['N'] * max_reference_length
50
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
51
+ for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
52
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
53
+ if reference_position in positions and read_position:
54
+ # get the base_identity in the read corresponding to that position
55
+ fwd_base_identities[read.query_name][reference_position] = query_sequence[read_position]
56
+
57
+ return fwd_base_identities, rev_base_identities
@@ -0,0 +1,22 @@
1
+ # extract_readnames_from_BAM
2
+
3
+ def extract_readnames_from_BAM(aligned_BAM):
4
+ """
5
+ Takes a BAM and writes out a txt file containing read names from the BAM
6
+
7
+ Parameters:
8
+ aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
9
+
10
+ Returns:
11
+ None
12
+
13
+ """
14
+ import subprocess
15
+ # Make a text file of reads for the BAM
16
+ txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
17
+ samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
18
+ with open(txt_output, "w") as output_file:
19
+ cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
20
+ samtools_view.stdout.close()
21
+ cut_process.wait()
22
+ samtools_view.wait()
@@ -6,34 +6,35 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
6
6
  If searching for adenine conversions, it will find coordinates of all adenines.
7
7
 
8
8
  Parameters:
9
- fasta_file (str): A string representing the file path to the unconverted reference FASTA.
9
+ fasta_file (str): A string representing the file path to the converted reference FASTA.
10
10
  modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
11
  conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
12
12
 
13
13
  Returns:
14
- record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
14
+ record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string, 5) Complement sequence
15
15
  """
16
16
  from .. import readwrite
17
17
  from Bio import SeqIO
18
18
  from Bio.SeqRecord import SeqRecord
19
19
  from Bio.Seq import Seq
20
20
 
21
- print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
21
+ #print('{0}: Finding positions of interest in reference FASTA: {1}'.format(readwrite.time_string(), fasta_file))
22
22
  # Initialize lists to hold top and bottom strand positional coordinates of interest
23
23
  top_strand_coordinates = []
24
24
  bottom_strand_coordinates = []
25
25
  unconverted = conversion_types[0]
26
26
  record_dict = {}
27
- print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
27
+ #print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
28
28
  # Open the FASTA record as read only
29
29
  with open(fasta_file, "r") as f:
30
30
  # Iterate over records in the FASTA
31
31
  for record in SeqIO.parse(f, "fasta"):
32
32
  # Only iterate over the unconverted records for the reference
33
33
  if unconverted in record.id:
34
- print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
34
+ #print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
35
35
  # Extract the sequence string of the record
36
36
  sequence = str(record.seq).upper()
37
+ complement = str(record.seq.complement()).upper()
37
38
  sequence_length = len(sequence)
38
39
  if modification_type == '5mC':
39
40
  # Iterate over the sequence string from the record
@@ -42,7 +43,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
42
43
  top_strand_coordinates.append(i) # 0-indexed coordinate
43
44
  if sequence[i] == 'G':
44
45
  bottom_strand_coordinates.append(i) # 0-indexed coordinate
45
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
46
+ #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
46
47
  elif modification_type == '6mA':
47
48
  # Iterate over the sequence string from the record
48
49
  for i in range(0, len(sequence)):
@@ -50,10 +51,11 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
50
51
  top_strand_coordinates.append(i) # 0-indexed coordinate
51
52
  if sequence[i] == 'T':
52
53
  bottom_strand_coordinates.append(i) # 0-indexed coordinate
53
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
54
+ #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
54
55
  else:
55
- print('modification_type not found. Please try 5mC or 6mA')
56
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
56
+ #print('modification_type not found. Please try 5mC or 6mA')
57
+ pass
58
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
57
59
  else:
58
60
  pass
59
61
  return record_dict