smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +3 -2
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_sample_sheet.csv +5 -0
  5. smftools/datasets/datasets.py +14 -11
  6. smftools/informatics/__init__.py +10 -7
  7. smftools/informatics/archived/bam_conversion.py +59 -0
  8. smftools/informatics/archived/bam_direct.py +63 -0
  9. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  10. smftools/informatics/conversion_smf.py +79 -0
  11. smftools/informatics/direct_smf.py +89 -0
  12. smftools/informatics/fast5_to_pod5.py +21 -0
  13. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  14. smftools/informatics/helpers/__init__.py +22 -4
  15. smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
  16. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  17. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  18. smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
  19. smftools/informatics/helpers/canoncall.py +14 -1
  20. smftools/informatics/helpers/complement_base_list.py +21 -0
  21. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
  23. smftools/informatics/helpers/count_aligned_reads.py +25 -14
  24. smftools/informatics/helpers/extract_base_identities.py +44 -23
  25. smftools/informatics/helpers/extract_mods.py +17 -5
  26. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  27. smftools/informatics/helpers/find_conversion_sites.py +24 -16
  28. smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
  29. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  30. smftools/informatics/helpers/get_native_references.py +10 -7
  31. smftools/informatics/helpers/index_fasta.py +12 -0
  32. smftools/informatics/helpers/make_dirs.py +9 -3
  33. smftools/informatics/helpers/make_modbed.py +10 -4
  34. smftools/informatics/helpers/modQC.py +10 -2
  35. smftools/informatics/helpers/modcall.py +16 -2
  36. smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
  37. smftools/informatics/helpers/ohe_batching.py +52 -0
  38. smftools/informatics/helpers/one_hot_encode.py +15 -8
  39. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  40. smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
  41. smftools/informatics/helpers/split_and_index_BAM.py +31 -11
  42. smftools/informatics/load_adata.py +127 -0
  43. smftools/informatics/readwrite.py +13 -16
  44. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  45. smftools/informatics/subsample_pod5.py +104 -0
  46. smftools/preprocessing/__init__.py +6 -7
  47. smftools/preprocessing/append_C_context.py +52 -22
  48. smftools/preprocessing/binarize_on_Youden.py +8 -4
  49. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  50. smftools/preprocessing/calculate_complexity.py +26 -14
  51. smftools/preprocessing/calculate_consensus.py +47 -0
  52. smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
  53. smftools/preprocessing/calculate_coverage.py +14 -8
  54. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  55. smftools/preprocessing/calculate_position_Youden.py +21 -12
  56. smftools/preprocessing/calculate_read_length_stats.py +67 -8
  57. smftools/preprocessing/clean_NaN.py +13 -6
  58. smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
  59. smftools/preprocessing/filter_reads_on_length.py +16 -6
  60. smftools/preprocessing/invert_adata.py +10 -5
  61. smftools/preprocessing/load_sample_sheet.py +24 -0
  62. smftools/preprocessing/make_dirs.py +21 -0
  63. smftools/preprocessing/mark_duplicates.py +54 -30
  64. smftools/preprocessing/min_non_diagonal.py +9 -4
  65. smftools/preprocessing/recipes.py +125 -0
  66. smftools/preprocessing/remove_duplicates.py +15 -6
  67. smftools/readwrite.py +13 -16
  68. smftools/tools/apply_HMM.py +1 -0
  69. smftools/tools/cluster.py +0 -0
  70. smftools/tools/read_HMM.py +1 -0
  71. smftools/tools/subset_adata.py +32 -0
  72. smftools/tools/train_HMM.py +43 -0
  73. smftools-0.1.3.dist-info/METADATA +94 -0
  74. smftools-0.1.3.dist-info/RECORD +84 -0
  75. smftools/informatics/helpers/align_BAM.py +0 -49
  76. smftools/informatics/helpers/load_experiment_config.py +0 -17
  77. smftools/informatics/pod5_conversion.py +0 -26
  78. smftools/informatics/pod5_direct.py +0 -29
  79. smftools/informatics/pod5_to_adata.py +0 -17
  80. smftools-0.1.0.dist-info/METADATA +0 -75
  81. smftools-0.1.0.dist-info/RECORD +0 -58
  82. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  83. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  84. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  85. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  86. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,355 +1,518 @@
1
1
  ## modkit_extract_to_adata
2
- from .. import readwrite
3
- from .get_native_references import get_native_references
4
- from .count_aligned_reads import count_aligned_reads
5
- from .extract_base_identities import extract_base_identities
6
- from .one_hot_encode import one_hot_encode
7
- import pandas as pd
8
- import anndata as ad
9
- import os
10
- import gc
11
- import math
12
- import numpy as np
13
-
14
- def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
2
+
3
+ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs=False):
15
4
  """
16
-
5
+ Takes modkit extract outputs and organizes it into an adata object
6
+
7
+ Parameters:
8
+ fasta (str): File path to the reference genome to align to.
9
+ bam_dir (str): File path to the directory containing the aligned_sorted split modified BAM files
10
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
+ experiment_name (str): A string to provide an experiment name to the output adata file.
12
+ mods (list): A list of strings of the modification types to use in the analysis.
13
+ batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
14
+ mod_tsv_dir (str): String representing the path to the mod TSV directory
15
+ delete_batch_hdfs (bool): Whether to delete the batch hdfs after writing out the final concatenated hdf. Default is False
16
+
17
+ Returns:
18
+ None
17
19
  """
18
20
  ###################################################
19
- ### Get input tsv file names into a sorted list ###
21
+ # Package imports
22
+ from .. import readwrite
23
+ from .get_native_references import get_native_references
24
+ from .count_aligned_reads import count_aligned_reads
25
+ from .extract_base_identities import extract_base_identities
26
+ from .one_hot_encode import one_hot_encode
27
+ from .ohe_batching import ohe_batching
28
+ import pandas as pd
29
+ import anndata as ad
30
+ import os
31
+ import gc
32
+ import math
33
+ import numpy as np
34
+ from Bio.Seq import Seq
35
+ from tqdm import tqdm
36
+ import h5py
37
+ from .make_dirs import make_dirs
38
+ ###################################################
39
+
40
+ ################## Get input tsv and bam file names into a sorted list ################
20
41
  # List all files in the directory
21
- files = os.listdir(os.getcwd())
42
+ tsv_files = os.listdir(mod_tsv_dir)
43
+ bam_files = os.listdir(bam_dir)
22
44
  # get current working directory
23
- cwd = os.getcwd()
45
+ parent_dir = os.path.dirname(mod_tsv_dir)
46
+ # Make output dirs
47
+ h5_dir = os.path.join(parent_dir, 'h5ads')
48
+ tmp_dir = os.path.join(parent_dir, 'tmp')
49
+ make_dirs([h5_dir, tmp_dir])
24
50
  # Filter file names that contain the search string in their filename and keep them in a list
25
- tsvs = [tsv for tsv in files if 'extract.tsv' in tsv]
51
+ tsvs = [tsv for tsv in tsv_files if 'extract.tsv' in tsv]
52
+ bams = [bam for bam in bam_files if '.bam' in bam and '.bai' not in bam]
26
53
  # Sort file list by names and print the list of file names
27
54
  tsvs.sort()
55
+ tsv_path_list = [os.path.join(mod_tsv_dir, tsv) for tsv in tsvs]
56
+ bams.sort()
57
+ bam_path_list = [os.path.join(bam_dir, bam) for bam in bams]
28
58
  print(f'{len(tsvs)} sample tsv files found: {tsvs}')
29
- print(f'sample bam file found: {bam}')
59
+ print(f'{len(bams)} sample bams found: {bams}')
60
+ ##########################################################################################
61
+
62
+ ######### Get Record names that have over a passed threshold of mapped reads #############
63
+ # get all records that are above a certain mapping threshold in at least one sample bam
64
+ records_to_analyze = []
65
+ for bami, bam in enumerate(bam_path_list):
66
+ aligned_reads_count, unaligned_reads_count, record_counts_dict = count_aligned_reads(bam)
67
+ percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
68
+ print(f'{percent_aligned} percent of reads in {bams[bami]} aligned successfully')
69
+ # Iterate over references and decide which to use in the analysis based on the mapping_threshold
70
+ for record in record_counts_dict:
71
+ print('{0} reads mapped to reference record {1}. This is {2} percent of all mapped reads in {3}'.format(record_counts_dict[record][0], record, record_counts_dict[record][1]*100, bams[bami]))
72
+ if record_counts_dict[record][1] >= mapping_threshold:
73
+ records_to_analyze.append(record)
74
+ records_to_analyze = set(records_to_analyze)
75
+ print(f'Records to analyze: {records_to_analyze}')
76
+ ##########################################################################################
30
77
 
78
+ ########### Determine the maximum record length to analyze in the dataset ################
31
79
  # Get all references within the FASTA and indicate the length and identity of the record sequence
32
80
  max_reference_length = 0
33
- reference_dict = get_native_references(fasta)
34
- for record in reference_dict.keys():
81
+ reference_dict = get_native_references(fasta) # returns a dict keyed by record name. Points to a tuple of (reference length, reference sequence)
82
+ # Get the max record length in the dataset.
83
+ for record in records_to_analyze:
35
84
  if reference_dict[record][0] > max_reference_length:
36
85
  max_reference_length = reference_dict[record][0]
37
-
38
86
  print(f'{readwrite.time_string()}: Max reference length in dataset: {max_reference_length}')
39
87
  batches = math.ceil(len(tsvs) / batch_size) # Number of batches to process
40
88
  print('{0}: Processing input tsvs in {1} batches of {2} tsvs '.format(readwrite.time_string(), batches, batch_size))
89
+ ##########################################################################################
41
90
 
42
- # look at aligned read proportions in the bam
43
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
44
- print('{} percent of reads in bam aligned successfully'.format(aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)))
45
- records_to_analyze = []
46
- # Iterate over references and decide which to use in the analysis based on the mapping_threshold
47
- for record in record_counts:
48
- print('{0} reads mapped to reference record {1}. This is {2} percent of all mapped reads'.format(record_counts[record][0], record, record_counts[record][1]*100))
49
- if record_counts[record][1] >= mapping_threshold:
50
- records_to_analyze.append(record)
51
- print(f'Records to analyze: {records_to_analyze}')
52
- # Iterate over records to analyze and return a dictionary keyed by the reference name that points to another dictionary keyed by read names that map to that reference. This internal dictionary points to a one-hot encoding of the mapped read
91
+ ##########################################################################################
92
+ # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
93
+ # Save the file paths in the bam_record_ohe_files dict.
94
+ bam_record_ohe_files = {}
95
+ bam_record_save = os.path.join(tmp_dir, 'tmp_file_dict.h5ad.gz')
96
+ fwd_mapped_reads = set()
97
+ rev_mapped_reads = set()
98
+ # If this step has already been performed, read in the tmp_dile_dict
99
+ if os.path.exists(bam_record_save):
100
+ bam_record_ohe_files = ad.read_h5ad(bam_record_save).uns
101
+ print('Found existing OHE reads, using these')
102
+ else:
103
+ # Iterate over split bams
104
+ for bami, bam in enumerate(bam_path_list):
105
+ # Iterate over references to process
106
+ for record in records_to_analyze:
107
+ current_reference_length = reference_dict[record][0]
108
+ positions = range(current_reference_length)
109
+ # Extract the base identities of reads aligned to the record
110
+ fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
111
+ # Store read names of fwd and rev mapped reads
112
+ fwd_mapped_reads.update(fwd_base_identities.keys())
113
+ rev_mapped_reads.update(rev_base_identities.keys())
114
+ # One hot encode the sequence string of the reads
115
+ fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bami}_fwd",batch_size=100000)
116
+ rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bami}_rev",batch_size=100000)
117
+ bam_record_ohe_files[f'{bami}_{record}'] = fwd_ohe_files + rev_ohe_files
118
+ del fwd_base_identities, rev_base_identities
119
+ # Save out the ohe file paths
120
+ X = np.random.rand(1, 1)
121
+ tmp_ad = ad.AnnData(X=X, uns=bam_record_ohe_files)
122
+ tmp_ad.write_h5ad(bam_record_save, compression='gzip')
123
+ ##########################################################################################
124
+
125
+ ##########################################################################################
126
+ # Iterate over records to analyze and return a dictionary keyed by the reference name that points to a tuple containing the top strand sequence and the complement
53
127
  record_seq_dict = {}
54
128
  for record in records_to_analyze:
55
129
  current_reference_length = reference_dict[record][0]
56
130
  delta_max_length = max_reference_length - current_reference_length
57
131
  sequence = reference_dict[record][1] + 'N'*delta_max_length
58
- # Get a dictionary of positional base identities keyed by read id
59
- base_identities = extract_base_identities(bam, record, current_reference_length, max_reference_length)
60
- # One hot encode the sequence string of the reads
61
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
62
- record_seq_dict[record] = (one_hot_reads, sequence)
132
+ complement = str(Seq(reference_dict[record][1]).complement()).upper() + 'N'*delta_max_length
133
+ record_seq_dict[record] = (sequence, complement)
134
+ ##########################################################################################
63
135
 
64
136
  ###################################################
137
+ existing_h5s = os.listdir(h5_dir)
138
+ existing_h5s = [h5 for h5 in existing_h5s if '.h5ad.gz' in h5]
139
+ final_hdf = f'{experiment_name}_final_experiment_hdf5.h5ad.gz'
140
+ final_hdf_already_exists = final_hdf in existing_h5s
65
141
 
66
- ###################################################
67
- # Begin iterating over batches
68
- for batch in range(batches):
69
- print('{0}: Processing tsvs for batch {1} '.format(readwrite.time_string(), batch))
70
- # For the final batch, just take the remaining tsv files
71
- if batch == batches - 1:
72
- tsv_batch = tsvs
73
- # For all other batches, take the next batch of tsvs out of the file queue.
74
- else:
75
- tsv_batch = tsvs[:batch_size]
76
- tsvs = tsvs[batch_size:]
77
- print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
78
- ###################################################
142
+ if final_hdf_already_exists:
143
+ print(f'{final_hdf} has already been made. Skipping processing.')
144
+ else:
145
+ # Begin iterating over batches
146
+ for batch in range(batches):
147
+ print('{0}: Processing tsvs for batch {1} '.format(readwrite.time_string(), batch))
148
+ # For the final batch, just take the remaining tsv and bam files
149
+ if batch == batches - 1:
150
+ tsv_batch = tsv_path_list
151
+ bam_batch = bam_path_list
152
+ # For all other batches, take the next batch of tsvs and bams out of the file queue.
153
+ else:
154
+ tsv_batch = tsv_path_list[:batch_size]
155
+ bam_batch = bam_path_list[:batch_size]
156
+ tsv_path_list = tsv_path_list[batch_size:]
157
+ bam_path_list = bam_path_list[batch_size:]
158
+ print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
79
159
 
160
+ batch_already_processed = sum([1 for h5 in existing_h5s if f'_{batch}_' in h5])
80
161
  ###################################################
81
- ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
82
- # Initialize dictionaries and place them in a list
83
- dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
84
- dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
85
-
86
- # Give names to represent each dictionary in the list
87
- sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
88
-
89
- # Give indices of dictionaries to skip for analysis and final dictionary saving.
90
- dict_to_skip = [0, 1, 4]
91
- combined_dicts = [7, 8]
92
- A_stranded_dicts = [2, 3]
93
- C_stranded_dicts = [5, 6]
94
- dict_to_skip = dict_to_skip + combined_dicts + A_stranded_dicts + C_stranded_dicts
95
- dict_to_skip = set(dict_to_skip)
96
-
97
- # Load the dict_total dictionary with all of the tsv files as dataframes.
98
- for i, tsv in enumerate(tsv_batch):
99
- print('{0}: Loading sample tsv {1} into dataframe'.format(readwrite.time_string(), tsv))
100
- temp_df = pd.read_csv(tsv, sep='\t', header=0)
101
- for record in records_to_analyze:
102
- if record not in dict_total.keys():
103
- dict_total[record] = {}
104
- # Only keep the reads aligned to the chromosomes of interest
105
- print('{0}: Filtering sample dataframe to keep chromosome of interest'.format(readwrite.time_string()))
106
- dict_total[record][i] = temp_df[temp_df['chrom'] == record]
107
- # Only keep the read positions that fall within the region of interest
108
- print('{0}: Filtering sample dataframe to keep positions falling within region of interest'.format(readwrite.time_string()))
109
- current_reference_length = reference_dict[record][0]
110
- dict_total[record][i] = dict_total[record][i][(current_reference_length > dict_total[record][i]['ref_position']) & (dict_total[record][i]['ref_position']>= 0)]
111
-
112
- # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
113
- for record in dict_total.keys():
114
- for i in dict_total[record].keys():
115
- if '6mA' in mods:
116
- # Remove Adenine stranded dicts from the dicts to skip set
117
- dict_to_skip.difference_update(A_stranded_dicts)
118
-
119
- if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
120
- dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
121
-
122
- # get a dictionary of dataframes that only contain methylated adenine positions
123
- dict_a[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'A']
124
- print('{}: Successfully created a methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
125
- # Stratify the adenine dictionary into two strand specific dictionaries.
126
- dict_a_bottom[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '-']
127
- print('{}: Successfully created a minus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
128
- dict_a_top[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '+']
129
- print('{}: Successfully created a plus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
130
-
131
- if '5mC' in mods:
132
- # Remove Cytosine stranded dicts from the dicts to skip set
133
- dict_to_skip.difference_update(C_stranded_dicts)
134
-
135
- if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
136
- dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
137
-
138
- # get a dictionary of dataframes that only contain methylated cytosine positions
139
- dict_c[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'C']
140
- print('{}: Successfully created a methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
141
- # Stratify the cytosine dictionary into two strand specific dictionaries.
142
- dict_c_bottom[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '-']
143
- print('{}: Successfully created a minus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
144
- dict_c_top[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '+']
145
- print('{}: Successfully created a plus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
146
- # In the strand specific dictionaries, only keep positions that are informative for GpC SMF
147
-
148
- if '6mA' in mods and '5mC' in mods:
149
- # Remove combined stranded dicts from the dicts to skip set
150
- dict_to_skip.difference_update(combined_dicts)
151
- # Initialize the sample keys for the combined dictionaries
152
-
153
- if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
154
- dict_combined_bottom[record], dict_combined_top[record]= {}, {}
155
-
156
- print('{}: Successfully created a minus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
157
- dict_combined_bottom[record][i] = []
158
- print('{}: Successfully created a plus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
159
- dict_combined_top[record][i] = []
160
-
161
- # Iterate over the stranded modification dictionaries and replace the dataframes with a dictionary of read names pointing to a list of values from the dataframe
162
- for i, dict_type in enumerate(dict_list):
163
- # Only iterate over stranded dictionaries
164
- if i not in dict_to_skip:
165
- print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), sample_types[i]))
166
- for record in dict_type.keys():
167
- # Get the dictionary for the modification type of interest from the reference mapping of interest
168
- dict = dict_type[record]
169
- print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), record))
170
- # For each sample in a stranded dictionary
171
- for sample in dict.keys():
172
- print('{0}: Extracting {1} dictionary from record {2} for sample {3}'.format(readwrite.time_string(), sample_types[i], record, sample))
173
- # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
174
- if i == 7:
175
- # Load the minus strand dictionaries for each sample into temporary variables
176
- temp_a_dict = dict_list[2][record][sample].copy()
177
- temp_c_dict = dict_list[5][record][sample].copy()
178
- dict[sample] = {}
179
- # Iterate over the reads present in the merge of both dictionaries
180
- for read in set(temp_a_dict) | set(temp_c_dict):
181
- # Add the arrays element-wise if the read is present in both dictionaries
182
- if read in temp_a_dict and read in temp_c_dict:
183
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
184
- # If the read is present in only one dictionary, copy its value
185
- elif read in temp_a_dict:
186
- dict[sample][read] = temp_a_dict[read]
187
- else:
188
- dict[sample][read] = temp_c_dict[read]
189
- # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
190
- elif i == 8:
191
- # Load the plus strand dictionaries for each sample into temporary variables
192
- temp_a_dict = dict_list[3][record][sample].copy()
193
- temp_c_dict = dict_list[6][record][sample].copy()
194
- dict[sample] = {}
195
- # Iterate over the reads present in the merge of both dictionaries
196
- for read in set(temp_a_dict) | set(temp_c_dict):
197
- # Add the arrays element-wise if the read is present in both dictionaries
198
- if read in temp_a_dict and read in temp_c_dict:
199
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
200
- # If the read is present in only one dictionary, copy its value
201
- elif read in temp_a_dict:
202
- dict[sample][read] = temp_a_dict[read]
162
+ if batch_already_processed:
163
+ print(f'Batch {batch} has already been processed into h5ads. Skipping batch and using existing files')
164
+ else:
165
+ ###################################################
166
+ ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
167
+ # Initialize dictionaries and place them in a list
168
+ dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
169
+ dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
170
+ # Give names to represent each dictionary in the list
171
+ sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
172
+ # Give indices of dictionaries to skip for analysis and final dictionary saving.
173
+ dict_to_skip = [0, 1, 4]
174
+ combined_dicts = [7, 8]
175
+ A_stranded_dicts = [2, 3]
176
+ C_stranded_dicts = [5, 6]
177
+ dict_to_skip = dict_to_skip + combined_dicts + A_stranded_dicts + C_stranded_dicts
178
+ dict_to_skip = set(dict_to_skip)
179
+
180
+ # Load the dict_total dictionary with all of the tsv files as dataframes.
181
+ for sample_index, tsv in tqdm(enumerate(tsv_batch), desc=f'Loading TSVs into dataframes and filtering on chromosome/position for batch {batch}', total=batch_size):
182
+ #print('{0}: Loading sample tsv {1} into dataframe'.format(readwrite.time_string(), tsv))
183
+ temp_df = pd.read_csv(tsv, sep='\t', header=0)
184
+ for record in records_to_analyze:
185
+ if record not in dict_total.keys():
186
+ dict_total[record] = {}
187
+ # Only keep the reads aligned to the chromosomes of interest
188
+ #print('{0}: Filtering sample dataframe to keep chromosome of interest'.format(readwrite.time_string()))
189
+ dict_total[record][sample_index] = temp_df[temp_df['chrom'] == record]
190
+ # Only keep the read positions that fall within the region of interest
191
+ #print('{0}: Filtering sample dataframe to keep positions falling within region of interest'.format(readwrite.time_string()))
192
+ current_reference_length = reference_dict[record][0]
193
+ dict_total[record][sample_index] = dict_total[record][sample_index][(current_reference_length > dict_total[record][sample_index]['ref_position']) & (dict_total[record][sample_index]['ref_position']>= 0)]
194
+
195
+ # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
196
+ for record in dict_total.keys():
197
+ for sample_index in dict_total[record].keys():
198
+ if '6mA' in mods:
199
+ # Remove Adenine stranded dicts from the dicts to skip set
200
+ dict_to_skip.difference_update(set(A_stranded_dicts))
201
+
202
+ if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
203
+ dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
204
+
205
+ # get a dictionary of dataframes that only contain methylated adenine positions
206
+ dict_a[record][sample_index] = dict_total[record][sample_index][dict_total[record][sample_index]['modified_primary_base'] == 'A']
207
+ print('{}: Successfully loaded a methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
208
+ # Stratify the adenine dictionary into two strand specific dictionaries.
209
+ dict_a_bottom[record][sample_index] = dict_a[record][sample_index][dict_a[record][sample_index]['ref_strand'] == '-']
210
+ print('{}: Successfully loaded a minus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
211
+ dict_a_top[record][sample_index] = dict_a[record][sample_index][dict_a[record][sample_index]['ref_strand'] == '+']
212
+ print('{}: Successfully loaded a plus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
213
+
214
+ # Reassign pointer for dict_a to None and delete the original value that it pointed to in order to decrease memory usage.
215
+ dict_a[record][sample_index] = None
216
+ gc.collect()
217
+
218
+ if '5mC' in mods:
219
+ # Remove Cytosine stranded dicts from the dicts to skip set
220
+ dict_to_skip.difference_update(set(C_stranded_dicts))
221
+
222
+ if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
223
+ dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
224
+
225
+ # get a dictionary of dataframes that only contain methylated cytosine positions
226
+ dict_c[record][sample_index] = dict_total[record][sample_index][dict_total[record][sample_index]['modified_primary_base'] == 'C']
227
+ print('{}: Successfully loaded a methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
228
+ # Stratify the cytosine dictionary into two strand specific dictionaries.
229
+ dict_c_bottom[record][sample_index] = dict_c[record][sample_index][dict_c[record][sample_index]['ref_strand'] == '-']
230
+ print('{}: Successfully loaded a minus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
231
+ dict_c_top[record][sample_index] = dict_c[record][sample_index][dict_c[record][sample_index]['ref_strand'] == '+']
232
+ print('{}: Successfully loaded a plus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
233
+ # In the strand specific dictionaries, only keep positions that are informative for GpC SMF
234
+
235
+ # Reassign pointer for dict_c to None and delete the original value that it pointed to in order to decrease memory usage.
236
+ dict_c[record][sample_index] = None
237
+ gc.collect()
238
+
239
+ if '6mA' in mods and '5mC' in mods:
240
+ # Remove combined stranded dicts from the dicts to skip set
241
+ dict_to_skip.difference_update(set(combined_dicts))
242
+ # Initialize the sample keys for the combined dictionaries
243
+
244
+ if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
245
+ dict_combined_bottom[record], dict_combined_top[record]= {}, {}
246
+
247
+ print('{}: Successfully created a minus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(sample_index))
248
+ dict_combined_bottom[record][sample_index] = []
249
+ print('{}: Successfully created a plus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(sample_index))
250
+ dict_combined_top[record][sample_index] = []
251
+
252
+ # Reassign pointer for dict_total to None and delete the original value that it pointed to in order to decrease memory usage.
253
+ dict_total[record][sample_index] = None
254
+ gc.collect()
255
+
256
+ # Iterate over the stranded modification dictionaries and replace the dataframes with a dictionary of read names pointing to a list of values from the dataframe
257
+ for dict_index, dict_type in enumerate(dict_list):
258
+ # Only iterate over stranded dictionaries
259
+ if dict_index not in dict_to_skip:
260
+ print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), sample_types[dict_index]))
261
+ for record in dict_type.keys():
262
+ # Get the dictionary for the modification type of interest from the reference mapping of interest
263
+ mod_strand_record_sample_dict = dict_type[record]
264
+ print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), record))
265
+ # For each sample in a stranded dictionary
266
+ n_samples = len(mod_strand_record_sample_dict.keys())
267
+ for sample in tqdm(mod_strand_record_sample_dict.keys(), desc=f'Extracting {sample_types[dict_index]} dictionary from record {record} for sample', total=n_samples):
268
+ # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
269
+ if dict_index == 7:
270
+ # Load the minus strand dictionaries for each sample into temporary variables
271
+ temp_a_dict = dict_list[2][record][sample].copy()
272
+ temp_c_dict = dict_list[5][record][sample].copy()
273
+ mod_strand_record_sample_dict[sample] = {}
274
+ # Iterate over the reads present in the merge of both dictionaries
275
+ for read in set(temp_a_dict) | set(temp_c_dict):
276
+ # Add the arrays element-wise if the read is present in both dictionaries
277
+ if read in temp_a_dict and read in temp_c_dict:
278
+ mod_strand_record_sample_dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
279
+ # If the read is present in only one dictionary, copy its value
280
+ elif read in temp_a_dict:
281
+ mod_strand_record_sample_dict[sample][read] = temp_a_dict[read]
282
+ elif read in temp_c_dict:
283
+ mod_strand_record_sample_dict[sample][read] = temp_c_dict[read]
284
+ del temp_a_dict, temp_c_dict
285
+ # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
286
+ elif dict_index == 8:
287
+ # Load the plus strand dictionaries for each sample into temporary variables
288
+ temp_a_dict = dict_list[3][record][sample].copy()
289
+ temp_c_dict = dict_list[6][record][sample].copy()
290
+ mod_strand_record_sample_dict[sample] = {}
291
+ # Iterate over the reads present in the merge of both dictionaries
292
+ for read in set(temp_a_dict) | set(temp_c_dict):
293
+ # Add the arrays element-wise if the read is present in both dictionaries
294
+ if read in temp_a_dict and read in temp_c_dict:
295
+ mod_strand_record_sample_dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
296
+ # If the read is present in only one dictionary, copy its value
297
+ elif read in temp_a_dict:
298
+ mod_strand_record_sample_dict[sample][read] = temp_a_dict[read]
299
+ elif read in temp_c_dict:
300
+ mod_strand_record_sample_dict[sample][read] = temp_c_dict[read]
301
+ del temp_a_dict, temp_c_dict
302
+ # For all other dictionaries
203
303
  else:
204
- dict[sample][read] = temp_c_dict[read]
205
- # For all other dictionaries
206
- else:
207
- # extract the dataframe from the dictionary into a temporary variable
208
- temp_df = dict[sample]
209
- # reassign the dictionary pointer to a nested dictionary.
210
- dict[sample] = {}
211
- # # Iterate through rows in the temp DataFrame
212
- for index, row in temp_df.iterrows():
213
- read = row['read_id'] # read name
214
- position = row['ref_position'] # positional coordinate
215
- probability = row['call_prob'] # Get the probability of the given call
216
- # if the call_code is modified change methylated value to the probability of methylation
217
- if (row['call_code'] in ['a', 'h', 'm']):
218
- methylated = probability
219
- # If the call code is canonical, change the methylated value to 1 - the probability of canonical
220
- elif (row['call_code'] in ['-']):
221
- methylated = 1 - probability
222
-
223
- # If the current read is not in the dictionary yet, initalize the dictionary with a nan filled numpy array of proper size.
224
- if read not in dict[sample]:
225
- dict[sample][read] = np.full(max_reference_length, np.nan)
304
+ # use temp_df to point to the dataframe held in mod_strand_record_sample_dict[sample]
305
+ temp_df = mod_strand_record_sample_dict[sample]
306
+ # reassign the dictionary pointer to a nested dictionary.
307
+ mod_strand_record_sample_dict[sample] = {}
308
+ # # Iterate through rows in the temp DataFrame
309
+ for index, row in temp_df.iterrows():
310
+ read = row['read_id'] # read name
311
+ position = row['ref_position'] # 1-indexed positional coordinate
312
+ probability = row['call_prob'] # Get the probability of the given call
313
+ # if the call_code is modified change methylated value to the probability of methylation
314
+ if (row['call_code'] in ['a', 'h', 'm']):
315
+ methylated = probability
316
+ # If the call code is canonical, change the methylated value to 1 - the probability of canonical
317
+ elif (row['call_code'] in ['-']):
318
+ methylated = 1 - probability
319
+
320
+ # If the current read is not in the dictionary yet, initalize the dictionary with a nan filled numpy array of proper size.
321
+ if read not in mod_strand_record_sample_dict[sample]:
322
+ mod_strand_record_sample_dict[sample][read] = np.full(max_reference_length, np.nan)
323
+
324
+ # add the positional methylation state to the numpy array
325
+ mod_strand_record_sample_dict[sample][read][position-1] = methylated
326
+
327
+ # Save the sample files in the batch as gzipped hdf5 files
328
+ os.chdir(h5_dir)
329
+ print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
330
+ for dict_index, dict_type in enumerate(dict_list):
331
+ if dict_index not in dict_to_skip:
332
+ # Initialize an hdf5 file for the current modified strand
333
+ adata = None
334
+ print('{0}: Converting {1} dictionary to an anndata object'.format(readwrite.time_string(), sample_types[dict_index]))
335
+ for record in dict_type.keys():
336
+ # Get the dictionary for the modification type of interest from the reference mapping of interest
337
+ mod_strand_record_sample_dict = dict_type[record]
338
+ for sample in mod_strand_record_sample_dict.keys():
339
+ print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(readwrite.time_string(), sample_types[dict_index], sample))
340
+ sample = int(sample)
341
+ final_sample_index = sample + (batch * batch_size)
342
+ print('{0}: Final sample index for sample: {1}'.format(readwrite.time_string(), final_sample_index))
343
+ print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
344
+ temp_df = pd.DataFrame.from_dict(mod_strand_record_sample_dict[sample], orient='index')
345
+ mod_strand_record_sample_dict[sample] = None # reassign pointer to facilitate memory usage
346
+ sorted_index = sorted(temp_df.index)
347
+ temp_df = temp_df.reindex(sorted_index)
348
+ X = temp_df.values
349
+
350
+ print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
351
+ temp_adata = ad.AnnData(X, dtype=X.dtype)
352
+ if temp_adata.shape[0] > 0:
353
+ print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
354
+ temp_adata.obs_names = temp_df.index
355
+ temp_adata.obs_names = temp_adata.obs_names.astype(str)
356
+ temp_adata.var_names = temp_df.columns
357
+ temp_adata.var_names = temp_adata.var_names.astype(str)
358
+ print('{0}: Adding {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
359
+ temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
360
+ dataset, strand = sample_types[dict_index].split('_')[:2]
361
+ temp_adata.obs['Strand'] = [strand] * len(temp_adata)
362
+ temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
363
+ temp_adata.obs['Reference'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
364
+ temp_adata.obs['Reference_chromosome'] = [f'{record}'] * len(temp_adata)
365
+
366
+ # Load in the one hot encoded reads from the current sample and record
367
+ one_hot_reads = {}
368
+ n_rows_OHE = 5
369
+ ohe_files = bam_record_ohe_files[f'{final_sample_index}_{record}']
370
+ print(f'Loading OHEs from {ohe_files}')
371
+ fwd_mapped_reads = set()
372
+ rev_mapped_reads = set()
373
+ for ohe_file in ohe_files:
374
+ tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
375
+ one_hot_reads.update(tmp_ohe_dict)
376
+ if '_fwd_' in ohe_file:
377
+ fwd_mapped_reads.update(tmp_ohe_dict.keys())
378
+ elif '_rev_' in ohe_file:
379
+ rev_mapped_reads.update(tmp_ohe_dict.keys())
380
+ del tmp_ohe_dict
381
+
382
+ read_names = list(one_hot_reads.keys())
383
+
384
+ read_mapping_direction = []
385
+ for read_id in temp_adata.obs_names:
386
+ if read_id in fwd_mapped_reads:
387
+ read_mapping_direction.append('fwd')
388
+ elif read_id in rev_mapped_reads:
389
+ read_mapping_direction.append('rev')
390
+ else:
391
+ read_mapping_direction.append('unk')
392
+
393
+ temp_adata.obs['Read_mapping_direction'] = read_mapping_direction
394
+
395
+ del temp_df
396
+
397
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
398
+ sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
399
+ df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
400
+ df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
401
+ df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
402
+ df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
403
+ df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
404
+
405
+ for read_name, one_hot_array in one_hot_reads.items():
406
+ one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
407
+ dict_A[read_name] = one_hot_array[0, :]
408
+ dict_C[read_name] = one_hot_array[1, :]
409
+ dict_G[read_name] = one_hot_array[2, :]
410
+ dict_T[read_name] = one_hot_array[3, :]
411
+ dict_N[read_name] = one_hot_array[4, :]
412
+
413
+ del one_hot_reads
414
+ gc.collect()
415
+
416
+ for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
417
+ df_A.iloc[j] = dict_A[read_name]
418
+ df_C.iloc[j] = dict_C[read_name]
419
+ df_G.iloc[j] = dict_G[read_name]
420
+ df_T.iloc[j] = dict_T[read_name]
421
+ df_N.iloc[j] = dict_N[read_name]
422
+
423
+ del dict_A, dict_C, dict_G, dict_T, dict_N
424
+ gc.collect()
425
+
426
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
427
+
428
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
429
+ temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
430
+ ohe_df_map[j] = None # Reassign pointer for memory usage purposes
431
+
432
+ # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
433
+ if adata:
434
+ if temp_adata.shape[0] > 0:
435
+ print('{0}: Concatenating {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
436
+ adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
437
+ del temp_adata
438
+ else:
439
+ print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata")
440
+ else:
441
+ if temp_adata.shape[0] > 0:
442
+ print('{0}: Initializing {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
443
+ adata = temp_adata
444
+ else:
445
+ print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata")
446
+
447
+ gc.collect()
226
448
  else:
227
- pass
228
- # add the positional methylation state to the numpy array
229
- dict[sample][read][position-1] = methylated
230
-
231
- # Save the sample files in the batch as gzipped hdf5 files
232
- print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
233
- for i, dict_type in enumerate(dict_list):
234
- if i not in dict_to_skip:
235
- # Initialize an hdf5 file for the current modified strand
236
- adata = None
237
- print('{0}: Converting {1} dictionary to an anndata object'.format(readwrite.time_string(), sample_types[i]))
238
- for record in dict_type.keys():
239
- # Get the dictionary for the modification type of interest from the reference mapping of interest
240
- dict = dict_type[record]
241
- for sample in dict.keys():
242
- print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(readwrite.time_string(), sample_types[i], sample))
243
- sample = int(sample)
244
- final_sample_index = sample + (batch * batch_size)
245
- print('{0}: Final sample index for sample: {1}'.format(readwrite.time_string(), final_sample_index))
246
- print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(readwrite.time_string(), sample_types[i], final_sample_index))
247
- temp_df = pd.DataFrame.from_dict(dict[sample], orient='index')
248
- sorted_index = sorted(temp_df.index)
249
- temp_df = temp_df.reindex(sorted_index)
250
- X = temp_df.values
251
- one_hot_encodings = record_seq_dict[record][0]
252
- read_names = list(one_hot_encodings.keys())
253
- sequence_length = one_hot_encodings[read_names[0]].shape[0]
254
- dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
255
- # Loop through each read name and its corresponding one-hot array
256
- print('{0}: Extracting one hot encodings into dictionaries'.format(readwrite.time_string()))
257
- for read_name, one_hot_array in one_hot_encodings.items():
258
- dict_A[read_name] = one_hot_array[:, 0]
259
- dict_C[read_name] = one_hot_array[:, 1]
260
- dict_G[read_name] = one_hot_array[:, 2]
261
- dict_T[read_name] = one_hot_array[:, 3]
262
- dict_N[read_name] = one_hot_array[:, 4]
263
- # Load dfs with data from the dictionaries
264
- print('{0}: Loading dataframes from one hot encoded dictionaries'.format(readwrite.time_string()))
265
- df_A = pd.DataFrame.from_dict(dict_A, orient='index').reindex(sorted_index)
266
- df_C = pd.DataFrame.from_dict(dict_C, orient='index').reindex(sorted_index)
267
- df_G = pd.DataFrame.from_dict(dict_G, orient='index').reindex(sorted_index)
268
- df_T = pd.DataFrame.from_dict(dict_T, orient='index').reindex(sorted_index)
269
- df_N = pd.DataFrame.from_dict(dict_N, orient='index').reindex(sorted_index)
270
-
271
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
272
-
273
- print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(readwrite.time_string(), sample_types[i], final_sample_index))
274
- temp_adata = ad.AnnData(X, dtype=X.dtype)
275
- print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
276
- temp_adata.obs_names = temp_df.index
277
- temp_adata.obs_names = temp_adata.obs_names.astype(str)
278
- temp_adata.var_names = temp_df.columns
279
- temp_adata.var_names = temp_adata.var_names.astype(str)
280
- print('{0}: Adding final sample id to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
281
- temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
282
- dataset, strand = sample_types[i].split('_')[:2]
283
- temp_adata.obs['Strand'] = [strand] * len(temp_adata)
284
- temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
285
- temp_adata.obs['Reference'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
286
- temp_adata.obs['Reference_chromosome'] = [f'{record}'] * len(temp_adata)
287
-
288
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
289
- temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
290
-
291
- # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
292
- if adata:
293
- print('{0}: Concatenating {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
294
- adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
295
- else:
296
- print('{0}: Initializing {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
297
- adata = temp_adata
298
-
299
- print('{0}: Writing {1} anndata out as a gzipped hdf5 file'.format(readwrite.time_string(), sample_types[i]))
300
- adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[i]), compression='gzip')
301
-
302
- # Delete the batch dictionaries from memory
303
- del dict_list
304
- gc.collect()
305
-
306
- # Iterate over all of the batched hdf5 files and concatenate them.
307
- files = os.listdir(os.getcwd())
308
- # Name the final output file
309
- final_hdf = '{0}_{1}_final_experiment_hdf5.h5ad.gz'.format(readwrite.date_string(), experiment_name)
310
- # Filter file names that contain the search string in their filename and keep them in a list
311
- hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
312
- # Sort file list by names and print the list of file names
313
- hdfs.sort()
314
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
315
- final_adata = None
316
- for hdf in hdfs:
317
- print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdf))
318
- temp_adata = ad.read_h5ad(hdf)
319
- if final_adata:
320
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
321
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
322
- else:
323
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
324
- final_adata = temp_adata
325
- print('{0}: Writing final concatenated hdf5 file'.format(readwrite.time_string()))
449
+ print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata. Skipping sample.")
326
450
 
327
- for record in records_to_analyze:
328
- # Add FASTA sequence to the object
329
- sequence = record_seq_dict[record][1]
330
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
331
- final_adata.var[f'{record}_FASTA_sequence_base'] = list(sequence)
332
-
333
- # Add consensus sequence of samples mapped to the record to the object
334
- record_subset = final_adata[final_adata.obs['Reference_chromosome'] == record].copy()
335
- layer_map, layer_counts = {}, []
336
- for i, layer in enumerate(record_subset.layers):
337
- layer_map[i] = layer.split('_')[0]
338
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
339
- count_array = np.array(layer_counts)
340
- nucleotide_indexes = np.argmax(count_array, axis=0)
341
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
342
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
343
-
344
- final_adata.write_h5ad(final_hdf, compression='gzip')
345
-
346
- # Delete the individual h5ad files and only keep the final concatenated file
347
- files = os.listdir(os.getcwd())
348
- hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
349
- # Iterate over the files and delete them
350
- for hdf in hdfs_to_delete:
351
- try:
352
- os.remove(hdf)
353
- print(f"Deleted file: {hdf}")
354
- except OSError as e:
355
- print(f"Error deleting file {hdf}: {e}")
451
+ print('{0}: Writing {1} anndata out as a gzipped hdf5 file'.format(readwrite.time_string(), sample_types[dict_index]))
452
+ adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[dict_index]), compression='gzip')
453
+
454
+ # Delete the batch dictionaries from memory
455
+ del dict_list, adata
456
+ gc.collect()
457
+
458
+ # Iterate over all of the batched hdf5 files and concatenate them.
459
+ os.chdir(h5_dir)
460
+ files = os.listdir(h5_dir)
461
+ # Filter file names that contain the search string in their filename and keep them in a list
462
+ hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
463
+ # Sort file list by names and print the list of file names
464
+ hdfs.sort()
465
+ print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
466
+ hdf_paths = [os.path.join(h5_dir, hd5) for hd5 in hdfs]
467
+ final_adata = None
468
+ for hdf_index, hdf in enumerate(hdf_paths):
469
+ print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdfs[hdf_index]))
470
+ temp_adata = ad.read_h5ad(hdf)
471
+ if final_adata:
472
+ print('{0}: Concatenating final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf[hdf_index]))
473
+ final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
474
+ else:
475
+ print('{0}: Initializing final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf[hdf_index]))
476
+ final_adata = temp_adata
477
+ del temp_adata
478
+
479
+ # Set obs columns to type 'category'
480
+ for col in final_adata.obs.columns:
481
+ final_adata.obs[col] = final_adata.obs[col].astype('category')
482
+
483
+ for record in records_to_analyze:
484
+ # Add FASTA sequence to the object
485
+ sequence = record_seq_dict[record][0]
486
+ complement = record_seq_dict[record][1]
487
+ final_adata.var[f'{record}_top_strand_FASTA_base_at_coordinate'] = list(sequence)
488
+ final_adata.var[f'{record}_bottom_strand_FASTA_base_at_coordinate'] = list(complement)
489
+ final_adata.uns[f'{record}_FASTA_sequence'] = sequence
490
+ # Add consensus sequence of samples mapped to the record to the object
491
+ record_subset = final_adata[final_adata.obs['Reference_chromosome'] == record].copy()
492
+ for strand in record_subset.obs['Strand'].cat.categories:
493
+ strand_subset = record_subset[record_subset.obs['Strand'] == strand].copy()
494
+ for mapping_dir in strand_subset.obs['Read_mapping_direction'].cat.categories:
495
+ mapping_dir_subset = strand_subset[strand_subset.obs['Read_mapping_direction'] == mapping_dir].copy()
496
+ layer_map, layer_counts = {}, []
497
+ for i, layer in enumerate(mapping_dir_subset.layers):
498
+ layer_map[i] = layer.split('_')[0]
499
+ layer_counts.append(np.sum(mapping_dir_subset.layers[layer], axis=0))
500
+ count_array = np.array(layer_counts)
501
+ nucleotide_indexes = np.argmax(count_array, axis=0)
502
+ consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
503
+ final_adata.var[f'{record}_{strand}_strand_{mapping_dir}_mapping_dir_consensus_from_all_samples'] = consensus_sequence_list
504
+
505
+ final_adata.write_h5ad(os.path.join(h5_dir, final_hdf), compression='gzip')
506
+
507
+ # Delete the individual h5ad files and only keep the final concatenated file
508
+ if delete_batch_hdfs:
509
+ files = os.listdir(h5_dir)
510
+ hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
511
+ hdf_paths_to_delete = [os.path.join(h5_dir, hdf) for hdf in hdfs_to_delete]
512
+ # Iterate over the files and delete them
513
+ for hdf in hdf_paths_to_delete:
514
+ try:
515
+ os.remove(hdf)
516
+ print(f"Deleted file: {hdf}")
517
+ except OSError as e:
518
+ print(f"Error deleting file {hdf}: {e}")