smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,245 @@
1
+ ## converted_BAM_to_adata
2
+
3
+ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
4
+ """
5
+ A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
6
+
7
+ Parameters:
8
+ converted_FASTA (str): A string representing the file path to the converted FASTA reference.
9
+ split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
10
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
+ experiment_name (str): A string to provide an experiment name to the output adata file.
12
+ conversion_types (list): A list of strings of the conversion types to use in the analysis.
13
+ bam_suffix (str): The suffix to use for the BAM file.
14
+
15
+ Returns:
16
+ final_adata_path (str): File path to the final adata object
17
+ Outputs a single gzipped adata object for the experiment.
18
+ """
19
+ from .. import readwrite
20
+ from .binarize_converted_base_identities import binarize_converted_base_identities
21
+ from .find_conversion_sites import find_conversion_sites
22
+ from .count_aligned_reads import count_aligned_reads
23
+ from .extract_base_identities import extract_base_identities
24
+ from .make_dirs import make_dirs
25
+ from .ohe_batching import ohe_batching
26
+ import pandas as pd
27
+ import numpy as np
28
+ import anndata as ad
29
+ import os
30
+ from tqdm import tqdm
31
+ import gc
32
+
33
+ ##########################################################################################
34
+ ## Get file paths and make necessary directories. ##
35
+ # Get all of the input BAM files
36
+ files = os.listdir(split_dir)
37
+ # Make output dir
38
+ parent_dir = os.path.dirname(split_dir)
39
+ split_dir_base = os.path.basename(split_dir)
40
+ h5_dir = os.path.join(parent_dir, 'h5ads')
41
+ final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{split_dir_base}.h5ad')
42
+
43
+ if os.path.exists(f"{final_adata_path}.gz"):
44
+ print(f'{final_adata_path}.gz already exists, using existing adata object') # Stops here if the final_adata file already exists
45
+ return final_adata_path
46
+
47
+ tmp_dir = os.path.join(parent_dir, 'tmp')
48
+ make_dirs([h5_dir, tmp_dir])
49
+ # Filter file names that contain the search string in their filename and keep them in a list
50
+ bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
51
+ # Sort file list by names and print the list of file names
52
+ bams.sort()
53
+ bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
54
+ print(f'Found the following BAMS: {bams}')
55
+ final_adata = None
56
+ ##########################################################################################
57
+
58
+ ##########################################################################################
59
+
60
+ ## need to fix this section
61
+ # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
62
+ modification_dict = {}
63
+ # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
64
+ record_FASTA_dict = {}
65
+ # While populating the dictionary, also extract the longest sequence record in the input references
66
+ max_reference_length = 0
67
+ conversions = conversion_types[1:]
68
+ for conversion_type in conversions:
69
+ # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
70
+ modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
71
+ # Get the max reference length
72
+ for record in modification_dict[conversion_type].keys():
73
+ if modification_dict[conversion_type][record][0] > max_reference_length:
74
+ max_reference_length = modification_dict[conversion_type][record][0]
75
+
76
+ mod_type, strand = record.split('_')[-2:]
77
+
78
+ chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
79
+ unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
80
+ current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
81
+ delta_max_length = max_reference_length - current_reference_length
82
+ sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
83
+ complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
84
+ record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
85
+ ##########################################################################################
86
+
87
+ ##########################################################################################
88
+ bam_alignment_stats_dict = {}
89
+ records_to_analyze = []
90
+ for bam_index, bam in enumerate(bam_path_list):
91
+ bam_alignment_stats_dict[bam_index] = {}
92
+ # look at aligned read proportions in the bam
93
+ aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
94
+ percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
95
+ print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
96
+ bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
97
+ # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
98
+ for record in record_counts:
99
+ print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
100
+ if record_counts[record][1] >= mapping_threshold:
101
+ records_to_analyze.append(record)
102
+ bam_alignment_stats_dict[bam_index]
103
+ bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
104
+ records_to_analyze = set(records_to_analyze)
105
+ ##########################################################################################
106
+
107
+ ##########################################################################################
108
+ # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
109
+ # Save the file paths in the bam_record_ohe_files dict.
110
+ bam_record_ohe_files = {}
111
+
112
+ # Iterate over split bams
113
+ for bam_index, bam in enumerate(bam_path_list):
114
+ # Iterate over references to process
115
+ for record in records_to_analyze:
116
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
117
+ sample = bams[bam_index].split(sep=bam_suffix)[0]
118
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
119
+ current_reference_length = record_FASTA_dict[unconverted_record_name][4]
120
+ mod_type = record_FASTA_dict[unconverted_record_name][6]
121
+ strand = record_FASTA_dict[unconverted_record_name][7]
122
+
123
+ # Extract the base identities of reads aligned to the record
124
+ fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
125
+
126
+ # binarize the dictionary of positional identities
127
+ print(f'Binarizing base identities')
128
+ fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
129
+ rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
130
+ merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
131
+ # converts the base identity dictionary to a dataframe.
132
+ binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
133
+ sorted_index = sorted(binarized_base_identities_df.index)
134
+ binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
135
+
136
+ # Load an anndata object with the sample data
137
+ X = binarized_base_identities_df.values
138
+ adata = ad.AnnData(X, dtype=X.dtype)
139
+ if adata.shape[0] > 0:
140
+ adata.obs_names = binarized_base_identities_df.index.astype(str)
141
+ adata.var_names = binarized_base_identities_df.columns.astype(str)
142
+ adata.obs['Sample'] = [sample] * len(adata)
143
+ adata.obs['Reference'] = [chromosome] * len(adata)
144
+ adata.obs['Strand'] = [strand] * len(adata)
145
+ adata.obs['Dataset'] = [mod_type] * len(adata)
146
+ adata.obs['Reference_dataset_strand'] = [f'{chromosome}_{mod_type}_{strand}'] * len(adata)
147
+ adata.obs['Reference_strand'] = [f'{record}'] * len(adata)
148
+
149
+ read_mapping_direction = []
150
+ for read_id in adata.obs_names:
151
+ if read_id in fwd_base_identities.keys():
152
+ read_mapping_direction.append('fwd')
153
+ elif read_id in rev_base_identities.keys():
154
+ read_mapping_direction.append('rev')
155
+ else:
156
+ read_mapping_direction.append('unk')
157
+
158
+ adata.obs['Read_mapping_direction'] = read_mapping_direction
159
+
160
+ # One hot encode the sequence string of the reads
161
+ fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
162
+ rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
163
+ bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
164
+ del fwd_base_identities, rev_base_identities
165
+
166
+ one_hot_reads = {}
167
+ n_rows_OHE = 5
168
+ for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
169
+ tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
170
+ one_hot_reads.update(tmp_ohe_dict)
171
+ del tmp_ohe_dict
172
+
173
+ read_names = list(one_hot_reads.keys())
174
+
175
+ sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
176
+ df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
177
+ df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
178
+ df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
179
+ df_T = np.zeros((len(sorted_index), sequence_length), dtype=int)
180
+ df_N = np.zeros((len(sorted_index), sequence_length), dtype=int)
181
+
182
+ # Process one-hot data into dictionaries
183
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
184
+ for read_name, one_hot_array in one_hot_reads.items():
185
+ one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
186
+ dict_A[read_name] = one_hot_array[0, :]
187
+ dict_C[read_name] = one_hot_array[1, :]
188
+ dict_G[read_name] = one_hot_array[2, :]
189
+ dict_T[read_name] = one_hot_array[3, :]
190
+ dict_N[read_name] = one_hot_array[4, :]
191
+
192
+ del one_hot_reads
193
+ gc.collect()
194
+
195
+ # Fill the arrays
196
+ for j, read_name in tqdm(enumerate(sorted_index), desc='Loading arrays of OHE reads', total=len(sorted_index)):
197
+ df_A[j, :] = dict_A[read_name]
198
+ df_C[j, :] = dict_C[read_name]
199
+ df_G[j, :] = dict_G[read_name]
200
+ df_T[j, :] = dict_T[read_name]
201
+ df_N[j, :] = dict_N[read_name]
202
+
203
+ del dict_A, dict_C, dict_G, dict_T, dict_N
204
+ gc.collect()
205
+
206
+ # Store the results in AnnData layers
207
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
208
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
209
+ adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
210
+ ohe_df_map[j] = None # Reassign pointer for memory usage purposes
211
+
212
+ if final_adata:
213
+ if adata.shape[0] > 0:
214
+ final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
215
+ else:
216
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
217
+ else:
218
+ if adata.shape[0] > 0:
219
+ final_adata = adata
220
+ else:
221
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
222
+
223
+ else:
224
+ print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
225
+
226
+ # Set obs columns to type 'category'
227
+ for col in final_adata.obs.columns:
228
+ final_adata.obs[col] = final_adata.obs[col].astype('category')
229
+
230
+ for record in records_to_analyze:
231
+ unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
232
+ sequence = record_FASTA_dict[unconverted_record_name][0]
233
+ complement = record_FASTA_dict[unconverted_record_name][1]
234
+ chromosome = record_FASTA_dict[unconverted_record_name][2]
235
+ final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
236
+ final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
237
+ final_adata.uns[f'{chromosome}_FASTA_sequence'] = sequence
238
+
239
+ ######################################################################################################
240
+
241
+ ######################################################################################################
242
+ ## Export the final adata object
243
+ print('Saving initial draft of final adata')
244
+ final_adata.write_h5ad(final_adata_path)
245
+ return final_adata_path