RefgenDetector 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,272 @@
1
+ import os
2
+ import sys
3
+ # Add the parent directory to the Python path
4
+ current_dir = os.path.dirname(os.path.abspath(__file__))
5
+ parent_dir = os.path.dirname(current_dir)
6
+ sys.path.insert(0, parent_dir)
7
+ from refgenDetector.reference_genome_dictionaries import *
8
+ from refgenDetector.exceptions.NoFileException import *
9
+ import argparse
10
+ import csv
11
+ import gzip
12
+ import pysam
13
+ import psutil
14
+ import time
15
+ from rich.console import Console
16
+
17
+ console = Console()
18
+
19
+ def intersection_targetfile_referencerepo(dict_SN_LN, reference_genome):
20
+ """
21
+ Find the matches between the target file and the repository of unique contigs per reference genome.
22
+ Returns the actual matches (ln info) instead of just their count.
23
+ Args:
24
+ dict_SN_LN (dict) : dictionary with the contig (SN: key, LN: value) info from the target file
25
+ reference_genome (dict entry): one of the versions from major_releases
26
+
27
+ Returns:
28
+ matches (set) : list of lengths matching to the version currently being read
29
+ reference_genome["build"] (str): build of the version currently being read
30
+ reference_genome["species"] (str): species of the version currently being read
31
+ """
32
+ matches = set(dict_SN_LN.values()).intersection(reference_genome["ref_gen"].values())
33
+ return matches, reference_genome["build"], reference_genome["species"]
34
+
35
+ def check_if_decoy(matches_info, target_file):
36
+ """
37
+ Checks if there's inconsistency of the versions in the target file or if the multiple matches are random
38
+ Args:
39
+ matches_info (list): list of tuples. Each tuple have 3 positions: lengths from the contigs matching,
40
+ version where the contigs match, species from the version.
41
+ target_file (str): path of the target file
42
+
43
+ Returns:
44
+ If the matches to the secondary version are at least as long as the shortest chromosome of the version with
45
+ more matches then a message raising the incosistency is printed.
46
+ If the matches to the secondary version are shorter than the shortest chromosome then it assumes it's a decoy
47
+ contig matching another version randomly and returns:
48
+ incosistency = False so the code can continue in comparison() and give the results based on the version with
49
+ most matches.
50
+ """
51
+ incosistency_found = False
52
+ filtered_entries = [entry for entry in matches_info if entry[0]] # get the multiple matches
53
+ match = max(filtered_entries, key=lambda ref_gen_w_macthes: len(ref_gen_w_macthes[0])) # version with most
54
+ # matches with LN values
55
+ inconsistency_matches = []
56
+ for version in filtered_entries:
57
+ if version != match:
58
+ for ln in version[0]:
59
+ if int(ln) > min_values[match[1]]: # checks if the ln matching is more or less chr length
60
+ ref_dict = globals().get(version[1])
61
+ inconsistency_matches.extend([key for key, value in ref_dict.items() if value == ln])
62
+ incosistency_found = True
63
+ if incosistency_found == True:
64
+ console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold] Inconsistency found "
65
+ f"- file contains contigs from different genome versions[/red]")
66
+ console.print(f"[red]Contigs {inconsistency_matches} belong to {version[1]}, but the rest belongs to"
67
+ f" {match[1]}[/red].")
68
+ return incosistency_found
69
+
70
+
71
+ def comparison(dict_SN_LN, target_file):
72
+ """
73
+ First, it defines the major release to which the header belongs to. Then, checks if a flavor can be inferred.
74
+ Args:
75
+ dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
76
+ target_file (str): path of the target file
77
+
78
+ Returns:
79
+ Prints the file path being analyzed,the species and the Reference genome version inferred.
80
+ It raises an error if:
81
+ - The contigs in the target file are not in the database (a species or ref gen version not included in the tool)
82
+ - There are contigs belonging to more than one release/species. This will be printed if the match between
83
+ species is as long as the shortest chromosome from the version with the most matches. If the match is
84
+ shorter it assumes it's a random match e.g a decoy contig that randomly matches the length of
85
+ another species/version.
86
+ """
87
+
88
+ matches_info = [intersection_targetfile_referencerepo(dict_SN_LN, major_releases[ref]) for ref in major_releases]
89
+ matches_with_counts = [(len(matches), build, species) for matches, build, species in matches_info]
90
+ max_match = max(matches_with_counts, key=lambda ref_gen_w_macthes: ref_gen_w_macthes[0]) # Find the major release
91
+ # with the maximum matches
92
+ incosistency = False
93
+
94
+ # check all the matches belong to the same release version
95
+ multiple_matches = []
96
+ for match in matches_with_counts:
97
+ if match[0] != 0:
98
+ multiple_matches.append(match)
99
+
100
+ if len(multiple_matches) > 1 :
101
+ if multiple_matches[0][1] != "hg17" and multiple_matches[1][1] != "hg18": # these versions share contig lengths
102
+ if multiple_matches[0][1] != "rhemac3" and multiple_matches[1][1] != "rhemac8":
103
+ incosistency = check_if_decoy(matches_info, target_file)
104
+
105
+ if incosistency == False:
106
+ if max_match[0] == 0:
107
+ for contig in dict_SN_LN.values():
108
+ if contig not in mit_contigs.values():
109
+ console.print(f"[bold][red]Reference genome can't be inferred[/bold] - "
110
+ "The contigs in the file are not found in refgenDetector database[red]")
111
+ break
112
+ else:
113
+ ref_version = next(key for key, value in mit_contigs.items() if value == contig)
114
+ console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]Reference genome version :[/bold] {ref_version}")
115
+ console.print(f"Note: Only the mitochondrial reference sequence is present. Nuclear genome build cannot be determined.")
116
+
117
+ elif max_match[1] == "GRCh37": #check for GRCh37 flavors
118
+
119
+ matches_flavors = [
120
+ intersection_targetfile_referencerepo(dict_SN_LN, flavors_GRCh37[ref])
121
+ for ref in flavors_GRCh37
122
+ ]
123
+
124
+ match_flavors = max(matches_flavors, key=lambda x: x[0])
125
+ if match_flavors: #if some flavor was defined it prints it
126
+ console.print(f"[bold]Species detected:[/bold] {match_flavors[2]} "
127
+ f"[bold]\nReference genome version :[/bold] {match_flavors[1]}")
128
+ else: #if there wasnt any flavor inferred, the major release it printed
129
+ console.print(f"[bold]Species detected:[/bold] Homo sapiens \n["
130
+ f"bold]Reference genome version :[/bold] GRCh37")
131
+
132
+ elif max_match[1] == "GRCh38": #checks for GRCh38 flavors
133
+
134
+ if any("HLA-" in key for key in dict_SN_LN.keys()):
135
+ #first checks if the contigs contain in their names HLA-
136
+ console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]"
137
+ f"Reference genome version :[/bold] hs38DH_extra")
138
+ elif set(dict_SN_LN.values()).intersection(verily_difGRCh38.values()):#checks if the Verily's unique
139
+ # lengths are present
140
+ console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]"
141
+ f"Reference genome version :[/bold] GRCh38_no_alt_plus_hs38d1")
142
+ else: # if no GRCh38 flavor is inferred, the major release is printed
143
+ console.print(f"[bold]Species detected:[/bold] Homo sapiens \n["
144
+ f"bold]Reference genome version :[/bold] GRCh38")
145
+ else: # print the major releases with no considered flavors.
146
+ console.print(f"[bold]Species detected:[/bold] {match[2]} "
147
+ f"\n[bold]Reference genome version:[/bold] {match[1]}")
148
+
149
+
150
+
151
+
152
+ def get_info_bamcram(header_bam_cram, target_file, md5, assembly):
153
+ """
154
+ Second function of the BAM/CRAM module. Loop over the SQ (sequence dictionary) records in the header, creates a
155
+ dictionary with the contigs names and lengths, if present and requested by the user (adding -m and -a in the
156
+ argument) prints AS and M5
157
+
158
+ Args:
159
+ header_bam_cram(pysam.libcalignmentfile.AlignmentHeader): text object
160
+
161
+ Returns:
162
+ dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
163
+ target_file (str): path of the target file
164
+ dict_assembly[1] (str): if present and asked by the user, AS value from the target file header
165
+ dict_M5 (dict): if present and asked by the user, M5 values from the target file header
166
+ """
167
+
168
+ dict_SN_LN = {sq_record["SN"]: sq_record["LN"] for sq_record in header_bam_cram.get("SQ", [])}
169
+
170
+ if assembly: # if the user chose -a
171
+ dict_assembly = set(sq_record["AS"] for sq_record in header_bam_cram.get("SQ", []) if "AS" in sq_record)
172
+ if dict_assembly:
173
+ console.print(f"[bold]AS field:[/bold] {dict_assembly.pop()}")
174
+ if md5: # if the user chose -m
175
+ dict_M5 = set(sq_record["M5"] for sq_record in header_bam_cram.get("SQ", []) if "M5" in sq_record)
176
+ if dict_M5:
177
+ console.print(f"[bold]M5 fields:[/bold]{dict_M5}")
178
+ comparison(dict_SN_LN, target_file)
179
+
180
+
181
+ def process_data_bamcram(target_file, md5, assembly):
182
+ """
183
+ First function of the BAM/CRAM module. It opens each BAM or CRAM provided by the user and extracts the header.
184
+
185
+ Args:
186
+ target_file (str): path to the file
187
+
188
+ Returns:
189
+ header_bam_cram (pysam.libcalignmentfile.AlignmentHeader): text object
190
+ """
191
+ try:
192
+ save = pysam.set_verbosity(0) # https://github.com/pysam-developers/pysam/issues/939
193
+ bam_cram = pysam.AlignmentFile(target_file, "rb")
194
+ pysam.set_verbosity(save)
195
+ except Exception as e:
196
+ console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold][red] {e.__class__}, {e}")
197
+
198
+
199
+ header_bam_cram = bam_cram.header
200
+ get_info_bamcram(header_bam_cram, target_file, md5, assembly)
201
+
202
+ def get_info_txt(header_txt, md5, assembly):
203
+ """
204
+ Second function of the txt module. Extracts the SQ (sequence dictionary) records in the header, creates a
205
+ dictionary with the contigs names and lengths, and, if present and requested by the user (adding -m and -a in the
206
+ argument) prints AS and M5.
207
+
208
+ Args:
209
+ header_txt (io.TextIOWrapper): text object
210
+
211
+ Returns:
212
+ dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
213
+ header_txt.name (str): path of the target file
214
+ dict_assembly[1] (str): if present and asked by the user, AS value from the target file header
215
+ dict_M5 (dict): if present and asked by the user, M5 values from the target file header
216
+ """
217
+ header_reader = csv.reader(header_txt, delimiter="\t")
218
+ try:
219
+ dict_SQ = [line for line in header_reader if "@SQ" in line]
220
+ except (UnicodeDecodeError, csv.Error) as e:
221
+ print(f"File cannot be read ({type(e).__name__}: {e}). It is likely compressed, corrupted or the incorrect -t.")
222
+ try:
223
+ dict_SN_LN = {line[1].replace("SN:", ""): int(line[2].replace("LN:", "")) for line in dict_SQ} #the dictonary values must be int due to the structure of the collection of reference dictionaries
224
+
225
+ except ValueError:
226
+ print(f"Check the LN field of your header {header_txt.name} only contains numbers")
227
+
228
+ comparison(dict_SN_LN, header_txt.name)
229
+
230
+ if assembly: # # if the user chose -a
231
+ dict_assembly = [l for line in dict_SQ for l in line if "AS" in l][:1]
232
+ if dict_assembly: # if AS is present in the header
233
+ console.print(f"[bold]AS field:[/bold] {dict_assembly[0].split(':')[1]}")
234
+ if md5: # # if the user chose -m
235
+ for i in dict_SQ[0]:
236
+ if "M5" in i:
237
+ dict_M5 = {line[1].replace("SN:", ""): i.replace("M5:", "") for line in
238
+ dict_SQ}
239
+ console.print(f"[bold]MD5 fields:[/bold] {dict_M5}")
240
+
241
+ def process_data_txt(target_file, md5, assembly):
242
+ """
243
+ First function of the txt module. It opens each header in --path. gzip or uncompressed and encoded in utf-8 or
244
+ iso-8859-1.
245
+
246
+ Args:
247
+ target_file (str): path to the file
248
+
249
+ Returns:
250
+ header_txt (io.TextIOWrapper): text object
251
+ """
252
+ try:
253
+ if os.path.isfile(target_file):
254
+ with open(target_file, "r") as header_txt:
255
+ get_info_txt(header_txt, md5, assembly)
256
+ else:
257
+ raise NoFileException()
258
+ except UnicodeError:
259
+ with open(target_file, "r", encoding="iso-8859-1") as header_txt:
260
+ get_info_txt(header_txt, md5, assembly)
261
+ except OSError:
262
+ try:
263
+ with gzip.open(target_file, "rt") as header_txt:
264
+ get_info_txt(header_txt, md5, assembly)
265
+ except UnicodeError:
266
+ with gzip.open(target_file, "rt", encoding="iso-8859-1") as header_txt:
267
+ get_info_txt(header_txt, md5, assembly)
268
+ except NoFileException:
269
+ console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold][red] The path provided is not "
270
+ f"found or you are using the incorrect --type option.")
271
+ except Exception as e:
272
+ print("Unexpected error:\n", e)
@@ -0,0 +1,34 @@
1
+
2
+ """
3
+ Original dictionary:
4
+ chromosomes = {
5
+ "chr1": ["1", "chr1", "CM000663.1", "NC_000001.10", "CM000663.2", "NC_000001.11", "CP068277.2", "NC_060925.1"],
6
+ "chr2": ["2", "chr2", "CM000664.1", "NC_000002.11", "CM000664.2", "NC_000002.12", "CP068276.2", "NC_060926.1"],
7
+ "chr3": ["3", "chr3", "CM000665.1", "NC_000003.11", "CM000665.2", "NC_000003.12", "CP068275.2", "NC_060927.1"],
8
+ "chr4": ["4", "chr4", "CM000666.1", "NC_000004.11", "CM000666.2", "NC_000004.12", "CP068274.2", "NC_060928.1"],
9
+ "chr5": ["5", "chr5", "CM000667.1", "NC_000005.9", "CM000667.2", "NC_000005.10", "CP068273.2", "NC_060929.1"],
10
+ "chr6": ["6", "chr6", "CM000668.1", "NC_000006.11", "CM000668.2", "NC_000006.12", "CP068272.2", "NC_060930.1"],
11
+ "chr7": ["7", "chr7", "CM000669.1", "NC_000007.13", "CM000669.2", "NC_000007.14", "CP068271.2", "NC_060931.1"],
12
+ "chr8": ["8", "chr8", "CM000670.1", "NC_000008.10", "CcM000670.2", "NC_000008.11", "CP068270.2", "NC_060932.1"],
13
+ "chr9": ["9", "chr9", "CM000671.1", "NC_000009.11", "CM000671.2", "NC_000009.12", "CP068269.2", "NC_060933.1"],
14
+ "chr10": ["10", "chr10", "CM000672.1", "NC_000010.10", "CM000672.2", "NC_000010.11", "CP068268.2", "NC_060934.1"],
15
+ "chr11": ["11", "chr11", "CM000673.1", "NC_000011.9", "CM000673.2", "NC_000011.10", "CP068267.2", "NC_060935.1"],
16
+ "chr12": ["12", "chr12", "CM000674.1", "NC_000012.11", "CM000674.2", "NC_000012.12", "CP068266.2", "NC_060936.1"],
17
+ "chr13": ["13", "chr13", "CM000675.1", "NC_000013.10", "CM000675.2", "NC_000013.11", "CP068265.2", "NC_060937.1"],
18
+ "chr14": ["14", "chr14", "CM000676.1", "NC_000014.8", "CM000676.2", "NC_000014.9", "CP068264.2", "NC_060938.1"],
19
+ "chr15": ["15", "chr15", "CM000677.1", "NC_000015.9", "CM000677.2", "NC_000015.10", "CP068263.2", "NC_060939.1"],
20
+ "chr16": ["16", "chr16", "CM000678.1", "NC_000016.9", "CM000678.2", "NC_000016.10", "CP068262.2", "NC_060940.1"],
21
+ "chr17": ["17", "chr17", "CM000679.1", "NC_000017.10", "CM000679.2", "NC_000017.11", "CP068261.2", "NC_060941.1"],
22
+ "chr18": ["18", "chr18", "CM000680.1", "NC_000018.9", "CM000680.2", "NC_000018.10", "CP068260.2", "NC_060942.1"],
23
+ "chr19": ["19", "chr19", "CM000681.1", "NC_000019.9", "CM000681.2", "NC_000019.10", "CP068259.2", "NC_060943.1"],
24
+ "chr20": ["20", "chr20", "CM000682.1", "NC_000020.10", "CM000682.2", "NC_000020.11", "CP068258.2", "NC_060944.1"],
25
+ "chr21": ["21", "chr21", "CM000683.1", "NC_000021.8", "CM000683.2", "NC_000021.9", "CP068257.2", "NC_060945.1"],
26
+ "chr22": ["22", "chr22", "CM000684.1", "NC_000022.10", "CM000684.2", "NC_000022.11", "CP068256.2", "NC_060946.1"],
27
+ "chrX": ["X", "chrX", "CM000685.1", "NC_000023.10", "CM000685.2", "NC_000023.11", "CP068255.2", "NC_060947.1"],
28
+ "chrY": ["Y", "chrY", "CM000686.1", "NC_000024.9", "CM000686.2", "NC_000024.10", "CP086569.2", "NC_060948.1"]
29
+ }
30
+
31
+ chromosome_map = {v: k for k, lst in chromosomes.items() for v in lst}
32
+ print(chromosome_map)"""
33
+
34
+ chromosome_map = {'1': 'chr1', 'chr1': 'chr1', 'CM000663.1': 'chr1', 'NC_000001.10': 'chr1', 'CM000663.2': 'chr1', 'NC_000001.11': 'chr1', 'CP068277.2': 'chr1', 'NC_060925.1': 'chr1', '2': 'chr2', 'chr2': 'chr2', 'CM000664.1': 'chr2', 'NC_000002.11': 'chr2', 'CM000664.2': 'chr2', 'NC_000002.12': 'chr2', 'CP068276.2': 'chr2', 'NC_060926.1': 'chr2', '3': 'chr3', 'chr3': 'chr3', 'CM000665.1': 'chr3', 'NC_000003.11': 'chr3', 'CM000665.2': 'chr3', 'NC_000003.12': 'chr3', 'CP068275.2': 'chr3', 'NC_060927.1': 'chr3', '4': 'chr4', 'chr4': 'chr4', 'CM000666.1': 'chr4', 'NC_000004.11': 'chr4', 'CM000666.2': 'chr4', 'NC_000004.12': 'chr4', 'CP068274.2': 'chr4', 'NC_060928.1': 'chr4', '5': 'chr5', 'chr5': 'chr5', 'CM000667.1': 'chr5', 'NC_000005.9': 'chr5', 'CM000667.2': 'chr5', 'NC_000005.10': 'chr5', 'CP068273.2': 'chr5', 'NC_060929.1': 'chr5', '6': 'chr6', 'chr6': 'chr6', 'CM000668.1': 'chr6', 'NC_000006.11': 'chr6', 'CM000668.2': 'chr6', 'NC_000006.12': 'chr6', 'CP068272.2': 'chr6', 'NC_060930.1': 'chr6', '7': 'chr7', 'chr7': 'chr7', 'CM000669.1': 'chr7', 'NC_000007.13': 'chr7', 'CM000669.2': 'chr7', 'NC_000007.14': 'chr7', 'CP068271.2': 'chr7', 'NC_060931.1': 'chr7', '8': 'chr8', 'chr8': 'chr8', 'CM000670.1': 'chr8', 'NC_000008.10': 'chr8', 'CM000670.2': 'chr8', 'NC_000008.11': 'chr8', 'CP068270.2': 'chr8', 'NC_060932.1': 'chr8', '9': 'chr9', 'chr9': 'chr9', 'CM000671.1': 'chr9', 'NC_000009.11': 'chr9', 'CM000671.2': 'chr9', 'NC_000009.12': 'chr9', 'CP068269.2': 'chr9', 'NC_060933.1': 'chr9', '10': 'chr10', 'chr10': 'chr10', 'CM000672.1': 'chr10', 'NC_000010.10': 'chr10', 'CM000672.2': 'chr10', 'NC_000010.11': 'chr10', 'CP068268.2': 'chr10', 'NC_060934.1': 'chr10', '11': 'chr11', 'chr11': 'chr11', 'CM000673.1': 'chr11', 'NC_000011.9': 'chr11', 'CM000673.2': 'chr11', 'NC_000011.10': 'chr11', 'CP068267.2': 'chr11', 'NC_060935.1': 'chr11', '12': 'chr12', 'chr12': 'chr12', 'CM000674.1': 'chr12', 'NC_000012.11': 'chr12', 'CM000674.2': 'chr12', 'NC_000012.12': 'chr12', 'CP068266.2': 'chr12', 'NC_060936.1': 'chr12', '13': 'chr13', 'chr13': 'chr13', 'CM000675.1': 'chr13', 'NC_000013.10': 'chr13', 'CM000675.2': 'chr13', 'NC_000013.11': 'chr13', 'CP068265.2': 'chr13', 'NC_060937.1': 'chr13', '14': 'chr14', 'chr14': 'chr14', 'CM000676.1': 'chr14', 'NC_000014.8': 'chr14', 'CM000676.2': 'chr14', 'NC_000014.9': 'chr14', 'CP068264.2': 'chr14', 'NC_060938.1': 'chr14', '15': 'chr15', 'chr15': 'chr15', 'CM000677.1': 'chr15', 'NC_000015.9': 'chr15', 'CM000677.2': 'chr15', 'NC_000015.10': 'chr15', 'CP068263.2': 'chr15', 'NC_060939.1': 'chr15', '16': 'chr16', 'chr16': 'chr16', 'CM000678.1': 'chr16', 'NC_000016.9': 'chr16', 'CM000678.2': 'chr16', 'NC_000016.10': 'chr16', 'CP068262.2': 'chr16', 'NC_060940.1': 'chr16', '17': 'chr17', 'chr17': 'chr17', 'CM000679.1': 'chr17', 'NC_000017.10': 'chr17', 'CM000679.2': 'chr17', 'NC_000017.11': 'chr17', 'CP068261.2': 'chr17', 'NC_060941.1': 'chr17', '18': 'chr18', 'chr18': 'chr18', 'CM000680.1': 'chr18', 'NC_000018.9': 'chr18', 'CM000680.2': 'chr18', 'NC_000018.10': 'chr18', 'CP068260.2': 'chr18', 'NC_060942.1': 'chr18', '19': 'chr19', 'chr19': 'chr19', 'CM000681.1': 'chr19', 'NC_000019.9': 'chr19', 'CM000681.2': 'chr19', 'NC_000019.10': 'chr19', 'CP068259.2': 'chr19', 'NC_060943.1': 'chr19', '20': 'chr20', 'chr20': 'chr20', 'CM000682.1': 'chr20', 'NC_000020.10': 'chr20', 'CM000682.2': 'chr20', 'NC_000020.11': 'chr20', 'CP068258.2': 'chr20', 'NC_060944.1': 'chr20', '21': 'chr21', 'chr21': 'chr21', 'CM000683.1': 'chr21', 'NC_000021.8': 'chr21', 'CM000683.2': 'chr21', 'NC_000021.9': 'chr21', 'CP068257.2': 'chr21', 'NC_060945.1': 'chr21', '22': 'chr22', 'chr22': 'chr22', 'CM000684.1': 'chr22', 'NC_000022.10': 'chr22', 'CM000684.2': 'chr22', 'NC_000022.11': 'chr22', 'CP068256.2': 'chr22', 'NC_060946.1': 'chr22', 'X': 'chrX', 'chrX': 'chrX', 'CM000685.1': 'chrX', 'NC_000023.10': 'chrX', 'CM000685.2': 'chrX', 'NC_000023.11': 'chrX', 'CP068255.2': 'chrX', 'NC_060947.1': 'chrX', 'Y': 'chrY', 'chrY': 'chrY', 'CM000686.1': 'chrY', 'NC_000024.9': 'chrY', 'CM000686.2': 'chrY', 'NC_000024.10': 'chrY', 'CP086569.2': 'chrY', 'NC_060948.1': 'chrY'}
@@ -0,0 +1,240 @@
1
+ import sys
2
+ import json
3
+ from pathlib import Path
4
+ from reference_genome_dictionaries import major_releases
5
+
6
+ DEFAULT_MAJOR_RELEASES = dict(major_releases)
7
+
8
+ CUSTOM_DB = Path("custom_references.json")
9
+
10
+ if CUSTOM_DB.exists():
11
+ with open(CUSTOM_DB) as f:
12
+ custom_refs = json.load(f)
13
+
14
+ major_releases.update(custom_refs)
15
+
16
+
17
+ def load_from_fai(file_path):
18
+ """
19
+ Loads the contig information from a .fai file into a dictionary.
20
+ Args:
21
+ file_path (str): path to the .fai file to load
22
+
23
+ Returns:
24
+ dict: A dictionary where keys are contig names and values are their lengths.
25
+ """
26
+
27
+ contigs = {}
28
+
29
+ with open(file_path) as f:
30
+ for line in f:
31
+ if line.strip() == "":
32
+ continue
33
+
34
+ parts = line.strip().split("\t")
35
+
36
+ if len(parts) < 2:
37
+ raise ValueError(f"Invalid .fai line: {line}")
38
+
39
+ contigs[parts[0]] = int(parts[1])
40
+
41
+ if not contigs:
42
+ raise ValueError("Empty .fai file")
43
+
44
+ return contigs
45
+
46
+
47
+ def load_custom_db():
48
+ """
49
+ Loads the custom reference database from a JSON file. If the file does not exist, returns an empty dictionary.
50
+
51
+ Returns:
52
+ dict: A dictionary containing the custom references, where keys are reference names and values are their data.
53
+ """
54
+ if CUSTOM_DB.exists():
55
+ with open(CUSTOM_DB) as f:
56
+ return json.load(f)
57
+ return {}
58
+
59
+
60
+ def save_custom_db(db):
61
+ """
62
+ Saves the custom reference database to a JSON file.
63
+ Args:
64
+ db (dict): A dictionary containing the custom references, where keys are reference names and values are their data.
65
+
66
+ """
67
+ with open(CUSTOM_DB, "w") as f:
68
+ json.dump(db, f, indent=4)
69
+
70
+
71
+ def get_all_references():
72
+ """
73
+ Merge default + custom references
74
+ Returns:
75
+ dict: A dictionary containing all references, where keys are reference names and values are their data.
76
+ """
77
+ custom = load_custom_db()
78
+ merged = dict(major_releases)
79
+ merged.update(custom)
80
+ return merged
81
+
82
+
83
+ def compare_contigs(c1, c2):
84
+ """
85
+ Return True if contig dictionaries match exactly
86
+ Args:
87
+ c1 (dict): Contig dictionary of the new reference, where keys are contig names and values are their lengths.
88
+ c2 (dict): Contig dictionary of an existing reference, where keys are contig names and values are their lengths.
89
+ """
90
+ return c1 == c2
91
+
92
+
93
+ def find_matching_reference(new_contigs, all_refs):
94
+ """
95
+ Check if contigs match an existing reference. Avoid adding duplicates to the database.
96
+ Args:
97
+ new_contigs (dict): Contig dictionary of the new reference, where keys are contig names and values are their lengths.
98
+ all_refs (dict): A dictionary containing all references, where keys are reference names and values are their data, including the contig dictionary under the key "
99
+ Returns:
100
+ str | None: The name of the matching reference if a match is found, or None if not.
101
+ """
102
+
103
+ for name, data in all_refs.items():
104
+ if compare_contigs(new_contigs, data["ref_gen"]):
105
+ return name
106
+
107
+ return None
108
+
109
+
110
+ def add_reference(ref_name, species, fai_file):
111
+ """
112
+ Add a new reference to the custom database, after checking that it doesn't match an existing reference. The contig information is loaded from the provided .fai file.
113
+ Args:
114
+ ref_name (str): Name of the new reference to add.
115
+ species (str): Species of the new reference to add.
116
+ fai_file (str): Path to the .fai file containing the contig information of the new reference. The .fai file must have the format: contig_name \t contig_length \t ... (other columns are ignored).
117
+ Returns:
118
+ If a matching reference is found, it prints a message with the name of the matching reference and aborts the addition.
119
+ If no match is found, it adds the new reference to the custom database and prints a success message.
120
+ """
121
+
122
+ fai_file = Path(fai_file)
123
+
124
+ if not fai_file.exists():
125
+ raise FileNotFoundError(f"File not found: {fai_file}")
126
+
127
+ if fai_file.suffix != ".fai":
128
+ raise ValueError("Input must be a .fai file")
129
+
130
+ print(f"Loading contigs from: {fai_file}")
131
+ contigs = load_from_fai(fai_file)
132
+
133
+ all_refs = get_all_references()
134
+
135
+ # Duplicate detection
136
+ match = find_matching_reference(contigs, all_refs)
137
+ if match:
138
+ print(f"This reference matches existing reference: {match}")
139
+ print("Aborting to avoid duplication.")
140
+ return
141
+
142
+ db = load_custom_db()
143
+
144
+ if ref_name in db:
145
+ print(f"Warning: '{ref_name}' already exists and will be overwritten")
146
+
147
+ db[ref_name] = {
148
+ "ref_gen": contigs,
149
+ "build": ref_name,
150
+ "species": species
151
+ }
152
+
153
+ save_custom_db(db)
154
+
155
+ print(f"Reference '{ref_name}' added successfully.")
156
+
157
+
158
+ def list_references():
159
+ """
160
+ List all available references, including both default and custom ones.
161
+ It indicates the origin of each reference (default or custom) and prints the total number of references.
162
+ Args: None
163
+ Returns:
164
+ """
165
+ default = DEFAULT_MAJOR_RELEASES
166
+ custom = load_custom_db()
167
+ all_refs = get_all_references()
168
+
169
+ print("\nAvailable references:\n")
170
+
171
+ for name, data in all_refs.items():
172
+ origin = "custom" if name in custom else "default"
173
+ print(f"- {name} ({data['species']}) [{origin}]")
174
+
175
+ print(f"\nTotal: {len(all_refs)} references")
176
+
177
+
178
+ def remove_reference(ref_name):
179
+ """
180
+ Remove a reference from the custom database by its name.
181
+ It checks if the reference exists in the custom database before attempting to remove it, and prints a message indicating whether the removal was successful or if the reference was not found.
182
+ Args:
183
+ ref_name (str): Name of the reference to remove. It must be a reference that was added to the custom database, as default references cannot be removed.
184
+ Returns:
185
+ If the reference is found and removed successfully, it prints a success message.
186
+ """
187
+ db = load_custom_db()
188
+
189
+ if ref_name not in db:
190
+ print(f"'{ref_name}' not found in custom references.")
191
+ return
192
+
193
+ del db[ref_name]
194
+ save_custom_db(db)
195
+
196
+ print(f"Removed reference '{ref_name}'")
197
+
198
+
199
+ def main():
200
+ """
201
+ Command-line interface for managing the reference database. It supports three commands:
202
+ - add <name> <species> <fai>: Adds a new reference to the custom database with the specified name, species, and contig information loaded from the provided .fai file.
203
+ - list: Lists all available references, including both default and custom ones, indicating their origin and the total number of references.
204
+ - remove <name>: Removes a reference from the custom database by its name. Only references that were added to the custom database can be removed, default references cannot be removed.
205
+ """
206
+
207
+ if len(sys.argv) < 2:
208
+ print(
209
+ "Usage:\n"
210
+ " add <name> <species> <fai>\n"
211
+ " list\n"
212
+ " remove <name>"
213
+ )
214
+ sys.exit(1)
215
+
216
+ command = sys.argv[1]
217
+
218
+ if command == "add":
219
+ if len(sys.argv) != 5:
220
+ print("Usage: add <name> <species> <genome.fai>")
221
+ sys.exit(1)
222
+
223
+ add_reference(sys.argv[2], sys.argv[3], sys.argv[4])
224
+
225
+ elif command == "list":
226
+ list_references()
227
+
228
+ elif command == "remove":
229
+ if len(sys.argv) != 3:
230
+ print("Usage: remove <name>")
231
+ sys.exit(1)
232
+
233
+ remove_reference(sys.argv[2])
234
+
235
+ else:
236
+ print(f"Unknown command: {command}")
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()