RefgenDetector 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- refgenDetector/__init__.py +0 -0
- refgenDetector/aligment_files.py +272 -0
- refgenDetector/chromosomes_dict.py +34 -0
- refgenDetector/ref_manager.py +240 -0
- refgenDetector/reference_genome_dictionaries.py +820 -0
- refgenDetector/refgenDetector_main.py +114 -0
- refgenDetector/variant_files.py +363 -0
- refgendetector-3.0.0.dist-info/METADATA +300 -0
- refgendetector-3.0.0.dist-info/RECORD +13 -0
- refgendetector-3.0.0.dist-info/WHEEL +5 -0
- refgendetector-3.0.0.dist-info/entry_points.txt +2 -0
- refgendetector-3.0.0.dist-info/licenses/LICENSE +3 -0
- refgendetector-3.0.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
# Add the parent directory to the Python path
|
|
4
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
5
|
+
parent_dir = os.path.dirname(current_dir)
|
|
6
|
+
sys.path.insert(0, parent_dir)
|
|
7
|
+
from refgenDetector.reference_genome_dictionaries import *
|
|
8
|
+
from refgenDetector.exceptions.NoFileException import *
|
|
9
|
+
import argparse
|
|
10
|
+
import csv
|
|
11
|
+
import gzip
|
|
12
|
+
import pysam
|
|
13
|
+
import psutil
|
|
14
|
+
import time
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
def intersection_targetfile_referencerepo(dict_SN_LN, reference_genome):
|
|
20
|
+
"""
|
|
21
|
+
Find the matches between the target file and the repository of unique contigs per reference genome.
|
|
22
|
+
Returns the actual matches (ln info) instead of just their count.
|
|
23
|
+
Args:
|
|
24
|
+
dict_SN_LN (dict) : dictionary with the contig (SN: key, LN: value) info from the target file
|
|
25
|
+
reference_genome (dict entry): one of the versions from major_releases
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
matches (set) : list of lengths matching to the version currently being read
|
|
29
|
+
reference_genome["build"] (str): build of the version currently being read
|
|
30
|
+
reference_genome["species"] (str): species of the version currently being read
|
|
31
|
+
"""
|
|
32
|
+
matches = set(dict_SN_LN.values()).intersection(reference_genome["ref_gen"].values())
|
|
33
|
+
return matches, reference_genome["build"], reference_genome["species"]
|
|
34
|
+
|
|
35
|
+
def check_if_decoy(matches_info, target_file):
|
|
36
|
+
"""
|
|
37
|
+
Checks if there's inconsistency of the versions in the target file or if the multiple matches are random
|
|
38
|
+
Args:
|
|
39
|
+
matches_info (list): list of tuples. Each tuple have 3 positions: lengths from the contigs matching,
|
|
40
|
+
version where the contigs match, species from the version.
|
|
41
|
+
target_file (str): path of the target file
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
If the matches to the secondary version are at least as long as the shortest chromosome of the version with
|
|
45
|
+
more matches then a message raising the incosistency is printed.
|
|
46
|
+
If the matches to the secondary version are shorter than the shortest chromosome then it assumes it's a decoy
|
|
47
|
+
contig matching another version randomly and returns:
|
|
48
|
+
incosistency = False so the code can continue in comparison() and give the results based on the version with
|
|
49
|
+
most matches.
|
|
50
|
+
"""
|
|
51
|
+
incosistency_found = False
|
|
52
|
+
filtered_entries = [entry for entry in matches_info if entry[0]] # get the multiple matches
|
|
53
|
+
match = max(filtered_entries, key=lambda ref_gen_w_macthes: len(ref_gen_w_macthes[0])) # version with most
|
|
54
|
+
# matches with LN values
|
|
55
|
+
inconsistency_matches = []
|
|
56
|
+
for version in filtered_entries:
|
|
57
|
+
if version != match:
|
|
58
|
+
for ln in version[0]:
|
|
59
|
+
if int(ln) > min_values[match[1]]: # checks if the ln matching is more or less chr length
|
|
60
|
+
ref_dict = globals().get(version[1])
|
|
61
|
+
inconsistency_matches.extend([key for key, value in ref_dict.items() if value == ln])
|
|
62
|
+
incosistency_found = True
|
|
63
|
+
if incosistency_found == True:
|
|
64
|
+
console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold] Inconsistency found "
|
|
65
|
+
f"- file contains contigs from different genome versions[/red]")
|
|
66
|
+
console.print(f"[red]Contigs {inconsistency_matches} belong to {version[1]}, but the rest belongs to"
|
|
67
|
+
f" {match[1]}[/red].")
|
|
68
|
+
return incosistency_found
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def comparison(dict_SN_LN, target_file):
|
|
72
|
+
"""
|
|
73
|
+
First, it defines the major release to which the header belongs to. Then, checks if a flavor can be inferred.
|
|
74
|
+
Args:
|
|
75
|
+
dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
|
|
76
|
+
target_file (str): path of the target file
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Prints the file path being analyzed,the species and the Reference genome version inferred.
|
|
80
|
+
It raises an error if:
|
|
81
|
+
- The contigs in the target file are not in the database (a species or ref gen version not included in the tool)
|
|
82
|
+
- There are contigs belonging to more than one release/species. This will be printed if the match between
|
|
83
|
+
species is as long as the shortest chromosome from the version with the most matches. If the match is
|
|
84
|
+
shorter it assumes it's a random match e.g a decoy contig that randomly matches the length of
|
|
85
|
+
another species/version.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
matches_info = [intersection_targetfile_referencerepo(dict_SN_LN, major_releases[ref]) for ref in major_releases]
|
|
89
|
+
matches_with_counts = [(len(matches), build, species) for matches, build, species in matches_info]
|
|
90
|
+
max_match = max(matches_with_counts, key=lambda ref_gen_w_macthes: ref_gen_w_macthes[0]) # Find the major release
|
|
91
|
+
# with the maximum matches
|
|
92
|
+
incosistency = False
|
|
93
|
+
|
|
94
|
+
# check all the matches belong to the same release version
|
|
95
|
+
multiple_matches = []
|
|
96
|
+
for match in matches_with_counts:
|
|
97
|
+
if match[0] != 0:
|
|
98
|
+
multiple_matches.append(match)
|
|
99
|
+
|
|
100
|
+
if len(multiple_matches) > 1 :
|
|
101
|
+
if multiple_matches[0][1] != "hg17" and multiple_matches[1][1] != "hg18": # these versions share contig lengths
|
|
102
|
+
if multiple_matches[0][1] != "rhemac3" and multiple_matches[1][1] != "rhemac8":
|
|
103
|
+
incosistency = check_if_decoy(matches_info, target_file)
|
|
104
|
+
|
|
105
|
+
if incosistency == False:
|
|
106
|
+
if max_match[0] == 0:
|
|
107
|
+
for contig in dict_SN_LN.values():
|
|
108
|
+
if contig not in mit_contigs.values():
|
|
109
|
+
console.print(f"[bold][red]Reference genome can't be inferred[/bold] - "
|
|
110
|
+
"The contigs in the file are not found in refgenDetector database[red]")
|
|
111
|
+
break
|
|
112
|
+
else:
|
|
113
|
+
ref_version = next(key for key, value in mit_contigs.items() if value == contig)
|
|
114
|
+
console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]Reference genome version :[/bold] {ref_version}")
|
|
115
|
+
console.print(f"Note: Only the mitochondrial reference sequence is present. Nuclear genome build cannot be determined.")
|
|
116
|
+
|
|
117
|
+
elif max_match[1] == "GRCh37": #check for GRCh37 flavors
|
|
118
|
+
|
|
119
|
+
matches_flavors = [
|
|
120
|
+
intersection_targetfile_referencerepo(dict_SN_LN, flavors_GRCh37[ref])
|
|
121
|
+
for ref in flavors_GRCh37
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
match_flavors = max(matches_flavors, key=lambda x: x[0])
|
|
125
|
+
if match_flavors: #if some flavor was defined it prints it
|
|
126
|
+
console.print(f"[bold]Species detected:[/bold] {match_flavors[2]} "
|
|
127
|
+
f"[bold]\nReference genome version :[/bold] {match_flavors[1]}")
|
|
128
|
+
else: #if there wasnt any flavor inferred, the major release it printed
|
|
129
|
+
console.print(f"[bold]Species detected:[/bold] Homo sapiens \n["
|
|
130
|
+
f"bold]Reference genome version :[/bold] GRCh37")
|
|
131
|
+
|
|
132
|
+
elif max_match[1] == "GRCh38": #checks for GRCh38 flavors
|
|
133
|
+
|
|
134
|
+
if any("HLA-" in key for key in dict_SN_LN.keys()):
|
|
135
|
+
#first checks if the contigs contain in their names HLA-
|
|
136
|
+
console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]"
|
|
137
|
+
f"Reference genome version :[/bold] hs38DH_extra")
|
|
138
|
+
elif set(dict_SN_LN.values()).intersection(verily_difGRCh38.values()):#checks if the Verily's unique
|
|
139
|
+
# lengths are present
|
|
140
|
+
console.print(f"[bold]Species detected:[/bold] Homo sapiens \n[bold]"
|
|
141
|
+
f"Reference genome version :[/bold] GRCh38_no_alt_plus_hs38d1")
|
|
142
|
+
else: # if no GRCh38 flavor is inferred, the major release is printed
|
|
143
|
+
console.print(f"[bold]Species detected:[/bold] Homo sapiens \n["
|
|
144
|
+
f"bold]Reference genome version :[/bold] GRCh38")
|
|
145
|
+
else: # print the major releases with no considered flavors.
|
|
146
|
+
console.print(f"[bold]Species detected:[/bold] {match[2]} "
|
|
147
|
+
f"\n[bold]Reference genome version:[/bold] {match[1]}")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def get_info_bamcram(header_bam_cram, target_file, md5, assembly):
|
|
153
|
+
"""
|
|
154
|
+
Second function of the BAM/CRAM module. Loop over the SQ (sequence dictionary) records in the header, creates a
|
|
155
|
+
dictionary with the contigs names and lengths, if present and requested by the user (adding -m and -a in the
|
|
156
|
+
argument) prints AS and M5
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
header_bam_cram(pysam.libcalignmentfile.AlignmentHeader): text object
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
|
|
163
|
+
target_file (str): path of the target file
|
|
164
|
+
dict_assembly[1] (str): if present and asked by the user, AS value from the target file header
|
|
165
|
+
dict_M5 (dict): if present and asked by the user, M5 values from the target file header
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
dict_SN_LN = {sq_record["SN"]: sq_record["LN"] for sq_record in header_bam_cram.get("SQ", [])}
|
|
169
|
+
|
|
170
|
+
if assembly: # if the user chose -a
|
|
171
|
+
dict_assembly = set(sq_record["AS"] for sq_record in header_bam_cram.get("SQ", []) if "AS" in sq_record)
|
|
172
|
+
if dict_assembly:
|
|
173
|
+
console.print(f"[bold]AS field:[/bold] {dict_assembly.pop()}")
|
|
174
|
+
if md5: # if the user chose -m
|
|
175
|
+
dict_M5 = set(sq_record["M5"] for sq_record in header_bam_cram.get("SQ", []) if "M5" in sq_record)
|
|
176
|
+
if dict_M5:
|
|
177
|
+
console.print(f"[bold]M5 fields:[/bold]{dict_M5}")
|
|
178
|
+
comparison(dict_SN_LN, target_file)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def process_data_bamcram(target_file, md5, assembly):
|
|
182
|
+
"""
|
|
183
|
+
First function of the BAM/CRAM module. It opens each BAM or CRAM provided by the user and extracts the header.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
target_file (str): path to the file
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
header_bam_cram (pysam.libcalignmentfile.AlignmentHeader): text object
|
|
190
|
+
"""
|
|
191
|
+
try:
|
|
192
|
+
save = pysam.set_verbosity(0) # https://github.com/pysam-developers/pysam/issues/939
|
|
193
|
+
bam_cram = pysam.AlignmentFile(target_file, "rb")
|
|
194
|
+
pysam.set_verbosity(save)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold][red] {e.__class__}, {e}")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
header_bam_cram = bam_cram.header
|
|
200
|
+
get_info_bamcram(header_bam_cram, target_file, md5, assembly)
|
|
201
|
+
|
|
202
|
+
def get_info_txt(header_txt, md5, assembly):
|
|
203
|
+
"""
|
|
204
|
+
Second function of the txt module. Extracts the SQ (sequence dictionary) records in the header, creates a
|
|
205
|
+
dictionary with the contigs names and lengths, and, if present and requested by the user (adding -m and -a in the
|
|
206
|
+
argument) prints AS and M5.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
header_txt (io.TextIOWrapper): text object
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
dict_SN_LN (dict): dictionary with the contig (SN: key, LN: value) info from the target file
|
|
213
|
+
header_txt.name (str): path of the target file
|
|
214
|
+
dict_assembly[1] (str): if present and asked by the user, AS value from the target file header
|
|
215
|
+
dict_M5 (dict): if present and asked by the user, M5 values from the target file header
|
|
216
|
+
"""
|
|
217
|
+
header_reader = csv.reader(header_txt, delimiter="\t")
|
|
218
|
+
try:
|
|
219
|
+
dict_SQ = [line for line in header_reader if "@SQ" in line]
|
|
220
|
+
except (UnicodeDecodeError, csv.Error) as e:
|
|
221
|
+
print(f"File cannot be read ({type(e).__name__}: {e}). It is likely compressed, corrupted or the incorrect -t.")
|
|
222
|
+
try:
|
|
223
|
+
dict_SN_LN = {line[1].replace("SN:", ""): int(line[2].replace("LN:", "")) for line in dict_SQ} #the dictonary values must be int due to the structure of the collection of reference dictionaries
|
|
224
|
+
|
|
225
|
+
except ValueError:
|
|
226
|
+
print(f"Check the LN field of your header {header_txt.name} only contains numbers")
|
|
227
|
+
|
|
228
|
+
comparison(dict_SN_LN, header_txt.name)
|
|
229
|
+
|
|
230
|
+
if assembly: # # if the user chose -a
|
|
231
|
+
dict_assembly = [l for line in dict_SQ for l in line if "AS" in l][:1]
|
|
232
|
+
if dict_assembly: # if AS is present in the header
|
|
233
|
+
console.print(f"[bold]AS field:[/bold] {dict_assembly[0].split(':')[1]}")
|
|
234
|
+
if md5: # # if the user chose -m
|
|
235
|
+
for i in dict_SQ[0]:
|
|
236
|
+
if "M5" in i:
|
|
237
|
+
dict_M5 = {line[1].replace("SN:", ""): i.replace("M5:", "") for line in
|
|
238
|
+
dict_SQ}
|
|
239
|
+
console.print(f"[bold]MD5 fields:[/bold] {dict_M5}")
|
|
240
|
+
|
|
241
|
+
def process_data_txt(target_file, md5, assembly):
|
|
242
|
+
"""
|
|
243
|
+
First function of the txt module. It opens each header in --path. gzip or uncompressed and encoded in utf-8 or
|
|
244
|
+
iso-8859-1.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
target_file (str): path to the file
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
header_txt (io.TextIOWrapper): text object
|
|
251
|
+
"""
|
|
252
|
+
try:
|
|
253
|
+
if os.path.isfile(target_file):
|
|
254
|
+
with open(target_file, "r") as header_txt:
|
|
255
|
+
get_info_txt(header_txt, md5, assembly)
|
|
256
|
+
else:
|
|
257
|
+
raise NoFileException()
|
|
258
|
+
except UnicodeError:
|
|
259
|
+
with open(target_file, "r", encoding="iso-8859-1") as header_txt:
|
|
260
|
+
get_info_txt(header_txt, md5, assembly)
|
|
261
|
+
except OSError:
|
|
262
|
+
try:
|
|
263
|
+
with gzip.open(target_file, "rt") as header_txt:
|
|
264
|
+
get_info_txt(header_txt, md5, assembly)
|
|
265
|
+
except UnicodeError:
|
|
266
|
+
with gzip.open(target_file, "rt", encoding="iso-8859-1") as header_txt:
|
|
267
|
+
get_info_txt(header_txt, md5, assembly)
|
|
268
|
+
except NoFileException:
|
|
269
|
+
console.print(f"[bold]File:[/bold] {target_file} \n[bold][red]Error:[/bold][red] The path provided is not "
|
|
270
|
+
f"found or you are using the incorrect --type option.")
|
|
271
|
+
except Exception as e:
|
|
272
|
+
print("Unexpected error:\n", e)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Original dictionary:
|
|
4
|
+
chromosomes = {
|
|
5
|
+
"chr1": ["1", "chr1", "CM000663.1", "NC_000001.10", "CM000663.2", "NC_000001.11", "CP068277.2", "NC_060925.1"],
|
|
6
|
+
"chr2": ["2", "chr2", "CM000664.1", "NC_000002.11", "CM000664.2", "NC_000002.12", "CP068276.2", "NC_060926.1"],
|
|
7
|
+
"chr3": ["3", "chr3", "CM000665.1", "NC_000003.11", "CM000665.2", "NC_000003.12", "CP068275.2", "NC_060927.1"],
|
|
8
|
+
"chr4": ["4", "chr4", "CM000666.1", "NC_000004.11", "CM000666.2", "NC_000004.12", "CP068274.2", "NC_060928.1"],
|
|
9
|
+
"chr5": ["5", "chr5", "CM000667.1", "NC_000005.9", "CM000667.2", "NC_000005.10", "CP068273.2", "NC_060929.1"],
|
|
10
|
+
"chr6": ["6", "chr6", "CM000668.1", "NC_000006.11", "CM000668.2", "NC_000006.12", "CP068272.2", "NC_060930.1"],
|
|
11
|
+
"chr7": ["7", "chr7", "CM000669.1", "NC_000007.13", "CM000669.2", "NC_000007.14", "CP068271.2", "NC_060931.1"],
|
|
12
|
+
"chr8": ["8", "chr8", "CM000670.1", "NC_000008.10", "CcM000670.2", "NC_000008.11", "CP068270.2", "NC_060932.1"],
|
|
13
|
+
"chr9": ["9", "chr9", "CM000671.1", "NC_000009.11", "CM000671.2", "NC_000009.12", "CP068269.2", "NC_060933.1"],
|
|
14
|
+
"chr10": ["10", "chr10", "CM000672.1", "NC_000010.10", "CM000672.2", "NC_000010.11", "CP068268.2", "NC_060934.1"],
|
|
15
|
+
"chr11": ["11", "chr11", "CM000673.1", "NC_000011.9", "CM000673.2", "NC_000011.10", "CP068267.2", "NC_060935.1"],
|
|
16
|
+
"chr12": ["12", "chr12", "CM000674.1", "NC_000012.11", "CM000674.2", "NC_000012.12", "CP068266.2", "NC_060936.1"],
|
|
17
|
+
"chr13": ["13", "chr13", "CM000675.1", "NC_000013.10", "CM000675.2", "NC_000013.11", "CP068265.2", "NC_060937.1"],
|
|
18
|
+
"chr14": ["14", "chr14", "CM000676.1", "NC_000014.8", "CM000676.2", "NC_000014.9", "CP068264.2", "NC_060938.1"],
|
|
19
|
+
"chr15": ["15", "chr15", "CM000677.1", "NC_000015.9", "CM000677.2", "NC_000015.10", "CP068263.2", "NC_060939.1"],
|
|
20
|
+
"chr16": ["16", "chr16", "CM000678.1", "NC_000016.9", "CM000678.2", "NC_000016.10", "CP068262.2", "NC_060940.1"],
|
|
21
|
+
"chr17": ["17", "chr17", "CM000679.1", "NC_000017.10", "CM000679.2", "NC_000017.11", "CP068261.2", "NC_060941.1"],
|
|
22
|
+
"chr18": ["18", "chr18", "CM000680.1", "NC_000018.9", "CM000680.2", "NC_000018.10", "CP068260.2", "NC_060942.1"],
|
|
23
|
+
"chr19": ["19", "chr19", "CM000681.1", "NC_000019.9", "CM000681.2", "NC_000019.10", "CP068259.2", "NC_060943.1"],
|
|
24
|
+
"chr20": ["20", "chr20", "CM000682.1", "NC_000020.10", "CM000682.2", "NC_000020.11", "CP068258.2", "NC_060944.1"],
|
|
25
|
+
"chr21": ["21", "chr21", "CM000683.1", "NC_000021.8", "CM000683.2", "NC_000021.9", "CP068257.2", "NC_060945.1"],
|
|
26
|
+
"chr22": ["22", "chr22", "CM000684.1", "NC_000022.10", "CM000684.2", "NC_000022.11", "CP068256.2", "NC_060946.1"],
|
|
27
|
+
"chrX": ["X", "chrX", "CM000685.1", "NC_000023.10", "CM000685.2", "NC_000023.11", "CP068255.2", "NC_060947.1"],
|
|
28
|
+
"chrY": ["Y", "chrY", "CM000686.1", "NC_000024.9", "CM000686.2", "NC_000024.10", "CP086569.2", "NC_060948.1"]
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
chromosome_map = {v: k for k, lst in chromosomes.items() for v in lst}
|
|
32
|
+
print(chromosome_map)"""
|
|
33
|
+
|
|
34
|
+
chromosome_map = {'1': 'chr1', 'chr1': 'chr1', 'CM000663.1': 'chr1', 'NC_000001.10': 'chr1', 'CM000663.2': 'chr1', 'NC_000001.11': 'chr1', 'CP068277.2': 'chr1', 'NC_060925.1': 'chr1', '2': 'chr2', 'chr2': 'chr2', 'CM000664.1': 'chr2', 'NC_000002.11': 'chr2', 'CM000664.2': 'chr2', 'NC_000002.12': 'chr2', 'CP068276.2': 'chr2', 'NC_060926.1': 'chr2', '3': 'chr3', 'chr3': 'chr3', 'CM000665.1': 'chr3', 'NC_000003.11': 'chr3', 'CM000665.2': 'chr3', 'NC_000003.12': 'chr3', 'CP068275.2': 'chr3', 'NC_060927.1': 'chr3', '4': 'chr4', 'chr4': 'chr4', 'CM000666.1': 'chr4', 'NC_000004.11': 'chr4', 'CM000666.2': 'chr4', 'NC_000004.12': 'chr4', 'CP068274.2': 'chr4', 'NC_060928.1': 'chr4', '5': 'chr5', 'chr5': 'chr5', 'CM000667.1': 'chr5', 'NC_000005.9': 'chr5', 'CM000667.2': 'chr5', 'NC_000005.10': 'chr5', 'CP068273.2': 'chr5', 'NC_060929.1': 'chr5', '6': 'chr6', 'chr6': 'chr6', 'CM000668.1': 'chr6', 'NC_000006.11': 'chr6', 'CM000668.2': 'chr6', 'NC_000006.12': 'chr6', 'CP068272.2': 'chr6', 'NC_060930.1': 'chr6', '7': 'chr7', 'chr7': 'chr7', 'CM000669.1': 'chr7', 'NC_000007.13': 'chr7', 'CM000669.2': 'chr7', 'NC_000007.14': 'chr7', 'CP068271.2': 'chr7', 'NC_060931.1': 'chr7', '8': 'chr8', 'chr8': 'chr8', 'CM000670.1': 'chr8', 'NC_000008.10': 'chr8', 'CM000670.2': 'chr8', 'NC_000008.11': 'chr8', 'CP068270.2': 'chr8', 'NC_060932.1': 'chr8', '9': 'chr9', 'chr9': 'chr9', 'CM000671.1': 'chr9', 'NC_000009.11': 'chr9', 'CM000671.2': 'chr9', 'NC_000009.12': 'chr9', 'CP068269.2': 'chr9', 'NC_060933.1': 'chr9', '10': 'chr10', 'chr10': 'chr10', 'CM000672.1': 'chr10', 'NC_000010.10': 'chr10', 'CM000672.2': 'chr10', 'NC_000010.11': 'chr10', 'CP068268.2': 'chr10', 'NC_060934.1': 'chr10', '11': 'chr11', 'chr11': 'chr11', 'CM000673.1': 'chr11', 'NC_000011.9': 'chr11', 'CM000673.2': 'chr11', 'NC_000011.10': 'chr11', 'CP068267.2': 'chr11', 'NC_060935.1': 'chr11', '12': 'chr12', 'chr12': 'chr12', 'CM000674.1': 'chr12', 'NC_000012.11': 'chr12', 'CM000674.2': 'chr12', 'NC_000012.12': 'chr12', 'CP068266.2': 'chr12', 'NC_060936.1': 'chr12', '13': 'chr13', 'chr13': 'chr13', 'CM000675.1': 'chr13', 'NC_000013.10': 'chr13', 'CM000675.2': 'chr13', 'NC_000013.11': 'chr13', 'CP068265.2': 'chr13', 'NC_060937.1': 'chr13', '14': 'chr14', 'chr14': 'chr14', 'CM000676.1': 'chr14', 'NC_000014.8': 'chr14', 'CM000676.2': 'chr14', 'NC_000014.9': 'chr14', 'CP068264.2': 'chr14', 'NC_060938.1': 'chr14', '15': 'chr15', 'chr15': 'chr15', 'CM000677.1': 'chr15', 'NC_000015.9': 'chr15', 'CM000677.2': 'chr15', 'NC_000015.10': 'chr15', 'CP068263.2': 'chr15', 'NC_060939.1': 'chr15', '16': 'chr16', 'chr16': 'chr16', 'CM000678.1': 'chr16', 'NC_000016.9': 'chr16', 'CM000678.2': 'chr16', 'NC_000016.10': 'chr16', 'CP068262.2': 'chr16', 'NC_060940.1': 'chr16', '17': 'chr17', 'chr17': 'chr17', 'CM000679.1': 'chr17', 'NC_000017.10': 'chr17', 'CM000679.2': 'chr17', 'NC_000017.11': 'chr17', 'CP068261.2': 'chr17', 'NC_060941.1': 'chr17', '18': 'chr18', 'chr18': 'chr18', 'CM000680.1': 'chr18', 'NC_000018.9': 'chr18', 'CM000680.2': 'chr18', 'NC_000018.10': 'chr18', 'CP068260.2': 'chr18', 'NC_060942.1': 'chr18', '19': 'chr19', 'chr19': 'chr19', 'CM000681.1': 'chr19', 'NC_000019.9': 'chr19', 'CM000681.2': 'chr19', 'NC_000019.10': 'chr19', 'CP068259.2': 'chr19', 'NC_060943.1': 'chr19', '20': 'chr20', 'chr20': 'chr20', 'CM000682.1': 'chr20', 'NC_000020.10': 'chr20', 'CM000682.2': 'chr20', 'NC_000020.11': 'chr20', 'CP068258.2': 'chr20', 'NC_060944.1': 'chr20', '21': 'chr21', 'chr21': 'chr21', 'CM000683.1': 'chr21', 'NC_000021.8': 'chr21', 'CM000683.2': 'chr21', 'NC_000021.9': 'chr21', 'CP068257.2': 'chr21', 'NC_060945.1': 'chr21', '22': 'chr22', 'chr22': 'chr22', 'CM000684.1': 'chr22', 'NC_000022.10': 'chr22', 'CM000684.2': 'chr22', 'NC_000022.11': 'chr22', 'CP068256.2': 'chr22', 'NC_060946.1': 'chr22', 'X': 'chrX', 'chrX': 'chrX', 'CM000685.1': 'chrX', 'NC_000023.10': 'chrX', 'CM000685.2': 'chrX', 'NC_000023.11': 'chrX', 'CP068255.2': 'chrX', 'NC_060947.1': 'chrX', 'Y': 'chrY', 'chrY': 'chrY', 'CM000686.1': 'chrY', 'NC_000024.9': 'chrY', 'CM000686.2': 'chrY', 'NC_000024.10': 'chrY', 'CP086569.2': 'chrY', 'NC_060948.1': 'chrY'}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from reference_genome_dictionaries import major_releases
|
|
5
|
+
|
|
6
|
+
DEFAULT_MAJOR_RELEASES = dict(major_releases)
|
|
7
|
+
|
|
8
|
+
CUSTOM_DB = Path("custom_references.json")
|
|
9
|
+
|
|
10
|
+
if CUSTOM_DB.exists():
|
|
11
|
+
with open(CUSTOM_DB) as f:
|
|
12
|
+
custom_refs = json.load(f)
|
|
13
|
+
|
|
14
|
+
major_releases.update(custom_refs)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_from_fai(file_path):
|
|
18
|
+
"""
|
|
19
|
+
Loads the contig information from a .fai file into a dictionary.
|
|
20
|
+
Args:
|
|
21
|
+
file_path (str): path to the .fai file to load
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
dict: A dictionary where keys are contig names and values are their lengths.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
contigs = {}
|
|
28
|
+
|
|
29
|
+
with open(file_path) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
if line.strip() == "":
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
parts = line.strip().split("\t")
|
|
35
|
+
|
|
36
|
+
if len(parts) < 2:
|
|
37
|
+
raise ValueError(f"Invalid .fai line: {line}")
|
|
38
|
+
|
|
39
|
+
contigs[parts[0]] = int(parts[1])
|
|
40
|
+
|
|
41
|
+
if not contigs:
|
|
42
|
+
raise ValueError("Empty .fai file")
|
|
43
|
+
|
|
44
|
+
return contigs
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_custom_db():
|
|
48
|
+
"""
|
|
49
|
+
Loads the custom reference database from a JSON file. If the file does not exist, returns an empty dictionary.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
dict: A dictionary containing the custom references, where keys are reference names and values are their data.
|
|
53
|
+
"""
|
|
54
|
+
if CUSTOM_DB.exists():
|
|
55
|
+
with open(CUSTOM_DB) as f:
|
|
56
|
+
return json.load(f)
|
|
57
|
+
return {}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def save_custom_db(db):
|
|
61
|
+
"""
|
|
62
|
+
Saves the custom reference database to a JSON file.
|
|
63
|
+
Args:
|
|
64
|
+
db (dict): A dictionary containing the custom references, where keys are reference names and values are their data.
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
with open(CUSTOM_DB, "w") as f:
|
|
68
|
+
json.dump(db, f, indent=4)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_all_references():
|
|
72
|
+
"""
|
|
73
|
+
Merge default + custom references
|
|
74
|
+
Returns:
|
|
75
|
+
dict: A dictionary containing all references, where keys are reference names and values are their data.
|
|
76
|
+
"""
|
|
77
|
+
custom = load_custom_db()
|
|
78
|
+
merged = dict(major_releases)
|
|
79
|
+
merged.update(custom)
|
|
80
|
+
return merged
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def compare_contigs(c1, c2):
|
|
84
|
+
"""
|
|
85
|
+
Return True if contig dictionaries match exactly
|
|
86
|
+
Args:
|
|
87
|
+
c1 (dict): Contig dictionary of the new reference, where keys are contig names and values are their lengths.
|
|
88
|
+
c2 (dict): Contig dictionary of an existing reference, where keys are contig names and values are their lengths.
|
|
89
|
+
"""
|
|
90
|
+
return c1 == c2
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def find_matching_reference(new_contigs, all_refs):
|
|
94
|
+
"""
|
|
95
|
+
Check if contigs match an existing reference. Avoid adding duplicates to the database.
|
|
96
|
+
Args:
|
|
97
|
+
new_contigs (dict): Contig dictionary of the new reference, where keys are contig names and values are their lengths.
|
|
98
|
+
all_refs (dict): A dictionary containing all references, where keys are reference names and values are their data, including the contig dictionary under the key "
|
|
99
|
+
Returns:
|
|
100
|
+
str | None: The name of the matching reference if a match is found, or None if not.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
for name, data in all_refs.items():
|
|
104
|
+
if compare_contigs(new_contigs, data["ref_gen"]):
|
|
105
|
+
return name
|
|
106
|
+
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def add_reference(ref_name, species, fai_file):
|
|
111
|
+
"""
|
|
112
|
+
Add a new reference to the custom database, after checking that it doesn't match an existing reference. The contig information is loaded from the provided .fai file.
|
|
113
|
+
Args:
|
|
114
|
+
ref_name (str): Name of the new reference to add.
|
|
115
|
+
species (str): Species of the new reference to add.
|
|
116
|
+
fai_file (str): Path to the .fai file containing the contig information of the new reference. The .fai file must have the format: contig_name \t contig_length \t ... (other columns are ignored).
|
|
117
|
+
Returns:
|
|
118
|
+
If a matching reference is found, it prints a message with the name of the matching reference and aborts the addition.
|
|
119
|
+
If no match is found, it adds the new reference to the custom database and prints a success message.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
fai_file = Path(fai_file)
|
|
123
|
+
|
|
124
|
+
if not fai_file.exists():
|
|
125
|
+
raise FileNotFoundError(f"File not found: {fai_file}")
|
|
126
|
+
|
|
127
|
+
if fai_file.suffix != ".fai":
|
|
128
|
+
raise ValueError("Input must be a .fai file")
|
|
129
|
+
|
|
130
|
+
print(f"Loading contigs from: {fai_file}")
|
|
131
|
+
contigs = load_from_fai(fai_file)
|
|
132
|
+
|
|
133
|
+
all_refs = get_all_references()
|
|
134
|
+
|
|
135
|
+
# Duplicate detection
|
|
136
|
+
match = find_matching_reference(contigs, all_refs)
|
|
137
|
+
if match:
|
|
138
|
+
print(f"This reference matches existing reference: {match}")
|
|
139
|
+
print("Aborting to avoid duplication.")
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
db = load_custom_db()
|
|
143
|
+
|
|
144
|
+
if ref_name in db:
|
|
145
|
+
print(f"Warning: '{ref_name}' already exists and will be overwritten")
|
|
146
|
+
|
|
147
|
+
db[ref_name] = {
|
|
148
|
+
"ref_gen": contigs,
|
|
149
|
+
"build": ref_name,
|
|
150
|
+
"species": species
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
save_custom_db(db)
|
|
154
|
+
|
|
155
|
+
print(f"Reference '{ref_name}' added successfully.")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def list_references():
|
|
159
|
+
"""
|
|
160
|
+
List all available references, including both default and custom ones.
|
|
161
|
+
It indicates the origin of each reference (default or custom) and prints the total number of references.
|
|
162
|
+
Args: None
|
|
163
|
+
Returns:
|
|
164
|
+
"""
|
|
165
|
+
default = DEFAULT_MAJOR_RELEASES
|
|
166
|
+
custom = load_custom_db()
|
|
167
|
+
all_refs = get_all_references()
|
|
168
|
+
|
|
169
|
+
print("\nAvailable references:\n")
|
|
170
|
+
|
|
171
|
+
for name, data in all_refs.items():
|
|
172
|
+
origin = "custom" if name in custom else "default"
|
|
173
|
+
print(f"- {name} ({data['species']}) [{origin}]")
|
|
174
|
+
|
|
175
|
+
print(f"\nTotal: {len(all_refs)} references")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def remove_reference(ref_name):
|
|
179
|
+
"""
|
|
180
|
+
Remove a reference from the custom database by its name.
|
|
181
|
+
It checks if the reference exists in the custom database before attempting to remove it, and prints a message indicating whether the removal was successful or if the reference was not found.
|
|
182
|
+
Args:
|
|
183
|
+
ref_name (str): Name of the reference to remove. It must be a reference that was added to the custom database, as default references cannot be removed.
|
|
184
|
+
Returns:
|
|
185
|
+
If the reference is found and removed successfully, it prints a success message.
|
|
186
|
+
"""
|
|
187
|
+
db = load_custom_db()
|
|
188
|
+
|
|
189
|
+
if ref_name not in db:
|
|
190
|
+
print(f"'{ref_name}' not found in custom references.")
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
del db[ref_name]
|
|
194
|
+
save_custom_db(db)
|
|
195
|
+
|
|
196
|
+
print(f"Removed reference '{ref_name}'")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def main():
|
|
200
|
+
"""
|
|
201
|
+
Command-line interface for managing the reference database. It supports three commands:
|
|
202
|
+
- add <name> <species> <fai>: Adds a new reference to the custom database with the specified name, species, and contig information loaded from the provided .fai file.
|
|
203
|
+
- list: Lists all available references, including both default and custom ones, indicating their origin and the total number of references.
|
|
204
|
+
- remove <name>: Removes a reference from the custom database by its name. Only references that were added to the custom database can be removed, default references cannot be removed.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
if len(sys.argv) < 2:
|
|
208
|
+
print(
|
|
209
|
+
"Usage:\n"
|
|
210
|
+
" add <name> <species> <fai>\n"
|
|
211
|
+
" list\n"
|
|
212
|
+
" remove <name>"
|
|
213
|
+
)
|
|
214
|
+
sys.exit(1)
|
|
215
|
+
|
|
216
|
+
command = sys.argv[1]
|
|
217
|
+
|
|
218
|
+
if command == "add":
|
|
219
|
+
if len(sys.argv) != 5:
|
|
220
|
+
print("Usage: add <name> <species> <genome.fai>")
|
|
221
|
+
sys.exit(1)
|
|
222
|
+
|
|
223
|
+
add_reference(sys.argv[2], sys.argv[3], sys.argv[4])
|
|
224
|
+
|
|
225
|
+
elif command == "list":
|
|
226
|
+
list_references()
|
|
227
|
+
|
|
228
|
+
elif command == "remove":
|
|
229
|
+
if len(sys.argv) != 3:
|
|
230
|
+
print("Usage: remove <name>")
|
|
231
|
+
sys.exit(1)
|
|
232
|
+
|
|
233
|
+
remove_reference(sys.argv[2])
|
|
234
|
+
|
|
235
|
+
else:
|
|
236
|
+
print(f"Unknown command: {command}")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
main()
|