XspecT 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of XspecT might be problematic. Click here for more details.
- xspect/definitions.py +0 -7
- xspect/download_models.py +25 -24
- xspect/fastapi.py +23 -26
- xspect/file_io.py +86 -2
- xspect/main.py +333 -98
- xspect/mlst_feature/mlst_helper.py +5 -7
- xspect/model_management.py +6 -0
- xspect/models/probabilistic_filter_model.py +16 -5
- xspect/models/probabilistic_filter_svm_model.py +33 -18
- xspect/models/probabilistic_single_filter_model.py +8 -1
- xspect/models/result.py +15 -61
- xspect/ncbi.py +265 -0
- xspect/train.py +258 -247
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/METADATA +14 -21
- xspect-0.4.0.dist-info/RECORD +24 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/WHEEL +1 -1
- XspecT-0.2.6.dist-info/RECORD +0 -34
- xspect/pipeline.py +0 -201
- xspect/run.py +0 -38
- xspect/train_filter/__init__.py +0 -0
- xspect/train_filter/create_svm.py +0 -45
- xspect/train_filter/extract_and_concatenate.py +0 -124
- xspect/train_filter/html_scrap.py +0 -114
- xspect/train_filter/ncbi_api/__init__.py +0 -0
- xspect/train_filter/ncbi_api/download_assemblies.py +0 -31
- xspect/train_filter/ncbi_api/ncbi_assembly_metadata.py +0 -110
- xspect/train_filter/ncbi_api/ncbi_children_tree.py +0 -53
- xspect/train_filter/ncbi_api/ncbi_taxon_metadata.py +0 -55
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/entry_points.txt +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {XspecT-0.2.6.dist-info → xspect-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,110 +0,0 @@
|
|
|
1
|
-
""" Collects metadata of assemblies from NCBI API """
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
from time import sleep
|
|
6
|
-
|
|
7
|
-
import requests
|
|
8
|
-
|
|
9
|
-
from loguru import logger
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class NCBIAssemblyMetadata:
|
|
13
|
-
"""Class to collect metadata of assemblies from the NCBI API."""
|
|
14
|
-
|
|
15
|
-
_all_metadata: dict
|
|
16
|
-
_count: int
|
|
17
|
-
_ani_gcf: list
|
|
18
|
-
_parameters: dict
|
|
19
|
-
_accessions: list[str]
|
|
20
|
-
_contig_n50: int
|
|
21
|
-
_all_metadata_complete: dict
|
|
22
|
-
|
|
23
|
-
def __init__(self, all_metadata: dict, ani_gcf: list, count=8, contig_n50=10000):
|
|
24
|
-
self._all_metadata = all_metadata
|
|
25
|
-
self._count = count
|
|
26
|
-
self._ani_gcf = ani_gcf
|
|
27
|
-
self._contig_n50 = contig_n50
|
|
28
|
-
|
|
29
|
-
self._set_parameters()
|
|
30
|
-
|
|
31
|
-
tmp_metadata = {}
|
|
32
|
-
for tax_id, curr_metadata in self._all_metadata.items():
|
|
33
|
-
sleep(2)
|
|
34
|
-
species_name = curr_metadata["sci_name"]
|
|
35
|
-
logger.info("Collecting metadata of {name}", name=species_name)
|
|
36
|
-
accessions = self._make_request(taxon=tax_id)
|
|
37
|
-
if len(accessions) != 0:
|
|
38
|
-
curr_metadata["accessions"] = accessions
|
|
39
|
-
tmp_metadata[tax_id] = curr_metadata
|
|
40
|
-
|
|
41
|
-
self._all_metadata_complete = tmp_metadata
|
|
42
|
-
|
|
43
|
-
def _set_parameters(self):
|
|
44
|
-
params = {
|
|
45
|
-
"filters.reference_only": "false",
|
|
46
|
-
"filters.assembly_source": "refseq",
|
|
47
|
-
"filters.exclude_atypical": "true",
|
|
48
|
-
"page_size": self._count,
|
|
49
|
-
"page_token": "",
|
|
50
|
-
}
|
|
51
|
-
params_ref = params.copy()
|
|
52
|
-
params_ref["filters.reference_only"] = "true"
|
|
53
|
-
|
|
54
|
-
params_comp_genome = params.copy()
|
|
55
|
-
params_comp_genome["filters.assembly_level"] = "complete_genome"
|
|
56
|
-
|
|
57
|
-
params_chrom = params.copy()
|
|
58
|
-
params_chrom["filters.assembly_level"] = "chromosome"
|
|
59
|
-
|
|
60
|
-
params_scaffold = params.copy()
|
|
61
|
-
params_scaffold["filters.assembly_level"] = "scaffold"
|
|
62
|
-
|
|
63
|
-
params_contig = params.copy()
|
|
64
|
-
params_contig["filters.assembly_level"] = "contig"
|
|
65
|
-
|
|
66
|
-
self._parameters = {
|
|
67
|
-
"params_ref": params_ref,
|
|
68
|
-
"params_comp_genome": params_comp_genome,
|
|
69
|
-
"params_chrom": params_chrom,
|
|
70
|
-
"params_scaffold": params_scaffold,
|
|
71
|
-
"params_contig": params_contig,
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
def _make_request(self, taxon: str):
|
|
75
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/genome/taxon/{taxon}"
|
|
76
|
-
accessions = []
|
|
77
|
-
count = 0
|
|
78
|
-
for request_type, parameters in self._parameters.items():
|
|
79
|
-
raw_response = requests.get(api_url, params=parameters, timeout=5)
|
|
80
|
-
response = raw_response.json()
|
|
81
|
-
if response:
|
|
82
|
-
try:
|
|
83
|
-
assemblies = response["assemblies"]
|
|
84
|
-
for assembly in assemblies:
|
|
85
|
-
curr_assembly = assembly["assembly"]
|
|
86
|
-
curr_accession = curr_assembly["assembly_accession"]
|
|
87
|
-
curr_contig_n50 = curr_assembly["contig_n50"]
|
|
88
|
-
if count < self._count:
|
|
89
|
-
if (
|
|
90
|
-
curr_accession in self._ani_gcf
|
|
91
|
-
and curr_contig_n50 > self._contig_n50
|
|
92
|
-
):
|
|
93
|
-
accessions.append(curr_accession)
|
|
94
|
-
count += 1
|
|
95
|
-
else:
|
|
96
|
-
break
|
|
97
|
-
except KeyError:
|
|
98
|
-
logger.debug(
|
|
99
|
-
"While requesting: {type} an error response was given",
|
|
100
|
-
type=request_type,
|
|
101
|
-
)
|
|
102
|
-
logger.debug(str(response))
|
|
103
|
-
|
|
104
|
-
if count >= self._count:
|
|
105
|
-
break
|
|
106
|
-
return accessions
|
|
107
|
-
|
|
108
|
-
def get_all_metadata(self):
|
|
109
|
-
"""Returns all metadata of the assemblies."""
|
|
110
|
-
return self._all_metadata_complete
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
"""This class uses the NCBI Datasets API to get the taxonomy tree of a given Taxon.
|
|
2
|
-
|
|
3
|
-
The taxonomy tree consists of only the next children to the parent taxon.
|
|
4
|
-
The children are only of the next lower rank of the parent taxon.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
__author__ = "Berger, Phillip"
|
|
8
|
-
|
|
9
|
-
import sys
|
|
10
|
-
import requests
|
|
11
|
-
|
|
12
|
-
from loguru import logger
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class NCBIChildrenTree:
|
|
16
|
-
_taxon: str
|
|
17
|
-
_response: dict
|
|
18
|
-
_parent_taxon_id: str
|
|
19
|
-
_children_taxon_ids = list()
|
|
20
|
-
|
|
21
|
-
def __init__(self, taxon: str):
|
|
22
|
-
self._taxon = taxon
|
|
23
|
-
self._request_tree()
|
|
24
|
-
|
|
25
|
-
def _request_tree(self):
|
|
26
|
-
"""Make the request for the children tree at the NCBI Datasets API."""
|
|
27
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{self._taxon}/filtered_subtree"
|
|
28
|
-
raw_response = requests.get(api_url, timeout=5)
|
|
29
|
-
self._response = raw_response.json()["edges"]
|
|
30
|
-
self._parent_taxon_id = str(self._response["1"]["visible_children"][0])
|
|
31
|
-
try:
|
|
32
|
-
tmp_children_ids = self._response[self._parent_taxon_id]["visible_children"]
|
|
33
|
-
except KeyError:
|
|
34
|
-
logger.error("KeyError for key: {key}", key=self._parent_taxon_id)
|
|
35
|
-
logger.error("Response: {response}", response=str(self._response))
|
|
36
|
-
logger.error("Aborting")
|
|
37
|
-
sys.exit()
|
|
38
|
-
for child_id in tmp_children_ids:
|
|
39
|
-
self._children_taxon_ids.append(str(child_id))
|
|
40
|
-
|
|
41
|
-
def parent_id(self):
|
|
42
|
-
"""The NCBI taxon ID of the given Taxon at the initialisation.
|
|
43
|
-
|
|
44
|
-
:return: The taxon ID.
|
|
45
|
-
"""
|
|
46
|
-
return self._parent_taxon_id
|
|
47
|
-
|
|
48
|
-
def children_ids(self) -> list[str]:
|
|
49
|
-
"""The NCBI taxon IDs of all children of the given Taxon. The children are all of a lower rank than the parent.
|
|
50
|
-
|
|
51
|
-
:return: The taxon IDs as a list.
|
|
52
|
-
"""
|
|
53
|
-
return self._children_taxon_ids
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
""" This module is used to retrieve metadata from the NCBI taxonomy database. """
|
|
2
|
-
|
|
3
|
-
__author__ = "Berger, Phillip"
|
|
4
|
-
|
|
5
|
-
import requests
|
|
6
|
-
|
|
7
|
-
from loguru import logger
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class NCBITaxonMetadata:
|
|
11
|
-
"""Class to retrieve metadata from the NCBI taxonomy database."""
|
|
12
|
-
|
|
13
|
-
_taxon: str
|
|
14
|
-
_response: dict
|
|
15
|
-
_all_metadata: dict
|
|
16
|
-
|
|
17
|
-
def __init__(self, taxon: list[str]):
|
|
18
|
-
self._taxon = ",".join(taxon)
|
|
19
|
-
self._all_metadata = {}
|
|
20
|
-
self._request_metadata()
|
|
21
|
-
self._collect_all_metadata()
|
|
22
|
-
|
|
23
|
-
def _request_metadata(self):
|
|
24
|
-
api_url = f"https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/{str(self._taxon)}"
|
|
25
|
-
raw_response = requests.get(api_url, timeout=5)
|
|
26
|
-
self._response = raw_response.json()["taxonomy_nodes"]
|
|
27
|
-
|
|
28
|
-
def _collect_all_metadata(self):
|
|
29
|
-
for child_metadata in self._response:
|
|
30
|
-
taxonomy = child_metadata["taxonomy"]
|
|
31
|
-
rank = taxonomy["rank"]
|
|
32
|
-
name = taxonomy["organism_name"]
|
|
33
|
-
tax_id = str(taxonomy["tax_id"])
|
|
34
|
-
lineage = taxonomy["lineage"]
|
|
35
|
-
if "Candidatus" not in name:
|
|
36
|
-
if " sp. " not in name:
|
|
37
|
-
metadata = {
|
|
38
|
-
"sci_name": name,
|
|
39
|
-
"tax_id": tax_id,
|
|
40
|
-
"rank": rank,
|
|
41
|
-
"lineage": lineage,
|
|
42
|
-
}
|
|
43
|
-
self._all_metadata[tax_id] = metadata
|
|
44
|
-
else:
|
|
45
|
-
logger.debug("{name} was not used for training", name=name)
|
|
46
|
-
else:
|
|
47
|
-
logger.debug("{name} was not used for training", name=name)
|
|
48
|
-
|
|
49
|
-
def get_response(self):
|
|
50
|
-
"""Returns the raw response from the NCBI taxonomy database."""
|
|
51
|
-
return self._response
|
|
52
|
-
|
|
53
|
-
def get_metadata(self):
|
|
54
|
-
"""Returns the metadata from the NCBI taxonomy database."""
|
|
55
|
-
return self._all_metadata
|
|
File without changes
|
|
File without changes
|
|
File without changes
|