taxutils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
taxutils-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Will O'Brien
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: taxutils
3
+ Version: 0.1.0
4
+ Summary: Utilities for working with taxonomic data.
5
+ Author-email: William O'Brien <wob@cs.ucla.edu>
6
+ Maintainer-email: William O'Brien <wob@cs.ucla.edu>
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy
17
+ Requires-Dist: pandas
18
+ Dynamic: license-file
19
+
20
+ # taxutils
21
+
22
+ Utilities for working with taxonomic data from NCBI. These are functions and data that I often need to access across many different projects. This project is meant to version-control routine work that continuously comes up in my projects and may also be useful in your own work.
23
+
24
+ # Contact
25
+ Author: Will O'Brien
26
+ Affiliation: Computer Science Department, UCLA
27
+ Email: wob@cs.ucla.edu
@@ -0,0 +1,8 @@
1
+ # taxutils
2
+
3
+ Utilities for working with taxonomic data from NCBI. These are functions and data that I often need to access across many different projects. This project is meant to version-control routine work that continuously comes up in my projects and may also be useful in your own work.
4
+
5
+ # Contact
6
+ Author: Will O'Brien
7
+ Affiliation: Computer Science Department, UCLA
8
+ Email: wob@cs.ucla.edu
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "taxutils"
7
+ version = "0.1.0"
8
+ description = "Utilities for working with taxonomic data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [
12
+ {name = "William O'Brien", email = "wob@cs.ucla.edu"},
13
+ ]
14
+ maintainers = [
15
+ {name = "William O'Brien", email = "wob@cs.ucla.edu"},
16
+ ]
17
+ dependencies = [
18
+ "numpy",
19
+ "pandas",
20
+ ]
21
+ classifiers = [
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Operating System :: OS Independent",
28
+ ]
29
+
30
+ [tool.setuptools]
31
+ package-dir = {"" = "src"}
32
+ py-modules = ["taxutils", "utils"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: taxutils
3
+ Version: 0.1.0
4
+ Summary: Utilities for working with taxonomic data.
5
+ Author-email: William O'Brien <wob@cs.ucla.edu>
6
+ Maintainer-email: William O'Brien <wob@cs.ucla.edu>
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy
17
+ Requires-Dist: pandas
18
+ Dynamic: license-file
19
+
20
+ # taxutils
21
+
22
+ Utilities for working with taxonomic data from NCBI. These are functions and data that I often need to access across many different projects. This project is meant to version-control routine work that continuously comes up in my projects and may also be useful in your own work.
23
+
24
+ # Contact
25
+ Author: Will O'Brien
26
+ Affiliation: Computer Science Department, UCLA
27
+ Email: wob@cs.ucla.edu
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/taxutils.py
5
+ src/utils.py
6
+ src/taxutils.egg-info/PKG-INFO
7
+ src/taxutils.egg-info/SOURCES.txt
8
+ src/taxutils.egg-info/dependency_links.txt
9
+ src/taxutils.egg-info/requires.txt
10
+ src/taxutils.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ numpy
2
+ pandas
@@ -0,0 +1,2 @@
1
+ taxutils
2
+ utils
@@ -0,0 +1,510 @@
1
+ # taxutils.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from collections import defaultdict
6
+ from typing import List
7
+ from dataclasses import dataclass
8
+ import os, json, urllib.request, tarfile, gzip, re
9
+ import sqlite3, subprocess
10
+
11
+ try:
12
+ from .utils import TAXUTILS_GLOBALS, get_logger
13
+ except ImportError:
14
+ from utils import TAXUTILS_GLOBALS, get_logger
15
+
16
+ logger = get_logger(__name__)
17
+
18
+ @dataclass
19
+ class TaxonomicUtils:
20
+ names: dict
21
+ nodes: dict
22
+ target_taxids: set
23
+ a2t: dict = None
24
+ tree: defaultdict = None
25
+ _is_matched: bool = False
26
+ parent: dict = None
27
+
28
+ def __repr__(self):
29
+ fields = []
30
+ for f in self.__dataclass_fields__:
31
+ val = getattr(self, f)
32
+ if val is None:
33
+ fields.append(f"{f}=None")
34
+ elif isinstance(val, dict):
35
+ fields.append(f"{f}=dict({len(val)} entries)")
36
+ elif isinstance(val, (set, list)):
37
+ fields.append(f"{f}={type(val).__name__}({len(val)} items)")
38
+ else:
39
+ fields.append(f"{f}={type(val).__name__}")
40
+ body = ",\n ".join(fields)
41
+ return f"TaxonomicUtils(\n {body}\n)"
42
+
43
+ def load_a2t(self, accessions: List[str], sqlite: bool = True):
44
+ self.a2t = build_a2t(accessions, sqlite=sqlite)
45
+
46
+ def load_tree(self):
47
+ parent = dict(zip(
48
+ self.nodes["taxid"],
49
+ self.nodes["parent"])
50
+ )
51
+ parent[1] = None
52
+ tree = defaultdict(list)
53
+ for k, v in parent.items():
54
+ if v is not None:
55
+ tree[int(v)].append(int(k))
56
+ self.tree = tree
57
+
58
+ def get_subtree(self, taxid):
59
+ if self.tree is None:
60
+ self.load_tree()
61
+ result = [taxid]
62
+ if taxid in self.tree:
63
+ for child in self.tree[taxid]:
64
+ result.extend(get_subtree(child, self.tree))
65
+ return result
66
+
67
+ def match_library(self, config):
68
+ logger.info("Matching library to target taxa.")
69
+ acc_ids = extract_accession_ids(config.library)
70
+ self.load_a2t(acc_ids, sqlite=config.sqlite)
71
+ library_taxids = set()
72
+ for acc_id in acc_ids:
73
+ add_to_lib = self.a2t.get(acc_id, None)
74
+ if add_to_lib is not None:
75
+ library_taxids.add(add_to_lib)
76
+
77
+ if self.tree is None:
78
+ self.load_tree()
79
+ parent = dict(zip(self.nodes['taxid'], self.nodes['parent']))
80
+ parent[1] = None
81
+ higher_than_F = dict(zip(self.nodes['taxid'], self.nodes['higher_than_F']))
82
+
83
+ target_set = set(self.target_taxids)
84
+ matched = target_set.intersection(library_taxids)
85
+
86
+ for tid in library_taxids:
87
+ cur = parent.get(tid)
88
+ while cur is not None and not higher_than_F.get(cur, True):
89
+ if cur in target_set:
90
+ matched.add(cur)
91
+ cur = parent.get(cur)
92
+
93
+ self.target_taxids = matched
94
+ self._is_matched = True
95
+
96
+ def load_parent(self):
97
+ self.parent = dict(zip(self.nodes["taxid"], self.nodes["parent"]))
98
+
99
+ def extract_accession_ids(fasta_path):
100
+ """Extract accession IDs with version numbers from FASTA headers using grep."""
101
+ result = subprocess.run(
102
+ ['grep', '-o', '^>.*', fasta_path],
103
+ capture_output=True, text=True
104
+ )
105
+
106
+ headers = result.stdout
107
+ accession_ids = re.findall(r'[A-Z]{1,2}_?\d+\.\d+', headers)
108
+
109
+ return accession_ids
110
+
111
+ def download_taxonomy(accessions: List[str]=None, sqlite: bool=True, pathogen_json=None) -> TaxonomicUtils:
112
+ save_path = TAXUTILS_GLOBALS["save_folder"]
113
+ os.makedirs(save_path, exist_ok=True)
114
+
115
+ names_path = os.path.join(save_path, "names.dmp")
116
+ nodes_path = os.path.join(save_path, "nodes.dmp")
117
+
118
+ if not (os.path.exists(names_path) and os.path.exists(nodes_path)):
119
+ logger.info(f"Downloading {names_path}, {nodes_path}...")
120
+ tarball_path = os.path.join(save_path, "taxdump.tar.gz")
121
+ url = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
122
+ urllib.request.urlretrieve(url, tarball_path)
123
+
124
+ with tarfile.open(tarball_path, "r:gz") as tar:
125
+ for member in tar.getmembers():
126
+ if member.name in ["names.dmp", "nodes.dmp"]:
127
+ tar.extract(member, path=save_path)
128
+
129
+ os.remove(tarball_path)
130
+ else:
131
+ logger.info(f"names.dmp and nodes.dmp exist in {save_path}, skipping download.")
132
+
133
+ if pathogen_json is None:
134
+ pathogen_json = os.path.join(save_path, "pathogen_dict.json")
135
+ if not os.path.exists(pathogen_json):
136
+ for url in TAXUTILS_GLOBALS["pathogen_dict_urls"]:
137
+ try:
138
+ logger.info(f"Downloading pathogen_dict.json from {url}...")
139
+ urllib.request.urlretrieve(url, pathogen_json)
140
+ break
141
+ except Exception as e:
142
+ logger.warning(f"Failed to download from {url}: {e}")
143
+ else:
144
+ raise RuntimeError("Could not download pathogen_dict.json from any URL.")
145
+ logger.info(f"Building nodes...")
146
+ names = build_names(names_path)
147
+ nodes = build_nodes(nodes_path, names)
148
+ target_taxids = build_target_taxids(
149
+ nodes, names, pathogen_json=pathogen_json, extra_taxids=(9606,)
150
+ )
151
+ a2t = None
152
+ if accessions is not None:
153
+ a2t = build_a2t(accessions, sqlite=sqlite)
154
+ a2t[TAXUTILS_GLOBALS["UNCLASSIFIED"]] = "unclassified"
155
+ a2t[TAXUTILS_GLOBALS["UNMAPPED"]] = "unmapped"
156
+ names[2697049] = "SARS-CoV-2"
157
+ names[694009] = "SARS-related-CoV"
158
+ return TaxonomicUtils(names=names, nodes=nodes, target_taxids=target_taxids, a2t=a2t)
159
+
160
+
161
+ def taxutils(accessions: List[str]=None, sqlite: bool=True, pathogen_json=None) -> TaxonomicUtils:
162
+ return download_taxonomy(accessions=accessions, sqlite=sqlite, pathogen_json=pathogen_json)
163
+
164
+
165
+ TaxonomicData = TaxonomicUtils
166
+
167
+
168
+ def _ensure_a2t_db(gz_path, a2t_db):
169
+ """Build the SQLite a2t db from gz if it doesn't exist, ensuring both indexes exist."""
170
+ if not os.path.exists(gz_path):
171
+ gz_path = download_a2t()
172
+ if not os.path.exists(a2t_db):
173
+ logger.info("Building SQLite db from gz file, this will take a while...")
174
+ conn = sqlite3.connect(a2t_db)
175
+ cur = conn.cursor()
176
+ cur.execute("CREATE TABLE a2t (accession TEXT, taxid INTEGER)")
177
+ with gzip.open(gz_path, "rt") as f:
178
+ next(f)
179
+ batch = []
180
+ for line in f:
181
+ parts = line.strip().split("\t")
182
+ batch.append((parts[1], int(parts[2])))
183
+ if len(batch) == 100_000:
184
+ cur.executemany("INSERT INTO a2t VALUES (?, ?)", batch)
185
+ batch.clear()
186
+ if batch:
187
+ cur.executemany("INSERT INTO a2t VALUES (?, ?)", batch)
188
+ cur.execute("CREATE INDEX idx_accession ON a2t (accession)")
189
+ cur.execute("CREATE INDEX idx_taxid ON a2t (taxid)")
190
+ conn.commit()
191
+ conn.close()
192
+ logger.info("SQLite db built.")
193
+ else:
194
+ # Ensure taxid index exists on dbs built before this index was added
195
+ conn = sqlite3.connect(a2t_db)
196
+ cur = conn.cursor()
197
+ cur.execute("CREATE INDEX IF NOT EXISTS idx_taxid ON a2t (taxid)")
198
+ conn.commit()
199
+ conn.close()
200
+
201
+ def download_a2t():
202
+ gz_path = os.path.join(TAXUTILS_GLOBALS["save_folder"], "nucl_gb.accession2taxid.gz")
203
+ a2t_url = "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz"
204
+ if not os.path.exists(gz_path):
205
+ os.makedirs(os.path.dirname(gz_path), exist_ok=True)
206
+ logger.info(f"Downloading {gz_path}...")
207
+ urllib.request.urlretrieve(a2t_url, gz_path)
208
+ else:
209
+ logger.info(f"{gz_path} already exists, skipping download.")
210
+
211
+ return gz_path
212
+
213
+ def build_a2t(accessions, sqlite=True):
214
+ gz_path = download_a2t()
215
+
216
+ if not sqlite:
217
+ accession_set = set(accessions) if not isinstance(accessions, set) else accessions
218
+ a2t = {}
219
+ with gzip.open(gz_path, 'rt') as f:
220
+ header = next(f).strip().split("\t")
221
+ acc_idx = header.index("accession.version")
222
+ taxid_idx = header.index("taxid")
223
+ for line in f:
224
+ parts = line.strip().split("\t")
225
+ if parts[acc_idx] in accession_set:
226
+ a2t[parts[acc_idx]] = int(parts[taxid_idx])
227
+ if len(a2t) == len(accession_set):
228
+ break
229
+ return a2t
230
+
231
+ # sqlite path
232
+ a2t_db = os.path.join(TAXUTILS_GLOBALS["save_folder"], "nucl_gb.accession2taxid.db")
233
+ _ensure_a2t_db(gz_path, a2t_db)
234
+
235
+ conn = sqlite3.connect(a2t_db)
236
+ acc_df = pd.DataFrame({"accession": list(accessions)})
237
+ acc_df.to_sql("tmp_accs", conn, if_exists="replace", index=False)
238
+ result = pd.read_sql("SELECT t.accession, a.taxid FROM tmp_accs t JOIN a2t a ON t.accession = a.accession", conn)
239
+ conn.close()
240
+ return dict(zip(result["accession"], result["taxid"]))
241
+
242
+
243
+ def get_accessions_for_taxids(taxids, sqlite=True):
244
+ """Return the set of accessions belonging to the given taxids.
245
+
246
+ When sqlite=True (default), queries the local SQLite db for speed.
247
+ When sqlite=False, scans the compressed gz file (keeps the file compressed
248
+ but is much slower).
249
+ """
250
+ gz_path = download_a2t()
251
+
252
+ if not sqlite:
253
+ taxid_set = {int(t) for t in taxids}
254
+ accessions = set()
255
+ with gzip.open(gz_path, 'rt') as f:
256
+ header = next(f).strip().split("\t")
257
+ acc_idx = header.index("accession.version")
258
+ taxid_idx = header.index("taxid")
259
+ for line in f:
260
+ parts = line.strip().split("\t")
261
+ if int(parts[taxid_idx]) in taxid_set:
262
+ accessions.add(parts[acc_idx])
263
+ return accessions
264
+
265
+ # sqlite path
266
+ a2t_db = os.path.join(TAXUTILS_GLOBALS["save_folder"], "nucl_gb.accession2taxid.db")
267
+ _ensure_a2t_db(gz_path, a2t_db)
268
+
269
+ taxid_list = [int(t) for t in taxids]
270
+ conn = sqlite3.connect(a2t_db)
271
+ taxid_df = pd.DataFrame({"taxid": [int(t) for t in taxids]})
272
+ taxid_df.to_sql("tmp_taxids", conn, if_exists="replace", index=False)
273
+ result = pd.read_sql("SELECT accession FROM a2t JOIN tmp_taxids ON a2t.taxid = tmp_taxids.taxid", conn)
274
+ conn.close()
275
+ return set(result["accession"])
276
+
277
+ def taxonomic_order(present, parent, rank, names):
278
+ anc, stack = set(), list(present)
279
+ while stack:
280
+ t = stack.pop()
281
+ p = parent.get(t)
282
+ if p is not None and p not in anc:
283
+ anc.add(p)
284
+ stack.append(p)
285
+
286
+ nodes = present | anc
287
+ children = {t: [] for t in nodes}
288
+ for t in nodes:
289
+ p = parent.get(t)
290
+ if p in nodes:
291
+ children[p].append(t)
292
+
293
+ def child_key(t):
294
+ return (str(rank.get(t, "")), str(names.get(t, "")), int(t))
295
+
296
+ for k in children:
297
+ children[k].sort(key=child_key)
298
+
299
+ special_order = [0,1,9606,2,10239]
300
+ roots = sorted(
301
+ [t for t in nodes if parent.get(t) not in nodes],
302
+ key=lambda t: (
303
+ t not in special_order,
304
+ special_order.index(t) if t in special_order else float("inf"),
305
+ child_key(t),
306
+ ),
307
+ )
308
+
309
+ order, seen = [], set()
310
+
311
+ def dfs(u):
312
+ if u in seen: return
313
+ seen.add(u)
314
+ if u in present:
315
+ order.append(u)
316
+ for v in children.get(u, []):
317
+ dfs(v)
318
+
319
+ for r in roots:
320
+ dfs(r)
321
+ for t in present:
322
+ if t not in seen:
323
+ order.append(t)
324
+ return order
325
+
326
+ def get_parent_tree(taxonomic_data):
327
+ parent = dict(zip(
328
+ taxonomic_data.nodes["taxid"],
329
+ taxonomic_data.nodes["parent"])
330
+ )
331
+ parent[1] = None
332
+ tree = defaultdict(list)
333
+ for k, v in parent.items():
334
+ if v is not None:
335
+ tree[int(v)].append(int(k))
336
+ return parent, tree
337
+
338
+ def build_names(names_path):
339
+ names = {}
340
+ with open(names_path) as f:
341
+ for line in f:
342
+ parts = [p.strip() for p in line.split("|")]
343
+ if len(parts) >= 4 and parts[3] == "scientific name":
344
+ taxid = parts[0]
345
+ name = parts[1]
346
+ names[int(taxid)] = name
347
+ names[TAXUTILS_GLOBALS["UNMAPPED"]] = "unmapped" # -1
348
+ names[TAXUTILS_GLOBALS["UNCLASSIFIED"]] = "unclassified" # -2
349
+ return names
350
+
351
+ def get_subtree(taxid, tree):
352
+ """
353
+ Get all descendant taxids including itself.
354
+ """
355
+ result = [taxid]
356
+ if taxid in tree:
357
+ for child in tree[taxid]:
358
+ result.extend(get_subtree(child, tree))
359
+ return result
360
+
361
+ def rank_below(r):
362
+ order = [
363
+ "superkingdom",
364
+ "kingdom",
365
+ "phylum",
366
+ "class",
367
+ "order",
368
+ "family",
369
+ "genus",
370
+ "species"
371
+ ]
372
+ rank_index = {r: i for i, r in enumerate(order)}
373
+ if r == "species": return "species"
374
+ if r not in rank_index:
375
+ return None
376
+ i = rank_index[r]
377
+ if i+1 < len(order):
378
+ return order[i+1]
379
+ return None
380
+
381
+ def correct_rank(taxid, rank_map, parent, seen):
382
+ if taxid in seen:
383
+ return "root"
384
+ seen.add(taxid)
385
+ cur_rank = rank_map.get(taxid)
386
+ if cur_rank is None or cur_rank == "root":
387
+ return "root"
388
+ if cur_rank != "no rank":
389
+ return cur_rank
390
+ return correct_rank(parent[taxid], rank_map, parent, seen)
391
+
392
+ def build_nodes(nodes_path, names):
393
+ major_letters = ['U','R','D','K','P','C','O','F','G','S']
394
+ major_order = {'U':0,'R':1,'D':2,'K':3,'P':4,'C':5,'O':6,'F':7,'G':8,'S':9}
395
+ major_rank_to_code = {
396
+ "root":"R",
397
+ "acellular root":"R",
398
+ "cellular root":"R",
399
+ "no rank":"NR",
400
+ "clade":"C",
401
+ "subfamily":"F",
402
+ "domain":"D",
403
+ "realm":"D",
404
+ "kingdom":"K",
405
+ "phylum":"P",
406
+ "class":"C",
407
+ "order":"O",
408
+ "family":"F",
409
+ "genus":"G",
410
+ "species":"S",
411
+ }
412
+
413
+ nodes = pd.read_csv(
414
+ nodes_path, sep="|", header=None, usecols=[0,1,2],
415
+ names=["taxid","parent","rank"], dtype={"taxid":int,"parent":int,"rank":str},
416
+ engine="python"
417
+ )
418
+ nodes["rank"] = nodes["rank"].str.strip().str.lower()
419
+ # nodes["name"] = nodes["taxid"].map(names)
420
+ nodes.loc[nodes["taxid"]==1, "parent"]
421
+ parent = dict(zip(nodes["taxid"], nodes["parent"]))
422
+ rank_map = dict(zip(nodes["taxid"], nodes["rank"]))
423
+
424
+ _rank_code_cache = {}
425
+ def _rank_code(t):
426
+ if t in _rank_code_cache: return _rank_code_cache[t]
427
+ if t == 0: _rank_code_cache[t] = "U"; return "U"
428
+ if t == 1: _rank_code_cache[t] = "R"; return "R"
429
+ steps, cur = 0, t
430
+ while True:
431
+ r = rank_map.get(cur, "")
432
+ b = major_rank_to_code.get(r)
433
+ if b:
434
+ c = b if steps == 0 else f"{b}{steps}"
435
+ _rank_code_cache[t] = c
436
+ return c
437
+ nxt = parent.get(cur)
438
+ if nxt is None or nxt == cur: _rank_code_cache[t] = None; return None
439
+ cur = nxt
440
+ steps += 1
441
+
442
+ nodes["rank"] = nodes["taxid"].apply(
443
+ lambda t: correct_rank(t, rank_map, parent, set())
444
+ )
445
+ nodes["rank_code"] = nodes["taxid"].apply(_rank_code)
446
+ nodes["rank_code"] = nodes["rank_code"]
447
+ nodes["rank_base"] = nodes["rank_code"].str[0]
448
+ nodes["rank_idx"] = nodes["rank_base"].map(major_order)
449
+
450
+ for L in major_letters:
451
+ ti = major_order[L]
452
+ nodes[f"higher_than_{L}"] = nodes["rank_idx"] < ti
453
+
454
+ return nodes
455
+
456
+ def get_parents(tid, parent_map, higher_than_F):
457
+ parents = set()
458
+ cur_node = tid
459
+ while True:
460
+ cur_node = parent_map.get(cur_node)
461
+ if cur_node is None:
462
+ break
463
+ if higher_than_F.get(cur_node, True):
464
+ break
465
+ parents.add(cur_node)
466
+ return parents
467
+
468
+ def build_target_taxids(nodes, names, pathogen_json, extra_taxids=(9606,)):
469
+ with open(pathogen_json) as f:
470
+ pdict = json.load(f)
471
+ pathogen_taxids = {int(v) for v in pdict["pathogens"].values()}
472
+
473
+ parent = dict(zip(nodes["taxid"], nodes["parent"]))
474
+ parent[1] = None
475
+ tree = defaultdict(list)
476
+ for k, v in parent.items():
477
+ if v is not None and not (isinstance(v, float) and np.isnan(v)):
478
+ tree[int(v)].append(int(k))
479
+ rank = dict(zip(nodes["taxid"], nodes["rank"]))
480
+ higher_than_F = dict(zip(nodes["taxid"], nodes["higher_than_F"]))
481
+
482
+ taxids = set()
483
+ for tid in pathogen_taxids:
484
+ taxids.update(get_subtree(tid, tree))
485
+ taxids.update(get_parents(tid, parent, higher_than_F))
486
+
487
+ if extra_taxids:
488
+ taxids.update(extra_taxids)
489
+ return taxonomic_order(taxids, parent, rank, names)
490
+
491
+ def get_lca(a, b, parent_dict):
492
+ if a == b:
493
+ return a
494
+ path_a = []
495
+ cur = a
496
+ while cur:
497
+ path_a.append(cur)
498
+ cur = parent_dict.get(cur)
499
+ path_b = []
500
+ cur = b
501
+ while cur:
502
+ path_b.append(cur)
503
+ cur = parent_dict.get(cur)
504
+ path_a = path_a[::-1] # leaf to root reversed to root to leaf
505
+ path_b = path_b[::-1]
506
+ i = 0
507
+ min_len = min(len(path_a), len(path_b))
508
+ while i < min_len and path_a[i] == path_b[i]:
509
+ i += 1
510
+ return int(path_a[i-1]) if i > 0 else 1
@@ -0,0 +1,20 @@
1
+ # global_utils.py
2
+
3
+ import os
4
+
5
+ TAXUTILS_GLOBALS = dict()
6
+ TAXUTILS_GLOBALS["save_folder"] = os.path.expanduser(os.environ.get("TAXUTILS_GLOBALS", "./taxutils/"))
7
+ TAXUTILS_GLOBALS["pathogen_dict_urls"] = [
8
+ "https://web.cs.ucla.edu/~wob/projects/trident/targets.json",
9
+ ]
10
+ TAXUTILS_GLOBALS["UNCLASSIFIED"] = 0
11
+ TAXUTILS_GLOBALS["UNMAPPED"] = -1
12
+
13
+ def get_logger(name):
14
+ import logging
15
+ logger = logging.getLogger(name)
16
+ logging.basicConfig(
17
+ format="%(asctime)s | %(levelname)s : %(message)s",
18
+ level=logging.INFO,
19
+ )
20
+ return logger