GenomeUtils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GenomeUtils/Downloaders.py +14 -0
- GenomeUtils/Genome.py +21 -0
- GenomeUtils/__init__.py +18 -0
- GenomeUtils/downloaders/__init__.py +18 -0
- GenomeUtils/downloaders/downloader.py +80 -0
- GenomeUtils/downloaders/genome_downloader.py +81 -0
- GenomeUtils/genome/builder.py +413 -0
- GenomeUtils/genome/chromosome.py +85 -0
- GenomeUtils/genome/exon.py +71 -0
- GenomeUtils/genome/gene.py +72 -0
- GenomeUtils/genome/genome.py +142 -0
- GenomeUtils/genome/genome_element.py +94 -0
- GenomeUtils/genome/locus.py +55 -0
- GenomeUtils/genome/transcript.py +148 -0
- genomeutils-0.1.0.dist-info/METADATA +202 -0
- genomeutils-0.1.0.dist-info/RECORD +20 -0
- genomeutils-0.1.0.dist-info/WHEEL +5 -0
- genomeutils-0.1.0.dist-info/licenses/COPYING +674 -0
- genomeutils-0.1.0.dist-info/licenses/COPYING.LESSER +165 -0
- genomeutils-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/Downloaders.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file serves as a convenient entry point for downloader classes.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from .downloaders import Downloader, EnsemblGenomeDownloader
|
|
13
|
+
|
|
14
|
+
__all__ = ["EnsemblGenomeDownloader", "Downloader"]
|
GenomeUtils/Genome.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/Genome.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file defines the main Genome class and related functionalities.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from .genome.builder import GenomeBuilder
|
|
13
|
+
from .genome.chromosome import Chromosome
|
|
14
|
+
from .genome.exon import Exon
|
|
15
|
+
from .genome.gene import Gene
|
|
16
|
+
from .genome.genome import Genome
|
|
17
|
+
from .genome.genome_element import GenomeElement
|
|
18
|
+
from .genome.locus import Locus
|
|
19
|
+
from .genome.transcript import Transcript
|
|
20
|
+
|
|
21
|
+
__all__ = ["Genome", "Gene", "Transcript", "Exon", "Chromosome", "Locus", "GenomeElement", "GenomeBuilder"]
|
GenomeUtils/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/__init__.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file is the initialization file for the GenomeUtils package.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from . import Downloaders
|
|
13
|
+
from . import Genome
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Genome",
|
|
17
|
+
"Downloaders",
|
|
18
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/downloaders/__init__.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: Initialization file for the downloaders package.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from .downloader import Downloader
|
|
13
|
+
from .genome_downloader import EnsemblGenomeDownloader
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Downloader",
|
|
17
|
+
"EnsemblGenomeDownloader",
|
|
18
|
+
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/downloaders/downloader.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file defines the base Downloader class for handling file downloads.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from abc import ABC
|
|
12
|
+
import logging
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
import shutil
|
|
15
|
+
import tempfile
|
|
16
|
+
from typing import Optional, Set
|
|
17
|
+
|
|
18
|
+
import requests
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Downloader(ABC):
|
|
22
|
+
"""Abstract base class for all downloaders."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, download_dir: Optional[Path] = None):
|
|
25
|
+
"""
|
|
26
|
+
Initializes the Downloader.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
download_dir: Directory for storing downloaded files.
|
|
30
|
+
If None, uses a temporary directory.
|
|
31
|
+
"""
|
|
32
|
+
self._is_temp_cache = download_dir is None
|
|
33
|
+
self.download_dir = download_dir or Path(tempfile.mkdtemp())
|
|
34
|
+
self.download_dir.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
36
|
+
self._created_files: Set[Path] = set()
|
|
37
|
+
|
|
38
|
+
def __repr__(self) -> str:
|
|
39
|
+
return f"{self.__class__.__name__}(download_dir={self.download_dir})"
|
|
40
|
+
|
|
41
|
+
def download_file(self, url: str, filename: str = None, force: bool = False) -> Path:
|
|
42
|
+
"""
|
|
43
|
+
Download a single file from a URL and saves it in the cache directory.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url: The URL of the file to download.
|
|
47
|
+
filename: The name of the file to be saved in the cache directory.
|
|
48
|
+
force: If True, redownload the file even if it exists. Defaults to False.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The path to the downloaded file.
|
|
52
|
+
"""
|
|
53
|
+
if filename is None:
|
|
54
|
+
# Extract filename from URL, removing query parameters
|
|
55
|
+
filename = url.split('/')[-1].split('?')[0]
|
|
56
|
+
destination_path = self.download_dir / filename
|
|
57
|
+
|
|
58
|
+
if not force and destination_path.exists():
|
|
59
|
+
self.logger.info(f"File '{filename}' already exists in cache. Skipping download.")
|
|
60
|
+
return destination_path
|
|
61
|
+
|
|
62
|
+
self.logger.info(f"Downloading '{filename}'...")
|
|
63
|
+
with requests.get(url, stream=True) as r:
|
|
64
|
+
r.raise_for_status()
|
|
65
|
+
with open(destination_path, 'wb') as f:
|
|
66
|
+
shutil.copyfileobj(r.raw, f)
|
|
67
|
+
self._created_files.add(destination_path)
|
|
68
|
+
return destination_path
|
|
69
|
+
|
|
70
|
+
def cleanup(self):
|
|
71
|
+
"""
|
|
72
|
+
Clean up created files.
|
|
73
|
+
"""
|
|
74
|
+
if self._is_temp_cache:
|
|
75
|
+
if self.download_dir.exists():
|
|
76
|
+
shutil.rmtree(self.download_dir)
|
|
77
|
+
else:
|
|
78
|
+
for path in self._created_files:
|
|
79
|
+
if path.exists():
|
|
80
|
+
path.unlink()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/downloaders/genome_downloader.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file defines the abstract base class for genome downloaders.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import gget
|
|
16
|
+
|
|
17
|
+
from .downloader import Downloader
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EnsemblGenomeDownloader(Downloader):
|
|
21
|
+
"""
|
|
22
|
+
Downloads genome data from Ensembl.
|
|
23
|
+
|
|
24
|
+
This downloader fetches the download URLs
|
|
25
|
+
for genomic data using `gget`, downloads the files, and stores them in `genomes_root_dir/ensembl/{assembly_id}/{ensembl_release}`.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self,
|
|
29
|
+
assembly_id: str,
|
|
30
|
+
ensembl_release: int,
|
|
31
|
+
species: str,
|
|
32
|
+
genomes_root_dir: Path | str = Path('./data/genomes')
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Initializes the EnsemblGenomeDownloader.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
assembly_id: The identifier for the genome assembly (e.g., 'GRCh38').
|
|
39
|
+
ensembl_release: The release number of the Ensembl database.
|
|
40
|
+
species: The scientific name for the species (e.g., 'homo_sapiens').
|
|
41
|
+
genomes_root_dir: The parent directory to store all downloaded genomes. Defaults to './data/genomes'.
|
|
42
|
+
"""
|
|
43
|
+
self.ensembl_release = ensembl_release
|
|
44
|
+
self.assembly_id = assembly_id
|
|
45
|
+
self.species = species
|
|
46
|
+
self.genomes_root_dir = Path(genomes_root_dir)
|
|
47
|
+
genome_dir = self.genomes_root_dir / 'ensembl' / assembly_id / str(ensembl_release)
|
|
48
|
+
super().__init__(genome_dir)
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return (f"{self.__class__.__name__}("
|
|
52
|
+
f"assembly_id={self.assembly_id}, "
|
|
53
|
+
f"ensembl_release={self.ensembl_release}, "
|
|
54
|
+
f"species={self.species}, "
|
|
55
|
+
f"genomes_root_dir={self.genomes_root_dir})")
|
|
56
|
+
|
|
57
|
+
def download(self) -> dict[str, Path]:
|
|
58
|
+
"""
|
|
59
|
+
Downloads all necessary genome files using gget to retrieve the URLs.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A dictionary mapping a file type to the local Path.
|
|
63
|
+
Keys are `dna`, `cdna`, and `annotation`.
|
|
64
|
+
"""
|
|
65
|
+
gtf_url, cdna_url, dna_url = tuple(
|
|
66
|
+
gget.ref(self.species,
|
|
67
|
+
which=["gtf", "cdna", "dna"],
|
|
68
|
+
release=self.ensembl_release,
|
|
69
|
+
ftp=True,
|
|
70
|
+
verbose=False)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
dna_path = self.download_file(dna_url, Path(dna_url).name)
|
|
74
|
+
cdna_path = self.download_file(cdna_url, Path(cdna_url).name)
|
|
75
|
+
annotation_path = self.download_file(gtf_url, Path(gtf_url).name)
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
'dna': dna_path,
|
|
79
|
+
'cdna': cdna_path,
|
|
80
|
+
'annotation': annotation_path,
|
|
81
|
+
}
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Filename: GenomeUtils/genome/builder.py
|
|
4
|
+
Author: Arash Ayat
|
|
5
|
+
Copyright: 2025, Alexander Schliep
|
|
6
|
+
Version: 0.1.0
|
|
7
|
+
Description: This file contains the GenomeBuilder class for constructing genome objects.
|
|
8
|
+
License: LGPL-3.0-or-later
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import gzip
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import shutil
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, Optional
|
|
19
|
+
|
|
20
|
+
import gffutils
|
|
21
|
+
from Bio import SeqIO
|
|
22
|
+
from Bio.Seq import Seq
|
|
23
|
+
from Bio.SeqRecord import SeqRecord
|
|
24
|
+
from tqdm import tqdm
|
|
25
|
+
|
|
26
|
+
from .chromosome import Chromosome
|
|
27
|
+
from .exon import Exon
|
|
28
|
+
from .gene import Gene
|
|
29
|
+
from .genome import Genome
|
|
30
|
+
from .transcript import Transcript
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _get_default_chromosomes_for_species(species: str) -> set[str]:
|
|
34
|
+
"""
|
|
35
|
+
Returns the default main chromosomes for common species.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
species: Species name (case-insensitive)
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Set of chromosome identifiers including both with and without 'chr' prefix
|
|
42
|
+
"""
|
|
43
|
+
species_lower = species.lower()
|
|
44
|
+
|
|
45
|
+
if any(term in species_lower for term in ['human', 'homo sapiens', 'homo_sapiens']):
|
|
46
|
+
# Human: 1-22, X, Y, M, MT
|
|
47
|
+
standard_set = {str(i) for i in range(1, 23)} | {'X', 'Y', 'M', 'MT'}
|
|
48
|
+
elif any(term in species_lower for term in ['mouse', 'mice', 'mus musculus', 'mus_musculus']):
|
|
49
|
+
# Mouse: 1-19, X, Y, M, MT
|
|
50
|
+
standard_set = {str(i) for i in range(1, 20)} | {'X', 'Y', 'M', 'MT'}
|
|
51
|
+
else:
|
|
52
|
+
# Default to human if species not recognized
|
|
53
|
+
raise ValueError(f"Species '{species}' not recognized. Please use a supported species.")
|
|
54
|
+
|
|
55
|
+
# Return both with and without 'chr' prefix
|
|
56
|
+
return set(standard_set).union({f'chr{c}' for c in standard_set})
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BuilderStateError(Exception):
|
|
60
|
+
"""Custom exception for GenomeBuilder state errors."""
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _strip_version(seq_id: str) -> str:
|
|
65
|
+
"""Removes version numbers from a sequence ID (e.g., 'NC_000001.11' -> 'NC_000001')."""
|
|
66
|
+
seq_id_parts = seq_id.split('.')
|
|
67
|
+
return seq_id_parts[0] if len(seq_id_parts) > 1 else seq_id
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class GenomeBuilder:
|
|
71
|
+
"""Constructs a Genome object from various file formats.
|
|
72
|
+
|
|
73
|
+
This builder simplifies the process of assembling a complete Genome object
|
|
74
|
+
by handling the parsing and integration of DNA sequences, cDNA sequences,
|
|
75
|
+
and gene annotations from standard bioinformatics files.
|
|
76
|
+
|
|
77
|
+
The correct order of operations is:
|
|
78
|
+
|
|
79
|
+
1. with_dna_fasta()
|
|
80
|
+
2. with_cdna_fasta()
|
|
81
|
+
3. with_gtf_file()
|
|
82
|
+
4. build()
|
|
83
|
+
|
|
84
|
+
Example::
|
|
85
|
+
|
|
86
|
+
builder = GenomeBuilder(id="hg38", species="homo_sapiens", name="Human Reference Genome")
|
|
87
|
+
genome = (
|
|
88
|
+
builder.with_dna_fasta(Path("path/to/dna.fa"))
|
|
89
|
+
.with_cdna_fasta(Path("path/to/cdna.fa"))
|
|
90
|
+
.with_gtf_file(Path("path/to/annotations.gtf"))
|
|
91
|
+
.build()
|
|
92
|
+
)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self,
|
|
96
|
+
id: str,
|
|
97
|
+
species: str,
|
|
98
|
+
name: str,
|
|
99
|
+
main_chromosomes: Optional[list[str]] = None,
|
|
100
|
+
separate_scaffolds: bool = True,
|
|
101
|
+
**kwargs):
|
|
102
|
+
"""
|
|
103
|
+
Initializes the GenomeBuilder.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
id: The ID of the genome.
|
|
107
|
+
species: The species of the genome.
|
|
108
|
+
name: The name of the genome.
|
|
109
|
+
main_chromosomes: A list of chromosome IDs to be considered as the main set.
|
|
110
|
+
If None, defaults to species-appropriate chromosomes
|
|
111
|
+
(Human: 1-22,X,Y,M,MT; Mouse: 1-19,X,Y,M,MT).
|
|
112
|
+
separate_scaffolds: If True, separates scaffold chromosomes into a second Genome object.
|
|
113
|
+
The `build()` method will then return a tuple: (main_genome, scaffold_genome).
|
|
114
|
+
kwargs: Additional attributes for the Genome object.
|
|
115
|
+
"""
|
|
116
|
+
self._genome = Genome(id, species, name, **kwargs)
|
|
117
|
+
self._cdna_records: Dict[str, SeqRecord] = {}
|
|
118
|
+
self._genes_map: Dict[str, Gene] = {}
|
|
119
|
+
self._transcripts_map: Dict[str, Transcript] = {}
|
|
120
|
+
self._chromosome_filter = None
|
|
121
|
+
self._separate_scaffolds = separate_scaffolds
|
|
122
|
+
self._scaffold_genome: Optional[Genome] = None
|
|
123
|
+
|
|
124
|
+
if main_chromosomes is None:
|
|
125
|
+
# Use species-dependent default chromosomes
|
|
126
|
+
self._main_chromosomes = _get_default_chromosomes_for_species(species)
|
|
127
|
+
else:
|
|
128
|
+
self._main_chromosomes = set(main_chromosomes)
|
|
129
|
+
|
|
130
|
+
logging.basicConfig(level=logging.INFO)
|
|
131
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
132
|
+
|
|
133
|
+
if self._separate_scaffolds:
|
|
134
|
+
self.logger.info("Scaffold separation enabled. `build()` will return (main_genome, scaffold_genome).")
|
|
135
|
+
self._scaffold_genome = Genome(
|
|
136
|
+
id=f"{id}_scaffolds",
|
|
137
|
+
species=species,
|
|
138
|
+
name=f"{name} (Scaffolds)",
|
|
139
|
+
**kwargs
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def set_chromosome_filter(self, chromosomes: list[str]) -> "GenomeBuilder":
|
|
143
|
+
"""
|
|
144
|
+
Set a filter to only include specified chromosomes.
|
|
145
|
+
"""
|
|
146
|
+
if self._genome.chromosomes:
|
|
147
|
+
raise BuilderStateError("Cannot set chromosome filter after with_dna_fasta() has been called.")
|
|
148
|
+
|
|
149
|
+
self._chromosome_filter = set(chromosomes)
|
|
150
|
+
self.logger.info(f"Chromosome filter set to: {self._chromosome_filter}")
|
|
151
|
+
return self
|
|
152
|
+
|
|
153
|
+
def with_dna_fasta(self, dna_fasta_path: Path) -> "GenomeBuilder":
|
|
154
|
+
"""
|
|
155
|
+
Loads chromosome sequences from a genomic DNA FASTA file.
|
|
156
|
+
This must be the first step in the build process.
|
|
157
|
+
"""
|
|
158
|
+
if self._genome.chromosomes:
|
|
159
|
+
raise BuilderStateError("with_dna_fasta() has already been called.")
|
|
160
|
+
|
|
161
|
+
dna_file_to_use = dna_fasta_path
|
|
162
|
+
|
|
163
|
+
if str(dna_fasta_path).endswith('.gz'):
|
|
164
|
+
extracted_path = dna_fasta_path.with_suffix('')
|
|
165
|
+
if extracted_path.exists():
|
|
166
|
+
dna_file_to_use = extracted_path
|
|
167
|
+
else:
|
|
168
|
+
self.logger.info(f"Extracting gzipped DNA FASTA to: {extracted_path}")
|
|
169
|
+
with gzip.open(dna_fasta_path, 'rt') as gz_in:
|
|
170
|
+
with open(extracted_path, 'w') as f_out:
|
|
171
|
+
shutil.copyfileobj(gz_in, f_out)
|
|
172
|
+
dna_file_to_use = extracted_path
|
|
173
|
+
|
|
174
|
+
self.logger.info(f"Loading DNA sequences from {dna_file_to_use}...")
|
|
175
|
+
|
|
176
|
+
dna_records = SeqIO.index(str(dna_file_to_use), "fasta")
|
|
177
|
+
|
|
178
|
+
for record in SeqIO.parse(dna_file_to_use, "fasta"):
|
|
179
|
+
if self._chromosome_filter and record.id not in self._chromosome_filter:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
chromosome = Chromosome(record.id, dna_records, genome=self._genome, length=len(record.seq))
|
|
183
|
+
|
|
184
|
+
if self._separate_scaffolds and record.id not in self._main_chromosomes:
|
|
185
|
+
if self._scaffold_genome:
|
|
186
|
+
self._scaffold_genome.add_chromosome(chromosome)
|
|
187
|
+
else:
|
|
188
|
+
self._genome.add_chromosome(chromosome)
|
|
189
|
+
|
|
190
|
+
self.logger.info(f"Loaded {len(self._genome.chromosomes)} main chromosomes.")
|
|
191
|
+
if self._scaffold_genome:
|
|
192
|
+
self.logger.info(f"Loaded {len(self._scaffold_genome.chromosomes)} scaffold chromosomes.")
|
|
193
|
+
return self
|
|
194
|
+
|
|
195
|
+
def with_cdna_fasta(self, cdna_fasta_path: Path) -> "GenomeBuilder":
|
|
196
|
+
"""
|
|
197
|
+
Loads transcript sequences from a cDNA FASTA file.
|
|
198
|
+
"""
|
|
199
|
+
if self._cdna_records:
|
|
200
|
+
raise BuilderStateError("with_cdna_fasta() has already been called.")
|
|
201
|
+
|
|
202
|
+
self.logger.info(f"Loading cDNA sequences from {cdna_fasta_path}...")
|
|
203
|
+
|
|
204
|
+
open_func = gzip.open if str(cdna_fasta_path).endswith('.gz') else open
|
|
205
|
+
with open_func(cdna_fasta_path, "rt") as handle:
|
|
206
|
+
self._cdna_records = SeqIO.to_dict(SeqIO.parse(handle, "fasta"), key_function=lambda x: _strip_version(x.id))
|
|
207
|
+
|
|
208
|
+
self.logger.info(f"Loaded {len(self._cdna_records)} cDNA sequences.")
|
|
209
|
+
return self
|
|
210
|
+
|
|
211
|
+
def with_gtf_file(self, gtf_path: Path) -> "GenomeBuilder":
|
|
212
|
+
"""
|
|
213
|
+
Parses a GTF file to build the gene-transcript-exon hierarchy.
|
|
214
|
+
`with_dna_fasta()` and `with_cdna_fasta()` must be called before this method.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
if not self._genome.chromosomes:
|
|
218
|
+
raise BuilderStateError("Must call with_dna_fasta() before with_gtf_file().")
|
|
219
|
+
if not self._cdna_records:
|
|
220
|
+
raise BuilderStateError("Must call with_cdna_fasta() before with_gtf_file().")
|
|
221
|
+
if self._genes_map:
|
|
222
|
+
raise BuilderStateError("with_gtf_file() has already been called.")
|
|
223
|
+
|
|
224
|
+
self.logger.info(f"Processing annotations from {gtf_path}...")
|
|
225
|
+
|
|
226
|
+
gtf_db_path = gtf_path.with_suffix('.db')
|
|
227
|
+
|
|
228
|
+
if gtf_db_path.exists():
|
|
229
|
+
self.logger.info(f"Loading existing gffutils database: {gtf_db_path}")
|
|
230
|
+
try:
|
|
231
|
+
db = gffutils.FeatureDB(str(gtf_db_path))
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self.logger.warning(f"Error loading existing gffutils database: {e}. Creating new database.")
|
|
234
|
+
gtf_db_path.unlink()
|
|
235
|
+
db = gffutils.create_db(str(gtf_path),
|
|
236
|
+
dbfn=str(gtf_db_path),
|
|
237
|
+
keep_order=False,
|
|
238
|
+
merge_strategy='error',
|
|
239
|
+
id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
|
|
240
|
+
disable_infer_genes=True,
|
|
241
|
+
disable_infer_transcripts=True)
|
|
242
|
+
else:
|
|
243
|
+
self.logger.info(f"Database not found. Creating new database at: {gtf_db_path}")
|
|
244
|
+
gtf_file_to_use = gtf_path
|
|
245
|
+
|
|
246
|
+
if str(gtf_path).endswith('.gz'):
|
|
247
|
+
extracted_path = gtf_path.with_suffix('')
|
|
248
|
+
|
|
249
|
+
if extracted_path.exists():
|
|
250
|
+
self.logger.info(f"Using existing extracted GTF file: {extracted_path}")
|
|
251
|
+
gtf_file_to_use = extracted_path
|
|
252
|
+
else:
|
|
253
|
+
self.logger.info(f"Extracting gzipped GTF file to: {extracted_path}")
|
|
254
|
+
with gzip.open(gtf_path, 'rt') as gz_file:
|
|
255
|
+
with open(extracted_path, 'w') as out_file:
|
|
256
|
+
out_file.write(gz_file.read())
|
|
257
|
+
gtf_file_to_use = extracted_path
|
|
258
|
+
|
|
259
|
+
db = gffutils.create_db(
|
|
260
|
+
str(gtf_file_to_use),
|
|
261
|
+
dbfn=str(gtf_db_path),
|
|
262
|
+
keep_order=False,
|
|
263
|
+
merge_strategy='error',
|
|
264
|
+
id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
|
|
265
|
+
disable_infer_genes=True,
|
|
266
|
+
disable_infer_transcripts=True
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
logging.info(f"GTF database created at: {gtf_db_path}")
|
|
270
|
+
|
|
271
|
+
self._create_genes(db)
|
|
272
|
+
|
|
273
|
+
self._create_transcripts(db)
|
|
274
|
+
|
|
275
|
+
self._create_exons(db)
|
|
276
|
+
|
|
277
|
+
self.logger.info(f"Successfully parsed and linked {len(self._genes_map)} genes "
|
|
278
|
+
f"and {len(self._transcripts_map)} transcripts.")
|
|
279
|
+
return self
|
|
280
|
+
|
|
281
|
+
def _create_genes(self, db: gffutils.FeatureDB):
|
|
282
|
+
"""Creates Gene objects from the GTF database."""
|
|
283
|
+
query = "SELECT id, seqid, start, end, strand, attributes FROM features WHERE featuretype = 'gene'"
|
|
284
|
+
|
|
285
|
+
count_query = "SELECT count(*) FROM features WHERE featuretype = 'gene'"
|
|
286
|
+
total_genes = db.conn.execute(count_query).fetchone()[0]
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
for g_id, seqid, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_genes, desc="Creating genes"):
|
|
290
|
+
if self._chromosome_filter and seqid not in self._chromosome_filter:
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
chromosome = self._genome.chromosome_by_id(seqid)
|
|
295
|
+
except ValueError:
|
|
296
|
+
if self._scaffold_genome:
|
|
297
|
+
chromosome = self._scaffold_genome.chromosome_by_id(seqid)
|
|
298
|
+
else:
|
|
299
|
+
self.logger.warning(f"Chromosome '{seqid}' for gene '{g_id}' not found. Skipping gene.")
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
if not chromosome:
|
|
303
|
+
self.logger.warning(f"Chromosome '{seqid}' for gene '{g_id}' not found. Skipping gene.")
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
attributes = json.loads(attributes_json)
|
|
308
|
+
|
|
309
|
+
gene_names = attributes.pop('gene_name', attributes.pop('gene', [g_id]))
|
|
310
|
+
gene_name = gene_names[0]
|
|
311
|
+
attributes['gene_synonyms'] = gene_names[1:]
|
|
312
|
+
|
|
313
|
+
attributes = {k: v for k, v in attributes.items()
|
|
314
|
+
if not (k.startswith('exon') or k.startswith('transcript'))}
|
|
315
|
+
|
|
316
|
+
gene_id = attributes.pop('gene_id', [g_id])[0]
|
|
317
|
+
attributes = {k.replace('gene_', ''): v for k, v in attributes.items()}
|
|
318
|
+
attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
|
|
319
|
+
|
|
320
|
+
gene = Gene(id=gene_id, name=gene_name, chr=chromosome.id, start=start,
|
|
321
|
+
end=end, strand=strand, chromosome=chromosome,
|
|
322
|
+
genome=self._genome,
|
|
323
|
+
**attributes)
|
|
324
|
+
chromosome.add_gene(gene)
|
|
325
|
+
self._genes_map[g_id] = gene
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
self.logger.warning(f"Error processing gene '{g_id}': {e}. Skipping.")
|
|
329
|
+
|
|
330
|
+
def _create_transcripts(self, db: gffutils.FeatureDB):
|
|
331
|
+
"""Creates Transcript objects and links them to genes."""
|
|
332
|
+
query = "SELECT id, start, end, strand, attributes FROM features WHERE featuretype = 'transcript'"
|
|
333
|
+
|
|
334
|
+
count_query = "SELECT count(*) FROM features WHERE featuretype = 'transcript'"
|
|
335
|
+
total_transcripts = db.conn.execute(count_query).fetchone()[0]
|
|
336
|
+
|
|
337
|
+
for t_id, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_transcripts, desc="Creating transcripts"):
|
|
338
|
+
attributes = json.loads(attributes_json)
|
|
339
|
+
|
|
340
|
+
gene_id = attributes.pop('gene_id', attributes.pop('gene', [None]))[0]
|
|
341
|
+
transcript_id = attributes.pop('transcript_id', [t_id])[0]
|
|
342
|
+
|
|
343
|
+
attributes = {k: v for k, v in attributes.items()
|
|
344
|
+
if not (k.startswith('exon') or k.startswith('gene'))}
|
|
345
|
+
attributes = {k.replace('transcript_', ''): v for k, v in attributes.items()}
|
|
346
|
+
attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
|
|
347
|
+
if gene_id and gene_id in self._genes_map:
|
|
348
|
+
gene = self._genes_map[gene_id]
|
|
349
|
+
sequence = self._cdna_records.pop(transcript_id, SeqRecord(Seq(""))).seq
|
|
350
|
+
transcript = Transcript(id=transcript_id, chr=gene.chr, start=start, end=end, strand=strand,
|
|
351
|
+
sequence=sequence, gene=gene, genome=self._genome, **attributes)
|
|
352
|
+
gene.add_transcript(transcript)
|
|
353
|
+
self._transcripts_map[t_id] = transcript
|
|
354
|
+
else:
|
|
355
|
+
self.logger.warning(f"Gene '{gene_id}' for transcript '{t_id}' not found. Skipping transcript.")
|
|
356
|
+
|
|
357
|
+
def _create_exons(self, db: gffutils.FeatureDB):
|
|
358
|
+
"""Creates Exon objects and links them to transcripts."""
|
|
359
|
+
query = "SELECT id, seqid, start, end, strand, attributes FROM features WHERE featuretype = 'exon'"
|
|
360
|
+
|
|
361
|
+
count_query = "SELECT count(*) FROM features WHERE featuretype = 'exon'"
|
|
362
|
+
total_exons = db.conn.execute(count_query).fetchone()[0]
|
|
363
|
+
|
|
364
|
+
for e_id, seqid, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_exons, desc="Creating exons"):
|
|
365
|
+
if self._chromosome_filter and seqid not in self._chromosome_filter:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
attributes = json.loads(attributes_json)
|
|
369
|
+
|
|
370
|
+
transcript_id = attributes.pop('transcript_id', [None])[0]
|
|
371
|
+
exon_id = attributes.pop('exon_id', [e_id])[0]
|
|
372
|
+
|
|
373
|
+
attributes = {k: v for k, v in attributes.items()
|
|
374
|
+
if not (k.startswith('transcript') or k.startswith('gene'))}
|
|
375
|
+
attributes = {k.replace('exon_', ''): v for k, v in attributes.items()}
|
|
376
|
+
attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
|
|
377
|
+
if transcript_id and transcript_id in self._transcripts_map:
|
|
378
|
+
transcript = self._transcripts_map[transcript_id]
|
|
379
|
+
exon = Exon(id=exon_id, chr=transcript.chr, start=start, end=end, strand=strand, transcript=transcript,
|
|
380
|
+
genome=self._genome,
|
|
381
|
+
**attributes)
|
|
382
|
+
transcript.add_exon(exon)
|
|
383
|
+
else:
|
|
384
|
+
self.logger.warning(f"Transcript '{transcript_id}' for exon '{e_id}' not found. Skipping exon.")
|
|
385
|
+
|
|
386
|
+
def build(self) -> Genome | tuple[Genome, Genome]:
|
|
387
|
+
"""
|
|
388
|
+
Finalizes the Genome object by creating an index for fast lookups.
|
|
389
|
+
"""
|
|
390
|
+
if not self._genes_map:
|
|
391
|
+
raise BuilderStateError("Cannot build Genome. GTF data is missing. "
|
|
392
|
+
"Please call with_gtf_file() before build().")
|
|
393
|
+
|
|
394
|
+
self._genome.index()
|
|
395
|
+
if self._scaffold_genome:
|
|
396
|
+
self.logger.info("Indexing scaffold genome for fast lookups...")
|
|
397
|
+
self._scaffold_genome.index()
|
|
398
|
+
|
|
399
|
+
self.logger.info("Genome construction complete.")
|
|
400
|
+
|
|
401
|
+
self._offload_memory()
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
if self._scaffold_genome:
|
|
405
|
+
return self._genome, self._scaffold_genome
|
|
406
|
+
|
|
407
|
+
return self._genome
|
|
408
|
+
|
|
409
|
+
def _offload_memory(self):
|
|
410
|
+
"""Clears large data structures from memory after the build is complete."""
|
|
411
|
+
self._cdna_records.clear()
|
|
412
|
+
self._genes_map.clear()
|
|
413
|
+
self._transcripts_map.clear()
|