GenomeUtils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/Downloaders.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file serves as a convenient entry point for downloader classes.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+
12
+ from .downloaders import Downloader, EnsemblGenomeDownloader
13
+
14
+ __all__ = ["EnsemblGenomeDownloader", "Downloader"]
GenomeUtils/Genome.py ADDED
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/Genome.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file defines the main Genome class and related functionalities.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+
12
+ from .genome.builder import GenomeBuilder
13
+ from .genome.chromosome import Chromosome
14
+ from .genome.exon import Exon
15
+ from .genome.gene import Gene
16
+ from .genome.genome import Genome
17
+ from .genome.genome_element import GenomeElement
18
+ from .genome.locus import Locus
19
+ from .genome.transcript import Transcript
20
+
21
+ __all__ = ["Genome", "Gene", "Transcript", "Exon", "Chromosome", "Locus", "GenomeElement", "GenomeBuilder"]
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/__init__.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file is the initialization file for the GenomeUtils package.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+
12
+ from . import Downloaders
13
+ from . import Genome
14
+
15
+ __all__ = [
16
+ "Genome",
17
+ "Downloaders",
18
+ ]
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/downloaders/__init__.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: Initialization file for the downloaders package.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+
12
+ from .downloader import Downloader
13
+ from .genome_downloader import EnsemblGenomeDownloader
14
+
15
+ __all__ = [
16
+ "Downloader",
17
+ "EnsemblGenomeDownloader",
18
+ ]
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/downloaders/downloader.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file defines the base Downloader class for handling file downloads.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+ from abc import ABC
12
+ import logging
13
+ from pathlib import Path
14
+ import shutil
15
+ import tempfile
16
+ from typing import Optional, Set
17
+
18
+ import requests
19
+
20
+
21
+ class Downloader(ABC):
22
+ """Abstract base class for all downloaders."""
23
+
24
+ def __init__(self, download_dir: Optional[Path] = None):
25
+ """
26
+ Initializes the Downloader.
27
+
28
+ Args:
29
+ download_dir: Directory for storing downloaded files.
30
+ If None, uses a temporary directory.
31
+ """
32
+ self._is_temp_cache = download_dir is None
33
+ self.download_dir = download_dir or Path(tempfile.mkdtemp())
34
+ self.download_dir.mkdir(parents=True, exist_ok=True)
35
+ self.logger = logging.getLogger(self.__class__.__name__)
36
+ self._created_files: Set[Path] = set()
37
+
38
+ def __repr__(self) -> str:
39
+ return f"{self.__class__.__name__}(download_dir={self.download_dir})"
40
+
41
+ def download_file(self, url: str, filename: str = None, force: bool = False) -> Path:
42
+ """
43
+ Download a single file from a URL and saves it in the cache directory.
44
+
45
+ Args:
46
+ url: The URL of the file to download.
47
+ filename: The name of the file to be saved in the cache directory.
48
+ force: If True, redownload the file even if it exists. Defaults to False.
49
+
50
+ Returns:
51
+ The path to the downloaded file.
52
+ """
53
+ if filename is None:
54
+ # Extract filename from URL, removing query parameters
55
+ filename = url.split('/')[-1].split('?')[0]
56
+ destination_path = self.download_dir / filename
57
+
58
+ if not force and destination_path.exists():
59
+ self.logger.info(f"File '{filename}' already exists in cache. Skipping download.")
60
+ return destination_path
61
+
62
+ self.logger.info(f"Downloading '{filename}'...")
63
+ with requests.get(url, stream=True) as r:
64
+ r.raise_for_status()
65
+ with open(destination_path, 'wb') as f:
66
+ shutil.copyfileobj(r.raw, f)
67
+ self._created_files.add(destination_path)
68
+ return destination_path
69
+
70
+ def cleanup(self):
71
+ """
72
+ Clean up created files.
73
+ """
74
+ if self._is_temp_cache:
75
+ if self.download_dir.exists():
76
+ shutil.rmtree(self.download_dir)
77
+ else:
78
+ for path in self._created_files:
79
+ if path.exists():
80
+ path.unlink()
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/downloaders/genome_downloader.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file defines the abstract base class for genome downloaders.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from pathlib import Path
14
+
15
+ import gget
16
+
17
+ from .downloader import Downloader
18
+
19
+
20
+ class EnsemblGenomeDownloader(Downloader):
21
+ """
22
+ Downloads genome data from Ensembl.
23
+
24
+ This downloader fetches the download URLs
25
+ for genomic data using `gget`, downloads the files, and stores them in `genomes_root_dir/ensembl/{assembly_id}/{ensembl_release}`.
26
+ """
27
+
28
+ def __init__(self,
29
+ assembly_id: str,
30
+ ensembl_release: int,
31
+ species: str,
32
+ genomes_root_dir: Path | str = Path('./data/genomes')
33
+ ):
34
+ """
35
+ Initializes the EnsemblGenomeDownloader.
36
+
37
+ Args:
38
+ assembly_id: The identifier for the genome assembly (e.g., 'GRCh38').
39
+ ensembl_release: The release number of the Ensembl database.
40
+ species: The scientific name for the species (e.g., 'homo_sapiens').
41
+ genomes_root_dir: The parent directory to store all downloaded genomes. Defaults to './data/genomes'.
42
+ """
43
+ self.ensembl_release = ensembl_release
44
+ self.assembly_id = assembly_id
45
+ self.species = species
46
+ self.genomes_root_dir = Path(genomes_root_dir)
47
+ genome_dir = self.genomes_root_dir / 'ensembl' / assembly_id / str(ensembl_release)
48
+ super().__init__(genome_dir)
49
+
50
+ def __repr__(self) -> str:
51
+ return (f"{self.__class__.__name__}("
52
+ f"assembly_id={self.assembly_id}, "
53
+ f"ensembl_release={self.ensembl_release}, "
54
+ f"species={self.species}, "
55
+ f"genomes_root_dir={self.genomes_root_dir})")
56
+
57
+ def download(self) -> dict[str, Path]:
58
+ """
59
+ Downloads all necessary genome files using gget to retrieve the URLs.
60
+
61
+ Returns:
62
+ A dictionary mapping a file type to the local Path.
63
+ Keys are `dna`, `cdna`, and `annotation`.
64
+ """
65
+ gtf_url, cdna_url, dna_url = tuple(
66
+ gget.ref(self.species,
67
+ which=["gtf", "cdna", "dna"],
68
+ release=self.ensembl_release,
69
+ ftp=True,
70
+ verbose=False)
71
+ )
72
+
73
+ dna_path = self.download_file(dna_url, Path(dna_url).name)
74
+ cdna_path = self.download_file(cdna_url, Path(cdna_url).name)
75
+ annotation_path = self.download_file(gtf_url, Path(gtf_url).name)
76
+
77
+ return {
78
+ 'dna': dna_path,
79
+ 'cdna': cdna_path,
80
+ 'annotation': annotation_path,
81
+ }
@@ -0,0 +1,413 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Filename: GenomeUtils/genome/builder.py
4
+ Author: Arash Ayat
5
+ Copyright: 2025, Alexander Schliep
6
+ Version: 0.1.0
7
+ Description: This file contains the GenomeBuilder class for constructing genome objects.
8
+ License: LGPL-3.0-or-later
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import gzip
14
+ import json
15
+ import logging
16
+ import shutil
17
+ from pathlib import Path
18
+ from typing import Dict, Optional
19
+
20
+ import gffutils
21
+ from Bio import SeqIO
22
+ from Bio.Seq import Seq
23
+ from Bio.SeqRecord import SeqRecord
24
+ from tqdm import tqdm
25
+
26
+ from .chromosome import Chromosome
27
+ from .exon import Exon
28
+ from .gene import Gene
29
+ from .genome import Genome
30
+ from .transcript import Transcript
31
+
32
+
33
+ def _get_default_chromosomes_for_species(species: str) -> set[str]:
34
+ """
35
+ Returns the default main chromosomes for common species.
36
+
37
+ Args:
38
+ species: Species name (case-insensitive)
39
+
40
+ Returns:
41
+ Set of chromosome identifiers including both with and without 'chr' prefix
42
+ """
43
+ species_lower = species.lower()
44
+
45
+ if any(term in species_lower for term in ['human', 'homo sapiens', 'homo_sapiens']):
46
+ # Human: 1-22, X, Y, M, MT
47
+ standard_set = {str(i) for i in range(1, 23)} | {'X', 'Y', 'M', 'MT'}
48
+ elif any(term in species_lower for term in ['mouse', 'mice', 'mus musculus', 'mus_musculus']):
49
+ # Mouse: 1-19, X, Y, M, MT
50
+ standard_set = {str(i) for i in range(1, 20)} | {'X', 'Y', 'M', 'MT'}
51
+ else:
52
+ # Default to human if species not recognized
53
+ raise ValueError(f"Species '{species}' not recognized. Please use a supported species.")
54
+
55
+ # Return both with and without 'chr' prefix
56
+ return set(standard_set).union({f'chr{c}' for c in standard_set})
57
+
58
+
59
+ class BuilderStateError(Exception):
60
+ """Custom exception for GenomeBuilder state errors."""
61
+ pass
62
+
63
+
64
+ def _strip_version(seq_id: str) -> str:
65
+ """Removes version numbers from a sequence ID (e.g., 'NC_000001.11' -> 'NC_000001')."""
66
+ seq_id_parts = seq_id.split('.')
67
+ return seq_id_parts[0] if len(seq_id_parts) > 1 else seq_id
68
+
69
+
70
+ class GenomeBuilder:
71
+ """Constructs a Genome object from various file formats.
72
+
73
+ This builder simplifies the process of assembling a complete Genome object
74
+ by handling the parsing and integration of DNA sequences, cDNA sequences,
75
+ and gene annotations from standard bioinformatics files.
76
+
77
+ The correct order of operations is:
78
+
79
+ 1. with_dna_fasta()
80
+ 2. with_cdna_fasta()
81
+ 3. with_gtf_file()
82
+ 4. build()
83
+
84
+ Example::
85
+
86
+ builder = GenomeBuilder(id="hg38", species="homo_sapiens", name="Human Reference Genome")
87
+ genome = (
88
+ builder.with_dna_fasta(Path("path/to/dna.fa"))
89
+ .with_cdna_fasta(Path("path/to/cdna.fa"))
90
+ .with_gtf_file(Path("path/to/annotations.gtf"))
91
+ .build()
92
+ )
93
+ """
94
+
95
+ def __init__(self,
96
+ id: str,
97
+ species: str,
98
+ name: str,
99
+ main_chromosomes: Optional[list[str]] = None,
100
+ separate_scaffolds: bool = True,
101
+ **kwargs):
102
+ """
103
+ Initializes the GenomeBuilder.
104
+
105
+ Args:
106
+ id: The ID of the genome.
107
+ species: The species of the genome.
108
+ name: The name of the genome.
109
+ main_chromosomes: A list of chromosome IDs to be considered as the main set.
110
+ If None, defaults to species-appropriate chromosomes
111
+ (Human: 1-22,X,Y,M,MT; Mouse: 1-19,X,Y,M,MT).
112
+ separate_scaffolds: If True, separates scaffold chromosomes into a second Genome object.
113
+ The `build()` method will then return a tuple: (main_genome, scaffold_genome).
114
+ kwargs: Additional attributes for the Genome object.
115
+ """
116
+ self._genome = Genome(id, species, name, **kwargs)
117
+ self._cdna_records: Dict[str, SeqRecord] = {}
118
+ self._genes_map: Dict[str, Gene] = {}
119
+ self._transcripts_map: Dict[str, Transcript] = {}
120
+ self._chromosome_filter = None
121
+ self._separate_scaffolds = separate_scaffolds
122
+ self._scaffold_genome: Optional[Genome] = None
123
+
124
+ if main_chromosomes is None:
125
+ # Use species-dependent default chromosomes
126
+ self._main_chromosomes = _get_default_chromosomes_for_species(species)
127
+ else:
128
+ self._main_chromosomes = set(main_chromosomes)
129
+
130
+ logging.basicConfig(level=logging.INFO)
131
+ self.logger = logging.getLogger(self.__class__.__name__)
132
+
133
+ if self._separate_scaffolds:
134
+ self.logger.info("Scaffold separation enabled. `build()` will return (main_genome, scaffold_genome).")
135
+ self._scaffold_genome = Genome(
136
+ id=f"{id}_scaffolds",
137
+ species=species,
138
+ name=f"{name} (Scaffolds)",
139
+ **kwargs
140
+ )
141
+
142
+ def set_chromosome_filter(self, chromosomes: list[str]) -> "GenomeBuilder":
143
+ """
144
+ Set a filter to only include specified chromosomes.
145
+ """
146
+ if self._genome.chromosomes:
147
+ raise BuilderStateError("Cannot set chromosome filter after with_dna_fasta() has been called.")
148
+
149
+ self._chromosome_filter = set(chromosomes)
150
+ self.logger.info(f"Chromosome filter set to: {self._chromosome_filter}")
151
+ return self
152
+
153
+ def with_dna_fasta(self, dna_fasta_path: Path) -> "GenomeBuilder":
154
+ """
155
+ Loads chromosome sequences from a genomic DNA FASTA file.
156
+ This must be the first step in the build process.
157
+ """
158
+ if self._genome.chromosomes:
159
+ raise BuilderStateError("with_dna_fasta() has already been called.")
160
+
161
+ dna_file_to_use = dna_fasta_path
162
+
163
+ if str(dna_fasta_path).endswith('.gz'):
164
+ extracted_path = dna_fasta_path.with_suffix('')
165
+ if extracted_path.exists():
166
+ dna_file_to_use = extracted_path
167
+ else:
168
+ self.logger.info(f"Extracting gzipped DNA FASTA to: {extracted_path}")
169
+ with gzip.open(dna_fasta_path, 'rt') as gz_in:
170
+ with open(extracted_path, 'w') as f_out:
171
+ shutil.copyfileobj(gz_in, f_out)
172
+ dna_file_to_use = extracted_path
173
+
174
+ self.logger.info(f"Loading DNA sequences from {dna_file_to_use}...")
175
+
176
+ dna_records = SeqIO.index(str(dna_file_to_use), "fasta")
177
+
178
+ for record in SeqIO.parse(dna_file_to_use, "fasta"):
179
+ if self._chromosome_filter and record.id not in self._chromosome_filter:
180
+ continue
181
+
182
+ chromosome = Chromosome(record.id, dna_records, genome=self._genome, length=len(record.seq))
183
+
184
+ if self._separate_scaffolds and record.id not in self._main_chromosomes:
185
+ if self._scaffold_genome:
186
+ self._scaffold_genome.add_chromosome(chromosome)
187
+ else:
188
+ self._genome.add_chromosome(chromosome)
189
+
190
+ self.logger.info(f"Loaded {len(self._genome.chromosomes)} main chromosomes.")
191
+ if self._scaffold_genome:
192
+ self.logger.info(f"Loaded {len(self._scaffold_genome.chromosomes)} scaffold chromosomes.")
193
+ return self
194
+
195
+ def with_cdna_fasta(self, cdna_fasta_path: Path) -> "GenomeBuilder":
196
+ """
197
+ Loads transcript sequences from a cDNA FASTA file.
198
+ """
199
+ if self._cdna_records:
200
+ raise BuilderStateError("with_cdna_fasta() has already been called.")
201
+
202
+ self.logger.info(f"Loading cDNA sequences from {cdna_fasta_path}...")
203
+
204
+ open_func = gzip.open if str(cdna_fasta_path).endswith('.gz') else open
205
+ with open_func(cdna_fasta_path, "rt") as handle:
206
+ self._cdna_records = SeqIO.to_dict(SeqIO.parse(handle, "fasta"), key_function=lambda x: _strip_version(x.id))
207
+
208
+ self.logger.info(f"Loaded {len(self._cdna_records)} cDNA sequences.")
209
+ return self
210
+
211
+ def with_gtf_file(self, gtf_path: Path) -> "GenomeBuilder":
212
+ """
213
+ Parses a GTF file to build the gene-transcript-exon hierarchy.
214
+ `with_dna_fasta()` and `with_cdna_fasta()` must be called before this method.
215
+ """
216
+
217
+ if not self._genome.chromosomes:
218
+ raise BuilderStateError("Must call with_dna_fasta() before with_gtf_file().")
219
+ if not self._cdna_records:
220
+ raise BuilderStateError("Must call with_cdna_fasta() before with_gtf_file().")
221
+ if self._genes_map:
222
+ raise BuilderStateError("with_gtf_file() has already been called.")
223
+
224
+ self.logger.info(f"Processing annotations from {gtf_path}...")
225
+
226
+ gtf_db_path = gtf_path.with_suffix('.db')
227
+
228
+ if gtf_db_path.exists():
229
+ self.logger.info(f"Loading existing gffutils database: {gtf_db_path}")
230
+ try:
231
+ db = gffutils.FeatureDB(str(gtf_db_path))
232
+ except Exception as e:
233
+ self.logger.warning(f"Error loading existing gffutils database: {e}. Creating new database.")
234
+ gtf_db_path.unlink()
235
+ db = gffutils.create_db(str(gtf_path),
236
+ dbfn=str(gtf_db_path),
237
+ keep_order=False,
238
+ merge_strategy='error',
239
+ id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
240
+ disable_infer_genes=True,
241
+ disable_infer_transcripts=True)
242
+ else:
243
+ self.logger.info(f"Database not found. Creating new database at: {gtf_db_path}")
244
+ gtf_file_to_use = gtf_path
245
+
246
+ if str(gtf_path).endswith('.gz'):
247
+ extracted_path = gtf_path.with_suffix('')
248
+
249
+ if extracted_path.exists():
250
+ self.logger.info(f"Using existing extracted GTF file: {extracted_path}")
251
+ gtf_file_to_use = extracted_path
252
+ else:
253
+ self.logger.info(f"Extracting gzipped GTF file to: {extracted_path}")
254
+ with gzip.open(gtf_path, 'rt') as gz_file:
255
+ with open(extracted_path, 'w') as out_file:
256
+ out_file.write(gz_file.read())
257
+ gtf_file_to_use = extracted_path
258
+
259
+ db = gffutils.create_db(
260
+ str(gtf_file_to_use),
261
+ dbfn=str(gtf_db_path),
262
+ keep_order=False,
263
+ merge_strategy='error',
264
+ id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
265
+ disable_infer_genes=True,
266
+ disable_infer_transcripts=True
267
+ )
268
+
269
+ logging.info(f"GTF database created at: {gtf_db_path}")
270
+
271
+ self._create_genes(db)
272
+
273
+ self._create_transcripts(db)
274
+
275
+ self._create_exons(db)
276
+
277
+ self.logger.info(f"Successfully parsed and linked {len(self._genes_map)} genes "
278
+ f"and {len(self._transcripts_map)} transcripts.")
279
+ return self
280
+
281
+ def _create_genes(self, db: gffutils.FeatureDB):
282
+ """Creates Gene objects from the GTF database."""
283
+ query = "SELECT id, seqid, start, end, strand, attributes FROM features WHERE featuretype = 'gene'"
284
+
285
+ count_query = "SELECT count(*) FROM features WHERE featuretype = 'gene'"
286
+ total_genes = db.conn.execute(count_query).fetchone()[0]
287
+
288
+
289
+ for g_id, seqid, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_genes, desc="Creating genes"):
290
+ if self._chromosome_filter and seqid not in self._chromosome_filter:
291
+ continue
292
+
293
+ try:
294
+ chromosome = self._genome.chromosome_by_id(seqid)
295
+ except ValueError:
296
+ if self._scaffold_genome:
297
+ chromosome = self._scaffold_genome.chromosome_by_id(seqid)
298
+ else:
299
+ self.logger.warning(f"Chromosome '{seqid}' for gene '{g_id}' not found. Skipping gene.")
300
+ continue
301
+
302
+ if not chromosome:
303
+ self.logger.warning(f"Chromosome '{seqid}' for gene '{g_id}' not found. Skipping gene.")
304
+ continue
305
+
306
+ try:
307
+ attributes = json.loads(attributes_json)
308
+
309
+ gene_names = attributes.pop('gene_name', attributes.pop('gene', [g_id]))
310
+ gene_name = gene_names[0]
311
+ attributes['gene_synonyms'] = gene_names[1:]
312
+
313
+ attributes = {k: v for k, v in attributes.items()
314
+ if not (k.startswith('exon') or k.startswith('transcript'))}
315
+
316
+ gene_id = attributes.pop('gene_id', [g_id])[0]
317
+ attributes = {k.replace('gene_', ''): v for k, v in attributes.items()}
318
+ attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
319
+
320
+ gene = Gene(id=gene_id, name=gene_name, chr=chromosome.id, start=start,
321
+ end=end, strand=strand, chromosome=chromosome,
322
+ genome=self._genome,
323
+ **attributes)
324
+ chromosome.add_gene(gene)
325
+ self._genes_map[g_id] = gene
326
+
327
+ except Exception as e:
328
+ self.logger.warning(f"Error processing gene '{g_id}': {e}. Skipping.")
329
+
330
+ def _create_transcripts(self, db: gffutils.FeatureDB):
331
+ """Creates Transcript objects and links them to genes."""
332
+ query = "SELECT id, start, end, strand, attributes FROM features WHERE featuretype = 'transcript'"
333
+
334
+ count_query = "SELECT count(*) FROM features WHERE featuretype = 'transcript'"
335
+ total_transcripts = db.conn.execute(count_query).fetchone()[0]
336
+
337
+ for t_id, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_transcripts, desc="Creating transcripts"):
338
+ attributes = json.loads(attributes_json)
339
+
340
+ gene_id = attributes.pop('gene_id', attributes.pop('gene', [None]))[0]
341
+ transcript_id = attributes.pop('transcript_id', [t_id])[0]
342
+
343
+ attributes = {k: v for k, v in attributes.items()
344
+ if not (k.startswith('exon') or k.startswith('gene'))}
345
+ attributes = {k.replace('transcript_', ''): v for k, v in attributes.items()}
346
+ attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
347
+ if gene_id and gene_id in self._genes_map:
348
+ gene = self._genes_map[gene_id]
349
+ sequence = self._cdna_records.pop(transcript_id, SeqRecord(Seq(""))).seq
350
+ transcript = Transcript(id=transcript_id, chr=gene.chr, start=start, end=end, strand=strand,
351
+ sequence=sequence, gene=gene, genome=self._genome, **attributes)
352
+ gene.add_transcript(transcript)
353
+ self._transcripts_map[t_id] = transcript
354
+ else:
355
+ self.logger.warning(f"Gene '{gene_id}' for transcript '{t_id}' not found. Skipping transcript.")
356
+
357
+ def _create_exons(self, db: gffutils.FeatureDB):
358
+ """Creates Exon objects and links them to transcripts."""
359
+ query = "SELECT id, seqid, start, end, strand, attributes FROM features WHERE featuretype = 'exon'"
360
+
361
+ count_query = "SELECT count(*) FROM features WHERE featuretype = 'exon'"
362
+ total_exons = db.conn.execute(count_query).fetchone()[0]
363
+
364
+ for e_id, seqid, start, end, strand, attributes_json in tqdm(db.conn.execute(query), total=total_exons, desc="Creating exons"):
365
+ if self._chromosome_filter and seqid not in self._chromosome_filter:
366
+ continue
367
+
368
+ attributes = json.loads(attributes_json)
369
+
370
+ transcript_id = attributes.pop('transcript_id', [None])[0]
371
+ exon_id = attributes.pop('exon_id', [e_id])[0]
372
+
373
+ attributes = {k: v for k, v in attributes.items()
374
+ if not (k.startswith('transcript') or k.startswith('gene'))}
375
+ attributes = {k.replace('exon_', ''): v for k, v in attributes.items()}
376
+ attributes = {k: (v[0] if isinstance(v, list) and len(v) == 1 else v) for k, v in attributes.items()}
377
+ if transcript_id and transcript_id in self._transcripts_map:
378
+ transcript = self._transcripts_map[transcript_id]
379
+ exon = Exon(id=exon_id, chr=transcript.chr, start=start, end=end, strand=strand, transcript=transcript,
380
+ genome=self._genome,
381
+ **attributes)
382
+ transcript.add_exon(exon)
383
+ else:
384
+ self.logger.warning(f"Transcript '{transcript_id}' for exon '{e_id}' not found. Skipping exon.")
385
+
386
+ def build(self) -> Genome | tuple[Genome, Genome]:
387
+ """
388
+ Finalizes the Genome object by creating an index for fast lookups.
389
+ """
390
+ if not self._genes_map:
391
+ raise BuilderStateError("Cannot build Genome. GTF data is missing. "
392
+ "Please call with_gtf_file() before build().")
393
+
394
+ self._genome.index()
395
+ if self._scaffold_genome:
396
+ self.logger.info("Indexing scaffold genome for fast lookups...")
397
+ self._scaffold_genome.index()
398
+
399
+ self.logger.info("Genome construction complete.")
400
+
401
+ self._offload_memory()
402
+
403
+
404
+ if self._scaffold_genome:
405
+ return self._genome, self._scaffold_genome
406
+
407
+ return self._genome
408
+
409
+ def _offload_memory(self):
410
+ """Clears large data structures from memory after the build is complete."""
411
+ self._cdna_records.clear()
412
+ self._genes_map.clear()
413
+ self._transcripts_map.clear()