seqmat 0.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seqmat/__init__.py ADDED
@@ -0,0 +1,52 @@
1
+ """
2
+ SeqMat - Lightning-fast genomic sequence matrix library
3
+
4
+ A comprehensive Python library for genomic sequence analysis with full mutation tracking,
5
+ splicing analysis, and sequence manipulation.
6
+ """
7
+
8
+ __version__ = "0.1.34"
9
+ __author__ = "Nicolas Lynn Vila"
10
+ __email__ = "nicolasalynn@gmail.com"
11
+
12
+ from .seqmat import SeqMat
13
+ from .gene import Gene
14
+ from .transcript import Transcript
15
+ from .config import get_default_organism, get_data_dir, get_config_dir
16
+ from .utils import (
17
+ setup_genomics_data,
18
+ load_config,
19
+ save_config,
20
+ list_available_organisms,
21
+ list_supported_organisms,
22
+ get_organism_info,
23
+ list_gene_biotypes,
24
+ count_genes,
25
+ get_gene_list,
26
+ data_summary,
27
+ print_data_summary,
28
+ search_genes,
29
+ get_all_genes
30
+ )
31
+
32
+ __all__ = [
33
+ "SeqMat",
34
+ "Gene",
35
+ "Transcript",
36
+ "get_default_organism",
37
+ "get_data_dir",
38
+ "get_config_dir",
39
+ "setup_genomics_data",
40
+ "load_config",
41
+ "save_config",
42
+ "list_available_organisms",
43
+ "list_supported_organisms",
44
+ "get_organism_info",
45
+ "list_gene_biotypes",
46
+ "count_genes",
47
+ "get_gene_list",
48
+ "data_summary",
49
+ "print_data_summary",
50
+ "search_genes",
51
+ "get_all_genes"
52
+ ]
seqmat/cli.py ADDED
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """Command-line interface for SeqMat data management"""
3
+
4
+ import argparse
5
+ import sys
6
+ from typing import Optional
7
+
8
+ from .utils import (
9
+ setup_genomics_data,
10
+ print_data_summary,
11
+ list_available_organisms,
12
+ list_supported_organisms,
13
+ list_gene_biotypes,
14
+ count_genes,
15
+ get_gene_list,
16
+ search_genes,
17
+ get_organism_info,
18
+ test_installation
19
+ )
20
+ from .config import get_available_organisms, get_default_organism, get_organism_info as get_organism_config_info, get_data_dir
21
+
22
+
23
+ def cmd_setup(args):
24
+ """Setup genomics data for an organism"""
25
+ try:
26
+ setup_genomics_data(
27
+ basepath=args.path,
28
+ organism=args.organism,
29
+ force=args.force,
30
+ pickup=args.pickup
31
+ )
32
+ print(f"✅ Successfully set up {args.organism} data in {args.path}")
33
+ except Exception as e:
34
+ print(f"❌ Error setting up data: {e}")
35
+ sys.exit(1)
36
+
37
+
38
+ def cmd_list_organisms(args):
39
+ """List available and supported organisms"""
40
+ print("🌍 Organism Support Status:")
41
+ print("-" * 30)
42
+
43
+ supported = list_supported_organisms()
44
+ configured = list_available_organisms()
45
+
46
+ # Get organism names from config
47
+ organism_names = {}
48
+ for org in set(supported + configured):
49
+ try:
50
+ info = get_organism_config_info(org)
51
+ organism_names[org] = info.get('name', org)
52
+ except:
53
+ organism_names[org] = org
54
+
55
+ for org in supported:
56
+ name = organism_names.get(org, org)
57
+ status = "✅ Configured" if org in configured else "❌ Not configured"
58
+ print(f"{org}: {name} - {status}")
59
+
60
+ if not configured:
61
+ print("\nTo set up data, run:")
62
+ print(" seqmat-setup --path /your/data/path --organism hg38")
63
+
64
+
65
+ def cmd_summary(args):
66
+ """Print data summary"""
67
+ print_data_summary()
68
+
69
+
70
+ def cmd_biotypes(args):
71
+ """List gene biotypes for an organism"""
72
+ if not args.organism:
73
+ print("❌ Please specify an organism with --organism")
74
+ sys.exit(1)
75
+
76
+ biotypes = list_gene_biotypes(args.organism)
77
+
78
+ if not biotypes:
79
+ print(f"❌ No data found for organism '{args.organism}'")
80
+ print("Available organisms:", ", ".join(list_available_organisms()))
81
+ sys.exit(1)
82
+
83
+ print(f"📊 Gene biotypes in {args.organism}:")
84
+ print("-" * 30)
85
+
86
+ # Get counts for each biotype
87
+ counts = count_genes(args.organism)
88
+
89
+ for biotype in biotypes:
90
+ count = counts.get(biotype, 0)
91
+ print(f"{biotype}: {count:,} genes")
92
+
93
+
94
+ def cmd_count(args):
95
+ """Count genes for an organism/biotype"""
96
+ if not args.organism:
97
+ print("❌ Please specify an organism with --organism")
98
+ sys.exit(1)
99
+
100
+ counts = count_genes(args.organism, args.biotype)
101
+
102
+ if not counts:
103
+ print(f"❌ No data found for organism '{args.organism}'")
104
+ sys.exit(1)
105
+
106
+ if args.biotype:
107
+ count = counts.get(args.biotype, 0)
108
+ print(f"📊 {args.organism} {args.biotype}: {count:,} genes")
109
+ else:
110
+ print(f"📊 Gene counts for {args.organism}:")
111
+ print("-" * 30)
112
+ total = 0
113
+ for biotype, count in sorted(counts.items()):
114
+ print(f"{biotype}: {count:,} genes")
115
+ total += count
116
+ print("-" * 30)
117
+ print(f"Total: {total:,} genes")
118
+
119
+
120
+ def cmd_list_genes(args):
121
+ """List genes for an organism/biotype"""
122
+ if not args.organism or not args.biotype:
123
+ print("❌ Please specify both --organism and --biotype")
124
+ sys.exit(1)
125
+
126
+ genes = get_gene_list(args.organism, args.biotype, limit=args.limit)
127
+
128
+ if not genes:
129
+ print(f"❌ No genes found for {args.organism} {args.biotype}")
130
+ sys.exit(1)
131
+
132
+ print(f"📋 {args.organism} {args.biotype} genes ({len(genes)} shown):")
133
+ print("-" * 50)
134
+
135
+ for i, gene in enumerate(genes, 1):
136
+ print(f"{i:4d}. {gene}")
137
+
138
+ if args.limit and len(genes) == args.limit:
139
+ total_count = count_genes(args.organism, args.biotype)
140
+ total = total_count.get(args.biotype, 0)
141
+ print(f"\n(Showing first {args.limit} of {total:,} total genes)")
142
+
143
+
144
+ def cmd_search(args):
145
+ """Search for genes by name pattern"""
146
+ if not args.organism or not args.query:
147
+ print("❌ Please specify both --organism and --query")
148
+ sys.exit(1)
149
+
150
+ results = search_genes(
151
+ organism=args.organism,
152
+ query=args.query,
153
+ biotype=args.biotype,
154
+ limit=args.limit
155
+ )
156
+
157
+ if not results:
158
+ print(f"❌ No genes found matching '{args.query}' in {args.organism}")
159
+ sys.exit(1)
160
+
161
+ print(f"🔍 Search results for '{args.query}' in {args.organism}:")
162
+ print("-" * 50)
163
+
164
+ for i, result in enumerate(results, 1):
165
+ gene_id = result.get('gene_id', '')
166
+ if gene_id:
167
+ print(f"{i:2d}. {result['gene_name']} ({gene_id}) - {result['biotype']}")
168
+ else:
169
+ print(f"{i:2d}. {result['gene_name']} ({result['biotype']})")
170
+
171
+ if len(results) == args.limit:
172
+ print(f"\n(Showing first {args.limit} results)")
173
+
174
+
175
+ def cmd_info(args):
176
+ """Show detailed information about an organism"""
177
+ if not args.organism:
178
+ print("❌ Please specify an organism with --organism")
179
+ sys.exit(1)
180
+
181
+ info = get_organism_info(args.organism)
182
+
183
+ if "error" in info:
184
+ print(f"❌ {info['error']}")
185
+ sys.exit(1)
186
+
187
+ print(f"ℹ️ Detailed information for {args.organism}:")
188
+ print("=" * 40)
189
+
190
+ # Data availability
191
+ data_avail = info.get("data_available", {})
192
+
193
+ if "gene_counts" in data_avail:
194
+ print("📊 Gene Data:")
195
+ total_genes = 0
196
+ for biotype, count in sorted(data_avail["gene_counts"].items()):
197
+ print(f" {biotype}: {count:,} genes")
198
+ total_genes += count
199
+ print(f" Total: {total_genes:,} genes")
200
+ print()
201
+
202
+ if "chromosomes" in data_avail:
203
+ chroms = data_avail["chromosomes"]
204
+ print(f"🧬 Chromosome Data: {len(chroms)} chromosomes")
205
+ print(f" Available: {', '.join(sorted(chroms))}")
206
+ print()
207
+
208
+ print("📁 Data Paths:")
209
+ for path_name, path_value in info["paths"].items():
210
+ from pathlib import Path
211
+ exists = "✅" if Path(path_value).exists() else "❌"
212
+ print(f" {path_name}: {exists} {path_value}")
213
+
214
+
215
+ def main():
216
+ """Main CLI entry point"""
217
+ parser = argparse.ArgumentParser(
218
+ prog="seqmat",
219
+ description="SeqMat genomics data management CLI"
220
+ )
221
+
222
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
223
+
224
+ # Setup command
225
+ setup_parser = subparsers.add_parser("setup", help="Set up genomics data")
226
+ default_data_dir = str(get_data_dir())
227
+ setup_parser.add_argument("--path", default=default_data_dir,
228
+ help=f"Base path for data storage (default: {default_data_dir})")
229
+ # Get available organisms dynamically
230
+ available_organisms = get_available_organisms()
231
+ default_organism = get_default_organism()
232
+ setup_parser.add_argument("--organism", default=default_organism, choices=available_organisms,
233
+ help=f"Organism to set up (default: {default_organism})")
234
+ setup_parser.add_argument("--force", action="store_true", help="Force overwrite existing data")
235
+ setup_parser.add_argument("--pickup", action="store_true", help="Resume interrupted setup, reuse existing downloaded files")
236
+ setup_parser.set_defaults(func=cmd_setup)
237
+
238
+ # List organisms command
239
+ organisms_parser = subparsers.add_parser("organisms", help="List supported/configured organisms")
240
+ organisms_parser.set_defaults(func=cmd_list_organisms)
241
+
242
+ # Summary command
243
+ summary_parser = subparsers.add_parser("summary", help="Show data summary")
244
+ summary_parser.set_defaults(func=cmd_summary)
245
+
246
+ # Biotypes command
247
+ biotypes_parser = subparsers.add_parser("biotypes", help="List gene biotypes")
248
+ biotypes_parser.add_argument("--organism", help="Organism to query")
249
+ biotypes_parser.set_defaults(func=cmd_biotypes)
250
+
251
+ # Count command
252
+ count_parser = subparsers.add_parser("count", help="Count genes")
253
+ count_parser.add_argument("--organism", help="Organism to query")
254
+ count_parser.add_argument("--biotype", help="Specific biotype to count")
255
+ count_parser.set_defaults(func=cmd_count)
256
+
257
+ # List genes command
258
+ list_parser = subparsers.add_parser("list", help="List genes")
259
+ list_parser.add_argument("--organism", help="Organism to query")
260
+ list_parser.add_argument("--biotype", help="Gene biotype")
261
+ list_parser.add_argument("--limit", type=int, default=50, help="Maximum genes to show")
262
+ list_parser.set_defaults(func=cmd_list_genes)
263
+
264
+ # Search command
265
+ search_parser = subparsers.add_parser("search", help="Search genes by name")
266
+ search_parser.add_argument("--organism", help="Organism to search")
267
+ search_parser.add_argument("--query", help="Gene name pattern to search")
268
+ search_parser.add_argument("--biotype", help="Filter by biotype")
269
+ search_parser.add_argument("--limit", type=int, default=20, help="Maximum results")
270
+ search_parser.set_defaults(func=cmd_search)
271
+
272
+ # Info command
273
+ info_parser = subparsers.add_parser("info", help="Show organism information")
274
+ info_parser.add_argument("--organism", help="Organism to query")
275
+ info_parser.set_defaults(func=cmd_info)
276
+
277
+ # Test command
278
+ test_parser = subparsers.add_parser("test", help="Test SeqMat installation and data setup")
279
+ test_parser.add_argument("--organism", help="Organism to test (uses default if not specified)")
280
+ test_parser.add_argument("--quiet", action="store_true", help="Suppress detailed output")
281
+ test_parser.set_defaults(func=cmd_test)
282
+
283
+ # Parse arguments
284
+ args = parser.parse_args()
285
+
286
+ if not args.command:
287
+ parser.print_help()
288
+ sys.exit(1)
289
+
290
+ # Execute command
291
+ args.func(args)
292
+
293
+
294
+ def cmd_test(args):
295
+ """Run comprehensive tests on SeqMat installation"""
296
+ organism = args.organism
297
+ verbose = not args.quiet
298
+
299
+ # Run tests
300
+ results = test_installation(organism, verbose=verbose)
301
+
302
+ # Exit with appropriate code
303
+ if results['tests_failed'] > 0:
304
+ sys.exit(1)
305
+ else:
306
+ sys.exit(0)
307
+
308
+
309
+ if __name__ == "__main__":
310
+ main()
seqmat/config.py ADDED
@@ -0,0 +1,127 @@
1
+ """Configuration management for SeqMat"""
2
+ import os
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, Any, List, Optional
6
+ from platformdirs import user_config_dir, user_data_dir
7
+
8
+ # Use XDG Base Directory specification
9
+ DEFAULT_CONFIG_DIR = Path(user_config_dir("seqmat", appauthor=False))
10
+ DEFAULT_DATA_DIR = Path(user_data_dir("seqmat", appauthor=False))
11
+ CONFIG_FILE = DEFAULT_CONFIG_DIR / 'config.json'
12
+
13
+ # Default organism data sources - can be overridden in config
14
+ DEFAULT_ORGANISM_DATA = {
15
+ 'hg38': {
16
+ 'name': 'Homo sapiens (Human)',
17
+ 'urls': {
18
+ 'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz',
19
+ 'gtf': 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz',
20
+ 'conservation': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl',
21
+ 'gtex': 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
22
+ }
23
+ },
24
+ 'mm39': {
25
+ 'name': 'Mus musculus (Mouse)',
26
+ 'urls': {
27
+ 'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz',
28
+ 'gtf': 'https://ftp.ensembl.org/pub/release-112/gtf/mus_musculus/Mus_musculus.GRCm39.112.gtf.gz'
29
+ }
30
+ }
31
+ }
32
+
33
+ DEFAULT_SETTINGS = {
34
+ 'default_organism': 'hg38',
35
+ 'directory_structure': {
36
+ 'chromosomes': 'chromosomes',
37
+ 'annotations': 'annotations'
38
+ }
39
+ }
40
+
41
+ def load_config() -> Dict[str, Any]:
42
+ """Load configuration from user's home directory"""
43
+ if CONFIG_FILE.exists():
44
+ with open(CONFIG_FILE, 'r') as f:
45
+ config = json.load(f)
46
+ # Merge with default settings
47
+ merged_config = DEFAULT_SETTINGS.copy()
48
+ merged_config.update(config)
49
+ return merged_config
50
+ return DEFAULT_SETTINGS.copy()
51
+
52
+ def save_config(config: Dict[str, Any]) -> None:
53
+ """Save configuration to user's home directory"""
54
+ DEFAULT_CONFIG_DIR.mkdir(exist_ok=True)
55
+ with open(CONFIG_FILE, 'w') as f:
56
+ json.dump(config, f, indent=2)
57
+
58
+ def get_default_organism() -> str:
59
+ """Get the default organism from config or fallback"""
60
+ config = load_config()
61
+ return config.get('default_organism', DEFAULT_SETTINGS['default_organism'])
62
+
63
+ def get_available_organisms() -> List[str]:
64
+ """Get list of available organisms from config and defaults"""
65
+ config = load_config()
66
+ configured_organisms = set(config.keys()) - {'default_organism', 'directory_structure'}
67
+ default_organisms = set(DEFAULT_ORGANISM_DATA.keys())
68
+ return sorted(configured_organisms | default_organisms)
69
+
70
+ def get_organism_info(organism: str) -> Dict[str, Any]:
71
+ """Get organism information including name and URLs"""
72
+ config = load_config()
73
+
74
+ if organism in config and isinstance(config[organism], dict):
75
+ org_config = config[organism]
76
+ # Merge with defaults if available
77
+ if organism in DEFAULT_ORGANISM_DATA:
78
+ default_data = DEFAULT_ORGANISM_DATA[organism].copy()
79
+ default_data.update(org_config)
80
+ return default_data
81
+ return org_config
82
+ elif organism in DEFAULT_ORGANISM_DATA:
83
+ return DEFAULT_ORGANISM_DATA[organism]
84
+ else:
85
+ raise ValueError(f"Organism '{organism}' not configured. Available: {get_available_organisms()}")
86
+
87
+ def get_organism_config(organism: Optional[str] = None) -> Dict[str, Path]:
88
+ """Get configuration paths for a specific organism"""
89
+ if organism is None:
90
+ organism = get_default_organism()
91
+
92
+ config = load_config()
93
+ if organism not in config:
94
+ raise ValueError(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
95
+
96
+ # Convert string paths to Path objects
97
+ org_config = config[organism]
98
+
99
+ # Handle case where org_config might be a string instead of dict
100
+ if isinstance(org_config, str):
101
+ raise ValueError(f"Invalid configuration for organism '{organism}'. "
102
+ f"Expected dictionary but got string: {org_config}")
103
+
104
+ if not isinstance(org_config, dict):
105
+ raise ValueError(f"Invalid configuration for organism '{organism}'. "
106
+ f"Expected dictionary but got {type(org_config)}")
107
+
108
+ return {k: Path(v) for k, v in org_config.items() if isinstance(v, str)}
109
+
110
+ def get_directory_config() -> Dict[str, str]:
111
+ """Get directory structure configuration"""
112
+ config = load_config()
113
+ return config.get('directory_structure', DEFAULT_SETTINGS['directory_structure'])
114
+
115
+ def get_data_dir() -> Path:
116
+ """
117
+ Get the data directory where genomic data files are stored.
118
+ Returns the user data directory following OS conventions.
119
+ """
120
+ return DEFAULT_DATA_DIR
121
+
122
+ def get_config_dir() -> Path:
123
+ """
124
+ Get the config directory where configuration files are stored.
125
+ Returns the user config directory following OS conventions.
126
+ """
127
+ return DEFAULT_CONFIG_DIR
seqmat/gene.py ADDED
@@ -0,0 +1,188 @@
1
+ """Gene class for representing genomic genes with associated transcripts"""
2
+ import copy
3
+ from typing import Any, Dict, List, Tuple, Optional, Iterator, Union
4
+ from collections import Counter
5
+ from pathlib import Path
6
+
7
+ from .config import get_organism_config, get_default_organism
8
+ from .utils import unload_pickle
9
+ from .transcript import Transcript
10
+
11
+
12
+ class Gene:
13
+ """
14
+ A class representing a Gene, with associated transcripts and metadata.
15
+
16
+ Attributes:
17
+ organism (str): The organism build (e.g. 'hg38').
18
+ transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
19
+ gene_name (str): The name of the gene.
20
+ gene_id (str): The unique identifier for the gene.
21
+ chrm (str): The chromosome on which the gene resides.
22
+ rev (bool): Whether the gene is on the reverse strand.
23
+ """
24
+
25
+ def __init__(self, gene_name: str, gene_id: str, rev: bool, chrm: str,
26
+ transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None):
27
+ """
28
+ Initialize a Gene instance.
29
+
30
+ Args:
31
+ gene_name: Name of the gene
32
+ gene_id: Unique identifier for the gene
33
+ rev: Whether gene is on reverse strand
34
+ chrm: Chromosome identifier
35
+ transcripts: Dictionary of transcript annotations
36
+ organism: Organism reference build (default from config)
37
+ """
38
+ self.gene_name = gene_name
39
+ self.gene_id = gene_id
40
+ self.rev = rev
41
+ self.chrm = chrm
42
+ self.organism = organism if organism is not None else get_default_organism()
43
+ self.transcripts = transcripts if transcripts is not None else {}
44
+
45
+ def __repr__(self) -> str:
46
+ """Official string representation of the Gene object."""
47
+ return f"Gene({self.gene_name})"
48
+
49
+ def __str__(self) -> str:
50
+ """User-friendly string representation of the Gene object."""
51
+ return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
52
+
53
+ def __len__(self) -> int:
54
+ """Returns the number of transcripts associated with this gene."""
55
+ return len(self.transcripts)
56
+
57
+ def __copy__(self):
58
+ """Returns a shallow copy of the Gene object."""
59
+ return copy.copy(self)
60
+
61
+ def __deepcopy__(self, memo):
62
+ """Returns a deep copy of the Gene object."""
63
+ return copy.deepcopy(self, memo)
64
+
65
+ def __iter__(self) -> Iterator[Transcript]:
66
+ """Allow iteration over the gene's transcripts, yielding Transcript objects."""
67
+ for tid, annotations in self.transcripts.items():
68
+ yield Transcript(annotations, organism=self.organism)
69
+
70
+ def __getitem__(self, item: str) -> Optional[Transcript]:
71
+ """Get a transcript by ID."""
72
+ if item not in self.transcripts:
73
+ print(f"{item} not an annotated transcript of this gene.")
74
+ return None
75
+ return Transcript(self.transcripts[item], organism=self.organism)
76
+
77
+ @classmethod
78
+ def from_file(cls, gene_name: str, organism: Optional[str] = None) -> Optional['Gene']:
79
+ """
80
+ Load gene data from file.
81
+
82
+ Args:
83
+ gene_name: Name of the gene to load
84
+ organism: Organism reference build
85
+
86
+ Returns:
87
+ Gene object or None if not found
88
+ """
89
+ if organism is None:
90
+ organism = get_default_organism()
91
+ try:
92
+ config = get_organism_config(organism)
93
+ except ValueError:
94
+ print(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
95
+ return None
96
+
97
+ # Search through all biotype folders in the configured organism MRNA path
98
+ mrna_path = Path(config['MRNA_PATH'])
99
+ gene_files = []
100
+
101
+ # Look through all biotype subdirectories
102
+ if mrna_path.exists():
103
+ for biotype_dir in mrna_path.iterdir():
104
+ if biotype_dir.is_dir():
105
+ # Search for gene files matching the name
106
+ matching_files = list(biotype_dir.glob(f'*_{gene_name}.pkl'))
107
+ gene_files.extend(matching_files)
108
+
109
+ if not gene_files:
110
+ print(f"No files available for gene '{gene_name}'.")
111
+ return None
112
+
113
+ # Load gene data from the first matching file
114
+ data = unload_pickle(gene_files[0])
115
+
116
+ return cls(
117
+ gene_name=data.get('gene_name'),
118
+ gene_id=data.get('gene_id'),
119
+ rev=data.get('rev'),
120
+ chrm=data.get('chrm'),
121
+ transcripts=data.get('transcripts', {}),
122
+ organism=organism
123
+ )
124
+
125
+ def splice_sites(self) -> Tuple[Counter, Counter]:
126
+ """
127
+ Aggregates splice sites (acceptors and donors) from all transcripts.
128
+
129
+ Returns:
130
+ tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
131
+ """
132
+ acceptors: List[Any] = []
133
+ donors: List[Any] = []
134
+
135
+ # Collect acceptor and donor sites from each transcript
136
+ for transcript in self.transcripts.values():
137
+ acceptors.extend(transcript.get('acceptors', []))
138
+ donors.extend(transcript.get('donors', []))
139
+
140
+ return Counter(acceptors), Counter(donors)
141
+
142
+ def transcript(self, tid: Optional[str] = None) -> Optional[Transcript]:
143
+ """
144
+ Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
145
+
146
+ Args:
147
+ tid: Transcript ID. If None, returns primary transcript.
148
+
149
+ Returns:
150
+ The Transcript object with the given ID or the primary transcript.
151
+ """
152
+ if tid is None:
153
+ tid = self.primary_transcript
154
+
155
+ if tid is None or tid not in self.transcripts:
156
+ return None
157
+
158
+ return Transcript(self.transcripts[tid], organism=self.organism)
159
+
160
+ @property
161
+ def primary_transcript(self) -> Optional[str]:
162
+ """
163
+ Returns the primary transcript ID for this gene.
164
+
165
+ Returns:
166
+ The primary transcript ID or None if not available.
167
+ """
168
+ # If already calculated, return it
169
+ if hasattr(self, '_primary_transcript'):
170
+ return self._primary_transcript
171
+
172
+ # Try to find a primary transcript
173
+ primary_transcripts = [k for k, v in self.transcripts.items()
174
+ if v.get('primary_transcript')]
175
+ if primary_transcripts:
176
+ self._primary_transcript = primary_transcripts[0]
177
+ return self._primary_transcript
178
+
179
+ # Fallback: find a protein-coding transcript
180
+ protein_coding = [k for k, v in self.transcripts.items()
181
+ if v.get('transcript_biotype') == 'protein_coding']
182
+ if protein_coding:
183
+ self._primary_transcript = protein_coding[0]
184
+ return self._primary_transcript
185
+
186
+ # No primary or protein-coding transcript found
187
+ self._primary_transcript = None
188
+ return None