seqmat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seqmat/__init__.py ADDED
@@ -0,0 +1,46 @@
1
+ """
2
+ SeqMat - Lightning-fast genomic sequence matrix library
3
+
4
+ A comprehensive Python library for genomic sequence analysis with full mutation tracking,
5
+ splicing analysis, and sequence manipulation.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Nicolas Lynn Vila"
10
+ __email__ = "nicolasalynn@gmail.com"
11
+
12
+ from .seqmat import SeqMat
13
+ from .gene import Gene
14
+ from .transcript import Transcript
15
+ from .utils import (
16
+ setup_genomics_data,
17
+ load_config,
18
+ save_config,
19
+ list_available_organisms,
20
+ list_supported_organisms,
21
+ get_organism_info,
22
+ list_gene_biotypes,
23
+ count_genes,
24
+ get_gene_list,
25
+ data_summary,
26
+ print_data_summary,
27
+ search_genes
28
+ )
29
+
30
+ __all__ = [
31
+ "SeqMat",
32
+ "Gene",
33
+ "Transcript",
34
+ "setup_genomics_data",
35
+ "load_config",
36
+ "save_config",
37
+ "list_available_organisms",
38
+ "list_supported_organisms",
39
+ "get_organism_info",
40
+ "list_gene_biotypes",
41
+ "count_genes",
42
+ "get_gene_list",
43
+ "data_summary",
44
+ "print_data_summary",
45
+ "search_genes"
46
+ ]
seqmat/cli.py ADDED
@@ -0,0 +1,280 @@
1
+ #!/usr/bin/env python3
2
+ """Command-line interface for SeqMat data management"""
3
+
4
+ import argparse
5
+ import sys
6
+ from typing import Optional
7
+
8
+ from .utils import (
9
+ setup_genomics_data,
10
+ print_data_summary,
11
+ list_available_organisms,
12
+ list_supported_organisms,
13
+ list_gene_biotypes,
14
+ count_genes,
15
+ get_gene_list,
16
+ search_genes,
17
+ get_organism_info
18
+ )
19
+ from .config import get_available_organisms, get_default_organism, get_organism_info as get_organism_config_info
20
+
21
+
22
+ def cmd_setup(args):
23
+ """Setup genomics data for an organism"""
24
+ try:
25
+ setup_genomics_data(
26
+ basepath=args.path,
27
+ organism=args.organism,
28
+ force=args.force
29
+ )
30
+ print(f"✅ Successfully set up {args.organism} data in {args.path}")
31
+ except Exception as e:
32
+ print(f"❌ Error setting up data: {e}")
33
+ sys.exit(1)
34
+
35
+
36
+ def cmd_list_organisms(args):
37
+ """List available and supported organisms"""
38
+ print("🌍 Organism Support Status:")
39
+ print("-" * 30)
40
+
41
+ supported = list_supported_organisms()
42
+ configured = list_available_organisms()
43
+
44
+ # Get organism names from config
45
+ organism_names = {}
46
+ for org in set(supported + configured):
47
+ try:
48
+ info = get_organism_config_info(org)
49
+ organism_names[org] = info.get('name', org)
50
+ except:
51
+ organism_names[org] = org
52
+
53
+ for org in supported:
54
+ name = organism_names.get(org, org)
55
+ status = "✅ Configured" if org in configured else "❌ Not configured"
56
+ print(f"{org}: {name} - {status}")
57
+
58
+ if not configured:
59
+ print("\nTo set up data, run:")
60
+ print(" seqmat-setup --path /your/data/path --organism hg38")
61
+
62
+
63
+ def cmd_summary(args):
64
+ """Print data summary"""
65
+ print_data_summary()
66
+
67
+
68
+ def cmd_biotypes(args):
69
+ """List gene biotypes for an organism"""
70
+ if not args.organism:
71
+ print("❌ Please specify an organism with --organism")
72
+ sys.exit(1)
73
+
74
+ biotypes = list_gene_biotypes(args.organism)
75
+
76
+ if not biotypes:
77
+ print(f"❌ No data found for organism '{args.organism}'")
78
+ print("Available organisms:", ", ".join(list_available_organisms()))
79
+ sys.exit(1)
80
+
81
+ print(f"📊 Gene biotypes in {args.organism}:")
82
+ print("-" * 30)
83
+
84
+ # Get counts for each biotype
85
+ counts = count_genes(args.organism)
86
+
87
+ for biotype in biotypes:
88
+ count = counts.get(biotype, 0)
89
+ print(f"{biotype}: {count:,} genes")
90
+
91
+
92
+ def cmd_count(args):
93
+ """Count genes for an organism/biotype"""
94
+ if not args.organism:
95
+ print("❌ Please specify an organism with --organism")
96
+ sys.exit(1)
97
+
98
+ counts = count_genes(args.organism, args.biotype)
99
+
100
+ if not counts:
101
+ print(f"❌ No data found for organism '{args.organism}'")
102
+ sys.exit(1)
103
+
104
+ if args.biotype:
105
+ count = counts.get(args.biotype, 0)
106
+ print(f"📊 {args.organism} {args.biotype}: {count:,} genes")
107
+ else:
108
+ print(f"📊 Gene counts for {args.organism}:")
109
+ print("-" * 30)
110
+ total = 0
111
+ for biotype, count in sorted(counts.items()):
112
+ print(f"{biotype}: {count:,} genes")
113
+ total += count
114
+ print("-" * 30)
115
+ print(f"Total: {total:,} genes")
116
+
117
+
118
+ def cmd_list_genes(args):
119
+ """List genes for an organism/biotype"""
120
+ if not args.organism or not args.biotype:
121
+ print("❌ Please specify both --organism and --biotype")
122
+ sys.exit(1)
123
+
124
+ genes = get_gene_list(args.organism, args.biotype, limit=args.limit)
125
+
126
+ if not genes:
127
+ print(f"❌ No genes found for {args.organism} {args.biotype}")
128
+ sys.exit(1)
129
+
130
+ print(f"📋 {args.organism} {args.biotype} genes ({len(genes)} shown):")
131
+ print("-" * 50)
132
+
133
+ for i, gene in enumerate(genes, 1):
134
+ print(f"{i:4d}. {gene}")
135
+
136
+ if args.limit and len(genes) == args.limit:
137
+ total_count = count_genes(args.organism, args.biotype)
138
+ total = total_count.get(args.biotype, 0)
139
+ print(f"\n(Showing first {args.limit} of {total:,} total genes)")
140
+
141
+
142
+ def cmd_search(args):
143
+ """Search for genes by name pattern"""
144
+ if not args.organism or not args.query:
145
+ print("❌ Please specify both --organism and --query")
146
+ sys.exit(1)
147
+
148
+ results = search_genes(
149
+ organism=args.organism,
150
+ query=args.query,
151
+ biotype=args.biotype,
152
+ limit=args.limit
153
+ )
154
+
155
+ if not results:
156
+ print(f"❌ No genes found matching '{args.query}' in {args.organism}")
157
+ sys.exit(1)
158
+
159
+ print(f"🔍 Search results for '{args.query}' in {args.organism}:")
160
+ print("-" * 50)
161
+
162
+ for i, result in enumerate(results, 1):
163
+ print(f"{i:2d}. {result['gene_name']} ({result['biotype']})")
164
+
165
+ if len(results) == args.limit:
166
+ print(f"\n(Showing first {args.limit} results)")
167
+
168
+
169
+ def cmd_info(args):
170
+ """Show detailed information about an organism"""
171
+ if not args.organism:
172
+ print("❌ Please specify an organism with --organism")
173
+ sys.exit(1)
174
+
175
+ info = get_organism_info(args.organism)
176
+
177
+ if "error" in info:
178
+ print(f"❌ {info['error']}")
179
+ sys.exit(1)
180
+
181
+ print(f"ℹ️ Detailed information for {args.organism}:")
182
+ print("=" * 40)
183
+
184
+ # Data availability
185
+ data_avail = info.get("data_available", {})
186
+
187
+ if "gene_counts" in data_avail:
188
+ print("📊 Gene Data:")
189
+ total_genes = 0
190
+ for biotype, count in sorted(data_avail["gene_counts"].items()):
191
+ print(f" {biotype}: {count:,} genes")
192
+ total_genes += count
193
+ print(f" Total: {total_genes:,} genes")
194
+ print()
195
+
196
+ if "chromosomes" in data_avail:
197
+ chroms = data_avail["chromosomes"]
198
+ print(f"🧬 Chromosome Data: {len(chroms)} chromosomes")
199
+ print(f" Available: {', '.join(sorted(chroms))}")
200
+ print()
201
+
202
+ print("📁 Data Paths:")
203
+ for path_name, path_value in info["paths"].items():
204
+ from pathlib import Path
205
+ exists = "✅" if Path(path_value).exists() else "❌"
206
+ print(f" {path_name}: {exists} {path_value}")
207
+
208
+
209
+ def main():
210
+ """Main CLI entry point"""
211
+ parser = argparse.ArgumentParser(
212
+ prog="seqmat",
213
+ description="SeqMat genomics data management CLI"
214
+ )
215
+
216
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
217
+
218
+ # Setup command
219
+ setup_parser = subparsers.add_parser("setup", help="Set up genomics data")
220
+ setup_parser.add_argument("--path", required=True, help="Base path for data storage")
221
+ # Get available organisms dynamically
222
+ available_organisms = get_available_organisms()
223
+ default_organism = get_default_organism()
224
+ setup_parser.add_argument("--organism", default=default_organism, choices=available_organisms,
225
+ help=f"Organism to set up (default: {default_organism})")
226
+ setup_parser.add_argument("--force", action="store_true", help="Force overwrite existing data")
227
+ setup_parser.set_defaults(func=cmd_setup)
228
+
229
+ # List organisms command
230
+ organisms_parser = subparsers.add_parser("organisms", help="List supported/configured organisms")
231
+ organisms_parser.set_defaults(func=cmd_list_organisms)
232
+
233
+ # Summary command
234
+ summary_parser = subparsers.add_parser("summary", help="Show data summary")
235
+ summary_parser.set_defaults(func=cmd_summary)
236
+
237
+ # Biotypes command
238
+ biotypes_parser = subparsers.add_parser("biotypes", help="List gene biotypes")
239
+ biotypes_parser.add_argument("--organism", help="Organism to query")
240
+ biotypes_parser.set_defaults(func=cmd_biotypes)
241
+
242
+ # Count command
243
+ count_parser = subparsers.add_parser("count", help="Count genes")
244
+ count_parser.add_argument("--organism", help="Organism to query")
245
+ count_parser.add_argument("--biotype", help="Specific biotype to count")
246
+ count_parser.set_defaults(func=cmd_count)
247
+
248
+ # List genes command
249
+ list_parser = subparsers.add_parser("list", help="List genes")
250
+ list_parser.add_argument("--organism", help="Organism to query")
251
+ list_parser.add_argument("--biotype", help="Gene biotype")
252
+ list_parser.add_argument("--limit", type=int, default=50, help="Maximum genes to show")
253
+ list_parser.set_defaults(func=cmd_list_genes)
254
+
255
+ # Search command
256
+ search_parser = subparsers.add_parser("search", help="Search genes by name")
257
+ search_parser.add_argument("--organism", help="Organism to search")
258
+ search_parser.add_argument("--query", help="Gene name pattern to search")
259
+ search_parser.add_argument("--biotype", help="Filter by biotype")
260
+ search_parser.add_argument("--limit", type=int, default=20, help="Maximum results")
261
+ search_parser.set_defaults(func=cmd_search)
262
+
263
+ # Info command
264
+ info_parser = subparsers.add_parser("info", help="Show organism information")
265
+ info_parser.add_argument("--organism", help="Organism to query")
266
+ info_parser.set_defaults(func=cmd_info)
267
+
268
+ # Parse arguments
269
+ args = parser.parse_args()
270
+
271
+ if not args.command:
272
+ parser.print_help()
273
+ sys.exit(1)
274
+
275
+ # Execute command
276
+ args.func(args)
277
+
278
+
279
+ if __name__ == "__main__":
280
+ main()
seqmat/config.py ADDED
@@ -0,0 +1,100 @@
1
+ """Configuration management for SeqMat"""
2
+ import os
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, Any, List, Optional
6
+
7
+ DEFAULT_CONFIG_DIR = Path.home() / '.seqmat'
8
+ CONFIG_FILE = DEFAULT_CONFIG_DIR / 'config.json'
9
+
10
+ # Default organism data sources - can be overridden in config
11
+ DEFAULT_ORGANISM_DATA = {
12
+ 'hg38': {
13
+ 'name': 'Homo sapiens (Human)',
14
+ 'urls': {
15
+ 'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz',
16
+ 'gtf': 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz',
17
+ 'conservation': 'https://genome-data-public-access.s3.eu-north-1.amazonaws.com/conservation.pkl',
18
+ 'gtex': 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz'
19
+ }
20
+ },
21
+ 'mm39': {
22
+ 'name': 'Mus musculus (Mouse)',
23
+ 'urls': {
24
+ 'fasta': 'https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz',
25
+ 'gtf': 'https://ftp.ensembl.org/pub/release-112/gtf/mus_musculus/Mus_musculus.GRCm39.112.gtf.gz'
26
+ }
27
+ }
28
+ }
29
+
30
+ DEFAULT_SETTINGS = {
31
+ 'default_organism': 'hg38',
32
+ 'directory_structure': {
33
+ 'chromosomes': 'chromosomes',
34
+ 'annotations': 'annotations'
35
+ }
36
+ }
37
+
38
+ def load_config() -> Dict[str, Any]:
39
+ """Load configuration from user's home directory"""
40
+ if CONFIG_FILE.exists():
41
+ with open(CONFIG_FILE, 'r') as f:
42
+ config = json.load(f)
43
+ # Merge with default settings
44
+ merged_config = DEFAULT_SETTINGS.copy()
45
+ merged_config.update(config)
46
+ return merged_config
47
+ return DEFAULT_SETTINGS.copy()
48
+
49
+ def save_config(config: Dict[str, Any]) -> None:
50
+ """Save configuration to user's home directory"""
51
+ DEFAULT_CONFIG_DIR.mkdir(exist_ok=True)
52
+ with open(CONFIG_FILE, 'w') as f:
53
+ json.dump(config, f, indent=2)
54
+
55
+ def get_default_organism() -> str:
56
+ """Get the default organism from config or fallback"""
57
+ config = load_config()
58
+ return config.get('default_organism', DEFAULT_SETTINGS['default_organism'])
59
+
60
+ def get_available_organisms() -> List[str]:
61
+ """Get list of available organisms from config and defaults"""
62
+ config = load_config()
63
+ configured_organisms = set(config.keys()) - {'default_organism', 'directory_structure'}
64
+ default_organisms = set(DEFAULT_ORGANISM_DATA.keys())
65
+ return sorted(configured_organisms | default_organisms)
66
+
67
+ def get_organism_info(organism: str) -> Dict[str, Any]:
68
+ """Get organism information including name and URLs"""
69
+ config = load_config()
70
+
71
+ if organism in config and isinstance(config[organism], dict):
72
+ org_config = config[organism]
73
+ # Merge with defaults if available
74
+ if organism in DEFAULT_ORGANISM_DATA:
75
+ default_data = DEFAULT_ORGANISM_DATA[organism].copy()
76
+ default_data.update(org_config)
77
+ return default_data
78
+ return org_config
79
+ elif organism in DEFAULT_ORGANISM_DATA:
80
+ return DEFAULT_ORGANISM_DATA[organism]
81
+ else:
82
+ raise ValueError(f"Organism '{organism}' not configured. Available: {get_available_organisms()}")
83
+
84
+ def get_organism_config(organism: Optional[str] = None) -> Dict[str, Path]:
85
+ """Get configuration paths for a specific organism"""
86
+ if organism is None:
87
+ organism = get_default_organism()
88
+
89
+ config = load_config()
90
+ if organism not in config:
91
+ raise ValueError(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
92
+
93
+ # Convert string paths to Path objects
94
+ org_config = config[organism]
95
+ return {k: Path(v) for k, v in org_config.items() if isinstance(v, str)}
96
+
97
+ def get_directory_config() -> Dict[str, str]:
98
+ """Get directory structure configuration"""
99
+ config = load_config()
100
+ return config.get('directory_structure', DEFAULT_SETTINGS['directory_structure'])
seqmat/gene.py ADDED
@@ -0,0 +1,178 @@
1
+ """Gene class for representing genomic genes with associated transcripts"""
2
+ import copy
3
+ from typing import Any, Dict, List, Tuple, Optional, Iterator, Union
4
+ from collections import Counter
5
+ from pathlib import Path
6
+
7
+ from .config import get_organism_config, get_default_organism
8
+ from .utils import unload_pickle
9
+ from .transcript import Transcript
10
+
11
+
12
+ class Gene:
13
+ """
14
+ A class representing a Gene, with associated transcripts and metadata.
15
+
16
+ Attributes:
17
+ organism (str): The organism build (e.g. 'hg38').
18
+ transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
19
+ gene_name (str): The name of the gene.
20
+ gene_id (str): The unique identifier for the gene.
21
+ chrm (str): The chromosome on which the gene resides.
22
+ rev (bool): Whether the gene is on the reverse strand.
23
+ """
24
+
25
+ def __init__(self, gene_name: str, gene_id: str, rev: bool, chrm: str,
26
+ transcripts: Optional[Dict[str, Any]] = None, organism: Optional[str] = None):
27
+ """
28
+ Initialize a Gene instance.
29
+
30
+ Args:
31
+ gene_name: Name of the gene
32
+ gene_id: Unique identifier for the gene
33
+ rev: Whether gene is on reverse strand
34
+ chrm: Chromosome identifier
35
+ transcripts: Dictionary of transcript annotations
36
+ organism: Organism reference build (default from config)
37
+ """
38
+ self.gene_name = gene_name
39
+ self.gene_id = gene_id
40
+ self.rev = rev
41
+ self.chrm = chrm
42
+ self.organism = organism if organism is not None else get_default_organism()
43
+ self.transcripts = transcripts if transcripts is not None else {}
44
+
45
+ def __repr__(self) -> str:
46
+ """Official string representation of the Gene object."""
47
+ return f"Gene({self.gene_name})"
48
+
49
+ def __str__(self) -> str:
50
+ """User-friendly string representation of the Gene object."""
51
+ return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
52
+
53
+ def __len__(self) -> int:
54
+ """Returns the number of transcripts associated with this gene."""
55
+ return len(self.transcripts)
56
+
57
+ def __copy__(self):
58
+ """Returns a shallow copy of the Gene object."""
59
+ return copy.copy(self)
60
+
61
+ def __deepcopy__(self, memo):
62
+ """Returns a deep copy of the Gene object."""
63
+ return copy.deepcopy(self, memo)
64
+
65
+ def __iter__(self) -> Iterator[Transcript]:
66
+ """Allow iteration over the gene's transcripts, yielding Transcript objects."""
67
+ for tid, annotations in self.transcripts.items():
68
+ yield Transcript(annotations, organism=self.organism)
69
+
70
+ def __getitem__(self, item: str) -> Optional[Transcript]:
71
+ """Get a transcript by ID."""
72
+ if item not in self.transcripts:
73
+ print(f"{item} not an annotated transcript of this gene.")
74
+ return None
75
+ return Transcript(self.transcripts[item], organism=self.organism)
76
+
77
+ @classmethod
78
+ def from_file(cls, gene_name: str, organism: Optional[str] = None) -> Optional['Gene']:
79
+ """
80
+ Load gene data from file.
81
+
82
+ Args:
83
+ gene_name: Name of the gene to load
84
+ organism: Organism reference build
85
+
86
+ Returns:
87
+ Gene object or None if not found
88
+ """
89
+ if organism is None:
90
+ organism = get_default_organism()
91
+ try:
92
+ config = get_organism_config(organism)
93
+ except ValueError:
94
+ print(f"Organism '{organism}' not configured. Run setup_genomics_data() first.")
95
+ return None
96
+
97
+ # Find gene data files in the configured organism MRNA path
98
+ gene_files = list((config['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
99
+ if not gene_files:
100
+ print(f"No files available for gene '{gene_name}'.")
101
+ return None
102
+
103
+ # Load gene data from the first matching file
104
+ data = unload_pickle(gene_files[0])
105
+
106
+ return cls(
107
+ gene_name=data.get('gene_name'),
108
+ gene_id=data.get('gene_id'),
109
+ rev=data.get('rev'),
110
+ chrm=data.get('chrm'),
111
+ transcripts=data.get('transcripts', {}),
112
+ organism=organism
113
+ )
114
+
115
+ def splice_sites(self) -> Tuple[Counter, Counter]:
116
+ """
117
+ Aggregates splice sites (acceptors and donors) from all transcripts.
118
+
119
+ Returns:
120
+ tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
121
+ """
122
+ acceptors: List[Any] = []
123
+ donors: List[Any] = []
124
+
125
+ # Collect acceptor and donor sites from each transcript
126
+ for transcript in self.transcripts.values():
127
+ acceptors.extend(transcript.get('acceptors', []))
128
+ donors.extend(transcript.get('donors', []))
129
+
130
+ return Counter(acceptors), Counter(donors)
131
+
132
+ def transcript(self, tid: Optional[str] = None) -> Optional[Transcript]:
133
+ """
134
+ Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
135
+
136
+ Args:
137
+ tid: Transcript ID. If None, returns primary transcript.
138
+
139
+ Returns:
140
+ The Transcript object with the given ID or the primary transcript.
141
+ """
142
+ if tid is None:
143
+ tid = self.primary_transcript
144
+
145
+ if tid is None or tid not in self.transcripts:
146
+ return None
147
+
148
+ return Transcript(self.transcripts[tid], organism=self.organism)
149
+
150
+ @property
151
+ def primary_transcript(self) -> Optional[str]:
152
+ """
153
+ Returns the primary transcript ID for this gene.
154
+
155
+ Returns:
156
+ The primary transcript ID or None if not available.
157
+ """
158
+ # If already calculated, return it
159
+ if hasattr(self, '_primary_transcript'):
160
+ return self._primary_transcript
161
+
162
+ # Try to find a primary transcript
163
+ primary_transcripts = [k for k, v in self.transcripts.items()
164
+ if v.get('primary_transcript')]
165
+ if primary_transcripts:
166
+ self._primary_transcript = primary_transcripts[0]
167
+ return self._primary_transcript
168
+
169
+ # Fallback: find a protein-coding transcript
170
+ protein_coding = [k for k, v in self.transcripts.items()
171
+ if v.get('transcript_biotype') == 'protein_coding']
172
+ if protein_coding:
173
+ self._primary_transcript = protein_coding[0]
174
+ return self._primary_transcript
175
+
176
+ # No primary or protein-coding transcript found
177
+ self._primary_transcript = None
178
+ return None