magicc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magicc-0.1.0/LICENSE +21 -0
- magicc-0.1.0/MANIFEST.in +4 -0
- magicc-0.1.0/PKG-INFO +115 -0
- magicc-0.1.0/README.md +83 -0
- magicc-0.1.0/magicc/__init__.py +7 -0
- magicc-0.1.0/magicc/__main__.py +7 -0
- magicc-0.1.0/magicc/assembly_stats.py +332 -0
- magicc-0.1.0/magicc/cli.py +587 -0
- magicc-0.1.0/magicc/contamination.py +374 -0
- magicc-0.1.0/magicc/data/normalization_params.json +102137 -0
- magicc-0.1.0/magicc/data/selected_kmers.txt +9249 -0
- magicc-0.1.0/magicc/fragmentation.py +768 -0
- magicc-0.1.0/magicc/kmer_counter.py +281 -0
- magicc-0.1.0/magicc/model.py +733 -0
- magicc-0.1.0/magicc/normalization.py +391 -0
- magicc-0.1.0/magicc/pipeline.py +253 -0
- magicc-0.1.0/magicc/storage.py +295 -0
- magicc-0.1.0/magicc/trainer.py +577 -0
- magicc-0.1.0/magicc.egg-info/PKG-INFO +115 -0
- magicc-0.1.0/magicc.egg-info/SOURCES.txt +24 -0
- magicc-0.1.0/magicc.egg-info/dependency_links.txt +1 -0
- magicc-0.1.0/magicc.egg-info/entry_points.txt +2 -0
- magicc-0.1.0/magicc.egg-info/requires.txt +5 -0
- magicc-0.1.0/magicc.egg-info/top_level.txt +1 -0
- magicc-0.1.0/pyproject.toml +61 -0
- magicc-0.1.0/setup.cfg +4 -0
magicc-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Maotian Ren
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
magicc-0.1.0/MANIFEST.in
ADDED
magicc-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: magicc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MAGICC: Metagenome-Assembled Genome Inference of Completeness and Contamination
|
|
5
|
+
Author: Renmao Tian
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/renmaotian/magicc
|
|
8
|
+
Project-URL: Repository, https://github.com/renmaotian/magicc
|
|
9
|
+
Project-URL: Issues, https://github.com/renmaotian/magicc/issues
|
|
10
|
+
Keywords: metagenomics,genome-quality,MAG,completeness,contamination,deep-learning,bioinformatics
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: numpy>=1.20
|
|
27
|
+
Requires-Dist: numba>=0.53
|
|
28
|
+
Requires-Dist: scipy>=1.7
|
|
29
|
+
Requires-Dist: h5py>=3.0
|
|
30
|
+
Requires-Dist: onnxruntime>=1.10
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# MAGICC
|
|
34
|
+
|
|
35
|
+
**Metagenome-Assembled Genome Inference of Completeness and Contamination**
|
|
36
|
+
|
|
37
|
+
Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install magicc
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Or from source:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
git clone https://github.com/renmaotian/magicc.git
|
|
49
|
+
cd magicc
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Note**: Git LFS is required to clone the repository (the ONNX model is ~180 MB).
|
|
54
|
+
|
|
55
|
+
### Dependencies
|
|
56
|
+
|
|
57
|
+
- Python >= 3.8
|
|
58
|
+
- numpy >= 1.20
|
|
59
|
+
- numba >= 0.53
|
|
60
|
+
- scipy >= 1.7
|
|
61
|
+
- h5py >= 3.0
|
|
62
|
+
- onnxruntime >= 1.10
|
|
63
|
+
|
|
64
|
+
## Usage
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Predict quality for all FASTA files in a directory (uses all CPUs by default)
|
|
68
|
+
magicc predict --input /path/to/genomes/ --output predictions.tsv
|
|
69
|
+
|
|
70
|
+
# Single genome
|
|
71
|
+
magicc predict --input genome.fasta --output predictions.tsv
|
|
72
|
+
|
|
73
|
+
# Specify threads and file extension
|
|
74
|
+
magicc predict --input /path/to/genomes/ --output predictions.tsv --threads 8 --extension .fa
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Options
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
magicc predict [OPTIONS]
|
|
81
|
+
|
|
82
|
+
Required:
|
|
83
|
+
--input, -i Path to genome FASTA file(s) or directory
|
|
84
|
+
--output, -o Output TSV file path
|
|
85
|
+
|
|
86
|
+
Optional:
|
|
87
|
+
--threads, -t Number of threads (default: 0 = all CPUs)
|
|
88
|
+
--batch-size Batch size for ONNX inference (default: 64)
|
|
89
|
+
--extension, -x Genome file extension filter (default: .fasta)
|
|
90
|
+
--model Path to ONNX model file (auto-downloads if not found)
|
|
91
|
+
--quiet, -q Suppress progress output
|
|
92
|
+
--verbose, -v Verbose debug output
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Output
|
|
96
|
+
|
|
97
|
+
Tab-separated file with three columns:
|
|
98
|
+
|
|
99
|
+
| genome_name | pred_completeness | pred_contamination |
|
|
100
|
+
|-------------|-------------------|--------------------|
|
|
101
|
+
| genome_001 | 95.2341 | 2.1567 |
|
|
102
|
+
| genome_002 | 78.4521 | 15.3421 |
|
|
103
|
+
|
|
104
|
+
- **pred_completeness**: Predicted completeness (%), range [50, 100]
|
|
105
|
+
- **pred_contamination**: Predicted contamination (%), range [0, 100]
|
|
106
|
+
|
|
107
|
+
## Citation
|
|
108
|
+
|
|
109
|
+
If you use MAGICC in your research, please cite:
|
|
110
|
+
|
|
111
|
+
> Tian, R. (2026). MAGICC: Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning. *In preparation*.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
magicc-0.1.0/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# MAGICC
|
|
2
|
+
|
|
3
|
+
**Metagenome-Assembled Genome Inference of Completeness and Contamination**
|
|
4
|
+
|
|
5
|
+
Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install magicc
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or from source:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/renmaotian/magicc.git
|
|
17
|
+
cd magicc
|
|
18
|
+
pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
**Note**: Git LFS is required to clone the repository (the ONNX model is ~180 MB).
|
|
22
|
+
|
|
23
|
+
### Dependencies
|
|
24
|
+
|
|
25
|
+
- Python >= 3.8
|
|
26
|
+
- numpy >= 1.20
|
|
27
|
+
- numba >= 0.53
|
|
28
|
+
- scipy >= 1.7
|
|
29
|
+
- h5py >= 3.0
|
|
30
|
+
- onnxruntime >= 1.10
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Predict quality for all FASTA files in a directory (uses all CPUs by default)
|
|
36
|
+
magicc predict --input /path/to/genomes/ --output predictions.tsv
|
|
37
|
+
|
|
38
|
+
# Single genome
|
|
39
|
+
magicc predict --input genome.fasta --output predictions.tsv
|
|
40
|
+
|
|
41
|
+
# Specify threads and file extension
|
|
42
|
+
magicc predict --input /path/to/genomes/ --output predictions.tsv --threads 8 --extension .fa
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Options
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
magicc predict [OPTIONS]
|
|
49
|
+
|
|
50
|
+
Required:
|
|
51
|
+
--input, -i Path to genome FASTA file(s) or directory
|
|
52
|
+
--output, -o Output TSV file path
|
|
53
|
+
|
|
54
|
+
Optional:
|
|
55
|
+
--threads, -t Number of threads (default: 0 = all CPUs)
|
|
56
|
+
--batch-size Batch size for ONNX inference (default: 64)
|
|
57
|
+
--extension, -x Genome file extension filter (default: .fasta)
|
|
58
|
+
--model Path to ONNX model file (auto-downloads if not found)
|
|
59
|
+
--quiet, -q Suppress progress output
|
|
60
|
+
--verbose, -v Verbose debug output
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Output
|
|
64
|
+
|
|
65
|
+
Tab-separated file with three columns:
|
|
66
|
+
|
|
67
|
+
| genome_name | pred_completeness | pred_contamination |
|
|
68
|
+
|-------------|-------------------|--------------------|
|
|
69
|
+
| genome_001 | 95.2341 | 2.1567 |
|
|
70
|
+
| genome_002 | 78.4521 | 15.3421 |
|
|
71
|
+
|
|
72
|
+
- **pred_completeness**: Predicted completeness (%), range [50, 100]
|
|
73
|
+
- **pred_contamination**: Predicted contamination (%), range [0, 100]
|
|
74
|
+
|
|
75
|
+
## Citation
|
|
76
|
+
|
|
77
|
+
If you use MAGICC in your research, please cite:
|
|
78
|
+
|
|
79
|
+
> Tian, R. (2026). MAGICC: Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning. *In preparation*.
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Assembly Statistics Module for MAGICC.
|
|
3
|
+
|
|
4
|
+
Computes all 26 assembly statistics features in a single pass:
|
|
5
|
+
- 11 contig length stats: total_length, contig_count, n50, n90, l50, l90,
|
|
6
|
+
largest_contig, smallest_contig, mean_contig, median_contig, contig_length_std
|
|
7
|
+
- 4 base composition: gc_mean, gc_std, gc_iqr, gc_bimodality
|
|
8
|
+
- 4 distributional: gc_outlier_fraction, largest_contig_fraction,
|
|
9
|
+
top10_concentration, n50_mean_ratio
|
|
10
|
+
- 1 k-mer (legacy): log10_total_kmer_count (placeholder, filled during k-mer counting)
|
|
11
|
+
- 6 k-mer summary features (new): total_kmer_sum, unique_kmer_count,
|
|
12
|
+
duplicate_kmer_count, kmer_entropy, unique_kmer_ratio, duplicate_kmer_ratio
|
|
13
|
+
|
|
14
|
+
Uses Numba JIT compilation for performance-critical parts.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import numba as nb
|
|
19
|
+
from typing import List, Dict, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
# Feature names in order
|
|
22
|
+
FEATURE_NAMES = [
|
|
23
|
+
# 11 contig length stats
|
|
24
|
+
'total_length',
|
|
25
|
+
'contig_count',
|
|
26
|
+
'n50',
|
|
27
|
+
'n90',
|
|
28
|
+
'l50',
|
|
29
|
+
'l90',
|
|
30
|
+
'largest_contig',
|
|
31
|
+
'smallest_contig',
|
|
32
|
+
'mean_contig',
|
|
33
|
+
'median_contig',
|
|
34
|
+
'contig_length_std',
|
|
35
|
+
# 4 base composition
|
|
36
|
+
'gc_mean',
|
|
37
|
+
'gc_std',
|
|
38
|
+
'gc_iqr',
|
|
39
|
+
'gc_bimodality',
|
|
40
|
+
# 4 distributional
|
|
41
|
+
'gc_outlier_fraction',
|
|
42
|
+
'largest_contig_fraction',
|
|
43
|
+
'top10_concentration',
|
|
44
|
+
'n50_mean_ratio',
|
|
45
|
+
# 1 k-mer (legacy)
|
|
46
|
+
'log10_total_kmer_count',
|
|
47
|
+
# 6 new k-mer summary features
|
|
48
|
+
'total_kmer_sum', # sum of all 9,249 k-mer raw counts
|
|
49
|
+
'unique_kmer_count', # number of k-mers with count > 0
|
|
50
|
+
'duplicate_kmer_count', # number of k-mers with count > 1
|
|
51
|
+
'kmer_entropy', # Shannon entropy of k-mer count distribution
|
|
52
|
+
'unique_kmer_ratio', # unique_kmer_count / 9249
|
|
53
|
+
'duplicate_kmer_ratio', # duplicate_kmer_count / unique_kmer_count
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
N_FEATURES = len(FEATURE_NAMES) # 26
|
|
57
|
+
assert N_FEATURES == 26
|
|
58
|
+
|
|
59
|
+
# Feature name -> index mapping
|
|
60
|
+
FEATURE_INDEX = {name: i for i, name in enumerate(FEATURE_NAMES)}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@nb.njit(cache=True)
|
|
64
|
+
def _compute_gc_from_bytes(seq_bytes: np.ndarray) -> float:
|
|
65
|
+
"""Compute GC content from a byte array of DNA sequence (Numba-accelerated)."""
|
|
66
|
+
gc = 0
|
|
67
|
+
total = 0
|
|
68
|
+
for i in range(len(seq_bytes)):
|
|
69
|
+
b = seq_bytes[i]
|
|
70
|
+
# G=71, C=67, A=65, T=84, g=103, c=99, a=97, t=116
|
|
71
|
+
if b == 71 or b == 67 or b == 103 or b == 99:
|
|
72
|
+
gc += 1
|
|
73
|
+
total += 1
|
|
74
|
+
elif b == 65 or b == 84 or b == 97 or b == 116:
|
|
75
|
+
total += 1
|
|
76
|
+
# N and other chars are ignored
|
|
77
|
+
if total == 0:
|
|
78
|
+
return 0.5
|
|
79
|
+
return gc / total
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@nb.njit(cache=True)
|
|
83
|
+
def _compute_nx_lx(sorted_lengths: np.ndarray, total_length: int, fraction: float) -> Tuple:
|
|
84
|
+
"""
|
|
85
|
+
Compute Nx and Lx metrics from sorted (descending) contig lengths.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
sorted_lengths : np.ndarray
|
|
90
|
+
Contig lengths sorted in descending order.
|
|
91
|
+
total_length : int
|
|
92
|
+
Total assembly length.
|
|
93
|
+
fraction : float
|
|
94
|
+
Fraction (e.g., 0.5 for N50, 0.9 for N90).
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
tuple of (nx, lx)
|
|
99
|
+
nx = contig length at which fraction of assembly is covered
|
|
100
|
+
lx = number of contigs needed to cover fraction
|
|
101
|
+
"""
|
|
102
|
+
threshold = total_length * fraction
|
|
103
|
+
running = 0
|
|
104
|
+
for i in range(len(sorted_lengths)):
|
|
105
|
+
running += sorted_lengths[i]
|
|
106
|
+
if running >= threshold:
|
|
107
|
+
return sorted_lengths[i], i + 1
|
|
108
|
+
return sorted_lengths[-1], len(sorted_lengths)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@nb.njit(cache=True)
|
|
112
|
+
def _compute_bimodality(values: np.ndarray) -> float:
|
|
113
|
+
"""
|
|
114
|
+
Compute bimodality coefficient: (skewness^2 + 1) / kurtosis.
|
|
115
|
+
|
|
116
|
+
Uses excess kurtosis + 3 for the denominator.
|
|
117
|
+
Returns 0 if kurtosis is zero or less.
|
|
118
|
+
"""
|
|
119
|
+
n = len(values)
|
|
120
|
+
if n < 4:
|
|
121
|
+
return 0.0
|
|
122
|
+
|
|
123
|
+
mean = 0.0
|
|
124
|
+
for i in range(n):
|
|
125
|
+
mean += values[i]
|
|
126
|
+
mean /= n
|
|
127
|
+
|
|
128
|
+
m2 = 0.0
|
|
129
|
+
m3 = 0.0
|
|
130
|
+
m4 = 0.0
|
|
131
|
+
for i in range(n):
|
|
132
|
+
d = values[i] - mean
|
|
133
|
+
d2 = d * d
|
|
134
|
+
m2 += d2
|
|
135
|
+
m3 += d2 * d
|
|
136
|
+
m4 += d2 * d2
|
|
137
|
+
m2 /= n
|
|
138
|
+
m3 /= n
|
|
139
|
+
m4 /= n
|
|
140
|
+
|
|
141
|
+
if m2 < 1e-20:
|
|
142
|
+
return 0.0
|
|
143
|
+
|
|
144
|
+
# Skewness
|
|
145
|
+
skewness = m3 / (m2 ** 1.5)
|
|
146
|
+
|
|
147
|
+
# Kurtosis (excess kurtosis + 3 = regular kurtosis)
|
|
148
|
+
kurtosis = m4 / (m2 ** 2)
|
|
149
|
+
|
|
150
|
+
if kurtosis < 1e-20:
|
|
151
|
+
return 0.0
|
|
152
|
+
|
|
153
|
+
return (skewness * skewness + 1.0) / kurtosis
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def compute_assembly_stats(
|
|
157
|
+
contigs: List[str],
|
|
158
|
+
log10_total_kmer_count: float = 0.0,
|
|
159
|
+
kmer_counts: Optional[np.ndarray] = None,
|
|
160
|
+
) -> np.ndarray:
|
|
161
|
+
"""
|
|
162
|
+
Compute all 26 assembly statistics features.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
contigs : list of str
|
|
167
|
+
List of contig sequences.
|
|
168
|
+
log10_total_kmer_count : float
|
|
169
|
+
Pre-computed log10 total k-mer count (filled during k-mer counting step).
|
|
170
|
+
kmer_counts : np.ndarray or None
|
|
171
|
+
Raw k-mer counts array of shape (n_kmer_features,). If provided,
|
|
172
|
+
used to compute the 6 new k-mer summary features.
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
np.ndarray
|
|
177
|
+
Array of 26 features in order defined by FEATURE_NAMES.
|
|
178
|
+
"""
|
|
179
|
+
features = np.zeros(N_FEATURES, dtype=np.float64)
|
|
180
|
+
|
|
181
|
+
n_contigs = len(contigs)
|
|
182
|
+
if n_contigs == 0:
|
|
183
|
+
return features
|
|
184
|
+
|
|
185
|
+
# Compute contig lengths and GC content per contig
|
|
186
|
+
lengths = np.empty(n_contigs, dtype=np.int64)
|
|
187
|
+
gc_values = np.empty(n_contigs, dtype=np.float64)
|
|
188
|
+
|
|
189
|
+
for i, contig in enumerate(contigs):
|
|
190
|
+
lengths[i] = len(contig)
|
|
191
|
+
if len(contig) > 0:
|
|
192
|
+
seq_bytes = np.frombuffer(contig.encode('ascii'), dtype=np.uint8)
|
|
193
|
+
gc_values[i] = _compute_gc_from_bytes(seq_bytes)
|
|
194
|
+
else:
|
|
195
|
+
gc_values[i] = 0.5
|
|
196
|
+
|
|
197
|
+
# Sort lengths descending for N50/N90 computation
|
|
198
|
+
sorted_lengths = np.sort(lengths)[::-1]
|
|
199
|
+
total_length = int(lengths.sum())
|
|
200
|
+
|
|
201
|
+
# === 11 Contig length stats ===
|
|
202
|
+
features[0] = total_length # total_length
|
|
203
|
+
features[1] = n_contigs # contig_count
|
|
204
|
+
|
|
205
|
+
n50, l50 = _compute_nx_lx(sorted_lengths, total_length, 0.5)
|
|
206
|
+
n90, l90 = _compute_nx_lx(sorted_lengths, total_length, 0.9)
|
|
207
|
+
features[2] = n50 # n50
|
|
208
|
+
features[3] = n90 # n90
|
|
209
|
+
features[4] = l50 # l50
|
|
210
|
+
features[5] = l90 # l90
|
|
211
|
+
features[6] = sorted_lengths[0] # largest_contig
|
|
212
|
+
features[7] = sorted_lengths[-1] # smallest_contig
|
|
213
|
+
mean_length = total_length / n_contigs
|
|
214
|
+
features[8] = mean_length # mean_contig
|
|
215
|
+
features[9] = float(np.median(lengths)) # median_contig
|
|
216
|
+
features[10] = float(np.std(lengths)) # contig_length_std
|
|
217
|
+
|
|
218
|
+
# === 4 Base composition ===
|
|
219
|
+
# Weight GC by contig length for overall mean
|
|
220
|
+
length_weights = lengths.astype(np.float64) / total_length
|
|
221
|
+
gc_mean = float(np.sum(gc_values * length_weights))
|
|
222
|
+
features[11] = gc_mean # gc_mean
|
|
223
|
+
|
|
224
|
+
# GC std across contigs (weighted)
|
|
225
|
+
gc_var = float(np.sum(length_weights * (gc_values - gc_mean) ** 2))
|
|
226
|
+
gc_std = np.sqrt(gc_var)
|
|
227
|
+
features[12] = gc_std # gc_std
|
|
228
|
+
|
|
229
|
+
# GC IQR
|
|
230
|
+
if n_contigs >= 4:
|
|
231
|
+
q75, q25 = np.percentile(gc_values, [75, 25])
|
|
232
|
+
gc_iqr = q75 - q25
|
|
233
|
+
else:
|
|
234
|
+
gc_iqr = gc_std * 1.35 # Approximate IQR from std for small samples
|
|
235
|
+
features[13] = gc_iqr # gc_iqr
|
|
236
|
+
|
|
237
|
+
# GC bimodality: (skewness^2 + 1) / kurtosis
|
|
238
|
+
features[14] = _compute_bimodality(gc_values) # gc_bimodality
|
|
239
|
+
|
|
240
|
+
# === 4 Distributional ===
|
|
241
|
+
# GC outlier fraction: fraction of total length in contigs with |GC - mean| > 2*std
|
|
242
|
+
if gc_std > 1e-10:
|
|
243
|
+
outlier_mask = np.abs(gc_values - gc_mean) > 2 * gc_std
|
|
244
|
+
gc_outlier_fraction = float(lengths[outlier_mask].sum()) / total_length
|
|
245
|
+
else:
|
|
246
|
+
gc_outlier_fraction = 0.0
|
|
247
|
+
features[15] = gc_outlier_fraction # gc_outlier_fraction
|
|
248
|
+
|
|
249
|
+
# Largest contig fraction
|
|
250
|
+
features[16] = float(sorted_lengths[0]) / total_length # largest_contig_fraction
|
|
251
|
+
|
|
252
|
+
# Top 10% contig concentration
|
|
253
|
+
n_top10 = max(1, int(np.ceil(n_contigs * 0.1)))
|
|
254
|
+
top10_length = float(sorted_lengths[:n_top10].sum())
|
|
255
|
+
features[17] = top10_length / total_length # top10_concentration
|
|
256
|
+
|
|
257
|
+
# N50/mean ratio
|
|
258
|
+
if mean_length > 0:
|
|
259
|
+
features[18] = n50 / mean_length # n50_mean_ratio
|
|
260
|
+
else:
|
|
261
|
+
features[18] = 0.0
|
|
262
|
+
|
|
263
|
+
# === 1 K-mer (legacy) ===
|
|
264
|
+
features[19] = log10_total_kmer_count # log10_total_kmer_count
|
|
265
|
+
|
|
266
|
+
# === 6 New k-mer summary features ===
|
|
267
|
+
if kmer_counts is not None:
|
|
268
|
+
total_kmer_sum = float(kmer_counts.sum())
|
|
269
|
+
features[20] = total_kmer_sum # total_kmer_sum
|
|
270
|
+
|
|
271
|
+
unique_count = int(np.count_nonzero(kmer_counts))
|
|
272
|
+
features[21] = float(unique_count) # unique_kmer_count
|
|
273
|
+
|
|
274
|
+
duplicate_count = int(np.sum(kmer_counts > 1))
|
|
275
|
+
features[22] = float(duplicate_count) # duplicate_kmer_count
|
|
276
|
+
|
|
277
|
+
# Shannon entropy of k-mer count distribution
|
|
278
|
+
if total_kmer_sum > 0:
|
|
279
|
+
probs = kmer_counts.astype(np.float64) / total_kmer_sum
|
|
280
|
+
# Only compute for non-zero entries to avoid log(0)
|
|
281
|
+
nonzero_mask = probs > 0
|
|
282
|
+
entropy = -float(np.sum(probs[nonzero_mask] * np.log2(probs[nonzero_mask])))
|
|
283
|
+
else:
|
|
284
|
+
entropy = 0.0
|
|
285
|
+
features[23] = entropy # kmer_entropy
|
|
286
|
+
|
|
287
|
+
n_total_kmers = len(kmer_counts)
|
|
288
|
+
features[24] = unique_count / n_total_kmers if n_total_kmers > 0 else 0.0 # unique_kmer_ratio
|
|
289
|
+
|
|
290
|
+
features[25] = duplicate_count / unique_count if unique_count > 0 else 0.0 # duplicate_kmer_ratio
|
|
291
|
+
# else: features[20:26] remain 0.0
|
|
292
|
+
|
|
293
|
+
return features
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def compute_assembly_stats_batch(
|
|
297
|
+
batch_contigs: List[List[str]],
|
|
298
|
+
log10_kmer_counts: Optional[np.ndarray] = None,
|
|
299
|
+
batch_kmer_counts: Optional[np.ndarray] = None,
|
|
300
|
+
) -> np.ndarray:
|
|
301
|
+
"""
|
|
302
|
+
Compute assembly statistics for a batch of genomes.
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
batch_contigs : list of list of str
|
|
307
|
+
List of contig lists, one per genome.
|
|
308
|
+
log10_kmer_counts : np.ndarray or None
|
|
309
|
+
Array of log10 total k-mer counts, one per genome.
|
|
310
|
+
batch_kmer_counts : np.ndarray or None
|
|
311
|
+
Raw k-mer counts, shape (n_genomes, n_kmer_features). If provided,
|
|
312
|
+
used to compute the 6 new k-mer summary features.
|
|
313
|
+
|
|
314
|
+
Returns
|
|
315
|
+
-------
|
|
316
|
+
np.ndarray
|
|
317
|
+
Array of shape (n_genomes, 26).
|
|
318
|
+
"""
|
|
319
|
+
n = len(batch_contigs)
|
|
320
|
+
result = np.zeros((n, N_FEATURES), dtype=np.float64)
|
|
321
|
+
|
|
322
|
+
for i in range(n):
|
|
323
|
+
kmer_count = float(log10_kmer_counts[i]) if log10_kmer_counts is not None else 0.0
|
|
324
|
+
kc = batch_kmer_counts[i] if batch_kmer_counts is not None else None
|
|
325
|
+
result[i] = compute_assembly_stats(batch_contigs[i], kmer_count, kc)
|
|
326
|
+
|
|
327
|
+
return result
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def format_stats(features: np.ndarray) -> Dict[str, float]:
|
|
331
|
+
"""Convert feature array to named dictionary."""
|
|
332
|
+
return {name: float(features[i]) for i, name in enumerate(FEATURE_NAMES)}
|