magicc 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magicc-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Maotian Ren
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ recursive-include magicc/data *
magicc-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: magicc
3
+ Version: 0.1.0
4
+ Summary: MAGICC: Metagenome-Assembled Genome Inference of Completeness and Contamination
5
+ Author: Renmao Tian
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/renmaotian/magicc
8
+ Project-URL: Repository, https://github.com/renmaotian/magicc
9
+ Project-URL: Issues, https://github.com/renmaotian/magicc/issues
10
+ Keywords: metagenomics,genome-quality,MAG,completeness,contamination,deep-learning,bioinformatics
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: numpy>=1.20
27
+ Requires-Dist: numba>=0.53
28
+ Requires-Dist: scipy>=1.7
29
+ Requires-Dist: h5py>=3.0
30
+ Requires-Dist: onnxruntime>=1.10
31
+ Dynamic: license-file
32
+
33
+ # MAGICC
34
+
35
+ **Metagenome-Assembled Genome Inference of Completeness and Contamination**
36
+
37
+ Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install magicc
43
+ ```
44
+
45
+ Or from source:
46
+
47
+ ```bash
48
+ git clone https://github.com/renmaotian/magicc.git
49
+ cd magicc
50
+ pip install -e .
51
+ ```
52
+
53
+ **Note**: Git LFS is required to clone the repository (the ONNX model is ~180 MB).
54
+
55
+ ### Dependencies
56
+
57
+ - Python >= 3.8
58
+ - numpy >= 1.20
59
+ - numba >= 0.53
60
+ - scipy >= 1.7
61
+ - h5py >= 3.0
62
+ - onnxruntime >= 1.10
63
+
64
+ ## Usage
65
+
66
+ ```bash
67
+ # Predict quality for all FASTA files in a directory (uses all CPUs by default)
68
+ magicc predict --input /path/to/genomes/ --output predictions.tsv
69
+
70
+ # Single genome
71
+ magicc predict --input genome.fasta --output predictions.tsv
72
+
73
+ # Specify threads and file extension
74
+ magicc predict --input /path/to/genomes/ --output predictions.tsv --threads 8 --extension .fa
75
+ ```
76
+
77
+ ### Options
78
+
79
+ ```
80
+ magicc predict [OPTIONS]
81
+
82
+ Required:
83
+ --input, -i Path to genome FASTA file(s) or directory
84
+ --output, -o Output TSV file path
85
+
86
+ Optional:
87
+ --threads, -t Number of threads (default: 0 = all CPUs)
88
+ --batch-size Batch size for ONNX inference (default: 64)
89
+ --extension, -x Genome file extension filter (default: .fasta)
90
+ --model Path to ONNX model file (auto-downloads if not found)
91
+ --quiet, -q Suppress progress output
92
+ --verbose, -v Verbose debug output
93
+ ```
94
+
95
+ ### Output
96
+
97
+ Tab-separated file with three columns:
98
+
99
+ | genome_name | pred_completeness | pred_contamination |
100
+ |-------------|-------------------|--------------------|
101
+ | genome_001 | 95.2341 | 2.1567 |
102
+ | genome_002 | 78.4521 | 15.3421 |
103
+
104
+ - **pred_completeness**: Predicted completeness (%), range [50, 100]
105
+ - **pred_contamination**: Predicted contamination (%), range [0, 100]
106
+
107
+ ## Citation
108
+
109
+ If you use MAGICC in your research, please cite:
110
+
111
+ > Tian, R. (2026). MAGICC: Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning. *In preparation*.
112
+
113
+ ## License
114
+
115
+ MIT License. See [LICENSE](LICENSE) for details.
magicc-0.1.0/README.md ADDED
@@ -0,0 +1,83 @@
1
+ # MAGICC
2
+
3
+ **Metagenome-Assembled Genome Inference of Completeness and Contamination**
4
+
5
+ Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install magicc
11
+ ```
12
+
13
+ Or from source:
14
+
15
+ ```bash
16
+ git clone https://github.com/renmaotian/magicc.git
17
+ cd magicc
18
+ pip install -e .
19
+ ```
20
+
21
+ **Note**: Git LFS is required to clone the repository (the ONNX model is ~180 MB).
22
+
23
+ ### Dependencies
24
+
25
+ - Python >= 3.8
26
+ - numpy >= 1.20
27
+ - numba >= 0.53
28
+ - scipy >= 1.7
29
+ - h5py >= 3.0
30
+ - onnxruntime >= 1.10
31
+
32
+ ## Usage
33
+
34
+ ```bash
35
+ # Predict quality for all FASTA files in a directory (uses all CPUs by default)
36
+ magicc predict --input /path/to/genomes/ --output predictions.tsv
37
+
38
+ # Single genome
39
+ magicc predict --input genome.fasta --output predictions.tsv
40
+
41
+ # Specify threads and file extension
42
+ magicc predict --input /path/to/genomes/ --output predictions.tsv --threads 8 --extension .fa
43
+ ```
44
+
45
+ ### Options
46
+
47
+ ```
48
+ magicc predict [OPTIONS]
49
+
50
+ Required:
51
+ --input, -i Path to genome FASTA file(s) or directory
52
+ --output, -o Output TSV file path
53
+
54
+ Optional:
55
+ --threads, -t Number of threads (default: 0 = all CPUs)
56
+ --batch-size Batch size for ONNX inference (default: 64)
57
+ --extension, -x Genome file extension filter (default: .fasta)
58
+ --model Path to ONNX model file (auto-downloads if not found)
59
+ --quiet, -q Suppress progress output
60
+ --verbose, -v Verbose debug output
61
+ ```
62
+
63
+ ### Output
64
+
65
+ Tab-separated file with three columns:
66
+
67
+ | genome_name | pred_completeness | pred_contamination |
68
+ |-------------|-------------------|--------------------|
69
+ | genome_001 | 95.2341 | 2.1567 |
70
+ | genome_002 | 78.4521 | 15.3421 |
71
+
72
+ - **pred_completeness**: Predicted completeness (%), range [50, 100]
73
+ - **pred_contamination**: Predicted contamination (%), range [0, 100]
74
+
75
+ ## Citation
76
+
77
+ If you use MAGICC in your research, please cite:
78
+
79
+ > Tian, R. (2026). MAGICC: Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning. *In preparation*.
80
+
81
+ ## License
82
+
83
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,7 @@
1
+ """
2
+ MAGICC - Metagenome-Assembled Genome Inference of Completeness and Contamination
3
+
4
+ Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
5
+ """
6
+
7
+ __version__ = "0.1.0"
@@ -0,0 +1,7 @@
1
+ """
2
+ Entry point for `python -m magicc`.
3
+ """
4
+ from magicc.cli import main
5
+
6
+ if __name__ == '__main__':
7
+ main()
@@ -0,0 +1,332 @@
1
+ """
2
+ Assembly Statistics Module for MAGICC.
3
+
4
+ Computes all 26 assembly statistics features in a single pass:
5
+ - 11 contig length stats: total_length, contig_count, n50, n90, l50, l90,
6
+ largest_contig, smallest_contig, mean_contig, median_contig, contig_length_std
7
+ - 4 base composition: gc_mean, gc_std, gc_iqr, gc_bimodality
8
+ - 4 distributional: gc_outlier_fraction, largest_contig_fraction,
9
+ top10_concentration, n50_mean_ratio
10
+ - 1 k-mer (legacy): log10_total_kmer_count (placeholder, filled during k-mer counting)
11
+ - 6 k-mer summary features (new): total_kmer_sum, unique_kmer_count,
12
+ duplicate_kmer_count, kmer_entropy, unique_kmer_ratio, duplicate_kmer_ratio
13
+
14
+ Uses Numba JIT compilation for performance-critical parts.
15
+ """
16
+
17
+ import numpy as np
18
+ import numba as nb
19
+ from typing import List, Dict, Optional, Tuple
20
+
21
+ # Feature names in order
22
+ FEATURE_NAMES = [
23
+ # 11 contig length stats
24
+ 'total_length',
25
+ 'contig_count',
26
+ 'n50',
27
+ 'n90',
28
+ 'l50',
29
+ 'l90',
30
+ 'largest_contig',
31
+ 'smallest_contig',
32
+ 'mean_contig',
33
+ 'median_contig',
34
+ 'contig_length_std',
35
+ # 4 base composition
36
+ 'gc_mean',
37
+ 'gc_std',
38
+ 'gc_iqr',
39
+ 'gc_bimodality',
40
+ # 4 distributional
41
+ 'gc_outlier_fraction',
42
+ 'largest_contig_fraction',
43
+ 'top10_concentration',
44
+ 'n50_mean_ratio',
45
+ # 1 k-mer (legacy)
46
+ 'log10_total_kmer_count',
47
+ # 6 new k-mer summary features
48
+ 'total_kmer_sum', # sum of all 9,249 k-mer raw counts
49
+ 'unique_kmer_count', # number of k-mers with count > 0
50
+ 'duplicate_kmer_count', # number of k-mers with count > 1
51
+ 'kmer_entropy', # Shannon entropy of k-mer count distribution
52
+ 'unique_kmer_ratio', # unique_kmer_count / 9249
53
+ 'duplicate_kmer_ratio', # duplicate_kmer_count / unique_kmer_count
54
+ ]
55
+
56
+ N_FEATURES = len(FEATURE_NAMES) # 26
57
+ assert N_FEATURES == 26
58
+
59
+ # Feature name -> index mapping
60
+ FEATURE_INDEX = {name: i for i, name in enumerate(FEATURE_NAMES)}
61
+
62
+
63
+ @nb.njit(cache=True)
64
+ def _compute_gc_from_bytes(seq_bytes: np.ndarray) -> float:
65
+ """Compute GC content from a byte array of DNA sequence (Numba-accelerated)."""
66
+ gc = 0
67
+ total = 0
68
+ for i in range(len(seq_bytes)):
69
+ b = seq_bytes[i]
70
+ # G=71, C=67, A=65, T=84, g=103, c=99, a=97, t=116
71
+ if b == 71 or b == 67 or b == 103 or b == 99:
72
+ gc += 1
73
+ total += 1
74
+ elif b == 65 or b == 84 or b == 97 or b == 116:
75
+ total += 1
76
+ # N and other chars are ignored
77
+ if total == 0:
78
+ return 0.5
79
+ return gc / total
80
+
81
+
82
+ @nb.njit(cache=True)
83
+ def _compute_nx_lx(sorted_lengths: np.ndarray, total_length: int, fraction: float) -> Tuple:
84
+ """
85
+ Compute Nx and Lx metrics from sorted (descending) contig lengths.
86
+
87
+ Parameters
88
+ ----------
89
+ sorted_lengths : np.ndarray
90
+ Contig lengths sorted in descending order.
91
+ total_length : int
92
+ Total assembly length.
93
+ fraction : float
94
+ Fraction (e.g., 0.5 for N50, 0.9 for N90).
95
+
96
+ Returns
97
+ -------
98
+ tuple of (nx, lx)
99
+ nx = contig length at which fraction of assembly is covered
100
+ lx = number of contigs needed to cover fraction
101
+ """
102
+ threshold = total_length * fraction
103
+ running = 0
104
+ for i in range(len(sorted_lengths)):
105
+ running += sorted_lengths[i]
106
+ if running >= threshold:
107
+ return sorted_lengths[i], i + 1
108
+ return sorted_lengths[-1], len(sorted_lengths)
109
+
110
+
111
+ @nb.njit(cache=True)
112
+ def _compute_bimodality(values: np.ndarray) -> float:
113
+ """
114
+ Compute bimodality coefficient: (skewness^2 + 1) / kurtosis.
115
+
116
+ Uses excess kurtosis + 3 for the denominator.
117
+ Returns 0 if kurtosis is zero or less.
118
+ """
119
+ n = len(values)
120
+ if n < 4:
121
+ return 0.0
122
+
123
+ mean = 0.0
124
+ for i in range(n):
125
+ mean += values[i]
126
+ mean /= n
127
+
128
+ m2 = 0.0
129
+ m3 = 0.0
130
+ m4 = 0.0
131
+ for i in range(n):
132
+ d = values[i] - mean
133
+ d2 = d * d
134
+ m2 += d2
135
+ m3 += d2 * d
136
+ m4 += d2 * d2
137
+ m2 /= n
138
+ m3 /= n
139
+ m4 /= n
140
+
141
+ if m2 < 1e-20:
142
+ return 0.0
143
+
144
+ # Skewness
145
+ skewness = m3 / (m2 ** 1.5)
146
+
147
+ # Kurtosis (excess kurtosis + 3 = regular kurtosis)
148
+ kurtosis = m4 / (m2 ** 2)
149
+
150
+ if kurtosis < 1e-20:
151
+ return 0.0
152
+
153
+ return (skewness * skewness + 1.0) / kurtosis
154
+
155
+
156
+ def compute_assembly_stats(
157
+ contigs: List[str],
158
+ log10_total_kmer_count: float = 0.0,
159
+ kmer_counts: Optional[np.ndarray] = None,
160
+ ) -> np.ndarray:
161
+ """
162
+ Compute all 26 assembly statistics features.
163
+
164
+ Parameters
165
+ ----------
166
+ contigs : list of str
167
+ List of contig sequences.
168
+ log10_total_kmer_count : float
169
+ Pre-computed log10 total k-mer count (filled during k-mer counting step).
170
+ kmer_counts : np.ndarray or None
171
+ Raw k-mer counts array of shape (n_kmer_features,). If provided,
172
+ used to compute the 6 new k-mer summary features.
173
+
174
+ Returns
175
+ -------
176
+ np.ndarray
177
+ Array of 26 features in order defined by FEATURE_NAMES.
178
+ """
179
+ features = np.zeros(N_FEATURES, dtype=np.float64)
180
+
181
+ n_contigs = len(contigs)
182
+ if n_contigs == 0:
183
+ return features
184
+
185
+ # Compute contig lengths and GC content per contig
186
+ lengths = np.empty(n_contigs, dtype=np.int64)
187
+ gc_values = np.empty(n_contigs, dtype=np.float64)
188
+
189
+ for i, contig in enumerate(contigs):
190
+ lengths[i] = len(contig)
191
+ if len(contig) > 0:
192
+ seq_bytes = np.frombuffer(contig.encode('ascii'), dtype=np.uint8)
193
+ gc_values[i] = _compute_gc_from_bytes(seq_bytes)
194
+ else:
195
+ gc_values[i] = 0.5
196
+
197
+ # Sort lengths descending for N50/N90 computation
198
+ sorted_lengths = np.sort(lengths)[::-1]
199
+ total_length = int(lengths.sum())
200
+
201
+ # === 11 Contig length stats ===
202
+ features[0] = total_length # total_length
203
+ features[1] = n_contigs # contig_count
204
+
205
+ n50, l50 = _compute_nx_lx(sorted_lengths, total_length, 0.5)
206
+ n90, l90 = _compute_nx_lx(sorted_lengths, total_length, 0.9)
207
+ features[2] = n50 # n50
208
+ features[3] = n90 # n90
209
+ features[4] = l50 # l50
210
+ features[5] = l90 # l90
211
+ features[6] = sorted_lengths[0] # largest_contig
212
+ features[7] = sorted_lengths[-1] # smallest_contig
213
+ mean_length = total_length / n_contigs
214
+ features[8] = mean_length # mean_contig
215
+ features[9] = float(np.median(lengths)) # median_contig
216
+ features[10] = float(np.std(lengths)) # contig_length_std
217
+
218
+ # === 4 Base composition ===
219
+ # Weight GC by contig length for overall mean
220
+ length_weights = lengths.astype(np.float64) / total_length
221
+ gc_mean = float(np.sum(gc_values * length_weights))
222
+ features[11] = gc_mean # gc_mean
223
+
224
+ # GC std across contigs (weighted)
225
+ gc_var = float(np.sum(length_weights * (gc_values - gc_mean) ** 2))
226
+ gc_std = np.sqrt(gc_var)
227
+ features[12] = gc_std # gc_std
228
+
229
+ # GC IQR
230
+ if n_contigs >= 4:
231
+ q75, q25 = np.percentile(gc_values, [75, 25])
232
+ gc_iqr = q75 - q25
233
+ else:
234
+ gc_iqr = gc_std * 1.35 # Approximate IQR from std for small samples
235
+ features[13] = gc_iqr # gc_iqr
236
+
237
+ # GC bimodality: (skewness^2 + 1) / kurtosis
238
+ features[14] = _compute_bimodality(gc_values) # gc_bimodality
239
+
240
+ # === 4 Distributional ===
241
+ # GC outlier fraction: fraction of total length in contigs with |GC - mean| > 2*std
242
+ if gc_std > 1e-10:
243
+ outlier_mask = np.abs(gc_values - gc_mean) > 2 * gc_std
244
+ gc_outlier_fraction = float(lengths[outlier_mask].sum()) / total_length
245
+ else:
246
+ gc_outlier_fraction = 0.0
247
+ features[15] = gc_outlier_fraction # gc_outlier_fraction
248
+
249
+ # Largest contig fraction
250
+ features[16] = float(sorted_lengths[0]) / total_length # largest_contig_fraction
251
+
252
+ # Top 10% contig concentration
253
+ n_top10 = max(1, int(np.ceil(n_contigs * 0.1)))
254
+ top10_length = float(sorted_lengths[:n_top10].sum())
255
+ features[17] = top10_length / total_length # top10_concentration
256
+
257
+ # N50/mean ratio
258
+ if mean_length > 0:
259
+ features[18] = n50 / mean_length # n50_mean_ratio
260
+ else:
261
+ features[18] = 0.0
262
+
263
+ # === 1 K-mer (legacy) ===
264
+ features[19] = log10_total_kmer_count # log10_total_kmer_count
265
+
266
+ # === 6 New k-mer summary features ===
267
+ if kmer_counts is not None:
268
+ total_kmer_sum = float(kmer_counts.sum())
269
+ features[20] = total_kmer_sum # total_kmer_sum
270
+
271
+ unique_count = int(np.count_nonzero(kmer_counts))
272
+ features[21] = float(unique_count) # unique_kmer_count
273
+
274
+ duplicate_count = int(np.sum(kmer_counts > 1))
275
+ features[22] = float(duplicate_count) # duplicate_kmer_count
276
+
277
+ # Shannon entropy of k-mer count distribution
278
+ if total_kmer_sum > 0:
279
+ probs = kmer_counts.astype(np.float64) / total_kmer_sum
280
+ # Only compute for non-zero entries to avoid log(0)
281
+ nonzero_mask = probs > 0
282
+ entropy = -float(np.sum(probs[nonzero_mask] * np.log2(probs[nonzero_mask])))
283
+ else:
284
+ entropy = 0.0
285
+ features[23] = entropy # kmer_entropy
286
+
287
+ n_total_kmers = len(kmer_counts)
288
+ features[24] = unique_count / n_total_kmers if n_total_kmers > 0 else 0.0 # unique_kmer_ratio
289
+
290
+ features[25] = duplicate_count / unique_count if unique_count > 0 else 0.0 # duplicate_kmer_ratio
291
+ # else: features[20:26] remain 0.0
292
+
293
+ return features
294
+
295
+
296
+ def compute_assembly_stats_batch(
297
+ batch_contigs: List[List[str]],
298
+ log10_kmer_counts: Optional[np.ndarray] = None,
299
+ batch_kmer_counts: Optional[np.ndarray] = None,
300
+ ) -> np.ndarray:
301
+ """
302
+ Compute assembly statistics for a batch of genomes.
303
+
304
+ Parameters
305
+ ----------
306
+ batch_contigs : list of list of str
307
+ List of contig lists, one per genome.
308
+ log10_kmer_counts : np.ndarray or None
309
+ Array of log10 total k-mer counts, one per genome.
310
+ batch_kmer_counts : np.ndarray or None
311
+ Raw k-mer counts, shape (n_genomes, n_kmer_features). If provided,
312
+ used to compute the 6 new k-mer summary features.
313
+
314
+ Returns
315
+ -------
316
+ np.ndarray
317
+ Array of shape (n_genomes, 26).
318
+ """
319
+ n = len(batch_contigs)
320
+ result = np.zeros((n, N_FEATURES), dtype=np.float64)
321
+
322
+ for i in range(n):
323
+ kmer_count = float(log10_kmer_counts[i]) if log10_kmer_counts is not None else 0.0
324
+ kc = batch_kmer_counts[i] if batch_kmer_counts is not None else None
325
+ result[i] = compute_assembly_stats(batch_contigs[i], kmer_count, kc)
326
+
327
+ return result
328
+
329
+
330
+ def format_stats(features: np.ndarray) -> Dict[str, float]:
331
+ """Convert feature array to named dictionary."""
332
+ return {name: float(features[i]) for i, name in enumerate(FEATURE_NAMES)}