magicc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magicc/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ MAGICC - Metagenome-Assembled Genome Inference of Completeness and Contamination
3
+
4
+ Ultra-fast genome quality assessment using core gene k-mer profiles and deep learning.
5
+ """
6
+
7
+ __version__ = "0.1.0"
magicc/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ Entry point for `python -m magicc`.
3
+ """
4
+ from magicc.cli import main
5
+
6
+ if __name__ == '__main__':
7
+ main()
@@ -0,0 +1,332 @@
1
+ """
2
+ Assembly Statistics Module for MAGICC.
3
+
4
+ Computes all 26 assembly statistics features in a single pass:
5
+ - 11 contig length stats: total_length, contig_count, n50, n90, l50, l90,
6
+ largest_contig, smallest_contig, mean_contig, median_contig, contig_length_std
7
+ - 4 base composition: gc_mean, gc_std, gc_iqr, gc_bimodality
8
+ - 4 distributional: gc_outlier_fraction, largest_contig_fraction,
9
+ top10_concentration, n50_mean_ratio
10
+ - 1 k-mer (legacy): log10_total_kmer_count (placeholder, filled during k-mer counting)
11
+ - 6 k-mer summary features (new): total_kmer_sum, unique_kmer_count,
12
+ duplicate_kmer_count, kmer_entropy, unique_kmer_ratio, duplicate_kmer_ratio
13
+
14
+ Uses Numba JIT compilation for performance-critical parts.
15
+ """
16
+
17
+ import numpy as np
18
+ import numba as nb
19
+ from typing import List, Dict, Optional, Tuple
20
+
21
+ # Feature names in order
22
+ FEATURE_NAMES = [
23
+ # 11 contig length stats
24
+ 'total_length',
25
+ 'contig_count',
26
+ 'n50',
27
+ 'n90',
28
+ 'l50',
29
+ 'l90',
30
+ 'largest_contig',
31
+ 'smallest_contig',
32
+ 'mean_contig',
33
+ 'median_contig',
34
+ 'contig_length_std',
35
+ # 4 base composition
36
+ 'gc_mean',
37
+ 'gc_std',
38
+ 'gc_iqr',
39
+ 'gc_bimodality',
40
+ # 4 distributional
41
+ 'gc_outlier_fraction',
42
+ 'largest_contig_fraction',
43
+ 'top10_concentration',
44
+ 'n50_mean_ratio',
45
+ # 1 k-mer (legacy)
46
+ 'log10_total_kmer_count',
47
+ # 6 new k-mer summary features
48
+ 'total_kmer_sum', # sum of all 9,249 k-mer raw counts
49
+ 'unique_kmer_count', # number of k-mers with count > 0
50
+ 'duplicate_kmer_count', # number of k-mers with count > 1
51
+ 'kmer_entropy', # Shannon entropy of k-mer count distribution
52
+ 'unique_kmer_ratio', # unique_kmer_count / 9249
53
+ 'duplicate_kmer_ratio', # duplicate_kmer_count / unique_kmer_count
54
+ ]
55
+
56
+ N_FEATURES = len(FEATURE_NAMES) # 26
57
+ assert N_FEATURES == 26
58
+
59
+ # Feature name -> index mapping
60
+ FEATURE_INDEX = {name: i for i, name in enumerate(FEATURE_NAMES)}
61
+
62
+
63
+ @nb.njit(cache=True)
64
+ def _compute_gc_from_bytes(seq_bytes: np.ndarray) -> float:
65
+ """Compute GC content from a byte array of DNA sequence (Numba-accelerated)."""
66
+ gc = 0
67
+ total = 0
68
+ for i in range(len(seq_bytes)):
69
+ b = seq_bytes[i]
70
+ # G=71, C=67, A=65, T=84, g=103, c=99, a=97, t=116
71
+ if b == 71 or b == 67 or b == 103 or b == 99:
72
+ gc += 1
73
+ total += 1
74
+ elif b == 65 or b == 84 or b == 97 or b == 116:
75
+ total += 1
76
+ # N and other chars are ignored
77
+ if total == 0:
78
+ return 0.5
79
+ return gc / total
80
+
81
+
82
+ @nb.njit(cache=True)
83
+ def _compute_nx_lx(sorted_lengths: np.ndarray, total_length: int, fraction: float) -> Tuple:
84
+ """
85
+ Compute Nx and Lx metrics from sorted (descending) contig lengths.
86
+
87
+ Parameters
88
+ ----------
89
+ sorted_lengths : np.ndarray
90
+ Contig lengths sorted in descending order.
91
+ total_length : int
92
+ Total assembly length.
93
+ fraction : float
94
+ Fraction (e.g., 0.5 for N50, 0.9 for N90).
95
+
96
+ Returns
97
+ -------
98
+ tuple of (nx, lx)
99
+ nx = contig length at which fraction of assembly is covered
100
+ lx = number of contigs needed to cover fraction
101
+ """
102
+ threshold = total_length * fraction
103
+ running = 0
104
+ for i in range(len(sorted_lengths)):
105
+ running += sorted_lengths[i]
106
+ if running >= threshold:
107
+ return sorted_lengths[i], i + 1
108
+ return sorted_lengths[-1], len(sorted_lengths)
109
+
110
+
111
+ @nb.njit(cache=True)
112
+ def _compute_bimodality(values: np.ndarray) -> float:
113
+ """
114
+ Compute bimodality coefficient: (skewness^2 + 1) / kurtosis.
115
+
116
+ Uses excess kurtosis + 3 for the denominator.
117
+ Returns 0 if kurtosis is zero or less.
118
+ """
119
+ n = len(values)
120
+ if n < 4:
121
+ return 0.0
122
+
123
+ mean = 0.0
124
+ for i in range(n):
125
+ mean += values[i]
126
+ mean /= n
127
+
128
+ m2 = 0.0
129
+ m3 = 0.0
130
+ m4 = 0.0
131
+ for i in range(n):
132
+ d = values[i] - mean
133
+ d2 = d * d
134
+ m2 += d2
135
+ m3 += d2 * d
136
+ m4 += d2 * d2
137
+ m2 /= n
138
+ m3 /= n
139
+ m4 /= n
140
+
141
+ if m2 < 1e-20:
142
+ return 0.0
143
+
144
+ # Skewness
145
+ skewness = m3 / (m2 ** 1.5)
146
+
147
+ # Kurtosis (excess kurtosis + 3 = regular kurtosis)
148
+ kurtosis = m4 / (m2 ** 2)
149
+
150
+ if kurtosis < 1e-20:
151
+ return 0.0
152
+
153
+ return (skewness * skewness + 1.0) / kurtosis
154
+
155
+
156
+ def compute_assembly_stats(
157
+ contigs: List[str],
158
+ log10_total_kmer_count: float = 0.0,
159
+ kmer_counts: Optional[np.ndarray] = None,
160
+ ) -> np.ndarray:
161
+ """
162
+ Compute all 26 assembly statistics features.
163
+
164
+ Parameters
165
+ ----------
166
+ contigs : list of str
167
+ List of contig sequences.
168
+ log10_total_kmer_count : float
169
+ Pre-computed log10 total k-mer count (filled during k-mer counting step).
170
+ kmer_counts : np.ndarray or None
171
+ Raw k-mer counts array of shape (n_kmer_features,). If provided,
172
+ used to compute the 6 new k-mer summary features.
173
+
174
+ Returns
175
+ -------
176
+ np.ndarray
177
+ Array of 26 features in order defined by FEATURE_NAMES.
178
+ """
179
+ features = np.zeros(N_FEATURES, dtype=np.float64)
180
+
181
+ n_contigs = len(contigs)
182
+ if n_contigs == 0:
183
+ return features
184
+
185
+ # Compute contig lengths and GC content per contig
186
+ lengths = np.empty(n_contigs, dtype=np.int64)
187
+ gc_values = np.empty(n_contigs, dtype=np.float64)
188
+
189
+ for i, contig in enumerate(contigs):
190
+ lengths[i] = len(contig)
191
+ if len(contig) > 0:
192
+ seq_bytes = np.frombuffer(contig.encode('ascii'), dtype=np.uint8)
193
+ gc_values[i] = _compute_gc_from_bytes(seq_bytes)
194
+ else:
195
+ gc_values[i] = 0.5
196
+
197
+ # Sort lengths descending for N50/N90 computation
198
+ sorted_lengths = np.sort(lengths)[::-1]
199
+ total_length = int(lengths.sum())
200
+
201
+ # === 11 Contig length stats ===
202
+ features[0] = total_length # total_length
203
+ features[1] = n_contigs # contig_count
204
+
205
+ n50, l50 = _compute_nx_lx(sorted_lengths, total_length, 0.5)
206
+ n90, l90 = _compute_nx_lx(sorted_lengths, total_length, 0.9)
207
+ features[2] = n50 # n50
208
+ features[3] = n90 # n90
209
+ features[4] = l50 # l50
210
+ features[5] = l90 # l90
211
+ features[6] = sorted_lengths[0] # largest_contig
212
+ features[7] = sorted_lengths[-1] # smallest_contig
213
+ mean_length = total_length / n_contigs
214
+ features[8] = mean_length # mean_contig
215
+ features[9] = float(np.median(lengths)) # median_contig
216
+ features[10] = float(np.std(lengths)) # contig_length_std
217
+
218
+ # === 4 Base composition ===
219
+ # Weight GC by contig length for overall mean
220
+ length_weights = lengths.astype(np.float64) / total_length
221
+ gc_mean = float(np.sum(gc_values * length_weights))
222
+ features[11] = gc_mean # gc_mean
223
+
224
+ # GC std across contigs (weighted)
225
+ gc_var = float(np.sum(length_weights * (gc_values - gc_mean) ** 2))
226
+ gc_std = np.sqrt(gc_var)
227
+ features[12] = gc_std # gc_std
228
+
229
+ # GC IQR
230
+ if n_contigs >= 4:
231
+ q75, q25 = np.percentile(gc_values, [75, 25])
232
+ gc_iqr = q75 - q25
233
+ else:
234
+ gc_iqr = gc_std * 1.35 # Approximate IQR from std for small samples
235
+ features[13] = gc_iqr # gc_iqr
236
+
237
+ # GC bimodality: (skewness^2 + 1) / kurtosis
238
+ features[14] = _compute_bimodality(gc_values) # gc_bimodality
239
+
240
+ # === 4 Distributional ===
241
+ # GC outlier fraction: fraction of total length in contigs with |GC - mean| > 2*std
242
+ if gc_std > 1e-10:
243
+ outlier_mask = np.abs(gc_values - gc_mean) > 2 * gc_std
244
+ gc_outlier_fraction = float(lengths[outlier_mask].sum()) / total_length
245
+ else:
246
+ gc_outlier_fraction = 0.0
247
+ features[15] = gc_outlier_fraction # gc_outlier_fraction
248
+
249
+ # Largest contig fraction
250
+ features[16] = float(sorted_lengths[0]) / total_length # largest_contig_fraction
251
+
252
+ # Top 10% contig concentration
253
+ n_top10 = max(1, int(np.ceil(n_contigs * 0.1)))
254
+ top10_length = float(sorted_lengths[:n_top10].sum())
255
+ features[17] = top10_length / total_length # top10_concentration
256
+
257
+ # N50/mean ratio
258
+ if mean_length > 0:
259
+ features[18] = n50 / mean_length # n50_mean_ratio
260
+ else:
261
+ features[18] = 0.0
262
+
263
+ # === 1 K-mer (legacy) ===
264
+ features[19] = log10_total_kmer_count # log10_total_kmer_count
265
+
266
+ # === 6 New k-mer summary features ===
267
+ if kmer_counts is not None:
268
+ total_kmer_sum = float(kmer_counts.sum())
269
+ features[20] = total_kmer_sum # total_kmer_sum
270
+
271
+ unique_count = int(np.count_nonzero(kmer_counts))
272
+ features[21] = float(unique_count) # unique_kmer_count
273
+
274
+ duplicate_count = int(np.sum(kmer_counts > 1))
275
+ features[22] = float(duplicate_count) # duplicate_kmer_count
276
+
277
+ # Shannon entropy of k-mer count distribution
278
+ if total_kmer_sum > 0:
279
+ probs = kmer_counts.astype(np.float64) / total_kmer_sum
280
+ # Only compute for non-zero entries to avoid log(0)
281
+ nonzero_mask = probs > 0
282
+ entropy = -float(np.sum(probs[nonzero_mask] * np.log2(probs[nonzero_mask])))
283
+ else:
284
+ entropy = 0.0
285
+ features[23] = entropy # kmer_entropy
286
+
287
+ n_total_kmers = len(kmer_counts)
288
+ features[24] = unique_count / n_total_kmers if n_total_kmers > 0 else 0.0 # unique_kmer_ratio
289
+
290
+ features[25] = duplicate_count / unique_count if unique_count > 0 else 0.0 # duplicate_kmer_ratio
291
+ # else: features[20:26] remain 0.0
292
+
293
+ return features
294
+
295
+
296
+ def compute_assembly_stats_batch(
297
+ batch_contigs: List[List[str]],
298
+ log10_kmer_counts: Optional[np.ndarray] = None,
299
+ batch_kmer_counts: Optional[np.ndarray] = None,
300
+ ) -> np.ndarray:
301
+ """
302
+ Compute assembly statistics for a batch of genomes.
303
+
304
+ Parameters
305
+ ----------
306
+ batch_contigs : list of list of str
307
+ List of contig lists, one per genome.
308
+ log10_kmer_counts : np.ndarray or None
309
+ Array of log10 total k-mer counts, one per genome.
310
+ batch_kmer_counts : np.ndarray or None
311
+ Raw k-mer counts, shape (n_genomes, n_kmer_features). If provided,
312
+ used to compute the 6 new k-mer summary features.
313
+
314
+ Returns
315
+ -------
316
+ np.ndarray
317
+ Array of shape (n_genomes, 26).
318
+ """
319
+ n = len(batch_contigs)
320
+ result = np.zeros((n, N_FEATURES), dtype=np.float64)
321
+
322
+ for i in range(n):
323
+ kmer_count = float(log10_kmer_counts[i]) if log10_kmer_counts is not None else 0.0
324
+ kc = batch_kmer_counts[i] if batch_kmer_counts is not None else None
325
+ result[i] = compute_assembly_stats(batch_contigs[i], kmer_count, kc)
326
+
327
+ return result
328
+
329
+
330
+ def format_stats(features: np.ndarray) -> Dict[str, float]:
331
+ """Convert feature array to named dictionary."""
332
+ return {name: float(features[i]) for i, name in enumerate(FEATURE_NAMES)}