magicc 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magicc/__init__.py +7 -0
- magicc/__main__.py +7 -0
- magicc/assembly_stats.py +332 -0
- magicc/cli.py +587 -0
- magicc/contamination.py +374 -0
- magicc/data/normalization_params.json +102137 -0
- magicc/data/selected_kmers.txt +9249 -0
- magicc/fragmentation.py +768 -0
- magicc/kmer_counter.py +281 -0
- magicc/model.py +733 -0
- magicc/normalization.py +391 -0
- magicc/pipeline.py +253 -0
- magicc/storage.py +295 -0
- magicc/trainer.py +577 -0
- magicc-0.1.0.dist-info/METADATA +115 -0
- magicc-0.1.0.dist-info/RECORD +20 -0
- magicc-0.1.0.dist-info/WHEEL +5 -0
- magicc-0.1.0.dist-info/entry_points.txt +2 -0
- magicc-0.1.0.dist-info/licenses/LICENSE +21 -0
- magicc-0.1.0.dist-info/top_level.txt +1 -0
magicc/__init__.py
ADDED
magicc/__main__.py
ADDED
magicc/assembly_stats.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Assembly Statistics Module for MAGICC.
|
|
3
|
+
|
|
4
|
+
Computes all 26 assembly statistics features in a single pass:
|
|
5
|
+
- 11 contig length stats: total_length, contig_count, n50, n90, l50, l90,
|
|
6
|
+
largest_contig, smallest_contig, mean_contig, median_contig, contig_length_std
|
|
7
|
+
- 4 base composition: gc_mean, gc_std, gc_iqr, gc_bimodality
|
|
8
|
+
- 4 distributional: gc_outlier_fraction, largest_contig_fraction,
|
|
9
|
+
top10_concentration, n50_mean_ratio
|
|
10
|
+
- 1 k-mer (legacy): log10_total_kmer_count (placeholder, filled during k-mer counting)
|
|
11
|
+
- 6 k-mer summary features (new): total_kmer_sum, unique_kmer_count,
|
|
12
|
+
duplicate_kmer_count, kmer_entropy, unique_kmer_ratio, duplicate_kmer_ratio
|
|
13
|
+
|
|
14
|
+
Uses Numba JIT compilation for performance-critical parts.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import numba as nb
|
|
19
|
+
from typing import List, Dict, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
# Feature names in order
|
|
22
|
+
FEATURE_NAMES = [
|
|
23
|
+
# 11 contig length stats
|
|
24
|
+
'total_length',
|
|
25
|
+
'contig_count',
|
|
26
|
+
'n50',
|
|
27
|
+
'n90',
|
|
28
|
+
'l50',
|
|
29
|
+
'l90',
|
|
30
|
+
'largest_contig',
|
|
31
|
+
'smallest_contig',
|
|
32
|
+
'mean_contig',
|
|
33
|
+
'median_contig',
|
|
34
|
+
'contig_length_std',
|
|
35
|
+
# 4 base composition
|
|
36
|
+
'gc_mean',
|
|
37
|
+
'gc_std',
|
|
38
|
+
'gc_iqr',
|
|
39
|
+
'gc_bimodality',
|
|
40
|
+
# 4 distributional
|
|
41
|
+
'gc_outlier_fraction',
|
|
42
|
+
'largest_contig_fraction',
|
|
43
|
+
'top10_concentration',
|
|
44
|
+
'n50_mean_ratio',
|
|
45
|
+
# 1 k-mer (legacy)
|
|
46
|
+
'log10_total_kmer_count',
|
|
47
|
+
# 6 new k-mer summary features
|
|
48
|
+
'total_kmer_sum', # sum of all 9,249 k-mer raw counts
|
|
49
|
+
'unique_kmer_count', # number of k-mers with count > 0
|
|
50
|
+
'duplicate_kmer_count', # number of k-mers with count > 1
|
|
51
|
+
'kmer_entropy', # Shannon entropy of k-mer count distribution
|
|
52
|
+
'unique_kmer_ratio', # unique_kmer_count / 9249
|
|
53
|
+
'duplicate_kmer_ratio', # duplicate_kmer_count / unique_kmer_count
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
N_FEATURES = len(FEATURE_NAMES) # 26
|
|
57
|
+
assert N_FEATURES == 26
|
|
58
|
+
|
|
59
|
+
# Feature name -> index mapping
|
|
60
|
+
FEATURE_INDEX = {name: i for i, name in enumerate(FEATURE_NAMES)}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@nb.njit(cache=True)
|
|
64
|
+
def _compute_gc_from_bytes(seq_bytes: np.ndarray) -> float:
|
|
65
|
+
"""Compute GC content from a byte array of DNA sequence (Numba-accelerated)."""
|
|
66
|
+
gc = 0
|
|
67
|
+
total = 0
|
|
68
|
+
for i in range(len(seq_bytes)):
|
|
69
|
+
b = seq_bytes[i]
|
|
70
|
+
# G=71, C=67, A=65, T=84, g=103, c=99, a=97, t=116
|
|
71
|
+
if b == 71 or b == 67 or b == 103 or b == 99:
|
|
72
|
+
gc += 1
|
|
73
|
+
total += 1
|
|
74
|
+
elif b == 65 or b == 84 or b == 97 or b == 116:
|
|
75
|
+
total += 1
|
|
76
|
+
# N and other chars are ignored
|
|
77
|
+
if total == 0:
|
|
78
|
+
return 0.5
|
|
79
|
+
return gc / total
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@nb.njit(cache=True)
|
|
83
|
+
def _compute_nx_lx(sorted_lengths: np.ndarray, total_length: int, fraction: float) -> Tuple:
|
|
84
|
+
"""
|
|
85
|
+
Compute Nx and Lx metrics from sorted (descending) contig lengths.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
sorted_lengths : np.ndarray
|
|
90
|
+
Contig lengths sorted in descending order.
|
|
91
|
+
total_length : int
|
|
92
|
+
Total assembly length.
|
|
93
|
+
fraction : float
|
|
94
|
+
Fraction (e.g., 0.5 for N50, 0.9 for N90).
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
tuple of (nx, lx)
|
|
99
|
+
nx = contig length at which fraction of assembly is covered
|
|
100
|
+
lx = number of contigs needed to cover fraction
|
|
101
|
+
"""
|
|
102
|
+
threshold = total_length * fraction
|
|
103
|
+
running = 0
|
|
104
|
+
for i in range(len(sorted_lengths)):
|
|
105
|
+
running += sorted_lengths[i]
|
|
106
|
+
if running >= threshold:
|
|
107
|
+
return sorted_lengths[i], i + 1
|
|
108
|
+
return sorted_lengths[-1], len(sorted_lengths)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@nb.njit(cache=True)
|
|
112
|
+
def _compute_bimodality(values: np.ndarray) -> float:
|
|
113
|
+
"""
|
|
114
|
+
Compute bimodality coefficient: (skewness^2 + 1) / kurtosis.
|
|
115
|
+
|
|
116
|
+
Uses excess kurtosis + 3 for the denominator.
|
|
117
|
+
Returns 0 if kurtosis is zero or less.
|
|
118
|
+
"""
|
|
119
|
+
n = len(values)
|
|
120
|
+
if n < 4:
|
|
121
|
+
return 0.0
|
|
122
|
+
|
|
123
|
+
mean = 0.0
|
|
124
|
+
for i in range(n):
|
|
125
|
+
mean += values[i]
|
|
126
|
+
mean /= n
|
|
127
|
+
|
|
128
|
+
m2 = 0.0
|
|
129
|
+
m3 = 0.0
|
|
130
|
+
m4 = 0.0
|
|
131
|
+
for i in range(n):
|
|
132
|
+
d = values[i] - mean
|
|
133
|
+
d2 = d * d
|
|
134
|
+
m2 += d2
|
|
135
|
+
m3 += d2 * d
|
|
136
|
+
m4 += d2 * d2
|
|
137
|
+
m2 /= n
|
|
138
|
+
m3 /= n
|
|
139
|
+
m4 /= n
|
|
140
|
+
|
|
141
|
+
if m2 < 1e-20:
|
|
142
|
+
return 0.0
|
|
143
|
+
|
|
144
|
+
# Skewness
|
|
145
|
+
skewness = m3 / (m2 ** 1.5)
|
|
146
|
+
|
|
147
|
+
# Kurtosis (excess kurtosis + 3 = regular kurtosis)
|
|
148
|
+
kurtosis = m4 / (m2 ** 2)
|
|
149
|
+
|
|
150
|
+
if kurtosis < 1e-20:
|
|
151
|
+
return 0.0
|
|
152
|
+
|
|
153
|
+
return (skewness * skewness + 1.0) / kurtosis
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def compute_assembly_stats(
|
|
157
|
+
contigs: List[str],
|
|
158
|
+
log10_total_kmer_count: float = 0.0,
|
|
159
|
+
kmer_counts: Optional[np.ndarray] = None,
|
|
160
|
+
) -> np.ndarray:
|
|
161
|
+
"""
|
|
162
|
+
Compute all 26 assembly statistics features.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
contigs : list of str
|
|
167
|
+
List of contig sequences.
|
|
168
|
+
log10_total_kmer_count : float
|
|
169
|
+
Pre-computed log10 total k-mer count (filled during k-mer counting step).
|
|
170
|
+
kmer_counts : np.ndarray or None
|
|
171
|
+
Raw k-mer counts array of shape (n_kmer_features,). If provided,
|
|
172
|
+
used to compute the 6 new k-mer summary features.
|
|
173
|
+
|
|
174
|
+
Returns
|
|
175
|
+
-------
|
|
176
|
+
np.ndarray
|
|
177
|
+
Array of 26 features in order defined by FEATURE_NAMES.
|
|
178
|
+
"""
|
|
179
|
+
features = np.zeros(N_FEATURES, dtype=np.float64)
|
|
180
|
+
|
|
181
|
+
n_contigs = len(contigs)
|
|
182
|
+
if n_contigs == 0:
|
|
183
|
+
return features
|
|
184
|
+
|
|
185
|
+
# Compute contig lengths and GC content per contig
|
|
186
|
+
lengths = np.empty(n_contigs, dtype=np.int64)
|
|
187
|
+
gc_values = np.empty(n_contigs, dtype=np.float64)
|
|
188
|
+
|
|
189
|
+
for i, contig in enumerate(contigs):
|
|
190
|
+
lengths[i] = len(contig)
|
|
191
|
+
if len(contig) > 0:
|
|
192
|
+
seq_bytes = np.frombuffer(contig.encode('ascii'), dtype=np.uint8)
|
|
193
|
+
gc_values[i] = _compute_gc_from_bytes(seq_bytes)
|
|
194
|
+
else:
|
|
195
|
+
gc_values[i] = 0.5
|
|
196
|
+
|
|
197
|
+
# Sort lengths descending for N50/N90 computation
|
|
198
|
+
sorted_lengths = np.sort(lengths)[::-1]
|
|
199
|
+
total_length = int(lengths.sum())
|
|
200
|
+
|
|
201
|
+
# === 11 Contig length stats ===
|
|
202
|
+
features[0] = total_length # total_length
|
|
203
|
+
features[1] = n_contigs # contig_count
|
|
204
|
+
|
|
205
|
+
n50, l50 = _compute_nx_lx(sorted_lengths, total_length, 0.5)
|
|
206
|
+
n90, l90 = _compute_nx_lx(sorted_lengths, total_length, 0.9)
|
|
207
|
+
features[2] = n50 # n50
|
|
208
|
+
features[3] = n90 # n90
|
|
209
|
+
features[4] = l50 # l50
|
|
210
|
+
features[5] = l90 # l90
|
|
211
|
+
features[6] = sorted_lengths[0] # largest_contig
|
|
212
|
+
features[7] = sorted_lengths[-1] # smallest_contig
|
|
213
|
+
mean_length = total_length / n_contigs
|
|
214
|
+
features[8] = mean_length # mean_contig
|
|
215
|
+
features[9] = float(np.median(lengths)) # median_contig
|
|
216
|
+
features[10] = float(np.std(lengths)) # contig_length_std
|
|
217
|
+
|
|
218
|
+
# === 4 Base composition ===
|
|
219
|
+
# Weight GC by contig length for overall mean
|
|
220
|
+
length_weights = lengths.astype(np.float64) / total_length
|
|
221
|
+
gc_mean = float(np.sum(gc_values * length_weights))
|
|
222
|
+
features[11] = gc_mean # gc_mean
|
|
223
|
+
|
|
224
|
+
# GC std across contigs (weighted)
|
|
225
|
+
gc_var = float(np.sum(length_weights * (gc_values - gc_mean) ** 2))
|
|
226
|
+
gc_std = np.sqrt(gc_var)
|
|
227
|
+
features[12] = gc_std # gc_std
|
|
228
|
+
|
|
229
|
+
# GC IQR
|
|
230
|
+
if n_contigs >= 4:
|
|
231
|
+
q75, q25 = np.percentile(gc_values, [75, 25])
|
|
232
|
+
gc_iqr = q75 - q25
|
|
233
|
+
else:
|
|
234
|
+
gc_iqr = gc_std * 1.35 # Approximate IQR from std for small samples
|
|
235
|
+
features[13] = gc_iqr # gc_iqr
|
|
236
|
+
|
|
237
|
+
# GC bimodality: (skewness^2 + 1) / kurtosis
|
|
238
|
+
features[14] = _compute_bimodality(gc_values) # gc_bimodality
|
|
239
|
+
|
|
240
|
+
# === 4 Distributional ===
|
|
241
|
+
# GC outlier fraction: fraction of total length in contigs with |GC - mean| > 2*std
|
|
242
|
+
if gc_std > 1e-10:
|
|
243
|
+
outlier_mask = np.abs(gc_values - gc_mean) > 2 * gc_std
|
|
244
|
+
gc_outlier_fraction = float(lengths[outlier_mask].sum()) / total_length
|
|
245
|
+
else:
|
|
246
|
+
gc_outlier_fraction = 0.0
|
|
247
|
+
features[15] = gc_outlier_fraction # gc_outlier_fraction
|
|
248
|
+
|
|
249
|
+
# Largest contig fraction
|
|
250
|
+
features[16] = float(sorted_lengths[0]) / total_length # largest_contig_fraction
|
|
251
|
+
|
|
252
|
+
# Top 10% contig concentration
|
|
253
|
+
n_top10 = max(1, int(np.ceil(n_contigs * 0.1)))
|
|
254
|
+
top10_length = float(sorted_lengths[:n_top10].sum())
|
|
255
|
+
features[17] = top10_length / total_length # top10_concentration
|
|
256
|
+
|
|
257
|
+
# N50/mean ratio
|
|
258
|
+
if mean_length > 0:
|
|
259
|
+
features[18] = n50 / mean_length # n50_mean_ratio
|
|
260
|
+
else:
|
|
261
|
+
features[18] = 0.0
|
|
262
|
+
|
|
263
|
+
# === 1 K-mer (legacy) ===
|
|
264
|
+
features[19] = log10_total_kmer_count # log10_total_kmer_count
|
|
265
|
+
|
|
266
|
+
# === 6 New k-mer summary features ===
|
|
267
|
+
if kmer_counts is not None:
|
|
268
|
+
total_kmer_sum = float(kmer_counts.sum())
|
|
269
|
+
features[20] = total_kmer_sum # total_kmer_sum
|
|
270
|
+
|
|
271
|
+
unique_count = int(np.count_nonzero(kmer_counts))
|
|
272
|
+
features[21] = float(unique_count) # unique_kmer_count
|
|
273
|
+
|
|
274
|
+
duplicate_count = int(np.sum(kmer_counts > 1))
|
|
275
|
+
features[22] = float(duplicate_count) # duplicate_kmer_count
|
|
276
|
+
|
|
277
|
+
# Shannon entropy of k-mer count distribution
|
|
278
|
+
if total_kmer_sum > 0:
|
|
279
|
+
probs = kmer_counts.astype(np.float64) / total_kmer_sum
|
|
280
|
+
# Only compute for non-zero entries to avoid log(0)
|
|
281
|
+
nonzero_mask = probs > 0
|
|
282
|
+
entropy = -float(np.sum(probs[nonzero_mask] * np.log2(probs[nonzero_mask])))
|
|
283
|
+
else:
|
|
284
|
+
entropy = 0.0
|
|
285
|
+
features[23] = entropy # kmer_entropy
|
|
286
|
+
|
|
287
|
+
n_total_kmers = len(kmer_counts)
|
|
288
|
+
features[24] = unique_count / n_total_kmers if n_total_kmers > 0 else 0.0 # unique_kmer_ratio
|
|
289
|
+
|
|
290
|
+
features[25] = duplicate_count / unique_count if unique_count > 0 else 0.0 # duplicate_kmer_ratio
|
|
291
|
+
# else: features[20:26] remain 0.0
|
|
292
|
+
|
|
293
|
+
return features
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def compute_assembly_stats_batch(
|
|
297
|
+
batch_contigs: List[List[str]],
|
|
298
|
+
log10_kmer_counts: Optional[np.ndarray] = None,
|
|
299
|
+
batch_kmer_counts: Optional[np.ndarray] = None,
|
|
300
|
+
) -> np.ndarray:
|
|
301
|
+
"""
|
|
302
|
+
Compute assembly statistics for a batch of genomes.
|
|
303
|
+
|
|
304
|
+
Parameters
|
|
305
|
+
----------
|
|
306
|
+
batch_contigs : list of list of str
|
|
307
|
+
List of contig lists, one per genome.
|
|
308
|
+
log10_kmer_counts : np.ndarray or None
|
|
309
|
+
Array of log10 total k-mer counts, one per genome.
|
|
310
|
+
batch_kmer_counts : np.ndarray or None
|
|
311
|
+
Raw k-mer counts, shape (n_genomes, n_kmer_features). If provided,
|
|
312
|
+
used to compute the 6 new k-mer summary features.
|
|
313
|
+
|
|
314
|
+
Returns
|
|
315
|
+
-------
|
|
316
|
+
np.ndarray
|
|
317
|
+
Array of shape (n_genomes, 26).
|
|
318
|
+
"""
|
|
319
|
+
n = len(batch_contigs)
|
|
320
|
+
result = np.zeros((n, N_FEATURES), dtype=np.float64)
|
|
321
|
+
|
|
322
|
+
for i in range(n):
|
|
323
|
+
kmer_count = float(log10_kmer_counts[i]) if log10_kmer_counts is not None else 0.0
|
|
324
|
+
kc = batch_kmer_counts[i] if batch_kmer_counts is not None else None
|
|
325
|
+
result[i] = compute_assembly_stats(batch_contigs[i], kmer_count, kc)
|
|
326
|
+
|
|
327
|
+
return result
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def format_stats(features: np.ndarray) -> Dict[str, float]:
|
|
331
|
+
"""Convert feature array to named dictionary."""
|
|
332
|
+
return {name: float(features[i]) for i, name in enumerate(FEATURE_NAMES)}
|