speconsense 0.7.2__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speconsense-0.7.2/speconsense.egg-info → speconsense-0.7.3}/PKG-INFO +60 -11
- {speconsense-0.7.2 → speconsense-0.7.3}/README.md +59 -10
- {speconsense-0.7.2 → speconsense-0.7.3}/pyproject.toml +1 -1
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/__init__.py +1 -1
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/cli.py +18 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/__init__.py +1 -0
- speconsense-0.7.3/speconsense/profiles/compressed.yaml +27 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/cli.py +33 -3
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/fields.py +5 -3
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/io.py +10 -1
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/merging.py +97 -76
- {speconsense-0.7.2 → speconsense-0.7.3/speconsense.egg-info}/PKG-INFO +60 -11
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/SOURCES.txt +2 -0
- speconsense-0.7.3/tests/test_complement_flags.py +180 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_summarize.py +203 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/LICENSE +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/setup.cfg +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/cli.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/__init__.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/__main__.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/clusterer.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/workers.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/msa.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/example.yaml +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/herbarium.yaml +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/largedata.yaml +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/nostalgia.yaml +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/strict.yaml +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/quality_report.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/__init__.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/base.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/config.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/vsearch.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/__init__.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/__main__.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/analysis.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/clustering.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/iupac.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/synth.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/types.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/dependency_links.txt +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/entry_points.txt +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/requires.txt +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/top_level.txt +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_ambiguity_calling.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_augment_input.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_fields.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_haplotype_filtering.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_orientation.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_overlap_merge.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_overlap_merge_integration.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_profiles.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_regression.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_synth.py +0 -0
- {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_variant_phasing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: speconsense
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
|
|
5
5
|
Author-email: Josh Walker <joshowalker@yahoo.com>
|
|
6
6
|
License: BSD-3-Clause
|
|
@@ -171,6 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
**Bundled profiles:**
|
|
174
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
174
175
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
175
176
|
- `largedata` — Experimental settings for large input files
|
|
176
177
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -294,12 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
|
|
|
294
295
|
|---------------|-------------|------------|-------------|
|
|
295
296
|
| **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
|
|
296
297
|
| **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
|
|
298
|
+
| **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
|
|
297
299
|
|
|
298
300
|
### Example Directory Structure
|
|
299
301
|
```
|
|
300
302
|
__Summary__/
|
|
301
303
|
├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
|
|
302
304
|
├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
|
|
305
|
+
├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
|
|
303
306
|
├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
|
|
304
307
|
├── summary.fasta # All final consensus sequences (excludes .raw)
|
|
305
308
|
├── summary.txt # Statistics
|
|
@@ -810,6 +813,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
|
|
|
810
813
|
|
|
811
814
|
### Additional Summarize Options
|
|
812
815
|
|
|
816
|
+
**Full Consensus:**
|
|
817
|
+
```bash
|
|
818
|
+
speconsense-summarize --enable-full-consensus
|
|
819
|
+
```
|
|
820
|
+
- Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
|
|
821
|
+
- Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
|
|
822
|
+
- Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
|
|
823
|
+
- Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
|
|
824
|
+
- Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
|
|
825
|
+
- Enabled by default in the `compressed` profile
|
|
826
|
+
- Use `--disable-full-consensus` to override when set by a profile
|
|
827
|
+
|
|
813
828
|
**Quality Filtering:**
|
|
814
829
|
```bash
|
|
815
830
|
speconsense-summarize --min-ric 5
|
|
@@ -1045,7 +1060,8 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1045
1060
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1046
1061
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1047
1062
|
5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1048
|
-
6. **
|
|
1063
|
+
6. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
|
|
1064
|
+
7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1049
1065
|
|
|
1050
1066
|
**Key architectural features**:
|
|
1051
1067
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1098,17 +1114,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
|
|
|
1098
1114
|
[--min-cluster-ratio MIN_CLUSTER_RATIO]
|
|
1099
1115
|
[--max-sample-size MAX_SAMPLE_SIZE]
|
|
1100
1116
|
[--outlier-identity OUTLIER_IDENTITY]
|
|
1101
|
-
[--disable-position-phasing]
|
|
1117
|
+
[--disable-position-phasing] [--enable-position-phasing]
|
|
1102
1118
|
[--min-variant-frequency MIN_VARIANT_FREQUENCY]
|
|
1103
1119
|
[--min-variant-count MIN_VARIANT_COUNT]
|
|
1104
|
-
[--disable-ambiguity-calling]
|
|
1120
|
+
[--disable-ambiguity-calling] [--enable-ambiguity-calling]
|
|
1105
1121
|
[--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
|
|
1106
1122
|
[--min-ambiguity-count MIN_AMBIGUITY_COUNT]
|
|
1107
|
-
[--disable-cluster-merging]
|
|
1123
|
+
[--disable-cluster-merging] [--enable-cluster-merging]
|
|
1108
1124
|
[--disable-homopolymer-equivalence]
|
|
1125
|
+
[--enable-homopolymer-equivalence]
|
|
1109
1126
|
[--orient-mode {skip,keep-all,filter-failed}]
|
|
1110
1127
|
[--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
|
|
1111
|
-
[--threads N] [--enable-early-filter]
|
|
1128
|
+
[--threads N] [--enable-early-filter]
|
|
1129
|
+
[--disable-early-filter] [--collect-discards]
|
|
1130
|
+
[--no-collect-discards]
|
|
1112
1131
|
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
|
1113
1132
|
[--version] [-p NAME] [--list-profiles]
|
|
1114
1133
|
input_file
|
|
@@ -1167,6 +1186,8 @@ Variant Phasing:
|
|
|
1167
1186
|
default). MCL graph clustering already separates most
|
|
1168
1187
|
variants; this second pass analyzes MSA positions to
|
|
1169
1188
|
phase remaining variants.
|
|
1189
|
+
--enable-position-phasing
|
|
1190
|
+
Override --disable-position-phasing or profile setting
|
|
1170
1191
|
--min-variant-frequency MIN_VARIANT_FREQUENCY
|
|
1171
1192
|
Minimum alternative allele frequency to call variant
|
|
1172
1193
|
(default: 0.10 for 10%)
|
|
@@ -1178,6 +1199,9 @@ Ambiguity Calling:
|
|
|
1178
1199
|
--disable-ambiguity-calling
|
|
1179
1200
|
Disable IUPAC ambiguity code calling for unphased
|
|
1180
1201
|
variant positions
|
|
1202
|
+
--enable-ambiguity-calling
|
|
1203
|
+
Override --disable-ambiguity-calling or profile
|
|
1204
|
+
setting
|
|
1181
1205
|
--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
|
|
1182
1206
|
Minimum alternative allele frequency for IUPAC
|
|
1183
1207
|
ambiguity calling (default: 0.10 for 10%)
|
|
@@ -1189,9 +1213,14 @@ Cluster Merging:
|
|
|
1189
1213
|
--disable-cluster-merging
|
|
1190
1214
|
Disable merging of clusters with identical consensus
|
|
1191
1215
|
sequences
|
|
1216
|
+
--enable-cluster-merging
|
|
1217
|
+
Override --disable-cluster-merging or profile setting
|
|
1192
1218
|
--disable-homopolymer-equivalence
|
|
1193
1219
|
Disable homopolymer equivalence in cluster merging
|
|
1194
1220
|
(only merge identical sequences)
|
|
1221
|
+
--enable-homopolymer-equivalence
|
|
1222
|
+
Override --disable-homopolymer-equivalence or profile
|
|
1223
|
+
setting
|
|
1195
1224
|
|
|
1196
1225
|
Orientation:
|
|
1197
1226
|
--orient-mode {skip,keep-all,filter-failed}
|
|
@@ -1213,10 +1242,14 @@ Performance:
|
|
|
1213
1242
|
Enable early filtering to skip small clusters before
|
|
1214
1243
|
variant phasing (improves performance for large
|
|
1215
1244
|
datasets)
|
|
1245
|
+
--disable-early-filter
|
|
1246
|
+
Override --enable-early-filter or profile setting
|
|
1216
1247
|
|
|
1217
1248
|
Debugging:
|
|
1218
1249
|
--collect-discards Write discarded reads (outliers and filtered clusters)
|
|
1219
1250
|
to cluster_debug/{sample}-discards.fastq
|
|
1251
|
+
--no-collect-discards
|
|
1252
|
+
Override --collect-discards or profile setting
|
|
1220
1253
|
--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
|
|
1221
1254
|
```
|
|
1222
1255
|
|
|
@@ -1227,15 +1260,21 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1227
1260
|
[--summary-dir SUMMARY_DIR]
|
|
1228
1261
|
[--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
|
|
1229
1262
|
[--min-len MIN_LEN] [--max-len MAX_LEN]
|
|
1230
|
-
[--group-identity GROUP_IDENTITY]
|
|
1263
|
+
[--group-identity GROUP_IDENTITY]
|
|
1264
|
+
[--disable-merging] [--enable-merging]
|
|
1265
|
+
[--merge-snp | --no-merge-snp]
|
|
1231
1266
|
[--merge-indel-length MERGE_INDEL_LENGTH]
|
|
1232
1267
|
[--merge-position-count MERGE_POSITION_COUNT]
|
|
1233
1268
|
[--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
|
|
1234
1269
|
[--min-merge-overlap MIN_MERGE_OVERLAP]
|
|
1235
1270
|
[--disable-homopolymer-equivalence]
|
|
1271
|
+
[--enable-homopolymer-equivalence]
|
|
1272
|
+
[--merge-effort LEVEL]
|
|
1236
1273
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1237
1274
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1238
1275
|
[--select-strategy {size,diversity}]
|
|
1276
|
+
[--enable-full-consensus]
|
|
1277
|
+
[--disable-full-consensus]
|
|
1239
1278
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
1240
1279
|
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
|
1241
1280
|
[--version] [-p NAME] [--list-profiles]
|
|
@@ -1281,10 +1320,7 @@ Grouping:
|
|
|
1281
1320
|
Merging:
|
|
1282
1321
|
--disable-merging Disable all variant merging (skip MSA-based merge
|
|
1283
1322
|
evaluation entirely)
|
|
1284
|
-
--
|
|
1285
|
-
thorough (12), or numeric 6-14. Higher values allow
|
|
1286
|
-
larger batch sizes for exhaustive subset search.
|
|
1287
|
-
Default: balanced
|
|
1323
|
+
--enable-merging Override --disable-merging or profile setting
|
|
1288
1324
|
--merge-snp, --no-merge-snp
|
|
1289
1325
|
Enable SNP-based merging (default: True, use --no-
|
|
1290
1326
|
merge-snp to disable)
|
|
@@ -1303,6 +1339,13 @@ Merging:
|
|
|
1303
1339
|
--disable-homopolymer-equivalence
|
|
1304
1340
|
Disable homopolymer equivalence in merging (treat AAA
|
|
1305
1341
|
vs AAAA as different)
|
|
1342
|
+
--enable-homopolymer-equivalence
|
|
1343
|
+
Override --disable-homopolymer-equivalence or profile
|
|
1344
|
+
setting
|
|
1345
|
+
--merge-effort LEVEL Merging effort level: fast (8), balanced (10),
|
|
1346
|
+
thorough (12), or numeric 6-14. Higher values allow
|
|
1347
|
+
larger batch sizes for exhaustive subset search.
|
|
1348
|
+
Default: balanced
|
|
1306
1349
|
|
|
1307
1350
|
Selection:
|
|
1308
1351
|
--select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
|
|
@@ -1314,6 +1357,12 @@ Selection:
|
|
|
1314
1357
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1315
1358
|
Variant selection strategy: size or diversity
|
|
1316
1359
|
(default: size)
|
|
1360
|
+
--enable-full-consensus
|
|
1361
|
+
Generate a full consensus per variant group
|
|
1362
|
+
representing all variation from pre-merge variants
|
|
1363
|
+
(gaps never win)
|
|
1364
|
+
--disable-full-consensus
|
|
1365
|
+
Override --enable-full-consensus or profile setting
|
|
1317
1366
|
|
|
1318
1367
|
Performance:
|
|
1319
1368
|
--scale-threshold SCALE_THRESHOLD
|
|
@@ -136,6 +136,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
**Bundled profiles:**
|
|
139
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
139
140
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
140
141
|
- `largedata` — Experimental settings for large input files
|
|
141
142
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -259,12 +260,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
|
|
|
259
260
|
|---------------|-------------|------------|-------------|
|
|
260
261
|
| **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
|
|
261
262
|
| **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
|
|
263
|
+
| **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
|
|
262
264
|
|
|
263
265
|
### Example Directory Structure
|
|
264
266
|
```
|
|
265
267
|
__Summary__/
|
|
266
268
|
├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
|
|
267
269
|
├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
|
|
270
|
+
├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
|
|
268
271
|
├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
|
|
269
272
|
├── summary.fasta # All final consensus sequences (excludes .raw)
|
|
270
273
|
├── summary.txt # Statistics
|
|
@@ -775,6 +778,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
|
|
|
775
778
|
|
|
776
779
|
### Additional Summarize Options
|
|
777
780
|
|
|
781
|
+
**Full Consensus:**
|
|
782
|
+
```bash
|
|
783
|
+
speconsense-summarize --enable-full-consensus
|
|
784
|
+
```
|
|
785
|
+
- Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
|
|
786
|
+
- Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
|
|
787
|
+
- Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
|
|
788
|
+
- Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
|
|
789
|
+
- Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
|
|
790
|
+
- Enabled by default in the `compressed` profile
|
|
791
|
+
- Use `--disable-full-consensus` to override when set by a profile
|
|
792
|
+
|
|
778
793
|
**Quality Filtering:**
|
|
779
794
|
```bash
|
|
780
795
|
speconsense-summarize --min-ric 5
|
|
@@ -1010,7 +1025,8 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1010
1025
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1011
1026
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1012
1027
|
5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1013
|
-
6. **
|
|
1028
|
+
6. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
|
|
1029
|
+
7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1014
1030
|
|
|
1015
1031
|
**Key architectural features**:
|
|
1016
1032
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1063,17 +1079,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
|
|
|
1063
1079
|
[--min-cluster-ratio MIN_CLUSTER_RATIO]
|
|
1064
1080
|
[--max-sample-size MAX_SAMPLE_SIZE]
|
|
1065
1081
|
[--outlier-identity OUTLIER_IDENTITY]
|
|
1066
|
-
[--disable-position-phasing]
|
|
1082
|
+
[--disable-position-phasing] [--enable-position-phasing]
|
|
1067
1083
|
[--min-variant-frequency MIN_VARIANT_FREQUENCY]
|
|
1068
1084
|
[--min-variant-count MIN_VARIANT_COUNT]
|
|
1069
|
-
[--disable-ambiguity-calling]
|
|
1085
|
+
[--disable-ambiguity-calling] [--enable-ambiguity-calling]
|
|
1070
1086
|
[--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
|
|
1071
1087
|
[--min-ambiguity-count MIN_AMBIGUITY_COUNT]
|
|
1072
|
-
[--disable-cluster-merging]
|
|
1088
|
+
[--disable-cluster-merging] [--enable-cluster-merging]
|
|
1073
1089
|
[--disable-homopolymer-equivalence]
|
|
1090
|
+
[--enable-homopolymer-equivalence]
|
|
1074
1091
|
[--orient-mode {skip,keep-all,filter-failed}]
|
|
1075
1092
|
[--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
|
|
1076
|
-
[--threads N] [--enable-early-filter]
|
|
1093
|
+
[--threads N] [--enable-early-filter]
|
|
1094
|
+
[--disable-early-filter] [--collect-discards]
|
|
1095
|
+
[--no-collect-discards]
|
|
1077
1096
|
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
|
1078
1097
|
[--version] [-p NAME] [--list-profiles]
|
|
1079
1098
|
input_file
|
|
@@ -1132,6 +1151,8 @@ Variant Phasing:
|
|
|
1132
1151
|
default). MCL graph clustering already separates most
|
|
1133
1152
|
variants; this second pass analyzes MSA positions to
|
|
1134
1153
|
phase remaining variants.
|
|
1154
|
+
--enable-position-phasing
|
|
1155
|
+
Override --disable-position-phasing or profile setting
|
|
1135
1156
|
--min-variant-frequency MIN_VARIANT_FREQUENCY
|
|
1136
1157
|
Minimum alternative allele frequency to call variant
|
|
1137
1158
|
(default: 0.10 for 10%)
|
|
@@ -1143,6 +1164,9 @@ Ambiguity Calling:
|
|
|
1143
1164
|
--disable-ambiguity-calling
|
|
1144
1165
|
Disable IUPAC ambiguity code calling for unphased
|
|
1145
1166
|
variant positions
|
|
1167
|
+
--enable-ambiguity-calling
|
|
1168
|
+
Override --disable-ambiguity-calling or profile
|
|
1169
|
+
setting
|
|
1146
1170
|
--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
|
|
1147
1171
|
Minimum alternative allele frequency for IUPAC
|
|
1148
1172
|
ambiguity calling (default: 0.10 for 10%)
|
|
@@ -1154,9 +1178,14 @@ Cluster Merging:
|
|
|
1154
1178
|
--disable-cluster-merging
|
|
1155
1179
|
Disable merging of clusters with identical consensus
|
|
1156
1180
|
sequences
|
|
1181
|
+
--enable-cluster-merging
|
|
1182
|
+
Override --disable-cluster-merging or profile setting
|
|
1157
1183
|
--disable-homopolymer-equivalence
|
|
1158
1184
|
Disable homopolymer equivalence in cluster merging
|
|
1159
1185
|
(only merge identical sequences)
|
|
1186
|
+
--enable-homopolymer-equivalence
|
|
1187
|
+
Override --disable-homopolymer-equivalence or profile
|
|
1188
|
+
setting
|
|
1160
1189
|
|
|
1161
1190
|
Orientation:
|
|
1162
1191
|
--orient-mode {skip,keep-all,filter-failed}
|
|
@@ -1178,10 +1207,14 @@ Performance:
|
|
|
1178
1207
|
Enable early filtering to skip small clusters before
|
|
1179
1208
|
variant phasing (improves performance for large
|
|
1180
1209
|
datasets)
|
|
1210
|
+
--disable-early-filter
|
|
1211
|
+
Override --enable-early-filter or profile setting
|
|
1181
1212
|
|
|
1182
1213
|
Debugging:
|
|
1183
1214
|
--collect-discards Write discarded reads (outliers and filtered clusters)
|
|
1184
1215
|
to cluster_debug/{sample}-discards.fastq
|
|
1216
|
+
--no-collect-discards
|
|
1217
|
+
Override --collect-discards or profile setting
|
|
1185
1218
|
--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
|
|
1186
1219
|
```
|
|
1187
1220
|
|
|
@@ -1192,15 +1225,21 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1192
1225
|
[--summary-dir SUMMARY_DIR]
|
|
1193
1226
|
[--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
|
|
1194
1227
|
[--min-len MIN_LEN] [--max-len MAX_LEN]
|
|
1195
|
-
[--group-identity GROUP_IDENTITY]
|
|
1228
|
+
[--group-identity GROUP_IDENTITY]
|
|
1229
|
+
[--disable-merging] [--enable-merging]
|
|
1230
|
+
[--merge-snp | --no-merge-snp]
|
|
1196
1231
|
[--merge-indel-length MERGE_INDEL_LENGTH]
|
|
1197
1232
|
[--merge-position-count MERGE_POSITION_COUNT]
|
|
1198
1233
|
[--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
|
|
1199
1234
|
[--min-merge-overlap MIN_MERGE_OVERLAP]
|
|
1200
1235
|
[--disable-homopolymer-equivalence]
|
|
1236
|
+
[--enable-homopolymer-equivalence]
|
|
1237
|
+
[--merge-effort LEVEL]
|
|
1201
1238
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1202
1239
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1203
1240
|
[--select-strategy {size,diversity}]
|
|
1241
|
+
[--enable-full-consensus]
|
|
1242
|
+
[--disable-full-consensus]
|
|
1204
1243
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
1205
1244
|
[--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
|
1206
1245
|
[--version] [-p NAME] [--list-profiles]
|
|
@@ -1246,10 +1285,7 @@ Grouping:
|
|
|
1246
1285
|
Merging:
|
|
1247
1286
|
--disable-merging Disable all variant merging (skip MSA-based merge
|
|
1248
1287
|
evaluation entirely)
|
|
1249
|
-
--
|
|
1250
|
-
thorough (12), or numeric 6-14. Higher values allow
|
|
1251
|
-
larger batch sizes for exhaustive subset search.
|
|
1252
|
-
Default: balanced
|
|
1288
|
+
--enable-merging Override --disable-merging or profile setting
|
|
1253
1289
|
--merge-snp, --no-merge-snp
|
|
1254
1290
|
Enable SNP-based merging (default: True, use --no-
|
|
1255
1291
|
merge-snp to disable)
|
|
@@ -1268,6 +1304,13 @@ Merging:
|
|
|
1268
1304
|
--disable-homopolymer-equivalence
|
|
1269
1305
|
Disable homopolymer equivalence in merging (treat AAA
|
|
1270
1306
|
vs AAAA as different)
|
|
1307
|
+
--enable-homopolymer-equivalence
|
|
1308
|
+
Override --disable-homopolymer-equivalence or profile
|
|
1309
|
+
setting
|
|
1310
|
+
--merge-effort LEVEL Merging effort level: fast (8), balanced (10),
|
|
1311
|
+
thorough (12), or numeric 6-14. Higher values allow
|
|
1312
|
+
larger batch sizes for exhaustive subset search.
|
|
1313
|
+
Default: balanced
|
|
1271
1314
|
|
|
1272
1315
|
Selection:
|
|
1273
1316
|
--select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
|
|
@@ -1279,6 +1322,12 @@ Selection:
|
|
|
1279
1322
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1280
1323
|
Variant selection strategy: size or diversity
|
|
1281
1324
|
(default: size)
|
|
1325
|
+
--enable-full-consensus
|
|
1326
|
+
Generate a full consensus per variant group
|
|
1327
|
+
representing all variation from pre-merge variants
|
|
1328
|
+
(gaps never win)
|
|
1329
|
+
--disable-full-consensus
|
|
1330
|
+
Override --enable-full-consensus or profile setting
|
|
1282
1331
|
|
|
1283
1332
|
Performance:
|
|
1284
1333
|
--scale-threshold SCALE_THRESHOLD
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "speconsense"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.3"
|
|
8
8
|
description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -66,6 +66,9 @@ def main():
|
|
|
66
66
|
help="Disable position-based variant phasing (enabled by default). "
|
|
67
67
|
"MCL graph clustering already separates most variants; this "
|
|
68
68
|
"second pass analyzes MSA positions to phase remaining variants.")
|
|
69
|
+
phasing_group.add_argument("--enable-position-phasing", action="store_false",
|
|
70
|
+
dest="disable_position_phasing",
|
|
71
|
+
help="Override --disable-position-phasing or profile setting")
|
|
69
72
|
phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
|
|
70
73
|
help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
|
|
71
74
|
phasing_group.add_argument("--min-variant-count", type=int, default=5,
|
|
@@ -75,6 +78,9 @@ def main():
|
|
|
75
78
|
ambiguity_group = parser.add_argument_group("Ambiguity Calling")
|
|
76
79
|
ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
|
|
77
80
|
help="Disable IUPAC ambiguity code calling for unphased variant positions")
|
|
81
|
+
ambiguity_group.add_argument("--enable-ambiguity-calling", action="store_false",
|
|
82
|
+
dest="disable_ambiguity_calling",
|
|
83
|
+
help="Override --disable-ambiguity-calling or profile setting")
|
|
78
84
|
ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
|
|
79
85
|
help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
|
|
80
86
|
ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
|
|
@@ -84,8 +90,14 @@ def main():
|
|
|
84
90
|
merging_group = parser.add_argument_group("Cluster Merging")
|
|
85
91
|
merging_group.add_argument("--disable-cluster-merging", action="store_true",
|
|
86
92
|
help="Disable merging of clusters with identical consensus sequences")
|
|
93
|
+
merging_group.add_argument("--enable-cluster-merging", action="store_false",
|
|
94
|
+
dest="disable_cluster_merging",
|
|
95
|
+
help="Override --disable-cluster-merging or profile setting")
|
|
87
96
|
merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
|
|
88
97
|
help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
|
|
98
|
+
merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
|
|
99
|
+
dest="disable_homopolymer_equivalence",
|
|
100
|
+
help="Override --disable-homopolymer-equivalence or profile setting")
|
|
89
101
|
|
|
90
102
|
# Orientation group
|
|
91
103
|
orient_group = parser.add_argument_group("Orientation")
|
|
@@ -104,11 +116,17 @@ def main():
|
|
|
104
116
|
"0=auto-detect, default=1 (safe for parallel workflows).")
|
|
105
117
|
perf_group.add_argument("--enable-early-filter", action="store_true",
|
|
106
118
|
help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
|
|
119
|
+
perf_group.add_argument("--disable-early-filter", action="store_false",
|
|
120
|
+
dest="enable_early_filter",
|
|
121
|
+
help="Override --enable-early-filter or profile setting")
|
|
107
122
|
|
|
108
123
|
# Debugging group
|
|
109
124
|
debug_group = parser.add_argument_group("Debugging")
|
|
110
125
|
debug_group.add_argument("--collect-discards", action="store_true",
|
|
111
126
|
help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
|
|
127
|
+
debug_group.add_argument("--no-collect-discards", action="store_false",
|
|
128
|
+
dest="collect_discards",
|
|
129
|
+
help="Override --collect-discards or profile setting")
|
|
112
130
|
debug_group.add_argument("--log-level", default="INFO",
|
|
113
131
|
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
|
|
114
132
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Compress variants into minimal IUPAC consensus sequences
|
|
2
|
+
#
|
|
3
|
+
# Aggressively merges similar variants (including indels) into single
|
|
4
|
+
# IUPAC consensus sequences. Only truly dissimilar sequences remain
|
|
5
|
+
# separate. Uses 20% frequency thresholds throughout.
|
|
6
|
+
#
|
|
7
|
+
# Designed for workflows where reviewers want fewer sequences to
|
|
8
|
+
# examine, with all variation represented via IUPAC ambiguity codes.
|
|
9
|
+
# Partial overlap merging is disabled as a safety measure.
|
|
10
|
+
#
|
|
11
|
+
# Use with:
|
|
12
|
+
# speconsense input.fastq -p compressed
|
|
13
|
+
# speconsense-summarize -p compressed
|
|
14
|
+
|
|
15
|
+
speconsense-version: "0.7.*"
|
|
16
|
+
description: "Compress variants into minimal IUPAC consensus sequences"
|
|
17
|
+
|
|
18
|
+
speconsense:
|
|
19
|
+
min-ambiguity-frequency: 0.20 # 20% threshold for IUPAC ambiguity calling
|
|
20
|
+
min-variant-frequency: 0.20 # 20% threshold for variant phasing
|
|
21
|
+
|
|
22
|
+
speconsense-summarize:
|
|
23
|
+
merge-indel-length: 5 # Merge indels up to 5bp
|
|
24
|
+
merge-position-count: 10 # Allow up to 10 variant positions in a merge
|
|
25
|
+
merge-min-size-ratio: 0.2 # Match 20% calling threshold
|
|
26
|
+
min-merge-overlap: 0 # Disable partial overlap merging
|
|
27
|
+
enable-full-consensus: true # Include full IUPAC consensus per group
|
|
@@ -54,8 +54,8 @@ from .io import (
|
|
|
54
54
|
write_output_files,
|
|
55
55
|
)
|
|
56
56
|
from .clustering import perform_hac_clustering, select_variants
|
|
57
|
-
from .merging import merge_group_with_msa
|
|
58
|
-
from .analysis import MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
|
|
57
|
+
from .merging import merge_group_with_msa, create_full_consensus_from_msa
|
|
58
|
+
from .analysis import run_spoa_msa, MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
# Merge effort configuration
|
|
@@ -132,6 +132,8 @@ def parse_arguments():
|
|
|
132
132
|
merging_group = parser.add_argument_group("Merging")
|
|
133
133
|
merging_group.add_argument("--disable-merging", action="store_true",
|
|
134
134
|
help="Disable all variant merging (skip MSA-based merge evaluation entirely)")
|
|
135
|
+
merging_group.add_argument("--enable-merging", action="store_false", dest="disable_merging",
|
|
136
|
+
help="Override --disable-merging or profile setting")
|
|
135
137
|
merging_group.add_argument("--merge-snp", action=argparse.BooleanOptionalAction, default=True,
|
|
136
138
|
help="Enable SNP-based merging (default: True, use --no-merge-snp to disable)")
|
|
137
139
|
merging_group.add_argument("--merge-indel-length", type=int, default=0,
|
|
@@ -144,6 +146,9 @@ def parse_arguments():
|
|
|
144
146
|
help="Minimum overlap in bp for merging sequences of different lengths (default: 200, 0 to disable)")
|
|
145
147
|
merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
|
|
146
148
|
help="Disable homopolymer equivalence in merging (treat AAA vs AAAA as different)")
|
|
149
|
+
merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
|
|
150
|
+
dest="disable_homopolymer_equivalence",
|
|
151
|
+
help="Override --disable-homopolymer-equivalence or profile setting")
|
|
147
152
|
merging_group.add_argument("--merge-effort", type=str, default="balanced", metavar="LEVEL",
|
|
148
153
|
help="Merging effort level: fast (8), balanced (10), thorough (12), "
|
|
149
154
|
"or numeric 6-14. Higher values allow larger batch sizes for "
|
|
@@ -164,6 +169,12 @@ def parse_arguments():
|
|
|
164
169
|
selection_group.add_argument("--select-strategy", "--variant-selection",
|
|
165
170
|
dest="select_strategy", choices=["size", "diversity"], default="size",
|
|
166
171
|
help="Variant selection strategy: size or diversity (default: size)")
|
|
172
|
+
selection_group.add_argument("--enable-full-consensus", action="store_true",
|
|
173
|
+
help="Generate a full consensus per variant group representing all variation "
|
|
174
|
+
"from pre-merge variants (gaps never win)")
|
|
175
|
+
selection_group.add_argument("--disable-full-consensus", action="store_false",
|
|
176
|
+
dest="enable_full_consensus",
|
|
177
|
+
help="Override --enable-full-consensus or profile setting")
|
|
167
178
|
|
|
168
179
|
# Performance group
|
|
169
180
|
perf_group = parser.add_argument_group("Performance")
|
|
@@ -345,7 +356,7 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
345
356
|
key=lambda x: max(m.size for m in x[1]),
|
|
346
357
|
reverse=True)
|
|
347
358
|
|
|
348
|
-
for group_idx, (
|
|
359
|
+
for group_idx, (group_id, group_members) in enumerate(sorted_groups):
|
|
349
360
|
final_group_name = group_idx + 1
|
|
350
361
|
|
|
351
362
|
# Select variants for this group
|
|
@@ -366,6 +377,24 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
366
377
|
final_consensus.append(renamed_variant)
|
|
367
378
|
group_naming.append((variant.sample_name, new_name))
|
|
368
379
|
|
|
380
|
+
# Generate full consensus from PRE-MERGE variants
|
|
381
|
+
if getattr(args, 'enable_full_consensus', False):
|
|
382
|
+
pre_merge_variants = variant_groups[group_id]
|
|
383
|
+
specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
|
|
384
|
+
full_name = f"{specimen_base}-{group_idx + 1}.full"
|
|
385
|
+
|
|
386
|
+
if len(pre_merge_variants) == 1:
|
|
387
|
+
# Single variant — copy directly
|
|
388
|
+
full_consensus = pre_merge_variants[0]._replace(sample_name=full_name)
|
|
389
|
+
else:
|
|
390
|
+
# MSA on pre-merge variants, full consensus logic
|
|
391
|
+
sequences = [v.sequence for v in pre_merge_variants]
|
|
392
|
+
aligned_seqs = run_spoa_msa(sequences, alignment_mode=1)
|
|
393
|
+
full_consensus = create_full_consensus_from_msa(aligned_seqs, pre_merge_variants)
|
|
394
|
+
full_consensus = full_consensus._replace(sample_name=full_name)
|
|
395
|
+
|
|
396
|
+
final_consensus.append(full_consensus)
|
|
397
|
+
|
|
369
398
|
naming_info[group_idx + 1] = group_naming
|
|
370
399
|
|
|
371
400
|
logging.info(f"Processed {file_name}: {len(final_consensus)} final variants across {len(merged_groups)} groups")
|
|
@@ -421,6 +450,7 @@ def main():
|
|
|
421
450
|
logging.info(f" --select-max-variants: {args.select_max_variants}")
|
|
422
451
|
logging.info(f" --select-max-groups: {args.select_max_groups}")
|
|
423
452
|
logging.info(f" --select-strategy: {args.select_strategy}")
|
|
453
|
+
logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
|
|
424
454
|
logging.info(f" --log-level: {args.log_level}")
|
|
425
455
|
logging.info("")
|
|
426
456
|
logging.info("Processing each specimen file independently to organize variants within specimens")
|
|
@@ -124,8 +124,8 @@ class GroupField(FastaField):
|
|
|
124
124
|
super().__init__('group', 'Variant group number')
|
|
125
125
|
|
|
126
126
|
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
127
|
-
# Extract from sample_name (e.g., "...-1.v1"
|
|
128
|
-
match = re.search(r'-(\d+)
|
|
127
|
+
# Extract from sample_name (e.g., "...-1.v1", "...-2.v1.raw1", or "...-1.full")
|
|
128
|
+
match = re.search(r'-(\d+)(?:\.v\d+(?:\.raw\d+)?|\.full)$', consensus.sample_name)
|
|
129
129
|
if match:
|
|
130
130
|
return f"group={match.group(1)}"
|
|
131
131
|
return None
|
|
@@ -136,8 +136,10 @@ class VariantField(FastaField):
|
|
|
136
136
|
super().__init__('variant', 'Variant identifier within group')
|
|
137
137
|
|
|
138
138
|
def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
|
|
139
|
-
# Extract from sample_name (e.g., "...-1.v1" -> "v1"
|
|
139
|
+
# Extract from sample_name (e.g., "...-1.v1" -> "v1", "...-1.v1.raw1" -> "v1", "...-1.full" -> "full")
|
|
140
140
|
match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
|
|
141
|
+
if not match:
|
|
142
|
+
match = re.search(r'\.(full)$', consensus.sample_name)
|
|
141
143
|
if match:
|
|
142
144
|
return f"variant={match.group(1)}"
|
|
143
145
|
return None
|
|
@@ -358,6 +358,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
|
|
|
358
358
|
# Generate .raw file consensuses for merged variants
|
|
359
359
|
raw_file_consensuses = []
|
|
360
360
|
for consensus in specimen_consensus:
|
|
361
|
+
# Skip .raw generation for .full consensus (synthetic/derived)
|
|
362
|
+
if consensus.sample_name.endswith('.full'):
|
|
363
|
+
continue
|
|
361
364
|
# Only create .raw files if this consensus was actually merged
|
|
362
365
|
if consensus.raw_ric and len(consensus.raw_ric) > 1:
|
|
363
366
|
# Find the original cluster name from naming_info
|
|
@@ -412,6 +415,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
|
|
|
412
415
|
|
|
413
416
|
# Write FASTQ files for each final consensus containing all contributing reads
|
|
414
417
|
for consensus in specimen_consensus:
|
|
418
|
+
# Skip FASTQ for .full consensus (synthetic/derived, no traceable cluster reads)
|
|
419
|
+
if consensus.sample_name.endswith('.full'):
|
|
420
|
+
continue
|
|
415
421
|
write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
|
|
416
422
|
|
|
417
423
|
# Write .raw files (individual FASTA and FASTQ for pre-merge variants)
|
|
@@ -704,7 +710,10 @@ def write_output_files(final_consensus: List[ConsensusInfo],
|
|
|
704
710
|
multiple_id = specimen_counters[base_name]
|
|
705
711
|
writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
|
|
706
712
|
unique_samples.add(base_name)
|
|
707
|
-
|
|
713
|
+
# Exclude .full from total RiC to avoid double-counting
|
|
714
|
+
# (.full aggregates reads already counted in merged variants)
|
|
715
|
+
if not consensus.sample_name.endswith('.full'):
|
|
716
|
+
total_ric += consensus.ric
|
|
708
717
|
|
|
709
718
|
writer.writerow([])
|
|
710
719
|
writer.writerow(['Total Unique Samples', len(unique_samples)])
|