speconsense 0.7.2__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {speconsense-0.7.2/speconsense.egg-info → speconsense-0.7.3}/PKG-INFO +60 -11
  2. {speconsense-0.7.2 → speconsense-0.7.3}/README.md +59 -10
  3. {speconsense-0.7.2 → speconsense-0.7.3}/pyproject.toml +1 -1
  4. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/__init__.py +1 -1
  5. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/cli.py +18 -0
  6. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/__init__.py +1 -0
  7. speconsense-0.7.3/speconsense/profiles/compressed.yaml +27 -0
  8. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/cli.py +33 -3
  9. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/fields.py +5 -3
  10. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/io.py +10 -1
  11. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/merging.py +97 -76
  12. {speconsense-0.7.2 → speconsense-0.7.3/speconsense.egg-info}/PKG-INFO +60 -11
  13. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/SOURCES.txt +2 -0
  14. speconsense-0.7.3/tests/test_complement_flags.py +180 -0
  15. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_summarize.py +203 -0
  16. {speconsense-0.7.2 → speconsense-0.7.3}/LICENSE +0 -0
  17. {speconsense-0.7.2 → speconsense-0.7.3}/setup.cfg +0 -0
  18. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/cli.py +0 -0
  19. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/__init__.py +0 -0
  20. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/__main__.py +0 -0
  21. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/clusterer.py +0 -0
  22. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/core/workers.py +0 -0
  23. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/msa.py +0 -0
  24. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/example.yaml +0 -0
  25. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/herbarium.yaml +0 -0
  26. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/largedata.yaml +0 -0
  27. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/nostalgia.yaml +0 -0
  28. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/profiles/strict.yaml +0 -0
  29. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/quality_report.py +0 -0
  30. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/__init__.py +0 -0
  31. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/base.py +0 -0
  32. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/config.py +0 -0
  33. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/scalability/vsearch.py +0 -0
  34. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/__init__.py +0 -0
  35. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/__main__.py +0 -0
  36. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/analysis.py +0 -0
  37. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/clustering.py +0 -0
  38. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/summarize/iupac.py +0 -0
  39. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/synth.py +0 -0
  40. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense/types.py +0 -0
  41. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/dependency_links.txt +0 -0
  42. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/entry_points.txt +0 -0
  43. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/requires.txt +0 -0
  44. {speconsense-0.7.2 → speconsense-0.7.3}/speconsense.egg-info/top_level.txt +0 -0
  45. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_ambiguity_calling.py +0 -0
  46. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_augment_input.py +0 -0
  47. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_fields.py +0 -0
  48. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_haplotype_filtering.py +0 -0
  49. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_orientation.py +0 -0
  50. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_overlap_merge.py +0 -0
  51. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_overlap_merge_integration.py +0 -0
  52. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_profiles.py +0 -0
  53. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_regression.py +0 -0
  54. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_synth.py +0 -0
  55. {speconsense-0.7.2 → speconsense-0.7.3}/tests/test_variant_phasing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.2
3
+ Version: 0.7.3
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,6 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
175
  - `herbarium` — High-recall for degraded DNA/type specimens
175
176
  - `largedata` — Experimental settings for large input files
176
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -294,12 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
294
295
  |---------------|-------------|------------|-------------|
295
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
296
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
297
299
 
298
300
  ### Example Directory Structure
299
301
  ```
300
302
  __Summary__/
301
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
302
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
303
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
304
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
305
308
  ├── summary.txt # Statistics
@@ -810,6 +813,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
810
813
 
811
814
  ### Additional Summarize Options
812
815
 
816
+ **Full Consensus:**
817
+ ```bash
818
+ speconsense-summarize --enable-full-consensus
819
+ ```
820
+ - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
821
+ - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
822
+ - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
823
+ - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
824
+ - Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
825
+ - Enabled by default in the `compressed` profile
826
+ - Use `--disable-full-consensus` to override when set by a profile
827
+
813
828
  **Quality Filtering:**
814
829
  ```bash
815
830
  speconsense-summarize --min-ric 5
@@ -1045,7 +1060,8 @@ The complete speconsense-summarize workflow operates in this order:
1045
1060
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1046
1061
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1047
1062
  5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1048
- 6. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1063
+ 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
+ 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1049
1065
 
1050
1066
  **Key architectural features**:
1051
1067
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1098,17 +1114,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
1098
1114
  [--min-cluster-ratio MIN_CLUSTER_RATIO]
1099
1115
  [--max-sample-size MAX_SAMPLE_SIZE]
1100
1116
  [--outlier-identity OUTLIER_IDENTITY]
1101
- [--disable-position-phasing]
1117
+ [--disable-position-phasing] [--enable-position-phasing]
1102
1118
  [--min-variant-frequency MIN_VARIANT_FREQUENCY]
1103
1119
  [--min-variant-count MIN_VARIANT_COUNT]
1104
- [--disable-ambiguity-calling]
1120
+ [--disable-ambiguity-calling] [--enable-ambiguity-calling]
1105
1121
  [--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
1106
1122
  [--min-ambiguity-count MIN_AMBIGUITY_COUNT]
1107
- [--disable-cluster-merging]
1123
+ [--disable-cluster-merging] [--enable-cluster-merging]
1108
1124
  [--disable-homopolymer-equivalence]
1125
+ [--enable-homopolymer-equivalence]
1109
1126
  [--orient-mode {skip,keep-all,filter-failed}]
1110
1127
  [--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
1111
- [--threads N] [--enable-early-filter] [--collect-discards]
1128
+ [--threads N] [--enable-early-filter]
1129
+ [--disable-early-filter] [--collect-discards]
1130
+ [--no-collect-discards]
1112
1131
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1113
1132
  [--version] [-p NAME] [--list-profiles]
1114
1133
  input_file
@@ -1167,6 +1186,8 @@ Variant Phasing:
1167
1186
  default). MCL graph clustering already separates most
1168
1187
  variants; this second pass analyzes MSA positions to
1169
1188
  phase remaining variants.
1189
+ --enable-position-phasing
1190
+ Override --disable-position-phasing or profile setting
1170
1191
  --min-variant-frequency MIN_VARIANT_FREQUENCY
1171
1192
  Minimum alternative allele frequency to call variant
1172
1193
  (default: 0.10 for 10%)
@@ -1178,6 +1199,9 @@ Ambiguity Calling:
1178
1199
  --disable-ambiguity-calling
1179
1200
  Disable IUPAC ambiguity code calling for unphased
1180
1201
  variant positions
1202
+ --enable-ambiguity-calling
1203
+ Override --disable-ambiguity-calling or profile
1204
+ setting
1181
1205
  --min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
1182
1206
  Minimum alternative allele frequency for IUPAC
1183
1207
  ambiguity calling (default: 0.10 for 10%)
@@ -1189,9 +1213,14 @@ Cluster Merging:
1189
1213
  --disable-cluster-merging
1190
1214
  Disable merging of clusters with identical consensus
1191
1215
  sequences
1216
+ --enable-cluster-merging
1217
+ Override --disable-cluster-merging or profile setting
1192
1218
  --disable-homopolymer-equivalence
1193
1219
  Disable homopolymer equivalence in cluster merging
1194
1220
  (only merge identical sequences)
1221
+ --enable-homopolymer-equivalence
1222
+ Override --disable-homopolymer-equivalence or profile
1223
+ setting
1195
1224
 
1196
1225
  Orientation:
1197
1226
  --orient-mode {skip,keep-all,filter-failed}
@@ -1213,10 +1242,14 @@ Performance:
1213
1242
  Enable early filtering to skip small clusters before
1214
1243
  variant phasing (improves performance for large
1215
1244
  datasets)
1245
+ --disable-early-filter
1246
+ Override --enable-early-filter or profile setting
1216
1247
 
1217
1248
  Debugging:
1218
1249
  --collect-discards Write discarded reads (outliers and filtered clusters)
1219
1250
  to cluster_debug/{sample}-discards.fastq
1251
+ --no-collect-discards
1252
+ Override --collect-discards or profile setting
1220
1253
  --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
1221
1254
  ```
1222
1255
 
@@ -1227,15 +1260,21 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1227
1260
  [--summary-dir SUMMARY_DIR]
1228
1261
  [--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
1229
1262
  [--min-len MIN_LEN] [--max-len MAX_LEN]
1230
- [--group-identity GROUP_IDENTITY] [--merge-snp]
1263
+ [--group-identity GROUP_IDENTITY]
1264
+ [--disable-merging] [--enable-merging]
1265
+ [--merge-snp | --no-merge-snp]
1231
1266
  [--merge-indel-length MERGE_INDEL_LENGTH]
1232
1267
  [--merge-position-count MERGE_POSITION_COUNT]
1233
1268
  [--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
1234
1269
  [--min-merge-overlap MIN_MERGE_OVERLAP]
1235
1270
  [--disable-homopolymer-equivalence]
1271
+ [--enable-homopolymer-equivalence]
1272
+ [--merge-effort LEVEL]
1236
1273
  [--select-max-groups SELECT_MAX_GROUPS]
1237
1274
  [--select-max-variants SELECT_MAX_VARIANTS]
1238
1275
  [--select-strategy {size,diversity}]
1276
+ [--enable-full-consensus]
1277
+ [--disable-full-consensus]
1239
1278
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
1240
1279
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1241
1280
  [--version] [-p NAME] [--list-profiles]
@@ -1281,10 +1320,7 @@ Grouping:
1281
1320
  Merging:
1282
1321
  --disable-merging Disable all variant merging (skip MSA-based merge
1283
1322
  evaluation entirely)
1284
- --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1285
- thorough (12), or numeric 6-14. Higher values allow
1286
- larger batch sizes for exhaustive subset search.
1287
- Default: balanced
1323
+ --enable-merging Override --disable-merging or profile setting
1288
1324
  --merge-snp, --no-merge-snp
1289
1325
  Enable SNP-based merging (default: True, use --no-
1290
1326
  merge-snp to disable)
@@ -1303,6 +1339,13 @@ Merging:
1303
1339
  --disable-homopolymer-equivalence
1304
1340
  Disable homopolymer equivalence in merging (treat AAA
1305
1341
  vs AAAA as different)
1342
+ --enable-homopolymer-equivalence
1343
+ Override --disable-homopolymer-equivalence or profile
1344
+ setting
1345
+ --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1346
+ thorough (12), or numeric 6-14. Higher values allow
1347
+ larger batch sizes for exhaustive subset search.
1348
+ Default: balanced
1306
1349
 
1307
1350
  Selection:
1308
1351
  --select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
@@ -1314,6 +1357,12 @@ Selection:
1314
1357
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1315
1358
  Variant selection strategy: size or diversity
1316
1359
  (default: size)
1360
+ --enable-full-consensus
1361
+ Generate a full consensus per variant group
1362
+ representing all variation from pre-merge variants
1363
+ (gaps never win)
1364
+ --disable-full-consensus
1365
+ Override --enable-full-consensus or profile setting
1317
1366
 
1318
1367
  Performance:
1319
1368
  --scale-threshold SCALE_THRESHOLD
@@ -136,6 +136,7 @@ speconsense input.fastq -p herbarium --min-size 10
136
136
  ```
137
137
 
138
138
  **Bundled profiles:**
139
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
139
140
  - `herbarium` — High-recall for degraded DNA/type specimens
140
141
  - `largedata` — Experimental settings for large input files
141
142
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -259,12 +260,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
259
260
  |---------------|-------------|------------|-------------|
260
261
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
261
262
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
263
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
262
264
 
263
265
  ### Example Directory Structure
264
266
  ```
265
267
  __Summary__/
266
268
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
267
269
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
270
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
268
271
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
269
272
  ├── summary.fasta # All final consensus sequences (excludes .raw)
270
273
  ├── summary.txt # Statistics
@@ -775,6 +778,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
775
778
 
776
779
  ### Additional Summarize Options
777
780
 
781
+ **Full Consensus:**
782
+ ```bash
783
+ speconsense-summarize --enable-full-consensus
784
+ ```
785
+ - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
786
+ - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
787
+ - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
788
+ - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
789
+ - Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
790
+ - Enabled by default in the `compressed` profile
791
+ - Use `--disable-full-consensus` to override when set by a profile
792
+
778
793
  **Quality Filtering:**
779
794
  ```bash
780
795
  speconsense-summarize --min-ric 5
@@ -1010,7 +1025,8 @@ The complete speconsense-summarize workflow operates in this order:
1010
1025
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1011
1026
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1012
1027
  5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1013
- 6. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1028
+ 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1029
+ 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1014
1030
 
1015
1031
  **Key architectural features**:
1016
1032
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1063,17 +1079,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
1063
1079
  [--min-cluster-ratio MIN_CLUSTER_RATIO]
1064
1080
  [--max-sample-size MAX_SAMPLE_SIZE]
1065
1081
  [--outlier-identity OUTLIER_IDENTITY]
1066
- [--disable-position-phasing]
1082
+ [--disable-position-phasing] [--enable-position-phasing]
1067
1083
  [--min-variant-frequency MIN_VARIANT_FREQUENCY]
1068
1084
  [--min-variant-count MIN_VARIANT_COUNT]
1069
- [--disable-ambiguity-calling]
1085
+ [--disable-ambiguity-calling] [--enable-ambiguity-calling]
1070
1086
  [--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
1071
1087
  [--min-ambiguity-count MIN_AMBIGUITY_COUNT]
1072
- [--disable-cluster-merging]
1088
+ [--disable-cluster-merging] [--enable-cluster-merging]
1073
1089
  [--disable-homopolymer-equivalence]
1090
+ [--enable-homopolymer-equivalence]
1074
1091
  [--orient-mode {skip,keep-all,filter-failed}]
1075
1092
  [--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
1076
- [--threads N] [--enable-early-filter] [--collect-discards]
1093
+ [--threads N] [--enable-early-filter]
1094
+ [--disable-early-filter] [--collect-discards]
1095
+ [--no-collect-discards]
1077
1096
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1078
1097
  [--version] [-p NAME] [--list-profiles]
1079
1098
  input_file
@@ -1132,6 +1151,8 @@ Variant Phasing:
1132
1151
  default). MCL graph clustering already separates most
1133
1152
  variants; this second pass analyzes MSA positions to
1134
1153
  phase remaining variants.
1154
+ --enable-position-phasing
1155
+ Override --disable-position-phasing or profile setting
1135
1156
  --min-variant-frequency MIN_VARIANT_FREQUENCY
1136
1157
  Minimum alternative allele frequency to call variant
1137
1158
  (default: 0.10 for 10%)
@@ -1143,6 +1164,9 @@ Ambiguity Calling:
1143
1164
  --disable-ambiguity-calling
1144
1165
  Disable IUPAC ambiguity code calling for unphased
1145
1166
  variant positions
1167
+ --enable-ambiguity-calling
1168
+ Override --disable-ambiguity-calling or profile
1169
+ setting
1146
1170
  --min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
1147
1171
  Minimum alternative allele frequency for IUPAC
1148
1172
  ambiguity calling (default: 0.10 for 10%)
@@ -1154,9 +1178,14 @@ Cluster Merging:
1154
1178
  --disable-cluster-merging
1155
1179
  Disable merging of clusters with identical consensus
1156
1180
  sequences
1181
+ --enable-cluster-merging
1182
+ Override --disable-cluster-merging or profile setting
1157
1183
  --disable-homopolymer-equivalence
1158
1184
  Disable homopolymer equivalence in cluster merging
1159
1185
  (only merge identical sequences)
1186
+ --enable-homopolymer-equivalence
1187
+ Override --disable-homopolymer-equivalence or profile
1188
+ setting
1160
1189
 
1161
1190
  Orientation:
1162
1191
  --orient-mode {skip,keep-all,filter-failed}
@@ -1178,10 +1207,14 @@ Performance:
1178
1207
  Enable early filtering to skip small clusters before
1179
1208
  variant phasing (improves performance for large
1180
1209
  datasets)
1210
+ --disable-early-filter
1211
+ Override --enable-early-filter or profile setting
1181
1212
 
1182
1213
  Debugging:
1183
1214
  --collect-discards Write discarded reads (outliers and filtered clusters)
1184
1215
  to cluster_debug/{sample}-discards.fastq
1216
+ --no-collect-discards
1217
+ Override --collect-discards or profile setting
1185
1218
  --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
1186
1219
  ```
1187
1220
 
@@ -1192,15 +1225,21 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1192
1225
  [--summary-dir SUMMARY_DIR]
1193
1226
  [--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
1194
1227
  [--min-len MIN_LEN] [--max-len MAX_LEN]
1195
- [--group-identity GROUP_IDENTITY] [--merge-snp]
1228
+ [--group-identity GROUP_IDENTITY]
1229
+ [--disable-merging] [--enable-merging]
1230
+ [--merge-snp | --no-merge-snp]
1196
1231
  [--merge-indel-length MERGE_INDEL_LENGTH]
1197
1232
  [--merge-position-count MERGE_POSITION_COUNT]
1198
1233
  [--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
1199
1234
  [--min-merge-overlap MIN_MERGE_OVERLAP]
1200
1235
  [--disable-homopolymer-equivalence]
1236
+ [--enable-homopolymer-equivalence]
1237
+ [--merge-effort LEVEL]
1201
1238
  [--select-max-groups SELECT_MAX_GROUPS]
1202
1239
  [--select-max-variants SELECT_MAX_VARIANTS]
1203
1240
  [--select-strategy {size,diversity}]
1241
+ [--enable-full-consensus]
1242
+ [--disable-full-consensus]
1204
1243
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
1205
1244
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1206
1245
  [--version] [-p NAME] [--list-profiles]
@@ -1246,10 +1285,7 @@ Grouping:
1246
1285
  Merging:
1247
1286
  --disable-merging Disable all variant merging (skip MSA-based merge
1248
1287
  evaluation entirely)
1249
- --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1250
- thorough (12), or numeric 6-14. Higher values allow
1251
- larger batch sizes for exhaustive subset search.
1252
- Default: balanced
1288
+ --enable-merging Override --disable-merging or profile setting
1253
1289
  --merge-snp, --no-merge-snp
1254
1290
  Enable SNP-based merging (default: True, use --no-
1255
1291
  merge-snp to disable)
@@ -1268,6 +1304,13 @@ Merging:
1268
1304
  --disable-homopolymer-equivalence
1269
1305
  Disable homopolymer equivalence in merging (treat AAA
1270
1306
  vs AAAA as different)
1307
+ --enable-homopolymer-equivalence
1308
+ Override --disable-homopolymer-equivalence or profile
1309
+ setting
1310
+ --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1311
+ thorough (12), or numeric 6-14. Higher values allow
1312
+ larger batch sizes for exhaustive subset search.
1313
+ Default: balanced
1271
1314
 
1272
1315
  Selection:
1273
1316
  --select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
@@ -1279,6 +1322,12 @@ Selection:
1279
1322
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1280
1323
  Variant selection strategy: size or diversity
1281
1324
  (default: size)
1325
+ --enable-full-consensus
1326
+ Generate a full consensus per variant group
1327
+ representing all variation from pre-merge variants
1328
+ (gaps never win)
1329
+ --disable-full-consensus
1330
+ Override --enable-full-consensus or profile setting
1282
1331
 
1283
1332
  Performance:
1284
1333
  --scale-threshold SCALE_THRESHOLD
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "speconsense"
7
- version = "0.7.2"
7
+ version = "0.7.3"
8
8
  description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.2"
8
+ __version__ = "0.7.3"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
@@ -66,6 +66,9 @@ def main():
66
66
  help="Disable position-based variant phasing (enabled by default). "
67
67
  "MCL graph clustering already separates most variants; this "
68
68
  "second pass analyzes MSA positions to phase remaining variants.")
69
+ phasing_group.add_argument("--enable-position-phasing", action="store_false",
70
+ dest="disable_position_phasing",
71
+ help="Override --disable-position-phasing or profile setting")
69
72
  phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
70
73
  help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
71
74
  phasing_group.add_argument("--min-variant-count", type=int, default=5,
@@ -75,6 +78,9 @@ def main():
75
78
  ambiguity_group = parser.add_argument_group("Ambiguity Calling")
76
79
  ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
77
80
  help="Disable IUPAC ambiguity code calling for unphased variant positions")
81
+ ambiguity_group.add_argument("--enable-ambiguity-calling", action="store_false",
82
+ dest="disable_ambiguity_calling",
83
+ help="Override --disable-ambiguity-calling or profile setting")
78
84
  ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
79
85
  help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
80
86
  ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
@@ -84,8 +90,14 @@ def main():
84
90
  merging_group = parser.add_argument_group("Cluster Merging")
85
91
  merging_group.add_argument("--disable-cluster-merging", action="store_true",
86
92
  help="Disable merging of clusters with identical consensus sequences")
93
+ merging_group.add_argument("--enable-cluster-merging", action="store_false",
94
+ dest="disable_cluster_merging",
95
+ help="Override --disable-cluster-merging or profile setting")
87
96
  merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
88
97
  help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
98
+ merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
99
+ dest="disable_homopolymer_equivalence",
100
+ help="Override --disable-homopolymer-equivalence or profile setting")
89
101
 
90
102
  # Orientation group
91
103
  orient_group = parser.add_argument_group("Orientation")
@@ -104,11 +116,17 @@ def main():
104
116
  "0=auto-detect, default=1 (safe for parallel workflows).")
105
117
  perf_group.add_argument("--enable-early-filter", action="store_true",
106
118
  help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
119
+ perf_group.add_argument("--disable-early-filter", action="store_false",
120
+ dest="enable_early_filter",
121
+ help="Override --enable-early-filter or profile setting")
107
122
 
108
123
  # Debugging group
109
124
  debug_group = parser.add_argument_group("Debugging")
110
125
  debug_group.add_argument("--collect-discards", action="store_true",
111
126
  help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
127
+ debug_group.add_argument("--no-collect-discards", action="store_false",
128
+ dest="collect_discards",
129
+ help="Override --collect-discards or profile setting")
112
130
  debug_group.add_argument("--log-level", default="INFO",
113
131
  choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
114
132
 
@@ -103,6 +103,7 @@ VALID_SUMMARIZE_KEYS = {
103
103
  "select-max-groups",
104
104
  "select-max-variants",
105
105
  "select-strategy",
106
+ "enable-full-consensus",
106
107
  # Processing
107
108
  "scale-threshold",
108
109
  "threads",
@@ -0,0 +1,27 @@
1
+ # Compress variants into minimal IUPAC consensus sequences
2
+ #
3
+ # Aggressively merges similar variants (including indels) into single
4
+ # IUPAC consensus sequences. Only truly dissimilar sequences remain
5
+ # separate. Uses 20% frequency thresholds throughout.
6
+ #
7
+ # Designed for workflows where reviewers want fewer sequences to
8
+ # examine, with all variation represented via IUPAC ambiguity codes.
9
+ # Partial overlap merging is disabled as a safety measure.
10
+ #
11
+ # Use with:
12
+ # speconsense input.fastq -p compressed
13
+ # speconsense-summarize -p compressed
14
+
15
+ speconsense-version: "0.7.*"
16
+ description: "Compress variants into minimal IUPAC consensus sequences"
17
+
18
+ speconsense:
19
+ min-ambiguity-frequency: 0.20 # 20% threshold for IUPAC ambiguity calling
20
+ min-variant-frequency: 0.20 # 20% threshold for variant phasing
21
+
22
+ speconsense-summarize:
23
+ merge-indel-length: 5 # Merge indels up to 5bp
24
+ merge-position-count: 10 # Allow up to 10 variant positions in a merge
25
+ merge-min-size-ratio: 0.2 # Match 20% calling threshold
26
+ min-merge-overlap: 0 # Disable partial overlap merging
27
+ enable-full-consensus: true # Include full IUPAC consensus per group
@@ -54,8 +54,8 @@ from .io import (
54
54
  write_output_files,
55
55
  )
56
56
  from .clustering import perform_hac_clustering, select_variants
57
- from .merging import merge_group_with_msa
58
- from .analysis import MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
57
+ from .merging import merge_group_with_msa, create_full_consensus_from_msa
58
+ from .analysis import run_spoa_msa, MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
59
59
 
60
60
 
61
61
  # Merge effort configuration
@@ -132,6 +132,8 @@ def parse_arguments():
132
132
  merging_group = parser.add_argument_group("Merging")
133
133
  merging_group.add_argument("--disable-merging", action="store_true",
134
134
  help="Disable all variant merging (skip MSA-based merge evaluation entirely)")
135
+ merging_group.add_argument("--enable-merging", action="store_false", dest="disable_merging",
136
+ help="Override --disable-merging or profile setting")
135
137
  merging_group.add_argument("--merge-snp", action=argparse.BooleanOptionalAction, default=True,
136
138
  help="Enable SNP-based merging (default: True, use --no-merge-snp to disable)")
137
139
  merging_group.add_argument("--merge-indel-length", type=int, default=0,
@@ -144,6 +146,9 @@ def parse_arguments():
144
146
  help="Minimum overlap in bp for merging sequences of different lengths (default: 200, 0 to disable)")
145
147
  merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
146
148
  help="Disable homopolymer equivalence in merging (treat AAA vs AAAA as different)")
149
+ merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
150
+ dest="disable_homopolymer_equivalence",
151
+ help="Override --disable-homopolymer-equivalence or profile setting")
147
152
  merging_group.add_argument("--merge-effort", type=str, default="balanced", metavar="LEVEL",
148
153
  help="Merging effort level: fast (8), balanced (10), thorough (12), "
149
154
  "or numeric 6-14. Higher values allow larger batch sizes for "
@@ -164,6 +169,12 @@ def parse_arguments():
164
169
  selection_group.add_argument("--select-strategy", "--variant-selection",
165
170
  dest="select_strategy", choices=["size", "diversity"], default="size",
166
171
  help="Variant selection strategy: size or diversity (default: size)")
172
+ selection_group.add_argument("--enable-full-consensus", action="store_true",
173
+ help="Generate a full consensus per variant group representing all variation "
174
+ "from pre-merge variants (gaps never win)")
175
+ selection_group.add_argument("--disable-full-consensus", action="store_false",
176
+ dest="enable_full_consensus",
177
+ help="Override --enable-full-consensus or profile setting")
167
178
 
168
179
  # Performance group
169
180
  perf_group = parser.add_argument_group("Performance")
@@ -345,7 +356,7 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
345
356
  key=lambda x: max(m.size for m in x[1]),
346
357
  reverse=True)
347
358
 
348
- for group_idx, (_, group_members) in enumerate(sorted_groups):
359
+ for group_idx, (group_id, group_members) in enumerate(sorted_groups):
349
360
  final_group_name = group_idx + 1
350
361
 
351
362
  # Select variants for this group
@@ -366,6 +377,24 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
366
377
  final_consensus.append(renamed_variant)
367
378
  group_naming.append((variant.sample_name, new_name))
368
379
 
380
+ # Generate full consensus from PRE-MERGE variants
381
+ if getattr(args, 'enable_full_consensus', False):
382
+ pre_merge_variants = variant_groups[group_id]
383
+ specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
384
+ full_name = f"{specimen_base}-{group_idx + 1}.full"
385
+
386
+ if len(pre_merge_variants) == 1:
387
+ # Single variant — copy directly
388
+ full_consensus = pre_merge_variants[0]._replace(sample_name=full_name)
389
+ else:
390
+ # MSA on pre-merge variants, full consensus logic
391
+ sequences = [v.sequence for v in pre_merge_variants]
392
+ aligned_seqs = run_spoa_msa(sequences, alignment_mode=1)
393
+ full_consensus = create_full_consensus_from_msa(aligned_seqs, pre_merge_variants)
394
+ full_consensus = full_consensus._replace(sample_name=full_name)
395
+
396
+ final_consensus.append(full_consensus)
397
+
369
398
  naming_info[group_idx + 1] = group_naming
370
399
 
371
400
  logging.info(f"Processed {file_name}: {len(final_consensus)} final variants across {len(merged_groups)} groups")
@@ -421,6 +450,7 @@ def main():
421
450
  logging.info(f" --select-max-variants: {args.select_max_variants}")
422
451
  logging.info(f" --select-max-groups: {args.select_max_groups}")
423
452
  logging.info(f" --select-strategy: {args.select_strategy}")
453
+ logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
424
454
  logging.info(f" --log-level: {args.log_level}")
425
455
  logging.info("")
426
456
  logging.info("Processing each specimen file independently to organize variants within specimens")
@@ -124,8 +124,8 @@ class GroupField(FastaField):
124
124
  super().__init__('group', 'Variant group number')
125
125
 
126
126
  def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
127
- # Extract from sample_name (e.g., "...-1.v1" or "...-2.v1.raw1")
128
- match = re.search(r'-(\d+)\.v\d+(?:\.raw\d+)?$', consensus.sample_name)
127
+ # Extract from sample_name (e.g., "...-1.v1", "...-2.v1.raw1", or "...-1.full")
128
+ match = re.search(r'-(\d+)(?:\.v\d+(?:\.raw\d+)?|\.full)$', consensus.sample_name)
129
129
  if match:
130
130
  return f"group={match.group(1)}"
131
131
  return None
@@ -136,8 +136,10 @@ class VariantField(FastaField):
136
136
  super().__init__('variant', 'Variant identifier within group')
137
137
 
138
138
  def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
139
- # Extract from sample_name (e.g., "...-1.v1" -> "v1" or "...-1.v1.raw1" -> "v1")
139
+ # Extract from sample_name (e.g., "...-1.v1" -> "v1", "...-1.v1.raw1" -> "v1", "...-1.full" -> "full")
140
140
  match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
141
+ if not match:
142
+ match = re.search(r'\.(full)$', consensus.sample_name)
141
143
  if match:
142
144
  return f"variant={match.group(1)}"
143
145
  return None
@@ -358,6 +358,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
358
358
  # Generate .raw file consensuses for merged variants
359
359
  raw_file_consensuses = []
360
360
  for consensus in specimen_consensus:
361
+ # Skip .raw generation for .full consensus (synthetic/derived)
362
+ if consensus.sample_name.endswith('.full'):
363
+ continue
361
364
  # Only create .raw files if this consensus was actually merged
362
365
  if consensus.raw_ric and len(consensus.raw_ric) > 1:
363
366
  # Find the original cluster name from naming_info
@@ -412,6 +415,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
412
415
 
413
416
  # Write FASTQ files for each final consensus containing all contributing reads
414
417
  for consensus in specimen_consensus:
418
+ # Skip FASTQ for .full consensus (synthetic/derived, no traceable cluster reads)
419
+ if consensus.sample_name.endswith('.full'):
420
+ continue
415
421
  write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
416
422
 
417
423
  # Write .raw files (individual FASTA and FASTQ for pre-merge variants)
@@ -704,7 +710,10 @@ def write_output_files(final_consensus: List[ConsensusInfo],
704
710
  multiple_id = specimen_counters[base_name]
705
711
  writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
706
712
  unique_samples.add(base_name)
707
- total_ric += consensus.ric
713
+ # Exclude .full from total RiC to avoid double-counting
714
+ # (.full aggregates reads already counted in merged variants)
715
+ if not consensus.sample_name.endswith('.full'):
716
+ total_ric += consensus.ric
708
717
 
709
718
  writer.writerow([])
710
719
  writer.writerow(['Total Unique Samples', len(unique_samples)])