speconsense 0.7.3__tar.gz → 0.7.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speconsense-0.7.3/speconsense.egg-info → speconsense-0.7.4}/PKG-INFO +23 -5
- {speconsense-0.7.3 → speconsense-0.7.4}/README.md +22 -4
- {speconsense-0.7.3 → speconsense-0.7.4}/pyproject.toml +1 -1
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/__init__.py +1 -1
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/__init__.py +1 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/compressed.yaml +1 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/example.yaml +1 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/cli.py +27 -0
- {speconsense-0.7.3 → speconsense-0.7.4/speconsense.egg-info}/PKG-INFO +23 -5
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_summarize.py +203 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/LICENSE +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/setup.cfg +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/cli.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/__init__.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/__main__.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/cli.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/clusterer.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/workers.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/msa.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/herbarium.yaml +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/largedata.yaml +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/nostalgia.yaml +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/strict.yaml +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/quality_report.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/__init__.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/base.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/config.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/vsearch.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/__init__.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/__main__.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/analysis.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/clustering.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/fields.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/io.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/iupac.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/merging.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/synth.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/types.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/SOURCES.txt +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/dependency_links.txt +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/entry_points.txt +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/requires.txt +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/top_level.txt +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_ambiguity_calling.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_augment_input.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_complement_flags.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_fields.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_haplotype_filtering.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_orientation.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_overlap_merge.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_overlap_merge_integration.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_profiles.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_regression.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_synth.py +0 -0
- {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_variant_phasing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: speconsense
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
|
|
5
5
|
Author-email: Josh Walker <joshowalker@yahoo.com>
|
|
6
6
|
License: BSD-3-Clause
|
|
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
**Bundled profiles:**
|
|
174
|
-
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
174
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
|
|
175
175
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
176
176
|
- `largedata` — Experimental settings for large input files
|
|
177
177
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
|
|
|
678
678
|
- Output up to select_max_variants per group
|
|
679
679
|
3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
|
|
680
680
|
|
|
681
|
+
**Selection Size Ratio Filtering:**
|
|
682
|
+
```bash
|
|
683
|
+
speconsense-summarize --select-min-size-ratio 0.2
|
|
684
|
+
```
|
|
685
|
+
- Filters out post-merge variants whose size is too small relative to the largest variant in their group
|
|
686
|
+
- Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
|
|
687
|
+
- Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
|
|
688
|
+
- Default is 0 (disabled) — all post-merge variants pass through to selection
|
|
689
|
+
- Applied after merging but before variant selection
|
|
690
|
+
- Useful for suppressing noise variants that survived merging but are too small to be meaningful
|
|
691
|
+
- Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
|
|
692
|
+
|
|
681
693
|
This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
|
|
682
694
|
|
|
683
695
|
### Customizing FASTA Header Fields
|
|
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1059
1071
|
2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
|
|
1060
1072
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1061
1073
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1062
|
-
5. **
|
|
1063
|
-
6. **
|
|
1064
|
-
7. **
|
|
1074
|
+
5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
|
|
1075
|
+
6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1076
|
+
7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
|
|
1077
|
+
8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1065
1078
|
|
|
1066
1079
|
**Key architectural features**:
|
|
1067
1080
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1273
1286
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1274
1287
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1275
1288
|
[--select-strategy {size,diversity}]
|
|
1289
|
+
[--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
|
|
1276
1290
|
[--enable-full-consensus]
|
|
1277
1291
|
[--disable-full-consensus]
|
|
1278
1292
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
@@ -1357,6 +1371,10 @@ Selection:
|
|
|
1357
1371
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1358
1372
|
Variant selection strategy: size or diversity
|
|
1359
1373
|
(default: size)
|
|
1374
|
+
--select-min-size-ratio SELECT_MIN_SIZE_RATIO
|
|
1375
|
+
Minimum size ratio (variant/largest) to include in
|
|
1376
|
+
output (default: 0 = disabled, e.g. 0.2 for 20%
|
|
1377
|
+
cutoff)
|
|
1360
1378
|
--enable-full-consensus
|
|
1361
1379
|
Generate a full consensus per variant group
|
|
1362
1380
|
representing all variation from pre-merge variants
|
|
@@ -136,7 +136,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
**Bundled profiles:**
|
|
139
|
-
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
139
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
|
|
140
140
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
141
141
|
- `largedata` — Experimental settings for large input files
|
|
142
142
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -643,6 +643,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
|
|
|
643
643
|
- Output up to select_max_variants per group
|
|
644
644
|
3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
|
|
645
645
|
|
|
646
|
+
**Selection Size Ratio Filtering:**
|
|
647
|
+
```bash
|
|
648
|
+
speconsense-summarize --select-min-size-ratio 0.2
|
|
649
|
+
```
|
|
650
|
+
- Filters out post-merge variants whose size is too small relative to the largest variant in their group
|
|
651
|
+
- Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
|
|
652
|
+
- Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
|
|
653
|
+
- Default is 0 (disabled) — all post-merge variants pass through to selection
|
|
654
|
+
- Applied after merging but before variant selection
|
|
655
|
+
- Useful for suppressing noise variants that survived merging but are too small to be meaningful
|
|
656
|
+
- Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
|
|
657
|
+
|
|
646
658
|
This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
|
|
647
659
|
|
|
648
660
|
### Customizing FASTA Header Fields
|
|
@@ -1024,9 +1036,10 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1024
1036
|
2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
|
|
1025
1037
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1026
1038
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1027
|
-
5. **
|
|
1028
|
-
6. **
|
|
1029
|
-
7. **
|
|
1039
|
+
5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
|
|
1040
|
+
6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1041
|
+
7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
|
|
1042
|
+
8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1030
1043
|
|
|
1031
1044
|
**Key architectural features**:
|
|
1032
1045
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1238,6 +1251,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1238
1251
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1239
1252
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1240
1253
|
[--select-strategy {size,diversity}]
|
|
1254
|
+
[--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
|
|
1241
1255
|
[--enable-full-consensus]
|
|
1242
1256
|
[--disable-full-consensus]
|
|
1243
1257
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
@@ -1322,6 +1336,10 @@ Selection:
|
|
|
1322
1336
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1323
1337
|
Variant selection strategy: size or diversity
|
|
1324
1338
|
(default: size)
|
|
1339
|
+
--select-min-size-ratio SELECT_MIN_SIZE_RATIO
|
|
1340
|
+
Minimum size ratio (variant/largest) to include in
|
|
1341
|
+
output (default: 0 = disabled, e.g. 0.2 for 20%
|
|
1342
|
+
cutoff)
|
|
1325
1343
|
--enable-full-consensus
|
|
1326
1344
|
Generate a full consensus per variant group
|
|
1327
1345
|
representing all variation from pre-merge variants
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "speconsense"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.4"
|
|
8
8
|
description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -23,5 +23,6 @@ speconsense-summarize:
|
|
|
23
23
|
merge-indel-length: 5 # Merge indels up to 5bp
|
|
24
24
|
merge-position-count: 10 # Allow up to 10 variant positions in a merge
|
|
25
25
|
merge-min-size-ratio: 0.2 # Match 20% calling threshold
|
|
26
|
+
select-min-size-ratio: 0.2 # Match 20% calling threshold
|
|
26
27
|
min-merge-overlap: 0 # Disable partial overlap merging
|
|
27
28
|
enable-full-consensus: true # Include full IUPAC consensus per group
|
|
@@ -91,6 +91,7 @@ speconsense-summarize:
|
|
|
91
91
|
# select-max-groups: -1 # Max groups to output (-1 = no limit)
|
|
92
92
|
# select-max-variants: -1 # Max variants per group (-1 = no limit)
|
|
93
93
|
# select-strategy: size # Selection strategy: size or diversity
|
|
94
|
+
# select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
|
|
94
95
|
|
|
95
96
|
# --- Processing ---
|
|
96
97
|
# threads: 0 # Max threads (0 = auto-detect)
|
|
@@ -169,6 +169,9 @@ def parse_arguments():
|
|
|
169
169
|
selection_group.add_argument("--select-strategy", "--variant-selection",
|
|
170
170
|
dest="select_strategy", choices=["size", "diversity"], default="size",
|
|
171
171
|
help="Variant selection strategy: size or diversity (default: size)")
|
|
172
|
+
selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
|
|
173
|
+
help="Minimum size ratio (variant/largest) to include in output "
|
|
174
|
+
"(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
|
|
172
175
|
selection_group.add_argument("--enable-full-consensus", action="store_true",
|
|
173
176
|
help="Generate a full consensus per variant group representing all variation "
|
|
174
177
|
"from pre-merge variants (gaps never win)")
|
|
@@ -359,6 +362,18 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
359
362
|
for group_idx, (group_id, group_members) in enumerate(sorted_groups):
|
|
360
363
|
final_group_name = group_idx + 1
|
|
361
364
|
|
|
365
|
+
# Apply select-min-size-ratio filter
|
|
366
|
+
if args.select_min_size_ratio > 0 and len(group_members) > 1:
|
|
367
|
+
largest_size = max(v.size for v in group_members)
|
|
368
|
+
filtered = [v for v in group_members
|
|
369
|
+
if (v.size / largest_size) >= args.select_min_size_ratio]
|
|
370
|
+
if len(filtered) < len(group_members):
|
|
371
|
+
filtered_count = len(group_members) - len(filtered)
|
|
372
|
+
logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
|
|
373
|
+
f"variants with size ratio < {args.select_min_size_ratio} "
|
|
374
|
+
f"relative to largest (size={largest_size})")
|
|
375
|
+
group_members = filtered
|
|
376
|
+
|
|
362
377
|
# Select variants for this group
|
|
363
378
|
selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
|
|
364
379
|
|
|
@@ -380,6 +395,17 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
380
395
|
# Generate full consensus from PRE-MERGE variants
|
|
381
396
|
if getattr(args, 'enable_full_consensus', False):
|
|
382
397
|
pre_merge_variants = variant_groups[group_id]
|
|
398
|
+
|
|
399
|
+
# Apply size-ratio filter (same as merge pipeline)
|
|
400
|
+
if args.merge_min_size_ratio > 0 and len(pre_merge_variants) > 1:
|
|
401
|
+
largest_size = max(v.size for v in pre_merge_variants)
|
|
402
|
+
filtered = [v for v in pre_merge_variants
|
|
403
|
+
if (v.size / largest_size) >= args.merge_min_size_ratio]
|
|
404
|
+
if len(filtered) < len(pre_merge_variants):
|
|
405
|
+
filtered_count = len(pre_merge_variants) - len(filtered)
|
|
406
|
+
logging.debug(f"Full consensus: filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
|
|
407
|
+
pre_merge_variants = filtered
|
|
408
|
+
|
|
383
409
|
specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
|
|
384
410
|
full_name = f"{specimen_base}-{group_idx + 1}.full"
|
|
385
411
|
|
|
@@ -450,6 +476,7 @@ def main():
|
|
|
450
476
|
logging.info(f" --select-max-variants: {args.select_max_variants}")
|
|
451
477
|
logging.info(f" --select-max-groups: {args.select_max_groups}")
|
|
452
478
|
logging.info(f" --select-strategy: {args.select_strategy}")
|
|
479
|
+
logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
|
|
453
480
|
logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
|
|
454
481
|
logging.info(f" --log-level: {args.log_level}")
|
|
455
482
|
logging.info("")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: speconsense
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.4
|
|
4
4
|
Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
|
|
5
5
|
Author-email: Josh Walker <joshowalker@yahoo.com>
|
|
6
6
|
License: BSD-3-Clause
|
|
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
**Bundled profiles:**
|
|
174
|
-
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
174
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
|
|
175
175
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
176
176
|
- `largedata` — Experimental settings for large input files
|
|
177
177
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
|
|
|
678
678
|
- Output up to select_max_variants per group
|
|
679
679
|
3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
|
|
680
680
|
|
|
681
|
+
**Selection Size Ratio Filtering:**
|
|
682
|
+
```bash
|
|
683
|
+
speconsense-summarize --select-min-size-ratio 0.2
|
|
684
|
+
```
|
|
685
|
+
- Filters out post-merge variants whose size is too small relative to the largest variant in their group
|
|
686
|
+
- Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
|
|
687
|
+
- Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
|
|
688
|
+
- Default is 0 (disabled) — all post-merge variants pass through to selection
|
|
689
|
+
- Applied after merging but before variant selection
|
|
690
|
+
- Useful for suppressing noise variants that survived merging but are too small to be meaningful
|
|
691
|
+
- Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
|
|
692
|
+
|
|
681
693
|
This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
|
|
682
694
|
|
|
683
695
|
### Customizing FASTA Header Fields
|
|
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1059
1071
|
2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
|
|
1060
1072
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1061
1073
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1062
|
-
5. **
|
|
1063
|
-
6. **
|
|
1064
|
-
7. **
|
|
1074
|
+
5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
|
|
1075
|
+
6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1076
|
+
7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
|
|
1077
|
+
8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1065
1078
|
|
|
1066
1079
|
**Key architectural features**:
|
|
1067
1080
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1273
1286
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1274
1287
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1275
1288
|
[--select-strategy {size,diversity}]
|
|
1289
|
+
[--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
|
|
1276
1290
|
[--enable-full-consensus]
|
|
1277
1291
|
[--disable-full-consensus]
|
|
1278
1292
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
@@ -1357,6 +1371,10 @@ Selection:
|
|
|
1357
1371
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1358
1372
|
Variant selection strategy: size or diversity
|
|
1359
1373
|
(default: size)
|
|
1374
|
+
--select-min-size-ratio SELECT_MIN_SIZE_RATIO
|
|
1375
|
+
Minimum size ratio (variant/largest) to include in
|
|
1376
|
+
output (default: 0 = disabled, e.g. 0.2 for 20%
|
|
1377
|
+
cutoff)
|
|
1360
1378
|
--enable-full-consensus
|
|
1361
1379
|
Generate a full consensus per variant group
|
|
1362
1380
|
representing all variation from pre-merge variants
|
|
@@ -520,6 +520,114 @@ class TestFullConsensus:
|
|
|
520
520
|
assert result.rid == 0.95
|
|
521
521
|
|
|
522
522
|
|
|
523
|
+
def test_full_consensus_filters_small_variants(self):
|
|
524
|
+
"""Integration test: merge_min_size_ratio filters small variants from full consensus."""
|
|
525
|
+
temp_dir = tempfile.mkdtemp()
|
|
526
|
+
source_dir = os.path.join(temp_dir, "clusters")
|
|
527
|
+
summary_dir = os.path.join(temp_dir, "__Summary__")
|
|
528
|
+
os.makedirs(source_dir)
|
|
529
|
+
|
|
530
|
+
try:
|
|
531
|
+
# Two similar sequences (1 SNP at position 12: G vs A)
|
|
532
|
+
# Very different sizes so the small one is filtered by merge_min_size_ratio
|
|
533
|
+
seq_large = "ATCGATCGATCGATCGATCGATCG" # G at position 12
|
|
534
|
+
seq_small = "ATCGATCGATCAATCGATCGATCG" # A at position 12
|
|
535
|
+
|
|
536
|
+
fasta_content = f""">test-c1 size=100 ric=100 primers=test
|
|
537
|
+
{seq_large}
|
|
538
|
+
>test-c2 size=5 ric=5 primers=test
|
|
539
|
+
{seq_small}
|
|
540
|
+
"""
|
|
541
|
+
fasta_file = os.path.join(source_dir, "test-all.fasta")
|
|
542
|
+
with open(fasta_file, 'w') as f:
|
|
543
|
+
f.write(fasta_content)
|
|
544
|
+
|
|
545
|
+
# merge-min-size-ratio 0.1 filters 5/100=0.05 from full consensus
|
|
546
|
+
result = subprocess.run(
|
|
547
|
+
[
|
|
548
|
+
"speconsense-summarize",
|
|
549
|
+
"--source", source_dir,
|
|
550
|
+
"--summary-dir", summary_dir,
|
|
551
|
+
"--min-ric", "3",
|
|
552
|
+
"--enable-full-consensus",
|
|
553
|
+
"--merge-min-size-ratio", "0.1",
|
|
554
|
+
"--disable-merging",
|
|
555
|
+
"--min-merge-overlap", "0",
|
|
556
|
+
],
|
|
557
|
+
capture_output=True,
|
|
558
|
+
text=True
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
|
|
562
|
+
|
|
563
|
+
output_fasta = os.path.join(summary_dir, "summary.fasta")
|
|
564
|
+
output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
|
|
565
|
+
|
|
566
|
+
full_seqs = [s for s in output_sequences if '.full' in s.id]
|
|
567
|
+
assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
|
|
568
|
+
|
|
569
|
+
# Small variant was filtered — .full should be the large variant only (no IUPAC)
|
|
570
|
+
full_seq_str = str(full_seqs[0].seq)
|
|
571
|
+
assert full_seq_str == seq_large, \
|
|
572
|
+
f"Expected large variant sequence, got {full_seq_str}"
|
|
573
|
+
|
|
574
|
+
finally:
|
|
575
|
+
shutil.rmtree(temp_dir)
|
|
576
|
+
|
|
577
|
+
def test_full_consensus_no_filter_when_disabled(self):
|
|
578
|
+
"""Integration test: merge_min_size_ratio=0 preserves all variants in full consensus."""
|
|
579
|
+
temp_dir = tempfile.mkdtemp()
|
|
580
|
+
source_dir = os.path.join(temp_dir, "clusters")
|
|
581
|
+
summary_dir = os.path.join(temp_dir, "__Summary__")
|
|
582
|
+
os.makedirs(source_dir)
|
|
583
|
+
|
|
584
|
+
try:
|
|
585
|
+
# Same sequences as above — 1 SNP at position 12 (G vs A)
|
|
586
|
+
seq_large = "ATCGATCGATCGATCGATCGATCG"
|
|
587
|
+
seq_small = "ATCGATCGATCAATCGATCGATCG"
|
|
588
|
+
|
|
589
|
+
fasta_content = f""">test-c1 size=100 ric=100 primers=test
|
|
590
|
+
{seq_large}
|
|
591
|
+
>test-c2 size=5 ric=5 primers=test
|
|
592
|
+
{seq_small}
|
|
593
|
+
"""
|
|
594
|
+
fasta_file = os.path.join(source_dir, "test-all.fasta")
|
|
595
|
+
with open(fasta_file, 'w') as f:
|
|
596
|
+
f.write(fasta_content)
|
|
597
|
+
|
|
598
|
+
# merge-min-size-ratio 0 disables filtering — both contribute to .full
|
|
599
|
+
result = subprocess.run(
|
|
600
|
+
[
|
|
601
|
+
"speconsense-summarize",
|
|
602
|
+
"--source", source_dir,
|
|
603
|
+
"--summary-dir", summary_dir,
|
|
604
|
+
"--min-ric", "3",
|
|
605
|
+
"--enable-full-consensus",
|
|
606
|
+
"--merge-min-size-ratio", "0",
|
|
607
|
+
"--disable-merging",
|
|
608
|
+
"--min-merge-overlap", "0",
|
|
609
|
+
],
|
|
610
|
+
capture_output=True,
|
|
611
|
+
text=True
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
|
|
615
|
+
|
|
616
|
+
output_fasta = os.path.join(summary_dir, "summary.fasta")
|
|
617
|
+
output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
|
|
618
|
+
|
|
619
|
+
full_seqs = [s for s in output_sequences if '.full' in s.id]
|
|
620
|
+
assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
|
|
621
|
+
|
|
622
|
+
# Both variants contribute — SNP position should be IUPAC R (A/G)
|
|
623
|
+
full_seq_str = str(full_seqs[0].seq)
|
|
624
|
+
assert "R" in full_seq_str, \
|
|
625
|
+
f"Expected IUPAC R (A/G) in full consensus, got {full_seq_str}"
|
|
626
|
+
|
|
627
|
+
finally:
|
|
628
|
+
shutil.rmtree(temp_dir)
|
|
629
|
+
|
|
630
|
+
|
|
523
631
|
class TestFieldRegexFullConsensus:
|
|
524
632
|
"""Tests for GroupField and VariantField regex handling of .full names."""
|
|
525
633
|
|
|
@@ -578,6 +686,101 @@ class TestFieldRegexFullConsensus:
|
|
|
578
686
|
assert field.format_value(cons) == "variant=v1"
|
|
579
687
|
|
|
580
688
|
|
|
689
|
+
class TestSelectMinSizeRatio:
|
|
690
|
+
"""Tests for --select-min-size-ratio filtering."""
|
|
691
|
+
|
|
692
|
+
def test_select_min_size_ratio_filters_small_variants(self):
|
|
693
|
+
"""Integration test: --select-min-size-ratio 0.1 filters out tiny variants."""
|
|
694
|
+
temp_dir = tempfile.mkdtemp()
|
|
695
|
+
source_dir = os.path.join(temp_dir, "clusters")
|
|
696
|
+
summary_dir = os.path.join(temp_dir, "__Summary__")
|
|
697
|
+
os.makedirs(source_dir)
|
|
698
|
+
|
|
699
|
+
try:
|
|
700
|
+
seq1 = "ATCGATCGATCGATCGATCGATCG"
|
|
701
|
+
seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP — different enough to not merge
|
|
702
|
+
|
|
703
|
+
fasta_content = f""">test-c1 size=100 ric=100 primers=test
|
|
704
|
+
{seq1}
|
|
705
|
+
>test-c2 size=3 ric=3 primers=test
|
|
706
|
+
{seq2}
|
|
707
|
+
"""
|
|
708
|
+
fasta_file = os.path.join(source_dir, "test-all.fasta")
|
|
709
|
+
with open(fasta_file, 'w') as f:
|
|
710
|
+
f.write(fasta_content)
|
|
711
|
+
|
|
712
|
+
result = subprocess.run(
|
|
713
|
+
[
|
|
714
|
+
"speconsense-summarize",
|
|
715
|
+
"--source", source_dir,
|
|
716
|
+
"--summary-dir", summary_dir,
|
|
717
|
+
"--min-ric", "3",
|
|
718
|
+
"--select-min-size-ratio", "0.1",
|
|
719
|
+
"--disable-merging",
|
|
720
|
+
],
|
|
721
|
+
capture_output=True,
|
|
722
|
+
text=True
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
|
|
726
|
+
|
|
727
|
+
output_fasta = os.path.join(summary_dir, "summary.fasta")
|
|
728
|
+
output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
|
|
729
|
+
|
|
730
|
+
# Only the large variant should remain (3/100 = 0.03 < 0.1)
|
|
731
|
+
assert len(output_sequences) == 1, \
|
|
732
|
+
f"Expected 1 sequence after filtering, got {len(output_sequences)}"
|
|
733
|
+
assert "size=100" in output_sequences[0].description
|
|
734
|
+
|
|
735
|
+
finally:
|
|
736
|
+
shutil.rmtree(temp_dir)
|
|
737
|
+
|
|
738
|
+
def test_select_min_size_ratio_disabled_preserves_all(self):
|
|
739
|
+
"""Integration test: --select-min-size-ratio 0 preserves all variants."""
|
|
740
|
+
temp_dir = tempfile.mkdtemp()
|
|
741
|
+
source_dir = os.path.join(temp_dir, "clusters")
|
|
742
|
+
summary_dir = os.path.join(temp_dir, "__Summary__")
|
|
743
|
+
os.makedirs(source_dir)
|
|
744
|
+
|
|
745
|
+
try:
|
|
746
|
+
seq1 = "ATCGATCGATCGATCGATCGATCG"
|
|
747
|
+
seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP
|
|
748
|
+
|
|
749
|
+
fasta_content = f""">test-c1 size=100 ric=100 primers=test
|
|
750
|
+
{seq1}
|
|
751
|
+
>test-c2 size=3 ric=3 primers=test
|
|
752
|
+
{seq2}
|
|
753
|
+
"""
|
|
754
|
+
fasta_file = os.path.join(source_dir, "test-all.fasta")
|
|
755
|
+
with open(fasta_file, 'w') as f:
|
|
756
|
+
f.write(fasta_content)
|
|
757
|
+
|
|
758
|
+
result = subprocess.run(
|
|
759
|
+
[
|
|
760
|
+
"speconsense-summarize",
|
|
761
|
+
"--source", source_dir,
|
|
762
|
+
"--summary-dir", summary_dir,
|
|
763
|
+
"--min-ric", "3",
|
|
764
|
+
"--select-min-size-ratio", "0",
|
|
765
|
+
"--disable-merging",
|
|
766
|
+
],
|
|
767
|
+
capture_output=True,
|
|
768
|
+
text=True
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
|
|
772
|
+
|
|
773
|
+
output_fasta = os.path.join(summary_dir, "summary.fasta")
|
|
774
|
+
output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
|
|
775
|
+
|
|
776
|
+
# Both variants should be preserved
|
|
777
|
+
assert len(output_sequences) == 2, \
|
|
778
|
+
f"Expected 2 sequences with ratio=0, got {len(output_sequences)}"
|
|
779
|
+
|
|
780
|
+
finally:
|
|
781
|
+
shutil.rmtree(temp_dir)
|
|
782
|
+
|
|
783
|
+
|
|
581
784
|
class TestFullConsensusIntegration:
|
|
582
785
|
"""Integration test for --enable-full-consensus."""
|
|
583
786
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|