speconsense 0.7.3__tar.gz → 0.7.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {speconsense-0.7.3/speconsense.egg-info → speconsense-0.7.4}/PKG-INFO +23 -5
  2. {speconsense-0.7.3 → speconsense-0.7.4}/README.md +22 -4
  3. {speconsense-0.7.3 → speconsense-0.7.4}/pyproject.toml +1 -1
  4. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/__init__.py +1 -1
  5. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/__init__.py +1 -0
  6. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/compressed.yaml +1 -0
  7. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/example.yaml +1 -0
  8. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/cli.py +27 -0
  9. {speconsense-0.7.3 → speconsense-0.7.4/speconsense.egg-info}/PKG-INFO +23 -5
  10. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_summarize.py +203 -0
  11. {speconsense-0.7.3 → speconsense-0.7.4}/LICENSE +0 -0
  12. {speconsense-0.7.3 → speconsense-0.7.4}/setup.cfg +0 -0
  13. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/cli.py +0 -0
  14. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/__init__.py +0 -0
  15. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/__main__.py +0 -0
  16. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/cli.py +0 -0
  17. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/clusterer.py +0 -0
  18. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/core/workers.py +0 -0
  19. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/msa.py +0 -0
  20. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/herbarium.yaml +0 -0
  21. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/largedata.yaml +0 -0
  22. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/nostalgia.yaml +0 -0
  23. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/profiles/strict.yaml +0 -0
  24. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/quality_report.py +0 -0
  25. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/__init__.py +0 -0
  26. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/base.py +0 -0
  27. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/config.py +0 -0
  28. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/scalability/vsearch.py +0 -0
  29. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/__init__.py +0 -0
  30. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/__main__.py +0 -0
  31. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/analysis.py +0 -0
  32. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/clustering.py +0 -0
  33. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/fields.py +0 -0
  34. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/io.py +0 -0
  35. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/iupac.py +0 -0
  36. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/summarize/merging.py +0 -0
  37. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/synth.py +0 -0
  38. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense/types.py +0 -0
  39. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/SOURCES.txt +0 -0
  40. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/dependency_links.txt +0 -0
  41. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/entry_points.txt +0 -0
  42. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/requires.txt +0 -0
  43. {speconsense-0.7.3 → speconsense-0.7.4}/speconsense.egg-info/top_level.txt +0 -0
  44. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_ambiguity_calling.py +0 -0
  45. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_augment_input.py +0 -0
  46. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_complement_flags.py +0 -0
  47. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_fields.py +0 -0
  48. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_haplotype_filtering.py +0 -0
  49. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_orientation.py +0 -0
  50. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_overlap_merge.py +0 -0
  51. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_overlap_merge_integration.py +0 -0
  52. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_profiles.py +0 -0
  53. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_regression.py +0 -0
  54. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_synth.py +0 -0
  55. {speconsense-0.7.3 → speconsense-0.7.4}/tests/test_variant_phasing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.3
3
+ Version: 0.7.4
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
175
175
  - `herbarium` — High-recall for degraded DNA/type specimens
176
176
  - `largedata` — Experimental settings for large input files
177
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
678
678
  - Output up to select_max_variants per group
679
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
680
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
681
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
682
694
 
683
695
  ### Customizing FASTA Header Fields
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1059
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1060
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1061
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1062
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1063
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1065
1078
 
1066
1079
  **Key architectural features**:
1067
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1273
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1274
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1275
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1276
1290
  [--enable-full-consensus]
1277
1291
  [--disable-full-consensus]
1278
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1357,6 +1371,10 @@ Selection:
1357
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1358
1372
  Variant selection strategy: size or diversity
1359
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1360
1378
  --enable-full-consensus
1361
1379
  Generate a full consensus per variant group
1362
1380
  representing all variation from pre-merge variants
@@ -136,7 +136,7 @@ speconsense input.fastq -p herbarium --min-size 10
136
136
  ```
137
137
 
138
138
  **Bundled profiles:**
139
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
139
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
140
140
  - `herbarium` — High-recall for degraded DNA/type specimens
141
141
  - `largedata` — Experimental settings for large input files
142
142
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -643,6 +643,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
643
643
  - Output up to select_max_variants per group
644
644
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
645
645
 
646
+ **Selection Size Ratio Filtering:**
647
+ ```bash
648
+ speconsense-summarize --select-min-size-ratio 0.2
649
+ ```
650
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
651
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
652
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
653
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
654
+ - Applied after merging but before variant selection
655
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
656
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
657
+
646
658
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
647
659
 
648
660
  ### Customizing FASTA Header Fields
@@ -1024,9 +1036,10 @@ The complete speconsense-summarize workflow operates in this order:
1024
1036
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1025
1037
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1026
1038
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1027
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1028
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1029
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1039
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1040
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1041
+ 7. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1042
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1030
1043
 
1031
1044
  **Key architectural features**:
1032
1045
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1238,6 +1251,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1238
1251
  [--select-max-groups SELECT_MAX_GROUPS]
1239
1252
  [--select-max-variants SELECT_MAX_VARIANTS]
1240
1253
  [--select-strategy {size,diversity}]
1254
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1241
1255
  [--enable-full-consensus]
1242
1256
  [--disable-full-consensus]
1243
1257
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1322,6 +1336,10 @@ Selection:
1322
1336
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1323
1337
  Variant selection strategy: size or diversity
1324
1338
  (default: size)
1339
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1340
+ Minimum size ratio (variant/largest) to include in
1341
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1342
+ cutoff)
1325
1343
  --enable-full-consensus
1326
1344
  Generate a full consensus per variant group
1327
1345
  representing all variation from pre-merge variants
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "speconsense"
7
- version = "0.7.3"
7
+ version = "0.7.4"
8
8
  description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.3"
8
+ __version__ = "0.7.4"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
@@ -103,6 +103,7 @@ VALID_SUMMARIZE_KEYS = {
103
103
  "select-max-groups",
104
104
  "select-max-variants",
105
105
  "select-strategy",
106
+ "select-min-size-ratio",
106
107
  "enable-full-consensus",
107
108
  # Processing
108
109
  "scale-threshold",
@@ -23,5 +23,6 @@ speconsense-summarize:
23
23
  merge-indel-length: 5 # Merge indels up to 5bp
24
24
  merge-position-count: 10 # Allow up to 10 variant positions in a merge
25
25
  merge-min-size-ratio: 0.2 # Match 20% calling threshold
26
+ select-min-size-ratio: 0.2 # Match 20% calling threshold
26
27
  min-merge-overlap: 0 # Disable partial overlap merging
27
28
  enable-full-consensus: true # Include full IUPAC consensus per group
@@ -91,6 +91,7 @@ speconsense-summarize:
91
91
  # select-max-groups: -1 # Max groups to output (-1 = no limit)
92
92
  # select-max-variants: -1 # Max variants per group (-1 = no limit)
93
93
  # select-strategy: size # Selection strategy: size or diversity
94
+ # select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
94
95
 
95
96
  # --- Processing ---
96
97
  # threads: 0 # Max threads (0 = auto-detect)
@@ -169,6 +169,9 @@ def parse_arguments():
169
169
  selection_group.add_argument("--select-strategy", "--variant-selection",
170
170
  dest="select_strategy", choices=["size", "diversity"], default="size",
171
171
  help="Variant selection strategy: size or diversity (default: size)")
172
+ selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
173
+ help="Minimum size ratio (variant/largest) to include in output "
174
+ "(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
172
175
  selection_group.add_argument("--enable-full-consensus", action="store_true",
173
176
  help="Generate a full consensus per variant group representing all variation "
174
177
  "from pre-merge variants (gaps never win)")
@@ -359,6 +362,18 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
359
362
  for group_idx, (group_id, group_members) in enumerate(sorted_groups):
360
363
  final_group_name = group_idx + 1
361
364
 
365
+ # Apply select-min-size-ratio filter
366
+ if args.select_min_size_ratio > 0 and len(group_members) > 1:
367
+ largest_size = max(v.size for v in group_members)
368
+ filtered = [v for v in group_members
369
+ if (v.size / largest_size) >= args.select_min_size_ratio]
370
+ if len(filtered) < len(group_members):
371
+ filtered_count = len(group_members) - len(filtered)
372
+ logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
373
+ f"variants with size ratio < {args.select_min_size_ratio} "
374
+ f"relative to largest (size={largest_size})")
375
+ group_members = filtered
376
+
362
377
  # Select variants for this group
363
378
  selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
364
379
 
@@ -380,6 +395,17 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
380
395
  # Generate full consensus from PRE-MERGE variants
381
396
  if getattr(args, 'enable_full_consensus', False):
382
397
  pre_merge_variants = variant_groups[group_id]
398
+
399
+ # Apply size-ratio filter (same as merge pipeline)
400
+ if args.merge_min_size_ratio > 0 and len(pre_merge_variants) > 1:
401
+ largest_size = max(v.size for v in pre_merge_variants)
402
+ filtered = [v for v in pre_merge_variants
403
+ if (v.size / largest_size) >= args.merge_min_size_ratio]
404
+ if len(filtered) < len(pre_merge_variants):
405
+ filtered_count = len(pre_merge_variants) - len(filtered)
406
+ logging.debug(f"Full consensus: filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
407
+ pre_merge_variants = filtered
408
+
383
409
  specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
384
410
  full_name = f"{specimen_base}-{group_idx + 1}.full"
385
411
 
@@ -450,6 +476,7 @@ def main():
450
476
  logging.info(f" --select-max-variants: {args.select_max_variants}")
451
477
  logging.info(f" --select-max-groups: {args.select_max_groups}")
452
478
  logging.info(f" --select-strategy: {args.select_strategy}")
479
+ logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
453
480
  logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
454
481
  logging.info(f" --log-level: {args.log_level}")
455
482
  logging.info("")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.3
3
+ Version: 0.7.4
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
175
175
  - `herbarium` — High-recall for degraded DNA/type specimens
176
176
  - `largedata` — Experimental settings for large input files
177
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
678
678
  - Output up to select_max_variants per group
679
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
680
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
681
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
682
694
 
683
695
  ### Customizing FASTA Header Fields
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1059
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1060
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1061
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1062
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1063
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1065
1078
 
1066
1079
  **Key architectural features**:
1067
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1273
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1274
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1275
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1276
1290
  [--enable-full-consensus]
1277
1291
  [--disable-full-consensus]
1278
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1357,6 +1371,10 @@ Selection:
1357
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1358
1372
  Variant selection strategy: size or diversity
1359
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1360
1378
  --enable-full-consensus
1361
1379
  Generate a full consensus per variant group
1362
1380
  representing all variation from pre-merge variants
@@ -520,6 +520,114 @@ class TestFullConsensus:
520
520
  assert result.rid == 0.95
521
521
 
522
522
 
523
+ def test_full_consensus_filters_small_variants(self):
524
+ """Integration test: merge_min_size_ratio filters small variants from full consensus."""
525
+ temp_dir = tempfile.mkdtemp()
526
+ source_dir = os.path.join(temp_dir, "clusters")
527
+ summary_dir = os.path.join(temp_dir, "__Summary__")
528
+ os.makedirs(source_dir)
529
+
530
+ try:
531
+ # Two similar sequences (1 SNP at position 12: G vs A)
532
+ # Very different sizes so the small one is filtered by merge_min_size_ratio
533
+ seq_large = "ATCGATCGATCGATCGATCGATCG" # G at position 12
534
+ seq_small = "ATCGATCGATCAATCGATCGATCG" # A at position 12
535
+
536
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
537
+ {seq_large}
538
+ >test-c2 size=5 ric=5 primers=test
539
+ {seq_small}
540
+ """
541
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
542
+ with open(fasta_file, 'w') as f:
543
+ f.write(fasta_content)
544
+
545
+ # merge-min-size-ratio 0.1 filters 5/100=0.05 from full consensus
546
+ result = subprocess.run(
547
+ [
548
+ "speconsense-summarize",
549
+ "--source", source_dir,
550
+ "--summary-dir", summary_dir,
551
+ "--min-ric", "3",
552
+ "--enable-full-consensus",
553
+ "--merge-min-size-ratio", "0.1",
554
+ "--disable-merging",
555
+ "--min-merge-overlap", "0",
556
+ ],
557
+ capture_output=True,
558
+ text=True
559
+ )
560
+
561
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
562
+
563
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
564
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
565
+
566
+ full_seqs = [s for s in output_sequences if '.full' in s.id]
567
+ assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
568
+
569
+ # Small variant was filtered — .full should be the large variant only (no IUPAC)
570
+ full_seq_str = str(full_seqs[0].seq)
571
+ assert full_seq_str == seq_large, \
572
+ f"Expected large variant sequence, got {full_seq_str}"
573
+
574
+ finally:
575
+ shutil.rmtree(temp_dir)
576
+
577
+ def test_full_consensus_no_filter_when_disabled(self):
578
+ """Integration test: merge_min_size_ratio=0 preserves all variants in full consensus."""
579
+ temp_dir = tempfile.mkdtemp()
580
+ source_dir = os.path.join(temp_dir, "clusters")
581
+ summary_dir = os.path.join(temp_dir, "__Summary__")
582
+ os.makedirs(source_dir)
583
+
584
+ try:
585
+ # Same sequences as above — 1 SNP at position 12 (G vs A)
586
+ seq_large = "ATCGATCGATCGATCGATCGATCG"
587
+ seq_small = "ATCGATCGATCAATCGATCGATCG"
588
+
589
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
590
+ {seq_large}
591
+ >test-c2 size=5 ric=5 primers=test
592
+ {seq_small}
593
+ """
594
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
595
+ with open(fasta_file, 'w') as f:
596
+ f.write(fasta_content)
597
+
598
+ # merge-min-size-ratio 0 disables filtering — both contribute to .full
599
+ result = subprocess.run(
600
+ [
601
+ "speconsense-summarize",
602
+ "--source", source_dir,
603
+ "--summary-dir", summary_dir,
604
+ "--min-ric", "3",
605
+ "--enable-full-consensus",
606
+ "--merge-min-size-ratio", "0",
607
+ "--disable-merging",
608
+ "--min-merge-overlap", "0",
609
+ ],
610
+ capture_output=True,
611
+ text=True
612
+ )
613
+
614
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
615
+
616
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
617
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
618
+
619
+ full_seqs = [s for s in output_sequences if '.full' in s.id]
620
+ assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
621
+
622
+ # Both variants contribute — SNP position should be IUPAC R (A/G)
623
+ full_seq_str = str(full_seqs[0].seq)
624
+ assert "R" in full_seq_str, \
625
+ f"Expected IUPAC R (A/G) in full consensus, got {full_seq_str}"
626
+
627
+ finally:
628
+ shutil.rmtree(temp_dir)
629
+
630
+
523
631
  class TestFieldRegexFullConsensus:
524
632
  """Tests for GroupField and VariantField regex handling of .full names."""
525
633
 
@@ -578,6 +686,101 @@ class TestFieldRegexFullConsensus:
578
686
  assert field.format_value(cons) == "variant=v1"
579
687
 
580
688
 
689
+ class TestSelectMinSizeRatio:
690
+ """Tests for --select-min-size-ratio filtering."""
691
+
692
+ def test_select_min_size_ratio_filters_small_variants(self):
693
+ """Integration test: --select-min-size-ratio 0.1 filters out tiny variants."""
694
+ temp_dir = tempfile.mkdtemp()
695
+ source_dir = os.path.join(temp_dir, "clusters")
696
+ summary_dir = os.path.join(temp_dir, "__Summary__")
697
+ os.makedirs(source_dir)
698
+
699
+ try:
700
+ seq1 = "ATCGATCGATCGATCGATCGATCG"
701
+ seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP — different enough to not merge
702
+
703
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
704
+ {seq1}
705
+ >test-c2 size=3 ric=3 primers=test
706
+ {seq2}
707
+ """
708
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
709
+ with open(fasta_file, 'w') as f:
710
+ f.write(fasta_content)
711
+
712
+ result = subprocess.run(
713
+ [
714
+ "speconsense-summarize",
715
+ "--source", source_dir,
716
+ "--summary-dir", summary_dir,
717
+ "--min-ric", "3",
718
+ "--select-min-size-ratio", "0.1",
719
+ "--disable-merging",
720
+ ],
721
+ capture_output=True,
722
+ text=True
723
+ )
724
+
725
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
726
+
727
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
728
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
729
+
730
+ # Only the large variant should remain (3/100 = 0.03 < 0.1)
731
+ assert len(output_sequences) == 1, \
732
+ f"Expected 1 sequence after filtering, got {len(output_sequences)}"
733
+ assert "size=100" in output_sequences[0].description
734
+
735
+ finally:
736
+ shutil.rmtree(temp_dir)
737
+
738
+ def test_select_min_size_ratio_disabled_preserves_all(self):
739
+ """Integration test: --select-min-size-ratio 0 preserves all variants."""
740
+ temp_dir = tempfile.mkdtemp()
741
+ source_dir = os.path.join(temp_dir, "clusters")
742
+ summary_dir = os.path.join(temp_dir, "__Summary__")
743
+ os.makedirs(source_dir)
744
+
745
+ try:
746
+ seq1 = "ATCGATCGATCGATCGATCGATCG"
747
+ seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP
748
+
749
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
750
+ {seq1}
751
+ >test-c2 size=3 ric=3 primers=test
752
+ {seq2}
753
+ """
754
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
755
+ with open(fasta_file, 'w') as f:
756
+ f.write(fasta_content)
757
+
758
+ result = subprocess.run(
759
+ [
760
+ "speconsense-summarize",
761
+ "--source", source_dir,
762
+ "--summary-dir", summary_dir,
763
+ "--min-ric", "3",
764
+ "--select-min-size-ratio", "0",
765
+ "--disable-merging",
766
+ ],
767
+ capture_output=True,
768
+ text=True
769
+ )
770
+
771
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
772
+
773
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
774
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
775
+
776
+ # Both variants should be preserved
777
+ assert len(output_sequences) == 2, \
778
+ f"Expected 2 sequences with ratio=0, got {len(output_sequences)}"
779
+
780
+ finally:
781
+ shutil.rmtree(temp_dir)
782
+
783
+
581
784
  class TestFullConsensusIntegration:
582
785
  """Integration test for --enable-full-consensus."""
583
786
 
File without changes
File without changes