speconsense 0.7.3__tar.gz → 0.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {speconsense-0.7.3/speconsense.egg-info → speconsense-0.7.5}/PKG-INFO +26 -8
  2. {speconsense-0.7.3 → speconsense-0.7.5}/README.md +25 -7
  3. {speconsense-0.7.3 → speconsense-0.7.5}/pyproject.toml +1 -1
  4. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/__init__.py +1 -1
  5. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/__init__.py +1 -0
  6. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/compressed.yaml +1 -0
  7. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/example.yaml +1 -0
  8. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/cli.py +29 -2
  9. {speconsense-0.7.3 → speconsense-0.7.5/speconsense.egg-info}/PKG-INFO +26 -8
  10. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_summarize.py +203 -0
  11. {speconsense-0.7.3 → speconsense-0.7.5}/LICENSE +0 -0
  12. {speconsense-0.7.3 → speconsense-0.7.5}/setup.cfg +0 -0
  13. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/cli.py +0 -0
  14. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/core/__init__.py +0 -0
  15. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/core/__main__.py +0 -0
  16. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/core/cli.py +0 -0
  17. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/core/clusterer.py +0 -0
  18. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/core/workers.py +0 -0
  19. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/msa.py +0 -0
  20. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/herbarium.yaml +0 -0
  21. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/largedata.yaml +0 -0
  22. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/nostalgia.yaml +0 -0
  23. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/profiles/strict.yaml +0 -0
  24. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/quality_report.py +0 -0
  25. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/scalability/__init__.py +0 -0
  26. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/scalability/base.py +0 -0
  27. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/scalability/config.py +0 -0
  28. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/scalability/vsearch.py +0 -0
  29. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/__init__.py +0 -0
  30. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/__main__.py +0 -0
  31. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/analysis.py +0 -0
  32. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/clustering.py +0 -0
  33. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/fields.py +0 -0
  34. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/io.py +0 -0
  35. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/iupac.py +0 -0
  36. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/summarize/merging.py +0 -0
  37. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/synth.py +0 -0
  38. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense/types.py +0 -0
  39. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense.egg-info/SOURCES.txt +0 -0
  40. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense.egg-info/dependency_links.txt +0 -0
  41. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense.egg-info/entry_points.txt +0 -0
  42. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense.egg-info/requires.txt +0 -0
  43. {speconsense-0.7.3 → speconsense-0.7.5}/speconsense.egg-info/top_level.txt +0 -0
  44. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_ambiguity_calling.py +0 -0
  45. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_augment_input.py +0 -0
  46. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_complement_flags.py +0 -0
  47. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_fields.py +0 -0
  48. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_haplotype_filtering.py +0 -0
  49. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_orientation.py +0 -0
  50. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_overlap_merge.py +0 -0
  51. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_overlap_merge_integration.py +0 -0
  52. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_profiles.py +0 -0
  53. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_regression.py +0 -0
  54. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_synth.py +0 -0
  55. {speconsense-0.7.3 → speconsense-0.7.5}/tests/test_variant_phasing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
175
175
  - `herbarium` — High-recall for degraded DNA/type specimens
176
176
  - `largedata` — Experimental settings for large input files
177
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
295
295
  |---------------|-------------|------------|-------------|
296
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
297
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
299
299
 
300
300
  ### Example Directory Structure
301
301
  ```
302
302
  __Summary__/
303
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
304
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
306
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
307
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
308
308
  ├── summary.txt # Statistics
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
678
678
  - Output up to select_max_variants per group
679
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
680
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
681
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
682
694
 
683
695
  ### Customizing FASTA Header Fields
@@ -817,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
817
829
  ```bash
818
830
  speconsense-summarize --enable-full-consensus
819
831
  ```
820
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
832
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
821
833
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
822
834
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
823
835
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1059
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1060
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1061
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1062
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1063
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1065
1078
 
1066
1079
  **Key architectural features**:
1067
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1273
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1274
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1275
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1276
1290
  [--enable-full-consensus]
1277
1291
  [--disable-full-consensus]
1278
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1357,6 +1371,10 @@ Selection:
1357
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1358
1372
  Variant selection strategy: size or diversity
1359
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1360
1378
  --enable-full-consensus
1361
1379
  Generate a full consensus per variant group
1362
1380
  representing all variation from pre-merge variants
@@ -136,7 +136,7 @@ speconsense input.fastq -p herbarium --min-size 10
136
136
  ```
137
137
 
138
138
  **Bundled profiles:**
139
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
139
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
140
140
  - `herbarium` — High-recall for degraded DNA/type specimens
141
141
  - `largedata` — Experimental settings for large input files
142
142
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -260,14 +260,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
260
260
  |---------------|-------------|------------|-------------|
261
261
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
262
262
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
263
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
263
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
264
264
 
265
265
  ### Example Directory Structure
266
266
  ```
267
267
  __Summary__/
268
268
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
269
269
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
270
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
270
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
271
271
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
272
272
  ├── summary.fasta # All final consensus sequences (excludes .raw)
273
273
  ├── summary.txt # Statistics
@@ -643,6 +643,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
643
643
  - Output up to select_max_variants per group
644
644
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
645
645
 
646
+ **Selection Size Ratio Filtering:**
647
+ ```bash
648
+ speconsense-summarize --select-min-size-ratio 0.2
649
+ ```
650
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
651
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
652
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
653
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
654
+ - Applied after merging but before variant selection
655
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
656
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
657
+
646
658
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
647
659
 
648
660
  ### Customizing FASTA Header Fields
@@ -782,7 +794,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
782
794
  ```bash
783
795
  speconsense-summarize --enable-full-consensus
784
796
  ```
785
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
797
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
786
798
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
787
799
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
788
800
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1024,9 +1036,10 @@ The complete speconsense-summarize workflow operates in this order:
1024
1036
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1025
1037
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1026
1038
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1027
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1028
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1029
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1039
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1040
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1041
+ 7. **Full consensus generation** (optional) IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1042
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1030
1043
 
1031
1044
  **Key architectural features**:
1032
1045
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1238,6 +1251,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1238
1251
  [--select-max-groups SELECT_MAX_GROUPS]
1239
1252
  [--select-max-variants SELECT_MAX_VARIANTS]
1240
1253
  [--select-strategy {size,diversity}]
1254
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1241
1255
  [--enable-full-consensus]
1242
1256
  [--disable-full-consensus]
1243
1257
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1322,6 +1336,10 @@ Selection:
1322
1336
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1323
1337
  Variant selection strategy: size or diversity
1324
1338
  (default: size)
1339
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1340
+ Minimum size ratio (variant/largest) to include in
1341
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1342
+ cutoff)
1325
1343
  --enable-full-consensus
1326
1344
  Generate a full consensus per variant group
1327
1345
  representing all variation from pre-merge variants
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "speconsense"
7
- version = "0.7.3"
7
+ version = "0.7.5"
8
8
  description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.3"
8
+ __version__ = "0.7.5"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
@@ -103,6 +103,7 @@ VALID_SUMMARIZE_KEYS = {
103
103
  "select-max-groups",
104
104
  "select-max-variants",
105
105
  "select-strategy",
106
+ "select-min-size-ratio",
106
107
  "enable-full-consensus",
107
108
  # Processing
108
109
  "scale-threshold",
@@ -23,5 +23,6 @@ speconsense-summarize:
23
23
  merge-indel-length: 5 # Merge indels up to 5bp
24
24
  merge-position-count: 10 # Allow up to 10 variant positions in a merge
25
25
  merge-min-size-ratio: 0.2 # Match 20% calling threshold
26
+ select-min-size-ratio: 0.2 # Match 20% calling threshold
26
27
  min-merge-overlap: 0 # Disable partial overlap merging
27
28
  enable-full-consensus: true # Include full IUPAC consensus per group
@@ -91,6 +91,7 @@ speconsense-summarize:
91
91
  # select-max-groups: -1 # Max groups to output (-1 = no limit)
92
92
  # select-max-variants: -1 # Max variants per group (-1 = no limit)
93
93
  # select-strategy: size # Selection strategy: size or diversity
94
+ # select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
94
95
 
95
96
  # --- Processing ---
96
97
  # threads: 0 # Max threads (0 = auto-detect)
@@ -169,6 +169,9 @@ def parse_arguments():
169
169
  selection_group.add_argument("--select-strategy", "--variant-selection",
170
170
  dest="select_strategy", choices=["size", "diversity"], default="size",
171
171
  help="Variant selection strategy: size or diversity (default: size)")
172
+ selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
173
+ help="Minimum size ratio (variant/largest) to include in output "
174
+ "(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
172
175
  selection_group.add_argument("--enable-full-consensus", action="store_true",
173
176
  help="Generate a full consensus per variant group representing all variation "
174
177
  "from pre-merge variants (gaps never win)")
@@ -359,6 +362,18 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
359
362
  for group_idx, (group_id, group_members) in enumerate(sorted_groups):
360
363
  final_group_name = group_idx + 1
361
364
 
365
+ # Apply select-min-size-ratio filter
366
+ if args.select_min_size_ratio > 0 and len(group_members) > 1:
367
+ largest_size = max(v.size for v in group_members)
368
+ filtered = [v for v in group_members
369
+ if (v.size / largest_size) >= args.select_min_size_ratio]
370
+ if len(filtered) < len(group_members):
371
+ filtered_count = len(group_members) - len(filtered)
372
+ logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
373
+ f"variants with size ratio < {args.select_min_size_ratio} "
374
+ f"relative to largest (size={largest_size})")
375
+ group_members = filtered
376
+
362
377
  # Select variants for this group
363
378
  selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
364
379
 
@@ -377,9 +392,20 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
377
392
  final_consensus.append(renamed_variant)
378
393
  group_naming.append((variant.sample_name, new_name))
379
394
 
380
- # Generate full consensus from PRE-MERGE variants
395
+ # Generate full consensus from PRE-MERGE variants that contributed
396
+ # to surviving post-merge variants (after select-min-size-ratio)
381
397
  if getattr(args, 'enable_full_consensus', False):
382
- pre_merge_variants = variant_groups[group_id]
398
+ # Collect original cluster names from surviving post-merge variants
399
+ surviving_originals = set()
400
+ for v in group_members:
401
+ if v.sample_name in all_merge_traceability:
402
+ surviving_originals.update(all_merge_traceability[v.sample_name])
403
+ else:
404
+ surviving_originals.add(v.sample_name)
405
+
406
+ pre_merge_variants = [v for v in variant_groups[group_id]
407
+ if v.sample_name in surviving_originals]
408
+
383
409
  specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
384
410
  full_name = f"{specimen_base}-{group_idx + 1}.full"
385
411
 
@@ -450,6 +476,7 @@ def main():
450
476
  logging.info(f" --select-max-variants: {args.select_max_variants}")
451
477
  logging.info(f" --select-max-groups: {args.select_max_groups}")
452
478
  logging.info(f" --select-strategy: {args.select_strategy}")
479
+ logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
453
480
  logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
454
481
  logging.info(f" --log-level: {args.log_level}")
455
482
  logging.info("")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
175
175
  - `herbarium` — High-recall for degraded DNA/type specimens
176
176
  - `largedata` — Experimental settings for large input files
177
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
295
295
  |---------------|-------------|------------|-------------|
296
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
297
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
299
299
 
300
300
  ### Example Directory Structure
301
301
  ```
302
302
  __Summary__/
303
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
304
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
306
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
307
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
308
308
  ├── summary.txt # Statistics
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
678
678
  - Output up to select_max_variants per group
679
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
680
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
681
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
682
694
 
683
695
  ### Customizing FASTA Header Fields
@@ -817,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
817
829
  ```bash
818
830
  speconsense-summarize --enable-full-consensus
819
831
  ```
820
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
832
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
821
833
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
822
834
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
823
835
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1059
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1060
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1061
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1062
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1063
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1065
1078
 
1066
1079
  **Key architectural features**:
1067
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1273
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1274
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1275
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1276
1290
  [--enable-full-consensus]
1277
1291
  [--disable-full-consensus]
1278
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1357,6 +1371,10 @@ Selection:
1357
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1358
1372
  Variant selection strategy: size or diversity
1359
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1360
1378
  --enable-full-consensus
1361
1379
  Generate a full consensus per variant group
1362
1380
  representing all variation from pre-merge variants
@@ -520,6 +520,114 @@ class TestFullConsensus:
520
520
  assert result.rid == 0.95
521
521
 
522
522
 
523
+ def test_full_consensus_filters_small_variants(self):
524
+ """Integration test: select_min_size_ratio filters small variants from full consensus."""
525
+ temp_dir = tempfile.mkdtemp()
526
+ source_dir = os.path.join(temp_dir, "clusters")
527
+ summary_dir = os.path.join(temp_dir, "__Summary__")
528
+ os.makedirs(source_dir)
529
+
530
+ try:
531
+ # Two similar sequences (1 SNP at position 12: G vs A)
532
+ # Very different sizes so the small one is filtered by select_min_size_ratio
533
+ seq_large = "ATCGATCGATCGATCGATCGATCG" # G at position 12
534
+ seq_small = "ATCGATCGATCAATCGATCGATCG" # A at position 12
535
+
536
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
537
+ {seq_large}
538
+ >test-c2 size=5 ric=5 primers=test
539
+ {seq_small}
540
+ """
541
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
542
+ with open(fasta_file, 'w') as f:
543
+ f.write(fasta_content)
544
+
545
+ # select-min-size-ratio 0.1 filters 5/100=0.05 post-merge variant,
546
+ # so its pre-merge components are excluded from .full consensus
547
+ result = subprocess.run(
548
+ [
549
+ "speconsense-summarize",
550
+ "--source", source_dir,
551
+ "--summary-dir", summary_dir,
552
+ "--min-ric", "3",
553
+ "--enable-full-consensus",
554
+ "--select-min-size-ratio", "0.1",
555
+ "--disable-merging",
556
+ "--min-merge-overlap", "0",
557
+ ],
558
+ capture_output=True,
559
+ text=True
560
+ )
561
+
562
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
563
+
564
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
565
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
566
+
567
+ full_seqs = [s for s in output_sequences if '.full' in s.id]
568
+ assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
569
+
570
+ # Small variant was filtered — .full should be the large variant only (no IUPAC)
571
+ full_seq_str = str(full_seqs[0].seq)
572
+ assert full_seq_str == seq_large, \
573
+ f"Expected large variant sequence, got {full_seq_str}"
574
+
575
+ finally:
576
+ shutil.rmtree(temp_dir)
577
+
578
+ def test_full_consensus_no_filter_when_all_survive(self):
579
+ """Integration test: all post-merge variants surviving means all contribute to .full."""
580
+ temp_dir = tempfile.mkdtemp()
581
+ source_dir = os.path.join(temp_dir, "clusters")
582
+ summary_dir = os.path.join(temp_dir, "__Summary__")
583
+ os.makedirs(source_dir)
584
+
585
+ try:
586
+ # Same sequences as above — 1 SNP at position 12 (G vs A)
587
+ seq_large = "ATCGATCGATCGATCGATCGATCG"
588
+ seq_small = "ATCGATCGATCAATCGATCGATCG"
589
+
590
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
591
+ {seq_large}
592
+ >test-c2 size=5 ric=5 primers=test
593
+ {seq_small}
594
+ """
595
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
596
+ with open(fasta_file, 'w') as f:
597
+ f.write(fasta_content)
598
+
599
+ # No select-min-size-ratio — both variants survive, both contribute to .full
600
+ result = subprocess.run(
601
+ [
602
+ "speconsense-summarize",
603
+ "--source", source_dir,
604
+ "--summary-dir", summary_dir,
605
+ "--min-ric", "3",
606
+ "--enable-full-consensus",
607
+ "--disable-merging",
608
+ "--min-merge-overlap", "0",
609
+ ],
610
+ capture_output=True,
611
+ text=True
612
+ )
613
+
614
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
615
+
616
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
617
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
618
+
619
+ full_seqs = [s for s in output_sequences if '.full' in s.id]
620
+ assert len(full_seqs) == 1, f"Expected 1 .full sequence, got {len(full_seqs)}"
621
+
622
+ # Both variants contribute — SNP position should be IUPAC R (A/G)
623
+ full_seq_str = str(full_seqs[0].seq)
624
+ assert "R" in full_seq_str, \
625
+ f"Expected IUPAC R (A/G) in full consensus, got {full_seq_str}"
626
+
627
+ finally:
628
+ shutil.rmtree(temp_dir)
629
+
630
+
523
631
  class TestFieldRegexFullConsensus:
524
632
  """Tests for GroupField and VariantField regex handling of .full names."""
525
633
 
@@ -578,6 +686,101 @@ class TestFieldRegexFullConsensus:
578
686
  assert field.format_value(cons) == "variant=v1"
579
687
 
580
688
 
689
+ class TestSelectMinSizeRatio:
690
+ """Tests for --select-min-size-ratio filtering."""
691
+
692
+ def test_select_min_size_ratio_filters_small_variants(self):
693
+ """Integration test: --select-min-size-ratio 0.1 filters out tiny variants."""
694
+ temp_dir = tempfile.mkdtemp()
695
+ source_dir = os.path.join(temp_dir, "clusters")
696
+ summary_dir = os.path.join(temp_dir, "__Summary__")
697
+ os.makedirs(source_dir)
698
+
699
+ try:
700
+ seq1 = "ATCGATCGATCGATCGATCGATCG"
701
+ seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP — different enough to not merge
702
+
703
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
704
+ {seq1}
705
+ >test-c2 size=3 ric=3 primers=test
706
+ {seq2}
707
+ """
708
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
709
+ with open(fasta_file, 'w') as f:
710
+ f.write(fasta_content)
711
+
712
+ result = subprocess.run(
713
+ [
714
+ "speconsense-summarize",
715
+ "--source", source_dir,
716
+ "--summary-dir", summary_dir,
717
+ "--min-ric", "3",
718
+ "--select-min-size-ratio", "0.1",
719
+ "--disable-merging",
720
+ ],
721
+ capture_output=True,
722
+ text=True
723
+ )
724
+
725
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
726
+
727
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
728
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
729
+
730
+ # Only the large variant should remain (3/100 = 0.03 < 0.1)
731
+ assert len(output_sequences) == 1, \
732
+ f"Expected 1 sequence after filtering, got {len(output_sequences)}"
733
+ assert "size=100" in output_sequences[0].description
734
+
735
+ finally:
736
+ shutil.rmtree(temp_dir)
737
+
738
+ def test_select_min_size_ratio_disabled_preserves_all(self):
739
+ """Integration test: --select-min-size-ratio 0 preserves all variants."""
740
+ temp_dir = tempfile.mkdtemp()
741
+ source_dir = os.path.join(temp_dir, "clusters")
742
+ summary_dir = os.path.join(temp_dir, "__Summary__")
743
+ os.makedirs(source_dir)
744
+
745
+ try:
746
+ seq1 = "ATCGATCGATCGATCGATCGATCG"
747
+ seq2 = "ATCGATCGATCAATCGATCGATCG" # One SNP
748
+
749
+ fasta_content = f""">test-c1 size=100 ric=100 primers=test
750
+ {seq1}
751
+ >test-c2 size=3 ric=3 primers=test
752
+ {seq2}
753
+ """
754
+ fasta_file = os.path.join(source_dir, "test-all.fasta")
755
+ with open(fasta_file, 'w') as f:
756
+ f.write(fasta_content)
757
+
758
+ result = subprocess.run(
759
+ [
760
+ "speconsense-summarize",
761
+ "--source", source_dir,
762
+ "--summary-dir", summary_dir,
763
+ "--min-ric", "3",
764
+ "--select-min-size-ratio", "0",
765
+ "--disable-merging",
766
+ ],
767
+ capture_output=True,
768
+ text=True
769
+ )
770
+
771
+ assert result.returncode == 0, f"speconsense-summarize failed: {result.stderr}"
772
+
773
+ output_fasta = os.path.join(summary_dir, "summary.fasta")
774
+ output_sequences = list(SeqIO.parse(output_fasta, "fasta"))
775
+
776
+ # Both variants should be preserved
777
+ assert len(output_sequences) == 2, \
778
+ f"Expected 2 sequences with ratio=0, got {len(output_sequences)}"
779
+
780
+ finally:
781
+ shutil.rmtree(temp_dir)
782
+
783
+
581
784
  class TestFullConsensusIntegration:
582
785
  """Integration test for --enable-full-consensus."""
583
786
 
File without changes
File without changes