speconsense 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
speconsense/__init__.py CHANGED
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.3"
8
+ __version__ = "0.7.5"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
@@ -103,6 +103,7 @@ VALID_SUMMARIZE_KEYS = {
103
103
  "select-max-groups",
104
104
  "select-max-variants",
105
105
  "select-strategy",
106
+ "select-min-size-ratio",
106
107
  "enable-full-consensus",
107
108
  # Processing
108
109
  "scale-threshold",
@@ -23,5 +23,6 @@ speconsense-summarize:
23
23
  merge-indel-length: 5 # Merge indels up to 5bp
24
24
  merge-position-count: 10 # Allow up to 10 variant positions in a merge
25
25
  merge-min-size-ratio: 0.2 # Match 20% calling threshold
26
+ select-min-size-ratio: 0.2 # Match 20% calling threshold
26
27
  min-merge-overlap: 0 # Disable partial overlap merging
27
28
  enable-full-consensus: true # Include full IUPAC consensus per group
@@ -91,6 +91,7 @@ speconsense-summarize:
91
91
  # select-max-groups: -1 # Max groups to output (-1 = no limit)
92
92
  # select-max-variants: -1 # Max variants per group (-1 = no limit)
93
93
  # select-strategy: size # Selection strategy: size or diversity
94
+ # select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
94
95
 
95
96
  # --- Processing ---
96
97
  # threads: 0 # Max threads (0 = auto-detect)
@@ -169,6 +169,9 @@ def parse_arguments():
169
169
  selection_group.add_argument("--select-strategy", "--variant-selection",
170
170
  dest="select_strategy", choices=["size", "diversity"], default="size",
171
171
  help="Variant selection strategy: size or diversity (default: size)")
172
+ selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
173
+ help="Minimum size ratio (variant/largest) to include in output "
174
+ "(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
172
175
  selection_group.add_argument("--enable-full-consensus", action="store_true",
173
176
  help="Generate a full consensus per variant group representing all variation "
174
177
  "from pre-merge variants (gaps never win)")
@@ -359,6 +362,18 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
359
362
  for group_idx, (group_id, group_members) in enumerate(sorted_groups):
360
363
  final_group_name = group_idx + 1
361
364
 
365
+ # Apply select-min-size-ratio filter
366
+ if args.select_min_size_ratio > 0 and len(group_members) > 1:
367
+ largest_size = max(v.size for v in group_members)
368
+ filtered = [v for v in group_members
369
+ if (v.size / largest_size) >= args.select_min_size_ratio]
370
+ if len(filtered) < len(group_members):
371
+ filtered_count = len(group_members) - len(filtered)
372
+ logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
373
+ f"variants with size ratio < {args.select_min_size_ratio} "
374
+ f"relative to largest (size={largest_size})")
375
+ group_members = filtered
376
+
362
377
  # Select variants for this group
363
378
  selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
364
379
 
@@ -377,9 +392,20 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
377
392
  final_consensus.append(renamed_variant)
378
393
  group_naming.append((variant.sample_name, new_name))
379
394
 
380
- # Generate full consensus from PRE-MERGE variants
395
+ # Generate full consensus from PRE-MERGE variants that contributed
396
+ # to surviving post-merge variants (after select-min-size-ratio)
381
397
  if getattr(args, 'enable_full_consensus', False):
382
- pre_merge_variants = variant_groups[group_id]
398
+ # Collect original cluster names from surviving post-merge variants
399
+ surviving_originals = set()
400
+ for v in group_members:
401
+ if v.sample_name in all_merge_traceability:
402
+ surviving_originals.update(all_merge_traceability[v.sample_name])
403
+ else:
404
+ surviving_originals.add(v.sample_name)
405
+
406
+ pre_merge_variants = [v for v in variant_groups[group_id]
407
+ if v.sample_name in surviving_originals]
408
+
383
409
  specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
384
410
  full_name = f"{specimen_base}-{group_idx + 1}.full"
385
411
 
@@ -450,6 +476,7 @@ def main():
450
476
  logging.info(f" --select-max-variants: {args.select_max_variants}")
451
477
  logging.info(f" --select-max-groups: {args.select_max_groups}")
452
478
  logging.info(f" --select-strategy: {args.select_strategy}")
479
+ logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
453
480
  logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
454
481
  logging.info(f" --log-level: {args.log_level}")
455
482
  logging.info("")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
- - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
175
175
  - `herbarium` — High-recall for degraded DNA/type specimens
176
176
  - `largedata` — Experimental settings for large input files
177
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
295
295
  |---------------|-------------|------------|-------------|
296
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
297
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
299
299
 
300
300
  ### Example Directory Structure
301
301
  ```
302
302
  __Summary__/
303
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
304
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
306
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
307
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
308
308
  ├── summary.txt # Statistics
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
678
678
  - Output up to select_max_variants per group
679
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
680
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
681
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
682
694
 
683
695
  ### Customizing FASTA Header Fields
@@ -817,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
817
829
  ```bash
818
830
  speconsense-summarize --enable-full-consensus
819
831
  ```
820
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
832
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
821
833
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
822
834
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
823
835
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1059
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1060
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1061
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1062
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1063
- 6. **Full consensus generation** (optional) IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1064
- 7. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1065
1078
 
1066
1079
  **Key architectural features**:
1067
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1273
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1274
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1275
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1276
1290
  [--enable-full-consensus]
1277
1291
  [--disable-full-consensus]
1278
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
@@ -1357,6 +1371,10 @@ Selection:
1357
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1358
1372
  Variant selection strategy: size or diversity
1359
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1360
1378
  --enable-full-consensus
1361
1379
  Generate a full consensus per variant group
1362
1380
  representing all variation from pre-merge variants
@@ -1,4 +1,4 @@
1
- speconsense/__init__.py,sha256=kKZUeH8GW0eHErdEOPuDsMEF9qqfUQjy3wntewndiO8,537
1
+ speconsense/__init__.py,sha256=qxithGom21C3MjbApvgOzVSIRFqw4jReMKZqipfY-Kk,537
2
2
  speconsense/cli.py,sha256=Kqb2da0IuazocAz72iqTnw71jI7UaQgxsHfb9CwiolU,85
3
3
  speconsense/msa.py,sha256=t1uDb-Tj5tDnB17QnNZPslpAiLXgAMIlnmMKBbwBKzs,31661
4
4
  speconsense/quality_report.py,sha256=Byrc115T03ybi7mpA0Bw8-gc83nhKPzDY0tyH1IIAMQ,19803
@@ -9,9 +9,9 @@ speconsense/core/__main__.py,sha256=dCfyQkVxxwlP6QqcWw9y5zp5iLzkG-fQsLmFHHEUlbI,
9
9
  speconsense/core/cli.py,sha256=iepQMK0ZUhZvQShVZY_6WaHneR8ZIRKZ_b6NvVwaRwU,17186
10
10
  speconsense/core/clusterer.py,sha256=UFK5Ec0oMQ7l3GsFJOAhTFk7r90eOOdOBXRskm79Fwk,72093
11
11
  speconsense/core/workers.py,sha256=6pUyt-W9KxkillJ6TU1RjRh-_L-zRIwWqzIcBSeiOSc,25811
12
- speconsense/profiles/__init__.py,sha256=SPQ_Xh5QTC-HV0hYyHyJ_oEIWREz74Zmx_vcRTIJwIA,16353
13
- speconsense/profiles/compressed.yaml,sha256=UZx7hgu9SQQ2fvId8B60NlQmpDkrJakC45vWpIfVh_4,1165
14
- speconsense/profiles/example.yaml,sha256=FGCsSAVtZL_m8PHWVLVrxaVCCwQkIk8wR7N7I2ZyxX0,4463
12
+ speconsense/profiles/__init__.py,sha256=5UWj6VyUIXTzQ1kBZ4mJ2olZ_ADMK85rwr7KEmRfZfk,16382
13
+ speconsense/profiles/compressed.yaml,sha256=LKtBm6nj8cpF2xeFcA7vzzNzaXdEo0JknnmcDDmdFj8,1227
14
+ speconsense/profiles/example.yaml,sha256=UGHoVvFiB6iQ-lUU4rwInL6oE1eAd7Fo5qp14vfXJvA,4546
15
15
  speconsense/profiles/herbarium.yaml,sha256=1OyAPvBZmJ0eWHejfTU_NLd1_08F9n5WbeE686mzYGE,1125
16
16
  speconsense/profiles/largedata.yaml,sha256=7qwl5CHA7BiFcznycUoprOX_A-qrsZzV5fBLnA3QmcE,884
17
17
  speconsense/profiles/nostalgia.yaml,sha256=Hy20M88FiCmDvscyIKbwfNSusiHptmBm4pIWPiSFmp0,661
@@ -23,15 +23,15 @@ speconsense/scalability/vsearch.py,sha256=I1IzTeRzEFn9bi8mNbBRvtcHvUBzBFdE7D5yf-
23
23
  speconsense/summarize/__init__.py,sha256=PE6W9hytDxhkw7W6Fz8X3jd92N2VdhuxiQ72Nqm1xC0,3181
24
24
  speconsense/summarize/__main__.py,sha256=_hzLNqNtv4PirL1oMic37GW2QmjWquoznzNtld_3FiQ,117
25
25
  speconsense/summarize/analysis.py,sha256=1MXtKMpX1bgKEtI-JN6BwTQj99qyt1eQLqNg51EgPiE,31560
26
- speconsense/summarize/cli.py,sha256=ts-N8i7fGTkB-qsvR-HX4egaxambHGq43Z_hjSZ8znU,25327
26
+ speconsense/summarize/cli.py,sha256=JDgLQl8zFlbkH-Oa_n68lbUTB-cLBWzl3dphn041ph0,26961
27
27
  speconsense/summarize/clustering.py,sha256=kk-FdFCea8KRocowN_4dt_aoqZNVJMmEu7CVKPfYgK8,28346
28
28
  speconsense/summarize/fields.py,sha256=a6aK9hkPJ-sDRRSqM_7IkyqCki99KSMnsQMV-U7r2zY,8687
29
29
  speconsense/summarize/io.py,sha256=FdHLbcj0NOL3WE1e5OL85DRdJaHpyXPMcmlNg9mG3tM,32732
30
30
  speconsense/summarize/iupac.py,sha256=Y6KqELmnGy4Eya4C_4ldXY8uek0ReuSUgITLI3NW0-A,11042
31
31
  speconsense/summarize/merging.py,sha256=FakBey3qpu7ULPIsc2GDo9WG8jNU1L6q2pgQ2HrOKXk,28454
32
- speconsense-0.7.3.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
33
- speconsense-0.7.3.dist-info/METADATA,sha256=GdljwvAiXAA27Ei6dSE4PXcC1CH6WdKPCVVuYnd9x08,78804
34
- speconsense-0.7.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
35
- speconsense-0.7.3.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
36
- speconsense-0.7.3.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
37
- speconsense-0.7.3.dist-info/RECORD,,
32
+ speconsense-0.7.5.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
33
+ speconsense-0.7.5.dist-info/METADATA,sha256=JiEg26k5JEJUFSR5hw4FHHysL4ei9PPyvVXsGBRBZVc,80041
34
+ speconsense-0.7.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
35
+ speconsense-0.7.5.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
36
+ speconsense-0.7.5.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
37
+ speconsense-0.7.5.dist-info/RECORD,,