speconsense 0.7.3__py3-none-any.whl → 0.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +1 -1
- speconsense/profiles/__init__.py +1 -0
- speconsense/profiles/compressed.yaml +1 -0
- speconsense/profiles/example.yaml +1 -0
- speconsense/summarize/cli.py +29 -2
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/METADATA +26 -8
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/RECORD +11 -11
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/WHEEL +0 -0
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/entry_points.txt +0 -0
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/licenses/LICENSE +0 -0
- {speconsense-0.7.3.dist-info → speconsense-0.7.5.dist-info}/top_level.txt +0 -0
speconsense/__init__.py
CHANGED
speconsense/profiles/__init__.py
CHANGED
|
@@ -23,5 +23,6 @@ speconsense-summarize:
|
|
|
23
23
|
merge-indel-length: 5 # Merge indels up to 5bp
|
|
24
24
|
merge-position-count: 10 # Allow up to 10 variant positions in a merge
|
|
25
25
|
merge-min-size-ratio: 0.2 # Match 20% calling threshold
|
|
26
|
+
select-min-size-ratio: 0.2 # Match 20% calling threshold
|
|
26
27
|
min-merge-overlap: 0 # Disable partial overlap merging
|
|
27
28
|
enable-full-consensus: true # Include full IUPAC consensus per group
|
|
@@ -91,6 +91,7 @@ speconsense-summarize:
|
|
|
91
91
|
# select-max-groups: -1 # Max groups to output (-1 = no limit)
|
|
92
92
|
# select-max-variants: -1 # Max variants per group (-1 = no limit)
|
|
93
93
|
# select-strategy: size # Selection strategy: size or diversity
|
|
94
|
+
# select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
|
|
94
95
|
|
|
95
96
|
# --- Processing ---
|
|
96
97
|
# threads: 0 # Max threads (0 = auto-detect)
|
speconsense/summarize/cli.py
CHANGED
|
@@ -169,6 +169,9 @@ def parse_arguments():
|
|
|
169
169
|
selection_group.add_argument("--select-strategy", "--variant-selection",
|
|
170
170
|
dest="select_strategy", choices=["size", "diversity"], default="size",
|
|
171
171
|
help="Variant selection strategy: size or diversity (default: size)")
|
|
172
|
+
selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
|
|
173
|
+
help="Minimum size ratio (variant/largest) to include in output "
|
|
174
|
+
"(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
|
|
172
175
|
selection_group.add_argument("--enable-full-consensus", action="store_true",
|
|
173
176
|
help="Generate a full consensus per variant group representing all variation "
|
|
174
177
|
"from pre-merge variants (gaps never win)")
|
|
@@ -359,6 +362,18 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
359
362
|
for group_idx, (group_id, group_members) in enumerate(sorted_groups):
|
|
360
363
|
final_group_name = group_idx + 1
|
|
361
364
|
|
|
365
|
+
# Apply select-min-size-ratio filter
|
|
366
|
+
if args.select_min_size_ratio > 0 and len(group_members) > 1:
|
|
367
|
+
largest_size = max(v.size for v in group_members)
|
|
368
|
+
filtered = [v for v in group_members
|
|
369
|
+
if (v.size / largest_size) >= args.select_min_size_ratio]
|
|
370
|
+
if len(filtered) < len(group_members):
|
|
371
|
+
filtered_count = len(group_members) - len(filtered)
|
|
372
|
+
logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
|
|
373
|
+
f"variants with size ratio < {args.select_min_size_ratio} "
|
|
374
|
+
f"relative to largest (size={largest_size})")
|
|
375
|
+
group_members = filtered
|
|
376
|
+
|
|
362
377
|
# Select variants for this group
|
|
363
378
|
selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
|
|
364
379
|
|
|
@@ -377,9 +392,20 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
|
|
|
377
392
|
final_consensus.append(renamed_variant)
|
|
378
393
|
group_naming.append((variant.sample_name, new_name))
|
|
379
394
|
|
|
380
|
-
# Generate full consensus from PRE-MERGE variants
|
|
395
|
+
# Generate full consensus from PRE-MERGE variants that contributed
|
|
396
|
+
# to surviving post-merge variants (after select-min-size-ratio)
|
|
381
397
|
if getattr(args, 'enable_full_consensus', False):
|
|
382
|
-
|
|
398
|
+
# Collect original cluster names from surviving post-merge variants
|
|
399
|
+
surviving_originals = set()
|
|
400
|
+
for v in group_members:
|
|
401
|
+
if v.sample_name in all_merge_traceability:
|
|
402
|
+
surviving_originals.update(all_merge_traceability[v.sample_name])
|
|
403
|
+
else:
|
|
404
|
+
surviving_originals.add(v.sample_name)
|
|
405
|
+
|
|
406
|
+
pre_merge_variants = [v for v in variant_groups[group_id]
|
|
407
|
+
if v.sample_name in surviving_originals]
|
|
408
|
+
|
|
383
409
|
specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
|
|
384
410
|
full_name = f"{specimen_base}-{group_idx + 1}.full"
|
|
385
411
|
|
|
@@ -450,6 +476,7 @@ def main():
|
|
|
450
476
|
logging.info(f" --select-max-variants: {args.select_max_variants}")
|
|
451
477
|
logging.info(f" --select-max-groups: {args.select_max_groups}")
|
|
452
478
|
logging.info(f" --select-strategy: {args.select_strategy}")
|
|
479
|
+
logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
|
|
453
480
|
logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
|
|
454
481
|
logging.info(f" --log-level: {args.log_level}")
|
|
455
482
|
logging.info("")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: speconsense
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.5
|
|
4
4
|
Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
|
|
5
5
|
Author-email: Josh Walker <joshowalker@yahoo.com>
|
|
6
6
|
License: BSD-3-Clause
|
|
@@ -171,7 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
|
|
|
171
171
|
```
|
|
172
172
|
|
|
173
173
|
**Bundled profiles:**
|
|
174
|
-
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus)
|
|
174
|
+
- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
|
|
175
175
|
- `herbarium` — High-recall for degraded DNA/type specimens
|
|
176
176
|
- `largedata` — Experimental settings for large input files
|
|
177
177
|
- `nostalgia` — Simulate older bioinformatics pipelines
|
|
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
|
|
|
295
295
|
|---------------|-------------|------------|-------------|
|
|
296
296
|
| **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
|
|
297
297
|
| **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
|
|
298
|
-
| **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from
|
|
298
|
+
| **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
|
|
299
299
|
|
|
300
300
|
### Example Directory Structure
|
|
301
301
|
```
|
|
302
302
|
__Summary__/
|
|
303
303
|
├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
|
|
304
304
|
├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
|
|
305
|
-
├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (
|
|
305
|
+
├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
|
|
306
306
|
├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
|
|
307
307
|
├── summary.fasta # All final consensus sequences (excludes .raw)
|
|
308
308
|
├── summary.txt # Statistics
|
|
@@ -678,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
|
|
|
678
678
|
- Output up to select_max_variants per group
|
|
679
679
|
3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
|
|
680
680
|
|
|
681
|
+
**Selection Size Ratio Filtering:**
|
|
682
|
+
```bash
|
|
683
|
+
speconsense-summarize --select-min-size-ratio 0.2
|
|
684
|
+
```
|
|
685
|
+
- Filters out post-merge variants whose size is too small relative to the largest variant in their group
|
|
686
|
+
- Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
|
|
687
|
+
- Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
|
|
688
|
+
- Default is 0 (disabled) — all post-merge variants pass through to selection
|
|
689
|
+
- Applied after merging but before variant selection
|
|
690
|
+
- Useful for suppressing noise variants that survived merging but are too small to be meaningful
|
|
691
|
+
- Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
|
|
692
|
+
|
|
681
693
|
This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
|
|
682
694
|
|
|
683
695
|
### Customizing FASTA Header Fields
|
|
@@ -817,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
|
|
|
817
829
|
```bash
|
|
818
830
|
speconsense-summarize --enable-full-consensus
|
|
819
831
|
```
|
|
820
|
-
- Generates a full IUPAC consensus sequence per variant group from
|
|
832
|
+
- Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
|
|
821
833
|
- Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
|
|
822
834
|
- Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
|
|
823
835
|
- Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
|
|
@@ -1059,9 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
|
|
|
1059
1071
|
2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
|
|
1060
1072
|
3. **Group filtering** to limit output groups (`--select-max-groups`)
|
|
1061
1073
|
4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
|
|
1062
|
-
5. **
|
|
1063
|
-
6. **
|
|
1064
|
-
7. **
|
|
1074
|
+
5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
|
|
1075
|
+
6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
|
|
1076
|
+
7. **Full consensus generation** (optional) — IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
|
|
1077
|
+
8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
|
|
1065
1078
|
|
|
1066
1079
|
**Key architectural features**:
|
|
1067
1080
|
- HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
|
|
@@ -1273,6 +1286,7 @@ usage: speconsense-summarize [-h] [--source SOURCE]
|
|
|
1273
1286
|
[--select-max-groups SELECT_MAX_GROUPS]
|
|
1274
1287
|
[--select-max-variants SELECT_MAX_VARIANTS]
|
|
1275
1288
|
[--select-strategy {size,diversity}]
|
|
1289
|
+
[--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
|
|
1276
1290
|
[--enable-full-consensus]
|
|
1277
1291
|
[--disable-full-consensus]
|
|
1278
1292
|
[--scale-threshold SCALE_THRESHOLD] [--threads N]
|
|
@@ -1357,6 +1371,10 @@ Selection:
|
|
|
1357
1371
|
--select-strategy {size,diversity}, --variant-selection {size,diversity}
|
|
1358
1372
|
Variant selection strategy: size or diversity
|
|
1359
1373
|
(default: size)
|
|
1374
|
+
--select-min-size-ratio SELECT_MIN_SIZE_RATIO
|
|
1375
|
+
Minimum size ratio (variant/largest) to include in
|
|
1376
|
+
output (default: 0 = disabled, e.g. 0.2 for 20%
|
|
1377
|
+
cutoff)
|
|
1360
1378
|
--enable-full-consensus
|
|
1361
1379
|
Generate a full consensus per variant group
|
|
1362
1380
|
representing all variation from pre-merge variants
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
speconsense/__init__.py,sha256=
|
|
1
|
+
speconsense/__init__.py,sha256=qxithGom21C3MjbApvgOzVSIRFqw4jReMKZqipfY-Kk,537
|
|
2
2
|
speconsense/cli.py,sha256=Kqb2da0IuazocAz72iqTnw71jI7UaQgxsHfb9CwiolU,85
|
|
3
3
|
speconsense/msa.py,sha256=t1uDb-Tj5tDnB17QnNZPslpAiLXgAMIlnmMKBbwBKzs,31661
|
|
4
4
|
speconsense/quality_report.py,sha256=Byrc115T03ybi7mpA0Bw8-gc83nhKPzDY0tyH1IIAMQ,19803
|
|
@@ -9,9 +9,9 @@ speconsense/core/__main__.py,sha256=dCfyQkVxxwlP6QqcWw9y5zp5iLzkG-fQsLmFHHEUlbI,
|
|
|
9
9
|
speconsense/core/cli.py,sha256=iepQMK0ZUhZvQShVZY_6WaHneR8ZIRKZ_b6NvVwaRwU,17186
|
|
10
10
|
speconsense/core/clusterer.py,sha256=UFK5Ec0oMQ7l3GsFJOAhTFk7r90eOOdOBXRskm79Fwk,72093
|
|
11
11
|
speconsense/core/workers.py,sha256=6pUyt-W9KxkillJ6TU1RjRh-_L-zRIwWqzIcBSeiOSc,25811
|
|
12
|
-
speconsense/profiles/__init__.py,sha256=
|
|
13
|
-
speconsense/profiles/compressed.yaml,sha256=
|
|
14
|
-
speconsense/profiles/example.yaml,sha256=
|
|
12
|
+
speconsense/profiles/__init__.py,sha256=5UWj6VyUIXTzQ1kBZ4mJ2olZ_ADMK85rwr7KEmRfZfk,16382
|
|
13
|
+
speconsense/profiles/compressed.yaml,sha256=LKtBm6nj8cpF2xeFcA7vzzNzaXdEo0JknnmcDDmdFj8,1227
|
|
14
|
+
speconsense/profiles/example.yaml,sha256=UGHoVvFiB6iQ-lUU4rwInL6oE1eAd7Fo5qp14vfXJvA,4546
|
|
15
15
|
speconsense/profiles/herbarium.yaml,sha256=1OyAPvBZmJ0eWHejfTU_NLd1_08F9n5WbeE686mzYGE,1125
|
|
16
16
|
speconsense/profiles/largedata.yaml,sha256=7qwl5CHA7BiFcznycUoprOX_A-qrsZzV5fBLnA3QmcE,884
|
|
17
17
|
speconsense/profiles/nostalgia.yaml,sha256=Hy20M88FiCmDvscyIKbwfNSusiHptmBm4pIWPiSFmp0,661
|
|
@@ -23,15 +23,15 @@ speconsense/scalability/vsearch.py,sha256=I1IzTeRzEFn9bi8mNbBRvtcHvUBzBFdE7D5yf-
|
|
|
23
23
|
speconsense/summarize/__init__.py,sha256=PE6W9hytDxhkw7W6Fz8X3jd92N2VdhuxiQ72Nqm1xC0,3181
|
|
24
24
|
speconsense/summarize/__main__.py,sha256=_hzLNqNtv4PirL1oMic37GW2QmjWquoznzNtld_3FiQ,117
|
|
25
25
|
speconsense/summarize/analysis.py,sha256=1MXtKMpX1bgKEtI-JN6BwTQj99qyt1eQLqNg51EgPiE,31560
|
|
26
|
-
speconsense/summarize/cli.py,sha256=
|
|
26
|
+
speconsense/summarize/cli.py,sha256=JDgLQl8zFlbkH-Oa_n68lbUTB-cLBWzl3dphn041ph0,26961
|
|
27
27
|
speconsense/summarize/clustering.py,sha256=kk-FdFCea8KRocowN_4dt_aoqZNVJMmEu7CVKPfYgK8,28346
|
|
28
28
|
speconsense/summarize/fields.py,sha256=a6aK9hkPJ-sDRRSqM_7IkyqCki99KSMnsQMV-U7r2zY,8687
|
|
29
29
|
speconsense/summarize/io.py,sha256=FdHLbcj0NOL3WE1e5OL85DRdJaHpyXPMcmlNg9mG3tM,32732
|
|
30
30
|
speconsense/summarize/iupac.py,sha256=Y6KqELmnGy4Eya4C_4ldXY8uek0ReuSUgITLI3NW0-A,11042
|
|
31
31
|
speconsense/summarize/merging.py,sha256=FakBey3qpu7ULPIsc2GDo9WG8jNU1L6q2pgQ2HrOKXk,28454
|
|
32
|
-
speconsense-0.7.
|
|
33
|
-
speconsense-0.7.
|
|
34
|
-
speconsense-0.7.
|
|
35
|
-
speconsense-0.7.
|
|
36
|
-
speconsense-0.7.
|
|
37
|
-
speconsense-0.7.
|
|
32
|
+
speconsense-0.7.5.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
|
|
33
|
+
speconsense-0.7.5.dist-info/METADATA,sha256=JiEg26k5JEJUFSR5hw4FHHysL4ei9PPyvVXsGBRBZVc,80041
|
|
34
|
+
speconsense-0.7.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
35
|
+
speconsense-0.7.5.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
|
|
36
|
+
speconsense-0.7.5.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
|
|
37
|
+
speconsense-0.7.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|