speconsense 0.7.4__tar.gz → 0.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {speconsense-0.7.4/speconsense.egg-info → speconsense-0.7.5}/PKG-INFO +5 -5
  2. {speconsense-0.7.4 → speconsense-0.7.5}/README.md +4 -4
  3. {speconsense-0.7.4 → speconsense-0.7.5}/pyproject.toml +1 -1
  4. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/__init__.py +1 -1
  5. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/cli.py +12 -12
  6. {speconsense-0.7.4 → speconsense-0.7.5/speconsense.egg-info}/PKG-INFO +5 -5
  7. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_summarize.py +8 -8
  8. {speconsense-0.7.4 → speconsense-0.7.5}/LICENSE +0 -0
  9. {speconsense-0.7.4 → speconsense-0.7.5}/setup.cfg +0 -0
  10. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/cli.py +0 -0
  11. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/core/__init__.py +0 -0
  12. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/core/__main__.py +0 -0
  13. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/core/cli.py +0 -0
  14. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/core/clusterer.py +0 -0
  15. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/core/workers.py +0 -0
  16. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/msa.py +0 -0
  17. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/__init__.py +0 -0
  18. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/compressed.yaml +0 -0
  19. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/example.yaml +0 -0
  20. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/herbarium.yaml +0 -0
  21. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/largedata.yaml +0 -0
  22. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/nostalgia.yaml +0 -0
  23. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/profiles/strict.yaml +0 -0
  24. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/quality_report.py +0 -0
  25. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/scalability/__init__.py +0 -0
  26. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/scalability/base.py +0 -0
  27. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/scalability/config.py +0 -0
  28. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/scalability/vsearch.py +0 -0
  29. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/__init__.py +0 -0
  30. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/__main__.py +0 -0
  31. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/analysis.py +0 -0
  32. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/clustering.py +0 -0
  33. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/fields.py +0 -0
  34. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/io.py +0 -0
  35. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/iupac.py +0 -0
  36. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/summarize/merging.py +0 -0
  37. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/synth.py +0 -0
  38. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense/types.py +0 -0
  39. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense.egg-info/SOURCES.txt +0 -0
  40. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense.egg-info/dependency_links.txt +0 -0
  41. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense.egg-info/entry_points.txt +0 -0
  42. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense.egg-info/requires.txt +0 -0
  43. {speconsense-0.7.4 → speconsense-0.7.5}/speconsense.egg-info/top_level.txt +0 -0
  44. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_ambiguity_calling.py +0 -0
  45. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_augment_input.py +0 -0
  46. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_complement_flags.py +0 -0
  47. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_fields.py +0 -0
  48. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_haplotype_filtering.py +0 -0
  49. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_orientation.py +0 -0
  50. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_overlap_merge.py +0 -0
  51. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_overlap_merge_integration.py +0 -0
  52. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_profiles.py +0 -0
  53. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_regression.py +0 -0
  54. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_synth.py +0 -0
  55. {speconsense-0.7.4 → speconsense-0.7.5}/tests/test_variant_phasing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.4
3
+ Version: 0.7.5
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
295
295
  |---------------|-------------|------------|-------------|
296
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
297
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
299
299
 
300
300
  ### Example Directory Structure
301
301
  ```
302
302
  __Summary__/
303
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
304
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
306
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
307
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
308
308
  ├── summary.txt # Statistics
@@ -829,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
829
829
  ```bash
830
830
  speconsense-summarize --enable-full-consensus
831
831
  ```
832
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
832
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
833
833
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
834
834
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
835
835
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1073,7 +1073,7 @@ The complete speconsense-summarize workflow operates in this order:
1073
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1074
1074
  5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
1075
  6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
- 7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1076
+ 7. **Full consensus generation** (optional) — IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1077
1077
  8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1078
1078
 
1079
1079
  **Key architectural features**:
@@ -260,14 +260,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
260
260
  |---------------|-------------|------------|-------------|
261
261
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
262
262
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
263
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
263
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
264
264
 
265
265
  ### Example Directory Structure
266
266
  ```
267
267
  __Summary__/
268
268
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
269
269
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
270
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
270
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
271
271
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
272
272
  ├── summary.fasta # All final consensus sequences (excludes .raw)
273
273
  ├── summary.txt # Statistics
@@ -794,7 +794,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
794
794
  ```bash
795
795
  speconsense-summarize --enable-full-consensus
796
796
  ```
797
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
797
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
798
798
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
799
799
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
800
800
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1038,7 +1038,7 @@ The complete speconsense-summarize workflow operates in this order:
1038
1038
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1039
1039
  5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1040
1040
  6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1041
- 7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1041
+ 7. **Full consensus generation** (optional) — IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1042
1042
  8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1043
1043
 
1044
1044
  **Key architectural features**:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "speconsense"
7
- version = "0.7.4"
7
+ version = "0.7.5"
8
8
  description = "High-quality clustering and consensus generation for Oxford Nanopore amplicon reads"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.4"
8
+ __version__ = "0.7.5"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
@@ -392,19 +392,19 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
392
392
  final_consensus.append(renamed_variant)
393
393
  group_naming.append((variant.sample_name, new_name))
394
394
 
395
- # Generate full consensus from PRE-MERGE variants
395
+ # Generate full consensus from PRE-MERGE variants that contributed
396
+ # to surviving post-merge variants (after select-min-size-ratio)
396
397
  if getattr(args, 'enable_full_consensus', False):
397
- pre_merge_variants = variant_groups[group_id]
398
-
399
- # Apply size-ratio filter (same as merge pipeline)
400
- if args.merge_min_size_ratio > 0 and len(pre_merge_variants) > 1:
401
- largest_size = max(v.size for v in pre_merge_variants)
402
- filtered = [v for v in pre_merge_variants
403
- if (v.size / largest_size) >= args.merge_min_size_ratio]
404
- if len(filtered) < len(pre_merge_variants):
405
- filtered_count = len(pre_merge_variants) - len(filtered)
406
- logging.debug(f"Full consensus: filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
407
- pre_merge_variants = filtered
398
+ # Collect original cluster names from surviving post-merge variants
399
+ surviving_originals = set()
400
+ for v in group_members:
401
+ if v.sample_name in all_merge_traceability:
402
+ surviving_originals.update(all_merge_traceability[v.sample_name])
403
+ else:
404
+ surviving_originals.add(v.sample_name)
405
+
406
+ pre_merge_variants = [v for v in variant_groups[group_id]
407
+ if v.sample_name in surviving_originals]
408
408
 
409
409
  specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
410
410
  full_name = f"{specimen_base}-{group_idx + 1}.full"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.4
3
+ Version: 0.7.5
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -295,14 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
295
295
  |---------------|-------------|------------|-------------|
296
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
297
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
- | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from pre-merge components of surviving variants |
299
299
 
300
300
  ### Example Directory Structure
301
301
  ```
302
302
  __Summary__/
303
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
304
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
- ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (surviving variants' components)
306
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
307
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
308
308
  ├── summary.txt # Statistics
@@ -829,7 +829,7 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
829
829
  ```bash
830
830
  speconsense-summarize --enable-full-consensus
831
831
  ```
832
- - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
832
+ - Generates a full IUPAC consensus sequence per variant group from pre-merge variants that contributed to surviving post-merge variants
833
833
  - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
834
834
  - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
835
835
  - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
@@ -1073,7 +1073,7 @@ The complete speconsense-summarize workflow operates in this order:
1073
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1074
1074
  5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
1075
  6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
- 7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1076
+ 7. **Full consensus generation** (optional) — IUPAC consensus from pre-merge components of surviving post-merge variants (`--enable-full-consensus`)
1077
1077
  8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1078
1078
 
1079
1079
  **Key architectural features**:
@@ -521,7 +521,7 @@ class TestFullConsensus:
521
521
 
522
522
 
523
523
  def test_full_consensus_filters_small_variants(self):
524
- """Integration test: merge_min_size_ratio filters small variants from full consensus."""
524
+ """Integration test: select_min_size_ratio filters small variants from full consensus."""
525
525
  temp_dir = tempfile.mkdtemp()
526
526
  source_dir = os.path.join(temp_dir, "clusters")
527
527
  summary_dir = os.path.join(temp_dir, "__Summary__")
@@ -529,7 +529,7 @@ class TestFullConsensus:
529
529
 
530
530
  try:
531
531
  # Two similar sequences (1 SNP at position 12: G vs A)
532
- # Very different sizes so the small one is filtered by merge_min_size_ratio
532
+ # Very different sizes so the small one is filtered by select_min_size_ratio
533
533
  seq_large = "ATCGATCGATCGATCGATCGATCG" # G at position 12
534
534
  seq_small = "ATCGATCGATCAATCGATCGATCG" # A at position 12
535
535
 
@@ -542,7 +542,8 @@ class TestFullConsensus:
542
542
  with open(fasta_file, 'w') as f:
543
543
  f.write(fasta_content)
544
544
 
545
- # merge-min-size-ratio 0.1 filters 5/100=0.05 from full consensus
545
+ # select-min-size-ratio 0.1 filters 5/100=0.05 post-merge variant,
546
+ # so its pre-merge components are excluded from .full consensus
546
547
  result = subprocess.run(
547
548
  [
548
549
  "speconsense-summarize",
@@ -550,7 +551,7 @@ class TestFullConsensus:
550
551
  "--summary-dir", summary_dir,
551
552
  "--min-ric", "3",
552
553
  "--enable-full-consensus",
553
- "--merge-min-size-ratio", "0.1",
554
+ "--select-min-size-ratio", "0.1",
554
555
  "--disable-merging",
555
556
  "--min-merge-overlap", "0",
556
557
  ],
@@ -574,8 +575,8 @@ class TestFullConsensus:
574
575
  finally:
575
576
  shutil.rmtree(temp_dir)
576
577
 
577
- def test_full_consensus_no_filter_when_disabled(self):
578
- """Integration test: merge_min_size_ratio=0 preserves all variants in full consensus."""
578
+ def test_full_consensus_no_filter_when_all_survive(self):
579
+ """Integration test: all post-merge variants surviving means all contribute to .full."""
579
580
  temp_dir = tempfile.mkdtemp()
580
581
  source_dir = os.path.join(temp_dir, "clusters")
581
582
  summary_dir = os.path.join(temp_dir, "__Summary__")
@@ -595,7 +596,7 @@ class TestFullConsensus:
595
596
  with open(fasta_file, 'w') as f:
596
597
  f.write(fasta_content)
597
598
 
598
- # merge-min-size-ratio 0 disables filtering both contribute to .full
599
+ # No select-min-size-ratio both variants survive, both contribute to .full
599
600
  result = subprocess.run(
600
601
  [
601
602
  "speconsense-summarize",
@@ -603,7 +604,6 @@ class TestFullConsensus:
603
604
  "--summary-dir", summary_dir,
604
605
  "--min-ric", "3",
605
606
  "--enable-full-consensus",
606
- "--merge-min-size-ratio", "0",
607
607
  "--disable-merging",
608
608
  "--min-merge-overlap", "0",
609
609
  ],
File without changes
File without changes