PyPI - speconsense - Versions diffs - 0.7.2__py3-none-any.whl → 0.7.4__py3-none-any.whl - Mend

speconsense 0.7.2py3-none-any.whl → 0.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

speconsense/__init__.py +1 -1
speconsense/core/cli.py +18 -0
speconsense/profiles/__init__.py +2 -0
speconsense/profiles/compressed.yaml +28 -0
speconsense/profiles/example.yaml +1 -0
speconsense/summarize/cli.py +60 -3
speconsense/summarize/fields.py +5 -3
speconsense/summarize/io.py +10 -1
speconsense/summarize/merging.py +97 -76
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/METADATA +79 -12
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/RECORD +15 -14
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/WHEEL +1 -1
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/entry_points.txt +0 -0
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/licenses/LICENSE +0 -0
{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/top_level.txt +0 -0

speconsense/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
 in the fungal DNA barcoding pipeline.
 """
-__version__ = "0.7.2"
+__version__ = "0.7.4"
 __author__ = "Josh Walker"
 __email__ = "joshowalker@yahoo.com"

speconsense/core/cli.py CHANGED Viewed

@@ -66,6 +66,9 @@ def main():
                                help="Disable position-based variant phasing (enabled by default). "
                                     "MCL graph clustering already separates most variants; this "
                                     "second pass analyzes MSA positions to phase remaining variants.")
+    phasing_group.add_argument("--enable-position-phasing", action="store_false",
+                               dest="disable_position_phasing",
+                               help="Override --disable-position-phasing or profile setting")
     phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
                                help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
     phasing_group.add_argument("--min-variant-count", type=int, default=5,
@@ -75,6 +78,9 @@ def main():
     ambiguity_group = parser.add_argument_group("Ambiguity Calling")
     ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
                                  help="Disable IUPAC ambiguity code calling for unphased variant positions")
+    ambiguity_group.add_argument("--enable-ambiguity-calling", action="store_false",
+                                 dest="disable_ambiguity_calling",
+                                 help="Override --disable-ambiguity-calling or profile setting")
     ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
                                  help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
     ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
@@ -84,8 +90,14 @@ def main():
     merging_group = parser.add_argument_group("Cluster Merging")
     merging_group.add_argument("--disable-cluster-merging", action="store_true",
                                help="Disable merging of clusters with identical consensus sequences")
+    merging_group.add_argument("--enable-cluster-merging", action="store_false",
+                               dest="disable_cluster_merging",
+                               help="Override --disable-cluster-merging or profile setting")
     merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
                                help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
+    merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
+                               dest="disable_homopolymer_equivalence",
+                               help="Override --disable-homopolymer-equivalence or profile setting")
     # Orientation group
     orient_group = parser.add_argument_group("Orientation")
@@ -104,11 +116,17 @@ def main():
                                  "0=auto-detect, default=1 (safe for parallel workflows).")
     perf_group.add_argument("--enable-early-filter", action="store_true",
                             help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
+    perf_group.add_argument("--disable-early-filter", action="store_false",
+                            dest="enable_early_filter",
+                            help="Override --enable-early-filter or profile setting")
     # Debugging group
     debug_group = parser.add_argument_group("Debugging")
     debug_group.add_argument("--collect-discards", action="store_true",
                              help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
+    debug_group.add_argument("--no-collect-discards", action="store_false",
+                             dest="collect_discards",
+                             help="Override --collect-discards or profile setting")
     debug_group.add_argument("--log-level", default="INFO",
                              choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])

speconsense/profiles/__init__.py CHANGED Viewed

@@ -103,6 +103,8 @@ VALID_SUMMARIZE_KEYS = {
     "select-max-groups",
     "select-max-variants",
     "select-strategy",
+    "select-min-size-ratio",
+    "enable-full-consensus",
     # Processing
     "scale-threshold",
     "threads",

speconsense/profiles/compressed.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+# Compress variants into minimal IUPAC consensus sequences
+#
+# Aggressively merges similar variants (including indels) into single
+# IUPAC consensus sequences. Only truly dissimilar sequences remain
+# separate. Uses 20% frequency thresholds throughout.
+#
+# Designed for workflows where reviewers want fewer sequences to
+# examine, with all variation represented via IUPAC ambiguity codes.
+# Partial overlap merging is disabled as a safety measure.
+#
+# Use with:
+#   speconsense input.fastq -p compressed
+#   speconsense-summarize -p compressed
+speconsense-version: "0.7.*"
+description: "Compress variants into minimal IUPAC consensus sequences"
+speconsense:
+  min-ambiguity-frequency: 0.20  # 20% threshold for IUPAC ambiguity calling
+  min-variant-frequency: 0.20   # 20% threshold for variant phasing
+speconsense-summarize:
+  merge-indel-length: 5         # Merge indels up to 5bp
+  merge-position-count: 10      # Allow up to 10 variant positions in a merge
+  merge-min-size-ratio: 0.2     # Match 20% calling threshold
+  select-min-size-ratio: 0.2    # Match 20% calling threshold
+  min-merge-overlap: 0          # Disable partial overlap merging
+  enable-full-consensus: true   # Include full IUPAC consensus per group

speconsense/profiles/example.yaml CHANGED Viewed

@@ -91,6 +91,7 @@ speconsense-summarize:
   # select-max-groups: -1         # Max groups to output (-1 = no limit)
   # select-max-variants: -1       # Max variants per group (-1 = no limit)
   # select-strategy: size         # Selection strategy: size or diversity
+  # select-min-size-ratio: 0    # Min size ratio to include variant (0 = disabled)
   # --- Processing ---
   # threads: 0                    # Max threads (0 = auto-detect)

speconsense/summarize/cli.py CHANGED Viewed

@@ -54,8 +54,8 @@ from .io import (
     write_output_files,
 )
 from .clustering import perform_hac_clustering, select_variants
-from .merging import merge_group_with_msa
-from .analysis import MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
+from .merging import merge_group_with_msa, create_full_consensus_from_msa
+from .analysis import run_spoa_msa, MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
 # Merge effort configuration
@@ -132,6 +132,8 @@ def parse_arguments():
     merging_group = parser.add_argument_group("Merging")
     merging_group.add_argument("--disable-merging", action="store_true",
                                help="Disable all variant merging (skip MSA-based merge evaluation entirely)")
+    merging_group.add_argument("--enable-merging", action="store_false", dest="disable_merging",
+                               help="Override --disable-merging or profile setting")
     merging_group.add_argument("--merge-snp", action=argparse.BooleanOptionalAction, default=True,
                                help="Enable SNP-based merging (default: True, use --no-merge-snp to disable)")
     merging_group.add_argument("--merge-indel-length", type=int, default=0,
@@ -144,6 +146,9 @@ def parse_arguments():
                                help="Minimum overlap in bp for merging sequences of different lengths (default: 200, 0 to disable)")
     merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
                                help="Disable homopolymer equivalence in merging (treat AAA vs AAAA as different)")
+    merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
+                               dest="disable_homopolymer_equivalence",
+                               help="Override --disable-homopolymer-equivalence or profile setting")
     merging_group.add_argument("--merge-effort", type=str, default="balanced", metavar="LEVEL",
                                help="Merging effort level: fast (8), balanced (10), thorough (12), "
                                     "or numeric 6-14. Higher values allow larger batch sizes for "
@@ -164,6 +169,15 @@ def parse_arguments():
     selection_group.add_argument("--select-strategy", "--variant-selection",
                                  dest="select_strategy", choices=["size", "diversity"], default="size",
                                  help="Variant selection strategy: size or diversity (default: size)")
+    selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
+                                 help="Minimum size ratio (variant/largest) to include in output "
+                                      "(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
+    selection_group.add_argument("--enable-full-consensus", action="store_true",
+                                 help="Generate a full consensus per variant group representing all variation "
+                                      "from pre-merge variants (gaps never win)")
+    selection_group.add_argument("--disable-full-consensus", action="store_false",
+                                 dest="enable_full_consensus",
+                                 help="Override --enable-full-consensus or profile setting")
     # Performance group
     perf_group = parser.add_argument_group("Performance")
@@ -345,9 +359,21 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
                           key=lambda x: max(m.size for m in x[1]),
                           reverse=True)
-    for group_idx, (_, group_members) in enumerate(sorted_groups):
+    for group_idx, (group_id, group_members) in enumerate(sorted_groups):
         final_group_name = group_idx + 1
+        # Apply select-min-size-ratio filter
+        if args.select_min_size_ratio > 0 and len(group_members) > 1:
+            largest_size = max(v.size for v in group_members)
+            filtered = [v for v in group_members
+                        if (v.size / largest_size) >= args.select_min_size_ratio]
+            if len(filtered) < len(group_members):
+                filtered_count = len(group_members) - len(filtered)
+                logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
+                              f"variants with size ratio < {args.select_min_size_ratio} "
+                              f"relative to largest (size={largest_size})")
+                group_members = filtered
         # Select variants for this group
         selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
@@ -366,6 +392,35 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
             final_consensus.append(renamed_variant)
             group_naming.append((variant.sample_name, new_name))
+        # Generate full consensus from PRE-MERGE variants
+        if getattr(args, 'enable_full_consensus', False):
+            pre_merge_variants = variant_groups[group_id]
+            # Apply size-ratio filter (same as merge pipeline)
+            if args.merge_min_size_ratio > 0 and len(pre_merge_variants) > 1:
+                largest_size = max(v.size for v in pre_merge_variants)
+                filtered = [v for v in pre_merge_variants
+                            if (v.size / largest_size) >= args.merge_min_size_ratio]
+                if len(filtered) < len(pre_merge_variants):
+                    filtered_count = len(pre_merge_variants) - len(filtered)
+                    logging.debug(f"Full consensus: filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
+                    pre_merge_variants = filtered
+            specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
+            full_name = f"{specimen_base}-{group_idx + 1}.full"
+            if len(pre_merge_variants) == 1:
+                # Single variant — copy directly
+                full_consensus = pre_merge_variants[0]._replace(sample_name=full_name)
+            else:
+                # MSA on pre-merge variants, full consensus logic
+                sequences = [v.sequence for v in pre_merge_variants]
+                aligned_seqs = run_spoa_msa(sequences, alignment_mode=1)
+                full_consensus = create_full_consensus_from_msa(aligned_seqs, pre_merge_variants)
+                full_consensus = full_consensus._replace(sample_name=full_name)
+            final_consensus.append(full_consensus)
         naming_info[group_idx + 1] = group_naming
     logging.info(f"Processed {file_name}: {len(final_consensus)} final variants across {len(merged_groups)} groups")
@@ -421,6 +476,8 @@ def main():
     logging.info(f"  --select-max-variants: {args.select_max_variants}")
     logging.info(f"  --select-max-groups: {args.select_max_groups}")
     logging.info(f"  --select-strategy: {args.select_strategy}")
+    logging.info(f"  --select-min-size-ratio: {args.select_min_size_ratio}")
+    logging.info(f"  --enable-full-consensus: {args.enable_full_consensus}")
     logging.info(f"  --log-level: {args.log_level}")
     logging.info("")
     logging.info("Processing each specimen file independently to organize variants within specimens")

speconsense/summarize/fields.py CHANGED Viewed

@@ -124,8 +124,8 @@ class GroupField(FastaField):
         super().__init__('group', 'Variant group number')
     def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
-        # Extract from sample_name (e.g., "...-1.v1" or "...-2.v1.raw1")
-        match = re.search(r'-(\d+)\.v\d+(?:\.raw\d+)?$', consensus.sample_name)
+        # Extract from sample_name (e.g., "...-1.v1", "...-2.v1.raw1", or "...-1.full")
+        match = re.search(r'-(\d+)(?:\.v\d+(?:\.raw\d+)?|\.full)$', consensus.sample_name)
         if match:
             return f"group={match.group(1)}"
         return None
@@ -136,8 +136,10 @@ class VariantField(FastaField):
         super().__init__('variant', 'Variant identifier within group')
     def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
-        # Extract from sample_name (e.g., "...-1.v1" -> "v1" or "...-1.v1.raw1" -> "v1")
+        # Extract from sample_name (e.g., "...-1.v1" -> "v1", "...-1.v1.raw1" -> "v1", "...-1.full" -> "full")
         match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
+        if not match:
+            match = re.search(r'\.(full)$', consensus.sample_name)
         if match:
             return f"variant={match.group(1)}"
         return None

speconsense/summarize/io.py CHANGED Viewed

@@ -358,6 +358,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
     # Generate .raw file consensuses for merged variants
     raw_file_consensuses = []
     for consensus in specimen_consensus:
+        # Skip .raw generation for .full consensus (synthetic/derived)
+        if consensus.sample_name.endswith('.full'):
+            continue
         # Only create .raw files if this consensus was actually merged
         if consensus.raw_ric and len(consensus.raw_ric) > 1:
             # Find the original cluster name from naming_info
@@ -412,6 +415,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
     # Write FASTQ files for each final consensus containing all contributing reads
     for consensus in specimen_consensus:
+        # Skip FASTQ for .full consensus (synthetic/derived, no traceable cluster reads)
+        if consensus.sample_name.endswith('.full'):
+            continue
         write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
     # Write .raw files (individual FASTA and FASTQ for pre-merge variants)
@@ -704,7 +710,10 @@ def write_output_files(final_consensus: List[ConsensusInfo],
             multiple_id = specimen_counters[base_name]
             writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
             unique_samples.add(base_name)
-            total_ric += consensus.ric
+            # Exclude .full from total RiC to avoid double-counting
+            # (.full aggregates reads already counted in merged variants)
+            if not consensus.sample_name.endswith('.full'):
+                total_ric += consensus.ric
         writer.writerow([])
         writer.writerow(['Total Unique Samples', len(unique_samples)])

speconsense/summarize/merging.py CHANGED Viewed

@@ -106,6 +106,63 @@ def is_compatible_subset(variant_stats: dict, args, prior_positions: dict = None
     return True
+def _build_merged_consensus_info(
+    consensus_seq: list, snp_count: int, variants: List[ConsensusInfo]
+) -> ConsensusInfo:
+    """Assemble a ConsensusInfo from column-voting results and source variants.
+    Handles joining the consensus sequence, aggregating size/ric totals,
+    flattening raw_ric/raw_len merge history, and selecting metadata
+    from the largest variant.
+    Args:
+        consensus_seq: List of consensus characters from column voting
+        snp_count: Number of ambiguous (multi-base) positions
+        variants: Source ConsensusInfo objects that were merged
+    Returns:
+        ConsensusInfo with merged metadata
+    """
+    consensus_sequence = ''.join(consensus_seq)
+    total_size = sum(v.size for v in variants)
+    total_ric = sum(v.ric for v in variants)
+    # Collect RiC values, preserving any prior merge history
+    raw_ric_values = []
+    for v in variants:
+        if v.raw_ric:
+            raw_ric_values.extend(v.raw_ric)
+        else:
+            raw_ric_values.append(v.ric)
+    raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
+    # Collect lengths, preserving any prior merge history
+    raw_len_values = []
+    for v in variants:
+        if v.raw_len:
+            raw_len_values.extend(v.raw_len)
+        else:
+            raw_len_values.append(len(v.sequence))
+    raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
+    largest_variant = max(variants, key=lambda v: v.size)
+    return ConsensusInfo(
+        sample_name=largest_variant.sample_name,
+        cluster_id=largest_variant.cluster_id,
+        sequence=consensus_sequence,
+        ric=total_ric,
+        size=total_size,
+        file_path=largest_variant.file_path,
+        snp_count=snp_count if snp_count > 0 else None,
+        primers=largest_variant.primers,
+        raw_ric=raw_ric_values,
+        raw_len=raw_len_values,
+        rid=largest_variant.rid,
+        rid_min=largest_variant.rid_min,
+    )
 def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
     """
     Generate consensus from MSA using size-weighted majority voting.
@@ -160,46 +217,7 @@ def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo])
                 snp_count += 1
         # else: majority wants gap, omit position
-    # Create merged ConsensusInfo
-    consensus_sequence = ''.join(consensus_seq)
-    total_size = sum(v.size for v in variants)
-    total_ric = sum(v.ric for v in variants)
-    # Collect RiC values, preserving any prior merge history
-    raw_ric_values = []
-    for v in variants:
-        if v.raw_ric:
-            raw_ric_values.extend(v.raw_ric)  # Flatten prior merge history
-        else:
-            raw_ric_values.append(v.ric)
-    raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
-    # Collect lengths, preserving any prior merge history
-    raw_len_values = []
-    for v in variants:
-        if v.raw_len:
-            raw_len_values.extend(v.raw_len)  # Flatten prior merge history
-        else:
-            raw_len_values.append(len(v.sequence))
-    raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
-    # Use name from largest variant
-    largest_variant = max(variants, key=lambda v: v.size)
-    return ConsensusInfo(
-        sample_name=largest_variant.sample_name,
-        cluster_id=largest_variant.cluster_id,
-        sequence=consensus_sequence,
-        ric=total_ric,
-        size=total_size,
-        file_path=largest_variant.file_path,
-        snp_count=snp_count if snp_count > 0 else None,
-        primers=largest_variant.primers,
-        raw_ric=raw_ric_values,
-        raw_len=raw_len_values,
-        rid=largest_variant.rid,  # Preserve identity metrics from largest variant
-        rid_min=largest_variant.rid_min,
-    )
+    return _build_merged_consensus_info(consensus_seq, snp_count, variants)
 def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
@@ -295,46 +313,49 @@ def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[Consens
                     consensus_seq.append(iupac_code)
                     snp_count += 1
-    # Create merged ConsensusInfo
-    consensus_sequence = ''.join(consensus_seq)
-    total_size = sum(v.size for v in variants)
-    total_ric = sum(v.ric for v in variants)
+    return _build_merged_consensus_info(consensus_seq, snp_count, variants)
-    # Collect RiC values, preserving any prior merge history
-    raw_ric_values = []
-    for v in variants:
-        if v.raw_ric:
-            raw_ric_values.extend(v.raw_ric)  # Flatten prior merge history
-        else:
-            raw_ric_values.append(v.ric)
-    raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
-    # Collect lengths, preserving any prior merge history
-    raw_len_values = []
-    for v in variants:
-        if v.raw_len:
-            raw_len_values.extend(v.raw_len)  # Flatten prior merge history
-        else:
-            raw_len_values.append(len(v.sequence))
-    raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
+def create_full_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
+    """
+    Generate full consensus from MSA where any non-gap base means inclusion.
-    # Use name from largest variant
-    largest_variant = max(variants, key=lambda v: v.size)
+    Unlike create_consensus_from_msa where gaps can win by majority vote,
+    the full consensus includes a position if ANY variant has a base there.
+    This captures all variation from all contributing variants.
-    return ConsensusInfo(
-        sample_name=largest_variant.sample_name,
-        cluster_id=largest_variant.cluster_id,
-        sequence=consensus_sequence,
-        ric=total_ric,
-        size=total_size,
-        file_path=largest_variant.file_path,
-        snp_count=snp_count if snp_count > 0 else None,
-        primers=largest_variant.primers,
-        raw_ric=raw_ric_values,
-        raw_len=raw_len_values,
-        rid=largest_variant.rid,
-        rid_min=largest_variant.rid_min,
-    )
+    Args:
+        aligned_seqs: MSA sequences with gaps as '-'
+        variants: Original ConsensusInfo objects (for metadata)
+    Returns:
+        ConsensusInfo with full consensus sequence
+    """
+    consensus_seq = []
+    snp_count = 0
+    alignment_length = len(aligned_seqs[0].seq)
+    for col_idx in range(alignment_length):
+        column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
+        # Collect non-gap bases
+        base_votes = defaultdict(int)
+        for i, base in enumerate(column):
+            upper_base = base.upper()
+            if upper_base != '-':
+                base_votes[upper_base] += variants[i].size
+        # Include position if ANY variant has a base (gaps never win)
+        if base_votes:
+            if len(base_votes) == 1:
+                consensus_seq.append(list(base_votes.keys())[0])
+            else:
+                represented_bases = set(base_votes.keys())
+                iupac_code = merge_bases_to_iupac(represented_bases)
+                consensus_seq.append(iupac_code)
+                snp_count += 1
+    return _build_merged_consensus_info(consensus_seq, snp_count, variants)
 def merge_group_with_msa(variants: List[ConsensusInfo], args) -> Tuple[List[ConsensusInfo], Dict, int, List[OverlapMergeInfo]]:

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: speconsense
-Version: 0.7.2
+Version: 0.7.4
 Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
 Author-email: Josh Walker <joshowalker@yahoo.com>
 License: BSD-3-Clause
@@ -171,6 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
 ```
 **Bundled profiles:**
+- `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
 - `herbarium` — High-recall for degraded DNA/type specimens
 - `largedata` — Experimental settings for large input files
 - `nostalgia` — Simulate older bioinformatics pipelines
@@ -294,12 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
 |---------------|-------------|------------|-------------|
 | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
 | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
+| **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
 ### Example Directory Structure
 ```
 __Summary__/
 ├── sample-1.v1-RiC45.fasta                  # Primary variant (group 1, merged)
 ├── sample-1.v2-RiC23.fasta                  # Additional variant (not merged)
+├── sample-1.full-RiC68.fasta                # Full IUPAC consensus for group 1 (all pre-merge variants)
 ├── sample-2.v1-RiC30.fasta                  # Second organism group, primary variant
 ├── summary.fasta                            # All final consensus sequences (excludes .raw)
 ├── summary.txt                              # Statistics
@@ -675,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
    - Output up to select_max_variants per group
 3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
+**Selection Size Ratio Filtering:**
+```bash
+speconsense-summarize --select-min-size-ratio 0.2
+```
+- Filters out post-merge variants whose size is too small relative to the largest variant in their group
+- Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
+- Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
+- Default is 0 (disabled) — all post-merge variants pass through to selection
+- Applied after merging but before variant selection
+- Useful for suppressing noise variants that survived merging but are too small to be meaningful
+- Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
 This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
 ### Customizing FASTA Header Fields
@@ -810,6 +825,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
 ### Additional Summarize Options
+**Full Consensus:**
+```bash
+speconsense-summarize --enable-full-consensus
+```
+- Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
+- Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
+- Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
+- Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
+- Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
+- Enabled by default in the `compressed` profile
+- Use `--disable-full-consensus` to override when set by a profile
 **Quality Filtering:**
 ```bash
 speconsense-summarize --min-ric 5
@@ -1044,8 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
 2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
 3. **Group filtering** to limit output groups (`--select-max-groups`)
 4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
-5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
-6. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
+5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
+6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
+7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
+8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
 **Key architectural features**:
 - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1098,17 +1127,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
                    [--min-cluster-ratio MIN_CLUSTER_RATIO]
                    [--max-sample-size MAX_SAMPLE_SIZE]
                    [--outlier-identity OUTLIER_IDENTITY]
-                   [--disable-position-phasing]
+                   [--disable-position-phasing] [--enable-position-phasing]
                    [--min-variant-frequency MIN_VARIANT_FREQUENCY]
                    [--min-variant-count MIN_VARIANT_COUNT]
-                   [--disable-ambiguity-calling]
+                   [--disable-ambiguity-calling] [--enable-ambiguity-calling]
                    [--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
                    [--min-ambiguity-count MIN_AMBIGUITY_COUNT]
-                   [--disable-cluster-merging]
+                   [--disable-cluster-merging] [--enable-cluster-merging]
                    [--disable-homopolymer-equivalence]
+                   [--enable-homopolymer-equivalence]
                    [--orient-mode {skip,keep-all,filter-failed}]
                    [--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
-                   [--threads N] [--enable-early-filter] [--collect-discards]
+                   [--threads N] [--enable-early-filter]
+                   [--disable-early-filter] [--collect-discards]
+                   [--no-collect-discards]
                    [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
                    [--version] [-p NAME] [--list-profiles]
                    input_file
@@ -1167,6 +1199,8 @@ Variant Phasing:
                         default). MCL graph clustering already separates most
                         variants; this second pass analyzes MSA positions to
                         phase remaining variants.
+  --enable-position-phasing
+                        Override --disable-position-phasing or profile setting
   --min-variant-frequency MIN_VARIANT_FREQUENCY
                         Minimum alternative allele frequency to call variant
                         (default: 0.10 for 10%)
@@ -1178,6 +1212,9 @@ Ambiguity Calling:
   --disable-ambiguity-calling
                         Disable IUPAC ambiguity code calling for unphased
                         variant positions
+  --enable-ambiguity-calling
+                        Override --disable-ambiguity-calling or profile
+                        setting
   --min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
                         Minimum alternative allele frequency for IUPAC
                         ambiguity calling (default: 0.10 for 10%)
@@ -1189,9 +1226,14 @@ Cluster Merging:
   --disable-cluster-merging
                         Disable merging of clusters with identical consensus
                         sequences
+  --enable-cluster-merging
+                        Override --disable-cluster-merging or profile setting
   --disable-homopolymer-equivalence
                         Disable homopolymer equivalence in cluster merging
                         (only merge identical sequences)
+  --enable-homopolymer-equivalence
+                        Override --disable-homopolymer-equivalence or profile
+                        setting
 Orientation:
   --orient-mode {skip,keep-all,filter-failed}
@@ -1213,10 +1255,14 @@ Performance:
                         Enable early filtering to skip small clusters before
                         variant phasing (improves performance for large
                         datasets)
+  --disable-early-filter
+                        Override --enable-early-filter or profile setting
 Debugging:
   --collect-discards    Write discarded reads (outliers and filtered clusters)
                         to cluster_debug/{sample}-discards.fastq
+  --no-collect-discards
+                        Override --collect-discards or profile setting
   --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
 ```
@@ -1227,15 +1273,22 @@ usage: speconsense-summarize [-h] [--source SOURCE]
                              [--summary-dir SUMMARY_DIR]
                              [--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
                              [--min-len MIN_LEN] [--max-len MAX_LEN]
-                             [--group-identity GROUP_IDENTITY] [--merge-snp]
+                             [--group-identity GROUP_IDENTITY]
+                             [--disable-merging] [--enable-merging]
+                             [--merge-snp | --no-merge-snp]
                              [--merge-indel-length MERGE_INDEL_LENGTH]
                              [--merge-position-count MERGE_POSITION_COUNT]
                              [--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
                              [--min-merge-overlap MIN_MERGE_OVERLAP]
                              [--disable-homopolymer-equivalence]
+                             [--enable-homopolymer-equivalence]
+                             [--merge-effort LEVEL]
                              [--select-max-groups SELECT_MAX_GROUPS]
                              [--select-max-variants SELECT_MAX_VARIANTS]
                              [--select-strategy {size,diversity}]
+                             [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
+                             [--enable-full-consensus]
+                             [--disable-full-consensus]
                              [--scale-threshold SCALE_THRESHOLD] [--threads N]
                              [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
                              [--version] [-p NAME] [--list-profiles]
@@ -1281,10 +1334,7 @@ Grouping:
 Merging:
   --disable-merging     Disable all variant merging (skip MSA-based merge
                         evaluation entirely)
-  --merge-effort LEVEL  Merging effort level: fast (8), balanced (10),
-                        thorough (12), or numeric 6-14. Higher values allow
-                        larger batch sizes for exhaustive subset search.
-                        Default: balanced
+  --enable-merging      Override --disable-merging or profile setting
   --merge-snp, --no-merge-snp
                         Enable SNP-based merging (default: True, use --no-
                         merge-snp to disable)
@@ -1303,6 +1353,13 @@ Merging:
   --disable-homopolymer-equivalence
                         Disable homopolymer equivalence in merging (treat AAA
                         vs AAAA as different)
+  --enable-homopolymer-equivalence
+                        Override --disable-homopolymer-equivalence or profile
+                        setting
+  --merge-effort LEVEL  Merging effort level: fast (8), balanced (10),
+                        thorough (12), or numeric 6-14. Higher values allow
+                        larger batch sizes for exhaustive subset search.
+                        Default: balanced
 Selection:
   --select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
@@ -1314,6 +1371,16 @@ Selection:
   --select-strategy {size,diversity}, --variant-selection {size,diversity}
                         Variant selection strategy: size or diversity
                         (default: size)
+  --select-min-size-ratio SELECT_MIN_SIZE_RATIO
+                        Minimum size ratio (variant/largest) to include in
+                        output (default: 0 = disabled, e.g. 0.2 for 20%
+                        cutoff)
+  --enable-full-consensus
+                        Generate a full consensus per variant group
+                        representing all variation from pre-merge variants
+                        (gaps never win)
+  --disable-full-consensus
+                        Override --enable-full-consensus or profile setting
 Performance:
   --scale-threshold SCALE_THRESHOLD

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-speconsense/__init__.py,sha256=VbusnaZk_gYhsif5UkANozOnjmCdNMDD_GOMPLfD5hQ,537
+speconsense/__init__.py,sha256=uLSZG2n0xobwuNT2PwZbytUg1DcyOr2aJlsbc52iKs0,537
 speconsense/cli.py,sha256=Kqb2da0IuazocAz72iqTnw71jI7UaQgxsHfb9CwiolU,85
 speconsense/msa.py,sha256=t1uDb-Tj5tDnB17QnNZPslpAiLXgAMIlnmMKBbwBKzs,31661
 speconsense/quality_report.py,sha256=Byrc115T03ybi7mpA0Bw8-gc83nhKPzDY0tyH1IIAMQ,19803
@@ -6,11 +6,12 @@ speconsense/synth.py,sha256=7kbifR9XZDcsB0wxo2PCHD8vLGEkVMTH3SQ724hTFGw,9892
 speconsense/types.py,sha256=_16nMMbfALEW212LDwTCan9u-gjvnS1ZQKpMK3y3zCE,1669
 speconsense/core/__init__.py,sha256=3AWfnmw1FTzzf-BRdGo1vRHjVJq7d-Wugsw50GJQY_0,694
 speconsense/core/__main__.py,sha256=dCfyQkVxxwlP6QqcWw9y5zp5iLzkG-fQsLmFHHEUlbI,112
-speconsense/core/cli.py,sha256=YmPj4CqjoFAyvgqXRZUFEjB7XAyUxho31rLKqBYw-yo,15767
+speconsense/core/cli.py,sha256=iepQMK0ZUhZvQShVZY_6WaHneR8ZIRKZ_b6NvVwaRwU,17186
 speconsense/core/clusterer.py,sha256=UFK5Ec0oMQ7l3GsFJOAhTFk7r90eOOdOBXRskm79Fwk,72093
 speconsense/core/workers.py,sha256=6pUyt-W9KxkillJ6TU1RjRh-_L-zRIwWqzIcBSeiOSc,25811
-speconsense/profiles/__init__.py,sha256=G2peTSx9xdFx0Z2nT71rTRHBwjWDxkFTdtGaQx-rujQ,16324
-speconsense/profiles/example.yaml,sha256=FGCsSAVtZL_m8PHWVLVrxaVCCwQkIk8wR7N7I2ZyxX0,4463
+speconsense/profiles/__init__.py,sha256=5UWj6VyUIXTzQ1kBZ4mJ2olZ_ADMK85rwr7KEmRfZfk,16382
+speconsense/profiles/compressed.yaml,sha256=LKtBm6nj8cpF2xeFcA7vzzNzaXdEo0JknnmcDDmdFj8,1227
+speconsense/profiles/example.yaml,sha256=UGHoVvFiB6iQ-lUU4rwInL6oE1eAd7Fo5qp14vfXJvA,4546
 speconsense/profiles/herbarium.yaml,sha256=1OyAPvBZmJ0eWHejfTU_NLd1_08F9n5WbeE686mzYGE,1125
 speconsense/profiles/largedata.yaml,sha256=7qwl5CHA7BiFcznycUoprOX_A-qrsZzV5fBLnA3QmcE,884
 speconsense/profiles/nostalgia.yaml,sha256=Hy20M88FiCmDvscyIKbwfNSusiHptmBm4pIWPiSFmp0,661
@@ -22,15 +23,15 @@ speconsense/scalability/vsearch.py,sha256=I1IzTeRzEFn9bi8mNbBRvtcHvUBzBFdE7D5yf-
 speconsense/summarize/__init__.py,sha256=PE6W9hytDxhkw7W6Fz8X3jd92N2VdhuxiQ72Nqm1xC0,3181
 speconsense/summarize/__main__.py,sha256=_hzLNqNtv4PirL1oMic37GW2QmjWquoznzNtld_3FiQ,117
 speconsense/summarize/analysis.py,sha256=1MXtKMpX1bgKEtI-JN6BwTQj99qyt1eQLqNg51EgPiE,31560
-speconsense/summarize/cli.py,sha256=Ptvt7D5Rwhg24_L_jqDI4xEb2ybRHoSEPC-c6a0K_yo,23281
+speconsense/summarize/cli.py,sha256=uSeY7__KpdQVXqJcQ0Zpn6ePeyJDVGdml7rZgHFr3W8,27124
 speconsense/summarize/clustering.py,sha256=kk-FdFCea8KRocowN_4dt_aoqZNVJMmEu7CVKPfYgK8,28346
-speconsense/summarize/fields.py,sha256=Mj7aUm4COU4niJeM6KG3R4c2SMD81lQNraDYWVh6CZ8,8550
-speconsense/summarize/io.py,sha256=Ofslc77qUDKB-9uFqdvjIPPZ0d0Tkjc_UsF0jwHWJZo,32220
+speconsense/summarize/fields.py,sha256=a6aK9hkPJ-sDRRSqM_7IkyqCki99KSMnsQMV-U7r2zY,8687
+speconsense/summarize/io.py,sha256=FdHLbcj0NOL3WE1e5OL85DRdJaHpyXPMcmlNg9mG3tM,32732
 speconsense/summarize/iupac.py,sha256=Y6KqELmnGy4Eya4C_4ldXY8uek0ReuSUgITLI3NW0-A,11042
-speconsense/summarize/merging.py,sha256=ptq6tJNplqHTf7U51Ppo2pGQkNF8ZTx-3kILT0Hbqxw,27754
-speconsense-0.7.2.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
-speconsense-0.7.2.dist-info/METADATA,sha256=1AlFCQ4kJ4_UKe3suOft55bF12gun1my0MyGqx2M5wg,75767
-speconsense-0.7.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-speconsense-0.7.2.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
-speconsense-0.7.2.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
-speconsense-0.7.2.dist-info/RECORD,,
+speconsense/summarize/merging.py,sha256=FakBey3qpu7ULPIsc2GDo9WG8jNU1L6q2pgQ2HrOKXk,28454
+speconsense-0.7.4.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
+speconsense-0.7.4.dist-info/METADATA,sha256=2vFyM5rqFEwMPIcsSAH32Dwh_bDA-bOk15D7El6MO7Y,79957
+speconsense-0.7.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+speconsense-0.7.4.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
+speconsense-0.7.4.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
+speconsense-0.7.4.dist-info/RECORD,,

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{speconsense-0.7.2.dist-info → speconsense-0.7.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

speconsense 0.7.2__py3-none-any.whl → 0.7.4__py3-none-any.whl

speconsense 0.7.2py3-none-any.whl → 0.7.4py3-none-any.whl