speconsense 0.7.2__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
speconsense/__init__.py CHANGED
@@ -5,7 +5,7 @@ A Python tool for experimental clustering and consensus generation as an alterna
5
5
  in the fungal DNA barcoding pipeline.
6
6
  """
7
7
 
8
- __version__ = "0.7.2"
8
+ __version__ = "0.7.4"
9
9
  __author__ = "Josh Walker"
10
10
  __email__ = "joshowalker@yahoo.com"
11
11
 
speconsense/core/cli.py CHANGED
@@ -66,6 +66,9 @@ def main():
66
66
  help="Disable position-based variant phasing (enabled by default). "
67
67
  "MCL graph clustering already separates most variants; this "
68
68
  "second pass analyzes MSA positions to phase remaining variants.")
69
+ phasing_group.add_argument("--enable-position-phasing", action="store_false",
70
+ dest="disable_position_phasing",
71
+ help="Override --disable-position-phasing or profile setting")
69
72
  phasing_group.add_argument("--min-variant-frequency", type=float, default=0.10,
70
73
  help="Minimum alternative allele frequency to call variant (default: 0.10 for 10%%)")
71
74
  phasing_group.add_argument("--min-variant-count", type=int, default=5,
@@ -75,6 +78,9 @@ def main():
75
78
  ambiguity_group = parser.add_argument_group("Ambiguity Calling")
76
79
  ambiguity_group.add_argument("--disable-ambiguity-calling", action="store_true",
77
80
  help="Disable IUPAC ambiguity code calling for unphased variant positions")
81
+ ambiguity_group.add_argument("--enable-ambiguity-calling", action="store_false",
82
+ dest="disable_ambiguity_calling",
83
+ help="Override --disable-ambiguity-calling or profile setting")
78
84
  ambiguity_group.add_argument("--min-ambiguity-frequency", type=float, default=0.10,
79
85
  help="Minimum alternative allele frequency for IUPAC ambiguity calling (default: 0.10 for 10%%)")
80
86
  ambiguity_group.add_argument("--min-ambiguity-count", type=int, default=3,
@@ -84,8 +90,14 @@ def main():
84
90
  merging_group = parser.add_argument_group("Cluster Merging")
85
91
  merging_group.add_argument("--disable-cluster-merging", action="store_true",
86
92
  help="Disable merging of clusters with identical consensus sequences")
93
+ merging_group.add_argument("--enable-cluster-merging", action="store_false",
94
+ dest="disable_cluster_merging",
95
+ help="Override --disable-cluster-merging or profile setting")
87
96
  merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
88
97
  help="Disable homopolymer equivalence in cluster merging (only merge identical sequences)")
98
+ merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
99
+ dest="disable_homopolymer_equivalence",
100
+ help="Override --disable-homopolymer-equivalence or profile setting")
89
101
 
90
102
  # Orientation group
91
103
  orient_group = parser.add_argument_group("Orientation")
@@ -104,11 +116,17 @@ def main():
104
116
  "0=auto-detect, default=1 (safe for parallel workflows).")
105
117
  perf_group.add_argument("--enable-early-filter", action="store_true",
106
118
  help="Enable early filtering to skip small clusters before variant phasing (improves performance for large datasets)")
119
+ perf_group.add_argument("--disable-early-filter", action="store_false",
120
+ dest="enable_early_filter",
121
+ help="Override --enable-early-filter or profile setting")
107
122
 
108
123
  # Debugging group
109
124
  debug_group = parser.add_argument_group("Debugging")
110
125
  debug_group.add_argument("--collect-discards", action="store_true",
111
126
  help="Write discarded reads (outliers and filtered clusters) to cluster_debug/{sample}-discards.fastq")
127
+ debug_group.add_argument("--no-collect-discards", action="store_false",
128
+ dest="collect_discards",
129
+ help="Override --collect-discards or profile setting")
112
130
  debug_group.add_argument("--log-level", default="INFO",
113
131
  choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
114
132
 
@@ -103,6 +103,8 @@ VALID_SUMMARIZE_KEYS = {
103
103
  "select-max-groups",
104
104
  "select-max-variants",
105
105
  "select-strategy",
106
+ "select-min-size-ratio",
107
+ "enable-full-consensus",
106
108
  # Processing
107
109
  "scale-threshold",
108
110
  "threads",
@@ -0,0 +1,28 @@
1
+ # Compress variants into minimal IUPAC consensus sequences
2
+ #
3
+ # Aggressively merges similar variants (including indels) into single
4
+ # IUPAC consensus sequences. Only truly dissimilar sequences remain
5
+ # separate. Uses 20% frequency thresholds throughout.
6
+ #
7
+ # Designed for workflows where reviewers want fewer sequences to
8
+ # examine, with all variation represented via IUPAC ambiguity codes.
9
+ # Partial overlap merging is disabled as a safety measure.
10
+ #
11
+ # Use with:
12
+ # speconsense input.fastq -p compressed
13
+ # speconsense-summarize -p compressed
14
+
15
+ speconsense-version: "0.7.*"
16
+ description: "Compress variants into minimal IUPAC consensus sequences"
17
+
18
+ speconsense:
19
+ min-ambiguity-frequency: 0.20 # 20% threshold for IUPAC ambiguity calling
20
+ min-variant-frequency: 0.20 # 20% threshold for variant phasing
21
+
22
+ speconsense-summarize:
23
+ merge-indel-length: 5 # Merge indels up to 5bp
24
+ merge-position-count: 10 # Allow up to 10 variant positions in a merge
25
+ merge-min-size-ratio: 0.2 # Match 20% calling threshold
26
+ select-min-size-ratio: 0.2 # Match 20% calling threshold
27
+ min-merge-overlap: 0 # Disable partial overlap merging
28
+ enable-full-consensus: true # Include full IUPAC consensus per group
@@ -91,6 +91,7 @@ speconsense-summarize:
91
91
  # select-max-groups: -1 # Max groups to output (-1 = no limit)
92
92
  # select-max-variants: -1 # Max variants per group (-1 = no limit)
93
93
  # select-strategy: size # Selection strategy: size or diversity
94
+ # select-min-size-ratio: 0 # Min size ratio to include variant (0 = disabled)
94
95
 
95
96
  # --- Processing ---
96
97
  # threads: 0 # Max threads (0 = auto-detect)
@@ -54,8 +54,8 @@ from .io import (
54
54
  write_output_files,
55
55
  )
56
56
  from .clustering import perform_hac_clustering, select_variants
57
- from .merging import merge_group_with_msa
58
- from .analysis import MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
57
+ from .merging import merge_group_with_msa, create_full_consensus_from_msa
58
+ from .analysis import run_spoa_msa, MAX_MSA_MERGE_VARIANTS, MIN_MERGE_BATCH, MAX_MERGE_BATCH
59
59
 
60
60
 
61
61
  # Merge effort configuration
@@ -132,6 +132,8 @@ def parse_arguments():
132
132
  merging_group = parser.add_argument_group("Merging")
133
133
  merging_group.add_argument("--disable-merging", action="store_true",
134
134
  help="Disable all variant merging (skip MSA-based merge evaluation entirely)")
135
+ merging_group.add_argument("--enable-merging", action="store_false", dest="disable_merging",
136
+ help="Override --disable-merging or profile setting")
135
137
  merging_group.add_argument("--merge-snp", action=argparse.BooleanOptionalAction, default=True,
136
138
  help="Enable SNP-based merging (default: True, use --no-merge-snp to disable)")
137
139
  merging_group.add_argument("--merge-indel-length", type=int, default=0,
@@ -144,6 +146,9 @@ def parse_arguments():
144
146
  help="Minimum overlap in bp for merging sequences of different lengths (default: 200, 0 to disable)")
145
147
  merging_group.add_argument("--disable-homopolymer-equivalence", action="store_true",
146
148
  help="Disable homopolymer equivalence in merging (treat AAA vs AAAA as different)")
149
+ merging_group.add_argument("--enable-homopolymer-equivalence", action="store_false",
150
+ dest="disable_homopolymer_equivalence",
151
+ help="Override --disable-homopolymer-equivalence or profile setting")
147
152
  merging_group.add_argument("--merge-effort", type=str, default="balanced", metavar="LEVEL",
148
153
  help="Merging effort level: fast (8), balanced (10), thorough (12), "
149
154
  "or numeric 6-14. Higher values allow larger batch sizes for "
@@ -164,6 +169,15 @@ def parse_arguments():
164
169
  selection_group.add_argument("--select-strategy", "--variant-selection",
165
170
  dest="select_strategy", choices=["size", "diversity"], default="size",
166
171
  help="Variant selection strategy: size or diversity (default: size)")
172
+ selection_group.add_argument("--select-min-size-ratio", type=float, default=0,
173
+ help="Minimum size ratio (variant/largest) to include in output "
174
+ "(default: 0 = disabled, e.g. 0.2 for 20%% cutoff)")
175
+ selection_group.add_argument("--enable-full-consensus", action="store_true",
176
+ help="Generate a full consensus per variant group representing all variation "
177
+ "from pre-merge variants (gaps never win)")
178
+ selection_group.add_argument("--disable-full-consensus", action="store_false",
179
+ dest="enable_full_consensus",
180
+ help="Override --enable-full-consensus or profile setting")
167
181
 
168
182
  # Performance group
169
183
  perf_group = parser.add_argument_group("Performance")
@@ -345,9 +359,21 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
345
359
  key=lambda x: max(m.size for m in x[1]),
346
360
  reverse=True)
347
361
 
348
- for group_idx, (_, group_members) in enumerate(sorted_groups):
362
+ for group_idx, (group_id, group_members) in enumerate(sorted_groups):
349
363
  final_group_name = group_idx + 1
350
364
 
365
+ # Apply select-min-size-ratio filter
366
+ if args.select_min_size_ratio > 0 and len(group_members) > 1:
367
+ largest_size = max(v.size for v in group_members)
368
+ filtered = [v for v in group_members
369
+ if (v.size / largest_size) >= args.select_min_size_ratio]
370
+ if len(filtered) < len(group_members):
371
+ filtered_count = len(group_members) - len(filtered)
372
+ logging.debug(f"Group {group_idx + 1}: filtered out {filtered_count} "
373
+ f"variants with size ratio < {args.select_min_size_ratio} "
374
+ f"relative to largest (size={largest_size})")
375
+ group_members = filtered
376
+
351
377
  # Select variants for this group
352
378
  selected_variants = select_variants(group_members, args.select_max_variants, args.select_strategy, group_number=final_group_name)
353
379
 
@@ -366,6 +392,35 @@ def process_single_specimen(file_consensuses: List[ConsensusInfo],
366
392
  final_consensus.append(renamed_variant)
367
393
  group_naming.append((variant.sample_name, new_name))
368
394
 
395
+ # Generate full consensus from PRE-MERGE variants
396
+ if getattr(args, 'enable_full_consensus', False):
397
+ pre_merge_variants = variant_groups[group_id]
398
+
399
+ # Apply size-ratio filter (same as merge pipeline)
400
+ if args.merge_min_size_ratio > 0 and len(pre_merge_variants) > 1:
401
+ largest_size = max(v.size for v in pre_merge_variants)
402
+ filtered = [v for v in pre_merge_variants
403
+ if (v.size / largest_size) >= args.merge_min_size_ratio]
404
+ if len(filtered) < len(pre_merge_variants):
405
+ filtered_count = len(pre_merge_variants) - len(filtered)
406
+ logging.debug(f"Full consensus: filtered out {filtered_count} variants with size ratio < {args.merge_min_size_ratio} relative to largest (size={largest_size})")
407
+ pre_merge_variants = filtered
408
+
409
+ specimen_base = selected_variants[0].sample_name.rsplit('-c', 1)[0]
410
+ full_name = f"{specimen_base}-{group_idx + 1}.full"
411
+
412
+ if len(pre_merge_variants) == 1:
413
+ # Single variant — copy directly
414
+ full_consensus = pre_merge_variants[0]._replace(sample_name=full_name)
415
+ else:
416
+ # MSA on pre-merge variants, full consensus logic
417
+ sequences = [v.sequence for v in pre_merge_variants]
418
+ aligned_seqs = run_spoa_msa(sequences, alignment_mode=1)
419
+ full_consensus = create_full_consensus_from_msa(aligned_seqs, pre_merge_variants)
420
+ full_consensus = full_consensus._replace(sample_name=full_name)
421
+
422
+ final_consensus.append(full_consensus)
423
+
369
424
  naming_info[group_idx + 1] = group_naming
370
425
 
371
426
  logging.info(f"Processed {file_name}: {len(final_consensus)} final variants across {len(merged_groups)} groups")
@@ -421,6 +476,8 @@ def main():
421
476
  logging.info(f" --select-max-variants: {args.select_max_variants}")
422
477
  logging.info(f" --select-max-groups: {args.select_max_groups}")
423
478
  logging.info(f" --select-strategy: {args.select_strategy}")
479
+ logging.info(f" --select-min-size-ratio: {args.select_min_size_ratio}")
480
+ logging.info(f" --enable-full-consensus: {args.enable_full_consensus}")
424
481
  logging.info(f" --log-level: {args.log_level}")
425
482
  logging.info("")
426
483
  logging.info("Processing each specimen file independently to organize variants within specimens")
@@ -124,8 +124,8 @@ class GroupField(FastaField):
124
124
  super().__init__('group', 'Variant group number')
125
125
 
126
126
  def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
127
- # Extract from sample_name (e.g., "...-1.v1" or "...-2.v1.raw1")
128
- match = re.search(r'-(\d+)\.v\d+(?:\.raw\d+)?$', consensus.sample_name)
127
+ # Extract from sample_name (e.g., "...-1.v1", "...-2.v1.raw1", or "...-1.full")
128
+ match = re.search(r'-(\d+)(?:\.v\d+(?:\.raw\d+)?|\.full)$', consensus.sample_name)
129
129
  if match:
130
130
  return f"group={match.group(1)}"
131
131
  return None
@@ -136,8 +136,10 @@ class VariantField(FastaField):
136
136
  super().__init__('variant', 'Variant identifier within group')
137
137
 
138
138
  def format_value(self, consensus: ConsensusInfo) -> Optional[str]:
139
- # Extract from sample_name (e.g., "...-1.v1" -> "v1" or "...-1.v1.raw1" -> "v1")
139
+ # Extract from sample_name (e.g., "...-1.v1" -> "v1", "...-1.v1.raw1" -> "v1", "...-1.full" -> "full")
140
140
  match = re.search(r'\.(v\d+)(?:\.raw\d+)?$', consensus.sample_name)
141
+ if not match:
142
+ match = re.search(r'\.(full)$', consensus.sample_name)
141
143
  if match:
142
144
  return f"variant={match.group(1)}"
143
145
  return None
@@ -358,6 +358,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
358
358
  # Generate .raw file consensuses for merged variants
359
359
  raw_file_consensuses = []
360
360
  for consensus in specimen_consensus:
361
+ # Skip .raw generation for .full consensus (synthetic/derived)
362
+ if consensus.sample_name.endswith('.full'):
363
+ continue
361
364
  # Only create .raw files if this consensus was actually merged
362
365
  if consensus.raw_ric and len(consensus.raw_ric) > 1:
363
366
  # Find the original cluster name from naming_info
@@ -412,6 +415,9 @@ def write_specimen_data_files(specimen_consensus: List[ConsensusInfo],
412
415
 
413
416
  # Write FASTQ files for each final consensus containing all contributing reads
414
417
  for consensus in specimen_consensus:
418
+ # Skip FASTQ for .full consensus (synthetic/derived, no traceable cluster reads)
419
+ if consensus.sample_name.endswith('.full'):
420
+ continue
415
421
  write_consensus_fastq(consensus, merge_traceability, naming_info, fastq_dir, fastq_lookup, original_consensus_lookup)
416
422
 
417
423
  # Write .raw files (individual FASTA and FASTQ for pre-merge variants)
@@ -704,7 +710,10 @@ def write_output_files(final_consensus: List[ConsensusInfo],
704
710
  multiple_id = specimen_counters[base_name]
705
711
  writer.writerow([consensus.sample_name, len(consensus.sequence), consensus.ric, multiple_id])
706
712
  unique_samples.add(base_name)
707
- total_ric += consensus.ric
713
+ # Exclude .full from total RiC to avoid double-counting
714
+ # (.full aggregates reads already counted in merged variants)
715
+ if not consensus.sample_name.endswith('.full'):
716
+ total_ric += consensus.ric
708
717
 
709
718
  writer.writerow([])
710
719
  writer.writerow(['Total Unique Samples', len(unique_samples)])
@@ -106,6 +106,63 @@ def is_compatible_subset(variant_stats: dict, args, prior_positions: dict = None
106
106
  return True
107
107
 
108
108
 
109
+ def _build_merged_consensus_info(
110
+ consensus_seq: list, snp_count: int, variants: List[ConsensusInfo]
111
+ ) -> ConsensusInfo:
112
+ """Assemble a ConsensusInfo from column-voting results and source variants.
113
+
114
+ Handles joining the consensus sequence, aggregating size/ric totals,
115
+ flattening raw_ric/raw_len merge history, and selecting metadata
116
+ from the largest variant.
117
+
118
+ Args:
119
+ consensus_seq: List of consensus characters from column voting
120
+ snp_count: Number of ambiguous (multi-base) positions
121
+ variants: Source ConsensusInfo objects that were merged
122
+
123
+ Returns:
124
+ ConsensusInfo with merged metadata
125
+ """
126
+ consensus_sequence = ''.join(consensus_seq)
127
+ total_size = sum(v.size for v in variants)
128
+ total_ric = sum(v.ric for v in variants)
129
+
130
+ # Collect RiC values, preserving any prior merge history
131
+ raw_ric_values = []
132
+ for v in variants:
133
+ if v.raw_ric:
134
+ raw_ric_values.extend(v.raw_ric)
135
+ else:
136
+ raw_ric_values.append(v.ric)
137
+ raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
138
+
139
+ # Collect lengths, preserving any prior merge history
140
+ raw_len_values = []
141
+ for v in variants:
142
+ if v.raw_len:
143
+ raw_len_values.extend(v.raw_len)
144
+ else:
145
+ raw_len_values.append(len(v.sequence))
146
+ raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
147
+
148
+ largest_variant = max(variants, key=lambda v: v.size)
149
+
150
+ return ConsensusInfo(
151
+ sample_name=largest_variant.sample_name,
152
+ cluster_id=largest_variant.cluster_id,
153
+ sequence=consensus_sequence,
154
+ ric=total_ric,
155
+ size=total_size,
156
+ file_path=largest_variant.file_path,
157
+ snp_count=snp_count if snp_count > 0 else None,
158
+ primers=largest_variant.primers,
159
+ raw_ric=raw_ric_values,
160
+ raw_len=raw_len_values,
161
+ rid=largest_variant.rid,
162
+ rid_min=largest_variant.rid_min,
163
+ )
164
+
165
+
109
166
  def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
110
167
  """
111
168
  Generate consensus from MSA using size-weighted majority voting.
@@ -160,46 +217,7 @@ def create_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo])
160
217
  snp_count += 1
161
218
  # else: majority wants gap, omit position
162
219
 
163
- # Create merged ConsensusInfo
164
- consensus_sequence = ''.join(consensus_seq)
165
- total_size = sum(v.size for v in variants)
166
- total_ric = sum(v.ric for v in variants)
167
-
168
- # Collect RiC values, preserving any prior merge history
169
- raw_ric_values = []
170
- for v in variants:
171
- if v.raw_ric:
172
- raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
173
- else:
174
- raw_ric_values.append(v.ric)
175
- raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
176
-
177
- # Collect lengths, preserving any prior merge history
178
- raw_len_values = []
179
- for v in variants:
180
- if v.raw_len:
181
- raw_len_values.extend(v.raw_len) # Flatten prior merge history
182
- else:
183
- raw_len_values.append(len(v.sequence))
184
- raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
185
-
186
- # Use name from largest variant
187
- largest_variant = max(variants, key=lambda v: v.size)
188
-
189
- return ConsensusInfo(
190
- sample_name=largest_variant.sample_name,
191
- cluster_id=largest_variant.cluster_id,
192
- sequence=consensus_sequence,
193
- ric=total_ric,
194
- size=total_size,
195
- file_path=largest_variant.file_path,
196
- snp_count=snp_count if snp_count > 0 else None,
197
- primers=largest_variant.primers,
198
- raw_ric=raw_ric_values,
199
- raw_len=raw_len_values,
200
- rid=largest_variant.rid, # Preserve identity metrics from largest variant
201
- rid_min=largest_variant.rid_min,
202
- )
220
+ return _build_merged_consensus_info(consensus_seq, snp_count, variants)
203
221
 
204
222
 
205
223
  def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
@@ -295,46 +313,49 @@ def create_overlap_consensus_from_msa(aligned_seqs: List, variants: List[Consens
295
313
  consensus_seq.append(iupac_code)
296
314
  snp_count += 1
297
315
 
298
- # Create merged ConsensusInfo
299
- consensus_sequence = ''.join(consensus_seq)
300
- total_size = sum(v.size for v in variants)
301
- total_ric = sum(v.ric for v in variants)
316
+ return _build_merged_consensus_info(consensus_seq, snp_count, variants)
302
317
 
303
- # Collect RiC values, preserving any prior merge history
304
- raw_ric_values = []
305
- for v in variants:
306
- if v.raw_ric:
307
- raw_ric_values.extend(v.raw_ric) # Flatten prior merge history
308
- else:
309
- raw_ric_values.append(v.ric)
310
- raw_ric_values = sorted(raw_ric_values, reverse=True) if len(variants) > 1 else None
311
318
 
312
- # Collect lengths, preserving any prior merge history
313
- raw_len_values = []
314
- for v in variants:
315
- if v.raw_len:
316
- raw_len_values.extend(v.raw_len) # Flatten prior merge history
317
- else:
318
- raw_len_values.append(len(v.sequence))
319
- raw_len_values = sorted(raw_len_values, reverse=True) if len(variants) > 1 else None
319
+ def create_full_consensus_from_msa(aligned_seqs: List, variants: List[ConsensusInfo]) -> ConsensusInfo:
320
+ """
321
+ Generate full consensus from MSA where any non-gap base means inclusion.
320
322
 
321
- # Use name from largest variant
322
- largest_variant = max(variants, key=lambda v: v.size)
323
+ Unlike create_consensus_from_msa where gaps can win by majority vote,
324
+ the full consensus includes a position if ANY variant has a base there.
325
+ This captures all variation from all contributing variants.
323
326
 
324
- return ConsensusInfo(
325
- sample_name=largest_variant.sample_name,
326
- cluster_id=largest_variant.cluster_id,
327
- sequence=consensus_sequence,
328
- ric=total_ric,
329
- size=total_size,
330
- file_path=largest_variant.file_path,
331
- snp_count=snp_count if snp_count > 0 else None,
332
- primers=largest_variant.primers,
333
- raw_ric=raw_ric_values,
334
- raw_len=raw_len_values,
335
- rid=largest_variant.rid,
336
- rid_min=largest_variant.rid_min,
337
- )
327
+ Args:
328
+ aligned_seqs: MSA sequences with gaps as '-'
329
+ variants: Original ConsensusInfo objects (for metadata)
330
+
331
+ Returns:
332
+ ConsensusInfo with full consensus sequence
333
+ """
334
+ consensus_seq = []
335
+ snp_count = 0
336
+ alignment_length = len(aligned_seqs[0].seq)
337
+
338
+ for col_idx in range(alignment_length):
339
+ column = [str(seq.seq[col_idx]) for seq in aligned_seqs]
340
+
341
+ # Collect non-gap bases
342
+ base_votes = defaultdict(int)
343
+ for i, base in enumerate(column):
344
+ upper_base = base.upper()
345
+ if upper_base != '-':
346
+ base_votes[upper_base] += variants[i].size
347
+
348
+ # Include position if ANY variant has a base (gaps never win)
349
+ if base_votes:
350
+ if len(base_votes) == 1:
351
+ consensus_seq.append(list(base_votes.keys())[0])
352
+ else:
353
+ represented_bases = set(base_votes.keys())
354
+ iupac_code = merge_bases_to_iupac(represented_bases)
355
+ consensus_seq.append(iupac_code)
356
+ snp_count += 1
357
+
358
+ return _build_merged_consensus_info(consensus_seq, snp_count, variants)
338
359
 
339
360
 
340
361
  def merge_group_with_msa(variants: List[ConsensusInfo], args) -> Tuple[List[ConsensusInfo], Dict, int, List[OverlapMergeInfo]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speconsense
3
- Version: 0.7.2
3
+ Version: 0.7.4
4
4
  Summary: High-quality clustering and consensus generation for Oxford Nanopore amplicon reads
5
5
  Author-email: Josh Walker <joshowalker@yahoo.com>
6
6
  License: BSD-3-Clause
@@ -171,6 +171,7 @@ speconsense input.fastq -p herbarium --min-size 10
171
171
  ```
172
172
 
173
173
  **Bundled profiles:**
174
+ - `compressed` — Compress variants into minimal IUPAC consensus sequences (aggressive merging with indels, 20% thresholds, full consensus, 20% selection size ratio)
174
175
  - `herbarium` — High-recall for degraded DNA/type specimens
175
176
  - `largedata` — Experimental settings for large input files
176
177
  - `nostalgia` — Simulate older bioinformatics pipelines
@@ -294,12 +295,14 @@ When using `speconsense-summarize` for post-processing, creates `__Summary__/` d
294
295
  |---------------|-------------|------------|-------------|
295
296
  | **Original** | Source `cluster_debug/` | `-c1`, `-c2`, `-c3` | Preserves speconsense clustering results |
296
297
  | **Summarization** | `__Summary__/`, `FASTQ Files/`, `variants/` | `-1.v1`, `-1.v2`, `-2.v1`, `.raw1` | Post-processing groups and variants |
298
+ | **Full consensus** | `__Summary__/` | `-1.full` | IUPAC consensus from all pre-merge variants in a group |
297
299
 
298
300
  ### Example Directory Structure
299
301
  ```
300
302
  __Summary__/
301
303
  ├── sample-1.v1-RiC45.fasta # Primary variant (group 1, merged)
302
304
  ├── sample-1.v2-RiC23.fasta # Additional variant (not merged)
305
+ ├── sample-1.full-RiC68.fasta # Full IUPAC consensus for group 1 (all pre-merge variants)
303
306
  ├── sample-2.v1-RiC30.fasta # Second organism group, primary variant
304
307
  ├── summary.fasta # All final consensus sequences (excludes .raw)
305
308
  ├── summary.txt # Statistics
@@ -675,6 +678,18 @@ speconsense-summarize --select-strategy diversity --select-max-variants 2
675
678
  - Output up to select_max_variants per group
676
679
  3. Final output contains representatives from all groups, ensuring both biological diversity (between groups) and appropriate sampling within each biological entity (within groups)
677
680
 
681
+ **Selection Size Ratio Filtering:**
682
+ ```bash
683
+ speconsense-summarize --select-min-size-ratio 0.2
684
+ ```
685
+ - Filters out post-merge variants whose size is too small relative to the largest variant in their group
686
+ - Ratio calculated as `variant_size / largest_size` — must be ≥ threshold to keep
687
+ - Example: `--select-min-size-ratio 0.2` means a variant must have ≥20% the reads of the largest variant in its group
688
+ - Default is 0 (disabled) — all post-merge variants pass through to selection
689
+ - Applied after merging but before variant selection
690
+ - Useful for suppressing noise variants that survived merging but are too small to be meaningful
691
+ - Set to 0.2 in the `compressed` profile to match the 20% calling threshold theme
692
+
678
693
  This two-stage process ensures that distinct biological sequences are preserved as separate groups, while providing control over variant complexity within each group.
679
694
 
680
695
  ### Customizing FASTA Header Fields
@@ -810,6 +825,18 @@ For high-throughput workflows (e.g., 100K sequences/year), this prioritization e
810
825
 
811
826
  ### Additional Summarize Options
812
827
 
828
+ **Full Consensus:**
829
+ ```bash
830
+ speconsense-summarize --enable-full-consensus
831
+ ```
832
+ - Generates a full IUPAC consensus sequence per variant group from all pre-merge variants
833
+ - Output named `{specimen}-{group}.full-RiC{reads}.fasta` in the `__Summary__/` directory
834
+ - Uses majority voting across all variants in the group; **gaps never win** — at each alignment column, the most common non-gap base is chosen, with IUPAC codes for ties among bases
835
+ - Useful when you want a single representative sequence that captures all variation within a group as IUPAC ambiguity codes
836
+ - Included in `summary.fasta` (but excluded from total RiC to avoid double-counting)
837
+ - Enabled by default in the `compressed` profile
838
+ - Use `--disable-full-consensus` to override when set by a profile
839
+
813
840
  **Quality Filtering:**
814
841
  ```bash
815
842
  speconsense-summarize --min-ric 5
@@ -1044,8 +1071,10 @@ The complete speconsense-summarize workflow operates in this order:
1044
1071
  2. **HAC variant grouping** by sequence identity to separate dissimilar sequences (`--group-identity`); uses single-linkage when overlap merging is enabled
1045
1072
  3. **Group filtering** to limit output groups (`--select-max-groups`)
1046
1073
  4. **Homopolymer-aware MSA-based variant merging** within each group, including **overlap merging** for different-length sequences (`--disable-merging`, `--merge-effort`, `--merge-position-count`, `--merge-indel-length`, `--min-merge-overlap`, `--merge-snp`, `--merge-min-size-ratio`, `--disable-homopolymer-equivalence`)
1047
- 5. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1048
- 6. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1074
+ 5. **Selection size ratio filtering** to remove tiny post-merge variants (`--select-min-size-ratio`)
1075
+ 6. **Variant selection** within each group (`--select-max-variants`, `--select-strategy`)
1076
+ 7. **Full consensus generation** (optional) — IUPAC consensus from all pre-merge variants per group (`--enable-full-consensus`)
1077
+ 8. **Output generation** with customizable header fields (`--fasta-fields`) and full traceability
1049
1078
 
1050
1079
  **Key architectural features**:
1051
1080
  - HAC grouping occurs BEFORE merging to prevent inappropriate merging of dissimilar sequences (e.g., contaminants with primary targets)
@@ -1098,17 +1127,20 @@ usage: speconsense [-h] [-O OUTPUT_DIR] [--primers PRIMERS]
1098
1127
  [--min-cluster-ratio MIN_CLUSTER_RATIO]
1099
1128
  [--max-sample-size MAX_SAMPLE_SIZE]
1100
1129
  [--outlier-identity OUTLIER_IDENTITY]
1101
- [--disable-position-phasing]
1130
+ [--disable-position-phasing] [--enable-position-phasing]
1102
1131
  [--min-variant-frequency MIN_VARIANT_FREQUENCY]
1103
1132
  [--min-variant-count MIN_VARIANT_COUNT]
1104
- [--disable-ambiguity-calling]
1133
+ [--disable-ambiguity-calling] [--enable-ambiguity-calling]
1105
1134
  [--min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY]
1106
1135
  [--min-ambiguity-count MIN_AMBIGUITY_COUNT]
1107
- [--disable-cluster-merging]
1136
+ [--disable-cluster-merging] [--enable-cluster-merging]
1108
1137
  [--disable-homopolymer-equivalence]
1138
+ [--enable-homopolymer-equivalence]
1109
1139
  [--orient-mode {skip,keep-all,filter-failed}]
1110
1140
  [--presample PRESAMPLE] [--scale-threshold SCALE_THRESHOLD]
1111
- [--threads N] [--enable-early-filter] [--collect-discards]
1141
+ [--threads N] [--enable-early-filter]
1142
+ [--disable-early-filter] [--collect-discards]
1143
+ [--no-collect-discards]
1112
1144
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1113
1145
  [--version] [-p NAME] [--list-profiles]
1114
1146
  input_file
@@ -1167,6 +1199,8 @@ Variant Phasing:
1167
1199
  default). MCL graph clustering already separates most
1168
1200
  variants; this second pass analyzes MSA positions to
1169
1201
  phase remaining variants.
1202
+ --enable-position-phasing
1203
+ Override --disable-position-phasing or profile setting
1170
1204
  --min-variant-frequency MIN_VARIANT_FREQUENCY
1171
1205
  Minimum alternative allele frequency to call variant
1172
1206
  (default: 0.10 for 10%)
@@ -1178,6 +1212,9 @@ Ambiguity Calling:
1178
1212
  --disable-ambiguity-calling
1179
1213
  Disable IUPAC ambiguity code calling for unphased
1180
1214
  variant positions
1215
+ --enable-ambiguity-calling
1216
+ Override --disable-ambiguity-calling or profile
1217
+ setting
1181
1218
  --min-ambiguity-frequency MIN_AMBIGUITY_FREQUENCY
1182
1219
  Minimum alternative allele frequency for IUPAC
1183
1220
  ambiguity calling (default: 0.10 for 10%)
@@ -1189,9 +1226,14 @@ Cluster Merging:
1189
1226
  --disable-cluster-merging
1190
1227
  Disable merging of clusters with identical consensus
1191
1228
  sequences
1229
+ --enable-cluster-merging
1230
+ Override --disable-cluster-merging or profile setting
1192
1231
  --disable-homopolymer-equivalence
1193
1232
  Disable homopolymer equivalence in cluster merging
1194
1233
  (only merge identical sequences)
1234
+ --enable-homopolymer-equivalence
1235
+ Override --disable-homopolymer-equivalence or profile
1236
+ setting
1195
1237
 
1196
1238
  Orientation:
1197
1239
  --orient-mode {skip,keep-all,filter-failed}
@@ -1213,10 +1255,14 @@ Performance:
1213
1255
  Enable early filtering to skip small clusters before
1214
1256
  variant phasing (improves performance for large
1215
1257
  datasets)
1258
+ --disable-early-filter
1259
+ Override --enable-early-filter or profile setting
1216
1260
 
1217
1261
  Debugging:
1218
1262
  --collect-discards Write discarded reads (outliers and filtered clusters)
1219
1263
  to cluster_debug/{sample}-discards.fastq
1264
+ --no-collect-discards
1265
+ Override --collect-discards or profile setting
1220
1266
  --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
1221
1267
  ```
1222
1268
 
@@ -1227,15 +1273,22 @@ usage: speconsense-summarize [-h] [--source SOURCE]
1227
1273
  [--summary-dir SUMMARY_DIR]
1228
1274
  [--fasta-fields FASTA_FIELDS] [--min-ric MIN_RIC]
1229
1275
  [--min-len MIN_LEN] [--max-len MAX_LEN]
1230
- [--group-identity GROUP_IDENTITY] [--merge-snp]
1276
+ [--group-identity GROUP_IDENTITY]
1277
+ [--disable-merging] [--enable-merging]
1278
+ [--merge-snp | --no-merge-snp]
1231
1279
  [--merge-indel-length MERGE_INDEL_LENGTH]
1232
1280
  [--merge-position-count MERGE_POSITION_COUNT]
1233
1281
  [--merge-min-size-ratio MERGE_MIN_SIZE_RATIO]
1234
1282
  [--min-merge-overlap MIN_MERGE_OVERLAP]
1235
1283
  [--disable-homopolymer-equivalence]
1284
+ [--enable-homopolymer-equivalence]
1285
+ [--merge-effort LEVEL]
1236
1286
  [--select-max-groups SELECT_MAX_GROUPS]
1237
1287
  [--select-max-variants SELECT_MAX_VARIANTS]
1238
1288
  [--select-strategy {size,diversity}]
1289
+ [--select-min-size-ratio SELECT_MIN_SIZE_RATIO]
1290
+ [--enable-full-consensus]
1291
+ [--disable-full-consensus]
1239
1292
  [--scale-threshold SCALE_THRESHOLD] [--threads N]
1240
1293
  [--log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
1241
1294
  [--version] [-p NAME] [--list-profiles]
@@ -1281,10 +1334,7 @@ Grouping:
1281
1334
  Merging:
1282
1335
  --disable-merging Disable all variant merging (skip MSA-based merge
1283
1336
  evaluation entirely)
1284
- --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1285
- thorough (12), or numeric 6-14. Higher values allow
1286
- larger batch sizes for exhaustive subset search.
1287
- Default: balanced
1337
+ --enable-merging Override --disable-merging or profile setting
1288
1338
  --merge-snp, --no-merge-snp
1289
1339
  Enable SNP-based merging (default: True, use --no-
1290
1340
  merge-snp to disable)
@@ -1303,6 +1353,13 @@ Merging:
1303
1353
  --disable-homopolymer-equivalence
1304
1354
  Disable homopolymer equivalence in merging (treat AAA
1305
1355
  vs AAAA as different)
1356
+ --enable-homopolymer-equivalence
1357
+ Override --disable-homopolymer-equivalence or profile
1358
+ setting
1359
+ --merge-effort LEVEL Merging effort level: fast (8), balanced (10),
1360
+ thorough (12), or numeric 6-14. Higher values allow
1361
+ larger batch sizes for exhaustive subset search.
1362
+ Default: balanced
1306
1363
 
1307
1364
  Selection:
1308
1365
  --select-max-groups SELECT_MAX_GROUPS, --max-groups SELECT_MAX_GROUPS
@@ -1314,6 +1371,16 @@ Selection:
1314
1371
  --select-strategy {size,diversity}, --variant-selection {size,diversity}
1315
1372
  Variant selection strategy: size or diversity
1316
1373
  (default: size)
1374
+ --select-min-size-ratio SELECT_MIN_SIZE_RATIO
1375
+ Minimum size ratio (variant/largest) to include in
1376
+ output (default: 0 = disabled, e.g. 0.2 for 20%
1377
+ cutoff)
1378
+ --enable-full-consensus
1379
+ Generate a full consensus per variant group
1380
+ representing all variation from pre-merge variants
1381
+ (gaps never win)
1382
+ --disable-full-consensus
1383
+ Override --enable-full-consensus or profile setting
1317
1384
 
1318
1385
  Performance:
1319
1386
  --scale-threshold SCALE_THRESHOLD
@@ -1,4 +1,4 @@
1
- speconsense/__init__.py,sha256=VbusnaZk_gYhsif5UkANozOnjmCdNMDD_GOMPLfD5hQ,537
1
+ speconsense/__init__.py,sha256=uLSZG2n0xobwuNT2PwZbytUg1DcyOr2aJlsbc52iKs0,537
2
2
  speconsense/cli.py,sha256=Kqb2da0IuazocAz72iqTnw71jI7UaQgxsHfb9CwiolU,85
3
3
  speconsense/msa.py,sha256=t1uDb-Tj5tDnB17QnNZPslpAiLXgAMIlnmMKBbwBKzs,31661
4
4
  speconsense/quality_report.py,sha256=Byrc115T03ybi7mpA0Bw8-gc83nhKPzDY0tyH1IIAMQ,19803
@@ -6,11 +6,12 @@ speconsense/synth.py,sha256=7kbifR9XZDcsB0wxo2PCHD8vLGEkVMTH3SQ724hTFGw,9892
6
6
  speconsense/types.py,sha256=_16nMMbfALEW212LDwTCan9u-gjvnS1ZQKpMK3y3zCE,1669
7
7
  speconsense/core/__init__.py,sha256=3AWfnmw1FTzzf-BRdGo1vRHjVJq7d-Wugsw50GJQY_0,694
8
8
  speconsense/core/__main__.py,sha256=dCfyQkVxxwlP6QqcWw9y5zp5iLzkG-fQsLmFHHEUlbI,112
9
- speconsense/core/cli.py,sha256=YmPj4CqjoFAyvgqXRZUFEjB7XAyUxho31rLKqBYw-yo,15767
9
+ speconsense/core/cli.py,sha256=iepQMK0ZUhZvQShVZY_6WaHneR8ZIRKZ_b6NvVwaRwU,17186
10
10
  speconsense/core/clusterer.py,sha256=UFK5Ec0oMQ7l3GsFJOAhTFk7r90eOOdOBXRskm79Fwk,72093
11
11
  speconsense/core/workers.py,sha256=6pUyt-W9KxkillJ6TU1RjRh-_L-zRIwWqzIcBSeiOSc,25811
12
- speconsense/profiles/__init__.py,sha256=G2peTSx9xdFx0Z2nT71rTRHBwjWDxkFTdtGaQx-rujQ,16324
13
- speconsense/profiles/example.yaml,sha256=FGCsSAVtZL_m8PHWVLVrxaVCCwQkIk8wR7N7I2ZyxX0,4463
12
+ speconsense/profiles/__init__.py,sha256=5UWj6VyUIXTzQ1kBZ4mJ2olZ_ADMK85rwr7KEmRfZfk,16382
13
+ speconsense/profiles/compressed.yaml,sha256=LKtBm6nj8cpF2xeFcA7vzzNzaXdEo0JknnmcDDmdFj8,1227
14
+ speconsense/profiles/example.yaml,sha256=UGHoVvFiB6iQ-lUU4rwInL6oE1eAd7Fo5qp14vfXJvA,4546
14
15
  speconsense/profiles/herbarium.yaml,sha256=1OyAPvBZmJ0eWHejfTU_NLd1_08F9n5WbeE686mzYGE,1125
15
16
  speconsense/profiles/largedata.yaml,sha256=7qwl5CHA7BiFcznycUoprOX_A-qrsZzV5fBLnA3QmcE,884
16
17
  speconsense/profiles/nostalgia.yaml,sha256=Hy20M88FiCmDvscyIKbwfNSusiHptmBm4pIWPiSFmp0,661
@@ -22,15 +23,15 @@ speconsense/scalability/vsearch.py,sha256=I1IzTeRzEFn9bi8mNbBRvtcHvUBzBFdE7D5yf-
22
23
  speconsense/summarize/__init__.py,sha256=PE6W9hytDxhkw7W6Fz8X3jd92N2VdhuxiQ72Nqm1xC0,3181
23
24
  speconsense/summarize/__main__.py,sha256=_hzLNqNtv4PirL1oMic37GW2QmjWquoznzNtld_3FiQ,117
24
25
  speconsense/summarize/analysis.py,sha256=1MXtKMpX1bgKEtI-JN6BwTQj99qyt1eQLqNg51EgPiE,31560
25
- speconsense/summarize/cli.py,sha256=Ptvt7D5Rwhg24_L_jqDI4xEb2ybRHoSEPC-c6a0K_yo,23281
26
+ speconsense/summarize/cli.py,sha256=uSeY7__KpdQVXqJcQ0Zpn6ePeyJDVGdml7rZgHFr3W8,27124
26
27
  speconsense/summarize/clustering.py,sha256=kk-FdFCea8KRocowN_4dt_aoqZNVJMmEu7CVKPfYgK8,28346
27
- speconsense/summarize/fields.py,sha256=Mj7aUm4COU4niJeM6KG3R4c2SMD81lQNraDYWVh6CZ8,8550
28
- speconsense/summarize/io.py,sha256=Ofslc77qUDKB-9uFqdvjIPPZ0d0Tkjc_UsF0jwHWJZo,32220
28
+ speconsense/summarize/fields.py,sha256=a6aK9hkPJ-sDRRSqM_7IkyqCki99KSMnsQMV-U7r2zY,8687
29
+ speconsense/summarize/io.py,sha256=FdHLbcj0NOL3WE1e5OL85DRdJaHpyXPMcmlNg9mG3tM,32732
29
30
  speconsense/summarize/iupac.py,sha256=Y6KqELmnGy4Eya4C_4ldXY8uek0ReuSUgITLI3NW0-A,11042
30
- speconsense/summarize/merging.py,sha256=ptq6tJNplqHTf7U51Ppo2pGQkNF8ZTx-3kILT0Hbqxw,27754
31
- speconsense-0.7.2.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
32
- speconsense-0.7.2.dist-info/METADATA,sha256=1AlFCQ4kJ4_UKe3suOft55bF12gun1my0MyGqx2M5wg,75767
33
- speconsense-0.7.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
34
- speconsense-0.7.2.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
35
- speconsense-0.7.2.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
36
- speconsense-0.7.2.dist-info/RECORD,,
31
+ speconsense/summarize/merging.py,sha256=FakBey3qpu7ULPIsc2GDo9WG8jNU1L6q2pgQ2HrOKXk,28454
32
+ speconsense-0.7.4.dist-info/licenses/LICENSE,sha256=T_VYPNbu9NSWjdQunfk4jqUGND_kYWe_An18s6N492o,1498
33
+ speconsense-0.7.4.dist-info/METADATA,sha256=2vFyM5rqFEwMPIcsSAH32Dwh_bDA-bOk15D7El6MO7Y,79957
34
+ speconsense-0.7.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
35
+ speconsense-0.7.4.dist-info/entry_points.txt,sha256=C0zFp5EYA8_KCb04uOyb4JNkxNH7bli1eU-XYrSX3BU,147
36
+ speconsense-0.7.4.dist-info/top_level.txt,sha256=nYUJOHrqeX-OOxOYQKvpp7Iv8-Bed18wN1DBwWfJKnQ,12
37
+ speconsense-0.7.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5