speconsense 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ # High-precision settings for confident results
2
+ #
3
+ # Use this profile when:
4
+ # - Precision is more important than recall
5
+ # - You need high-confidence sequences for publication
6
+ # - Working with well-characterized reference databases
7
+ # - False positives would be more problematic than false negatives
8
+ #
9
+ # The settings prioritize quality over completeness, filtering out
10
+ # low-support variants and limiting output to well-supported sequences.
11
+
12
+ speconsense-version: "0.7.*"
13
+ description: "High-precision settings for confident results"
14
+
15
+ speconsense:
16
+ min-identity: 0.95 # High initial clustering identity
17
+ outlier-identity: 0.98 # Discard reads with high mismatch
18
+ min-cluster-ratio: 0.05 # Filter more aggressively (5% minimum)
19
+ min-size: 10 # Require strong cluster support
20
+ min-variant-frequency: 0.25 # Less sensitive to variants (25%)
21
+ disable-ambiguity-calling: true # Don't call ambiguities
22
+ presample: 0 # Use all reads
23
+ max-sample-size: 100 # 100 reads sufficient for high quality consensus
24
+
25
+ speconsense-summarize:
26
+ min-ric: 5 # Higher consensus threshold
27
+ disable-merging: true # Skip merging entirely - preserve all variants
@@ -0,0 +1,499 @@
1
+ """
2
+ Quality report generation for speconsense-summarize.
3
+
4
+ This module handles the generation of quality reports with multiple analysis sections:
5
+ - Executive Summary
6
+ - Read Identity Analysis
7
+ - Positional Identity Analysis
8
+ - Overlap Merge Analysis
9
+ - Interpretation Guide
10
+ """
11
+
12
+ import logging
13
+ import os
14
+ import re
15
+ from datetime import datetime
16
+ from typing import Dict, List, Tuple, TextIO
17
+
18
+ from tqdm import tqdm
19
+
20
+ # Import shared types
21
+ from speconsense.types import ConsensusInfo, OverlapMergeInfo
22
+
23
+ # Import helper functions from summarize (safe because summarize uses deferred import for this module)
24
+ from speconsense.summarize import (
25
+ identify_outliers,
26
+ analyze_positional_identity_outliers,
27
+ load_metadata_from_json,
28
+ write_position_debug_file,
29
+ )
30
+
31
+
32
+ def write_header_section(f: TextIO, source_folder: str):
33
+ """Write the report header with timestamp and source info."""
34
+ f.write("=" * 80 + "\n")
35
+ f.write("QUALITY REPORT - speconsense-summarize\n")
36
+ f.write("=" * 80 + "\n")
37
+ f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
38
+ f.write(f"Source: {source_folder}\n")
39
+ f.write("=" * 80 + "\n\n")
40
+
41
+
42
+ def write_executive_summary_section(
43
+ f: TextIO,
44
+ total_seqs: int,
45
+ total_merged: int,
46
+ stats: Dict,
47
+ n_stat: int,
48
+ n_pos: int,
49
+ n_merged: int,
50
+ total_flagged: int
51
+ ):
52
+ """Write the executive summary section with high-level statistics."""
53
+ f.write("EXECUTIVE SUMMARY\n")
54
+ f.write("-" * 80 + "\n\n")
55
+
56
+ f.write(f"Total sequences: {total_seqs}\n")
57
+ f.write(f"Merged sequences: {total_merged} ({100*total_merged/total_seqs:.1f}%)\n\n")
58
+
59
+ f.write("Global Read Identity Statistics:\n")
60
+ f.write(f" Mean rid: {stats['mean_rid']:.1%} ± {stats['std_rid']:.1%}\n\n")
61
+
62
+ f.write("Sequences Requiring Attention:\n")
63
+ n_rid_issues = n_stat + n_merged
64
+ f.write(f" Total flagged: {total_flagged} ({100*total_flagged/total_seqs:.1f}%)\n")
65
+ f.write(f" - Low read identity: {n_rid_issues} ({n_stat} sequences + {n_merged} merged)\n")
66
+ f.write(f" - High-error positions: {n_pos}\n\n")
67
+
68
+
69
+ def write_read_identity_section(
70
+ f: TextIO,
71
+ outlier_results: Dict,
72
+ merged_with_issues: List[Tuple[ConsensusInfo, List, float, float]],
73
+ stats: Dict
74
+ ):
75
+ """Write the read identity analysis section."""
76
+ n_stat = len(outlier_results['statistical_outliers'])
77
+ n_merged = len(merged_with_issues)
78
+ n_rid_issues = n_stat + n_merged
79
+
80
+ if n_rid_issues == 0:
81
+ return
82
+
83
+ f.write("=" * 80 + "\n")
84
+ f.write("READ IDENTITY ANALYSIS\n")
85
+ f.write("=" * 80 + "\n\n")
86
+
87
+ f.write("Sequences with mean read identity (rid) below mean - 2×std.\n")
88
+ f.write(f"Threshold: {stats['stat_threshold_rid']:.1%}\n\n")
89
+
90
+ f.write(f"{'Sequence':<50} {'RiC':<6} {'rid':<8}\n")
91
+ f.write("-" * 64 + "\n")
92
+
93
+ # Build combined list for sorting
94
+ combined_entries = []
95
+
96
+ # Add non-merged statistical outliers
97
+ for cons, rid in outlier_results['statistical_outliers']:
98
+ is_merged = cons.snp_count is not None and cons.snp_count > 0
99
+ if not is_merged:
100
+ combined_entries.append((rid, False, (cons, rid)))
101
+
102
+ # Add merged sequences with issues
103
+ for entry in merged_with_issues:
104
+ cons, components_info, worst_rid, weighted_avg_rid = entry
105
+ combined_entries.append((worst_rid, True, entry))
106
+
107
+ # Sort by rid ascending
108
+ combined_entries.sort(key=lambda x: x[0])
109
+
110
+ # Display entries
111
+ for _, is_merged, data in combined_entries:
112
+ if is_merged:
113
+ cons, components_info, worst_rid, weighted_avg_rid = data
114
+ name = cons.sample_name
115
+ name_with_tag = f"{name} [merged]"
116
+ name_truncated = name_with_tag[:49] if len(name_with_tag) > 49 else name_with_tag
117
+ rid_str = f"{weighted_avg_rid:.1%}"
118
+ f.write(f"{name_truncated:<50} {cons.ric:<6} {rid_str:<8}\n")
119
+
120
+ # Component rows (indented)
121
+ for raw, comp_rid, comp_ric in components_info:
122
+ comp_name = raw.sample_name
123
+ comp_display = f" └─ {comp_name}"
124
+ comp_truncated = comp_display[:49] if len(comp_display) > 49 else comp_display
125
+ comp_rid_str = f"{comp_rid:.1%}"
126
+ f.write(f"{comp_truncated:<50} {comp_ric:<6} {comp_rid_str:<8}\n")
127
+ else:
128
+ cons, rid = data
129
+ name_truncated = cons.sample_name[:49] if len(cons.sample_name) > 49 else cons.sample_name
130
+ rid_str = f"{rid:.1%}"
131
+ f.write(f"{name_truncated:<50} {cons.ric:<6} {rid_str:<8}\n")
132
+
133
+ f.write("\n")
134
+
135
+
136
+ def write_positional_identity_section(
137
+ f: TextIO,
138
+ sequences_with_pos_outliers: List[Tuple[ConsensusInfo, Dict]],
139
+ min_variant_frequency: float,
140
+ min_variant_count: int
141
+ ):
142
+ """Write the positional identity analysis section."""
143
+ if not sequences_with_pos_outliers:
144
+ return
145
+
146
+ f.write("=" * 80 + "\n")
147
+ f.write("POSITIONAL IDENTITY ANALYSIS\n")
148
+ f.write("=" * 80 + "\n\n")
149
+
150
+ f.write("Sequences with high-error positions (error rate > threshold at specific positions):\n")
151
+ f.write(f"Threshold: {min_variant_frequency:.1%} (--min-variant-frequency from metadata)\n")
152
+ f.write(f"Min RiC: {2 * min_variant_count} (2 × --min-variant-count)\n")
153
+ f.write("Positions above threshold may indicate undetected/unphased variants.\n")
154
+ f.write("For merged sequences, shows worst component.\n\n")
155
+
156
+ # Sort by total nucleotide errors (descending)
157
+ sorted_pos_outliers = sorted(
158
+ sequences_with_pos_outliers,
159
+ key=lambda x: x[1].get('total_nucleotide_errors', 0),
160
+ reverse=True
161
+ )
162
+
163
+ # Calculate display names and find max length for dynamic column width
164
+ display_data = []
165
+ for cons, result in sorted_pos_outliers:
166
+ if 'component_name' in result:
167
+ component_suffix = result['component_name'].split('.')[-1] if '.' in result['component_name'] else ''
168
+ display_name = f"{cons.sample_name} ({component_suffix})"
169
+ ric_val = result.get('component_ric', cons.ric)
170
+ else:
171
+ display_name = cons.sample_name
172
+ ric_val = cons.ric
173
+ display_data.append((display_name, ric_val, cons, result))
174
+
175
+ # Calculate column width based on longest name (minimum 40, cap at 70)
176
+ max_name_len = max(len(name) for name, _, _, _ in display_data) if display_data else 40
177
+ name_col_width = min(max(max_name_len + 2, 40), 70)
178
+
179
+ f.write(f"{'Sequence':<{name_col_width}} {'RiC':<6} {'Ambig':<6} {'#Pos':<6} {'MeanErr':<8} {'TotalErr':<10}\n")
180
+ f.write("-" * (name_col_width + 38) + "\n")
181
+
182
+ for display_name, ric_val, cons, result in display_data:
183
+ mean_err = result.get('mean_outlier_error_rate', 0.0)
184
+ total_err = result.get('total_nucleotide_errors', 0)
185
+ num_pos = result['num_outlier_positions']
186
+ # Count IUPAC ambiguity codes in the consensus sequence (non-ACGT characters)
187
+ ambig_count = sum(1 for c in cons.sequence if c.upper() not in 'ACGT')
188
+
189
+ f.write(f"{display_name:<{name_col_width}} {ric_val:<6} {ambig_count:<6} {num_pos:<6} "
190
+ f"{mean_err:<8.1%} {total_err:<10}\n")
191
+
192
+ f.write("\n")
193
+
194
+
195
+ def write_overlap_merge_section(
196
+ f: TextIO,
197
+ overlap_merges: List[OverlapMergeInfo],
198
+ min_merge_overlap: int
199
+ ):
200
+ """Write the overlap merge analysis section."""
201
+ # Only include merges that extended beyond full overlap
202
+ true_overlap_merges = [m for m in overlap_merges if m.prefix_bp > 0 or m.suffix_bp > 0]
203
+
204
+ if not true_overlap_merges:
205
+ return
206
+
207
+ f.write("=" * 80 + "\n")
208
+ f.write("OVERLAP MERGE ANALYSIS\n")
209
+ f.write("=" * 80 + "\n\n")
210
+
211
+ # Group merges by specimen
212
+ specimen_merges: Dict[str, List[OverlapMergeInfo]] = {}
213
+ for merge_info in true_overlap_merges:
214
+ if merge_info.specimen not in specimen_merges:
215
+ specimen_merges[merge_info.specimen] = []
216
+ specimen_merges[merge_info.specimen].append(merge_info)
217
+
218
+ f.write(f"{len(specimen_merges)} specimen(s) had overlap merges:\n\n")
219
+
220
+ # Sort specimens by name
221
+ for specimen in sorted(specimen_merges.keys()):
222
+ merges = specimen_merges[specimen]
223
+ merge_count = len(merges)
224
+ max_iteration = max(m.iteration for m in merges)
225
+
226
+ if max_iteration > 1:
227
+ f.write(f"{specimen} ({merge_count} merge(s), iterative):\n")
228
+ else:
229
+ f.write(f"{specimen} ({merge_count} merge(s)):\n")
230
+
231
+ # Sort by iteration
232
+ for merge_info in sorted(merges, key=lambda m: m.iteration):
233
+ iter_prefix = f" Round {merge_info.iteration}: " if max_iteration > 1 else " "
234
+
235
+ # Format input clusters
236
+ input_parts = []
237
+ for cluster, length, ric in zip(
238
+ merge_info.input_clusters,
239
+ merge_info.input_lengths,
240
+ merge_info.input_rics
241
+ ):
242
+ cluster_id = cluster.rsplit('-', 1)[-1] if '-' in cluster else cluster
243
+ input_parts.append(f"{cluster_id} ({length}bp, RiC={ric})")
244
+
245
+ f.write(f"{iter_prefix}Merged: {' + '.join(input_parts)} -> {merge_info.output_length}bp\n")
246
+
247
+ # Calculate overlap as percentage of shorter sequence
248
+ shorter_len = min(merge_info.input_lengths)
249
+ overlap_pct = (merge_info.overlap_bp / shorter_len * 100) if shorter_len > 0 else 0
250
+ f.write(f" Overlap: {merge_info.overlap_bp}bp ({overlap_pct:.0f}% of shorter sequence)\n")
251
+ f.write(f" Extensions: prefix={merge_info.prefix_bp}bp, suffix={merge_info.suffix_bp}bp\n")
252
+
253
+ f.write("\n")
254
+
255
+ # Edge case warnings
256
+ warnings = []
257
+ for merge_info in true_overlap_merges:
258
+ # Warn if overlap is within 10% of threshold
259
+ if merge_info.overlap_bp < min_merge_overlap * 1.1:
260
+ shorter_len = min(merge_info.input_lengths)
261
+ if merge_info.overlap_bp < shorter_len:
262
+ warnings.append(
263
+ f"{merge_info.specimen}: Small overlap relative to threshold "
264
+ f"({merge_info.overlap_bp}bp, threshold={min_merge_overlap}bp)"
265
+ )
266
+
267
+ # Warn if large length ratio (>3:1)
268
+ max_len = max(merge_info.input_lengths)
269
+ min_len = min(merge_info.input_lengths)
270
+ if max_len > min_len * 3:
271
+ warnings.append(
272
+ f"{merge_info.specimen}: Large length ratio "
273
+ f"({max_len}bp / {min_len}bp = {max_len/min_len:.1f}x)"
274
+ )
275
+
276
+ if warnings:
277
+ f.write("Attention:\n")
278
+ for warning in warnings:
279
+ f.write(f" * {warning}\n")
280
+ f.write("\n")
281
+
282
+
283
+ def write_interpretation_guide_section(f: TextIO):
284
+ """Write the interpretation guide section."""
285
+ f.write("=" * 80 + "\n")
286
+ f.write("INTERPRETATION GUIDE\n")
287
+ f.write("=" * 80 + "\n\n")
288
+
289
+ f.write("Read Identity Analysis:\n")
290
+ f.write("-" * 40 + "\n")
291
+ f.write(" Threshold: mean - 2×std (statistical outliers)\n")
292
+ f.write(" RiC: Read-in-Cluster count\n")
293
+ f.write(" rid: Mean read identity to consensus\n")
294
+ f.write(" [merged]: Weighted average rid; components shown below\n\n")
295
+
296
+ f.write("Positional Identity Analysis:\n")
297
+ f.write("-" * 40 + "\n")
298
+ f.write(" Threshold: --min-variant-frequency from metadata\n")
299
+ f.write(" Min RiC: 2 × --min-variant-count\n")
300
+ f.write(" Ambig: Count of IUPAC ambiguity codes in consensus\n")
301
+ f.write(" #Pos: Count of positions exceeding error threshold\n")
302
+ f.write(" MeanErr: Average error rate at flagged positions\n")
303
+ f.write(" TotalErr: Sum of errors at flagged positions\n\n")
304
+
305
+
306
+ def write_quality_report(
307
+ final_consensus: List[ConsensusInfo],
308
+ all_raw_consensuses: List[Tuple[ConsensusInfo, str]],
309
+ summary_folder: str,
310
+ source_folder: str,
311
+ overlap_merges: List[OverlapMergeInfo] = None,
312
+ min_merge_overlap: int = 200
313
+ ):
314
+ """
315
+ Write quality report with rid-based dual outlier detection.
316
+
317
+ Uses mean read identity (rid) for outlier detection. rid_min is not used
318
+ because single outlier reads don't significantly impact consensus quality;
319
+ positional analysis better captures systematic issues.
320
+
321
+ Structure:
322
+ 1. Executive Summary - High-level overview with attention flags
323
+ 2. Read Identity Analysis - Dual outlier detection (clustering threshold + statistical)
324
+ 3. Positional Identity Analysis - Sequences with problematic positions
325
+ 4. Overlap Merge Analysis - Details of overlap merges (when applicable)
326
+ 5. Interpretation Guide - Actionable guidance with neutral tone
327
+
328
+ Args:
329
+ final_consensus: List of final consensus sequences
330
+ all_raw_consensuses: List of (raw_consensus, original_name) tuples
331
+ summary_folder: Output directory for report
332
+ source_folder: Source directory containing cluster_debug with MSA files
333
+ overlap_merges: List of OverlapMergeInfo objects describing overlap merges
334
+ min_merge_overlap: Threshold used for overlap merging (for edge case warnings)
335
+ """
336
+ if overlap_merges is None:
337
+ overlap_merges = []
338
+
339
+ quality_report_path = os.path.join(summary_folder, 'quality_report.txt')
340
+
341
+ # Build .raw lookup: map merged sequence names to their .raw components
342
+ raw_lookup: Dict[str, List[ConsensusInfo]] = {}
343
+ for raw_cons, original_name in all_raw_consensuses:
344
+ base_match = re.match(r'(.+?)\.raw\d+$', raw_cons.sample_name)
345
+ if base_match:
346
+ base_name = base_match.group(1)
347
+ if base_name not in raw_lookup:
348
+ raw_lookup[base_name] = []
349
+ raw_lookup[base_name].append(raw_cons)
350
+
351
+ # Identify outliers using dual detection
352
+ outlier_results = identify_outliers(final_consensus, all_raw_consensuses, source_folder)
353
+
354
+ # Load min_variant_frequency and min_variant_count from metadata
355
+ min_variant_frequency = None
356
+ min_variant_count = None
357
+
358
+ for cons in final_consensus:
359
+ sample_name = cons.sample_name
360
+ specimen_base = re.sub(r'-\d+\.v\d+$', '', sample_name)
361
+
362
+ metadata = load_metadata_from_json(source_folder, specimen_base)
363
+ if metadata and 'parameters' in metadata:
364
+ params = metadata['parameters']
365
+ min_variant_frequency = params.get('min_variant_frequency', 0.2)
366
+ min_variant_count = params.get('min_variant_count', 5)
367
+ break
368
+
369
+ # Fallback to defaults if not found
370
+ if min_variant_frequency is None:
371
+ min_variant_frequency = 0.2
372
+ logging.warning("Could not load min_variant_frequency from metadata, using default: 0.2")
373
+ if min_variant_count is None:
374
+ min_variant_count = 5
375
+ logging.warning("Could not load min_variant_count from metadata, using default: 5")
376
+
377
+ # Analyze positional identity for all sequences
378
+ sequences_with_pos_outliers: List[Tuple[ConsensusInfo, Dict]] = []
379
+ sequences_to_analyze = {cons.sample_name: cons for cons in final_consensus}
380
+
381
+ logging.info("Analyzing positional identity for quality report...")
382
+ for cons in tqdm(sequences_to_analyze.values(), desc="Analyzing positional identity", unit="seq"):
383
+ is_merged = cons.snp_count is not None and cons.snp_count > 0
384
+
385
+ if is_merged:
386
+ raw_components = raw_lookup.get(cons.sample_name, [])
387
+ worst_result = None
388
+ worst_outliers = 0
389
+
390
+ for raw_cons in raw_components:
391
+ result = analyze_positional_identity_outliers(
392
+ raw_cons, source_folder, min_variant_frequency, min_variant_count
393
+ )
394
+ if result:
395
+ result['component_name'] = raw_cons.sample_name
396
+ result['component_ric'] = raw_cons.ric
397
+ if result['num_outlier_positions'] > worst_outliers:
398
+ worst_outliers = result['num_outlier_positions']
399
+ worst_result = result
400
+
401
+ if worst_result and worst_result['num_outlier_positions'] > 0:
402
+ sequences_with_pos_outliers.append((cons, worst_result))
403
+ else:
404
+ result = analyze_positional_identity_outliers(
405
+ cons, source_folder, min_variant_frequency, min_variant_count
406
+ )
407
+ if result and result['num_outlier_positions'] > 0:
408
+ sequences_with_pos_outliers.append((cons, result))
409
+
410
+ sequences_with_pos_outliers.sort(key=lambda x: x[1].get('total_nucleotide_errors', 0), reverse=True)
411
+
412
+ # Write detailed position debug file
413
+ if sequences_with_pos_outliers:
414
+ write_position_debug_file(sequences_with_pos_outliers, summary_folder, min_variant_frequency)
415
+
416
+ # Identify merged sequences with quality issues
417
+ merged_with_issues: List[Tuple[ConsensusInfo, List, float, float]] = []
418
+ threshold_rid = outlier_results['global_stats']['stat_threshold_rid']
419
+
420
+ for cons in final_consensus:
421
+ is_merged = cons.snp_count is not None and cons.snp_count > 0
422
+ if is_merged:
423
+ raw_components = raw_lookup.get(cons.sample_name, [])
424
+ if not raw_components:
425
+ continue
426
+
427
+ components_info = []
428
+ worst_rid = 1.0
429
+ total_ric = 0
430
+ weighted_rid_sum = 0.0
431
+
432
+ for raw in raw_components:
433
+ rid = raw.rid if raw.rid is not None else 1.0
434
+ ric = raw.ric if raw.ric else 0
435
+ components_info.append((raw, rid, ric))
436
+ if rid < worst_rid:
437
+ worst_rid = rid
438
+ if ric > 0:
439
+ total_ric += ric
440
+ weighted_rid_sum += rid * ric
441
+
442
+ weighted_avg_rid = weighted_rid_sum / total_ric if total_ric > 0 else 1.0
443
+ components_info.sort(key=lambda x: x[1])
444
+
445
+ if worst_rid < threshold_rid:
446
+ merged_with_issues.append((cons, components_info, worst_rid, weighted_avg_rid))
447
+
448
+ merged_with_issues.sort(key=lambda x: x[2])
449
+
450
+ # Calculate summary statistics
451
+ total_seqs = len(final_consensus)
452
+ total_merged = sum(1 for cons in final_consensus if cons.snp_count is not None and cons.snp_count > 0)
453
+ stats = outlier_results['global_stats']
454
+ n_stat = len(outlier_results['statistical_outliers'])
455
+ n_pos = len(sequences_with_pos_outliers)
456
+ n_merged = len(merged_with_issues)
457
+ n_rid_issues = n_stat + n_merged
458
+
459
+ # Count unique flagged sequences
460
+ flagged_names = set()
461
+ for c, _ in outlier_results['statistical_outliers']:
462
+ flagged_names.add(c.sample_name)
463
+ for c, _ in sequences_with_pos_outliers:
464
+ flagged_names.add(c.sample_name)
465
+ for c, _, _, _ in merged_with_issues:
466
+ flagged_names.add(c.sample_name)
467
+ total_flagged = len(flagged_names)
468
+
469
+ # Write the report
470
+ with open(quality_report_path, 'w') as f:
471
+ write_header_section(f, source_folder)
472
+
473
+ write_executive_summary_section(
474
+ f, total_seqs, total_merged, stats,
475
+ n_stat, n_pos, n_merged, total_flagged
476
+ )
477
+
478
+ write_read_identity_section(f, outlier_results, merged_with_issues, stats)
479
+
480
+ write_positional_identity_section(
481
+ f, sequences_with_pos_outliers,
482
+ min_variant_frequency, min_variant_count
483
+ )
484
+
485
+ write_overlap_merge_section(f, overlap_merges, min_merge_overlap)
486
+
487
+ write_interpretation_guide_section(f)
488
+
489
+ # Log summary
490
+ logging.info(f"Quality report written to: {quality_report_path}")
491
+ if n_rid_issues > 0:
492
+ logging.info(f" {n_rid_issues} sequence(s) flagged for read identity ({n_stat} direct + {n_merged} merged)")
493
+ else:
494
+ logging.info(" All sequences show good read identity")
495
+
496
+ if n_pos > 0:
497
+ logging.info(f" {n_pos} sequence(s) with high-error positions")
498
+ if n_merged > 0:
499
+ logging.info(f" {n_merged} merged sequence(s) with component quality issues")
@@ -0,0 +1,29 @@
1
+ """Scalability features for large-scale sequence comparison.
2
+
3
+ This module provides abstractions for accelerating O(n^2) pairwise sequence
4
+ comparison operations using external tools like vsearch.
5
+
6
+ Example usage:
7
+ from speconsense.scalability import (
8
+ VsearchCandidateFinder,
9
+ ScalablePairwiseOperation,
10
+ ScalabilityConfig
11
+ )
12
+
13
+ config = ScalabilityConfig(enabled=True)
14
+ finder = VsearchCandidateFinder()
15
+ operation = ScalablePairwiseOperation(finder, scoring_function, config)
16
+
17
+ neighbors = operation.compute_top_k_neighbors(sequences, k=20, min_identity=0.8)
18
+ """
19
+
20
+ from .config import ScalabilityConfig
21
+ from .base import CandidateFinder, ScalablePairwiseOperation
22
+ from .vsearch import VsearchCandidateFinder
23
+
24
+ __all__ = [
25
+ 'ScalabilityConfig',
26
+ 'CandidateFinder',
27
+ 'ScalablePairwiseOperation',
28
+ 'VsearchCandidateFinder',
29
+ ]