speconsense 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speconsense/__init__.py +16 -0
- speconsense/cli.py +6 -0
- speconsense/core/__init__.py +32 -0
- speconsense/core/__main__.py +6 -0
- speconsense/core/cli.py +308 -0
- speconsense/core/clusterer.py +1565 -0
- speconsense/core/workers.py +696 -0
- speconsense/msa.py +813 -0
- speconsense/profiles/__init__.py +514 -0
- speconsense/profiles/example.yaml +97 -0
- speconsense/profiles/herbarium.yaml +25 -0
- speconsense/profiles/largedata.yaml +19 -0
- speconsense/profiles/nostalgia.yaml +22 -0
- speconsense/profiles/strict.yaml +27 -0
- speconsense/quality_report.py +499 -0
- speconsense/scalability/__init__.py +29 -0
- speconsense/scalability/base.py +461 -0
- speconsense/scalability/config.py +42 -0
- speconsense/scalability/vsearch.py +226 -0
- speconsense/summarize/__init__.py +129 -0
- speconsense/summarize/__main__.py +6 -0
- speconsense/summarize/analysis.py +780 -0
- speconsense/summarize/cli.py +528 -0
- speconsense/summarize/clustering.py +669 -0
- speconsense/summarize/fields.py +262 -0
- speconsense/summarize/io.py +723 -0
- speconsense/summarize/iupac.py +294 -0
- speconsense/summarize/merging.py +606 -0
- speconsense/synth.py +292 -0
- speconsense/types.py +38 -0
- speconsense-0.7.2.dist-info/METADATA +1449 -0
- speconsense-0.7.2.dist-info/RECORD +36 -0
- speconsense-0.7.2.dist-info/WHEEL +5 -0
- speconsense-0.7.2.dist-info/entry_points.txt +4 -0
- speconsense-0.7.2.dist-info/licenses/LICENSE +28 -0
- speconsense-0.7.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
"""Profile system for speconsense parameter presets.
|
|
2
|
+
|
|
3
|
+
Profiles allow users to save and reuse parameter configurations for different
|
|
4
|
+
workflows (e.g., herbarium specimens vs. fresh specimens).
|
|
5
|
+
|
|
6
|
+
Profile resolution order:
|
|
7
|
+
1. User profiles in ~/.config/speconsense/profiles/
|
|
8
|
+
2. Bundled profiles in package
|
|
9
|
+
|
|
10
|
+
Override order: defaults -> profile -> explicit CLI arguments
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, Any, Optional, List, Set
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import yaml
|
|
23
|
+
except ImportError:
|
|
24
|
+
yaml = None # type: ignore
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from speconsense import __version__
|
|
28
|
+
except ImportError:
|
|
29
|
+
__version__ = "dev"
|
|
30
|
+
|
|
31
|
+
# Use importlib.resources for Python 3.9+, fall back to pkg_resources
|
|
32
|
+
HAS_IMPORTLIB_RESOURCES = False
|
|
33
|
+
HAS_PKG_RESOURCES = False
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from importlib.resources import files as importlib_files
|
|
37
|
+
from importlib.resources import as_file
|
|
38
|
+
HAS_IMPORTLIB_RESOURCES = True
|
|
39
|
+
except ImportError:
|
|
40
|
+
try:
|
|
41
|
+
import pkg_resources
|
|
42
|
+
HAS_PKG_RESOURCES = True
|
|
43
|
+
except ImportError:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
# XDG-compliant config path
|
|
49
|
+
XDG_CONFIG_HOME = Path.home() / ".config"
|
|
50
|
+
PROFILES_DIR = XDG_CONFIG_HOME / "speconsense" / "profiles"
|
|
51
|
+
|
|
52
|
+
# Valid keys for each tool (for strict validation)
|
|
53
|
+
VALID_SPECONSENSE_KEYS = {
|
|
54
|
+
# Clustering algorithm
|
|
55
|
+
"algorithm",
|
|
56
|
+
"min-identity",
|
|
57
|
+
"inflation",
|
|
58
|
+
"k-nearest-neighbors",
|
|
59
|
+
# Cluster filtering
|
|
60
|
+
"min-size",
|
|
61
|
+
"min-cluster-ratio",
|
|
62
|
+
"outlier-identity",
|
|
63
|
+
# Sampling
|
|
64
|
+
"max-sample-size",
|
|
65
|
+
"presample",
|
|
66
|
+
# Variant calling
|
|
67
|
+
"min-variant-frequency",
|
|
68
|
+
"min-variant-count",
|
|
69
|
+
"disable-position-phasing",
|
|
70
|
+
# Ambiguity calling
|
|
71
|
+
"min-ambiguity-frequency",
|
|
72
|
+
"min-ambiguity-count",
|
|
73
|
+
"disable-ambiguity-calling",
|
|
74
|
+
# Merging
|
|
75
|
+
"disable-cluster-merging",
|
|
76
|
+
"disable-homopolymer-equivalence",
|
|
77
|
+
# Orientation
|
|
78
|
+
"orient-mode",
|
|
79
|
+
# Processing
|
|
80
|
+
"scale-threshold",
|
|
81
|
+
"threads",
|
|
82
|
+
"enable-early-filter",
|
|
83
|
+
"collect-discards",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
VALID_SUMMARIZE_KEYS = {
|
|
87
|
+
# Filtering
|
|
88
|
+
"min-ric",
|
|
89
|
+
"min-len",
|
|
90
|
+
"max-len",
|
|
91
|
+
# Grouping
|
|
92
|
+
"group-identity",
|
|
93
|
+
# Merging
|
|
94
|
+
"disable-merging",
|
|
95
|
+
"merge-effort",
|
|
96
|
+
"merge-snp",
|
|
97
|
+
"merge-indel-length",
|
|
98
|
+
"merge-position-count",
|
|
99
|
+
"merge-min-size-ratio",
|
|
100
|
+
"min-merge-overlap",
|
|
101
|
+
"disable-homopolymer-equivalence",
|
|
102
|
+
# Selection
|
|
103
|
+
"select-max-groups",
|
|
104
|
+
"select-max-variants",
|
|
105
|
+
"select-strategy",
|
|
106
|
+
# Processing
|
|
107
|
+
"scale-threshold",
|
|
108
|
+
"threads",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ProfileError(Exception):
|
|
113
|
+
"""Error loading or applying a profile."""
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class ProfileVersionError(ProfileError):
|
|
118
|
+
"""Profile version is incompatible with current speconsense version."""
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class ProfileValidationError(ProfileError):
|
|
123
|
+
"""Profile contains invalid keys."""
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class Profile:
|
|
129
|
+
"""A parameter profile for speconsense tools."""
|
|
130
|
+
name: str
|
|
131
|
+
version: str # e.g., "0.7.*"
|
|
132
|
+
description: str
|
|
133
|
+
speconsense: Dict[str, Any] = field(default_factory=dict)
|
|
134
|
+
speconsense_summarize: Dict[str, Any] = field(default_factory=dict)
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def load(cls, name: str, check_version: bool = True) -> 'Profile':
|
|
138
|
+
"""Load profile by name from user dir or bundled.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
name: Profile name (without .yaml extension)
|
|
142
|
+
check_version: If True, validate version compatibility
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Loaded Profile instance
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
ProfileError: If profile not found
|
|
149
|
+
ProfileVersionError: If version incompatible
|
|
150
|
+
ProfileValidationError: If profile contains invalid keys
|
|
151
|
+
"""
|
|
152
|
+
if yaml is None:
|
|
153
|
+
raise ProfileError(
|
|
154
|
+
"PyYAML is required for profile support. "
|
|
155
|
+
"Install with: pip install pyyaml"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Initialize user profiles directory with example on first use
|
|
159
|
+
ensure_user_profiles_dir()
|
|
160
|
+
|
|
161
|
+
# Try user profile first
|
|
162
|
+
user_path = PROFILES_DIR / f"{name}.yaml"
|
|
163
|
+
if user_path.exists():
|
|
164
|
+
return cls._load_from_path(user_path, name, check_version)
|
|
165
|
+
|
|
166
|
+
# Fall back to bundled profile
|
|
167
|
+
bundled_path = get_bundled_profile_path(name)
|
|
168
|
+
if bundled_path is not None:
|
|
169
|
+
return cls._load_from_path(bundled_path, name, check_version)
|
|
170
|
+
|
|
171
|
+
# Profile not found - provide helpful error
|
|
172
|
+
available = list_profiles()
|
|
173
|
+
if available:
|
|
174
|
+
raise ProfileError(
|
|
175
|
+
f"Profile '{name}' not found. Available profiles: {', '.join(available)}"
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
raise ProfileError(
|
|
179
|
+
f"Profile '{name}' not found and no profiles are available. "
|
|
180
|
+
f"Check that profiles are installed in {PROFILES_DIR}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def _load_from_path(cls, path: Path, name: str, check_version: bool) -> 'Profile':
|
|
185
|
+
"""Load profile from a specific path."""
|
|
186
|
+
try:
|
|
187
|
+
with open(path, 'r') as f:
|
|
188
|
+
data = yaml.safe_load(f)
|
|
189
|
+
except yaml.YAMLError as e:
|
|
190
|
+
raise ProfileError(f"Invalid YAML in profile '{name}': {e}")
|
|
191
|
+
except IOError as e:
|
|
192
|
+
raise ProfileError(f"Cannot read profile '{name}': {e}")
|
|
193
|
+
|
|
194
|
+
if not isinstance(data, dict):
|
|
195
|
+
raise ProfileError(f"Profile '{name}' must be a YAML mapping")
|
|
196
|
+
|
|
197
|
+
# Extract fields
|
|
198
|
+
version = data.get('speconsense-version', '*')
|
|
199
|
+
description = data.get('description', '')
|
|
200
|
+
speconsense = data.get('speconsense', {}) or {}
|
|
201
|
+
speconsense_summarize = data.get('speconsense-summarize', {}) or {}
|
|
202
|
+
|
|
203
|
+
# Validate version compatibility
|
|
204
|
+
if check_version and not check_version_compatible(version, __version__):
|
|
205
|
+
raise ProfileVersionError(
|
|
206
|
+
f"Profile '{name}' requires speconsense version {version}, "
|
|
207
|
+
f"but you have {__version__}.\n\n"
|
|
208
|
+
f"This profile may use parameters that have changed or been removed.\n"
|
|
209
|
+
f"Please update the profile for your version, or copy the bundled\n"
|
|
210
|
+
f"'{name}' profile which is compatible with your version:\n\n"
|
|
211
|
+
f" cp {PROFILES_DIR}/{name}.yaml {PROFILES_DIR}/{name}.yaml.bak\n"
|
|
212
|
+
f" speconsense --list-profiles # Will show available profiles"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Validate keys (strict validation)
|
|
216
|
+
_validate_profile_keys(name, speconsense, speconsense_summarize)
|
|
217
|
+
|
|
218
|
+
return cls(
|
|
219
|
+
name=name,
|
|
220
|
+
version=version,
|
|
221
|
+
description=description,
|
|
222
|
+
speconsense=speconsense,
|
|
223
|
+
speconsense_summarize=speconsense_summarize,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def check_version_compatible(profile_version: str, current_version: str) -> bool:
|
|
228
|
+
"""Check if profile version pattern matches current version.
|
|
229
|
+
|
|
230
|
+
Supports wildcards:
|
|
231
|
+
- "0.7.*" matches "0.7.0", "0.7.1", etc.
|
|
232
|
+
- "0.7.0" matches only "0.7.0"
|
|
233
|
+
- "0.*" matches any 0.x release
|
|
234
|
+
- "*" matches any version
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
profile_version: Version pattern from profile (e.g., "0.7.*")
|
|
238
|
+
current_version: Current speconsense version (e.g., "0.7.2")
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
True if versions are compatible
|
|
242
|
+
"""
|
|
243
|
+
if profile_version == '*':
|
|
244
|
+
return True
|
|
245
|
+
|
|
246
|
+
# Convert wildcard pattern to regex
|
|
247
|
+
# Escape dots and convert * to .*
|
|
248
|
+
pattern = profile_version.replace('.', r'\.').replace('*', r'.*')
|
|
249
|
+
pattern = f'^{pattern}$'
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
return bool(re.match(pattern, current_version))
|
|
253
|
+
except re.error:
|
|
254
|
+
# Invalid regex, treat as literal match
|
|
255
|
+
return profile_version == current_version
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _validate_profile_keys(
|
|
259
|
+
name: str,
|
|
260
|
+
speconsense: Dict[str, Any],
|
|
261
|
+
speconsense_summarize: Dict[str, Any]
|
|
262
|
+
) -> None:
|
|
263
|
+
"""Validate that profile only contains known keys.
|
|
264
|
+
|
|
265
|
+
Raises ProfileValidationError for unknown keys.
|
|
266
|
+
"""
|
|
267
|
+
errors = []
|
|
268
|
+
|
|
269
|
+
unknown_core = set(speconsense.keys()) - VALID_SPECONSENSE_KEYS
|
|
270
|
+
if unknown_core:
|
|
271
|
+
errors.append(f" speconsense: {', '.join(sorted(unknown_core))}")
|
|
272
|
+
|
|
273
|
+
unknown_summarize = set(speconsense_summarize.keys()) - VALID_SUMMARIZE_KEYS
|
|
274
|
+
if unknown_summarize:
|
|
275
|
+
errors.append(f" speconsense-summarize: {', '.join(sorted(unknown_summarize))}")
|
|
276
|
+
|
|
277
|
+
if errors:
|
|
278
|
+
raise ProfileValidationError(
|
|
279
|
+
f"Profile '{name}' contains unknown keys:\n" + '\n'.join(errors) + "\n\n"
|
|
280
|
+
f"This may indicate a typo or an option that has been removed.\n"
|
|
281
|
+
f"Please check the profile and fix or remove the invalid keys."
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def get_bundled_profile_path(name: str) -> Optional[Path]:
|
|
286
|
+
"""Get path to bundled profile using importlib.resources.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
name: Profile name (without .yaml extension)
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Path to bundled profile file, or None if not found
|
|
293
|
+
"""
|
|
294
|
+
if HAS_IMPORTLIB_RESOURCES:
|
|
295
|
+
try:
|
|
296
|
+
# Python 3.9+ style
|
|
297
|
+
profiles_pkg = importlib_files('speconsense.profiles')
|
|
298
|
+
profile_file = profiles_pkg.joinpath(f'{name}.yaml')
|
|
299
|
+
# Check if file exists using as_file context manager
|
|
300
|
+
with as_file(profile_file) as path:
|
|
301
|
+
if path.exists():
|
|
302
|
+
return path
|
|
303
|
+
except (TypeError, FileNotFoundError, ModuleNotFoundError):
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
if HAS_PKG_RESOURCES:
|
|
307
|
+
try:
|
|
308
|
+
# Fall back to pkg_resources for Python 3.8
|
|
309
|
+
resource_path = pkg_resources.resource_filename(
|
|
310
|
+
'speconsense.profiles', f'{name}.yaml'
|
|
311
|
+
)
|
|
312
|
+
path = Path(resource_path)
|
|
313
|
+
if path.exists():
|
|
314
|
+
return path
|
|
315
|
+
except (FileNotFoundError, ModuleNotFoundError):
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
# Last resort: check relative to this file (profiles/__init__.py)
|
|
319
|
+
bundled_dir = Path(__file__).parent
|
|
320
|
+
bundled_path = bundled_dir / f'{name}.yaml'
|
|
321
|
+
if bundled_path.exists():
|
|
322
|
+
return bundled_path
|
|
323
|
+
|
|
324
|
+
return None
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def list_bundled_profiles() -> List[str]:
|
|
328
|
+
"""List names of bundled profiles."""
|
|
329
|
+
profiles = []
|
|
330
|
+
|
|
331
|
+
# Try importlib.resources first
|
|
332
|
+
if HAS_IMPORTLIB_RESOURCES:
|
|
333
|
+
try:
|
|
334
|
+
profiles_pkg = importlib_files('speconsense.profiles')
|
|
335
|
+
for item in profiles_pkg.iterdir():
|
|
336
|
+
if str(item).endswith('.yaml'):
|
|
337
|
+
name = Path(str(item)).stem
|
|
338
|
+
profiles.append(name)
|
|
339
|
+
if profiles:
|
|
340
|
+
return sorted(profiles)
|
|
341
|
+
except (TypeError, FileNotFoundError, ModuleNotFoundError):
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
# Fall back to checking directory relative to this file
|
|
345
|
+
bundled_dir = Path(__file__).parent
|
|
346
|
+
if bundled_dir.exists():
|
|
347
|
+
for yaml_file in bundled_dir.glob('*.yaml'):
|
|
348
|
+
profiles.append(yaml_file.stem)
|
|
349
|
+
|
|
350
|
+
return sorted(profiles)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def list_profiles() -> List[str]:
|
|
354
|
+
"""List available profiles (user + bundled).
|
|
355
|
+
|
|
356
|
+
Returns list of profile names (without .yaml extension).
|
|
357
|
+
User profiles take precedence over bundled profiles with same name.
|
|
358
|
+
"""
|
|
359
|
+
profiles: Set[str] = set()
|
|
360
|
+
|
|
361
|
+
# User profiles
|
|
362
|
+
if PROFILES_DIR.exists():
|
|
363
|
+
for yaml_file in PROFILES_DIR.glob('*.yaml'):
|
|
364
|
+
profiles.add(yaml_file.stem)
|
|
365
|
+
|
|
366
|
+
# Bundled profiles
|
|
367
|
+
profiles.update(list_bundled_profiles())
|
|
368
|
+
|
|
369
|
+
return sorted(profiles)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def ensure_user_profiles_dir() -> Path:
|
|
373
|
+
"""Ensure user profiles directory exists with example profile.
|
|
374
|
+
|
|
375
|
+
On first use, creates the directory and copies an example profile
|
|
376
|
+
to help users create their own profiles.
|
|
377
|
+
|
|
378
|
+
This function is safe to call from parallel processes - it uses
|
|
379
|
+
atomic file operations to avoid race conditions.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Path to user profiles directory
|
|
383
|
+
"""
|
|
384
|
+
import tempfile
|
|
385
|
+
|
|
386
|
+
PROFILES_DIR.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
|
|
388
|
+
example_path = PROFILES_DIR / "example.yaml"
|
|
389
|
+
|
|
390
|
+
# Skip if example already exists (common case, avoid extra work)
|
|
391
|
+
if example_path.exists():
|
|
392
|
+
return PROFILES_DIR
|
|
393
|
+
|
|
394
|
+
# Skip if user already has other profiles (they don't need the example)
|
|
395
|
+
if any(PROFILES_DIR.glob('*.yaml')):
|
|
396
|
+
return PROFILES_DIR
|
|
397
|
+
|
|
398
|
+
# Copy example profile atomically (safe for parallel invocations)
|
|
399
|
+
bundled_example = get_bundled_profile_path('example')
|
|
400
|
+
if bundled_example is not None:
|
|
401
|
+
try:
|
|
402
|
+
# Write to temp file in same directory, then atomic rename
|
|
403
|
+
fd, temp_path = tempfile.mkstemp(
|
|
404
|
+
dir=PROFILES_DIR,
|
|
405
|
+
prefix='.example.',
|
|
406
|
+
suffix='.yaml.tmp'
|
|
407
|
+
)
|
|
408
|
+
try:
|
|
409
|
+
with open(bundled_example, 'rb') as src:
|
|
410
|
+
os.write(fd, src.read())
|
|
411
|
+
finally:
|
|
412
|
+
os.close(fd)
|
|
413
|
+
|
|
414
|
+
# Atomic rename - if file exists, this either succeeds or fails cleanly
|
|
415
|
+
# On POSIX: atomic, last writer wins (all have same content, so fine)
|
|
416
|
+
# On Windows: may raise if file exists, which we catch
|
|
417
|
+
try:
|
|
418
|
+
os.rename(temp_path, example_path)
|
|
419
|
+
logger.info(f"Created example profile at {example_path}")
|
|
420
|
+
except OSError:
|
|
421
|
+
# Another process won the race - that's fine
|
|
422
|
+
if os.path.exists(temp_path):
|
|
423
|
+
os.unlink(temp_path)
|
|
424
|
+
except Exception as e:
|
|
425
|
+
# Non-fatal - profile system works without example in user dir
|
|
426
|
+
logger.debug(f"Could not create example profile: {e}")
|
|
427
|
+
|
|
428
|
+
return PROFILES_DIR
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def apply_profile_to_args(
|
|
432
|
+
args,
|
|
433
|
+
profile: Profile,
|
|
434
|
+
tool: str,
|
|
435
|
+
explicit_args: Set[str]
|
|
436
|
+
) -> None:
|
|
437
|
+
"""Apply profile values to args, respecting explicit CLI overrides.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
args: argparse Namespace to modify
|
|
441
|
+
profile: Profile to apply
|
|
442
|
+
tool: Either 'speconsense' or 'speconsense-summarize'
|
|
443
|
+
explicit_args: Set of argument names that were explicitly provided on CLI
|
|
444
|
+
"""
|
|
445
|
+
if tool == 'speconsense':
|
|
446
|
+
profile_values = profile.speconsense
|
|
447
|
+
elif tool == 'speconsense-summarize':
|
|
448
|
+
profile_values = profile.speconsense_summarize
|
|
449
|
+
else:
|
|
450
|
+
raise ValueError(f"Unknown tool: {tool}")
|
|
451
|
+
|
|
452
|
+
for key, value in profile_values.items():
|
|
453
|
+
# Convert YAML key (with dashes) to argparse attribute name (with underscores)
|
|
454
|
+
attr_name = key.replace('-', '_')
|
|
455
|
+
|
|
456
|
+
# Only apply if not explicitly set on command line
|
|
457
|
+
if attr_name not in explicit_args:
|
|
458
|
+
if hasattr(args, attr_name):
|
|
459
|
+
logger.debug(f"Profile '{profile.name}': setting {attr_name}={value}")
|
|
460
|
+
setattr(args, attr_name, value)
|
|
461
|
+
else:
|
|
462
|
+
# This shouldn't happen if validation passed, but log it
|
|
463
|
+
logger.warning(
|
|
464
|
+
f"Profile '{profile.name}': unknown attribute '{attr_name}'"
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def print_profiles_list(tool: str = 'speconsense') -> None:
|
|
469
|
+
"""Print available profiles to stdout.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
tool: Either 'speconsense' or 'speconsense-summarize'
|
|
473
|
+
"""
|
|
474
|
+
if yaml is None:
|
|
475
|
+
print("Profile support requires PyYAML. Install with: pip install pyyaml")
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
# Initialize user profiles directory with example on first use
|
|
479
|
+
ensure_user_profiles_dir()
|
|
480
|
+
|
|
481
|
+
profiles = list_profiles()
|
|
482
|
+
|
|
483
|
+
if not profiles:
|
|
484
|
+
print(f"No profiles found.")
|
|
485
|
+
print(f"\nProfiles are stored in: {PROFILES_DIR}")
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
print(f"Available profiles:\n")
|
|
489
|
+
|
|
490
|
+
for name in profiles:
|
|
491
|
+
try:
|
|
492
|
+
# Load without version check to show all profiles
|
|
493
|
+
profile = Profile.load(name, check_version=False)
|
|
494
|
+
|
|
495
|
+
# Check if it's a user profile or bundled
|
|
496
|
+
user_path = PROFILES_DIR / f"{name}.yaml"
|
|
497
|
+
source = "user" if user_path.exists() else "bundled"
|
|
498
|
+
|
|
499
|
+
# Check version compatibility
|
|
500
|
+
compatible = check_version_compatible(profile.version, __version__)
|
|
501
|
+
compat_str = "" if compatible else " [INCOMPATIBLE]"
|
|
502
|
+
|
|
503
|
+
print(f" {name} ({source}){compat_str}")
|
|
504
|
+
if profile.description:
|
|
505
|
+
print(f" {profile.description}")
|
|
506
|
+
print(f" Version: {profile.version}")
|
|
507
|
+
print()
|
|
508
|
+
|
|
509
|
+
except ProfileError as e:
|
|
510
|
+
print(f" {name} [ERROR: {e}]")
|
|
511
|
+
print()
|
|
512
|
+
|
|
513
|
+
print(f"Usage: {tool} -p <profile> [other options]")
|
|
514
|
+
print(f"Profile directory: {PROFILES_DIR}")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Example Speconsense Profile
|
|
2
|
+
#
|
|
3
|
+
# Copy this file to create your own profiles:
|
|
4
|
+
# cp example.yaml my-workflow.yaml
|
|
5
|
+
#
|
|
6
|
+
# Then use it with:
|
|
7
|
+
# speconsense input.fastq -p my-workflow
|
|
8
|
+
# speconsense-summarize -p my-workflow
|
|
9
|
+
#
|
|
10
|
+
# HOW TO USE THIS FILE:
|
|
11
|
+
# - Lines starting with # are comments (ignored)
|
|
12
|
+
# - To enable an option, remove the # at the start of the line
|
|
13
|
+
# - Only include parameters you want to change from defaults
|
|
14
|
+
# - CLI arguments always override profile values
|
|
15
|
+
#
|
|
16
|
+
# Example: To set min-size to 10, change:
|
|
17
|
+
# # min-size: 5
|
|
18
|
+
# to:
|
|
19
|
+
# min-size: 10
|
|
20
|
+
|
|
21
|
+
speconsense-version: "0.7.*"
|
|
22
|
+
description: "My custom workflow"
|
|
23
|
+
|
|
24
|
+
# =============================================================================
|
|
25
|
+
# Parameters for speconsense (clustering and consensus)
|
|
26
|
+
# =============================================================================
|
|
27
|
+
speconsense:
|
|
28
|
+
|
|
29
|
+
# --- Clustering Algorithm ---
|
|
30
|
+
# algorithm: graph # Clustering algorithm: graph (MCL) or greedy
|
|
31
|
+
# min-identity: 0.9 # Similarity threshold for clustering (0.0-1.0)
|
|
32
|
+
# inflation: 2.0 # MCL inflation parameter (higher = more clusters)
|
|
33
|
+
# k-nearest-neighbors: 10 # K-NN graph construction parameter
|
|
34
|
+
|
|
35
|
+
# --- Cluster Filtering ---
|
|
36
|
+
# min-size: 5 # Minimum reads per cluster (0 to disable)
|
|
37
|
+
# min-cluster-ratio: 0.01 # Minimum cluster size as ratio of total reads
|
|
38
|
+
# outlier-identity: 0.85 # Identity threshold for outlier detection
|
|
39
|
+
|
|
40
|
+
# --- Read Sampling ---
|
|
41
|
+
# presample: 1000 # Initial random sampling of input reads
|
|
42
|
+
# max-sample-size: 100 # Max reads used for consensus generation
|
|
43
|
+
|
|
44
|
+
# --- Variant Calling (position-based phasing) ---
|
|
45
|
+
# min-variant-frequency: 0.10 # Min frequency to call a variant (0.0-1.0)
|
|
46
|
+
# min-variant-count: 3 # Min read count to call a variant
|
|
47
|
+
# disable-position-phasing: false # Set true to disable variant detection
|
|
48
|
+
|
|
49
|
+
# --- Ambiguity Calling (IUPAC codes) ---
|
|
50
|
+
# min-ambiguity-frequency: 0.25 # Min frequency for IUPAC ambiguity codes
|
|
51
|
+
# min-ambiguity-count: 3 # Min read count for ambiguity codes
|
|
52
|
+
# disable-ambiguity-calling: false # Set true to disable IUPAC codes
|
|
53
|
+
|
|
54
|
+
# --- Cluster Merging ---
|
|
55
|
+
# disable-cluster-merging: false # Set true to skip merging similar clusters
|
|
56
|
+
# disable-homopolymer-equivalence: false # Set true for strict homopolymer comparison
|
|
57
|
+
|
|
58
|
+
# --- Primer Orientation ---
|
|
59
|
+
# orient-mode: filter-failed # skip, keep-all, or filter-failed
|
|
60
|
+
|
|
61
|
+
# --- Processing ---
|
|
62
|
+
# threads: 0 # Max threads (0 = auto-detect)
|
|
63
|
+
# scale-threshold: 500 # Read count threshold for scaled processing
|
|
64
|
+
# enable-early-filter: false # Enable early filtering optimization
|
|
65
|
+
# collect-discards: false # Write discarded reads to separate file
|
|
66
|
+
|
|
67
|
+
# =============================================================================
|
|
68
|
+
# Parameters for speconsense-summarize (post-processing)
|
|
69
|
+
# =============================================================================
|
|
70
|
+
speconsense-summarize:
|
|
71
|
+
|
|
72
|
+
# --- Input Filtering ---
|
|
73
|
+
# min-ric: 3 # Minimum Reads in Consensus threshold
|
|
74
|
+
# min-len: 0 # Minimum sequence length (0 = disabled)
|
|
75
|
+
# max-len: 0 # Maximum sequence length (0 = disabled)
|
|
76
|
+
|
|
77
|
+
# --- Variant Grouping (HAC clustering) ---
|
|
78
|
+
# group-identity: 0.95 # Identity threshold for grouping variants
|
|
79
|
+
|
|
80
|
+
# --- MSA-based Merging ---
|
|
81
|
+
# disable-merging: false # Set true to skip all merging (fastest)
|
|
82
|
+
# merge-effort: balanced # Effort level: fast, balanced, thorough, or 6-14
|
|
83
|
+
# merge-snp: true # Enable SNP-based merging
|
|
84
|
+
# merge-indel-length: 3 # Max indel length to merge (0 = disabled)
|
|
85
|
+
# merge-position-count: 1 # Max variant positions to merge
|
|
86
|
+
# merge-min-size-ratio: 0.1 # Min size ratio for merge candidate
|
|
87
|
+
# min-merge-overlap: 0.9 # Min alignment overlap for merging
|
|
88
|
+
# disable-homopolymer-equivalence: false # Set true for strict comparison
|
|
89
|
+
|
|
90
|
+
# --- Variant Selection ---
|
|
91
|
+
# select-max-groups: -1 # Max groups to output (-1 = no limit)
|
|
92
|
+
# select-max-variants: -1 # Max variants per group (-1 = no limit)
|
|
93
|
+
# select-strategy: size # Selection strategy: size or diversity
|
|
94
|
+
|
|
95
|
+
# --- Processing ---
|
|
96
|
+
# threads: 0 # Max threads (0 = auto-detect)
|
|
97
|
+
# scale-threshold: 500 # Read count threshold for scaled processing
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# High-recall settings for herbarium specimens with degraded DNA
|
|
2
|
+
#
|
|
3
|
+
# Use this profile when working with:
|
|
4
|
+
# - Herbarium specimens where DNA may be degraded
|
|
5
|
+
# - Type specimens that cannot be resampled
|
|
6
|
+
# - Samples where contamination may dominate the target
|
|
7
|
+
# - Any situation where false negatives are more costly than false positives
|
|
8
|
+
#
|
|
9
|
+
# The settings prioritize keeping all potential biological signal for
|
|
10
|
+
# manual review, at the cost of more output variants.
|
|
11
|
+
|
|
12
|
+
speconsense-version: "0.7.*"
|
|
13
|
+
description: "High-recall settings for herbarium specimens with degraded DNA"
|
|
14
|
+
|
|
15
|
+
speconsense:
|
|
16
|
+
min-identity: 0.85 # Lower threshold for degraded DNA
|
|
17
|
+
# outlier-identity: auto # Auto-calculated as 0.925 from min-identity
|
|
18
|
+
min-cluster-ratio: 0 # Keep all clusters
|
|
19
|
+
min-size: 3 # Lower minimum for degraded samples
|
|
20
|
+
min-variant-frequency: 0.05 # More sensitive variant detection (5%)
|
|
21
|
+
presample: 0 # Use all reads
|
|
22
|
+
max-sample-size: 100 # 100 reads for high-quality consensus
|
|
23
|
+
|
|
24
|
+
speconsense-summarize:
|
|
25
|
+
min-ric: 3 # Accept lower-support consensus
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Speconsense profile for a single large input file
|
|
2
|
+
|
|
3
|
+
speconsense-version: "0.7.*"
|
|
4
|
+
description: "Large Input (experimental)"
|
|
5
|
+
|
|
6
|
+
# =============================================================================
|
|
7
|
+
# Parameters for speconsense (clustering and consensus)
|
|
8
|
+
# =============================================================================
|
|
9
|
+
speconsense:
|
|
10
|
+
min-cluster-ratio: 0 # Do not filter by relative size
|
|
11
|
+
presample: 0 # Process all reads
|
|
12
|
+
threads: 0 # Max threads (0 = auto-detect)
|
|
13
|
+
|
|
14
|
+
# =============================================================================
|
|
15
|
+
# Parameters for speconsense-summarize (post-processing)
|
|
16
|
+
# =============================================================================
|
|
17
|
+
speconsense-summarize:
|
|
18
|
+
merge-effort: fast
|
|
19
|
+
group-identity: 0.95 # Avoid spurious groupings with many sequences
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Speconsense Profile to simulate results of older bioinformatics pipelines.
|
|
2
|
+
# Included for comparison only - not recommended for production usage
|
|
3
|
+
|
|
4
|
+
speconsense-version: "0.7.*"
|
|
5
|
+
description: "Simulate older bioinformatics"
|
|
6
|
+
|
|
7
|
+
# Parameters for speconsense (clustering and consensus)
|
|
8
|
+
speconsense:
|
|
9
|
+
algorithm: greedy
|
|
10
|
+
min-identity: 0.85
|
|
11
|
+
outlier-identity: 0.85
|
|
12
|
+
disable-position-phasing: True
|
|
13
|
+
disable-ambiguity-calling: True
|
|
14
|
+
min-cluster-ratio: 0.2
|
|
15
|
+
min-size: 5
|
|
16
|
+
max-sample-size: 500
|
|
17
|
+
presample: 500
|
|
18
|
+
|
|
19
|
+
# Parameters for speconsense-summarize (post-processing)
|
|
20
|
+
speconsense-summarize:
|
|
21
|
+
min-ric: 5
|
|
22
|
+
disable-merging: true # Skip merging entirely
|