evoseq 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evoseq/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .config import load_config, run_from_config
2
+
3
+ __all__ = ["load_config", "run_from_config"]
evoseq/cli.py ADDED
@@ -0,0 +1,21 @@
1
+ import argparse
2
+
3
+ from .config import run_from_config
4
+
5
+
6
+ def main(argv=None):
7
+ parser = argparse.ArgumentParser(description="Run EvoSeq from a TOML config.")
8
+ parser.add_argument("config", help="Path to an EvoSeq TOML config file.")
9
+ args = parser.parse_args(argv)
10
+
11
+ outputs = run_from_config(args.config)
12
+ print("EvoSeq run completed.")
13
+ for key, value in outputs.items():
14
+ if key.endswith("_paths"):
15
+ print(f"{key}:")
16
+ for name, path in value.items():
17
+ print(f" {name}: {path}")
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()
evoseq/config.py ADDED
@@ -0,0 +1,107 @@
1
+ from pathlib import Path
2
+ import tomllib
3
+
4
+ from .preprocess import preprocess_files, preprocess_folder
5
+ from .scoring import export_perbase_logprobs, score_pairs_file
6
+
7
+
8
+ def _none_if_blank(value):
9
+ return None if value == "" else value
10
+
11
+
12
+ def load_config(path):
13
+ with open(path, "rb") as fh:
14
+ return tomllib.load(fh)
15
+
16
+
17
+ def run_from_config(path):
18
+ config = load_config(path)
19
+
20
+ project = config.get("project", {})
21
+ input_dir = project.get("input_dir", project.get("base_dir", "."))
22
+
23
+ preprocess_config = config.get("preprocess", {})
24
+ scoring_config = config.get("scoring", {})
25
+ perbase_config = config.get("perbase", {})
26
+
27
+ outputs = {}
28
+ if preprocess_config.get("enabled", True):
29
+ reference_fasta_path = _none_if_blank(preprocess_config.get("reference_fasta_path"))
30
+ mutant_fasta_path = _none_if_blank(preprocess_config.get("mutant_fasta_path"))
31
+ manifest_path = preprocess_config.get("manifest_path", "auto")
32
+
33
+ if reference_fasta_path and mutant_fasta_path:
34
+ evo_df, saved = preprocess_files(
35
+ reference_fasta_path=reference_fasta_path,
36
+ mutant_fasta_path=mutant_fasta_path,
37
+ manifest_path=manifest_path,
38
+ output_dir=_none_if_blank(
39
+ preprocess_config.get("output_dir", preprocess_config.get("out_dir"))
40
+ ),
41
+ strict_manifest=preprocess_config.get("strict_manifest", False),
42
+ progress=preprocess_config.get("progress", True),
43
+ )
44
+ else:
45
+ evo_df, saved = preprocess_folder(
46
+ input_dir=input_dir,
47
+ output_dir=_none_if_blank(
48
+ preprocess_config.get("output_dir", preprocess_config.get("out_dir"))
49
+ ),
50
+ manifest_path=manifest_path,
51
+ reference_fasta_path=reference_fasta_path,
52
+ mutant_fasta_path=mutant_fasta_path,
53
+ dataset_type=preprocess_config.get("dataset_type", "auto"),
54
+ window_size=preprocess_config.get("window_size"),
55
+ strict_manifest=preprocess_config.get("strict_manifest", False),
56
+ progress=preprocess_config.get("progress", True),
57
+ )
58
+ outputs["preprocess_df"] = evo_df
59
+ outputs["preprocess_paths"] = saved
60
+
61
+ if scoring_config.get("enabled", False):
62
+ pairs_path = (
63
+ _none_if_blank(scoring_config.get("pairs_path"))
64
+ or outputs.get("preprocess_paths", {}).get("pairs")
65
+ )
66
+ if not pairs_path:
67
+ raise ValueError(
68
+ "scoring.enabled is true, but no pairs_path was provided and "
69
+ "preprocessing did not produce a pair table."
70
+ )
71
+
72
+ result_df, paths = score_pairs_file(
73
+ pairs_path=pairs_path,
74
+ output_dir=_none_if_blank(
75
+ scoring_config.get("output_dir", scoring_config.get("result_dir"))
76
+ ),
77
+ manifest_path=scoring_config.get("manifest_path", "auto"),
78
+ model_name=scoring_config.get("model_name", "evo2_7b"),
79
+ device=scoring_config.get("device", "cuda:0"),
80
+ local_path=_none_if_blank(scoring_config.get("local_path")),
81
+ batch_size=scoring_config.get("batch_size", 8),
82
+ force_reload=scoring_config.get("force_reload", False),
83
+ require_recommended_gpu=scoring_config.get(
84
+ "require_recommended_gpu",
85
+ True,
86
+ ),
87
+ progress=scoring_config.get("progress", True),
88
+ )
89
+ outputs["scoring_df"] = result_df
90
+ outputs["scoring_paths"] = paths
91
+
92
+ if perbase_config.get("enabled", False):
93
+ output_path = export_perbase_logprobs(
94
+ fasta_path=perbase_config["fasta_path"],
95
+ output_path=_none_if_blank(perbase_config.get("output_path")),
96
+ output_dir=_none_if_blank(perbase_config.get("output_dir")),
97
+ model_name=perbase_config.get("model_name", "evo2_7b"),
98
+ device=perbase_config.get("device", "cuda:0"),
99
+ center=perbase_config.get("center", 4096),
100
+ half_window=perbase_config.get("half_window", 320),
101
+ local_path=_none_if_blank(perbase_config.get("local_path")),
102
+ progress=perbase_config.get("progress", True),
103
+ )
104
+ outputs["perbase_path"] = output_path
105
+
106
+ outputs["config_path"] = Path(path)
107
+ return outputs
evoseq/paths.py ADDED
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+
3
+
4
+ def common_parent(paths):
5
+ resolved = [Path(path).expanduser().resolve().parent for path in paths if path]
6
+ if not resolved:
7
+ return Path.cwd()
8
+ if len(resolved) == 1:
9
+ return resolved[0]
10
+
11
+ import os
12
+
13
+ return Path(os.path.commonpath([str(path) for path in resolved]))
14
+
15
+
16
+ def default_output_dir(kind, *input_paths, base_dir=None):
17
+ names = {
18
+ "preprocess": "evoseq_preprocess_output",
19
+ "scoring": "evoseq_scoring_output",
20
+ "perbase": "evoseq_perbase_output",
21
+ }
22
+ dirname = names.get(kind, f"evoseq_{kind}_output")
23
+
24
+ if base_dir:
25
+ return Path(base_dir) / dirname
26
+
27
+ return common_parent(input_paths) / dirname
28
+
29
+
30
+ def ensure_output_dir(path, fallback="/content/evoseq_output"):
31
+ path = Path(path)
32
+ try:
33
+ path.mkdir(parents=True, exist_ok=True)
34
+ test_path = path / ".write_test"
35
+ test_path.write_text("ok")
36
+ test_path.unlink(missing_ok=True)
37
+ return path
38
+ except OSError as exc:
39
+ fallback_path = Path(fallback)
40
+ print(f"Warning: cannot use output directory {path} ({exc}).")
41
+ print(f"Using fallback output directory: {fallback_path}")
42
+ fallback_path.mkdir(parents=True, exist_ok=True)
43
+ return fallback_path
@@ -0,0 +1,13 @@
1
+ from .pipeline import (
2
+ prepare_evo2_input,
3
+ preprocess_files,
4
+ preprocess_folder,
5
+ preprocess_from_base_dir,
6
+ )
7
+
8
+ __all__ = [
9
+ "prepare_evo2_input",
10
+ "preprocess_files",
11
+ "preprocess_folder",
12
+ "preprocess_from_base_dir",
13
+ ]
@@ -0,0 +1,108 @@
1
+ from pathlib import Path
2
+
3
+
4
+ FASTA_SUFFIXES = {".fa", ".fasta", ".fna"}
5
+
6
+
7
+ def _score_candidate(path, kind, dataset_type="auto", window_size=None):
8
+ name = path.name.lower()
9
+ score = 0
10
+
11
+ if kind == "reference":
12
+ score += 5 if "reference" in name else 0
13
+ score += 3 if "_ref" in name or "ref_" in name else 0
14
+ else:
15
+ score += 5 if "mutant" in name else 0
16
+ score += 3 if "_mut" in name or "mut_" in name else 0
17
+
18
+ if dataset_type and dataset_type != "auto":
19
+ aliases = {
20
+ "positive": ["pos", "positive"],
21
+ "negative": ["neg", "negative"],
22
+ }.get(dataset_type, [dataset_type])
23
+ score += 4 if any(alias in name for alias in aliases) else 0
24
+
25
+ if window_size:
26
+ score += 2 if str(window_size) in name else 0
27
+
28
+ score -= 4 if "evo2_all" in name else 0
29
+ score -= 3 if "output" in str(path).lower() else 0
30
+ return score
31
+
32
+
33
+ def infer_dataset_type(base_dir):
34
+ name = Path(base_dir).name.lower()
35
+ if "neg" in name or "negative" in name:
36
+ return "negative"
37
+ if "pos" in name or "positive" in name:
38
+ return "positive"
39
+ return "auto"
40
+
41
+
42
+ def discover_manifest(base_dir, manifest_path="auto"):
43
+ if manifest_path in (None, False):
44
+ return None
45
+ if manifest_path != "auto":
46
+ return Path(manifest_path)
47
+
48
+ base_dir = Path(base_dir)
49
+ candidates = list((base_dir / "data").glob("manifest*.tsv"))
50
+ candidates += list(base_dir.glob("manifest*.tsv"))
51
+ return candidates[0] if candidates else None
52
+
53
+
54
+ def discover_fasta_pair(
55
+ base_dir,
56
+ reference_fasta_path=None,
57
+ mutant_fasta_path=None,
58
+ dataset_type="auto",
59
+ window_size=None,
60
+ ):
61
+ if reference_fasta_path and mutant_fasta_path:
62
+ return Path(reference_fasta_path), Path(mutant_fasta_path)
63
+
64
+ base_dir = Path(base_dir)
65
+ search_dirs = [base_dir / "data", base_dir]
66
+ fasta_paths = []
67
+ for search_dir in search_dirs:
68
+ if search_dir.exists():
69
+ fasta_paths.extend(
70
+ p for p in search_dir.iterdir() if p.suffix.lower() in FASTA_SUFFIXES
71
+ )
72
+
73
+ if dataset_type == "auto":
74
+ dataset_type = infer_dataset_type(base_dir)
75
+
76
+ if not reference_fasta_path:
77
+ refs = [
78
+ p for p in fasta_paths
79
+ if "reference" in p.name.lower() or "_ref" in p.name.lower()
80
+ ]
81
+ refs = sorted(
82
+ refs,
83
+ key=lambda p: _score_candidate(p, "reference", dataset_type, window_size),
84
+ reverse=True,
85
+ )
86
+ reference_fasta_path = refs[0] if refs else None
87
+
88
+ if not mutant_fasta_path:
89
+ muts = [
90
+ p for p in fasta_paths
91
+ if "mutant" in p.name.lower() or "_mut" in p.name.lower()
92
+ ]
93
+ muts = sorted(
94
+ muts,
95
+ key=lambda p: _score_candidate(p, "mutant", dataset_type, window_size),
96
+ reverse=True,
97
+ )
98
+ mutant_fasta_path = muts[0] if muts else None
99
+
100
+ if not reference_fasta_path or not mutant_fasta_path:
101
+ available = ", ".join(str(p) for p in sorted(fasta_paths)) or "none"
102
+ raise FileNotFoundError(
103
+ "Could not discover reference/mutant FASTA files. "
104
+ "Pass reference_fasta_path and mutant_fasta_path explicitly. "
105
+ f"Available FASTA files: {available}"
106
+ )
107
+
108
+ return Path(reference_fasta_path), Path(mutant_fasta_path)
@@ -0,0 +1,80 @@
1
+ import pandas as pd
2
+
3
+ from .fasta import wrap_sequence
4
+
5
+
6
+ def sanitize_header_value(value):
7
+ if pd.isna(value):
8
+ return "NA"
9
+
10
+ return str(value).replace(" ", "_").replace("\n", "_").replace("\t", "_")
11
+
12
+
13
+ def get_row_value(row, name, default="NA"):
14
+ return getattr(row, name) if hasattr(row, name) else default
15
+
16
+
17
+ def make_fasta_header(row, allele):
18
+ seq_len = row.ref_len if allele == "ref" else row.mut_len
19
+
20
+ fields = [
21
+ row.id,
22
+ f"allele={allele}",
23
+ f"gene={sanitize_header_value(get_row_value(row, 'gene'))}",
24
+ f"variant={sanitize_header_value(get_row_value(row, 'variant'))}",
25
+ f"hgvs={sanitize_header_value(get_row_value(row, 'hgvs'))}",
26
+ f"ann={sanitize_header_value(get_row_value(row, 'annotation'))}",
27
+ f"type={sanitize_header_value(get_row_value(row, 'variant_type'))}",
28
+ f"len={seq_len}",
29
+ ]
30
+
31
+ return "|".join(fields)
32
+
33
+
34
+ def write_fasta_from_df(table, path, allele):
35
+ seq_col = "ref_seq" if allele == "ref" else "mut_seq"
36
+
37
+ with open(path, "w") as f:
38
+ for row in table.itertuples(index=False):
39
+ header = make_fasta_header(row, allele)
40
+ seq = getattr(row, seq_col)
41
+
42
+ f.write(f">{header}\n")
43
+ f.write(wrap_sequence(seq) + "\n")
44
+
45
+
46
+ def export_evo2_input(evo_input_df, out_dir):
47
+ out_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ pairs_path = out_dir / "evo2_pairs.tsv"
50
+ pair_path = out_dir / "evo2_pair.tsv"
51
+ ref_path = out_dir / "evo2_reference.fa"
52
+ mut_path = out_dir / "evo2_mutant.fa"
53
+ all_path = out_dir / "evo2_all.fa"
54
+
55
+ evo_input_df.to_csv(pairs_path, sep="\t", index=False)
56
+ evo_input_df.to_csv(pair_path, sep="\t", index=False)
57
+
58
+ write_fasta_from_df(evo_input_df, ref_path, "ref")
59
+ write_fasta_from_df(evo_input_df, mut_path, "mut")
60
+
61
+ with open(all_path, "w") as fout:
62
+ for row in evo_input_df.itertuples(index=False):
63
+ ref_header = make_fasta_header(row, "ref")
64
+ mut_header = make_fasta_header(row, "mut")
65
+
66
+ fout.write(f">{ref_header}\n")
67
+ fout.write(wrap_sequence(row.ref_seq) + "\n")
68
+ fout.write(f">{mut_header}\n")
69
+ fout.write(wrap_sequence(row.mut_seq) + "\n")
70
+
71
+ return {
72
+ "pairs": pairs_path,
73
+ "pair_tsv": pair_path,
74
+ "reference": ref_path,
75
+ "reference_fasta": ref_path,
76
+ "mutant": mut_path,
77
+ "mutant_fasta": mut_path,
78
+ "all": all_path,
79
+ "all_fasta": all_path,
80
+ }
@@ -0,0 +1,29 @@
1
+ from Bio import SeqIO
2
+
3
+
4
+ def read_fasta_as_dict(path):
5
+ records = {}
6
+ with open(path) as handle:
7
+ for rec in SeqIO.parse(handle, "fasta"):
8
+ records[rec.id] = str(rec.seq).upper()
9
+ return records
10
+
11
+
12
+ def check_fasta_pair(ref_records, mut_records):
13
+ ref_ids = set(ref_records)
14
+ mut_ids = set(mut_records)
15
+
16
+ ref_only = ref_ids - mut_ids
17
+ mut_only = mut_ids - ref_ids
18
+ common = ref_ids & mut_ids
19
+
20
+ if ref_only:
21
+ raise ValueError(f"Reference-only IDs exist: { list(sorted(ref_only))[:5] }")
22
+ if mut_only:
23
+ raise ValueError(f"Mutant-only IDs exist: { list(sorted(mut_only))[:5] }")
24
+
25
+ return common
26
+
27
+
28
+ def wrap_sequence(seq, width=80):
29
+ return "\n".join(seq[i : i + width] for i in range(0, len(seq), width))
@@ -0,0 +1,65 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+ def read_manifest(path):
6
+ df = pd.read_csv(path, sep="\t")
7
+ unnamed = [c for c in df.columns if str(c).startswith("Unnamed:")]
8
+ if unnamed:
9
+ df = df.drop(columns=unnamed)
10
+ return df
11
+
12
+
13
+ def summarize_manifest(df):
14
+ def nunique_or_zero(column):
15
+ return df[column].nunique() if column in df.columns else 0
16
+
17
+ return {
18
+ "rows": len(df),
19
+ "columns": df.shape[1],
20
+ "unique_record_id": nunique_or_zero("record_id"),
21
+ "unique_samples": nunique_or_zero("sample"),
22
+ "unique_genes": nunique_or_zero("gene"),
23
+ "unique_hgvs": nunique_or_zero("hgvs"),
24
+ "unique_spdi": nunique_or_zero("spdi"),
25
+ }
26
+
27
+
28
+ def aggregate_manifest(df, record_id_col="record_id"):
29
+ if record_id_col not in df.columns:
30
+ raise ValueError(f"Manifest is missing required column: {record_id_col}")
31
+
32
+ grouped = df.groupby(record_id_col, dropna=False)
33
+ out = grouped.size().rename("n_manifest_rows").reset_index()
34
+
35
+ if record_id_col != "record_id":
36
+ out = out.rename(columns={record_id_col: "record_id"})
37
+
38
+ optional_first = [
39
+ "gene",
40
+ "hgvs",
41
+ "annotation",
42
+ "chrom",
43
+ "pos1",
44
+ "ref",
45
+ "alt",
46
+ "spdi",
47
+ ]
48
+ for column in optional_first:
49
+ if column in df.columns:
50
+ values = grouped[column].first().reset_index(drop=True)
51
+ out[f"{column}_manifest" if column == "gene" else column] = values
52
+
53
+ if "sample" in df.columns:
54
+ out["samples_joined"] = grouped["sample"].agg(
55
+ lambda x: ";".join(sorted(set(map(str, x.dropna()))))
56
+ ).reset_index(drop=True)
57
+
58
+ if "note" in df.columns:
59
+ out["note"] = grouped["note"].agg(
60
+ lambda x: ";".join(sorted(set(map(str, x.dropna()))))
61
+ if x.notna().any()
62
+ else np.nan
63
+ ).reset_index(drop=True)
64
+
65
+ return out