EntDetect 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EntDetect/Jwalk/GridTools.py +567 -0
- EntDetect/Jwalk/PDBTools.py +532 -0
- EntDetect/Jwalk/SASDTools.py +543 -0
- EntDetect/Jwalk/SurfaceTools.py +150 -0
- EntDetect/Jwalk/__init__.py +19 -0
- EntDetect/Jwalk/naccess.config.txt +255 -0
- EntDetect/__init__.py +10 -0
- EntDetect/_logging.py +71 -0
- EntDetect/change_resolution.py +2361 -0
- EntDetect/clustering.py +2626 -0
- EntDetect/compare_sim2exp.py +1927 -0
- EntDetect/entanglement_features.py +478 -0
- EntDetect/gaussian_entanglement.py +2067 -0
- EntDetect/order_params.py +1048 -0
- EntDetect/resources/__init__.py +11 -0
- EntDetect/resources/__pycache__/__init__.cpython-311.pyc +0 -0
- EntDetect/resources/calc_K.pl +712 -0
- EntDetect/resources/calc_Q.pl +962 -0
- EntDetect/resources/pulchra +0 -0
- EntDetect/resources/shared_files/__init__.py +2 -0
- EntDetect/resources/shared_files/bt_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/karanicolas_dihe_parm.dat +1600 -0
- EntDetect/resources/shared_files/kgs_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/mj_contact_potential.dat +22 -0
- EntDetect/resources/stride +0 -0
- EntDetect/statistics.py +1344 -0
- EntDetect/utilities.py +201 -0
- entdetect-1.2.0.dist-info/METADATA +26 -0
- entdetect-1.2.0.dist-info/RECORD +45 -0
- entdetect-1.2.0.dist-info/WHEEL +5 -0
- entdetect-1.2.0.dist-info/entry_points.txt +11 -0
- entdetect-1.2.0.dist-info/licenses/LICENSE +674 -0
- entdetect-1.2.0.dist-info/top_level.txt +2 -0
- scripts/__init__.py +5 -0
- scripts/convert_cor_psf_to_pdb.py +103 -0
- scripts/run_Foldingpathway.py +162 -0
- scripts/run_MSM.py +152 -0
- scripts/run_OP_on_simulation_traj.py +194 -0
- scripts/run_change_resolution.py +63 -0
- scripts/run_compare_sim2exp.py +215 -0
- scripts/run_montecarlo.py +158 -0
- scripts/run_nativeNCLE.py +179 -0
- scripts/run_nonnative_entanglement_clustering.py +110 -0
- scripts/run_population_modeling.py +117 -0
- scripts/run_workflow4_nativeNCLE_batch.py +412 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from EntDetect.clustering import ClusterNonNativeEntanglements
|
|
2
|
+
from EntDetect._logging import setup_logger
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Cluster non-native entanglement changes across an ensemble of simulation trajectories.
|
|
6
|
+
|
|
7
|
+
Reads per-trajectory entanglement pkl files produced by run_OP_on_simulation_traj.py
|
|
8
|
+
(located in the Combined_GE/ subdirectory of the G/ output folder), groups them into
|
|
9
|
+
non-redundant entanglement-change clusters, and writes representative structures and
|
|
10
|
+
per-frame cluster assignments to --outdir.
|
|
11
|
+
|
|
12
|
+
Examples
|
|
13
|
+
--------
|
|
14
|
+
Basic run:
|
|
15
|
+
python scripts/run_nonnative_entanglement_clustering.py \\
|
|
16
|
+
--outdir $DATASTORE/outputs/workflow2/nonnative_clustering \\
|
|
17
|
+
--trajnum2pklfile_path $DATASTORE/user_input/metadata/trajnum2file.txt \\
|
|
18
|
+
--traj_dir_prefix $DATASTORE/user_input/cg_trajectories
|
|
19
|
+
|
|
20
|
+
Flags
|
|
21
|
+
-----
|
|
22
|
+
--outdir Output directory for clustering results
|
|
23
|
+
--trajnum2pklfile_path CSV file (source of truth) with columns: trajnum, pklfile
|
|
24
|
+
Users control exactly which pkl files to analyze via this file
|
|
25
|
+
--traj_dir_prefix Path prefix to the directory containing trajectory DCD files
|
|
26
|
+
--start_frame First frame index to include, 0-based (default: 0)
|
|
27
|
+
--end_frame Last frame index to include, 0-based (default: all frames)
|
|
28
|
+
--nproc Number of parallel worker threads (default: 1)
|
|
29
|
+
Parallelises both pkl loading (per trajectory) and
|
|
30
|
+
entanglement-keyword clustering (per unique keyword).
|
|
31
|
+
Use the number of available CPU cores for best speed.
|
|
32
|
+
--log_level Logging verbosity: DEBUG, INFO, WARNING, ERROR (default: INFO)
|
|
33
|
+
--logdir Directory for log file (default: same as --outdir)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def main(argv=None):
|
|
38
|
+
|
|
39
|
+
###---------------------------------------------------------------------------------------------------------
|
|
40
|
+
import sys, os
|
|
41
|
+
import argparse
|
|
42
|
+
import time
|
|
43
|
+
import logging
|
|
44
|
+
start_time = time.time()
|
|
45
|
+
###---------------------------------------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
###---------------------------------------------------------------------------------------------------------
|
|
48
|
+
parser = argparse.ArgumentParser(
|
|
49
|
+
description="Cluster non-native entanglement changes across simulation trajectories.")
|
|
50
|
+
|
|
51
|
+
# --- identity / IO ---
|
|
52
|
+
parser.add_argument("--outdir", type=str, required=True, help="Output directory for clustering results")
|
|
53
|
+
parser.add_argument("--trajnum2pklfile_path", type=str, required=True, help="CSV file mapping trajectory numbers to pkl file paths (source of truth for which files to analyze)")
|
|
54
|
+
parser.add_argument("--traj_dir_prefix", type=str, required=True, help="Path prefix to the directory containing trajectory DCD files")
|
|
55
|
+
|
|
56
|
+
# --- frame selection ---
|
|
57
|
+
parser.add_argument("--start_frame", type=int, default=0, help="First frame index to include, 0-based (default: 0)")
|
|
58
|
+
parser.add_argument("--end_frame", type=int, default=9999999, help="Last frame index to include, 0-based (default: all frames)")
|
|
59
|
+
|
|
60
|
+
# --- parallelism ---
|
|
61
|
+
parser.add_argument("--nproc", type=int, default=1, help="Number of parallel worker threads (default: 1)")
|
|
62
|
+
|
|
63
|
+
# --- logging ---
|
|
64
|
+
parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging verbosity (default: INFO)")
|
|
65
|
+
parser.add_argument("--logdir", type=str, default=None, help="Directory for log file (default: same as --outdir)")
|
|
66
|
+
|
|
67
|
+
args = parser.parse_args(argv)
|
|
68
|
+
|
|
69
|
+
outdir = args.outdir
|
|
70
|
+
###---------------------------------------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
###---------------------------------------------------------------------------------------------------------
|
|
73
|
+
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
|
74
|
+
logdir = args.logdir if args.logdir is not None else outdir
|
|
75
|
+
|
|
76
|
+
logger = setup_logger('run_nonnative_clustering', outdir=logdir, ID='ClusterNonNativeEntanglements', log_level=log_level)
|
|
77
|
+
setup_logger('ClusterNonNativeEntanglements', outdir=logdir, ID='ClusterNonNativeEntanglements', log_level=log_level)
|
|
78
|
+
logger.info(f'args: {args}')
|
|
79
|
+
###---------------------------------------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
###---------------------------------------------------------------------------------------------------------
|
|
82
|
+
# --- input validation ---
|
|
83
|
+
if not os.path.isfile(args.trajnum2pklfile_path):
|
|
84
|
+
parser.error(f"--trajnum2pklfile_path does not exist: {args.trajnum2pklfile_path}")
|
|
85
|
+
|
|
86
|
+
if not os.path.isdir(args.traj_dir_prefix):
|
|
87
|
+
parser.error(f"--traj_dir_prefix does not exist or is not a directory: {args.traj_dir_prefix}")
|
|
88
|
+
|
|
89
|
+
os.makedirs(outdir, exist_ok=True)
|
|
90
|
+
###---------------------------------------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
###---------------------------------------------------------------------------------------------------------
|
|
93
|
+
clustering_NNents = ClusterNonNativeEntanglements(
|
|
94
|
+
trajnum2pklfile_path=args.trajnum2pklfile_path,
|
|
95
|
+
traj_dir_prefix=args.traj_dir_prefix,
|
|
96
|
+
outdir=outdir,
|
|
97
|
+
log_level=log_level,
|
|
98
|
+
logdir=logdir,
|
|
99
|
+
nproc=args.nproc,
|
|
100
|
+
)
|
|
101
|
+
logger.info(f'ClusterNonNativeEntanglements: {clustering_NNents}')
|
|
102
|
+
clustering_NNents.cluster(start_frame=args.start_frame, end_frame=args.end_frame)
|
|
103
|
+
###---------------------------------------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
|
|
106
|
+
return 0
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from EntDetect.statistics import ProteomeLogisticRegression
|
|
3
|
+
from EntDetect._logging import setup_logger
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Run Workflow 4 proteome-level logistic regression from residue feature tables.
|
|
7
|
+
|
|
8
|
+
This script wraps EntDetect.statistics.ProteomeLogisticRegression and writes
|
|
9
|
+
the final regression table as a pipe-delimited CSV.
|
|
10
|
+
|
|
11
|
+
Example
|
|
12
|
+
-------
|
|
13
|
+
python scripts/run_population_modeling.py \
|
|
14
|
+
--dataframe_files /path/to/residue_dataframes_workflow4.csv \
|
|
15
|
+
--outdir /path/to/workflow4/population_modeling/ \
|
|
16
|
+
--gene_list /path/to/gene_list.txt \
|
|
17
|
+
--tag Ecoli_population \
|
|
18
|
+
--reg_formula "cut_C_Rall ~ AA + region"
|
|
19
|
+
|
|
20
|
+
Expected input schema
|
|
21
|
+
---------------------
|
|
22
|
+
- Either a single combined design matrix file OR one residue table per protein
|
|
23
|
+
- Files are pipe-delimited ("|")
|
|
24
|
+
- Required columns: gene, mapped_resid, AA, region, cut_C_Rall
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def main(argv=None):
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import argparse
|
|
31
|
+
import time
|
|
32
|
+
import logging
|
|
33
|
+
|
|
34
|
+
start_time = time.time()
|
|
35
|
+
|
|
36
|
+
parser = argparse.ArgumentParser(
|
|
37
|
+
description="Run Workflow 4 proteome-level logistic regression from residue feature tables."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# --- required IO ---
|
|
41
|
+
parser.add_argument("--dataframe_files", type=str, required=True,
|
|
42
|
+
help="Input design matrix path: either a directory of per-protein files or a single combined CSV")
|
|
43
|
+
parser.add_argument("--outdir", type=str, required=True,
|
|
44
|
+
help="Output directory for regression results")
|
|
45
|
+
parser.add_argument("--gene_list", type=str, required=True,
|
|
46
|
+
help="Path to gene list file (one ID per line)")
|
|
47
|
+
parser.add_argument("--tag", type=str, required=True,
|
|
48
|
+
help="Identifier tag for output naming")
|
|
49
|
+
|
|
50
|
+
# --- model options ---
|
|
51
|
+
parser.add_argument("--reg_formula", type=str, default='cut_C_Rall ~ AA + region',
|
|
52
|
+
help="Regression formula (default: 'cut_C_Rall ~ AA + region')")
|
|
53
|
+
|
|
54
|
+
# --- logging ---
|
|
55
|
+
parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
56
|
+
help="Logging verbosity (default: INFO)")
|
|
57
|
+
parser.add_argument("--logdir", type=str, default=None,
|
|
58
|
+
help="Directory for log files (default: same as --outdir)")
|
|
59
|
+
|
|
60
|
+
args = parser.parse_args(argv)
|
|
61
|
+
|
|
62
|
+
dataframe_files = args.dataframe_files
|
|
63
|
+
outdir = args.outdir
|
|
64
|
+
gene_list = args.gene_list
|
|
65
|
+
tag = args.tag
|
|
66
|
+
reg_formula = args.reg_formula
|
|
67
|
+
|
|
68
|
+
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
|
69
|
+
logdir = args.logdir if args.logdir is not None else outdir
|
|
70
|
+
os.makedirs(logdir, exist_ok=True)
|
|
71
|
+
|
|
72
|
+
logger = setup_logger('run_population_modeling', outdir=logdir, ID=tag, log_level=log_level)
|
|
73
|
+
setup_logger('ProteomeLogisticRegression', outdir=logdir, ID=tag, log_level=log_level)
|
|
74
|
+
logger.info(f'args: {args}')
|
|
75
|
+
|
|
76
|
+
# --- input validation ---
|
|
77
|
+
if not os.path.exists(dataframe_files):
|
|
78
|
+
parser.error(f"--dataframe_files does not exist: {dataframe_files}")
|
|
79
|
+
if not os.path.isfile(gene_list):
|
|
80
|
+
parser.error(f"--gene_list does not exist or is not a file: {gene_list}")
|
|
81
|
+
os.makedirs(outdir, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
## initialize the regression object
|
|
84
|
+
ProtRegression = ProteomeLogisticRegression(
|
|
85
|
+
dataframe_files=dataframe_files,
|
|
86
|
+
outdir=outdir,
|
|
87
|
+
gene_list=gene_list,
|
|
88
|
+
ID=tag,
|
|
89
|
+
reg_formula=reg_formula,
|
|
90
|
+
log_level=log_level,
|
|
91
|
+
logdir=logdir,
|
|
92
|
+
)
|
|
93
|
+
logger.info(f'ProteomeLogisticRegression: {ProtRegression}')
|
|
94
|
+
|
|
95
|
+
# --- step 1: load residue-level data ---
|
|
96
|
+
ProtRegression.load_data(
|
|
97
|
+
sep='|',
|
|
98
|
+
reg_var=['AA', 'region'],
|
|
99
|
+
response_var='cut_C_Rall',
|
|
100
|
+
var2binarize=['cut_C_Rall', 'region'],
|
|
101
|
+
mask_column='mapped_resid',
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# --- step 2: run regression ---
|
|
105
|
+
reg_df = ProtRegression.run()
|
|
106
|
+
|
|
107
|
+
# --- step 3: persist results ---
|
|
108
|
+
reg_outfile = os.path.join(outdir, f"regression_results_{tag}.csv")
|
|
109
|
+
reg_df.to_csv(reg_outfile, index=False, sep='|')
|
|
110
|
+
logger.info(f"SAVED: {reg_outfile}")
|
|
111
|
+
|
|
112
|
+
logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
|
|
113
|
+
return 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from EntDetect._logging import setup_logger
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Batch Workflow 4 helper for Steps 3 and 4.
|
|
6
|
+
|
|
7
|
+
This script scans a directory of PDB files, filters to structures whose IDs are
|
|
8
|
+
present in a gene list, and runs scripts/run_nativeNCLE.py in parallel for each
|
|
9
|
+
selected structure.
|
|
10
|
+
|
|
11
|
+
Optionally, it can also build Step 4 regression/design-matrix files by combining:
|
|
12
|
+
1) per-residue experimental/structural features from a residueFeatures CSV, and
|
|
13
|
+
2) region labels inferred from NCLE `ent_region` output columns.
|
|
14
|
+
|
|
15
|
+
Compared to run_nativeNCLE.py, this wrapper accepts --pdb_dir and --gene_list
|
|
16
|
+
instead of --struct, then forwards the remaining nativeNCLE options.
|
|
17
|
+
|
|
18
|
+
Example
|
|
19
|
+
-------
|
|
20
|
+
python scripts/run_workflow4_nativeNCLE_batch.py \
|
|
21
|
+
--pdb_dir /scratch/ims86/EntDetect_Datastore/user_input/proteome_structures/AF \
|
|
22
|
+
--gene_list /scratch/ims86/EntDetect_Datastore/user_input/experimental_data/Gene_lists/AF/AF_0.6g_C_Rall_spa50_LiPMScov50_all_genes.txt \
|
|
23
|
+
--outdir /scratch/ims86/EntDetect_Datastore/outputs/workflow4/nativeNCLE_all \
|
|
24
|
+
--organism Ecoli \
|
|
25
|
+
--model AF \
|
|
26
|
+
--contacts heavy \
|
|
27
|
+
--resolution aa \
|
|
28
|
+
--ent_detection_method 3 \
|
|
29
|
+
--nproc 16 \
|
|
30
|
+
--residue_features_file /scratch/ims86/EntDetect_Datastore/user_input/experimental_data/PDB_residue_features/AF/residueFeatures.csv \
|
|
31
|
+
--reg_formula "cut_C_Rall ~ AA + region"
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _read_gene_list(path):
|
|
36
|
+
genes = []
|
|
37
|
+
with open(path, "r", encoding="utf-8") as handle:
|
|
38
|
+
for line in handle:
|
|
39
|
+
item = line.strip()
|
|
40
|
+
if not item:
|
|
41
|
+
continue
|
|
42
|
+
if item.startswith("#"):
|
|
43
|
+
continue
|
|
44
|
+
genes.append(item)
|
|
45
|
+
return set(genes)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parse_formula(reg_formula: str):
|
|
49
|
+
left, right = reg_formula.split("~", 1)
|
|
50
|
+
response_var = left.strip()
|
|
51
|
+
reg_vars = [v.strip() for v in right.split("+") if v.strip()]
|
|
52
|
+
return response_var, reg_vars
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _parse_ent_region_to_set(ent_region_value):
|
|
56
|
+
"""Convert ent_region field to a set of integer residue indices."""
|
|
57
|
+
if ent_region_value is None:
|
|
58
|
+
return set()
|
|
59
|
+
|
|
60
|
+
text = str(ent_region_value).strip()
|
|
61
|
+
if text == "" or text.lower() == "nan":
|
|
62
|
+
return set()
|
|
63
|
+
|
|
64
|
+
region = set()
|
|
65
|
+
for tok in text.split(","):
|
|
66
|
+
tok = tok.strip()
|
|
67
|
+
if tok == "":
|
|
68
|
+
continue
|
|
69
|
+
try:
|
|
70
|
+
region.add(int(tok))
|
|
71
|
+
except ValueError:
|
|
72
|
+
continue
|
|
73
|
+
return region
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _collect_ent_region_map(root_outdir, selected_gene_ids, logger):
|
|
77
|
+
"""Build gene -> set(mapped_resid in entangled regions) from NCLE feature files."""
|
|
78
|
+
import glob
|
|
79
|
+
import os
|
|
80
|
+
import pandas as pd
|
|
81
|
+
|
|
82
|
+
ent_region_map = {}
|
|
83
|
+
missing_genes = []
|
|
84
|
+
|
|
85
|
+
for gene in sorted(selected_gene_ids):
|
|
86
|
+
feat_glob = os.path.join(root_outdir, gene, "Native_clustered_HQ_GE_features", "*_uent_features.csv")
|
|
87
|
+
feat_files = sorted(glob.glob(feat_glob))
|
|
88
|
+
if not feat_files:
|
|
89
|
+
missing_genes.append(gene)
|
|
90
|
+
ent_region_map[gene] = set()
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
region_set = set()
|
|
94
|
+
for fp in feat_files:
|
|
95
|
+
try:
|
|
96
|
+
df = pd.read_csv(fp, sep="|", usecols=["ent_region"])
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
logger.warning(f"Could not read ent_region from {fp}: {exc}")
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
for v in df["ent_region"].values:
|
|
102
|
+
region_set.update(_parse_ent_region_to_set(v))
|
|
103
|
+
|
|
104
|
+
ent_region_map[gene] = region_set
|
|
105
|
+
|
|
106
|
+
if missing_genes:
|
|
107
|
+
logger.warning(
|
|
108
|
+
f"No NCLE feature files found for {len(missing_genes)} gene(s). "
|
|
109
|
+
f"Their region labels will default to 0."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return ent_region_map
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _build_design_matrices(args, selected_gene_ids, logger):
|
|
116
|
+
import os
|
|
117
|
+
import pandas as pd
|
|
118
|
+
|
|
119
|
+
if args.residue_features_file is None or args.reg_formula is None:
|
|
120
|
+
logger.info("Design-matrix build skipped (provide both --residue_features_file and --reg_formula to enable).")
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
if not os.path.isfile(args.residue_features_file):
|
|
124
|
+
raise FileNotFoundError(f"residue_features_file not found: {args.residue_features_file}")
|
|
125
|
+
|
|
126
|
+
response_var, reg_vars = _parse_formula(args.reg_formula)
|
|
127
|
+
logger.info(f"Building design matrices for formula: {response_var} ~ {' + '.join(reg_vars)}")
|
|
128
|
+
|
|
129
|
+
if "region" not in reg_vars:
|
|
130
|
+
logger.warning("Formula does not include 'region'; NCLE-derived region labels will not be used.")
|
|
131
|
+
|
|
132
|
+
ent_region_map = _collect_ent_region_map(args.outdir, selected_gene_ids, logger)
|
|
133
|
+
|
|
134
|
+
workflow4_root = os.path.dirname(os.path.abspath(args.outdir.rstrip(os.sep)))
|
|
135
|
+
combined_outfile = args.design_matrix_file or os.path.join(workflow4_root, "residue_dataframes_workflow4.csv")
|
|
136
|
+
|
|
137
|
+
req_cols = ["gene", "mapped_resid", "uniprot_length", response_var, *reg_vars]
|
|
138
|
+
# Keep AA for downstream filters even if omitted in formula
|
|
139
|
+
req_cols.extend(["AA"]) if "AA" not in req_cols else None
|
|
140
|
+
# remove region from file-read requirements (it is built from ent_region)
|
|
141
|
+
req_cols = [c for c in req_cols if c != "region"]
|
|
142
|
+
|
|
143
|
+
logger.info(f"Reading residue features from: {args.residue_features_file}")
|
|
144
|
+
df = pd.read_csv(args.residue_features_file, sep="|", low_memory=False)
|
|
145
|
+
|
|
146
|
+
missing_cols = [c for c in req_cols if c not in df.columns]
|
|
147
|
+
if missing_cols:
|
|
148
|
+
raise ValueError(f"Missing required columns in residue_features_file: {missing_cols}")
|
|
149
|
+
|
|
150
|
+
data = df[df["gene"].isin(selected_gene_ids)][req_cols].copy()
|
|
151
|
+
logger.info(f"Rows after gene-list filter: {len(data)}")
|
|
152
|
+
|
|
153
|
+
mapped_resid_num = pd.to_numeric(data["mapped_resid"], errors="coerce")
|
|
154
|
+
data["mapped_resid"] = mapped_resid_num
|
|
155
|
+
|
|
156
|
+
# Build region from NCLE ent_region sets
|
|
157
|
+
region_values = []
|
|
158
|
+
for gene, resid in zip(data["gene"].values, data["mapped_resid"].values):
|
|
159
|
+
if pd.isna(resid):
|
|
160
|
+
region_values.append(0)
|
|
161
|
+
continue
|
|
162
|
+
region_values.append(1 if int(resid) in ent_region_map.get(str(gene), set()) else 0)
|
|
163
|
+
data["region"] = region_values
|
|
164
|
+
|
|
165
|
+
final_cols = ["gene", "mapped_resid", "uniprot_length", *reg_vars, response_var]
|
|
166
|
+
# preserve order and uniqueness
|
|
167
|
+
seen = set()
|
|
168
|
+
final_cols = [c for c in final_cols if not (c in seen or seen.add(c))]
|
|
169
|
+
data = data[final_cols]
|
|
170
|
+
|
|
171
|
+
os.makedirs(os.path.dirname(os.path.abspath(combined_outfile)), exist_ok=True)
|
|
172
|
+
data.to_csv(combined_outfile, sep="|", index=False)
|
|
173
|
+
|
|
174
|
+
logger.info(f"Design matrix build complete: single matrix file written to {combined_outfile}")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _build_native_command(args, native_script, pdb_file, root_outdir, logdir):
|
|
178
|
+
import os
|
|
179
|
+
import sys
|
|
180
|
+
|
|
181
|
+
pdb_name = os.path.basename(pdb_file)
|
|
182
|
+
protein_id = os.path.splitext(pdb_name)[0]
|
|
183
|
+
protein_outdir = os.path.join(root_outdir, protein_id)
|
|
184
|
+
accession = args.Accession if args.Accession is not None else protein_id
|
|
185
|
+
|
|
186
|
+
cmd = [
|
|
187
|
+
sys.executable,
|
|
188
|
+
native_script,
|
|
189
|
+
"--struct", pdb_file,
|
|
190
|
+
"--outdir", protein_outdir,
|
|
191
|
+
"--ID", protein_id,
|
|
192
|
+
"--organism", args.organism,
|
|
193
|
+
"--Accession", accession,
|
|
194
|
+
"--model", args.model,
|
|
195
|
+
"--ent_detection_method", str(args.ent_detection_method),
|
|
196
|
+
"--log_level", args.log_level,
|
|
197
|
+
"--logdir", logdir,
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
if args.chain is not None:
|
|
201
|
+
cmd.extend(["--chain", args.chain])
|
|
202
|
+
|
|
203
|
+
if args.resolution is not None:
|
|
204
|
+
cmd.extend(["--resolution", args.resolution])
|
|
205
|
+
|
|
206
|
+
if args.contacts is not None:
|
|
207
|
+
cmd.extend(["--contacts", args.contacts])
|
|
208
|
+
|
|
209
|
+
if args.cluster_cutoff is not None:
|
|
210
|
+
cmd.extend(["--cluster_cutoff", str(args.cluster_cutoff)])
|
|
211
|
+
|
|
212
|
+
if args.cg:
|
|
213
|
+
cmd.append("--cg")
|
|
214
|
+
|
|
215
|
+
if args.Calpha:
|
|
216
|
+
cmd.append("--Calpha")
|
|
217
|
+
|
|
218
|
+
return protein_id, cmd
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _run_one(job):
|
|
222
|
+
import subprocess
|
|
223
|
+
|
|
224
|
+
protein_id, cmd = job
|
|
225
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
226
|
+
return protein_id, proc.returncode, proc.stdout, proc.stderr
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def main(argv=None):
|
|
230
|
+
import argparse
|
|
231
|
+
import glob
|
|
232
|
+
import logging
|
|
233
|
+
import os
|
|
234
|
+
import time
|
|
235
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
236
|
+
|
|
237
|
+
start_time = time.time()
|
|
238
|
+
|
|
239
|
+
parser = argparse.ArgumentParser(
|
|
240
|
+
description=(
|
|
241
|
+
"Batch-run scripts/run_nativeNCLE.py over a PDB directory filtered by a gene list."
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# --- required batch inputs ---
|
|
246
|
+
parser.add_argument("--pdb_dir", type=str, required=True,
|
|
247
|
+
help="Directory containing input .pdb files")
|
|
248
|
+
parser.add_argument("--gene_list", type=str, required=True,
|
|
249
|
+
help="Gene/accession list file (one ID per line)")
|
|
250
|
+
parser.add_argument("--outdir", type=str, required=True,
|
|
251
|
+
help="Root output directory; each protein writes to outdir/<ID>")
|
|
252
|
+
|
|
253
|
+
# --- parallelism / matching behavior ---
|
|
254
|
+
parser.add_argument("--nproc", type=int, default=8,
|
|
255
|
+
help="Number of parallel nativeNCLE jobs (default: 8)")
|
|
256
|
+
parser.add_argument("--allow_prefix_match", action="store_true",
|
|
257
|
+
help=(
|
|
258
|
+
"Allow gene IDs to match as prefix of PDB stem (useful for "
|
|
259
|
+
"filenames containing structure suffixes)."
|
|
260
|
+
))
|
|
261
|
+
parser.add_argument("--dry_run", action="store_true",
|
|
262
|
+
help="Print selected proteins and exit without running jobs")
|
|
263
|
+
|
|
264
|
+
# --- forwarded run_nativeNCLE options (minus --struct) ---
|
|
265
|
+
parser.add_argument("--chain", type=str, default=None,
|
|
266
|
+
help="Chain identifier (optional)")
|
|
267
|
+
parser.add_argument("--organism", type=str, default="Ecoli",
|
|
268
|
+
help="Organism for clustering: Ecoli | Human | Yeast")
|
|
269
|
+
parser.add_argument("--Accession", type=str, default=None,
|
|
270
|
+
help="Accession value passed to run_nativeNCLE. If omitted, uses each protein ID from the PDB stem.")
|
|
271
|
+
parser.add_argument("--cg", action="store_true",
|
|
272
|
+
help="Pass --cg to run_nativeNCLE (legacy flag)")
|
|
273
|
+
parser.add_argument("--Calpha", "--calpha", action="store_true", dest="Calpha",
|
|
274
|
+
help="Pass --Calpha to run_nativeNCLE (legacy flag)")
|
|
275
|
+
parser.add_argument("--resolution", type=str, choices=["aa", "cg"], default=None,
|
|
276
|
+
help="Resolution forwarded to run_nativeNCLE")
|
|
277
|
+
parser.add_argument("--contacts", type=str, choices=["heavy", "calpha"], default=None,
|
|
278
|
+
help="Contact type forwarded to run_nativeNCLE")
|
|
279
|
+
parser.add_argument("--cluster_cutoff", type=float, default=None,
|
|
280
|
+
help="Cluster cutoff forwarded to run_nativeNCLE")
|
|
281
|
+
parser.add_argument("--model", type=str, default="AF",
|
|
282
|
+
help="Model type for HQ selection: EXP | AF")
|
|
283
|
+
parser.add_argument("--ent_detection_method", type=int, default=3,
|
|
284
|
+
help="Entanglement detection method passed to run_nativeNCLE")
|
|
285
|
+
|
|
286
|
+
# --- optional Step 4 design-matrix build ---
|
|
287
|
+
parser.add_argument("--residue_features_file", type=str, default=None,
|
|
288
|
+
help="Path to residue features CSV (e.g., .../PDB_residue_features/AF/residueFeatures.csv)")
|
|
289
|
+
parser.add_argument("--reg_formula", type=str, default=None,
|
|
290
|
+
help="Regression formula for design matrix (e.g., 'cut_C_Rall ~ AA + region')")
|
|
291
|
+
parser.add_argument("--design_matrix_file", type=str, default=None,
|
|
292
|
+
help="Output path for combined design matrix CSV (default: sibling of --outdir/residue_dataframes_workflow4.csv)")
|
|
293
|
+
|
|
294
|
+
# --- logging ---
|
|
295
|
+
parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
296
|
+
help="Logging verbosity (default: INFO)")
|
|
297
|
+
parser.add_argument("--logdir", type=str, default=None,
|
|
298
|
+
help="Directory for run log files (default: <outdir>/logs)")
|
|
299
|
+
|
|
300
|
+
args = parser.parse_args(argv)
|
|
301
|
+
|
|
302
|
+
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
|
303
|
+
logdir = args.logdir if args.logdir is not None else os.path.join(args.outdir, "logs")
|
|
304
|
+
os.makedirs(logdir, exist_ok=True)
|
|
305
|
+
|
|
306
|
+
logger = setup_logger("run_workflow4_nativeNCLE_batch", outdir=logdir, ID="workflow4_batch", log_level=log_level)
|
|
307
|
+
logger.info(f"args: {args}")
|
|
308
|
+
|
|
309
|
+
# --- validation ---
|
|
310
|
+
if not os.path.isdir(args.pdb_dir):
|
|
311
|
+
parser.error(f"--pdb_dir does not exist or is not a directory: {args.pdb_dir}")
|
|
312
|
+
if not os.path.isfile(args.gene_list):
|
|
313
|
+
parser.error(f"--gene_list does not exist or is not a file: {args.gene_list}")
|
|
314
|
+
if args.nproc < 1:
|
|
315
|
+
parser.error("--nproc must be >= 1")
|
|
316
|
+
|
|
317
|
+
os.makedirs(args.outdir, exist_ok=True)
|
|
318
|
+
|
|
319
|
+
# Resolve native runner path relative to this script.
|
|
320
|
+
native_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_nativeNCLE.py")
|
|
321
|
+
if not os.path.isfile(native_script):
|
|
322
|
+
parser.error(f"Could not locate run_nativeNCLE.py at expected path: {native_script}")
|
|
323
|
+
|
|
324
|
+
gene_set = _read_gene_list(args.gene_list)
|
|
325
|
+
if not gene_set:
|
|
326
|
+
parser.error(f"--gene_list appears empty: {args.gene_list}")
|
|
327
|
+
|
|
328
|
+
pdb_files = sorted(glob.glob(os.path.join(args.pdb_dir, "*.pdb")))
|
|
329
|
+
if not pdb_files:
|
|
330
|
+
parser.error(f"No .pdb files found in --pdb_dir: {args.pdb_dir}")
|
|
331
|
+
|
|
332
|
+
selected = []
|
|
333
|
+
skipped = 0
|
|
334
|
+
for pdb_file in pdb_files:
|
|
335
|
+
stem = os.path.splitext(os.path.basename(pdb_file))[0]
|
|
336
|
+
if stem in gene_set:
|
|
337
|
+
selected.append(pdb_file)
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
if args.allow_prefix_match:
|
|
341
|
+
# Accept if any gene is an exact prefix token of the filename stem.
|
|
342
|
+
if any(stem.startswith(gene + "_") or stem.startswith(gene + "-") for gene in gene_set):
|
|
343
|
+
selected.append(pdb_file)
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
skipped += 1
|
|
347
|
+
|
|
348
|
+
logger.info(f"Found {len(pdb_files)} pdb files")
|
|
349
|
+
logger.info(f"Matched {len(selected)} structures against gene list; skipped {skipped}")
|
|
350
|
+
|
|
351
|
+
if not selected:
|
|
352
|
+
parser.error("No PDBs matched the provided gene list. Check naming conventions or use --allow_prefix_match.")
|
|
353
|
+
|
|
354
|
+
jobs = [_build_native_command(args, native_script, pdb_file, args.outdir, logdir) for pdb_file in selected]
|
|
355
|
+
|
|
356
|
+
if args.dry_run:
|
|
357
|
+
logger.info("Dry-run mode enabled. Selected proteins:")
|
|
358
|
+
for protein_id, _ in jobs:
|
|
359
|
+
logger.info(f" {protein_id}")
|
|
360
|
+
logger.info(f"NORMAL TERMINATION - {time.time() - start_time:.1f} seconds")
|
|
361
|
+
return 0
|
|
362
|
+
|
|
363
|
+
logger.info(f"Launching {len(jobs)} nativeNCLE jobs with nproc={args.nproc}")
|
|
364
|
+
|
|
365
|
+
failures = []
|
|
366
|
+
completed = 0
|
|
367
|
+
with ThreadPoolExecutor(max_workers=args.nproc) as executor:
|
|
368
|
+
future_map = {executor.submit(_run_one, job): job[0] for job in jobs}
|
|
369
|
+
for future in as_completed(future_map):
|
|
370
|
+
protein_id = future_map[future]
|
|
371
|
+
try:
|
|
372
|
+
_, code, stdout, stderr = future.result()
|
|
373
|
+
except Exception as exc:
|
|
374
|
+
failures.append((protein_id, -1, "", f"internal runner error: {exc}"))
|
|
375
|
+
logger.error(f"FAILED: {protein_id} (internal error)")
|
|
376
|
+
continue
|
|
377
|
+
|
|
378
|
+
completed += 1
|
|
379
|
+
if code == 0:
|
|
380
|
+
logger.info(f"DONE: {protein_id} ({completed}/{len(jobs)})")
|
|
381
|
+
else:
|
|
382
|
+
failures.append((protein_id, code, stdout, stderr))
|
|
383
|
+
logger.error(f"FAILED: {protein_id} exit_code={code} ({completed}/{len(jobs)})")
|
|
384
|
+
|
|
385
|
+
if failures:
|
|
386
|
+
fail_log = os.path.join(args.outdir, "workflow4_nativeNCLE_batch_failures.log")
|
|
387
|
+
with open(fail_log, "w", encoding="utf-8") as handle:
|
|
388
|
+
for protein_id, code, stdout, stderr in failures:
|
|
389
|
+
handle.write(f"#{'='*78}\n")
|
|
390
|
+
handle.write(f"protein: {protein_id}\n")
|
|
391
|
+
handle.write(f"exit_code: {code}\n")
|
|
392
|
+
handle.write("--- stdout ---\n")
|
|
393
|
+
handle.write(stdout or "")
|
|
394
|
+
handle.write("\n--- stderr ---\n")
|
|
395
|
+
handle.write(stderr or "")
|
|
396
|
+
handle.write("\n")
|
|
397
|
+
logger.error(f"{len(failures)} jobs failed. See: {fail_log}")
|
|
398
|
+
logger.info(f"NORMAL TERMINATION WITH FAILURES - {time.time() - start_time:.1f} seconds")
|
|
399
|
+
return 1
|
|
400
|
+
|
|
401
|
+
logger.info(f"All {len(jobs)} jobs completed successfully")
|
|
402
|
+
|
|
403
|
+
# Optional Step 4: design matrix build
|
|
404
|
+
selected_gene_ids = [job[0] for job in jobs]
|
|
405
|
+
_build_design_matrices(args, selected_gene_ids, logger)
|
|
406
|
+
|
|
407
|
+
logger.info(f"NORMAL TERMINATION - {time.time() - start_time:.1f} seconds")
|
|
408
|
+
return 0
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
if __name__ == "__main__":
|
|
412
|
+
raise SystemExit(main())
|