EntDetect 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. EntDetect/Jwalk/GridTools.py +567 -0
  2. EntDetect/Jwalk/PDBTools.py +532 -0
  3. EntDetect/Jwalk/SASDTools.py +543 -0
  4. EntDetect/Jwalk/SurfaceTools.py +150 -0
  5. EntDetect/Jwalk/__init__.py +19 -0
  6. EntDetect/Jwalk/naccess.config.txt +255 -0
  7. EntDetect/__init__.py +10 -0
  8. EntDetect/_logging.py +71 -0
  9. EntDetect/change_resolution.py +2361 -0
  10. EntDetect/clustering.py +2626 -0
  11. EntDetect/compare_sim2exp.py +1927 -0
  12. EntDetect/entanglement_features.py +478 -0
  13. EntDetect/gaussian_entanglement.py +2067 -0
  14. EntDetect/order_params.py +1048 -0
  15. EntDetect/resources/__init__.py +11 -0
  16. EntDetect/resources/__pycache__/__init__.cpython-311.pyc +0 -0
  17. EntDetect/resources/calc_K.pl +712 -0
  18. EntDetect/resources/calc_Q.pl +962 -0
  19. EntDetect/resources/pulchra +0 -0
  20. EntDetect/resources/shared_files/__init__.py +2 -0
  21. EntDetect/resources/shared_files/bt_contact_potential.dat +22 -0
  22. EntDetect/resources/shared_files/karanicolas_dihe_parm.dat +1600 -0
  23. EntDetect/resources/shared_files/kgs_contact_potential.dat +22 -0
  24. EntDetect/resources/shared_files/mj_contact_potential.dat +22 -0
  25. EntDetect/resources/stride +0 -0
  26. EntDetect/statistics.py +1344 -0
  27. EntDetect/utilities.py +201 -0
  28. entdetect-1.2.0.dist-info/METADATA +26 -0
  29. entdetect-1.2.0.dist-info/RECORD +45 -0
  30. entdetect-1.2.0.dist-info/WHEEL +5 -0
  31. entdetect-1.2.0.dist-info/entry_points.txt +11 -0
  32. entdetect-1.2.0.dist-info/licenses/LICENSE +674 -0
  33. entdetect-1.2.0.dist-info/top_level.txt +2 -0
  34. scripts/__init__.py +5 -0
  35. scripts/convert_cor_psf_to_pdb.py +103 -0
  36. scripts/run_Foldingpathway.py +162 -0
  37. scripts/run_MSM.py +152 -0
  38. scripts/run_OP_on_simulation_traj.py +194 -0
  39. scripts/run_change_resolution.py +63 -0
  40. scripts/run_compare_sim2exp.py +215 -0
  41. scripts/run_montecarlo.py +158 -0
  42. scripts/run_nativeNCLE.py +179 -0
  43. scripts/run_nonnative_entanglement_clustering.py +110 -0
  44. scripts/run_population_modeling.py +117 -0
  45. scripts/run_workflow4_nativeNCLE_batch.py +412 -0
@@ -0,0 +1,110 @@
1
+ from EntDetect.clustering import ClusterNonNativeEntanglements
2
+ from EntDetect._logging import setup_logger
3
+
4
+ """
5
+ Cluster non-native entanglement changes across an ensemble of simulation trajectories.
6
+
7
+ Reads per-trajectory entanglement pkl files produced by run_OP_on_simulation_traj.py
8
+ (located in the Combined_GE/ subdirectory of the G/ output folder), groups them into
9
+ non-redundant entanglement-change clusters, and writes representative structures and
10
+ per-frame cluster assignments to --outdir.
11
+
12
+ Examples
13
+ --------
14
+ Basic run:
15
+ python scripts/run_nonnative_entanglement_clustering.py \\
16
+ --outdir $DATASTORE/outputs/workflow2/nonnative_clustering \\
17
+ --trajnum2pklfile_path $DATASTORE/user_input/metadata/trajnum2file.txt \\
18
+ --traj_dir_prefix $DATASTORE/user_input/cg_trajectories
19
+
20
+ Flags
21
+ -----
22
+ --outdir Output directory for clustering results
23
+ --trajnum2pklfile_path CSV file (source of truth) with columns: trajnum, pklfile
24
+ Users control exactly which pkl files to analyze via this file
25
+ --traj_dir_prefix Path prefix to the directory containing trajectory DCD files
26
+ --start_frame First frame index to include, 0-based (default: 0)
27
+ --end_frame Last frame index to include, 0-based (default: all frames)
28
+ --nproc Number of parallel worker threads (default: 1)
29
+ Parallelises both pkl loading (per trajectory) and
30
+ entanglement-keyword clustering (per unique keyword).
31
+ Use the number of available CPU cores for best speed.
32
+ --log_level Logging verbosity: DEBUG, INFO, WARNING, ERROR (default: INFO)
33
+ --logdir Directory for log file (default: same as --outdir)
34
+ """
35
+
36
+
37
+ def main(argv=None):
38
+
39
+ ###---------------------------------------------------------------------------------------------------------
40
+ import sys, os
41
+ import argparse
42
+ import time
43
+ import logging
44
+ start_time = time.time()
45
+ ###---------------------------------------------------------------------------------------------------------
46
+
47
+ ###---------------------------------------------------------------------------------------------------------
48
+ parser = argparse.ArgumentParser(
49
+ description="Cluster non-native entanglement changes across simulation trajectories.")
50
+
51
+ # --- identity / IO ---
52
+ parser.add_argument("--outdir", type=str, required=True, help="Output directory for clustering results")
53
+ parser.add_argument("--trajnum2pklfile_path", type=str, required=True, help="CSV file mapping trajectory numbers to pkl file paths (source of truth for which files to analyze)")
54
+ parser.add_argument("--traj_dir_prefix", type=str, required=True, help="Path prefix to the directory containing trajectory DCD files")
55
+
56
+ # --- frame selection ---
57
+ parser.add_argument("--start_frame", type=int, default=0, help="First frame index to include, 0-based (default: 0)")
58
+ parser.add_argument("--end_frame", type=int, default=9999999, help="Last frame index to include, 0-based (default: all frames)")
59
+
60
+ # --- parallelism ---
61
+ parser.add_argument("--nproc", type=int, default=1, help="Number of parallel worker threads (default: 1)")
62
+
63
+ # --- logging ---
64
+ parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging verbosity (default: INFO)")
65
+ parser.add_argument("--logdir", type=str, default=None, help="Directory for log file (default: same as --outdir)")
66
+
67
+ args = parser.parse_args(argv)
68
+
69
+ outdir = args.outdir
70
+ ###---------------------------------------------------------------------------------------------------------
71
+
72
+ ###---------------------------------------------------------------------------------------------------------
73
+ log_level = getattr(logging, args.log_level.upper(), logging.INFO)
74
+ logdir = args.logdir if args.logdir is not None else outdir
75
+
76
+ logger = setup_logger('run_nonnative_clustering', outdir=logdir, ID='ClusterNonNativeEntanglements', log_level=log_level)
77
+ setup_logger('ClusterNonNativeEntanglements', outdir=logdir, ID='ClusterNonNativeEntanglements', log_level=log_level)
78
+ logger.info(f'args: {args}')
79
+ ###---------------------------------------------------------------------------------------------------------
80
+
81
+ ###---------------------------------------------------------------------------------------------------------
82
+ # --- input validation ---
83
+ if not os.path.isfile(args.trajnum2pklfile_path):
84
+ parser.error(f"--trajnum2pklfile_path does not exist: {args.trajnum2pklfile_path}")
85
+
86
+ if not os.path.isdir(args.traj_dir_prefix):
87
+ parser.error(f"--traj_dir_prefix does not exist or is not a directory: {args.traj_dir_prefix}")
88
+
89
+ os.makedirs(outdir, exist_ok=True)
90
+ ###---------------------------------------------------------------------------------------------------------
91
+
92
+ ###---------------------------------------------------------------------------------------------------------
93
+ clustering_NNents = ClusterNonNativeEntanglements(
94
+ trajnum2pklfile_path=args.trajnum2pklfile_path,
95
+ traj_dir_prefix=args.traj_dir_prefix,
96
+ outdir=outdir,
97
+ log_level=log_level,
98
+ logdir=logdir,
99
+ nproc=args.nproc,
100
+ )
101
+ logger.info(f'ClusterNonNativeEntanglements: {clustering_NNents}')
102
+ clustering_NNents.cluster(start_frame=args.start_frame, end_frame=args.end_frame)
103
+ ###---------------------------------------------------------------------------------------------------------
104
+
105
+ logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
106
+ return 0
107
+
108
+
109
+ if __name__ == "__main__":
110
+ raise SystemExit(main())
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python3
2
+ from EntDetect.statistics import ProteomeLogisticRegression
3
+ from EntDetect._logging import setup_logger
4
+
5
+ """
6
+ Run Workflow 4 proteome-level logistic regression from residue feature tables.
7
+
8
+ This script wraps EntDetect.statistics.ProteomeLogisticRegression and writes
9
+ the final regression table as a pipe-delimited CSV.
10
+
11
+ Example
12
+ -------
13
+ python scripts/run_population_modeling.py \
14
+ --dataframe_files /path/to/residue_dataframes_workflow4.csv \
15
+ --outdir /path/to/workflow4/population_modeling/ \
16
+ --gene_list /path/to/gene_list.txt \
17
+ --tag Ecoli_population \
18
+ --reg_formula "cut_C_Rall ~ AA + region"
19
+
20
+ Expected input schema
21
+ ---------------------
22
+ - Either a single combined design matrix file OR one residue table per protein
23
+ - Files are pipe-delimited ("|")
24
+ - Required columns: gene, mapped_resid, AA, region, cut_C_Rall
25
+ """
26
+
27
+ def main(argv=None):
28
+
29
+ import os
30
+ import argparse
31
+ import time
32
+ import logging
33
+
34
+ start_time = time.time()
35
+
36
+ parser = argparse.ArgumentParser(
37
+ description="Run Workflow 4 proteome-level logistic regression from residue feature tables."
38
+ )
39
+
40
+ # --- required IO ---
41
+ parser.add_argument("--dataframe_files", type=str, required=True,
42
+ help="Input design matrix path: either a directory of per-protein files or a single combined CSV")
43
+ parser.add_argument("--outdir", type=str, required=True,
44
+ help="Output directory for regression results")
45
+ parser.add_argument("--gene_list", type=str, required=True,
46
+ help="Path to gene list file (one ID per line)")
47
+ parser.add_argument("--tag", type=str, required=True,
48
+ help="Identifier tag for output naming")
49
+
50
+ # --- model options ---
51
+ parser.add_argument("--reg_formula", type=str, default='cut_C_Rall ~ AA + region',
52
+ help="Regression formula (default: 'cut_C_Rall ~ AA + region')")
53
+
54
+ # --- logging ---
55
+ parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
56
+ help="Logging verbosity (default: INFO)")
57
+ parser.add_argument("--logdir", type=str, default=None,
58
+ help="Directory for log files (default: same as --outdir)")
59
+
60
+ args = parser.parse_args(argv)
61
+
62
+ dataframe_files = args.dataframe_files
63
+ outdir = args.outdir
64
+ gene_list = args.gene_list
65
+ tag = args.tag
66
+ reg_formula = args.reg_formula
67
+
68
+ log_level = getattr(logging, args.log_level.upper(), logging.INFO)
69
+ logdir = args.logdir if args.logdir is not None else outdir
70
+ os.makedirs(logdir, exist_ok=True)
71
+
72
+ logger = setup_logger('run_population_modeling', outdir=logdir, ID=tag, log_level=log_level)
73
+ setup_logger('ProteomeLogisticRegression', outdir=logdir, ID=tag, log_level=log_level)
74
+ logger.info(f'args: {args}')
75
+
76
+ # --- input validation ---
77
+ if not os.path.exists(dataframe_files):
78
+ parser.error(f"--dataframe_files does not exist: {dataframe_files}")
79
+ if not os.path.isfile(gene_list):
80
+ parser.error(f"--gene_list does not exist or is not a file: {gene_list}")
81
+ os.makedirs(outdir, exist_ok=True)
82
+
83
+ ## initialize the regression object
84
+ ProtRegression = ProteomeLogisticRegression(
85
+ dataframe_files=dataframe_files,
86
+ outdir=outdir,
87
+ gene_list=gene_list,
88
+ ID=tag,
89
+ reg_formula=reg_formula,
90
+ log_level=log_level,
91
+ logdir=logdir,
92
+ )
93
+ logger.info(f'ProteomeLogisticRegression: {ProtRegression}')
94
+
95
+ # --- step 1: load residue-level data ---
96
+ ProtRegression.load_data(
97
+ sep='|',
98
+ reg_var=['AA', 'region'],
99
+ response_var='cut_C_Rall',
100
+ var2binarize=['cut_C_Rall', 'region'],
101
+ mask_column='mapped_resid',
102
+ )
103
+
104
+ # --- step 2: run regression ---
105
+ reg_df = ProtRegression.run()
106
+
107
+ # --- step 3: persist results ---
108
+ reg_outfile = os.path.join(outdir, f"regression_results_{tag}.csv")
109
+ reg_df.to_csv(reg_outfile, index=False, sep='|')
110
+ logger.info(f"SAVED: {reg_outfile}")
111
+
112
+ logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
113
+ return 0
114
+
115
+
116
+ if __name__ == "__main__":
117
+ raise SystemExit(main())
@@ -0,0 +1,412 @@
1
+ #!/usr/bin/env python3
2
+ from EntDetect._logging import setup_logger
3
+
4
+ """
5
+ Batch Workflow 4 helper for Steps 3 and 4.
6
+
7
+ This script scans a directory of PDB files, filters to structures whose IDs are
8
+ present in a gene list, and runs scripts/run_nativeNCLE.py in parallel for each
9
+ selected structure.
10
+
11
+ Optionally, it can also build Step 4 regression/design-matrix files by combining:
12
+ 1) per-residue experimental/structural features from a residueFeatures CSV, and
13
+ 2) region labels inferred from NCLE `ent_region` output columns.
14
+
15
+ Compared to run_nativeNCLE.py, this wrapper accepts --pdb_dir and --gene_list
16
+ instead of --struct, then forwards the remaining nativeNCLE options.
17
+
18
+ Example
19
+ -------
20
+ python scripts/run_workflow4_nativeNCLE_batch.py \
21
+ --pdb_dir /scratch/ims86/EntDetect_Datastore/user_input/proteome_structures/AF \
22
+ --gene_list /scratch/ims86/EntDetect_Datastore/user_input/experimental_data/Gene_lists/AF/AF_0.6g_C_Rall_spa50_LiPMScov50_all_genes.txt \
23
+ --outdir /scratch/ims86/EntDetect_Datastore/outputs/workflow4/nativeNCLE_all \
24
+ --organism Ecoli \
25
+ --model AF \
26
+ --contacts heavy \
27
+ --resolution aa \
28
+ --ent_detection_method 3 \
29
+ --nproc 16 \
30
+ --residue_features_file /scratch/ims86/EntDetect_Datastore/user_input/experimental_data/PDB_residue_features/AF/residueFeatures.csv \
31
+ --reg_formula "cut_C_Rall ~ AA + region"
32
+ """
33
+
34
+
35
+ def _read_gene_list(path):
36
+ genes = []
37
+ with open(path, "r", encoding="utf-8") as handle:
38
+ for line in handle:
39
+ item = line.strip()
40
+ if not item:
41
+ continue
42
+ if item.startswith("#"):
43
+ continue
44
+ genes.append(item)
45
+ return set(genes)
46
+
47
+
48
+ def _parse_formula(reg_formula: str):
49
+ left, right = reg_formula.split("~", 1)
50
+ response_var = left.strip()
51
+ reg_vars = [v.strip() for v in right.split("+") if v.strip()]
52
+ return response_var, reg_vars
53
+
54
+
55
+ def _parse_ent_region_to_set(ent_region_value):
56
+ """Convert ent_region field to a set of integer residue indices."""
57
+ if ent_region_value is None:
58
+ return set()
59
+
60
+ text = str(ent_region_value).strip()
61
+ if text == "" or text.lower() == "nan":
62
+ return set()
63
+
64
+ region = set()
65
+ for tok in text.split(","):
66
+ tok = tok.strip()
67
+ if tok == "":
68
+ continue
69
+ try:
70
+ region.add(int(tok))
71
+ except ValueError:
72
+ continue
73
+ return region
74
+
75
+
76
+ def _collect_ent_region_map(root_outdir, selected_gene_ids, logger):
77
+ """Build gene -> set(mapped_resid in entangled regions) from NCLE feature files."""
78
+ import glob
79
+ import os
80
+ import pandas as pd
81
+
82
+ ent_region_map = {}
83
+ missing_genes = []
84
+
85
+ for gene in sorted(selected_gene_ids):
86
+ feat_glob = os.path.join(root_outdir, gene, "Native_clustered_HQ_GE_features", "*_uent_features.csv")
87
+ feat_files = sorted(glob.glob(feat_glob))
88
+ if not feat_files:
89
+ missing_genes.append(gene)
90
+ ent_region_map[gene] = set()
91
+ continue
92
+
93
+ region_set = set()
94
+ for fp in feat_files:
95
+ try:
96
+ df = pd.read_csv(fp, sep="|", usecols=["ent_region"])
97
+ except Exception as exc:
98
+ logger.warning(f"Could not read ent_region from {fp}: {exc}")
99
+ continue
100
+
101
+ for v in df["ent_region"].values:
102
+ region_set.update(_parse_ent_region_to_set(v))
103
+
104
+ ent_region_map[gene] = region_set
105
+
106
+ if missing_genes:
107
+ logger.warning(
108
+ f"No NCLE feature files found for {len(missing_genes)} gene(s). "
109
+ f"Their region labels will default to 0."
110
+ )
111
+
112
+ return ent_region_map
113
+
114
+
115
+ def _build_design_matrices(args, selected_gene_ids, logger):
116
+ import os
117
+ import pandas as pd
118
+
119
+ if args.residue_features_file is None or args.reg_formula is None:
120
+ logger.info("Design-matrix build skipped (provide both --residue_features_file and --reg_formula to enable).")
121
+ return
122
+
123
+ if not os.path.isfile(args.residue_features_file):
124
+ raise FileNotFoundError(f"residue_features_file not found: {args.residue_features_file}")
125
+
126
+ response_var, reg_vars = _parse_formula(args.reg_formula)
127
+ logger.info(f"Building design matrices for formula: {response_var} ~ {' + '.join(reg_vars)}")
128
+
129
+ if "region" not in reg_vars:
130
+ logger.warning("Formula does not include 'region'; NCLE-derived region labels will not be used.")
131
+
132
+ ent_region_map = _collect_ent_region_map(args.outdir, selected_gene_ids, logger)
133
+
134
+ workflow4_root = os.path.dirname(os.path.abspath(args.outdir.rstrip(os.sep)))
135
+ combined_outfile = args.design_matrix_file or os.path.join(workflow4_root, "residue_dataframes_workflow4.csv")
136
+
137
+ req_cols = ["gene", "mapped_resid", "uniprot_length", response_var, *reg_vars]
138
+ # Keep AA for downstream filters even if omitted in formula
139
+ req_cols.extend(["AA"]) if "AA" not in req_cols else None
140
+ # remove region from file-read requirements (it is built from ent_region)
141
+ req_cols = [c for c in req_cols if c != "region"]
142
+
143
+ logger.info(f"Reading residue features from: {args.residue_features_file}")
144
+ df = pd.read_csv(args.residue_features_file, sep="|", low_memory=False)
145
+
146
+ missing_cols = [c for c in req_cols if c not in df.columns]
147
+ if missing_cols:
148
+ raise ValueError(f"Missing required columns in residue_features_file: {missing_cols}")
149
+
150
+ data = df[df["gene"].isin(selected_gene_ids)][req_cols].copy()
151
+ logger.info(f"Rows after gene-list filter: {len(data)}")
152
+
153
+ mapped_resid_num = pd.to_numeric(data["mapped_resid"], errors="coerce")
154
+ data["mapped_resid"] = mapped_resid_num
155
+
156
+ # Build region from NCLE ent_region sets
157
+ region_values = []
158
+ for gene, resid in zip(data["gene"].values, data["mapped_resid"].values):
159
+ if pd.isna(resid):
160
+ region_values.append(0)
161
+ continue
162
+ region_values.append(1 if int(resid) in ent_region_map.get(str(gene), set()) else 0)
163
+ data["region"] = region_values
164
+
165
+ final_cols = ["gene", "mapped_resid", "uniprot_length", *reg_vars, response_var]
166
+ # preserve order and uniqueness
167
+ seen = set()
168
+ final_cols = [c for c in final_cols if not (c in seen or seen.add(c))]
169
+ data = data[final_cols]
170
+
171
+ os.makedirs(os.path.dirname(os.path.abspath(combined_outfile)), exist_ok=True)
172
+ data.to_csv(combined_outfile, sep="|", index=False)
173
+
174
+ logger.info(f"Design matrix build complete: single matrix file written to {combined_outfile}")
175
+
176
+
177
+ def _build_native_command(args, native_script, pdb_file, root_outdir, logdir):
178
+ import os
179
+ import sys
180
+
181
+ pdb_name = os.path.basename(pdb_file)
182
+ protein_id = os.path.splitext(pdb_name)[0]
183
+ protein_outdir = os.path.join(root_outdir, protein_id)
184
+ accession = args.Accession if args.Accession is not None else protein_id
185
+
186
+ cmd = [
187
+ sys.executable,
188
+ native_script,
189
+ "--struct", pdb_file,
190
+ "--outdir", protein_outdir,
191
+ "--ID", protein_id,
192
+ "--organism", args.organism,
193
+ "--Accession", accession,
194
+ "--model", args.model,
195
+ "--ent_detection_method", str(args.ent_detection_method),
196
+ "--log_level", args.log_level,
197
+ "--logdir", logdir,
198
+ ]
199
+
200
+ if args.chain is not None:
201
+ cmd.extend(["--chain", args.chain])
202
+
203
+ if args.resolution is not None:
204
+ cmd.extend(["--resolution", args.resolution])
205
+
206
+ if args.contacts is not None:
207
+ cmd.extend(["--contacts", args.contacts])
208
+
209
+ if args.cluster_cutoff is not None:
210
+ cmd.extend(["--cluster_cutoff", str(args.cluster_cutoff)])
211
+
212
+ if args.cg:
213
+ cmd.append("--cg")
214
+
215
+ if args.Calpha:
216
+ cmd.append("--Calpha")
217
+
218
+ return protein_id, cmd
219
+
220
+
221
+ def _run_one(job):
222
+ import subprocess
223
+
224
+ protein_id, cmd = job
225
+ proc = subprocess.run(cmd, capture_output=True, text=True)
226
+ return protein_id, proc.returncode, proc.stdout, proc.stderr
227
+
228
+
229
+ def main(argv=None):
230
+ import argparse
231
+ import glob
232
+ import logging
233
+ import os
234
+ import time
235
+ from concurrent.futures import ThreadPoolExecutor, as_completed
236
+
237
+ start_time = time.time()
238
+
239
+ parser = argparse.ArgumentParser(
240
+ description=(
241
+ "Batch-run scripts/run_nativeNCLE.py over a PDB directory filtered by a gene list."
242
+ )
243
+ )
244
+
245
+ # --- required batch inputs ---
246
+ parser.add_argument("--pdb_dir", type=str, required=True,
247
+ help="Directory containing input .pdb files")
248
+ parser.add_argument("--gene_list", type=str, required=True,
249
+ help="Gene/accession list file (one ID per line)")
250
+ parser.add_argument("--outdir", type=str, required=True,
251
+ help="Root output directory; each protein writes to outdir/<ID>")
252
+
253
+ # --- parallelism / matching behavior ---
254
+ parser.add_argument("--nproc", type=int, default=8,
255
+ help="Number of parallel nativeNCLE jobs (default: 8)")
256
+ parser.add_argument("--allow_prefix_match", action="store_true",
257
+ help=(
258
+ "Allow gene IDs to match as prefix of PDB stem (useful for "
259
+ "filenames containing structure suffixes)."
260
+ ))
261
+ parser.add_argument("--dry_run", action="store_true",
262
+ help="Print selected proteins and exit without running jobs")
263
+
264
+ # --- forwarded run_nativeNCLE options (minus --struct) ---
265
+ parser.add_argument("--chain", type=str, default=None,
266
+ help="Chain identifier (optional)")
267
+ parser.add_argument("--organism", type=str, default="Ecoli",
268
+ help="Organism for clustering: Ecoli | Human | Yeast")
269
+ parser.add_argument("--Accession", type=str, default=None,
270
+ help="Accession value passed to run_nativeNCLE. If omitted, uses each protein ID from the PDB stem.")
271
+ parser.add_argument("--cg", action="store_true",
272
+ help="Pass --cg to run_nativeNCLE (legacy flag)")
273
+ parser.add_argument("--Calpha", "--calpha", action="store_true", dest="Calpha",
274
+ help="Pass --Calpha to run_nativeNCLE (legacy flag)")
275
+ parser.add_argument("--resolution", type=str, choices=["aa", "cg"], default=None,
276
+ help="Resolution forwarded to run_nativeNCLE")
277
+ parser.add_argument("--contacts", type=str, choices=["heavy", "calpha"], default=None,
278
+ help="Contact type forwarded to run_nativeNCLE")
279
+ parser.add_argument("--cluster_cutoff", type=float, default=None,
280
+ help="Cluster cutoff forwarded to run_nativeNCLE")
281
+ parser.add_argument("--model", type=str, default="AF",
282
+ help="Model type for HQ selection: EXP | AF")
283
+ parser.add_argument("--ent_detection_method", type=int, default=3,
284
+ help="Entanglement detection method passed to run_nativeNCLE")
285
+
286
+ # --- optional Step 4 design-matrix build ---
287
+ parser.add_argument("--residue_features_file", type=str, default=None,
288
+ help="Path to residue features CSV (e.g., .../PDB_residue_features/AF/residueFeatures.csv)")
289
+ parser.add_argument("--reg_formula", type=str, default=None,
290
+ help="Regression formula for design matrix (e.g., 'cut_C_Rall ~ AA + region')")
291
+ parser.add_argument("--design_matrix_file", type=str, default=None,
292
+ help="Output path for combined design matrix CSV (default: sibling of --outdir/residue_dataframes_workflow4.csv)")
293
+
294
+ # --- logging ---
295
+ parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
296
+ help="Logging verbosity (default: INFO)")
297
+ parser.add_argument("--logdir", type=str, default=None,
298
+ help="Directory for run log files (default: <outdir>/logs)")
299
+
300
+ args = parser.parse_args(argv)
301
+
302
+ log_level = getattr(logging, args.log_level.upper(), logging.INFO)
303
+ logdir = args.logdir if args.logdir is not None else os.path.join(args.outdir, "logs")
304
+ os.makedirs(logdir, exist_ok=True)
305
+
306
+ logger = setup_logger("run_workflow4_nativeNCLE_batch", outdir=logdir, ID="workflow4_batch", log_level=log_level)
307
+ logger.info(f"args: {args}")
308
+
309
+ # --- validation ---
310
+ if not os.path.isdir(args.pdb_dir):
311
+ parser.error(f"--pdb_dir does not exist or is not a directory: {args.pdb_dir}")
312
+ if not os.path.isfile(args.gene_list):
313
+ parser.error(f"--gene_list does not exist or is not a file: {args.gene_list}")
314
+ if args.nproc < 1:
315
+ parser.error("--nproc must be >= 1")
316
+
317
+ os.makedirs(args.outdir, exist_ok=True)
318
+
319
+ # Resolve native runner path relative to this script.
320
+ native_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run_nativeNCLE.py")
321
+ if not os.path.isfile(native_script):
322
+ parser.error(f"Could not locate run_nativeNCLE.py at expected path: {native_script}")
323
+
324
+ gene_set = _read_gene_list(args.gene_list)
325
+ if not gene_set:
326
+ parser.error(f"--gene_list appears empty: {args.gene_list}")
327
+
328
+ pdb_files = sorted(glob.glob(os.path.join(args.pdb_dir, "*.pdb")))
329
+ if not pdb_files:
330
+ parser.error(f"No .pdb files found in --pdb_dir: {args.pdb_dir}")
331
+
332
+ selected = []
333
+ skipped = 0
334
+ for pdb_file in pdb_files:
335
+ stem = os.path.splitext(os.path.basename(pdb_file))[0]
336
+ if stem in gene_set:
337
+ selected.append(pdb_file)
338
+ continue
339
+
340
+ if args.allow_prefix_match:
341
+ # Accept if any gene is an exact prefix token of the filename stem.
342
+ if any(stem.startswith(gene + "_") or stem.startswith(gene + "-") for gene in gene_set):
343
+ selected.append(pdb_file)
344
+ continue
345
+
346
+ skipped += 1
347
+
348
+ logger.info(f"Found {len(pdb_files)} pdb files")
349
+ logger.info(f"Matched {len(selected)} structures against gene list; skipped {skipped}")
350
+
351
+ if not selected:
352
+ parser.error("No PDBs matched the provided gene list. Check naming conventions or use --allow_prefix_match.")
353
+
354
+ jobs = [_build_native_command(args, native_script, pdb_file, args.outdir, logdir) for pdb_file in selected]
355
+
356
+ if args.dry_run:
357
+ logger.info("Dry-run mode enabled. Selected proteins:")
358
+ for protein_id, _ in jobs:
359
+ logger.info(f" {protein_id}")
360
+ logger.info(f"NORMAL TERMINATION - {time.time() - start_time:.1f} seconds")
361
+ return 0
362
+
363
+ logger.info(f"Launching {len(jobs)} nativeNCLE jobs with nproc={args.nproc}")
364
+
365
+ failures = []
366
+ completed = 0
367
+ with ThreadPoolExecutor(max_workers=args.nproc) as executor:
368
+ future_map = {executor.submit(_run_one, job): job[0] for job in jobs}
369
+ for future in as_completed(future_map):
370
+ protein_id = future_map[future]
371
+ try:
372
+ _, code, stdout, stderr = future.result()
373
+ except Exception as exc:
374
+ failures.append((protein_id, -1, "", f"internal runner error: {exc}"))
375
+ logger.error(f"FAILED: {protein_id} (internal error)")
376
+ continue
377
+
378
+ completed += 1
379
+ if code == 0:
380
+ logger.info(f"DONE: {protein_id} ({completed}/{len(jobs)})")
381
+ else:
382
+ failures.append((protein_id, code, stdout, stderr))
383
+ logger.error(f"FAILED: {protein_id} exit_code={code} ({completed}/{len(jobs)})")
384
+
385
+ if failures:
386
+ fail_log = os.path.join(args.outdir, "workflow4_nativeNCLE_batch_failures.log")
387
+ with open(fail_log, "w", encoding="utf-8") as handle:
388
+ for protein_id, code, stdout, stderr in failures:
389
+ handle.write(f"#{'='*78}\n")
390
+ handle.write(f"protein: {protein_id}\n")
391
+ handle.write(f"exit_code: {code}\n")
392
+ handle.write("--- stdout ---\n")
393
+ handle.write(stdout or "")
394
+ handle.write("\n--- stderr ---\n")
395
+ handle.write(stderr or "")
396
+ handle.write("\n")
397
+ logger.error(f"{len(failures)} jobs failed. See: {fail_log}")
398
+ logger.info(f"NORMAL TERMINATION WITH FAILURES - {time.time() - start_time:.1f} seconds")
399
+ return 1
400
+
401
+ logger.info(f"All {len(jobs)} jobs completed successfully")
402
+
403
+ # Optional Step 4: design matrix build
404
+ selected_gene_ids = [job[0] for job in jobs]
405
+ _build_design_matrices(args, selected_gene_ids, logger)
406
+
407
+ logger.info(f"NORMAL TERMINATION - {time.time() - start_time:.1f} seconds")
408
+ return 0
409
+
410
+
411
+ if __name__ == "__main__":
412
+ raise SystemExit(main())