EntDetect 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EntDetect/Jwalk/GridTools.py +567 -0
- EntDetect/Jwalk/PDBTools.py +532 -0
- EntDetect/Jwalk/SASDTools.py +543 -0
- EntDetect/Jwalk/SurfaceTools.py +150 -0
- EntDetect/Jwalk/__init__.py +19 -0
- EntDetect/Jwalk/naccess.config.txt +255 -0
- EntDetect/__init__.py +10 -0
- EntDetect/_logging.py +71 -0
- EntDetect/change_resolution.py +2361 -0
- EntDetect/clustering.py +2626 -0
- EntDetect/compare_sim2exp.py +1927 -0
- EntDetect/entanglement_features.py +478 -0
- EntDetect/gaussian_entanglement.py +2067 -0
- EntDetect/order_params.py +1048 -0
- EntDetect/resources/__init__.py +11 -0
- EntDetect/resources/__pycache__/__init__.cpython-311.pyc +0 -0
- EntDetect/resources/calc_K.pl +712 -0
- EntDetect/resources/calc_Q.pl +962 -0
- EntDetect/resources/pulchra +0 -0
- EntDetect/resources/shared_files/__init__.py +2 -0
- EntDetect/resources/shared_files/bt_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/karanicolas_dihe_parm.dat +1600 -0
- EntDetect/resources/shared_files/kgs_contact_potential.dat +22 -0
- EntDetect/resources/shared_files/mj_contact_potential.dat +22 -0
- EntDetect/resources/stride +0 -0
- EntDetect/statistics.py +1344 -0
- EntDetect/utilities.py +201 -0
- entdetect-1.2.0.dist-info/METADATA +26 -0
- entdetect-1.2.0.dist-info/RECORD +45 -0
- entdetect-1.2.0.dist-info/WHEEL +5 -0
- entdetect-1.2.0.dist-info/entry_points.txt +11 -0
- entdetect-1.2.0.dist-info/licenses/LICENSE +674 -0
- entdetect-1.2.0.dist-info/top_level.txt +2 -0
- scripts/__init__.py +5 -0
- scripts/convert_cor_psf_to_pdb.py +103 -0
- scripts/run_Foldingpathway.py +162 -0
- scripts/run_MSM.py +152 -0
- scripts/run_OP_on_simulation_traj.py +194 -0
- scripts/run_change_resolution.py +63 -0
- scripts/run_compare_sim2exp.py +215 -0
- scripts/run_montecarlo.py +158 -0
- scripts/run_nativeNCLE.py +179 -0
- scripts/run_nonnative_entanglement_clustering.py +110 -0
- scripts/run_population_modeling.py +117 -0
- scripts/run_workflow4_nativeNCLE_batch.py +412 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
python scripts/run_change_resolution.py
|
|
3
|
+
--outdir TestingGrounds/CoarseGraining/
|
|
4
|
+
--pdbfile /path/to/structure.pdb
|
|
5
|
+
--nscal 2
|
|
6
|
+
--domain_file /path/to/domain_def.dat
|
|
7
|
+
--ID test_protein
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def main(argv=None):
|
|
11
|
+
|
|
12
|
+
import sys, os
|
|
13
|
+
import argparse
|
|
14
|
+
import time
|
|
15
|
+
|
|
16
|
+
start_time = time.time()
|
|
17
|
+
|
|
18
|
+
parser = argparse.ArgumentParser(description="Process user specified arguments")
|
|
19
|
+
parser.add_argument("--outdir", type=str, required=True, help="output directory for results")
|
|
20
|
+
parser.add_argument("--pdbfile", type=str, required=True, help="Path to the all-atom PDB file")
|
|
21
|
+
parser.add_argument("--nscal", type=int, default=2, help="Coarse graining scale factor")
|
|
22
|
+
parser.add_argument("--domain_file", type=str, required=True, help="Path to the domain definition file")
|
|
23
|
+
parser.add_argument("--ID", type=str, required=True, help="Tag for the coarse graining process")
|
|
24
|
+
args = parser.parse_args(argv)
|
|
25
|
+
print(args)
|
|
26
|
+
outdir = args.outdir
|
|
27
|
+
pdbfile = args.pdbfile
|
|
28
|
+
nscal = args.nscal
|
|
29
|
+
domain_file = args.domain_file
|
|
30
|
+
ID = args.ID
|
|
31
|
+
|
|
32
|
+
# Import here so `run_change_resolution -h` works without OpenMM installed.
|
|
33
|
+
from EntDetect.change_resolution import CoarseGrain, BackMapping
|
|
34
|
+
|
|
35
|
+
## Coarse grain the all-atom structure
|
|
36
|
+
CoarseGrainer = CoarseGrain(outdir=outdir,
|
|
37
|
+
ID=ID,
|
|
38
|
+
pdbfile=pdbfile,
|
|
39
|
+
nscal=nscal,
|
|
40
|
+
domain_file=domain_file)
|
|
41
|
+
print(CoarseGrainer)
|
|
42
|
+
|
|
43
|
+
# run the coarse graining for the all-atom pdbfile
|
|
44
|
+
CGfiles = CoarseGrainer.run()
|
|
45
|
+
print(CGfiles)
|
|
46
|
+
|
|
47
|
+
# parse the prm and top file to make a OpenMM compatible .xml force field file
|
|
48
|
+
CoarseGrainer.parse_cg_prm(prmfile=CGfiles['prm'], topfile=CGfiles['top'])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
## Backmap the coarse grained structure to all-atom
|
|
52
|
+
backMapper = BackMapping(outdir=outdir)
|
|
53
|
+
print(f'BackMapper: {backMapper}')
|
|
54
|
+
|
|
55
|
+
# backmap (using original pdbfile as aa_pdb reference)
|
|
56
|
+
backMapper.backmap(cg_pdb=CGfiles['cor'], aa_pdb=pdbfile, TAG=ID)
|
|
57
|
+
|
|
58
|
+
print(f'NORMAL TERMINATION - {time.time() - start_time} seconds')
|
|
59
|
+
return 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
if __name__ == "__main__":
|
|
63
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from EntDetect.gaussian_entanglement import GaussianEntanglement
|
|
2
|
+
from EntDetect.clustering import ClusterNativeEntanglements, MSMNonNativeEntanglementClustering
|
|
3
|
+
from EntDetect.order_params import CalculateOP, CollectOP
|
|
4
|
+
from EntDetect.compare_sim2exp import MassSpec
|
|
5
|
+
|
|
6
|
+
"""
|
|
7
|
+
Collect per-trajectory SASA/XP outputs (optional) and run the LiP-MS / XL-MS
|
|
8
|
+
consistency test.
|
|
9
|
+
|
|
10
|
+
Two usage modes:
|
|
11
|
+
|
|
12
|
+
1. Collect + run (provide --sasa_dir and --xp_dir):
|
|
13
|
+
python scripts/run_compare_sim2exp.py
|
|
14
|
+
--sasa_dir /path/to/OP_AA/SASA
|
|
15
|
+
--xp_dir /path/to/OP_AA/XP
|
|
16
|
+
--n_traj 1000
|
|
17
|
+
--n_frames 335
|
|
18
|
+
--msm_data_file ... (remaining args as below)
|
|
19
|
+
|
|
20
|
+
2. Skip collection (provide --sasa_data_file and --dist_data_file directly):
|
|
21
|
+
python scripts/run_compare_sim2exp.py
|
|
22
|
+
--sasa_data_file /path/to/SASA.npy
|
|
23
|
+
--dist_data_file /path/to/Jwalk.npy
|
|
24
|
+
--msm_data_file ...
|
|
25
|
+
|
|
26
|
+
Full example (mode 1):
|
|
27
|
+
python scripts/run_compare_sim2exp.py
|
|
28
|
+
--sasa_dir /path/to/OP_AA/SASA
|
|
29
|
+
--xp_dir /path/to/OP_AA/XP
|
|
30
|
+
--n_traj 1000
|
|
31
|
+
--n_frames 335
|
|
32
|
+
--msm_data_file /path/to/msm_data.csv
|
|
33
|
+
--meta_dist_file /path/to/meta_dist.npy
|
|
34
|
+
--LiPMS_exp_file /path/to/LiPMS_exp.xlsx
|
|
35
|
+
--XLMS_exp_file /path/to/XLMS_exp.xlsx
|
|
36
|
+
--cluster_data_file /path/to/cluster_data.npz
|
|
37
|
+
--OPpath /path/to/OP_AA/
|
|
38
|
+
--AAdcd_dir /path/to/aa_trajectories/
|
|
39
|
+
--native_AA_pdb /path/to/native.pdb
|
|
40
|
+
--state_idx_list 4 6 8
|
|
41
|
+
--prot_len 387
|
|
42
|
+
--last_num_frames 335
|
|
43
|
+
--rm_traj_list 65 75 155
|
|
44
|
+
--native_state_idx 9
|
|
45
|
+
--outdir /path/to/outdir/
|
|
46
|
+
--ID 1ZMR
|
|
47
|
+
--start 6600
|
|
48
|
+
--end -1
|
|
49
|
+
--stride 1
|
|
50
|
+
--num_perm 1000
|
|
51
|
+
--n_boot 100
|
|
52
|
+
--lag_frame 20
|
|
53
|
+
--nproc 10
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def main(argv=None):
|
|
57
|
+
|
|
58
|
+
import multiprocessing as mp
|
|
59
|
+
import sys, os
|
|
60
|
+
import argparse
|
|
61
|
+
import time
|
|
62
|
+
|
|
63
|
+
start_time = time.time()
|
|
64
|
+
|
|
65
|
+
parser = argparse.ArgumentParser(description="Process user specified arguments")
|
|
66
|
+
parser.add_argument("--msm_data_file", type=str, required=True, help="Path to MSM mapping file")
|
|
67
|
+
parser.add_argument("--meta_dist_file", type=str, required=True, help="Path to meta-distance file")
|
|
68
|
+
parser.add_argument("--LiPMS_exp_file", type=str, required=True, help="Path to LiP-MS experimental data file")
|
|
69
|
+
parser.add_argument("--XLMS_exp_file", type=str, required=True, help="Path to XL-MS experimental data file")
|
|
70
|
+
parser.add_argument("--cluster_data_file", type=str, required=True, help="Path to clustering data file")
|
|
71
|
+
parser.add_argument("--OPpath", type=str, required=True, help="Path to order parameters directory")
|
|
72
|
+
parser.add_argument("--AAdcd_dir", type=str, required=True, help="Path to all-atom DCD files directory")
|
|
73
|
+
parser.add_argument("--native_AA_pdb", type=str, required=True, help="Path to native all-atom PDB file")
|
|
74
|
+
parser.add_argument("--state_idx_list", type=int, nargs='+', required=True, help="List of state indices to analyze")
|
|
75
|
+
parser.add_argument("--prot_len", type=int, required=True, help="Length of the protein")
|
|
76
|
+
parser.add_argument("--last_num_frames", type=int, required=True, help="Number of last frames to consider")
|
|
77
|
+
parser.add_argument("--rm_traj_list", type=int, nargs='+', required=True, help="List of trajectory indices to remove")
|
|
78
|
+
parser.add_argument("--native_state_idx", type=int, required=True, help="Index of the native state")
|
|
79
|
+
parser.add_argument("--outdir", type=str, required=True, help="Output directory for results")
|
|
80
|
+
parser.add_argument("--ID", type=str, required=True, help="An ID for the analysis")
|
|
81
|
+
parser.add_argument("--start", type=int, required=True, help="Start frame index")
|
|
82
|
+
parser.add_argument("--end", type=int, required=True, help="End frame index")
|
|
83
|
+
parser.add_argument("--stride", type=int, required=True, help="Stride for frame selection")
|
|
84
|
+
parser.add_argument("--verbose", action='store_true', help="Enable verbose output")
|
|
85
|
+
parser.add_argument("--num_perm", type=int, required=True, help="Number of permutations for statistical tests")
|
|
86
|
+
parser.add_argument("--n_boot", type=int, required=True, help="Number of bootstrap samples")
|
|
87
|
+
parser.add_argument("--lag_frame", type=int, required=True, help="Lag time in frames")
|
|
88
|
+
parser.add_argument("--nproc", type=int, required=True, help="Number of processes for parallel computation")
|
|
89
|
+
# --- CollectOP arguments (optional: collect from per-traj files) ---
|
|
90
|
+
parser.add_argument("--sasa_dir", type=str, default=None,
|
|
91
|
+
help="Directory of per-traj {ID}_Traj{N}.SASA files. "
|
|
92
|
+
"If provided together with --xp_dir, CollectOP is run "
|
|
93
|
+
"before MassSpec and --sasa_data_file / --dist_data_file "
|
|
94
|
+
"are set automatically.")
|
|
95
|
+
parser.add_argument("--xp_dir", type=str, default=None,
|
|
96
|
+
help="Directory of per-traj {ID}_Traj{N}.XP files (used with --sasa_dir).")
|
|
97
|
+
parser.add_argument("--n_traj", type=int, default=None,
|
|
98
|
+
help="Total number of trajectories for CollectOP (required with --sasa_dir).")
|
|
99
|
+
parser.add_argument("--n_frames", type=int, default=None,
|
|
100
|
+
help="Frames per trajectory stored in each file (required with --sasa_dir).")
|
|
101
|
+
parser.add_argument("--collect_jwalk_npy", action='store_true',
|
|
102
|
+
help="Also build Jwalk.npy with CollectOP (legacy path). "
|
|
103
|
+
"By default, XL-MS scoring streams directly from XP files to reduce memory use.")
|
|
104
|
+
# --- Direct array paths (used when skipping collection) ---
|
|
105
|
+
parser.add_argument("--sasa_data_file", type=str, default=None,
|
|
106
|
+
help="Path to pre-built SASA.npy. Required if --sasa_dir is not provided.")
|
|
107
|
+
parser.add_argument("--dist_data_file", type=str, default=None,
|
|
108
|
+
help="Path to pre-built Jwalk.npy. Required if --xp_dir is not provided.")
|
|
109
|
+
args = parser.parse_args(argv)
|
|
110
|
+
print(args)
|
|
111
|
+
msm_data_file = args.msm_data_file
|
|
112
|
+
meta_dist_file = args.meta_dist_file
|
|
113
|
+
LiPMS_exp_file = args.LiPMS_exp_file
|
|
114
|
+
sasa_data_file = args.sasa_data_file
|
|
115
|
+
XLMS_exp_file = args.XLMS_exp_file
|
|
116
|
+
dist_data_file = args.dist_data_file
|
|
117
|
+
cluster_data_file = args.cluster_data_file
|
|
118
|
+
OPpath = args.OPpath
|
|
119
|
+
AAdcd_dir = args.AAdcd_dir
|
|
120
|
+
native_AA_pdb = args.native_AA_pdb
|
|
121
|
+
state_idx_list = args.state_idx_list
|
|
122
|
+
prot_len = args.prot_len
|
|
123
|
+
last_num_frames = args.last_num_frames
|
|
124
|
+
rm_traj_list = args.rm_traj_list
|
|
125
|
+
native_state_idx = args.native_state_idx
|
|
126
|
+
outdir = args.outdir
|
|
127
|
+
ID = args.ID
|
|
128
|
+
start = args.start
|
|
129
|
+
end = args.end
|
|
130
|
+
stride = args.stride
|
|
131
|
+
verbose = args.verbose
|
|
132
|
+
num_perm = args.num_perm
|
|
133
|
+
n_boot = args.n_boot
|
|
134
|
+
lag_frame = args.lag_frame
|
|
135
|
+
nproc = args.nproc
|
|
136
|
+
|
|
137
|
+
# ── validate input mode ────────────────────────────────────────────────
|
|
138
|
+
collect_mode = args.sasa_dir is not None and args.xp_dir is not None
|
|
139
|
+
direct_mode = args.sasa_data_file is not None and args.dist_data_file is not None
|
|
140
|
+
|
|
141
|
+
if not collect_mode and not direct_mode:
|
|
142
|
+
parser.error(
|
|
143
|
+
"Provide either (--sasa_dir + --xp_dir + --n_traj + --n_frames) "
|
|
144
|
+
"to collect from per-trajectory files, or "
|
|
145
|
+
"(--sasa_data_file + --dist_data_file) to use pre-built arrays."
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# ── Step 1: collect per-trajectory outputs if requested ───────────────
|
|
149
|
+
if collect_mode:
|
|
150
|
+
if args.n_traj is None or args.n_frames is None:
|
|
151
|
+
parser.error("--n_traj and --n_frames are required when using --sasa_dir / --xp_dir")
|
|
152
|
+
|
|
153
|
+
os.makedirs(outdir, exist_ok=True)
|
|
154
|
+
collector = CollectOP(
|
|
155
|
+
sasa_dir = args.sasa_dir,
|
|
156
|
+
xp_dir = args.xp_dir,
|
|
157
|
+
outdir = outdir,
|
|
158
|
+
ID = ID,
|
|
159
|
+
n_traj = args.n_traj,
|
|
160
|
+
n_frames = args.n_frames,
|
|
161
|
+
prot_len = prot_len,
|
|
162
|
+
)
|
|
163
|
+
sasa_data_file = collector.collect_SASA()
|
|
164
|
+
if args.collect_jwalk_npy:
|
|
165
|
+
dist_data_file = collector.collect_Jwalk()
|
|
166
|
+
else:
|
|
167
|
+
dist_data_file = None
|
|
168
|
+
print(f'CollectOP SASA: {sasa_data_file}')
|
|
169
|
+
if dist_data_file is not None:
|
|
170
|
+
print(f'CollectOP Jwalk: {dist_data_file}')
|
|
171
|
+
else:
|
|
172
|
+
print('CollectOP Jwalk: skipped (streaming XP mode enabled)')
|
|
173
|
+
|
|
174
|
+
# ── Step 2: run the consistency test ──────────────────────────────────
|
|
175
|
+
MS = MassSpec(msm_data_file=msm_data_file,
|
|
176
|
+
meta_dist_file=meta_dist_file,
|
|
177
|
+
LiPMS_exp_file=LiPMS_exp_file,
|
|
178
|
+
sasa_data_file=sasa_data_file,
|
|
179
|
+
XLMS_exp_file=XLMS_exp_file,
|
|
180
|
+
dist_data_file=dist_data_file,
|
|
181
|
+
cluster_data_file=cluster_data_file,
|
|
182
|
+
OPpath=OPpath,
|
|
183
|
+
AAdcd_dir=AAdcd_dir,
|
|
184
|
+
native_AA_pdb=native_AA_pdb,
|
|
185
|
+
xp_dir=args.xp_dir,
|
|
186
|
+
state_idx_list=state_idx_list,
|
|
187
|
+
prot_len=prot_len,
|
|
188
|
+
last_num_frames=last_num_frames,
|
|
189
|
+
rm_traj_list=rm_traj_list,
|
|
190
|
+
native_state_idx=native_state_idx,
|
|
191
|
+
outdir=outdir,
|
|
192
|
+
ID=ID,
|
|
193
|
+
start=start,
|
|
194
|
+
end=end,
|
|
195
|
+
stride=stride,
|
|
196
|
+
verbose=verbose,
|
|
197
|
+
num_perm=num_perm,
|
|
198
|
+
n_boot=n_boot,
|
|
199
|
+
lag_frame=lag_frame,
|
|
200
|
+
nproc=nproc)
|
|
201
|
+
|
|
202
|
+
# run the consistency test
|
|
203
|
+
consist_data_file, consist_result_file = MS.LiP_XL_MS_ConsistencyTest()
|
|
204
|
+
print(f'consist_data_file: {consist_data_file}')
|
|
205
|
+
print(f'consist_result_file: {consist_result_file}')
|
|
206
|
+
|
|
207
|
+
# select the representative structures from the consistency test
|
|
208
|
+
MS.select_rep_structs(consist_data_file, consist_result_file, total_traj_num_frames=335, last_num_frames=67)
|
|
209
|
+
|
|
210
|
+
print(f'NORMAL TERMINATION - {time.time() - start_time} seconds')
|
|
211
|
+
return 0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from EntDetect.statistics import MonteCarlo
|
|
3
|
+
from EntDetect._logging import setup_logger
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Run Workflow 4 Monte Carlo subpopulation selection.
|
|
7
|
+
|
|
8
|
+
This script wraps EntDetect.statistics.MonteCarlo and optimizes population
|
|
9
|
+
partitions using a logistic-regression objective and penalty terms.
|
|
10
|
+
|
|
11
|
+
Example
|
|
12
|
+
-------
|
|
13
|
+
python scripts/run_montecarlo.py \
|
|
14
|
+
--dataframe_files /path/to/residue_dataframes_workflow4.csv \
|
|
15
|
+
--outpath /path/to/workflow4/monte_carlo/ \
|
|
16
|
+
--gene_list /path/to/gene_list.txt \
|
|
17
|
+
--tag Ecoli_population_mc \
|
|
18
|
+
--steps 100000 \
|
|
19
|
+
--n_groups 4 \
|
|
20
|
+
--C1 1.0 \
|
|
21
|
+
--C2 2.5 \
|
|
22
|
+
--beta 0.05
|
|
23
|
+
|
|
24
|
+
Expected input schema
|
|
25
|
+
---------------------
|
|
26
|
+
- Either a single combined design matrix file OR one residue table per protein
|
|
27
|
+
- Files are pipe-delimited ("|")
|
|
28
|
+
- Required columns: gene, mapped_resid, uniprot_length, AA, region, cut_C_Rall
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def main(argv=None):
|
|
32
|
+
|
|
33
|
+
import os
|
|
34
|
+
import argparse
|
|
35
|
+
import time
|
|
36
|
+
import logging
|
|
37
|
+
|
|
38
|
+
start_time = time.time()
|
|
39
|
+
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
description="Run Workflow 4 Monte Carlo subpopulation selection."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# --- required IO ---
|
|
45
|
+
parser.add_argument("--dataframe_files", type=str, required=True,
|
|
46
|
+
help="Input design matrix path: either a directory of per-protein files or a single combined CSV")
|
|
47
|
+
parser.add_argument("--outpath", type=str, required=True,
|
|
48
|
+
help="Output directory for Monte Carlo results")
|
|
49
|
+
parser.add_argument("--gene_list", type=str, required=True,
|
|
50
|
+
help="Path to gene list file (one ID per line)")
|
|
51
|
+
parser.add_argument("--tag", type=str, required=True,
|
|
52
|
+
help="Identifier tag for output naming")
|
|
53
|
+
|
|
54
|
+
# --- model options ---
|
|
55
|
+
parser.add_argument("--reg_formula", type=str, default='cut_C_Rall ~ region + AA',
|
|
56
|
+
help="Regression formula used by state scoring")
|
|
57
|
+
parser.add_argument("--response_var", type=str, default='cut_C_Rall',
|
|
58
|
+
help="Response variable in regression")
|
|
59
|
+
parser.add_argument("--test_var", type=str, default='region',
|
|
60
|
+
help="Primary test variable")
|
|
61
|
+
parser.add_argument("--random", action='store_true',
|
|
62
|
+
help="Use random sampling mode")
|
|
63
|
+
parser.add_argument("--n_groups", type=int, default=4,
|
|
64
|
+
help="Number of groups (default: 4)")
|
|
65
|
+
parser.add_argument("--steps", type=int, default=100000,
|
|
66
|
+
help="Number of Monte Carlo steps (default: 100000)")
|
|
67
|
+
parser.add_argument("--C1", type=float, default=1.0,
|
|
68
|
+
help="Monte Carlo objective weight C1")
|
|
69
|
+
parser.add_argument("--C2", type=float, default=2.5,
|
|
70
|
+
help="Monte Carlo objective weight C2")
|
|
71
|
+
parser.add_argument("--beta", type=float, default=0.05,
|
|
72
|
+
help="Inverse temperature/annealing parameter beta")
|
|
73
|
+
parser.add_argument("--linearT", action='store_true',
|
|
74
|
+
help="Use linear temperature schedule")
|
|
75
|
+
|
|
76
|
+
# --- logging ---
|
|
77
|
+
parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
78
|
+
help="Logging verbosity (default: INFO)")
|
|
79
|
+
parser.add_argument("--logdir", type=str, default=None,
|
|
80
|
+
help="Directory for log files (default: same as --outpath)")
|
|
81
|
+
|
|
82
|
+
args = parser.parse_args(argv)
|
|
83
|
+
|
|
84
|
+
dataframe_files = args.dataframe_files
|
|
85
|
+
outdir = args.outpath
|
|
86
|
+
gene_list = args.gene_list
|
|
87
|
+
tag = args.tag
|
|
88
|
+
reg_formula = args.reg_formula
|
|
89
|
+
response_var = args.response_var
|
|
90
|
+
test_var = args.test_var
|
|
91
|
+
random = args.random
|
|
92
|
+
n_groups = args.n_groups
|
|
93
|
+
steps = args.steps
|
|
94
|
+
C1 = args.C1
|
|
95
|
+
C2 = args.C2
|
|
96
|
+
beta = args.beta
|
|
97
|
+
linearT = args.linearT
|
|
98
|
+
|
|
99
|
+
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
|
100
|
+
logdir = args.logdir if args.logdir is not None else outdir
|
|
101
|
+
os.makedirs(logdir, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
logger = setup_logger('run_montecarlo', outdir=logdir, ID=tag, log_level=log_level)
|
|
104
|
+
setup_logger('MonteCarlo', outdir=logdir, ID=tag, log_level=log_level)
|
|
105
|
+
logger.info(f'args: {args}')
|
|
106
|
+
|
|
107
|
+
# --- input validation ---
|
|
108
|
+
if not os.path.exists(dataframe_files):
|
|
109
|
+
parser.error(f"--dataframe_files does not exist: {dataframe_files}")
|
|
110
|
+
if not os.path.isfile(gene_list):
|
|
111
|
+
parser.error(f"--gene_list does not exist or is not a file: {gene_list}")
|
|
112
|
+
if n_groups < 1:
|
|
113
|
+
parser.error("--n_groups must be >= 1")
|
|
114
|
+
if steps < 1:
|
|
115
|
+
parser.error("--steps must be >= 1")
|
|
116
|
+
os.makedirs(outdir, exist_ok=True)
|
|
117
|
+
|
|
118
|
+
# --- step 1: initialize Monte Carlo object ---
|
|
119
|
+
MC = MonteCarlo(
|
|
120
|
+
dataframe_files=dataframe_files,
|
|
121
|
+
outdir=outdir,
|
|
122
|
+
gene_list=gene_list,
|
|
123
|
+
ID=tag,
|
|
124
|
+
reg_formula=reg_formula,
|
|
125
|
+
response_var=response_var,
|
|
126
|
+
test_var=test_var,
|
|
127
|
+
random=random,
|
|
128
|
+
n_groups=n_groups,
|
|
129
|
+
steps=steps,
|
|
130
|
+
C1=C1,
|
|
131
|
+
C2=C2,
|
|
132
|
+
beta=beta,
|
|
133
|
+
linearT=linearT,
|
|
134
|
+
log_level=log_level,
|
|
135
|
+
logdir=logdir,
|
|
136
|
+
)
|
|
137
|
+
logger.info(f'MonteCarlo: {MC}')
|
|
138
|
+
|
|
139
|
+
# --- step 2: load residue-level data ---
|
|
140
|
+
MC.load_data(
|
|
141
|
+
sep='|',
|
|
142
|
+
reg_var=['AA', 'region'],
|
|
143
|
+
response_var='cut_C_Rall',
|
|
144
|
+
var2binarize=['cut_C_Rall', 'region'],
|
|
145
|
+
mask_column='mapped_resid',
|
|
146
|
+
ID_column='gene',
|
|
147
|
+
Length_column='uniprot_length',
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# --- step 3: run simulation ---
|
|
151
|
+
MC.run(encoded_df=MC.data, ID_column='gene')
|
|
152
|
+
|
|
153
|
+
logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
|
|
154
|
+
return 0
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if __name__ == "__main__":
|
|
158
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from EntDetect.gaussian_entanglement import GaussianEntanglement
|
|
3
|
+
from EntDetect.clustering import ClusterNativeEntanglements
|
|
4
|
+
from EntDetect.entanglement_features import FeatureGen
|
|
5
|
+
from EntDetect._logging import setup_logger
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Script to calculate native Gaussian entanglements in a given structure (PDB or COR file),
|
|
9
|
+
filter for high-quality entanglements, cluster them, and generate entanglement features.
|
|
10
|
+
|
|
11
|
+
Usage example (1ZMR / ecPGK):
|
|
12
|
+
python scripts/run_nativeNCLE.py \\
|
|
13
|
+
--struct /scratch/ims86/EntDetect_Datastore/user_input/reference_structures/1zmr_model_clean.pdb \\
|
|
14
|
+
--outdir /scratch/ims86/EntDetect_Datastore/outputs/workflow1 \\
|
|
15
|
+
--ID 1ZMR \\
|
|
16
|
+
--chain A \\
|
|
17
|
+
--organism Ecoli \\
|
|
18
|
+
--Accession P00558 \\
|
|
19
|
+
--model EXP
|
|
20
|
+
|
|
21
|
+
Arguments:
|
|
22
|
+
--struct Path to input PDB (or COR) structure file [required]
|
|
23
|
+
--outdir Root output directory; sub-dirs are created automatically [required]
|
|
24
|
+
--ID Identifier for the analysis (default: structure basename)
|
|
25
|
+
--chain Chain ID to process; omit to process all chains
|
|
26
|
+
--organism Reference proteome for clustering: Ecoli | Human | Yeast (default: Ecoli)
|
|
27
|
+
--Accession UniProt accession used in feature-file naming (default: P00558)
|
|
28
|
+
--model Structure type for HQ filtering: EXP | AF (default: EXP)
|
|
29
|
+
--resolution Structure resolution: aa | cg (overrides --cg flag)
|
|
30
|
+
--contacts Contact definition: heavy | calpha
|
|
31
|
+
--cg Flag: input is a coarse-grained C-alpha model (legacy; prefer --resolution cg)
|
|
32
|
+
--Calpha Flag: use C-alpha contacts (legacy; prefer --contacts calpha)
|
|
33
|
+
--cluster_cutoff Clustering distance cutoff in Å; if omitted, uses the
|
|
34
|
+
organism-specific default (Ecoli: 57, Human: 52, Yeast: 49)
|
|
35
|
+
--ent_detection_method
|
|
36
|
+
Entanglement detection criterion:
|
|
37
|
+
1 = any nonzero GLN for either termini
|
|
38
|
+
2 = any nonzero TLN for either termini (class default)
|
|
39
|
+
3 = both GLN and TLN nonzero for same termini (recommended; script default)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main(argv=None):
|
|
44
|
+
|
|
45
|
+
import multiprocessing as mp
|
|
46
|
+
import sys, os
|
|
47
|
+
import argparse
|
|
48
|
+
import time
|
|
49
|
+
|
|
50
|
+
start_time = time.time()
|
|
51
|
+
|
|
52
|
+
parser = argparse.ArgumentParser(description="Process user specified arguments")
|
|
53
|
+
parser.add_argument("--struct", type=str, required=True, help="Path to PDB structure file")
|
|
54
|
+
parser.add_argument("--outdir", type=str, required=True, help="output directory for results")
|
|
55
|
+
parser.add_argument("--ID", type=str, required=False, help="An id for the analysis (defaults to structure basename)")
|
|
56
|
+
parser.add_argument("--chain", type=str, required=False, help="Chain identifier (optional, processes all chains if not specified)", default=None)
|
|
57
|
+
parser.add_argument("--organism", type=str, required=False, help="Organism name for clustering: {Ecoli, Human, Yeast}", default='Ecoli')
|
|
58
|
+
parser.add_argument("--Accession", type=str, required=False, help="UniProt Accession for the protein", default='P00558')
|
|
59
|
+
parser.add_argument("--cg", action='store_true', help="Indicate structure is coarse-grained (C-alpha only) model")
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--Calpha",
|
|
62
|
+
"--calpha",
|
|
63
|
+
action='store_true',
|
|
64
|
+
dest="Calpha",
|
|
65
|
+
help="Use C-alpha atoms for contact definition (legacy flag; prefer --contacts calpha)",
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"--resolution",
|
|
69
|
+
type=str,
|
|
70
|
+
choices=["aa", "cg"],
|
|
71
|
+
default=None,
|
|
72
|
+
help="Structure resolution: 'aa' (all-atom) or 'cg' (C-alpha coarse-grained). If set, this overrides --cg.",
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument(
|
|
75
|
+
"--contacts",
|
|
76
|
+
type=str,
|
|
77
|
+
choices=["heavy", "calpha"],
|
|
78
|
+
default=None,
|
|
79
|
+
help="Contact definition to use: 'heavy' (all-atom) or 'calpha'. If omitted, defaults to heavy for aa and calpha for cg.",
|
|
80
|
+
)
|
|
81
|
+
parser.add_argument("--cluster_cutoff", type=float, required=False,
|
|
82
|
+
help="Clustering distance cutoff in Å. If omitted, uses the organism-specific default (Ecoli: 57, Human: 52, Yeast: 49).",
|
|
83
|
+
default=None)
|
|
84
|
+
parser.add_argument("--model", type=str, required=False, help="Model type for high-quality selection: {EXP, AF}", default='EXP')
|
|
85
|
+
parser.add_argument("--ent_detection_method", type=int, required=False, help="ENT detection method: 1=any GLN, 2=any TLN (default), 3=both GLN and TLN same termini", default=3)
|
|
86
|
+
parser.add_argument("--log_level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
87
|
+
help="Logging verbosity level (default: INFO)")
|
|
88
|
+
parser.add_argument("--logdir", type=str, default=None,
|
|
89
|
+
help="Directory for log file. Defaults to --outdir if not specified.")
|
|
90
|
+
args = parser.parse_args(argv)
|
|
91
|
+
import logging
|
|
92
|
+
log_level = getattr(logging, args.log_level.upper(), logging.INFO)
|
|
93
|
+
|
|
94
|
+
struct = args.struct
|
|
95
|
+
outdir = args.outdir
|
|
96
|
+
os.makedirs(outdir, exist_ok=True)
|
|
97
|
+
ID = args.ID if args.ID is not None else os.path.splitext(os.path.basename(struct))[0]
|
|
98
|
+
logdir = args.logdir if args.logdir is not None else outdir
|
|
99
|
+
# Pre-configure all EntDetect loggers for this run so they share one log file
|
|
100
|
+
logger = setup_logger('run_nativeNCLE', outdir=logdir, ID=ID, log_level=log_level)
|
|
101
|
+
for _cls in ['GaussianEntanglement', 'ClusterNativeEntanglements', 'FeatureGen']:
|
|
102
|
+
setup_logger(_cls, outdir=logdir, ID=ID, log_level=log_level)
|
|
103
|
+
logger.info(f'args: {args}')
|
|
104
|
+
chain = args.chain
|
|
105
|
+
organism = args.organism
|
|
106
|
+
cluster_cutoff = args.cluster_cutoff
|
|
107
|
+
model = args.model
|
|
108
|
+
|
|
109
|
+
# Derive effective resolution/contact settings while keeping legacy flags working.
|
|
110
|
+
# If neither --resolution nor --contacts are provided, behavior matches historical defaults:
|
|
111
|
+
# - all-atom (CG=False)
|
|
112
|
+
# - heavy-atom contacts (Calpha=False)
|
|
113
|
+
if args.resolution is None:
|
|
114
|
+
CG = bool(args.cg)
|
|
115
|
+
else:
|
|
116
|
+
CG = args.resolution == "cg"
|
|
117
|
+
|
|
118
|
+
if args.contacts is None:
|
|
119
|
+
Calpha = bool(args.Calpha) if args.resolution is None else (CG is True)
|
|
120
|
+
else:
|
|
121
|
+
Calpha = args.contacts == "calpha"
|
|
122
|
+
|
|
123
|
+
# Set up Gaussian Entanglement and Clustering objects
|
|
124
|
+
ge = GaussianEntanglement(g_threshold=0.6, density=1.0, Calpha=Calpha, CG=CG, ent_detection_method=args.ent_detection_method, log_level=log_level, logdir=logdir)
|
|
125
|
+
clustering = ClusterNativeEntanglements(organism=organism, cut_off=cluster_cutoff, log_level=log_level, logdir=logdir)
|
|
126
|
+
|
|
127
|
+
# Determine which chains to process
|
|
128
|
+
if chain is not None:
|
|
129
|
+
chains_to_process = [chain]
|
|
130
|
+
else:
|
|
131
|
+
# Get all chains from the structure
|
|
132
|
+
import MDAnalysis as mda
|
|
133
|
+
u = mda.Universe(struct)
|
|
134
|
+
chains_to_process = sorted(set([atom.segid if atom.segid else 'A' for atom in u.atoms if atom.segid or atom.chainID]))
|
|
135
|
+
if not chains_to_process or chains_to_process == ['']:
|
|
136
|
+
# Fallback: use mdtraj to get chains
|
|
137
|
+
import mdtraj as md
|
|
138
|
+
traj = md.load(struct)
|
|
139
|
+
chains_to_process = sorted(set([c.chain_id for c in traj.topology.chains]))
|
|
140
|
+
logger.info(f'Processing chains: {chains_to_process}')
|
|
141
|
+
|
|
142
|
+
# Process each chain separately for all steps
|
|
143
|
+
for chain_id in chains_to_process:
|
|
144
|
+
logger.info(f"{'='*80}\nProcessing chain {chain_id}\n{'='*80}")
|
|
145
|
+
|
|
146
|
+
# Use chain suffix for file naming when processing multiple chains
|
|
147
|
+
if len(chains_to_process) > 1:
|
|
148
|
+
hq_id = f"{ID}_{chain_id}"
|
|
149
|
+
else:
|
|
150
|
+
hq_id = ID
|
|
151
|
+
|
|
152
|
+
# All chains use the same Native_GE directory
|
|
153
|
+
ge_outdir = os.path.join(outdir, 'Native_GE')
|
|
154
|
+
os.makedirs(ge_outdir, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
# Calculate native entanglements for this chain
|
|
157
|
+
NativeEnt = ge.calculate_native_entanglements(struct, outdir=ge_outdir, ID=hq_id, chain=chain_id)
|
|
158
|
+
logger.info(f'Native entanglements saved to {NativeEnt["outfile"]}')
|
|
159
|
+
|
|
160
|
+
# Optional steps: select high-quality entanglements
|
|
161
|
+
HQNativeEnt = ge.select_high_quality_entanglements(NativeEnt['outfile'], struct, outdir=os.path.join(outdir, "Native_HQ_GE"), ID=hq_id, model=model, chain=chain_id)
|
|
162
|
+
logger.info(f'High-quality native entanglements saved to {HQNativeEnt["outfile"]}')
|
|
163
|
+
|
|
164
|
+
# Cluster the native entanglements to remove degeneracies
|
|
165
|
+
nativeClusteredEnt = clustering.Cluster_NativeEntanglements(HQNativeEnt['outfile'], outdir=os.path.join(outdir, "Native_clustered_HQ_GE"), outfile=f"{hq_id}.csv", chain=chain_id)
|
|
166
|
+
logger.info(f'Clustered native entanglements saved to {nativeClusteredEnt["outfile"]}')
|
|
167
|
+
|
|
168
|
+
# Generate entanglement features for clustered native entanglements
|
|
169
|
+
FGen = FeatureGen(struct, outdir=os.path.join(outdir, "Native_clustered_HQ_GE_features"), cluster_file=nativeClusteredEnt['outfile'], log_level=log_level, logdir=logdir)
|
|
170
|
+
EntFeatures = FGen.get_uent_features(gene=args.Accession, chain=chain_id, pdbid=ID)
|
|
171
|
+
logger.info(f'Entanglement features saved to {EntFeatures["outfile"]}')
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if __name__ == "__main__":
|
|
179
|
+
raise SystemExit(main())
|