EntDetect 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. EntDetect/Jwalk/GridTools.py +567 -0
  2. EntDetect/Jwalk/PDBTools.py +532 -0
  3. EntDetect/Jwalk/SASDTools.py +543 -0
  4. EntDetect/Jwalk/SurfaceTools.py +150 -0
  5. EntDetect/Jwalk/__init__.py +19 -0
  6. EntDetect/Jwalk/naccess.config.txt +255 -0
  7. EntDetect/__init__.py +10 -0
  8. EntDetect/_logging.py +71 -0
  9. EntDetect/change_resolution.py +2361 -0
  10. EntDetect/clustering.py +2626 -0
  11. EntDetect/compare_sim2exp.py +1927 -0
  12. EntDetect/entanglement_features.py +478 -0
  13. EntDetect/gaussian_entanglement.py +2067 -0
  14. EntDetect/order_params.py +1048 -0
  15. EntDetect/resources/__init__.py +11 -0
  16. EntDetect/resources/__pycache__/__init__.cpython-311.pyc +0 -0
  17. EntDetect/resources/calc_K.pl +712 -0
  18. EntDetect/resources/calc_Q.pl +962 -0
  19. EntDetect/resources/pulchra +0 -0
  20. EntDetect/resources/shared_files/__init__.py +2 -0
  21. EntDetect/resources/shared_files/bt_contact_potential.dat +22 -0
  22. EntDetect/resources/shared_files/karanicolas_dihe_parm.dat +1600 -0
  23. EntDetect/resources/shared_files/kgs_contact_potential.dat +22 -0
  24. EntDetect/resources/shared_files/mj_contact_potential.dat +22 -0
  25. EntDetect/resources/stride +0 -0
  26. EntDetect/statistics.py +1344 -0
  27. EntDetect/utilities.py +201 -0
  28. entdetect-1.2.0.dist-info/METADATA +26 -0
  29. entdetect-1.2.0.dist-info/RECORD +45 -0
  30. entdetect-1.2.0.dist-info/WHEEL +5 -0
  31. entdetect-1.2.0.dist-info/entry_points.txt +11 -0
  32. entdetect-1.2.0.dist-info/licenses/LICENSE +674 -0
  33. entdetect-1.2.0.dist-info/top_level.txt +2 -0
  34. scripts/__init__.py +5 -0
  35. scripts/convert_cor_psf_to_pdb.py +103 -0
  36. scripts/run_Foldingpathway.py +162 -0
  37. scripts/run_MSM.py +152 -0
  38. scripts/run_OP_on_simulation_traj.py +194 -0
  39. scripts/run_change_resolution.py +63 -0
  40. scripts/run_compare_sim2exp.py +215 -0
  41. scripts/run_montecarlo.py +158 -0
  42. scripts/run_nativeNCLE.py +179 -0
  43. scripts/run_nonnative_entanglement_clustering.py +110 -0
  44. scripts/run_population_modeling.py +117 -0
  45. scripts/run_workflow4_nativeNCLE_batch.py +412 -0
@@ -0,0 +1,63 @@
1
+ """
2
+ python scripts/run_change_resolution.py
3
+ --outdir TestingGrounds/CoarseGraining/
4
+ --pdbfile /path/to/structure.pdb
5
+ --nscal 2
6
+ --domain_file /path/to/domain_def.dat
7
+ --ID test_protein
8
+ """
9
+
10
+ def main(argv=None):
11
+
12
+ import sys, os
13
+ import argparse
14
+ import time
15
+
16
+ start_time = time.time()
17
+
18
+ parser = argparse.ArgumentParser(description="Process user specified arguments")
19
+ parser.add_argument("--outdir", type=str, required=True, help="output directory for results")
20
+ parser.add_argument("--pdbfile", type=str, required=True, help="Path to the all-atom PDB file")
21
+ parser.add_argument("--nscal", type=int, default=2, help="Coarse graining scale factor")
22
+ parser.add_argument("--domain_file", type=str, required=True, help="Path to the domain definition file")
23
+ parser.add_argument("--ID", type=str, required=True, help="Tag for the coarse graining process")
24
+ args = parser.parse_args(argv)
25
+ print(args)
26
+ outdir = args.outdir
27
+ pdbfile = args.pdbfile
28
+ nscal = args.nscal
29
+ domain_file = args.domain_file
30
+ ID = args.ID
31
+
32
+ # Import here so `run_change_resolution -h` works without OpenMM installed.
33
+ from EntDetect.change_resolution import CoarseGrain, BackMapping
34
+
35
+ ## Coarse grain the all-atom structure
36
+ CoarseGrainer = CoarseGrain(outdir=outdir,
37
+ ID=ID,
38
+ pdbfile=pdbfile,
39
+ nscal=nscal,
40
+ domain_file=domain_file)
41
+ print(CoarseGrainer)
42
+
43
+ # run the coarse graining for the all-atom pdbfile
44
+ CGfiles = CoarseGrainer.run()
45
+ print(CGfiles)
46
+
47
+ # parse the prm and top file to make a OpenMM compatible .xml force field file
48
+ CoarseGrainer.parse_cg_prm(prmfile=CGfiles['prm'], topfile=CGfiles['top'])
49
+
50
+
51
+ ## Backmap the coarse grained structure to all-atom
52
+ backMapper = BackMapping(outdir=outdir)
53
+ print(f'BackMapper: {backMapper}')
54
+
55
+ # backmap (using original pdbfile as aa_pdb reference)
56
+ backMapper.backmap(cg_pdb=CGfiles['cor'], aa_pdb=pdbfile, TAG=ID)
57
+
58
+ print(f'NORMAL TERMINATION - {time.time() - start_time} seconds')
59
+ return 0
60
+
61
+
62
+ if __name__ == "__main__":
63
+ raise SystemExit(main())
@@ -0,0 +1,215 @@
1
+ from EntDetect.gaussian_entanglement import GaussianEntanglement
2
+ from EntDetect.clustering import ClusterNativeEntanglements, MSMNonNativeEntanglementClustering
3
+ from EntDetect.order_params import CalculateOP, CollectOP
4
+ from EntDetect.compare_sim2exp import MassSpec
5
+
6
+ """
7
+ Collect per-trajectory SASA/XP outputs (optional) and run the LiP-MS / XL-MS
8
+ consistency test.
9
+
10
+ Two usage modes:
11
+
12
+ 1. Collect + run (provide --sasa_dir and --xp_dir):
13
+ python scripts/run_compare_sim2exp.py
14
+ --sasa_dir /path/to/OP_AA/SASA
15
+ --xp_dir /path/to/OP_AA/XP
16
+ --n_traj 1000
17
+ --n_frames 335
18
+ --msm_data_file ... (remaining args as below)
19
+
20
+ 2. Skip collection (provide --sasa_data_file and --dist_data_file directly):
21
+ python scripts/run_compare_sim2exp.py
22
+ --sasa_data_file /path/to/SASA.npy
23
+ --dist_data_file /path/to/Jwalk.npy
24
+ --msm_data_file ...
25
+
26
+ Full example (mode 1):
27
+ python scripts/run_compare_sim2exp.py
28
+ --sasa_dir /path/to/OP_AA/SASA
29
+ --xp_dir /path/to/OP_AA/XP
30
+ --n_traj 1000
31
+ --n_frames 335
32
+ --msm_data_file /path/to/msm_data.csv
33
+ --meta_dist_file /path/to/meta_dist.npy
34
+ --LiPMS_exp_file /path/to/LiPMS_exp.xlsx
35
+ --XLMS_exp_file /path/to/XLMS_exp.xlsx
36
+ --cluster_data_file /path/to/cluster_data.npz
37
+ --OPpath /path/to/OP_AA/
38
+ --AAdcd_dir /path/to/aa_trajectories/
39
+ --native_AA_pdb /path/to/native.pdb
40
+ --state_idx_list 4 6 8
41
+ --prot_len 387
42
+ --last_num_frames 335
43
+ --rm_traj_list 65 75 155
44
+ --native_state_idx 9
45
+ --outdir /path/to/outdir/
46
+ --ID 1ZMR
47
+ --start 6600
48
+ --end -1
49
+ --stride 1
50
+ --num_perm 1000
51
+ --n_boot 100
52
+ --lag_frame 20
53
+ --nproc 10
54
+ """
55
+
56
+ def main(argv=None):
57
+
58
+ import multiprocessing as mp
59
+ import sys, os
60
+ import argparse
61
+ import time
62
+
63
+ start_time = time.time()
64
+
65
+ parser = argparse.ArgumentParser(description="Process user specified arguments")
66
+ parser.add_argument("--msm_data_file", type=str, required=True, help="Path to MSM mapping file")
67
+ parser.add_argument("--meta_dist_file", type=str, required=True, help="Path to meta-distance file")
68
+ parser.add_argument("--LiPMS_exp_file", type=str, required=True, help="Path to LiP-MS experimental data file")
69
+ parser.add_argument("--XLMS_exp_file", type=str, required=True, help="Path to XL-MS experimental data file")
70
+ parser.add_argument("--cluster_data_file", type=str, required=True, help="Path to clustering data file")
71
+ parser.add_argument("--OPpath", type=str, required=True, help="Path to order parameters directory")
72
+ parser.add_argument("--AAdcd_dir", type=str, required=True, help="Path to all-atom DCD files directory")
73
+ parser.add_argument("--native_AA_pdb", type=str, required=True, help="Path to native all-atom PDB file")
74
+ parser.add_argument("--state_idx_list", type=int, nargs='+', required=True, help="List of state indices to analyze")
75
+ parser.add_argument("--prot_len", type=int, required=True, help="Length of the protein")
76
+ parser.add_argument("--last_num_frames", type=int, required=True, help="Number of last frames to consider")
77
+ parser.add_argument("--rm_traj_list", type=int, nargs='+', required=True, help="List of trajectory indices to remove")
78
+ parser.add_argument("--native_state_idx", type=int, required=True, help="Index of the native state")
79
+ parser.add_argument("--outdir", type=str, required=True, help="Output directory for results")
80
+ parser.add_argument("--ID", type=str, required=True, help="An ID for the analysis")
81
+ parser.add_argument("--start", type=int, required=True, help="Start frame index")
82
+ parser.add_argument("--end", type=int, required=True, help="End frame index")
83
+ parser.add_argument("--stride", type=int, required=True, help="Stride for frame selection")
84
+ parser.add_argument("--verbose", action='store_true', help="Enable verbose output")
85
+ parser.add_argument("--num_perm", type=int, required=True, help="Number of permutations for statistical tests")
86
+ parser.add_argument("--n_boot", type=int, required=True, help="Number of bootstrap samples")
87
+ parser.add_argument("--lag_frame", type=int, required=True, help="Lag time in frames")
88
+ parser.add_argument("--nproc", type=int, required=True, help="Number of processes for parallel computation")
89
+ # --- CollectOP arguments (optional: collect from per-traj files) ---
90
+ parser.add_argument("--sasa_dir", type=str, default=None,
91
+ help="Directory of per-traj {ID}_Traj{N}.SASA files. "
92
+ "If provided together with --xp_dir, CollectOP is run "
93
+ "before MassSpec and --sasa_data_file / --dist_data_file "
94
+ "are set automatically.")
95
+ parser.add_argument("--xp_dir", type=str, default=None,
96
+ help="Directory of per-traj {ID}_Traj{N}.XP files (used with --sasa_dir).")
97
+ parser.add_argument("--n_traj", type=int, default=None,
98
+ help="Total number of trajectories for CollectOP (required with --sasa_dir).")
99
+ parser.add_argument("--n_frames", type=int, default=None,
100
+ help="Frames per trajectory stored in each file (required with --sasa_dir).")
101
+ parser.add_argument("--collect_jwalk_npy", action='store_true',
102
+ help="Also build Jwalk.npy with CollectOP (legacy path). "
103
+ "By default, XL-MS scoring streams directly from XP files to reduce memory use.")
104
+ # --- Direct array paths (used when skipping collection) ---
105
+ parser.add_argument("--sasa_data_file", type=str, default=None,
106
+ help="Path to pre-built SASA.npy. Required if --sasa_dir is not provided.")
107
+ parser.add_argument("--dist_data_file", type=str, default=None,
108
+ help="Path to pre-built Jwalk.npy. Required if --xp_dir is not provided.")
109
+ args = parser.parse_args(argv)
110
+ print(args)
111
+ msm_data_file = args.msm_data_file
112
+ meta_dist_file = args.meta_dist_file
113
+ LiPMS_exp_file = args.LiPMS_exp_file
114
+ sasa_data_file = args.sasa_data_file
115
+ XLMS_exp_file = args.XLMS_exp_file
116
+ dist_data_file = args.dist_data_file
117
+ cluster_data_file = args.cluster_data_file
118
+ OPpath = args.OPpath
119
+ AAdcd_dir = args.AAdcd_dir
120
+ native_AA_pdb = args.native_AA_pdb
121
+ state_idx_list = args.state_idx_list
122
+ prot_len = args.prot_len
123
+ last_num_frames = args.last_num_frames
124
+ rm_traj_list = args.rm_traj_list
125
+ native_state_idx = args.native_state_idx
126
+ outdir = args.outdir
127
+ ID = args.ID
128
+ start = args.start
129
+ end = args.end
130
+ stride = args.stride
131
+ verbose = args.verbose
132
+ num_perm = args.num_perm
133
+ n_boot = args.n_boot
134
+ lag_frame = args.lag_frame
135
+ nproc = args.nproc
136
+
137
+ # ── validate input mode ────────────────────────────────────────────────
138
+ collect_mode = args.sasa_dir is not None and args.xp_dir is not None
139
+ direct_mode = args.sasa_data_file is not None and args.dist_data_file is not None
140
+
141
+ if not collect_mode and not direct_mode:
142
+ parser.error(
143
+ "Provide either (--sasa_dir + --xp_dir + --n_traj + --n_frames) "
144
+ "to collect from per-trajectory files, or "
145
+ "(--sasa_data_file + --dist_data_file) to use pre-built arrays."
146
+ )
147
+
148
+ # ── Step 1: collect per-trajectory outputs if requested ───────────────
149
+ if collect_mode:
150
+ if args.n_traj is None or args.n_frames is None:
151
+ parser.error("--n_traj and --n_frames are required when using --sasa_dir / --xp_dir")
152
+
153
+ os.makedirs(outdir, exist_ok=True)
154
+ collector = CollectOP(
155
+ sasa_dir = args.sasa_dir,
156
+ xp_dir = args.xp_dir,
157
+ outdir = outdir,
158
+ ID = ID,
159
+ n_traj = args.n_traj,
160
+ n_frames = args.n_frames,
161
+ prot_len = prot_len,
162
+ )
163
+ sasa_data_file = collector.collect_SASA()
164
+ if args.collect_jwalk_npy:
165
+ dist_data_file = collector.collect_Jwalk()
166
+ else:
167
+ dist_data_file = None
168
+ print(f'CollectOP SASA: {sasa_data_file}')
169
+ if dist_data_file is not None:
170
+ print(f'CollectOP Jwalk: {dist_data_file}')
171
+ else:
172
+ print('CollectOP Jwalk: skipped (streaming XP mode enabled)')
173
+
174
+ # ── Step 2: run the consistency test ──────────────────────────────────
175
+ MS = MassSpec(msm_data_file=msm_data_file,
176
+ meta_dist_file=meta_dist_file,
177
+ LiPMS_exp_file=LiPMS_exp_file,
178
+ sasa_data_file=sasa_data_file,
179
+ XLMS_exp_file=XLMS_exp_file,
180
+ dist_data_file=dist_data_file,
181
+ cluster_data_file=cluster_data_file,
182
+ OPpath=OPpath,
183
+ AAdcd_dir=AAdcd_dir,
184
+ native_AA_pdb=native_AA_pdb,
185
+ xp_dir=args.xp_dir,
186
+ state_idx_list=state_idx_list,
187
+ prot_len=prot_len,
188
+ last_num_frames=last_num_frames,
189
+ rm_traj_list=rm_traj_list,
190
+ native_state_idx=native_state_idx,
191
+ outdir=outdir,
192
+ ID=ID,
193
+ start=start,
194
+ end=end,
195
+ stride=stride,
196
+ verbose=verbose,
197
+ num_perm=num_perm,
198
+ n_boot=n_boot,
199
+ lag_frame=lag_frame,
200
+ nproc=nproc)
201
+
202
+ # run the consistency test
203
+ consist_data_file, consist_result_file = MS.LiP_XL_MS_ConsistencyTest()
204
+ print(f'consist_data_file: {consist_data_file}')
205
+ print(f'consist_result_file: {consist_result_file}')
206
+
207
+ # select the representative structures from the consistency test
208
+ MS.select_rep_structs(consist_data_file, consist_result_file, total_traj_num_frames=335, last_num_frames=67)
209
+
210
+ print(f'NORMAL TERMINATION - {time.time() - start_time} seconds')
211
+ return 0
212
+
213
+
214
+ if __name__ == "__main__":
215
+ raise SystemExit(main())
@@ -0,0 +1,158 @@
1
+ #!/usr/bin/env python3
2
+ from EntDetect.statistics import MonteCarlo
3
+ from EntDetect._logging import setup_logger
4
+
5
+ """
6
+ Run Workflow 4 Monte Carlo subpopulation selection.
7
+
8
+ This script wraps EntDetect.statistics.MonteCarlo and optimizes population
9
+ partitions using a logistic-regression objective and penalty terms.
10
+
11
+ Example
12
+ -------
13
+ python scripts/run_montecarlo.py \
14
+ --dataframe_files /path/to/residue_dataframes_workflow4.csv \
15
+ --outpath /path/to/workflow4/monte_carlo/ \
16
+ --gene_list /path/to/gene_list.txt \
17
+ --tag Ecoli_population_mc \
18
+ --steps 100000 \
19
+ --n_groups 4 \
20
+ --C1 1.0 \
21
+ --C2 2.5 \
22
+ --beta 0.05
23
+
24
+ Expected input schema
25
+ ---------------------
26
+ - Either a single combined design matrix file OR one residue table per protein
27
+ - Files are pipe-delimited ("|")
28
+ - Required columns: gene, mapped_resid, uniprot_length, AA, region, cut_C_Rall
29
+ """
30
+
31
+ def main(argv=None):
32
+
33
+ import os
34
+ import argparse
35
+ import time
36
+ import logging
37
+
38
+ start_time = time.time()
39
+
40
+ parser = argparse.ArgumentParser(
41
+ description="Run Workflow 4 Monte Carlo subpopulation selection."
42
+ )
43
+
44
+ # --- required IO ---
45
+ parser.add_argument("--dataframe_files", type=str, required=True,
46
+ help="Input design matrix path: either a directory of per-protein files or a single combined CSV")
47
+ parser.add_argument("--outpath", type=str, required=True,
48
+ help="Output directory for Monte Carlo results")
49
+ parser.add_argument("--gene_list", type=str, required=True,
50
+ help="Path to gene list file (one ID per line)")
51
+ parser.add_argument("--tag", type=str, required=True,
52
+ help="Identifier tag for output naming")
53
+
54
+ # --- model options ---
55
+ parser.add_argument("--reg_formula", type=str, default='cut_C_Rall ~ region + AA',
56
+ help="Regression formula used by state scoring")
57
+ parser.add_argument("--response_var", type=str, default='cut_C_Rall',
58
+ help="Response variable in regression")
59
+ parser.add_argument("--test_var", type=str, default='region',
60
+ help="Primary test variable")
61
+ parser.add_argument("--random", action='store_true',
62
+ help="Use random sampling mode")
63
+ parser.add_argument("--n_groups", type=int, default=4,
64
+ help="Number of groups (default: 4)")
65
+ parser.add_argument("--steps", type=int, default=100000,
66
+ help="Number of Monte Carlo steps (default: 100000)")
67
+ parser.add_argument("--C1", type=float, default=1.0,
68
+ help="Monte Carlo objective weight C1")
69
+ parser.add_argument("--C2", type=float, default=2.5,
70
+ help="Monte Carlo objective weight C2")
71
+ parser.add_argument("--beta", type=float, default=0.05,
72
+ help="Inverse temperature/annealing parameter beta")
73
+ parser.add_argument("--linearT", action='store_true',
74
+ help="Use linear temperature schedule")
75
+
76
+ # --- logging ---
77
+ parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
78
+ help="Logging verbosity (default: INFO)")
79
+ parser.add_argument("--logdir", type=str, default=None,
80
+ help="Directory for log files (default: same as --outpath)")
81
+
82
+ args = parser.parse_args(argv)
83
+
84
+ dataframe_files = args.dataframe_files
85
+ outdir = args.outpath
86
+ gene_list = args.gene_list
87
+ tag = args.tag
88
+ reg_formula = args.reg_formula
89
+ response_var = args.response_var
90
+ test_var = args.test_var
91
+ random = args.random
92
+ n_groups = args.n_groups
93
+ steps = args.steps
94
+ C1 = args.C1
95
+ C2 = args.C2
96
+ beta = args.beta
97
+ linearT = args.linearT
98
+
99
+ log_level = getattr(logging, args.log_level.upper(), logging.INFO)
100
+ logdir = args.logdir if args.logdir is not None else outdir
101
+ os.makedirs(logdir, exist_ok=True)
102
+
103
+ logger = setup_logger('run_montecarlo', outdir=logdir, ID=tag, log_level=log_level)
104
+ setup_logger('MonteCarlo', outdir=logdir, ID=tag, log_level=log_level)
105
+ logger.info(f'args: {args}')
106
+
107
+ # --- input validation ---
108
+ if not os.path.exists(dataframe_files):
109
+ parser.error(f"--dataframe_files does not exist: {dataframe_files}")
110
+ if not os.path.isfile(gene_list):
111
+ parser.error(f"--gene_list does not exist or is not a file: {gene_list}")
112
+ if n_groups < 1:
113
+ parser.error("--n_groups must be >= 1")
114
+ if steps < 1:
115
+ parser.error("--steps must be >= 1")
116
+ os.makedirs(outdir, exist_ok=True)
117
+
118
+ # --- step 1: initialize Monte Carlo object ---
119
+ MC = MonteCarlo(
120
+ dataframe_files=dataframe_files,
121
+ outdir=outdir,
122
+ gene_list=gene_list,
123
+ ID=tag,
124
+ reg_formula=reg_formula,
125
+ response_var=response_var,
126
+ test_var=test_var,
127
+ random=random,
128
+ n_groups=n_groups,
129
+ steps=steps,
130
+ C1=C1,
131
+ C2=C2,
132
+ beta=beta,
133
+ linearT=linearT,
134
+ log_level=log_level,
135
+ logdir=logdir,
136
+ )
137
+ logger.info(f'MonteCarlo: {MC}')
138
+
139
+ # --- step 2: load residue-level data ---
140
+ MC.load_data(
141
+ sep='|',
142
+ reg_var=['AA', 'region'],
143
+ response_var='cut_C_Rall',
144
+ var2binarize=['cut_C_Rall', 'region'],
145
+ mask_column='mapped_resid',
146
+ ID_column='gene',
147
+ Length_column='uniprot_length',
148
+ )
149
+
150
+ # --- step 3: run simulation ---
151
+ MC.run(encoded_df=MC.data, ID_column='gene')
152
+
153
+ logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
154
+ return 0
155
+
156
+
157
+ if __name__ == "__main__":
158
+ raise SystemExit(main())
@@ -0,0 +1,179 @@
1
+ #!/usr/bin/env python3
2
+ from EntDetect.gaussian_entanglement import GaussianEntanglement
3
+ from EntDetect.clustering import ClusterNativeEntanglements
4
+ from EntDetect.entanglement_features import FeatureGen
5
+ from EntDetect._logging import setup_logger
6
+
7
+ """
8
+ Script to calculate native Gaussian entanglements in a given structure (PDB or COR file),
9
+ filter for high-quality entanglements, cluster them, and generate entanglement features.
10
+
11
+ Usage example (1ZMR / ecPGK):
12
+ python scripts/run_nativeNCLE.py \\
13
+ --struct /scratch/ims86/EntDetect_Datastore/user_input/reference_structures/1zmr_model_clean.pdb \\
14
+ --outdir /scratch/ims86/EntDetect_Datastore/outputs/workflow1 \\
15
+ --ID 1ZMR \\
16
+ --chain A \\
17
+ --organism Ecoli \\
18
+ --Accession P00558 \\
19
+ --model EXP
20
+
21
+ Arguments:
22
+ --struct Path to input PDB (or COR) structure file [required]
23
+ --outdir Root output directory; sub-dirs are created automatically [required]
24
+ --ID Identifier for the analysis (default: structure basename)
25
+ --chain Chain ID to process; omit to process all chains
26
+ --organism Reference proteome for clustering: Ecoli | Human | Yeast (default: Ecoli)
27
+ --Accession UniProt accession used in feature-file naming (default: P00558)
28
+ --model Structure type for HQ filtering: EXP | AF (default: EXP)
29
+ --resolution Structure resolution: aa | cg (overrides --cg flag)
30
+ --contacts Contact definition: heavy | calpha
31
+ --cg Flag: input is a coarse-grained C-alpha model (legacy; prefer --resolution cg)
32
+ --Calpha Flag: use C-alpha contacts (legacy; prefer --contacts calpha)
33
+ --cluster_cutoff Clustering distance cutoff in Å; if omitted, uses the
34
+ organism-specific default (Ecoli: 57, Human: 52, Yeast: 49)
35
+ --ent_detection_method
36
+ Entanglement detection criterion:
37
+ 1 = any nonzero GLN for either termini
38
+ 2 = any nonzero TLN for either termini (class default)
39
+ 3 = both GLN and TLN nonzero for same termini (recommended; script default)
40
+ """
41
+
42
+
43
+ def main(argv=None):
44
+
45
+ import multiprocessing as mp
46
+ import sys, os
47
+ import argparse
48
+ import time
49
+
50
+ start_time = time.time()
51
+
52
+ parser = argparse.ArgumentParser(description="Process user specified arguments")
53
+ parser.add_argument("--struct", type=str, required=True, help="Path to PDB structure file")
54
+ parser.add_argument("--outdir", type=str, required=True, help="output directory for results")
55
+ parser.add_argument("--ID", type=str, required=False, help="An id for the analysis (defaults to structure basename)")
56
+ parser.add_argument("--chain", type=str, required=False, help="Chain identifier (optional, processes all chains if not specified)", default=None)
57
+ parser.add_argument("--organism", type=str, required=False, help="Organism name for clustering: {Ecoli, Human, Yeast}", default='Ecoli')
58
+ parser.add_argument("--Accession", type=str, required=False, help="UniProt Accession for the protein", default='P00558')
59
+ parser.add_argument("--cg", action='store_true', help="Indicate structure is coarse-grained (C-alpha only) model")
60
+ parser.add_argument(
61
+ "--Calpha",
62
+ "--calpha",
63
+ action='store_true',
64
+ dest="Calpha",
65
+ help="Use C-alpha atoms for contact definition (legacy flag; prefer --contacts calpha)",
66
+ )
67
+ parser.add_argument(
68
+ "--resolution",
69
+ type=str,
70
+ choices=["aa", "cg"],
71
+ default=None,
72
+ help="Structure resolution: 'aa' (all-atom) or 'cg' (C-alpha coarse-grained). If set, this overrides --cg.",
73
+ )
74
+ parser.add_argument(
75
+ "--contacts",
76
+ type=str,
77
+ choices=["heavy", "calpha"],
78
+ default=None,
79
+ help="Contact definition to use: 'heavy' (all-atom) or 'calpha'. If omitted, defaults to heavy for aa and calpha for cg.",
80
+ )
81
+ parser.add_argument("--cluster_cutoff", type=float, required=False,
82
+ help="Clustering distance cutoff in Å. If omitted, uses the organism-specific default (Ecoli: 57, Human: 52, Yeast: 49).",
83
+ default=None)
84
+ parser.add_argument("--model", type=str, required=False, help="Model type for high-quality selection: {EXP, AF}", default='EXP')
85
+ parser.add_argument("--ent_detection_method", type=int, required=False, help="ENT detection method: 1=any GLN, 2=any TLN (default), 3=both GLN and TLN same termini", default=3)
86
+ parser.add_argument("--log_level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
87
+ help="Logging verbosity level (default: INFO)")
88
+ parser.add_argument("--logdir", type=str, default=None,
89
+ help="Directory for log file. Defaults to --outdir if not specified.")
90
+ args = parser.parse_args(argv)
91
+ import logging
92
+ log_level = getattr(logging, args.log_level.upper(), logging.INFO)
93
+
94
+ struct = args.struct
95
+ outdir = args.outdir
96
+ os.makedirs(outdir, exist_ok=True)
97
+ ID = args.ID if args.ID is not None else os.path.splitext(os.path.basename(struct))[0]
98
+ logdir = args.logdir if args.logdir is not None else outdir
99
+ # Pre-configure all EntDetect loggers for this run so they share one log file
100
+ logger = setup_logger('run_nativeNCLE', outdir=logdir, ID=ID, log_level=log_level)
101
+ for _cls in ['GaussianEntanglement', 'ClusterNativeEntanglements', 'FeatureGen']:
102
+ setup_logger(_cls, outdir=logdir, ID=ID, log_level=log_level)
103
+ logger.info(f'args: {args}')
104
+ chain = args.chain
105
+ organism = args.organism
106
+ cluster_cutoff = args.cluster_cutoff
107
+ model = args.model
108
+
109
+ # Derive effective resolution/contact settings while keeping legacy flags working.
110
+ # If neither --resolution nor --contacts are provided, behavior matches historical defaults:
111
+ # - all-atom (CG=False)
112
+ # - heavy-atom contacts (Calpha=False)
113
+ if args.resolution is None:
114
+ CG = bool(args.cg)
115
+ else:
116
+ CG = args.resolution == "cg"
117
+
118
+ if args.contacts is None:
119
+ Calpha = bool(args.Calpha) if args.resolution is None else (CG is True)
120
+ else:
121
+ Calpha = args.contacts == "calpha"
122
+
123
+ # Set up Gaussian Entanglement and Clustering objects
124
+ ge = GaussianEntanglement(g_threshold=0.6, density=1.0, Calpha=Calpha, CG=CG, ent_detection_method=args.ent_detection_method, log_level=log_level, logdir=logdir)
125
+ clustering = ClusterNativeEntanglements(organism=organism, cut_off=cluster_cutoff, log_level=log_level, logdir=logdir)
126
+
127
+ # Determine which chains to process
128
+ if chain is not None:
129
+ chains_to_process = [chain]
130
+ else:
131
+ # Get all chains from the structure
132
+ import MDAnalysis as mda
133
+ u = mda.Universe(struct)
134
+ chains_to_process = sorted(set([atom.segid if atom.segid else 'A' for atom in u.atoms if atom.segid or atom.chainID]))
135
+ if not chains_to_process or chains_to_process == ['']:
136
+ # Fallback: use mdtraj to get chains
137
+ import mdtraj as md
138
+ traj = md.load(struct)
139
+ chains_to_process = sorted(set([c.chain_id for c in traj.topology.chains]))
140
+ logger.info(f'Processing chains: {chains_to_process}')
141
+
142
+ # Process each chain separately for all steps
143
+ for chain_id in chains_to_process:
144
+ logger.info(f"{'='*80}\nProcessing chain {chain_id}\n{'='*80}")
145
+
146
+ # Use chain suffix for file naming when processing multiple chains
147
+ if len(chains_to_process) > 1:
148
+ hq_id = f"{ID}_{chain_id}"
149
+ else:
150
+ hq_id = ID
151
+
152
+ # All chains use the same Native_GE directory
153
+ ge_outdir = os.path.join(outdir, 'Native_GE')
154
+ os.makedirs(ge_outdir, exist_ok=True)
155
+
156
+ # Calculate native entanglements for this chain
157
+ NativeEnt = ge.calculate_native_entanglements(struct, outdir=ge_outdir, ID=hq_id, chain=chain_id)
158
+ logger.info(f'Native entanglements saved to {NativeEnt["outfile"]}')
159
+
160
+ # Optional steps: select high-quality entanglements
161
+ HQNativeEnt = ge.select_high_quality_entanglements(NativeEnt['outfile'], struct, outdir=os.path.join(outdir, "Native_HQ_GE"), ID=hq_id, model=model, chain=chain_id)
162
+ logger.info(f'High-quality native entanglements saved to {HQNativeEnt["outfile"]}')
163
+
164
+ # Cluster the native entanglements to remove degeneracies
165
+ nativeClusteredEnt = clustering.Cluster_NativeEntanglements(HQNativeEnt['outfile'], outdir=os.path.join(outdir, "Native_clustered_HQ_GE"), outfile=f"{hq_id}.csv", chain=chain_id)
166
+ logger.info(f'Clustered native entanglements saved to {nativeClusteredEnt["outfile"]}')
167
+
168
+ # Generate entanglement features for clustered native entanglements
169
+ FGen = FeatureGen(struct, outdir=os.path.join(outdir, "Native_clustered_HQ_GE_features"), cluster_file=nativeClusteredEnt['outfile'], log_level=log_level, logdir=logdir)
170
+ EntFeatures = FGen.get_uent_features(gene=args.Accession, chain=chain_id, pdbid=ID)
171
+ logger.info(f'Entanglement features saved to {EntFeatures["outfile"]}')
172
+
173
+
174
+ logger.info(f'NORMAL TERMINATION - {time.time() - start_time:.1f} seconds')
175
+ return 0
176
+
177
+
178
+ if __name__ == "__main__":
179
+ raise SystemExit(main())