mlmm-toolkit 0.2.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hessian_ff/__init__.py +50 -0
- hessian_ff/analytical_hessian.py +609 -0
- hessian_ff/constants.py +46 -0
- hessian_ff/forcefield.py +339 -0
- hessian_ff/loaders.py +608 -0
- hessian_ff/native/Makefile +8 -0
- hessian_ff/native/__init__.py +28 -0
- hessian_ff/native/analytical_hessian.py +88 -0
- hessian_ff/native/analytical_hessian_ext.cpp +258 -0
- hessian_ff/native/bonded.py +82 -0
- hessian_ff/native/bonded_ext.cpp +640 -0
- hessian_ff/native/loader.py +349 -0
- hessian_ff/native/nonbonded.py +118 -0
- hessian_ff/native/nonbonded_ext.cpp +1150 -0
- hessian_ff/prmtop_parmed.py +23 -0
- hessian_ff/system.py +107 -0
- hessian_ff/terms/__init__.py +14 -0
- hessian_ff/terms/angle.py +73 -0
- hessian_ff/terms/bond.py +44 -0
- hessian_ff/terms/cmap.py +406 -0
- hessian_ff/terms/dihedral.py +141 -0
- hessian_ff/terms/nonbonded.py +209 -0
- hessian_ff/tests/__init__.py +0 -0
- hessian_ff/tests/conftest.py +75 -0
- hessian_ff/tests/data/small/complex.parm7 +1346 -0
- hessian_ff/tests/data/small/complex.pdb +125 -0
- hessian_ff/tests/data/small/complex.rst7 +63 -0
- hessian_ff/tests/test_coords_input.py +44 -0
- hessian_ff/tests/test_energy_force.py +49 -0
- hessian_ff/tests/test_hessian.py +137 -0
- hessian_ff/tests/test_smoke.py +18 -0
- hessian_ff/tests/test_validation.py +40 -0
- hessian_ff/workflows.py +889 -0
- mlmm/__init__.py +36 -0
- mlmm/__main__.py +7 -0
- mlmm/_version.py +34 -0
- mlmm/add_elem_info.py +374 -0
- mlmm/advanced_help.py +91 -0
- mlmm/align_freeze_atoms.py +601 -0
- mlmm/all.py +3535 -0
- mlmm/bond_changes.py +231 -0
- mlmm/bool_compat.py +223 -0
- mlmm/cli.py +574 -0
- mlmm/cli_utils.py +166 -0
- mlmm/default_group.py +337 -0
- mlmm/defaults.py +467 -0
- mlmm/define_layer.py +526 -0
- mlmm/dft.py +1041 -0
- mlmm/energy_diagram.py +253 -0
- mlmm/extract.py +2213 -0
- mlmm/fix_altloc.py +464 -0
- mlmm/freq.py +1406 -0
- mlmm/harmonic_constraints.py +140 -0
- mlmm/hessian_cache.py +44 -0
- mlmm/hessian_calc.py +174 -0
- mlmm/irc.py +638 -0
- mlmm/mlmm_calc.py +2262 -0
- mlmm/mm_parm.py +945 -0
- mlmm/oniom_export.py +1983 -0
- mlmm/oniom_import.py +457 -0
- mlmm/opt.py +1742 -0
- mlmm/path_opt.py +1353 -0
- mlmm/path_search.py +2299 -0
- mlmm/preflight.py +88 -0
- mlmm/py.typed +1 -0
- mlmm/pysis_runner.py +45 -0
- mlmm/scan.py +1047 -0
- mlmm/scan2d.py +1226 -0
- mlmm/scan3d.py +1265 -0
- mlmm/scan_common.py +184 -0
- mlmm/summary_log.py +736 -0
- mlmm/trj2fig.py +448 -0
- mlmm/tsopt.py +2871 -0
- mlmm/utils.py +2309 -0
- mlmm/xtb_embedcharge_correction.py +475 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/METADATA +1159 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/RECORD +372 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/WHEEL +5 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/entry_points.txt +2 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/licenses/LICENSE +674 -0
- mlmm_toolkit-0.2.2.dev0.dist-info/top_level.txt +4 -0
- pysisyphus/Geometry.py +1667 -0
- pysisyphus/LICENSE +674 -0
- pysisyphus/TableFormatter.py +63 -0
- pysisyphus/TablePrinter.py +74 -0
- pysisyphus/__init__.py +12 -0
- pysisyphus/calculators/AFIR.py +452 -0
- pysisyphus/calculators/AnaPot.py +20 -0
- pysisyphus/calculators/AnaPot2.py +48 -0
- pysisyphus/calculators/AnaPot3.py +12 -0
- pysisyphus/calculators/AnaPot4.py +20 -0
- pysisyphus/calculators/AnaPotBase.py +337 -0
- pysisyphus/calculators/AnaPotCBM.py +25 -0
- pysisyphus/calculators/AtomAtomTransTorque.py +154 -0
- pysisyphus/calculators/CFOUR.py +250 -0
- pysisyphus/calculators/Calculator.py +844 -0
- pysisyphus/calculators/CerjanMiller.py +24 -0
- pysisyphus/calculators/Composite.py +123 -0
- pysisyphus/calculators/ConicalIntersection.py +171 -0
- pysisyphus/calculators/DFTBp.py +430 -0
- pysisyphus/calculators/DFTD3.py +66 -0
- pysisyphus/calculators/DFTD4.py +84 -0
- pysisyphus/calculators/Dalton.py +61 -0
- pysisyphus/calculators/Dimer.py +681 -0
- pysisyphus/calculators/Dummy.py +20 -0
- pysisyphus/calculators/EGO.py +76 -0
- pysisyphus/calculators/EnergyMin.py +224 -0
- pysisyphus/calculators/ExternalPotential.py +264 -0
- pysisyphus/calculators/FakeASE.py +35 -0
- pysisyphus/calculators/FourWellAnaPot.py +28 -0
- pysisyphus/calculators/FreeEndNEBPot.py +39 -0
- pysisyphus/calculators/Gaussian09.py +18 -0
- pysisyphus/calculators/Gaussian16.py +726 -0
- pysisyphus/calculators/HardSphere.py +159 -0
- pysisyphus/calculators/IDPPCalculator.py +49 -0
- pysisyphus/calculators/IPIClient.py +133 -0
- pysisyphus/calculators/IPIServer.py +234 -0
- pysisyphus/calculators/LEPSBase.py +24 -0
- pysisyphus/calculators/LEPSExpr.py +139 -0
- pysisyphus/calculators/LennardJones.py +80 -0
- pysisyphus/calculators/MOPAC.py +219 -0
- pysisyphus/calculators/MullerBrownSympyPot.py +51 -0
- pysisyphus/calculators/MultiCalc.py +85 -0
- pysisyphus/calculators/NFK.py +45 -0
- pysisyphus/calculators/OBabel.py +87 -0
- pysisyphus/calculators/ONIOMv2.py +1129 -0
- pysisyphus/calculators/ORCA.py +893 -0
- pysisyphus/calculators/ORCA5.py +6 -0
- pysisyphus/calculators/OpenMM.py +88 -0
- pysisyphus/calculators/OpenMolcas.py +281 -0
- pysisyphus/calculators/OverlapCalculator.py +908 -0
- pysisyphus/calculators/Psi4.py +218 -0
- pysisyphus/calculators/PyPsi4.py +37 -0
- pysisyphus/calculators/PySCF.py +341 -0
- pysisyphus/calculators/PyXTB.py +73 -0
- pysisyphus/calculators/QCEngine.py +106 -0
- pysisyphus/calculators/Rastrigin.py +22 -0
- pysisyphus/calculators/Remote.py +76 -0
- pysisyphus/calculators/Rosenbrock.py +15 -0
- pysisyphus/calculators/SocketCalc.py +97 -0
- pysisyphus/calculators/TIP3P.py +111 -0
- pysisyphus/calculators/TransTorque.py +161 -0
- pysisyphus/calculators/Turbomole.py +965 -0
- pysisyphus/calculators/VRIPot.py +37 -0
- pysisyphus/calculators/WFOWrapper.py +333 -0
- pysisyphus/calculators/WFOWrapper2.py +341 -0
- pysisyphus/calculators/XTB.py +418 -0
- pysisyphus/calculators/__init__.py +81 -0
- pysisyphus/calculators/cosmo_data.py +139 -0
- pysisyphus/calculators/parser.py +150 -0
- pysisyphus/color.py +19 -0
- pysisyphus/config.py +133 -0
- pysisyphus/constants.py +65 -0
- pysisyphus/cos/AdaptiveNEB.py +230 -0
- pysisyphus/cos/ChainOfStates.py +725 -0
- pysisyphus/cos/FreeEndNEB.py +25 -0
- pysisyphus/cos/FreezingString.py +103 -0
- pysisyphus/cos/GrowingChainOfStates.py +71 -0
- pysisyphus/cos/GrowingNT.py +309 -0
- pysisyphus/cos/GrowingString.py +508 -0
- pysisyphus/cos/NEB.py +189 -0
- pysisyphus/cos/SimpleZTS.py +64 -0
- pysisyphus/cos/__init__.py +22 -0
- pysisyphus/cos/stiffness.py +199 -0
- pysisyphus/drivers/__init__.py +17 -0
- pysisyphus/drivers/afir.py +855 -0
- pysisyphus/drivers/barriers.py +271 -0
- pysisyphus/drivers/birkholz.py +138 -0
- pysisyphus/drivers/cluster.py +318 -0
- pysisyphus/drivers/diabatization.py +133 -0
- pysisyphus/drivers/merge.py +368 -0
- pysisyphus/drivers/merge_mol2.py +322 -0
- pysisyphus/drivers/opt.py +375 -0
- pysisyphus/drivers/perf.py +91 -0
- pysisyphus/drivers/pka.py +52 -0
- pysisyphus/drivers/precon_pos_rot.py +669 -0
- pysisyphus/drivers/rates.py +480 -0
- pysisyphus/drivers/replace.py +219 -0
- pysisyphus/drivers/scan.py +212 -0
- pysisyphus/drivers/spectrum.py +166 -0
- pysisyphus/drivers/thermo.py +31 -0
- pysisyphus/dynamics/Gaussian.py +103 -0
- pysisyphus/dynamics/__init__.py +20 -0
- pysisyphus/dynamics/colvars.py +136 -0
- pysisyphus/dynamics/driver.py +297 -0
- pysisyphus/dynamics/helpers.py +256 -0
- pysisyphus/dynamics/lincs.py +105 -0
- pysisyphus/dynamics/mdp.py +364 -0
- pysisyphus/dynamics/rattle.py +121 -0
- pysisyphus/dynamics/thermostats.py +128 -0
- pysisyphus/dynamics/wigner.py +266 -0
- pysisyphus/elem_data.py +3473 -0
- pysisyphus/exceptions.py +2 -0
- pysisyphus/filtertrj.py +69 -0
- pysisyphus/helpers.py +623 -0
- pysisyphus/helpers_pure.py +649 -0
- pysisyphus/init_logging.py +50 -0
- pysisyphus/intcoords/Bend.py +69 -0
- pysisyphus/intcoords/Bend2.py +25 -0
- pysisyphus/intcoords/BondedFragment.py +32 -0
- pysisyphus/intcoords/Cartesian.py +41 -0
- pysisyphus/intcoords/CartesianCoords.py +140 -0
- pysisyphus/intcoords/Coords.py +56 -0
- pysisyphus/intcoords/DLC.py +197 -0
- pysisyphus/intcoords/DistanceFunction.py +34 -0
- pysisyphus/intcoords/DummyImproper.py +70 -0
- pysisyphus/intcoords/DummyTorsion.py +72 -0
- pysisyphus/intcoords/LinearBend.py +105 -0
- pysisyphus/intcoords/LinearDisplacement.py +80 -0
- pysisyphus/intcoords/OutOfPlane.py +59 -0
- pysisyphus/intcoords/PrimTypes.py +286 -0
- pysisyphus/intcoords/Primitive.py +137 -0
- pysisyphus/intcoords/RedundantCoords.py +659 -0
- pysisyphus/intcoords/RobustTorsion.py +59 -0
- pysisyphus/intcoords/Rotation.py +147 -0
- pysisyphus/intcoords/Stretch.py +31 -0
- pysisyphus/intcoords/Torsion.py +101 -0
- pysisyphus/intcoords/Torsion2.py +25 -0
- pysisyphus/intcoords/Translation.py +45 -0
- pysisyphus/intcoords/__init__.py +61 -0
- pysisyphus/intcoords/augment_bonds.py +126 -0
- pysisyphus/intcoords/derivatives.py +10512 -0
- pysisyphus/intcoords/eval.py +80 -0
- pysisyphus/intcoords/exceptions.py +37 -0
- pysisyphus/intcoords/findiffs.py +48 -0
- pysisyphus/intcoords/generate_derivatives.py +414 -0
- pysisyphus/intcoords/helpers.py +235 -0
- pysisyphus/intcoords/logging_conf.py +10 -0
- pysisyphus/intcoords/mp_derivatives.py +10836 -0
- pysisyphus/intcoords/setup.py +962 -0
- pysisyphus/intcoords/setup_fast.py +176 -0
- pysisyphus/intcoords/update.py +272 -0
- pysisyphus/intcoords/valid.py +89 -0
- pysisyphus/interpolate/Geodesic.py +93 -0
- pysisyphus/interpolate/IDPP.py +55 -0
- pysisyphus/interpolate/Interpolator.py +116 -0
- pysisyphus/interpolate/LST.py +70 -0
- pysisyphus/interpolate/Redund.py +152 -0
- pysisyphus/interpolate/__init__.py +9 -0
- pysisyphus/interpolate/helpers.py +34 -0
- pysisyphus/io/__init__.py +22 -0
- pysisyphus/io/aomix.py +178 -0
- pysisyphus/io/cjson.py +24 -0
- pysisyphus/io/crd.py +101 -0
- pysisyphus/io/cube.py +220 -0
- pysisyphus/io/fchk.py +184 -0
- pysisyphus/io/hdf5.py +49 -0
- pysisyphus/io/hessian.py +72 -0
- pysisyphus/io/mol2.py +146 -0
- pysisyphus/io/molden.py +293 -0
- pysisyphus/io/orca.py +189 -0
- pysisyphus/io/pdb.py +269 -0
- pysisyphus/io/psf.py +79 -0
- pysisyphus/io/pubchem.py +31 -0
- pysisyphus/io/qcschema.py +34 -0
- pysisyphus/io/sdf.py +29 -0
- pysisyphus/io/xyz.py +61 -0
- pysisyphus/io/zmat.py +175 -0
- pysisyphus/irc/DWI.py +108 -0
- pysisyphus/irc/DampedVelocityVerlet.py +134 -0
- pysisyphus/irc/Euler.py +22 -0
- pysisyphus/irc/EulerPC.py +345 -0
- pysisyphus/irc/GonzalezSchlegel.py +187 -0
- pysisyphus/irc/IMKMod.py +164 -0
- pysisyphus/irc/IRC.py +878 -0
- pysisyphus/irc/IRCDummy.py +10 -0
- pysisyphus/irc/Instanton.py +307 -0
- pysisyphus/irc/LQA.py +53 -0
- pysisyphus/irc/ModeKill.py +136 -0
- pysisyphus/irc/ParamPlot.py +53 -0
- pysisyphus/irc/RK4.py +36 -0
- pysisyphus/irc/__init__.py +31 -0
- pysisyphus/irc/initial_displ.py +219 -0
- pysisyphus/linalg.py +411 -0
- pysisyphus/line_searches/Backtracking.py +88 -0
- pysisyphus/line_searches/HagerZhang.py +184 -0
- pysisyphus/line_searches/LineSearch.py +232 -0
- pysisyphus/line_searches/StrongWolfe.py +108 -0
- pysisyphus/line_searches/__init__.py +9 -0
- pysisyphus/line_searches/interpol.py +15 -0
- pysisyphus/modefollow/NormalMode.py +40 -0
- pysisyphus/modefollow/__init__.py +10 -0
- pysisyphus/modefollow/davidson.py +199 -0
- pysisyphus/modefollow/lanczos.py +95 -0
- pysisyphus/optimizers/BFGS.py +99 -0
- pysisyphus/optimizers/BacktrackingOptimizer.py +113 -0
- pysisyphus/optimizers/ConjugateGradient.py +98 -0
- pysisyphus/optimizers/CubicNewton.py +75 -0
- pysisyphus/optimizers/FIRE.py +113 -0
- pysisyphus/optimizers/HessianOptimizer.py +1176 -0
- pysisyphus/optimizers/LBFGS.py +228 -0
- pysisyphus/optimizers/LayerOpt.py +411 -0
- pysisyphus/optimizers/MicroOptimizer.py +169 -0
- pysisyphus/optimizers/NCOptimizer.py +90 -0
- pysisyphus/optimizers/Optimizer.py +1084 -0
- pysisyphus/optimizers/PreconLBFGS.py +260 -0
- pysisyphus/optimizers/PreconSteepestDescent.py +7 -0
- pysisyphus/optimizers/QuickMin.py +74 -0
- pysisyphus/optimizers/RFOptimizer.py +181 -0
- pysisyphus/optimizers/RSA.py +99 -0
- pysisyphus/optimizers/StabilizedQNMethod.py +248 -0
- pysisyphus/optimizers/SteepestDescent.py +23 -0
- pysisyphus/optimizers/StringOptimizer.py +173 -0
- pysisyphus/optimizers/__init__.py +41 -0
- pysisyphus/optimizers/closures.py +301 -0
- pysisyphus/optimizers/cls_map.py +58 -0
- pysisyphus/optimizers/exceptions.py +6 -0
- pysisyphus/optimizers/gdiis.py +280 -0
- pysisyphus/optimizers/guess_hessians.py +311 -0
- pysisyphus/optimizers/hessian_updates.py +355 -0
- pysisyphus/optimizers/poly_fit.py +285 -0
- pysisyphus/optimizers/precon.py +153 -0
- pysisyphus/optimizers/restrict_step.py +24 -0
- pysisyphus/pack.py +172 -0
- pysisyphus/peakdetect.py +948 -0
- pysisyphus/plot.py +1031 -0
- pysisyphus/run.py +2106 -0
- pysisyphus/socket_helper.py +74 -0
- pysisyphus/stocastic/FragmentKick.py +132 -0
- pysisyphus/stocastic/Kick.py +81 -0
- pysisyphus/stocastic/Pipeline.py +303 -0
- pysisyphus/stocastic/__init__.py +21 -0
- pysisyphus/stocastic/align.py +127 -0
- pysisyphus/testing.py +96 -0
- pysisyphus/thermo.py +156 -0
- pysisyphus/trj.py +824 -0
- pysisyphus/tsoptimizers/RSIRFOptimizer.py +56 -0
- pysisyphus/tsoptimizers/RSPRFOptimizer.py +182 -0
- pysisyphus/tsoptimizers/TRIM.py +59 -0
- pysisyphus/tsoptimizers/TSHessianOptimizer.py +463 -0
- pysisyphus/tsoptimizers/__init__.py +23 -0
- pysisyphus/wavefunction/Basis.py +239 -0
- pysisyphus/wavefunction/DIIS.py +76 -0
- pysisyphus/wavefunction/__init__.py +25 -0
- pysisyphus/wavefunction/build_ext.py +42 -0
- pysisyphus/wavefunction/cart2sph.py +190 -0
- pysisyphus/wavefunction/diabatization.py +304 -0
- pysisyphus/wavefunction/excited_states.py +435 -0
- pysisyphus/wavefunction/gen_ints.py +1811 -0
- pysisyphus/wavefunction/helpers.py +104 -0
- pysisyphus/wavefunction/ints/__init__.py +0 -0
- pysisyphus/wavefunction/ints/boys.py +193 -0
- pysisyphus/wavefunction/ints/boys_table_N_64_xasym_27.1_step_0.01.npy +0 -0
- pysisyphus/wavefunction/ints/cart_gto3d.py +176 -0
- pysisyphus/wavefunction/ints/coulomb3d.py +25928 -0
- pysisyphus/wavefunction/ints/diag_quadrupole3d.py +10036 -0
- pysisyphus/wavefunction/ints/dipole3d.py +8762 -0
- pysisyphus/wavefunction/ints/int2c2e3d.py +7198 -0
- pysisyphus/wavefunction/ints/int3c2e3d_sph.py +65040 -0
- pysisyphus/wavefunction/ints/kinetic3d.py +8240 -0
- pysisyphus/wavefunction/ints/ovlp3d.py +3777 -0
- pysisyphus/wavefunction/ints/quadrupole3d.py +15054 -0
- pysisyphus/wavefunction/ints/self_ovlp3d.py +198 -0
- pysisyphus/wavefunction/localization.py +458 -0
- pysisyphus/wavefunction/multipole.py +159 -0
- pysisyphus/wavefunction/normalization.py +36 -0
- pysisyphus/wavefunction/pop_analysis.py +134 -0
- pysisyphus/wavefunction/shells.py +1171 -0
- pysisyphus/wavefunction/wavefunction.py +504 -0
- pysisyphus/wrapper/__init__.py +11 -0
- pysisyphus/wrapper/exceptions.py +2 -0
- pysisyphus/wrapper/jmol.py +120 -0
- pysisyphus/wrapper/mwfn.py +169 -0
- pysisyphus/wrapper/packmol.py +71 -0
- pysisyphus/xyzloader.py +168 -0
- pysisyphus/yaml_mods.py +45 -0
- thermoanalysis/LICENSE +674 -0
- thermoanalysis/QCData.py +244 -0
- thermoanalysis/__init__.py +0 -0
- thermoanalysis/config.py +3 -0
- thermoanalysis/constants.py +20 -0
- thermoanalysis/thermo.py +1011 -0
mlmm/extract.py
ADDED
|
@@ -0,0 +1,2213 @@
|
|
|
1
|
+
# mlmm/extract.py
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
extract — Automated binding‑pocket (active‑site) extractor
|
|
5
|
+
====================================================================
|
|
6
|
+
|
|
7
|
+
Usage (CLI)
|
|
8
|
+
-----------
|
|
9
|
+
mlmm extract -i INPUT.pdb [INPUT2.pdb ...] -c <substrate_spec> \
|
|
10
|
+
[-o OUTPUT.pdb ...] [-r <Å>] [--radius-het2het <Å>] \
|
|
11
|
+
[--include-H2O/--no-include-H2O] [--exclude-backbone/--no-exclude-backbone] \
|
|
12
|
+
[--add-linkH/--no-add-linkH] [--selected-resn "CHAIN:RES" ...] \
|
|
13
|
+
[-l, --ligand-charge <number|"RES:Q,...">] [--verbose/--no-verbose]
|
|
14
|
+
|
|
15
|
+
Examples
|
|
16
|
+
--------
|
|
17
|
+
# Minimal (ID-based substrate) with explicit total ligand charge
|
|
18
|
+
mlmm extract -i complex.pdb -c A:123 -o pocket.pdb -l -3
|
|
19
|
+
|
|
20
|
+
# Substrate provided as a PDB; per-resname charge mapping (others remain 0)
|
|
21
|
+
mlmm extract -i complex.pdb -c substrate.pdb -o pocket.pdb \
|
|
22
|
+
-l "GPP:-3,MMT:-1"
|
|
23
|
+
|
|
24
|
+
# Name-based substrate selection including all matches (WARNING is logged)
|
|
25
|
+
mlmm extract -i complex.pdb -c "GPP,MMT" -o pocket.pdb -l -4
|
|
26
|
+
|
|
27
|
+
# Multi-structure to single multi-MODEL output with hetero-hetero proximity enabled
|
|
28
|
+
mlmm extract -i complex1.pdb complex2.pdb -c A:123 \
|
|
29
|
+
-o pocket_multi.pdb --radius-het2het 2.6 -l -3 --verbose
|
|
30
|
+
|
|
31
|
+
Description
|
|
32
|
+
-----------
|
|
33
|
+
Extracts an active‑site pocket around specified substrate residues from a protein–substrate complex,
|
|
34
|
+
applies biochemically aware truncation (backbone/side‑chain capping with safeguards), and can append
|
|
35
|
+
link hydrogens for cut bonds. Supports single structures and ensembles (multi‑MODEL or per‑file outputs).
|
|
36
|
+
Typical use cases include QM/MM, ML/MM, and cluster QM models.
|
|
37
|
+
|
|
38
|
+
Residue inclusion
|
|
39
|
+
-----
|
|
40
|
+
- Always include the substrate residues.
|
|
41
|
+
- Standard cutoff (``--radius``, default 2.6 Å):
|
|
42
|
+
- If ``--no-exclude-backbone``: include any residue if **any atom** is within the cutoff.
|
|
43
|
+
- If ``--exclude-backbone`` (default): for **amino‑acid residues**, the qualifying atom
|
|
44
|
+
must be **non‑backbone** (not in {N, H*, CA, HA*, C, O, OXT}); non‑amino‑acid residues qualify by any atom.
|
|
45
|
+
- Independent hetero–hetero proximity (``--radius-het2het``):
|
|
46
|
+
add residues if a **substrate hetero atom (non‑C/H)** is within the cutoff of a **protein hetero atom**.
|
|
47
|
+
With ``--exclude-backbone``, amino‑acid neighbors must be **non‑backbone** atoms.
|
|
48
|
+
- Waters are included by default (``--include-H2O``; disable with ``--no-include-H2O``).
|
|
49
|
+
- ``--selected-resn`` force‑includes residues (chain and insertion codes supported).
|
|
50
|
+
- When ``--no-exclude-backbone`` and a selected residue’s **backbone atom** contacts the substrate
|
|
51
|
+
(within either cutoff), include its peptide‑adjacent N‑side and C‑side neighbors (C–N ≤ 1.9 Å). For true termini,
|
|
52
|
+
keep the respective terminal cap atoms (N/H* or C/O/OXT).
|
|
53
|
+
- **Disulfide safeguard:** if a selected CYS/CYX forms an SG–SG contact ≤ 2.5 Å, include both partners.
|
|
54
|
+
- **Proline safeguard:** if a selected **PRO** is not N‑terminal (peptide‑adjacent), include the immediately
|
|
55
|
+
preceding amino acid. For that neighbor, **CA is always kept**, and when backbone exclusion is on,
|
|
56
|
+
keep **C** and **O/OXT** (to preserve the bond into PRO–N).
|
|
57
|
+
|
|
58
|
+
Truncation (capping)
|
|
59
|
+
-----
|
|
60
|
+
- **Isolated residues** → keep **pure side‑chain** (remove N, CA, C, O, OXT and N/CA H*).
|
|
61
|
+
- **PRO/HYP** retain N, CA, HA, H* to keep the ring.
|
|
62
|
+
- **Continuous peptide stretches** keep internal backbone; only **terminal caps** are removed
|
|
63
|
+
(N‑cap: N/H*; C‑cap: C/O/OXT). TER‑aware segmentation prevents crossing chain breaks.
|
|
64
|
+
- With ``--exclude-backbone`` (default), delete main‑chain atoms on all **non‑substrate amino acids**,
|
|
65
|
+
except for the specific PRO/HYP retention and PRO‑adjacency preservation above.
|
|
66
|
+
- **Non‑amino‑acid residues**: atoms named like protein backbone ({"N","CA","HA","H","H1","H2","H3"})
|
|
67
|
+
are **never deleted** by capping logic.
|
|
68
|
+
|
|
69
|
+
Link hydrogens (--add-linkH)
|
|
70
|
+
-----
|
|
71
|
+
- Adds **carbon‑only** link H at **1.09 Å** along the cut‑bond vector.
|
|
72
|
+
Normal residues: checks **CB–CA**, **CA–N**, **CA–C**; **PRO/HYP**: **CA–C** only.
|
|
73
|
+
- If any are added, append a **TER** then a contiguous **HETATM** block with atoms named **``HL``**
|
|
74
|
+
in residue **``LKH``** (chain ``L``); serials continue from the main block.
|
|
75
|
+
- In multi‑structure mode, link‑H **targets and ordering** are enforced to be **identical across models**;
|
|
76
|
+
coordinates remain model‑specific. The flag is honored in both single‑ and multi‑structure modes.
|
|
77
|
+
|
|
78
|
+
Charge summary
|
|
79
|
+
-----
|
|
80
|
+
- **AMINO_ACIDS** dictionary supplies nominal integer charges for amino‑acid residues (common variants included).
|
|
81
|
+
- **ION** supplies charges for common ions (e.g., ZN, MG, FE2). Waters are 0.
|
|
82
|
+
- **Unknown residues** (not in AMINO_ACIDS/ION/WATER) are **0** unless ``--ligand-charge`` is given.
|
|
83
|
+
- ``--ligand-charge <number>``: total charge distributed across **unknown substrate** residues
|
|
84
|
+
(or across all unknowns if no unknown substrate).
|
|
85
|
+
- ``--ligand-charge "RES1:Q1,RES2:Q2"``: set **per‑resname** charges; any other unknown residues remain 0.
|
|
86
|
+
- In multi‑structure mode the **charge summary is computed on the first input PDB** only.
|
|
87
|
+
|
|
88
|
+
Multi-structure ensembles
|
|
89
|
+
-----
|
|
90
|
+
- Accepts multiple input PDBs (same **atom count**; ordering is **assumed identical** and is
|
|
91
|
+
**spot‑checked** at the beginning and end of the atom list).
|
|
92
|
+
- Each structure is selected independently; the **union** of selected residues is applied to all.
|
|
93
|
+
Disulfides, PRO‑adjacency, and (if enabled) backbone‑contact neighbor augmentation are also unioned.
|
|
94
|
+
- Outputs:
|
|
95
|
+
- Provide **one** output path → **multi‑MODEL** PDB (one MODEL per input).
|
|
96
|
+
- Provide **N** output paths where **N == number of inputs** → **N** single‑model PDBs.
|
|
97
|
+
- If ``-o`` is omitted with multiple inputs → per‑file outputs ``pocket_{original_filename}.pdb``.
|
|
98
|
+
- **Diagnostics:** atom counts (**raw** vs **after truncation**) are logged **per model**.
|
|
99
|
+
|
|
100
|
+
Substrate specification
|
|
101
|
+
-----
|
|
102
|
+
``-c/--center`` accepts:
|
|
103
|
+
- a **PDB path** (exact coordinate match on the first input; IDs propagated to others),
|
|
104
|
+
- a list of **residue IDs**: ``"123,124"``, ``"A:123,B:456"``, ``"123A"``, ``"A:123A"`` (insertion codes OK),
|
|
105
|
+
- or a list of **residue names** (case‑insensitive), e.g., ``"GPP,MMT"``.
|
|
106
|
+
If multiple residues share the same name, **all** matches are used and a **WARNING** is logged.
|
|
107
|
+
|
|
108
|
+
Outputs (& Directory Layout)
|
|
109
|
+
----------------------------
|
|
110
|
+
<output>/ (default: pocket.pdb for single input; pocket_<source>.pdb per input when -o is omitted)
|
|
111
|
+
├─ pocket.pdb # default single-input pocket
|
|
112
|
+
├─ pocket_<original_filename>.pdb # default per-input pocket when multiple inputs and -o omitted
|
|
113
|
+
└─ <user_paths>.pdb # custom outputs; one path = multi-MODEL, N paths = per-structure
|
|
114
|
+
|
|
115
|
+
Link hydrogens, logs, and programmatic use
|
|
116
|
+
├─ Link-H block (when added) follows a TER as contiguous HETATM records named HL in residue LKH (chain L).
|
|
117
|
+
├─ INFO logs summarize residue selection, raw/kept atom counts, and the charge summary.
|
|
118
|
+
└─ ``extract(..., api=True)`` / ``extract_api(...)`` returns ``{"outputs": [...], "counts": [...], "charge_summary": {...}}``.
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
Notes
|
|
122
|
+
-----
|
|
123
|
+
- **Defaults / behavior:**
|
|
124
|
+
- ``--radius`` default: **2.6 Å**. If given **0**, internally nudged to **0.001 Å**.
|
|
125
|
+
- ``--radius-het2het`` default: **0 Å** (off). Internally treated as **0.001 Å** if ``0`` is given.
|
|
126
|
+
- ``--include-H2O`` default: **true**.
|
|
127
|
+
- ``--exclude-backbone`` default: **true**.
|
|
128
|
+
- ``--add-linkH`` default: **false**.
|
|
129
|
+
- ``--ligand-charge`` default: **None** (unknown residues counted as 0 unless set).
|
|
130
|
+
- Output default: single input → ``pocket.pdb``; multiple inputs → ``pocket_{original_filename}.pdb``.
|
|
131
|
+
- **Geometry thresholds and tolerances:**
|
|
132
|
+
- Peptide adjacency: **C(prev)–N(next) ≤ 1.9 Å** (distance‑based; practical TER awareness).
|
|
133
|
+
- Disulfide detection: **SG–SG ≤ 2.5 Å**.
|
|
134
|
+
- Link‑H distance: **1.09 Å** (C–H) along the cut‑bond vector.
|
|
135
|
+
- Exact match tolerance for substrate PDB: **1e‑3 Å** per atom.
|
|
136
|
+
- **Safeguards and special cases:**
|
|
137
|
+
- **PRO/HYP** retain N, CA, HA, H* in isolated truncations; PRO’s **N‑side neighbor** is auto‑included
|
|
138
|
+
when peptide‑adjacent; **CA** on that neighbor is always kept, and with backbone exclusion
|
|
139
|
+
**C** and **O/OXT** are preserved to maintain the peptide bond into PRO–N.
|
|
140
|
+
- **Non‑amino‑acid residues** never lose atoms named like backbone (``N, CA, HA, H, H1, H2, H3``).
|
|
141
|
+
- **Waters** (HOH/WAT/TIP3/SOL) are always neutral (charge 0) and included by default.
|
|
142
|
+
- **Dependencies:** Python ≥ **3.10** (PEP 604 unions), Biopython ≥ **1.80**, NumPy.
|
|
143
|
+
- **Python API (for reference):**
|
|
144
|
+
- ``extract(args: argparse.Namespace | None = None, api=False)`` — main entry (CLI or programmatic).
|
|
145
|
+
- ``extract_api(...)`` — convenience wrapper that returns ``{'outputs','counts','charge_summary'}`` when used programmatically.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
from __future__ import annotations
|
|
149
|
+
|
|
150
|
+
import argparse
|
|
151
|
+
import logging
|
|
152
|
+
import io as _io
|
|
153
|
+
import os
|
|
154
|
+
import re
|
|
155
|
+
import sys
|
|
156
|
+
from typing import Dict, List, Set, Tuple, Iterable, Any, Optional, Sequence
|
|
157
|
+
|
|
158
|
+
import click
|
|
159
|
+
import numpy as np
|
|
160
|
+
from Bio import PDB
|
|
161
|
+
from Bio.PDB import NeighborSearch
|
|
162
|
+
|
|
163
|
+
# Public API
|
|
164
|
+
__all__ = ["extract", "extract_api"]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
LOGGER = logging.getLogger(__name__)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class _ClickEchoHandler(logging.Handler):
|
|
171
|
+
"""Route logger records to click.echo with stderr for warnings/errors."""
|
|
172
|
+
|
|
173
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
174
|
+
try:
|
|
175
|
+
click.echo(self.format(record), err=record.levelno >= logging.WARNING)
|
|
176
|
+
except Exception:
|
|
177
|
+
self.handleError(record)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _configure_extract_logger(verbose: bool) -> None:
|
|
181
|
+
"""Configure module-local logger for CLI output without touching global logging."""
|
|
182
|
+
level = logging.INFO if verbose else logging.WARNING
|
|
183
|
+
LOGGER.setLevel(level)
|
|
184
|
+
LOGGER.propagate = False
|
|
185
|
+
|
|
186
|
+
for handler in list(LOGGER.handlers):
|
|
187
|
+
LOGGER.removeHandler(handler)
|
|
188
|
+
|
|
189
|
+
handler = _ClickEchoHandler()
|
|
190
|
+
handler.setLevel(level)
|
|
191
|
+
handler.setFormatter(
|
|
192
|
+
logging.Formatter("%(message)s" if verbose else "%(levelname)s: %(message)s")
|
|
193
|
+
)
|
|
194
|
+
LOGGER.addHandler(handler)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ---------------------------------------------------------------------
|
|
198
|
+
# Constants
|
|
199
|
+
# ---------------------------------------------------------------------
|
|
200
|
+
BACKBONE_ATOMS: Set[str] = {
|
|
201
|
+
"N", "C", "O", "CA", "OXT",
|
|
202
|
+
"H", "H1", "H2", "H3", "HN", "HA", "HA2", "HA3",
|
|
203
|
+
}
|
|
204
|
+
# When --exclude-backbone true, remove the full main-chain set:
|
|
205
|
+
BACKBONE_ALL: Set[str] = BACKBONE_ATOMS
|
|
206
|
+
|
|
207
|
+
# Unified amino-acid dictionary: resname -> nominal integer charge
|
|
208
|
+
# (membership checks throughout the code use dictionary keys)
|
|
209
|
+
AMINO_ACIDS: Dict[str, int] = {
|
|
210
|
+
# --- Standard 20 (L) ---
|
|
211
|
+
"ALA": 0, "ARG": +1, "ASN": 0, "ASP": -1, "CYS": 0,
|
|
212
|
+
"GLU": -1, "GLN": 0, "GLY": 0, "HIS": 0, "ILE": 0,
|
|
213
|
+
"LEU": 0, "LYS": +1, "MET": 0, "PHE": 0, "PRO": 0,
|
|
214
|
+
"SER": 0, "THR": 0, "TRP": 0, "TYR": 0, "VAL": 0,
|
|
215
|
+
|
|
216
|
+
# --- Canonical extras ---
|
|
217
|
+
"SEC": 0, # selenocysteine
|
|
218
|
+
"PYL": +1, # pyrrolysine
|
|
219
|
+
|
|
220
|
+
# --- Protonation / tautomers (Amber/CHARMM style) ---
|
|
221
|
+
"HIP": +1, # fully protonated His
|
|
222
|
+
"HID": 0, # Nδ-protonated His
|
|
223
|
+
"HIE": 0, # Nε-protonated His
|
|
224
|
+
"ASH": 0, # neutral Asp
|
|
225
|
+
"GLH": 0, # neutral Glu
|
|
226
|
+
"LYN": 0, # neutral Lys
|
|
227
|
+
"ARN": 0, # neutral Arg
|
|
228
|
+
"TYM": -1, # deprotonated Tyr (phenolate)
|
|
229
|
+
|
|
230
|
+
# --- Phosphorylated residues ---
|
|
231
|
+
"SEP": -2, "TPO": -2, "PTR": -2,
|
|
232
|
+
"S1P": -1, "T1P": -1, "Y1P": -1, # monoanionic phospho-Ser/Thr/Tyr
|
|
233
|
+
|
|
234
|
+
# --- Phosphorylated histidines (phosaa19SB) ---
|
|
235
|
+
"H1D": 0, # ND1-phospho-His, neutral
|
|
236
|
+
"H2D": -1, # ND1-phospho-His, anionic
|
|
237
|
+
"H1E": 0, # NE2-phospho-His, neutral
|
|
238
|
+
"H2E": -1, # NE2-phospho-His, anionic
|
|
239
|
+
|
|
240
|
+
# --- Cys family ---
|
|
241
|
+
"CYX": 0, # disulfide Cys
|
|
242
|
+
"CSO": 0, # Cys sulfenic acid
|
|
243
|
+
"CSD": -1, # Cys sulfinic acid
|
|
244
|
+
"CSX": 0, # generic Cys derivative
|
|
245
|
+
"OCS": -1, # cysteic acid
|
|
246
|
+
"CYM": -1, # deprotonated Cys
|
|
247
|
+
|
|
248
|
+
# --- Lys variants / carboxylation ---
|
|
249
|
+
"MLY": +1, "LLP": +1, "DLY": +1,
|
|
250
|
+
"KCX": -1, # Lysine Nz-Carboxylic Acid
|
|
251
|
+
|
|
252
|
+
# --- D isomers (19 residues) ---
|
|
253
|
+
"DAL": 0, "DAR": +1, "DSG": 0, "DAS": -1, "DCY": 0,
|
|
254
|
+
"DGN": 0, "DGL": -1, "DHI": 0, "DIL": 0, "DLE": 0,
|
|
255
|
+
"DLY": +1, "MED": 0, "DPN": 0, "DPR": 0, "DSN": 0,
|
|
256
|
+
"DTH": 0, "DTR": 0, "DTY": 0, "DVA": 0,
|
|
257
|
+
|
|
258
|
+
# --- Carboxylation / cyclization / others ---
|
|
259
|
+
"CGU": -2, # gamma-carboxy-glutamate
|
|
260
|
+
"CGA": -1, # carboxymethylated glutamate
|
|
261
|
+
"PCA": 0, # pyroglutamate
|
|
262
|
+
"MSE": 0, # selenomethionine
|
|
263
|
+
"OMT": 0, # methionine sulfone
|
|
264
|
+
|
|
265
|
+
# --- Other modified residues possibly encountered ---
|
|
266
|
+
"ASA": 0, "CIR": 0, "FOR": 0, "MVA": 0, "IIL": 0, "AIB": 0, "HTN": 0,
|
|
267
|
+
"SAR": 0, "NMC": 0, "PFF": 0, "NFA": 0, "ALY": 0, "AZF": 0, "CNX": 0, "CYF": 0,
|
|
268
|
+
|
|
269
|
+
# --- Hydroxyproline ---
|
|
270
|
+
"HYP": 0,
|
|
271
|
+
|
|
272
|
+
# --- All C-terminus ---
|
|
273
|
+
"CALA": -1, "CARG": 0, "CASN": -1, "CASP": -2, "CCYS": -1,
|
|
274
|
+
"CCYX": -1, "CGLN": -1, "CGLU": -2, "CGLY": -1, "CHID": -1,
|
|
275
|
+
"CHIE": -1, "CHIP": 0, "CHYP": -1, "CILE": -1, "CLEU": -1,
|
|
276
|
+
"CLYS": 0, "CMET": -1, "CPHE": -1, "CPRO": -1, "CSER": -1,
|
|
277
|
+
"CTHR": -1, "CTRP": -1, "CTYR": -1, "CVAL": -1, "NHE": 0,
|
|
278
|
+
"NME": 0,
|
|
279
|
+
"CTER": -1, # generic C-terminus
|
|
280
|
+
|
|
281
|
+
# --- All N-terminus ---
|
|
282
|
+
"NALA": +1, "NARG": +2, "NASN": +1, "NASP": 0, "NCYS": +1,
|
|
283
|
+
"NCYX": +1, "NGLN": +1, "NGLU": 0, "NGLY": +1, "NHID": +1,
|
|
284
|
+
"NHIE": +1, "NHIP": +2, "NILE": +1, "NLEU": +1, "NLYS": +2,
|
|
285
|
+
"NMET": +1, "NPHE": +1, "NPRO": +1, "NSER": +1, "NTHR": +1,
|
|
286
|
+
"NTRP": +1, "NTYR": +1, "NVAL": +1, "ACE": 0,
|
|
287
|
+
"NTER": +1, # generic N-terminus
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Common ions (by residue name) and their formal charges
|
|
291
|
+
ION: Dict[str, int] = {
|
|
292
|
+
# +1
|
|
293
|
+
"LI": +1, "NA": +1, "K": +1, "RB": +1, "CS": +1, "TL": +1, "AG": +1, "CU1": +1,
|
|
294
|
+
"Ag": +1, "K+": +1, "Na+": +1, "NH4": +1, "H3O+": +1, "HE+": +1, "HZ+": +1, "Tl": +1,
|
|
295
|
+
|
|
296
|
+
# +2
|
|
297
|
+
"MG": +2, "CA": +2, "SR": +2, "BA": +2, "MN": +2, "FE2": +2, "CO": +2, "NI": +2,
|
|
298
|
+
"CU": +2, "ZN": +2, "CD": +2, "HG": +2, "PB": +2, "Be": +2, "PD": +2, "PT": +2,
|
|
299
|
+
"Sn": +2, "Ra": +2, "YB2": +2, "V2+": +2,
|
|
300
|
+
|
|
301
|
+
# +3
|
|
302
|
+
"FE": +3, "AU3": +3, "AL": +3, "GA": +3, "IN": +3,
|
|
303
|
+
"CE": +3, "Ce": +3, "CR": +3, "Cr": +3, "Dy": +3, "EU": +3, "EU3": +3, "Er": +3,
|
|
304
|
+
"GD3": +3, "LA": +3, "LU": +3, "Nd": +3, "PR": +3, "SM": +3, "Sm": +3, "TB": +3,
|
|
305
|
+
"Tm": +3, "Y": +3, "Pu": +3,
|
|
306
|
+
|
|
307
|
+
# +4
|
|
308
|
+
"U4+": +4, "Th": +4, "Hf": +4, "Zr": +4,
|
|
309
|
+
|
|
310
|
+
# -1
|
|
311
|
+
"F": -1, "CL": -1, "BR": -1, "I": -1, "Cl-": -1, "IOD": -1,
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
DISULFIDE_CUTOFF = 2.5 # Å Sγ–Sγ (SG–SG)
|
|
315
|
+
EXACT_EPS = 1e-3 # Å tolerance for exact match
|
|
316
|
+
WATER_RES = {"HOH","WAT","H2O","DOD","TIP","TIP3","SOL"}
|
|
317
|
+
|
|
318
|
+
# Type for cross-structure residue identity (chain, hetflag, resseq, icode, resname)
|
|
319
|
+
ResidueKey = Tuple[str, str, int, str, str]
|
|
320
|
+
|
|
321
|
+
# ---------------------------------------------------------------------
|
|
322
|
+
# Helpers
|
|
323
|
+
# ---------------------------------------------------------------------
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _extract_short_help() -> str:
|
|
327
|
+
return "\n".join(
|
|
328
|
+
[
|
|
329
|
+
"Usage: mlmm extract [OPTIONS]",
|
|
330
|
+
"",
|
|
331
|
+
"Extract a binding pocket around substrate residues.",
|
|
332
|
+
"",
|
|
333
|
+
"Core options:",
|
|
334
|
+
" -i, --input PATH [PATH ...] Input complex PDB file(s).",
|
|
335
|
+
" -c, --center SPEC Substrate selector (PDB / residue IDs / residue names).",
|
|
336
|
+
" -o, --output PATH [PATH ...] Output pocket PDB path(s).",
|
|
337
|
+
" -r, --radius FLOAT Pocket inclusion radius in angstrom.",
|
|
338
|
+
" -l, --ligand-charge VALUE Total or mapped ligand charge.",
|
|
339
|
+
" --help-advanced Show full extract options and exit.",
|
|
340
|
+
"",
|
|
341
|
+
"Use '--help-advanced' to see all extractor options.",
|
|
342
|
+
]
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# ── Native Click command ─────────────────────────────────────────────
|
|
347
|
+
|
|
348
|
+
_EXTRACT_ALL_FLAGS = (
|
|
349
|
+
"-i", "--input",
|
|
350
|
+
"-c", "--center",
|
|
351
|
+
"-o", "--output",
|
|
352
|
+
"-r", "--radius",
|
|
353
|
+
"--radius-het2het",
|
|
354
|
+
"--include-H2O", "--include-h2o", "--no-include-H2O", "--no-include-h2o",
|
|
355
|
+
"--exclude-backbone", "--no-exclude-backbone",
|
|
356
|
+
"--add-linkH", "--no-add-linkH",
|
|
357
|
+
"--selected-resn",
|
|
358
|
+
"-l",
|
|
359
|
+
"--ligand-charge",
|
|
360
|
+
"-v", "--verbose", "--no-verbose",
|
|
361
|
+
"-h", "--help", "--help-advanced",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _gather_extract_variadic(
|
|
366
|
+
ctx_args: List[str],
|
|
367
|
+
flag_names: Sequence[str],
|
|
368
|
+
) -> List[str]:
|
|
369
|
+
"""Collect variadic positional values after *flag_names* from Click extra args."""
|
|
370
|
+
names_set = set(flag_names)
|
|
371
|
+
stop_set = set(_EXTRACT_ALL_FLAGS)
|
|
372
|
+
vals: List[str] = []
|
|
373
|
+
i = 0
|
|
374
|
+
while i < len(ctx_args):
|
|
375
|
+
tok = ctx_args[i]
|
|
376
|
+
if tok in names_set:
|
|
377
|
+
j = i + 1
|
|
378
|
+
while j < len(ctx_args) and ctx_args[j] not in stop_set:
|
|
379
|
+
vals.append(ctx_args[j])
|
|
380
|
+
j += 1
|
|
381
|
+
i = j
|
|
382
|
+
else:
|
|
383
|
+
i += 1
|
|
384
|
+
return vals
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@click.command(
|
|
388
|
+
name="extract",
|
|
389
|
+
help=(
|
|
390
|
+
"Extract a binding pocket around substrate residues (from a PDB or "
|
|
391
|
+
"residue IDs/names), with biochemically aware truncation and optional "
|
|
392
|
+
"link-H; supports multi-structure input and multi-MODEL output."
|
|
393
|
+
),
|
|
394
|
+
context_settings={
|
|
395
|
+
"help_option_names": ["-h", "--help"],
|
|
396
|
+
"ignore_unknown_options": True,
|
|
397
|
+
"allow_extra_args": True,
|
|
398
|
+
},
|
|
399
|
+
)
|
|
400
|
+
@click.option(
|
|
401
|
+
"-i", "--input", "complex_pdb",
|
|
402
|
+
type=str, multiple=True, required=True,
|
|
403
|
+
help="Protein-substrate complex PDB(s). If multiple, they must have identical atom counts and ordering.",
|
|
404
|
+
)
|
|
405
|
+
@click.option(
|
|
406
|
+
"-c", "--center", "substrate_pdb",
|
|
407
|
+
type=str, required=True,
|
|
408
|
+
help=(
|
|
409
|
+
"Substrate specification: a PDB path, a comma/space-separated residue-ID list "
|
|
410
|
+
"like '123,124' or 'A:123,B:456' (insertion codes supported), "
|
|
411
|
+
"or a residue-name list like 'GPP,SAM'."
|
|
412
|
+
),
|
|
413
|
+
)
|
|
414
|
+
@click.option(
|
|
415
|
+
"-o", "--output", "output_pdb",
|
|
416
|
+
type=str, multiple=True, default=(),
|
|
417
|
+
help=(
|
|
418
|
+
"Output PDB path(s). One path for multi-MODEL PDB, or N paths for per-file output. "
|
|
419
|
+
"If omitted: single input -> pocket.pdb; multiple inputs -> pocket_{filename}.pdb."
|
|
420
|
+
),
|
|
421
|
+
)
|
|
422
|
+
@click.option(
|
|
423
|
+
"-r", "--radius",
|
|
424
|
+
type=float, default=2.6, show_default=True,
|
|
425
|
+
help="Cutoff (angstrom) around substrate atoms for pocket inclusion.",
|
|
426
|
+
)
|
|
427
|
+
@click.option(
|
|
428
|
+
"--radius-het2het",
|
|
429
|
+
type=float, default=0, show_default=True,
|
|
430
|
+
help="Cutoff (angstrom) for substrate-protein hetero-atom proximity (non-C/H). 0 disables.",
|
|
431
|
+
)
|
|
432
|
+
@click.option(
|
|
433
|
+
"--include-H2O/--no-include-H2O",
|
|
434
|
+
"include_H2O",
|
|
435
|
+
default=True, show_default=True,
|
|
436
|
+
help="Include waters (HOH/WAT/H2O/DOD/TIP/TIP3/SOL).",
|
|
437
|
+
)
|
|
438
|
+
@click.option(
|
|
439
|
+
"--exclude-backbone/--no-exclude-backbone",
|
|
440
|
+
default=True, show_default=True,
|
|
441
|
+
help="Delete main-chain atoms from non-substrate amino acids.",
|
|
442
|
+
)
|
|
443
|
+
@click.option(
|
|
444
|
+
"--add-linkH/--no-add-linkH",
|
|
445
|
+
"add_linkH",
|
|
446
|
+
default=False, show_default=True,
|
|
447
|
+
help="Add carbon-only link-H at 1.09 angstrom along cut-bond directions.",
|
|
448
|
+
)
|
|
449
|
+
@click.option(
|
|
450
|
+
"--selected-resn",
|
|
451
|
+
type=str, default="",
|
|
452
|
+
help="Comma/space-separated residue IDs to force-include.",
|
|
453
|
+
)
|
|
454
|
+
@click.option(
|
|
455
|
+
"-l",
|
|
456
|
+
"--ligand-charge",
|
|
457
|
+
type=str, default=None,
|
|
458
|
+
help="Total charge number or per-resname mapping like 'GPP:-3,SAM:1'.",
|
|
459
|
+
)
|
|
460
|
+
@click.option(
|
|
461
|
+
"-v", "--verbose/--no-verbose",
|
|
462
|
+
default=True, show_default=True,
|
|
463
|
+
help="Enable INFO-level logging.",
|
|
464
|
+
)
|
|
465
|
+
@click.pass_context
|
|
466
|
+
def cli(
|
|
467
|
+
ctx: click.Context,
|
|
468
|
+
complex_pdb: Sequence[str],
|
|
469
|
+
substrate_pdb: str,
|
|
470
|
+
output_pdb: Sequence[str],
|
|
471
|
+
radius: float,
|
|
472
|
+
radius_het2het: float,
|
|
473
|
+
include_H2O: bool,
|
|
474
|
+
exclude_backbone: bool,
|
|
475
|
+
add_linkH: bool,
|
|
476
|
+
selected_resn: str,
|
|
477
|
+
ligand_charge: Optional[str],
|
|
478
|
+
verbose: bool,
|
|
479
|
+
) -> None:
|
|
480
|
+
# Recover variadic values after -i / -o from extra args (supports
|
|
481
|
+
# space-separated syntax: ``-i a.pdb b.pdb`` in addition to ``-i a.pdb -i b.pdb``).
|
|
482
|
+
extra_inputs = _gather_extract_variadic(ctx.args, ("-i", "--input"))
|
|
483
|
+
input_list = list(complex_pdb) + extra_inputs if extra_inputs else list(complex_pdb)
|
|
484
|
+
|
|
485
|
+
extra_outputs = _gather_extract_variadic(ctx.args, ("-o", "--output"))
|
|
486
|
+
output_list: Optional[List[str]]
|
|
487
|
+
if output_pdb or extra_outputs:
|
|
488
|
+
output_list = list(output_pdb) + extra_outputs if extra_outputs else list(output_pdb)
|
|
489
|
+
else:
|
|
490
|
+
output_list = None
|
|
491
|
+
|
|
492
|
+
ns = argparse.Namespace(
|
|
493
|
+
complex_pdb=input_list,
|
|
494
|
+
substrate_pdb=substrate_pdb,
|
|
495
|
+
output_pdb=output_list,
|
|
496
|
+
radius=radius,
|
|
497
|
+
radius_het2het=radius_het2het,
|
|
498
|
+
include_H2O=include_H2O,
|
|
499
|
+
exclude_backbone=exclude_backbone,
|
|
500
|
+
add_linkH=add_linkH,
|
|
501
|
+
selected_resn=selected_resn,
|
|
502
|
+
ligand_charge=ligand_charge,
|
|
503
|
+
verbose=verbose,
|
|
504
|
+
)
|
|
505
|
+
extract(ns)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _build_arg_parser(*, prog: str) -> argparse.ArgumentParser:
|
|
509
|
+
p = argparse.ArgumentParser(
|
|
510
|
+
prog=prog,
|
|
511
|
+
description=(
|
|
512
|
+
"Extract a binding pocket around substrate residues (from a PDB or residue IDs/names), "
|
|
513
|
+
"with biochemically aware truncation and optional link‑H; supports multi‑structure input "
|
|
514
|
+
"and multi‑MODEL output. Also logs pocket charge summary."
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
p.add_argument(
|
|
519
|
+
"-i", "--input", dest="complex_pdb", required=True, nargs="+",
|
|
520
|
+
metavar="complex.pdb",
|
|
521
|
+
help="Protein–substrate complex PDB(s). If multiple, they must have identical atom counts and ordering."
|
|
522
|
+
)
|
|
523
|
+
p.add_argument(
|
|
524
|
+
"-c", "--center", dest="substrate_pdb", required=True,
|
|
525
|
+
metavar="substrate.pdb | '123,124' | 'A:123,B:456' | 'GPP,MMT'",
|
|
526
|
+
help=("Substrate specification: either a PDB containing exactly the substrate residue(s), "
|
|
527
|
+
"a comma/space‑separated residue‑ID list like '123,124' or 'A:123,B:456' "
|
|
528
|
+
"(insertion codes supported: '123A' / 'A:123A'), "
|
|
529
|
+
"or a comma/space‑separated **residue‑name** list like 'GPP,MMT'. "
|
|
530
|
+
"When residue names are used and multiple residues share a name, all are used and a WARNING is logged.")
|
|
531
|
+
)
|
|
532
|
+
p.add_argument(
|
|
533
|
+
"-o", "--output", dest="output_pdb", required=False, nargs="+",
|
|
534
|
+
metavar="pocket.pdb", default=None,
|
|
535
|
+
help=("Output PDB path(s). Provide one path to write a single multi‑MODEL PDB, "
|
|
536
|
+
"or provide N paths where N == number of inputs to write N single‑model PDBs (one per input, in order). "
|
|
537
|
+
"If omitted: single input → pocket.pdb; multiple inputs → pocket_{original_filename}.pdb.")
|
|
538
|
+
)
|
|
539
|
+
p.add_argument(
|
|
540
|
+
"-r", "--radius", type=float, default=2.6,
|
|
541
|
+
help=("Cutoff (Å) around substrate atoms. With --exclude-backbone true (default), an **amino-acid** "
|
|
542
|
+
"neighbor must have a **non-backbone** atom within this distance; otherwise **any atom** suffices. "
|
|
543
|
+
"(default: 2.6)")
|
|
544
|
+
)
|
|
545
|
+
p.add_argument(
|
|
546
|
+
"--radius-het2het", type=float, default=0,
|
|
547
|
+
help=("Cutoff (Å) for substrate–protein hetero‑atom proximity (non‑C/H on both sides); "
|
|
548
|
+
"applied independently of --radius. 0 conceptually disables this rule, "
|
|
549
|
+
"but is internally treated as 0.001 Å. (default: 0)")
|
|
550
|
+
)
|
|
551
|
+
p.add_argument(
|
|
552
|
+
"--include-H2O", "--include-h2o",
|
|
553
|
+
dest="include_H2O",
|
|
554
|
+
action=argparse.BooleanOptionalAction,
|
|
555
|
+
default=True,
|
|
556
|
+
help="Include waters (HOH/WAT/TIP3/SOL). (default: True)"
|
|
557
|
+
)
|
|
558
|
+
p.add_argument(
|
|
559
|
+
"--exclude-backbone",
|
|
560
|
+
dest="exclude_backbone",
|
|
561
|
+
action=argparse.BooleanOptionalAction,
|
|
562
|
+
default=True,
|
|
563
|
+
help="Delete main‑chain atoms (N, H*, CA, HA*, C, O) from non‑substrate amino acids; PRO/HYP keep N, CA, HA, H*. (default: True)"
|
|
564
|
+
)
|
|
565
|
+
p.add_argument(
|
|
566
|
+
"--add-linkH",
|
|
567
|
+
dest="add_linkH",
|
|
568
|
+
action=argparse.BooleanOptionalAction,
|
|
569
|
+
default=False,
|
|
570
|
+
help="Add carbon‑only link‑H at 1.09 Å along cut‑bond directions; appended after a TER as HL/LKH HETATM records. (default: False)"
|
|
571
|
+
)
|
|
572
|
+
p.add_argument(
|
|
573
|
+
"--selected-resn", dest="selected_resn", required=False, default="",
|
|
574
|
+
help=("Comma/space‑separated residue IDs to force‑include (e.g., '123,124', 'A:123,B:456'; "
|
|
575
|
+
"insertion codes allowed: '123A' / 'A:123A').")
|
|
576
|
+
)
|
|
577
|
+
p.add_argument(
|
|
578
|
+
"-l", "--ligand-charge", type=str, default=None,
|
|
579
|
+
help=("Either a single **number** giving the **total** charge to distribute across unknown residues "
|
|
580
|
+
"(preferring unknown substrate), or a comma/space‑separated **per‑resname** list like "
|
|
581
|
+
"'GPP:-3,MMT:-1'. In mapping mode, any other unknown residues remain 0.")
|
|
582
|
+
)
|
|
583
|
+
p.add_argument(
|
|
584
|
+
"-v", "--verbose",
|
|
585
|
+
dest="verbose",
|
|
586
|
+
action=argparse.BooleanOptionalAction,
|
|
587
|
+
default=True,
|
|
588
|
+
help=("Enable INFO-level logging."
|
|
589
|
+
" default: True.")
|
|
590
|
+
)
|
|
591
|
+
return p
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
595
|
+
"""
|
|
596
|
+
Parse CLI arguments.
|
|
597
|
+
|
|
598
|
+
Parameters
|
|
599
|
+
----------
|
|
600
|
+
argv : Optional[Sequence[str]]
|
|
601
|
+
Command-line arguments to parse. If None, uses sys.argv.
|
|
602
|
+
|
|
603
|
+
Returns
|
|
604
|
+
-------
|
|
605
|
+
argparse.Namespace
|
|
606
|
+
Parameters for running the pocket extraction.
|
|
607
|
+
"""
|
|
608
|
+
argv_list = list(argv) if argv is not None else None
|
|
609
|
+
if argv_list is not None:
|
|
610
|
+
wants_adv = "--help-advanced" in argv_list
|
|
611
|
+
wants_help = ("--help" in argv_list) or ("-h" in argv_list)
|
|
612
|
+
if wants_help and not wants_adv:
|
|
613
|
+
click.echo(_extract_short_help())
|
|
614
|
+
raise SystemExit(0)
|
|
615
|
+
if wants_adv:
|
|
616
|
+
argv_list = [a for a in argv_list if a != "--help-advanced"]
|
|
617
|
+
if ("--help" not in argv_list) and ("-h" not in argv_list):
|
|
618
|
+
argv_list.append("--help")
|
|
619
|
+
|
|
620
|
+
p = _build_arg_parser(prog="mlmm extract")
|
|
621
|
+
return p.parse_args(args=argv_list)
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def load_structure(path: str, name: str) -> PDB.Structure.Structure:
|
|
625
|
+
"""
|
|
626
|
+
Load a PDB file into a Biopython Structure object.
|
|
627
|
+
"""
|
|
628
|
+
parser = PDB.PDBParser(QUIET=True)
|
|
629
|
+
return parser.get_structure(name, path)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
# ---------------------------------------------------------------------
|
|
633
|
+
# Formatting helpers (for logging / API)
|
|
634
|
+
# ---------------------------------------------------------------------
|
|
635
|
+
|
|
636
|
+
def _fmt_res_id(res: PDB.Residue.Residue) -> str:
|
|
637
|
+
"""
|
|
638
|
+
Return a compact residue tag like 'A:123A SER' or '123 SER'.
|
|
639
|
+
"""
|
|
640
|
+
chain = res.get_parent().id or ""
|
|
641
|
+
het, resseq, icode = res.id
|
|
642
|
+
icode_txt = "" if icode == " " else icode
|
|
643
|
+
chain_txt = f"{chain}:" if chain else ""
|
|
644
|
+
return f"{chain_txt}{resseq}{icode_txt} {res.get_resname()}"
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _fmt_fid(structure, fid: Tuple) -> str:
|
|
648
|
+
"""
|
|
649
|
+
Format a full-id into a human-friendly residue tag.
|
|
650
|
+
"""
|
|
651
|
+
res: PDB.Residue.Residue = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
652
|
+
return _fmt_res_id(res)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
# ---------------------------------------------------------------------
|
|
656
|
+
# Substrate matching
|
|
657
|
+
# ---------------------------------------------------------------------
|
|
658
|
+
|
|
659
|
+
def is_exact_match(lig_atoms: Dict[str, PDB.Vector.Vector],
|
|
660
|
+
cand: PDB.Residue.Residue) -> bool:
|
|
661
|
+
"""
|
|
662
|
+
Return True if candidate residue matches ligand atom names and positions within EXACT_EPS.
|
|
663
|
+
"""
|
|
664
|
+
for name, vec in lig_atoms.items():
|
|
665
|
+
if name not in cand:
|
|
666
|
+
return False
|
|
667
|
+
if (vec - cand[name].get_vector()).norm() > EXACT_EPS:
|
|
668
|
+
return False
|
|
669
|
+
return True
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def find_substrate_residues(complex_struct, substrate_struct) -> List[PDB.Residue.Residue]:
|
|
673
|
+
"""
|
|
674
|
+
Find substrate residues in the complex by **exact coordinate match** to a substrate PDB.
|
|
675
|
+
"""
|
|
676
|
+
substrate_res_list = list(substrate_struct.get_residues())
|
|
677
|
+
matched: List[PDB.Residue.Residue] = []
|
|
678
|
+
for lig in substrate_res_list:
|
|
679
|
+
lig_name = lig.get_resname()
|
|
680
|
+
lig_atoms = {a.get_name(): a.get_vector() for a in lig}
|
|
681
|
+
candidates = [r for r in complex_struct.get_residues()
|
|
682
|
+
if r.get_resname() == lig_name and len(r) == len(lig_atoms)]
|
|
683
|
+
for cand in candidates:
|
|
684
|
+
if is_exact_match(lig_atoms, cand):
|
|
685
|
+
matched.append(cand)
|
|
686
|
+
break
|
|
687
|
+
else:
|
|
688
|
+
chain_id = lig.get_full_id()[2] if len(lig.get_full_id()) > 2 else ""
|
|
689
|
+
resseq = lig.id[1]
|
|
690
|
+
icode = lig.id[2] if len(lig.id) > 2 else " "
|
|
691
|
+
icode_str = "" if icode == " " else icode
|
|
692
|
+
raise ValueError(
|
|
693
|
+
f"Exact match not found for substrate residue {lig_name} chain {chain_id} {resseq}{icode_str}"
|
|
694
|
+
)
|
|
695
|
+
return matched
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
# ---------- Residue‑ID–based substrate selection ----------
|
|
699
|
+
|
|
700
|
+
_RES_TOKEN_RE = re.compile(r"""
|
|
701
|
+
^\s*
|
|
702
|
+
(?:(?P<chain>[^:\s,]+)\s*:\s*)? # optional chain like A or A_long
|
|
703
|
+
(?P<resseq>\d+) # residue sequence number
|
|
704
|
+
(?P<icode>[A-Za-z]?) # optional insertion code (single letter)
|
|
705
|
+
\s*$
|
|
706
|
+
""", re.VERBOSE)
|
|
707
|
+
|
|
708
|
+
def _parse_res_tokens(spec: str) -> List[Tuple[str | None, int, str | None]]:
|
|
709
|
+
"""
|
|
710
|
+
Parse a residue specification string into (chain, resseq, icode) tuples.
|
|
711
|
+
"""
|
|
712
|
+
if not spec or not spec.strip():
|
|
713
|
+
raise ValueError("Empty -c/--center specification.")
|
|
714
|
+
tokens = [t.strip() for t in re.split(r"[,\s]+", spec) if t.strip()]
|
|
715
|
+
parsed: List[Tuple[str | None, int, str | None]] = []
|
|
716
|
+
for tok in tokens:
|
|
717
|
+
m = _RES_TOKEN_RE.match(tok)
|
|
718
|
+
if not m:
|
|
719
|
+
raise ValueError(
|
|
720
|
+
f"Invalid residue specifier '{tok}'. Use '123', '123A', 'A:123', or 'A:123A'."
|
|
721
|
+
)
|
|
722
|
+
chain = m.group("chain")
|
|
723
|
+
resseq = int(m.group("resseq"))
|
|
724
|
+
icode = m.group("icode") or None
|
|
725
|
+
parsed.append((chain, resseq, icode))
|
|
726
|
+
return parsed
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def find_substrate_by_idspec(complex_struct, spec: str) -> List[PDB.Residue.Residue]:
|
|
730
|
+
"""
|
|
731
|
+
Resolve a comma/space-separated residue list into residues within the complex.
|
|
732
|
+
|
|
733
|
+
Matching rules
|
|
734
|
+
--------------
|
|
735
|
+
* Chain may be omitted (matches all chains).
|
|
736
|
+
* Insertion code may be omitted (matches any insertion code for that resseq).
|
|
737
|
+
|
|
738
|
+
Returns
|
|
739
|
+
-------
|
|
740
|
+
list[Bio.PDB.Residue.Residue]
|
|
741
|
+
"""
|
|
742
|
+
targets = _parse_res_tokens(spec)
|
|
743
|
+
found: List[PDB.Residue.Residue] = []
|
|
744
|
+
seen: Set[Tuple] = set()
|
|
745
|
+
|
|
746
|
+
for chain_req, resseq_req, icode_req in targets:
|
|
747
|
+
matches: List[PDB.Residue.Residue] = []
|
|
748
|
+
for model in complex_struct:
|
|
749
|
+
for chain in model:
|
|
750
|
+
if chain_req is not None and chain.id != chain_req:
|
|
751
|
+
continue
|
|
752
|
+
for res in chain.get_residues():
|
|
753
|
+
_, resseq, icode = res.id
|
|
754
|
+
if resseq != resseq_req:
|
|
755
|
+
continue
|
|
756
|
+
if icode_req is not None and icode != icode_req:
|
|
757
|
+
continue
|
|
758
|
+
fid = res.get_full_id()
|
|
759
|
+
if fid not in seen:
|
|
760
|
+
seen.add(fid)
|
|
761
|
+
matches.append(res)
|
|
762
|
+
if not matches:
|
|
763
|
+
chain_txt = f"{chain_req}:" if chain_req is not None else ""
|
|
764
|
+
icode_txt = icode_req or ""
|
|
765
|
+
raise ValueError(f"Residue '{chain_txt}{resseq_req}{icode_txt}' not found in complex.")
|
|
766
|
+
found.extend(matches)
|
|
767
|
+
|
|
768
|
+
return found
|
|
769
|
+
|
|
770
|
+
# ---------- Residue-name-based substrate selection ----------
|
|
771
|
+
|
|
772
|
+
def find_substrate_by_resname(complex_struct, spec: str) -> List[PDB.Residue.Residue]:
|
|
773
|
+
"""
|
|
774
|
+
Resolve a comma/space-separated residue-name list (e.g., 'GPP,MMT') into residues in the complex.
|
|
775
|
+
|
|
776
|
+
Behavior
|
|
777
|
+
--------
|
|
778
|
+
* Case-insensitive match against residue `resname`.
|
|
779
|
+
* If multiple residues share the same name, **all** are included and a **WARNING** is logged.
|
|
780
|
+
"""
|
|
781
|
+
if not spec or not spec.strip():
|
|
782
|
+
raise ValueError("Empty -c/--center specification.")
|
|
783
|
+
tokens = [t.strip().upper() for t in re.split(r"[,\s]+", spec) if t.strip()]
|
|
784
|
+
found: List[PDB.Residue.Residue] = []
|
|
785
|
+
seen_fids: Set[Tuple] = set()
|
|
786
|
+
for rn in tokens:
|
|
787
|
+
matches = [r for r in complex_struct.get_residues() if r.get_resname().upper() == rn]
|
|
788
|
+
if not matches:
|
|
789
|
+
raise ValueError(f"Residue name '{rn}' not found in complex.")
|
|
790
|
+
if len(matches) > 1:
|
|
791
|
+
try:
|
|
792
|
+
sample = ", ".join(_fmt_res_id(r) for r in matches[:5])
|
|
793
|
+
except Exception:
|
|
794
|
+
sample = "(list omitted)"
|
|
795
|
+
LOGGER.warning("[extract] Multiple residues with resname '%s' found (%d). Using all: %s",
|
|
796
|
+
rn, len(matches), sample)
|
|
797
|
+
for r in matches:
|
|
798
|
+
fid = r.get_full_id()
|
|
799
|
+
if fid not in seen_fids:
|
|
800
|
+
seen_fids.add(fid)
|
|
801
|
+
found.append(r)
|
|
802
|
+
return found
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
def resolve_substrate_residues(complex_struct, center_spec: str) -> List[PDB.Residue.Residue]:
|
|
806
|
+
"""
|
|
807
|
+
Determine substrate residues from a PDB path, residue-ID list, or residue-name list.
|
|
808
|
+
"""
|
|
809
|
+
if os.path.exists(center_spec):
|
|
810
|
+
substrate_struct = load_structure(center_spec, "substrate")
|
|
811
|
+
return find_substrate_residues(complex_struct, substrate_struct)
|
|
812
|
+
# If it parses as ID-spec, treat as IDs (and propagate any not-found errors).
|
|
813
|
+
try:
|
|
814
|
+
_parse_res_tokens(center_spec)
|
|
815
|
+
return find_substrate_by_idspec(complex_struct, center_spec)
|
|
816
|
+
except ValueError:
|
|
817
|
+
# Otherwise, interpret as residue-name list (e.g., 'GPP,MMT').
|
|
818
|
+
return find_substrate_by_resname(complex_struct, center_spec)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
# ---------------------------------------------------------------------
|
|
822
|
+
# Polypeptide adjacency (C–N) helper
|
|
823
|
+
# ---------------------------------------------------------------------
|
|
824
|
+
|
|
825
|
+
def are_peptide_adjacent(prev_res: PDB.Residue.Residue,
|
|
826
|
+
next_res: PDB.Residue.Residue,
|
|
827
|
+
max_cn_dist: float = 1.9) -> bool:
|
|
828
|
+
"""
|
|
829
|
+
Return True if prev_res—next_res are peptide-bond adjacent based on C(prev)–N(next) distance.
|
|
830
|
+
|
|
831
|
+
Notes
|
|
832
|
+
-----
|
|
833
|
+
Distance‑based criterion; in practice this avoids crossing TER boundaries because missing
|
|
834
|
+
atoms or long inter‑residue distances will fail the check.
|
|
835
|
+
"""
|
|
836
|
+
if prev_res.get_resname() not in AMINO_ACIDS or next_res.get_resname() not in AMINO_ACIDS:
|
|
837
|
+
return False
|
|
838
|
+
if ("C" not in prev_res) or ("N" not in next_res):
|
|
839
|
+
return False
|
|
840
|
+
try:
|
|
841
|
+
d = (prev_res["C"].get_vector() - next_res["N"].get_vector()).norm()
|
|
842
|
+
except Exception:
|
|
843
|
+
return False
|
|
844
|
+
return (d == d) and (d <= max_cn_dist) # d==d to filter NaN
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
# ---------------------------------------------------------------------
|
|
848
|
+
# Residue selection around the substrate
|
|
849
|
+
# ---------------------------------------------------------------------
|
|
850
|
+
|
|
851
|
+
def select_residues(complex_struct,
|
|
852
|
+
substrate_res_list: List[PDB.Residue.Residue],
|
|
853
|
+
r_as: float,
|
|
854
|
+
r_het: float,
|
|
855
|
+
include_h2o: bool,
|
|
856
|
+
exclude_backbone: bool) -> Tuple[Set[Tuple], Set[Tuple]]:
|
|
857
|
+
"""
|
|
858
|
+
Select pocket residues around the substrate.
|
|
859
|
+
|
|
860
|
+
Selection rule
|
|
861
|
+
--------------
|
|
862
|
+
* Always include the substrate residues themselves.
|
|
863
|
+
* Standard cutoff (`r_as`):
|
|
864
|
+
- If `exclude_backbone` is **False**: include a residue if **any** atom is within `r_as`.
|
|
865
|
+
- If `exclude_backbone` is **True**: for **amino acids**, require a **non‑backbone** atom
|
|
866
|
+
to be within `r_as`; non‑amino‑acid residues are included if **any** atom is within `r_as`.
|
|
867
|
+
* Hetero‑hetero cutoff (`r_het`):
|
|
868
|
+
- Neighbor atom must be hetero (element not in {C,H}).
|
|
869
|
+
- When `exclude_backbone` is **True** and the neighbor is an amino acid, that atom must
|
|
870
|
+
also be **non‑backbone**.
|
|
871
|
+
|
|
872
|
+
Returns
|
|
873
|
+
-------
|
|
874
|
+
(selected_ids, backbone_contact_ids)
|
|
875
|
+
selected_ids : set of residue full-ids to output
|
|
876
|
+
backbone_contact_ids : subset with any **backbone atom** within r_as or r_het of a substrate atom.
|
|
877
|
+
(Waters ignored; only relevant when exclude_backbone == False)
|
|
878
|
+
"""
|
|
879
|
+
substrate_atoms = [a for lig in substrate_res_list for a in lig]
|
|
880
|
+
substrate_het = [a for a in substrate_atoms if a.element not in ("C", "H")]
|
|
881
|
+
ns = NeighborSearch(list(complex_struct.get_atoms()))
|
|
882
|
+
|
|
883
|
+
selected_ids: Set[Tuple] = {res.get_full_id() for res in substrate_res_list}
|
|
884
|
+
backbone_contact_ids: Set[Tuple] = set()
|
|
885
|
+
|
|
886
|
+
def is_amino_backbone_atom(atom: PDB.Atom.Atom) -> bool:
|
|
887
|
+
res = atom.get_parent()
|
|
888
|
+
return (res.get_resname() in AMINO_ACIDS) and (atom.get_name() in BACKBONE_ATOMS)
|
|
889
|
+
|
|
890
|
+
def maybe_add(atom, via_backbone: bool):
|
|
891
|
+
res = atom.get_parent()
|
|
892
|
+
if not include_h2o and res.get_resname() in WATER_RES:
|
|
893
|
+
return
|
|
894
|
+
fid = res.get_full_id()
|
|
895
|
+
selected_ids.add(fid)
|
|
896
|
+
if via_backbone and res.get_resname() in AMINO_ACIDS:
|
|
897
|
+
backbone_contact_ids.add(fid)
|
|
898
|
+
|
|
899
|
+
# standard radius: any atom within r_as (with backbone filter when exclude_backbone==True)
|
|
900
|
+
for atom in substrate_atoms:
|
|
901
|
+
for neigh in ns.search(atom.get_coord(), r_as):
|
|
902
|
+
if exclude_backbone and is_amino_backbone_atom(neigh):
|
|
903
|
+
continue # require non-backbone atom for amino-acid residues
|
|
904
|
+
via_backbone_neigh = (neigh.get_name() in BACKBONE_ATOMS)
|
|
905
|
+
maybe_add(neigh, via_backbone_neigh)
|
|
906
|
+
|
|
907
|
+
# hetero-hetero radius: both sides non-C/H (and non-backbone filter for amino acids when exclude_backbone==True)
|
|
908
|
+
for atom in substrate_het:
|
|
909
|
+
for neigh in ns.search(atom.get_coord(), r_het):
|
|
910
|
+
if neigh.element in ("C", "H"):
|
|
911
|
+
continue
|
|
912
|
+
if exclude_backbone and is_amino_backbone_atom(neigh):
|
|
913
|
+
continue
|
|
914
|
+
via_backbone_neigh = (neigh.get_name() in BACKBONE_ATOMS)
|
|
915
|
+
maybe_add(neigh, via_backbone_neigh)
|
|
916
|
+
|
|
917
|
+
return selected_ids, backbone_contact_ids
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
# ---------------------------------------------------------------------
|
|
921
|
+
# Disulfide augmentation
|
|
922
|
+
# ---------------------------------------------------------------------
|
|
923
|
+
|
|
924
|
+
def augment_disulfides(structure, selected_ids: Set[Tuple],
|
|
925
|
+
cutoff: float = DISULFIDE_CUTOFF):
|
|
926
|
+
"""
|
|
927
|
+
Include Cys–Cys disulfide partners if either residue is selected (SG–SG ≤ cutoff).
|
|
928
|
+
"""
|
|
929
|
+
sg_atoms = [r["SG"] for r in structure.get_residues()
|
|
930
|
+
if r.get_resname() in {"CYS", "CYX"} and "SG" in r]
|
|
931
|
+
|
|
932
|
+
if not sg_atoms:
|
|
933
|
+
return
|
|
934
|
+
|
|
935
|
+
ns = NeighborSearch(sg_atoms)
|
|
936
|
+
for at in sg_atoms:
|
|
937
|
+
for other in ns.search(at.get_coord(), cutoff):
|
|
938
|
+
if other is at:
|
|
939
|
+
continue
|
|
940
|
+
f1 = at.get_parent().get_full_id()
|
|
941
|
+
f2 = other.get_parent().get_full_id()
|
|
942
|
+
if f1 in selected_ids or f2 in selected_ids:
|
|
943
|
+
selected_ids.update((f1, f2))
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
# ---------------------------------------------------------------------
|
|
947
|
+
# Proline augmentation (N-side neighbor inclusion; TER-aware)
|
|
948
|
+
# ---------------------------------------------------------------------
|
|
949
|
+
|
|
950
|
+
def augment_proline_prev_neighbor(structure, selected_ids: Set[Tuple]):
|
|
951
|
+
"""
|
|
952
|
+
Ensure that if a selected PRO is not at the N-terminus, the immediately
|
|
953
|
+
preceding (N-side) amino-acid residue is also selected.
|
|
954
|
+
|
|
955
|
+
Notes
|
|
956
|
+
-----
|
|
957
|
+
Uses peptide adjacency (C–N ≤ 1.9 Å) to avoid crossing TER boundaries.
|
|
958
|
+
"""
|
|
959
|
+
added = 0
|
|
960
|
+
for fid in list(selected_ids):
|
|
961
|
+
model_id, chain_id, res_id = fid[1], fid[2], fid[3]
|
|
962
|
+
res: PDB.Residue.Residue = structure[model_id][chain_id].child_dict[res_id]
|
|
963
|
+
if res.get_resname() != "PRO":
|
|
964
|
+
continue
|
|
965
|
+
chain = structure[model_id][chain_id]
|
|
966
|
+
residues: List[PDB.Residue.Residue] = list(chain.get_residues())
|
|
967
|
+
try:
|
|
968
|
+
idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
|
|
969
|
+
except StopIteration:
|
|
970
|
+
continue
|
|
971
|
+
prev_res = None
|
|
972
|
+
for j in range(idx - 1, -1, -1):
|
|
973
|
+
rj = residues[j]
|
|
974
|
+
if rj.get_resname() in AMINO_ACIDS:
|
|
975
|
+
prev_res = rj
|
|
976
|
+
break
|
|
977
|
+
if prev_res is None:
|
|
978
|
+
continue
|
|
979
|
+
if not are_peptide_adjacent(prev_res, res):
|
|
980
|
+
continue
|
|
981
|
+
prev_fid = prev_res.get_full_id()
|
|
982
|
+
if prev_fid not in selected_ids:
|
|
983
|
+
selected_ids.add(prev_fid)
|
|
984
|
+
added += 1
|
|
985
|
+
if added:
|
|
986
|
+
LOGGER.info("[extract] Added %d N-side neighbor residues for PRO (TER-aware).", added)
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
# ---------------------------------------------------------------------
|
|
990
|
+
# Backbone-contact neighbor augmentation (exclude_backbone == False; TER-aware)
|
|
991
|
+
# ---------------------------------------------------------------------
|
|
992
|
+
|
|
993
|
+
def augment_backbone_contact_neighbors(structure,
|
|
994
|
+
selected_ids: Set[Tuple],
|
|
995
|
+
backbone_contact_ids: Set[Tuple],
|
|
996
|
+
substrate_ids: Set[Tuple]) -> Tuple[Set[Tuple], Set[Tuple]]:
|
|
997
|
+
"""
|
|
998
|
+
If a non-substrate residue had **any backbone atom** within selection radii,
|
|
999
|
+
include its immediate N- and C-side amino-acid neighbors **only if peptide-bond adjacent**.
|
|
1000
|
+
|
|
1001
|
+
If a side has no peptide-adjacent neighbor (true terminus; e.g., separated by TER),
|
|
1002
|
+
mark the residue to **keep** the respective terminal atoms (N/H* for N-terminus; C/O/OXT for C-terminus).
|
|
1003
|
+
|
|
1004
|
+
Returns
|
|
1005
|
+
-------
|
|
1006
|
+
keep_ncap_ids, keep_ccap_ids : sets of full-ids whose terminal caps must be preserved
|
|
1007
|
+
"""
|
|
1008
|
+
keep_ncap_ids: Set[Tuple] = set()
|
|
1009
|
+
keep_ccap_ids: Set[Tuple] = set()
|
|
1010
|
+
added = 0
|
|
1011
|
+
termini_kept_n = 0
|
|
1012
|
+
termini_kept_c = 0
|
|
1013
|
+
|
|
1014
|
+
for fid in list(backbone_contact_ids):
|
|
1015
|
+
if fid in substrate_ids:
|
|
1016
|
+
continue # do not augment around substrate residues
|
|
1017
|
+
model_id, chain_id, res_id = fid[1], fid[2], fid[3]
|
|
1018
|
+
chain = structure[model_id][chain_id]
|
|
1019
|
+
residues: List[PDB.Residue.Residue] = list(chain.get_residues())
|
|
1020
|
+
try:
|
|
1021
|
+
idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
|
|
1022
|
+
except StopIteration:
|
|
1023
|
+
continue
|
|
1024
|
+
|
|
1025
|
+
cur_res = residues[idx]
|
|
1026
|
+
|
|
1027
|
+
# previous amino-acid — require peptide adjacency
|
|
1028
|
+
prev_res = None
|
|
1029
|
+
for j in range(idx - 1, -1, -1):
|
|
1030
|
+
rj = residues[j]
|
|
1031
|
+
if rj.get_resname() in AMINO_ACIDS:
|
|
1032
|
+
prev_res = rj
|
|
1033
|
+
break
|
|
1034
|
+
if prev_res is not None and are_peptide_adjacent(prev_res, cur_res):
|
|
1035
|
+
prev_fid = prev_res.get_full_id()
|
|
1036
|
+
if prev_fid not in selected_ids:
|
|
1037
|
+
selected_ids.add(prev_fid)
|
|
1038
|
+
added += 1
|
|
1039
|
+
else:
|
|
1040
|
+
keep_ncap_ids.add(fid)
|
|
1041
|
+
termini_kept_n += 1
|
|
1042
|
+
|
|
1043
|
+
# next amino-acid — require peptide adjacency
|
|
1044
|
+
next_res = None
|
|
1045
|
+
for j in range(idx + 1, len(residues)):
|
|
1046
|
+
rj = residues[j]
|
|
1047
|
+
if rj.get_resname() in AMINO_ACIDS:
|
|
1048
|
+
next_res = rj
|
|
1049
|
+
break
|
|
1050
|
+
if next_res is not None and are_peptide_adjacent(cur_res, next_res):
|
|
1051
|
+
next_fid = next_res.get_full_id()
|
|
1052
|
+
if next_fid not in selected_ids:
|
|
1053
|
+
selected_ids.add(next_fid)
|
|
1054
|
+
added += 1
|
|
1055
|
+
else:
|
|
1056
|
+
keep_ccap_ids.add(fid)
|
|
1057
|
+
termini_kept_c += 1
|
|
1058
|
+
|
|
1059
|
+
if added or termini_kept_n or termini_kept_c:
|
|
1060
|
+
LOGGER.info("[extract] Backbone-contact context (TER-aware): added %d neighbors; kept N-cap on %d, C-cap on %d residues.",
|
|
1061
|
+
added, termini_kept_n, termini_kept_c)
|
|
1062
|
+
return keep_ncap_ids, keep_ccap_ids
|
|
1063
|
+
|
|
1064
|
+
|
|
1065
|
+
# ---------------------------------------------------------------------
|
|
1066
|
+
# Backbone trimming / skip-map generation
|
|
1067
|
+
# ---------------------------------------------------------------------
|
|
1068
|
+
|
|
1069
|
+
def mark_atoms_to_skip(structure, selected_ids: Set[Tuple], substrate_ids: Set[Tuple],
|
|
1070
|
+
exclude_backbone: bool,
|
|
1071
|
+
keep_ncap_ids: Set[Tuple] | None = None,
|
|
1072
|
+
keep_ccap_ids: Set[Tuple] | None = None) -> Dict[Tuple, Set[str]]:
|
|
1073
|
+
"""
|
|
1074
|
+
Decide which atoms to delete (truncation). Never delete substrate atoms.
|
|
1075
|
+
|
|
1076
|
+
Returns
|
|
1077
|
+
-------
|
|
1078
|
+
dict[full-id -> set(atom_names_to_delete)]
|
|
1079
|
+
"""
|
|
1080
|
+
keep_ncap_ids = keep_ncap_ids or set()
|
|
1081
|
+
keep_ccap_ids = keep_ccap_ids or set()
|
|
1082
|
+
|
|
1083
|
+
# start with the original truncation logic (except for substrate residues)
|
|
1084
|
+
chain_map: Dict[Tuple[str, str], List[Tuple]] = {}
|
|
1085
|
+
for fid in selected_ids:
|
|
1086
|
+
if fid in substrate_ids:
|
|
1087
|
+
continue # never delete atoms from substrate residues
|
|
1088
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1089
|
+
if res.get_resname() in WATER_RES:
|
|
1090
|
+
continue
|
|
1091
|
+
chain_map.setdefault((fid[1], fid[2]), []).append(fid)
|
|
1092
|
+
|
|
1093
|
+
skip: Dict[Tuple, Set[str]] = {}
|
|
1094
|
+
|
|
1095
|
+
# --- TER-aware segmentation: split by peptide adjacency in file order ---
|
|
1096
|
+
for (model, chain), fids in chain_map.items():
|
|
1097
|
+
chain_obj = structure[model][chain]
|
|
1098
|
+
residues_all: List[PDB.Residue.Residue] = list(chain_obj.get_residues())
|
|
1099
|
+
index_map: Dict[Tuple, int] = {r.get_full_id(): i for i, r in enumerate(residues_all)}
|
|
1100
|
+
|
|
1101
|
+
# sort by file order
|
|
1102
|
+
fids.sort(key=lambda x: index_map.get(x, 10**9))
|
|
1103
|
+
|
|
1104
|
+
# build segments by peptide-bond adjacency
|
|
1105
|
+
segs: List[List[Tuple]] = []
|
|
1106
|
+
cur_seg: List[Tuple] = []
|
|
1107
|
+
for k, fid in enumerate(fids):
|
|
1108
|
+
if not cur_seg:
|
|
1109
|
+
cur_seg = [fid]
|
|
1110
|
+
continue
|
|
1111
|
+
prev_fid = cur_seg[-1]
|
|
1112
|
+
prev_res = chain_obj.child_dict[prev_fid[3]]
|
|
1113
|
+
cur_res = chain_obj.child_dict[fid[3]]
|
|
1114
|
+
if are_peptide_adjacent(prev_res, cur_res):
|
|
1115
|
+
cur_seg.append(fid)
|
|
1116
|
+
else:
|
|
1117
|
+
segs.append(cur_seg)
|
|
1118
|
+
cur_seg = [fid]
|
|
1119
|
+
if cur_seg:
|
|
1120
|
+
segs.append(cur_seg)
|
|
1121
|
+
|
|
1122
|
+
# apply cap deletions on these TER-aware segments
|
|
1123
|
+
for seg in segs:
|
|
1124
|
+
n_id, c_id = seg[0], seg[-1]
|
|
1125
|
+
single = len(seg) == 1
|
|
1126
|
+
|
|
1127
|
+
def add(fid_local, names):
|
|
1128
|
+
skip.setdefault(fid_local, set()).update(names)
|
|
1129
|
+
|
|
1130
|
+
n_res = chain_obj.child_dict[n_id[3]]
|
|
1131
|
+
c_res = chain_obj.child_dict[c_id[3]]
|
|
1132
|
+
|
|
1133
|
+
# N-terminal cap deletion (only for amino acids; skip if PRO/HYP or explicitly kept)
|
|
1134
|
+
if (n_res.get_resname() in AMINO_ACIDS) and (n_res.get_resname() not in {"PRO", "HYP"}) and (n_id not in keep_ncap_ids):
|
|
1135
|
+
add(n_id, {"N", "H", "H1", "H2", "H3", "HN"})
|
|
1136
|
+
# C-terminal cap deletion (only for amino acids; skip if explicitly kept)
|
|
1137
|
+
if (c_res.get_resname() in AMINO_ACIDS) and (c_id not in keep_ccap_ids):
|
|
1138
|
+
add(c_id, {"C", "O", "OXT"})
|
|
1139
|
+
|
|
1140
|
+
# Isolated stretch – remove CA/HA* (only for amino acids; except PRO/HYP)
|
|
1141
|
+
if single and (n_res.get_resname() in AMINO_ACIDS) and (n_res.get_resname() not in {"PRO", "HYP"}):
|
|
1142
|
+
add(n_id, {"CA", "HA", "HA2", "HA3"})
|
|
1143
|
+
|
|
1144
|
+
# ---------------------------------------------------------------------
|
|
1145
|
+
# Optional: remove *all* backbone atoms from every non-substrate residue
|
|
1146
|
+
# PRO/HYP keep N, CA, and HA* to preserve the ring.
|
|
1147
|
+
# ---------------------------------------------------------------------
|
|
1148
|
+
if exclude_backbone:
|
|
1149
|
+
for fid in selected_ids:
|
|
1150
|
+
if fid in substrate_ids:
|
|
1151
|
+
continue
|
|
1152
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1153
|
+
if res.get_resname() in WATER_RES:
|
|
1154
|
+
continue
|
|
1155
|
+
if res.get_resname() in AMINO_ACIDS:
|
|
1156
|
+
if res.get_resname() in {"PRO", "HYP"}:
|
|
1157
|
+
to_remove = BACKBONE_ALL - {"N", "CA", "HA", "H", "H1", "H2", "H3"}
|
|
1158
|
+
else:
|
|
1159
|
+
to_remove = BACKBONE_ALL
|
|
1160
|
+
skip.setdefault(fid, set()).update(to_remove)
|
|
1161
|
+
|
|
1162
|
+
# Preserve peptide carbonyl on the N-side neighbor of PRO
|
|
1163
|
+
for fid in selected_ids:
|
|
1164
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1165
|
+
if res.get_resname() != "PRO":
|
|
1166
|
+
continue
|
|
1167
|
+
chain = structure[fid[1]][fid[2]]
|
|
1168
|
+
residues: List[PDB.Residue.Residue] = list(chain.get_residues())
|
|
1169
|
+
try:
|
|
1170
|
+
idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
|
|
1171
|
+
except StopIteration:
|
|
1172
|
+
continue
|
|
1173
|
+
prev_res = None
|
|
1174
|
+
for j in range(idx - 1, -1, -1):
|
|
1175
|
+
rj = residues[j]
|
|
1176
|
+
if rj.get_resname() in AMINO_ACIDS:
|
|
1177
|
+
prev_res = rj
|
|
1178
|
+
break
|
|
1179
|
+
if prev_res is None:
|
|
1180
|
+
continue
|
|
1181
|
+
if not are_peptide_adjacent(prev_res, res):
|
|
1182
|
+
continue
|
|
1183
|
+
prev_fid = prev_res.get_full_id()
|
|
1184
|
+
if prev_fid in selected_ids:
|
|
1185
|
+
sk = skip.setdefault(prev_fid, set())
|
|
1186
|
+
for nm in ("C", "O", "OXT"):
|
|
1187
|
+
if nm in sk:
|
|
1188
|
+
sk.remove(nm)
|
|
1189
|
+
|
|
1190
|
+
# Always keep CA on the N-side neighbor of PRO (independent of --exclude-backbone)
|
|
1191
|
+
for fid in selected_ids:
|
|
1192
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1193
|
+
if res.get_resname() != "PRO":
|
|
1194
|
+
continue
|
|
1195
|
+
chain = structure[fid[1]][fid[2]]
|
|
1196
|
+
residues: List[PDB.Residue.Residue] = list(chain.get_residues())
|
|
1197
|
+
try:
|
|
1198
|
+
idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
|
|
1199
|
+
except StopIteration:
|
|
1200
|
+
continue
|
|
1201
|
+
prev_res = None
|
|
1202
|
+
for j in range(idx - 1, -1, -1):
|
|
1203
|
+
rj = residues[j]
|
|
1204
|
+
if rj.get_resname() in AMINO_ACIDS:
|
|
1205
|
+
prev_res = rj
|
|
1206
|
+
break
|
|
1207
|
+
if prev_res is None:
|
|
1208
|
+
continue
|
|
1209
|
+
if not are_peptide_adjacent(prev_res, res):
|
|
1210
|
+
continue
|
|
1211
|
+
prev_fid = prev_res.get_full_id()
|
|
1212
|
+
if prev_fid in selected_ids:
|
|
1213
|
+
sk = skip.setdefault(prev_fid, set())
|
|
1214
|
+
if "CA" in sk:
|
|
1215
|
+
sk.remove("CA")
|
|
1216
|
+
|
|
1217
|
+
return skip
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
def _atom_present_in_output(res: PDB.Residue.Residue, name: str, skip_set: Set[str]) -> bool:
|
|
1221
|
+
"""
|
|
1222
|
+
True if the atom exists originally AND is not marked for deletion.
|
|
1223
|
+
"""
|
|
1224
|
+
return (name in res) and (name not in skip_set)
|
|
1225
|
+
|
|
1226
|
+
def _atom_removed_by_truncation(res: PDB.Residue.Residue, name: str, skip_set: Set[str]) -> bool:
|
|
1227
|
+
"""
|
|
1228
|
+
True if the atom exists originally AND is marked for deletion.
|
|
1229
|
+
"""
|
|
1230
|
+
return (name in res) and (name in skip_set)
|
|
1231
|
+
|
|
1232
|
+
def compute_linkH_atoms(structure,
|
|
1233
|
+
selected_ids: Set[Tuple],
|
|
1234
|
+
skip_map: Dict[Tuple, Set[str]]) -> List[Tuple[float, float, float]]:
|
|
1235
|
+
"""
|
|
1236
|
+
Identify severed bonds created by truncation and compute link‑H coordinates.
|
|
1237
|
+
|
|
1238
|
+
Rules
|
|
1239
|
+
-----
|
|
1240
|
+
* Normal residues: place H along **CB→CA**, **CA→N**, **CA→C** if partner was removed.
|
|
1241
|
+
* PRO/HYP: place H along **CA→C** only.
|
|
1242
|
+
* Parent atom **must be Carbon**; H is placed along (parent → removed_partner) at **1.09 Å**.
|
|
1243
|
+
|
|
1244
|
+
Returns
|
|
1245
|
+
-------
|
|
1246
|
+
list of (x, y, z) coordinates for link‑H atoms
|
|
1247
|
+
"""
|
|
1248
|
+
link_coords: List[Tuple[float, float, float]] = []
|
|
1249
|
+
|
|
1250
|
+
for fid in selected_ids:
|
|
1251
|
+
model_id, chain_id, res_id = fid[1], fid[2], fid[3]
|
|
1252
|
+
res: PDB.Residue.Residue = structure[model_id][chain_id].child_dict[res_id]
|
|
1253
|
+
if res.get_resname() in WATER_RES:
|
|
1254
|
+
continue
|
|
1255
|
+
skip_set = skip_map.get(fid, set())
|
|
1256
|
+
resname = res.get_resname()
|
|
1257
|
+
|
|
1258
|
+
def _add_if_cut(parent_name: str, partner_name: str):
|
|
1259
|
+
if not _atom_present_in_output(res, parent_name, skip_set):
|
|
1260
|
+
return
|
|
1261
|
+
if not _atom_removed_by_truncation(res, partner_name, skip_set):
|
|
1262
|
+
return
|
|
1263
|
+
parent = res[parent_name]
|
|
1264
|
+
partner = res[partner_name]
|
|
1265
|
+
parent_elem = (parent.element or parent.get_name()[0]).upper()
|
|
1266
|
+
if parent_elem != "C":
|
|
1267
|
+
return
|
|
1268
|
+
v = np.array(partner.get_coord(), dtype=float) - np.array(parent.get_coord(), dtype=float)
|
|
1269
|
+
norm = np.linalg.norm(v)
|
|
1270
|
+
if not np.isfinite(norm) or norm < 1e-6:
|
|
1271
|
+
return
|
|
1272
|
+
v /= norm
|
|
1273
|
+
dist = 1.09 # C–H
|
|
1274
|
+
h = np.array(parent.get_coord(), dtype=float) + v * dist
|
|
1275
|
+
link_coords.append((float(h[0]), float(h[1]), float(h[2])))
|
|
1276
|
+
|
|
1277
|
+
if resname in {"PRO", "HYP"}:
|
|
1278
|
+
_add_if_cut("CA", "C")
|
|
1279
|
+
else:
|
|
1280
|
+
_add_if_cut("CB", "CA")
|
|
1281
|
+
_add_if_cut("CA", "N")
|
|
1282
|
+
_add_if_cut("CA", "C")
|
|
1283
|
+
|
|
1284
|
+
return link_coords
|
|
1285
|
+
|
|
1286
|
+
|
|
1287
|
+
def _max_serial_from_pdb_text(pdb_text: str) -> int:
|
|
1288
|
+
"""
|
|
1289
|
+
Find the maximum atom serial number in PDB text.
|
|
1290
|
+
"""
|
|
1291
|
+
max_serial = 0
|
|
1292
|
+
for line in pdb_text.splitlines():
|
|
1293
|
+
if line.startswith("ATOM") or line.startswith("HETATM"):
|
|
1294
|
+
try:
|
|
1295
|
+
serial = int(line[6:11])
|
|
1296
|
+
if serial > max_serial:
|
|
1297
|
+
max_serial = serial
|
|
1298
|
+
except Exception:
|
|
1299
|
+
continue
|
|
1300
|
+
return max_serial
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
def _format_linkH_block(link_coords: List[Tuple[float, float, float]],
|
|
1304
|
+
start_serial: int,
|
|
1305
|
+
chain_id: str = "L") -> str:
|
|
1306
|
+
"""
|
|
1307
|
+
Format a contiguous HETATM block for link‑H atoms.
|
|
1308
|
+
|
|
1309
|
+
Conventions
|
|
1310
|
+
-----------
|
|
1311
|
+
* Atom name: HL
|
|
1312
|
+
* Residue name: LKH
|
|
1313
|
+
* Chain: chain_id (default 'L')
|
|
1314
|
+
* Residue numbers: 1..N (one pseudo‑residue per H)
|
|
1315
|
+
"""
|
|
1316
|
+
lines: List[str] = []
|
|
1317
|
+
serial = start_serial
|
|
1318
|
+
resseq = 1
|
|
1319
|
+
for (x, y, z) in link_coords:
|
|
1320
|
+
serial += 1
|
|
1321
|
+
line = (
|
|
1322
|
+
f"HETATM{serial:5d} "
|
|
1323
|
+
f"{'HL':>4s} "
|
|
1324
|
+
f"{'LKH':>3s} "
|
|
1325
|
+
f"{chain_id}"
|
|
1326
|
+
f"{resseq:4d} "
|
|
1327
|
+
f"{x:8.3f}{y:8.3f}{z:8.3f}"
|
|
1328
|
+
f"{1.00:6.2f}{0.00:6.2f}"
|
|
1329
|
+
f" {'H':>2s}"
|
|
1330
|
+
)
|
|
1331
|
+
lines.append(line)
|
|
1332
|
+
resseq += 1
|
|
1333
|
+
return ("\n".join(lines) + ("\n" if lines else ""))
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
# ---------------------------------------------------------------------
|
|
1337
|
+
# Charge calculation & logging
|
|
1338
|
+
# ---------------------------------------------------------------------
|
|
1339
|
+
|
|
1340
|
+
def _sorted_fids_by_file_order(structure, fids: Iterable[Tuple]) -> List[Tuple]:
|
|
1341
|
+
"""
|
|
1342
|
+
Sort full-ids by file order using a residue index map.
|
|
1343
|
+
"""
|
|
1344
|
+
order: Dict[Tuple, int] = {}
|
|
1345
|
+
idx = 0
|
|
1346
|
+
for model in structure:
|
|
1347
|
+
for chain in model:
|
|
1348
|
+
for res in chain.get_residues():
|
|
1349
|
+
order[res.get_full_id()] = idx
|
|
1350
|
+
idx += 1
|
|
1351
|
+
return sorted(set(fids), key=lambda fid: order.get(fid, 10**12))
|
|
1352
|
+
|
|
1353
|
+
def _residue_key_from_res(res: PDB.Residue.Residue) -> ResidueKey:
|
|
1354
|
+
"""
|
|
1355
|
+
Build a cross-structure residue key from a residue.
|
|
1356
|
+
"""
|
|
1357
|
+
chain_id = res.get_parent().id
|
|
1358
|
+
hetflag, resseq, icode = res.id
|
|
1359
|
+
icode_str = icode if icode != " " else ""
|
|
1360
|
+
return (chain_id, hetflag, int(resseq), icode_str, res.get_resname())
|
|
1361
|
+
|
|
1362
|
+
def _residue_key_from_fid(structure, fid: Tuple) -> ResidueKey:
|
|
1363
|
+
"""
|
|
1364
|
+
Build a cross-structure residue key from a full-id.
|
|
1365
|
+
"""
|
|
1366
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1367
|
+
return _residue_key_from_res(res)
|
|
1368
|
+
|
|
1369
|
+
# ---- helper for parsing --ligand-charge (number or 'RES:Q' mapping) ----
|
|
1370
|
+
def _parse_ligand_charge_option(ligand_charge: float | str | Dict[str, float] | None
|
|
1371
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
1372
|
+
"""
|
|
1373
|
+
Returns
|
|
1374
|
+
-------
|
|
1375
|
+
(total_charge, mapping)
|
|
1376
|
+
total_charge : float | None
|
|
1377
|
+
mapping : dict[RESNAME -> float] | None
|
|
1378
|
+
"""
|
|
1379
|
+
if ligand_charge is None:
|
|
1380
|
+
return None, None
|
|
1381
|
+
if isinstance(ligand_charge, (int, float)):
|
|
1382
|
+
return float(ligand_charge), None
|
|
1383
|
+
if isinstance(ligand_charge, dict):
|
|
1384
|
+
mapping = {str(k).upper(): float(v) for k, v in ligand_charge.items()}
|
|
1385
|
+
return None, mapping
|
|
1386
|
+
if isinstance(ligand_charge, str):
|
|
1387
|
+
s = ligand_charge.strip()
|
|
1388
|
+
if not s:
|
|
1389
|
+
return None, None
|
|
1390
|
+
# try numeric
|
|
1391
|
+
try:
|
|
1392
|
+
return float(s), None
|
|
1393
|
+
except ValueError:
|
|
1394
|
+
pass
|
|
1395
|
+
# mapping: tokens "RES:Q"
|
|
1396
|
+
tokens = [t for t in re.split(r"[,\s]+", s) if t]
|
|
1397
|
+
mapping: Dict[str, float] = {}
|
|
1398
|
+
for tok in tokens:
|
|
1399
|
+
if ":" not in tok:
|
|
1400
|
+
raise ValueError(f"Invalid --ligand-charge token '{tok}'. Use 'RES:Q' (e.g., GPP:-3) or a number (e.g., -3).")
|
|
1401
|
+
res, qtxt = tok.split(":", 1)
|
|
1402
|
+
resname = res.strip().upper()
|
|
1403
|
+
if not resname:
|
|
1404
|
+
raise ValueError(f"Invalid --ligand-charge token '{tok}': empty residue name.")
|
|
1405
|
+
try:
|
|
1406
|
+
qval = float(qtxt.strip())
|
|
1407
|
+
except ValueError:
|
|
1408
|
+
raise ValueError(f"Invalid --ligand-charge token '{tok}': '{qtxt}' is not a number.")
|
|
1409
|
+
mapping[resname] = qval
|
|
1410
|
+
if not mapping:
|
|
1411
|
+
raise ValueError("Empty --ligand-charge mapping.")
|
|
1412
|
+
return None, mapping
|
|
1413
|
+
raise TypeError(f"Unsupported type for ligand_charge: {type(ligand_charge)!r}")
|
|
1414
|
+
|
|
1415
|
+
def compute_charge_summary(structure,
|
|
1416
|
+
selected_ids: Set[Tuple],
|
|
1417
|
+
substrate_ids: Set[Tuple],
|
|
1418
|
+
ligand_charge: float | str | Dict[str, float] | None = None) -> Dict[str, Any]:
|
|
1419
|
+
"""
|
|
1420
|
+
Compute pocket charge summary.
|
|
1421
|
+
|
|
1422
|
+
Args
|
|
1423
|
+
----
|
|
1424
|
+
structure : Bio.PDB.Structure.Structure
|
|
1425
|
+
The (first) structure to evaluate.
|
|
1426
|
+
selected_ids : set[tuple]
|
|
1427
|
+
Residues included in the pocket.
|
|
1428
|
+
substrate_ids : set[tuple]
|
|
1429
|
+
Residues designated as substrate.
|
|
1430
|
+
ligand_charge : float | str | dict[str,float] | None
|
|
1431
|
+
- float: total charge to assign across **unknown residues** (preferring unknown substrate).
|
|
1432
|
+
- str : numeric string (total) or mapping like "GPP:-3,MMT:-1" (per‑resname).
|
|
1433
|
+
- dict : mapping {RESNAME: charge}. In mapping mode, other unknown residues remain 0.
|
|
1434
|
+
|
|
1435
|
+
Returns
|
|
1436
|
+
-------
|
|
1437
|
+
dict with keys:
|
|
1438
|
+
- total_charge : float
|
|
1439
|
+
- protein_charge : float
|
|
1440
|
+
- ligand_total_charge : float
|
|
1441
|
+
- ion_total_charge : float
|
|
1442
|
+
- ion_charges : list[(str tag, float)]
|
|
1443
|
+
- unknown_residue_charges : dict[str -> float] # for concise per‑resname log
|
|
1444
|
+
"""
|
|
1445
|
+
per_map: Dict[ResidueKey, float] = {}
|
|
1446
|
+
aa_charge = 0.0
|
|
1447
|
+
total = 0.0
|
|
1448
|
+
|
|
1449
|
+
fids_in_order = _sorted_fids_by_file_order(structure, selected_ids)
|
|
1450
|
+
|
|
1451
|
+
# First pass: dictionary/ion/water charges; collect unknowns and ions
|
|
1452
|
+
unknown_fids: List[Tuple] = []
|
|
1453
|
+
unknown_substrate_fids: List[Tuple] = []
|
|
1454
|
+
ion_entries: List[Tuple[str, float]] = []
|
|
1455
|
+
|
|
1456
|
+
for fid in fids_in_order:
|
|
1457
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1458
|
+
rn = res.get_resname().upper()
|
|
1459
|
+
key = _residue_key_from_res(res)
|
|
1460
|
+
if rn in WATER_RES:
|
|
1461
|
+
q = 0.0
|
|
1462
|
+
elif rn in AMINO_ACIDS:
|
|
1463
|
+
q = float(AMINO_ACIDS[rn])
|
|
1464
|
+
aa_charge += q
|
|
1465
|
+
elif rn in ION:
|
|
1466
|
+
q = float(ION[rn])
|
|
1467
|
+
ion_entries.append((_fmt_fid(structure, fid), q))
|
|
1468
|
+
else:
|
|
1469
|
+
q = 0.0
|
|
1470
|
+
unknown_fids.append(fid)
|
|
1471
|
+
if fid in substrate_ids:
|
|
1472
|
+
unknown_substrate_fids.append(fid)
|
|
1473
|
+
per_map[key] = q
|
|
1474
|
+
total += q
|
|
1475
|
+
|
|
1476
|
+
# Apply --ligand-charge if provided
|
|
1477
|
+
total_spec, mapping_spec = _parse_ligand_charge_option(ligand_charge)
|
|
1478
|
+
|
|
1479
|
+
if total_spec is not None:
|
|
1480
|
+
# Distribute total across unknown substrate if present, else across all unknowns
|
|
1481
|
+
targets = unknown_substrate_fids if unknown_substrate_fids else unknown_fids
|
|
1482
|
+
if targets:
|
|
1483
|
+
per_res_val = float(total_spec) / float(len(targets))
|
|
1484
|
+
for fid in targets:
|
|
1485
|
+
key = _residue_key_from_fid(structure, fid)
|
|
1486
|
+
per_map[key] = per_res_val
|
|
1487
|
+
# recompute totals
|
|
1488
|
+
total = sum(per_map.values())
|
|
1489
|
+
aa_charge = sum(q for k, q in per_map.items() if k[4] in AMINO_ACIDS)
|
|
1490
|
+
elif mapping_spec is not None:
|
|
1491
|
+
# Per‑resname mapping. Unspecified unknown residues remain 0.
|
|
1492
|
+
for fid in unknown_fids:
|
|
1493
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1494
|
+
rn = res.get_resname().upper()
|
|
1495
|
+
if rn in mapping_spec:
|
|
1496
|
+
key = _residue_key_from_fid(structure, fid)
|
|
1497
|
+
per_map[key] = float(mapping_spec[rn])
|
|
1498
|
+
# recompute totals
|
|
1499
|
+
total = sum(per_map.values())
|
|
1500
|
+
aa_charge = sum(q for k, q in per_map.items() if k[4] in AMINO_ACIDS)
|
|
1501
|
+
|
|
1502
|
+
# Net ligand and ion charges
|
|
1503
|
+
unknown_keys = {_residue_key_from_fid(structure, fid) for fid in unknown_fids}
|
|
1504
|
+
ligand_total = sum(per_map[k] for k in unknown_keys)
|
|
1505
|
+
ion_total = sum(q for _, q in ion_entries)
|
|
1506
|
+
|
|
1507
|
+
# Build per‑resname mapping for unknown residues (after applying any overrides)
|
|
1508
|
+
unknown_residue_charges: Dict[str, float] = {}
|
|
1509
|
+
for fid in unknown_fids:
|
|
1510
|
+
res = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1511
|
+
rn = res.get_resname().upper()
|
|
1512
|
+
key = _residue_key_from_fid(structure, fid)
|
|
1513
|
+
unknown_residue_charges[rn] = float(per_map[key])
|
|
1514
|
+
|
|
1515
|
+
return {
|
|
1516
|
+
"total_charge": float(total),
|
|
1517
|
+
"protein_charge": float(aa_charge),
|
|
1518
|
+
"ligand_total_charge": float(ligand_total),
|
|
1519
|
+
"ion_total_charge": float(ion_total),
|
|
1520
|
+
"ion_charges": [(tag, float(q)) for tag, q in ion_entries],
|
|
1521
|
+
"unknown_residue_charges": unknown_residue_charges,
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
def log_charge_summary(prefix: str,
|
|
1525
|
+
summary: Dict[str, Any]):
|
|
1526
|
+
"""
|
|
1527
|
+
Emit concise charge summary logs.
|
|
1528
|
+
"""
|
|
1529
|
+
total = summary["total_charge"]
|
|
1530
|
+
protein = summary["protein_charge"]
|
|
1531
|
+
ligand = summary.get("ligand_total_charge", 0.0)
|
|
1532
|
+
ion_list: List[Tuple[str, float]] = summary.get("ion_charges", [])
|
|
1533
|
+
ion_total = summary.get("ion_total_charge", sum(q for _, q in ion_list))
|
|
1534
|
+
unk_map: Dict[str, float] = summary.get("unknown_residue_charges", {}) or {}
|
|
1535
|
+
|
|
1536
|
+
if unk_map:
|
|
1537
|
+
items = ", ".join(f"{res}: {q:g}" for res, q in sorted(unk_map.items()))
|
|
1538
|
+
LOGGER.info("%s Per-resname ligand charges: %s", prefix, items)
|
|
1539
|
+
else:
|
|
1540
|
+
LOGGER.info("%s Per-resname ligand charges: (none)", prefix)
|
|
1541
|
+
|
|
1542
|
+
LOGGER.info("%s Net protein charge: %+g", prefix, protein)
|
|
1543
|
+
LOGGER.info("%s Net ligand charge: %+g", prefix, ligand)
|
|
1544
|
+
if ion_list:
|
|
1545
|
+
LOGGER.info("%s Ion charges (each):", prefix)
|
|
1546
|
+
for tag, q in ion_list:
|
|
1547
|
+
LOGGER.info(" %s -> %+g", tag, q)
|
|
1548
|
+
LOGGER.info("%s Net ion charge: %+g", prefix, ion_total)
|
|
1549
|
+
else:
|
|
1550
|
+
LOGGER.info("%s Ion charges: (none)", prefix)
|
|
1551
|
+
LOGGER.info("%s Total pocket charge: %+g", prefix, total)
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
# =========================== Cross-structure helpers ===========================
|
|
1555
|
+
# Multi-model driver utilities
|
|
1556
|
+
# ==============================================================================
|
|
1557
|
+
|
|
1558
|
+
def _build_key_maps(structure) -> Tuple[Dict[ResidueKey, Tuple], Dict[Tuple, ResidueKey]]:
|
|
1559
|
+
"""
|
|
1560
|
+
Create maps between ResidueKey and full-id for a structure.
|
|
1561
|
+
"""
|
|
1562
|
+
key2fid: Dict[ResidueKey, Tuple] = {}
|
|
1563
|
+
fid2key: Dict[Tuple, ResidueKey] = {}
|
|
1564
|
+
for model in structure:
|
|
1565
|
+
for chain in model:
|
|
1566
|
+
for res in chain.get_residues():
|
|
1567
|
+
key = _residue_key_from_res(res)
|
|
1568
|
+
fid = res.get_full_id()
|
|
1569
|
+
key2fid[key] = fid
|
|
1570
|
+
fid2key[fid] = key
|
|
1571
|
+
return key2fid, fid2key
|
|
1572
|
+
|
|
1573
|
+
def _keys_to_fids(structure, keys: Iterable[ResidueKey]) -> Set[Tuple]:
|
|
1574
|
+
"""
|
|
1575
|
+
Translate a set of ResidueKeys into full-ids for this structure.
|
|
1576
|
+
"""
|
|
1577
|
+
key2fid, _ = _build_key_maps(structure)
|
|
1578
|
+
fids: Set[Tuple] = set()
|
|
1579
|
+
missing: List[ResidueKey] = []
|
|
1580
|
+
for k in keys:
|
|
1581
|
+
fid = key2fid.get(k)
|
|
1582
|
+
if fid is None:
|
|
1583
|
+
missing.append(k)
|
|
1584
|
+
else:
|
|
1585
|
+
fids.add(fid)
|
|
1586
|
+
if missing:
|
|
1587
|
+
raise ValueError(f"Some residues not found in structure: {missing[:5]}{' ...' if len(missing)>5 else ''}")
|
|
1588
|
+
return fids
|
|
1589
|
+
|
|
1590
|
+
def _fids_to_keys(structure, fids: Iterable[Tuple]) -> Set[ResidueKey]:
|
|
1591
|
+
"""
|
|
1592
|
+
Translate a set of full-ids into ResidueKeys.
|
|
1593
|
+
"""
|
|
1594
|
+
return {_residue_key_from_fid(structure, fid) for fid in fids}
|
|
1595
|
+
|
|
1596
|
+
def _substrate_residues_for_structs(structs: List[PDB.Structure.Structure],
|
|
1597
|
+
center_spec: str) -> List[List[PDB.Residue.Residue]]:
|
|
1598
|
+
"""
|
|
1599
|
+
Resolve substrate residues per structure.
|
|
1600
|
+
|
|
1601
|
+
Behavior
|
|
1602
|
+
--------
|
|
1603
|
+
* If `center_spec` is a PDB path: exact‑match on the first structure only,
|
|
1604
|
+
then propagate to others by a residue‑ID list derived from the first match.
|
|
1605
|
+
* If `center_spec` is an ID list: apply to all structures.
|
|
1606
|
+
* If `center_spec` is a residue‑name list: apply to all structures; names may match multiple residues
|
|
1607
|
+
(all included; WARNING logged per structure).
|
|
1608
|
+
"""
|
|
1609
|
+
if os.path.exists(center_spec):
|
|
1610
|
+
sub_first = resolve_substrate_residues(structs[0], center_spec)
|
|
1611
|
+
tokens = []
|
|
1612
|
+
for res in sub_first:
|
|
1613
|
+
chain = res.get_parent().id
|
|
1614
|
+
chain_txt = (chain or "").strip()
|
|
1615
|
+
het, num, icode = res.id
|
|
1616
|
+
icode_txt = "" if icode == " " else icode
|
|
1617
|
+
if chain_txt:
|
|
1618
|
+
tokens.append(f"{chain}:{num}{icode_txt}")
|
|
1619
|
+
else:
|
|
1620
|
+
tokens.append(f"{num}{icode_txt}")
|
|
1621
|
+
idspec = ",".join(tokens)
|
|
1622
|
+
out: List[List[PDB.Residue.Residue]] = []
|
|
1623
|
+
for si, st in enumerate(structs):
|
|
1624
|
+
out.append(find_substrate_by_idspec(st, idspec))
|
|
1625
|
+
return out
|
|
1626
|
+
else:
|
|
1627
|
+
# Distinguish ID-spec vs resname list by attempting to parse as IDs first.
|
|
1628
|
+
try:
|
|
1629
|
+
_parse_res_tokens(center_spec)
|
|
1630
|
+
return [find_substrate_by_idspec(st, center_spec) for st in structs]
|
|
1631
|
+
except ValueError:
|
|
1632
|
+
return [find_substrate_by_resname(st, center_spec) for st in structs]
|
|
1633
|
+
|
|
1634
|
+
def _disulfide_partner_keys(structure, candidate_keys: Set[ResidueKey],
|
|
1635
|
+
cutoff: float = DISULFIDE_CUTOFF) -> Set[ResidueKey]:
|
|
1636
|
+
"""
|
|
1637
|
+
Return ResidueKeys of disulfide partners to include for any selected CYS/CYX.
|
|
1638
|
+
"""
|
|
1639
|
+
key2fid, _ = _build_key_maps(structure)
|
|
1640
|
+
sg_atoms: List[PDB.Atom.Atom] = []
|
|
1641
|
+
res_of_atom: Dict[PDB.Atom.Atom, ResidueKey] = {}
|
|
1642
|
+
for res in structure.get_residues():
|
|
1643
|
+
if res.get_resname() in {"CYS", "CYX"} and "SG" in res:
|
|
1644
|
+
at = res["SG"]
|
|
1645
|
+
sg_atoms.append(at)
|
|
1646
|
+
res_of_atom[at] = _residue_key_from_res(res)
|
|
1647
|
+
add: Set[ResidueKey] = set()
|
|
1648
|
+
if not sg_atoms:
|
|
1649
|
+
return add
|
|
1650
|
+
ns = NeighborSearch(sg_atoms)
|
|
1651
|
+
for at in sg_atoms:
|
|
1652
|
+
for other in ns.search(at.get_coord(), cutoff):
|
|
1653
|
+
if other is at:
|
|
1654
|
+
continue
|
|
1655
|
+
k1 = res_of_atom[at]
|
|
1656
|
+
k2 = res_of_atom[other]
|
|
1657
|
+
if (k1 in candidate_keys) or (k2 in candidate_keys):
|
|
1658
|
+
add.add(k1); add.add(k2)
|
|
1659
|
+
return add
|
|
1660
|
+
|
|
1661
|
+
def _assert_atom_ordering_identical(structs: List[PDB.Structure.Structure]):
|
|
1662
|
+
"""
|
|
1663
|
+
Light consistency check across inputs:
|
|
1664
|
+
- Enforce identical atom counts.
|
|
1665
|
+
- Spot‑check ordering at the beginning and end of the atom list; if mismatched there (and overall lists differ),
|
|
1666
|
+
raise an error.
|
|
1667
|
+
"""
|
|
1668
|
+
def signature(st: PDB.Structure.Structure) -> List[str]:
|
|
1669
|
+
sig: List[str] = []
|
|
1670
|
+
for model in st:
|
|
1671
|
+
for chain in model:
|
|
1672
|
+
for res in chain.get_residues():
|
|
1673
|
+
het, resseq, icode = res.id
|
|
1674
|
+
icode_txt = icode if icode != " " else ""
|
|
1675
|
+
base = f"{chain.id}|{het}|{resseq}{icode_txt}|{res.get_resname()}"
|
|
1676
|
+
for atom in res:
|
|
1677
|
+
sig.append(base + f"|{atom.get_name()}")
|
|
1678
|
+
return sig
|
|
1679
|
+
sig0 = signature(structs[0])
|
|
1680
|
+
for i in range(1, len(structs)):
|
|
1681
|
+
sigi = signature(structs[i])
|
|
1682
|
+
if len(sigi) != len(sig0):
|
|
1683
|
+
raise ValueError(f"[multi] Atom count mismatch between input #1 and input #{i+1}: {len(sig0)} vs {len(sigi)}")
|
|
1684
|
+
check_pairs = [(0, min(10, len(sig0))),
|
|
1685
|
+
(max(0, len(sig0)-10), len(sig0))]
|
|
1686
|
+
mismatch = False
|
|
1687
|
+
for a, b in check_pairs:
|
|
1688
|
+
if sig0[a:b] != sigi[a:b]:
|
|
1689
|
+
mismatch = True
|
|
1690
|
+
break
|
|
1691
|
+
if mismatch and sig0 != sigi:
|
|
1692
|
+
raise ValueError(f"[multi] Atom order mismatch between input #1 and input #{i+1}.")
|
|
1693
|
+
|
|
1694
|
+
|
|
1695
|
+
def _strip_trailing_END(text: str) -> str:
|
|
1696
|
+
"""
|
|
1697
|
+
Remove trailing 'END' lines and ensure a final newline.
|
|
1698
|
+
"""
|
|
1699
|
+
lines = [ln for ln in text.splitlines() if ln.strip() != "END"]
|
|
1700
|
+
out = "\n".join(lines)
|
|
1701
|
+
if not out.endswith("\n"):
|
|
1702
|
+
out += "\n"
|
|
1703
|
+
return out
|
|
1704
|
+
|
|
1705
|
+
|
|
1706
|
+
def _compute_linkH_defs(structure,
|
|
1707
|
+
selected_ids: Set[Tuple],
|
|
1708
|
+
skip_map: Dict[Tuple, Set[str]]) -> List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]]:
|
|
1709
|
+
"""
|
|
1710
|
+
Deterministic list of link‑H definitions and coordinates.
|
|
1711
|
+
|
|
1712
|
+
Returns
|
|
1713
|
+
-------
|
|
1714
|
+
list of ((ResidueKey, cut_type), (x, y, z)), where cut_type ∈ {"CB-CA","CA-N","CA-C"}.
|
|
1715
|
+
Ordering is by residue file order, then by cut_type in the sequence above.
|
|
1716
|
+
"""
|
|
1717
|
+
out: List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]] = []
|
|
1718
|
+
for fid in _sorted_fids_by_file_order(structure, selected_ids):
|
|
1719
|
+
res: PDB.Residue.Residue = structure[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1720
|
+
if res.get_resname() in WATER_RES:
|
|
1721
|
+
continue
|
|
1722
|
+
skip_set = skip_map.get(fid, set())
|
|
1723
|
+
key = _residue_key_from_res(res)
|
|
1724
|
+
|
|
1725
|
+
def _maybe(parent_name: str, partner_name: str, cut_type: str):
|
|
1726
|
+
if not _atom_present_in_output(res, parent_name, skip_set):
|
|
1727
|
+
return
|
|
1728
|
+
if not _atom_removed_by_truncation(res, partner_name, skip_set):
|
|
1729
|
+
return
|
|
1730
|
+
parent = res[parent_name]
|
|
1731
|
+
partner = res[partner_name]
|
|
1732
|
+
parent_elem = (parent.element or parent.get_name()[0]).upper()
|
|
1733
|
+
if parent_elem != "C":
|
|
1734
|
+
return
|
|
1735
|
+
v = np.array(partner.get_coord(), dtype=float) - np.array(parent.get_coord(), dtype=float)
|
|
1736
|
+
norm = np.linalg.norm(v)
|
|
1737
|
+
if not np.isfinite(norm) or norm < 1e-6:
|
|
1738
|
+
return
|
|
1739
|
+
v /= norm
|
|
1740
|
+
dist = 1.09
|
|
1741
|
+
h = np.array(parent.get_coord(), dtype=float) + v * dist
|
|
1742
|
+
out.append(((key, cut_type), (float(h[0]), float(h[1]), float(h[2]))))
|
|
1743
|
+
|
|
1744
|
+
if res.get_resname() in {"PRO", "HYP"}:
|
|
1745
|
+
_maybe("CA", "C", "CA-C")
|
|
1746
|
+
else:
|
|
1747
|
+
_maybe("CB", "CA", "CB-CA")
|
|
1748
|
+
_maybe("CA", "N", "CA-N")
|
|
1749
|
+
_maybe("CA", "C", "CA-C")
|
|
1750
|
+
return out
|
|
1751
|
+
|
|
1752
|
+
|
|
1753
|
+
def extract_multi(args: argparse.Namespace, api=False) -> Dict[str, Any]:
|
|
1754
|
+
"""
|
|
1755
|
+
Multi‑structure driver.
|
|
1756
|
+
|
|
1757
|
+
Args
|
|
1758
|
+
----
|
|
1759
|
+
args : argparse.Namespace
|
|
1760
|
+
Parsed CLI arguments (or equivalent) controlling selection, truncation, outputs.
|
|
1761
|
+
|
|
1762
|
+
Returns
|
|
1763
|
+
-------
|
|
1764
|
+
dict
|
|
1765
|
+
{
|
|
1766
|
+
'outputs': List[str],
|
|
1767
|
+
'counts': List[{'raw_atoms': int, 'kept_atoms': int}], # per model
|
|
1768
|
+
'charge_summary': {...}, # computed on model #1
|
|
1769
|
+
}
|
|
1770
|
+
"""
|
|
1771
|
+
paths: List[str] = args.complex_pdb
|
|
1772
|
+
names = [f"complex{i+1}" for i in range(len(paths))]
|
|
1773
|
+
structs: List[PDB.Structure.Structure] = [load_structure(p, n) for p, n in zip(paths, names)]
|
|
1774
|
+
|
|
1775
|
+
LOGGER.info("[extract:multi] Loaded %d structures.", len(structs))
|
|
1776
|
+
_assert_atom_ordering_identical(structs)
|
|
1777
|
+
|
|
1778
|
+
# Substrates per structure (PDB-path -> first only, then propagate by IDs)
|
|
1779
|
+
subs_per_struct: List[List[PDB.Residue.Residue]] = _substrate_residues_for_structs(structs, args.substrate_pdb)
|
|
1780
|
+
|
|
1781
|
+
# 1) Per-structure selection and backbone-contact → OR unify as keys
|
|
1782
|
+
union_sel_keys: Set[ResidueKey] = set()
|
|
1783
|
+
union_bb_contact_keys: Set[ResidueKey] = set()
|
|
1784
|
+
|
|
1785
|
+
for st, subs in zip(structs, subs_per_struct):
|
|
1786
|
+
selected_ids, bb_contact_ids = select_residues(st, subs, args.radius, args.radius_het2het, args.include_H2O, args.exclude_backbone)
|
|
1787
|
+
union_sel_keys |= _fids_to_keys(st, selected_ids)
|
|
1788
|
+
union_bb_contact_keys |= _fids_to_keys(st, bb_contact_ids)
|
|
1789
|
+
|
|
1790
|
+
LOGGER.info("[extract:multi] Initial union selection: %d residues; backbone-contact: %d residues.",
|
|
1791
|
+
len(union_sel_keys), len(union_bb_contact_keys))
|
|
1792
|
+
|
|
1793
|
+
# 1a) Force-include residues via --selected-resn (OR across structures)
|
|
1794
|
+
if getattr(args, "selected_resn", ""):
|
|
1795
|
+
forced_union: Set[ResidueKey] = set()
|
|
1796
|
+
for st in structs:
|
|
1797
|
+
forced_res = find_substrate_by_idspec(st, args.selected_resn)
|
|
1798
|
+
forced_union |= {_residue_key_from_res(r) for r in forced_res}
|
|
1799
|
+
if forced_union:
|
|
1800
|
+
LOGGER.info("[extract:multi] Force-include (--selected-resn): +%d residues.", len(forced_union))
|
|
1801
|
+
union_sel_keys |= forced_union
|
|
1802
|
+
|
|
1803
|
+
# 2) Disulfide partners (OR across structures)
|
|
1804
|
+
dis_keys_union: Set[ResidueKey] = set()
|
|
1805
|
+
for st in structs:
|
|
1806
|
+
dis_keys_union |= _disulfide_partner_keys(st, union_sel_keys, DISULFIDE_CUTOFF)
|
|
1807
|
+
if dis_keys_union:
|
|
1808
|
+
LOGGER.info("[extract:multi] Disulfide partner addition (union): +%d residues.", len(dis_keys_union))
|
|
1809
|
+
union_sel_keys |= dis_keys_union
|
|
1810
|
+
|
|
1811
|
+
# 3) Backbone-contact neighbor augmentation (if exclude_backbone == False)
|
|
1812
|
+
keep_ncap_union: Set[ResidueKey] = set()
|
|
1813
|
+
keep_ccap_union: Set[ResidueKey] = set()
|
|
1814
|
+
if not args.exclude_backbone and union_bb_contact_keys:
|
|
1815
|
+
added_neighbor_union: Set[ResidueKey] = set()
|
|
1816
|
+
for st, subs in zip(structs, subs_per_struct):
|
|
1817
|
+
sel_ids = _keys_to_fids(st, union_sel_keys)
|
|
1818
|
+
bb_ids = _keys_to_fids(st, union_bb_contact_keys & _fids_to_keys(st, sel_ids))
|
|
1819
|
+
sub_ids = {r.get_full_id() for r in subs}
|
|
1820
|
+
# single call performs neighbor augmentation and returns cap-preservation flags
|
|
1821
|
+
kn_fids, kc_fids = augment_backbone_contact_neighbors(st, sel_ids, bb_ids, sub_ids)
|
|
1822
|
+
after_keys = _fids_to_keys(st, sel_ids)
|
|
1823
|
+
added_neighbor_union |= (after_keys - union_sel_keys)
|
|
1824
|
+
keep_ncap_union |= _fids_to_keys(st, kn_fids)
|
|
1825
|
+
keep_ccap_union |= _fids_to_keys(st, kc_fids)
|
|
1826
|
+
if added_neighbor_union:
|
|
1827
|
+
LOGGER.info("[extract:multi] Backbone-contact neighbor addition (union): +%d residues.",
|
|
1828
|
+
len(added_neighbor_union))
|
|
1829
|
+
union_sel_keys |= added_neighbor_union
|
|
1830
|
+
|
|
1831
|
+
# 4) PRO N-side neighbor augmentation (OR across structures)
|
|
1832
|
+
pro_prev_add_union: Set[ResidueKey] = set()
|
|
1833
|
+
for st in structs:
|
|
1834
|
+
sel_ids = _keys_to_fids(st, union_sel_keys)
|
|
1835
|
+
augment_proline_prev_neighbor(st, sel_ids)
|
|
1836
|
+
added = _fids_to_keys(st, sel_ids) - union_sel_keys
|
|
1837
|
+
pro_prev_add_union |= added
|
|
1838
|
+
if pro_prev_add_union:
|
|
1839
|
+
LOGGER.info("[extract:multi] PRO N-side neighbor addition (union): +%d residues.",
|
|
1840
|
+
len(pro_prev_add_union))
|
|
1841
|
+
union_sel_keys |= pro_prev_add_union
|
|
1842
|
+
|
|
1843
|
+
# ==== Build skip maps per structure (using unified selection and cap-keep flags) ====
|
|
1844
|
+
selected_ids_per_struct: List[Set[Tuple]] = []
|
|
1845
|
+
skip_maps_per_struct: List[Dict[Tuple, Set[str]]] = []
|
|
1846
|
+
substrate_idsets_per_struct: List[Set[Tuple]] = []
|
|
1847
|
+
|
|
1848
|
+
for st, subs in zip(structs, subs_per_struct):
|
|
1849
|
+
sel_fids = _keys_to_fids(st, union_sel_keys)
|
|
1850
|
+
selected_ids_per_struct.append(sel_fids)
|
|
1851
|
+
sub_ids = {r.get_full_id() for r in subs}
|
|
1852
|
+
substrate_idsets_per_struct.append(sub_ids)
|
|
1853
|
+
kn_fids = _keys_to_fids(st, keep_ncap_union) if (not args.exclude_backbone) else None
|
|
1854
|
+
kc_fids = _keys_to_fids(st, keep_ccap_union) if (not args.exclude_backbone) else None
|
|
1855
|
+
skip_map = mark_atoms_to_skip(st, sel_fids, sub_ids, args.exclude_backbone, kn_fids, kc_fids)
|
|
1856
|
+
skip_maps_per_struct.append(skip_map)
|
|
1857
|
+
|
|
1858
|
+
# ==== Compute link‑H definitions for each model and ensure identical targets/order ====
|
|
1859
|
+
linkdefs_per_struct: List[List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]]] = []
|
|
1860
|
+
for st, sel_fids, skip_map in zip(structs, selected_ids_per_struct, skip_maps_per_struct):
|
|
1861
|
+
linkdefs = _compute_linkH_defs(st, sel_fids, skip_map)
|
|
1862
|
+
linkdefs_per_struct.append(linkdefs)
|
|
1863
|
+
ref_targets = [ld[0] for ld in linkdefs_per_struct[0]]
|
|
1864
|
+
for i in range(1, len(linkdefs_per_struct)):
|
|
1865
|
+
targets_i = [ld[0] for ld in linkdefs_per_struct[i]]
|
|
1866
|
+
if targets_i != ref_targets:
|
|
1867
|
+
raise RuntimeError(
|
|
1868
|
+
f"[multi] link-H targets/order differ between model #1 and model #{i+1}. "
|
|
1869
|
+
f"Ensure inputs and options produce identical truncation across models."
|
|
1870
|
+
)
|
|
1871
|
+
LOGGER.info("[extract:multi] link-H targets common across models: %d.", len(ref_targets))
|
|
1872
|
+
|
|
1873
|
+
# ==== Write outputs ====
|
|
1874
|
+
per_file_outputs = (len(args.output_pdb) == len(paths))
|
|
1875
|
+
if not per_file_outputs and len(args.output_pdb) != 1:
|
|
1876
|
+
raise ValueError("[extract:multi] Provide either a single output path for a multi‑MODEL PDB "
|
|
1877
|
+
"or exactly N output paths where N == number of inputs for per‑structure outputs.")
|
|
1878
|
+
|
|
1879
|
+
io = PDB.PDBIO()
|
|
1880
|
+
model_texts: List[str] = []
|
|
1881
|
+
model_counts: List[Dict[str, int]] = []
|
|
1882
|
+
|
|
1883
|
+
for m, (st, sel_fids, skip_map) in enumerate(zip(structs, selected_ids_per_struct, skip_maps_per_struct), start=1):
|
|
1884
|
+
io.set_structure(st)
|
|
1885
|
+
buf = _io.StringIO()
|
|
1886
|
+
io.save(buf, AS_Select(sel_fids, skip_map))
|
|
1887
|
+
main_text = _strip_trailing_END(buf.getvalue())
|
|
1888
|
+
|
|
1889
|
+
# Atom-count diagnostics
|
|
1890
|
+
raw_atoms = sum(len(st[f[1]][f[2]].child_dict[f[3]]) for f in sel_fids)
|
|
1891
|
+
kept_atoms = sum(
|
|
1892
|
+
1 for fid in sel_fids
|
|
1893
|
+
for a in st[fid[1]][fid[2]].child_dict[fid[3]]
|
|
1894
|
+
if a.get_name() not in skip_map.get(fid, set())
|
|
1895
|
+
)
|
|
1896
|
+
LOGGER.info("[extract:multi] Raw atoms (model %d): %d", m, raw_atoms)
|
|
1897
|
+
LOGGER.info("[extract:multi] Atoms after truncation (model %d): %d", m, kept_atoms)
|
|
1898
|
+
model_counts.append({"raw_atoms": raw_atoms, "kept_atoms": kept_atoms})
|
|
1899
|
+
|
|
1900
|
+
# Append TER + link‑H block (honor --add-linkH)
|
|
1901
|
+
link_coords = [coord for (_, coord) in linkdefs_per_struct[m-1]]
|
|
1902
|
+
if args.add_linkH and link_coords:
|
|
1903
|
+
if not main_text.endswith("\n"):
|
|
1904
|
+
main_text += "\n"
|
|
1905
|
+
parts = [main_text]
|
|
1906
|
+
last_line = main_text.splitlines()[-1].strip() if main_text.strip() else ""
|
|
1907
|
+
if last_line != "TER":
|
|
1908
|
+
parts.append("TER\n")
|
|
1909
|
+
start_serial = _max_serial_from_pdb_text(main_text)
|
|
1910
|
+
parts.append(_format_linkH_block(link_coords, start_serial))
|
|
1911
|
+
main_text = "".join(parts)
|
|
1912
|
+
|
|
1913
|
+
model_texts.append(main_text)
|
|
1914
|
+
|
|
1915
|
+
outputs: List[str] = []
|
|
1916
|
+
if per_file_outputs:
|
|
1917
|
+
for idx, text in enumerate(model_texts):
|
|
1918
|
+
content = text
|
|
1919
|
+
if not content.endswith("\n"):
|
|
1920
|
+
content += "\n"
|
|
1921
|
+
content += "END\n"
|
|
1922
|
+
out_path = args.output_pdb[idx]
|
|
1923
|
+
with open(out_path, "w") as fh:
|
|
1924
|
+
fh.write(content)
|
|
1925
|
+
outputs.append(out_path)
|
|
1926
|
+
LOGGER.info("[extract:multi] Single‑model pocket saved to %s", out_path)
|
|
1927
|
+
else:
|
|
1928
|
+
buf_models: List[str] = []
|
|
1929
|
+
for m, text in enumerate(model_texts, start=1):
|
|
1930
|
+
model_block = []
|
|
1931
|
+
model_block.append(f"MODEL {m}\n")
|
|
1932
|
+
model_block.append(text)
|
|
1933
|
+
model_block.append("ENDMDL\n")
|
|
1934
|
+
buf_models.append("".join(model_block))
|
|
1935
|
+
out_path = args.output_pdb[0]
|
|
1936
|
+
with open(out_path, "w") as fh:
|
|
1937
|
+
for blk in buf_models:
|
|
1938
|
+
fh.write(blk)
|
|
1939
|
+
fh.write("END\n")
|
|
1940
|
+
outputs.append(out_path)
|
|
1941
|
+
LOGGER.info("[extract:multi] Multi‑MODEL pocket saved to %s", out_path)
|
|
1942
|
+
|
|
1943
|
+
# ==== Charge summary (first model only) ====
|
|
1944
|
+
charge_summary = compute_charge_summary(
|
|
1945
|
+
structs[0],
|
|
1946
|
+
selected_ids_per_struct[0],
|
|
1947
|
+
substrate_idsets_per_struct[0],
|
|
1948
|
+
getattr(args, "ligand_charge", None)
|
|
1949
|
+
)
|
|
1950
|
+
log_charge_summary("[extract:multi]", charge_summary)
|
|
1951
|
+
|
|
1952
|
+
if api==True:
|
|
1953
|
+
return {
|
|
1954
|
+
"outputs": outputs,
|
|
1955
|
+
"counts": model_counts,
|
|
1956
|
+
"charge_summary": charge_summary,
|
|
1957
|
+
}
|
|
1958
|
+
else:
|
|
1959
|
+
return
|
|
1960
|
+
|
|
1961
|
+
|
|
1962
|
+
# ---------------------------------------------------------------------
|
|
1963
|
+
# PDB writer helper
|
|
1964
|
+
# ---------------------------------------------------------------------
|
|
1965
|
+
class AS_Select(PDB.Select):
|
|
1966
|
+
"""
|
|
1967
|
+
Biopython Select subclass that filters residues/atoms according to skip map.
|
|
1968
|
+
"""
|
|
1969
|
+
def __init__(self, selected_ids: Set[Tuple], skip_map: Dict[Tuple, Set[str]]):
|
|
1970
|
+
self.ids = selected_ids
|
|
1971
|
+
self.skip = skip_map
|
|
1972
|
+
|
|
1973
|
+
def accept_residue(self, residue):
|
|
1974
|
+
return residue.get_full_id() in self.ids
|
|
1975
|
+
|
|
1976
|
+
def accept_atom(self, atom):
|
|
1977
|
+
fid = atom.get_parent().get_full_id()
|
|
1978
|
+
return atom.get_name() not in self.skip.get(fid, set())
|
|
1979
|
+
|
|
1980
|
+
|
|
1981
|
+
# ---------------------------------------------------------------------
|
|
1982
|
+
# Main driver (single or multi) — CLI or API
|
|
1983
|
+
# ---------------------------------------------------------------------
|
|
1984
|
+
|
|
1985
|
+
def extract(args: argparse.Namespace | None = None, api=False) -> Dict[str, Any]:
|
|
1986
|
+
"""
|
|
1987
|
+
Run from CLI (args=None → parse_args()) or as an API with a pre-built Namespace.
|
|
1988
|
+
|
|
1989
|
+
Args
|
|
1990
|
+
----
|
|
1991
|
+
args : argparse.Namespace | None
|
|
1992
|
+
If None, parse CLI args. Otherwise, use the provided Namespace.
|
|
1993
|
+
api : bool
|
|
1994
|
+
If True, return a structured result dictionary; if False (CLI), return None.
|
|
1995
|
+
|
|
1996
|
+
Returns
|
|
1997
|
+
-------
|
|
1998
|
+
dict | None
|
|
1999
|
+
When api=True, returns { 'outputs', 'counts', 'charge_summary' }. Otherwise, None.
|
|
2000
|
+
"""
|
|
2001
|
+
if args is None:
|
|
2002
|
+
args = parse_args()
|
|
2003
|
+
|
|
2004
|
+
_configure_extract_logger(bool(args.verbose))
|
|
2005
|
+
|
|
2006
|
+
if args.radius == 0.0:
|
|
2007
|
+
args.radius = 0.001
|
|
2008
|
+
if args.radius_het2het == 0.0:
|
|
2009
|
+
args.radius_het2het = 0.001
|
|
2010
|
+
|
|
2011
|
+
# Log extract options
|
|
2012
|
+
LOGGER.info("[extract] Options: radius=%.2f, radius_het2het=%.2f, "
|
|
2013
|
+
"include_H2O=%s, exclude_backbone=%s, add_linkH=%s, "
|
|
2014
|
+
"selected_resn='%s'",
|
|
2015
|
+
args.radius, args.radius_het2het,
|
|
2016
|
+
args.include_H2O, args.exclude_backbone,
|
|
2017
|
+
getattr(args, 'add_linkh', False),
|
|
2018
|
+
getattr(args, 'selected_resn', ''))
|
|
2019
|
+
|
|
2020
|
+
# default output names
|
|
2021
|
+
if args.output_pdb is None:
|
|
2022
|
+
if len(args.complex_pdb) > 1:
|
|
2023
|
+
# multiple inputs → per-file outputs: pocket_{original_filename}.pdb
|
|
2024
|
+
args.output_pdb = [
|
|
2025
|
+
f"pocket_{os.path.splitext(os.path.basename(p))[0]}.pdb"
|
|
2026
|
+
for p in args.complex_pdb
|
|
2027
|
+
]
|
|
2028
|
+
else:
|
|
2029
|
+
args.output_pdb = ['pocket.pdb']
|
|
2030
|
+
|
|
2031
|
+
# Single-structure path
|
|
2032
|
+
if len(args.complex_pdb) == 1:
|
|
2033
|
+
complex_struct = load_structure(args.complex_pdb[0], "complex")
|
|
2034
|
+
|
|
2035
|
+
# Resolve substrate residues from PDB path or residue-ID/name list
|
|
2036
|
+
substrate_residues = resolve_substrate_residues(complex_struct, args.substrate_pdb)
|
|
2037
|
+
substrate_ids = {r.get_full_id() for r in substrate_residues}
|
|
2038
|
+
LOGGER.info("[extract] Substrate residues matched: resseq %s",
|
|
2039
|
+
[r.id[1] for r in substrate_residues])
|
|
2040
|
+
|
|
2041
|
+
selected_ids, backbone_contact_ids = select_residues(
|
|
2042
|
+
complex_struct, substrate_residues,
|
|
2043
|
+
args.radius, args.radius_het2het,
|
|
2044
|
+
args.include_H2O,
|
|
2045
|
+
args.exclude_backbone
|
|
2046
|
+
)
|
|
2047
|
+
|
|
2048
|
+
# Force-include residues via --selected-resn
|
|
2049
|
+
if getattr(args, "selected_resn", ""):
|
|
2050
|
+
forced_res = find_substrate_by_idspec(complex_struct, args.selected_resn)
|
|
2051
|
+
add_n = 0
|
|
2052
|
+
for r in forced_res:
|
|
2053
|
+
fid = r.get_full_id()
|
|
2054
|
+
if fid not in selected_ids:
|
|
2055
|
+
selected_ids.add(fid)
|
|
2056
|
+
add_n += 1
|
|
2057
|
+
if add_n:
|
|
2058
|
+
LOGGER.info("[extract] Force-include (--selected-resn): +%d residues.", add_n)
|
|
2059
|
+
|
|
2060
|
+
augment_disulfides(complex_struct, selected_ids)
|
|
2061
|
+
|
|
2062
|
+
# Backbone-contact context (if enabled)
|
|
2063
|
+
keep_ncap_ids: Set[Tuple] = set()
|
|
2064
|
+
keep_ccap_ids: Set[Tuple] = set()
|
|
2065
|
+
if not args.exclude_backbone and backbone_contact_ids:
|
|
2066
|
+
kn, kc = augment_backbone_contact_neighbors(
|
|
2067
|
+
complex_struct, selected_ids, backbone_contact_ids, substrate_ids
|
|
2068
|
+
)
|
|
2069
|
+
keep_ncap_ids.update(kn)
|
|
2070
|
+
keep_ccap_ids.update(kc)
|
|
2071
|
+
|
|
2072
|
+
# Ensure PRO's N-side neighbor is included (TER-aware)
|
|
2073
|
+
augment_proline_prev_neighbor(complex_struct, selected_ids)
|
|
2074
|
+
|
|
2075
|
+
# Atom counts
|
|
2076
|
+
raw = sum(len(complex_struct[f[1]][f[2]].child_dict[f[3]]) for f in selected_ids)
|
|
2077
|
+
LOGGER.info("[extract] Raw atoms: %d", raw)
|
|
2078
|
+
|
|
2079
|
+
skip_map = mark_atoms_to_skip(
|
|
2080
|
+
complex_struct, selected_ids, substrate_ids,
|
|
2081
|
+
args.exclude_backbone,
|
|
2082
|
+
keep_ncap_ids if not args.exclude_backbone else None,
|
|
2083
|
+
keep_ccap_ids if not args.exclude_backbone else None
|
|
2084
|
+
)
|
|
2085
|
+
|
|
2086
|
+
kept_atoms = sum(
|
|
2087
|
+
1 for fid in selected_ids
|
|
2088
|
+
for a in complex_struct[fid[1]][fid[2]].child_dict[fid[3]]
|
|
2089
|
+
if a.get_name() not in skip_map.get(fid, set())
|
|
2090
|
+
)
|
|
2091
|
+
LOGGER.info("[extract] Atoms after truncation: %d", kept_atoms)
|
|
2092
|
+
|
|
2093
|
+
# Save structure (and optionally append link‑H block)
|
|
2094
|
+
io = PDB.PDBIO()
|
|
2095
|
+
io.set_structure(complex_struct)
|
|
2096
|
+
|
|
2097
|
+
buf = _io.StringIO()
|
|
2098
|
+
io.save(buf, AS_Select(selected_ids, skip_map))
|
|
2099
|
+
main_pdb_text = buf.getvalue()
|
|
2100
|
+
|
|
2101
|
+
output_path = args.output_pdb[0]
|
|
2102
|
+
outputs: List[str] = []
|
|
2103
|
+
|
|
2104
|
+
if args.add_linkH:
|
|
2105
|
+
link_coords = compute_linkH_atoms(complex_struct, selected_ids, skip_map)
|
|
2106
|
+
LOGGER.info("[extract] Link-H to add: %d", len(link_coords))
|
|
2107
|
+
|
|
2108
|
+
lines = [ln for ln in main_pdb_text.splitlines() if ln.strip() != "END"]
|
|
2109
|
+
if lines and lines[-1].strip() == "TER":
|
|
2110
|
+
pass
|
|
2111
|
+
main_no_end = "\n".join(lines)
|
|
2112
|
+
if not main_no_end.endswith("\n"):
|
|
2113
|
+
main_no_end += "\n"
|
|
2114
|
+
|
|
2115
|
+
final_parts = [main_no_end]
|
|
2116
|
+
if link_coords:
|
|
2117
|
+
final_parts.append("TER\n")
|
|
2118
|
+
start_serial = _max_serial_from_pdb_text(main_no_end)
|
|
2119
|
+
final_parts.append(_format_linkH_block(link_coords, start_serial))
|
|
2120
|
+
final_parts.append("END\n")
|
|
2121
|
+
|
|
2122
|
+
with open(output_path, "w") as fh:
|
|
2123
|
+
fh.write("".join(final_parts))
|
|
2124
|
+
LOGGER.info("[extract] Binding-Pocket (Active Site) + link-H saved to %s", output_path)
|
|
2125
|
+
outputs.append(output_path)
|
|
2126
|
+
else:
|
|
2127
|
+
with open(output_path, "w") as fh:
|
|
2128
|
+
fh.write(main_pdb_text)
|
|
2129
|
+
LOGGER.info("[extract] Binding-Pocket (Active Site) saved to %s", output_path)
|
|
2130
|
+
outputs.append(output_path)
|
|
2131
|
+
|
|
2132
|
+
# Charge summary (single model)
|
|
2133
|
+
charge_summary = compute_charge_summary(
|
|
2134
|
+
complex_struct, selected_ids, substrate_ids, getattr(args, "ligand_charge", None)
|
|
2135
|
+
)
|
|
2136
|
+
log_charge_summary("[extract]", charge_summary)
|
|
2137
|
+
|
|
2138
|
+
if api:
|
|
2139
|
+
return {
|
|
2140
|
+
"outputs": outputs,
|
|
2141
|
+
"counts": [{"raw_atoms": raw, "kept_atoms": kept_atoms}],
|
|
2142
|
+
"charge_summary": charge_summary,
|
|
2143
|
+
}
|
|
2144
|
+
else:
|
|
2145
|
+
return
|
|
2146
|
+
|
|
2147
|
+
# Multi-structure path
|
|
2148
|
+
return extract_multi(args, api=api)
|
|
2149
|
+
|
|
2150
|
+
|
|
2151
|
+
def extract_api(complex_pdb: List[str],
|
|
2152
|
+
center: str,
|
|
2153
|
+
output: Optional[List[str]] = None,
|
|
2154
|
+
radius: float = 2.6,
|
|
2155
|
+
radius_het2het: float = 0.0,
|
|
2156
|
+
include_H2O: bool = True,
|
|
2157
|
+
exclude_backbone: bool = True,
|
|
2158
|
+
add_linkH: bool = True,
|
|
2159
|
+
selected_resn: str = "",
|
|
2160
|
+
ligand_charge: Optional[float | str | Dict[str, float]] = None,
|
|
2161
|
+
verbose: bool = False) -> Dict[str, Any]:
|
|
2162
|
+
"""
|
|
2163
|
+
Convenience API for programmatic use.
|
|
2164
|
+
|
|
2165
|
+
Args
|
|
2166
|
+
----
|
|
2167
|
+
complex_pdb : list[str]
|
|
2168
|
+
Input PDB path(s). len==1 → single, len>1 → multi.
|
|
2169
|
+
center : str
|
|
2170
|
+
Substrate spec: a PDB path, a residue‑ID list 'A:123,456' (insertion codes OK),
|
|
2171
|
+
or a residue‑name list 'GPP,MMT'.
|
|
2172
|
+
output : list[str] | None
|
|
2173
|
+
Output path(s): one path for multi‑MODEL PDB, or N paths for per‑file outputs.
|
|
2174
|
+
If None, defaults to ['pocket.pdb'].
|
|
2175
|
+
radius : float
|
|
2176
|
+
Atom–atom cutoff (Å) for inclusion around substrate atoms.
|
|
2177
|
+
radius_het2het : float
|
|
2178
|
+
Independent hetero‑hetero cutoff (Å) for non‑C/H pairs.
|
|
2179
|
+
include_H2O : bool
|
|
2180
|
+
Include waters in the selection.
|
|
2181
|
+
exclude_backbone : bool
|
|
2182
|
+
Remove backbone atoms on non‑substrate amino acids (with safeguards).
|
|
2183
|
+
add_linkH : bool
|
|
2184
|
+
Add link‑H atoms for cut bonds (carbon‑only) and append as HL/LKH HETATM records.
|
|
2185
|
+
selected_resn : str
|
|
2186
|
+
Additional residues to force‑include (comma/space separated).
|
|
2187
|
+
ligand_charge : float | str | dict[str,float] | None
|
|
2188
|
+
Either a total charge (float/str) for unknown residues (prefer unknown substrate),
|
|
2189
|
+
or a mapping like {'GPP': -3, 'MMT': -1}. In mapping mode, other unknown residues remain 0.
|
|
2190
|
+
verbose : bool
|
|
2191
|
+
Enable INFO logging.
|
|
2192
|
+
|
|
2193
|
+
Returns
|
|
2194
|
+
-------
|
|
2195
|
+
dict
|
|
2196
|
+
Same structure as `extract(..., api=True)`.
|
|
2197
|
+
"""
|
|
2198
|
+
if not output:
|
|
2199
|
+
output = ['pocket.pdb']
|
|
2200
|
+
ns = argparse.Namespace(
|
|
2201
|
+
complex_pdb=complex_pdb,
|
|
2202
|
+
substrate_pdb=center,
|
|
2203
|
+
output_pdb=output,
|
|
2204
|
+
radius=radius,
|
|
2205
|
+
radius_het2het=radius_het2het,
|
|
2206
|
+
include_H2O=include_H2O,
|
|
2207
|
+
exclude_backbone=exclude_backbone,
|
|
2208
|
+
add_linkH=add_linkH,
|
|
2209
|
+
selected_resn=selected_resn,
|
|
2210
|
+
ligand_charge=ligand_charge,
|
|
2211
|
+
verbose=verbose,
|
|
2212
|
+
)
|
|
2213
|
+
return extract(ns, api=True)
|