mlmm-toolkit 0.2.2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (372) hide show
  1. hessian_ff/__init__.py +50 -0
  2. hessian_ff/analytical_hessian.py +609 -0
  3. hessian_ff/constants.py +46 -0
  4. hessian_ff/forcefield.py +339 -0
  5. hessian_ff/loaders.py +608 -0
  6. hessian_ff/native/Makefile +8 -0
  7. hessian_ff/native/__init__.py +28 -0
  8. hessian_ff/native/analytical_hessian.py +88 -0
  9. hessian_ff/native/analytical_hessian_ext.cpp +258 -0
  10. hessian_ff/native/bonded.py +82 -0
  11. hessian_ff/native/bonded_ext.cpp +640 -0
  12. hessian_ff/native/loader.py +349 -0
  13. hessian_ff/native/nonbonded.py +118 -0
  14. hessian_ff/native/nonbonded_ext.cpp +1150 -0
  15. hessian_ff/prmtop_parmed.py +23 -0
  16. hessian_ff/system.py +107 -0
  17. hessian_ff/terms/__init__.py +14 -0
  18. hessian_ff/terms/angle.py +73 -0
  19. hessian_ff/terms/bond.py +44 -0
  20. hessian_ff/terms/cmap.py +406 -0
  21. hessian_ff/terms/dihedral.py +141 -0
  22. hessian_ff/terms/nonbonded.py +209 -0
  23. hessian_ff/tests/__init__.py +0 -0
  24. hessian_ff/tests/conftest.py +75 -0
  25. hessian_ff/tests/data/small/complex.parm7 +1346 -0
  26. hessian_ff/tests/data/small/complex.pdb +125 -0
  27. hessian_ff/tests/data/small/complex.rst7 +63 -0
  28. hessian_ff/tests/test_coords_input.py +44 -0
  29. hessian_ff/tests/test_energy_force.py +49 -0
  30. hessian_ff/tests/test_hessian.py +137 -0
  31. hessian_ff/tests/test_smoke.py +18 -0
  32. hessian_ff/tests/test_validation.py +40 -0
  33. hessian_ff/workflows.py +889 -0
  34. mlmm/__init__.py +36 -0
  35. mlmm/__main__.py +7 -0
  36. mlmm/_version.py +34 -0
  37. mlmm/add_elem_info.py +374 -0
  38. mlmm/advanced_help.py +91 -0
  39. mlmm/align_freeze_atoms.py +601 -0
  40. mlmm/all.py +3535 -0
  41. mlmm/bond_changes.py +231 -0
  42. mlmm/bool_compat.py +223 -0
  43. mlmm/cli.py +574 -0
  44. mlmm/cli_utils.py +166 -0
  45. mlmm/default_group.py +337 -0
  46. mlmm/defaults.py +467 -0
  47. mlmm/define_layer.py +526 -0
  48. mlmm/dft.py +1041 -0
  49. mlmm/energy_diagram.py +253 -0
  50. mlmm/extract.py +2213 -0
  51. mlmm/fix_altloc.py +464 -0
  52. mlmm/freq.py +1406 -0
  53. mlmm/harmonic_constraints.py +140 -0
  54. mlmm/hessian_cache.py +44 -0
  55. mlmm/hessian_calc.py +174 -0
  56. mlmm/irc.py +638 -0
  57. mlmm/mlmm_calc.py +2262 -0
  58. mlmm/mm_parm.py +945 -0
  59. mlmm/oniom_export.py +1983 -0
  60. mlmm/oniom_import.py +457 -0
  61. mlmm/opt.py +1742 -0
  62. mlmm/path_opt.py +1353 -0
  63. mlmm/path_search.py +2299 -0
  64. mlmm/preflight.py +88 -0
  65. mlmm/py.typed +1 -0
  66. mlmm/pysis_runner.py +45 -0
  67. mlmm/scan.py +1047 -0
  68. mlmm/scan2d.py +1226 -0
  69. mlmm/scan3d.py +1265 -0
  70. mlmm/scan_common.py +184 -0
  71. mlmm/summary_log.py +736 -0
  72. mlmm/trj2fig.py +448 -0
  73. mlmm/tsopt.py +2871 -0
  74. mlmm/utils.py +2309 -0
  75. mlmm/xtb_embedcharge_correction.py +475 -0
  76. mlmm_toolkit-0.2.2.dev0.dist-info/METADATA +1159 -0
  77. mlmm_toolkit-0.2.2.dev0.dist-info/RECORD +372 -0
  78. mlmm_toolkit-0.2.2.dev0.dist-info/WHEEL +5 -0
  79. mlmm_toolkit-0.2.2.dev0.dist-info/entry_points.txt +2 -0
  80. mlmm_toolkit-0.2.2.dev0.dist-info/licenses/LICENSE +674 -0
  81. mlmm_toolkit-0.2.2.dev0.dist-info/top_level.txt +4 -0
  82. pysisyphus/Geometry.py +1667 -0
  83. pysisyphus/LICENSE +674 -0
  84. pysisyphus/TableFormatter.py +63 -0
  85. pysisyphus/TablePrinter.py +74 -0
  86. pysisyphus/__init__.py +12 -0
  87. pysisyphus/calculators/AFIR.py +452 -0
  88. pysisyphus/calculators/AnaPot.py +20 -0
  89. pysisyphus/calculators/AnaPot2.py +48 -0
  90. pysisyphus/calculators/AnaPot3.py +12 -0
  91. pysisyphus/calculators/AnaPot4.py +20 -0
  92. pysisyphus/calculators/AnaPotBase.py +337 -0
  93. pysisyphus/calculators/AnaPotCBM.py +25 -0
  94. pysisyphus/calculators/AtomAtomTransTorque.py +154 -0
  95. pysisyphus/calculators/CFOUR.py +250 -0
  96. pysisyphus/calculators/Calculator.py +844 -0
  97. pysisyphus/calculators/CerjanMiller.py +24 -0
  98. pysisyphus/calculators/Composite.py +123 -0
  99. pysisyphus/calculators/ConicalIntersection.py +171 -0
  100. pysisyphus/calculators/DFTBp.py +430 -0
  101. pysisyphus/calculators/DFTD3.py +66 -0
  102. pysisyphus/calculators/DFTD4.py +84 -0
  103. pysisyphus/calculators/Dalton.py +61 -0
  104. pysisyphus/calculators/Dimer.py +681 -0
  105. pysisyphus/calculators/Dummy.py +20 -0
  106. pysisyphus/calculators/EGO.py +76 -0
  107. pysisyphus/calculators/EnergyMin.py +224 -0
  108. pysisyphus/calculators/ExternalPotential.py +264 -0
  109. pysisyphus/calculators/FakeASE.py +35 -0
  110. pysisyphus/calculators/FourWellAnaPot.py +28 -0
  111. pysisyphus/calculators/FreeEndNEBPot.py +39 -0
  112. pysisyphus/calculators/Gaussian09.py +18 -0
  113. pysisyphus/calculators/Gaussian16.py +726 -0
  114. pysisyphus/calculators/HardSphere.py +159 -0
  115. pysisyphus/calculators/IDPPCalculator.py +49 -0
  116. pysisyphus/calculators/IPIClient.py +133 -0
  117. pysisyphus/calculators/IPIServer.py +234 -0
  118. pysisyphus/calculators/LEPSBase.py +24 -0
  119. pysisyphus/calculators/LEPSExpr.py +139 -0
  120. pysisyphus/calculators/LennardJones.py +80 -0
  121. pysisyphus/calculators/MOPAC.py +219 -0
  122. pysisyphus/calculators/MullerBrownSympyPot.py +51 -0
  123. pysisyphus/calculators/MultiCalc.py +85 -0
  124. pysisyphus/calculators/NFK.py +45 -0
  125. pysisyphus/calculators/OBabel.py +87 -0
  126. pysisyphus/calculators/ONIOMv2.py +1129 -0
  127. pysisyphus/calculators/ORCA.py +893 -0
  128. pysisyphus/calculators/ORCA5.py +6 -0
  129. pysisyphus/calculators/OpenMM.py +88 -0
  130. pysisyphus/calculators/OpenMolcas.py +281 -0
  131. pysisyphus/calculators/OverlapCalculator.py +908 -0
  132. pysisyphus/calculators/Psi4.py +218 -0
  133. pysisyphus/calculators/PyPsi4.py +37 -0
  134. pysisyphus/calculators/PySCF.py +341 -0
  135. pysisyphus/calculators/PyXTB.py +73 -0
  136. pysisyphus/calculators/QCEngine.py +106 -0
  137. pysisyphus/calculators/Rastrigin.py +22 -0
  138. pysisyphus/calculators/Remote.py +76 -0
  139. pysisyphus/calculators/Rosenbrock.py +15 -0
  140. pysisyphus/calculators/SocketCalc.py +97 -0
  141. pysisyphus/calculators/TIP3P.py +111 -0
  142. pysisyphus/calculators/TransTorque.py +161 -0
  143. pysisyphus/calculators/Turbomole.py +965 -0
  144. pysisyphus/calculators/VRIPot.py +37 -0
  145. pysisyphus/calculators/WFOWrapper.py +333 -0
  146. pysisyphus/calculators/WFOWrapper2.py +341 -0
  147. pysisyphus/calculators/XTB.py +418 -0
  148. pysisyphus/calculators/__init__.py +81 -0
  149. pysisyphus/calculators/cosmo_data.py +139 -0
  150. pysisyphus/calculators/parser.py +150 -0
  151. pysisyphus/color.py +19 -0
  152. pysisyphus/config.py +133 -0
  153. pysisyphus/constants.py +65 -0
  154. pysisyphus/cos/AdaptiveNEB.py +230 -0
  155. pysisyphus/cos/ChainOfStates.py +725 -0
  156. pysisyphus/cos/FreeEndNEB.py +25 -0
  157. pysisyphus/cos/FreezingString.py +103 -0
  158. pysisyphus/cos/GrowingChainOfStates.py +71 -0
  159. pysisyphus/cos/GrowingNT.py +309 -0
  160. pysisyphus/cos/GrowingString.py +508 -0
  161. pysisyphus/cos/NEB.py +189 -0
  162. pysisyphus/cos/SimpleZTS.py +64 -0
  163. pysisyphus/cos/__init__.py +22 -0
  164. pysisyphus/cos/stiffness.py +199 -0
  165. pysisyphus/drivers/__init__.py +17 -0
  166. pysisyphus/drivers/afir.py +855 -0
  167. pysisyphus/drivers/barriers.py +271 -0
  168. pysisyphus/drivers/birkholz.py +138 -0
  169. pysisyphus/drivers/cluster.py +318 -0
  170. pysisyphus/drivers/diabatization.py +133 -0
  171. pysisyphus/drivers/merge.py +368 -0
  172. pysisyphus/drivers/merge_mol2.py +322 -0
  173. pysisyphus/drivers/opt.py +375 -0
  174. pysisyphus/drivers/perf.py +91 -0
  175. pysisyphus/drivers/pka.py +52 -0
  176. pysisyphus/drivers/precon_pos_rot.py +669 -0
  177. pysisyphus/drivers/rates.py +480 -0
  178. pysisyphus/drivers/replace.py +219 -0
  179. pysisyphus/drivers/scan.py +212 -0
  180. pysisyphus/drivers/spectrum.py +166 -0
  181. pysisyphus/drivers/thermo.py +31 -0
  182. pysisyphus/dynamics/Gaussian.py +103 -0
  183. pysisyphus/dynamics/__init__.py +20 -0
  184. pysisyphus/dynamics/colvars.py +136 -0
  185. pysisyphus/dynamics/driver.py +297 -0
  186. pysisyphus/dynamics/helpers.py +256 -0
  187. pysisyphus/dynamics/lincs.py +105 -0
  188. pysisyphus/dynamics/mdp.py +364 -0
  189. pysisyphus/dynamics/rattle.py +121 -0
  190. pysisyphus/dynamics/thermostats.py +128 -0
  191. pysisyphus/dynamics/wigner.py +266 -0
  192. pysisyphus/elem_data.py +3473 -0
  193. pysisyphus/exceptions.py +2 -0
  194. pysisyphus/filtertrj.py +69 -0
  195. pysisyphus/helpers.py +623 -0
  196. pysisyphus/helpers_pure.py +649 -0
  197. pysisyphus/init_logging.py +50 -0
  198. pysisyphus/intcoords/Bend.py +69 -0
  199. pysisyphus/intcoords/Bend2.py +25 -0
  200. pysisyphus/intcoords/BondedFragment.py +32 -0
  201. pysisyphus/intcoords/Cartesian.py +41 -0
  202. pysisyphus/intcoords/CartesianCoords.py +140 -0
  203. pysisyphus/intcoords/Coords.py +56 -0
  204. pysisyphus/intcoords/DLC.py +197 -0
  205. pysisyphus/intcoords/DistanceFunction.py +34 -0
  206. pysisyphus/intcoords/DummyImproper.py +70 -0
  207. pysisyphus/intcoords/DummyTorsion.py +72 -0
  208. pysisyphus/intcoords/LinearBend.py +105 -0
  209. pysisyphus/intcoords/LinearDisplacement.py +80 -0
  210. pysisyphus/intcoords/OutOfPlane.py +59 -0
  211. pysisyphus/intcoords/PrimTypes.py +286 -0
  212. pysisyphus/intcoords/Primitive.py +137 -0
  213. pysisyphus/intcoords/RedundantCoords.py +659 -0
  214. pysisyphus/intcoords/RobustTorsion.py +59 -0
  215. pysisyphus/intcoords/Rotation.py +147 -0
  216. pysisyphus/intcoords/Stretch.py +31 -0
  217. pysisyphus/intcoords/Torsion.py +101 -0
  218. pysisyphus/intcoords/Torsion2.py +25 -0
  219. pysisyphus/intcoords/Translation.py +45 -0
  220. pysisyphus/intcoords/__init__.py +61 -0
  221. pysisyphus/intcoords/augment_bonds.py +126 -0
  222. pysisyphus/intcoords/derivatives.py +10512 -0
  223. pysisyphus/intcoords/eval.py +80 -0
  224. pysisyphus/intcoords/exceptions.py +37 -0
  225. pysisyphus/intcoords/findiffs.py +48 -0
  226. pysisyphus/intcoords/generate_derivatives.py +414 -0
  227. pysisyphus/intcoords/helpers.py +235 -0
  228. pysisyphus/intcoords/logging_conf.py +10 -0
  229. pysisyphus/intcoords/mp_derivatives.py +10836 -0
  230. pysisyphus/intcoords/setup.py +962 -0
  231. pysisyphus/intcoords/setup_fast.py +176 -0
  232. pysisyphus/intcoords/update.py +272 -0
  233. pysisyphus/intcoords/valid.py +89 -0
  234. pysisyphus/interpolate/Geodesic.py +93 -0
  235. pysisyphus/interpolate/IDPP.py +55 -0
  236. pysisyphus/interpolate/Interpolator.py +116 -0
  237. pysisyphus/interpolate/LST.py +70 -0
  238. pysisyphus/interpolate/Redund.py +152 -0
  239. pysisyphus/interpolate/__init__.py +9 -0
  240. pysisyphus/interpolate/helpers.py +34 -0
  241. pysisyphus/io/__init__.py +22 -0
  242. pysisyphus/io/aomix.py +178 -0
  243. pysisyphus/io/cjson.py +24 -0
  244. pysisyphus/io/crd.py +101 -0
  245. pysisyphus/io/cube.py +220 -0
  246. pysisyphus/io/fchk.py +184 -0
  247. pysisyphus/io/hdf5.py +49 -0
  248. pysisyphus/io/hessian.py +72 -0
  249. pysisyphus/io/mol2.py +146 -0
  250. pysisyphus/io/molden.py +293 -0
  251. pysisyphus/io/orca.py +189 -0
  252. pysisyphus/io/pdb.py +269 -0
  253. pysisyphus/io/psf.py +79 -0
  254. pysisyphus/io/pubchem.py +31 -0
  255. pysisyphus/io/qcschema.py +34 -0
  256. pysisyphus/io/sdf.py +29 -0
  257. pysisyphus/io/xyz.py +61 -0
  258. pysisyphus/io/zmat.py +175 -0
  259. pysisyphus/irc/DWI.py +108 -0
  260. pysisyphus/irc/DampedVelocityVerlet.py +134 -0
  261. pysisyphus/irc/Euler.py +22 -0
  262. pysisyphus/irc/EulerPC.py +345 -0
  263. pysisyphus/irc/GonzalezSchlegel.py +187 -0
  264. pysisyphus/irc/IMKMod.py +164 -0
  265. pysisyphus/irc/IRC.py +878 -0
  266. pysisyphus/irc/IRCDummy.py +10 -0
  267. pysisyphus/irc/Instanton.py +307 -0
  268. pysisyphus/irc/LQA.py +53 -0
  269. pysisyphus/irc/ModeKill.py +136 -0
  270. pysisyphus/irc/ParamPlot.py +53 -0
  271. pysisyphus/irc/RK4.py +36 -0
  272. pysisyphus/irc/__init__.py +31 -0
  273. pysisyphus/irc/initial_displ.py +219 -0
  274. pysisyphus/linalg.py +411 -0
  275. pysisyphus/line_searches/Backtracking.py +88 -0
  276. pysisyphus/line_searches/HagerZhang.py +184 -0
  277. pysisyphus/line_searches/LineSearch.py +232 -0
  278. pysisyphus/line_searches/StrongWolfe.py +108 -0
  279. pysisyphus/line_searches/__init__.py +9 -0
  280. pysisyphus/line_searches/interpol.py +15 -0
  281. pysisyphus/modefollow/NormalMode.py +40 -0
  282. pysisyphus/modefollow/__init__.py +10 -0
  283. pysisyphus/modefollow/davidson.py +199 -0
  284. pysisyphus/modefollow/lanczos.py +95 -0
  285. pysisyphus/optimizers/BFGS.py +99 -0
  286. pysisyphus/optimizers/BacktrackingOptimizer.py +113 -0
  287. pysisyphus/optimizers/ConjugateGradient.py +98 -0
  288. pysisyphus/optimizers/CubicNewton.py +75 -0
  289. pysisyphus/optimizers/FIRE.py +113 -0
  290. pysisyphus/optimizers/HessianOptimizer.py +1176 -0
  291. pysisyphus/optimizers/LBFGS.py +228 -0
  292. pysisyphus/optimizers/LayerOpt.py +411 -0
  293. pysisyphus/optimizers/MicroOptimizer.py +169 -0
  294. pysisyphus/optimizers/NCOptimizer.py +90 -0
  295. pysisyphus/optimizers/Optimizer.py +1084 -0
  296. pysisyphus/optimizers/PreconLBFGS.py +260 -0
  297. pysisyphus/optimizers/PreconSteepestDescent.py +7 -0
  298. pysisyphus/optimizers/QuickMin.py +74 -0
  299. pysisyphus/optimizers/RFOptimizer.py +181 -0
  300. pysisyphus/optimizers/RSA.py +99 -0
  301. pysisyphus/optimizers/StabilizedQNMethod.py +248 -0
  302. pysisyphus/optimizers/SteepestDescent.py +23 -0
  303. pysisyphus/optimizers/StringOptimizer.py +173 -0
  304. pysisyphus/optimizers/__init__.py +41 -0
  305. pysisyphus/optimizers/closures.py +301 -0
  306. pysisyphus/optimizers/cls_map.py +58 -0
  307. pysisyphus/optimizers/exceptions.py +6 -0
  308. pysisyphus/optimizers/gdiis.py +280 -0
  309. pysisyphus/optimizers/guess_hessians.py +311 -0
  310. pysisyphus/optimizers/hessian_updates.py +355 -0
  311. pysisyphus/optimizers/poly_fit.py +285 -0
  312. pysisyphus/optimizers/precon.py +153 -0
  313. pysisyphus/optimizers/restrict_step.py +24 -0
  314. pysisyphus/pack.py +172 -0
  315. pysisyphus/peakdetect.py +948 -0
  316. pysisyphus/plot.py +1031 -0
  317. pysisyphus/run.py +2106 -0
  318. pysisyphus/socket_helper.py +74 -0
  319. pysisyphus/stocastic/FragmentKick.py +132 -0
  320. pysisyphus/stocastic/Kick.py +81 -0
  321. pysisyphus/stocastic/Pipeline.py +303 -0
  322. pysisyphus/stocastic/__init__.py +21 -0
  323. pysisyphus/stocastic/align.py +127 -0
  324. pysisyphus/testing.py +96 -0
  325. pysisyphus/thermo.py +156 -0
  326. pysisyphus/trj.py +824 -0
  327. pysisyphus/tsoptimizers/RSIRFOptimizer.py +56 -0
  328. pysisyphus/tsoptimizers/RSPRFOptimizer.py +182 -0
  329. pysisyphus/tsoptimizers/TRIM.py +59 -0
  330. pysisyphus/tsoptimizers/TSHessianOptimizer.py +463 -0
  331. pysisyphus/tsoptimizers/__init__.py +23 -0
  332. pysisyphus/wavefunction/Basis.py +239 -0
  333. pysisyphus/wavefunction/DIIS.py +76 -0
  334. pysisyphus/wavefunction/__init__.py +25 -0
  335. pysisyphus/wavefunction/build_ext.py +42 -0
  336. pysisyphus/wavefunction/cart2sph.py +190 -0
  337. pysisyphus/wavefunction/diabatization.py +304 -0
  338. pysisyphus/wavefunction/excited_states.py +435 -0
  339. pysisyphus/wavefunction/gen_ints.py +1811 -0
  340. pysisyphus/wavefunction/helpers.py +104 -0
  341. pysisyphus/wavefunction/ints/__init__.py +0 -0
  342. pysisyphus/wavefunction/ints/boys.py +193 -0
  343. pysisyphus/wavefunction/ints/boys_table_N_64_xasym_27.1_step_0.01.npy +0 -0
  344. pysisyphus/wavefunction/ints/cart_gto3d.py +176 -0
  345. pysisyphus/wavefunction/ints/coulomb3d.py +25928 -0
  346. pysisyphus/wavefunction/ints/diag_quadrupole3d.py +10036 -0
  347. pysisyphus/wavefunction/ints/dipole3d.py +8762 -0
  348. pysisyphus/wavefunction/ints/int2c2e3d.py +7198 -0
  349. pysisyphus/wavefunction/ints/int3c2e3d_sph.py +65040 -0
  350. pysisyphus/wavefunction/ints/kinetic3d.py +8240 -0
  351. pysisyphus/wavefunction/ints/ovlp3d.py +3777 -0
  352. pysisyphus/wavefunction/ints/quadrupole3d.py +15054 -0
  353. pysisyphus/wavefunction/ints/self_ovlp3d.py +198 -0
  354. pysisyphus/wavefunction/localization.py +458 -0
  355. pysisyphus/wavefunction/multipole.py +159 -0
  356. pysisyphus/wavefunction/normalization.py +36 -0
  357. pysisyphus/wavefunction/pop_analysis.py +134 -0
  358. pysisyphus/wavefunction/shells.py +1171 -0
  359. pysisyphus/wavefunction/wavefunction.py +504 -0
  360. pysisyphus/wrapper/__init__.py +11 -0
  361. pysisyphus/wrapper/exceptions.py +2 -0
  362. pysisyphus/wrapper/jmol.py +120 -0
  363. pysisyphus/wrapper/mwfn.py +169 -0
  364. pysisyphus/wrapper/packmol.py +71 -0
  365. pysisyphus/xyzloader.py +168 -0
  366. pysisyphus/yaml_mods.py +45 -0
  367. thermoanalysis/LICENSE +674 -0
  368. thermoanalysis/QCData.py +244 -0
  369. thermoanalysis/__init__.py +0 -0
  370. thermoanalysis/config.py +3 -0
  371. thermoanalysis/constants.py +20 -0
  372. thermoanalysis/thermo.py +1011 -0
mlmm/extract.py ADDED
@@ -0,0 +1,2213 @@
1
+ # mlmm/extract.py
2
+
3
+ """
4
+ extract — Automated binding‑pocket (active‑site) extractor
5
+ ====================================================================
6
+
7
+ Usage (CLI)
8
+ -----------
9
+ mlmm extract -i INPUT.pdb [INPUT2.pdb ...] -c <substrate_spec> \
10
+ [-o OUTPUT.pdb ...] [-r <Å>] [--radius-het2het <Å>] \
11
+ [--include-H2O/--no-include-H2O] [--exclude-backbone/--no-exclude-backbone] \
12
+ [--add-linkH/--no-add-linkH] [--selected-resn "CHAIN:RES" ...] \
13
+ [-l, --ligand-charge <number|"RES:Q,...">] [--verbose/--no-verbose]
14
+
15
+ Examples
16
+ --------
17
+ # Minimal (ID-based substrate) with explicit total ligand charge
18
+ mlmm extract -i complex.pdb -c A:123 -o pocket.pdb -l -3
19
+
20
+ # Substrate provided as a PDB; per-resname charge mapping (others remain 0)
21
+ mlmm extract -i complex.pdb -c substrate.pdb -o pocket.pdb \
22
+ -l "GPP:-3,MMT:-1"
23
+
24
+ # Name-based substrate selection including all matches (WARNING is logged)
25
+ mlmm extract -i complex.pdb -c "GPP,MMT" -o pocket.pdb -l -4
26
+
27
+ # Multi-structure to single multi-MODEL output with hetero-hetero proximity enabled
28
+ mlmm extract -i complex1.pdb complex2.pdb -c A:123 \
29
+ -o pocket_multi.pdb --radius-het2het 2.6 -l -3 --verbose
30
+
31
+ Description
32
+ -----------
33
+ Extracts an active‑site pocket around specified substrate residues from a protein–substrate complex,
34
+ applies biochemically aware truncation (backbone/side‑chain capping with safeguards), and can append
35
+ link hydrogens for cut bonds. Supports single structures and ensembles (multi‑MODEL or per‑file outputs).
36
+ Typical use cases include QM/MM, ML/MM, and cluster QM models.
37
+
38
+ Residue inclusion
39
+ -----
40
+ - Always include the substrate residues.
41
+ - Standard cutoff (``--radius``, default 2.6 Å):
42
+ - If ``--no-exclude-backbone``: include any residue if **any atom** is within the cutoff.
43
+ - If ``--exclude-backbone`` (default): for **amino‑acid residues**, the qualifying atom
44
+ must be **non‑backbone** (not in {N, H*, CA, HA*, C, O, OXT}); non‑amino‑acid residues qualify by any atom.
45
+ - Independent hetero–hetero proximity (``--radius-het2het``):
46
+ add residues if a **substrate hetero atom (non‑C/H)** is within the cutoff of a **protein hetero atom**.
47
+ With ``--exclude-backbone``, amino‑acid neighbors must be **non‑backbone** atoms.
48
+ - Waters are included by default (``--include-H2O``; disable with ``--no-include-H2O``).
49
+ - ``--selected-resn`` force‑includes residues (chain and insertion codes supported).
50
+ - When ``--no-exclude-backbone`` and a selected residue’s **backbone atom** contacts the substrate
51
+ (within either cutoff), include its peptide‑adjacent N‑side and C‑side neighbors (C–N ≤ 1.9 Å). For true termini,
52
+ keep the respective terminal cap atoms (N/H* or C/O/OXT).
53
+ - **Disulfide safeguard:** if a selected CYS/CYX forms an SG–SG contact ≤ 2.5 Å, include both partners.
54
+ - **Proline safeguard:** if a selected **PRO** is not N‑terminal (peptide‑adjacent), include the immediately
55
+ preceding amino acid. For that neighbor, **CA is always kept**, and when backbone exclusion is on,
56
+ keep **C** and **O/OXT** (to preserve the bond into PRO–N).
57
+
58
+ Truncation (capping)
59
+ -----
60
+ - **Isolated residues** → keep **pure side‑chain** (remove N, CA, C, O, OXT and N/CA H*).
61
+ - **PRO/HYP** retain N, CA, HA, H* to keep the ring.
62
+ - **Continuous peptide stretches** keep internal backbone; only **terminal caps** are removed
63
+ (N‑cap: N/H*; C‑cap: C/O/OXT). TER‑aware segmentation prevents crossing chain breaks.
64
+ - With ``--exclude-backbone`` (default), delete main‑chain atoms on all **non‑substrate amino acids**,
65
+ except for the specific PRO/HYP retention and PRO‑adjacency preservation above.
66
+ - **Non‑amino‑acid residues**: atoms named like protein backbone ({"N","CA","HA","H","H1","H2","H3"})
67
+ are **never deleted** by capping logic.
68
+
69
+ Link hydrogens (--add-linkH)
70
+ -----
71
+ - Adds **carbon‑only** link H at **1.09 Å** along the cut‑bond vector.
72
+ Normal residues: checks **CB–CA**, **CA–N**, **CA–C**; **PRO/HYP**: **CA–C** only.
73
+ - If any are added, append a **TER** then a contiguous **HETATM** block with atoms named **``HL``**
74
+ in residue **``LKH``** (chain ``L``); serials continue from the main block.
75
+ - In multi‑structure mode, link‑H **targets and ordering** are enforced to be **identical across models**;
76
+ coordinates remain model‑specific. The flag is honored in both single‑ and multi‑structure modes.
77
+
78
+ Charge summary
79
+ -----
80
+ - **AMINO_ACIDS** dictionary supplies nominal integer charges for amino‑acid residues (common variants included).
81
+ - **ION** supplies charges for common ions (e.g., ZN, MG, FE2). Waters are 0.
82
+ - **Unknown residues** (not in AMINO_ACIDS/ION/WATER) are **0** unless ``--ligand-charge`` is given.
83
+ - ``--ligand-charge <number>``: total charge distributed across **unknown substrate** residues
84
+ (or across all unknowns if no unknown substrate).
85
+ - ``--ligand-charge "RES1:Q1,RES2:Q2"``: set **per‑resname** charges; any other unknown residues remain 0.
86
+ - In multi‑structure mode the **charge summary is computed on the first input PDB** only.
87
+
88
+ Multi-structure ensembles
89
+ -----
90
+ - Accepts multiple input PDBs (same **atom count**; ordering is **assumed identical** and is
91
+ **spot‑checked** at the beginning and end of the atom list).
92
+ - Each structure is selected independently; the **union** of selected residues is applied to all.
93
+ Disulfides, PRO‑adjacency, and (if enabled) backbone‑contact neighbor augmentation are also unioned.
94
+ - Outputs:
95
+ - Provide **one** output path → **multi‑MODEL** PDB (one MODEL per input).
96
+ - Provide **N** output paths where **N == number of inputs** → **N** single‑model PDBs.
97
+ - If ``-o`` is omitted with multiple inputs → per‑file outputs ``pocket_{original_filename}.pdb``.
98
+ - **Diagnostics:** atom counts (**raw** vs **after truncation**) are logged **per model**.
99
+
100
+ Substrate specification
101
+ -----
102
+ ``-c/--center`` accepts:
103
+ - a **PDB path** (exact coordinate match on the first input; IDs propagated to others),
104
+ - a list of **residue IDs**: ``"123,124"``, ``"A:123,B:456"``, ``"123A"``, ``"A:123A"`` (insertion codes OK),
105
+ - or a list of **residue names** (case‑insensitive), e.g., ``"GPP,MMT"``.
106
+ If multiple residues share the same name, **all** matches are used and a **WARNING** is logged.
107
+
108
+ Outputs (& Directory Layout)
109
+ ----------------------------
110
+ <output>/ (default: pocket.pdb for single input; pocket_<source>.pdb per input when -o is omitted)
111
+ ├─ pocket.pdb # default single-input pocket
112
+ ├─ pocket_<original_filename>.pdb # default per-input pocket when multiple inputs and -o omitted
113
+ └─ <user_paths>.pdb # custom outputs; one path = multi-MODEL, N paths = per-structure
114
+
115
+ Link hydrogens, logs, and programmatic use
116
+ ├─ Link-H block (when added) follows a TER as contiguous HETATM records named HL in residue LKH (chain L).
117
+ ├─ INFO logs summarize residue selection, raw/kept atom counts, and the charge summary.
118
+ └─ ``extract(..., api=True)`` / ``extract_api(...)`` returns ``{"outputs": [...], "counts": [...], "charge_summary": {...}}``.
119
+
120
+
121
+ Notes
122
+ -----
123
+ - **Defaults / behavior:**
124
+ - ``--radius`` default: **2.6 Å**. If given **0**, internally nudged to **0.001 Å**.
125
+ - ``--radius-het2het`` default: **0 Å** (off). Internally treated as **0.001 Å** if ``0`` is given.
126
+ - ``--include-H2O`` default: **true**.
127
+ - ``--exclude-backbone`` default: **true**.
128
+ - ``--add-linkH`` default: **false**.
129
+ - ``--ligand-charge`` default: **None** (unknown residues counted as 0 unless set).
130
+ - Output default: single input → ``pocket.pdb``; multiple inputs → ``pocket_{original_filename}.pdb``.
131
+ - **Geometry thresholds and tolerances:**
132
+ - Peptide adjacency: **C(prev)–N(next) ≤ 1.9 Å** (distance‑based; practical TER awareness).
133
+ - Disulfide detection: **SG–SG ≤ 2.5 Å**.
134
+ - Link‑H distance: **1.09 Å** (C–H) along the cut‑bond vector.
135
+ - Exact match tolerance for substrate PDB: **1e‑3 Å** per atom.
136
+ - **Safeguards and special cases:**
137
+ - **PRO/HYP** retain N, CA, HA, H* in isolated truncations; PRO’s **N‑side neighbor** is auto‑included
138
+ when peptide‑adjacent; **CA** on that neighbor is always kept, and with backbone exclusion
139
+ **C** and **O/OXT** are preserved to maintain the peptide bond into PRO–N.
140
+ - **Non‑amino‑acid residues** never lose atoms named like backbone (``N, CA, HA, H, H1, H2, H3``).
141
+ - **Waters** (HOH/WAT/TIP3/SOL) are always neutral (charge 0) and included by default.
142
+ - **Dependencies:** Python ≥ **3.10** (PEP 604 unions), Biopython ≥ **1.80**, NumPy.
143
+ - **Python API (for reference):**
144
+ - ``extract(args: argparse.Namespace | None = None, api=False)`` — main entry (CLI or programmatic).
145
+ - ``extract_api(...)`` — convenience wrapper that returns ``{'outputs','counts','charge_summary'}`` when used programmatically.
146
+ """
147
+
148
+ from __future__ import annotations
149
+
150
+ import argparse
151
+ import logging
152
+ import io as _io
153
+ import os
154
+ import re
155
+ import sys
156
+ from typing import Dict, List, Set, Tuple, Iterable, Any, Optional, Sequence
157
+
158
+ import click
159
+ import numpy as np
160
+ from Bio import PDB
161
+ from Bio.PDB import NeighborSearch
162
+
163
+ # Public API
164
+ __all__ = ["extract", "extract_api"]
165
+
166
+
167
+ LOGGER = logging.getLogger(__name__)
168
+
169
+
170
+ class _ClickEchoHandler(logging.Handler):
171
+ """Route logger records to click.echo with stderr for warnings/errors."""
172
+
173
+ def emit(self, record: logging.LogRecord) -> None:
174
+ try:
175
+ click.echo(self.format(record), err=record.levelno >= logging.WARNING)
176
+ except Exception:
177
+ self.handleError(record)
178
+
179
+
180
+ def _configure_extract_logger(verbose: bool) -> None:
181
+ """Configure module-local logger for CLI output without touching global logging."""
182
+ level = logging.INFO if verbose else logging.WARNING
183
+ LOGGER.setLevel(level)
184
+ LOGGER.propagate = False
185
+
186
+ for handler in list(LOGGER.handlers):
187
+ LOGGER.removeHandler(handler)
188
+
189
+ handler = _ClickEchoHandler()
190
+ handler.setLevel(level)
191
+ handler.setFormatter(
192
+ logging.Formatter("%(message)s" if verbose else "%(levelname)s: %(message)s")
193
+ )
194
+ LOGGER.addHandler(handler)
195
+
196
+
197
+ # ---------------------------------------------------------------------
198
+ # Constants
199
+ # ---------------------------------------------------------------------
200
+ BACKBONE_ATOMS: Set[str] = {
201
+ "N", "C", "O", "CA", "OXT",
202
+ "H", "H1", "H2", "H3", "HN", "HA", "HA2", "HA3",
203
+ }
204
+ # When --exclude-backbone true, remove the full main-chain set:
205
+ BACKBONE_ALL: Set[str] = BACKBONE_ATOMS
206
+
207
+ # Unified amino-acid dictionary: resname -> nominal integer charge
208
+ # (membership checks throughout the code use dictionary keys)
209
+ AMINO_ACIDS: Dict[str, int] = {
210
+ # --- Standard 20 (L) ---
211
+ "ALA": 0, "ARG": +1, "ASN": 0, "ASP": -1, "CYS": 0,
212
+ "GLU": -1, "GLN": 0, "GLY": 0, "HIS": 0, "ILE": 0,
213
+ "LEU": 0, "LYS": +1, "MET": 0, "PHE": 0, "PRO": 0,
214
+ "SER": 0, "THR": 0, "TRP": 0, "TYR": 0, "VAL": 0,
215
+
216
+ # --- Canonical extras ---
217
+ "SEC": 0, # selenocysteine
218
+ "PYL": +1, # pyrrolysine
219
+
220
+ # --- Protonation / tautomers (Amber/CHARMM style) ---
221
+ "HIP": +1, # fully protonated His
222
+ "HID": 0, # Nδ-protonated His
223
+ "HIE": 0, # Nε-protonated His
224
+ "ASH": 0, # neutral Asp
225
+ "GLH": 0, # neutral Glu
226
+ "LYN": 0, # neutral Lys
227
+ "ARN": 0, # neutral Arg
228
+ "TYM": -1, # deprotonated Tyr (phenolate)
229
+
230
+ # --- Phosphorylated residues ---
231
+ "SEP": -2, "TPO": -2, "PTR": -2,
232
+ "S1P": -1, "T1P": -1, "Y1P": -1, # monoanionic phospho-Ser/Thr/Tyr
233
+
234
+ # --- Phosphorylated histidines (phosaa19SB) ---
235
+ "H1D": 0, # ND1-phospho-His, neutral
236
+ "H2D": -1, # ND1-phospho-His, anionic
237
+ "H1E": 0, # NE2-phospho-His, neutral
238
+ "H2E": -1, # NE2-phospho-His, anionic
239
+
240
+ # --- Cys family ---
241
+ "CYX": 0, # disulfide Cys
242
+ "CSO": 0, # Cys sulfenic acid
243
+ "CSD": -1, # Cys sulfinic acid
244
+ "CSX": 0, # generic Cys derivative
245
+ "OCS": -1, # cysteic acid
246
+ "CYM": -1, # deprotonated Cys
247
+
248
+ # --- Lys variants / carboxylation ---
249
+ "MLY": +1, "LLP": +1, "DLY": +1,
250
+ "KCX": -1, # Lysine Nz-Carboxylic Acid
251
+
252
+ # --- D isomers (19 residues) ---
253
+ "DAL": 0, "DAR": +1, "DSG": 0, "DAS": -1, "DCY": 0,
254
+ "DGN": 0, "DGL": -1, "DHI": 0, "DIL": 0, "DLE": 0,
255
+ "DLY": +1, "MED": 0, "DPN": 0, "DPR": 0, "DSN": 0,
256
+ "DTH": 0, "DTR": 0, "DTY": 0, "DVA": 0,
257
+
258
+ # --- Carboxylation / cyclization / others ---
259
+ "CGU": -2, # gamma-carboxy-glutamate
260
+ "CGA": -1, # carboxymethylated glutamate
261
+ "PCA": 0, # pyroglutamate
262
+ "MSE": 0, # selenomethionine
263
+ "OMT": 0, # methionine sulfone
264
+
265
+ # --- Other modified residues possibly encountered ---
266
+ "ASA": 0, "CIR": 0, "FOR": 0, "MVA": 0, "IIL": 0, "AIB": 0, "HTN": 0,
267
+ "SAR": 0, "NMC": 0, "PFF": 0, "NFA": 0, "ALY": 0, "AZF": 0, "CNX": 0, "CYF": 0,
268
+
269
+ # --- Hydroxyproline ---
270
+ "HYP": 0,
271
+
272
+ # --- All C-terminus ---
273
+ "CALA": -1, "CARG": 0, "CASN": -1, "CASP": -2, "CCYS": -1,
274
+ "CCYX": -1, "CGLN": -1, "CGLU": -2, "CGLY": -1, "CHID": -1,
275
+ "CHIE": -1, "CHIP": 0, "CHYP": -1, "CILE": -1, "CLEU": -1,
276
+ "CLYS": 0, "CMET": -1, "CPHE": -1, "CPRO": -1, "CSER": -1,
277
+ "CTHR": -1, "CTRP": -1, "CTYR": -1, "CVAL": -1, "NHE": 0,
278
+ "NME": 0,
279
+ "CTER": -1, # generic C-terminus
280
+
281
+ # --- All N-terminus ---
282
+ "NALA": +1, "NARG": +2, "NASN": +1, "NASP": 0, "NCYS": +1,
283
+ "NCYX": +1, "NGLN": +1, "NGLU": 0, "NGLY": +1, "NHID": +1,
284
+ "NHIE": +1, "NHIP": +2, "NILE": +1, "NLEU": +1, "NLYS": +2,
285
+ "NMET": +1, "NPHE": +1, "NPRO": +1, "NSER": +1, "NTHR": +1,
286
+ "NTRP": +1, "NTYR": +1, "NVAL": +1, "ACE": 0,
287
+ "NTER": +1, # generic N-terminus
288
+ }
289
+
290
+ # Common ions (by residue name) and their formal charges
291
+ ION: Dict[str, int] = {
292
+ # +1
293
+ "LI": +1, "NA": +1, "K": +1, "RB": +1, "CS": +1, "TL": +1, "AG": +1, "CU1": +1,
294
+ "Ag": +1, "K+": +1, "Na+": +1, "NH4": +1, "H3O+": +1, "HE+": +1, "HZ+": +1, "Tl": +1,
295
+
296
+ # +2
297
+ "MG": +2, "CA": +2, "SR": +2, "BA": +2, "MN": +2, "FE2": +2, "CO": +2, "NI": +2,
298
+ "CU": +2, "ZN": +2, "CD": +2, "HG": +2, "PB": +2, "Be": +2, "PD": +2, "PT": +2,
299
+ "Sn": +2, "Ra": +2, "YB2": +2, "V2+": +2,
300
+
301
+ # +3
302
+ "FE": +3, "AU3": +3, "AL": +3, "GA": +3, "IN": +3,
303
+ "CE": +3, "Ce": +3, "CR": +3, "Cr": +3, "Dy": +3, "EU": +3, "EU3": +3, "Er": +3,
304
+ "GD3": +3, "LA": +3, "LU": +3, "Nd": +3, "PR": +3, "SM": +3, "Sm": +3, "TB": +3,
305
+ "Tm": +3, "Y": +3, "Pu": +3,
306
+
307
+ # +4
308
+ "U4+": +4, "Th": +4, "Hf": +4, "Zr": +4,
309
+
310
+ # -1
311
+ "F": -1, "CL": -1, "BR": -1, "I": -1, "Cl-": -1, "IOD": -1,
312
+ }
313
+
314
+ DISULFIDE_CUTOFF = 2.5 # Å Sγ–Sγ (SG–SG)
315
+ EXACT_EPS = 1e-3 # Å tolerance for exact match
316
+ WATER_RES = {"HOH","WAT","H2O","DOD","TIP","TIP3","SOL"}
317
+
318
+ # Type for cross-structure residue identity (chain, hetflag, resseq, icode, resname)
319
+ ResidueKey = Tuple[str, str, int, str, str]
320
+
321
+ # ---------------------------------------------------------------------
322
+ # Helpers
323
+ # ---------------------------------------------------------------------
324
+
325
+
326
+ def _extract_short_help() -> str:
327
+ return "\n".join(
328
+ [
329
+ "Usage: mlmm extract [OPTIONS]",
330
+ "",
331
+ "Extract a binding pocket around substrate residues.",
332
+ "",
333
+ "Core options:",
334
+ " -i, --input PATH [PATH ...] Input complex PDB file(s).",
335
+ " -c, --center SPEC Substrate selector (PDB / residue IDs / residue names).",
336
+ " -o, --output PATH [PATH ...] Output pocket PDB path(s).",
337
+ " -r, --radius FLOAT Pocket inclusion radius in angstrom.",
338
+ " -l, --ligand-charge VALUE Total or mapped ligand charge.",
339
+ " --help-advanced Show full extract options and exit.",
340
+ "",
341
+ "Use '--help-advanced' to see all extractor options.",
342
+ ]
343
+ )
344
+
345
+
346
+ # ── Native Click command ─────────────────────────────────────────────
347
+
348
+ _EXTRACT_ALL_FLAGS = (
349
+ "-i", "--input",
350
+ "-c", "--center",
351
+ "-o", "--output",
352
+ "-r", "--radius",
353
+ "--radius-het2het",
354
+ "--include-H2O", "--include-h2o", "--no-include-H2O", "--no-include-h2o",
355
+ "--exclude-backbone", "--no-exclude-backbone",
356
+ "--add-linkH", "--no-add-linkH",
357
+ "--selected-resn",
358
+ "-l",
359
+ "--ligand-charge",
360
+ "-v", "--verbose", "--no-verbose",
361
+ "-h", "--help", "--help-advanced",
362
+ )
363
+
364
+
365
+ def _gather_extract_variadic(
366
+ ctx_args: List[str],
367
+ flag_names: Sequence[str],
368
+ ) -> List[str]:
369
+ """Collect variadic positional values after *flag_names* from Click extra args."""
370
+ names_set = set(flag_names)
371
+ stop_set = set(_EXTRACT_ALL_FLAGS)
372
+ vals: List[str] = []
373
+ i = 0
374
+ while i < len(ctx_args):
375
+ tok = ctx_args[i]
376
+ if tok in names_set:
377
+ j = i + 1
378
+ while j < len(ctx_args) and ctx_args[j] not in stop_set:
379
+ vals.append(ctx_args[j])
380
+ j += 1
381
+ i = j
382
+ else:
383
+ i += 1
384
+ return vals
385
+
386
+
387
+ @click.command(
388
+ name="extract",
389
+ help=(
390
+ "Extract a binding pocket around substrate residues (from a PDB or "
391
+ "residue IDs/names), with biochemically aware truncation and optional "
392
+ "link-H; supports multi-structure input and multi-MODEL output."
393
+ ),
394
+ context_settings={
395
+ "help_option_names": ["-h", "--help"],
396
+ "ignore_unknown_options": True,
397
+ "allow_extra_args": True,
398
+ },
399
+ )
400
+ @click.option(
401
+ "-i", "--input", "complex_pdb",
402
+ type=str, multiple=True, required=True,
403
+ help="Protein-substrate complex PDB(s). If multiple, they must have identical atom counts and ordering.",
404
+ )
405
+ @click.option(
406
+ "-c", "--center", "substrate_pdb",
407
+ type=str, required=True,
408
+ help=(
409
+ "Substrate specification: a PDB path, a comma/space-separated residue-ID list "
410
+ "like '123,124' or 'A:123,B:456' (insertion codes supported), "
411
+ "or a residue-name list like 'GPP,SAM'."
412
+ ),
413
+ )
414
+ @click.option(
415
+ "-o", "--output", "output_pdb",
416
+ type=str, multiple=True, default=(),
417
+ help=(
418
+ "Output PDB path(s). One path for multi-MODEL PDB, or N paths for per-file output. "
419
+ "If omitted: single input -> pocket.pdb; multiple inputs -> pocket_{filename}.pdb."
420
+ ),
421
+ )
422
+ @click.option(
423
+ "-r", "--radius",
424
+ type=float, default=2.6, show_default=True,
425
+ help="Cutoff (angstrom) around substrate atoms for pocket inclusion.",
426
+ )
427
+ @click.option(
428
+ "--radius-het2het",
429
+ type=float, default=0, show_default=True,
430
+ help="Cutoff (angstrom) for substrate-protein hetero-atom proximity (non-C/H). 0 disables.",
431
+ )
432
+ @click.option(
433
+ "--include-H2O/--no-include-H2O",
434
+ "include_H2O",
435
+ default=True, show_default=True,
436
+ help="Include waters (HOH/WAT/H2O/DOD/TIP/TIP3/SOL).",
437
+ )
438
+ @click.option(
439
+ "--exclude-backbone/--no-exclude-backbone",
440
+ default=True, show_default=True,
441
+ help="Delete main-chain atoms from non-substrate amino acids.",
442
+ )
443
+ @click.option(
444
+ "--add-linkH/--no-add-linkH",
445
+ "add_linkH",
446
+ default=False, show_default=True,
447
+ help="Add carbon-only link-H at 1.09 angstrom along cut-bond directions.",
448
+ )
449
+ @click.option(
450
+ "--selected-resn",
451
+ type=str, default="",
452
+ help="Comma/space-separated residue IDs to force-include.",
453
+ )
454
+ @click.option(
455
+ "-l",
456
+ "--ligand-charge",
457
+ type=str, default=None,
458
+ help="Total charge number or per-resname mapping like 'GPP:-3,SAM:1'.",
459
+ )
460
+ @click.option(
461
+ "-v", "--verbose/--no-verbose",
462
+ default=True, show_default=True,
463
+ help="Enable INFO-level logging.",
464
+ )
465
+ @click.pass_context
466
+ def cli(
467
+ ctx: click.Context,
468
+ complex_pdb: Sequence[str],
469
+ substrate_pdb: str,
470
+ output_pdb: Sequence[str],
471
+ radius: float,
472
+ radius_het2het: float,
473
+ include_H2O: bool,
474
+ exclude_backbone: bool,
475
+ add_linkH: bool,
476
+ selected_resn: str,
477
+ ligand_charge: Optional[str],
478
+ verbose: bool,
479
+ ) -> None:
480
+ # Recover variadic values after -i / -o from extra args (supports
481
+ # space-separated syntax: ``-i a.pdb b.pdb`` in addition to ``-i a.pdb -i b.pdb``).
482
+ extra_inputs = _gather_extract_variadic(ctx.args, ("-i", "--input"))
483
+ input_list = list(complex_pdb) + extra_inputs if extra_inputs else list(complex_pdb)
484
+
485
+ extra_outputs = _gather_extract_variadic(ctx.args, ("-o", "--output"))
486
+ output_list: Optional[List[str]]
487
+ if output_pdb or extra_outputs:
488
+ output_list = list(output_pdb) + extra_outputs if extra_outputs else list(output_pdb)
489
+ else:
490
+ output_list = None
491
+
492
+ ns = argparse.Namespace(
493
+ complex_pdb=input_list,
494
+ substrate_pdb=substrate_pdb,
495
+ output_pdb=output_list,
496
+ radius=radius,
497
+ radius_het2het=radius_het2het,
498
+ include_H2O=include_H2O,
499
+ exclude_backbone=exclude_backbone,
500
+ add_linkH=add_linkH,
501
+ selected_resn=selected_resn,
502
+ ligand_charge=ligand_charge,
503
+ verbose=verbose,
504
+ )
505
+ extract(ns)
506
+
507
+
508
+ def _build_arg_parser(*, prog: str) -> argparse.ArgumentParser:
509
+ p = argparse.ArgumentParser(
510
+ prog=prog,
511
+ description=(
512
+ "Extract a binding pocket around substrate residues (from a PDB or residue IDs/names), "
513
+ "with biochemically aware truncation and optional link‑H; supports multi‑structure input "
514
+ "and multi‑MODEL output. Also logs pocket charge summary."
515
+ )
516
+ )
517
+
518
+ p.add_argument(
519
+ "-i", "--input", dest="complex_pdb", required=True, nargs="+",
520
+ metavar="complex.pdb",
521
+ help="Protein–substrate complex PDB(s). If multiple, they must have identical atom counts and ordering."
522
+ )
523
+ p.add_argument(
524
+ "-c", "--center", dest="substrate_pdb", required=True,
525
+ metavar="substrate.pdb | '123,124' | 'A:123,B:456' | 'GPP,MMT'",
526
+ help=("Substrate specification: either a PDB containing exactly the substrate residue(s), "
527
+ "a comma/space‑separated residue‑ID list like '123,124' or 'A:123,B:456' "
528
+ "(insertion codes supported: '123A' / 'A:123A'), "
529
+ "or a comma/space‑separated **residue‑name** list like 'GPP,MMT'. "
530
+ "When residue names are used and multiple residues share a name, all are used and a WARNING is logged.")
531
+ )
532
+ p.add_argument(
533
+ "-o", "--output", dest="output_pdb", required=False, nargs="+",
534
+ metavar="pocket.pdb", default=None,
535
+ help=("Output PDB path(s). Provide one path to write a single multi‑MODEL PDB, "
536
+ "or provide N paths where N == number of inputs to write N single‑model PDBs (one per input, in order). "
537
+ "If omitted: single input → pocket.pdb; multiple inputs → pocket_{original_filename}.pdb.")
538
+ )
539
+ p.add_argument(
540
+ "-r", "--radius", type=float, default=2.6,
541
+ help=("Cutoff (Å) around substrate atoms. With --exclude-backbone true (default), an **amino-acid** "
542
+ "neighbor must have a **non-backbone** atom within this distance; otherwise **any atom** suffices. "
543
+ "(default: 2.6)")
544
+ )
545
+ p.add_argument(
546
+ "--radius-het2het", type=float, default=0,
547
+ help=("Cutoff (Å) for substrate–protein hetero‑atom proximity (non‑C/H on both sides); "
548
+ "applied independently of --radius. 0 conceptually disables this rule, "
549
+ "but is internally treated as 0.001 Å. (default: 0)")
550
+ )
551
+ p.add_argument(
552
+ "--include-H2O", "--include-h2o",
553
+ dest="include_H2O",
554
+ action=argparse.BooleanOptionalAction,
555
+ default=True,
556
+ help="Include waters (HOH/WAT/TIP3/SOL). (default: True)"
557
+ )
558
+ p.add_argument(
559
+ "--exclude-backbone",
560
+ dest="exclude_backbone",
561
+ action=argparse.BooleanOptionalAction,
562
+ default=True,
563
+ help="Delete main‑chain atoms (N, H*, CA, HA*, C, O) from non‑substrate amino acids; PRO/HYP keep N, CA, HA, H*. (default: True)"
564
+ )
565
+ p.add_argument(
566
+ "--add-linkH",
567
+ dest="add_linkH",
568
+ action=argparse.BooleanOptionalAction,
569
+ default=False,
570
+ help="Add carbon‑only link‑H at 1.09 Å along cut‑bond directions; appended after a TER as HL/LKH HETATM records. (default: False)"
571
+ )
572
+ p.add_argument(
573
+ "--selected-resn", dest="selected_resn", required=False, default="",
574
+ help=("Comma/space‑separated residue IDs to force‑include (e.g., '123,124', 'A:123,B:456'; "
575
+ "insertion codes allowed: '123A' / 'A:123A').")
576
+ )
577
+ p.add_argument(
578
+ "-l", "--ligand-charge", type=str, default=None,
579
+ help=("Either a single **number** giving the **total** charge to distribute across unknown residues "
580
+ "(preferring unknown substrate), or a comma/space‑separated **per‑resname** list like "
581
+ "'GPP:-3,MMT:-1'. In mapping mode, any other unknown residues remain 0.")
582
+ )
583
+ p.add_argument(
584
+ "-v", "--verbose",
585
+ dest="verbose",
586
+ action=argparse.BooleanOptionalAction,
587
+ default=True,
588
+ help=("Enable INFO-level logging."
589
+ " default: True.")
590
+ )
591
+ return p
592
+
593
+
594
+ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
595
+ """
596
+ Parse CLI arguments.
597
+
598
+ Parameters
599
+ ----------
600
+ argv : Optional[Sequence[str]]
601
+ Command-line arguments to parse. If None, uses sys.argv.
602
+
603
+ Returns
604
+ -------
605
+ argparse.Namespace
606
+ Parameters for running the pocket extraction.
607
+ """
608
+ argv_list = list(argv) if argv is not None else None
609
+ if argv_list is not None:
610
+ wants_adv = "--help-advanced" in argv_list
611
+ wants_help = ("--help" in argv_list) or ("-h" in argv_list)
612
+ if wants_help and not wants_adv:
613
+ click.echo(_extract_short_help())
614
+ raise SystemExit(0)
615
+ if wants_adv:
616
+ argv_list = [a for a in argv_list if a != "--help-advanced"]
617
+ if ("--help" not in argv_list) and ("-h" not in argv_list):
618
+ argv_list.append("--help")
619
+
620
+ p = _build_arg_parser(prog="mlmm extract")
621
+ return p.parse_args(args=argv_list)
622
+
623
+
624
+ def load_structure(path: str, name: str) -> PDB.Structure.Structure:
625
+ """
626
+ Load a PDB file into a Biopython Structure object.
627
+ """
628
+ parser = PDB.PDBParser(QUIET=True)
629
+ return parser.get_structure(name, path)
630
+
631
+
632
+ # ---------------------------------------------------------------------
633
+ # Formatting helpers (for logging / API)
634
+ # ---------------------------------------------------------------------
635
+
636
+ def _fmt_res_id(res: PDB.Residue.Residue) -> str:
637
+ """
638
+ Return a compact residue tag like 'A:123A SER' or '123 SER'.
639
+ """
640
+ chain = res.get_parent().id or ""
641
+ het, resseq, icode = res.id
642
+ icode_txt = "" if icode == " " else icode
643
+ chain_txt = f"{chain}:" if chain else ""
644
+ return f"{chain_txt}{resseq}{icode_txt} {res.get_resname()}"
645
+
646
+
647
+ def _fmt_fid(structure, fid: Tuple) -> str:
648
+ """
649
+ Format a full-id into a human-friendly residue tag.
650
+ """
651
+ res: PDB.Residue.Residue = structure[fid[1]][fid[2]].child_dict[fid[3]]
652
+ return _fmt_res_id(res)
653
+
654
+
655
+ # ---------------------------------------------------------------------
656
+ # Substrate matching
657
+ # ---------------------------------------------------------------------
658
+
659
+ def is_exact_match(lig_atoms: Dict[str, PDB.Vector.Vector],
660
+ cand: PDB.Residue.Residue) -> bool:
661
+ """
662
+ Return True if candidate residue matches ligand atom names and positions within EXACT_EPS.
663
+ """
664
+ for name, vec in lig_atoms.items():
665
+ if name not in cand:
666
+ return False
667
+ if (vec - cand[name].get_vector()).norm() > EXACT_EPS:
668
+ return False
669
+ return True
670
+
671
+
672
+ def find_substrate_residues(complex_struct, substrate_struct) -> List[PDB.Residue.Residue]:
673
+ """
674
+ Find substrate residues in the complex by **exact coordinate match** to a substrate PDB.
675
+ """
676
+ substrate_res_list = list(substrate_struct.get_residues())
677
+ matched: List[PDB.Residue.Residue] = []
678
+ for lig in substrate_res_list:
679
+ lig_name = lig.get_resname()
680
+ lig_atoms = {a.get_name(): a.get_vector() for a in lig}
681
+ candidates = [r for r in complex_struct.get_residues()
682
+ if r.get_resname() == lig_name and len(r) == len(lig_atoms)]
683
+ for cand in candidates:
684
+ if is_exact_match(lig_atoms, cand):
685
+ matched.append(cand)
686
+ break
687
+ else:
688
+ chain_id = lig.get_full_id()[2] if len(lig.get_full_id()) > 2 else ""
689
+ resseq = lig.id[1]
690
+ icode = lig.id[2] if len(lig.id) > 2 else " "
691
+ icode_str = "" if icode == " " else icode
692
+ raise ValueError(
693
+ f"Exact match not found for substrate residue {lig_name} chain {chain_id} {resseq}{icode_str}"
694
+ )
695
+ return matched
696
+
697
+
698
+ # ---------- Residue‑ID–based substrate selection ----------
699
+
700
+ _RES_TOKEN_RE = re.compile(r"""
701
+ ^\s*
702
+ (?:(?P<chain>[^:\s,]+)\s*:\s*)? # optional chain like A or A_long
703
+ (?P<resseq>\d+) # residue sequence number
704
+ (?P<icode>[A-Za-z]?) # optional insertion code (single letter)
705
+ \s*$
706
+ """, re.VERBOSE)
707
+
708
+ def _parse_res_tokens(spec: str) -> List[Tuple[str | None, int, str | None]]:
709
+ """
710
+ Parse a residue specification string into (chain, resseq, icode) tuples.
711
+ """
712
+ if not spec or not spec.strip():
713
+ raise ValueError("Empty -c/--center specification.")
714
+ tokens = [t.strip() for t in re.split(r"[,\s]+", spec) if t.strip()]
715
+ parsed: List[Tuple[str | None, int, str | None]] = []
716
+ for tok in tokens:
717
+ m = _RES_TOKEN_RE.match(tok)
718
+ if not m:
719
+ raise ValueError(
720
+ f"Invalid residue specifier '{tok}'. Use '123', '123A', 'A:123', or 'A:123A'."
721
+ )
722
+ chain = m.group("chain")
723
+ resseq = int(m.group("resseq"))
724
+ icode = m.group("icode") or None
725
+ parsed.append((chain, resseq, icode))
726
+ return parsed
727
+
728
+
729
+ def find_substrate_by_idspec(complex_struct, spec: str) -> List[PDB.Residue.Residue]:
730
+ """
731
+ Resolve a comma/space-separated residue list into residues within the complex.
732
+
733
+ Matching rules
734
+ --------------
735
+ * Chain may be omitted (matches all chains).
736
+ * Insertion code may be omitted (matches any insertion code for that resseq).
737
+
738
+ Returns
739
+ -------
740
+ list[Bio.PDB.Residue.Residue]
741
+ """
742
+ targets = _parse_res_tokens(spec)
743
+ found: List[PDB.Residue.Residue] = []
744
+ seen: Set[Tuple] = set()
745
+
746
+ for chain_req, resseq_req, icode_req in targets:
747
+ matches: List[PDB.Residue.Residue] = []
748
+ for model in complex_struct:
749
+ for chain in model:
750
+ if chain_req is not None and chain.id != chain_req:
751
+ continue
752
+ for res in chain.get_residues():
753
+ _, resseq, icode = res.id
754
+ if resseq != resseq_req:
755
+ continue
756
+ if icode_req is not None and icode != icode_req:
757
+ continue
758
+ fid = res.get_full_id()
759
+ if fid not in seen:
760
+ seen.add(fid)
761
+ matches.append(res)
762
+ if not matches:
763
+ chain_txt = f"{chain_req}:" if chain_req is not None else ""
764
+ icode_txt = icode_req or ""
765
+ raise ValueError(f"Residue '{chain_txt}{resseq_req}{icode_txt}' not found in complex.")
766
+ found.extend(matches)
767
+
768
+ return found
769
+
770
+ # ---------- Residue-name-based substrate selection ----------
771
+
772
+ def find_substrate_by_resname(complex_struct, spec: str) -> List[PDB.Residue.Residue]:
773
+ """
774
+ Resolve a comma/space-separated residue-name list (e.g., 'GPP,MMT') into residues in the complex.
775
+
776
+ Behavior
777
+ --------
778
+ * Case-insensitive match against residue `resname`.
779
+ * If multiple residues share the same name, **all** are included and a **WARNING** is logged.
780
+ """
781
+ if not spec or not spec.strip():
782
+ raise ValueError("Empty -c/--center specification.")
783
+ tokens = [t.strip().upper() for t in re.split(r"[,\s]+", spec) if t.strip()]
784
+ found: List[PDB.Residue.Residue] = []
785
+ seen_fids: Set[Tuple] = set()
786
+ for rn in tokens:
787
+ matches = [r for r in complex_struct.get_residues() if r.get_resname().upper() == rn]
788
+ if not matches:
789
+ raise ValueError(f"Residue name '{rn}' not found in complex.")
790
+ if len(matches) > 1:
791
+ try:
792
+ sample = ", ".join(_fmt_res_id(r) for r in matches[:5])
793
+ except Exception:
794
+ sample = "(list omitted)"
795
+ LOGGER.warning("[extract] Multiple residues with resname '%s' found (%d). Using all: %s",
796
+ rn, len(matches), sample)
797
+ for r in matches:
798
+ fid = r.get_full_id()
799
+ if fid not in seen_fids:
800
+ seen_fids.add(fid)
801
+ found.append(r)
802
+ return found
803
+
804
+
805
+ def resolve_substrate_residues(complex_struct, center_spec: str) -> List[PDB.Residue.Residue]:
806
+ """
807
+ Determine substrate residues from a PDB path, residue-ID list, or residue-name list.
808
+ """
809
+ if os.path.exists(center_spec):
810
+ substrate_struct = load_structure(center_spec, "substrate")
811
+ return find_substrate_residues(complex_struct, substrate_struct)
812
+ # If it parses as ID-spec, treat as IDs (and propagate any not-found errors).
813
+ try:
814
+ _parse_res_tokens(center_spec)
815
+ return find_substrate_by_idspec(complex_struct, center_spec)
816
+ except ValueError:
817
+ # Otherwise, interpret as residue-name list (e.g., 'GPP,MMT').
818
+ return find_substrate_by_resname(complex_struct, center_spec)
819
+
820
+
821
+ # ---------------------------------------------------------------------
822
+ # Polypeptide adjacency (C–N) helper
823
+ # ---------------------------------------------------------------------
824
+
825
+ def are_peptide_adjacent(prev_res: PDB.Residue.Residue,
826
+ next_res: PDB.Residue.Residue,
827
+ max_cn_dist: float = 1.9) -> bool:
828
+ """
829
+ Return True if prev_res—next_res are peptide-bond adjacent based on C(prev)–N(next) distance.
830
+
831
+ Notes
832
+ -----
833
+ Distance‑based criterion; in practice this avoids crossing TER boundaries because missing
834
+ atoms or long inter‑residue distances will fail the check.
835
+ """
836
+ if prev_res.get_resname() not in AMINO_ACIDS or next_res.get_resname() not in AMINO_ACIDS:
837
+ return False
838
+ if ("C" not in prev_res) or ("N" not in next_res):
839
+ return False
840
+ try:
841
+ d = (prev_res["C"].get_vector() - next_res["N"].get_vector()).norm()
842
+ except Exception:
843
+ return False
844
+ return (d == d) and (d <= max_cn_dist) # d==d to filter NaN
845
+
846
+
847
+ # ---------------------------------------------------------------------
848
+ # Residue selection around the substrate
849
+ # ---------------------------------------------------------------------
850
+
851
+ def select_residues(complex_struct,
852
+ substrate_res_list: List[PDB.Residue.Residue],
853
+ r_as: float,
854
+ r_het: float,
855
+ include_h2o: bool,
856
+ exclude_backbone: bool) -> Tuple[Set[Tuple], Set[Tuple]]:
857
+ """
858
+ Select pocket residues around the substrate.
859
+
860
+ Selection rule
861
+ --------------
862
+ * Always include the substrate residues themselves.
863
+ * Standard cutoff (`r_as`):
864
+ - If `exclude_backbone` is **False**: include a residue if **any** atom is within `r_as`.
865
+ - If `exclude_backbone` is **True**: for **amino acids**, require a **non‑backbone** atom
866
+ to be within `r_as`; non‑amino‑acid residues are included if **any** atom is within `r_as`.
867
+ * Hetero‑hetero cutoff (`r_het`):
868
+ - Neighbor atom must be hetero (element not in {C,H}).
869
+ - When `exclude_backbone` is **True** and the neighbor is an amino acid, that atom must
870
+ also be **non‑backbone**.
871
+
872
+ Returns
873
+ -------
874
+ (selected_ids, backbone_contact_ids)
875
+ selected_ids : set of residue full-ids to output
876
+ backbone_contact_ids : subset with any **backbone atom** within r_as or r_het of a substrate atom.
877
+ (Waters ignored; only relevant when exclude_backbone == False)
878
+ """
879
+ substrate_atoms = [a for lig in substrate_res_list for a in lig]
880
+ substrate_het = [a for a in substrate_atoms if a.element not in ("C", "H")]
881
+ ns = NeighborSearch(list(complex_struct.get_atoms()))
882
+
883
+ selected_ids: Set[Tuple] = {res.get_full_id() for res in substrate_res_list}
884
+ backbone_contact_ids: Set[Tuple] = set()
885
+
886
+ def is_amino_backbone_atom(atom: PDB.Atom.Atom) -> bool:
887
+ res = atom.get_parent()
888
+ return (res.get_resname() in AMINO_ACIDS) and (atom.get_name() in BACKBONE_ATOMS)
889
+
890
+ def maybe_add(atom, via_backbone: bool):
891
+ res = atom.get_parent()
892
+ if not include_h2o and res.get_resname() in WATER_RES:
893
+ return
894
+ fid = res.get_full_id()
895
+ selected_ids.add(fid)
896
+ if via_backbone and res.get_resname() in AMINO_ACIDS:
897
+ backbone_contact_ids.add(fid)
898
+
899
+ # standard radius: any atom within r_as (with backbone filter when exclude_backbone==True)
900
+ for atom in substrate_atoms:
901
+ for neigh in ns.search(atom.get_coord(), r_as):
902
+ if exclude_backbone and is_amino_backbone_atom(neigh):
903
+ continue # require non-backbone atom for amino-acid residues
904
+ via_backbone_neigh = (neigh.get_name() in BACKBONE_ATOMS)
905
+ maybe_add(neigh, via_backbone_neigh)
906
+
907
+ # hetero-hetero radius: both sides non-C/H (and non-backbone filter for amino acids when exclude_backbone==True)
908
+ for atom in substrate_het:
909
+ for neigh in ns.search(atom.get_coord(), r_het):
910
+ if neigh.element in ("C", "H"):
911
+ continue
912
+ if exclude_backbone and is_amino_backbone_atom(neigh):
913
+ continue
914
+ via_backbone_neigh = (neigh.get_name() in BACKBONE_ATOMS)
915
+ maybe_add(neigh, via_backbone_neigh)
916
+
917
+ return selected_ids, backbone_contact_ids
918
+
919
+
920
+ # ---------------------------------------------------------------------
921
+ # Disulfide augmentation
922
+ # ---------------------------------------------------------------------
923
+
924
+ def augment_disulfides(structure, selected_ids: Set[Tuple],
925
+ cutoff: float = DISULFIDE_CUTOFF):
926
+ """
927
+ Include Cys–Cys disulfide partners if either residue is selected (SG–SG ≤ cutoff).
928
+ """
929
+ sg_atoms = [r["SG"] for r in structure.get_residues()
930
+ if r.get_resname() in {"CYS", "CYX"} and "SG" in r]
931
+
932
+ if not sg_atoms:
933
+ return
934
+
935
+ ns = NeighborSearch(sg_atoms)
936
+ for at in sg_atoms:
937
+ for other in ns.search(at.get_coord(), cutoff):
938
+ if other is at:
939
+ continue
940
+ f1 = at.get_parent().get_full_id()
941
+ f2 = other.get_parent().get_full_id()
942
+ if f1 in selected_ids or f2 in selected_ids:
943
+ selected_ids.update((f1, f2))
944
+
945
+
946
+ # ---------------------------------------------------------------------
947
+ # Proline augmentation (N-side neighbor inclusion; TER-aware)
948
+ # ---------------------------------------------------------------------
949
+
950
+ def augment_proline_prev_neighbor(structure, selected_ids: Set[Tuple]):
951
+ """
952
+ Ensure that if a selected PRO is not at the N-terminus, the immediately
953
+ preceding (N-side) amino-acid residue is also selected.
954
+
955
+ Notes
956
+ -----
957
+ Uses peptide adjacency (C–N ≤ 1.9 Å) to avoid crossing TER boundaries.
958
+ """
959
+ added = 0
960
+ for fid in list(selected_ids):
961
+ model_id, chain_id, res_id = fid[1], fid[2], fid[3]
962
+ res: PDB.Residue.Residue = structure[model_id][chain_id].child_dict[res_id]
963
+ if res.get_resname() != "PRO":
964
+ continue
965
+ chain = structure[model_id][chain_id]
966
+ residues: List[PDB.Residue.Residue] = list(chain.get_residues())
967
+ try:
968
+ idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
969
+ except StopIteration:
970
+ continue
971
+ prev_res = None
972
+ for j in range(idx - 1, -1, -1):
973
+ rj = residues[j]
974
+ if rj.get_resname() in AMINO_ACIDS:
975
+ prev_res = rj
976
+ break
977
+ if prev_res is None:
978
+ continue
979
+ if not are_peptide_adjacent(prev_res, res):
980
+ continue
981
+ prev_fid = prev_res.get_full_id()
982
+ if prev_fid not in selected_ids:
983
+ selected_ids.add(prev_fid)
984
+ added += 1
985
+ if added:
986
+ LOGGER.info("[extract] Added %d N-side neighbor residues for PRO (TER-aware).", added)
987
+
988
+
989
+ # ---------------------------------------------------------------------
990
+ # Backbone-contact neighbor augmentation (exclude_backbone == False; TER-aware)
991
+ # ---------------------------------------------------------------------
992
+
993
+ def augment_backbone_contact_neighbors(structure,
994
+ selected_ids: Set[Tuple],
995
+ backbone_contact_ids: Set[Tuple],
996
+ substrate_ids: Set[Tuple]) -> Tuple[Set[Tuple], Set[Tuple]]:
997
+ """
998
+ If a non-substrate residue had **any backbone atom** within selection radii,
999
+ include its immediate N- and C-side amino-acid neighbors **only if peptide-bond adjacent**.
1000
+
1001
+ If a side has no peptide-adjacent neighbor (true terminus; e.g., separated by TER),
1002
+ mark the residue to **keep** the respective terminal atoms (N/H* for N-terminus; C/O/OXT for C-terminus).
1003
+
1004
+ Returns
1005
+ -------
1006
+ keep_ncap_ids, keep_ccap_ids : sets of full-ids whose terminal caps must be preserved
1007
+ """
1008
+ keep_ncap_ids: Set[Tuple] = set()
1009
+ keep_ccap_ids: Set[Tuple] = set()
1010
+ added = 0
1011
+ termini_kept_n = 0
1012
+ termini_kept_c = 0
1013
+
1014
+ for fid in list(backbone_contact_ids):
1015
+ if fid in substrate_ids:
1016
+ continue # do not augment around substrate residues
1017
+ model_id, chain_id, res_id = fid[1], fid[2], fid[3]
1018
+ chain = structure[model_id][chain_id]
1019
+ residues: List[PDB.Residue.Residue] = list(chain.get_residues())
1020
+ try:
1021
+ idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
1022
+ except StopIteration:
1023
+ continue
1024
+
1025
+ cur_res = residues[idx]
1026
+
1027
+ # previous amino-acid — require peptide adjacency
1028
+ prev_res = None
1029
+ for j in range(idx - 1, -1, -1):
1030
+ rj = residues[j]
1031
+ if rj.get_resname() in AMINO_ACIDS:
1032
+ prev_res = rj
1033
+ break
1034
+ if prev_res is not None and are_peptide_adjacent(prev_res, cur_res):
1035
+ prev_fid = prev_res.get_full_id()
1036
+ if prev_fid not in selected_ids:
1037
+ selected_ids.add(prev_fid)
1038
+ added += 1
1039
+ else:
1040
+ keep_ncap_ids.add(fid)
1041
+ termini_kept_n += 1
1042
+
1043
+ # next amino-acid — require peptide adjacency
1044
+ next_res = None
1045
+ for j in range(idx + 1, len(residues)):
1046
+ rj = residues[j]
1047
+ if rj.get_resname() in AMINO_ACIDS:
1048
+ next_res = rj
1049
+ break
1050
+ if next_res is not None and are_peptide_adjacent(cur_res, next_res):
1051
+ next_fid = next_res.get_full_id()
1052
+ if next_fid not in selected_ids:
1053
+ selected_ids.add(next_fid)
1054
+ added += 1
1055
+ else:
1056
+ keep_ccap_ids.add(fid)
1057
+ termini_kept_c += 1
1058
+
1059
+ if added or termini_kept_n or termini_kept_c:
1060
+ LOGGER.info("[extract] Backbone-contact context (TER-aware): added %d neighbors; kept N-cap on %d, C-cap on %d residues.",
1061
+ added, termini_kept_n, termini_kept_c)
1062
+ return keep_ncap_ids, keep_ccap_ids
1063
+
1064
+
1065
+ # ---------------------------------------------------------------------
1066
+ # Backbone trimming / skip-map generation
1067
+ # ---------------------------------------------------------------------
1068
+
1069
+ def mark_atoms_to_skip(structure, selected_ids: Set[Tuple], substrate_ids: Set[Tuple],
1070
+ exclude_backbone: bool,
1071
+ keep_ncap_ids: Set[Tuple] | None = None,
1072
+ keep_ccap_ids: Set[Tuple] | None = None) -> Dict[Tuple, Set[str]]:
1073
+ """
1074
+ Decide which atoms to delete (truncation). Never delete substrate atoms.
1075
+
1076
+ Returns
1077
+ -------
1078
+ dict[full-id -> set(atom_names_to_delete)]
1079
+ """
1080
+ keep_ncap_ids = keep_ncap_ids or set()
1081
+ keep_ccap_ids = keep_ccap_ids or set()
1082
+
1083
+ # start with the original truncation logic (except for substrate residues)
1084
+ chain_map: Dict[Tuple[str, str], List[Tuple]] = {}
1085
+ for fid in selected_ids:
1086
+ if fid in substrate_ids:
1087
+ continue # never delete atoms from substrate residues
1088
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1089
+ if res.get_resname() in WATER_RES:
1090
+ continue
1091
+ chain_map.setdefault((fid[1], fid[2]), []).append(fid)
1092
+
1093
+ skip: Dict[Tuple, Set[str]] = {}
1094
+
1095
+ # --- TER-aware segmentation: split by peptide adjacency in file order ---
1096
+ for (model, chain), fids in chain_map.items():
1097
+ chain_obj = structure[model][chain]
1098
+ residues_all: List[PDB.Residue.Residue] = list(chain_obj.get_residues())
1099
+ index_map: Dict[Tuple, int] = {r.get_full_id(): i for i, r in enumerate(residues_all)}
1100
+
1101
+ # sort by file order
1102
+ fids.sort(key=lambda x: index_map.get(x, 10**9))
1103
+
1104
+ # build segments by peptide-bond adjacency
1105
+ segs: List[List[Tuple]] = []
1106
+ cur_seg: List[Tuple] = []
1107
+ for k, fid in enumerate(fids):
1108
+ if not cur_seg:
1109
+ cur_seg = [fid]
1110
+ continue
1111
+ prev_fid = cur_seg[-1]
1112
+ prev_res = chain_obj.child_dict[prev_fid[3]]
1113
+ cur_res = chain_obj.child_dict[fid[3]]
1114
+ if are_peptide_adjacent(prev_res, cur_res):
1115
+ cur_seg.append(fid)
1116
+ else:
1117
+ segs.append(cur_seg)
1118
+ cur_seg = [fid]
1119
+ if cur_seg:
1120
+ segs.append(cur_seg)
1121
+
1122
+ # apply cap deletions on these TER-aware segments
1123
+ for seg in segs:
1124
+ n_id, c_id = seg[0], seg[-1]
1125
+ single = len(seg) == 1
1126
+
1127
+ def add(fid_local, names):
1128
+ skip.setdefault(fid_local, set()).update(names)
1129
+
1130
+ n_res = chain_obj.child_dict[n_id[3]]
1131
+ c_res = chain_obj.child_dict[c_id[3]]
1132
+
1133
+ # N-terminal cap deletion (only for amino acids; skip if PRO/HYP or explicitly kept)
1134
+ if (n_res.get_resname() in AMINO_ACIDS) and (n_res.get_resname() not in {"PRO", "HYP"}) and (n_id not in keep_ncap_ids):
1135
+ add(n_id, {"N", "H", "H1", "H2", "H3", "HN"})
1136
+ # C-terminal cap deletion (only for amino acids; skip if explicitly kept)
1137
+ if (c_res.get_resname() in AMINO_ACIDS) and (c_id not in keep_ccap_ids):
1138
+ add(c_id, {"C", "O", "OXT"})
1139
+
1140
+ # Isolated stretch – remove CA/HA* (only for amino acids; except PRO/HYP)
1141
+ if single and (n_res.get_resname() in AMINO_ACIDS) and (n_res.get_resname() not in {"PRO", "HYP"}):
1142
+ add(n_id, {"CA", "HA", "HA2", "HA3"})
1143
+
1144
+ # ---------------------------------------------------------------------
1145
+ # Optional: remove *all* backbone atoms from every non-substrate residue
1146
+ # PRO/HYP keep N, CA, and HA* to preserve the ring.
1147
+ # ---------------------------------------------------------------------
1148
+ if exclude_backbone:
1149
+ for fid in selected_ids:
1150
+ if fid in substrate_ids:
1151
+ continue
1152
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1153
+ if res.get_resname() in WATER_RES:
1154
+ continue
1155
+ if res.get_resname() in AMINO_ACIDS:
1156
+ if res.get_resname() in {"PRO", "HYP"}:
1157
+ to_remove = BACKBONE_ALL - {"N", "CA", "HA", "H", "H1", "H2", "H3"}
1158
+ else:
1159
+ to_remove = BACKBONE_ALL
1160
+ skip.setdefault(fid, set()).update(to_remove)
1161
+
1162
+ # Preserve peptide carbonyl on the N-side neighbor of PRO
1163
+ for fid in selected_ids:
1164
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1165
+ if res.get_resname() != "PRO":
1166
+ continue
1167
+ chain = structure[fid[1]][fid[2]]
1168
+ residues: List[PDB.Residue.Residue] = list(chain.get_residues())
1169
+ try:
1170
+ idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
1171
+ except StopIteration:
1172
+ continue
1173
+ prev_res = None
1174
+ for j in range(idx - 1, -1, -1):
1175
+ rj = residues[j]
1176
+ if rj.get_resname() in AMINO_ACIDS:
1177
+ prev_res = rj
1178
+ break
1179
+ if prev_res is None:
1180
+ continue
1181
+ if not are_peptide_adjacent(prev_res, res):
1182
+ continue
1183
+ prev_fid = prev_res.get_full_id()
1184
+ if prev_fid in selected_ids:
1185
+ sk = skip.setdefault(prev_fid, set())
1186
+ for nm in ("C", "O", "OXT"):
1187
+ if nm in sk:
1188
+ sk.remove(nm)
1189
+
1190
+ # Always keep CA on the N-side neighbor of PRO (independent of --exclude-backbone)
1191
+ for fid in selected_ids:
1192
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1193
+ if res.get_resname() != "PRO":
1194
+ continue
1195
+ chain = structure[fid[1]][fid[2]]
1196
+ residues: List[PDB.Residue.Residue] = list(chain.get_residues())
1197
+ try:
1198
+ idx = next(i for i, r in enumerate(residues) if r.get_full_id() == fid)
1199
+ except StopIteration:
1200
+ continue
1201
+ prev_res = None
1202
+ for j in range(idx - 1, -1, -1):
1203
+ rj = residues[j]
1204
+ if rj.get_resname() in AMINO_ACIDS:
1205
+ prev_res = rj
1206
+ break
1207
+ if prev_res is None:
1208
+ continue
1209
+ if not are_peptide_adjacent(prev_res, res):
1210
+ continue
1211
+ prev_fid = prev_res.get_full_id()
1212
+ if prev_fid in selected_ids:
1213
+ sk = skip.setdefault(prev_fid, set())
1214
+ if "CA" in sk:
1215
+ sk.remove("CA")
1216
+
1217
+ return skip
1218
+
1219
+
1220
+ def _atom_present_in_output(res: PDB.Residue.Residue, name: str, skip_set: Set[str]) -> bool:
1221
+ """
1222
+ True if the atom exists originally AND is not marked for deletion.
1223
+ """
1224
+ return (name in res) and (name not in skip_set)
1225
+
1226
+ def _atom_removed_by_truncation(res: PDB.Residue.Residue, name: str, skip_set: Set[str]) -> bool:
1227
+ """
1228
+ True if the atom exists originally AND is marked for deletion.
1229
+ """
1230
+ return (name in res) and (name in skip_set)
1231
+
1232
+ def compute_linkH_atoms(structure,
1233
+ selected_ids: Set[Tuple],
1234
+ skip_map: Dict[Tuple, Set[str]]) -> List[Tuple[float, float, float]]:
1235
+ """
1236
+ Identify severed bonds created by truncation and compute link‑H coordinates.
1237
+
1238
+ Rules
1239
+ -----
1240
+ * Normal residues: place H along **CB→CA**, **CA→N**, **CA→C** if partner was removed.
1241
+ * PRO/HYP: place H along **CA→C** only.
1242
+ * Parent atom **must be Carbon**; H is placed along (parent → removed_partner) at **1.09 Å**.
1243
+
1244
+ Returns
1245
+ -------
1246
+ list of (x, y, z) coordinates for link‑H atoms
1247
+ """
1248
+ link_coords: List[Tuple[float, float, float]] = []
1249
+
1250
+ for fid in selected_ids:
1251
+ model_id, chain_id, res_id = fid[1], fid[2], fid[3]
1252
+ res: PDB.Residue.Residue = structure[model_id][chain_id].child_dict[res_id]
1253
+ if res.get_resname() in WATER_RES:
1254
+ continue
1255
+ skip_set = skip_map.get(fid, set())
1256
+ resname = res.get_resname()
1257
+
1258
+ def _add_if_cut(parent_name: str, partner_name: str):
1259
+ if not _atom_present_in_output(res, parent_name, skip_set):
1260
+ return
1261
+ if not _atom_removed_by_truncation(res, partner_name, skip_set):
1262
+ return
1263
+ parent = res[parent_name]
1264
+ partner = res[partner_name]
1265
+ parent_elem = (parent.element or parent.get_name()[0]).upper()
1266
+ if parent_elem != "C":
1267
+ return
1268
+ v = np.array(partner.get_coord(), dtype=float) - np.array(parent.get_coord(), dtype=float)
1269
+ norm = np.linalg.norm(v)
1270
+ if not np.isfinite(norm) or norm < 1e-6:
1271
+ return
1272
+ v /= norm
1273
+ dist = 1.09 # C–H
1274
+ h = np.array(parent.get_coord(), dtype=float) + v * dist
1275
+ link_coords.append((float(h[0]), float(h[1]), float(h[2])))
1276
+
1277
+ if resname in {"PRO", "HYP"}:
1278
+ _add_if_cut("CA", "C")
1279
+ else:
1280
+ _add_if_cut("CB", "CA")
1281
+ _add_if_cut("CA", "N")
1282
+ _add_if_cut("CA", "C")
1283
+
1284
+ return link_coords
1285
+
1286
+
1287
+ def _max_serial_from_pdb_text(pdb_text: str) -> int:
1288
+ """
1289
+ Find the maximum atom serial number in PDB text.
1290
+ """
1291
+ max_serial = 0
1292
+ for line in pdb_text.splitlines():
1293
+ if line.startswith("ATOM") or line.startswith("HETATM"):
1294
+ try:
1295
+ serial = int(line[6:11])
1296
+ if serial > max_serial:
1297
+ max_serial = serial
1298
+ except Exception:
1299
+ continue
1300
+ return max_serial
1301
+
1302
+
1303
+ def _format_linkH_block(link_coords: List[Tuple[float, float, float]],
1304
+ start_serial: int,
1305
+ chain_id: str = "L") -> str:
1306
+ """
1307
+ Format a contiguous HETATM block for link‑H atoms.
1308
+
1309
+ Conventions
1310
+ -----------
1311
+ * Atom name: HL
1312
+ * Residue name: LKH
1313
+ * Chain: chain_id (default 'L')
1314
+ * Residue numbers: 1..N (one pseudo‑residue per H)
1315
+ """
1316
+ lines: List[str] = []
1317
+ serial = start_serial
1318
+ resseq = 1
1319
+ for (x, y, z) in link_coords:
1320
+ serial += 1
1321
+ line = (
1322
+ f"HETATM{serial:5d} "
1323
+ f"{'HL':>4s} "
1324
+ f"{'LKH':>3s} "
1325
+ f"{chain_id}"
1326
+ f"{resseq:4d} "
1327
+ f"{x:8.3f}{y:8.3f}{z:8.3f}"
1328
+ f"{1.00:6.2f}{0.00:6.2f}"
1329
+ f" {'H':>2s}"
1330
+ )
1331
+ lines.append(line)
1332
+ resseq += 1
1333
+ return ("\n".join(lines) + ("\n" if lines else ""))
1334
+
1335
+
1336
+ # ---------------------------------------------------------------------
1337
+ # Charge calculation & logging
1338
+ # ---------------------------------------------------------------------
1339
+
1340
+ def _sorted_fids_by_file_order(structure, fids: Iterable[Tuple]) -> List[Tuple]:
1341
+ """
1342
+ Sort full-ids by file order using a residue index map.
1343
+ """
1344
+ order: Dict[Tuple, int] = {}
1345
+ idx = 0
1346
+ for model in structure:
1347
+ for chain in model:
1348
+ for res in chain.get_residues():
1349
+ order[res.get_full_id()] = idx
1350
+ idx += 1
1351
+ return sorted(set(fids), key=lambda fid: order.get(fid, 10**12))
1352
+
1353
+ def _residue_key_from_res(res: PDB.Residue.Residue) -> ResidueKey:
1354
+ """
1355
+ Build a cross-structure residue key from a residue.
1356
+ """
1357
+ chain_id = res.get_parent().id
1358
+ hetflag, resseq, icode = res.id
1359
+ icode_str = icode if icode != " " else ""
1360
+ return (chain_id, hetflag, int(resseq), icode_str, res.get_resname())
1361
+
1362
+ def _residue_key_from_fid(structure, fid: Tuple) -> ResidueKey:
1363
+ """
1364
+ Build a cross-structure residue key from a full-id.
1365
+ """
1366
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1367
+ return _residue_key_from_res(res)
1368
+
1369
+ # ---- helper for parsing --ligand-charge (number or 'RES:Q' mapping) ----
1370
+ def _parse_ligand_charge_option(ligand_charge: float | str | Dict[str, float] | None
1371
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
1372
+ """
1373
+ Returns
1374
+ -------
1375
+ (total_charge, mapping)
1376
+ total_charge : float | None
1377
+ mapping : dict[RESNAME -> float] | None
1378
+ """
1379
+ if ligand_charge is None:
1380
+ return None, None
1381
+ if isinstance(ligand_charge, (int, float)):
1382
+ return float(ligand_charge), None
1383
+ if isinstance(ligand_charge, dict):
1384
+ mapping = {str(k).upper(): float(v) for k, v in ligand_charge.items()}
1385
+ return None, mapping
1386
+ if isinstance(ligand_charge, str):
1387
+ s = ligand_charge.strip()
1388
+ if not s:
1389
+ return None, None
1390
+ # try numeric
1391
+ try:
1392
+ return float(s), None
1393
+ except ValueError:
1394
+ pass
1395
+ # mapping: tokens "RES:Q"
1396
+ tokens = [t for t in re.split(r"[,\s]+", s) if t]
1397
+ mapping: Dict[str, float] = {}
1398
+ for tok in tokens:
1399
+ if ":" not in tok:
1400
+ raise ValueError(f"Invalid --ligand-charge token '{tok}'. Use 'RES:Q' (e.g., GPP:-3) or a number (e.g., -3).")
1401
+ res, qtxt = tok.split(":", 1)
1402
+ resname = res.strip().upper()
1403
+ if not resname:
1404
+ raise ValueError(f"Invalid --ligand-charge token '{tok}': empty residue name.")
1405
+ try:
1406
+ qval = float(qtxt.strip())
1407
+ except ValueError:
1408
+ raise ValueError(f"Invalid --ligand-charge token '{tok}': '{qtxt}' is not a number.")
1409
+ mapping[resname] = qval
1410
+ if not mapping:
1411
+ raise ValueError("Empty --ligand-charge mapping.")
1412
+ return None, mapping
1413
+ raise TypeError(f"Unsupported type for ligand_charge: {type(ligand_charge)!r}")
1414
+
1415
+ def compute_charge_summary(structure,
1416
+ selected_ids: Set[Tuple],
1417
+ substrate_ids: Set[Tuple],
1418
+ ligand_charge: float | str | Dict[str, float] | None = None) -> Dict[str, Any]:
1419
+ """
1420
+ Compute pocket charge summary.
1421
+
1422
+ Args
1423
+ ----
1424
+ structure : Bio.PDB.Structure.Structure
1425
+ The (first) structure to evaluate.
1426
+ selected_ids : set[tuple]
1427
+ Residues included in the pocket.
1428
+ substrate_ids : set[tuple]
1429
+ Residues designated as substrate.
1430
+ ligand_charge : float | str | dict[str,float] | None
1431
+ - float: total charge to assign across **unknown residues** (preferring unknown substrate).
1432
+ - str : numeric string (total) or mapping like "GPP:-3,MMT:-1" (per‑resname).
1433
+ - dict : mapping {RESNAME: charge}. In mapping mode, other unknown residues remain 0.
1434
+
1435
+ Returns
1436
+ -------
1437
+ dict with keys:
1438
+ - total_charge : float
1439
+ - protein_charge : float
1440
+ - ligand_total_charge : float
1441
+ - ion_total_charge : float
1442
+ - ion_charges : list[(str tag, float)]
1443
+ - unknown_residue_charges : dict[str -> float] # for concise per‑resname log
1444
+ """
1445
+ per_map: Dict[ResidueKey, float] = {}
1446
+ aa_charge = 0.0
1447
+ total = 0.0
1448
+
1449
+ fids_in_order = _sorted_fids_by_file_order(structure, selected_ids)
1450
+
1451
+ # First pass: dictionary/ion/water charges; collect unknowns and ions
1452
+ unknown_fids: List[Tuple] = []
1453
+ unknown_substrate_fids: List[Tuple] = []
1454
+ ion_entries: List[Tuple[str, float]] = []
1455
+
1456
+ for fid in fids_in_order:
1457
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1458
+ rn = res.get_resname().upper()
1459
+ key = _residue_key_from_res(res)
1460
+ if rn in WATER_RES:
1461
+ q = 0.0
1462
+ elif rn in AMINO_ACIDS:
1463
+ q = float(AMINO_ACIDS[rn])
1464
+ aa_charge += q
1465
+ elif rn in ION:
1466
+ q = float(ION[rn])
1467
+ ion_entries.append((_fmt_fid(structure, fid), q))
1468
+ else:
1469
+ q = 0.0
1470
+ unknown_fids.append(fid)
1471
+ if fid in substrate_ids:
1472
+ unknown_substrate_fids.append(fid)
1473
+ per_map[key] = q
1474
+ total += q
1475
+
1476
+ # Apply --ligand-charge if provided
1477
+ total_spec, mapping_spec = _parse_ligand_charge_option(ligand_charge)
1478
+
1479
+ if total_spec is not None:
1480
+ # Distribute total across unknown substrate if present, else across all unknowns
1481
+ targets = unknown_substrate_fids if unknown_substrate_fids else unknown_fids
1482
+ if targets:
1483
+ per_res_val = float(total_spec) / float(len(targets))
1484
+ for fid in targets:
1485
+ key = _residue_key_from_fid(structure, fid)
1486
+ per_map[key] = per_res_val
1487
+ # recompute totals
1488
+ total = sum(per_map.values())
1489
+ aa_charge = sum(q for k, q in per_map.items() if k[4] in AMINO_ACIDS)
1490
+ elif mapping_spec is not None:
1491
+ # Per‑resname mapping. Unspecified unknown residues remain 0.
1492
+ for fid in unknown_fids:
1493
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1494
+ rn = res.get_resname().upper()
1495
+ if rn in mapping_spec:
1496
+ key = _residue_key_from_fid(structure, fid)
1497
+ per_map[key] = float(mapping_spec[rn])
1498
+ # recompute totals
1499
+ total = sum(per_map.values())
1500
+ aa_charge = sum(q for k, q in per_map.items() if k[4] in AMINO_ACIDS)
1501
+
1502
+ # Net ligand and ion charges
1503
+ unknown_keys = {_residue_key_from_fid(structure, fid) for fid in unknown_fids}
1504
+ ligand_total = sum(per_map[k] for k in unknown_keys)
1505
+ ion_total = sum(q for _, q in ion_entries)
1506
+
1507
+ # Build per‑resname mapping for unknown residues (after applying any overrides)
1508
+ unknown_residue_charges: Dict[str, float] = {}
1509
+ for fid in unknown_fids:
1510
+ res = structure[fid[1]][fid[2]].child_dict[fid[3]]
1511
+ rn = res.get_resname().upper()
1512
+ key = _residue_key_from_fid(structure, fid)
1513
+ unknown_residue_charges[rn] = float(per_map[key])
1514
+
1515
+ return {
1516
+ "total_charge": float(total),
1517
+ "protein_charge": float(aa_charge),
1518
+ "ligand_total_charge": float(ligand_total),
1519
+ "ion_total_charge": float(ion_total),
1520
+ "ion_charges": [(tag, float(q)) for tag, q in ion_entries],
1521
+ "unknown_residue_charges": unknown_residue_charges,
1522
+ }
1523
+
1524
+ def log_charge_summary(prefix: str,
1525
+ summary: Dict[str, Any]):
1526
+ """
1527
+ Emit concise charge summary logs.
1528
+ """
1529
+ total = summary["total_charge"]
1530
+ protein = summary["protein_charge"]
1531
+ ligand = summary.get("ligand_total_charge", 0.0)
1532
+ ion_list: List[Tuple[str, float]] = summary.get("ion_charges", [])
1533
+ ion_total = summary.get("ion_total_charge", sum(q for _, q in ion_list))
1534
+ unk_map: Dict[str, float] = summary.get("unknown_residue_charges", {}) or {}
1535
+
1536
+ if unk_map:
1537
+ items = ", ".join(f"{res}: {q:g}" for res, q in sorted(unk_map.items()))
1538
+ LOGGER.info("%s Per-resname ligand charges: %s", prefix, items)
1539
+ else:
1540
+ LOGGER.info("%s Per-resname ligand charges: (none)", prefix)
1541
+
1542
+ LOGGER.info("%s Net protein charge: %+g", prefix, protein)
1543
+ LOGGER.info("%s Net ligand charge: %+g", prefix, ligand)
1544
+ if ion_list:
1545
+ LOGGER.info("%s Ion charges (each):", prefix)
1546
+ for tag, q in ion_list:
1547
+ LOGGER.info(" %s -> %+g", tag, q)
1548
+ LOGGER.info("%s Net ion charge: %+g", prefix, ion_total)
1549
+ else:
1550
+ LOGGER.info("%s Ion charges: (none)", prefix)
1551
+ LOGGER.info("%s Total pocket charge: %+g", prefix, total)
1552
+
1553
+
1554
+ # =========================== Cross-structure helpers ===========================
1555
+ # Multi-model driver utilities
1556
+ # ==============================================================================
1557
+
1558
+ def _build_key_maps(structure) -> Tuple[Dict[ResidueKey, Tuple], Dict[Tuple, ResidueKey]]:
1559
+ """
1560
+ Create maps between ResidueKey and full-id for a structure.
1561
+ """
1562
+ key2fid: Dict[ResidueKey, Tuple] = {}
1563
+ fid2key: Dict[Tuple, ResidueKey] = {}
1564
+ for model in structure:
1565
+ for chain in model:
1566
+ for res in chain.get_residues():
1567
+ key = _residue_key_from_res(res)
1568
+ fid = res.get_full_id()
1569
+ key2fid[key] = fid
1570
+ fid2key[fid] = key
1571
+ return key2fid, fid2key
1572
+
1573
+ def _keys_to_fids(structure, keys: Iterable[ResidueKey]) -> Set[Tuple]:
1574
+ """
1575
+ Translate a set of ResidueKeys into full-ids for this structure.
1576
+ """
1577
+ key2fid, _ = _build_key_maps(structure)
1578
+ fids: Set[Tuple] = set()
1579
+ missing: List[ResidueKey] = []
1580
+ for k in keys:
1581
+ fid = key2fid.get(k)
1582
+ if fid is None:
1583
+ missing.append(k)
1584
+ else:
1585
+ fids.add(fid)
1586
+ if missing:
1587
+ raise ValueError(f"Some residues not found in structure: {missing[:5]}{' ...' if len(missing)>5 else ''}")
1588
+ return fids
1589
+
1590
+ def _fids_to_keys(structure, fids: Iterable[Tuple]) -> Set[ResidueKey]:
1591
+ """
1592
+ Translate a set of full-ids into ResidueKeys.
1593
+ """
1594
+ return {_residue_key_from_fid(structure, fid) for fid in fids}
1595
+
1596
+ def _substrate_residues_for_structs(structs: List[PDB.Structure.Structure],
1597
+ center_spec: str) -> List[List[PDB.Residue.Residue]]:
1598
+ """
1599
+ Resolve substrate residues per structure.
1600
+
1601
+ Behavior
1602
+ --------
1603
+ * If `center_spec` is a PDB path: exact‑match on the first structure only,
1604
+ then propagate to others by a residue‑ID list derived from the first match.
1605
+ * If `center_spec` is an ID list: apply to all structures.
1606
+ * If `center_spec` is a residue‑name list: apply to all structures; names may match multiple residues
1607
+ (all included; WARNING logged per structure).
1608
+ """
1609
+ if os.path.exists(center_spec):
1610
+ sub_first = resolve_substrate_residues(structs[0], center_spec)
1611
+ tokens = []
1612
+ for res in sub_first:
1613
+ chain = res.get_parent().id
1614
+ chain_txt = (chain or "").strip()
1615
+ het, num, icode = res.id
1616
+ icode_txt = "" if icode == " " else icode
1617
+ if chain_txt:
1618
+ tokens.append(f"{chain}:{num}{icode_txt}")
1619
+ else:
1620
+ tokens.append(f"{num}{icode_txt}")
1621
+ idspec = ",".join(tokens)
1622
+ out: List[List[PDB.Residue.Residue]] = []
1623
+ for si, st in enumerate(structs):
1624
+ out.append(find_substrate_by_idspec(st, idspec))
1625
+ return out
1626
+ else:
1627
+ # Distinguish ID-spec vs resname list by attempting to parse as IDs first.
1628
+ try:
1629
+ _parse_res_tokens(center_spec)
1630
+ return [find_substrate_by_idspec(st, center_spec) for st in structs]
1631
+ except ValueError:
1632
+ return [find_substrate_by_resname(st, center_spec) for st in structs]
1633
+
1634
+ def _disulfide_partner_keys(structure, candidate_keys: Set[ResidueKey],
1635
+ cutoff: float = DISULFIDE_CUTOFF) -> Set[ResidueKey]:
1636
+ """
1637
+ Return ResidueKeys of disulfide partners to include for any selected CYS/CYX.
1638
+ """
1639
+ key2fid, _ = _build_key_maps(structure)
1640
+ sg_atoms: List[PDB.Atom.Atom] = []
1641
+ res_of_atom: Dict[PDB.Atom.Atom, ResidueKey] = {}
1642
+ for res in structure.get_residues():
1643
+ if res.get_resname() in {"CYS", "CYX"} and "SG" in res:
1644
+ at = res["SG"]
1645
+ sg_atoms.append(at)
1646
+ res_of_atom[at] = _residue_key_from_res(res)
1647
+ add: Set[ResidueKey] = set()
1648
+ if not sg_atoms:
1649
+ return add
1650
+ ns = NeighborSearch(sg_atoms)
1651
+ for at in sg_atoms:
1652
+ for other in ns.search(at.get_coord(), cutoff):
1653
+ if other is at:
1654
+ continue
1655
+ k1 = res_of_atom[at]
1656
+ k2 = res_of_atom[other]
1657
+ if (k1 in candidate_keys) or (k2 in candidate_keys):
1658
+ add.add(k1); add.add(k2)
1659
+ return add
1660
+
1661
+ def _assert_atom_ordering_identical(structs: List[PDB.Structure.Structure]):
1662
+ """
1663
+ Light consistency check across inputs:
1664
+ - Enforce identical atom counts.
1665
+ - Spot‑check ordering at the beginning and end of the atom list; if mismatched there (and overall lists differ),
1666
+ raise an error.
1667
+ """
1668
+ def signature(st: PDB.Structure.Structure) -> List[str]:
1669
+ sig: List[str] = []
1670
+ for model in st:
1671
+ for chain in model:
1672
+ for res in chain.get_residues():
1673
+ het, resseq, icode = res.id
1674
+ icode_txt = icode if icode != " " else ""
1675
+ base = f"{chain.id}|{het}|{resseq}{icode_txt}|{res.get_resname()}"
1676
+ for atom in res:
1677
+ sig.append(base + f"|{atom.get_name()}")
1678
+ return sig
1679
+ sig0 = signature(structs[0])
1680
+ for i in range(1, len(structs)):
1681
+ sigi = signature(structs[i])
1682
+ if len(sigi) != len(sig0):
1683
+ raise ValueError(f"[multi] Atom count mismatch between input #1 and input #{i+1}: {len(sig0)} vs {len(sigi)}")
1684
+ check_pairs = [(0, min(10, len(sig0))),
1685
+ (max(0, len(sig0)-10), len(sig0))]
1686
+ mismatch = False
1687
+ for a, b in check_pairs:
1688
+ if sig0[a:b] != sigi[a:b]:
1689
+ mismatch = True
1690
+ break
1691
+ if mismatch and sig0 != sigi:
1692
+ raise ValueError(f"[multi] Atom order mismatch between input #1 and input #{i+1}.")
1693
+
1694
+
1695
+ def _strip_trailing_END(text: str) -> str:
1696
+ """
1697
+ Remove trailing 'END' lines and ensure a final newline.
1698
+ """
1699
+ lines = [ln for ln in text.splitlines() if ln.strip() != "END"]
1700
+ out = "\n".join(lines)
1701
+ if not out.endswith("\n"):
1702
+ out += "\n"
1703
+ return out
1704
+
1705
+
1706
+ def _compute_linkH_defs(structure,
1707
+ selected_ids: Set[Tuple],
1708
+ skip_map: Dict[Tuple, Set[str]]) -> List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]]:
1709
+ """
1710
+ Deterministic list of link‑H definitions and coordinates.
1711
+
1712
+ Returns
1713
+ -------
1714
+ list of ((ResidueKey, cut_type), (x, y, z)), where cut_type ∈ {"CB-CA","CA-N","CA-C"}.
1715
+ Ordering is by residue file order, then by cut_type in the sequence above.
1716
+ """
1717
+ out: List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]] = []
1718
+ for fid in _sorted_fids_by_file_order(structure, selected_ids):
1719
+ res: PDB.Residue.Residue = structure[fid[1]][fid[2]].child_dict[fid[3]]
1720
+ if res.get_resname() in WATER_RES:
1721
+ continue
1722
+ skip_set = skip_map.get(fid, set())
1723
+ key = _residue_key_from_res(res)
1724
+
1725
+ def _maybe(parent_name: str, partner_name: str, cut_type: str):
1726
+ if not _atom_present_in_output(res, parent_name, skip_set):
1727
+ return
1728
+ if not _atom_removed_by_truncation(res, partner_name, skip_set):
1729
+ return
1730
+ parent = res[parent_name]
1731
+ partner = res[partner_name]
1732
+ parent_elem = (parent.element or parent.get_name()[0]).upper()
1733
+ if parent_elem != "C":
1734
+ return
1735
+ v = np.array(partner.get_coord(), dtype=float) - np.array(parent.get_coord(), dtype=float)
1736
+ norm = np.linalg.norm(v)
1737
+ if not np.isfinite(norm) or norm < 1e-6:
1738
+ return
1739
+ v /= norm
1740
+ dist = 1.09
1741
+ h = np.array(parent.get_coord(), dtype=float) + v * dist
1742
+ out.append(((key, cut_type), (float(h[0]), float(h[1]), float(h[2]))))
1743
+
1744
+ if res.get_resname() in {"PRO", "HYP"}:
1745
+ _maybe("CA", "C", "CA-C")
1746
+ else:
1747
+ _maybe("CB", "CA", "CB-CA")
1748
+ _maybe("CA", "N", "CA-N")
1749
+ _maybe("CA", "C", "CA-C")
1750
+ return out
1751
+
1752
+
1753
+ def extract_multi(args: argparse.Namespace, api=False) -> Dict[str, Any]:
1754
+ """
1755
+ Multi‑structure driver.
1756
+
1757
+ Args
1758
+ ----
1759
+ args : argparse.Namespace
1760
+ Parsed CLI arguments (or equivalent) controlling selection, truncation, outputs.
1761
+
1762
+ Returns
1763
+ -------
1764
+ dict
1765
+ {
1766
+ 'outputs': List[str],
1767
+ 'counts': List[{'raw_atoms': int, 'kept_atoms': int}], # per model
1768
+ 'charge_summary': {...}, # computed on model #1
1769
+ }
1770
+ """
1771
+ paths: List[str] = args.complex_pdb
1772
+ names = [f"complex{i+1}" for i in range(len(paths))]
1773
+ structs: List[PDB.Structure.Structure] = [load_structure(p, n) for p, n in zip(paths, names)]
1774
+
1775
+ LOGGER.info("[extract:multi] Loaded %d structures.", len(structs))
1776
+ _assert_atom_ordering_identical(structs)
1777
+
1778
+ # Substrates per structure (PDB-path -> first only, then propagate by IDs)
1779
+ subs_per_struct: List[List[PDB.Residue.Residue]] = _substrate_residues_for_structs(structs, args.substrate_pdb)
1780
+
1781
+ # 1) Per-structure selection and backbone-contact → OR unify as keys
1782
+ union_sel_keys: Set[ResidueKey] = set()
1783
+ union_bb_contact_keys: Set[ResidueKey] = set()
1784
+
1785
+ for st, subs in zip(structs, subs_per_struct):
1786
+ selected_ids, bb_contact_ids = select_residues(st, subs, args.radius, args.radius_het2het, args.include_H2O, args.exclude_backbone)
1787
+ union_sel_keys |= _fids_to_keys(st, selected_ids)
1788
+ union_bb_contact_keys |= _fids_to_keys(st, bb_contact_ids)
1789
+
1790
+ LOGGER.info("[extract:multi] Initial union selection: %d residues; backbone-contact: %d residues.",
1791
+ len(union_sel_keys), len(union_bb_contact_keys))
1792
+
1793
+ # 1a) Force-include residues via --selected-resn (OR across structures)
1794
+ if getattr(args, "selected_resn", ""):
1795
+ forced_union: Set[ResidueKey] = set()
1796
+ for st in structs:
1797
+ forced_res = find_substrate_by_idspec(st, args.selected_resn)
1798
+ forced_union |= {_residue_key_from_res(r) for r in forced_res}
1799
+ if forced_union:
1800
+ LOGGER.info("[extract:multi] Force-include (--selected-resn): +%d residues.", len(forced_union))
1801
+ union_sel_keys |= forced_union
1802
+
1803
+ # 2) Disulfide partners (OR across structures)
1804
+ dis_keys_union: Set[ResidueKey] = set()
1805
+ for st in structs:
1806
+ dis_keys_union |= _disulfide_partner_keys(st, union_sel_keys, DISULFIDE_CUTOFF)
1807
+ if dis_keys_union:
1808
+ LOGGER.info("[extract:multi] Disulfide partner addition (union): +%d residues.", len(dis_keys_union))
1809
+ union_sel_keys |= dis_keys_union
1810
+
1811
+ # 3) Backbone-contact neighbor augmentation (if exclude_backbone == False)
1812
+ keep_ncap_union: Set[ResidueKey] = set()
1813
+ keep_ccap_union: Set[ResidueKey] = set()
1814
+ if not args.exclude_backbone and union_bb_contact_keys:
1815
+ added_neighbor_union: Set[ResidueKey] = set()
1816
+ for st, subs in zip(structs, subs_per_struct):
1817
+ sel_ids = _keys_to_fids(st, union_sel_keys)
1818
+ bb_ids = _keys_to_fids(st, union_bb_contact_keys & _fids_to_keys(st, sel_ids))
1819
+ sub_ids = {r.get_full_id() for r in subs}
1820
+ # single call performs neighbor augmentation and returns cap-preservation flags
1821
+ kn_fids, kc_fids = augment_backbone_contact_neighbors(st, sel_ids, bb_ids, sub_ids)
1822
+ after_keys = _fids_to_keys(st, sel_ids)
1823
+ added_neighbor_union |= (after_keys - union_sel_keys)
1824
+ keep_ncap_union |= _fids_to_keys(st, kn_fids)
1825
+ keep_ccap_union |= _fids_to_keys(st, kc_fids)
1826
+ if added_neighbor_union:
1827
+ LOGGER.info("[extract:multi] Backbone-contact neighbor addition (union): +%d residues.",
1828
+ len(added_neighbor_union))
1829
+ union_sel_keys |= added_neighbor_union
1830
+
1831
+ # 4) PRO N-side neighbor augmentation (OR across structures)
1832
+ pro_prev_add_union: Set[ResidueKey] = set()
1833
+ for st in structs:
1834
+ sel_ids = _keys_to_fids(st, union_sel_keys)
1835
+ augment_proline_prev_neighbor(st, sel_ids)
1836
+ added = _fids_to_keys(st, sel_ids) - union_sel_keys
1837
+ pro_prev_add_union |= added
1838
+ if pro_prev_add_union:
1839
+ LOGGER.info("[extract:multi] PRO N-side neighbor addition (union): +%d residues.",
1840
+ len(pro_prev_add_union))
1841
+ union_sel_keys |= pro_prev_add_union
1842
+
1843
+ # ==== Build skip maps per structure (using unified selection and cap-keep flags) ====
1844
+ selected_ids_per_struct: List[Set[Tuple]] = []
1845
+ skip_maps_per_struct: List[Dict[Tuple, Set[str]]] = []
1846
+ substrate_idsets_per_struct: List[Set[Tuple]] = []
1847
+
1848
+ for st, subs in zip(structs, subs_per_struct):
1849
+ sel_fids = _keys_to_fids(st, union_sel_keys)
1850
+ selected_ids_per_struct.append(sel_fids)
1851
+ sub_ids = {r.get_full_id() for r in subs}
1852
+ substrate_idsets_per_struct.append(sub_ids)
1853
+ kn_fids = _keys_to_fids(st, keep_ncap_union) if (not args.exclude_backbone) else None
1854
+ kc_fids = _keys_to_fids(st, keep_ccap_union) if (not args.exclude_backbone) else None
1855
+ skip_map = mark_atoms_to_skip(st, sel_fids, sub_ids, args.exclude_backbone, kn_fids, kc_fids)
1856
+ skip_maps_per_struct.append(skip_map)
1857
+
1858
+ # ==== Compute link‑H definitions for each model and ensure identical targets/order ====
1859
+ linkdefs_per_struct: List[List[Tuple[Tuple[ResidueKey, str], Tuple[float, float, float]]]] = []
1860
+ for st, sel_fids, skip_map in zip(structs, selected_ids_per_struct, skip_maps_per_struct):
1861
+ linkdefs = _compute_linkH_defs(st, sel_fids, skip_map)
1862
+ linkdefs_per_struct.append(linkdefs)
1863
+ ref_targets = [ld[0] for ld in linkdefs_per_struct[0]]
1864
+ for i in range(1, len(linkdefs_per_struct)):
1865
+ targets_i = [ld[0] for ld in linkdefs_per_struct[i]]
1866
+ if targets_i != ref_targets:
1867
+ raise RuntimeError(
1868
+ f"[multi] link-H targets/order differ between model #1 and model #{i+1}. "
1869
+ f"Ensure inputs and options produce identical truncation across models."
1870
+ )
1871
+ LOGGER.info("[extract:multi] link-H targets common across models: %d.", len(ref_targets))
1872
+
1873
+ # ==== Write outputs ====
1874
+ per_file_outputs = (len(args.output_pdb) == len(paths))
1875
+ if not per_file_outputs and len(args.output_pdb) != 1:
1876
+ raise ValueError("[extract:multi] Provide either a single output path for a multi‑MODEL PDB "
1877
+ "or exactly N output paths where N == number of inputs for per‑structure outputs.")
1878
+
1879
+ io = PDB.PDBIO()
1880
+ model_texts: List[str] = []
1881
+ model_counts: List[Dict[str, int]] = []
1882
+
1883
+ for m, (st, sel_fids, skip_map) in enumerate(zip(structs, selected_ids_per_struct, skip_maps_per_struct), start=1):
1884
+ io.set_structure(st)
1885
+ buf = _io.StringIO()
1886
+ io.save(buf, AS_Select(sel_fids, skip_map))
1887
+ main_text = _strip_trailing_END(buf.getvalue())
1888
+
1889
+ # Atom-count diagnostics
1890
+ raw_atoms = sum(len(st[f[1]][f[2]].child_dict[f[3]]) for f in sel_fids)
1891
+ kept_atoms = sum(
1892
+ 1 for fid in sel_fids
1893
+ for a in st[fid[1]][fid[2]].child_dict[fid[3]]
1894
+ if a.get_name() not in skip_map.get(fid, set())
1895
+ )
1896
+ LOGGER.info("[extract:multi] Raw atoms (model %d): %d", m, raw_atoms)
1897
+ LOGGER.info("[extract:multi] Atoms after truncation (model %d): %d", m, kept_atoms)
1898
+ model_counts.append({"raw_atoms": raw_atoms, "kept_atoms": kept_atoms})
1899
+
1900
+ # Append TER + link‑H block (honor --add-linkH)
1901
+ link_coords = [coord for (_, coord) in linkdefs_per_struct[m-1]]
1902
+ if args.add_linkH and link_coords:
1903
+ if not main_text.endswith("\n"):
1904
+ main_text += "\n"
1905
+ parts = [main_text]
1906
+ last_line = main_text.splitlines()[-1].strip() if main_text.strip() else ""
1907
+ if last_line != "TER":
1908
+ parts.append("TER\n")
1909
+ start_serial = _max_serial_from_pdb_text(main_text)
1910
+ parts.append(_format_linkH_block(link_coords, start_serial))
1911
+ main_text = "".join(parts)
1912
+
1913
+ model_texts.append(main_text)
1914
+
1915
+ outputs: List[str] = []
1916
+ if per_file_outputs:
1917
+ for idx, text in enumerate(model_texts):
1918
+ content = text
1919
+ if not content.endswith("\n"):
1920
+ content += "\n"
1921
+ content += "END\n"
1922
+ out_path = args.output_pdb[idx]
1923
+ with open(out_path, "w") as fh:
1924
+ fh.write(content)
1925
+ outputs.append(out_path)
1926
+ LOGGER.info("[extract:multi] Single‑model pocket saved to %s", out_path)
1927
+ else:
1928
+ buf_models: List[str] = []
1929
+ for m, text in enumerate(model_texts, start=1):
1930
+ model_block = []
1931
+ model_block.append(f"MODEL {m}\n")
1932
+ model_block.append(text)
1933
+ model_block.append("ENDMDL\n")
1934
+ buf_models.append("".join(model_block))
1935
+ out_path = args.output_pdb[0]
1936
+ with open(out_path, "w") as fh:
1937
+ for blk in buf_models:
1938
+ fh.write(blk)
1939
+ fh.write("END\n")
1940
+ outputs.append(out_path)
1941
+ LOGGER.info("[extract:multi] Multi‑MODEL pocket saved to %s", out_path)
1942
+
1943
+ # ==== Charge summary (first model only) ====
1944
+ charge_summary = compute_charge_summary(
1945
+ structs[0],
1946
+ selected_ids_per_struct[0],
1947
+ substrate_idsets_per_struct[0],
1948
+ getattr(args, "ligand_charge", None)
1949
+ )
1950
+ log_charge_summary("[extract:multi]", charge_summary)
1951
+
1952
+ if api==True:
1953
+ return {
1954
+ "outputs": outputs,
1955
+ "counts": model_counts,
1956
+ "charge_summary": charge_summary,
1957
+ }
1958
+ else:
1959
+ return
1960
+
1961
+
1962
+ # ---------------------------------------------------------------------
1963
+ # PDB writer helper
1964
+ # ---------------------------------------------------------------------
1965
+ class AS_Select(PDB.Select):
1966
+ """
1967
+ Biopython Select subclass that filters residues/atoms according to skip map.
1968
+ """
1969
+ def __init__(self, selected_ids: Set[Tuple], skip_map: Dict[Tuple, Set[str]]):
1970
+ self.ids = selected_ids
1971
+ self.skip = skip_map
1972
+
1973
+ def accept_residue(self, residue):
1974
+ return residue.get_full_id() in self.ids
1975
+
1976
+ def accept_atom(self, atom):
1977
+ fid = atom.get_parent().get_full_id()
1978
+ return atom.get_name() not in self.skip.get(fid, set())
1979
+
1980
+
1981
+ # ---------------------------------------------------------------------
1982
+ # Main driver (single or multi) — CLI or API
1983
+ # ---------------------------------------------------------------------
1984
+
1985
+ def extract(args: argparse.Namespace | None = None, api=False) -> Dict[str, Any]:
1986
+ """
1987
+ Run from CLI (args=None → parse_args()) or as an API with a pre-built Namespace.
1988
+
1989
+ Args
1990
+ ----
1991
+ args : argparse.Namespace | None
1992
+ If None, parse CLI args. Otherwise, use the provided Namespace.
1993
+ api : bool
1994
+ If True, return a structured result dictionary; if False (CLI), return None.
1995
+
1996
+ Returns
1997
+ -------
1998
+ dict | None
1999
+ When api=True, returns { 'outputs', 'counts', 'charge_summary' }. Otherwise, None.
2000
+ """
2001
+ if args is None:
2002
+ args = parse_args()
2003
+
2004
+ _configure_extract_logger(bool(args.verbose))
2005
+
2006
+ if args.radius == 0.0:
2007
+ args.radius = 0.001
2008
+ if args.radius_het2het == 0.0:
2009
+ args.radius_het2het = 0.001
2010
+
2011
+ # Log extract options
2012
+ LOGGER.info("[extract] Options: radius=%.2f, radius_het2het=%.2f, "
2013
+ "include_H2O=%s, exclude_backbone=%s, add_linkH=%s, "
2014
+ "selected_resn='%s'",
2015
+ args.radius, args.radius_het2het,
2016
+ args.include_H2O, args.exclude_backbone,
2017
+ getattr(args, 'add_linkh', False),
2018
+ getattr(args, 'selected_resn', ''))
2019
+
2020
+ # default output names
2021
+ if args.output_pdb is None:
2022
+ if len(args.complex_pdb) > 1:
2023
+ # multiple inputs → per-file outputs: pocket_{original_filename}.pdb
2024
+ args.output_pdb = [
2025
+ f"pocket_{os.path.splitext(os.path.basename(p))[0]}.pdb"
2026
+ for p in args.complex_pdb
2027
+ ]
2028
+ else:
2029
+ args.output_pdb = ['pocket.pdb']
2030
+
2031
+ # Single-structure path
2032
+ if len(args.complex_pdb) == 1:
2033
+ complex_struct = load_structure(args.complex_pdb[0], "complex")
2034
+
2035
+ # Resolve substrate residues from PDB path or residue-ID/name list
2036
+ substrate_residues = resolve_substrate_residues(complex_struct, args.substrate_pdb)
2037
+ substrate_ids = {r.get_full_id() for r in substrate_residues}
2038
+ LOGGER.info("[extract] Substrate residues matched: resseq %s",
2039
+ [r.id[1] for r in substrate_residues])
2040
+
2041
+ selected_ids, backbone_contact_ids = select_residues(
2042
+ complex_struct, substrate_residues,
2043
+ args.radius, args.radius_het2het,
2044
+ args.include_H2O,
2045
+ args.exclude_backbone
2046
+ )
2047
+
2048
+ # Force-include residues via --selected-resn
2049
+ if getattr(args, "selected_resn", ""):
2050
+ forced_res = find_substrate_by_idspec(complex_struct, args.selected_resn)
2051
+ add_n = 0
2052
+ for r in forced_res:
2053
+ fid = r.get_full_id()
2054
+ if fid not in selected_ids:
2055
+ selected_ids.add(fid)
2056
+ add_n += 1
2057
+ if add_n:
2058
+ LOGGER.info("[extract] Force-include (--selected-resn): +%d residues.", add_n)
2059
+
2060
+ augment_disulfides(complex_struct, selected_ids)
2061
+
2062
+ # Backbone-contact context (if enabled)
2063
+ keep_ncap_ids: Set[Tuple] = set()
2064
+ keep_ccap_ids: Set[Tuple] = set()
2065
+ if not args.exclude_backbone and backbone_contact_ids:
2066
+ kn, kc = augment_backbone_contact_neighbors(
2067
+ complex_struct, selected_ids, backbone_contact_ids, substrate_ids
2068
+ )
2069
+ keep_ncap_ids.update(kn)
2070
+ keep_ccap_ids.update(kc)
2071
+
2072
+ # Ensure PRO's N-side neighbor is included (TER-aware)
2073
+ augment_proline_prev_neighbor(complex_struct, selected_ids)
2074
+
2075
+ # Atom counts
2076
+ raw = sum(len(complex_struct[f[1]][f[2]].child_dict[f[3]]) for f in selected_ids)
2077
+ LOGGER.info("[extract] Raw atoms: %d", raw)
2078
+
2079
+ skip_map = mark_atoms_to_skip(
2080
+ complex_struct, selected_ids, substrate_ids,
2081
+ args.exclude_backbone,
2082
+ keep_ncap_ids if not args.exclude_backbone else None,
2083
+ keep_ccap_ids if not args.exclude_backbone else None
2084
+ )
2085
+
2086
+ kept_atoms = sum(
2087
+ 1 for fid in selected_ids
2088
+ for a in complex_struct[fid[1]][fid[2]].child_dict[fid[3]]
2089
+ if a.get_name() not in skip_map.get(fid, set())
2090
+ )
2091
+ LOGGER.info("[extract] Atoms after truncation: %d", kept_atoms)
2092
+
2093
+ # Save structure (and optionally append link‑H block)
2094
+ io = PDB.PDBIO()
2095
+ io.set_structure(complex_struct)
2096
+
2097
+ buf = _io.StringIO()
2098
+ io.save(buf, AS_Select(selected_ids, skip_map))
2099
+ main_pdb_text = buf.getvalue()
2100
+
2101
+ output_path = args.output_pdb[0]
2102
+ outputs: List[str] = []
2103
+
2104
+ if args.add_linkH:
2105
+ link_coords = compute_linkH_atoms(complex_struct, selected_ids, skip_map)
2106
+ LOGGER.info("[extract] Link-H to add: %d", len(link_coords))
2107
+
2108
+ lines = [ln for ln in main_pdb_text.splitlines() if ln.strip() != "END"]
2109
+ if lines and lines[-1].strip() == "TER":
2110
+ pass
2111
+ main_no_end = "\n".join(lines)
2112
+ if not main_no_end.endswith("\n"):
2113
+ main_no_end += "\n"
2114
+
2115
+ final_parts = [main_no_end]
2116
+ if link_coords:
2117
+ final_parts.append("TER\n")
2118
+ start_serial = _max_serial_from_pdb_text(main_no_end)
2119
+ final_parts.append(_format_linkH_block(link_coords, start_serial))
2120
+ final_parts.append("END\n")
2121
+
2122
+ with open(output_path, "w") as fh:
2123
+ fh.write("".join(final_parts))
2124
+ LOGGER.info("[extract] Binding-Pocket (Active Site) + link-H saved to %s", output_path)
2125
+ outputs.append(output_path)
2126
+ else:
2127
+ with open(output_path, "w") as fh:
2128
+ fh.write(main_pdb_text)
2129
+ LOGGER.info("[extract] Binding-Pocket (Active Site) saved to %s", output_path)
2130
+ outputs.append(output_path)
2131
+
2132
+ # Charge summary (single model)
2133
+ charge_summary = compute_charge_summary(
2134
+ complex_struct, selected_ids, substrate_ids, getattr(args, "ligand_charge", None)
2135
+ )
2136
+ log_charge_summary("[extract]", charge_summary)
2137
+
2138
+ if api:
2139
+ return {
2140
+ "outputs": outputs,
2141
+ "counts": [{"raw_atoms": raw, "kept_atoms": kept_atoms}],
2142
+ "charge_summary": charge_summary,
2143
+ }
2144
+ else:
2145
+ return
2146
+
2147
+ # Multi-structure path
2148
+ return extract_multi(args, api=api)
2149
+
2150
+
2151
+ def extract_api(complex_pdb: List[str],
2152
+ center: str,
2153
+ output: Optional[List[str]] = None,
2154
+ radius: float = 2.6,
2155
+ radius_het2het: float = 0.0,
2156
+ include_H2O: bool = True,
2157
+ exclude_backbone: bool = True,
2158
+ add_linkH: bool = True,
2159
+ selected_resn: str = "",
2160
+ ligand_charge: Optional[float | str | Dict[str, float]] = None,
2161
+ verbose: bool = False) -> Dict[str, Any]:
2162
+ """
2163
+ Convenience API for programmatic use.
2164
+
2165
+ Args
2166
+ ----
2167
+ complex_pdb : list[str]
2168
+ Input PDB path(s). len==1 → single, len>1 → multi.
2169
+ center : str
2170
+ Substrate spec: a PDB path, a residue‑ID list 'A:123,456' (insertion codes OK),
2171
+ or a residue‑name list 'GPP,MMT'.
2172
+ output : list[str] | None
2173
+ Output path(s): one path for multi‑MODEL PDB, or N paths for per‑file outputs.
2174
+ If None, defaults to ['pocket.pdb'].
2175
+ radius : float
2176
+ Atom–atom cutoff (Å) for inclusion around substrate atoms.
2177
+ radius_het2het : float
2178
+ Independent hetero‑hetero cutoff (Å) for non‑C/H pairs.
2179
+ include_H2O : bool
2180
+ Include waters in the selection.
2181
+ exclude_backbone : bool
2182
+ Remove backbone atoms on non‑substrate amino acids (with safeguards).
2183
+ add_linkH : bool
2184
+ Add link‑H atoms for cut bonds (carbon‑only) and append as HL/LKH HETATM records.
2185
+ selected_resn : str
2186
+ Additional residues to force‑include (comma/space separated).
2187
+ ligand_charge : float | str | dict[str,float] | None
2188
+ Either a total charge (float/str) for unknown residues (prefer unknown substrate),
2189
+ or a mapping like {'GPP': -3, 'MMT': -1}. In mapping mode, other unknown residues remain 0.
2190
+ verbose : bool
2191
+ Enable INFO logging.
2192
+
2193
+ Returns
2194
+ -------
2195
+ dict
2196
+ Same structure as `extract(..., api=True)`.
2197
+ """
2198
+ if not output:
2199
+ output = ['pocket.pdb']
2200
+ ns = argparse.Namespace(
2201
+ complex_pdb=complex_pdb,
2202
+ substrate_pdb=center,
2203
+ output_pdb=output,
2204
+ radius=radius,
2205
+ radius_het2het=radius_het2het,
2206
+ include_H2O=include_H2O,
2207
+ exclude_backbone=exclude_backbone,
2208
+ add_linkH=add_linkH,
2209
+ selected_resn=selected_resn,
2210
+ ligand_charge=ligand_charge,
2211
+ verbose=verbose,
2212
+ )
2213
+ return extract(ns, api=True)