biotite 1.5.0__cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-313-x86_64-linux-gnu.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-313-x86_64-linux-gnu.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-313-x86_64-linux-gnu.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-313-x86_64-linux-gnu.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-313-x86_64-linux-gnu.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-313-x86_64-linux-gnu.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-313-x86_64-linux-gnu.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-313-x86_64-linux-gnu.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-313-x86_64-linux-gnu.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-313-x86_64-linux-gnu.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-313-x86_64-linux-gnu.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-313-x86_64-linux-gnu.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-313-x86_64-linux-gnu.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-313-x86_64-linux-gnu.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-313-x86_64-linux-gnu.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-313-x86_64-linux-gnu.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-313-x86_64-linux-gnu.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-313-x86_64-linux-gnu.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-313-x86_64-linux-gnu.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-313-x86_64-linux-gnu.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-313-x86_64-linux-gnu.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1403 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module provides functions for base pair identification.
7
+ """
8
+
9
+ __name__ = "biotite.structure"
10
+ __author__ = "Tom David Müller"
11
+ __all__ = [
12
+ "base_pairs",
13
+ "map_nucleotide",
14
+ "base_stacking",
15
+ "base_pairs_edge",
16
+ "Edge",
17
+ "base_pairs_glycosidic_bond",
18
+ "GlycosidicBond",
19
+ ]
20
+
21
+ import warnings
22
+ from enum import IntEnum
23
+ import numpy as np
24
+ from biotite.structure.atoms import Atom, array
25
+ from biotite.structure.celllist import CellList
26
+ from biotite.structure.compare import rmsd
27
+ from biotite.structure.error import (
28
+ BadStructureError,
29
+ IncompleteStructureWarning,
30
+ UnexpectedStructureWarning,
31
+ )
32
+ from biotite.structure.filter import filter_nucleotides
33
+ from biotite.structure.hbond import hbond
34
+ from biotite.structure.info.standardize import standardize_order
35
+ from biotite.structure.residues import get_residue_masks, get_residue_starts_for
36
+ from biotite.structure.superimpose import superimpose
37
+ from biotite.structure.util import distance, norm_vector
38
+
39
+
40
+ def _get_std_adenine():
41
+ """
42
+ Get standard base variables for adenine.
43
+
44
+ Returns
45
+ -------
46
+ standard_base : AtomArray
47
+ Standard coordinates nomenclature of the adenine base as
48
+ :class:`AtomArray` with nomenclature of PDB File Format V3
49
+ coordinates : tuple (ndarray, ndarray, ndarray, dtype=float)
50
+ :class:`ndarray` containing the center according to the SCHNaP-
51
+ paper referenced in the function ``base_pairs``,
52
+ :class:`ndarray` containing the coordinates of the pyrimidine
53
+ ring center, :class:`ndarray` containing the coordinates of the
54
+ imidazole ring center
55
+ """
56
+ atom1 = Atom([-1.291, 4.498, 0.000], atom_name="N9", res_name="A")
57
+ atom2 = Atom([0.024, 4.897, 0.000], atom_name="C8", res_name="A")
58
+ atom3 = Atom([0.877, 3.902, 0.000], atom_name="N7", res_name="A")
59
+ atom4 = Atom([0.071, 2.771, 0.000], atom_name="C5", res_name="A")
60
+ atom5 = Atom([0.369, 1.398, 0.000], atom_name="C6", res_name="A")
61
+ atom6 = Atom([1.611, 0.909, 0.000], atom_name="N6", res_name="A")
62
+ atom7 = Atom([-0.668, 0.532, 0.000], atom_name="N1", res_name="A")
63
+ atom8 = Atom([-1.912, 1.023, 0.000], atom_name="C2", res_name="A")
64
+ atom9 = Atom([-2.320, 2.290, 0.000], atom_name="N3", res_name="A")
65
+ atom10 = Atom([-1.267, 3.124, 0.000], atom_name="C4", res_name="A")
66
+ adenine = array(
67
+ [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10]
68
+ )
69
+
70
+ # Get the midpoint between the N1 and C4 atoms
71
+ midpoint = np.mean([atom7.coord, atom10.coord], axis=-2)
72
+ # Calculate the coordinates of the aromatic ring centers
73
+ pyrimidine_center = np.mean(
74
+ [atom4.coord, atom5.coord, atom7.coord, atom8.coord, atom9.coord, atom10.coord],
75
+ axis=-2,
76
+ )
77
+ imidazole_center = np.mean(
78
+ [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom10.coord], axis=-2
79
+ )
80
+
81
+ return adenine, (midpoint, pyrimidine_center, imidazole_center)
82
+
83
+
84
+ def _get_std_cytosine():
85
+ """
86
+ Get standard base variables for cytosine.
87
+
88
+ Returns
89
+ -------
90
+ standard_base : AtomArray
91
+ Standard coordinates nomenclature of the cytosine base as
92
+ :class:`AtomArray` with nomenclature of PDB File Format V3
93
+ coordinates : tuple (ndarray, ndarray, dtype=float)
94
+ :class:`ndarray` containing the center according to the SCHNaP-
95
+ paper referenced in the function ``base_pairs``,
96
+ :class:`ndarray` containing the coordinates of the pyrimidine
97
+ ring center
98
+ """
99
+ atom1 = Atom([-1.285, 4.542, 0.000], atom_name="N1", res_name="C")
100
+ atom2 = Atom([-1.472, 3.158, 0.000], atom_name="C2", res_name="C")
101
+ atom3 = Atom([-2.628, 2.709, 0.000], atom_name="O2", res_name="C")
102
+ atom4 = Atom([-0.391, 2.344, 0.000], atom_name="N3", res_name="C")
103
+ atom5 = Atom([0.837, 2.868, 0.000], atom_name="C4", res_name="C")
104
+ atom6 = Atom([1.875, 2.027, 0.000], atom_name="N4", res_name="C")
105
+ atom7 = Atom([1.056, 4.275, 0.000], atom_name="C5", res_name="C")
106
+ atom8 = Atom([-0.023, 5.068, 0.000], atom_name="C6", res_name="C")
107
+ cytosine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8])
108
+
109
+ # Get the midpoint between the N3 and C6 atoms
110
+ midpoint = np.mean([atom4.coord, atom8.coord], axis=-2)
111
+ # Calculate the coordinates of the aromatic ring center
112
+ pyrimidine_center = np.mean(
113
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord],
114
+ axis=-2,
115
+ )
116
+
117
+ return cytosine, (midpoint, pyrimidine_center)
118
+
119
+
120
+ def _get_std_guanine():
121
+ """
122
+ Get standard base variables for guanine.
123
+
124
+ Returns
125
+ -------
126
+ standard_base : AtomArray
127
+ Standard coordinates nomenclature of the guanine base as
128
+ :class:`AtomArray` with nomenclature of PDB File Format V3
129
+ coordinates : tuple (ndarray, ndarray, ndarray, dtype=float)
130
+ :class:`ndarray` containing the center according to the SCHNaP-
131
+ paper referenced in the function ''base_pairs'',
132
+ :class:`ndarray` containing the coordinates of the pyrimidine
133
+ ring center, :class:`ndarray` containing the coordinates of the
134
+ imidazole ring center
135
+ """
136
+ atom1 = Atom([-1.289, 4.551, 0.000], atom_name="N9", res_name="G")
137
+ atom2 = Atom([0.023, 4.962, 0.000], atom_name="C8", res_name="G")
138
+ atom3 = Atom([0.870, 3.969, 0.000], atom_name="N7", res_name="G")
139
+ atom4 = Atom([0.071, 2.833, 0.000], atom_name="C5", res_name="G")
140
+ atom5 = Atom([0.424, 1.460, 0.000], atom_name="C6", res_name="G")
141
+ atom6 = Atom([1.554, 0.955, 0.000], atom_name="O6", res_name="G")
142
+ atom7 = Atom([-0.700, 0.641, 0.000], atom_name="N1", res_name="G")
143
+ atom8 = Atom([-1.999, 1.087, 0.000], atom_name="C2", res_name="G")
144
+ atom9 = Atom([-2.949, 0.139, -0.001], atom_name="N2", res_name="G")
145
+ atom10 = Atom([-2.342, 2.364, 0.001], atom_name="N3", res_name="G")
146
+ atom11 = Atom([-1.265, 3.177, 0.000], atom_name="C4", res_name="G")
147
+ guanine = array(
148
+ [atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9, atom10, atom11]
149
+ )
150
+
151
+ # Get the midpoint between the N1 and C4 atoms
152
+ midpoint = np.mean([atom7.coord, atom11.coord], axis=-2)
153
+ # Calculate the coordinates of the aromatic ring centers
154
+ pyrimidine_center = np.mean(
155
+ [
156
+ atom4.coord,
157
+ atom5.coord,
158
+ atom7.coord,
159
+ atom8.coord,
160
+ atom10.coord,
161
+ atom11.coord,
162
+ ],
163
+ axis=-2,
164
+ )
165
+ imidazole_center = np.mean(
166
+ [atom1.coord, atom2.coord, atom3.coord, atom4.coord, atom11.coord], axis=-2
167
+ )
168
+
169
+ return guanine, (midpoint, pyrimidine_center, imidazole_center)
170
+
171
+
172
+ def _get_std_thymine():
173
+ """
174
+ Get standard base variables for thymine.
175
+
176
+ Returns
177
+ -------
178
+ standard_base : AtomArray
179
+ Standard coordinates nomenclature of the thymine base as
180
+ :class:`AtomArray` with nomenclature of PDB File Format V3
181
+ coordinates : tuple (ndarray, ndarray, dtype=float)
182
+ :class:`ndarray` containing the center according to the SCHNaP-
183
+ paper referenced in the function ``base_pairs``,
184
+ :class:`ndarray` containing the coordinates of the pyrimidine
185
+ ring center
186
+ """
187
+ atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="T")
188
+ atom2 = Atom([-1.462, 3.135, 0.000], atom_name="C2", res_name="T")
189
+ atom3 = Atom([-2.562, 2.608, 0.000], atom_name="O2", res_name="T")
190
+ atom4 = Atom([-0.298, 2.407, 0.000], atom_name="N3", res_name="T")
191
+ atom5 = Atom([0.994, 2.897, 0.000], atom_name="C4", res_name="T")
192
+ atom6 = Atom([1.944, 2.119, 0.000], atom_name="O4", res_name="T")
193
+ atom7 = Atom([1.106, 4.338, 0.000], atom_name="C5", res_name="T")
194
+ atom8 = Atom([2.466, 4.961, 0.001], atom_name="C7", res_name="T")
195
+ atom9 = Atom([-0.024, 5.057, 0.000], atom_name="C6", res_name="T")
196
+ thymine = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8, atom9])
197
+
198
+ # Get the midpoint between the N3 and C6 atoms
199
+ midpoint = np.mean([atom4.coord, atom9.coord], axis=-2)
200
+ # Calculate the coordinates of the aromatic ring center
201
+ pyrimidine_center = np.mean(
202
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom9.coord],
203
+ axis=-2,
204
+ )
205
+
206
+ return thymine, (midpoint, pyrimidine_center)
207
+
208
+
209
+ def _get_std_uracil():
210
+ """
211
+ Get standard base variables for uracil.
212
+
213
+ Returns
214
+ -------
215
+ standard_base : AtomArray
216
+ Standard coordinates nomenclature of the uracil base as
217
+ :class:`AtomArray` with nomenclature of PDB File Format V3
218
+ coordinates : tuple (ndarray, ndarray, dtype=float)
219
+ :class:`ndarray` containing the center according to the SCHNaP-
220
+ paper referenced in the function ``base_pairs``,
221
+ :class:`ndarray` containing the coordinates of the pyrimidine
222
+ ring center
223
+ """
224
+ atom1 = Atom([-1.284, 4.500, 0.000], atom_name="N1", res_name="U")
225
+ atom2 = Atom([-1.462, 3.131, 0.000], atom_name="C2", res_name="U")
226
+ atom3 = Atom([-2.563, 2.608, 0.000], atom_name="O2", res_name="U")
227
+ atom4 = Atom([-0.302, 2.397, 0.000], atom_name="N3", res_name="U")
228
+ atom5 = Atom([0.989, 2.884, 0.000], atom_name="C4", res_name="U")
229
+ atom6 = Atom([1.935, 2.094, -0.001], atom_name="O4", res_name="U")
230
+ atom7 = Atom([1.089, 4.311, 0.000], atom_name="C5", res_name="U")
231
+ atom8 = Atom([-0.024, 5.053, 0.000], atom_name="C6", res_name="U")
232
+ uracil = array([atom1, atom2, atom3, atom4, atom5, atom6, atom7, atom8])
233
+
234
+ # Get the midpoint between the N3 and C6 atoms
235
+ midpoint = np.mean([atom4.coord, atom8.coord], axis=-2)
236
+ # Calculate the coordinates of the aromatic ring center
237
+ pyrimidine_center = np.mean(
238
+ [atom1.coord, atom2.coord, atom4.coord, atom5.coord, atom7.coord, atom8.coord],
239
+ axis=-2,
240
+ )
241
+
242
+ return uracil, (midpoint, pyrimidine_center)
243
+
244
+
245
+ _STD_ADENINE, _STD_ADENINE_RING_CENTERS = _get_std_adenine()
246
+ _STD_CYTOSINE, _STD_CYTOSINE_RING_CENTERS = _get_std_cytosine()
247
+ _STD_GUANINE, _STD_GUANINE_RING_CENTERS = _get_std_guanine()
248
+ _STD_THYMINE, _STD_THYMINE_RING_CENTERS = _get_std_thymine()
249
+ _STD_URACIL, _STD_URACIL_RING_CENTERS = _get_std_uracil()
250
+
251
+ _ADENINE_CONTAINING_NUCLEOTIDES = ["A", "DA"]
252
+ _THYMINE_CONTAINING_NUCLEOTIDES = ["T", "DT"]
253
+ _CYTOSINE_CONTAINING_NUCLEOTIDES = ["C", "DC"]
254
+ _GUANINE_CONTAINING_NUCLEOTIDES = ["G", "DG"]
255
+ _URACIL_CONTAINING_NUCLEOTIDES = ["U", "DU"]
256
+ _REFERENCE_NUCLEOTIDE_NAMES = (
257
+ _ADENINE_CONTAINING_NUCLEOTIDES
258
+ + _THYMINE_CONTAINING_NUCLEOTIDES
259
+ + _CYTOSINE_CONTAINING_NUCLEOTIDES
260
+ + _GUANINE_CONTAINING_NUCLEOTIDES
261
+ + _URACIL_CONTAINING_NUCLEOTIDES
262
+ )
263
+
264
+ # Atoms that are part of respective base edges according to the
265
+ # Leontis-Westhof nomenclature
266
+ _WATSON_CRICK_EDGE = {
267
+ "A": ["N6", "N1"],
268
+ "G": ["O6", "N1", "N2"],
269
+ "U": ["O4", "N3", "O2"],
270
+ "T": ["O4", "N3", "O2"],
271
+ "C": ["N4", "N3", "O2"],
272
+ }
273
+ _HOOGSTEEN_EDGE = {
274
+ "A": ["N6", "N7"],
275
+ "G": ["O6", "N7"],
276
+ "U": ["O4"],
277
+ "T": ["O4"],
278
+ "C": ["N4"],
279
+ }
280
+ _SUGAR_EDGE = {
281
+ "A": ["N3", "O2'"],
282
+ "G": ["N2", "N3", "O2'"],
283
+ "U": ["O2", "O2'"],
284
+ "T": ["O2", "O2'"],
285
+ "C": ["O2", "O2'"],
286
+ }
287
+ _EDGES = [_WATSON_CRICK_EDGE, _HOOGSTEEN_EDGE, _SUGAR_EDGE]
288
+
289
+
290
+ class Edge(IntEnum):
291
+ """
292
+ This enum type represents the interacting edge for a given base.
293
+ """
294
+
295
+ INVALID = (0,)
296
+ WATSON_CRICK = (1,)
297
+ HOOGSTEEN = (2,)
298
+ SUGAR = 3
299
+
300
+
301
+ class GlycosidicBond(IntEnum):
302
+ """
303
+ This enum type represents the relative glycosidic bond orientation
304
+ for a given base pair.
305
+ """
306
+
307
+ INVALID = 0
308
+ CIS = (1,)
309
+ TRANS = (2,)
310
+
311
+
312
+ def base_pairs_edge(atom_array, base_pairs):
313
+ """
314
+ Get the interacting edges for given base pairs in an
315
+ :class:`AtomArray` according to the Leontis-Westhof nomenclature.
316
+ :footcite:`Leontis2001`
317
+
318
+ The :class:`AtomArray` must contain hydrogens as it relies on
319
+ :func:`hbond()`.
320
+
321
+ Parameters
322
+ ----------
323
+ atom_array : AtomArray
324
+ The :class:`AtomArray` containing the bases.
325
+ base_pairs : ndarray, dtype=int, shape=(n,2)
326
+ Each row is equivalent to one base pair and contains the first
327
+ indices of the residues corresponding to each base. The
328
+ structure of the ``ndarray`` is the same as the output of
329
+ :func:`base_pairs()`.
330
+
331
+ Returns
332
+ -------
333
+ results : ndarray, dtype=uint8, shape=(n,2)
334
+ The ``ndarray`` has the same dimensions as ``base_pairs``. Each
335
+ cell corresponds to the interacting edge of the referenced base
336
+ in ``base_pairs``. The edge type is stored as integer that is
337
+ interpreted as member of the the :class:`Edge` enum.
338
+
339
+ See Also
340
+ --------
341
+ base_pairs : Get the base pairs required for this function.
342
+ base_pairs_glycosidic_bond : Determine the orientation for each base pair.
343
+
344
+ Notes
345
+ -----
346
+ If a base is not a canonical base (``A``, ``C``, ``G``, ``T``,
347
+ ``U``) or no hydrogen bonds are found between the bases that conform
348
+ to the interacting edges described by Leontis and Westhof, 0 is
349
+ returned (corresponding to ``Edge.INVALID``).
350
+
351
+ The edge returned always corresponds to the edge with the most
352
+ hydrogen bonding interactions.
353
+
354
+ References
355
+ ----------
356
+
357
+ .. footbibliography::
358
+
359
+ Examples
360
+ --------
361
+ Compute the interacting base edges for the dna helix with the PDB
362
+ id 1QXB:
363
+
364
+ >>> from os.path import join
365
+ >>> dna_helix = load_structure(
366
+ ... join(path_to_structures, "base_pairs", "1qxb.cif")
367
+ ... )
368
+ >>> basepairs = base_pairs(dna_helix)
369
+ >>> interacting_edges = base_pairs_edge(dna_helix, basepairs)
370
+ >>> print(interacting_edges)
371
+ [[1 1]
372
+ [1 1]
373
+ [1 1]
374
+ [1 1]
375
+ [1 1]
376
+ [1 1]
377
+ [1 1]
378
+ [1 1]
379
+ [1 1]
380
+ [1 1]
381
+ [1 1]
382
+ [1 1]]
383
+
384
+ The resulting integers can be interpreted as :class:`Edge` ``Enum``:
385
+
386
+ >>> for interaction in interacting_edges:
387
+ ... print(f"{Edge(interaction[0]).name} to {Edge(interaction[1]).name}")
388
+ WATSON_CRICK to WATSON_CRICK
389
+ WATSON_CRICK to WATSON_CRICK
390
+ WATSON_CRICK to WATSON_CRICK
391
+ WATSON_CRICK to WATSON_CRICK
392
+ WATSON_CRICK to WATSON_CRICK
393
+ WATSON_CRICK to WATSON_CRICK
394
+ WATSON_CRICK to WATSON_CRICK
395
+ WATSON_CRICK to WATSON_CRICK
396
+ WATSON_CRICK to WATSON_CRICK
397
+ WATSON_CRICK to WATSON_CRICK
398
+ WATSON_CRICK to WATSON_CRICK
399
+ WATSON_CRICK to WATSON_CRICK
400
+ """
401
+ # Result-``ndarray`` matches the dimensions of the input array
402
+ results = np.zeros_like(base_pairs, dtype="uint8")
403
+
404
+ # Get the residue masks for each residue
405
+ base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten())
406
+
407
+ # Group every two masks together for easy iteration (each 'row' is
408
+ # respective to a row in ``base_pairs``)
409
+ base_pairs_masks = base_pairs_masks.reshape(
410
+ (base_pairs.shape[0], 2, atom_array.shape[0])
411
+ )
412
+
413
+ for i, base_masks in enumerate(base_pairs_masks):
414
+ # Get the absolute atom count for each edge
415
+ base_edges = _get_edge_matrix(atom_array, base_masks)
416
+
417
+ # Classify the base edges based on the highest number of
418
+ # matching hydrogen bonded atoms
419
+ for j, base in enumerate(base_edges):
420
+ if np.max(base) != 0:
421
+ results[i, j] = np.argmax(base) + 1
422
+ return results
423
+
424
+
425
+ def _get_edge_matrix(atom_array, base_masks):
426
+ """
427
+ Get the number of atoms interacting for each edge as a matrix, where
428
+ each row corresponds to a base and each column to the number of
429
+ Watson-Crick-, Hoogsteen- and Sugar-edge interactions respectively.
430
+
431
+ Parameters
432
+ ----------
433
+ atom_array : AtomArray
434
+ The :class:`AtomArray` containing the bases.
435
+ base_masks : ndarray, dtype=bool, shape=(2,n)
436
+ Boolean masks for the interacting bases
437
+
438
+ Returns
439
+ -------
440
+ matrix : ndarray, dtype=int, shape=(2,3)
441
+ The edge matrix.
442
+ """
443
+ # Get the hydrogen bonds between the residues
444
+ hbonds = hbond(atom_array, base_masks[0], base_masks[1])
445
+ if len(hbonds) == 0:
446
+ raise BadStructureError(
447
+ f"No hydrogen bonds between nucleotides with residue start "
448
+ f"indices {np.argmax(base_masks[0])} and "
449
+ f"{np.argmax(base_masks[1])}"
450
+ )
451
+ # filter out donor/acceptor heteroatoms and flatten for easy
452
+ # iteration
453
+ hbonds = hbonds[:, (0, 2)].flatten()
454
+
455
+ # ``ndarray`` with one row for each base and the number of
456
+ # bonded edge heteroatoms as in ``_edge`` as columns
457
+ matrix = np.zeros((2, 3), dtype="int32")
458
+
459
+ # Iterate through the atoms and corresponding atoms indices
460
+ # that are part of the hydrogen bonds
461
+ for atom, atom_index in zip(atom_array[hbonds], hbonds):
462
+ if atom.res_name not in _REFERENCE_NUCLEOTIDE_NAMES:
463
+ continue
464
+
465
+ # Iterate over the edge types
466
+ for edge_type_index, edge_type in enumerate(_EDGES):
467
+ # Iterate over the two base masks
468
+ for base_index, base_mask in enumerate(base_masks):
469
+ # If a donor/acceptor atom name matches a name in
470
+ # the corresponding edge list increase the tally
471
+ if (
472
+ base_mask[atom_index]
473
+ and atom.atom_name in edge_type[atom.res_name[-1]]
474
+ ):
475
+ matrix[base_index, edge_type_index] += 1
476
+ return matrix
477
+
478
+
479
+ def base_pairs_glycosidic_bond(atom_array, base_pairs):
480
+ """
481
+ Calculate the glycosidic bond orientation for given base pairs in an
482
+ :class:`AtomArray` according to the Leontis-Westhof nomenclature.
483
+ :footcite:`Leontis2001`
484
+
485
+ Parameters
486
+ ----------
487
+ atom_array : AtomArray
488
+ The :class:`AtomArray` containing the bases.
489
+ base_pairs : ndarray, dtype=int, shape=(n,2)
490
+ Each row is equivalent to one base pair and contains the first
491
+ indices of the residues corresponding to each base. The
492
+ structure of the ``ndarray`` is the same as the output of
493
+ :func:`base_pairs()`.
494
+
495
+ Returns
496
+ -------
497
+ results : ndarray, dtype=int, shape=(n,)
498
+ The ``ndarray`` has the same dimensions as ``base_pairs``. Each
499
+ cell corresponds to the interacting edge of the referenced base
500
+ in ``base_pairs``.
501
+ Each row is equivalent to the respective base pair. The
502
+ glycosidic bond orientation is stored as integer that is
503
+ interpreted as member of the the :class:`GlycosidicBond` class.
504
+
505
+ See Also
506
+ --------
507
+ base_pairs : Get the base pairs required for this function.
508
+ base_pairs_edge : Determine the interacting edge for each base pair.
509
+ GlycosidicBond : The Enum type for interpretation of the return value.
510
+
511
+ Notes
512
+ -----
513
+ The orientation is found using the geometric centers of the bases
514
+ and the glycosidic bonds as described in :footcite:`Yang2003`.
515
+
516
+ References
517
+ ----------
518
+
519
+ .. footbibliography::
520
+
521
+ Examples
522
+ --------
523
+ Compute the glycosidic bond orientations for the dna helix with the
524
+ PDB ID 1QXB:
525
+
526
+ >>> from os.path import join
527
+ >>> dna_helix = load_structure(
528
+ ... join(path_to_structures, "base_pairs", "1qxb.cif")
529
+ ... )
530
+ >>> basepairs = base_pairs(dna_helix)
531
+ >>> orientations = base_pairs_glycosidic_bond(dna_helix, basepairs)
532
+ >>> print(orientations)
533
+ [1 1 1 1 1 1 1 1 1 1 1 1]
534
+
535
+ The resulting integers can be interpreted as :class:`GlycosidicBond`
536
+ ``Enum``:
537
+
538
+ >>> for orientation in orientations:
539
+ ... print(GlycosidicBond(orientation).name)
540
+ CIS
541
+ CIS
542
+ CIS
543
+ CIS
544
+ CIS
545
+ CIS
546
+ CIS
547
+ CIS
548
+ CIS
549
+ CIS
550
+ CIS
551
+ CIS
552
+ """
553
+ results = np.zeros(len(base_pairs), dtype="uint8")
554
+
555
+ # Get the residue masks for each residue
556
+ base_pairs_masks = get_residue_masks(atom_array, base_pairs.flatten())
557
+
558
+ # Group every two masks together for easy iteration (each 'row' is
559
+ # respective to a row in ``base_pairs``)
560
+ base_pairs_masks = base_pairs_masks.reshape(
561
+ (base_pairs.shape[0], 2, atom_array.shape[0])
562
+ )
563
+
564
+ for i, pair_masks in enumerate(base_pairs_masks):
565
+ # position vectors of each bases geometric center
566
+ geometric_centers = np.zeros((2, 3))
567
+ # direction vectors of the glycosidic bonds
568
+ glycosidic_bonds = np.zeros((2, 3))
569
+
570
+ for base_index, base_mask in enumerate(pair_masks):
571
+ base = atom_array[base_mask]
572
+ ring_center = _match_base(base, 3)[3:]
573
+
574
+ # For Purines the glycosidic bond is between the C1' and the
575
+ # N9 atoms, for pyrimidines it is between the C1' atom and
576
+ # the N1 atom
577
+ if (
578
+ base.res_name[0] in _ADENINE_CONTAINING_NUCLEOTIDES
579
+ or base.res_name[0] in _GUANINE_CONTAINING_NUCLEOTIDES
580
+ ):
581
+ geometric_centers[base_index] = (ring_center[0] + ring_center[1]) / 2
582
+ base_atom = base[base.atom_name == "N9"][0]
583
+
584
+ elif (
585
+ base.res_name[0] in _THYMINE_CONTAINING_NUCLEOTIDES
586
+ or base.res_name[0] in _URACIL_CONTAINING_NUCLEOTIDES
587
+ or base.res_name[0] in _CYTOSINE_CONTAINING_NUCLEOTIDES
588
+ ):
589
+ geometric_centers[base_index] = ring_center[0]
590
+ base_atom = base[base.atom_name == "N1"][0]
591
+
592
+ else:
593
+ results[i] = GlycosidicBond.INVALID
594
+ break
595
+
596
+ sugar_atom = base[base.atom_name == "C1'"][0]
597
+
598
+ # Calculate the glycosidic bond direction vector
599
+ glycosidic_bonds[base_index] = sugar_atom.coord - base_atom.coord
600
+
601
+ # if the bond is not invalid compute the orientation
602
+ else:
603
+ # Calculate the direction vector between the geometric centers
604
+ geometric_centers_dir = geometric_centers[1] - geometric_centers[0]
605
+
606
+ # Check the orientation of the glycosidic bonds
607
+ if (
608
+ np.dot(
609
+ np.cross(geometric_centers_dir, glycosidic_bonds[0]),
610
+ np.cross(geometric_centers_dir, glycosidic_bonds[1]),
611
+ )
612
+ < 0
613
+ ):
614
+ results[i] = GlycosidicBond.TRANS
615
+
616
+ else:
617
+ results[i] = GlycosidicBond.CIS
618
+
619
+ return results
620
+
621
+
622
+ def base_stacking(atom_array, min_atoms_per_base=3):
623
+ """
624
+ Find pi-stacking interactions between aromatic rings
625
+ in nucleic acids.
626
+
627
+ The presence of base stacking is assumed if the following criteria
628
+ are met :footcite:`Gabb1996`:
629
+
630
+ (i) Distance between aromatic ring centers <=4.5 Å
631
+
632
+ (ii) Angle between the ring normal vectors <=23°
633
+
634
+ (iii) Angle between normalized distance vector between two ring
635
+ centers and both bases' normal vectors <=40°
636
+
637
+ Parameters
638
+ ----------
639
+ atom_array : AtomArray
640
+ The :class:`AtomArray` to find stacked bases in.
641
+ min_atoms_per_base : integer, optional
642
+ The number of atoms a nucleotides' base must have to be
643
+ considered a candidate for a stacking interaction.
644
+
645
+ Returns
646
+ -------
647
+ stacked_bases : ndarray, dtype=int, shape=(n,2)
648
+ Each row is equivalent to one pair of stacked bases and
649
+ contains the indices to the first atom for each one of both
650
+ paired residues.
651
+
652
+ Notes
653
+ -----
654
+ Please note that ring normal vectors are assumed to be equal to the
655
+ base normal vectors.
656
+
657
+ References
658
+ ----------
659
+
660
+ .. footbibliography::
661
+
662
+ Examples
663
+ --------
664
+ Compute the stacking interactions for a DNA-double-helix (PDB ID
665
+ 1BNA):
666
+
667
+ >>> from os.path import join
668
+ >>> dna_helix = load_structure(
669
+ ... join(path_to_structures, "base_pairs", "1bna.pdb")
670
+ ... )
671
+ >>> stacking_interactions = base_stacking(dna_helix)
672
+ >>> print(dna_helix[stacking_interactions].res_id)
673
+ [[ 1 2]
674
+ [ 2 3]
675
+ [ 3 4]
676
+ [ 4 5]
677
+ [ 5 6]
678
+ [ 6 7]
679
+ [ 7 8]
680
+ [ 8 9]
681
+ [ 9 10]
682
+ [11 12]
683
+ [14 15]
684
+ [15 16]
685
+ [16 17]
686
+ [17 18]
687
+ [18 19]
688
+ [19 20]
689
+ [20 21]
690
+ [21 22]
691
+ [22 23]
692
+ [23 24]]
693
+ """
694
+ # Get the stacking candidates according to a cutoff distance, where
695
+ # each base is identified as the first index of its respective
696
+ # residue.
697
+ # The diameter from the C1'-sugar-atom across a purine base is ~5Å
698
+ # and the distance between the base centers can be at most 4.5Å.
699
+ # Thus, accounting for buffer, a cutoff of 15Å between the
700
+ # nucleotides' C1'-atoms was chosen.
701
+ c1_mask = filter_nucleotides(atom_array) & (atom_array.atom_name == "C1'")
702
+ stacking_candidates, _ = _get_proximate_residues(atom_array, c1_mask, 15)
703
+
704
+ # Contains the plausible pairs of stacked bases
705
+ stacked_bases = []
706
+
707
+ # Get the residue masks for each residue
708
+ base_masks = get_residue_masks(atom_array, stacking_candidates.flatten())
709
+
710
+ # Group every two masks together for easy iteration (each 'row' is
711
+ # respective to a row in ``stacking_candidates``)
712
+ base_masks = base_masks.reshape(
713
+ (stacking_candidates.shape[0], 2, atom_array.shape[0])
714
+ )
715
+
716
+ for (base1_index, base2_index), (base1_mask, base2_mask) in zip(
717
+ stacking_candidates, base_masks
718
+ ):
719
+ bases = (atom_array[base1_mask], atom_array[base2_mask])
720
+
721
+ # A list containing ndarray for each base with transformed
722
+ # vectors from the standard base reference frame to the
723
+ # structures' coordinates. The layout is as follows:
724
+ #
725
+ # [Origin coordinates]
726
+ # [Base normal vector]
727
+ # [SCHNAaP origin coordinates]
728
+ # [Aromatic Ring Center coordinates]
729
+ transformed_std_vectors = [None] * 2
730
+
731
+ # Generate the data necessary for analysis of each base.
732
+ for i in range(2):
733
+ base_tuple = _match_base(bases[i], min_atoms_per_base)
734
+
735
+ if base_tuple is None:
736
+ break
737
+
738
+ transformed_std_vectors[i] = base_tuple
739
+
740
+ normal_vectors = np.vstack(
741
+ (transformed_std_vectors[0][1], transformed_std_vectors[1][1])
742
+ )
743
+ aromatic_ring_centers = [
744
+ transformed_std_vectors[0][3:],
745
+ transformed_std_vectors[1][3:],
746
+ ]
747
+
748
+ # Check if the base pairs are stacked.
749
+ stacked = _check_base_stacking(aromatic_ring_centers, normal_vectors)
750
+
751
+ # If a stacking interaction is found, append the first indices
752
+ # of the bases´'residues to the output.
753
+ if stacked:
754
+ stacked_bases.append((base1_index, base2_index))
755
+
756
+ return np.array(stacked_bases)
757
+
758
+
759
+ def base_pairs(atom_array, min_atoms_per_base=3, unique=True):
760
+ """
761
+ Use DSSR criteria to find the base pairs in an :class:`AtomArray`.
762
+
763
+ The algorithm is able to identify canonical and non-canonical
764
+ base pairs. between the 5 common bases Adenine, Guanine, Thymine,
765
+ Cytosine, and Uracil bound to Deoxyribose and Ribose.
766
+ Each Base is mapped to the 5 common bases Adenine, Guanine, Thymine,
767
+ Cytosine, and Uracil in a standard reference frame described in
768
+ :footcite:`Olson2001` using :func:`map_nucleotide()`.
769
+
770
+ The DSSR Criteria are as follows :footcite:`Lu2015`:
771
+
772
+ (i) Distance between base origins <=15 Å
773
+
774
+ (ii) Vertical separation between the base planes <=2.5 Å
775
+
776
+ (iii) Angle between the base normal vectors <=65°
777
+
778
+ (iv) Absence of stacking between the two bases
779
+
780
+ (v) Presence of at least one hydrogen bond involving a base atom
781
+
782
+ Parameters
783
+ ----------
784
+ atom_array : AtomArray
785
+ The :class:`AtomArray` to find base pairs in.
786
+ min_atoms_per_base : integer, optional
787
+ The number of atoms a nucleotides' base must have to be
788
+ considered a candidate for a base pair.
789
+ unique : bool, optional
790
+ If ``True``, each base is assumed to be only paired with one
791
+ other base. If multiple pairings are plausible, the pairing with
792
+ the most hydrogen bonds is selected.
793
+
794
+ Returns
795
+ -------
796
+ basepairs : ndarray, dtype=int, shape=(n,2)
797
+ Each row is equivalent to one base pair and contains the first
798
+ indices of the residues corresponding to each base.
799
+
800
+ Notes
801
+ -----
802
+ The bases from the standard reference frame described in
803
+ :footcite:`Olson2001` were modified such that only the base atoms
804
+ are implemented.
805
+ Sugar atoms (specifically C1') were disregarded, as nucleosides such
806
+ as PSU do not posess the usual N-glycosidic linkage, thus leading to
807
+ inaccurate results.
808
+
809
+ The vertical separation is implemented as the scalar
810
+ projection of the distance vectors between the base origins
811
+ according to :footcite:`Lu1997` onto the averaged base normal
812
+ vectors.
813
+
814
+ The presence of base stacking is assumed if the following criteria
815
+ are met :footcite:`Gabb1996`:
816
+
817
+ (i) Distance between aromatic ring centers <=4.5 Å
818
+
819
+ (ii) Angle between the ring normal vectors <=23°
820
+
821
+ (iii) Angle between normalized distance vector between two ring
822
+ centers and both bases' normal vectors <=40°
823
+
824
+ Please note that ring normal vectors are assumed to be equal to the
825
+ base normal vectors.
826
+
827
+ For structures without hydrogens the accuracy of the algorithm is
828
+ limited as the hydrogen bonds can be only checked be checked for
829
+ plausibility.
830
+ A hydrogen bond is considered as plausible if a cutoff of 3.6 Å
831
+ between N/O atom pairs is met. 3.6Å was chosen as hydrogen bonds are
832
+ typically 1.5-2.5Å in length. N-H and O-H bonds have a length of
833
+ 1.00Å and 0.96Å respectively. Thus, including some buffer, a 3.6Å
834
+ cutoff should cover all hydrogen bonds.
835
+
836
+ References
837
+ ----------
838
+
839
+ .. footbibliography::
840
+
841
+ Examples
842
+ --------
843
+ Compute the base pairs for the structure with the PDB ID 1QXB:
844
+
845
+ >>> from os.path import join
846
+ >>> dna_helix = load_structure(
847
+ ... join(path_to_structures, "base_pairs", "1qxb.cif")
848
+ ... )
849
+ >>> basepairs = base_pairs(dna_helix)
850
+ >>> print(dna_helix[basepairs].res_name)
851
+ [['DC' 'DG']
852
+ ['DG' 'DC']
853
+ ['DC' 'DG']
854
+ ['DG' 'DC']
855
+ ['DA' 'DT']
856
+ ['DA' 'DT']
857
+ ['DT' 'DA']
858
+ ['DT' 'DA']
859
+ ['DC' 'DG']
860
+ ['DG' 'DC']
861
+ ['DC' 'DG']
862
+ ['DG' 'DC']]
863
+ """
864
+
865
+ # Get the nucleotides for the given atom_array
866
+ nucleotides_boolean = filter_nucleotides(atom_array)
867
+
868
+ # Disregard the phosphate-backbone
869
+ non_phosphate_boolean = ~np.isin(
870
+ atom_array.atom_name, ["O5'", "P", "OP1", "OP2", "OP3", "HOP2", "HOP3"]
871
+ )
872
+
873
+ # Combine the two boolean masks
874
+ boolean_mask = nucleotides_boolean & non_phosphate_boolean
875
+
876
+ # Get only nucleosides
877
+ nucleosides = atom_array[boolean_mask]
878
+
879
+ # Get the base pair candidates according to a N/O cutoff distance,
880
+ # where each base is identified as the first index of its respective
881
+ # residue
882
+ n_o_mask = np.isin(nucleosides.element, ["N", "O"])
883
+ basepair_candidates, n_o_matches = _get_proximate_residues(
884
+ nucleosides, n_o_mask, 3.6
885
+ )
886
+
887
+ # Contains the plausible base pairs
888
+ basepairs = []
889
+ # Contains the number of hydrogens for each plausible base pair
890
+ basepairs_hbonds = []
891
+
892
+ # Get the residue masks for each residue
893
+ base_masks = get_residue_masks(nucleosides, basepair_candidates.flatten())
894
+
895
+ # Group every two masks together for easy iteration (each 'row' is
896
+ # respective to a row in ``basepair_candidates``)
897
+ base_masks = base_masks.reshape(
898
+ (basepair_candidates.shape[0], 2, nucleosides.shape[0])
899
+ )
900
+
901
+ for (base1_index, base2_index), (base1_mask, base2_mask), n_o_pairs in zip(
902
+ basepair_candidates, base_masks, n_o_matches
903
+ ):
904
+ base1 = nucleosides[base1_mask]
905
+ base2 = nucleosides[base2_mask]
906
+
907
+ hbonds = _check_dssr_criteria((base1, base2), min_atoms_per_base, unique)
908
+
909
+ # If no hydrogens are present use the number N/O pairs to
910
+ # decide between multiple pairing possibilities.
911
+
912
+ if hbonds is None:
913
+ # Each N/O-pair is detected twice. Thus, the number of
914
+ # matches must be divided by two.
915
+ hbonds = n_o_pairs / 2
916
+ if hbonds != -1:
917
+ basepairs.append((base1_index, base2_index))
918
+ if unique:
919
+ basepairs_hbonds.append(hbonds)
920
+
921
+ basepair_array = np.array(basepairs)
922
+
923
+ if unique:
924
+ # Contains all non-unique base pairs that are flagged to be
925
+ # removed
926
+ to_remove = []
927
+
928
+ # Get all bases that have non-unique pairing interactions
929
+ base_indices, occurrences = np.unique(basepairs, return_counts=True)
930
+ for base_index, occurrence in zip(base_indices, occurrences):
931
+ if occurrence > 1:
932
+ # Write the non-unique base pairs to a dictionary as
933
+ # 'index: number of hydrogen bonds'
934
+ remove_candidates = {}
935
+ for i, row in enumerate(np.asarray(basepair_array == base_index)):
936
+ if np.any(row):
937
+ remove_candidates[i] = basepairs_hbonds[i]
938
+ # Flag all non-unique base pairs for removal except the
939
+ # one that has the most hydrogen bonds
940
+ del remove_candidates[max(remove_candidates, key=remove_candidates.get)]
941
+ to_remove += list(remove_candidates.keys())
942
+ # Remove all flagged base pairs from the output `ndarray`
943
+ basepair_array = np.delete(basepair_array, to_remove, axis=0)
944
+
945
+ # Remap values to original atom array
946
+ if len(basepair_array) > 0:
947
+ basepair_array = np.where(boolean_mask)[0][basepair_array]
948
+ for i, row in enumerate(basepair_array):
949
+ basepair_array[i] = get_residue_starts_for(atom_array, row)
950
+ return basepair_array
951
+
952
+
953
+ def _check_dssr_criteria(basepair, min_atoms_per_base, unique):
954
+ """
955
+ Check the DSSR criteria of a potential base pair.
956
+
957
+ Parameters
958
+ ----------
959
+ basepair : tuple (AtomArray, AtomArray)
960
+ The two bases to check the criteria for as :class:`AtomArray`.
961
+ min_atoms_per_base : int
962
+ The number of atoms a nucleotides' base must have to be
963
+ considered a candidate for a base pair.
964
+ unique : bool
965
+ If ``True``, the shortest hydrogen bond length between the bases
966
+ is calculated for plausible base pairs.
967
+
968
+ Returns
969
+ -------
970
+ satisfied : int
971
+ `> 0` if the base pair satisfies the criteria and `-1`,
972
+ if it does not.
973
+ If unique is ``True``, the number of hydrogen bonds is
974
+ returned for plausible base pairs.
975
+ """
976
+
977
+ # A list containing ndarray for each base with transformed
978
+ # vectors from the standard base reference frame to the structures'
979
+ # coordinates. The layout is as follows:
980
+ #
981
+ # [Origin coordinates]
982
+ # [Base normal vector]
983
+ # [SCHNAaP origin coordinates]
984
+ # [Aromatic Ring Center coordinates]
985
+ transformed_std_vectors = [None] * 2
986
+
987
+ # Generate the data necessary for analysis of each base.
988
+ for i in range(2):
989
+ transformed_std_vectors[i] = _match_base(basepair[i], min_atoms_per_base)
990
+
991
+ if transformed_std_vectors[i] is None:
992
+ return -1
993
+
994
+ origins = np.vstack((transformed_std_vectors[0][0], transformed_std_vectors[1][0]))
995
+ normal_vectors = np.vstack(
996
+ (transformed_std_vectors[0][1], transformed_std_vectors[1][1])
997
+ )
998
+ schnaap_origins = np.vstack(
999
+ (transformed_std_vectors[0][2], transformed_std_vectors[1][2])
1000
+ )
1001
+ aromatic_ring_centers = [
1002
+ transformed_std_vectors[0][3:],
1003
+ transformed_std_vectors[1][3:],
1004
+ ]
1005
+
1006
+ # Criterion 1: Distance between orgins <=15 Å
1007
+ if not (distance(origins[0], origins[1]) <= 15):
1008
+ return -1
1009
+
1010
+ # Criterion 2: Vertical separation <=2.5 Å
1011
+ #
1012
+ # Average the base normal vectors. If the angle between the vectors
1013
+ # is >=90°, flip one vector before averaging
1014
+ mean_normal_vector = (
1015
+ normal_vectors[0]
1016
+ + (normal_vectors[1] * np.sign(np.dot(normal_vectors[0], normal_vectors[1])))
1017
+ ) / 2
1018
+ norm_vector(mean_normal_vector)
1019
+ # Calculate the distance vector between the two SCHNAaP origins
1020
+ origin_distance_vector = schnaap_origins[1] - schnaap_origins[0]
1021
+
1022
+ # The scalar projection of the distance vector between the two
1023
+ # origins onto the averaged normal vectors is the vertical
1024
+ # seperation
1025
+ if not abs(np.dot(origin_distance_vector, mean_normal_vector)) <= 2.5:
1026
+ return -1
1027
+
1028
+ # Criterion 3: Angle between normal vectors <=65°
1029
+ if not (
1030
+ np.arccos(np.dot(normal_vectors[0], normal_vectors[1])) >= ((115 * np.pi) / 180)
1031
+ ):
1032
+ return -1
1033
+
1034
+ # Criterion 4: Absence of stacking
1035
+ if _check_base_stacking(aromatic_ring_centers, normal_vectors):
1036
+ return -1
1037
+
1038
+ # Criterion 5: Presence of at least one hydrogen bond
1039
+ #
1040
+ # Check if both bases came with hydrogens.
1041
+ if ("H" in basepair[0].element) and ("H" in basepair[1].element):
1042
+ # For Structures that contain hydrogens, check for their
1043
+ # presence directly.
1044
+ #
1045
+ # Generate input atom array for ``hbond``
1046
+ potential_basepair = basepair[0] + basepair[1]
1047
+
1048
+ # Get the number of hydrogen bonds
1049
+ bonds = len(
1050
+ hbond(
1051
+ potential_basepair,
1052
+ np.ones_like(potential_basepair, dtype=bool),
1053
+ np.ones_like(potential_basepair, dtype=bool),
1054
+ )
1055
+ )
1056
+
1057
+ if bonds > 0:
1058
+ return bonds
1059
+ return -1
1060
+
1061
+ else:
1062
+ # If the structure does not contain hydrogens return None
1063
+ return None
1064
+
1065
+
1066
+ def _check_base_stacking(aromatic_ring_centers, normal_vectors):
1067
+ """
1068
+ Check for base stacking between two bases.
1069
+
1070
+ Parameters
1071
+ ----------
1072
+ aromatic_ring_centers : list [ndarray, ndarray]
1073
+ A list with the aromatic ring center coordinates as
1074
+ :class:`ndarray`. Each row represents a ring center.
1075
+ normal_vectors : ndarray shape=(2, 3)
1076
+ The normal vectors of the bases.
1077
+
1078
+ Returns
1079
+ -------
1080
+ base_stacking : bool
1081
+ ``True`` if base stacking is detected and ``False`` if not
1082
+ """
1083
+
1084
+ # Contains the normalized distance vectors between ring centers less
1085
+ # than 4.5 Å apart.
1086
+ normalized_distance_vectors = []
1087
+
1088
+ # Criterion 1: Distance between aromatic ring centers <=4.5 Å
1089
+ wrong_distance = True
1090
+ for ring_center1 in aromatic_ring_centers[0]:
1091
+ for ring_center2 in aromatic_ring_centers[1]:
1092
+ if distance(ring_center1, ring_center2) <= 4.5:
1093
+ wrong_distance = False
1094
+ normalized_distance_vectors.append(ring_center2 - ring_center1)
1095
+ norm_vector(normalized_distance_vectors[-1])
1096
+ if wrong_distance:
1097
+ return False
1098
+
1099
+ # Criterion 2: Angle between normal vectors or its supplement <=23°
1100
+ normal_vectors_angle = np.rad2deg(
1101
+ np.arccos(np.dot(normal_vectors[0], normal_vectors[1]))
1102
+ )
1103
+ if (normal_vectors_angle >= 23) and (normal_vectors_angle <= 157):
1104
+ return False
1105
+
1106
+ # Criterion 3: Angle between one normalized distance vector and
1107
+ # each of the bases' normal vector or supplement <=40°
1108
+ for normal_vector in normal_vectors:
1109
+ for normalized_dist_vector in normalized_distance_vectors:
1110
+ dist_normal_vector_angle = np.rad2deg(
1111
+ np.arccos(np.dot(normal_vector, normalized_dist_vector))
1112
+ )
1113
+ if (dist_normal_vector_angle >= 40) and (dist_normal_vector_angle <= 140):
1114
+ return False
1115
+
1116
+ return True
1117
+
1118
+
1119
+ def _match_base(nucleotide, min_atoms_per_base):
1120
+ """
1121
+ Match the nucleotide to a corresponding standard base reference
1122
+ frame.
1123
+
1124
+ Parameters
1125
+ ----------
1126
+ nucleotide : AtomArray
1127
+ The nucleotide to be matched to a standard base.
1128
+ min_atoms_per_base : integer
1129
+ The number of atoms a base must have to be considered a
1130
+ candidate for a base pair.
1131
+
1132
+ Returns
1133
+ -------
1134
+ vectors : ndarray, dtype=float, shape=(n,3)
1135
+ Transformed standard vectors, origin coordinates, base normal
1136
+ vector, aromatic ring center coordinates.
1137
+ """
1138
+
1139
+ # Standard vectors containing the origin and the base normal vectors
1140
+ vectors = np.array([[0, 0, 0], [0, 0, 1]], dtype=float)
1141
+
1142
+ # Map the nucleotide to a reference base
1143
+ one_letter_code, _ = map_nucleotide(nucleotide, min_atoms_per_base)
1144
+
1145
+ if one_letter_code is None:
1146
+ return None
1147
+
1148
+ if one_letter_code == "A":
1149
+ std_base = _STD_ADENINE
1150
+ std_ring_centers = _STD_ADENINE_RING_CENTERS
1151
+ elif one_letter_code == "T":
1152
+ std_base = _STD_THYMINE
1153
+ std_ring_centers = _STD_THYMINE_RING_CENTERS
1154
+ elif one_letter_code == "C":
1155
+ std_base = _STD_CYTOSINE
1156
+ std_ring_centers = _STD_CYTOSINE_RING_CENTERS
1157
+ elif one_letter_code == "G":
1158
+ std_base = _STD_GUANINE
1159
+ std_ring_centers = _STD_GUANINE_RING_CENTERS
1160
+ elif one_letter_code == "U":
1161
+ std_base = _STD_URACIL
1162
+ std_ring_centers = _STD_URACIL_RING_CENTERS
1163
+
1164
+ # Add the ring centers to the array of vectors to be transformed.
1165
+ vectors = np.vstack((vectors, std_ring_centers))
1166
+
1167
+ # Select the matching atoms of the nucleotide and the standard base
1168
+ nucleotide_matched = nucleotide[np.isin(nucleotide.atom_name, std_base.atom_name)]
1169
+ std_base_matched = std_base[np.isin(std_base.atom_name, nucleotide.atom_name)]
1170
+ # Ensure the nucleotide does not contain duplicate atom names
1171
+ _, unique_indices = np.unique(nucleotide_matched.atom_name, return_index=True)
1172
+ nucleotide_matched = nucleotide_matched[unique_indices]
1173
+ # Only continue if minimum number of matching atoms is reached
1174
+ if len(nucleotide_matched) < min_atoms_per_base:
1175
+ warnings.warn(
1176
+ f"Nucleotide with res_id {nucleotide.res_id[0]} and "
1177
+ f"chain_id {nucleotide.chain_id[0]} has less than 3 base "
1178
+ f"atoms, unable to check for base pair.",
1179
+ IncompleteStructureWarning,
1180
+ )
1181
+ return None
1182
+ # Reorder the atoms of the nucleotide to obtain the standard RCSB
1183
+ # PDB atom order.
1184
+ nucleotide_matched = nucleotide_matched[standardize_order(nucleotide_matched)]
1185
+
1186
+ # Match the selected std_base to the base.
1187
+ _, transformation = superimpose(nucleotide_matched, std_base_matched)
1188
+ vectors = transformation.apply(vectors)
1189
+ # Normalize the base-normal-vector
1190
+ vectors[1, :] = vectors[1, :] - vectors[0, :]
1191
+ norm_vector(vectors[1, :])
1192
+
1193
+ return vectors
1194
+
1195
+
1196
+ def map_nucleotide(residue, min_atoms_per_base=3, rmsd_cutoff=0.28):
1197
+ """
1198
+ Map a nucleotide to one of the 5 common bases Adenine, Guanine,
1199
+ Thymine, Cytosine, and Uracil. If one of those bases bound to
1200
+ Deoxyribose and Ribose is detected as input, the corresponding one-
1201
+ letter-code (``A``, ``G``, ``T``, ``C``, ``U``) is returned.
1202
+
1203
+ If a different nucleotide is given, it is mapped to the best
1204
+ fitting base using the algorithm described below.
1205
+
1206
+ (i) The number of matching atom names with the reference bases is counted.
1207
+ If the number of matching atoms with all reference bases is less than the
1208
+ specified `min_atoms_per_base` the nucleotide cannot be mapped and ``None`` is
1209
+ returned.
1210
+
1211
+ (ii) The bases with maximum number of matching atoms are selected and superimposed
1212
+ with each reference.
1213
+ The base with lowest RMSD is chosen.
1214
+ If the RMSD is more than the specified `rmsd_cutoff`, the nucleotide cannot be
1215
+ mapped and ``None`` is returned.
1216
+
1217
+ Parameters
1218
+ ----------
1219
+ residue : AtomArray
1220
+ The nucleotide to be mapped.
1221
+ min_atoms_per_base : int, optional
1222
+ The number of atoms the residue must have in common with the
1223
+ reference.
1224
+ rmsd_cutoff : float, optional
1225
+ The maximum RSMD that is allowed for a mapping to occur.
1226
+
1227
+ Returns
1228
+ -------
1229
+ one_letter_code : str
1230
+ The one-letter-code of the mapped base. ``None`` if no base can
1231
+ be mapped.
1232
+ exact_match : bool
1233
+ Wether or not the residue name exactly matches one of the common
1234
+ bases, i.e. the ``res_name`` of the input `residue` is one of
1235
+ ``A``, ``G``, ``T``, ``C``, ``U``, ``DA``, ``DG``, ``DT``,
1236
+ ``DC`` or ``DU``.
1237
+
1238
+ Notes
1239
+ -----
1240
+ The default RMSD cutoff was chosen according to :footcite:`Lu2015`,
1241
+ where the same cutoff is used to detect if a given base is a
1242
+ nucleotide, by superimposing the base ring atoms onto a reference
1243
+ structure.
1244
+
1245
+ References
1246
+ ----------
1247
+
1248
+ .. footbibliography::
1249
+ """
1250
+ # Check if the residue is a 'standard' nucleotide
1251
+ if residue.res_name[0] in _REFERENCE_NUCLEOTIDE_NAMES:
1252
+ return residue.res_name[0][-1], True
1253
+
1254
+ # List of the standard bases for easy iteration
1255
+ std_base_list = [
1256
+ _STD_ADENINE,
1257
+ _STD_THYMINE,
1258
+ _STD_CYTOSINE,
1259
+ _STD_GUANINE,
1260
+ _STD_URACIL,
1261
+ ]
1262
+
1263
+ # The number of matched atoms for each 'standard' base
1264
+ matched_atom_no = [
1265
+ np.sum(np.isin(ref_base.atom_name, residue.atom_name))
1266
+ for ref_base in std_base_list
1267
+ ]
1268
+
1269
+ if np.max(matched_atom_no) < min_atoms_per_base:
1270
+ warnings.warn(
1271
+ f"Base with res_id {residue.res_id[0]} and chain_id "
1272
+ f"{residue.chain_id[0]} has an overlap with the reference "
1273
+ f"bases which is less than {min_atoms_per_base} atoms. "
1274
+ f"Unable to map nucleotide.",
1275
+ IncompleteStructureWarning,
1276
+ )
1277
+ return None, False
1278
+
1279
+ # The one letter code of the best matching reference base
1280
+ best_base = None
1281
+
1282
+ # Iterate through the reference bases with the maximum number of
1283
+ # matching atoms
1284
+ for ref_base in np.array(std_base_list, dtype="object")[
1285
+ np.array(matched_atom_no) == np.max(matched_atom_no)
1286
+ ]:
1287
+ # Copy the residue as the res_name property of the ``AtomArray``
1288
+ # has to be modified for later function calls.
1289
+ nuc = residue.copy()
1290
+
1291
+ # Select the matching atoms of the nucleotide and the reference
1292
+ # base
1293
+ nuc = nuc[np.isin(nuc.atom_name, ref_base.atom_name)]
1294
+ ref_base_matched = ref_base[np.isin(ref_base.atom_name, nuc.atom_name)]
1295
+
1296
+ # Set the res_name property to the same as the reference base.
1297
+ # This is a requirement for ``standardize_order``
1298
+ nuc.res_name = ref_base_matched.res_name
1299
+ # Reorder the atoms of the nucleotide to obtain the standard
1300
+ # RCSB PDB atom order. If a residue contains multiple atoms with
1301
+ # the same ``atom_name`` an exception is thrown by
1302
+ # ``standardize_order``. The exception is caught and the
1303
+ # selected reference is disregarded
1304
+ try:
1305
+ nuc = nuc[standardize_order(nuc)]
1306
+ except Exception:
1307
+ continue
1308
+
1309
+ # Superimpose the nucleotide to the reference base
1310
+ fitted, _ = superimpose(ref_base_matched, nuc)
1311
+
1312
+ # If the RMSD is lower than the specified cutoff or better than
1313
+ # a previous found reference, the current reference is selected
1314
+ # as best base
1315
+ if rmsd(fitted, ref_base_matched) < rmsd_cutoff:
1316
+ rmsd_cutoff = rmsd(fitted, ref_base_matched)
1317
+ best_base = ref_base_matched.res_name[0][-1]
1318
+
1319
+ if best_base is None:
1320
+ warnings.warn(
1321
+ f"Base Type {residue.res_name[0]} not supported. ",
1322
+ UnexpectedStructureWarning,
1323
+ )
1324
+ return None
1325
+
1326
+ return best_base, False
1327
+
1328
+
1329
+ def _get_proximate_residues(atom_array, boolean_mask, cutoff):
1330
+ """
1331
+ Filter for residue pairs based on the distance between selected
1332
+ atoms.
1333
+
1334
+ Parameters
1335
+ ----------
1336
+ atom_array : AtomArray, shape=(n,)
1337
+ The :class:`AtomArray`` to find basepair candidates in.
1338
+ boolean_mask : ndarray, dtype=bool, shape=(n,)
1339
+ The selection of atoms.
1340
+ cutoff : integer
1341
+ The maximum distance between the atoms of the two residues.
1342
+
1343
+ Returns
1344
+ -------
1345
+ pairs : ndarray, dtype=int, shape=(n,2)
1346
+ Contains the basepair candidates. Each row is equivalent to one
1347
+ potential basepair. bases are represented as the first indices
1348
+ of their corresponding residues.
1349
+ count : ndarray, dtype=int, shape=(n,)
1350
+ The number of atom pairs between the residues within the
1351
+ specified cutoff
1352
+ """
1353
+
1354
+ # Get the indices of the atoms that are within the maximum cutoff
1355
+ # of each other
1356
+ indices = CellList(atom_array, cutoff, selection=boolean_mask).get_atoms(
1357
+ atom_array.coord[boolean_mask], cutoff
1358
+ )
1359
+
1360
+ # Loop through the indices of potential partners
1361
+ pairs = []
1362
+ for candidate, partners in zip(np.argwhere(boolean_mask)[:, 0], indices):
1363
+ for partner in partners:
1364
+ if partner != -1:
1365
+ pairs.append((candidate, partner))
1366
+
1367
+ # Get the residue starts for the indices of the candidate/partner
1368
+ # indices.
1369
+ pairs = np.array(pairs)
1370
+ basepair_candidates_shape = pairs.shape
1371
+ pairs = get_residue_starts_for(atom_array, pairs.flatten()).reshape(
1372
+ basepair_candidates_shape
1373
+ )
1374
+
1375
+ # Remove candidates where the pairs are from the same residue
1376
+ pairs = np.delete(pairs, np.where(pairs[:, 0] == pairs[:, 1]), axis=0)
1377
+ # Sort the residue starts for each pair
1378
+ for i, candidate in enumerate(pairs):
1379
+ pairs[i] = sorted(candidate)
1380
+ # Make sure each pair is only listed once, count the occurrences
1381
+ pairs, count = np.unique(pairs, axis=0, return_counts=True)
1382
+
1383
+ return pairs, count
1384
+
1385
+
1386
+ def _filter_atom_type(atom_array, atom_names):
1387
+ """
1388
+ Get all atoms with specified atom names.
1389
+
1390
+ Parameters
1391
+ ----------
1392
+ atom_array : AtomArray
1393
+ The :class:`AtomArray` to filter.
1394
+ atom_names : array_like
1395
+ The desired atom names.
1396
+
1397
+ Returns
1398
+ -------
1399
+ filter : ndarray, dtype=bool
1400
+ This array is ``True`` for all indices in the :class:`AtomArray`
1401
+ , where the atom has the desired atom names.
1402
+ """
1403
+ return np.isin(atom_array.atom_name, atom_names) & (atom_array.res_id != -1)