biotite 1.5.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cp314-win_amd64.pyd +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cp314-win_amd64.pyd +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cp314-win_amd64.pyd +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cp314-win_amd64.pyd +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cp314-win_amd64.pyd +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cp314-win_amd64.pyd +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cp314-win_amd64.pyd +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cp314-win_amd64.pyd +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cp314-win_amd64.pyd +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cp314-win_amd64.pyd +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cp314-win_amd64.pyd +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cp314-win_amd64.pyd +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cp314-win_amd64.pyd +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cp314-win_amd64.pyd +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cp314-win_amd64.pyd +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cp314-win_amd64.pyd +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cp314-win_amd64.pyd +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cp314-win_amd64.pyd +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cp314-win_amd64.pyd +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cp314-win_amd64.pyd +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cp314-win_amd64.pyd +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +4 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,2113 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdbx"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
7
+ __all__ = [
8
+ "get_sequence",
9
+ "get_model_count",
10
+ "get_structure",
11
+ "set_structure",
12
+ "get_component",
13
+ "set_component",
14
+ "list_assemblies",
15
+ "get_assembly",
16
+ "get_unit_cell",
17
+ "get_sse",
18
+ ]
19
+
20
+ import itertools
21
+ import warnings
22
+ from collections import defaultdict
23
+ import numpy as np
24
+ from biotite.file import InvalidFileError
25
+ from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
26
+ from biotite.structure.atoms import (
27
+ AtomArray,
28
+ AtomArrayStack,
29
+ concatenate,
30
+ repeat,
31
+ )
32
+ from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
33
+ from biotite.structure.box import (
34
+ coord_to_fraction,
35
+ fraction_to_coord,
36
+ space_group_transforms,
37
+ unitcell_from_vectors,
38
+ vectors_from_unitcell,
39
+ )
40
+ from biotite.structure.error import BadStructureError
41
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
42
+ from biotite.structure.filter import (
43
+ _canonical_nucleotide_list as canonical_nucleotide_list,
44
+ )
45
+ from biotite.structure.filter import (
46
+ filter_first_altloc,
47
+ filter_highest_occupancy_altloc,
48
+ )
49
+ from biotite.structure.geometry import centroid
50
+ from biotite.structure.io.pdbx.bcif import (
51
+ BinaryCIFBlock,
52
+ BinaryCIFColumn,
53
+ BinaryCIFFile,
54
+ )
55
+ from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
56
+ from biotite.structure.io.pdbx.component import MaskValue
57
+ from biotite.structure.io.pdbx.encoding import StringArrayEncoding
58
+ from biotite.structure.repair import create_continuous_res_ids
59
+ from biotite.structure.residues import (
60
+ get_residue_count,
61
+ get_residue_positions,
62
+ get_residue_starts_for,
63
+ )
64
+ from biotite.structure.transform import AffineTransformation
65
+
66
+ # Bond types in `struct_conn` category that refer to covalent bonds
67
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
68
+ # Although a covalent bond, could in theory have a higher bond order,
69
+ # practically inter-residue bonds are always single
70
+ "covale": BondType.SINGLE,
71
+ "covale_base": BondType.SINGLE,
72
+ "covale_phosphate": BondType.SINGLE,
73
+ "covale_sugar": BondType.SINGLE,
74
+ "disulf": BondType.SINGLE,
75
+ "modres": BondType.SINGLE,
76
+ "modres_link": BondType.SINGLE,
77
+ "metalc": BondType.COORDINATION,
78
+ }
79
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
80
+ BondType.ANY: "covale",
81
+ BondType.SINGLE: "covale",
82
+ BondType.DOUBLE: "covale",
83
+ BondType.TRIPLE: "covale",
84
+ BondType.QUADRUPLE: "covale",
85
+ BondType.AROMATIC_SINGLE: "covale",
86
+ BondType.AROMATIC_DOUBLE: "covale",
87
+ BondType.AROMATIC_TRIPLE: "covale",
88
+ BondType.COORDINATION: "metalc",
89
+ }
90
+ PDBX_BOND_TYPE_TO_ORDER = {
91
+ BondType.SINGLE: "sing",
92
+ BondType.DOUBLE: "doub",
93
+ BondType.TRIPLE: "trip",
94
+ BondType.QUADRUPLE: "quad",
95
+ BondType.AROMATIC_SINGLE: "sing",
96
+ BondType.AROMATIC_DOUBLE: "doub",
97
+ BondType.AROMATIC_TRIPLE: "trip",
98
+ # These are masked later, it is merely added here to avoid a KeyError
99
+ BondType.ANY: "",
100
+ BondType.AROMATIC: "",
101
+ BondType.COORDINATION: "",
102
+ }
103
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
104
+ COMP_BOND_ORDER_TO_TYPE = {
105
+ ("SING", "N"): BondType.SINGLE,
106
+ ("DOUB", "N"): BondType.DOUBLE,
107
+ ("TRIP", "N"): BondType.TRIPLE,
108
+ ("QUAD", "N"): BondType.QUADRUPLE,
109
+ ("SING", "Y"): BondType.AROMATIC_SINGLE,
110
+ ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
111
+ ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
112
+ ("AROM", "Y"): BondType.AROMATIC,
113
+ }
114
+ # ...and vice versa
115
+ COMP_BOND_TYPE_TO_ORDER = {
116
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
117
+ }
118
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
119
+ # it was observed that when the number or rows in `atom_site` and `struct_conn`
120
+ # exceed a certain threshold,
121
+ # a dictionary approach is less computation and memory intensive than the dense
122
+ # vectorized approach.
123
+ # https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
124
+ FIND_MATCHES_SWITCH_THRESHOLD = 4000000
125
+
126
+ _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
127
+ _nucleotideseq_type_list = [
128
+ "polydeoxyribonucleotide",
129
+ "polyribonucleotide",
130
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
131
+ ]
132
+ _other_type_list = [
133
+ "cyclic-pseudo-peptide",
134
+ "other",
135
+ "peptide nucleic acid",
136
+ "polysaccharide(D)",
137
+ "polysaccharide(L)",
138
+ ]
139
+
140
+
141
+ def _filter(category, index):
142
+ """
143
+ Reduce the given category to the values selected by the given index,
144
+ """
145
+ Category = type(category)
146
+ Column = Category.subcomponent_class()
147
+ Data = Column.subcomponent_class()
148
+
149
+ return Category(
150
+ {
151
+ key: Column(
152
+ Data(column.data.array[index]),
153
+ (Data(column.mask.array[index]) if column.mask is not None else None),
154
+ )
155
+ for key, column in category.items()
156
+ }
157
+ )
158
+
159
+
160
+ def get_sequence(pdbx_file, data_block=None):
161
+ """
162
+ Get the protein and nucleotide sequences from the
163
+ ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
164
+
165
+ Supported polymer types (``_entity_poly.type``) are:
166
+ ``'polypeptide(D)'``, ``'polypeptide(L)'``,
167
+ ``'polydeoxyribonucleotide'``, ``'polyribonucleotide'`` and
168
+ ``'polydeoxyribonucleotide/polyribonucleotide hybrid'``.
169
+ Uracil is converted to Thymine.
170
+
171
+ Parameters
172
+ ----------
173
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
174
+ The file object.
175
+ data_block : str, optional
176
+ The name of the data block.
177
+ Default is the first (and most times only) data block of the
178
+ file.
179
+ If the data block object is passed directly to `pdbx_file`,
180
+ this parameter is ignored.
181
+
182
+ Returns
183
+ -------
184
+ sequence_dict : Dictionary of Sequences
185
+ Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
186
+ (equivalent to ``atom_site.auth_asym_id``).
187
+ Dictionary values are sequences.
188
+
189
+ Notes
190
+ -----
191
+ The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial
192
+ complete sequence. If the structure represents a truncated or spliced
193
+ version of this initial sequence, it will include only a subset of the
194
+ initial sequence. Use biotite.structure.get_residues to retrieve only
195
+ the residues that are represented in the structure.
196
+ """
197
+
198
+ block = _get_block(pdbx_file, data_block)
199
+ poly_category = block["entity_poly"]
200
+
201
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
202
+ seq_type = poly_category["type"].as_array(str)
203
+
204
+ sequences = [
205
+ _convert_string_to_sequence(string, stype)
206
+ for string, stype in zip(seq_string, seq_type)
207
+ ]
208
+
209
+ strand_ids = poly_category["pdbx_strand_id"].as_array(str)
210
+ strand_ids = [strand_id.split(",") for strand_id in strand_ids]
211
+
212
+ sequence_dict = {
213
+ strand_id: sequence
214
+ for sequence, strand_ids in zip(sequences, strand_ids)
215
+ for strand_id in strand_ids
216
+ if sequence is not None
217
+ }
218
+
219
+ return sequence_dict
220
+
221
+
222
+ def get_model_count(pdbx_file, data_block=None):
223
+ """
224
+ Get the number of models contained in a file.
225
+
226
+ Parameters
227
+ ----------
228
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
229
+ The file object.
230
+ data_block : str, optional
231
+ The name of the data block.
232
+ Default is the first (and most times only) data block of the
233
+ file.
234
+ If the data block object is passed directly to `pdbx_file`,
235
+ this parameter is ignored.
236
+
237
+ Returns
238
+ -------
239
+ model_count : int
240
+ The number of models.
241
+ """
242
+ block = _get_block(pdbx_file, data_block)
243
+ return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
244
+
245
+
246
+ def get_structure(
247
+ pdbx_file,
248
+ model=None,
249
+ data_block=None,
250
+ altloc="first",
251
+ extra_fields=None,
252
+ use_author_fields=True,
253
+ include_bonds=False,
254
+ ):
255
+ """
256
+ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
257
+ ``atom_site`` category in a file.
258
+
259
+ Parameters
260
+ ----------
261
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
262
+ The file object.
263
+ model : int, optional
264
+ If this parameter is given, the function will return an
265
+ :class:`AtomArray` from the atoms corresponding to the given
266
+ model number (starting at 1).
267
+ Negative values are used to index models starting from the last
268
+ model insted of the first model.
269
+ If this parameter is omitted, an :class:`AtomArrayStack`
270
+ containing all models will be returned, even if the structure
271
+ contains only one model.
272
+ data_block : str, optional
273
+ The name of the data block.
274
+ Default is the first (and most times only) data block of the
275
+ file.
276
+ If the data block object is passed directly to `pdbx_file`,
277
+ this parameter is ignored.
278
+ altloc : {'first', 'occupancy', 'all'}
279
+ This parameter defines how *altloc* IDs are handled:
280
+ - ``'first'`` - Use atoms that have the first *altloc* ID
281
+ appearing in a residue.
282
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
283
+ with the highest occupancy for a residue.
284
+ - ``'all'`` - Use all atoms.
285
+ Note that this leads to duplicate atoms.
286
+ When this option is chosen, the ``altloc_id`` annotation
287
+ array is added to the returned structure.
288
+ extra_fields : list of str, optional
289
+ The strings in the list are entry names, that are
290
+ additionally added as annotation arrays.
291
+ The annotation category name will be the same as the PDBx
292
+ subcategory name.
293
+ The array type is always `str`.
294
+ An exception are the special field identifiers:
295
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
296
+ These will convert the fitting subcategory into an
297
+ annotation array with reasonable type.
298
+ use_author_fields : bool, optional
299
+ Some fields can be read from two alternative sources,
300
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
301
+ the ID of the residue.
302
+ While, the ``label_xxx`` fields can be used as official pointers
303
+ to other categories in the file, the ``auth_xxx``
304
+ fields are set by the author(s) of the structure and are
305
+ consistent with the corresponding values in PDB files.
306
+ If `use_author_fields` is true, the annotation arrays will be
307
+ read from the ``auth_xxx`` fields (if applicable),
308
+ otherwise from the the ``label_xxx`` fields.
309
+ If the requested field is not available, the respective other
310
+ field is taken as fallback.
311
+ include_bonds : bool, optional
312
+ If set to true, a :class:`BondList` will be created for the
313
+ resulting :class:`AtomArray` containing the bond information
314
+ from the file.
315
+ Inter-residue bonds, will be read from the ``struct_conn``
316
+ category.
317
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
318
+ available, otherwise they will be derived from the Chemical
319
+ Component Dictionary.
320
+
321
+ Returns
322
+ -------
323
+ array : AtomArray or AtomArrayStack
324
+ The return type depends on the `model` parameter.
325
+
326
+ Examples
327
+ --------
328
+
329
+ >>> import os.path
330
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
331
+ >>> arr = get_structure(file, model=1)
332
+ >>> print(len(arr))
333
+ 304
334
+ """
335
+ block = _get_block(pdbx_file, data_block)
336
+
337
+ extra_fields = set() if extra_fields is None else set(extra_fields)
338
+
339
+ atom_site = block.get("atom_site")
340
+ if atom_site is None:
341
+ raise InvalidFileError("Missing 'atom_site' category in file")
342
+
343
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
344
+ model_count = len(np.unique(models))
345
+ atom_count = len(models)
346
+
347
+ if model is None:
348
+ # For a stack, the annotations are derived from the first model
349
+ model_atom_site = _filter_model(atom_site, 1)
350
+ # Any field of the category would work here to get the length
351
+ model_length = model_atom_site.row_count
352
+ atoms = AtomArrayStack(model_count, model_length)
353
+
354
+ # Check if each model has the same amount of atoms
355
+ # If not, raise exception
356
+ if model_length * model_count != atom_count:
357
+ raise InvalidFileError(
358
+ "The models in the file have unequal "
359
+ "amount of atoms, give an explicit model "
360
+ "instead"
361
+ )
362
+
363
+ atoms.coord[:, :, 0] = (
364
+ atom_site["Cartn_x"]
365
+ .as_array(np.float32)
366
+ .reshape((model_count, model_length))
367
+ )
368
+ atoms.coord[:, :, 1] = (
369
+ atom_site["Cartn_y"]
370
+ .as_array(np.float32)
371
+ .reshape((model_count, model_length))
372
+ )
373
+ atoms.coord[:, :, 2] = (
374
+ atom_site["Cartn_z"]
375
+ .as_array(np.float32)
376
+ .reshape((model_count, model_length))
377
+ )
378
+
379
+ box = _get_box(block)
380
+ if box is not None:
381
+ # Duplicate same box for each model
382
+ atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
383
+
384
+ else:
385
+ if model == 0:
386
+ raise ValueError("The model index must not be 0")
387
+ # Negative models mean model indexing starting from last model
388
+ model = model_count + model + 1 if model < 0 else model
389
+ if model > model_count:
390
+ raise ValueError(
391
+ f"The file has {model_count} models, "
392
+ f"the given model {model} does not exist"
393
+ )
394
+
395
+ model_atom_site = _filter_model(atom_site, model)
396
+ # Any field of the category would work here to get the length
397
+ model_length = model_atom_site.row_count
398
+ atoms = AtomArray(model_length)
399
+
400
+ atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
401
+ atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
402
+ atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
403
+
404
+ atoms.box = _get_box(block)
405
+
406
+ # The below part is the same for both, AtomArray and AtomArrayStack
407
+ _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
408
+
409
+ atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
410
+
411
+ if include_bonds:
412
+ if altloc == "all":
413
+ raise ValueError(
414
+ "Bond computation is not supported with `altloc='all', consider using "
415
+ "'connect_via_residue_names()' afterwards"
416
+ )
417
+
418
+ if "chem_comp_bond" in block:
419
+ try:
420
+ custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
421
+ except KeyError:
422
+ warnings.warn(
423
+ "The 'chem_comp_bond' category has missing columns, "
424
+ "falling back to using Chemical Component Dictionary",
425
+ UserWarning,
426
+ )
427
+ custom_bond_dict = None
428
+ bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict)
429
+ else:
430
+ bonds = connect_via_residue_names(atoms)
431
+ if "struct_conn" in block:
432
+ bonds = bonds.merge(
433
+ _parse_inter_residue_bonds(
434
+ altloc_filtered_atom_site,
435
+ block["struct_conn"],
436
+ atom_count=atoms.array_length(),
437
+ )
438
+ )
439
+ atoms.bonds = bonds
440
+
441
+ return atoms
442
+
443
+
444
+ def _get_block(pdbx_component, block_name):
445
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
446
+ # Determine block
447
+ if block_name is None:
448
+ return pdbx_component.block
449
+ else:
450
+ return pdbx_component[block_name]
451
+ else:
452
+ return pdbx_component
453
+
454
+
455
+ def _get_or_fallback(category, key, fallback_key):
456
+ """
457
+ Return column related to key in category if it exists,
458
+ otherwise try to get the column related to fallback key.
459
+ """
460
+ if key not in category:
461
+ warnings.warn(
462
+ f"Attribute '{key}' not found within 'atom_site' category. "
463
+ f"The fallback attribute '{fallback_key}' will be used instead",
464
+ UserWarning,
465
+ )
466
+ try:
467
+ return category[fallback_key]
468
+ except KeyError as key_exc:
469
+ raise InvalidFileError(
470
+ f"Fallback attribute '{fallback_key}' not found within "
471
+ "'atom_site' category"
472
+ ) from key_exc
473
+ return category[key]
474
+
475
+
476
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
477
+ """Fill atom_site annotations in atom array or atom array stack.
478
+
479
+ Parameters
480
+ ----------
481
+ array : AtomArray or AtomArrayStack
482
+ Atom array or stack which will be annotated.
483
+ atom_site : CIFCategory or BinaryCIFCategory
484
+ ``atom_site`` category with values for one model.
485
+ extra_fields : list of str
486
+ Entry names, that are additionally added as annotation arrays.
487
+ use_author_fields : bool
488
+ Define if alternate fields prefixed with ``auth_`` should be used
489
+ instead of ``label_``.
490
+ """
491
+
492
+ prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth")
493
+
494
+ array.set_annotation(
495
+ "chain_id",
496
+ _get_or_fallback(
497
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
498
+ ).as_array(str),
499
+ )
500
+ array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
501
+ array.set_annotation(
502
+ "res_name",
503
+ _get_or_fallback(
504
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
505
+ ).as_array(str),
506
+ )
507
+ array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
508
+ array.set_annotation(
509
+ "atom_name",
510
+ _get_or_fallback(
511
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
512
+ ).as_array(str),
513
+ )
514
+ array.set_annotation("element", atom_site["type_symbol"].as_array(str))
515
+
516
+ # Special handling for `res_id`, as the `label_seq_id` is equal (`.`) for all
517
+ # hetero residues, which makes distinguishing subsequent residues from another
518
+ # difficult (https://github.com/biotite-dev/biotite/issues/553)
519
+ res_id = _get_or_fallback(
520
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
521
+ ).as_array(int, -1)
522
+ if not use_author_fields and "auth_seq_id" in atom_site:
523
+ # Therefore, the `auth_seq_id` is still used to determine residue starts
524
+ # in `create_continuous_res_ids()`, even if `use_author_fields = False`.
525
+ res_id_for_residue_starts = atom_site["auth_seq_id"].as_array(int, -1)
526
+ array.set_annotation("res_id", res_id_for_residue_starts)
527
+ fallback_res_ids = create_continuous_res_ids(array)
528
+ array.set_annotation("res_id", np.where(res_id == -1, fallback_res_ids, res_id))
529
+ else:
530
+ array.set_annotation("res_id", res_id)
531
+
532
+ if "atom_id" in extra_fields:
533
+ if "id" in atom_site:
534
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
535
+ else:
536
+ warnings.warn(
537
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
538
+ UserWarning,
539
+ )
540
+ array.set_annotation("atom_id", np.arange(array.array_length()))
541
+ extra_fields.remove("atom_id")
542
+ if "b_factor" in extra_fields:
543
+ if "B_iso_or_equiv" in atom_site:
544
+ array.set_annotation(
545
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
546
+ )
547
+ else:
548
+ warnings.warn(
549
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
550
+ UserWarning,
551
+ )
552
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
553
+ extra_fields.remove("b_factor")
554
+ if "occupancy" in extra_fields:
555
+ if "occupancy" in atom_site:
556
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
557
+ else:
558
+ warnings.warn(
559
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
560
+ UserWarning,
561
+ )
562
+ array.set_annotation(
563
+ "occupancy", np.ones(array.array_length(), dtype=float)
564
+ )
565
+ extra_fields.remove("occupancy")
566
+ if "charge" in extra_fields:
567
+ if "pdbx_formal_charge" in atom_site:
568
+ array.set_annotation(
569
+ "charge",
570
+ atom_site["pdbx_formal_charge"].as_array(
571
+ int, 0
572
+ ), # masked values are set to 0
573
+ )
574
+ else:
575
+ warnings.warn(
576
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
577
+ UserWarning,
578
+ )
579
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
580
+ extra_fields.remove("charge")
581
+
582
+ # Handle all remaining custom fields
583
+ for field in extra_fields:
584
+ array.set_annotation(field, atom_site[field].as_array(str))
585
+
586
+
587
+ def _parse_intra_residue_bonds(chem_comp_bond):
588
+ """
589
+ Create a :func:`connect_via_residue_names()` compatible
590
+ `custom_bond_dict` from the ``chem_comp_bond`` category.
591
+ """
592
+ custom_bond_dict = {}
593
+ for res_name, atom_1, atom_2, order, aromatic_flag in zip(
594
+ chem_comp_bond["comp_id"].as_array(str),
595
+ chem_comp_bond["atom_id_1"].as_array(str),
596
+ chem_comp_bond["atom_id_2"].as_array(str),
597
+ chem_comp_bond["value_order"].as_array(str),
598
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str),
599
+ ):
600
+ if res_name not in custom_bond_dict:
601
+ custom_bond_dict[res_name] = {}
602
+ bond_type = COMP_BOND_ORDER_TO_TYPE.get(
603
+ (order.upper(), aromatic_flag), BondType.ANY
604
+ )
605
+ custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
606
+ return custom_bond_dict
607
+
608
+
609
+ def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
610
+ """
611
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
612
+ The atom indices of each bond are found by matching the bond labels
613
+ to the ``atom_site`` category.
614
+ If atom_count is None, it will be inferred from the ``atom_site`` category.
615
+ """
616
+ # Identity symmetry operation
617
+ IDENTITY = "1_555"
618
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
619
+ COLUMNS = [
620
+ "label_asym_id",
621
+ "label_comp_id",
622
+ "label_seq_id",
623
+ "label_atom_id",
624
+ "label_alt_id",
625
+ "auth_asym_id",
626
+ "auth_comp_id",
627
+ "auth_seq_id",
628
+ "pdbx_PDB_ins_code",
629
+ ]
630
+
631
+ covale_mask = np.isin(
632
+ struct_conn["conn_type_id"].as_array(str),
633
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
634
+ )
635
+ if "ptnr1_symmetry" in struct_conn:
636
+ covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
637
+ if "ptnr2_symmetry" in struct_conn:
638
+ covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
639
+
640
+ atom_indices = [None] * 2
641
+ for i in range(2):
642
+ reference_arrays = []
643
+ query_arrays = []
644
+ for col_name in COLUMNS:
645
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1)
646
+ if col_name not in atom_site or struct_conn_col_name not in struct_conn:
647
+ continue
648
+ # Ensure both arrays have the same dtype to allow comparison
649
+ reference = atom_site[col_name].as_array()
650
+ dtype = reference.dtype
651
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
652
+ if np.issubdtype(reference.dtype, str):
653
+ # The mask value is not necessarily consistent
654
+ # between query and reference
655
+ # -> make it consistent
656
+ reference[reference == "?"] = "."
657
+ query[query == "?"] = "."
658
+ reference_arrays.append(reference)
659
+ query_arrays.append(query[covale_mask])
660
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
661
+ # in 'atom_site' and 'struct_conn'
662
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
663
+ atoms_indices_1 = atom_indices[0]
664
+ atoms_indices_2 = atom_indices[1]
665
+
666
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
667
+ # This is okay,
668
+ # as 'atom_site' might already be reduced to a single model
669
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
670
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
671
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
672
+
673
+ bond_type_id = struct_conn["conn_type_id"].as_array()
674
+ # Consecutively apply the same masks as applied to the atom indices
675
+ # Logical combination does not work here,
676
+ # as the second mask was created based on already filtered data
677
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
678
+ # The type ID is always present in the dictionary,
679
+ # as it was used to filter the applicable bonds
680
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
681
+
682
+ return BondList(
683
+ atom_count if atom_count is not None else atom_site.row_count,
684
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
685
+ )
686
+
687
+
688
+ def _find_matches(query_arrays, reference_arrays):
689
+ """
690
+ For each index in the `query_arrays` find the indices in the
691
+ `reference_arrays` where all query values match the reference counterpart.
692
+ If no match is found for a query, the corresponding index is -1.
693
+ """
694
+ if (
695
+ query_arrays[0].shape[0] * reference_arrays[0].shape[0]
696
+ <= FIND_MATCHES_SWITCH_THRESHOLD
697
+ ):
698
+ match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
699
+ else:
700
+ match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
701
+ return match_indices
702
+
703
+
704
+ def _find_matches_by_dense_array(query_arrays, reference_arrays):
705
+ match_masks_for_all_columns = np.stack(
706
+ [
707
+ query[:, np.newaxis] == reference[np.newaxis, :]
708
+ for query, reference in zip(query_arrays, reference_arrays)
709
+ ],
710
+ axis=-1,
711
+ )
712
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
713
+ query_matches, reference_matches = np.where(match_masks)
714
+
715
+ # Duplicate matches indicate that an atom from the query cannot
716
+ # be uniquely matched to an atom in the reference
717
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
718
+ if np.any(counts > 1):
719
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
720
+ raise InvalidFileError(
721
+ f"The covalent bond in the 'struct_conn' category at index "
722
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
723
+ f"the 'atom_site' category"
724
+ )
725
+
726
+ # -1 indicates that no match was found in the reference
727
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
728
+ match_indices[query_matches] = reference_matches
729
+ return match_indices
730
+
731
+
732
+ def _find_matches_by_dict(query_arrays, reference_arrays):
733
+ # Convert reference arrays to a dictionary for O(1) lookups
734
+ reference_dict = {}
735
+ ambiguous_keys = set()
736
+ for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
737
+ ref_key = tuple(ref_row)
738
+ if ref_key in reference_dict:
739
+ ambiguous_keys.add(ref_key)
740
+ continue
741
+ reference_dict[ref_key] = ref_idx
742
+
743
+ match_indices = []
744
+ for query_idx, query_row in enumerate(zip(*query_arrays)):
745
+ query_key = tuple(query_row)
746
+ occurrence = reference_dict.get(query_key)
747
+
748
+ if occurrence is None:
749
+ # -1 indicates that no match was found in the reference
750
+ match_indices.append(-1)
751
+ elif query_key in ambiguous_keys:
752
+ # The query cannot be uniquely matched to an atom in the reference
753
+ raise InvalidFileError(
754
+ f"The covalent bond in the 'struct_conn' category at index "
755
+ f"{query_idx} cannot be unambiguously assigned to atoms in "
756
+ f"the 'atom_site' category"
757
+ )
758
+ else:
759
+ match_indices.append(occurrence)
760
+
761
+ return np.array(match_indices)
762
+
763
+
764
+ def _get_struct_conn_col_name(col_name, partner):
765
+ """
766
+ For a column name in ``atom_site`` get the corresponding column name
767
+ in ``struct_conn``.
768
+ """
769
+ if col_name == "label_alt_id":
770
+ return f"pdbx_ptnr{partner}_label_alt_id"
771
+ elif col_name.startswith("pdbx_"):
772
+ # Move 'pdbx_' to front
773
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
774
+ else:
775
+ return f"ptnr{partner}_{col_name}"
776
+
777
+
778
+ def _filter_altloc(array, atom_site, altloc):
779
+ """
780
+ Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
781
+ specified by the given *altloc* identifier.
782
+ """
783
+ altloc_ids = atom_site.get("label_alt_id")
784
+ occupancy = atom_site.get("occupancy")
785
+
786
+ if altloc == "all":
787
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
788
+ return array, atom_site
789
+ elif altloc_ids is None or (
790
+ altloc_ids.mask is not None
791
+ and (altloc_ids.mask.array != MaskValue.PRESENT).all()
792
+ ):
793
+ # No altlocs in atom_site category
794
+ return array, atom_site
795
+ elif altloc == "occupancy" and occupancy is not None:
796
+ mask = filter_highest_occupancy_altloc(
797
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
798
+ )
799
+ return array[..., mask], _filter(atom_site, mask)
800
+ # 'first' is also fallback if file has no occupancy information
801
+ elif altloc == "first":
802
+ mask = filter_first_altloc(array, altloc_ids.as_array(str))
803
+ return array[..., mask], _filter(atom_site, mask)
804
+ else:
805
+ raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
806
+
807
+
808
+ def _filter_model(atom_site, model):
809
+ """
810
+ Reduce the ``atom_site`` category to the values for the given
811
+ model.
812
+
813
+ Parameters
814
+ ----------
815
+ atom_site : CIFCategory or BinaryCIFCategory
816
+ ``atom_site`` category containing all models.
817
+ model : int
818
+ The model to be selected.
819
+
820
+ Returns
821
+ -------
822
+ atom_site : CIFCategory or BinaryCIFCategory
823
+ The ``atom_site`` category containing only the selected model.
824
+ """
825
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
826
+ _, model_starts = np.unique(models, return_index=True)
827
+ model_starts.sort()
828
+ # Append exclusive stop
829
+ model_starts = np.append(model_starts, [atom_site.row_count])
830
+ # Indexing starts at 0, but model number starts at 1
831
+ model_index = model - 1
832
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
833
+ return _filter(atom_site, index)
834
+
835
+
836
+ def _get_box(block):
837
+ cell = block.get("cell")
838
+ if cell is None:
839
+ return None
840
+ try:
841
+ len_a, len_b, len_c = [
842
+ float(cell[length].as_item())
843
+ for length in ["length_a", "length_b", "length_c"]
844
+ ]
845
+ alpha, beta, gamma = [
846
+ np.deg2rad(float(cell[angle].as_item()))
847
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
848
+ ]
849
+ except ValueError:
850
+ # 'cell_dict' has no proper unit cell values, e.g. '?'
851
+ return None
852
+ return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
853
+
854
+
855
+ def set_structure(
856
+ pdbx_file,
857
+ array,
858
+ data_block=None,
859
+ include_bonds=False,
860
+ extra_fields=[],
861
+ ):
862
+ """
863
+ Set the ``atom_site`` category with atom information from an
864
+ :class:`AtomArray` or :class:`AtomArrayStack`.
865
+
866
+ This will save the coordinates, the mandatory annotation categories
867
+ and the optional annotation categories
868
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
869
+ If the atom array (stack) contains the annotation ``'atom_id'``,
870
+ these values will be used for atom numbering instead of continuous
871
+ numbering.
872
+ Furthermore, inter-residue bonds will be written into the
873
+ ``struct_conn`` category.
874
+
875
+ Parameters
876
+ ----------
877
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
878
+ The file object.
879
+ array : AtomArray or AtomArrayStack
880
+ The structure to be written. If a stack is given, each array in
881
+ the stack will be in a separate model.
882
+ data_block : str, optional
883
+ The name of the data block.
884
+ Default is the first (and most times only) data block of the
885
+ file.
886
+ If the data block object is passed directly to `pdbx_file`,
887
+ this parameter is ignored.
888
+ If the file is empty, a new data block will be created.
889
+ include_bonds : bool, optional
890
+ DEPRECATED: Has no effect anymore.
891
+ extra_fields : list of str, optional
892
+ List of additional fields from the ``atom_site`` category
893
+ that should be written into the file.
894
+ Default is an empty list.
895
+
896
+ Notes
897
+ -----
898
+ In some cases, the written inter-residue bonds cannot be read again
899
+ due to ambiguity to which atoms the bond refers.
900
+ This is the case, when two equal residues in the same chain have
901
+ the same (or a masked) `res_id`.
902
+
903
+ Examples
904
+ --------
905
+
906
+ >>> import os.path
907
+ >>> file = CIFFile()
908
+ >>> set_structure(file, atom_array)
909
+ >>> file.write(os.path.join(path_to_directory, "structure.cif"))
910
+ """
911
+ if include_bonds:
912
+ warnings.warn(
913
+ "`include_bonds` parameter is deprecated, "
914
+ "intra-residue are always written, if available",
915
+ DeprecationWarning,
916
+ )
917
+
918
+ _check_non_empty(array)
919
+
920
+ block = _get_or_create_block(pdbx_file, data_block)
921
+ Category = block.subcomponent_class()
922
+ Column = Category.subcomponent_class()
923
+
924
+ # Fill PDBx columns from information
925
+ # in structures' attribute arrays as good as possible
926
+ atom_site = Category()
927
+ atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM")
928
+ atom_site["type_symbol"] = np.copy(array.element)
929
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
930
+ atom_site["label_alt_id"] = Column(
931
+ # AtomArrays do not store altloc atoms
932
+ np.full(array.array_length(), "."),
933
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
934
+ )
935
+ atom_site["label_comp_id"] = np.copy(array.res_name)
936
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
937
+ atom_site["label_entity_id"] = (
938
+ np.copy(array.label_entity_id)
939
+ if "label_entity_id" in array.get_annotation_categories()
940
+ else _determine_entity_id(array.chain_id)
941
+ )
942
+ atom_site["label_seq_id"] = np.copy(array.res_id)
943
+ atom_site["pdbx_PDB_ins_code"] = Column(
944
+ np.copy(array.ins_code),
945
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT),
946
+ )
947
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
948
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
949
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
950
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
951
+
952
+ annot_categories = array.get_annotation_categories()
953
+ if "atom_id" in annot_categories:
954
+ atom_site["id"] = np.copy(array.atom_id)
955
+ if "b_factor" in annot_categories:
956
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
957
+ if "occupancy" in annot_categories:
958
+ atom_site["occupancy"] = np.copy(array.occupancy)
959
+ if "charge" in annot_categories:
960
+ atom_site["pdbx_formal_charge"] = Column(
961
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
962
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
963
+ )
964
+
965
+ # Handle all remaining custom fields
966
+ if len(extra_fields) > 0:
967
+ # ... check to avoid clashes with standard annotations
968
+ _standard_annotations = [
969
+ "hetero",
970
+ "element",
971
+ "atom_name",
972
+ "res_name",
973
+ "chain_id",
974
+ "res_id",
975
+ "ins_code",
976
+ "atom_id",
977
+ "b_factor",
978
+ "occupancy",
979
+ "charge",
980
+ ]
981
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
982
+
983
+ for annot in extra_fields:
984
+ if annot in _reserved_annotation_names:
985
+ raise ValueError(
986
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
987
+ "Please choose another name."
988
+ )
989
+ atom_site[annot] = np.copy(array.get_annotation(annot))
990
+
991
+ if array.bonds is not None:
992
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
993
+ if struct_conn is not None:
994
+ block["struct_conn"] = struct_conn
995
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
996
+ if chem_comp_bond is not None:
997
+ block["chem_comp_bond"] = chem_comp_bond
998
+
999
+ # In case of a single model handle each coordinate
1000
+ # simply like a flattened array
1001
+ if isinstance(array, AtomArray) or (
1002
+ isinstance(array, AtomArrayStack) and array.stack_depth() == 1
1003
+ ):
1004
+ # 'ravel' flattens coord without copy
1005
+ # in case of stack with stack_depth = 1
1006
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
1007
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
1008
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
1009
+ atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32)
1010
+ # In case of multiple models repeat annotations
1011
+ # and use model specific coordinates
1012
+ else:
1013
+ atom_site = _repeat(atom_site, array.stack_depth())
1014
+ coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3))
1015
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
1016
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
1017
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
1018
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
1019
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
1020
+ repeats=array.array_length(),
1021
+ )
1022
+ if "atom_id" not in annot_categories:
1023
+ # Count from 1
1024
+ atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1)
1025
+ block["atom_site"] = atom_site
1026
+
1027
+ # Write box into file
1028
+ if array.box is not None:
1029
+ # PDBx files can only store one box for all models
1030
+ # -> Use first box
1031
+ if array.box.ndim == 3:
1032
+ box = array.box[0]
1033
+ else:
1034
+ box = array.box
1035
+ len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
1036
+ cell = Category()
1037
+ cell["length_a"] = len_a
1038
+ cell["length_b"] = len_b
1039
+ cell["length_c"] = len_c
1040
+ cell["angle_alpha"] = np.rad2deg(alpha)
1041
+ cell["angle_beta"] = np.rad2deg(beta)
1042
+ cell["angle_gamma"] = np.rad2deg(gamma)
1043
+ block["cell"] = cell
1044
+
1045
+
1046
+ def _check_non_empty(array):
1047
+ if isinstance(array, AtomArray):
1048
+ if array.array_length() == 0:
1049
+ raise BadStructureError("Structure must not be empty")
1050
+ elif isinstance(array, AtomArrayStack):
1051
+ if array.array_length() == 0 or array.stack_depth() == 0:
1052
+ raise BadStructureError("Structure must not be empty")
1053
+ else:
1054
+ raise ValueError(
1055
+ "Structure must be AtomArray or AtomArrayStack, "
1056
+ f"but got {type(array).__name__}"
1057
+ )
1058
+
1059
+
1060
+ def _get_or_create_block(pdbx_component, block_name):
1061
+ Block = pdbx_component.subcomponent_class()
1062
+
1063
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
1064
+ if block_name is None:
1065
+ if len(pdbx_component) > 0:
1066
+ block_name = next(iter(pdbx_component.keys()))
1067
+ else:
1068
+ # File is empty -> invent a new block name
1069
+ block_name = "structure"
1070
+
1071
+ if block_name not in pdbx_component:
1072
+ block = Block()
1073
+ pdbx_component[block_name] = block
1074
+ return pdbx_component[block_name]
1075
+ else:
1076
+ # Already a block
1077
+ return pdbx_component
1078
+
1079
+
1080
+ def _determine_entity_id(chain_id):
1081
+ entity_id = np.zeros(len(chain_id), dtype=int)
1082
+ # Dictionary that translates chain_id to entity_id
1083
+ id_translation = {}
1084
+ id = 1
1085
+ for i in range(len(chain_id)):
1086
+ try:
1087
+ entity_id[i] = id_translation[chain_id[i]]
1088
+ except KeyError:
1089
+ # chain_id is not in dictionary -> new entry
1090
+ id_translation[chain_id[i]] = id
1091
+ entity_id[i] = id_translation[chain_id[i]]
1092
+ id += 1
1093
+ return entity_id
1094
+
1095
+
1096
+ def _repeat(category, repetitions):
1097
+ Category = type(category)
1098
+ Column = Category.subcomponent_class()
1099
+ Data = Column.subcomponent_class()
1100
+
1101
+ category_dict = {}
1102
+ for key, column in category.items():
1103
+ if isinstance(column, BinaryCIFColumn):
1104
+ data_encoding = column.data.encoding
1105
+ # Optimization: The repeated string array has the same
1106
+ # unique values, as the original string array
1107
+ # -> Use same unique values (faster due to shorter array)
1108
+ if isinstance(data_encoding[0], StringArrayEncoding):
1109
+ data_encoding[0].strings = np.unique(column.data.array)
1110
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
1111
+ else:
1112
+ data = Data(np.tile(column.data.array, repetitions))
1113
+ mask = (
1114
+ Data(np.tile(column.mask.array, repetitions))
1115
+ if column.mask is not None
1116
+ else None
1117
+ )
1118
+ category_dict[key] = Column(data, mask)
1119
+ return Category(category_dict)
1120
+
1121
+
1122
+ def _set_intra_residue_bonds(array, atom_site):
1123
+ """
1124
+ Create the ``chem_comp_bond`` category containing the intra-residue
1125
+ bonds.
1126
+ ``atom_site`` is only used to infer the right :class:`Category` type
1127
+ (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
1128
+ """
1129
+ if (array.res_name == "").any():
1130
+ raise BadStructureError(
1131
+ "Structure contains atoms with empty residue name, "
1132
+ "but it is required to write intra-residue bonds"
1133
+ )
1134
+ if (array.atom_name == "").any():
1135
+ raise BadStructureError(
1136
+ "Structure contains atoms with empty atom name, "
1137
+ "but it is required to write intra-residue bonds"
1138
+ )
1139
+
1140
+ Category = type(atom_site)
1141
+ Column = Category.subcomponent_class()
1142
+
1143
+ bond_array = _filter_bonds(array, "intra")
1144
+ if len(bond_array) == 0:
1145
+ return None
1146
+ value_order = np.zeros(len(bond_array), dtype="U4")
1147
+ aromatic_flag = np.zeros(len(bond_array), dtype="U1")
1148
+ for i, bond_type in enumerate(bond_array[:, 2]):
1149
+ if bond_type == BondType.ANY:
1150
+ # ANY bonds will be masked anyway, no need to set the value
1151
+ continue
1152
+ order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
1153
+ value_order[i] = order
1154
+ aromatic_flag[i] = aromatic
1155
+ any_mask = bond_array[:, 2] == BondType.ANY
1156
+
1157
+ # Remove already existing residue and atom name combinations
1158
+ # These appear when the structure contains a residue multiple times
1159
+ atom_id_1 = array.atom_name[bond_array[:, 0]]
1160
+ atom_id_2 = array.atom_name[bond_array[:, 1]]
1161
+ # Take the residue name from the first atom index, as the residue
1162
+ # name is the same for both atoms, since we have only intra bonds
1163
+ comp_id = array.res_name[bond_array[:, 0]]
1164
+ _, unique_indices = np.unique(
1165
+ np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True
1166
+ )
1167
+ unique_indices.sort()
1168
+
1169
+ chem_comp_bond = Category()
1170
+ n_bonds = len(unique_indices)
1171
+ chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32)
1172
+ chem_comp_bond["comp_id"] = comp_id[unique_indices]
1173
+ chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices]
1174
+ chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices]
1175
+ chem_comp_bond["value_order"] = Column(
1176
+ value_order[unique_indices],
1177
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1178
+ )
1179
+ chem_comp_bond["pdbx_aromatic_flag"] = Column(
1180
+ aromatic_flag[unique_indices],
1181
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1182
+ )
1183
+ # BondList does not contain stereo information
1184
+ # -> all values are missing
1185
+ chem_comp_bond["pdbx_stereo_config"] = Column(
1186
+ np.zeros(n_bonds, dtype="U1"),
1187
+ np.full(n_bonds, MaskValue.MISSING),
1188
+ )
1189
+ return chem_comp_bond
1190
+
1191
+
1192
+ def _set_inter_residue_bonds(array, atom_site):
1193
+ """
1194
+ Create the ``struct_conn`` category containing the inter-residue
1195
+ bonds.
1196
+ The involved atoms are identified by annotations from the
1197
+ ``atom_site`` category.
1198
+ """
1199
+ COLUMNS = [
1200
+ "label_asym_id",
1201
+ "label_comp_id",
1202
+ "label_seq_id",
1203
+ "label_atom_id",
1204
+ "pdbx_PDB_ins_code",
1205
+ ]
1206
+
1207
+ Category = type(atom_site)
1208
+ Column = Category.subcomponent_class()
1209
+
1210
+ bond_array = _filter_bonds(array, "inter")
1211
+ if len(bond_array) == 0:
1212
+ return None
1213
+
1214
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1215
+ # nucleotide/amino acid residues
1216
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1217
+ if len(bond_array) == 0:
1218
+ return None
1219
+
1220
+ struct_conn = Category()
1221
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1222
+ struct_conn["conn_type_id"] = [
1223
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1224
+ ]
1225
+ struct_conn["pdbx_value_order"] = Column(
1226
+ np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1227
+ np.where(
1228
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1229
+ MaskValue.MISSING,
1230
+ MaskValue.PRESENT,
1231
+ ),
1232
+ )
1233
+ # Write the identifying annotation...
1234
+ for col_name in COLUMNS:
1235
+ annot = atom_site[col_name].as_array()
1236
+ # ...for each bond partner
1237
+ for i in range(2):
1238
+ atom_indices = bond_array[:, i]
1239
+ struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[
1240
+ atom_indices
1241
+ ]
1242
+ return struct_conn
1243
+
1244
+
1245
+ def _filter_bonds(array, connection):
1246
+ """
1247
+ Get a bonds array, that contain either only intra-residue or
1248
+ only inter-residue bonds.
1249
+ """
1250
+ bond_array = array.bonds.as_array()
1251
+ # To save computation time call 'get_residue_starts_for()' only once
1252
+ # with indices of the first and second atom of each bond
1253
+ residue_starts_1, residue_starts_2 = (
1254
+ get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T
1255
+ )
1256
+ if connection == "intra":
1257
+ return bond_array[residue_starts_1 == residue_starts_2]
1258
+ elif connection == "inter":
1259
+ return bond_array[residue_starts_1 != residue_starts_2]
1260
+ else:
1261
+ raise ValueError("Invalid 'connection' option")
1262
+
1263
+
1264
+ def _filter_canonical_links(array, bond_array):
1265
+ """
1266
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1267
+ """
1268
+ # Get the residue index for each bonded atom
1269
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1270
+ -1, 2
1271
+ )
1272
+
1273
+ return (
1274
+ # Must be canonical residues
1275
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1276
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1277
+ # Must be backbone bond
1278
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1279
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1280
+ # Must connect adjacent residues
1281
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1282
+ ) # fmt: skip
1283
+
1284
+
1285
+ def get_component(
1286
+ pdbx_file,
1287
+ data_block=None,
1288
+ use_ideal_coord=True,
1289
+ res_name=None,
1290
+ allow_missing_coord=False,
1291
+ ):
1292
+ """
1293
+ Create an :class:`AtomArray` for a chemical component from the
1294
+ ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
1295
+ category in a file.
1296
+
1297
+ Parameters
1298
+ ----------
1299
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1300
+ The file object.
1301
+ data_block : str, optional
1302
+ The name of the data block.
1303
+ Default is the first (and most times only) data block of the
1304
+ file.
1305
+ If the data block object is passed directly to `pdbx_file`,
1306
+ this parameter is ignored.
1307
+ use_ideal_coord : bool, optional
1308
+ If true, the *ideal* coordinates are read from the file
1309
+ (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
1310
+ originating from computations.
1311
+ If set to false, alternative coordinates are read
1312
+ (``model_Cartn_<dim>_`` fields).
1313
+ res_name : str
1314
+ In rare cases the categories may contain rows for multiple
1315
+ components.
1316
+ In this case, the component with the given residue name is
1317
+ read.
1318
+ By default, all rows would be read in this case.
1319
+ allow_missing_coord : bool, optional
1320
+ Whether to allow missing coordinate values in components.
1321
+ If ``True``, these will be represented as ``nan`` values.
1322
+ If ``False``, a ``ValueError`` is raised when missing coordinates
1323
+ are encountered.
1324
+
1325
+ Returns
1326
+ -------
1327
+ array : AtomArray
1328
+ The parsed chemical component.
1329
+
1330
+ Examples
1331
+ --------
1332
+
1333
+ >>> import os.path
1334
+ >>> file = CIFFile.read(
1335
+ ... os.path.join(path_to_structures, "molecules", "TYR.cif")
1336
+ ... )
1337
+ >>> comp = get_component(file)
1338
+ >>> print(comp)
1339
+ HET 0 TYR N N 1.320 0.952 1.428
1340
+ HET 0 TYR CA C -0.018 0.429 1.734
1341
+ HET 0 TYR C C -0.103 0.094 3.201
1342
+ HET 0 TYR O O 0.886 -0.254 3.799
1343
+ HET 0 TYR CB C -0.274 -0.831 0.907
1344
+ HET 0 TYR CG C -0.189 -0.496 -0.559
1345
+ HET 0 TYR CD1 C 1.022 -0.589 -1.219
1346
+ HET 0 TYR CD2 C -1.324 -0.102 -1.244
1347
+ HET 0 TYR CE1 C 1.103 -0.282 -2.563
1348
+ HET 0 TYR CE2 C -1.247 0.210 -2.587
1349
+ HET 0 TYR CZ C -0.032 0.118 -3.252
1350
+ HET 0 TYR OH O 0.044 0.420 -4.574
1351
+ HET 0 TYR OXT O -1.279 0.184 3.842
1352
+ HET 0 TYR H H 1.977 0.225 1.669
1353
+ HET 0 TYR H2 H 1.365 1.063 0.426
1354
+ HET 0 TYR HA H -0.767 1.183 1.489
1355
+ HET 0 TYR HB2 H 0.473 -1.585 1.152
1356
+ HET 0 TYR HB3 H -1.268 -1.219 1.134
1357
+ HET 0 TYR HD1 H 1.905 -0.902 -0.683
1358
+ HET 0 TYR HD2 H -2.269 -0.031 -0.727
1359
+ HET 0 TYR HE1 H 2.049 -0.354 -3.078
1360
+ HET 0 TYR HE2 H -2.132 0.523 -3.121
1361
+ HET 0 TYR HH H -0.123 -0.399 -5.059
1362
+ HET 0 TYR HXT H -1.333 -0.030 4.784
1363
+ """
1364
+ block = _get_block(pdbx_file, data_block)
1365
+
1366
+ try:
1367
+ atom_category = block["chem_comp_atom"]
1368
+ except KeyError:
1369
+ raise InvalidFileError("Missing 'chem_comp_atom' category in file")
1370
+ if res_name is not None:
1371
+ atom_category = _filter(
1372
+ atom_category, atom_category["comp_id"].as_array() == res_name
1373
+ )
1374
+ if atom_category.row_count == 0:
1375
+ raise KeyError(
1376
+ f"No rows with residue name '{res_name}' found in "
1377
+ f"'chem_comp_atom' category"
1378
+ )
1379
+
1380
+ array = AtomArray(atom_category.row_count)
1381
+
1382
+ array.set_annotation("hetero", np.full(len(atom_category["comp_id"]), True))
1383
+ array.set_annotation("res_name", atom_category["comp_id"].as_array(str))
1384
+ array.set_annotation("atom_name", atom_category["atom_id"].as_array(str))
1385
+ array.set_annotation("element", atom_category["type_symbol"].as_array(str))
1386
+ array.set_annotation("charge", atom_category["charge"].as_array(int, 0))
1387
+
1388
+ coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
1389
+ alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
1390
+ if not use_ideal_coord:
1391
+ # Swap with the fallback option
1392
+ coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1393
+ try:
1394
+ array.coord = _parse_component_coordinates(
1395
+ [atom_category[field] for field in coord_fields]
1396
+ )
1397
+ except Exception as err:
1398
+ if isinstance(err, KeyError):
1399
+ key = err.args[0]
1400
+ warnings.warn(
1401
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1402
+ f"The fallback coordinates will be used instead",
1403
+ UserWarning,
1404
+ )
1405
+ elif isinstance(err, ValueError):
1406
+ warnings.warn(
1407
+ "The coordinates are missing for some atoms. "
1408
+ "The fallback coordinates will be used instead",
1409
+ UserWarning,
1410
+ )
1411
+ else:
1412
+ raise
1413
+ array.coord = _parse_component_coordinates(
1414
+ [atom_category[field] for field in alt_coord_fields],
1415
+ allow_missing=allow_missing_coord,
1416
+ )
1417
+
1418
+ try:
1419
+ bond_category = block["chem_comp_bond"]
1420
+ if res_name is not None:
1421
+ bond_category = _filter(
1422
+ bond_category, bond_category["comp_id"].as_array() == res_name
1423
+ )
1424
+ except KeyError:
1425
+ warnings.warn(
1426
+ "Category 'chem_comp_bond' not found. No bonds will be parsed",
1427
+ UserWarning,
1428
+ )
1429
+ else:
1430
+ bonds = BondList(array.array_length())
1431
+ for atom1, atom2, order, aromatic_flag in zip(
1432
+ bond_category["atom_id_1"].as_array(str),
1433
+ bond_category["atom_id_2"].as_array(str),
1434
+ bond_category["value_order"].as_array(str),
1435
+ bond_category["pdbx_aromatic_flag"].as_array(str),
1436
+ ):
1437
+ atom_i = np.where(array.atom_name == atom1)[0][0]
1438
+ atom_j = np.where(array.atom_name == atom2)[0][0]
1439
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
1440
+ bonds.add_bond(atom_i, atom_j, bond_type)
1441
+ array.bonds = bonds
1442
+
1443
+ return array
1444
+
1445
+
1446
+ def _parse_component_coordinates(coord_columns, allow_missing=False):
1447
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1448
+ for i, column in enumerate(coord_columns):
1449
+ if column.mask is not None and column.mask.array.any():
1450
+ if allow_missing:
1451
+ warnings.warn(
1452
+ "Missing coordinates for some atoms. Those will be set to nan",
1453
+ UserWarning,
1454
+ )
1455
+ else:
1456
+ raise ValueError(
1457
+ "Missing coordinates for some atoms",
1458
+ )
1459
+ coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
1460
+ return coord
1461
+
1462
+
1463
+ def set_component(pdbx_file, array, data_block=None):
1464
+ """
1465
+ Set the ``chem_comp_atom`` and, if bonds are available,
1466
+ ``chem_comp_bond`` category with atom information from an
1467
+ :class:`AtomArray`.
1468
+
1469
+ This will save the coordinates, the mandatory annotation categories
1470
+ and the optional ``charge`` category as well as an associated
1471
+ :class:`BondList`, if available.
1472
+
1473
+ Parameters
1474
+ ----------
1475
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1476
+ The file object.
1477
+ array : AtomArray
1478
+ The chemical component to be written.
1479
+ Must contain only a single residue.
1480
+ data_block : str, optional
1481
+ The name of the data block.
1482
+ Default is the first (and most times only) data block of the
1483
+ file.
1484
+ If the file is empty, a new data will be created.
1485
+ If the data block object is passed directly to `pdbx_file`,
1486
+ this parameter is ignored.
1487
+ """
1488
+ _check_non_empty(array)
1489
+
1490
+ block = _get_or_create_block(pdbx_file, data_block)
1491
+ Category = block.subcomponent_class()
1492
+
1493
+ if get_residue_count(array) > 1:
1494
+ raise BadStructureError("The input atom array must comprise only one residue")
1495
+ res_name = array.res_name[0]
1496
+
1497
+ annot_categories = array.get_annotation_categories()
1498
+ if "charge" in annot_categories:
1499
+ charge = array.charge.astype("U2")
1500
+ else:
1501
+ charge = np.full(array.array_length(), "?", dtype="U2")
1502
+
1503
+ atom_cat = Category()
1504
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1505
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1506
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1507
+ atom_cat["type_symbol"] = np.copy(array.element)
1508
+ atom_cat["charge"] = charge
1509
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1510
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1511
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1512
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1513
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1514
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1515
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1516
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1517
+ atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str)
1518
+ block["chem_comp_atom"] = atom_cat
1519
+
1520
+ if array.bonds is not None and array.bonds.get_bond_count() > 0:
1521
+ bond_array = array.bonds.as_array()
1522
+ order_flags = []
1523
+ aromatic_flags = []
1524
+ for bond_type in bond_array[:, 2]:
1525
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
1526
+ order_flags.append(order_flag)
1527
+ aromatic_flags.append(aromatic_flag)
1528
+
1529
+ bond_cat = Category()
1530
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1531
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]]
1532
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]]
1533
+ bond_cat["value_order"] = np.array(order_flags)
1534
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1535
+ bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str)
1536
+ block["chem_comp_bond"] = bond_cat
1537
+
1538
+
1539
+ def list_assemblies(pdbx_file, data_block=None):
1540
+ """
1541
+ List the biological assemblies that are available for the structure
1542
+ in the given file.
1543
+
1544
+ This function receives the data from the ``pdbx_struct_assembly``
1545
+ category in the file.
1546
+ Consequently, this category must be present in the file.
1547
+
1548
+ Parameters
1549
+ ----------
1550
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1551
+ The file object.
1552
+ data_block : str, optional
1553
+ The name of the data block.
1554
+ Default is the first (and most times only) data block of the
1555
+ file.
1556
+ If the data block object is passed directly to `pdbx_file`,
1557
+ this parameter is ignored.
1558
+
1559
+ Returns
1560
+ -------
1561
+ assemblies : dict of str -> str
1562
+ A dictionary that maps an assembly ID to a description of the
1563
+ corresponding assembly.
1564
+
1565
+ Examples
1566
+ --------
1567
+
1568
+ >>> import os.path
1569
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1570
+ >>> assembly_ids = list_assemblies(file)
1571
+ >>> for key, val in assembly_ids.items():
1572
+ ... print(f"'{key}' : '{val}'")
1573
+ '1' : 'complete icosahedral assembly'
1574
+ '2' : 'icosahedral asymmetric unit'
1575
+ '3' : 'icosahedral pentamer'
1576
+ '4' : 'icosahedral 23 hexamer'
1577
+ '5' : 'icosahedral asymmetric unit, std point frame'
1578
+ '6' : 'crystal asymmetric unit, crystal frame'
1579
+ """
1580
+ block = _get_block(pdbx_file, data_block)
1581
+
1582
+ try:
1583
+ assembly_category = block["pdbx_struct_assembly"]
1584
+ except KeyError:
1585
+ raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
1586
+ return {
1587
+ id: details
1588
+ for id, details in zip(
1589
+ assembly_category["id"].as_array(str),
1590
+ assembly_category["details"].as_array(str),
1591
+ )
1592
+ }
1593
+
1594
+
1595
+ def get_assembly(
1596
+ pdbx_file,
1597
+ assembly_id=None,
1598
+ model=None,
1599
+ data_block=None,
1600
+ altloc="first",
1601
+ extra_fields=None,
1602
+ use_author_fields=True,
1603
+ include_bonds=False,
1604
+ ):
1605
+ """
1606
+ Build the given biological assembly.
1607
+
1608
+ This function receives the data from the
1609
+ ``pdbx_struct_assembly_gen``, ``pdbx_struct_oper_list`` and
1610
+ ``atom_site`` categories in the file.
1611
+ Consequently, these categories must be present in the file.
1612
+
1613
+ Parameters
1614
+ ----------
1615
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1616
+ The file object.
1617
+ assembly_id : str
1618
+ The assembly to build.
1619
+ Available assembly IDs can be obtained via
1620
+ :func:`list_assemblies()`.
1621
+ model : int, optional
1622
+ If this parameter is given, the function will return an
1623
+ :class:`AtomArray` from the atoms corresponding to the given
1624
+ model number (starting at 1).
1625
+ Negative values are used to index models starting from the last
1626
+ model insted of the first model.
1627
+ If this parameter is omitted, an :class:`AtomArrayStack`
1628
+ containing all models will be returned, even if the structure
1629
+ contains only one model.
1630
+ data_block : str, optional
1631
+ The name of the data block.
1632
+ Default is the first (and most times only) data block of the
1633
+ file.
1634
+ If the data block object is passed directly to `pdbx_file`,
1635
+ this parameter is ignored.
1636
+ altloc : {'first', 'occupancy', 'all'}
1637
+ This parameter defines how *altloc* IDs are handled:
1638
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1639
+ appearing in a residue.
1640
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1641
+ with the highest occupancy for a residue.
1642
+ - ``'all'`` - Use all atoms.
1643
+ Note that this leads to duplicate atoms.
1644
+ When this option is chosen, the ``altloc_id`` annotation
1645
+ array is added to the returned structure.
1646
+ extra_fields : list of str, optional
1647
+ The strings in the list are entry names, that are
1648
+ additionally added as annotation arrays.
1649
+ The annotation category name will be the same as the PDBx
1650
+ subcategory name.
1651
+ The array type is always `str`.
1652
+ An exception are the special field identifiers:
1653
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1654
+ These will convert the fitting subcategory into an
1655
+ annotation array with reasonable type.
1656
+ use_author_fields : bool, optional
1657
+ Some fields can be read from two alternative sources,
1658
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1659
+ the ID of the residue.
1660
+ While, the ``label_xxx`` fields can be used as official pointers
1661
+ to other categories in the file, the ``auth_xxx``
1662
+ fields are set by the author(s) of the structure and are
1663
+ consistent with the corresponding values in PDB files.
1664
+ If `use_author_fields` is true, the annotation arrays will be
1665
+ read from the ``auth_xxx`` fields (if applicable),
1666
+ otherwise from the the ``label_xxx`` fields.
1667
+ include_bonds : bool, optional
1668
+ If set to true, a :class:`BondList` will be created for the
1669
+ resulting :class:`AtomArray` containing the bond information
1670
+ from the file.
1671
+ Inter-residue bonds, will be read from the ``struct_conn``
1672
+ category.
1673
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1674
+ available, otherwise they will be derived from the Chemical
1675
+ Component Dictionary.
1676
+
1677
+ Returns
1678
+ -------
1679
+ assembly : AtomArray or AtomArrayStack
1680
+ The assembly.
1681
+ The return type depends on the `model` parameter.
1682
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1683
+ unit in the assembly.
1684
+
1685
+ Examples
1686
+ --------
1687
+
1688
+ >>> import os.path
1689
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1690
+ >>> assembly = get_assembly(file, model=1)
1691
+ """
1692
+ block = _get_block(pdbx_file, data_block)
1693
+
1694
+ try:
1695
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1696
+ except KeyError:
1697
+ raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category")
1698
+
1699
+ try:
1700
+ struct_oper_category = block["pdbx_struct_oper_list"]
1701
+ except KeyError:
1702
+ raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
1703
+
1704
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
1705
+ if assembly_id is None:
1706
+ assembly_id = assembly_ids[0]
1707
+ elif assembly_id not in assembly_ids:
1708
+ raise KeyError(f"File has no Assembly ID '{assembly_id}'")
1709
+
1710
+ ### Calculate all possible transformations
1711
+ transformations = _get_transformations(struct_oper_category)
1712
+
1713
+ ### Get structure according to additional parameters
1714
+ # Include 'label_asym_id' as annotation array
1715
+ # for correct asym ID filtering
1716
+ extra_fields = [] if extra_fields is None else extra_fields
1717
+ if "label_asym_id" in extra_fields:
1718
+ extra_fields_and_asym = extra_fields
1719
+ else:
1720
+ # The operations apply on asym IDs
1721
+ # -> they need to be included to select the correct atoms
1722
+ extra_fields_and_asym = extra_fields + ["label_asym_id"]
1723
+ structure = get_structure(
1724
+ pdbx_file,
1725
+ model,
1726
+ data_block,
1727
+ altloc,
1728
+ extra_fields_and_asym,
1729
+ use_author_fields,
1730
+ include_bonds,
1731
+ )
1732
+
1733
+ ### Get transformations and apply them to the affected asym IDs
1734
+ chain_ops = defaultdict(list)
1735
+ for id, op_expr, asym_id_expr in zip(
1736
+ assembly_gen_category["assembly_id"].as_array(str),
1737
+ assembly_gen_category["oper_expression"].as_array(str),
1738
+ assembly_gen_category["asym_id_list"].as_array(str),
1739
+ ):
1740
+ # Find the operation expressions for given assembly ID
1741
+ # We already asserted that the ID is actually present
1742
+ if id == assembly_id:
1743
+ for chain_id in asym_id_expr.split(","):
1744
+ chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
1745
+
1746
+ sub_assemblies = []
1747
+ for asym_id, op_list in chain_ops.items():
1748
+ sub_struct = structure[..., structure.label_asym_id == asym_id]
1749
+ sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
1750
+ # Merge the chain's sub_assembly into the rest of the assembly
1751
+ sub_assemblies.append(sub_assembly)
1752
+ assembly = concatenate(sub_assemblies)
1753
+
1754
+ # Sort AtomArray or AtomArrayStack by 'sym_id'
1755
+ max_sym_id = assembly.sym_id.max()
1756
+ assembly = concatenate(
1757
+ [assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
1758
+ )
1759
+
1760
+ # Remove 'label_asym_id', if it was not included in the original
1761
+ # user-supplied 'extra_fields'
1762
+ if "label_asym_id" not in extra_fields:
1763
+ assembly.del_annotation("label_asym_id")
1764
+
1765
+ return assembly
1766
+
1767
+
1768
+ def _apply_transformations(structure, transformation_dict, operations):
1769
+ """
1770
+ Get subassembly by applying the given operations to the input
1771
+ structure containing affected asym IDs.
1772
+ """
1773
+ # Additional first dimesion for 'structure.repeat()'
1774
+ assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1775
+ # Apply corresponding transformation for each copy in the assembly
1776
+ for i, operation in enumerate(operations):
1777
+ coord = structure.coord
1778
+ # Execute for each transformation step
1779
+ # in the operation expression
1780
+ for op_step in operation:
1781
+ coord = transformation_dict[op_step].apply(coord)
1782
+ assembly_coord[i] = coord
1783
+
1784
+ assembly = repeat(structure, assembly_coord)
1785
+ assembly.set_annotation(
1786
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1787
+ )
1788
+ return assembly
1789
+
1790
+
1791
+ def _get_transformations(struct_oper):
1792
+ """
1793
+ Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
1794
+ """
1795
+ transformation_dict = {}
1796
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1797
+ rotation_matrix = np.array(
1798
+ [
1799
+ [
1800
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1801
+ for j in (1, 2, 3)
1802
+ ]
1803
+ for i in (1, 2, 3)
1804
+ ]
1805
+ )
1806
+ translation_vector = np.array(
1807
+ [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
1808
+ )
1809
+ transformation_dict[id] = AffineTransformation(
1810
+ np.zeros(3), rotation_matrix, translation_vector
1811
+ )
1812
+ return transformation_dict
1813
+
1814
+
1815
+ def _parse_operation_expression(expression):
1816
+ """
1817
+ Get successive operation steps (IDs) for the given
1818
+ ``oper_expression``.
1819
+ Form the cartesian product, if necessary.
1820
+ """
1821
+ # Split groups by parentheses:
1822
+ # use the opening parenthesis as delimiter
1823
+ # and just remove the closing parenthesis
1824
+ # example: '(X0)(1-10,21-25)' from 1a34
1825
+ expressions_per_step = expression.replace(")", "").split("(")
1826
+ expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
1827
+ # Important: Operations are applied from right to left
1828
+ expressions_per_step.reverse()
1829
+
1830
+ operations = []
1831
+ for one_step_expr in expressions_per_step:
1832
+ one_step_op_ids = []
1833
+ for expr in one_step_expr.split(","):
1834
+ if "-" in expr:
1835
+ # Range of operation IDs, they must be integers
1836
+ first, last = expr.split("-")
1837
+ one_step_op_ids.extend(
1838
+ [str(id) for id in range(int(first), int(last) + 1)]
1839
+ )
1840
+ else:
1841
+ # Single operation ID
1842
+ one_step_op_ids.append(expr)
1843
+ operations.append(one_step_op_ids)
1844
+
1845
+ # Cartesian product of operations
1846
+ return list(itertools.product(*operations))
1847
+
1848
+
1849
+ def _convert_string_to_sequence(string, stype):
1850
+ """
1851
+ Convert strings to `ProteinSequence` if `stype` is contained in
1852
+ ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1853
+ contained in ``_nucleotideseq_type_list``.
1854
+ """
1855
+ # sequence may be stored as multiline string
1856
+ string = string.replace("\n", "")
1857
+ if stype in _proteinseq_type_list:
1858
+ return ProteinSequence(string)
1859
+ elif stype in _nucleotideseq_type_list:
1860
+ string = string.replace("U", "T")
1861
+ return NucleotideSequence(string)
1862
+ elif stype in _other_type_list:
1863
+ return None
1864
+ else:
1865
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
1866
+
1867
+
1868
+ def get_unit_cell(
1869
+ pdbx_file,
1870
+ center=True,
1871
+ model=None,
1872
+ data_block=None,
1873
+ altloc="first",
1874
+ extra_fields=None,
1875
+ use_author_fields=True,
1876
+ include_bonds=False,
1877
+ ):
1878
+ """
1879
+ Build a structure model containing all symmetric copies of the structure within a
1880
+ single unit cell.
1881
+
1882
+ This function receives the data from the ``symmetry`` and ``atom_site`` categories
1883
+ in the file.
1884
+ Consequently, these categories must be present in the file.
1885
+
1886
+ Parameters
1887
+ ----------
1888
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1889
+ The file object.
1890
+ center : bool, optional
1891
+ If set to true, each symmetric copy will be moved inside the unit cell
1892
+ dimensions, if its centroid is outside.
1893
+ By default, the copies are are created using the raw space group
1894
+ transformations, which may put them one unit cell length further away.
1895
+ model : int, optional
1896
+ If this parameter is given, the function will return an
1897
+ :class:`AtomArray` from the atoms corresponding to the given
1898
+ model number (starting at 1).
1899
+ Negative values are used to index models starting from the last
1900
+ model insted of the first model.
1901
+ If this parameter is omitted, an :class:`AtomArrayStack`
1902
+ containing all models will be returned, even if the structure
1903
+ contains only one model.
1904
+ data_block : str, optional
1905
+ The name of the data block.
1906
+ Default is the first (and most times only) data block of the
1907
+ file.
1908
+ If the data block object is passed directly to `pdbx_file`,
1909
+ this parameter is ignored.
1910
+ altloc : {'first', 'occupancy', 'all'}
1911
+ This parameter defines how *altloc* IDs are handled:
1912
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1913
+ appearing in a residue.
1914
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1915
+ with the highest occupancy for a residue.
1916
+ - ``'all'`` - Use all atoms.
1917
+ Note that this leads to duplicate atoms.
1918
+ When this option is chosen, the ``altloc_id`` annotation
1919
+ array is added to the returned structure.
1920
+ extra_fields : list of str, optional
1921
+ The strings in the list are entry names, that are
1922
+ additionally added as annotation arrays.
1923
+ The annotation category name will be the same as the PDBx
1924
+ subcategory name.
1925
+ The array type is always `str`.
1926
+ An exception are the special field identifiers:
1927
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1928
+ These will convert the fitting subcategory into an
1929
+ annotation array with reasonable type.
1930
+ use_author_fields : bool, optional
1931
+ Some fields can be read from two alternative sources,
1932
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1933
+ the ID of the residue.
1934
+ While, the ``label_xxx`` fields can be used as official pointers
1935
+ to other categories in the file, the ``auth_xxx``
1936
+ fields are set by the author(s) of the structure and are
1937
+ consistent with the corresponding values in PDB files.
1938
+ If `use_author_fields` is true, the annotation arrays will be
1939
+ read from the ``auth_xxx`` fields (if applicable),
1940
+ otherwise from the the ``label_xxx`` fields.
1941
+ include_bonds : bool, optional
1942
+ If set to true, a :class:`BondList` will be created for the
1943
+ resulting :class:`AtomArray` containing the bond information
1944
+ from the file.
1945
+ Inter-residue bonds, will be read from the ``struct_conn``
1946
+ category.
1947
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
1948
+ available, otherwise they will be derived from the Chemical
1949
+ Component Dictionary.
1950
+
1951
+ Returns
1952
+ -------
1953
+ unit_cell : AtomArray or AtomArrayStack
1954
+ The structure representing the unit cell.
1955
+ The return type depends on the `model` parameter.
1956
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1957
+ unit in the unit cell.
1958
+
1959
+ Examples
1960
+ --------
1961
+
1962
+ >>> import os.path
1963
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1964
+ >>> unit_cell = get_unit_cell(file, model=1)
1965
+ """
1966
+ block = _get_block(pdbx_file, data_block)
1967
+
1968
+ try:
1969
+ space_group = block["symmetry"]["space_group_name_H-M"].as_item()
1970
+ except KeyError:
1971
+ raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
1972
+ transforms = space_group_transforms(space_group)
1973
+
1974
+ asym = get_structure(
1975
+ pdbx_file,
1976
+ model,
1977
+ data_block,
1978
+ altloc,
1979
+ extra_fields,
1980
+ use_author_fields,
1981
+ include_bonds,
1982
+ )
1983
+
1984
+ fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
1985
+ unit_cell_copies = []
1986
+ for transform in transforms:
1987
+ fractional_coord = transform.apply(fractional_asym_coord)
1988
+ if center:
1989
+ # If the centroid is outside the box, move the copy inside the box
1990
+ orig_centroid = centroid(fractional_coord)
1991
+ new_centroid = orig_centroid % 1
1992
+ fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
1993
+ unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
1994
+
1995
+ unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
1996
+ unit_cell.set_annotation(
1997
+ "sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
1998
+ )
1999
+ return unit_cell
2000
+
2001
+
2002
+ def get_sse(pdbx_file, data_block=None, match_model=None):
2003
+ """
2004
+ Get the secondary structure from a PDBx file.
2005
+
2006
+ Parameters
2007
+ ----------
2008
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
2009
+ The file object.
2010
+ The following categories are required:
2011
+
2012
+ - ``entity_poly``
2013
+ - ``struct_conf`` (if alpha-helices are present)
2014
+ - ``struct_sheet_range`` (if beta-strands are present)
2015
+ - ``atom_site`` (if `match_model` is set)
2016
+
2017
+ data_block : str, optional
2018
+ The name of the data block.
2019
+ Default is the first (and most times only) data block of the
2020
+ file.
2021
+ If the data block object is passed directly to `pdbx_file`,
2022
+ this parameter is ignored.
2023
+ match_model : None, optional
2024
+ If a model number is given, only secondary structure elements for residues are
2025
+ kept, that are resolved in the given model.
2026
+ This means secondary structure elements for residues that would not appear
2027
+ in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
2028
+ By default, all residues in the sequence are kept.
2029
+
2030
+ Returns
2031
+ -------
2032
+ sse_dict : dict of str -> ndarray, dtype=str
2033
+ The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
2034
+ secondary structure of the respective chain.
2035
+
2036
+ - ``"a"``: alpha-helix
2037
+ - ``"b"``: beta-strand
2038
+ - ``"c"``: coil or not an amino acid
2039
+
2040
+ Each secondary structure element corresponds to the ``label_seq_id`` of the
2041
+ ``atom_site`` category.
2042
+ This means that the 0-th position of the array corresponds to the residue
2043
+ in ``atom_site`` with ``label_seq_id`` ``1``.
2044
+
2045
+ Examples
2046
+ --------
2047
+
2048
+ >>> import os.path
2049
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
2050
+ >>> sse = get_sse(file, match_model=1)
2051
+ >>> print(sse)
2052
+ {'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
2053
+ 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
2054
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2055
+ 'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
2056
+ 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2057
+ 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2058
+ 'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
2059
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
2060
+ 'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2061
+ 'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
2062
+ dtype='<U1')}
2063
+
2064
+ If only secondary structure elements for resolved residues are requested, the length
2065
+ of the returned array matches the number of peptide residues in the structure.
2066
+
2067
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
2068
+ >>> print(len(get_sse(file, match_model=1)["A"]))
2069
+ 128
2070
+ >>> atoms = get_structure(file, model=1)
2071
+ >>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
2072
+ >>> print(get_residue_count(atoms))
2073
+ 128
2074
+ """
2075
+ block = _get_block(pdbx_file, data_block)
2076
+
2077
+ # Init all chains with "c" for coil
2078
+ sse_dict = {
2079
+ chain_id: np.repeat("c", len(sequence))
2080
+ for chain_id, sequence in get_sequence(block).items()
2081
+ }
2082
+
2083
+ # Populate SSE arrays with helices and strands
2084
+ for sse_symbol, category_name in [
2085
+ ("a", "struct_conf"),
2086
+ ("b", "struct_sheet_range"),
2087
+ ]:
2088
+ if category_name in block:
2089
+ category = block[category_name]
2090
+ chains = category["beg_auth_asym_id"].as_array(str)
2091
+ start_positions = category["beg_label_seq_id"].as_array(int)
2092
+ end_positions = category["end_label_seq_id"].as_array(int)
2093
+
2094
+ # set alpha helix positions
2095
+ for chain, start, end in zip(chains, start_positions, end_positions):
2096
+ # Translate the 1-based positions from PDBx into 0-based array indices
2097
+ sse_dict[chain][start - 1 : end] = sse_symbol
2098
+
2099
+ if match_model is not None:
2100
+ model_atom_site = _filter_model(block["atom_site"], match_model)
2101
+ chain_ids = model_atom_site["auth_asym_id"].as_array(str)
2102
+ res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
2103
+ # Filter out masked residues, i.e. residues not part of a chain
2104
+ mask = res_ids != -1
2105
+ chain_ids = chain_ids[mask]
2106
+ res_ids = res_ids[mask]
2107
+ for chain_id, sse in sse_dict.items():
2108
+ res_ids_in_chain = res_ids[chain_ids == chain_id]
2109
+ # Transform from 1-based residue ID to 0-based index
2110
+ indices = np.unique(res_ids_in_chain) - 1
2111
+ sse_dict[chain_id] = sse[indices]
2112
+
2113
+ return sse_dict