biotite 1.3.0__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +191 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +160 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1226 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  266. biotite/structure/bonds.pyx +1975 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +276 -0
  271. biotite/structure/charges.cpython-312-darwin.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +681 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +590 -0
  278. biotite/structure/geometry.py +655 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +90 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +388 -0
  311. biotite/structure/io/pdb/file.py +1356 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +671 -0
  319. biotite/structure/io/pdbx/cif.py +1088 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +358 -0
  322. biotite/structure/io/pdbx/convert.py +2097 -0
  323. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1047 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +544 -0
  337. biotite/structure/rings.py +335 -0
  338. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +292 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +168 -0
  349. biotite/version.py +21 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.3.0.dist-info/METADATA +162 -0
  352. biotite-1.3.0.dist-info/RECORD +354 -0
  353. biotite-1.3.0.dist-info/WHEEL +6 -0
  354. biotite-1.3.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,2097 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdbx"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
7
+ __all__ = [
8
+ "get_sequence",
9
+ "get_model_count",
10
+ "get_structure",
11
+ "set_structure",
12
+ "get_component",
13
+ "set_component",
14
+ "list_assemblies",
15
+ "get_assembly",
16
+ "get_unit_cell",
17
+ "get_sse",
18
+ ]
19
+
20
+ import itertools
21
+ import warnings
22
+ from collections import defaultdict
23
+ import numpy as np
24
+ from biotite.file import InvalidFileError
25
+ from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
26
+ from biotite.structure.atoms import (
27
+ AtomArray,
28
+ AtomArrayStack,
29
+ concatenate,
30
+ repeat,
31
+ )
32
+ from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
33
+ from biotite.structure.box import (
34
+ coord_to_fraction,
35
+ fraction_to_coord,
36
+ space_group_transforms,
37
+ unitcell_from_vectors,
38
+ vectors_from_unitcell,
39
+ )
40
+ from biotite.structure.error import BadStructureError
41
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
42
+ from biotite.structure.filter import (
43
+ _canonical_nucleotide_list as canonical_nucleotide_list,
44
+ )
45
+ from biotite.structure.filter import (
46
+ filter_first_altloc,
47
+ filter_highest_occupancy_altloc,
48
+ )
49
+ from biotite.structure.geometry import centroid
50
+ from biotite.structure.io.pdbx.bcif import (
51
+ BinaryCIFBlock,
52
+ BinaryCIFColumn,
53
+ BinaryCIFFile,
54
+ )
55
+ from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
56
+ from biotite.structure.io.pdbx.component import MaskValue
57
+ from biotite.structure.io.pdbx.encoding import StringArrayEncoding
58
+ from biotite.structure.residues import (
59
+ get_residue_count,
60
+ get_residue_positions,
61
+ get_residue_starts_for,
62
+ )
63
+ from biotite.structure.transform import AffineTransformation
64
+
65
+ # Bond types in `struct_conn` category that refer to covalent bonds
66
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
67
+ # Although a covalent bond, could in theory have a higher bond order,
68
+ # practically inter-residue bonds are always single
69
+ "covale": BondType.SINGLE,
70
+ "covale_base": BondType.SINGLE,
71
+ "covale_phosphate": BondType.SINGLE,
72
+ "covale_sugar": BondType.SINGLE,
73
+ "disulf": BondType.SINGLE,
74
+ "modres": BondType.SINGLE,
75
+ "modres_link": BondType.SINGLE,
76
+ "metalc": BondType.COORDINATION,
77
+ }
78
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
79
+ BondType.ANY: "covale",
80
+ BondType.SINGLE: "covale",
81
+ BondType.DOUBLE: "covale",
82
+ BondType.TRIPLE: "covale",
83
+ BondType.QUADRUPLE: "covale",
84
+ BondType.AROMATIC_SINGLE: "covale",
85
+ BondType.AROMATIC_DOUBLE: "covale",
86
+ BondType.AROMATIC_TRIPLE: "covale",
87
+ BondType.COORDINATION: "metalc",
88
+ }
89
+ PDBX_BOND_TYPE_TO_ORDER = {
90
+ BondType.SINGLE: "sing",
91
+ BondType.DOUBLE: "doub",
92
+ BondType.TRIPLE: "trip",
93
+ BondType.QUADRUPLE: "quad",
94
+ BondType.AROMATIC_SINGLE: "sing",
95
+ BondType.AROMATIC_DOUBLE: "doub",
96
+ BondType.AROMATIC_TRIPLE: "trip",
97
+ # These are masked later, it is merely added here to avoid a KeyError
98
+ BondType.ANY: "",
99
+ BondType.AROMATIC: "",
100
+ BondType.COORDINATION: "",
101
+ }
102
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
103
+ COMP_BOND_ORDER_TO_TYPE = {
104
+ ("SING", "N"): BondType.SINGLE,
105
+ ("DOUB", "N"): BondType.DOUBLE,
106
+ ("TRIP", "N"): BondType.TRIPLE,
107
+ ("QUAD", "N"): BondType.QUADRUPLE,
108
+ ("SING", "Y"): BondType.AROMATIC_SINGLE,
109
+ ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
110
+ ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
111
+ ("AROM", "Y"): BondType.AROMATIC,
112
+ }
113
+ # ...and vice versa
114
+ COMP_BOND_TYPE_TO_ORDER = {
115
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
116
+ }
117
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
118
+ # it was observed that when the number or rows in `atom_site` and `struct_conn`
119
+ # exceed a certain threshold,
120
+ # a dictionary approach is less computation and memory intensive than the dense
121
+ # vectorized approach.
122
+ # https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
123
+ FIND_MATCHES_SWITCH_THRESHOLD = 4000000
124
+
125
+ _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
126
+ _nucleotideseq_type_list = [
127
+ "polydeoxyribonucleotide",
128
+ "polyribonucleotide",
129
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
130
+ ]
131
+ _other_type_list = [
132
+ "cyclic-pseudo-peptide",
133
+ "other",
134
+ "peptide nucleic acid",
135
+ "polysaccharide(D)",
136
+ "polysaccharide(L)",
137
+ ]
138
+
139
+
140
+ def _filter(category, index):
141
+ """
142
+ Reduce the given category to the values selected by the given index,
143
+ """
144
+ Category = type(category)
145
+ Column = Category.subcomponent_class()
146
+ Data = Column.subcomponent_class()
147
+
148
+ return Category(
149
+ {
150
+ key: Column(
151
+ Data(column.data.array[index]),
152
+ (Data(column.mask.array[index]) if column.mask is not None else None),
153
+ )
154
+ for key, column in category.items()
155
+ }
156
+ )
157
+
158
+
159
+ def get_sequence(pdbx_file, data_block=None):
160
+ """
161
+ Get the protein and nucleotide sequences from the
162
+ ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
163
+
164
+ Supported polymer types (``_entity_poly.type``) are:
165
+ ``'polypeptide(D)'``, ``'polypeptide(L)'``,
166
+ ``'polydeoxyribonucleotide'``, ``'polyribonucleotide'`` and
167
+ ``'polydeoxyribonucleotide/polyribonucleotide hybrid'``.
168
+ Uracil is converted to Thymine.
169
+
170
+ Parameters
171
+ ----------
172
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
173
+ The file object.
174
+ data_block : str, optional
175
+ The name of the data block.
176
+ Default is the first (and most times only) data block of the
177
+ file.
178
+ If the data block object is passed directly to `pdbx_file`,
179
+ this parameter is ignored.
180
+
181
+ Returns
182
+ -------
183
+ sequence_dict : Dictionary of Sequences
184
+ Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
185
+ (equivalent to ``atom_site.auth_asym_id``).
186
+ Dictionary values are sequences.
187
+
188
+ Notes
189
+ -----
190
+ The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial
191
+ complete sequence. If the structure represents a truncated or spliced
192
+ version of this initial sequence, it will include only a subset of the
193
+ initial sequence. Use biotite.structure.get_residues to retrieve only
194
+ the residues that are represented in the structure.
195
+ """
196
+
197
+ block = _get_block(pdbx_file, data_block)
198
+ poly_category = block["entity_poly"]
199
+
200
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
201
+ seq_type = poly_category["type"].as_array(str)
202
+
203
+ sequences = [
204
+ _convert_string_to_sequence(string, stype)
205
+ for string, stype in zip(seq_string, seq_type)
206
+ ]
207
+
208
+ strand_ids = poly_category["pdbx_strand_id"].as_array(str)
209
+ strand_ids = [strand_id.split(",") for strand_id in strand_ids]
210
+
211
+ sequence_dict = {
212
+ strand_id: sequence
213
+ for sequence, strand_ids in zip(sequences, strand_ids)
214
+ for strand_id in strand_ids
215
+ if sequence is not None
216
+ }
217
+
218
+ return sequence_dict
219
+
220
+
221
+ def get_model_count(pdbx_file, data_block=None):
222
+ """
223
+ Get the number of models contained in a file.
224
+
225
+ Parameters
226
+ ----------
227
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
228
+ The file object.
229
+ data_block : str, optional
230
+ The name of the data block.
231
+ Default is the first (and most times only) data block of the
232
+ file.
233
+ If the data block object is passed directly to `pdbx_file`,
234
+ this parameter is ignored.
235
+
236
+ Returns
237
+ -------
238
+ model_count : int
239
+ The number of models.
240
+ """
241
+ block = _get_block(pdbx_file, data_block)
242
+ return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
243
+
244
+
245
+ def get_structure(
246
+ pdbx_file,
247
+ model=None,
248
+ data_block=None,
249
+ altloc="first",
250
+ extra_fields=None,
251
+ use_author_fields=True,
252
+ include_bonds=False,
253
+ ):
254
+ """
255
+ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
256
+ ``atom_site`` category in a file.
257
+
258
+ Parameters
259
+ ----------
260
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
261
+ The file object.
262
+ model : int, optional
263
+ If this parameter is given, the function will return an
264
+ :class:`AtomArray` from the atoms corresponding to the given
265
+ model number (starting at 1).
266
+ Negative values are used to index models starting from the last
267
+ model insted of the first model.
268
+ If this parameter is omitted, an :class:`AtomArrayStack`
269
+ containing all models will be returned, even if the structure
270
+ contains only one model.
271
+ data_block : str, optional
272
+ The name of the data block.
273
+ Default is the first (and most times only) data block of the
274
+ file.
275
+ If the data block object is passed directly to `pdbx_file`,
276
+ this parameter is ignored.
277
+ altloc : {'first', 'occupancy', 'all'}
278
+ This parameter defines how *altloc* IDs are handled:
279
+ - ``'first'`` - Use atoms that have the first *altloc* ID
280
+ appearing in a residue.
281
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
282
+ with the highest occupancy for a residue.
283
+ - ``'all'`` - Use all atoms.
284
+ Note that this leads to duplicate atoms.
285
+ When this option is chosen, the ``altloc_id`` annotation
286
+ array is added to the returned structure.
287
+ extra_fields : list of str, optional
288
+ The strings in the list are entry names, that are
289
+ additionally added as annotation arrays.
290
+ The annotation category name will be the same as the PDBx
291
+ subcategory name.
292
+ The array type is always `str`.
293
+ An exception are the special field identifiers:
294
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
295
+ These will convert the fitting subcategory into an
296
+ annotation array with reasonable type.
297
+ use_author_fields : bool, optional
298
+ Some fields can be read from two alternative sources,
299
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
300
+ the ID of the residue.
301
+ While, the ``label_xxx`` fields can be used as official pointers
302
+ to other categories in the file, the ``auth_xxx``
303
+ fields are set by the author(s) of the structure and are
304
+ consistent with the corresponding values in PDB files.
305
+ If `use_author_fields` is true, the annotation arrays will be
306
+ read from the ``auth_xxx`` fields (if applicable),
307
+ otherwise from the the ``label_xxx`` fields.
308
+ If the requested field is not available, the respective other
309
+ field is taken as fallback.
310
+ include_bonds : bool, optional
311
+ If set to true, a :class:`BondList` will be created for the
312
+ resulting :class:`AtomArray` containing the bond information
313
+ from the file.
314
+ Inter-residue bonds, will be read from the ``struct_conn``
315
+ category.
316
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
317
+ available, otherwise they will be derived from the Chemical
318
+ Component Dictionary.
319
+
320
+ Returns
321
+ -------
322
+ array : AtomArray or AtomArrayStack
323
+ The return type depends on the `model` parameter.
324
+
325
+ Examples
326
+ --------
327
+
328
+ >>> import os.path
329
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
330
+ >>> arr = get_structure(file, model=1)
331
+ >>> print(len(arr))
332
+ 304
333
+ """
334
+ block = _get_block(pdbx_file, data_block)
335
+
336
+ extra_fields = set() if extra_fields is None else set(extra_fields)
337
+
338
+ atom_site = block.get("atom_site")
339
+ if atom_site is None:
340
+ raise InvalidFileError("Missing 'atom_site' category in file")
341
+
342
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
343
+ model_count = len(np.unique(models))
344
+ atom_count = len(models)
345
+
346
+ if model is None:
347
+ # For a stack, the annotations are derived from the first model
348
+ model_atom_site = _filter_model(atom_site, 1)
349
+ # Any field of the category would work here to get the length
350
+ model_length = model_atom_site.row_count
351
+ atoms = AtomArrayStack(model_count, model_length)
352
+
353
+ # Check if each model has the same amount of atoms
354
+ # If not, raise exception
355
+ if model_length * model_count != atom_count:
356
+ raise InvalidFileError(
357
+ "The models in the file have unequal "
358
+ "amount of atoms, give an explicit model "
359
+ "instead"
360
+ )
361
+
362
+ atoms.coord[:, :, 0] = (
363
+ atom_site["Cartn_x"]
364
+ .as_array(np.float32)
365
+ .reshape((model_count, model_length))
366
+ )
367
+ atoms.coord[:, :, 1] = (
368
+ atom_site["Cartn_y"]
369
+ .as_array(np.float32)
370
+ .reshape((model_count, model_length))
371
+ )
372
+ atoms.coord[:, :, 2] = (
373
+ atom_site["Cartn_z"]
374
+ .as_array(np.float32)
375
+ .reshape((model_count, model_length))
376
+ )
377
+
378
+ box = _get_box(block)
379
+ if box is not None:
380
+ # Duplicate same box for each model
381
+ atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
382
+
383
+ else:
384
+ if model == 0:
385
+ raise ValueError("The model index must not be 0")
386
+ # Negative models mean model indexing starting from last model
387
+ model = model_count + model + 1 if model < 0 else model
388
+ if model > model_count:
389
+ raise ValueError(
390
+ f"The file has {model_count} models, "
391
+ f"the given model {model} does not exist"
392
+ )
393
+
394
+ model_atom_site = _filter_model(atom_site, model)
395
+ # Any field of the category would work here to get the length
396
+ model_length = model_atom_site.row_count
397
+ atoms = AtomArray(model_length)
398
+
399
+ atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
400
+ atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
401
+ atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
402
+
403
+ atoms.box = _get_box(block)
404
+
405
+ # The below part is the same for both, AtomArray and AtomArrayStack
406
+ _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
407
+
408
+ atoms, altloc_filtered_atom_site = _filter_altloc(atoms, model_atom_site, altloc)
409
+
410
+ if include_bonds:
411
+ if altloc == "all":
412
+ raise ValueError(
413
+ "Bond computation is not supported with `altloc='all', consider using "
414
+ "'connect_via_residue_names()' afterwards"
415
+ )
416
+
417
+ if "chem_comp_bond" in block:
418
+ try:
419
+ custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
420
+ except KeyError:
421
+ warnings.warn(
422
+ "The 'chem_comp_bond' category has missing columns, "
423
+ "falling back to using Chemical Component Dictionary",
424
+ UserWarning,
425
+ )
426
+ custom_bond_dict = None
427
+ bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict)
428
+ else:
429
+ bonds = connect_via_residue_names(atoms)
430
+ if "struct_conn" in block:
431
+ bonds = bonds.merge(
432
+ _parse_inter_residue_bonds(
433
+ altloc_filtered_atom_site,
434
+ block["struct_conn"],
435
+ atom_count=atoms.array_length(),
436
+ )
437
+ )
438
+ atoms.bonds = bonds
439
+
440
+ return atoms
441
+
442
+
443
+ def _get_block(pdbx_component, block_name):
444
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
445
+ # Determine block
446
+ if block_name is None:
447
+ return pdbx_component.block
448
+ else:
449
+ return pdbx_component[block_name]
450
+ else:
451
+ return pdbx_component
452
+
453
+
454
+ def _get_or_fallback(category, key, fallback_key):
455
+ """
456
+ Return column related to key in category if it exists,
457
+ otherwise try to get the column related to fallback key.
458
+ """
459
+ if key not in category:
460
+ warnings.warn(
461
+ f"Attribute '{key}' not found within 'atom_site' category. "
462
+ f"The fallback attribute '{fallback_key}' will be used instead",
463
+ UserWarning,
464
+ )
465
+ try:
466
+ return category[fallback_key]
467
+ except KeyError as key_exc:
468
+ raise InvalidFileError(
469
+ f"Fallback attribute '{fallback_key}' not found within "
470
+ "'atom_site' category"
471
+ ) from key_exc
472
+ return category[key]
473
+
474
+
475
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
476
+ """Fill atom_site annotations in atom array or atom array stack.
477
+
478
+ Parameters
479
+ ----------
480
+ array : AtomArray or AtomArrayStack
481
+ Atom array or stack which will be annotated.
482
+ atom_site : CIFCategory or BinaryCIFCategory
483
+ ``atom_site`` category with values for one model.
484
+ extra_fields : list of str
485
+ Entry names, that are additionally added as annotation arrays.
486
+ use_author_fields : bool
487
+ Define if alternate fields prefixed with ``auth_`` should be used
488
+ instead of ``label_``.
489
+ """
490
+
491
+ prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth")
492
+
493
+ array.set_annotation(
494
+ "chain_id",
495
+ _get_or_fallback(
496
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
497
+ ).as_array(str),
498
+ )
499
+ array.set_annotation(
500
+ "res_id",
501
+ _get_or_fallback(
502
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
503
+ ).as_array(int, -1),
504
+ )
505
+ array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
506
+ array.set_annotation(
507
+ "res_name",
508
+ _get_or_fallback(
509
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
510
+ ).as_array(str),
511
+ )
512
+ array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
513
+ array.set_annotation(
514
+ "atom_name",
515
+ _get_or_fallback(
516
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
517
+ ).as_array(str),
518
+ )
519
+ array.set_annotation("element", atom_site["type_symbol"].as_array(str))
520
+
521
+ if "atom_id" in extra_fields:
522
+ if "id" in atom_site:
523
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
524
+ else:
525
+ warnings.warn(
526
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
527
+ UserWarning,
528
+ )
529
+ array.set_annotation("atom_id", np.arange(array.array_length()))
530
+ extra_fields.remove("atom_id")
531
+ if "b_factor" in extra_fields:
532
+ if "B_iso_or_equiv" in atom_site:
533
+ array.set_annotation(
534
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
535
+ )
536
+ else:
537
+ warnings.warn(
538
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
539
+ UserWarning,
540
+ )
541
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
542
+ extra_fields.remove("b_factor")
543
+ if "occupancy" in extra_fields:
544
+ if "occupancy" in atom_site:
545
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
546
+ else:
547
+ warnings.warn(
548
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
549
+ UserWarning,
550
+ )
551
+ array.set_annotation(
552
+ "occupancy", np.ones(array.array_length(), dtype=float)
553
+ )
554
+ extra_fields.remove("occupancy")
555
+ if "charge" in extra_fields:
556
+ if "pdbx_formal_charge" in atom_site:
557
+ array.set_annotation(
558
+ "charge",
559
+ atom_site["pdbx_formal_charge"].as_array(
560
+ int, 0
561
+ ), # masked values are set to 0
562
+ )
563
+ else:
564
+ warnings.warn(
565
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
566
+ UserWarning,
567
+ )
568
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
569
+ extra_fields.remove("charge")
570
+
571
+ # Handle all remaining custom fields
572
+ for field in extra_fields:
573
+ array.set_annotation(field, atom_site[field].as_array(str))
574
+
575
+
576
+ def _parse_intra_residue_bonds(chem_comp_bond):
577
+ """
578
+ Create a :func:`connect_via_residue_names()` compatible
579
+ `custom_bond_dict` from the ``chem_comp_bond`` category.
580
+ """
581
+ custom_bond_dict = {}
582
+ for res_name, atom_1, atom_2, order, aromatic_flag in zip(
583
+ chem_comp_bond["comp_id"].as_array(str),
584
+ chem_comp_bond["atom_id_1"].as_array(str),
585
+ chem_comp_bond["atom_id_2"].as_array(str),
586
+ chem_comp_bond["value_order"].as_array(str),
587
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str),
588
+ ):
589
+ if res_name not in custom_bond_dict:
590
+ custom_bond_dict[res_name] = {}
591
+ bond_type = COMP_BOND_ORDER_TO_TYPE.get(
592
+ (order.upper(), aromatic_flag), BondType.ANY
593
+ )
594
+ custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
595
+ return custom_bond_dict
596
+
597
+
598
+ def _parse_inter_residue_bonds(atom_site, struct_conn, atom_count=None):
599
+ """
600
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
601
+ The atom indices of each bond are found by matching the bond labels
602
+ to the ``atom_site`` category.
603
+ If atom_count is None, it will be inferred from the ``atom_site`` category.
604
+ """
605
+ # Identity symmetry operation
606
+ IDENTITY = "1_555"
607
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
608
+ COLUMNS = [
609
+ "label_asym_id",
610
+ "label_comp_id",
611
+ "label_seq_id",
612
+ "label_atom_id",
613
+ "label_alt_id",
614
+ "auth_asym_id",
615
+ "auth_comp_id",
616
+ "auth_seq_id",
617
+ "pdbx_PDB_ins_code",
618
+ ]
619
+
620
+ covale_mask = np.isin(
621
+ struct_conn["conn_type_id"].as_array(str),
622
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
623
+ )
624
+ if "ptnr1_symmetry" in struct_conn:
625
+ covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
626
+ if "ptnr2_symmetry" in struct_conn:
627
+ covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
628
+
629
+ atom_indices = [None] * 2
630
+ for i in range(2):
631
+ reference_arrays = []
632
+ query_arrays = []
633
+ for col_name in COLUMNS:
634
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1)
635
+ if col_name not in atom_site or struct_conn_col_name not in struct_conn:
636
+ continue
637
+ # Ensure both arrays have the same dtype to allow comparison
638
+ reference = atom_site[col_name].as_array()
639
+ dtype = reference.dtype
640
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
641
+ if np.issubdtype(reference.dtype, str):
642
+ # The mask value is not necessarily consistent
643
+ # between query and reference
644
+ # -> make it consistent
645
+ reference[reference == "?"] = "."
646
+ query[query == "?"] = "."
647
+ reference_arrays.append(reference)
648
+ query_arrays.append(query[covale_mask])
649
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
650
+ # in 'atom_site' and 'struct_conn'
651
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
652
+ atoms_indices_1 = atom_indices[0]
653
+ atoms_indices_2 = atom_indices[1]
654
+
655
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
656
+ # This is okay,
657
+ # as 'atom_site' might already be reduced to a single model
658
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
659
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
660
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
661
+
662
+ bond_type_id = struct_conn["conn_type_id"].as_array()
663
+ # Consecutively apply the same masks as applied to the atom indices
664
+ # Logical combination does not work here,
665
+ # as the second mask was created based on already filtered data
666
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
667
+ # The type ID is always present in the dictionary,
668
+ # as it was used to filter the applicable bonds
669
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
670
+
671
+ return BondList(
672
+ atom_count if atom_count is not None else atom_site.row_count,
673
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
674
+ )
675
+
676
+
677
+ def _find_matches(query_arrays, reference_arrays):
678
+ """
679
+ For each index in the `query_arrays` find the indices in the
680
+ `reference_arrays` where all query values match the reference counterpart.
681
+ If no match is found for a query, the corresponding index is -1.
682
+ """
683
+ if (
684
+ query_arrays[0].shape[0] * reference_arrays[0].shape[0]
685
+ <= FIND_MATCHES_SWITCH_THRESHOLD
686
+ ):
687
+ match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
688
+ else:
689
+ match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
690
+ return match_indices
691
+
692
+
693
+ def _find_matches_by_dense_array(query_arrays, reference_arrays):
694
+ match_masks_for_all_columns = np.stack(
695
+ [
696
+ query[:, np.newaxis] == reference[np.newaxis, :]
697
+ for query, reference in zip(query_arrays, reference_arrays)
698
+ ],
699
+ axis=-1,
700
+ )
701
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
702
+ query_matches, reference_matches = np.where(match_masks)
703
+
704
+ # Duplicate matches indicate that an atom from the query cannot
705
+ # be uniquely matched to an atom in the reference
706
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
707
+ if np.any(counts > 1):
708
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
709
+ raise InvalidFileError(
710
+ f"The covalent bond in the 'struct_conn' category at index "
711
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
712
+ f"the 'atom_site' category"
713
+ )
714
+
715
+ # -1 indicates that no match was found in the reference
716
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
717
+ match_indices[query_matches] = reference_matches
718
+ return match_indices
719
+
720
+
721
+ def _find_matches_by_dict(query_arrays, reference_arrays):
722
+ # Convert reference arrays to a dictionary for O(1) lookups
723
+ reference_dict = {}
724
+ ambiguous_keys = set()
725
+ for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
726
+ ref_key = tuple(ref_row)
727
+ if ref_key in reference_dict:
728
+ ambiguous_keys.add(ref_key)
729
+ continue
730
+ reference_dict[ref_key] = ref_idx
731
+
732
+ match_indices = []
733
+ for query_idx, query_row in enumerate(zip(*query_arrays)):
734
+ query_key = tuple(query_row)
735
+ occurrence = reference_dict.get(query_key)
736
+
737
+ if occurrence is None:
738
+ # -1 indicates that no match was found in the reference
739
+ match_indices.append(-1)
740
+ elif query_key in ambiguous_keys:
741
+ # The query cannot be uniquely matched to an atom in the reference
742
+ raise InvalidFileError(
743
+ f"The covalent bond in the 'struct_conn' category at index "
744
+ f"{query_idx} cannot be unambiguously assigned to atoms in "
745
+ f"the 'atom_site' category"
746
+ )
747
+ else:
748
+ match_indices.append(occurrence)
749
+
750
+ return np.array(match_indices)
751
+
752
+
753
+ def _get_struct_conn_col_name(col_name, partner):
754
+ """
755
+ For a column name in ``atom_site`` get the corresponding column name
756
+ in ``struct_conn``.
757
+ """
758
+ if col_name == "label_alt_id":
759
+ return f"pdbx_ptnr{partner}_label_alt_id"
760
+ elif col_name.startswith("pdbx_"):
761
+ # Move 'pdbx_' to front
762
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
763
+ else:
764
+ return f"ptnr{partner}_{col_name}"
765
+
766
+
767
+ def _filter_altloc(array, atom_site, altloc):
768
+ """
769
+ Filter the given :class:`AtomArray` and ``atom_site`` category to the rows
770
+ specified by the given *altloc* identifier.
771
+ """
772
+ altloc_ids = atom_site.get("label_alt_id")
773
+ occupancy = atom_site.get("occupancy")
774
+
775
+ if altloc == "all":
776
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
777
+ return array, atom_site
778
+ elif altloc_ids is None or (altloc_ids.mask.array != MaskValue.PRESENT).all():
779
+ # No altlocs in atom_site category
780
+ return array, atom_site
781
+ elif altloc == "occupancy" and occupancy is not None:
782
+ mask = filter_highest_occupancy_altloc(
783
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
784
+ )
785
+ return array[..., mask], _filter(atom_site, mask)
786
+ # 'first' is also fallback if file has no occupancy information
787
+ elif altloc == "first":
788
+ mask = filter_first_altloc(array, altloc_ids.as_array(str))
789
+ return array[..., mask], _filter(atom_site, mask)
790
+ else:
791
+ raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
792
+
793
+
794
+ def _filter_model(atom_site, model):
795
+ """
796
+ Reduce the ``atom_site`` category to the values for the given
797
+ model.
798
+
799
+ Parameters
800
+ ----------
801
+ atom_site : CIFCategory or BinaryCIFCategory
802
+ ``atom_site`` category containing all models.
803
+ model : int
804
+ The model to be selected.
805
+
806
+ Returns
807
+ -------
808
+ atom_site : CIFCategory or BinaryCIFCategory
809
+ The ``atom_site`` category containing only the selected model.
810
+ """
811
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
812
+ _, model_starts = np.unique(models, return_index=True)
813
+ model_starts.sort()
814
+ # Append exclusive stop
815
+ model_starts = np.append(model_starts, [atom_site.row_count])
816
+ # Indexing starts at 0, but model number starts at 1
817
+ model_index = model - 1
818
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
819
+ return _filter(atom_site, index)
820
+
821
+
822
+ def _get_box(block):
823
+ cell = block.get("cell")
824
+ if cell is None:
825
+ return None
826
+ try:
827
+ len_a, len_b, len_c = [
828
+ float(cell[length].as_item())
829
+ for length in ["length_a", "length_b", "length_c"]
830
+ ]
831
+ alpha, beta, gamma = [
832
+ np.deg2rad(float(cell[angle].as_item()))
833
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
834
+ ]
835
+ except ValueError:
836
+ # 'cell_dict' has no proper unit cell values, e.g. '?'
837
+ return None
838
+ return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
839
+
840
+
841
+ def set_structure(
842
+ pdbx_file,
843
+ array,
844
+ data_block=None,
845
+ include_bonds=False,
846
+ extra_fields=[],
847
+ ):
848
+ """
849
+ Set the ``atom_site`` category with atom information from an
850
+ :class:`AtomArray` or :class:`AtomArrayStack`.
851
+
852
+ This will save the coordinates, the mandatory annotation categories
853
+ and the optional annotation categories
854
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
855
+ If the atom array (stack) contains the annotation ``'atom_id'``,
856
+ these values will be used for atom numbering instead of continuous
857
+ numbering.
858
+ Furthermore, inter-residue bonds will be written into the
859
+ ``struct_conn`` category.
860
+
861
+ Parameters
862
+ ----------
863
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
864
+ The file object.
865
+ array : AtomArray or AtomArrayStack
866
+ The structure to be written. If a stack is given, each array in
867
+ the stack will be in a separate model.
868
+ data_block : str, optional
869
+ The name of the data block.
870
+ Default is the first (and most times only) data block of the
871
+ file.
872
+ If the data block object is passed directly to `pdbx_file`,
873
+ this parameter is ignored.
874
+ If the file is empty, a new data block will be created.
875
+ include_bonds : bool, optional
876
+ If set to true and `array` has associated ``bonds`` , the
877
+ intra-residue bonds will be written into the ``chem_comp_bond``
878
+ category.
879
+ Inter-residue bonds will be written into the ``struct_conn``
880
+ independent of this parameter.
881
+ extra_fields : list of str, optional
882
+ List of additional fields from the ``atom_site`` category
883
+ that should be written into the file.
884
+ Default is an empty list.
885
+
886
+ Notes
887
+ -----
888
+ In some cases, the written inter-residue bonds cannot be read again
889
+ due to ambiguity to which atoms the bond refers.
890
+ This is the case, when two equal residues in the same chain have
891
+ the same (or a masked) `res_id`.
892
+
893
+ Examples
894
+ --------
895
+
896
+ >>> import os.path
897
+ >>> file = CIFFile()
898
+ >>> set_structure(file, atom_array)
899
+ >>> file.write(os.path.join(path_to_directory, "structure.cif"))
900
+ """
901
+ _check_non_empty(array)
902
+
903
+ block = _get_or_create_block(pdbx_file, data_block)
904
+ Category = block.subcomponent_class()
905
+ Column = Category.subcomponent_class()
906
+
907
+ # Fill PDBx columns from information
908
+ # in structures' attribute arrays as good as possible
909
+ atom_site = Category()
910
+ atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM")
911
+ atom_site["type_symbol"] = np.copy(array.element)
912
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
913
+ atom_site["label_alt_id"] = Column(
914
+ # AtomArrays do not store altloc atoms
915
+ np.full(array.array_length(), "."),
916
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
917
+ )
918
+ atom_site["label_comp_id"] = np.copy(array.res_name)
919
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
920
+ atom_site["label_entity_id"] = (
921
+ np.copy(array.label_entity_id)
922
+ if "label_entity_id" in array.get_annotation_categories()
923
+ else _determine_entity_id(array.chain_id)
924
+ )
925
+ atom_site["label_seq_id"] = np.copy(array.res_id)
926
+ atom_site["pdbx_PDB_ins_code"] = Column(
927
+ np.copy(array.ins_code),
928
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT),
929
+ )
930
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
931
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
932
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
933
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
934
+
935
+ annot_categories = array.get_annotation_categories()
936
+ if "atom_id" in annot_categories:
937
+ atom_site["id"] = np.copy(array.atom_id)
938
+ if "b_factor" in annot_categories:
939
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
940
+ if "occupancy" in annot_categories:
941
+ atom_site["occupancy"] = np.copy(array.occupancy)
942
+ if "charge" in annot_categories:
943
+ atom_site["pdbx_formal_charge"] = Column(
944
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
945
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
946
+ )
947
+
948
+ # Handle all remaining custom fields
949
+ if len(extra_fields) > 0:
950
+ # ... check to avoid clashes with standard annotations
951
+ _standard_annotations = [
952
+ "hetero",
953
+ "element",
954
+ "atom_name",
955
+ "res_name",
956
+ "chain_id",
957
+ "res_id",
958
+ "ins_code",
959
+ "atom_id",
960
+ "b_factor",
961
+ "occupancy",
962
+ "charge",
963
+ ]
964
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
965
+
966
+ for annot in extra_fields:
967
+ if annot in _reserved_annotation_names:
968
+ raise ValueError(
969
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
970
+ "Please choose another name."
971
+ )
972
+ atom_site[annot] = np.copy(array.get_annotation(annot))
973
+
974
+ if array.bonds is not None:
975
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
976
+ if struct_conn is not None:
977
+ block["struct_conn"] = struct_conn
978
+ if include_bonds:
979
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
980
+ if chem_comp_bond is not None:
981
+ block["chem_comp_bond"] = chem_comp_bond
982
+
983
+ # In case of a single model handle each coordinate
984
+ # simply like a flattened array
985
+ if isinstance(array, AtomArray) or (
986
+ isinstance(array, AtomArrayStack) and array.stack_depth() == 1
987
+ ):
988
+ # 'ravel' flattens coord without copy
989
+ # in case of stack with stack_depth = 1
990
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
991
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
992
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
993
+ atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32)
994
+ # In case of multiple models repeat annotations
995
+ # and use model specific coordinates
996
+ else:
997
+ atom_site = _repeat(atom_site, array.stack_depth())
998
+ coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3))
999
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
1000
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
1001
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
1002
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
1003
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
1004
+ repeats=array.array_length(),
1005
+ )
1006
+ if "atom_id" not in annot_categories:
1007
+ # Count from 1
1008
+ atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1)
1009
+ block["atom_site"] = atom_site
1010
+
1011
+ # Write box into file
1012
+ if array.box is not None:
1013
+ # PDBx files can only store one box for all models
1014
+ # -> Use first box
1015
+ if array.box.ndim == 3:
1016
+ box = array.box[0]
1017
+ else:
1018
+ box = array.box
1019
+ len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
1020
+ cell = Category()
1021
+ cell["length_a"] = len_a
1022
+ cell["length_b"] = len_b
1023
+ cell["length_c"] = len_c
1024
+ cell["angle_alpha"] = np.rad2deg(alpha)
1025
+ cell["angle_beta"] = np.rad2deg(beta)
1026
+ cell["angle_gamma"] = np.rad2deg(gamma)
1027
+ block["cell"] = cell
1028
+
1029
+
1030
+ def _check_non_empty(array):
1031
+ if isinstance(array, AtomArray):
1032
+ if array.array_length() == 0:
1033
+ raise BadStructureError("Structure must not be empty")
1034
+ elif isinstance(array, AtomArrayStack):
1035
+ if array.array_length() == 0 or array.stack_depth() == 0:
1036
+ raise BadStructureError("Structure must not be empty")
1037
+ else:
1038
+ raise ValueError(
1039
+ "Structure must be AtomArray or AtomArrayStack, "
1040
+ f"but got {type(array).__name__}"
1041
+ )
1042
+
1043
+
1044
+ def _get_or_create_block(pdbx_component, block_name):
1045
+ Block = pdbx_component.subcomponent_class()
1046
+
1047
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
1048
+ if block_name is None:
1049
+ if len(pdbx_component) > 0:
1050
+ block_name = next(iter(pdbx_component.keys()))
1051
+ else:
1052
+ # File is empty -> invent a new block name
1053
+ block_name = "structure"
1054
+
1055
+ if block_name not in pdbx_component:
1056
+ block = Block()
1057
+ pdbx_component[block_name] = block
1058
+ return pdbx_component[block_name]
1059
+ else:
1060
+ # Already a block
1061
+ return pdbx_component
1062
+
1063
+
1064
+ def _determine_entity_id(chain_id):
1065
+ entity_id = np.zeros(len(chain_id), dtype=int)
1066
+ # Dictionary that translates chain_id to entity_id
1067
+ id_translation = {}
1068
+ id = 1
1069
+ for i in range(len(chain_id)):
1070
+ try:
1071
+ entity_id[i] = id_translation[chain_id[i]]
1072
+ except KeyError:
1073
+ # chain_id is not in dictionary -> new entry
1074
+ id_translation[chain_id[i]] = id
1075
+ entity_id[i] = id_translation[chain_id[i]]
1076
+ id += 1
1077
+ return entity_id
1078
+
1079
+
1080
+ def _repeat(category, repetitions):
1081
+ Category = type(category)
1082
+ Column = Category.subcomponent_class()
1083
+ Data = Column.subcomponent_class()
1084
+
1085
+ category_dict = {}
1086
+ for key, column in category.items():
1087
+ if isinstance(column, BinaryCIFColumn):
1088
+ data_encoding = column.data.encoding
1089
+ # Optimization: The repeated string array has the same
1090
+ # unique values, as the original string array
1091
+ # -> Use same unique values (faster due to shorter array)
1092
+ if isinstance(data_encoding[0], StringArrayEncoding):
1093
+ data_encoding[0].strings = np.unique(column.data.array)
1094
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
1095
+ else:
1096
+ data = Data(np.tile(column.data.array, repetitions))
1097
+ mask = (
1098
+ Data(np.tile(column.mask.array, repetitions))
1099
+ if column.mask is not None
1100
+ else None
1101
+ )
1102
+ category_dict[key] = Column(data, mask)
1103
+ return Category(category_dict)
1104
+
1105
+
1106
+ def _set_intra_residue_bonds(array, atom_site):
1107
+ """
1108
+ Create the ``chem_comp_bond`` category containing the intra-residue
1109
+ bonds.
1110
+ ``atom_site`` is only used to infer the right :class:`Category` type
1111
+ (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
1112
+ """
1113
+ if (array.res_name == "").any():
1114
+ raise BadStructureError(
1115
+ "Structure contains atoms with empty residue name, "
1116
+ "but it is required to write intra-residue bonds"
1117
+ )
1118
+ if (array.atom_name == "").any():
1119
+ raise BadStructureError(
1120
+ "Structure contains atoms with empty atom name, "
1121
+ "but it is required to write intra-residue bonds"
1122
+ )
1123
+
1124
+ Category = type(atom_site)
1125
+ Column = Category.subcomponent_class()
1126
+
1127
+ bond_array = _filter_bonds(array, "intra")
1128
+ if len(bond_array) == 0:
1129
+ return None
1130
+ value_order = np.zeros(len(bond_array), dtype="U4")
1131
+ aromatic_flag = np.zeros(len(bond_array), dtype="U1")
1132
+ for i, bond_type in enumerate(bond_array[:, 2]):
1133
+ if bond_type == BondType.ANY:
1134
+ # ANY bonds will be masked anyway, no need to set the value
1135
+ continue
1136
+ order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
1137
+ value_order[i] = order
1138
+ aromatic_flag[i] = aromatic
1139
+ any_mask = bond_array[:, 2] == BondType.ANY
1140
+
1141
+ # Remove already existing residue and atom name combinations
1142
+ # These appear when the structure contains a residue multiple times
1143
+ atom_id_1 = array.atom_name[bond_array[:, 0]]
1144
+ atom_id_2 = array.atom_name[bond_array[:, 1]]
1145
+ # Take the residue name from the first atom index, as the residue
1146
+ # name is the same for both atoms, since we have only intra bonds
1147
+ comp_id = array.res_name[bond_array[:, 0]]
1148
+ _, unique_indices = np.unique(
1149
+ np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True
1150
+ )
1151
+ unique_indices.sort()
1152
+
1153
+ chem_comp_bond = Category()
1154
+ n_bonds = len(unique_indices)
1155
+ chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32)
1156
+ chem_comp_bond["comp_id"] = comp_id[unique_indices]
1157
+ chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices]
1158
+ chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices]
1159
+ chem_comp_bond["value_order"] = Column(
1160
+ value_order[unique_indices],
1161
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1162
+ )
1163
+ chem_comp_bond["pdbx_aromatic_flag"] = Column(
1164
+ aromatic_flag[unique_indices],
1165
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1166
+ )
1167
+ # BondList does not contain stereo information
1168
+ # -> all values are missing
1169
+ chem_comp_bond["pdbx_stereo_config"] = Column(
1170
+ np.zeros(n_bonds, dtype="U1"),
1171
+ np.full(n_bonds, MaskValue.MISSING),
1172
+ )
1173
+ return chem_comp_bond
1174
+
1175
+
1176
+ def _set_inter_residue_bonds(array, atom_site):
1177
+ """
1178
+ Create the ``struct_conn`` category containing the inter-residue
1179
+ bonds.
1180
+ The involved atoms are identified by annotations from the
1181
+ ``atom_site`` category.
1182
+ """
1183
+ COLUMNS = [
1184
+ "label_asym_id",
1185
+ "label_comp_id",
1186
+ "label_seq_id",
1187
+ "label_atom_id",
1188
+ "pdbx_PDB_ins_code",
1189
+ ]
1190
+
1191
+ Category = type(atom_site)
1192
+ Column = Category.subcomponent_class()
1193
+
1194
+ bond_array = _filter_bonds(array, "inter")
1195
+ if len(bond_array) == 0:
1196
+ return None
1197
+
1198
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1199
+ # nucleotide/amino acid residues
1200
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1201
+ if len(bond_array) == 0:
1202
+ return None
1203
+
1204
+ struct_conn = Category()
1205
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1206
+ struct_conn["conn_type_id"] = [
1207
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1208
+ ]
1209
+ struct_conn["pdbx_value_order"] = Column(
1210
+ np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1211
+ np.where(
1212
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1213
+ MaskValue.MISSING,
1214
+ MaskValue.PRESENT,
1215
+ ),
1216
+ )
1217
+ # Write the identifying annotation...
1218
+ for col_name in COLUMNS:
1219
+ annot = atom_site[col_name].as_array()
1220
+ # ...for each bond partner
1221
+ for i in range(2):
1222
+ atom_indices = bond_array[:, i]
1223
+ struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[
1224
+ atom_indices
1225
+ ]
1226
+ return struct_conn
1227
+
1228
+
1229
+ def _filter_bonds(array, connection):
1230
+ """
1231
+ Get a bonds array, that contain either only intra-residue or
1232
+ only inter-residue bonds.
1233
+ """
1234
+ bond_array = array.bonds.as_array()
1235
+ # To save computation time call 'get_residue_starts_for()' only once
1236
+ # with indices of the first and second atom of each bond
1237
+ residue_starts_1, residue_starts_2 = (
1238
+ get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T
1239
+ )
1240
+ if connection == "intra":
1241
+ return bond_array[residue_starts_1 == residue_starts_2]
1242
+ elif connection == "inter":
1243
+ return bond_array[residue_starts_1 != residue_starts_2]
1244
+ else:
1245
+ raise ValueError("Invalid 'connection' option")
1246
+
1247
+
1248
+ def _filter_canonical_links(array, bond_array):
1249
+ """
1250
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1251
+ """
1252
+ # Get the residue index for each bonded atom
1253
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1254
+ -1, 2
1255
+ )
1256
+
1257
+ return (
1258
+ # Must be canonical residues
1259
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1260
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1261
+ # Must be backbone bond
1262
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1263
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1264
+ # Must connect adjacent residues
1265
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1266
+ ) # fmt: skip
1267
+
1268
+
1269
+ def get_component(
1270
+ pdbx_file,
1271
+ data_block=None,
1272
+ use_ideal_coord=True,
1273
+ res_name=None,
1274
+ allow_missing_coord=False,
1275
+ ):
1276
+ """
1277
+ Create an :class:`AtomArray` for a chemical component from the
1278
+ ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
1279
+ category in a file.
1280
+
1281
+ Parameters
1282
+ ----------
1283
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1284
+ The file object.
1285
+ data_block : str, optional
1286
+ The name of the data block.
1287
+ Default is the first (and most times only) data block of the
1288
+ file.
1289
+ If the data block object is passed directly to `pdbx_file`,
1290
+ this parameter is ignored.
1291
+ use_ideal_coord : bool, optional
1292
+ If true, the *ideal* coordinates are read from the file
1293
+ (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
1294
+ originating from computations.
1295
+ If set to false, alternative coordinates are read
1296
+ (``model_Cartn_<dim>_`` fields).
1297
+ res_name : str
1298
+ In rare cases the categories may contain rows for multiple
1299
+ components.
1300
+ In this case, the component with the given residue name is
1301
+ read.
1302
+ By default, all rows would be read in this case.
1303
+ allow_missing_coord : bool, optional
1304
+ Whether to allow missing coordinate values in components.
1305
+ If ``True``, these will be represented as ``nan`` values.
1306
+ If ``False``, a ``ValueError`` is raised when missing coordinates
1307
+ are encountered.
1308
+
1309
+ Returns
1310
+ -------
1311
+ array : AtomArray
1312
+ The parsed chemical component.
1313
+
1314
+ Examples
1315
+ --------
1316
+
1317
+ >>> import os.path
1318
+ >>> file = CIFFile.read(
1319
+ ... os.path.join(path_to_structures, "molecules", "TYR.cif")
1320
+ ... )
1321
+ >>> comp = get_component(file)
1322
+ >>> print(comp)
1323
+ HET 0 TYR N N 1.320 0.952 1.428
1324
+ HET 0 TYR CA C -0.018 0.429 1.734
1325
+ HET 0 TYR C C -0.103 0.094 3.201
1326
+ HET 0 TYR O O 0.886 -0.254 3.799
1327
+ HET 0 TYR CB C -0.274 -0.831 0.907
1328
+ HET 0 TYR CG C -0.189 -0.496 -0.559
1329
+ HET 0 TYR CD1 C 1.022 -0.589 -1.219
1330
+ HET 0 TYR CD2 C -1.324 -0.102 -1.244
1331
+ HET 0 TYR CE1 C 1.103 -0.282 -2.563
1332
+ HET 0 TYR CE2 C -1.247 0.210 -2.587
1333
+ HET 0 TYR CZ C -0.032 0.118 -3.252
1334
+ HET 0 TYR OH O 0.044 0.420 -4.574
1335
+ HET 0 TYR OXT O -1.279 0.184 3.842
1336
+ HET 0 TYR H H 1.977 0.225 1.669
1337
+ HET 0 TYR H2 H 1.365 1.063 0.426
1338
+ HET 0 TYR HA H -0.767 1.183 1.489
1339
+ HET 0 TYR HB2 H 0.473 -1.585 1.152
1340
+ HET 0 TYR HB3 H -1.268 -1.219 1.134
1341
+ HET 0 TYR HD1 H 1.905 -0.902 -0.683
1342
+ HET 0 TYR HD2 H -2.269 -0.031 -0.727
1343
+ HET 0 TYR HE1 H 2.049 -0.354 -3.078
1344
+ HET 0 TYR HE2 H -2.132 0.523 -3.121
1345
+ HET 0 TYR HH H -0.123 -0.399 -5.059
1346
+ HET 0 TYR HXT H -1.333 -0.030 4.784
1347
+ """
1348
+ block = _get_block(pdbx_file, data_block)
1349
+
1350
+ try:
1351
+ atom_category = block["chem_comp_atom"]
1352
+ except KeyError:
1353
+ raise InvalidFileError("Missing 'chem_comp_atom' category in file")
1354
+ if res_name is not None:
1355
+ atom_category = _filter(
1356
+ atom_category, atom_category["comp_id"].as_array() == res_name
1357
+ )
1358
+ if atom_category.row_count == 0:
1359
+ raise KeyError(
1360
+ f"No rows with residue name '{res_name}' found in "
1361
+ f"'chem_comp_atom' category"
1362
+ )
1363
+
1364
+ array = AtomArray(atom_category.row_count)
1365
+
1366
+ array.set_annotation("hetero", np.full(len(atom_category["comp_id"]), True))
1367
+ array.set_annotation("res_name", atom_category["comp_id"].as_array(str))
1368
+ array.set_annotation("atom_name", atom_category["atom_id"].as_array(str))
1369
+ array.set_annotation("element", atom_category["type_symbol"].as_array(str))
1370
+ array.set_annotation("charge", atom_category["charge"].as_array(int, 0))
1371
+
1372
+ coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
1373
+ alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
1374
+ if not use_ideal_coord:
1375
+ # Swap with the fallback option
1376
+ coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1377
+ try:
1378
+ array.coord = _parse_component_coordinates(
1379
+ [atom_category[field] for field in coord_fields]
1380
+ )
1381
+ except Exception as err:
1382
+ if isinstance(err, KeyError):
1383
+ key = err.args[0]
1384
+ warnings.warn(
1385
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1386
+ f"The fallback coordinates will be used instead",
1387
+ UserWarning,
1388
+ )
1389
+ elif isinstance(err, ValueError):
1390
+ warnings.warn(
1391
+ "The coordinates are missing for some atoms. "
1392
+ "The fallback coordinates will be used instead",
1393
+ UserWarning,
1394
+ )
1395
+ else:
1396
+ raise
1397
+ array.coord = _parse_component_coordinates(
1398
+ [atom_category[field] for field in alt_coord_fields],
1399
+ allow_missing=allow_missing_coord,
1400
+ )
1401
+
1402
+ try:
1403
+ bond_category = block["chem_comp_bond"]
1404
+ if res_name is not None:
1405
+ bond_category = _filter(
1406
+ bond_category, bond_category["comp_id"].as_array() == res_name
1407
+ )
1408
+ except KeyError:
1409
+ warnings.warn(
1410
+ "Category 'chem_comp_bond' not found. No bonds will be parsed",
1411
+ UserWarning,
1412
+ )
1413
+ else:
1414
+ bonds = BondList(array.array_length())
1415
+ for atom1, atom2, order, aromatic_flag in zip(
1416
+ bond_category["atom_id_1"].as_array(str),
1417
+ bond_category["atom_id_2"].as_array(str),
1418
+ bond_category["value_order"].as_array(str),
1419
+ bond_category["pdbx_aromatic_flag"].as_array(str),
1420
+ ):
1421
+ atom_i = np.where(array.atom_name == atom1)[0][0]
1422
+ atom_j = np.where(array.atom_name == atom2)[0][0]
1423
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
1424
+ bonds.add_bond(atom_i, atom_j, bond_type)
1425
+ array.bonds = bonds
1426
+
1427
+ return array
1428
+
1429
+
1430
+ def _parse_component_coordinates(coord_columns, allow_missing=False):
1431
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1432
+ for i, column in enumerate(coord_columns):
1433
+ if column.mask is not None and column.mask.array.any():
1434
+ if allow_missing:
1435
+ warnings.warn(
1436
+ "Missing coordinates for some atoms. Those will be set to nan",
1437
+ UserWarning,
1438
+ )
1439
+ else:
1440
+ raise ValueError(
1441
+ "Missing coordinates for some atoms",
1442
+ )
1443
+ coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
1444
+ return coord
1445
+
1446
+
1447
+ def set_component(pdbx_file, array, data_block=None):
1448
+ """
1449
+ Set the ``chem_comp_atom`` and, if bonds are available,
1450
+ ``chem_comp_bond`` category with atom information from an
1451
+ :class:`AtomArray`.
1452
+
1453
+ This will save the coordinates, the mandatory annotation categories
1454
+ and the optional ``charge`` category as well as an associated
1455
+ :class:`BondList`, if available.
1456
+
1457
+ Parameters
1458
+ ----------
1459
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1460
+ The file object.
1461
+ array : AtomArray
1462
+ The chemical component to be written.
1463
+ Must contain only a single residue.
1464
+ data_block : str, optional
1465
+ The name of the data block.
1466
+ Default is the first (and most times only) data block of the
1467
+ file.
1468
+ If the file is empty, a new data will be created.
1469
+ If the data block object is passed directly to `pdbx_file`,
1470
+ this parameter is ignored.
1471
+ """
1472
+ _check_non_empty(array)
1473
+
1474
+ block = _get_or_create_block(pdbx_file, data_block)
1475
+ Category = block.subcomponent_class()
1476
+
1477
+ if get_residue_count(array) > 1:
1478
+ raise BadStructureError("The input atom array must comprise only one residue")
1479
+ res_name = array.res_name[0]
1480
+
1481
+ annot_categories = array.get_annotation_categories()
1482
+ if "charge" in annot_categories:
1483
+ charge = array.charge.astype("U2")
1484
+ else:
1485
+ charge = np.full(array.array_length(), "?", dtype="U2")
1486
+
1487
+ atom_cat = Category()
1488
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1489
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1490
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1491
+ atom_cat["type_symbol"] = np.copy(array.element)
1492
+ atom_cat["charge"] = charge
1493
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1494
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1495
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1496
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1497
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1498
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1499
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1500
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1501
+ atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str)
1502
+ block["chem_comp_atom"] = atom_cat
1503
+
1504
+ if array.bonds is not None and array.bonds.get_bond_count() > 0:
1505
+ bond_array = array.bonds.as_array()
1506
+ order_flags = []
1507
+ aromatic_flags = []
1508
+ for bond_type in bond_array[:, 2]:
1509
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
1510
+ order_flags.append(order_flag)
1511
+ aromatic_flags.append(aromatic_flag)
1512
+
1513
+ bond_cat = Category()
1514
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1515
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]]
1516
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]]
1517
+ bond_cat["value_order"] = np.array(order_flags)
1518
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1519
+ bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str)
1520
+ block["chem_comp_bond"] = bond_cat
1521
+
1522
+
1523
+ def list_assemblies(pdbx_file, data_block=None):
1524
+ """
1525
+ List the biological assemblies that are available for the structure
1526
+ in the given file.
1527
+
1528
+ This function receives the data from the ``pdbx_struct_assembly``
1529
+ category in the file.
1530
+ Consequently, this category must be present in the file.
1531
+
1532
+ Parameters
1533
+ ----------
1534
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1535
+ The file object.
1536
+ data_block : str, optional
1537
+ The name of the data block.
1538
+ Default is the first (and most times only) data block of the
1539
+ file.
1540
+ If the data block object is passed directly to `pdbx_file`,
1541
+ this parameter is ignored.
1542
+
1543
+ Returns
1544
+ -------
1545
+ assemblies : dict of str -> str
1546
+ A dictionary that maps an assembly ID to a description of the
1547
+ corresponding assembly.
1548
+
1549
+ Examples
1550
+ --------
1551
+
1552
+ >>> import os.path
1553
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1554
+ >>> assembly_ids = list_assemblies(file)
1555
+ >>> for key, val in assembly_ids.items():
1556
+ ... print(f"'{key}' : '{val}'")
1557
+ '1' : 'complete icosahedral assembly'
1558
+ '2' : 'icosahedral asymmetric unit'
1559
+ '3' : 'icosahedral pentamer'
1560
+ '4' : 'icosahedral 23 hexamer'
1561
+ '5' : 'icosahedral asymmetric unit, std point frame'
1562
+ '6' : 'crystal asymmetric unit, crystal frame'
1563
+ """
1564
+ block = _get_block(pdbx_file, data_block)
1565
+
1566
+ try:
1567
+ assembly_category = block["pdbx_struct_assembly"]
1568
+ except KeyError:
1569
+ raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
1570
+ return {
1571
+ id: details
1572
+ for id, details in zip(
1573
+ assembly_category["id"].as_array(str),
1574
+ assembly_category["details"].as_array(str),
1575
+ )
1576
+ }
1577
+
1578
+
1579
+ def get_assembly(
1580
+ pdbx_file,
1581
+ assembly_id=None,
1582
+ model=None,
1583
+ data_block=None,
1584
+ altloc="first",
1585
+ extra_fields=None,
1586
+ use_author_fields=True,
1587
+ include_bonds=False,
1588
+ ):
1589
+ """
1590
+ Build the given biological assembly.
1591
+
1592
+ This function receives the data from the
1593
+ ``pdbx_struct_assembly_gen``, ``pdbx_struct_oper_list`` and
1594
+ ``atom_site`` categories in the file.
1595
+ Consequently, these categories must be present in the file.
1596
+
1597
+ Parameters
1598
+ ----------
1599
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1600
+ The file object.
1601
+ assembly_id : str
1602
+ The assembly to build.
1603
+ Available assembly IDs can be obtained via
1604
+ :func:`list_assemblies()`.
1605
+ model : int, optional
1606
+ If this parameter is given, the function will return an
1607
+ :class:`AtomArray` from the atoms corresponding to the given
1608
+ model number (starting at 1).
1609
+ Negative values are used to index models starting from the last
1610
+ model insted of the first model.
1611
+ If this parameter is omitted, an :class:`AtomArrayStack`
1612
+ containing all models will be returned, even if the structure
1613
+ contains only one model.
1614
+ data_block : str, optional
1615
+ The name of the data block.
1616
+ Default is the first (and most times only) data block of the
1617
+ file.
1618
+ If the data block object is passed directly to `pdbx_file`,
1619
+ this parameter is ignored.
1620
+ altloc : {'first', 'occupancy', 'all'}
1621
+ This parameter defines how *altloc* IDs are handled:
1622
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1623
+ appearing in a residue.
1624
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1625
+ with the highest occupancy for a residue.
1626
+ - ``'all'`` - Use all atoms.
1627
+ Note that this leads to duplicate atoms.
1628
+ When this option is chosen, the ``altloc_id`` annotation
1629
+ array is added to the returned structure.
1630
+ extra_fields : list of str, optional
1631
+ The strings in the list are entry names, that are
1632
+ additionally added as annotation arrays.
1633
+ The annotation category name will be the same as the PDBx
1634
+ subcategory name.
1635
+ The array type is always `str`.
1636
+ An exception are the special field identifiers:
1637
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1638
+ These will convert the fitting subcategory into an
1639
+ annotation array with reasonable type.
1640
+ use_author_fields : bool, optional
1641
+ Some fields can be read from two alternative sources,
1642
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1643
+ the ID of the residue.
1644
+ While, the ``label_xxx`` fields can be used as official pointers
1645
+ to other categories in the file, the ``auth_xxx``
1646
+ fields are set by the author(s) of the structure and are
1647
+ consistent with the corresponding values in PDB files.
1648
+ If `use_author_fields` is true, the annotation arrays will be
1649
+ read from the ``auth_xxx`` fields (if applicable),
1650
+ otherwise from the the ``label_xxx`` fields.
1651
+ include_bonds : bool, optional
1652
+ If set to true, a :class:`BondList` will be created for the
1653
+ resulting :class:`AtomArray` containing the bond information
1654
+ from the file.
1655
+ Bonds, whose order could not be determined from the
1656
+ *Chemical Component Dictionary*
1657
+ (e.g. especially inter-residue bonds),
1658
+ have :attr:`BondType.ANY`, since the PDB format itself does
1659
+ not support bond orders.
1660
+
1661
+ Returns
1662
+ -------
1663
+ assembly : AtomArray or AtomArrayStack
1664
+ The assembly.
1665
+ The return type depends on the `model` parameter.
1666
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1667
+ unit in the assembly.
1668
+
1669
+ Examples
1670
+ --------
1671
+
1672
+ >>> import os.path
1673
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1674
+ >>> assembly = get_assembly(file, model=1)
1675
+ """
1676
+ block = _get_block(pdbx_file, data_block)
1677
+
1678
+ try:
1679
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1680
+ except KeyError:
1681
+ raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category")
1682
+
1683
+ try:
1684
+ struct_oper_category = block["pdbx_struct_oper_list"]
1685
+ except KeyError:
1686
+ raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
1687
+
1688
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
1689
+ if assembly_id is None:
1690
+ assembly_id = assembly_ids[0]
1691
+ elif assembly_id not in assembly_ids:
1692
+ raise KeyError(f"File has no Assembly ID '{assembly_id}'")
1693
+
1694
+ ### Calculate all possible transformations
1695
+ transformations = _get_transformations(struct_oper_category)
1696
+
1697
+ ### Get structure according to additional parameters
1698
+ # Include 'label_asym_id' as annotation array
1699
+ # for correct asym ID filtering
1700
+ extra_fields = [] if extra_fields is None else extra_fields
1701
+ if "label_asym_id" in extra_fields:
1702
+ extra_fields_and_asym = extra_fields
1703
+ else:
1704
+ # The operations apply on asym IDs
1705
+ # -> they need to be included to select the correct atoms
1706
+ extra_fields_and_asym = extra_fields + ["label_asym_id"]
1707
+ structure = get_structure(
1708
+ pdbx_file,
1709
+ model,
1710
+ data_block,
1711
+ altloc,
1712
+ extra_fields_and_asym,
1713
+ use_author_fields,
1714
+ include_bonds,
1715
+ )
1716
+
1717
+ ### Get transformations and apply them to the affected asym IDs
1718
+ chain_ops = defaultdict(list)
1719
+ for id, op_expr, asym_id_expr in zip(
1720
+ assembly_gen_category["assembly_id"].as_array(str),
1721
+ assembly_gen_category["oper_expression"].as_array(str),
1722
+ assembly_gen_category["asym_id_list"].as_array(str),
1723
+ ):
1724
+ # Find the operation expressions for given assembly ID
1725
+ # We already asserted that the ID is actually present
1726
+ if id == assembly_id:
1727
+ for chain_id in asym_id_expr.split(","):
1728
+ chain_ops[chain_id].extend(_parse_operation_expression(op_expr))
1729
+
1730
+ sub_assemblies = []
1731
+ for asym_id, op_list in chain_ops.items():
1732
+ sub_struct = structure[..., structure.label_asym_id == asym_id]
1733
+ sub_assembly = _apply_transformations(sub_struct, transformations, op_list)
1734
+ # Merge the chain's sub_assembly into the rest of the assembly
1735
+ sub_assemblies.append(sub_assembly)
1736
+ assembly = concatenate(sub_assemblies)
1737
+
1738
+ # Sort AtomArray or AtomArrayStack by 'sym_id'
1739
+ max_sym_id = assembly.sym_id.max()
1740
+ assembly = concatenate(
1741
+ [assembly[..., assembly.sym_id == sym_id] for sym_id in range(max_sym_id + 1)]
1742
+ )
1743
+
1744
+ # Remove 'label_asym_id', if it was not included in the original
1745
+ # user-supplied 'extra_fields'
1746
+ if "label_asym_id" not in extra_fields:
1747
+ assembly.del_annotation("label_asym_id")
1748
+
1749
+ return assembly
1750
+
1751
+
1752
+ def _apply_transformations(structure, transformation_dict, operations):
1753
+ """
1754
+ Get subassembly by applying the given operations to the input
1755
+ structure containing affected asym IDs.
1756
+ """
1757
+ # Additional first dimesion for 'structure.repeat()'
1758
+ assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1759
+ # Apply corresponding transformation for each copy in the assembly
1760
+ for i, operation in enumerate(operations):
1761
+ coord = structure.coord
1762
+ # Execute for each transformation step
1763
+ # in the operation expression
1764
+ for op_step in operation:
1765
+ coord = transformation_dict[op_step].apply(coord)
1766
+ assembly_coord[i] = coord
1767
+
1768
+ assembly = repeat(structure, assembly_coord)
1769
+ assembly.set_annotation(
1770
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1771
+ )
1772
+ return assembly
1773
+
1774
+
1775
+ def _get_transformations(struct_oper):
1776
+ """
1777
+ Get affine transformation for each operation ID in ``pdbx_struct_oper_list``.
1778
+ """
1779
+ transformation_dict = {}
1780
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1781
+ rotation_matrix = np.array(
1782
+ [
1783
+ [
1784
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1785
+ for j in (1, 2, 3)
1786
+ ]
1787
+ for i in (1, 2, 3)
1788
+ ]
1789
+ )
1790
+ translation_vector = np.array(
1791
+ [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
1792
+ )
1793
+ transformation_dict[id] = AffineTransformation(
1794
+ np.zeros(3), rotation_matrix, translation_vector
1795
+ )
1796
+ return transformation_dict
1797
+
1798
+
1799
+ def _parse_operation_expression(expression):
1800
+ """
1801
+ Get successive operation steps (IDs) for the given
1802
+ ``oper_expression``.
1803
+ Form the cartesian product, if necessary.
1804
+ """
1805
+ # Split groups by parentheses:
1806
+ # use the opening parenthesis as delimiter
1807
+ # and just remove the closing parenthesis
1808
+ # example: '(X0)(1-10,21-25)' from 1a34
1809
+ expressions_per_step = expression.replace(")", "").split("(")
1810
+ expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
1811
+ # Important: Operations are applied from right to left
1812
+ expressions_per_step.reverse()
1813
+
1814
+ operations = []
1815
+ for one_step_expr in expressions_per_step:
1816
+ one_step_op_ids = []
1817
+ for expr in one_step_expr.split(","):
1818
+ if "-" in expr:
1819
+ # Range of operation IDs, they must be integers
1820
+ first, last = expr.split("-")
1821
+ one_step_op_ids.extend(
1822
+ [str(id) for id in range(int(first), int(last) + 1)]
1823
+ )
1824
+ else:
1825
+ # Single operation ID
1826
+ one_step_op_ids.append(expr)
1827
+ operations.append(one_step_op_ids)
1828
+
1829
+ # Cartesian product of operations
1830
+ return list(itertools.product(*operations))
1831
+
1832
+
1833
+ def _convert_string_to_sequence(string, stype):
1834
+ """
1835
+ Convert strings to `ProteinSequence` if `stype` is contained in
1836
+ ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1837
+ contained in ``_nucleotideseq_type_list``.
1838
+ """
1839
+ # sequence may be stored as multiline string
1840
+ string = string.replace("\n", "")
1841
+ if stype in _proteinseq_type_list:
1842
+ return ProteinSequence(string)
1843
+ elif stype in _nucleotideseq_type_list:
1844
+ string = string.replace("U", "T")
1845
+ return NucleotideSequence(string)
1846
+ elif stype in _other_type_list:
1847
+ return None
1848
+ else:
1849
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
1850
+
1851
+
1852
+ def get_unit_cell(
1853
+ pdbx_file,
1854
+ center=True,
1855
+ model=None,
1856
+ data_block=None,
1857
+ altloc="first",
1858
+ extra_fields=None,
1859
+ use_author_fields=True,
1860
+ include_bonds=False,
1861
+ ):
1862
+ """
1863
+ Build a structure model containing all symmetric copies of the structure within a
1864
+ single unit cell.
1865
+
1866
+ This function receives the data from the ``symmetry`` and ``atom_site`` categories
1867
+ in the file.
1868
+ Consequently, these categories must be present in the file.
1869
+
1870
+ Parameters
1871
+ ----------
1872
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1873
+ The file object.
1874
+ center : bool, optional
1875
+ If set to true, each symmetric copy will be moved inside the unit cell
1876
+ dimensions, if its centroid is outside.
1877
+ By default, the copies are are created using the raw space group
1878
+ transformations, which may put them one unit cell length further away.
1879
+ model : int, optional
1880
+ If this parameter is given, the function will return an
1881
+ :class:`AtomArray` from the atoms corresponding to the given
1882
+ model number (starting at 1).
1883
+ Negative values are used to index models starting from the last
1884
+ model insted of the first model.
1885
+ If this parameter is omitted, an :class:`AtomArrayStack`
1886
+ containing all models will be returned, even if the structure
1887
+ contains only one model.
1888
+ data_block : str, optional
1889
+ The name of the data block.
1890
+ Default is the first (and most times only) data block of the
1891
+ file.
1892
+ If the data block object is passed directly to `pdbx_file`,
1893
+ this parameter is ignored.
1894
+ altloc : {'first', 'occupancy', 'all'}
1895
+ This parameter defines how *altloc* IDs are handled:
1896
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1897
+ appearing in a residue.
1898
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1899
+ with the highest occupancy for a residue.
1900
+ - ``'all'`` - Use all atoms.
1901
+ Note that this leads to duplicate atoms.
1902
+ When this option is chosen, the ``altloc_id`` annotation
1903
+ array is added to the returned structure.
1904
+ extra_fields : list of str, optional
1905
+ The strings in the list are entry names, that are
1906
+ additionally added as annotation arrays.
1907
+ The annotation category name will be the same as the PDBx
1908
+ subcategory name.
1909
+ The array type is always `str`.
1910
+ An exception are the special field identifiers:
1911
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1912
+ These will convert the fitting subcategory into an
1913
+ annotation array with reasonable type.
1914
+ use_author_fields : bool, optional
1915
+ Some fields can be read from two alternative sources,
1916
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1917
+ the ID of the residue.
1918
+ While, the ``label_xxx`` fields can be used as official pointers
1919
+ to other categories in the file, the ``auth_xxx``
1920
+ fields are set by the author(s) of the structure and are
1921
+ consistent with the corresponding values in PDB files.
1922
+ If `use_author_fields` is true, the annotation arrays will be
1923
+ read from the ``auth_xxx`` fields (if applicable),
1924
+ otherwise from the the ``label_xxx`` fields.
1925
+ include_bonds : bool, optional
1926
+ If set to true, a :class:`BondList` will be created for the
1927
+ resulting :class:`AtomArray` containing the bond information
1928
+ from the file.
1929
+ Bonds, whose order could not be determined from the
1930
+ *Chemical Component Dictionary*
1931
+ (e.g. especially inter-residue bonds),
1932
+ have :attr:`BondType.ANY`, since the PDB format itself does
1933
+ not support bond orders.
1934
+
1935
+ Returns
1936
+ -------
1937
+ unit_cell : AtomArray or AtomArrayStack
1938
+ The structure representing the unit cell.
1939
+ The return type depends on the `model` parameter.
1940
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1941
+ unit in the unit cell.
1942
+
1943
+ Examples
1944
+ --------
1945
+
1946
+ >>> import os.path
1947
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1948
+ >>> unit_cell = get_unit_cell(file, model=1)
1949
+ """
1950
+ block = _get_block(pdbx_file, data_block)
1951
+
1952
+ try:
1953
+ space_group = block["symmetry"]["space_group_name_H-M"].as_item()
1954
+ except KeyError:
1955
+ raise InvalidFileError("File has no 'symmetry.space_group_name_H-M' field")
1956
+ transforms = space_group_transforms(space_group)
1957
+
1958
+ asym = get_structure(
1959
+ pdbx_file,
1960
+ model,
1961
+ data_block,
1962
+ altloc,
1963
+ extra_fields,
1964
+ use_author_fields,
1965
+ include_bonds,
1966
+ )
1967
+
1968
+ fractional_asym_coord = coord_to_fraction(asym.coord, asym.box)
1969
+ unit_cell_copies = []
1970
+ for transform in transforms:
1971
+ fractional_coord = transform.apply(fractional_asym_coord)
1972
+ if center:
1973
+ # If the centroid is outside the box, move the copy inside the box
1974
+ orig_centroid = centroid(fractional_coord)
1975
+ new_centroid = orig_centroid % 1
1976
+ fractional_coord += (new_centroid - orig_centroid)[..., np.newaxis, :]
1977
+ unit_cell_copies.append(fraction_to_coord(fractional_coord, asym.box))
1978
+
1979
+ unit_cell = repeat(asym, np.stack(unit_cell_copies, axis=0))
1980
+ unit_cell.set_annotation(
1981
+ "sym_id", np.repeat(np.arange(len(transforms)), asym.array_length())
1982
+ )
1983
+ return unit_cell
1984
+
1985
+
1986
+ def get_sse(pdbx_file, data_block=None, match_model=None):
1987
+ """
1988
+ Get the secondary structure from a PDBx file.
1989
+
1990
+ Parameters
1991
+ ----------
1992
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1993
+ The file object.
1994
+ The following categories are required:
1995
+
1996
+ - ``entity_poly``
1997
+ - ``struct_conf`` (if alpha-helices are present)
1998
+ - ``struct_sheet_range`` (if beta-strands are present)
1999
+ - ``atom_site`` (if `match_model` is set)
2000
+
2001
+ data_block : str, optional
2002
+ The name of the data block.
2003
+ Default is the first (and most times only) data block of the
2004
+ file.
2005
+ If the data block object is passed directly to `pdbx_file`,
2006
+ this parameter is ignored.
2007
+ match_model : None, optional
2008
+ If a model number is given, only secondary structure elements for residues are
2009
+ kept, that are resolved in the given model.
2010
+ This means secondary structure elements for residues that would not appear
2011
+ in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
2012
+ By default, all residues in the sequence are kept.
2013
+
2014
+ Returns
2015
+ -------
2016
+ sse_dict : dict of str -> ndarray, dtype=str
2017
+ The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
2018
+ secondary structure of the respective chain.
2019
+
2020
+ - ``"a"``: alpha-helix
2021
+ - ``"b"``: beta-strand
2022
+ - ``"c"``: coil or not an amino acid
2023
+
2024
+ Each secondary structure element corresponds to the ``label_seq_id`` of the
2025
+ ``atom_site`` category.
2026
+ This means that the 0-th position of the array corresponds to the residue
2027
+ in ``atom_site`` with ``label_seq_id`` ``1``.
2028
+
2029
+ Examples
2030
+ --------
2031
+
2032
+ >>> import os.path
2033
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
2034
+ >>> sse = get_sse(file, match_model=1)
2035
+ >>> print(sse)
2036
+ {'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
2037
+ 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
2038
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2039
+ 'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
2040
+ 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2041
+ 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
2042
+ 'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
2043
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
2044
+ 'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
2045
+ 'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
2046
+ dtype='<U1')}
2047
+
2048
+ If only secondary structure elements for resolved residues are requested, the length
2049
+ of the returned array matches the number of peptide residues in the structure.
2050
+
2051
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
2052
+ >>> print(len(get_sse(file, match_model=1)["A"]))
2053
+ 128
2054
+ >>> atoms = get_structure(file, model=1)
2055
+ >>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
2056
+ >>> print(get_residue_count(atoms))
2057
+ 128
2058
+ """
2059
+ block = _get_block(pdbx_file, data_block)
2060
+
2061
+ # Init all chains with "c" for coil
2062
+ sse_dict = {
2063
+ chain_id: np.repeat("c", len(sequence))
2064
+ for chain_id, sequence in get_sequence(block).items()
2065
+ }
2066
+
2067
+ # Populate SSE arrays with helices and strands
2068
+ for sse_symbol, category_name in [
2069
+ ("a", "struct_conf"),
2070
+ ("b", "struct_sheet_range"),
2071
+ ]:
2072
+ if category_name in block:
2073
+ category = block[category_name]
2074
+ chains = category["beg_auth_asym_id"].as_array(str)
2075
+ start_positions = category["beg_label_seq_id"].as_array(int)
2076
+ end_positions = category["end_label_seq_id"].as_array(int)
2077
+
2078
+ # set alpha helix positions
2079
+ for chain, start, end in zip(chains, start_positions, end_positions):
2080
+ # Translate the 1-based positions from PDBx into 0-based array indices
2081
+ sse_dict[chain][start - 1 : end] = sse_symbol
2082
+
2083
+ if match_model is not None:
2084
+ model_atom_site = _filter_model(block["atom_site"], match_model)
2085
+ chain_ids = model_atom_site["auth_asym_id"].as_array(str)
2086
+ res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
2087
+ # Filter out masked residues, i.e. residues not part of a chain
2088
+ mask = res_ids != -1
2089
+ chain_ids = chain_ids[mask]
2090
+ res_ids = res_ids[mask]
2091
+ for chain_id, sse in sse_dict.items():
2092
+ res_ids_in_chain = res_ids[chain_ids == chain_id]
2093
+ # Transform from 1-based residue ID to 0-based index
2094
+ indices = np.unique(res_ids_in_chain) - 1
2095
+ sse_dict[chain_id] = sse[indices]
2096
+
2097
+ return sse_dict