biotite 0.41.1__cp310-cp310-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-310-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1597 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdbx"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann"
7
+ __all__ = [
8
+ "get_sequence",
9
+ "get_model_count",
10
+ "get_structure",
11
+ "set_structure",
12
+ "get_component",
13
+ "set_component",
14
+ "list_assemblies",
15
+ "get_assembly",
16
+ ]
17
+
18
+ import itertools
19
+ import warnings
20
+ import numpy as np
21
+ from ....file import InvalidFileError
22
+ from ....sequence.seqtypes import NucleotideSequence, ProteinSequence
23
+ from ...atoms import AtomArray, AtomArrayStack, repeat
24
+ from ...bonds import BondList, BondType, connect_via_residue_names
25
+ from ...box import unitcell_from_vectors, vectors_from_unitcell
26
+ from ...filter import filter_first_altloc, filter_highest_occupancy_altloc
27
+ from ...residues import get_residue_count, get_residue_starts_for
28
+ from ...error import BadStructureError
29
+ from ...util import matrix_rotate
30
+ from .legacy import PDBxFile
31
+ from .component import MaskValue
32
+ from .cif import CIFFile, CIFBlock
33
+ from .bcif import BinaryCIFFile, BinaryCIFBlock, BinaryCIFColumn
34
+ from .encoding import StringArrayEncoding
35
+
36
+
37
+ # Cond types in `struct_conn` category that refer to covalent bonds
38
+ PDBX_COVALENT_TYPES = [
39
+ "covale", "covale_base", "covale_phosphate", "covale_sugar",
40
+ "disulf", "modres", "modres_link", "metalc"
41
+ ]
42
+ # Map 'struct_conn' bond orders to 'BondType'...
43
+ PDBX_BOND_ORDER_TO_TYPE = {
44
+ "": BondType.ANY,
45
+ "sing": BondType.SINGLE,
46
+ "doub": BondType.DOUBLE,
47
+ "trip": BondType.TRIPLE,
48
+ "quad": BondType.QUADRUPLE,
49
+ }
50
+ # ...and vice versa
51
+ PDBX_BOND_TYPE_TO_ORDER = {
52
+ # 'ANY' is masked later, it is merely added here to avoid a KeyError
53
+ BondType.ANY: "",
54
+ BondType.SINGLE: "sing",
55
+ BondType.DOUBLE: "doub",
56
+ BondType.TRIPLE: "trip",
57
+ BondType.QUADRUPLE: "quad",
58
+ BondType.AROMATIC_SINGLE: "sing",
59
+ BondType.AROMATIC_DOUBLE: "doub",
60
+ BondType.AROMATIC_TRIPLE: "trip",
61
+ }
62
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
63
+ COMP_BOND_ORDER_TO_TYPE = {
64
+ ("SING", "N") : BondType.SINGLE,
65
+ ("DOUB", "N") : BondType.DOUBLE,
66
+ ("TRIP", "N") : BondType.TRIPLE,
67
+ ("QUAD", "N") : BondType.QUADRUPLE,
68
+ ("SING", "Y") : BondType.AROMATIC_SINGLE,
69
+ ("DOUB", "Y") : BondType.AROMATIC_DOUBLE,
70
+ ("TRIP", "Y") : BondType.AROMATIC_TRIPLE,
71
+ }
72
+ # ...and vice versa
73
+ COMP_BOND_TYPE_TO_ORDER = {
74
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
75
+ }
76
+
77
+ _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
78
+ _nucleotideseq_type_list = [
79
+ "polydeoxyribonucleotide",
80
+ "polyribonucleotide",
81
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
82
+ ]
83
+ _other_type_list = [
84
+ "cyclic-pseudo-peptide",
85
+ "other",
86
+ "peptide nucleic acid",
87
+ "polysaccharide(D)",
88
+ "polysaccharide(L)",
89
+ ]
90
+
91
+
92
+ def _filter(category, index):
93
+ """
94
+ Reduce the ``atom_site`` category to the values for the given
95
+ model.
96
+ """
97
+ Category = type(category)
98
+ Column = Category.subcomponent_class()
99
+ Data = Column.subcomponent_class()
100
+
101
+ return Category({
102
+ key: Column(
103
+ Data(column.data.array[index]),
104
+ (
105
+ Data(column.mask.array[index])
106
+ if column.mask is not None else None
107
+ )
108
+ )
109
+ for key, column in category.items()
110
+ })
111
+
112
+
113
+ def get_sequence(pdbx_file, data_block=None):
114
+ """
115
+ Get the protein and nucleotide sequences from the
116
+ ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
117
+
118
+ Supported polymer types (``_entity_poly.type``) are:
119
+ ``'polypeptide(D)'``, ``'polypeptide(L)'``,
120
+ ``'polydeoxyribonucleotide'``, ``'polyribonucleotide'`` and
121
+ ``'polydeoxyribonucleotide/polyribonucleotide hybrid'``.
122
+ Uracil is converted to Thymine.
123
+
124
+ Parameters
125
+ ----------
126
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
127
+ The file object.
128
+ data_block : str, optional
129
+ The name of the data block.
130
+ Default is the first (and most times only) data block of the
131
+ file.
132
+ If the data block object is passed directly to `pdbx_file`,
133
+ this parameter is ignored.
134
+
135
+ Returns
136
+ -------
137
+ sequences : list of Sequence
138
+ The protein and nucleotide sequences for each entity
139
+ (equivalent to chains in most cases).
140
+ """
141
+ block = _get_block(pdbx_file, data_block)
142
+
143
+ poly_category= block["entity_poly"]
144
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
145
+ seq_type = poly_category["type"].as_array(str)
146
+ sequences = []
147
+ for string, stype in zip(seq_string, seq_type):
148
+ sequence = _convert_string_to_sequence(string, stype)
149
+ if sequence is not None:
150
+ sequences.append(sequence)
151
+ return sequences
152
+
153
+
154
+ def get_model_count(pdbx_file, data_block=None):
155
+ """
156
+ Get the number of models contained in a :class:`PDBxFile`.
157
+
158
+ Parameters
159
+ ----------
160
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
161
+ The file object.
162
+ data_block : str, optional
163
+ The name of the data block.
164
+ Default is the first (and most times only) data block of the
165
+ file.
166
+ If the data block object is passed directly to `pdbx_file`,
167
+ this parameter is ignored.
168
+
169
+ Returns
170
+ -------
171
+ model_count : int
172
+ The number of models.
173
+ """
174
+ block = _get_block(pdbx_file, data_block)
175
+ return len(_get_model_starts(
176
+ block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32)
177
+ ))
178
+
179
+
180
+ def get_structure(pdbx_file, model=None, data_block=None, altloc="first",
181
+ extra_fields=None, use_author_fields=True,
182
+ include_bonds=False):
183
+ """
184
+ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
185
+ ``atom_site`` category in a :class:`PDBxFile`.
186
+
187
+ Parameters
188
+ ----------
189
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
190
+ The file object.
191
+ model : int, optional
192
+ If this parameter is given, the function will return an
193
+ :class:`AtomArray` from the atoms corresponding to the given
194
+ model number (starting at 1).
195
+ Negative values are used to index models starting from the last
196
+ model insted of the first model.
197
+ If this parameter is omitted, an :class:`AtomArrayStack`
198
+ containing all models will be returned, even if the structure
199
+ contains only one model.
200
+ data_block : str, optional
201
+ The name of the data block.
202
+ Default is the first (and most times only) data block of the
203
+ file.
204
+ If the data block object is passed directly to `pdbx_file`,
205
+ this parameter is ignored.
206
+ altloc : {'first', 'occupancy', 'all'}
207
+ This parameter defines how *altloc* IDs are handled:
208
+ - ``'first'`` - Use atoms that have the first *altloc* ID
209
+ appearing in a residue.
210
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
211
+ with the highest occupancy for a residue.
212
+ - ``'all'`` - Use all atoms.
213
+ Note that this leads to duplicate atoms.
214
+ When this option is chosen, the ``altloc_id`` annotation
215
+ array is added to the returned structure.
216
+ extra_fields : list of str, optional
217
+ The strings in the list are entry names, that are
218
+ additionally added as annotation arrays.
219
+ The annotation category name will be the same as the PDBx
220
+ subcategory name.
221
+ The array type is always `str`.
222
+ An exception are the special field identifiers:
223
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
224
+ These will convert the fitting subcategory into an
225
+ annotation array with reasonable type.
226
+ use_author_fields : bool, optional
227
+ Some fields can be read from two alternative sources,
228
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
229
+ the ID of the residue.
230
+ While, the ``label_xxx`` fields can be used as official pointers
231
+ to other categories in the :class:`PDBxFile`, the ``auth_xxx``
232
+ fields are set by the author(s) of the structure and are
233
+ consistent with the corresponding values in PDB files.
234
+ If `use_author_fields` is true, the annotation arrays will be
235
+ read from the ``auth_xxx`` fields (if applicable),
236
+ otherwise from the the ``label_xxx`` fields.
237
+ If the requested field is not available, the respective other
238
+ field is taken as fallback.
239
+ include_bonds : bool, optional
240
+ If set to true, a :class:`BondList` will be created for the
241
+ resulting :class:`AtomArray` containing the bond information
242
+ from the file.
243
+ Inter-residue bonds, will be read from the ``struct_conn``
244
+ category.
245
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
246
+ available, otherwise they will be derived from the Chemical
247
+ Component Dictionary.
248
+
249
+ Returns
250
+ -------
251
+ array : AtomArray or AtomArrayStack
252
+ The return type depends on the `model` parameter.
253
+
254
+ Examples
255
+ --------
256
+
257
+ >>> import os.path
258
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
259
+ >>> arr = get_structure(file, model=1)
260
+ >>> print(len(arr))
261
+ 304
262
+
263
+ """
264
+ block = _get_block(pdbx_file, data_block)
265
+
266
+ extra_fields = set() if extra_fields is None else set(extra_fields)
267
+
268
+ atom_site = block.get("atom_site")
269
+ if atom_site is None:
270
+ raise InvalidFileError("Missing 'atom_site' category in file")
271
+
272
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
273
+ model_starts = _get_model_starts(models)
274
+ model_count = len(model_starts)
275
+ atom_count = len(models)
276
+
277
+ if model is None:
278
+ # For a stack, the annotations are derived from the first model
279
+ model_atom_site = _filter_model(atom_site, model_starts, 1)
280
+ # Any field of the category would work here to get the length
281
+ model_length = model_atom_site.row_count
282
+ atoms = AtomArrayStack(model_count, model_length)
283
+
284
+ # Check if each model has the same amount of atoms
285
+ # If not, raise exception
286
+ if model_length * model_count != atom_count:
287
+ raise InvalidFileError(
288
+ "The models in the file have unequal "
289
+ "amount of atoms, give an explicit model "
290
+ "instead"
291
+ )
292
+
293
+ atoms.coord[:, :, 0] = atom_site["Cartn_x"].as_array(np.float32) \
294
+ .reshape((model_count, model_length))
295
+ atoms.coord[:, :, 1] = atom_site["Cartn_y"].as_array(np.float32) \
296
+ .reshape((model_count, model_length))
297
+ atoms.coord[:, :, 2] = atom_site["Cartn_z"].as_array(np.float32) \
298
+ .reshape((model_count, model_length))
299
+
300
+ box = _get_box(block)
301
+ if box is not None:
302
+ # Duplicate same box for each model
303
+ atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
304
+
305
+ else:
306
+ if model == 0:
307
+ raise ValueError("The model index must not be 0")
308
+ # Negative models mean model indexing starting from last model
309
+ model = model_count + model + 1 if model < 0 else model
310
+ if model > model_count:
311
+ raise ValueError(
312
+ f"The file has {model_count} models, "
313
+ f"the given model {model} does not exist"
314
+ )
315
+
316
+ model_atom_site = _filter_model(atom_site, model_starts, model)
317
+ # Any field of the category would work here to get the length
318
+ model_length = model_atom_site.row_count
319
+ atoms = AtomArray(model_length)
320
+
321
+ atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
322
+ atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
323
+ atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
324
+
325
+ atoms.box = _get_box(block)
326
+
327
+ # The below part is the same for both, AtomArray and AtomArrayStack
328
+ _fill_annotations(
329
+ atoms, model_atom_site, extra_fields, use_author_fields
330
+ )
331
+ if include_bonds:
332
+ if "chem_comp_bond" in block:
333
+ try:
334
+ custom_bond_dict = _parse_intra_residue_bonds(
335
+ block["chem_comp_bond"]
336
+ )
337
+ except KeyError:
338
+ warnings.warn(
339
+ "The 'chem_comp_bond' category has missing columns, "
340
+ "falling back to using Chemical Component Dictionary",
341
+ UserWarning
342
+ )
343
+ custom_bond_dict = None
344
+ bonds = connect_via_residue_names(
345
+ atoms, custom_bond_dict=custom_bond_dict
346
+ )
347
+ else:
348
+ bonds = connect_via_residue_names(atoms)
349
+ if "struct_conn" in block:
350
+ bonds = bonds.merge(_parse_inter_residue_bonds(
351
+ model_atom_site, block["struct_conn"]
352
+ ))
353
+ atoms.bonds = bonds
354
+ atoms = _filter_altloc(atoms, model_atom_site, altloc)
355
+
356
+ return atoms
357
+
358
+
359
+ def _get_block(pdbx_component, block_name):
360
+ if isinstance(pdbx_component, PDBxFile):
361
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
362
+ pdbx_component = pdbx_component.cif_file
363
+
364
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
365
+ # Determine block
366
+ if block_name is None:
367
+ return pdbx_component.block
368
+ else:
369
+ return pdbx_component[block_name]
370
+ else:
371
+ return pdbx_component
372
+
373
+
374
+ def _get_or_fallback(category, key, fallback_key):
375
+ """
376
+ Return column related to key in category if it exists,
377
+ otherwise try to get the column related to fallback key.
378
+ """
379
+ if key not in category:
380
+ warnings.warn(
381
+ f"Attribute '{key}' not found within 'atom_site' category. "
382
+ f"The fallback attribute '{fallback_key}' will be used instead",
383
+ UserWarning
384
+ )
385
+ try:
386
+ return category[fallback_key]
387
+ except KeyError as key_exc:
388
+ raise InvalidFileError(
389
+ f"Fallback attribute '{fallback_key}' not found within "
390
+ "'atom_site' category"
391
+ ) from key_exc
392
+ return category[key]
393
+
394
+
395
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
396
+ """Fill atom_site annotations in atom array or atom array stack.
397
+
398
+ Parameters
399
+ ----------
400
+ array : AtomArray or AtomArrayStack
401
+ Atom array or stack which will be annotated.
402
+ atom_site : CIFCategory or BinaryCIFCategory
403
+ ``atom_site`` category with values for one model.
404
+ extra_fields : list of str
405
+ Entry names, that are additionally added as annotation arrays.
406
+ use_author_fields : bool
407
+ Define if alternate fields prefixed with ``auth_`` should be used
408
+ instead of ``label_``.
409
+ """
410
+
411
+ prefix, alt_prefix = (
412
+ ("auth", "label") if use_author_fields else ("label", "auth")
413
+ )
414
+
415
+ array.set_annotation(
416
+ "chain_id",
417
+ _get_or_fallback(
418
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
419
+ ).as_array("U4")
420
+ )
421
+ array.set_annotation(
422
+ "res_id",
423
+ _get_or_fallback(
424
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
425
+ ).as_array(int, -1)
426
+ )
427
+ array.set_annotation(
428
+ "ins_code",
429
+ atom_site["pdbx_PDB_ins_code"].as_array("U1", "")
430
+ )
431
+ array.set_annotation(
432
+ "res_name",
433
+ _get_or_fallback(
434
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
435
+ ).as_array("U5")
436
+ )
437
+ array.set_annotation(
438
+ "hetero",
439
+ atom_site["group_PDB"].as_array(str) == "HETATM"
440
+ )
441
+ array.set_annotation(
442
+ "atom_name",
443
+ _get_or_fallback(
444
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
445
+ ).as_array("U6")
446
+ )
447
+ array.set_annotation(
448
+ "element",
449
+ atom_site["type_symbol"].as_array("U2")
450
+ )
451
+
452
+ if "atom_id" in extra_fields:
453
+ array.set_annotation(
454
+ "atom_id",
455
+ atom_site["id"].as_array(int)
456
+ )
457
+ extra_fields.remove("atom_id")
458
+ if "b_factor" in extra_fields:
459
+ array.set_annotation(
460
+ "b_factor",
461
+ atom_site["B_iso_or_equiv"].as_array(float)
462
+ )
463
+ extra_fields.remove("b_factor")
464
+ if "occupancy" in extra_fields:
465
+ array.set_annotation(
466
+ "occupancy",
467
+ atom_site["occupancy"].as_array(float)
468
+ )
469
+ extra_fields.remove("occupancy")
470
+ if "charge" in extra_fields:
471
+ array.set_annotation(
472
+ "charge",
473
+ atom_site["pdbx_formal_charge"].as_array(int, 0)
474
+ )
475
+ extra_fields.remove("charge")
476
+
477
+ # Handle all remaining custom fields
478
+ for field in extra_fields:
479
+ array.set_annotation(
480
+ field,
481
+ atom_site[field].as_array(str)
482
+ )
483
+
484
+
485
+ def _parse_intra_residue_bonds(chem_comp_bond):
486
+ """
487
+ Create a :func:`connect_via_residue_names()` compatible
488
+ `custom_bond_dict` from the ``chem_comp_bond`` category.
489
+ """
490
+ custom_bond_dict = {}
491
+ for res_name, atom_1, atom_2, order, aromatic_flag in zip(
492
+ chem_comp_bond["comp_id"].as_array(str),
493
+ chem_comp_bond["atom_id_1"].as_array(str),
494
+ chem_comp_bond["atom_id_2"].as_array(str),
495
+ chem_comp_bond["value_order"].as_array(str),
496
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str)
497
+ ):
498
+ if res_name not in custom_bond_dict:
499
+ custom_bond_dict[res_name] = {}
500
+ bond_type = COMP_BOND_ORDER_TO_TYPE.get(
501
+ (order.upper(), aromatic_flag), BondType.ANY
502
+ )
503
+ custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
504
+ return custom_bond_dict
505
+
506
+
507
+ def _parse_inter_residue_bonds(atom_site, struct_conn):
508
+ """
509
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
510
+ The atom indices of each bond are found by matching the bond labels
511
+ to the ``atom_site`` category.
512
+ """
513
+ # Identity symmetry operation
514
+ IDENTITY = "1_555"
515
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
516
+ COLUMNS = [
517
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
518
+ "label_alt_id", "auth_asym_id", "auth_comp_id", "auth_seq_id",
519
+ "pdbx_PDB_ins_code"
520
+ ]
521
+
522
+ covale_mask = np.isin(
523
+ struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
524
+ )
525
+ if "ptnr1_symmetry" in struct_conn:
526
+ covale_mask &= (
527
+ struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
528
+ )
529
+ if "ptnr2_symmetry" in struct_conn:
530
+ covale_mask &= (
531
+ struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
532
+ )
533
+
534
+ atom_indices = [None] * 2
535
+ for i in range(2):
536
+ reference_arrays = []
537
+ query_arrays = []
538
+ for col_name in COLUMNS:
539
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i+1)
540
+ if (
541
+ col_name not in atom_site
542
+ or struct_conn_col_name not in struct_conn
543
+ ):
544
+ continue
545
+ # Ensure both arrays have the same dtype to allow comparison
546
+ reference = atom_site[col_name].as_array()
547
+ dtype = reference.dtype
548
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
549
+ if np.issubdtype(reference.dtype, str):
550
+ # The mask value is not necessarily consistent
551
+ # between query and reference
552
+ # -> make it consistent
553
+ reference[reference == "?"] = "."
554
+ query[query == "?"] = "."
555
+ reference_arrays.append(reference)
556
+ query_arrays.append(query[covale_mask])
557
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
558
+ # in 'atom_site' and 'struct_conn'
559
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
560
+ atoms_indices_1 = atom_indices[0]
561
+ atoms_indices_2 = atom_indices[1]
562
+
563
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
564
+ # This is okay,
565
+ # as 'atom_site' might already be reduced to a single model
566
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
567
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
568
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
569
+
570
+ # Interpret missing values as ANY bonds
571
+ bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
572
+ # Consecutively apply the same masks as applied to the atom indices
573
+ # Logical combination does not work here,
574
+ # as the second mask was created based on already filtered data
575
+ bond_order = bond_order[covale_mask][mapping_exists_mask]
576
+ bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
577
+
578
+ return BondList(
579
+ atom_site.row_count,
580
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1)
581
+ )
582
+
583
+
584
+ def _find_matches(query_arrays, reference_arrays):
585
+ """
586
+ For each index in the `query_arrays` find the indices in the
587
+ `reference_arrays` where all query values the reference counterpart.
588
+ If no match is found for a query, the corresponding index is -1.
589
+ """
590
+ match_masks_for_all_columns = np.stack([
591
+ query[:, np.newaxis] == reference[np.newaxis, :]
592
+ for query, reference in zip(query_arrays, reference_arrays)
593
+ ], axis=-1)
594
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
595
+ query_matches, reference_matches = np.where(match_masks)
596
+
597
+ # Duplicate matches indicate that an atom from the query cannot
598
+ # be uniquely matched to an atom in the reference
599
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
600
+ if np.any(counts > 1):
601
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
602
+ raise InvalidFileError(
603
+ f"The covalent bond in the 'struct_conn' category at index "
604
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
605
+ f"the 'atom_site' category"
606
+ )
607
+
608
+ # -1 indicates that no match was found in the reference
609
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
610
+ match_indices[query_matches] = reference_matches
611
+ return match_indices
612
+
613
+
614
+ def _get_struct_conn_col_name(col_name, partner):
615
+ """
616
+ For a column name in ``atom_site`` get the corresponding column name
617
+ in ``struct_conn``.
618
+ """
619
+ if col_name == "label_alt_id":
620
+ return f"pdbx_ptnr{partner}_label_alt_id"
621
+ elif col_name.startswith("pdbx_"):
622
+ # Move 'pdbx_' to front
623
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
624
+ else:
625
+ return f"ptnr{partner}_{col_name}"
626
+
627
+
628
+ def _filter_altloc(array, atom_site, altloc):
629
+ altloc_ids = atom_site.get("label_alt_id")
630
+ occupancy = atom_site.get("occupancy")
631
+
632
+ # Filter altloc IDs and return
633
+ if altloc_ids is None:
634
+ return array
635
+ elif altloc == "occupancy" and occupancy is not None:
636
+ return array[
637
+ ...,
638
+ filter_highest_occupancy_altloc(
639
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
640
+ ),
641
+ ]
642
+ # 'first' is also fallback if file has no occupancy information
643
+ elif altloc == "first":
644
+ return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
645
+ elif altloc == "all":
646
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
647
+ return array
648
+ else:
649
+ raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
650
+
651
+
652
+ def _get_model_starts(model_array):
653
+ """
654
+ Get the start index for each model in the arrays of the
655
+ ``atom_site`` category.
656
+ """
657
+ _, indices = np.unique(model_array, return_index=True)
658
+ indices.sort()
659
+ return indices
660
+
661
+
662
+ def _filter_model(atom_site, model_starts, model):
663
+ """
664
+ Reduce the ``atom_site`` category to the values for the given
665
+ model.
666
+ """
667
+ Category = type(atom_site)
668
+ Column = Category.subcomponent_class()
669
+ Data = Column.subcomponent_class()
670
+
671
+ # Append exclusive stop
672
+ model_starts = np.append(
673
+ model_starts, [atom_site.row_count]
674
+ )
675
+ # Indexing starts at 0, but model number starts at 1
676
+ model_index = model - 1
677
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
678
+ return _filter(atom_site, index)
679
+
680
+
681
+ def _get_box(block):
682
+ cell = block.get("cell")
683
+ if cell is None:
684
+ return None
685
+ try:
686
+ len_a, len_b, len_c = [
687
+ float(cell[length].as_item())
688
+ for length in ["length_a", "length_b", "length_c"]
689
+ ]
690
+ alpha, beta, gamma = [
691
+ np.deg2rad(float(cell[angle].as_item()))
692
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
693
+ ]
694
+ except ValueError:
695
+ # 'cell_dict' has no proper unit cell values, e.g. '?'
696
+ return None
697
+ return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
698
+
699
+
700
+ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
701
+ """
702
+ Set the ``atom_site`` category with atom information from an
703
+ :class:`AtomArray` or :class:`AtomArrayStack`.
704
+
705
+ This will save the coordinates, the mandatory annotation categories
706
+ and the optional annotation categories
707
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
708
+ If the atom array (stack) contains the annotation ``'atom_id'``,
709
+ these values will be used for atom numbering instead of continuous
710
+ numbering.
711
+ Furthermore, inter-residue bonds will be written into the
712
+ ``struct_conn`` category.
713
+
714
+ Parameters
715
+ ----------
716
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
717
+ The file object.
718
+ array : AtomArray or AtomArrayStack
719
+ The structure to be written. If a stack is given, each array in
720
+ the stack will be in a separate model.
721
+ data_block : str, optional
722
+ The name of the data block.
723
+ Default is the first (and most times only) data block of the
724
+ file.
725
+ If the data block object is passed directly to `pdbx_file`,
726
+ this parameter is ignored.
727
+ If the file is empty, a new data block will be created.
728
+ include_bonds : bool, optional
729
+ If set to true and `array` has associated ``bonds`` , the
730
+ intra-residue bonds will be written into the ``chem_comp_bond``
731
+ category.
732
+ Inter-residue bonds will be written into the ``struct_conn``
733
+ independent of this parameter.
734
+
735
+ Notes
736
+ -----
737
+ In some cases, the written inter-residue bonds cannot be read again
738
+ due to ambiguity to which atoms the bond refers.
739
+ This is the case, when two equal residues in the same chain have
740
+ the same (or a masked) `res_id`.
741
+
742
+ Examples
743
+ --------
744
+
745
+ >>> import os.path
746
+ >>> file = CIFFile()
747
+ >>> set_structure(file, atom_array)
748
+ >>> file.write(os.path.join(path_to_directory, "structure.cif"))
749
+
750
+ """
751
+ _check_non_empty(array)
752
+
753
+ block = _get_or_create_block(pdbx_file, data_block)
754
+ Category = block.subcomponent_class()
755
+ Column = Category.subcomponent_class()
756
+
757
+ # Fill PDBx columns from information
758
+ # in structures' attribute arrays as good as possible
759
+ atom_site = Category()
760
+ atom_site["group_PDB"] = np.where(
761
+ array.hetero, "HETATM", "ATOM"
762
+ )
763
+ atom_site["type_symbol"] = np.copy(array.element)
764
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
765
+ atom_site["label_alt_id"] = Column(
766
+ # AtomArrays do not store altloc atoms
767
+ np.full(array.array_length(), "."),
768
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
769
+ )
770
+ atom_site["label_comp_id"] = np.copy(array.res_name)
771
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
772
+ atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
773
+ atom_site["label_seq_id"] = np.copy(array.res_id)
774
+ atom_site["pdbx_PDB_ins_code"] = Column(
775
+ np.copy(array.ins_code),
776
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT)
777
+ )
778
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
779
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
780
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
781
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
782
+
783
+ annot_categories = array.get_annotation_categories()
784
+ if "atom_id" in annot_categories:
785
+ atom_site["id"] = np.copy(array.atom_id)
786
+ if "b_factor" in annot_categories:
787
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
788
+ if "occupancy" in annot_categories:
789
+ atom_site["occupancy"] = np.copy(array.occupancy)
790
+ if "charge" in annot_categories:
791
+ atom_site["pdbx_formal_charge"] = Column(
792
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
793
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT)
794
+ )
795
+
796
+ if array.bonds is not None:
797
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
798
+ if struct_conn is not None:
799
+ block["struct_conn"] = struct_conn
800
+ if include_bonds:
801
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
802
+ if chem_comp_bond is not None:
803
+ block["chem_comp_bond"] = chem_comp_bond
804
+
805
+ # In case of a single model handle each coordinate
806
+ # simply like a flattened array
807
+ if type(array) == AtomArray or (
808
+ type(array) == AtomArrayStack and array.stack_depth() == 1
809
+ ):
810
+ # 'ravel' flattens coord without copy
811
+ # in case of stack with stack_depth = 1
812
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
813
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
814
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
815
+ atom_site["pdbx_PDB_model_num"] = np.ones(
816
+ array.array_length(), dtype=np.int32
817
+ )
818
+ # In case of multiple models repeat annotations
819
+ # and use model specific coordinates
820
+ else:
821
+ atom_site = _repeat(atom_site, array.stack_depth())
822
+ coord = np.reshape(
823
+ array.coord, (array.stack_depth() * array.array_length(), 3)
824
+ )
825
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
826
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
827
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
828
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
829
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
830
+ repeats=array.array_length(),
831
+ )
832
+ if not "atom_id" in annot_categories:
833
+ # Count from 1
834
+ atom_site["id"] = np.arange(
835
+ 1, len(atom_site["group_PDB"]) + 1
836
+ )
837
+ block["atom_site"] = atom_site
838
+
839
+ # Write box into file
840
+ if array.box is not None:
841
+ # PDBx files can only store one box for all models
842
+ # -> Use first box
843
+ if array.box.ndim == 3:
844
+ box = array.box[0]
845
+ else:
846
+ box = array.box
847
+ len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
848
+ cell = Category()
849
+ cell["length_a"] = len_a
850
+ cell["length_b"] = len_b
851
+ cell["length_c"] = len_c
852
+ cell["angle_alpha"] = np.rad2deg(alpha)
853
+ cell["angle_beta"] = np.rad2deg(beta)
854
+ cell["angle_gamma"] = np.rad2deg(gamma)
855
+ block["cell"] = cell
856
+
857
+
858
+ def _check_non_empty(array):
859
+ if isinstance(array, AtomArray):
860
+ if array.array_length() == 0:
861
+ raise BadStructureError("Structure must not be empty")
862
+ elif isinstance(array, AtomArrayStack):
863
+ if array.array_length() == 0 or array.stack_depth() == 0:
864
+ raise BadStructureError("Structure must not be empty")
865
+ else:
866
+ raise ValueError(
867
+ "Structure must be AtomArray or AtomArrayStack, "
868
+ f"but got {type(array).__name__}"
869
+ )
870
+
871
+
872
+ def _get_or_create_block(pdbx_component, block_name):
873
+ if isinstance(pdbx_component, PDBxFile):
874
+ # The deprecated 'PDBxFile' is a thin wrapper around 'CIFFile'
875
+ pdbx_component = pdbx_component.cif_file
876
+
877
+ Block = pdbx_component.subcomponent_class()
878
+
879
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
880
+ if block_name is None:
881
+ if len(pdbx_component) > 0:
882
+ block_name = next(iter(pdbx_component.keys()))
883
+ else:
884
+ # File is empty -> invent a new block name
885
+ block_name = "structure"
886
+
887
+ if block_name not in pdbx_component:
888
+ block = Block()
889
+ pdbx_component[block_name] = block
890
+ return pdbx_component[block_name]
891
+ else:
892
+ # Already a block
893
+ return pdbx_component
894
+
895
+
896
+ def _determine_entity_id(chain_id):
897
+ entity_id = np.zeros(len(chain_id), dtype=int)
898
+ # Dictionary that translates chain_id to entity_id
899
+ id_translation = {}
900
+ id = 1
901
+ for i in range(len(chain_id)):
902
+ try:
903
+ entity_id[i] = id_translation[chain_id[i]]
904
+ except:
905
+ # chain_id is not in dictionary -> new entry
906
+ id_translation[chain_id[i]] = id
907
+ entity_id[i] = id_translation[chain_id[i]]
908
+ id += 1
909
+ return entity_id
910
+
911
+
912
+ def _repeat(category, repetitions):
913
+ Category = type(category)
914
+ Column = Category.subcomponent_class()
915
+ Data = Column.subcomponent_class()
916
+
917
+ category_dict = {}
918
+ for key, column in category.items():
919
+ if isinstance(column, BinaryCIFColumn):
920
+ data_encoding = column.data.encoding
921
+ # Optimization: The repeated string array has the same
922
+ # unique values, as the original string array
923
+ # -> Use same unique values (faster due to shorter array)
924
+ if isinstance(data_encoding[0], StringArrayEncoding):
925
+ data_encoding[0].strings = np.unique(column.data.array)
926
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
927
+ else:
928
+ data = Data(np.tile(column.data.array, repetitions))
929
+ mask = Data(np.tile(column.mask.array, repetitions)) \
930
+ if column.mask is not None else None
931
+ category_dict[key] = Column(data, mask)
932
+ return Category(category_dict)
933
+
934
+
935
+ def _set_intra_residue_bonds(array, atom_site):
936
+ """
937
+ Create the ``chem_comp_bond`` category containing the intra-residue
938
+ bonds.
939
+ ``atom_site`` is only used to infer the right :class:`Category` type
940
+ (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
941
+ """
942
+ if (array.res_name == "").any():
943
+ raise BadStructureError(
944
+ "Structure contains atoms with empty residue name, "
945
+ "but it is required to write intra-residue bonds"
946
+ )
947
+ if (array.atom_name == "").any():
948
+ raise BadStructureError(
949
+ "Structure contains atoms with empty atom name, "
950
+ "but it is required to write intra-residue bonds"
951
+ )
952
+
953
+ Category = type(atom_site)
954
+ Column = Category.subcomponent_class()
955
+
956
+ bond_array = _filter_bonds(array, "intra")
957
+ if len(bond_array) == 0:
958
+ return None
959
+ value_order = np.zeros(len(bond_array), dtype="U4")
960
+ aromatic_flag = np.zeros(len(bond_array), dtype="U1")
961
+ for i, bond_type in enumerate(bond_array[:, 2]):
962
+ if bond_type == BondType.ANY:
963
+ # ANY bonds will be masked anyway, no need to set the value
964
+ continue
965
+ order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
966
+ value_order[i] = order
967
+ aromatic_flag[i] = aromatic
968
+ any_mask = bond_array[:, 2] == BondType.ANY
969
+
970
+ chem_comp_bond = Category()
971
+ # Take the residue name from the first atom index, as the residue
972
+ # name is the same for both atoms, since we have only intra bonds
973
+ chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]]
974
+ chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
975
+ chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
976
+ chem_comp_bond["value_order"] = Column(
977
+ value_order,
978
+ np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
979
+ )
980
+ chem_comp_bond["pdbx_aromatic_flag"] = Column(
981
+ aromatic_flag,
982
+ np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
983
+ )
984
+ # BondList does not contain stereo information
985
+ # -> all values are missing
986
+ chem_comp_bond["pdbx_stereo_config"] = Column(
987
+ np.zeros(len(bond_array), dtype="U1"),
988
+ np.full(len(bond_array), MaskValue.MISSING)
989
+ )
990
+ chem_comp_bond["pdbx_ordinal"] = np.arange(
991
+ 1, len(bond_array) + 1, dtype=np.int32
992
+ )
993
+ return chem_comp_bond
994
+
995
+
996
+ def _set_inter_residue_bonds(array, atom_site):
997
+ """
998
+ Create the ``struct_conn`` category containing the inter-residue
999
+ bonds.
1000
+ The involved atoms are identified by annotations from the
1001
+ ``atom_site`` category.
1002
+ """
1003
+ COLUMNS = [
1004
+ "label_asym_id", "label_comp_id", "label_seq_id", "label_atom_id",
1005
+ "pdbx_PDB_ins_code"
1006
+ ]
1007
+
1008
+ Category = type(atom_site)
1009
+ Column = Category.subcomponent_class()
1010
+
1011
+ bond_array = _filter_bonds(array, "inter")
1012
+ if len(bond_array) == 0:
1013
+ return None
1014
+ struct_conn = Category()
1015
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1016
+ struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
1017
+ struct_conn["pdbx_value_order"] = Column(
1018
+ np.array(
1019
+ [PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]
1020
+ ),
1021
+ np.where(
1022
+ bond_array[:, 2] == BondType.ANY,
1023
+ MaskValue.MISSING, MaskValue.PRESENT,
1024
+ )
1025
+ )
1026
+ # Write the identifying annotation...
1027
+ for col_name in COLUMNS:
1028
+ annot = atom_site[col_name].as_array()
1029
+ # ...for each bond partner
1030
+ for i in range(2):
1031
+ atom_indices = bond_array[:, i]
1032
+ struct_conn[_get_struct_conn_col_name(col_name, i+1)] \
1033
+ = annot[atom_indices]
1034
+ return struct_conn
1035
+
1036
+
1037
+ def _filter_bonds(array, connection):
1038
+ """
1039
+ Get a bonds array, that contain either only intra-residue or
1040
+ only inter-residue bonds.
1041
+ """
1042
+ bond_array = array.bonds.as_array()
1043
+ # To save computation time call 'get_residue_starts_for()' only once
1044
+ # with indices of the first and second atom of each bond
1045
+ residue_starts_1, residue_starts_2 = get_residue_starts_for(
1046
+ array, bond_array[:, :2].flatten()
1047
+ ).reshape(-1, 2).T
1048
+ if connection == "intra":
1049
+ return bond_array[residue_starts_1 == residue_starts_2]
1050
+ elif connection == "inter":
1051
+ return bond_array[residue_starts_1 != residue_starts_2]
1052
+ else:
1053
+ raise ValueError("Invalid 'connection' option")
1054
+
1055
+
1056
+ def get_component(pdbx_file, data_block=None, use_ideal_coord=True,
1057
+ res_name=None):
1058
+ """
1059
+ Create an :class:`AtomArray` for a chemical component from the
1060
+ ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
1061
+ category in a :class:`PDBxFile`.
1062
+
1063
+ Parameters
1064
+ ----------
1065
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1066
+ The file object.
1067
+ data_block : str, optional
1068
+ The name of the data block.
1069
+ Default is the first (and most times only) data block of the
1070
+ file.
1071
+ If the data block object is passed directly to `pdbx_file`,
1072
+ this parameter is ignored.
1073
+ use_ideal_coord : bool, optional
1074
+ If true, the *ideal* coordinates are read from the file
1075
+ (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
1076
+ originating from computations.
1077
+ If set to false, alternative coordinates are read
1078
+ (``model_Cartn_<dim>_`` fields).
1079
+ res_name : str
1080
+ In rare cases the categories may contain rows for multiple
1081
+ components.
1082
+ In this case, the component with the given residue name is
1083
+ read.
1084
+ By default, all rows would be read in this case.
1085
+
1086
+ Returns
1087
+ -------
1088
+ array : AtomArray
1089
+ The parsed chemical component.
1090
+
1091
+ Examples
1092
+ --------
1093
+
1094
+ >>> import os.path
1095
+ >>> file = CIFFile.read(
1096
+ ... os.path.join(path_to_structures, "molecules", "TYR.cif")
1097
+ ... )
1098
+ >>> comp = get_component(file)
1099
+ >>> print(comp)
1100
+ HET 0 TYR N N 1.320 0.952 1.428
1101
+ HET 0 TYR CA C -0.018 0.429 1.734
1102
+ HET 0 TYR C C -0.103 0.094 3.201
1103
+ HET 0 TYR O O 0.886 -0.254 3.799
1104
+ HET 0 TYR CB C -0.274 -0.831 0.907
1105
+ HET 0 TYR CG C -0.189 -0.496 -0.559
1106
+ HET 0 TYR CD1 C 1.022 -0.589 -1.219
1107
+ HET 0 TYR CD2 C -1.324 -0.102 -1.244
1108
+ HET 0 TYR CE1 C 1.103 -0.282 -2.563
1109
+ HET 0 TYR CE2 C -1.247 0.210 -2.587
1110
+ HET 0 TYR CZ C -0.032 0.118 -3.252
1111
+ HET 0 TYR OH O 0.044 0.420 -4.574
1112
+ HET 0 TYR OXT O -1.279 0.184 3.842
1113
+ HET 0 TYR H H 1.977 0.225 1.669
1114
+ HET 0 TYR H2 H 1.365 1.063 0.426
1115
+ HET 0 TYR HA H -0.767 1.183 1.489
1116
+ HET 0 TYR HB2 H 0.473 -1.585 1.152
1117
+ HET 0 TYR HB3 H -1.268 -1.219 1.134
1118
+ HET 0 TYR HD1 H 1.905 -0.902 -0.683
1119
+ HET 0 TYR HD2 H -2.269 -0.031 -0.727
1120
+ HET 0 TYR HE1 H 2.049 -0.354 -3.078
1121
+ HET 0 TYR HE2 H -2.132 0.523 -3.121
1122
+ HET 0 TYR HH H -0.123 -0.399 -5.059
1123
+ HET 0 TYR HXT H -1.333 -0.030 4.784
1124
+ """
1125
+ block = _get_block(pdbx_file, data_block)
1126
+
1127
+ try:
1128
+ atom_category = block["chem_comp_atom"]
1129
+ except KeyError:
1130
+ raise InvalidFileError("Missing 'chem_comp_atom' category in file")
1131
+ if res_name is not None:
1132
+ atom_category = _filter(
1133
+ atom_category, atom_category["comp_id"].as_array() == res_name
1134
+ )
1135
+ if atom_category.row_count == 0:
1136
+ raise KeyError(
1137
+ f"No rows with residue name '{res_name}' found in "
1138
+ f"'chem_comp_atom' category"
1139
+ )
1140
+
1141
+ array = AtomArray(atom_category.row_count)
1142
+
1143
+ array.hetero[:] = True
1144
+ array.res_name = atom_category["comp_id"].as_array("U5")
1145
+ array.atom_name = atom_category["atom_id"].as_array("U6")
1146
+ array.element = atom_category["type_symbol"].as_array("U2")
1147
+ array.add_annotation("charge", int)
1148
+ array.charge = atom_category["charge"].as_array(int, 0)
1149
+
1150
+ coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
1151
+ alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
1152
+ if not use_ideal_coord:
1153
+ # Swap with the fallback option
1154
+ coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1155
+ try:
1156
+ for i, field in enumerate(coord_fields):
1157
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
1158
+ except KeyError as err:
1159
+ key = err.args[0]
1160
+ warnings.warn(
1161
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1162
+ f"The fallback coordinates will be used instead",
1163
+ UserWarning
1164
+ )
1165
+ for i, field in enumerate(alt_coord_fields):
1166
+ array.coord[:,i] = atom_category[field].as_array(np.float32)
1167
+
1168
+ try:
1169
+ bond_category = block["chem_comp_bond"]
1170
+ if res_name is not None:
1171
+ bond_category = _filter(
1172
+ bond_category, bond_category["comp_id"].as_array() == res_name
1173
+ )
1174
+ except KeyError:
1175
+ warnings.warn(
1176
+ f"Category 'chem_comp_bond' not found. "
1177
+ f"No bonds will be parsed",
1178
+ UserWarning
1179
+ )
1180
+ else:
1181
+ bonds = BondList(array.array_length())
1182
+ for atom1, atom2, order, aromatic_flag in zip(
1183
+ bond_category["atom_id_1"].as_array(str),
1184
+ bond_category["atom_id_2"].as_array(str),
1185
+ bond_category["value_order"].as_array(str),
1186
+ bond_category["pdbx_aromatic_flag"].as_array(str)
1187
+ ):
1188
+ atom_i = np.where(array.atom_name == atom1)[0][0]
1189
+ atom_j = np.where(array.atom_name == atom2)[0][0]
1190
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
1191
+ bonds.add_bond(atom_i, atom_j, bond_type)
1192
+ array.bonds = bonds
1193
+
1194
+ return array
1195
+
1196
+
1197
+ def set_component(pdbx_file, array, data_block=None):
1198
+ """
1199
+ Set the ``chem_comp_atom`` and, if bonds are available,
1200
+ ``chem_comp_bond`` category with atom information from an
1201
+ :class:`AtomArray`.
1202
+
1203
+ This will save the coordinates, the mandatory annotation categories
1204
+ and the optional ``charge`` category as well as an associated
1205
+ :class:`BondList`, if available.
1206
+
1207
+ Parameters
1208
+ ----------
1209
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1210
+ The file object.
1211
+ array : AtomArray
1212
+ The chemical component to be written.
1213
+ Must contain only a single residue.
1214
+ data_block : str, optional
1215
+ The name of the data block.
1216
+ Default is the first (and most times only) data block of the
1217
+ file.
1218
+ If the file is empty, a new data will be created.
1219
+ If the data block object is passed directly to `pdbx_file`,
1220
+ this parameter is ignored.
1221
+ """
1222
+ _check_non_empty(array)
1223
+
1224
+ block = _get_or_create_block(pdbx_file, data_block)
1225
+ Category = block.subcomponent_class()
1226
+
1227
+ if get_residue_count(array) > 1:
1228
+ raise BadStructureError(
1229
+ "The input atom array must comprise only one residue"
1230
+ )
1231
+ res_name = array.res_name[0]
1232
+
1233
+ annot_categories = array.get_annotation_categories()
1234
+ if "charge" in annot_categories:
1235
+ charge = array.charge.astype("U2")
1236
+ else:
1237
+ charge = np.full(array.array_length(), "?", dtype="U2")
1238
+
1239
+ atom_cat = Category()
1240
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1241
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1242
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1243
+ atom_cat["type_symbol"] = np.copy(array.element)
1244
+ atom_cat["charge"] = charge
1245
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1246
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1247
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1248
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1249
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1250
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1251
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1252
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1253
+ atom_cat["pdbx_ordinal"] = np.arange(
1254
+ 1, array.array_length() + 1
1255
+ ).astype(str)
1256
+ block["chem_comp_atom"] = atom_cat
1257
+
1258
+ if array.bonds is not None and array.bonds.get_bond_count() > 0:
1259
+ bond_array = array.bonds.as_array()
1260
+ order_flags = []
1261
+ aromatic_flags = []
1262
+ for bond_type in bond_array[:,2]:
1263
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
1264
+ order_flags.append(order_flag)
1265
+ aromatic_flags.append(aromatic_flag)
1266
+
1267
+ bond_cat = Category()
1268
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1269
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:,0]]
1270
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:,1]]
1271
+ bond_cat["value_order"] = np.array(order_flags)
1272
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1273
+ bond_cat["pdbx_ordinal"] = np.arange(
1274
+ 1, len(bond_array) + 1
1275
+ ).astype(str)
1276
+ block["chem_comp_bond"] = bond_cat
1277
+
1278
+ def list_assemblies(pdbx_file, data_block=None):
1279
+ """
1280
+ List the biological assemblies that are available for the structure
1281
+ in the given file.
1282
+
1283
+ This function receives the data from the ``pdbx_struct_assembly``
1284
+ category in the file.
1285
+ Consequently, this category must be present in the file.
1286
+
1287
+ Parameters
1288
+ ----------
1289
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1290
+ The file object.
1291
+ data_block : str, optional
1292
+ The name of the data block.
1293
+ Default is the first (and most times only) data block of the
1294
+ file.
1295
+ If the data block object is passed directly to `pdbx_file`,
1296
+ this parameter is ignored.
1297
+
1298
+ Returns
1299
+ -------
1300
+ assemblies : dict of str -> str
1301
+ A dictionary that maps an assembly ID to a description of the
1302
+ corresponding assembly.
1303
+
1304
+ Examples
1305
+ --------
1306
+ >>> import os.path
1307
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1308
+ >>> assembly_ids = list_assemblies(file)
1309
+ >>> for key, val in assembly_ids.items():
1310
+ ... print(f"'{key}' : '{val}'")
1311
+ '1' : 'complete icosahedral assembly'
1312
+ '2' : 'icosahedral asymmetric unit'
1313
+ '3' : 'icosahedral pentamer'
1314
+ '4' : 'icosahedral 23 hexamer'
1315
+ '5' : 'icosahedral asymmetric unit, std point frame'
1316
+ '6' : 'crystal asymmetric unit, crystal frame'
1317
+ """
1318
+ block = _get_block(pdbx_file, data_block)
1319
+
1320
+ try:
1321
+ assembly_category = block["pdbx_struct_assembly"]
1322
+ except KeyError:
1323
+ raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
1324
+ return {
1325
+ id: details
1326
+ for id, details in zip(
1327
+ assembly_category["id"].as_array(str),
1328
+ assembly_category["details"].as_array(str)
1329
+ )
1330
+ }
1331
+
1332
+
1333
+ def get_assembly(pdbx_file, assembly_id=None, model=None, data_block=None,
1334
+ altloc="first", extra_fields=None, use_author_fields=True,
1335
+ include_bonds=False):
1336
+ """
1337
+ Build the given biological assembly.
1338
+
1339
+ This function receives the data from the
1340
+ ``pdbx_struct_assembly_gen``, ``pdbx_struct_oper_list`` and
1341
+ ``atom_site`` categories in the file.
1342
+ Consequently, these categories must be present in the file.
1343
+
1344
+ Parameters
1345
+ ----------
1346
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1347
+ The file object.
1348
+ assembly_id : str
1349
+ The assembly to build.
1350
+ Available assembly IDs can be obtained via
1351
+ :func:`list_assemblies()`.
1352
+ model : int, optional
1353
+ If this parameter is given, the function will return an
1354
+ :class:`AtomArray` from the atoms corresponding to the given
1355
+ model number (starting at 1).
1356
+ Negative values are used to index models starting from the last
1357
+ model insted of the first model.
1358
+ If this parameter is omitted, an :class:`AtomArrayStack`
1359
+ containing all models will be returned, even if the structure
1360
+ contains only one model.
1361
+ data_block : str, optional
1362
+ The name of the data block.
1363
+ Default is the first (and most times only) data block of the
1364
+ file.
1365
+ If the data block object is passed directly to `pdbx_file`,
1366
+ this parameter is ignored.
1367
+ altloc : {'first', 'occupancy', 'all'}
1368
+ This parameter defines how *altloc* IDs are handled:
1369
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1370
+ appearing in a residue.
1371
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1372
+ with the highest occupancy for a residue.
1373
+ - ``'all'`` - Use all atoms.
1374
+ Note that this leads to duplicate atoms.
1375
+ When this option is chosen, the ``altloc_id`` annotation
1376
+ array is added to the returned structure.
1377
+ extra_fields : list of str, optional
1378
+ The strings in the list are entry names, that are
1379
+ additionally added as annotation arrays.
1380
+ The annotation category name will be the same as the PDBx
1381
+ subcategory name.
1382
+ The array type is always `str`.
1383
+ An exception are the special field identifiers:
1384
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1385
+ These will convert the fitting subcategory into an
1386
+ annotation array with reasonable type.
1387
+ use_author_fields : bool, optional
1388
+ Some fields can be read from two alternative sources,
1389
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1390
+ the ID of the residue.
1391
+ While, the ``label_xxx`` fields can be used as official pointers
1392
+ to other categories in the :class:`PDBxFile`, the ``auth_xxx``
1393
+ fields are set by the author(s) of the structure and are
1394
+ consistent with the corresponding values in PDB files.
1395
+ If `use_author_fields` is true, the annotation arrays will be
1396
+ read from the ``auth_xxx`` fields (if applicable),
1397
+ otherwise from the the ``label_xxx`` fields.
1398
+ include_bonds : bool, optional
1399
+ If set to true, a :class:`BondList` will be created for the
1400
+ resulting :class:`AtomArray` containing the bond information
1401
+ from the file.
1402
+ Bonds, whose order could not be determined from the
1403
+ *Chemical Component Dictionary*
1404
+ (e.g. especially inter-residue bonds),
1405
+ have :attr:`BondType.ANY`, since the PDB format itself does
1406
+ not support bond orders.
1407
+
1408
+ Returns
1409
+ -------
1410
+ assembly : AtomArray or AtomArrayStack
1411
+ The assembly. The return type depends on the `model` parameter.
1412
+
1413
+ Examples
1414
+ --------
1415
+
1416
+ >>> import os.path
1417
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1418
+ >>> assembly = get_assembly(file, model=1)
1419
+ """
1420
+ block = _get_block(pdbx_file, data_block)
1421
+
1422
+ try:
1423
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1424
+ except KeyError:
1425
+ raise InvalidFileError(
1426
+ "File has no 'pdbx_struct_assembly_gen' category"
1427
+ )
1428
+
1429
+ try:
1430
+ struct_oper_category = block["pdbx_struct_oper_list"]
1431
+ except KeyError:
1432
+ raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
1433
+
1434
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
1435
+ if assembly_id is None:
1436
+ assembly_id = assembly_ids[0]
1437
+ elif assembly_id not in assembly_ids:
1438
+ raise KeyError(f"File has no Assembly ID '{assembly_id}'")
1439
+
1440
+ ### Calculate all possible transformations
1441
+ transformations = _get_transformations(struct_oper_category)
1442
+
1443
+ ### Get structure according to additional parameters
1444
+ # Include 'label_asym_id' as annotation array
1445
+ # for correct asym ID filtering
1446
+ extra_fields = [] if extra_fields is None else extra_fields
1447
+ if "label_asym_id" in extra_fields:
1448
+ extra_fields_and_asym = extra_fields
1449
+ else:
1450
+ # The operations apply on asym IDs
1451
+ # -> they need to be included to select the correct atoms
1452
+ extra_fields_and_asym = extra_fields + ["label_asym_id"]
1453
+ structure = get_structure(
1454
+ pdbx_file,
1455
+ model,
1456
+ data_block,
1457
+ altloc,
1458
+ extra_fields_and_asym,
1459
+ use_author_fields,
1460
+ include_bonds
1461
+ )
1462
+
1463
+ ### Get transformations and apply them to the affected asym IDs
1464
+ assembly = None
1465
+ for id, op_expr, asym_id_expr in zip(
1466
+ assembly_gen_category["assembly_id"].as_array(str),
1467
+ assembly_gen_category["oper_expression"].as_array(str),
1468
+ assembly_gen_category["asym_id_list"].as_array(str),
1469
+ ):
1470
+ # Find the operation expressions for given assembly ID
1471
+ # We already asserted that the ID is actually present
1472
+ if id == assembly_id:
1473
+ operations = _parse_operation_expression(op_expr)
1474
+ asym_ids = asym_id_expr.split(",")
1475
+ # Filter affected asym IDs
1476
+ sub_structure = structure[
1477
+ ..., np.isin(structure.label_asym_id, asym_ids)
1478
+ ]
1479
+ sub_assembly = _apply_transformations(
1480
+ sub_structure, transformations, operations
1481
+ )
1482
+ # Merge the chains with asym IDs for this operation
1483
+ # with chains from other operations
1484
+ if assembly is None:
1485
+ assembly = sub_assembly
1486
+ else:
1487
+ assembly += sub_assembly
1488
+
1489
+ # Remove 'label_asym_id', if it was not included in the original
1490
+ # user-supplied 'extra_fields'
1491
+ if "label_asym_id" not in extra_fields:
1492
+ assembly.del_annotation("label_asym_id")
1493
+
1494
+ return assembly
1495
+
1496
+
1497
+ def _apply_transformations(structure, transformation_dict, operations):
1498
+ """
1499
+ Get subassembly by applying the given operations to the input
1500
+ structure containing affected asym IDs.
1501
+ """
1502
+ # Additional first dimesion for 'structure.repeat()'
1503
+ assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1504
+
1505
+ # Apply corresponding transformation for each copy in the assembly
1506
+ for i, operation in enumerate(operations):
1507
+ coord = structure.coord
1508
+ # Execute for each transformation step
1509
+ # in the operation expression
1510
+ for op_step in operation:
1511
+ rotation_matrix, translation_vector = transformation_dict[op_step]
1512
+ # Rotate
1513
+ coord = matrix_rotate(coord, rotation_matrix)
1514
+ # Translate
1515
+ coord += translation_vector
1516
+ assembly_coord[i] = coord
1517
+
1518
+ return repeat(structure, assembly_coord)
1519
+
1520
+
1521
+ def _get_transformations(struct_oper):
1522
+ """
1523
+ Get transformation operation in terms of rotation matrix and
1524
+ translation for each operation ID in ``pdbx_struct_oper_list``.
1525
+ """
1526
+ transformation_dict = {}
1527
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1528
+ rotation_matrix = np.array(
1529
+ [
1530
+ [
1531
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1532
+ for j in (1, 2, 3)
1533
+ ]
1534
+ for i in (1, 2, 3)
1535
+ ]
1536
+ )
1537
+ translation_vector = np.array([
1538
+ struct_oper[f"vector[{i}]"].as_array(float)[index]
1539
+ for i in (1, 2, 3)
1540
+ ])
1541
+ transformation_dict[id] = (rotation_matrix, translation_vector)
1542
+ return transformation_dict
1543
+
1544
+
1545
+ def _parse_operation_expression(expression):
1546
+ """
1547
+ Get successive operation steps (IDs) for the given
1548
+ ``oper_expression``.
1549
+ Form the cartesian product, if necessary.
1550
+ """
1551
+ # Split groups by parentheses:
1552
+ # use the opening parenthesis as delimiter
1553
+ # and just remove the closing parenthesis
1554
+ # example: '(X0)(1-10,21-25)' from 1a34
1555
+ expressions_per_step = expression.replace(")", "").split("(")
1556
+ expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
1557
+ # Important: Operations are applied from right to left
1558
+ expressions_per_step.reverse()
1559
+
1560
+ operations = []
1561
+ for one_step_expr in expressions_per_step:
1562
+ one_step_op_ids = []
1563
+ for expr in one_step_expr.split(","):
1564
+ if "-" in expr:
1565
+ # Range of operation IDs, they must be integers
1566
+ first, last = expr.split("-")
1567
+ one_step_op_ids.extend(
1568
+ [str(id) for id in range(int(first), int(last) + 1)]
1569
+ )
1570
+ else:
1571
+ # Single operation ID
1572
+ one_step_op_ids.append(expr)
1573
+ operations.append(one_step_op_ids)
1574
+
1575
+ # Cartesian product of operations
1576
+ return list(itertools.product(*operations))
1577
+
1578
+
1579
+ def _convert_string_to_sequence(string, stype):
1580
+ """
1581
+ Convert strings to `ProteinSequence` if `stype` is contained in
1582
+ ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1583
+ contained in ``_nucleotideseq_type_list``.
1584
+ """
1585
+ # sequence may be stored as multiline string
1586
+ string = string.replace("\n", "")
1587
+ if stype in _proteinseq_type_list:
1588
+ return ProteinSequence(string)
1589
+ elif stype in _nucleotideseq_type_list:
1590
+ string = string.replace("U", "T")
1591
+ return NucleotideSequence(string)
1592
+ elif stype in _other_type_list:
1593
+ return None
1594
+ else:
1595
+ raise InvalidFileError(
1596
+ "mmCIF _entity_poly.type unsupported" " type: " + stype
1597
+ )