biotite 1.1.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (332) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +452 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +57 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +206 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +60 -0
  35. biotite/database/entrez/dbnames.py +91 -0
  36. biotite/database/entrez/download.py +229 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +262 -0
  39. biotite/database/error.py +16 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +258 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +830 -0
  44. biotite/database/pubchem/throttle.py +98 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +159 -0
  47. biotite/database/rcsb/query.py +964 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +40 -0
  50. biotite/database/uniprot/download.py +129 -0
  51. biotite/database/uniprot/query.py +293 -0
  52. biotite/file.py +232 -0
  53. biotite/sequence/__init__.py +84 -0
  54. biotite/sequence/align/__init__.py +203 -0
  55. biotite/sequence/align/alignment.py +680 -0
  56. biotite/sequence/align/banded.cp313-win_amd64.pyd +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +71 -0
  59. biotite/sequence/align/cigar.py +425 -0
  60. biotite/sequence/align/kmeralphabet.cp313-win_amd64.pyd +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +595 -0
  62. biotite/sequence/align/kmersimilarity.cp313-win_amd64.pyd +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cp313-win_amd64.pyd +0 -0
  65. biotite/sequence/align/kmertable.pyx +3411 -0
  66. biotite/sequence/align/localgapped.cp313-win_amd64.pyd +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cp313-win_amd64.pyd +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +622 -0
  71. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  72. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  81. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  87. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  99. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  100. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  101. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  102. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  103. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  104. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  105. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  155. biotite/sequence/align/matrix_data/PB.license +21 -0
  156. biotite/sequence/align/matrix_data/PB.mat +18 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  160. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  161. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  162. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  163. biotite/sequence/align/multiple.cp313-win_amd64.pyd +0 -0
  164. biotite/sequence/align/multiple.pyx +620 -0
  165. biotite/sequence/align/pairwise.cp313-win_amd64.pyd +0 -0
  166. biotite/sequence/align/pairwise.pyx +587 -0
  167. biotite/sequence/align/permutation.cp313-win_amd64.pyd +0 -0
  168. biotite/sequence/align/permutation.pyx +313 -0
  169. biotite/sequence/align/primes.txt +821 -0
  170. biotite/sequence/align/selector.cp313-win_amd64.pyd +0 -0
  171. biotite/sequence/align/selector.pyx +954 -0
  172. biotite/sequence/align/statistics.py +264 -0
  173. biotite/sequence/align/tracetable.cp313-win_amd64.pyd +0 -0
  174. biotite/sequence/align/tracetable.pxd +64 -0
  175. biotite/sequence/align/tracetable.pyx +370 -0
  176. biotite/sequence/alphabet.py +555 -0
  177. biotite/sequence/annotation.py +830 -0
  178. biotite/sequence/codec.cp313-win_amd64.pyd +0 -0
  179. biotite/sequence/codec.pyx +155 -0
  180. biotite/sequence/codon.py +477 -0
  181. biotite/sequence/codon_tables.txt +202 -0
  182. biotite/sequence/graphics/__init__.py +33 -0
  183. biotite/sequence/graphics/alignment.py +1115 -0
  184. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  185. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  186. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  187. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  188. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  189. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  190. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  192. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  193. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  194. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  195. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  196. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  197. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  198. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  199. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  200. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  201. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  202. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  203. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  204. biotite/sequence/graphics/colorschemes.py +170 -0
  205. biotite/sequence/graphics/dendrogram.py +229 -0
  206. biotite/sequence/graphics/features.py +544 -0
  207. biotite/sequence/graphics/logo.py +104 -0
  208. biotite/sequence/graphics/plasmid.py +712 -0
  209. biotite/sequence/io/__init__.py +12 -0
  210. biotite/sequence/io/fasta/__init__.py +22 -0
  211. biotite/sequence/io/fasta/convert.py +284 -0
  212. biotite/sequence/io/fasta/file.py +265 -0
  213. biotite/sequence/io/fastq/__init__.py +19 -0
  214. biotite/sequence/io/fastq/convert.py +117 -0
  215. biotite/sequence/io/fastq/file.py +507 -0
  216. biotite/sequence/io/genbank/__init__.py +17 -0
  217. biotite/sequence/io/genbank/annotation.py +269 -0
  218. biotite/sequence/io/genbank/file.py +573 -0
  219. biotite/sequence/io/genbank/metadata.py +336 -0
  220. biotite/sequence/io/genbank/sequence.py +171 -0
  221. biotite/sequence/io/general.py +201 -0
  222. biotite/sequence/io/gff/__init__.py +26 -0
  223. biotite/sequence/io/gff/convert.py +128 -0
  224. biotite/sequence/io/gff/file.py +450 -0
  225. biotite/sequence/phylo/__init__.py +36 -0
  226. biotite/sequence/phylo/nj.cp313-win_amd64.pyd +0 -0
  227. biotite/sequence/phylo/nj.pyx +221 -0
  228. biotite/sequence/phylo/tree.cp313-win_amd64.pyd +0 -0
  229. biotite/sequence/phylo/tree.pyx +1169 -0
  230. biotite/sequence/phylo/upgma.cp313-win_amd64.pyd +0 -0
  231. biotite/sequence/phylo/upgma.pyx +164 -0
  232. biotite/sequence/profile.py +567 -0
  233. biotite/sequence/search.py +118 -0
  234. biotite/sequence/seqtypes.py +713 -0
  235. biotite/sequence/sequence.py +374 -0
  236. biotite/setup_ccd.py +197 -0
  237. biotite/structure/__init__.py +133 -0
  238. biotite/structure/alphabet/__init__.py +25 -0
  239. biotite/structure/alphabet/encoder.py +332 -0
  240. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  241. biotite/structure/alphabet/i3d.py +110 -0
  242. biotite/structure/alphabet/layers.py +86 -0
  243. biotite/structure/alphabet/pb.license +21 -0
  244. biotite/structure/alphabet/pb.py +171 -0
  245. biotite/structure/alphabet/unkerasify.py +122 -0
  246. biotite/structure/atoms.py +1554 -0
  247. biotite/structure/basepairs.py +1404 -0
  248. biotite/structure/bonds.cp313-win_amd64.pyd +0 -0
  249. biotite/structure/bonds.pyx +1972 -0
  250. biotite/structure/box.py +588 -0
  251. biotite/structure/celllist.cp313-win_amd64.pyd +0 -0
  252. biotite/structure/celllist.pyx +849 -0
  253. biotite/structure/chains.py +314 -0
  254. biotite/structure/charges.cp313-win_amd64.pyd +0 -0
  255. biotite/structure/charges.pyx +520 -0
  256. biotite/structure/compare.py +274 -0
  257. biotite/structure/density.py +109 -0
  258. biotite/structure/dotbracket.py +214 -0
  259. biotite/structure/error.py +39 -0
  260. biotite/structure/filter.py +590 -0
  261. biotite/structure/geometry.py +655 -0
  262. biotite/structure/graphics/__init__.py +13 -0
  263. biotite/structure/graphics/atoms.py +243 -0
  264. biotite/structure/graphics/rna.py +295 -0
  265. biotite/structure/hbond.py +428 -0
  266. biotite/structure/info/__init__.py +24 -0
  267. biotite/structure/info/atom_masses.json +121 -0
  268. biotite/structure/info/atoms.py +81 -0
  269. biotite/structure/info/bonds.py +149 -0
  270. biotite/structure/info/ccd.py +202 -0
  271. biotite/structure/info/components.bcif +0 -0
  272. biotite/structure/info/groups.py +131 -0
  273. biotite/structure/info/masses.py +121 -0
  274. biotite/structure/info/misc.py +138 -0
  275. biotite/structure/info/radii.py +197 -0
  276. biotite/structure/info/standardize.py +186 -0
  277. biotite/structure/integrity.py +215 -0
  278. biotite/structure/io/__init__.py +29 -0
  279. biotite/structure/io/dcd/__init__.py +13 -0
  280. biotite/structure/io/dcd/file.py +67 -0
  281. biotite/structure/io/general.py +243 -0
  282. biotite/structure/io/gro/__init__.py +14 -0
  283. biotite/structure/io/gro/file.py +344 -0
  284. biotite/structure/io/mol/__init__.py +20 -0
  285. biotite/structure/io/mol/convert.py +112 -0
  286. biotite/structure/io/mol/ctab.py +415 -0
  287. biotite/structure/io/mol/header.py +120 -0
  288. biotite/structure/io/mol/mol.py +149 -0
  289. biotite/structure/io/mol/sdf.py +914 -0
  290. biotite/structure/io/netcdf/__init__.py +13 -0
  291. biotite/structure/io/netcdf/file.py +64 -0
  292. biotite/structure/io/pdb/__init__.py +20 -0
  293. biotite/structure/io/pdb/convert.py +307 -0
  294. biotite/structure/io/pdb/file.py +1290 -0
  295. biotite/structure/io/pdb/hybrid36.cp313-win_amd64.pyd +0 -0
  296. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  297. biotite/structure/io/pdbqt/__init__.py +15 -0
  298. biotite/structure/io/pdbqt/convert.py +113 -0
  299. biotite/structure/io/pdbqt/file.py +688 -0
  300. biotite/structure/io/pdbx/__init__.py +23 -0
  301. biotite/structure/io/pdbx/bcif.py +656 -0
  302. biotite/structure/io/pdbx/cif.py +1075 -0
  303. biotite/structure/io/pdbx/component.py +245 -0
  304. biotite/structure/io/pdbx/compress.py +321 -0
  305. biotite/structure/io/pdbx/convert.py +1745 -0
  306. biotite/structure/io/pdbx/encoding.cp313-win_amd64.pyd +0 -0
  307. biotite/structure/io/pdbx/encoding.pyx +1031 -0
  308. biotite/structure/io/trajfile.py +693 -0
  309. biotite/structure/io/trr/__init__.py +13 -0
  310. biotite/structure/io/trr/file.py +43 -0
  311. biotite/structure/io/xtc/__init__.py +13 -0
  312. biotite/structure/io/xtc/file.py +43 -0
  313. biotite/structure/mechanics.py +73 -0
  314. biotite/structure/molecules.py +352 -0
  315. biotite/structure/pseudoknots.py +628 -0
  316. biotite/structure/rdf.py +245 -0
  317. biotite/structure/repair.py +304 -0
  318. biotite/structure/residues.py +572 -0
  319. biotite/structure/sasa.cp313-win_amd64.pyd +0 -0
  320. biotite/structure/sasa.pyx +322 -0
  321. biotite/structure/segments.py +178 -0
  322. biotite/structure/sequence.py +111 -0
  323. biotite/structure/sse.py +308 -0
  324. biotite/structure/superimpose.py +689 -0
  325. biotite/structure/transform.py +530 -0
  326. biotite/structure/util.py +168 -0
  327. biotite/version.py +16 -0
  328. biotite/visualize.py +265 -0
  329. biotite-1.1.0.dist-info/METADATA +190 -0
  330. biotite-1.1.0.dist-info/RECORD +332 -0
  331. biotite-1.1.0.dist-info/WHEEL +4 -0
  332. biotite-1.1.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1745 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdbx"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann"
7
+ __all__ = [
8
+ "get_sequence",
9
+ "get_model_count",
10
+ "get_structure",
11
+ "set_structure",
12
+ "get_component",
13
+ "set_component",
14
+ "list_assemblies",
15
+ "get_assembly",
16
+ ]
17
+
18
+ import itertools
19
+ import warnings
20
+ import numpy as np
21
+ from biotite.file import InvalidFileError
22
+ from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
23
+ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
24
+ from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
25
+ from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
26
+ from biotite.structure.error import BadStructureError
27
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
28
+ from biotite.structure.filter import (
29
+ _canonical_nucleotide_list as canonical_nucleotide_list,
30
+ )
31
+ from biotite.structure.filter import (
32
+ filter_first_altloc,
33
+ filter_highest_occupancy_altloc,
34
+ )
35
+ from biotite.structure.io.pdbx.bcif import (
36
+ BinaryCIFBlock,
37
+ BinaryCIFColumn,
38
+ BinaryCIFFile,
39
+ )
40
+ from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
41
+ from biotite.structure.io.pdbx.component import MaskValue
42
+ from biotite.structure.io.pdbx.encoding import StringArrayEncoding
43
+ from biotite.structure.residues import (
44
+ get_residue_count,
45
+ get_residue_positions,
46
+ get_residue_starts_for,
47
+ )
48
+ from biotite.structure.util import matrix_rotate
49
+
50
+ # Bond types in `struct_conn` category that refer to covalent bonds
51
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
52
+ # Although a covalent bond, could in theory have a higher bond order,
53
+ # practically inter-residue bonds are always single
54
+ "covale": BondType.SINGLE,
55
+ "covale_base": BondType.SINGLE,
56
+ "covale_phosphate": BondType.SINGLE,
57
+ "covale_sugar": BondType.SINGLE,
58
+ "disulf": BondType.SINGLE,
59
+ "modres": BondType.SINGLE,
60
+ "modres_link": BondType.SINGLE,
61
+ "metalc": BondType.COORDINATION,
62
+ }
63
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
64
+ BondType.ANY: "covale",
65
+ BondType.SINGLE: "covale",
66
+ BondType.DOUBLE: "covale",
67
+ BondType.TRIPLE: "covale",
68
+ BondType.QUADRUPLE: "covale",
69
+ BondType.AROMATIC_SINGLE: "covale",
70
+ BondType.AROMATIC_DOUBLE: "covale",
71
+ BondType.AROMATIC_TRIPLE: "covale",
72
+ BondType.COORDINATION: "metalc",
73
+ }
74
+ PDBX_BOND_TYPE_TO_ORDER = {
75
+ BondType.SINGLE: "sing",
76
+ BondType.DOUBLE: "doub",
77
+ BondType.TRIPLE: "trip",
78
+ BondType.QUADRUPLE: "quad",
79
+ BondType.AROMATIC_SINGLE: "sing",
80
+ BondType.AROMATIC_DOUBLE: "doub",
81
+ BondType.AROMATIC_TRIPLE: "trip",
82
+ # These are masked later, it is merely added here to avoid a KeyError
83
+ BondType.ANY: "",
84
+ BondType.COORDINATION: "",
85
+ }
86
+ # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
87
+ COMP_BOND_ORDER_TO_TYPE = {
88
+ ("SING", "N"): BondType.SINGLE,
89
+ ("DOUB", "N"): BondType.DOUBLE,
90
+ ("TRIP", "N"): BondType.TRIPLE,
91
+ ("QUAD", "N"): BondType.QUADRUPLE,
92
+ ("SING", "Y"): BondType.AROMATIC_SINGLE,
93
+ ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
94
+ ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
95
+ }
96
+ # ...and vice versa
97
+ COMP_BOND_TYPE_TO_ORDER = {
98
+ bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
99
+ }
100
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
101
+
102
+ _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
103
+ _nucleotideseq_type_list = [
104
+ "polydeoxyribonucleotide",
105
+ "polyribonucleotide",
106
+ "polydeoxyribonucleotide/polyribonucleotide hybrid",
107
+ ]
108
+ _other_type_list = [
109
+ "cyclic-pseudo-peptide",
110
+ "other",
111
+ "peptide nucleic acid",
112
+ "polysaccharide(D)",
113
+ "polysaccharide(L)",
114
+ ]
115
+
116
+
117
+ def _filter(category, index):
118
+ """
119
+ Reduce the ``atom_site`` category to the values for the given
120
+ model.
121
+ """
122
+ Category = type(category)
123
+ Column = Category.subcomponent_class()
124
+ Data = Column.subcomponent_class()
125
+
126
+ return Category(
127
+ {
128
+ key: Column(
129
+ Data(column.data.array[index]),
130
+ (Data(column.mask.array[index]) if column.mask is not None else None),
131
+ )
132
+ for key, column in category.items()
133
+ }
134
+ )
135
+
136
+
137
+ def get_sequence(pdbx_file, data_block=None):
138
+ """
139
+ Get the protein and nucleotide sequences from the
140
+ ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
141
+
142
+ Supported polymer types (``_entity_poly.type``) are:
143
+ ``'polypeptide(D)'``, ``'polypeptide(L)'``,
144
+ ``'polydeoxyribonucleotide'``, ``'polyribonucleotide'`` and
145
+ ``'polydeoxyribonucleotide/polyribonucleotide hybrid'``.
146
+ Uracil is converted to Thymine.
147
+
148
+ Parameters
149
+ ----------
150
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
151
+ The file object.
152
+ data_block : str, optional
153
+ The name of the data block.
154
+ Default is the first (and most times only) data block of the
155
+ file.
156
+ If the data block object is passed directly to `pdbx_file`,
157
+ this parameter is ignored.
158
+
159
+ Returns
160
+ -------
161
+ sequence_dict : Dictionary of Sequences
162
+ Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
163
+ (often equivalent to chain_id and atom_site.auth_asym_id
164
+ in most cases). Dictionary values are sequences.
165
+
166
+ Notes
167
+ -----
168
+ The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial
169
+ complete sequence. If the structure represents a truncated or spliced
170
+ version of this initial sequence, it will include only a subset of the
171
+ initial sequence. Use biotite.structure.get_residues to retrieve only
172
+ the residues that are represented in the structure.
173
+ """
174
+
175
+ block = _get_block(pdbx_file, data_block)
176
+ poly_category = block["entity_poly"]
177
+
178
+ seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
179
+ seq_type = poly_category["type"].as_array(str)
180
+
181
+ sequences = [
182
+ _convert_string_to_sequence(string, stype)
183
+ for string, stype in zip(seq_string, seq_type)
184
+ ]
185
+
186
+ strand_ids = poly_category["pdbx_strand_id"].as_array(str)
187
+ strand_ids = [strand_id.split(",") for strand_id in strand_ids]
188
+
189
+ sequence_dict = {
190
+ strand_id: sequence
191
+ for sequence, strand_ids in zip(sequences, strand_ids)
192
+ for strand_id in strand_ids
193
+ if sequence is not None
194
+ }
195
+
196
+ return sequence_dict
197
+
198
+
199
+ def get_model_count(pdbx_file, data_block=None):
200
+ """
201
+ Get the number of models contained in a file.
202
+
203
+ Parameters
204
+ ----------
205
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
206
+ The file object.
207
+ data_block : str, optional
208
+ The name of the data block.
209
+ Default is the first (and most times only) data block of the
210
+ file.
211
+ If the data block object is passed directly to `pdbx_file`,
212
+ this parameter is ignored.
213
+
214
+ Returns
215
+ -------
216
+ model_count : int
217
+ The number of models.
218
+ """
219
+ block = _get_block(pdbx_file, data_block)
220
+ return len(
221
+ _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
222
+ )
223
+
224
+
225
+ def get_structure(
226
+ pdbx_file,
227
+ model=None,
228
+ data_block=None,
229
+ altloc="first",
230
+ extra_fields=None,
231
+ use_author_fields=True,
232
+ include_bonds=False,
233
+ ):
234
+ """
235
+ Create an :class:`AtomArray` or :class:`AtomArrayStack` from the
236
+ ``atom_site`` category in a file.
237
+
238
+ Parameters
239
+ ----------
240
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
241
+ The file object.
242
+ model : int, optional
243
+ If this parameter is given, the function will return an
244
+ :class:`AtomArray` from the atoms corresponding to the given
245
+ model number (starting at 1).
246
+ Negative values are used to index models starting from the last
247
+ model insted of the first model.
248
+ If this parameter is omitted, an :class:`AtomArrayStack`
249
+ containing all models will be returned, even if the structure
250
+ contains only one model.
251
+ data_block : str, optional
252
+ The name of the data block.
253
+ Default is the first (and most times only) data block of the
254
+ file.
255
+ If the data block object is passed directly to `pdbx_file`,
256
+ this parameter is ignored.
257
+ altloc : {'first', 'occupancy', 'all'}
258
+ This parameter defines how *altloc* IDs are handled:
259
+ - ``'first'`` - Use atoms that have the first *altloc* ID
260
+ appearing in a residue.
261
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
262
+ with the highest occupancy for a residue.
263
+ - ``'all'`` - Use all atoms.
264
+ Note that this leads to duplicate atoms.
265
+ When this option is chosen, the ``altloc_id`` annotation
266
+ array is added to the returned structure.
267
+ extra_fields : list of str, optional
268
+ The strings in the list are entry names, that are
269
+ additionally added as annotation arrays.
270
+ The annotation category name will be the same as the PDBx
271
+ subcategory name.
272
+ The array type is always `str`.
273
+ An exception are the special field identifiers:
274
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
275
+ These will convert the fitting subcategory into an
276
+ annotation array with reasonable type.
277
+ use_author_fields : bool, optional
278
+ Some fields can be read from two alternative sources,
279
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
280
+ the ID of the residue.
281
+ While, the ``label_xxx`` fields can be used as official pointers
282
+ to other categories in the file, the ``auth_xxx``
283
+ fields are set by the author(s) of the structure and are
284
+ consistent with the corresponding values in PDB files.
285
+ If `use_author_fields` is true, the annotation arrays will be
286
+ read from the ``auth_xxx`` fields (if applicable),
287
+ otherwise from the the ``label_xxx`` fields.
288
+ If the requested field is not available, the respective other
289
+ field is taken as fallback.
290
+ include_bonds : bool, optional
291
+ If set to true, a :class:`BondList` will be created for the
292
+ resulting :class:`AtomArray` containing the bond information
293
+ from the file.
294
+ Inter-residue bonds, will be read from the ``struct_conn``
295
+ category.
296
+ Intra-residue bonds will be read from the ``chem_comp_bond``, if
297
+ available, otherwise they will be derived from the Chemical
298
+ Component Dictionary.
299
+
300
+ Returns
301
+ -------
302
+ array : AtomArray or AtomArrayStack
303
+ The return type depends on the `model` parameter.
304
+
305
+ Examples
306
+ --------
307
+
308
+ >>> import os.path
309
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1l2y.cif"))
310
+ >>> arr = get_structure(file, model=1)
311
+ >>> print(len(arr))
312
+ 304
313
+
314
+ """
315
+ block = _get_block(pdbx_file, data_block)
316
+
317
+ extra_fields = set() if extra_fields is None else set(extra_fields)
318
+
319
+ atom_site = block.get("atom_site")
320
+ if atom_site is None:
321
+ raise InvalidFileError("Missing 'atom_site' category in file")
322
+
323
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
324
+ model_starts = _get_model_starts(models)
325
+ model_count = len(model_starts)
326
+ atom_count = len(models)
327
+
328
+ if model is None:
329
+ # For a stack, the annotations are derived from the first model
330
+ model_atom_site = _filter_model(atom_site, model_starts, 1)
331
+ # Any field of the category would work here to get the length
332
+ model_length = model_atom_site.row_count
333
+ atoms = AtomArrayStack(model_count, model_length)
334
+
335
+ # Check if each model has the same amount of atoms
336
+ # If not, raise exception
337
+ if model_length * model_count != atom_count:
338
+ raise InvalidFileError(
339
+ "The models in the file have unequal "
340
+ "amount of atoms, give an explicit model "
341
+ "instead"
342
+ )
343
+
344
+ atoms.coord[:, :, 0] = (
345
+ atom_site["Cartn_x"]
346
+ .as_array(np.float32)
347
+ .reshape((model_count, model_length))
348
+ )
349
+ atoms.coord[:, :, 1] = (
350
+ atom_site["Cartn_y"]
351
+ .as_array(np.float32)
352
+ .reshape((model_count, model_length))
353
+ )
354
+ atoms.coord[:, :, 2] = (
355
+ atom_site["Cartn_z"]
356
+ .as_array(np.float32)
357
+ .reshape((model_count, model_length))
358
+ )
359
+
360
+ box = _get_box(block)
361
+ if box is not None:
362
+ # Duplicate same box for each model
363
+ atoms.box = np.repeat(box[np.newaxis, ...], model_count, axis=0)
364
+
365
+ else:
366
+ if model == 0:
367
+ raise ValueError("The model index must not be 0")
368
+ # Negative models mean model indexing starting from last model
369
+ model = model_count + model + 1 if model < 0 else model
370
+ if model > model_count:
371
+ raise ValueError(
372
+ f"The file has {model_count} models, "
373
+ f"the given model {model} does not exist"
374
+ )
375
+
376
+ model_atom_site = _filter_model(atom_site, model_starts, model)
377
+ # Any field of the category would work here to get the length
378
+ model_length = model_atom_site.row_count
379
+ atoms = AtomArray(model_length)
380
+
381
+ atoms.coord[:, 0] = model_atom_site["Cartn_x"].as_array(np.float32)
382
+ atoms.coord[:, 1] = model_atom_site["Cartn_y"].as_array(np.float32)
383
+ atoms.coord[:, 2] = model_atom_site["Cartn_z"].as_array(np.float32)
384
+
385
+ atoms.box = _get_box(block)
386
+
387
+ # The below part is the same for both, AtomArray and AtomArrayStack
388
+ _fill_annotations(atoms, model_atom_site, extra_fields, use_author_fields)
389
+ if include_bonds:
390
+ if "chem_comp_bond" in block:
391
+ try:
392
+ custom_bond_dict = _parse_intra_residue_bonds(block["chem_comp_bond"])
393
+ except KeyError:
394
+ warnings.warn(
395
+ "The 'chem_comp_bond' category has missing columns, "
396
+ "falling back to using Chemical Component Dictionary",
397
+ UserWarning,
398
+ )
399
+ custom_bond_dict = None
400
+ bonds = connect_via_residue_names(atoms, custom_bond_dict=custom_bond_dict)
401
+ else:
402
+ bonds = connect_via_residue_names(atoms)
403
+ if "struct_conn" in block:
404
+ bonds = bonds.merge(
405
+ _parse_inter_residue_bonds(model_atom_site, block["struct_conn"])
406
+ )
407
+ atoms.bonds = bonds
408
+ atoms = _filter_altloc(atoms, model_atom_site, altloc)
409
+
410
+ return atoms
411
+
412
+
413
+ def _get_block(pdbx_component, block_name):
414
+ if not isinstance(pdbx_component, (CIFBlock, BinaryCIFBlock)):
415
+ # Determine block
416
+ if block_name is None:
417
+ return pdbx_component.block
418
+ else:
419
+ return pdbx_component[block_name]
420
+ else:
421
+ return pdbx_component
422
+
423
+
424
+ def _get_or_fallback(category, key, fallback_key):
425
+ """
426
+ Return column related to key in category if it exists,
427
+ otherwise try to get the column related to fallback key.
428
+ """
429
+ if key not in category:
430
+ warnings.warn(
431
+ f"Attribute '{key}' not found within 'atom_site' category. "
432
+ f"The fallback attribute '{fallback_key}' will be used instead",
433
+ UserWarning,
434
+ )
435
+ try:
436
+ return category[fallback_key]
437
+ except KeyError as key_exc:
438
+ raise InvalidFileError(
439
+ f"Fallback attribute '{fallback_key}' not found within "
440
+ "'atom_site' category"
441
+ ) from key_exc
442
+ return category[key]
443
+
444
+
445
+ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
446
+ """Fill atom_site annotations in atom array or atom array stack.
447
+
448
+ Parameters
449
+ ----------
450
+ array : AtomArray or AtomArrayStack
451
+ Atom array or stack which will be annotated.
452
+ atom_site : CIFCategory or BinaryCIFCategory
453
+ ``atom_site`` category with values for one model.
454
+ extra_fields : list of str
455
+ Entry names, that are additionally added as annotation arrays.
456
+ use_author_fields : bool
457
+ Define if alternate fields prefixed with ``auth_`` should be used
458
+ instead of ``label_``.
459
+ """
460
+
461
+ prefix, alt_prefix = ("auth", "label") if use_author_fields else ("label", "auth")
462
+
463
+ array.set_annotation(
464
+ "chain_id",
465
+ _get_or_fallback(
466
+ atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
467
+ ).as_array(str),
468
+ )
469
+ array.set_annotation(
470
+ "res_id",
471
+ _get_or_fallback(
472
+ atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
473
+ ).as_array(int, -1),
474
+ )
475
+ array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
476
+ array.set_annotation(
477
+ "res_name",
478
+ _get_or_fallback(
479
+ atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
480
+ ).as_array(str),
481
+ )
482
+ array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
483
+ array.set_annotation(
484
+ "atom_name",
485
+ _get_or_fallback(
486
+ atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
487
+ ).as_array(str),
488
+ )
489
+ array.set_annotation("element", atom_site["type_symbol"].as_array(str))
490
+
491
+ if "atom_id" in extra_fields:
492
+ if "id" in atom_site:
493
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
494
+ else:
495
+ warnings.warn(
496
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
497
+ UserWarning,
498
+ )
499
+ array.set_annotation("atom_id", np.arange(array.array_length()))
500
+ extra_fields.remove("atom_id")
501
+ if "b_factor" in extra_fields:
502
+ if "B_iso_or_equiv" in atom_site:
503
+ array.set_annotation(
504
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
505
+ )
506
+ else:
507
+ warnings.warn(
508
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
509
+ UserWarning,
510
+ )
511
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
512
+ extra_fields.remove("b_factor")
513
+ if "occupancy" in extra_fields:
514
+ if "occupancy" in atom_site:
515
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
516
+ else:
517
+ warnings.warn(
518
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
519
+ UserWarning,
520
+ )
521
+ array.set_annotation(
522
+ "occupancy", np.ones(array.array_length(), dtype=float)
523
+ )
524
+ extra_fields.remove("occupancy")
525
+ if "charge" in extra_fields:
526
+ if "pdbx_formal_charge" in atom_site:
527
+ array.set_annotation(
528
+ "charge",
529
+ atom_site["pdbx_formal_charge"].as_array(
530
+ int, 0
531
+ ), # masked values are set to 0
532
+ )
533
+ else:
534
+ warnings.warn(
535
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
536
+ UserWarning,
537
+ )
538
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
539
+ extra_fields.remove("charge")
540
+
541
+ # Handle all remaining custom fields
542
+ for field in extra_fields:
543
+ array.set_annotation(field, atom_site[field].as_array(str))
544
+
545
+
546
+ def _parse_intra_residue_bonds(chem_comp_bond):
547
+ """
548
+ Create a :func:`connect_via_residue_names()` compatible
549
+ `custom_bond_dict` from the ``chem_comp_bond`` category.
550
+ """
551
+ custom_bond_dict = {}
552
+ for res_name, atom_1, atom_2, order, aromatic_flag in zip(
553
+ chem_comp_bond["comp_id"].as_array(str),
554
+ chem_comp_bond["atom_id_1"].as_array(str),
555
+ chem_comp_bond["atom_id_2"].as_array(str),
556
+ chem_comp_bond["value_order"].as_array(str),
557
+ chem_comp_bond["pdbx_aromatic_flag"].as_array(str),
558
+ ):
559
+ if res_name not in custom_bond_dict:
560
+ custom_bond_dict[res_name] = {}
561
+ bond_type = COMP_BOND_ORDER_TO_TYPE.get(
562
+ (order.upper(), aromatic_flag), BondType.ANY
563
+ )
564
+ custom_bond_dict[res_name][atom_1.item(), atom_2.item()] = bond_type
565
+ return custom_bond_dict
566
+
567
+
568
+ def _parse_inter_residue_bonds(atom_site, struct_conn):
569
+ """
570
+ Create inter-residue bonds by parsing the ``struct_conn`` category.
571
+ The atom indices of each bond are found by matching the bond labels
572
+ to the ``atom_site`` category.
573
+ """
574
+ # Identity symmetry operation
575
+ IDENTITY = "1_555"
576
+ # Columns in 'atom_site' that should be matched by 'struct_conn'
577
+ COLUMNS = [
578
+ "label_asym_id",
579
+ "label_comp_id",
580
+ "label_seq_id",
581
+ "label_atom_id",
582
+ "label_alt_id",
583
+ "auth_asym_id",
584
+ "auth_comp_id",
585
+ "auth_seq_id",
586
+ "pdbx_PDB_ins_code",
587
+ ]
588
+
589
+ covale_mask = np.isin(
590
+ struct_conn["conn_type_id"].as_array(str),
591
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
592
+ )
593
+ if "ptnr1_symmetry" in struct_conn:
594
+ covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
595
+ if "ptnr2_symmetry" in struct_conn:
596
+ covale_mask &= struct_conn["ptnr2_symmetry"].as_array(str, IDENTITY) == IDENTITY
597
+
598
+ atom_indices = [None] * 2
599
+ for i in range(2):
600
+ reference_arrays = []
601
+ query_arrays = []
602
+ for col_name in COLUMNS:
603
+ struct_conn_col_name = _get_struct_conn_col_name(col_name, i + 1)
604
+ if col_name not in atom_site or struct_conn_col_name not in struct_conn:
605
+ continue
606
+ # Ensure both arrays have the same dtype to allow comparison
607
+ reference = atom_site[col_name].as_array()
608
+ dtype = reference.dtype
609
+ query = struct_conn[struct_conn_col_name].as_array(dtype)
610
+ if np.issubdtype(reference.dtype, str):
611
+ # The mask value is not necessarily consistent
612
+ # between query and reference
613
+ # -> make it consistent
614
+ reference[reference == "?"] = "."
615
+ query[query == "?"] = "."
616
+ reference_arrays.append(reference)
617
+ query_arrays.append(query[covale_mask])
618
+ # Match the combination of 'label_asym_id', 'label_comp_id', etc.
619
+ # in 'atom_site' and 'struct_conn'
620
+ atom_indices[i] = _find_matches(query_arrays, reference_arrays)
621
+ atoms_indices_1 = atom_indices[0]
622
+ atoms_indices_2 = atom_indices[1]
623
+
624
+ # Some bonds in 'struct_conn' may not be found in 'atom_site'
625
+ # This is okay,
626
+ # as 'atom_site' might already be reduced to a single model
627
+ mapping_exists_mask = (atoms_indices_1 != -1) & (atoms_indices_2 != -1)
628
+ atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
629
+ atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
630
+
631
+ bond_type_id = struct_conn["conn_type_id"].as_array()
632
+ # Consecutively apply the same masks as applied to the atom indices
633
+ # Logical combination does not work here,
634
+ # as the second mask was created based on already filtered data
635
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
636
+ # The type ID is always present in the dictionary,
637
+ # as it was used to filter the applicable bonds
638
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
639
+
640
+ return BondList(
641
+ atom_site.row_count,
642
+ np.stack([atoms_indices_1, atoms_indices_2, bond_types], axis=-1),
643
+ )
644
+
645
+
646
+ def _find_matches(query_arrays, reference_arrays):
647
+ """
648
+ For each index in the `query_arrays` find the indices in the
649
+ `reference_arrays` where all query values match the reference counterpart.
650
+ If no match is found for a query, the corresponding index is -1.
651
+ """
652
+ match_masks_for_all_columns = np.stack(
653
+ [
654
+ query[:, np.newaxis] == reference[np.newaxis, :]
655
+ for query, reference in zip(query_arrays, reference_arrays)
656
+ ],
657
+ axis=-1,
658
+ )
659
+ match_masks = np.all(match_masks_for_all_columns, axis=-1)
660
+ query_matches, reference_matches = np.where(match_masks)
661
+
662
+ # Duplicate matches indicate that an atom from the query cannot
663
+ # be uniquely matched to an atom in the reference
664
+ unique_query_matches, counts = np.unique(query_matches, return_counts=True)
665
+ if np.any(counts > 1):
666
+ ambiguous_query = unique_query_matches[np.where(counts > 1)[0][0]]
667
+ raise InvalidFileError(
668
+ f"The covalent bond in the 'struct_conn' category at index "
669
+ f"{ambiguous_query} cannot be unambiguously assigned to atoms in "
670
+ f"the 'atom_site' category"
671
+ )
672
+
673
+ # -1 indicates that no match was found in the reference
674
+ match_indices = np.full(len(query_arrays[0]), -1, dtype=int)
675
+ match_indices[query_matches] = reference_matches
676
+ return match_indices
677
+
678
+
679
+ def _get_struct_conn_col_name(col_name, partner):
680
+ """
681
+ For a column name in ``atom_site`` get the corresponding column name
682
+ in ``struct_conn``.
683
+ """
684
+ if col_name == "label_alt_id":
685
+ return f"pdbx_ptnr{partner}_label_alt_id"
686
+ elif col_name.startswith("pdbx_"):
687
+ # Move 'pdbx_' to front
688
+ return f"pdbx_ptnr{partner}_{col_name[5:]}"
689
+ else:
690
+ return f"ptnr{partner}_{col_name}"
691
+
692
+
693
+ def _filter_altloc(array, atom_site, altloc):
694
+ altloc_ids = atom_site.get("label_alt_id")
695
+ occupancy = atom_site.get("occupancy")
696
+
697
+ # Filter altloc IDs and return
698
+ if altloc_ids is None:
699
+ return array
700
+ elif altloc == "occupancy" and occupancy is not None:
701
+ return array[
702
+ ...,
703
+ filter_highest_occupancy_altloc(
704
+ array, altloc_ids.as_array(str), occupancy.as_array(float)
705
+ ),
706
+ ]
707
+ # 'first' is also fallback if file has no occupancy information
708
+ elif altloc == "first":
709
+ return array[..., filter_first_altloc(array, altloc_ids.as_array(str))]
710
+ elif altloc == "all":
711
+ array.set_annotation("altloc_id", altloc_ids.as_array(str))
712
+ return array
713
+ else:
714
+ raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
715
+
716
+
717
+ def _get_model_starts(model_array):
718
+ """
719
+ Get the start index for each model in the arrays of the
720
+ ``atom_site`` category.
721
+ """
722
+ _, indices = np.unique(model_array, return_index=True)
723
+ indices.sort()
724
+ return indices
725
+
726
+
727
+ def _filter_model(atom_site, model_starts, model):
728
+ """
729
+ Reduce the ``atom_site`` category to the values for the given
730
+ model.
731
+ """
732
+ # Append exclusive stop
733
+ model_starts = np.append(model_starts, [atom_site.row_count])
734
+ # Indexing starts at 0, but model number starts at 1
735
+ model_index = model - 1
736
+ index = slice(model_starts[model_index], model_starts[model_index + 1])
737
+ return _filter(atom_site, index)
738
+
739
+
740
+ def _get_box(block):
741
+ cell = block.get("cell")
742
+ if cell is None:
743
+ return None
744
+ try:
745
+ len_a, len_b, len_c = [
746
+ float(cell[length].as_item())
747
+ for length in ["length_a", "length_b", "length_c"]
748
+ ]
749
+ alpha, beta, gamma = [
750
+ np.deg2rad(float(cell[angle].as_item()))
751
+ for angle in ["angle_alpha", "angle_beta", "angle_gamma"]
752
+ ]
753
+ except ValueError:
754
+ # 'cell_dict' has no proper unit cell values, e.g. '?'
755
+ return None
756
+ return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
757
+
758
+
759
+ def set_structure(
760
+ pdbx_file,
761
+ array,
762
+ data_block=None,
763
+ include_bonds=False,
764
+ extra_fields=[],
765
+ ):
766
+ """
767
+ Set the ``atom_site`` category with atom information from an
768
+ :class:`AtomArray` or :class:`AtomArrayStack`.
769
+
770
+ This will save the coordinates, the mandatory annotation categories
771
+ and the optional annotation categories
772
+ ``atom_id``, ``b_factor``, ``occupancy`` and ``charge``.
773
+ If the atom array (stack) contains the annotation ``'atom_id'``,
774
+ these values will be used for atom numbering instead of continuous
775
+ numbering.
776
+ Furthermore, inter-residue bonds will be written into the
777
+ ``struct_conn`` category.
778
+
779
+ Parameters
780
+ ----------
781
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
782
+ The file object.
783
+ array : AtomArray or AtomArrayStack
784
+ The structure to be written. If a stack is given, each array in
785
+ the stack will be in a separate model.
786
+ data_block : str, optional
787
+ The name of the data block.
788
+ Default is the first (and most times only) data block of the
789
+ file.
790
+ If the data block object is passed directly to `pdbx_file`,
791
+ this parameter is ignored.
792
+ If the file is empty, a new data block will be created.
793
+ include_bonds : bool, optional
794
+ If set to true and `array` has associated ``bonds`` , the
795
+ intra-residue bonds will be written into the ``chem_comp_bond``
796
+ category.
797
+ Inter-residue bonds will be written into the ``struct_conn``
798
+ independent of this parameter.
799
+ extra_fields : list of str, optional
800
+ List of additional fields from the ``atom_site`` category
801
+ that should be written into the file.
802
+ Default is an empty list.
803
+
804
+ Notes
805
+ -----
806
+ In some cases, the written inter-residue bonds cannot be read again
807
+ due to ambiguity to which atoms the bond refers.
808
+ This is the case, when two equal residues in the same chain have
809
+ the same (or a masked) `res_id`.
810
+
811
+ Examples
812
+ --------
813
+
814
+ >>> import os.path
815
+ >>> file = CIFFile()
816
+ >>> set_structure(file, atom_array)
817
+ >>> file.write(os.path.join(path_to_directory, "structure.cif"))
818
+
819
+ """
820
+ _check_non_empty(array)
821
+
822
+ block = _get_or_create_block(pdbx_file, data_block)
823
+ Category = block.subcomponent_class()
824
+ Column = Category.subcomponent_class()
825
+
826
+ # Fill PDBx columns from information
827
+ # in structures' attribute arrays as good as possible
828
+ atom_site = Category()
829
+ atom_site["group_PDB"] = np.where(array.hetero, "HETATM", "ATOM")
830
+ atom_site["type_symbol"] = np.copy(array.element)
831
+ atom_site["label_atom_id"] = np.copy(array.atom_name)
832
+ atom_site["label_alt_id"] = Column(
833
+ # AtomArrays do not store altloc atoms
834
+ np.full(array.array_length(), "."),
835
+ np.full(array.array_length(), MaskValue.INAPPLICABLE),
836
+ )
837
+ atom_site["label_comp_id"] = np.copy(array.res_name)
838
+ atom_site["label_asym_id"] = np.copy(array.chain_id)
839
+ atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
840
+ atom_site["label_seq_id"] = np.copy(array.res_id)
841
+ atom_site["pdbx_PDB_ins_code"] = Column(
842
+ np.copy(array.ins_code),
843
+ np.where(array.ins_code == "", MaskValue.INAPPLICABLE, MaskValue.PRESENT),
844
+ )
845
+ atom_site["auth_seq_id"] = atom_site["label_seq_id"]
846
+ atom_site["auth_comp_id"] = atom_site["label_comp_id"]
847
+ atom_site["auth_asym_id"] = atom_site["label_asym_id"]
848
+ atom_site["auth_atom_id"] = atom_site["label_atom_id"]
849
+
850
+ annot_categories = array.get_annotation_categories()
851
+ if "atom_id" in annot_categories:
852
+ atom_site["id"] = np.copy(array.atom_id)
853
+ if "b_factor" in annot_categories:
854
+ atom_site["B_iso_or_equiv"] = np.copy(array.b_factor)
855
+ if "occupancy" in annot_categories:
856
+ atom_site["occupancy"] = np.copy(array.occupancy)
857
+ if "charge" in annot_categories:
858
+ atom_site["pdbx_formal_charge"] = Column(
859
+ np.array([f"{c:+d}" if c != 0 else "?" for c in array.charge]),
860
+ np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
861
+ )
862
+
863
+ # Handle all remaining custom fields
864
+ if len(extra_fields) > 0:
865
+ # ... check to avoid clashes with standard annotations
866
+ _standard_annotations = [
867
+ "hetero",
868
+ "element",
869
+ "atom_name",
870
+ "res_name",
871
+ "chain_id",
872
+ "res_id",
873
+ "ins_code",
874
+ "atom_id",
875
+ "b_factor",
876
+ "occupancy",
877
+ "charge",
878
+ ]
879
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
880
+
881
+ for annot in extra_fields:
882
+ if annot in _reserved_annotation_names:
883
+ raise ValueError(
884
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
885
+ "Please choose another name."
886
+ )
887
+ atom_site[annot] = np.copy(array.get_annotation(annot))
888
+
889
+ if array.bonds is not None:
890
+ struct_conn = _set_inter_residue_bonds(array, atom_site)
891
+ if struct_conn is not None:
892
+ block["struct_conn"] = struct_conn
893
+ if include_bonds:
894
+ chem_comp_bond = _set_intra_residue_bonds(array, atom_site)
895
+ if chem_comp_bond is not None:
896
+ block["chem_comp_bond"] = chem_comp_bond
897
+
898
+ # In case of a single model handle each coordinate
899
+ # simply like a flattened array
900
+ if isinstance(array, AtomArray) or (
901
+ isinstance(array, AtomArrayStack) and array.stack_depth() == 1
902
+ ):
903
+ # 'ravel' flattens coord without copy
904
+ # in case of stack with stack_depth = 1
905
+ atom_site["Cartn_x"] = np.copy(np.ravel(array.coord[..., 0]))
906
+ atom_site["Cartn_y"] = np.copy(np.ravel(array.coord[..., 1]))
907
+ atom_site["Cartn_z"] = np.copy(np.ravel(array.coord[..., 2]))
908
+ atom_site["pdbx_PDB_model_num"] = np.ones(array.array_length(), dtype=np.int32)
909
+ # In case of multiple models repeat annotations
910
+ # and use model specific coordinates
911
+ else:
912
+ atom_site = _repeat(atom_site, array.stack_depth())
913
+ coord = np.reshape(array.coord, (array.stack_depth() * array.array_length(), 3))
914
+ atom_site["Cartn_x"] = np.copy(coord[:, 0])
915
+ atom_site["Cartn_y"] = np.copy(coord[:, 1])
916
+ atom_site["Cartn_z"] = np.copy(coord[:, 2])
917
+ atom_site["pdbx_PDB_model_num"] = np.repeat(
918
+ np.arange(1, array.stack_depth() + 1, dtype=np.int32),
919
+ repeats=array.array_length(),
920
+ )
921
+ if "atom_id" not in annot_categories:
922
+ # Count from 1
923
+ atom_site["id"] = np.arange(1, len(atom_site["group_PDB"]) + 1)
924
+ block["atom_site"] = atom_site
925
+
926
+ # Write box into file
927
+ if array.box is not None:
928
+ # PDBx files can only store one box for all models
929
+ # -> Use first box
930
+ if array.box.ndim == 3:
931
+ box = array.box[0]
932
+ else:
933
+ box = array.box
934
+ len_a, len_b, len_c, alpha, beta, gamma = unitcell_from_vectors(box)
935
+ cell = Category()
936
+ cell["length_a"] = len_a
937
+ cell["length_b"] = len_b
938
+ cell["length_c"] = len_c
939
+ cell["angle_alpha"] = np.rad2deg(alpha)
940
+ cell["angle_beta"] = np.rad2deg(beta)
941
+ cell["angle_gamma"] = np.rad2deg(gamma)
942
+ block["cell"] = cell
943
+
944
+
945
+ def _check_non_empty(array):
946
+ if isinstance(array, AtomArray):
947
+ if array.array_length() == 0:
948
+ raise BadStructureError("Structure must not be empty")
949
+ elif isinstance(array, AtomArrayStack):
950
+ if array.array_length() == 0 or array.stack_depth() == 0:
951
+ raise BadStructureError("Structure must not be empty")
952
+ else:
953
+ raise ValueError(
954
+ "Structure must be AtomArray or AtomArrayStack, "
955
+ f"but got {type(array).__name__}"
956
+ )
957
+
958
+
959
+ def _get_or_create_block(pdbx_component, block_name):
960
+ Block = pdbx_component.subcomponent_class()
961
+
962
+ if isinstance(pdbx_component, (CIFFile, BinaryCIFFile)):
963
+ if block_name is None:
964
+ if len(pdbx_component) > 0:
965
+ block_name = next(iter(pdbx_component.keys()))
966
+ else:
967
+ # File is empty -> invent a new block name
968
+ block_name = "structure"
969
+
970
+ if block_name not in pdbx_component:
971
+ block = Block()
972
+ pdbx_component[block_name] = block
973
+ return pdbx_component[block_name]
974
+ else:
975
+ # Already a block
976
+ return pdbx_component
977
+
978
+
979
+ def _determine_entity_id(chain_id):
980
+ entity_id = np.zeros(len(chain_id), dtype=int)
981
+ # Dictionary that translates chain_id to entity_id
982
+ id_translation = {}
983
+ id = 1
984
+ for i in range(len(chain_id)):
985
+ try:
986
+ entity_id[i] = id_translation[chain_id[i]]
987
+ except KeyError:
988
+ # chain_id is not in dictionary -> new entry
989
+ id_translation[chain_id[i]] = id
990
+ entity_id[i] = id_translation[chain_id[i]]
991
+ id += 1
992
+ return entity_id
993
+
994
+
995
+ def _repeat(category, repetitions):
996
+ Category = type(category)
997
+ Column = Category.subcomponent_class()
998
+ Data = Column.subcomponent_class()
999
+
1000
+ category_dict = {}
1001
+ for key, column in category.items():
1002
+ if isinstance(column, BinaryCIFColumn):
1003
+ data_encoding = column.data.encoding
1004
+ # Optimization: The repeated string array has the same
1005
+ # unique values, as the original string array
1006
+ # -> Use same unique values (faster due to shorter array)
1007
+ if isinstance(data_encoding[0], StringArrayEncoding):
1008
+ data_encoding[0].strings = np.unique(column.data.array)
1009
+ data = Data(np.tile(column.data.array, repetitions), data_encoding)
1010
+ else:
1011
+ data = Data(np.tile(column.data.array, repetitions))
1012
+ mask = (
1013
+ Data(np.tile(column.mask.array, repetitions))
1014
+ if column.mask is not None
1015
+ else None
1016
+ )
1017
+ category_dict[key] = Column(data, mask)
1018
+ return Category(category_dict)
1019
+
1020
+
1021
+ def _set_intra_residue_bonds(array, atom_site):
1022
+ """
1023
+ Create the ``chem_comp_bond`` category containing the intra-residue
1024
+ bonds.
1025
+ ``atom_site`` is only used to infer the right :class:`Category` type
1026
+ (either :class:`CIFCategory` or :class:`BinaryCIFCategory`).
1027
+ """
1028
+ if (array.res_name == "").any():
1029
+ raise BadStructureError(
1030
+ "Structure contains atoms with empty residue name, "
1031
+ "but it is required to write intra-residue bonds"
1032
+ )
1033
+ if (array.atom_name == "").any():
1034
+ raise BadStructureError(
1035
+ "Structure contains atoms with empty atom name, "
1036
+ "but it is required to write intra-residue bonds"
1037
+ )
1038
+
1039
+ Category = type(atom_site)
1040
+ Column = Category.subcomponent_class()
1041
+
1042
+ bond_array = _filter_bonds(array, "intra")
1043
+ if len(bond_array) == 0:
1044
+ return None
1045
+ value_order = np.zeros(len(bond_array), dtype="U4")
1046
+ aromatic_flag = np.zeros(len(bond_array), dtype="U1")
1047
+ for i, bond_type in enumerate(bond_array[:, 2]):
1048
+ if bond_type == BondType.ANY:
1049
+ # ANY bonds will be masked anyway, no need to set the value
1050
+ continue
1051
+ order, aromatic = COMP_BOND_TYPE_TO_ORDER[bond_type]
1052
+ value_order[i] = order
1053
+ aromatic_flag[i] = aromatic
1054
+ any_mask = bond_array[:, 2] == BondType.ANY
1055
+
1056
+ # Remove already existing residue and atom name combinations
1057
+ # These appear when the structure contains a residue multiple times
1058
+ atom_id_1 = array.atom_name[bond_array[:, 0]]
1059
+ atom_id_2 = array.atom_name[bond_array[:, 1]]
1060
+ # Take the residue name from the first atom index, as the residue
1061
+ # name is the same for both atoms, since we have only intra bonds
1062
+ comp_id = array.res_name[bond_array[:, 0]]
1063
+ _, unique_indices = np.unique(
1064
+ np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True
1065
+ )
1066
+ unique_indices.sort()
1067
+
1068
+ chem_comp_bond = Category()
1069
+ n_bonds = len(unique_indices)
1070
+ chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32)
1071
+ chem_comp_bond["comp_id"] = comp_id[unique_indices]
1072
+ chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices]
1073
+ chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices]
1074
+ chem_comp_bond["value_order"] = Column(
1075
+ value_order[unique_indices],
1076
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1077
+ )
1078
+ chem_comp_bond["pdbx_aromatic_flag"] = Column(
1079
+ aromatic_flag[unique_indices],
1080
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
1081
+ )
1082
+ # BondList does not contain stereo information
1083
+ # -> all values are missing
1084
+ chem_comp_bond["pdbx_stereo_config"] = Column(
1085
+ np.zeros(n_bonds, dtype="U1"),
1086
+ np.full(n_bonds, MaskValue.MISSING),
1087
+ )
1088
+ return chem_comp_bond
1089
+
1090
+
1091
+ def _set_inter_residue_bonds(array, atom_site):
1092
+ """
1093
+ Create the ``struct_conn`` category containing the inter-residue
1094
+ bonds.
1095
+ The involved atoms are identified by annotations from the
1096
+ ``atom_site`` category.
1097
+ """
1098
+ COLUMNS = [
1099
+ "label_asym_id",
1100
+ "label_comp_id",
1101
+ "label_seq_id",
1102
+ "label_atom_id",
1103
+ "pdbx_PDB_ins_code",
1104
+ ]
1105
+
1106
+ Category = type(atom_site)
1107
+ Column = Category.subcomponent_class()
1108
+
1109
+ bond_array = _filter_bonds(array, "inter")
1110
+ if len(bond_array) == 0:
1111
+ return None
1112
+
1113
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1114
+ # nucleotide/amino acid residues
1115
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1116
+ if len(bond_array) == 0:
1117
+ return None
1118
+
1119
+ struct_conn = Category()
1120
+ struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1121
+ struct_conn["conn_type_id"] = [
1122
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1123
+ ]
1124
+ struct_conn["pdbx_value_order"] = Column(
1125
+ np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1126
+ np.where(
1127
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1128
+ MaskValue.MISSING,
1129
+ MaskValue.PRESENT,
1130
+ ),
1131
+ )
1132
+ # Write the identifying annotation...
1133
+ for col_name in COLUMNS:
1134
+ annot = atom_site[col_name].as_array()
1135
+ # ...for each bond partner
1136
+ for i in range(2):
1137
+ atom_indices = bond_array[:, i]
1138
+ struct_conn[_get_struct_conn_col_name(col_name, i + 1)] = annot[
1139
+ atom_indices
1140
+ ]
1141
+ return struct_conn
1142
+
1143
+
1144
+ def _filter_bonds(array, connection):
1145
+ """
1146
+ Get a bonds array, that contain either only intra-residue or
1147
+ only inter-residue bonds.
1148
+ """
1149
+ bond_array = array.bonds.as_array()
1150
+ # To save computation time call 'get_residue_starts_for()' only once
1151
+ # with indices of the first and second atom of each bond
1152
+ residue_starts_1, residue_starts_2 = (
1153
+ get_residue_starts_for(array, bond_array[:, :2].flatten()).reshape(-1, 2).T
1154
+ )
1155
+ if connection == "intra":
1156
+ return bond_array[residue_starts_1 == residue_starts_2]
1157
+ elif connection == "inter":
1158
+ return bond_array[residue_starts_1 != residue_starts_2]
1159
+ else:
1160
+ raise ValueError("Invalid 'connection' option")
1161
+
1162
+
1163
+ def _filter_canonical_links(array, bond_array):
1164
+ """
1165
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1166
+ """
1167
+ # Get the residue index for each bonded atom
1168
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1169
+ -1, 2
1170
+ )
1171
+
1172
+ return (
1173
+ # Must be canonical residues
1174
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1175
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1176
+ # Must be backbone bond
1177
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1178
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1179
+ # Must connect adjacent residues
1180
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1181
+ ) # fmt: skip
1182
+
1183
+
1184
+ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
1185
+ """
1186
+ Create an :class:`AtomArray` for a chemical component from the
1187
+ ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
1188
+ category in a file.
1189
+
1190
+ Parameters
1191
+ ----------
1192
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1193
+ The file object.
1194
+ data_block : str, optional
1195
+ The name of the data block.
1196
+ Default is the first (and most times only) data block of the
1197
+ file.
1198
+ If the data block object is passed directly to `pdbx_file`,
1199
+ this parameter is ignored.
1200
+ use_ideal_coord : bool, optional
1201
+ If true, the *ideal* coordinates are read from the file
1202
+ (``pdbx_model_Cartn_<dim>_ideal`` fields), typically
1203
+ originating from computations.
1204
+ If set to false, alternative coordinates are read
1205
+ (``model_Cartn_<dim>_`` fields).
1206
+ res_name : str
1207
+ In rare cases the categories may contain rows for multiple
1208
+ components.
1209
+ In this case, the component with the given residue name is
1210
+ read.
1211
+ By default, all rows would be read in this case.
1212
+
1213
+ Returns
1214
+ -------
1215
+ array : AtomArray
1216
+ The parsed chemical component.
1217
+
1218
+ Examples
1219
+ --------
1220
+
1221
+ >>> import os.path
1222
+ >>> file = CIFFile.read(
1223
+ ... os.path.join(path_to_structures, "molecules", "TYR.cif")
1224
+ ... )
1225
+ >>> comp = get_component(file)
1226
+ >>> print(comp)
1227
+ HET 0 TYR N N 1.320 0.952 1.428
1228
+ HET 0 TYR CA C -0.018 0.429 1.734
1229
+ HET 0 TYR C C -0.103 0.094 3.201
1230
+ HET 0 TYR O O 0.886 -0.254 3.799
1231
+ HET 0 TYR CB C -0.274 -0.831 0.907
1232
+ HET 0 TYR CG C -0.189 -0.496 -0.559
1233
+ HET 0 TYR CD1 C 1.022 -0.589 -1.219
1234
+ HET 0 TYR CD2 C -1.324 -0.102 -1.244
1235
+ HET 0 TYR CE1 C 1.103 -0.282 -2.563
1236
+ HET 0 TYR CE2 C -1.247 0.210 -2.587
1237
+ HET 0 TYR CZ C -0.032 0.118 -3.252
1238
+ HET 0 TYR OH O 0.044 0.420 -4.574
1239
+ HET 0 TYR OXT O -1.279 0.184 3.842
1240
+ HET 0 TYR H H 1.977 0.225 1.669
1241
+ HET 0 TYR H2 H 1.365 1.063 0.426
1242
+ HET 0 TYR HA H -0.767 1.183 1.489
1243
+ HET 0 TYR HB2 H 0.473 -1.585 1.152
1244
+ HET 0 TYR HB3 H -1.268 -1.219 1.134
1245
+ HET 0 TYR HD1 H 1.905 -0.902 -0.683
1246
+ HET 0 TYR HD2 H -2.269 -0.031 -0.727
1247
+ HET 0 TYR HE1 H 2.049 -0.354 -3.078
1248
+ HET 0 TYR HE2 H -2.132 0.523 -3.121
1249
+ HET 0 TYR HH H -0.123 -0.399 -5.059
1250
+ HET 0 TYR HXT H -1.333 -0.030 4.784
1251
+ """
1252
+ block = _get_block(pdbx_file, data_block)
1253
+
1254
+ try:
1255
+ atom_category = block["chem_comp_atom"]
1256
+ except KeyError:
1257
+ raise InvalidFileError("Missing 'chem_comp_atom' category in file")
1258
+ if res_name is not None:
1259
+ atom_category = _filter(
1260
+ atom_category, atom_category["comp_id"].as_array() == res_name
1261
+ )
1262
+ if atom_category.row_count == 0:
1263
+ raise KeyError(
1264
+ f"No rows with residue name '{res_name}' found in "
1265
+ f"'chem_comp_atom' category"
1266
+ )
1267
+
1268
+ array = AtomArray(atom_category.row_count)
1269
+
1270
+ array.set_annotation("hetero", np.full(len(atom_category["comp_id"]), True))
1271
+ array.set_annotation("res_name", atom_category["comp_id"].as_array(str))
1272
+ array.set_annotation("atom_name", atom_category["atom_id"].as_array(str))
1273
+ array.set_annotation("element", atom_category["type_symbol"].as_array(str))
1274
+ array.set_annotation("charge", atom_category["charge"].as_array(int, 0))
1275
+
1276
+ coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
1277
+ alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
1278
+ if not use_ideal_coord:
1279
+ # Swap with the fallback option
1280
+ coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1281
+ try:
1282
+ array.coord = _parse_component_coordinates(
1283
+ [atom_category[field] for field in coord_fields]
1284
+ )
1285
+ except Exception as err:
1286
+ if isinstance(err, KeyError):
1287
+ key = err.args[0]
1288
+ warnings.warn(
1289
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1290
+ f"The fallback coordinates will be used instead",
1291
+ UserWarning,
1292
+ )
1293
+ elif isinstance(err, ValueError):
1294
+ warnings.warn(
1295
+ "The coordinates are missing for some atoms. "
1296
+ "The fallback coordinates will be used instead",
1297
+ UserWarning,
1298
+ )
1299
+ else:
1300
+ raise
1301
+ array.coord = _parse_component_coordinates(
1302
+ [atom_category[field] for field in alt_coord_fields]
1303
+ )
1304
+
1305
+ try:
1306
+ bond_category = block["chem_comp_bond"]
1307
+ if res_name is not None:
1308
+ bond_category = _filter(
1309
+ bond_category, bond_category["comp_id"].as_array() == res_name
1310
+ )
1311
+ except KeyError:
1312
+ warnings.warn(
1313
+ "Category 'chem_comp_bond' not found. " "No bonds will be parsed",
1314
+ UserWarning,
1315
+ )
1316
+ else:
1317
+ bonds = BondList(array.array_length())
1318
+ for atom1, atom2, order, aromatic_flag in zip(
1319
+ bond_category["atom_id_1"].as_array(str),
1320
+ bond_category["atom_id_2"].as_array(str),
1321
+ bond_category["value_order"].as_array(str),
1322
+ bond_category["pdbx_aromatic_flag"].as_array(str),
1323
+ ):
1324
+ atom_i = np.where(array.atom_name == atom1)[0][0]
1325
+ atom_j = np.where(array.atom_name == atom2)[0][0]
1326
+ bond_type = COMP_BOND_ORDER_TO_TYPE[order, aromatic_flag]
1327
+ bonds.add_bond(atom_i, atom_j, bond_type)
1328
+ array.bonds = bonds
1329
+
1330
+ return array
1331
+
1332
+
1333
+ def _parse_component_coordinates(coord_columns):
1334
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1335
+ for i, column in enumerate(coord_columns):
1336
+ if column.mask is not None and column.mask.array.any():
1337
+ raise ValueError(
1338
+ "Missing coordinates for some atoms",
1339
+ )
1340
+ coord[:, i] = column.as_array(np.float32)
1341
+ return coord
1342
+
1343
+
1344
+ def set_component(pdbx_file, array, data_block=None):
1345
+ """
1346
+ Set the ``chem_comp_atom`` and, if bonds are available,
1347
+ ``chem_comp_bond`` category with atom information from an
1348
+ :class:`AtomArray`.
1349
+
1350
+ This will save the coordinates, the mandatory annotation categories
1351
+ and the optional ``charge`` category as well as an associated
1352
+ :class:`BondList`, if available.
1353
+
1354
+ Parameters
1355
+ ----------
1356
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1357
+ The file object.
1358
+ array : AtomArray
1359
+ The chemical component to be written.
1360
+ Must contain only a single residue.
1361
+ data_block : str, optional
1362
+ The name of the data block.
1363
+ Default is the first (and most times only) data block of the
1364
+ file.
1365
+ If the file is empty, a new data will be created.
1366
+ If the data block object is passed directly to `pdbx_file`,
1367
+ this parameter is ignored.
1368
+ """
1369
+ _check_non_empty(array)
1370
+
1371
+ block = _get_or_create_block(pdbx_file, data_block)
1372
+ Category = block.subcomponent_class()
1373
+
1374
+ if get_residue_count(array) > 1:
1375
+ raise BadStructureError("The input atom array must comprise only one residue")
1376
+ res_name = array.res_name[0]
1377
+
1378
+ annot_categories = array.get_annotation_categories()
1379
+ if "charge" in annot_categories:
1380
+ charge = array.charge.astype("U2")
1381
+ else:
1382
+ charge = np.full(array.array_length(), "?", dtype="U2")
1383
+
1384
+ atom_cat = Category()
1385
+ atom_cat["comp_id"] = np.full(array.array_length(), res_name)
1386
+ atom_cat["atom_id"] = np.copy(array.atom_name)
1387
+ atom_cat["alt_atom_id"] = atom_cat["atom_id"]
1388
+ atom_cat["type_symbol"] = np.copy(array.element)
1389
+ atom_cat["charge"] = charge
1390
+ atom_cat["model_Cartn_x"] = np.copy(array.coord[:, 0])
1391
+ atom_cat["model_Cartn_y"] = np.copy(array.coord[:, 1])
1392
+ atom_cat["model_Cartn_z"] = np.copy(array.coord[:, 2])
1393
+ atom_cat["pdbx_model_Cartn_x_ideal"] = atom_cat["model_Cartn_x"]
1394
+ atom_cat["pdbx_model_Cartn_y_ideal"] = atom_cat["model_Cartn_y"]
1395
+ atom_cat["pdbx_model_Cartn_z_ideal"] = atom_cat["model_Cartn_z"]
1396
+ atom_cat["pdbx_component_atom_id"] = atom_cat["atom_id"]
1397
+ atom_cat["pdbx_component_comp_id"] = atom_cat["comp_id"]
1398
+ atom_cat["pdbx_ordinal"] = np.arange(1, array.array_length() + 1).astype(str)
1399
+ block["chem_comp_atom"] = atom_cat
1400
+
1401
+ if array.bonds is not None and array.bonds.get_bond_count() > 0:
1402
+ bond_array = array.bonds.as_array()
1403
+ order_flags = []
1404
+ aromatic_flags = []
1405
+ for bond_type in bond_array[:, 2]:
1406
+ order_flag, aromatic_flag = COMP_BOND_TYPE_TO_ORDER[bond_type]
1407
+ order_flags.append(order_flag)
1408
+ aromatic_flags.append(aromatic_flag)
1409
+
1410
+ bond_cat = Category()
1411
+ bond_cat["comp_id"] = np.full(len(bond_array), res_name)
1412
+ bond_cat["atom_id_1"] = array.atom_name[bond_array[:, 0]]
1413
+ bond_cat["atom_id_2"] = array.atom_name[bond_array[:, 1]]
1414
+ bond_cat["value_order"] = np.array(order_flags)
1415
+ bond_cat["pdbx_aromatic_flag"] = np.array(aromatic_flags)
1416
+ bond_cat["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1).astype(str)
1417
+ block["chem_comp_bond"] = bond_cat
1418
+
1419
+
1420
+ def list_assemblies(pdbx_file, data_block=None):
1421
+ """
1422
+ List the biological assemblies that are available for the structure
1423
+ in the given file.
1424
+
1425
+ This function receives the data from the ``pdbx_struct_assembly``
1426
+ category in the file.
1427
+ Consequently, this category must be present in the file.
1428
+
1429
+ Parameters
1430
+ ----------
1431
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1432
+ The file object.
1433
+ data_block : str, optional
1434
+ The name of the data block.
1435
+ Default is the first (and most times only) data block of the
1436
+ file.
1437
+ If the data block object is passed directly to `pdbx_file`,
1438
+ this parameter is ignored.
1439
+
1440
+ Returns
1441
+ -------
1442
+ assemblies : dict of str -> str
1443
+ A dictionary that maps an assembly ID to a description of the
1444
+ corresponding assembly.
1445
+
1446
+ Examples
1447
+ --------
1448
+ >>> import os.path
1449
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1450
+ >>> assembly_ids = list_assemblies(file)
1451
+ >>> for key, val in assembly_ids.items():
1452
+ ... print(f"'{key}' : '{val}'")
1453
+ '1' : 'complete icosahedral assembly'
1454
+ '2' : 'icosahedral asymmetric unit'
1455
+ '3' : 'icosahedral pentamer'
1456
+ '4' : 'icosahedral 23 hexamer'
1457
+ '5' : 'icosahedral asymmetric unit, std point frame'
1458
+ '6' : 'crystal asymmetric unit, crystal frame'
1459
+ """
1460
+ block = _get_block(pdbx_file, data_block)
1461
+
1462
+ try:
1463
+ assembly_category = block["pdbx_struct_assembly"]
1464
+ except KeyError:
1465
+ raise InvalidFileError("File has no 'pdbx_struct_assembly' category")
1466
+ return {
1467
+ id: details
1468
+ for id, details in zip(
1469
+ assembly_category["id"].as_array(str),
1470
+ assembly_category["details"].as_array(str),
1471
+ )
1472
+ }
1473
+
1474
+
1475
+ def get_assembly(
1476
+ pdbx_file,
1477
+ assembly_id=None,
1478
+ model=None,
1479
+ data_block=None,
1480
+ altloc="first",
1481
+ extra_fields=None,
1482
+ use_author_fields=True,
1483
+ include_bonds=False,
1484
+ ):
1485
+ """
1486
+ Build the given biological assembly.
1487
+
1488
+ This function receives the data from the
1489
+ ``pdbx_struct_assembly_gen``, ``pdbx_struct_oper_list`` and
1490
+ ``atom_site`` categories in the file.
1491
+ Consequently, these categories must be present in the file.
1492
+
1493
+ Parameters
1494
+ ----------
1495
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1496
+ The file object.
1497
+ assembly_id : str
1498
+ The assembly to build.
1499
+ Available assembly IDs can be obtained via
1500
+ :func:`list_assemblies()`.
1501
+ model : int, optional
1502
+ If this parameter is given, the function will return an
1503
+ :class:`AtomArray` from the atoms corresponding to the given
1504
+ model number (starting at 1).
1505
+ Negative values are used to index models starting from the last
1506
+ model insted of the first model.
1507
+ If this parameter is omitted, an :class:`AtomArrayStack`
1508
+ containing all models will be returned, even if the structure
1509
+ contains only one model.
1510
+ data_block : str, optional
1511
+ The name of the data block.
1512
+ Default is the first (and most times only) data block of the
1513
+ file.
1514
+ If the data block object is passed directly to `pdbx_file`,
1515
+ this parameter is ignored.
1516
+ altloc : {'first', 'occupancy', 'all'}
1517
+ This parameter defines how *altloc* IDs are handled:
1518
+ - ``'first'`` - Use atoms that have the first *altloc* ID
1519
+ appearing in a residue.
1520
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
1521
+ with the highest occupancy for a residue.
1522
+ - ``'all'`` - Use all atoms.
1523
+ Note that this leads to duplicate atoms.
1524
+ When this option is chosen, the ``altloc_id`` annotation
1525
+ array is added to the returned structure.
1526
+ extra_fields : list of str, optional
1527
+ The strings in the list are entry names, that are
1528
+ additionally added as annotation arrays.
1529
+ The annotation category name will be the same as the PDBx
1530
+ subcategory name.
1531
+ The array type is always `str`.
1532
+ An exception are the special field identifiers:
1533
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
1534
+ These will convert the fitting subcategory into an
1535
+ annotation array with reasonable type.
1536
+ use_author_fields : bool, optional
1537
+ Some fields can be read from two alternative sources,
1538
+ for example both, ``label_seq_id`` and ``auth_seq_id`` describe
1539
+ the ID of the residue.
1540
+ While, the ``label_xxx`` fields can be used as official pointers
1541
+ to other categories in the file, the ``auth_xxx``
1542
+ fields are set by the author(s) of the structure and are
1543
+ consistent with the corresponding values in PDB files.
1544
+ If `use_author_fields` is true, the annotation arrays will be
1545
+ read from the ``auth_xxx`` fields (if applicable),
1546
+ otherwise from the the ``label_xxx`` fields.
1547
+ include_bonds : bool, optional
1548
+ If set to true, a :class:`BondList` will be created for the
1549
+ resulting :class:`AtomArray` containing the bond information
1550
+ from the file.
1551
+ Bonds, whose order could not be determined from the
1552
+ *Chemical Component Dictionary*
1553
+ (e.g. especially inter-residue bonds),
1554
+ have :attr:`BondType.ANY`, since the PDB format itself does
1555
+ not support bond orders.
1556
+
1557
+ Returns
1558
+ -------
1559
+ assembly : AtomArray or AtomArrayStack
1560
+ The assembly.
1561
+ The return type depends on the `model` parameter.
1562
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1563
+ unit in the assembly.
1564
+
1565
+ Examples
1566
+ --------
1567
+
1568
+ >>> import os.path
1569
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1570
+ >>> assembly = get_assembly(file, model=1)
1571
+ """
1572
+ block = _get_block(pdbx_file, data_block)
1573
+
1574
+ try:
1575
+ assembly_gen_category = block["pdbx_struct_assembly_gen"]
1576
+ except KeyError:
1577
+ raise InvalidFileError("File has no 'pdbx_struct_assembly_gen' category")
1578
+
1579
+ try:
1580
+ struct_oper_category = block["pdbx_struct_oper_list"]
1581
+ except KeyError:
1582
+ raise InvalidFileError("File has no 'pdbx_struct_oper_list' category")
1583
+
1584
+ assembly_ids = assembly_gen_category["assembly_id"].as_array(str)
1585
+ if assembly_id is None:
1586
+ assembly_id = assembly_ids[0]
1587
+ elif assembly_id not in assembly_ids:
1588
+ raise KeyError(f"File has no Assembly ID '{assembly_id}'")
1589
+
1590
+ ### Calculate all possible transformations
1591
+ transformations = _get_transformations(struct_oper_category)
1592
+
1593
+ ### Get structure according to additional parameters
1594
+ # Include 'label_asym_id' as annotation array
1595
+ # for correct asym ID filtering
1596
+ extra_fields = [] if extra_fields is None else extra_fields
1597
+ if "label_asym_id" in extra_fields:
1598
+ extra_fields_and_asym = extra_fields
1599
+ else:
1600
+ # The operations apply on asym IDs
1601
+ # -> they need to be included to select the correct atoms
1602
+ extra_fields_and_asym = extra_fields + ["label_asym_id"]
1603
+ structure = get_structure(
1604
+ pdbx_file,
1605
+ model,
1606
+ data_block,
1607
+ altloc,
1608
+ extra_fields_and_asym,
1609
+ use_author_fields,
1610
+ include_bonds,
1611
+ )
1612
+
1613
+ ### Get transformations and apply them to the affected asym IDs
1614
+ assembly = None
1615
+ for id, op_expr, asym_id_expr in zip(
1616
+ assembly_gen_category["assembly_id"].as_array(str),
1617
+ assembly_gen_category["oper_expression"].as_array(str),
1618
+ assembly_gen_category["asym_id_list"].as_array(str),
1619
+ ):
1620
+ # Find the operation expressions for given assembly ID
1621
+ # We already asserted that the ID is actually present
1622
+ if id == assembly_id:
1623
+ operations = _parse_operation_expression(op_expr)
1624
+ asym_ids = asym_id_expr.split(",")
1625
+ # Filter affected asym IDs
1626
+ sub_structure = structure[..., np.isin(structure.label_asym_id, asym_ids)]
1627
+ sub_assembly = _apply_transformations(
1628
+ sub_structure, transformations, operations
1629
+ )
1630
+ # Merge the chains with asym IDs for this operation
1631
+ # with chains from other operations
1632
+ if assembly is None:
1633
+ assembly = sub_assembly
1634
+ else:
1635
+ assembly += sub_assembly
1636
+
1637
+ # Remove 'label_asym_id', if it was not included in the original
1638
+ # user-supplied 'extra_fields'
1639
+ if "label_asym_id" not in extra_fields:
1640
+ assembly.del_annotation("label_asym_id")
1641
+
1642
+ return assembly
1643
+
1644
+
1645
+ def _apply_transformations(structure, transformation_dict, operations):
1646
+ """
1647
+ Get subassembly by applying the given operations to the input
1648
+ structure containing affected asym IDs.
1649
+ """
1650
+ # Additional first dimesion for 'structure.repeat()'
1651
+ assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1652
+ # Apply corresponding transformation for each copy in the assembly
1653
+ for i, operation in enumerate(operations):
1654
+ coord = structure.coord
1655
+ # Execute for each transformation step
1656
+ # in the operation expression
1657
+ for op_step in operation:
1658
+ rotation_matrix, translation_vector = transformation_dict[op_step]
1659
+ # Rotate
1660
+ coord = matrix_rotate(coord, rotation_matrix)
1661
+ # Translate
1662
+ coord += translation_vector
1663
+ assembly_coord[i] = coord
1664
+
1665
+ assembly = repeat(structure, assembly_coord)
1666
+ assembly.set_annotation(
1667
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1668
+ )
1669
+ return assembly
1670
+
1671
+
1672
+ def _get_transformations(struct_oper):
1673
+ """
1674
+ Get transformation operation in terms of rotation matrix and
1675
+ translation for each operation ID in ``pdbx_struct_oper_list``.
1676
+ """
1677
+ transformation_dict = {}
1678
+ for index, id in enumerate(struct_oper["id"].as_array(str)):
1679
+ rotation_matrix = np.array(
1680
+ [
1681
+ [
1682
+ struct_oper[f"matrix[{i}][{j}]"].as_array(float)[index]
1683
+ for j in (1, 2, 3)
1684
+ ]
1685
+ for i in (1, 2, 3)
1686
+ ]
1687
+ )
1688
+ translation_vector = np.array(
1689
+ [struct_oper[f"vector[{i}]"].as_array(float)[index] for i in (1, 2, 3)]
1690
+ )
1691
+ transformation_dict[id] = (rotation_matrix, translation_vector)
1692
+ return transformation_dict
1693
+
1694
+
1695
+ def _parse_operation_expression(expression):
1696
+ """
1697
+ Get successive operation steps (IDs) for the given
1698
+ ``oper_expression``.
1699
+ Form the cartesian product, if necessary.
1700
+ """
1701
+ # Split groups by parentheses:
1702
+ # use the opening parenthesis as delimiter
1703
+ # and just remove the closing parenthesis
1704
+ # example: '(X0)(1-10,21-25)' from 1a34
1705
+ expressions_per_step = expression.replace(")", "").split("(")
1706
+ expressions_per_step = [e for e in expressions_per_step if len(e) > 0]
1707
+ # Important: Operations are applied from right to left
1708
+ expressions_per_step.reverse()
1709
+
1710
+ operations = []
1711
+ for one_step_expr in expressions_per_step:
1712
+ one_step_op_ids = []
1713
+ for expr in one_step_expr.split(","):
1714
+ if "-" in expr:
1715
+ # Range of operation IDs, they must be integers
1716
+ first, last = expr.split("-")
1717
+ one_step_op_ids.extend(
1718
+ [str(id) for id in range(int(first), int(last) + 1)]
1719
+ )
1720
+ else:
1721
+ # Single operation ID
1722
+ one_step_op_ids.append(expr)
1723
+ operations.append(one_step_op_ids)
1724
+
1725
+ # Cartesian product of operations
1726
+ return list(itertools.product(*operations))
1727
+
1728
+
1729
+ def _convert_string_to_sequence(string, stype):
1730
+ """
1731
+ Convert strings to `ProteinSequence` if `stype` is contained in
1732
+ ``proteinseq_type_list`` or to ``NucleotideSequence`` if `stype` is
1733
+ contained in ``_nucleotideseq_type_list``.
1734
+ """
1735
+ # sequence may be stored as multiline string
1736
+ string = string.replace("\n", "")
1737
+ if stype in _proteinseq_type_list:
1738
+ return ProteinSequence(string)
1739
+ elif stype in _nucleotideseq_type_list:
1740
+ string = string.replace("U", "T")
1741
+ return NucleotideSequence(string)
1742
+ elif stype in _other_type_list:
1743
+ return None
1744
+ else:
1745
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype)