biotite 0.41.1__cp312-cp312-macosx_10_16_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-312-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-312-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-312-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-312-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-312-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1240 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ __name__ = "biotite.structure.io.pdb"
6
+ __author__ = "Patrick Kunzmann, Daniel Bauer, Claude J. Rogers"
7
+ __all__ = ["PDBFile"]
8
+
9
+ import warnings
10
+ import numpy as np
11
+ from ...atoms import AtomArray, AtomArrayStack, repeat
12
+ from ...bonds import BondList, connect_via_residue_names
13
+ from ...box import vectors_from_unitcell, unitcell_from_vectors
14
+ from ....file import TextFile, InvalidFileError
15
+ from ...repair import infer_elements
16
+ from ...error import BadStructureError
17
+ from ...filter import (
18
+ filter_first_altloc,
19
+ filter_highest_occupancy_altloc,
20
+ filter_solvent,
21
+ )
22
+ from ...util import matrix_rotate
23
+ from .hybrid36 import encode_hybrid36, decode_hybrid36, max_hybrid36_number
24
+
25
+
26
+ _PDB_MAX_ATOMS = 99999
27
+ _PDB_MAX_RESIDUES = 9999
28
+
29
+ # slice objects for readability
30
+ # ATOM/HETATM
31
+ _record = slice(0, 6)
32
+ _atom_id = slice(6, 11)
33
+ _atom_name = slice(12, 16)
34
+ _alt_loc = slice(16, 17)
35
+ _res_name = slice(17, 20)
36
+ _chain_id = slice(21, 22)
37
+ _res_id = slice(22, 26)
38
+ _ins_code = slice(26, 27)
39
+ _coord_x = slice(30, 38)
40
+ _coord_y = slice(38, 46)
41
+ _coord_z = slice(46, 54)
42
+ _occupancy = slice(54, 60)
43
+ _temp_f = slice(60, 66)
44
+ _element = slice(76, 78)
45
+ _charge = slice(78, 80)
46
+ # CRYST1
47
+ _a = slice(6, 15)
48
+ _b = slice(15, 24)
49
+ _c = slice(24, 33)
50
+ _alpha = slice(33, 40)
51
+ _beta = slice(40, 47)
52
+ _gamma = slice(47, 54)
53
+
54
+
55
+ class PDBFile(TextFile):
56
+ r"""
57
+ This class represents a PDB file.
58
+
59
+ The usage of :mod:`biotite.structure.io.pdbx` is encouraged in favor
60
+ of this class.
61
+
62
+ This class only provides support for reading/writing the pure atom
63
+ information (*ATOM*, *HETATM*, *MODEL* and *ENDMDL* records). *TER*
64
+ records cannot be written.
65
+ Additionally, *REMARK* records can be read
66
+
67
+ See also
68
+ --------
69
+ CIFFile
70
+ BinaryCIFFile
71
+
72
+ Examples
73
+ --------
74
+ Load a `\\*.pdb` file, modify the structure and save the new
75
+ structure into a new file:
76
+
77
+ >>> import os.path
78
+ >>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
79
+ >>> array_stack = file.get_structure()
80
+ >>> array_stack_mod = rotate(array_stack, [1,2,3])
81
+ >>> file = PDBFile()
82
+ >>> file.set_structure(array_stack_mod)
83
+ >>> file.write(os.path.join(path_to_directory, "1l2y_mod.pdb"))
84
+ """
85
+ @classmethod
86
+ def read(cls, file):
87
+ file = super().read(file)
88
+ # Pad lines with whitespace if lines are shorter
89
+ # than the required 80 characters
90
+ file.lines = [line.ljust(80) for line in file.lines]
91
+ file._index_models_and_atoms()
92
+ return file
93
+
94
+
95
+ def get_remark(self, number):
96
+ r"""
97
+ Get the lines containing the *REMARK* records with the given
98
+ `number`.
99
+
100
+ Parameters
101
+ ----------
102
+ number : int
103
+ The *REMARK* number, i.e. the `XXX` in ``REMARK XXX``.
104
+
105
+ Returns
106
+ -------
107
+ remark_lines : None or list of str
108
+ The content of the selected *REMARK* lines.
109
+ Each line is an element of this list.
110
+ The ``REMARK XXX `` part of each line is omitted.
111
+ Furthermore, the first line, which always must be empty, is
112
+ not included.
113
+ ``None`` is returned, if the selected *REMARK* records do not
114
+ exist in the file.
115
+
116
+ Examples
117
+ --------
118
+
119
+ >>> import os.path
120
+ >>> file = PDBFile.read(os.path.join(path_to_structures, "1l2y.pdb"))
121
+ >>> remarks = file.get_remark(900)
122
+ >>> print("\n".join(remarks))
123
+ RELATED ENTRIES
124
+ RELATED ID: 5292 RELATED DB: BMRB
125
+ BMRB 5292 IS CHEMICAL SHIFTS FOR TC5B IN BUFFER AND BUFFER
126
+ CONTAINING 30 VOL-% TFE.
127
+ RELATED ID: 1JRJ RELATED DB: PDB
128
+ 1JRJ IS AN ANALAGOUS C-TERMINAL STRUCTURE.
129
+ >>> nonexistent_remark = file.get_remark(999)
130
+ >>> print(nonexistent_remark)
131
+ None
132
+ """
133
+ CONTENT_START_COLUMN = 11
134
+
135
+ # in case a non-integer is accidentally given
136
+ number = int(number)
137
+ if number < 0 or number > 999:
138
+ raise ValueError("The number must be in range 0-999")
139
+
140
+ remark_string = f"REMARK {number:>3d}"
141
+ # Find lines and omit ``REMARK XXX `` part
142
+ remark_lines = [
143
+ line[CONTENT_START_COLUMN:] for line in self.lines
144
+ if line.startswith(remark_string)
145
+ ]
146
+ if len(remark_lines) == 0:
147
+ return None
148
+ # Remove first empty line
149
+ remark_lines = remark_lines[1:]
150
+ return remark_lines
151
+
152
+
153
+ def get_model_count(self):
154
+ """
155
+ Get the number of models contained in the PDB file.
156
+
157
+ Returns
158
+ -------
159
+ model_count : int
160
+ The number of models.
161
+ """
162
+ return len(self._model_start_i)
163
+
164
+
165
+ def get_coord(self, model=None):
166
+ """
167
+ Get only the coordinates from the PDB file.
168
+
169
+ Parameters
170
+ ----------
171
+ model : int, optional
172
+ If this parameter is given, the function will return a
173
+ 2D coordinate array from the atoms corresponding to the
174
+ given model number (starting at 1).
175
+ Negative values are used to index models starting from the
176
+ last model instead of the first model.
177
+ If this parameter is omitted, an 3D coordinate array
178
+ containing all models will be returned, even if
179
+ the structure contains only one model.
180
+
181
+ Returns
182
+ -------
183
+ coord : ndarray, shape=(m,n,3) or shape=(n,3), dtype=float
184
+ The coordinates read from the ATOM and HETATM records of the
185
+ file.
186
+
187
+ Notes
188
+ -----
189
+ Note that :func:`get_coord()` may output more coordinates than
190
+ the atom array (stack) from the corresponding
191
+ :func:`get_structure()` call has.
192
+ The reason for this is, that :func:`get_structure()` filters
193
+ *altloc* IDs, while `get_coord()` does not.
194
+
195
+ Examples
196
+ --------
197
+ Read an :class:`AtomArrayStack` from multiple PDB files, where
198
+ each PDB file contains the same atoms but different positions.
199
+ This is an efficient approach when a trajectory is spread into
200
+ multiple PDB files, as done e.g. by the *Rosetta* modeling
201
+ software.
202
+
203
+ For the purpose of this example, the PDB files are created from
204
+ an existing :class:`AtomArrayStack`.
205
+
206
+ >>> import os.path
207
+ >>> from tempfile import gettempdir
208
+ >>> file_names = []
209
+ >>> for i in range(atom_array_stack.stack_depth()):
210
+ ... pdb_file = PDBFile()
211
+ ... pdb_file.set_structure(atom_array_stack[i])
212
+ ... file_name = os.path.join(gettempdir(), f"model_{i+1}.pdb")
213
+ ... pdb_file.write(file_name)
214
+ ... file_names.append(file_name)
215
+ >>> print(file_names)
216
+ ['...model_1.pdb', '...model_2.pdb', ..., '...model_38.pdb']
217
+
218
+ Now the PDB files are used to create an :class:`AtomArrayStack`,
219
+ where each model represents a different model.
220
+
221
+ Construct a new :class:`AtomArrayStack` with annotations taken
222
+ from one of the created files used as template and coordinates
223
+ from all of the PDB files.
224
+
225
+ >>> template_file = PDBFile.read(file_names[0])
226
+ >>> template = template_file.get_structure()
227
+ >>> coord = []
228
+ >>> for i, file_name in enumerate(file_names):
229
+ ... pdb_file = PDBFile.read(file_name)
230
+ ... coord.append(pdb_file.get_coord(model=1))
231
+ >>> new_stack = from_template(template, np.array(coord))
232
+
233
+ The newly created :class:`AtomArrayStack` should now be equal to
234
+ the :class:`AtomArrayStack` the PDB files were created from.
235
+
236
+ >>> print(np.allclose(new_stack.coord, atom_array_stack.coord))
237
+ True
238
+ """
239
+ if model is None:
240
+ coord = np.zeros(
241
+ (len(self._model_start_i), self._get_model_length(), 3),
242
+ dtype=np.float32
243
+ )
244
+ m = 0
245
+ i = 0
246
+ for line_i in self._atom_line_i:
247
+ if (
248
+ m < len(self._model_start_i)-1
249
+ and line_i > self._model_start_i[m+1]
250
+ ):
251
+ m += 1
252
+ i = 0
253
+ line = self.lines[line_i]
254
+ coord[m,i,0] = float(line[_coord_x])
255
+ coord[m,i,1] = float(line[_coord_y])
256
+ coord[m,i,2] = float(line[_coord_z])
257
+ i += 1
258
+ return coord
259
+
260
+ else:
261
+ coord_i = self._get_atom_record_indices_for_model(model)
262
+ coord = np.zeros((len(coord_i), 3), dtype=np.float32)
263
+ for i, line_i in enumerate(coord_i):
264
+ line = self.lines[line_i]
265
+ coord[i,0] = float(line[_coord_x])
266
+ coord[i,1] = float(line[_coord_y])
267
+ coord[i,2] = float(line[_coord_z])
268
+ return coord
269
+
270
+
271
+ def get_b_factor(self, model=None):
272
+ """
273
+ Get only the B-factors from the PDB file.
274
+
275
+ Parameters
276
+ ----------
277
+ model : int, optional
278
+ If this parameter is given, the function will return a
279
+ 1D B-factor array from the atoms corresponding to the
280
+ given model number (starting at 1).
281
+ Negative values are used to index models starting from the
282
+ last model instead of the first model.
283
+ If this parameter is omitted, an 2D B-factor array
284
+ containing all models will be returned, even if
285
+ the structure contains only one model.
286
+
287
+ Returns
288
+ -------
289
+ b_factor : ndarray, shape=(m,n) or shape=(n,), dtype=float
290
+ The B-factors read from the ATOM and HETATM records of the
291
+ file.
292
+
293
+ Notes
294
+ -----
295
+ Note that :func:`get_b_factor()` may output more B-factors
296
+ than the atom array (stack) from the corresponding
297
+ :func:`get_structure()` call has atoms.
298
+ The reason for this is, that :func:`get_structure()` filters
299
+ *altloc* IDs, while `get_b_factor()` does not.
300
+ """
301
+ if model is None:
302
+ b_factor = np.zeros(
303
+ (len(self._model_start_i), self._get_model_length()),
304
+ dtype=np.float32
305
+ )
306
+ m = 0
307
+ i = 0
308
+ for line_i in self._atom_line_i:
309
+ if (
310
+ m < len(self._model_start_i)-1
311
+ and line_i > self._model_start_i[m+1]
312
+ ):
313
+ m += 1
314
+ i = 0
315
+ line = self.lines[line_i]
316
+ b_factor[m,i] = float(line[_temp_f])
317
+ i += 1
318
+ return b_factor
319
+
320
+ else:
321
+ b_factor_i = self._get_atom_record_indices_for_model(model)
322
+ b_factor = np.zeros(len(b_factor_i), dtype=np.float32)
323
+ for i, line_i in enumerate(b_factor_i):
324
+ line = self.lines[line_i]
325
+ b_factor[i] = float(line[_temp_f])
326
+ return b_factor
327
+
328
+
329
+ def get_structure(self, model=None, altloc="first", extra_fields=[],
330
+ include_bonds=False):
331
+ """
332
+ Get an :class:`AtomArray` or :class:`AtomArrayStack` from the PDB file.
333
+
334
+ This function parses standard base-10 PDB files as well as
335
+ hybrid-36 PDB.
336
+
337
+ Parameters
338
+ ----------
339
+ model : int, optional
340
+ If this parameter is given, the function will return an
341
+ :class:`AtomArray` from the atoms corresponding to the given
342
+ model number (starting at 1).
343
+ Negative values are used to index models starting from the
344
+ last model instead of the first model.
345
+ If this parameter is omitted, an :class:`AtomArrayStack`
346
+ containing all models will be returned, even if the
347
+ structure contains only one model.
348
+ altloc : {'first', 'occupancy', 'all'}
349
+ This parameter defines how *altloc* IDs are handled:
350
+ - ``'first'`` - Use atoms that have the first
351
+ *altloc* ID appearing in a residue.
352
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
353
+ with the highest occupancy for a residue.
354
+ - ``'all'`` - Use all atoms.
355
+ Note that this leads to duplicate atoms.
356
+ When this option is chosen, the ``altloc_id``
357
+ annotation array is added to the returned structure.
358
+ extra_fields : list of str, optional
359
+ The strings in the list are optional annotation categories
360
+ that should be stored in the output array or stack.
361
+ These are valid values:
362
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
363
+ ``'charge'``.
364
+ include_bonds : bool, optional
365
+ If set to true, a :class:`BondList` will be created for the
366
+ resulting :class:`AtomArray` containing the bond information
367
+ from the file.
368
+ Bonds, whose order could not be determined from the
369
+ *Chemical Component Dictionary*
370
+ (e.g. especially inter-residue bonds),
371
+ have :attr:`BondType.ANY`, since the PDB format itself does
372
+ not support bond orders.
373
+
374
+ Returns
375
+ -------
376
+ array : AtomArray or AtomArrayStack
377
+ The return type depends on the `model` parameter.
378
+ """
379
+ if model is None:
380
+ depth = len(self._model_start_i)
381
+ length = self._get_model_length()
382
+ array = AtomArrayStack(depth, length)
383
+ # Record indices for annotation determination
384
+ # Annotation is determined from model 1
385
+ annot_i = self._get_atom_record_indices_for_model(1)
386
+ # Record indices for coordinate determination
387
+ coord_i = self._atom_line_i
388
+
389
+ else:
390
+ annot_i = coord_i = self._get_atom_record_indices_for_model(model)
391
+ array = AtomArray(len(coord_i))
392
+
393
+ # Create mandatory and optional annotation arrays
394
+ chain_id = np.zeros(array.array_length(), array.chain_id.dtype)
395
+ res_id = np.zeros(array.array_length(), array.res_id.dtype)
396
+ ins_code = np.zeros(array.array_length(), array.ins_code.dtype)
397
+ res_name = np.zeros(array.array_length(), array.res_name.dtype)
398
+ hetero = np.zeros(array.array_length(), array.hetero.dtype)
399
+ atom_name = np.zeros(array.array_length(), array.atom_name.dtype)
400
+ element = np.zeros(array.array_length(), array.element.dtype)
401
+ atom_id_raw = np.zeros(array.array_length(), "U5")
402
+ charge_raw = np.zeros(array.array_length(), "U2")
403
+ occupancy = np.zeros(array.array_length(), float)
404
+ b_factor = np.zeros(array.array_length(), float)
405
+ altloc_id = np.zeros(array.array_length(), dtype="U1")
406
+
407
+ # Fill annotation array
408
+ # i is index in array, line_i is line index
409
+ for i, line_i in enumerate(annot_i):
410
+ line = self.lines[line_i]
411
+ chain_id[i] = line[_chain_id].strip()
412
+ res_id[i] = decode_hybrid36(line[_res_id])
413
+ ins_code[i] = line[_ins_code].strip()
414
+ res_name[i] = line[_res_name].strip()
415
+ hetero[i] = line[_record] == "HETATM"
416
+ atom_name[i] = line[_atom_name].strip()
417
+ element[i] = line[_element].strip()
418
+ altloc_id[i] = line[_alt_loc]
419
+ atom_id_raw[i] = line[_atom_id]
420
+ # turn "1-" into "-1", if necessary
421
+ if line[_charge][0] in "+-":
422
+ charge_raw[i] = line[_charge]
423
+ else:
424
+ charge_raw[i] = line[_charge][::-1]
425
+ occupancy[i] = float(line[_occupancy].strip())
426
+ b_factor[i] = float(line[_temp_f].strip())
427
+
428
+ if include_bonds or \
429
+ (extra_fields is not None and "atom_id" in extra_fields):
430
+ # The atom IDs are only required in these two cases
431
+ atom_id = np.array(
432
+ [decode_hybrid36(raw_id.item()) for raw_id in atom_id_raw],
433
+ dtype=int
434
+ )
435
+ else:
436
+ atom_id = None
437
+
438
+ # Add annotation arrays to atom array (stack)
439
+ array.chain_id = chain_id
440
+ array.res_id = res_id
441
+ array.ins_code = ins_code
442
+ array.res_name = res_name
443
+ array.hetero = hetero
444
+ array.atom_name = atom_name
445
+ array.element = element
446
+
447
+ for field in (extra_fields if extra_fields is not None else []):
448
+ if field == "atom_id":
449
+ # Copy is necessary to avoid double masking in
450
+ # later altloc ID filtering
451
+ array.set_annotation("atom_id", atom_id.copy())
452
+ elif field == "charge":
453
+ charge = np.array(charge_raw)
454
+ array.set_annotation("charge", np.where(
455
+ charge == " ", "0", charge
456
+ ).astype(int))
457
+ elif field == "occupancy":
458
+ array.set_annotation("occupancy", occupancy)
459
+ elif field == "b_factor":
460
+ array.set_annotation("b_factor", b_factor)
461
+ else:
462
+ raise ValueError(f"Unknown extra field: {field}")
463
+
464
+ # Replace empty strings for elements with guessed types
465
+ # This is used e.g. for PDB files created by Gromacs
466
+ empty_element_mask = array.element == ""
467
+ if empty_element_mask.any():
468
+ warnings.warn(
469
+ f"{np.count_nonzero(empty_element_mask)} elements "
470
+ "were guessed from atom name"
471
+ )
472
+ array.element[empty_element_mask] = infer_elements(
473
+ array.atom_name[empty_element_mask]
474
+ )
475
+
476
+ # Fill in coordinates
477
+ if isinstance(array, AtomArray):
478
+ for i, line_i in enumerate(coord_i):
479
+ line = self.lines[line_i]
480
+ array.coord[i, 0] = float(line[_coord_x])
481
+ array.coord[i, 1] = float(line[_coord_y])
482
+ array.coord[i, 2] = float(line[_coord_z])
483
+
484
+ elif isinstance(array, AtomArrayStack):
485
+ m = 0
486
+ i = 0
487
+ for line_i in self._atom_line_i:
488
+ if m < len(self._model_start_i)-1 and line_i > self._model_start_i[m+1]:
489
+ m += 1
490
+ i = 0
491
+ line = self.lines[line_i]
492
+ array.coord[m, i, 0] = float(line[_coord_x])
493
+ array.coord[m, i, 1] = float(line[_coord_y])
494
+ array.coord[m, i, 2] = float(line[_coord_z])
495
+ i += 1
496
+
497
+ # Fill in box vectors
498
+ # PDB does not support changing box dimensions. CRYST1 is a one-time
499
+ # record so we can extract it directly
500
+ for line in self.lines:
501
+ if line.startswith("CRYST1"):
502
+ try:
503
+ len_a = float(line[_a])
504
+ len_b = float(line[_b])
505
+ len_c = float(line[_c])
506
+ alpha = np.deg2rad(float(line[_alpha]))
507
+ beta = np.deg2rad(float(line[_beta]))
508
+ gamma = np.deg2rad(float(line[_gamma]))
509
+ box = vectors_from_unitcell(
510
+ len_a, len_b, len_c, alpha, beta, gamma
511
+ )
512
+ except ValueError:
513
+ # File contains invalid 'CRYST1' record
514
+ warnings.warn(
515
+ "File contains invalid 'CRYST1' record, box is ignored"
516
+ )
517
+ break
518
+
519
+ if isinstance(array, AtomArray):
520
+ array.box = box
521
+ else:
522
+ array.box = np.repeat(
523
+ box[np.newaxis, ...], array.stack_depth(), axis=0
524
+ )
525
+ break
526
+
527
+ # Filter altloc IDs
528
+ if altloc == "occupancy":
529
+ filter = filter_highest_occupancy_altloc(
530
+ array, altloc_id, occupancy
531
+ )
532
+ array = array[..., filter]
533
+ atom_id = atom_id[filter] if atom_id is not None else None
534
+ elif altloc == "first":
535
+ filter = filter_first_altloc(array, altloc_id)
536
+ array = array[..., filter]
537
+ atom_id = atom_id[filter] if atom_id is not None else None
538
+ elif altloc == "all":
539
+ array.set_annotation("altloc_id", altloc_id)
540
+ else:
541
+ raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
542
+
543
+ # Read bonds
544
+ if include_bonds:
545
+ bond_list = self._get_bonds(atom_id)
546
+ bond_list = bond_list.merge(connect_via_residue_names(array))
547
+ array.bonds = bond_list
548
+
549
+ return array
550
+
551
+
552
+ def set_structure(self, array, hybrid36=False):
553
+ """
554
+ Set the :class:`AtomArray` or :class:`AtomArrayStack` for the
555
+ file.
556
+
557
+ This makes also use of the optional annotation arrays
558
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and ``'charge'``.
559
+ If the atom array (stack) contains the annotation ``'atom_id'``,
560
+ these values will be used for atom numbering instead of
561
+ continuous numbering.
562
+
563
+ Parameters
564
+ ----------
565
+ array : AtomArray or AtomArrayStack
566
+ The array or stack to be saved into this file. If a stack
567
+ is given, each array in the stack is saved as separate
568
+ model.
569
+ hybrid36: bool, optional
570
+ Defines wether the file should be written in hybrid-36
571
+ format.
572
+
573
+ Notes
574
+ -----
575
+ If `array` has an associated :class:`BondList`, ``CONECT``
576
+ records are also written for all non-water hetero residues
577
+ and all inter-residue connections.
578
+ """
579
+ _check_pdb_compatibility(array, hybrid36)
580
+
581
+ natoms = array.array_length()
582
+ annot_categories = array.get_annotation_categories()
583
+ record = np.char.array(np.where(array.hetero, "HETATM", "ATOM"))
584
+ # Check for optional annotation categories
585
+ if "atom_id" in annot_categories:
586
+ atom_id = array.atom_id
587
+ else:
588
+ atom_id = np.arange(1, natoms + 1)
589
+ if "b_factor" in annot_categories:
590
+ b_factor = np.char.array([f"{b:>6.2f}" for b in array.b_factor])
591
+ else:
592
+ b_factor = np.char.array(np.full(natoms, " 0.00", dtype="U6"))
593
+ if "occupancy" in annot_categories:
594
+ occupancy = np.char.array([f"{o:>6.2f}" for o in array.occupancy])
595
+ else:
596
+ occupancy = np.char.array(np.full(natoms, " 1.00", dtype="U6"))
597
+ if "charge" in annot_categories:
598
+ charge = np.char.array(
599
+ [str(np.abs(charge)) + "+" if charge > 0 else
600
+ (str(np.abs(charge)) + "-" if charge < 0 else "")
601
+ for charge in array.get_annotation("charge")]
602
+ )
603
+ else:
604
+ charge = np.char.array(np.full(natoms, " ", dtype="U2"))
605
+
606
+ if hybrid36:
607
+ pdb_atom_id = np.char.array(
608
+ [encode_hybrid36(i, 5) for i in atom_id]
609
+ )
610
+ pdb_res_id = np.char.array(
611
+ [encode_hybrid36(i, 4) for i in array.res_id]
612
+ )
613
+ else:
614
+ # Atom IDs are supported up to 99999,
615
+ # but negative IDs are also possible
616
+ pdb_atom_id = np.char.array(np.where(
617
+ atom_id > 0,
618
+ ((atom_id - 1) % _PDB_MAX_ATOMS) + 1,
619
+ atom_id
620
+ ).astype(str))
621
+ # Residue IDs are supported up to 9999,
622
+ # but negative IDs are also possible
623
+ pdb_res_id = np.char.array(np.where(
624
+ array.res_id > 0,
625
+ ((array.res_id - 1) % _PDB_MAX_RESIDUES) + 1,
626
+ array.res_id
627
+ ).astype(str))
628
+
629
+ names = np.char.array(
630
+ [f" {atm}" if len(elem) == 1 and len(atm) < 4 else atm
631
+ for atm, elem in zip(array.atom_name, array.element)]
632
+ )
633
+ res_names = np.char.array(array.res_name)
634
+ chain_ids = np.char.array(array.chain_id)
635
+ ins_codes = np.char.array(array.ins_code)
636
+ spaces = np.char.array(np.full(natoms, " ", dtype="U1"))
637
+ elements = np.char.array(array.element)
638
+
639
+ first_half = (
640
+ record.ljust(6) +
641
+ pdb_atom_id.rjust(5) +
642
+ spaces +
643
+ names.ljust(4) +
644
+ spaces + res_names.rjust(3) + spaces + chain_ids +
645
+ pdb_res_id.rjust(4) + ins_codes.rjust(1)
646
+ )
647
+
648
+ second_half = (
649
+ occupancy + b_factor + 10 * spaces +
650
+ elements.rjust(2) + charge.rjust(2)
651
+ )
652
+
653
+ coords = array.coord
654
+ if coords.ndim == 2:
655
+ coords = coords[np.newaxis, ...]
656
+
657
+ self.lines = []
658
+ # Prepend a single CRYST1 record if we have box information
659
+ if array.box is not None:
660
+ box = array.box
661
+ if len(box.shape) == 3:
662
+ box = box[0]
663
+ a, b, c, alpha, beta, gamma = unitcell_from_vectors(box)
664
+ self.lines.append(
665
+ f"CRYST1{a:>9.3f}{b:>9.3f}{c:>9.3f}"
666
+ f"{np.rad2deg(alpha):>7.2f}{np.rad2deg(beta):>7.2f}"
667
+ f"{np.rad2deg(gamma):>7.2f} P 1 1 "
668
+ )
669
+ is_stack = coords.shape[0] > 1
670
+ for model_num, coord_i in enumerate(coords, start=1):
671
+ # for an ArrayStack, this is run once
672
+ # only add model lines if is_stack
673
+ if is_stack:
674
+ self.lines.append(f"MODEL {model_num:4}")
675
+ # Bundle non-coordinate data to simplify iteration
676
+ self.lines.extend(
677
+ [f"{start:27} {x:>8.3f}{y:>8.3f}{z:>8.3f}{end:26}"
678
+ for start, (x, y, z), end in
679
+ zip(first_half, coord_i, second_half)]
680
+ )
681
+ if is_stack:
682
+ self.lines.append("ENDMDL")
683
+
684
+ # Add CONECT records if bonds are present
685
+ if array.bonds is not None:
686
+ # Only non-water hetero records and connections between
687
+ # residues are added to the records
688
+ hetero_indices = np.where(array.hetero & ~filter_solvent(array))[0]
689
+ bond_array = array.bonds.as_array()
690
+ bond_array = bond_array[
691
+ np.isin(bond_array[:,0], hetero_indices) |
692
+ np.isin(bond_array[:,1], hetero_indices) |
693
+ (array.res_id [bond_array[:,0]] != array.res_id [bond_array[:,1]]) |
694
+ (array.chain_id[bond_array[:,0]] != array.chain_id[bond_array[:,1]])
695
+ ]
696
+ self._set_bonds(
697
+ BondList(array.array_length(), bond_array), pdb_atom_id
698
+ )
699
+
700
+ self._index_models_and_atoms()
701
+
702
+
703
+ def list_assemblies(self):
704
+ """
705
+ List the biological assemblies that are available for the
706
+ structure in the given file.
707
+
708
+ This function receives the data from the ``REMARK 300`` records
709
+ in the file.
710
+ Consequently, this remark must be present in the file.
711
+
712
+ Returns
713
+ -------
714
+ assemblies : list of str
715
+ A list that contains the available assembly IDs.
716
+
717
+ Examples
718
+ --------
719
+ >>> import os.path
720
+ >>> file = PDBFile.read(os.path.join(path_to_structures, "1f2n.pdb"))
721
+ >>> print(file.list_assemblies())
722
+ ['1']
723
+ """
724
+ # Get remarks listing available assemblies
725
+ remark_lines = self.get_remark(300)
726
+ if remark_lines is None:
727
+ raise InvalidFileError(
728
+ "File does not contain assembly information (REMARK 300)"
729
+ )
730
+ return [
731
+ assembly_id.strip()
732
+ for assembly_id in remark_lines[0][12:].split(",")
733
+ ]
734
+
735
+
736
+ def get_assembly(self, assembly_id=None, model=None, altloc="first",
737
+ extra_fields=[], include_bonds=False):
738
+ """
739
+ Build the given biological assembly.
740
+
741
+ This function receives the data from ``REMARK 350`` records in
742
+ the file.
743
+ Consequently, this remark must be present in the file.
744
+
745
+ Parameters
746
+ ----------
747
+ assembly_id : str
748
+ The assembly to build.
749
+ Available assembly IDs can be obtained via
750
+ :func:`list_assemblies()`.
751
+ model : int, optional
752
+ If this parameter is given, the function will return an
753
+ :class:`AtomArray` from the atoms corresponding to the given
754
+ model number (starting at 1).
755
+ Negative values are used to index models starting from the
756
+ last model instead of the first model.
757
+ If this parameter is omitted, an :class:`AtomArrayStack`
758
+ containing all models will be returned, even if the
759
+ structure contains only one model.
760
+ altloc : {'first', 'occupancy', 'all'}
761
+ This parameter defines how *altloc* IDs are handled:
762
+ - ``'first'`` - Use atoms that have the first
763
+ *altloc* ID appearing in a residue.
764
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
765
+ with the highest occupancy for a residue.
766
+ - ``'all'`` - Use all atoms.
767
+ Note that this leads to duplicate atoms.
768
+ When this option is chosen, the ``altloc_id``
769
+ annotation array is added to the returned structure.
770
+ extra_fields : list of str, optional
771
+ The strings in the list are optional annotation categories
772
+ that should be stored in the output array or stack.
773
+ These are valid values:
774
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
775
+ ``'charge'``.
776
+ include_bonds : bool, optional
777
+ If set to true, a :class:`BondList` will be created for the
778
+ resulting :class:`AtomArray` containing the bond information
779
+ from the file.
780
+ Bonds, whose order could not be determined from the
781
+ *Chemical Component Dictionary*
782
+ (e.g. especially inter-residue bonds),
783
+ have :attr:`BondType.ANY`, since the PDB format itself does
784
+ not support bond orders.
785
+
786
+ Returns
787
+ -------
788
+ assembly : AtomArray or AtomArrayStack
789
+ The assembly.
790
+ The return type depends on the `model` parameter.
791
+
792
+ Examples
793
+ --------
794
+
795
+ >>> import os.path
796
+ >>> file = PDBFile.read(os.path.join(path_to_structures, "1f2n.pdb"))
797
+ >>> assembly = file.get_assembly(model=1)
798
+ """
799
+ # Get base structure
800
+ structure = self.get_structure(
801
+ model,
802
+ altloc,
803
+ extra_fields,
804
+ include_bonds,
805
+ )
806
+
807
+ # Get lines containing transformations for chosen assembly
808
+ remark_lines = self.get_remark(350)
809
+ if remark_lines is None:
810
+ raise InvalidFileError(
811
+ "File does not contain assembly information (REMARK 350)"
812
+ )
813
+ # Get lines corresponding to selected assembly ID
814
+ assembly_start_i = None
815
+ assembly_stop_i = None
816
+ for i, line in enumerate(remark_lines):
817
+ if line.startswith("BIOMOLECULE"):
818
+ current_assembly_id = line[12:].strip()
819
+ if assembly_start_i is not None:
820
+ # Start was already found -> this is the next entry
821
+ # -> this is the stop
822
+ assembly_stop_i = i
823
+ break
824
+ if current_assembly_id == assembly_id or assembly_id is None:
825
+ assembly_start_i = i
826
+ # In case of the final assembly of the file,
827
+ # the 'stop' is the end of REMARK 350 lines
828
+ assembly_stop_i = len(remark_lines) if assembly_stop_i is None else i
829
+ if assembly_start_i is None:
830
+ if assembly_id is None:
831
+ raise InvalidFileError(
832
+ "File does not contain transformation "
833
+ "expressions for assemblies"
834
+ )
835
+ else:
836
+ raise KeyError(
837
+ f"The assembly ID '{assembly_id}' is not found"
838
+ )
839
+ assembly_lines = remark_lines[assembly_start_i : assembly_stop_i]
840
+
841
+ # Get transformations for a set of chains
842
+ chain_set_start_indices = [
843
+ i for i, line in enumerate(assembly_lines)
844
+ if line.startswith("APPLY THE FOLLOWING TO CHAINS")
845
+ ]
846
+ # Add exclusive stop at end of records
847
+ chain_set_start_indices.append(len(assembly_lines))
848
+ assembly = None
849
+ for i in range(len(chain_set_start_indices) - 1):
850
+ start = chain_set_start_indices[i]
851
+ stop = chain_set_start_indices[i+1]
852
+ # Read affected chain IDs from the following line(s)
853
+ affected_chain_ids = []
854
+ transform_start = None
855
+ for j, line in enumerate(assembly_lines[start : stop]):
856
+ if line.startswith("APPLY THE FOLLOWING TO CHAINS:") or \
857
+ line.startswith(" AND CHAINS:"):
858
+ affected_chain_ids += [
859
+ chain_id.strip()
860
+ for chain_id in line[30:].split(",")
861
+ ]
862
+ else:
863
+ # Chain specification has finished
864
+ # BIOMT lines start directly after chain specification
865
+ transform_start = start + j
866
+ break
867
+ # Parse transformations from BIOMT lines
868
+ if transform_start is None:
869
+ raise InvalidFileError(
870
+ "No 'BIOMT' records found for chosen assembly"
871
+ )
872
+ rotations, translations = _parse_transformations(
873
+ assembly_lines[transform_start : stop]
874
+ )
875
+ # Filter affected chains
876
+ sub_structure = structure[
877
+ ..., np.isin(structure.chain_id, affected_chain_ids)
878
+ ]
879
+ sub_assembly = _apply_transformations(
880
+ sub_structure, rotations, translations
881
+ )
882
+ # Merge the chains with IDs for this transformation
883
+ # with chains from other transformations
884
+ if assembly is None:
885
+ assembly = sub_assembly
886
+ else:
887
+ assembly += sub_assembly
888
+
889
+ return assembly
890
+
891
+
892
+ def get_symmetry_mates(self, model=None, altloc="first",
893
+ extra_fields=[], include_bonds=False):
894
+ """
895
+ Build a structure model containing all symmetric copies
896
+ of the structure within a single unit cell, given by the space
897
+ group.
898
+
899
+ This function receives the data from ``REMARK 290`` records in
900
+ the file.
901
+ Consequently, this remark must be present in the file, which is
902
+ usually only true for crystal structures.
903
+
904
+ Parameters
905
+ ----------
906
+ model : int, optional
907
+ If this parameter is given, the function will return an
908
+ :class:`AtomArray` from the atoms corresponding to the given
909
+ model number (starting at 1).
910
+ Negative values are used to index models starting from the
911
+ last model instead of the first model.
912
+ If this parameter is omitted, an :class:`AtomArrayStack`
913
+ containing all models will be returned, even if the
914
+ structure contains only one model.
915
+ altloc : {'first', 'occupancy', 'all'}
916
+ This parameter defines how *altloc* IDs are handled:
917
+ - ``'first'`` - Use atoms that have the first
918
+ *altloc* ID appearing in a residue.
919
+ - ``'occupancy'`` - Use atoms that have the *altloc* ID
920
+ with the highest occupancy for a residue.
921
+ - ``'all'`` - Use all atoms.
922
+ Note that this leads to duplicate atoms.
923
+ When this option is chosen, the ``altloc_id``
924
+ annotation array is added to the returned structure.
925
+ extra_fields : list of str, optional
926
+ The strings in the list are optional annotation categories
927
+ that should be stored in the output array or stack.
928
+ These are valid values:
929
+ ``'atom_id'``, ``'b_factor'``, ``'occupancy'`` and
930
+ ``'charge'``.
931
+ include_bonds : bool, optional
932
+ If set to true, a :class:`BondList` will be created for the
933
+ resulting :class:`AtomArray` containing the bond information
934
+ from the file.
935
+ Bonds, whose order could not be determined from the
936
+ *Chemical Component Dictionary*
937
+ (e.g. especially inter-residue bonds),
938
+ have :attr:`BondType.ANY`, since the PDB format itself does
939
+ not support bond orders.
940
+
941
+ Returns
942
+ -------
943
+ symmetry_mates : AtomArray or AtomArrayStack
944
+ All atoms within a single unit cell.
945
+ The return type depends on the `model` parameter.
946
+
947
+ Notes
948
+ -----
949
+ To expand the structure beyond a single unit cell, use
950
+ :func:`repeat_box()` with the return value as its
951
+ input.
952
+
953
+ Examples
954
+ --------
955
+
956
+ >>> import os.path
957
+ >>> file = PDBFile.read(os.path.join(path_to_structures, "1aki.pdb"))
958
+ >>> atoms_in_unit_cell = file.get_symmetry_mates(model=1)
959
+ """
960
+ # Get base structure
961
+ structure = self.get_structure(
962
+ model,
963
+ altloc,
964
+ extra_fields,
965
+ include_bonds,
966
+ )
967
+ # Get lines containing transformations for crystallographic symmetry
968
+ remark_lines = self.get_remark(290)
969
+ if remark_lines is None:
970
+ raise InvalidFileError(
971
+ "File does not contain crystallographic symmetry "
972
+ "information (REMARK 350)"
973
+ )
974
+ transform_lines = [
975
+ line for line in remark_lines if line.startswith(" SMTRY")
976
+ ]
977
+ rotations, translations = _parse_transformations(
978
+ transform_lines
979
+ )
980
+ return _apply_transformations(
981
+ structure, rotations, translations
982
+ )
983
+
984
+
985
+
986
+
987
+ def _index_models_and_atoms(self):
988
+ # Line indices where a new model starts
989
+ self._model_start_i = np.array(
990
+ [
991
+ i for i in range(len(self.lines))
992
+ if self.lines[i].startswith(("MODEL"))
993
+ ],
994
+ dtype=int
995
+ )
996
+ if len(self._model_start_i) == 0:
997
+ # It could be an empty file or a file with a single model,
998
+ # where the 'MODEL' line is missing
999
+ for line in self.lines:
1000
+ if line.startswith(("ATOM", "HETATM")):
1001
+ # Single model
1002
+ self._model_start_i = np.array([0])
1003
+ break
1004
+
1005
+ # Line indices with ATOM or HETATM records
1006
+ self._atom_line_i = np.array(
1007
+ [
1008
+ i for i in range(len(self.lines))
1009
+ if self.lines[i].startswith(("ATOM", "HETATM"))
1010
+ ],
1011
+ dtype=int
1012
+ )
1013
+
1014
+
1015
+ def _get_atom_record_indices_for_model(self, model):
1016
+ last_model = len(self._model_start_i)
1017
+ if model == 0:
1018
+ raise ValueError("The model index must not be 0")
1019
+ # Negative models mean index starting from last model
1020
+ model = last_model + model + 1 if model < 0 else model
1021
+
1022
+ if model < last_model:
1023
+ line_filter = (
1024
+ (self._atom_line_i >= self._model_start_i[model-1]) &
1025
+ (self._atom_line_i < self._model_start_i[model ])
1026
+ )
1027
+ elif model == last_model:
1028
+ line_filter = (self._atom_line_i >= self._model_start_i[model-1])
1029
+ else:
1030
+ raise ValueError(
1031
+ f"The file has {last_model} models, "
1032
+ f"the given model {model} does not exist"
1033
+ )
1034
+ return self._atom_line_i[line_filter]
1035
+
1036
+
1037
+ def _get_model_length(self):
1038
+ """
1039
+ Determine length of models and check that all models
1040
+ have equal length.
1041
+ """
1042
+ n_models = len(self._model_start_i)
1043
+ length = None
1044
+ for model_i in range(len(self._model_start_i)):
1045
+ model_start = self._model_start_i[model_i]
1046
+ model_stop = self._model_start_i[model_i+1] \
1047
+ if model_i+1 < n_models else len(self.lines)
1048
+ model_length = np.count_nonzero(
1049
+ (self._atom_line_i >= model_start) &
1050
+ (self._atom_line_i < model_stop)
1051
+ )
1052
+ if length is None:
1053
+ length = model_length
1054
+ if model_length != length:
1055
+ raise InvalidFileError(
1056
+ f"Model {model_i+1} has {model_length} atoms, "
1057
+ f"but model 1 has {length} atoms, must be equal"
1058
+ )
1059
+ return length
1060
+
1061
+
1062
+ def _get_bonds(self, atom_ids):
1063
+ conect_lines = [line for line in self.lines
1064
+ if line.startswith("CONECT")]
1065
+
1066
+ # Mapping from atom ids to indices in an AtomArray
1067
+ atom_id_to_index = np.zeros(atom_ids[-1]+1, dtype=int)
1068
+ try:
1069
+ for i, id in enumerate(atom_ids):
1070
+ atom_id_to_index[id] = i
1071
+ except IndexError as e:
1072
+ raise InvalidFileError(
1073
+ "Atom IDs are not strictly increasing"
1074
+ ) from e
1075
+
1076
+ bonds = []
1077
+ for line in conect_lines:
1078
+ center_id = atom_id_to_index[decode_hybrid36(line[6 : 11])]
1079
+ for i in range(11, 31, 5):
1080
+ id_string = line[i : i+5]
1081
+ try:
1082
+ id = atom_id_to_index[decode_hybrid36(id_string)]
1083
+ except ValueError:
1084
+ # String is empty -> no further IDs
1085
+ break
1086
+ bonds.append((center_id, id))
1087
+
1088
+ # The length of the 'atom_ids' array
1089
+ # is equal to the length of the AtomArray
1090
+ return BondList(len(atom_ids), np.array(bonds, dtype=np.uint32))
1091
+
1092
+
1093
+ def _set_bonds(self, bond_list, atom_ids):
1094
+ # Bond type is unused since PDB does not support bond orders
1095
+ bonds, _ = bond_list.get_all_bonds()
1096
+
1097
+ for center_i, bonded_indices in enumerate(bonds):
1098
+ n_added = 0
1099
+ for bonded_i in bonded_indices:
1100
+ if bonded_i == -1:
1101
+ # Reached padding values
1102
+ break
1103
+ if n_added == 0:
1104
+ # Add new record
1105
+ line = f"CONECT{atom_ids[center_i]:>5}"
1106
+ line += f"{atom_ids[bonded_i]:>5}"
1107
+ n_added += 1
1108
+ if n_added == 4:
1109
+ # Only a maximum of 4 bond partners can be put
1110
+ # into a single line
1111
+ # If there are more, use an extra record
1112
+ n_added = 0
1113
+ self.lines.append(line)
1114
+ if n_added > 0:
1115
+ self.lines.append(line)
1116
+
1117
+
1118
+ def _parse_transformations(lines):
1119
+ """
1120
+ Parse the rotation and translation transformations from
1121
+ *REMARK* 290 or 350.
1122
+ Return as array of matrices and vectors respectively
1123
+ """
1124
+ # Each transformation requires 3 lines for the (x,y,z) components
1125
+ if len(lines) % 3 != 0:
1126
+ raise InvalidFileError("Invalid number of transformation vectors")
1127
+ n_transformations = len(lines) // 3
1128
+
1129
+ rotations = np.zeros((n_transformations, 3, 3), dtype=float)
1130
+ translations = np.zeros((n_transformations, 3), dtype=float)
1131
+
1132
+ transformation_i = 0
1133
+ component_i = 0
1134
+ for line in lines:
1135
+ # The first two elements (component and
1136
+ # transformation index) are not used
1137
+ transformations = [float(e) for e in line.split()[2:]]
1138
+ if len(transformations) != 4:
1139
+ raise InvalidFileError(
1140
+ "Invalid number of transformation vector elements"
1141
+ )
1142
+ rotations[transformation_i, component_i, :] = transformations[:3]
1143
+ translations[transformation_i, component_i] = transformations[3]
1144
+
1145
+ component_i += 1
1146
+ if component_i == 3:
1147
+ # All (x,y,z) components were parsed
1148
+ # -> head to the next transformation
1149
+ transformation_i += 1
1150
+ component_i = 0
1151
+
1152
+ return rotations, translations
1153
+
1154
+
1155
+ def _apply_transformations(structure, rotations, translations):
1156
+ """
1157
+ Get subassembly by applying the given transformations to the input
1158
+ structure containing affected chains.
1159
+ """
1160
+ # Additional first dimension for 'structure.repeat()'
1161
+ assembly_coord = np.zeros((len(rotations),) + structure.coord.shape)
1162
+
1163
+ # Apply corresponding transformation for each copy in the assembly
1164
+ for i, (rotation, translation) in enumerate(zip(rotations, translations)):
1165
+ coord = structure.coord
1166
+ # Rotate
1167
+ coord = matrix_rotate(coord, rotation)
1168
+ # Translate
1169
+ coord += translation
1170
+ assembly_coord[i] = coord
1171
+
1172
+ return repeat(structure, assembly_coord)
1173
+
1174
+
1175
+ def _check_pdb_compatibility(array, hybrid36):
1176
+ annot_categories = array.get_annotation_categories()
1177
+
1178
+ if hybrid36:
1179
+ max_atoms = max_hybrid36_number(5)
1180
+ max_residues = max_hybrid36_number(4)
1181
+ else:
1182
+ max_atoms, max_residues = _PDB_MAX_ATOMS, _PDB_MAX_RESIDUES
1183
+ if "atom_id" in annot_categories:
1184
+ max_atom_id = np.max(array.atom_id)
1185
+ else:
1186
+ max_atom_id = array.array_length()
1187
+
1188
+ if max_atom_id > max_atoms:
1189
+ warnings.warn(f"Atom IDs exceed {max_atoms:,}, will be wrapped")
1190
+ if (array.res_id > max_residues).any():
1191
+ warnings.warn(f"Residue IDs exceed {max_residues:,}, will be wrapped")
1192
+ if np.isnan(array.coord).any():
1193
+ raise BadStructureError("Coordinates contain 'NaN' values")
1194
+ if any([len(name) > 1 for name in array.chain_id]):
1195
+ raise BadStructureError("Some chain IDs exceed 1 character")
1196
+ if any([len(name) > 3 for name in array.res_name]):
1197
+ raise BadStructureError("Some residue names exceed 3 characters")
1198
+ if any([len(name) > 4 for name in array.atom_name]):
1199
+ raise BadStructureError("Some atom names exceed 4 characters")
1200
+ for i, coord_name in enumerate(["x", "y", "z"]):
1201
+ n_coord_digits = _number_of_integer_digits(array.coord[..., i])
1202
+ if n_coord_digits > 4:
1203
+ raise BadStructureError(
1204
+ f"4 pre-decimal columns for {coord_name}-coordinates are "
1205
+ f"available, but array would require {n_coord_digits}"
1206
+ )
1207
+ if "b_factor" in annot_categories:
1208
+ n_b_factor_digits = _number_of_integer_digits(array.b_factor)
1209
+ if n_b_factor_digits > 3:
1210
+ raise BadStructureError(
1211
+ "3 pre-decimal columns for B-factor are available, "
1212
+ f"but array would require {n_b_factor_digits}"
1213
+ )
1214
+ if "occupancy" in annot_categories:
1215
+ n_occupancy_digits = _number_of_integer_digits(array.occupancy)
1216
+ if n_occupancy_digits > 3:
1217
+ raise BadStructureError(
1218
+ "3 pre-decimal columns for occupancy are available, "
1219
+ f"but array would require {n_occupancy_digits}"
1220
+ )
1221
+ if "charge" in annot_categories:
1222
+ # The sign can be omitted is it is put into the adjacent column
1223
+ n_charge_digits = _number_of_integer_digits(np.abs(array.charge))
1224
+ if n_charge_digits > 1:
1225
+ raise BadStructureError(
1226
+ "1 column for charge is available, "
1227
+ f"but array would require {n_charge_digits}"
1228
+ )
1229
+
1230
+
1231
+ def _number_of_integer_digits(values):
1232
+ """
1233
+ Get the maximum number of characters needed to represent the
1234
+ pre-decimal positions of the given numeric values.
1235
+ """
1236
+ values = values.astype(int, copy=False)
1237
+ n_digits = 0
1238
+ n_digits = max(n_digits, len(str(np.min(values))))
1239
+ n_digits = max(n_digits, len(str(np.max(values))))
1240
+ return n_digits