biotite 1.0.1__cp310-cp310-win_amd64.whl → 1.2.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (177) hide show
  1. biotite/application/application.py +3 -3
  2. biotite/application/autodock/app.py +1 -1
  3. biotite/application/blast/webapp.py +1 -1
  4. biotite/application/clustalo/app.py +1 -1
  5. biotite/application/dssp/app.py +13 -3
  6. biotite/application/localapp.py +36 -2
  7. biotite/application/msaapp.py +10 -10
  8. biotite/application/muscle/app3.py +5 -18
  9. biotite/application/muscle/app5.py +5 -5
  10. biotite/application/sra/app.py +0 -5
  11. biotite/application/util.py +22 -2
  12. biotite/application/viennarna/rnaalifold.py +8 -8
  13. biotite/application/viennarna/rnaplot.py +9 -3
  14. biotite/application/viennarna/util.py +1 -1
  15. biotite/application/webapp.py +1 -1
  16. biotite/database/afdb/__init__.py +12 -0
  17. biotite/database/afdb/download.py +191 -0
  18. biotite/database/entrez/dbnames.py +10 -0
  19. biotite/database/entrez/download.py +9 -10
  20. biotite/database/entrez/key.py +1 -1
  21. biotite/database/entrez/query.py +5 -4
  22. biotite/database/pubchem/download.py +6 -6
  23. biotite/database/pubchem/error.py +10 -0
  24. biotite/database/pubchem/query.py +12 -23
  25. biotite/database/rcsb/download.py +3 -2
  26. biotite/database/rcsb/query.py +8 -9
  27. biotite/database/uniprot/check.py +22 -17
  28. biotite/database/uniprot/download.py +3 -6
  29. biotite/database/uniprot/query.py +4 -5
  30. biotite/file.py +14 -2
  31. biotite/interface/__init__.py +19 -0
  32. biotite/interface/openmm/__init__.py +16 -0
  33. biotite/interface/openmm/state.py +93 -0
  34. biotite/interface/openmm/system.py +227 -0
  35. biotite/interface/pymol/__init__.py +198 -0
  36. biotite/interface/pymol/cgo.py +346 -0
  37. biotite/interface/pymol/convert.py +185 -0
  38. biotite/interface/pymol/display.py +267 -0
  39. biotite/interface/pymol/object.py +1226 -0
  40. biotite/interface/pymol/shapes.py +178 -0
  41. biotite/interface/pymol/startup.py +169 -0
  42. biotite/interface/rdkit/__init__.py +15 -0
  43. biotite/interface/rdkit/mol.py +490 -0
  44. biotite/interface/version.py +71 -0
  45. biotite/interface/warning.py +19 -0
  46. biotite/sequence/align/__init__.py +0 -4
  47. biotite/sequence/align/alignment.py +49 -14
  48. biotite/sequence/align/banded.cp310-win_amd64.pyd +0 -0
  49. biotite/sequence/align/banded.pyx +26 -26
  50. biotite/sequence/align/cigar.py +2 -2
  51. biotite/sequence/align/kmeralphabet.cp310-win_amd64.pyd +0 -0
  52. biotite/sequence/align/kmeralphabet.pyx +19 -2
  53. biotite/sequence/align/kmersimilarity.cp310-win_amd64.pyd +0 -0
  54. biotite/sequence/align/kmertable.cp310-win_amd64.pyd +0 -0
  55. biotite/sequence/align/kmertable.pyx +58 -48
  56. biotite/sequence/align/localgapped.cp310-win_amd64.pyd +0 -0
  57. biotite/sequence/align/localgapped.pyx +47 -47
  58. biotite/sequence/align/localungapped.cp310-win_amd64.pyd +0 -0
  59. biotite/sequence/align/localungapped.pyx +10 -10
  60. biotite/sequence/align/matrix.py +284 -57
  61. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  62. biotite/sequence/align/matrix_data/PB.license +21 -0
  63. biotite/sequence/align/matrix_data/PB.mat +18 -0
  64. biotite/sequence/align/multiple.cp310-win_amd64.pyd +0 -0
  65. biotite/sequence/align/pairwise.cp310-win_amd64.pyd +0 -0
  66. biotite/sequence/align/pairwise.pyx +35 -35
  67. biotite/sequence/align/permutation.cp310-win_amd64.pyd +0 -0
  68. biotite/sequence/align/selector.cp310-win_amd64.pyd +0 -0
  69. biotite/sequence/align/selector.pyx +2 -2
  70. biotite/sequence/align/statistics.py +1 -1
  71. biotite/sequence/align/tracetable.cp310-win_amd64.pyd +0 -0
  72. biotite/sequence/alphabet.py +5 -2
  73. biotite/sequence/annotation.py +19 -13
  74. biotite/sequence/codec.cp310-win_amd64.pyd +0 -0
  75. biotite/sequence/codon.py +1 -2
  76. biotite/sequence/graphics/alignment.py +25 -39
  77. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  78. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  79. biotite/sequence/graphics/colorschemes.py +44 -11
  80. biotite/sequence/graphics/dendrogram.py +4 -2
  81. biotite/sequence/graphics/features.py +2 -2
  82. biotite/sequence/graphics/logo.py +10 -12
  83. biotite/sequence/io/fasta/convert.py +1 -2
  84. biotite/sequence/io/fasta/file.py +1 -1
  85. biotite/sequence/io/fastq/file.py +3 -3
  86. biotite/sequence/io/genbank/file.py +3 -3
  87. biotite/sequence/io/genbank/sequence.py +2 -0
  88. biotite/sequence/io/gff/convert.py +1 -1
  89. biotite/sequence/io/gff/file.py +1 -2
  90. biotite/sequence/phylo/nj.cp310-win_amd64.pyd +0 -0
  91. biotite/sequence/phylo/tree.cp310-win_amd64.pyd +0 -0
  92. biotite/sequence/phylo/upgma.cp310-win_amd64.pyd +0 -0
  93. biotite/sequence/profile.py +105 -29
  94. biotite/sequence/search.py +0 -1
  95. biotite/sequence/seqtypes.py +136 -8
  96. biotite/sequence/sequence.py +1 -2
  97. biotite/setup_ccd.py +197 -0
  98. biotite/structure/__init__.py +6 -3
  99. biotite/structure/alphabet/__init__.py +25 -0
  100. biotite/structure/alphabet/encoder.py +332 -0
  101. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  102. biotite/structure/alphabet/i3d.py +109 -0
  103. biotite/structure/alphabet/layers.py +86 -0
  104. biotite/structure/alphabet/pb.license +21 -0
  105. biotite/structure/alphabet/pb.py +170 -0
  106. biotite/structure/alphabet/unkerasify.py +128 -0
  107. biotite/structure/atoms.py +163 -66
  108. biotite/structure/basepairs.py +26 -26
  109. biotite/structure/bonds.cp310-win_amd64.pyd +0 -0
  110. biotite/structure/bonds.pyx +79 -25
  111. biotite/structure/box.py +19 -21
  112. biotite/structure/celllist.cp310-win_amd64.pyd +0 -0
  113. biotite/structure/celllist.pyx +83 -67
  114. biotite/structure/chains.py +5 -37
  115. biotite/structure/charges.cp310-win_amd64.pyd +0 -0
  116. biotite/structure/compare.py +420 -13
  117. biotite/structure/density.py +1 -1
  118. biotite/structure/dotbracket.py +27 -28
  119. biotite/structure/filter.py +8 -8
  120. biotite/structure/geometry.py +74 -127
  121. biotite/structure/hbond.py +17 -19
  122. biotite/structure/info/__init__.py +1 -0
  123. biotite/structure/info/atoms.py +24 -15
  124. biotite/structure/info/bonds.py +12 -6
  125. biotite/structure/info/ccd.py +125 -34
  126. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  127. biotite/structure/info/groups.py +62 -19
  128. biotite/structure/info/masses.py +9 -6
  129. biotite/structure/info/misc.py +15 -22
  130. biotite/structure/info/radii.py +92 -22
  131. biotite/structure/info/standardize.py +4 -4
  132. biotite/structure/integrity.py +4 -6
  133. biotite/structure/io/general.py +2 -2
  134. biotite/structure/io/gro/file.py +8 -9
  135. biotite/structure/io/mol/convert.py +1 -1
  136. biotite/structure/io/mol/ctab.py +33 -28
  137. biotite/structure/io/mol/mol.py +1 -1
  138. biotite/structure/io/mol/sdf.py +80 -53
  139. biotite/structure/io/pdb/convert.py +4 -3
  140. biotite/structure/io/pdb/file.py +85 -25
  141. biotite/structure/io/pdb/hybrid36.cp310-win_amd64.pyd +0 -0
  142. biotite/structure/io/pdbqt/file.py +36 -36
  143. biotite/structure/io/pdbx/__init__.py +1 -0
  144. biotite/structure/io/pdbx/bcif.py +54 -15
  145. biotite/structure/io/pdbx/cif.py +92 -66
  146. biotite/structure/io/pdbx/component.py +15 -4
  147. biotite/structure/io/pdbx/compress.py +321 -0
  148. biotite/structure/io/pdbx/convert.py +410 -75
  149. biotite/structure/io/pdbx/encoding.cp310-win_amd64.pyd +0 -0
  150. biotite/structure/io/pdbx/encoding.pyx +98 -17
  151. biotite/structure/io/trajfile.py +9 -6
  152. biotite/structure/io/util.py +38 -0
  153. biotite/structure/mechanics.py +0 -1
  154. biotite/structure/molecules.py +141 -156
  155. biotite/structure/pseudoknots.py +7 -13
  156. biotite/structure/repair.py +2 -4
  157. biotite/structure/residues.py +13 -24
  158. biotite/structure/rings.py +335 -0
  159. biotite/structure/sasa.cp310-win_amd64.pyd +0 -0
  160. biotite/structure/sasa.pyx +2 -1
  161. biotite/structure/segments.py +69 -11
  162. biotite/structure/sequence.py +0 -1
  163. biotite/structure/sse.py +0 -2
  164. biotite/structure/superimpose.py +74 -62
  165. biotite/structure/tm.py +581 -0
  166. biotite/structure/transform.py +12 -25
  167. biotite/structure/util.py +76 -4
  168. biotite/version.py +9 -4
  169. biotite/visualize.py +111 -1
  170. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
  171. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
  172. biotite/structure/info/ccd/README.rst +0 -8
  173. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  174. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  175. biotite/structure/info/ccd/nucleotides.txt +0 -798
  176. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
  177. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -3,7 +3,7 @@
3
3
  # information.
4
4
 
5
5
  __name__ = "biotite.structure.io.pdbx"
6
- __author__ = "Fabrice Allain, Patrick Kunzmann"
6
+ __author__ = "Fabrice Allain, Patrick Kunzmann, Cheyenne Ziegler"
7
7
  __all__ = [
8
8
  "get_sequence",
9
9
  "get_model_count",
@@ -13,6 +13,7 @@ __all__ = [
13
13
  "set_component",
14
14
  "list_assemblies",
15
15
  "get_assembly",
16
+ "get_sse",
16
17
  ]
17
18
 
18
19
  import itertools
@@ -24,6 +25,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
24
25
  from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
25
26
  from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
26
27
  from biotite.structure.error import BadStructureError
28
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
29
+ from biotite.structure.filter import (
30
+ _canonical_nucleotide_list as canonical_nucleotide_list,
31
+ )
27
32
  from biotite.structure.filter import (
28
33
  filter_first_altloc,
29
34
  filter_highest_occupancy_altloc,
@@ -36,32 +41,38 @@ from biotite.structure.io.pdbx.bcif import (
36
41
  from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
37
42
  from biotite.structure.io.pdbx.component import MaskValue
38
43
  from biotite.structure.io.pdbx.encoding import StringArrayEncoding
39
- from biotite.structure.residues import get_residue_count, get_residue_starts_for
44
+ from biotite.structure.residues import (
45
+ get_residue_count,
46
+ get_residue_positions,
47
+ get_residue_starts_for,
48
+ )
40
49
  from biotite.structure.util import matrix_rotate
41
50
 
42
- # Cond types in `struct_conn` category that refer to covalent bonds
43
- PDBX_COVALENT_TYPES = [
44
- "covale",
45
- "covale_base",
46
- "covale_phosphate",
47
- "covale_sugar",
48
- "disulf",
49
- "modres",
50
- "modres_link",
51
- "metalc",
52
- ]
53
- # Map 'struct_conn' bond orders to 'BondType'...
54
- PDBX_BOND_ORDER_TO_TYPE = {
55
- "": BondType.ANY,
56
- "sing": BondType.SINGLE,
57
- "doub": BondType.DOUBLE,
58
- "trip": BondType.TRIPLE,
59
- "quad": BondType.QUADRUPLE,
51
+ # Bond types in `struct_conn` category that refer to covalent bonds
52
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
53
+ # Although a covalent bond, could in theory have a higher bond order,
54
+ # practically inter-residue bonds are always single
55
+ "covale": BondType.SINGLE,
56
+ "covale_base": BondType.SINGLE,
57
+ "covale_phosphate": BondType.SINGLE,
58
+ "covale_sugar": BondType.SINGLE,
59
+ "disulf": BondType.SINGLE,
60
+ "modres": BondType.SINGLE,
61
+ "modres_link": BondType.SINGLE,
62
+ "metalc": BondType.COORDINATION,
63
+ }
64
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
65
+ BondType.ANY: "covale",
66
+ BondType.SINGLE: "covale",
67
+ BondType.DOUBLE: "covale",
68
+ BondType.TRIPLE: "covale",
69
+ BondType.QUADRUPLE: "covale",
70
+ BondType.AROMATIC_SINGLE: "covale",
71
+ BondType.AROMATIC_DOUBLE: "covale",
72
+ BondType.AROMATIC_TRIPLE: "covale",
73
+ BondType.COORDINATION: "metalc",
60
74
  }
61
- # ...and vice versa
62
75
  PDBX_BOND_TYPE_TO_ORDER = {
63
- # 'ANY' is masked later, it is merely added here to avoid a KeyError
64
- BondType.ANY: "",
65
76
  BondType.SINGLE: "sing",
66
77
  BondType.DOUBLE: "doub",
67
78
  BondType.TRIPLE: "trip",
@@ -69,6 +80,10 @@ PDBX_BOND_TYPE_TO_ORDER = {
69
80
  BondType.AROMATIC_SINGLE: "sing",
70
81
  BondType.AROMATIC_DOUBLE: "doub",
71
82
  BondType.AROMATIC_TRIPLE: "trip",
83
+ # These are masked later, it is merely added here to avoid a KeyError
84
+ BondType.ANY: "",
85
+ BondType.AROMATIC: "",
86
+ BondType.COORDINATION: "",
72
87
  }
73
88
  # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
74
89
  COMP_BOND_ORDER_TO_TYPE = {
@@ -79,11 +94,19 @@ COMP_BOND_ORDER_TO_TYPE = {
79
94
  ("SING", "Y"): BondType.AROMATIC_SINGLE,
80
95
  ("DOUB", "Y"): BondType.AROMATIC_DOUBLE,
81
96
  ("TRIP", "Y"): BondType.AROMATIC_TRIPLE,
97
+ ("AROM", "Y"): BondType.AROMATIC,
82
98
  }
83
99
  # ...and vice versa
84
100
  COMP_BOND_TYPE_TO_ORDER = {
85
101
  bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
86
102
  }
103
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
104
+ # it was observed that when the number or rows in `atom_site` and `struct_conn`
105
+ # exceed a certain threshold,
106
+ # a dictionary approach is less computation and memory intensive than the dense
107
+ # vectorized approach.
108
+ # https://github.com/biotite-dev/biotite/pull/765#issuecomment-2708867357
109
+ FIND_MATCHES_SWITCH_THRESHOLD = 4000000
87
110
 
88
111
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
89
112
  _nucleotideseq_type_list = [
@@ -146,8 +169,8 @@ def get_sequence(pdbx_file, data_block=None):
146
169
  -------
147
170
  sequence_dict : Dictionary of Sequences
148
171
  Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
149
- (often equivalent to chain_id and atom_site.auth_asym_id
150
- in most cases). Dictionary values are sequences.
172
+ (equivalent to ``atom_site.auth_asym_id``).
173
+ Dictionary values are sequences.
151
174
 
152
175
  Notes
153
176
  -----
@@ -203,9 +226,7 @@ def get_model_count(pdbx_file, data_block=None):
203
226
  The number of models.
204
227
  """
205
228
  block = _get_block(pdbx_file, data_block)
206
- return len(
207
- _get_model_starts(block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))
208
- )
229
+ return len(np.unique((block["atom_site"]["pdbx_PDB_model_num"].as_array(np.int32))))
209
230
 
210
231
 
211
232
  def get_structure(
@@ -296,7 +317,6 @@ def get_structure(
296
317
  >>> arr = get_structure(file, model=1)
297
318
  >>> print(len(arr))
298
319
  304
299
-
300
320
  """
301
321
  block = _get_block(pdbx_file, data_block)
302
322
 
@@ -307,13 +327,12 @@ def get_structure(
307
327
  raise InvalidFileError("Missing 'atom_site' category in file")
308
328
 
309
329
  models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
310
- model_starts = _get_model_starts(models)
311
- model_count = len(model_starts)
330
+ model_count = len(np.unique(models))
312
331
  atom_count = len(models)
313
332
 
314
333
  if model is None:
315
334
  # For a stack, the annotations are derived from the first model
316
- model_atom_site = _filter_model(atom_site, model_starts, 1)
335
+ model_atom_site = _filter_model(atom_site, 1)
317
336
  # Any field of the category would work here to get the length
318
337
  model_length = model_atom_site.row_count
319
338
  atoms = AtomArrayStack(model_count, model_length)
@@ -359,7 +378,7 @@ def get_structure(
359
378
  f"the given model {model} does not exist"
360
379
  )
361
380
 
362
- model_atom_site = _filter_model(atom_site, model_starts, model)
381
+ model_atom_site = _filter_model(atom_site, model)
363
382
  # Any field of the category would work here to get the length
364
383
  model_length = model_atom_site.row_count
365
384
  atoms = AtomArray(model_length)
@@ -475,16 +494,53 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
475
494
  array.set_annotation("element", atom_site["type_symbol"].as_array(str))
476
495
 
477
496
  if "atom_id" in extra_fields:
478
- array.set_annotation("atom_id", atom_site["id"].as_array(int))
497
+ if "id" in atom_site:
498
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
499
+ else:
500
+ warnings.warn(
501
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
502
+ UserWarning,
503
+ )
504
+ array.set_annotation("atom_id", np.arange(array.array_length()))
479
505
  extra_fields.remove("atom_id")
480
506
  if "b_factor" in extra_fields:
481
- array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
507
+ if "B_iso_or_equiv" in atom_site:
508
+ array.set_annotation(
509
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
510
+ )
511
+ else:
512
+ warnings.warn(
513
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
514
+ UserWarning,
515
+ )
516
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
482
517
  extra_fields.remove("b_factor")
483
518
  if "occupancy" in extra_fields:
484
- array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
519
+ if "occupancy" in atom_site:
520
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
521
+ else:
522
+ warnings.warn(
523
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
524
+ UserWarning,
525
+ )
526
+ array.set_annotation(
527
+ "occupancy", np.ones(array.array_length(), dtype=float)
528
+ )
485
529
  extra_fields.remove("occupancy")
486
530
  if "charge" in extra_fields:
487
- array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
531
+ if "pdbx_formal_charge" in atom_site:
532
+ array.set_annotation(
533
+ "charge",
534
+ atom_site["pdbx_formal_charge"].as_array(
535
+ int, 0
536
+ ), # masked values are set to 0
537
+ )
538
+ else:
539
+ warnings.warn(
540
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
541
+ UserWarning,
542
+ )
543
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
488
544
  extra_fields.remove("charge")
489
545
 
490
546
  # Handle all remaining custom fields
@@ -536,7 +592,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
536
592
  ]
537
593
 
538
594
  covale_mask = np.isin(
539
- struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
595
+ struct_conn["conn_type_id"].as_array(str),
596
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
540
597
  )
541
598
  if "ptnr1_symmetry" in struct_conn:
542
599
  covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
@@ -576,13 +633,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
576
633
  atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
577
634
  atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
578
635
 
579
- # Interpret missing values as ANY bonds
580
- bond_order = struct_conn["pdbx_value_order"].as_array(str, "")
636
+ bond_type_id = struct_conn["conn_type_id"].as_array()
581
637
  # Consecutively apply the same masks as applied to the atom indices
582
638
  # Logical combination does not work here,
583
639
  # as the second mask was created based on already filtered data
584
- bond_order = bond_order[covale_mask][mapping_exists_mask]
585
- bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
640
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
641
+ # The type ID is always present in the dictionary,
642
+ # as it was used to filter the applicable bonds
643
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
586
644
 
587
645
  return BondList(
588
646
  atom_site.row_count,
@@ -593,9 +651,20 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
593
651
  def _find_matches(query_arrays, reference_arrays):
594
652
  """
595
653
  For each index in the `query_arrays` find the indices in the
596
- `reference_arrays` where all query values the reference counterpart.
654
+ `reference_arrays` where all query values match the reference counterpart.
597
655
  If no match is found for a query, the corresponding index is -1.
598
656
  """
657
+ if (
658
+ query_arrays[0].shape[0] * reference_arrays[0].shape[0]
659
+ <= FIND_MATCHES_SWITCH_THRESHOLD
660
+ ):
661
+ match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
662
+ else:
663
+ match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
664
+ return match_indices
665
+
666
+
667
+ def _find_matches_by_dense_array(query_arrays, reference_arrays):
599
668
  match_masks_for_all_columns = np.stack(
600
669
  [
601
670
  query[:, np.newaxis] == reference[np.newaxis, :]
@@ -623,6 +692,38 @@ def _find_matches(query_arrays, reference_arrays):
623
692
  return match_indices
624
693
 
625
694
 
695
+ def _find_matches_by_dict(query_arrays, reference_arrays):
696
+ # Convert reference arrays to a dictionary for O(1) lookups
697
+ reference_dict = {}
698
+ ambiguous_keys = set()
699
+ for ref_idx, ref_row in enumerate(zip(*reference_arrays)):
700
+ ref_key = tuple(ref_row)
701
+ if ref_key in reference_dict:
702
+ ambiguous_keys.add(ref_key)
703
+ continue
704
+ reference_dict[ref_key] = ref_idx
705
+
706
+ match_indices = []
707
+ for query_idx, query_row in enumerate(zip(*query_arrays)):
708
+ query_key = tuple(query_row)
709
+ occurrence = reference_dict.get(query_key)
710
+
711
+ if occurrence is None:
712
+ # -1 indicates that no match was found in the reference
713
+ match_indices.append(-1)
714
+ elif query_key in ambiguous_keys:
715
+ # The query cannot be uniquely matched to an atom in the reference
716
+ raise InvalidFileError(
717
+ f"The covalent bond in the 'struct_conn' category at index "
718
+ f"{query_idx} cannot be unambiguously assigned to atoms in "
719
+ f"the 'atom_site' category"
720
+ )
721
+ else:
722
+ match_indices.append(occurrence)
723
+
724
+ return np.array(match_indices)
725
+
726
+
626
727
  def _get_struct_conn_col_name(col_name, partner):
627
728
  """
628
729
  For a column name in ``atom_site`` get the corresponding column name
@@ -661,21 +762,26 @@ def _filter_altloc(array, atom_site, altloc):
661
762
  raise ValueError(f"'{altloc}' is not a valid 'altloc' option")
662
763
 
663
764
 
664
- def _get_model_starts(model_array):
665
- """
666
- Get the start index for each model in the arrays of the
667
- ``atom_site`` category.
668
- """
669
- _, indices = np.unique(model_array, return_index=True)
670
- indices.sort()
671
- return indices
672
-
673
-
674
- def _filter_model(atom_site, model_starts, model):
765
+ def _filter_model(atom_site, model):
675
766
  """
676
767
  Reduce the ``atom_site`` category to the values for the given
677
768
  model.
769
+
770
+ Parameters
771
+ ----------
772
+ atom_site : CIFCategory or BinaryCIFCategory
773
+ ``atom_site`` category containing all models.
774
+ model : int
775
+ The model to be selected.
776
+
777
+ Returns
778
+ -------
779
+ atom_site : CIFCategory or BinaryCIFCategory
780
+ The ``atom_site`` category containing only the selected model.
678
781
  """
782
+ models = atom_site["pdbx_PDB_model_num"].as_array(np.int32)
783
+ _, model_starts = np.unique(models, return_index=True)
784
+ model_starts.sort()
679
785
  # Append exclusive stop
680
786
  model_starts = np.append(model_starts, [atom_site.row_count])
681
787
  # Indexing starts at 0, but model number starts at 1
@@ -703,7 +809,13 @@ def _get_box(block):
703
809
  return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
704
810
 
705
811
 
706
- def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
812
+ def set_structure(
813
+ pdbx_file,
814
+ array,
815
+ data_block=None,
816
+ include_bonds=False,
817
+ extra_fields=[],
818
+ ):
707
819
  """
708
820
  Set the ``atom_site`` category with atom information from an
709
821
  :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -737,6 +849,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
737
849
  category.
738
850
  Inter-residue bonds will be written into the ``struct_conn``
739
851
  independent of this parameter.
852
+ extra_fields : list of str, optional
853
+ List of additional fields from the ``atom_site`` category
854
+ that should be written into the file.
855
+ Default is an empty list.
740
856
 
741
857
  Notes
742
858
  -----
@@ -752,7 +868,6 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
752
868
  >>> file = CIFFile()
753
869
  >>> set_structure(file, atom_array)
754
870
  >>> file.write(os.path.join(path_to_directory, "structure.cif"))
755
-
756
871
  """
757
872
  _check_non_empty(array)
758
873
 
@@ -773,7 +888,11 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
773
888
  )
774
889
  atom_site["label_comp_id"] = np.copy(array.res_name)
775
890
  atom_site["label_asym_id"] = np.copy(array.chain_id)
776
- atom_site["label_entity_id"] = _determine_entity_id(array.chain_id)
891
+ atom_site["label_entity_id"] = (
892
+ np.copy(array.label_entity_id)
893
+ if "label_entity_id" in array.get_annotation_categories()
894
+ else _determine_entity_id(array.chain_id)
895
+ )
777
896
  atom_site["label_seq_id"] = np.copy(array.res_id)
778
897
  atom_site["pdbx_PDB_ins_code"] = Column(
779
898
  np.copy(array.ins_code),
@@ -797,6 +916,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
797
916
  np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
798
917
  )
799
918
 
919
+ # Handle all remaining custom fields
920
+ if len(extra_fields) > 0:
921
+ # ... check to avoid clashes with standard annotations
922
+ _standard_annotations = [
923
+ "hetero",
924
+ "element",
925
+ "atom_name",
926
+ "res_name",
927
+ "chain_id",
928
+ "res_id",
929
+ "ins_code",
930
+ "atom_id",
931
+ "b_factor",
932
+ "occupancy",
933
+ "charge",
934
+ ]
935
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
936
+
937
+ for annot in extra_fields:
938
+ if annot in _reserved_annotation_names:
939
+ raise ValueError(
940
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
941
+ "Please choose another name."
942
+ )
943
+ atom_site[annot] = np.copy(array.get_annotation(annot))
944
+
800
945
  if array.bonds is not None:
801
946
  struct_conn = _set_inter_residue_bonds(array, atom_site)
802
947
  if struct_conn is not None:
@@ -1021,13 +1166,21 @@ def _set_inter_residue_bonds(array, atom_site):
1021
1166
  if len(bond_array) == 0:
1022
1167
  return None
1023
1168
 
1169
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1170
+ # nucleotide/amino acid residues
1171
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1172
+ if len(bond_array) == 0:
1173
+ return None
1174
+
1024
1175
  struct_conn = Category()
1025
1176
  struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1026
- struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
1177
+ struct_conn["conn_type_id"] = [
1178
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1179
+ ]
1027
1180
  struct_conn["pdbx_value_order"] = Column(
1028
1181
  np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1029
1182
  np.where(
1030
- bond_array[:, 2] == BondType.ANY,
1183
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1031
1184
  MaskValue.MISSING,
1032
1185
  MaskValue.PRESENT,
1033
1186
  ),
@@ -1063,7 +1216,34 @@ def _filter_bonds(array, connection):
1063
1216
  raise ValueError("Invalid 'connection' option")
1064
1217
 
1065
1218
 
1066
- def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
1219
+ def _filter_canonical_links(array, bond_array):
1220
+ """
1221
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1222
+ """
1223
+ # Get the residue index for each bonded atom
1224
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1225
+ -1, 2
1226
+ )
1227
+
1228
+ return (
1229
+ # Must be canonical residues
1230
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1231
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1232
+ # Must be backbone bond
1233
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1234
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1235
+ # Must connect adjacent residues
1236
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1237
+ ) # fmt: skip
1238
+
1239
+
1240
+ def get_component(
1241
+ pdbx_file,
1242
+ data_block=None,
1243
+ use_ideal_coord=True,
1244
+ res_name=None,
1245
+ allow_missing_coord=False,
1246
+ ):
1067
1247
  """
1068
1248
  Create an :class:`AtomArray` for a chemical component from the
1069
1249
  ``chem_comp_atom`` and, if available, the ``chem_comp_bond``
@@ -1091,6 +1271,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1091
1271
  In this case, the component with the given residue name is
1092
1272
  read.
1093
1273
  By default, all rows would be read in this case.
1274
+ allow_missing_coord : bool, optional
1275
+ Whether to allow missing coordinate values in components.
1276
+ If ``True``, these will be represented as ``nan`` values.
1277
+ If ``False``, a ``ValueError`` is raised when missing coordinates
1278
+ are encountered.
1094
1279
 
1095
1280
  Returns
1096
1281
  -------
@@ -1161,17 +1346,29 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1161
1346
  # Swap with the fallback option
1162
1347
  coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1163
1348
  try:
1164
- for i, field in enumerate(coord_fields):
1165
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1166
- except KeyError as err:
1167
- key = err.args[0]
1168
- warnings.warn(
1169
- f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1170
- f"The fallback coordinates will be used instead",
1171
- UserWarning,
1349
+ array.coord = _parse_component_coordinates(
1350
+ [atom_category[field] for field in coord_fields]
1351
+ )
1352
+ except Exception as err:
1353
+ if isinstance(err, KeyError):
1354
+ key = err.args[0]
1355
+ warnings.warn(
1356
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1357
+ f"The fallback coordinates will be used instead",
1358
+ UserWarning,
1359
+ )
1360
+ elif isinstance(err, ValueError):
1361
+ warnings.warn(
1362
+ "The coordinates are missing for some atoms. "
1363
+ "The fallback coordinates will be used instead",
1364
+ UserWarning,
1365
+ )
1366
+ else:
1367
+ raise
1368
+ array.coord = _parse_component_coordinates(
1369
+ [atom_category[field] for field in alt_coord_fields],
1370
+ allow_missing=allow_missing_coord,
1172
1371
  )
1173
- for i, field in enumerate(alt_coord_fields):
1174
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1175
1372
 
1176
1373
  try:
1177
1374
  bond_category = block["chem_comp_bond"]
@@ -1181,7 +1378,7 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1181
1378
  )
1182
1379
  except KeyError:
1183
1380
  warnings.warn(
1184
- "Category 'chem_comp_bond' not found. " "No bonds will be parsed",
1381
+ "Category 'chem_comp_bond' not found. No bonds will be parsed",
1185
1382
  UserWarning,
1186
1383
  )
1187
1384
  else:
@@ -1201,6 +1398,23 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1201
1398
  return array
1202
1399
 
1203
1400
 
1401
+ def _parse_component_coordinates(coord_columns, allow_missing=False):
1402
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1403
+ for i, column in enumerate(coord_columns):
1404
+ if column.mask is not None and column.mask.array.any():
1405
+ if allow_missing:
1406
+ warnings.warn(
1407
+ "Missing coordinates for some atoms. Those will be set to nan",
1408
+ UserWarning,
1409
+ )
1410
+ else:
1411
+ raise ValueError(
1412
+ "Missing coordinates for some atoms",
1413
+ )
1414
+ coord[:, i] = column.as_array(np.float32, masked_value=np.nan)
1415
+ return coord
1416
+
1417
+
1204
1418
  def set_component(pdbx_file, array, data_block=None):
1205
1419
  """
1206
1420
  Set the ``chem_comp_atom`` and, if bonds are available,
@@ -1305,6 +1519,7 @@ def list_assemblies(pdbx_file, data_block=None):
1305
1519
 
1306
1520
  Examples
1307
1521
  --------
1522
+
1308
1523
  >>> import os.path
1309
1524
  >>> file = CIFFile.read(os.path.join(path_to_structures, "1f2n.cif"))
1310
1525
  >>> assembly_ids = list_assemblies(file)
@@ -1417,7 +1632,10 @@ def get_assembly(
1417
1632
  Returns
1418
1633
  -------
1419
1634
  assembly : AtomArray or AtomArrayStack
1420
- The assembly. The return type depends on the `model` parameter.
1635
+ The assembly.
1636
+ The return type depends on the `model` parameter.
1637
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1638
+ unit in the assembly.
1421
1639
 
1422
1640
  Examples
1423
1641
  --------
@@ -1506,7 +1724,6 @@ def _apply_transformations(structure, transformation_dict, operations):
1506
1724
  """
1507
1725
  # Additional first dimesion for 'structure.repeat()'
1508
1726
  assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1509
-
1510
1727
  # Apply corresponding transformation for each copy in the assembly
1511
1728
  for i, operation in enumerate(operations):
1512
1729
  coord = structure.coord
@@ -1520,7 +1737,11 @@ def _apply_transformations(structure, transformation_dict, operations):
1520
1737
  coord += translation_vector
1521
1738
  assembly_coord[i] = coord
1522
1739
 
1523
- return repeat(structure, assembly_coord)
1740
+ assembly = repeat(structure, assembly_coord)
1741
+ assembly.set_annotation(
1742
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1743
+ )
1744
+ return assembly
1524
1745
 
1525
1746
 
1526
1747
  def _get_transformations(struct_oper):
@@ -1596,4 +1817,118 @@ def _convert_string_to_sequence(string, stype):
1596
1817
  elif stype in _other_type_list:
1597
1818
  return None
1598
1819
  else:
1599
- raise InvalidFileError("mmCIF _entity_poly.type unsupported" " type: " + stype)
1820
+ raise InvalidFileError("mmCIF _entity_poly.type unsupported type: " + stype)
1821
+
1822
+
1823
+ def get_sse(pdbx_file, data_block=None, match_model=None):
1824
+ """
1825
+ Get the secondary structure from a PDBx file.
1826
+
1827
+ Parameters
1828
+ ----------
1829
+ pdbx_file : CIFFile or CIFBlock or BinaryCIFFile or BinaryCIFBlock
1830
+ The file object.
1831
+ The following categories are required:
1832
+
1833
+ - ``entity_poly``
1834
+ - ``struct_conf`` (if alpha-helices are present)
1835
+ - ``struct_sheet_range`` (if beta-strands are present)
1836
+ - ``atom_site`` (if `match_model` is set)
1837
+
1838
+ data_block : str, optional
1839
+ The name of the data block.
1840
+ Default is the first (and most times only) data block of the
1841
+ file.
1842
+ If the data block object is passed directly to `pdbx_file`,
1843
+ this parameter is ignored.
1844
+ match_model : None, optional
1845
+ If a model number is given, only secondary structure elements for residues are
1846
+ kept, that are resolved in the given model.
1847
+ This means secondary structure elements for residues that would not appear
1848
+ in a corresponding :class:`AtomArray` from :func:`get_structure()` are removed.
1849
+ By default, all residues in the sequence are kept.
1850
+
1851
+ Returns
1852
+ -------
1853
+ sse_dict : dict of str -> ndarray, dtype=str
1854
+ The dictionary maps the chain ID (derived from ``auth_asym_id``) to the
1855
+ secondary structure of the respective chain.
1856
+
1857
+ - ``"a"``: alpha-helix
1858
+ - ``"b"``: beta-strand
1859
+ - ``"c"``: coil or not an amino acid
1860
+
1861
+ Each secondary structure element corresponds to the ``label_seq_id`` of the
1862
+ ``atom_site`` category.
1863
+ This means that the 0-th position of the array corresponds to the residue
1864
+ in ``atom_site`` with ``label_seq_id`` ``1``.
1865
+
1866
+ Examples
1867
+ --------
1868
+
1869
+ >>> import os.path
1870
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "1aki.cif"))
1871
+ >>> sse = get_sse(file, match_model=1)
1872
+ >>> print(sse)
1873
+ {'A': array(['c', 'c', 'c', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
1874
+ 'a', 'c', 'c', 'c', 'c', 'c', 'a', 'a', 'a', 'c', 'c', 'a', 'a',
1875
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
1876
+ 'c', 'c', 'c', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'b', 'b',
1877
+ 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
1878
+ 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c',
1879
+ 'c', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'a', 'a', 'a',
1880
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'a',
1881
+ 'a', 'a', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c',
1882
+ 'c', 'c', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c'],
1883
+ dtype='<U1')}
1884
+
1885
+ If only secondary structure elements for resolved residues are requested, the length
1886
+ of the returned array matches the number of peptide residues in the structure.
1887
+
1888
+ >>> file = CIFFile.read(os.path.join(path_to_structures, "3o5r.cif"))
1889
+ >>> print(len(get_sse(file, match_model=1)["A"]))
1890
+ 128
1891
+ >>> atoms = get_structure(file, model=1)
1892
+ >>> atoms = atoms[filter_amino_acids(atoms) & (atoms.chain_id == "A")]
1893
+ >>> print(get_residue_count(atoms))
1894
+ 128
1895
+ """
1896
+ block = _get_block(pdbx_file, data_block)
1897
+
1898
+ # Init all chains with "c" for coil
1899
+ sse_dict = {
1900
+ chain_id: np.repeat("c", len(sequence))
1901
+ for chain_id, sequence in get_sequence(block).items()
1902
+ }
1903
+
1904
+ # Populate SSE arrays with helices and strands
1905
+ for sse_symbol, category_name in [
1906
+ ("a", "struct_conf"),
1907
+ ("b", "struct_sheet_range"),
1908
+ ]:
1909
+ if category_name in block:
1910
+ category = block[category_name]
1911
+ chains = category["beg_auth_asym_id"].as_array(str)
1912
+ start_positions = category["beg_label_seq_id"].as_array(int)
1913
+ end_positions = category["end_label_seq_id"].as_array(int)
1914
+
1915
+ # set alpha helix positions
1916
+ for chain, start, end in zip(chains, start_positions, end_positions):
1917
+ # Translate the 1-based positions from PDBx into 0-based array indices
1918
+ sse_dict[chain][start - 1 : end] = sse_symbol
1919
+
1920
+ if match_model is not None:
1921
+ model_atom_site = _filter_model(block["atom_site"], match_model)
1922
+ chain_ids = model_atom_site["auth_asym_id"].as_array(str)
1923
+ res_ids = model_atom_site["label_seq_id"].as_array(int, masked_value=-1)
1924
+ # Filter out masked residues, i.e. residues not part of a chain
1925
+ mask = res_ids != -1
1926
+ chain_ids = chain_ids[mask]
1927
+ res_ids = res_ids[mask]
1928
+ for chain_id, sse in sse_dict.items():
1929
+ res_ids_in_chain = res_ids[chain_ids == chain_id]
1930
+ # Transform from 1-based residue ID to 0-based index
1931
+ indices = np.unique(res_ids_in_chain) - 1
1932
+ sse_dict[chain_id] = sse[indices]
1933
+
1934
+ return sse_dict