biotite 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (92) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  20. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  26. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  28. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  37. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +156 -43
  51. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  54. biotite/structure/charges.cpython-311-darwin.so +0 -0
  55. biotite/structure/filter.py +1 -1
  56. biotite/structure/geometry.py +60 -113
  57. biotite/structure/info/__init__.py +1 -0
  58. biotite/structure/info/atoms.py +13 -13
  59. biotite/structure/info/bonds.py +12 -6
  60. biotite/structure/info/ccd.py +125 -32
  61. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  62. biotite/structure/info/groups.py +63 -17
  63. biotite/structure/info/masses.py +9 -6
  64. biotite/structure/info/misc.py +15 -21
  65. biotite/structure/info/standardize.py +3 -2
  66. biotite/structure/io/mol/sdf.py +41 -40
  67. biotite/structure/io/pdb/convert.py +2 -0
  68. biotite/structure/io/pdb/file.py +74 -3
  69. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  70. biotite/structure/io/pdbqt/file.py +32 -32
  71. biotite/structure/io/pdbx/__init__.py +1 -0
  72. biotite/structure/io/pdbx/bcif.py +32 -8
  73. biotite/structure/io/pdbx/cif.py +148 -107
  74. biotite/structure/io/pdbx/component.py +9 -4
  75. biotite/structure/io/pdbx/compress.py +321 -0
  76. biotite/structure/io/pdbx/convert.py +227 -68
  77. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  78. biotite/structure/io/pdbx/encoding.pyx +98 -17
  79. biotite/structure/io/trajfile.py +16 -16
  80. biotite/structure/molecules.py +141 -141
  81. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  82. biotite/structure/segments.py +1 -2
  83. biotite/structure/util.py +73 -1
  84. biotite/version.py +2 -2
  85. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
  86. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
  87. biotite/structure/info/ccd/README.rst +0 -8
  88. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  89. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  90. biotite/structure/info/ccd/nucleotides.txt +0 -798
  91. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  92. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -24,6 +24,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
24
24
  from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
25
25
  from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
26
26
  from biotite.structure.error import BadStructureError
27
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
28
+ from biotite.structure.filter import (
29
+ _canonical_nucleotide_list as canonical_nucleotide_list,
30
+ )
27
31
  from biotite.structure.filter import (
28
32
  filter_first_altloc,
29
33
  filter_highest_occupancy_altloc,
@@ -36,32 +40,38 @@ from biotite.structure.io.pdbx.bcif import (
36
40
  from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
37
41
  from biotite.structure.io.pdbx.component import MaskValue
38
42
  from biotite.structure.io.pdbx.encoding import StringArrayEncoding
39
- from biotite.structure.residues import get_residue_count, get_residue_starts_for
43
+ from biotite.structure.residues import (
44
+ get_residue_count,
45
+ get_residue_positions,
46
+ get_residue_starts_for,
47
+ )
40
48
  from biotite.structure.util import matrix_rotate
41
49
 
42
- # Cond types in `struct_conn` category that refer to covalent bonds
43
- PDBX_COVALENT_TYPES = [
44
- "covale",
45
- "covale_base",
46
- "covale_phosphate",
47
- "covale_sugar",
48
- "disulf",
49
- "modres",
50
- "modres_link",
51
- "metalc",
52
- ]
53
- # Map 'struct_conn' bond orders to 'BondType'...
54
- PDBX_BOND_ORDER_TO_TYPE = {
55
- "": BondType.ANY,
56
- "sing": BondType.SINGLE,
57
- "doub": BondType.DOUBLE,
58
- "trip": BondType.TRIPLE,
59
- "quad": BondType.QUADRUPLE,
50
+ # Bond types in `struct_conn` category that refer to covalent bonds
51
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
52
+ # Although a covalent bond, could in theory have a higher bond order,
53
+ # practically inter-residue bonds are always single
54
+ "covale": BondType.SINGLE,
55
+ "covale_base": BondType.SINGLE,
56
+ "covale_phosphate": BondType.SINGLE,
57
+ "covale_sugar": BondType.SINGLE,
58
+ "disulf": BondType.SINGLE,
59
+ "modres": BondType.SINGLE,
60
+ "modres_link": BondType.SINGLE,
61
+ "metalc": BondType.COORDINATION,
62
+ }
63
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
64
+ BondType.ANY: "covale",
65
+ BondType.SINGLE: "covale",
66
+ BondType.DOUBLE: "covale",
67
+ BondType.TRIPLE: "covale",
68
+ BondType.QUADRUPLE: "covale",
69
+ BondType.AROMATIC_SINGLE: "covale",
70
+ BondType.AROMATIC_DOUBLE: "covale",
71
+ BondType.AROMATIC_TRIPLE: "covale",
72
+ BondType.COORDINATION: "metalc",
60
73
  }
61
- # ...and vice versa
62
74
  PDBX_BOND_TYPE_TO_ORDER = {
63
- # 'ANY' is masked later, it is merely added here to avoid a KeyError
64
- BondType.ANY: "",
65
75
  BondType.SINGLE: "sing",
66
76
  BondType.DOUBLE: "doub",
67
77
  BondType.TRIPLE: "trip",
@@ -69,6 +79,9 @@ PDBX_BOND_TYPE_TO_ORDER = {
69
79
  BondType.AROMATIC_SINGLE: "sing",
70
80
  BondType.AROMATIC_DOUBLE: "doub",
71
81
  BondType.AROMATIC_TRIPLE: "trip",
82
+ # These are masked later, it is merely added here to avoid a KeyError
83
+ BondType.ANY: "",
84
+ BondType.COORDINATION: "",
72
85
  }
73
86
  # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
74
87
  COMP_BOND_ORDER_TO_TYPE = {
@@ -84,6 +97,7 @@ COMP_BOND_ORDER_TO_TYPE = {
84
97
  COMP_BOND_TYPE_TO_ORDER = {
85
98
  bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
86
99
  }
100
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
87
101
 
88
102
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
89
103
  _nucleotideseq_type_list = [
@@ -450,7 +464,7 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
450
464
  "chain_id",
451
465
  _get_or_fallback(
452
466
  atom_site, f"{prefix}_asym_id", f"{alt_prefix}_asym_id"
453
- ).as_array("U4"),
467
+ ).as_array(str),
454
468
  )
455
469
  array.set_annotation(
456
470
  "res_id",
@@ -458,33 +472,70 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
458
472
  atom_site, f"{prefix}_seq_id", f"{alt_prefix}_seq_id"
459
473
  ).as_array(int, -1),
460
474
  )
461
- array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array("U1", ""))
475
+ array.set_annotation("ins_code", atom_site["pdbx_PDB_ins_code"].as_array(str, ""))
462
476
  array.set_annotation(
463
477
  "res_name",
464
478
  _get_or_fallback(
465
479
  atom_site, f"{prefix}_comp_id", f"{alt_prefix}_comp_id"
466
- ).as_array("U5"),
480
+ ).as_array(str),
467
481
  )
468
482
  array.set_annotation("hetero", atom_site["group_PDB"].as_array(str) == "HETATM")
469
483
  array.set_annotation(
470
484
  "atom_name",
471
485
  _get_or_fallback(
472
486
  atom_site, f"{prefix}_atom_id", f"{alt_prefix}_atom_id"
473
- ).as_array("U6"),
487
+ ).as_array(str),
474
488
  )
475
- array.set_annotation("element", atom_site["type_symbol"].as_array("U2"))
489
+ array.set_annotation("element", atom_site["type_symbol"].as_array(str))
476
490
 
477
491
  if "atom_id" in extra_fields:
478
- array.set_annotation("atom_id", atom_site["id"].as_array(int))
492
+ if "id" in atom_site:
493
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
494
+ else:
495
+ warnings.warn(
496
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
497
+ UserWarning,
498
+ )
499
+ array.set_annotation("atom_id", np.arange(array.array_length()))
479
500
  extra_fields.remove("atom_id")
480
501
  if "b_factor" in extra_fields:
481
- array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
502
+ if "B_iso_or_equiv" in atom_site:
503
+ array.set_annotation(
504
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
505
+ )
506
+ else:
507
+ warnings.warn(
508
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
509
+ UserWarning,
510
+ )
511
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
482
512
  extra_fields.remove("b_factor")
483
513
  if "occupancy" in extra_fields:
484
- array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
514
+ if "occupancy" in atom_site:
515
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
516
+ else:
517
+ warnings.warn(
518
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
519
+ UserWarning,
520
+ )
521
+ array.set_annotation(
522
+ "occupancy", np.ones(array.array_length(), dtype=float)
523
+ )
485
524
  extra_fields.remove("occupancy")
486
525
  if "charge" in extra_fields:
487
- array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
526
+ if "pdbx_formal_charge" in atom_site:
527
+ array.set_annotation(
528
+ "charge",
529
+ atom_site["pdbx_formal_charge"].as_array(
530
+ int, 0
531
+ ), # masked values are set to 0
532
+ )
533
+ else:
534
+ warnings.warn(
535
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
536
+ UserWarning,
537
+ )
538
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
488
539
  extra_fields.remove("charge")
489
540
 
490
541
  # Handle all remaining custom fields
@@ -536,7 +587,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
536
587
  ]
537
588
 
538
589
  covale_mask = np.isin(
539
- struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
590
+ struct_conn["conn_type_id"].as_array(str),
591
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
540
592
  )
541
593
  if "ptnr1_symmetry" in struct_conn:
542
594
  covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
@@ -576,13 +628,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
576
628
  atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
577
629
  atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
578
630
 
579
- # Interpret missing values as ANY bonds
580
- bond_order = struct_conn["pdbx_value_order"].as_array("U4", "")
631
+ bond_type_id = struct_conn["conn_type_id"].as_array()
581
632
  # Consecutively apply the same masks as applied to the atom indices
582
633
  # Logical combination does not work here,
583
634
  # as the second mask was created based on already filtered data
584
- bond_order = bond_order[covale_mask][mapping_exists_mask]
585
- bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
635
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
636
+ # The type ID is always present in the dictionary,
637
+ # as it was used to filter the applicable bonds
638
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
586
639
 
587
640
  return BondList(
588
641
  atom_site.row_count,
@@ -593,7 +646,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
593
646
  def _find_matches(query_arrays, reference_arrays):
594
647
  """
595
648
  For each index in the `query_arrays` find the indices in the
596
- `reference_arrays` where all query values the reference counterpart.
649
+ `reference_arrays` where all query values match the reference counterpart.
597
650
  If no match is found for a query, the corresponding index is -1.
598
651
  """
599
652
  match_masks_for_all_columns = np.stack(
@@ -703,7 +756,13 @@ def _get_box(block):
703
756
  return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
704
757
 
705
758
 
706
- def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
759
+ def set_structure(
760
+ pdbx_file,
761
+ array,
762
+ data_block=None,
763
+ include_bonds=False,
764
+ extra_fields=[],
765
+ ):
707
766
  """
708
767
  Set the ``atom_site`` category with atom information from an
709
768
  :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -737,6 +796,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
737
796
  category.
738
797
  Inter-residue bonds will be written into the ``struct_conn``
739
798
  independent of this parameter.
799
+ extra_fields : list of str, optional
800
+ List of additional fields from the ``atom_site`` category
801
+ that should be written into the file.
802
+ Default is an empty list.
740
803
 
741
804
  Notes
742
805
  -----
@@ -797,6 +860,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
797
860
  np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
798
861
  )
799
862
 
863
+ # Handle all remaining custom fields
864
+ if len(extra_fields) > 0:
865
+ # ... check to avoid clashes with standard annotations
866
+ _standard_annotations = [
867
+ "hetero",
868
+ "element",
869
+ "atom_name",
870
+ "res_name",
871
+ "chain_id",
872
+ "res_id",
873
+ "ins_code",
874
+ "atom_id",
875
+ "b_factor",
876
+ "occupancy",
877
+ "charge",
878
+ ]
879
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
880
+
881
+ for annot in extra_fields:
882
+ if annot in _reserved_annotation_names:
883
+ raise ValueError(
884
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
885
+ "Please choose another name."
886
+ )
887
+ atom_site[annot] = np.copy(array.get_annotation(annot))
888
+
800
889
  if array.bonds is not None:
801
890
  struct_conn = _set_inter_residue_bonds(array, atom_site)
802
891
  if struct_conn is not None:
@@ -964,25 +1053,38 @@ def _set_intra_residue_bonds(array, atom_site):
964
1053
  aromatic_flag[i] = aromatic
965
1054
  any_mask = bond_array[:, 2] == BondType.ANY
966
1055
 
967
- chem_comp_bond = Category()
1056
+ # Remove already existing residue and atom name combinations
1057
+ # These appear when the structure contains a residue multiple times
1058
+ atom_id_1 = array.atom_name[bond_array[:, 0]]
1059
+ atom_id_2 = array.atom_name[bond_array[:, 1]]
968
1060
  # Take the residue name from the first atom index, as the residue
969
1061
  # name is the same for both atoms, since we have only intra bonds
970
- chem_comp_bond["comp_id"] = array.res_name[bond_array[:, 0]]
971
- chem_comp_bond["atom_id_1"] = array.atom_name[bond_array[:, 0]]
972
- chem_comp_bond["atom_id_2"] = array.atom_name[bond_array[:, 1]]
1062
+ comp_id = array.res_name[bond_array[:, 0]]
1063
+ _, unique_indices = np.unique(
1064
+ np.stack([comp_id, atom_id_1, atom_id_2], axis=-1), axis=0, return_index=True
1065
+ )
1066
+ unique_indices.sort()
1067
+
1068
+ chem_comp_bond = Category()
1069
+ n_bonds = len(unique_indices)
1070
+ chem_comp_bond["pdbx_ordinal"] = np.arange(1, n_bonds + 1, dtype=np.int32)
1071
+ chem_comp_bond["comp_id"] = comp_id[unique_indices]
1072
+ chem_comp_bond["atom_id_1"] = atom_id_1[unique_indices]
1073
+ chem_comp_bond["atom_id_2"] = atom_id_2[unique_indices]
973
1074
  chem_comp_bond["value_order"] = Column(
974
- value_order, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
1075
+ value_order[unique_indices],
1076
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
975
1077
  )
976
1078
  chem_comp_bond["pdbx_aromatic_flag"] = Column(
977
- aromatic_flag, np.where(any_mask, MaskValue.MISSING, MaskValue.PRESENT)
1079
+ aromatic_flag[unique_indices],
1080
+ np.where(any_mask[unique_indices], MaskValue.MISSING, MaskValue.PRESENT),
978
1081
  )
979
1082
  # BondList does not contain stereo information
980
1083
  # -> all values are missing
981
1084
  chem_comp_bond["pdbx_stereo_config"] = Column(
982
- np.zeros(len(bond_array), dtype="U1"),
983
- np.full(len(bond_array), MaskValue.MISSING),
1085
+ np.zeros(n_bonds, dtype="U1"),
1086
+ np.full(n_bonds, MaskValue.MISSING),
984
1087
  )
985
- chem_comp_bond["pdbx_ordinal"] = np.arange(1, len(bond_array) + 1, dtype=np.int32)
986
1088
  return chem_comp_bond
987
1089
 
988
1090
 
@@ -1007,13 +1109,22 @@ def _set_inter_residue_bonds(array, atom_site):
1007
1109
  bond_array = _filter_bonds(array, "inter")
1008
1110
  if len(bond_array) == 0:
1009
1111
  return None
1112
+
1113
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1114
+ # nucleotide/amino acid residues
1115
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1116
+ if len(bond_array) == 0:
1117
+ return None
1118
+
1010
1119
  struct_conn = Category()
1011
1120
  struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1012
- struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
1121
+ struct_conn["conn_type_id"] = [
1122
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1123
+ ]
1013
1124
  struct_conn["pdbx_value_order"] = Column(
1014
1125
  np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1015
1126
  np.where(
1016
- bond_array[:, 2] == BondType.ANY,
1127
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1017
1128
  MaskValue.MISSING,
1018
1129
  MaskValue.PRESENT,
1019
1130
  ),
@@ -1049,6 +1160,27 @@ def _filter_bonds(array, connection):
1049
1160
  raise ValueError("Invalid 'connection' option")
1050
1161
 
1051
1162
 
1163
+ def _filter_canonical_links(array, bond_array):
1164
+ """
1165
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1166
+ """
1167
+ # Get the residue index for each bonded atom
1168
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1169
+ -1, 2
1170
+ )
1171
+
1172
+ return (
1173
+ # Must be canonical residues
1174
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1175
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1176
+ # Must be backbone bond
1177
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1178
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1179
+ # Must connect adjacent residues
1180
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1181
+ ) # fmt: skip
1182
+
1183
+
1052
1184
  def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
1053
1185
  """
1054
1186
  Create an :class:`AtomArray` for a chemical component from the
@@ -1135,12 +1267,11 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1135
1267
 
1136
1268
  array = AtomArray(atom_category.row_count)
1137
1269
 
1138
- array.hetero[:] = True
1139
- array.res_name = atom_category["comp_id"].as_array("U5")
1140
- array.atom_name = atom_category["atom_id"].as_array("U6")
1141
- array.element = atom_category["type_symbol"].as_array("U2")
1142
- array.add_annotation("charge", int)
1143
- array.charge = atom_category["charge"].as_array(int, 0)
1270
+ array.set_annotation("hetero", np.full(len(atom_category["comp_id"]), True))
1271
+ array.set_annotation("res_name", atom_category["comp_id"].as_array(str))
1272
+ array.set_annotation("atom_name", atom_category["atom_id"].as_array(str))
1273
+ array.set_annotation("element", atom_category["type_symbol"].as_array(str))
1274
+ array.set_annotation("charge", atom_category["charge"].as_array(int, 0))
1144
1275
 
1145
1276
  coord_fields = [f"pdbx_model_Cartn_{dim}_ideal" for dim in ("x", "y", "z")]
1146
1277
  alt_coord_fields = [f"model_Cartn_{dim}" for dim in ("x", "y", "z")]
@@ -1148,17 +1279,28 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1148
1279
  # Swap with the fallback option
1149
1280
  coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1150
1281
  try:
1151
- for i, field in enumerate(coord_fields):
1152
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1153
- except KeyError as err:
1154
- key = err.args[0]
1155
- warnings.warn(
1156
- f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1157
- f"The fallback coordinates will be used instead",
1158
- UserWarning,
1282
+ array.coord = _parse_component_coordinates(
1283
+ [atom_category[field] for field in coord_fields]
1284
+ )
1285
+ except Exception as err:
1286
+ if isinstance(err, KeyError):
1287
+ key = err.args[0]
1288
+ warnings.warn(
1289
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1290
+ f"The fallback coordinates will be used instead",
1291
+ UserWarning,
1292
+ )
1293
+ elif isinstance(err, ValueError):
1294
+ warnings.warn(
1295
+ "The coordinates are missing for some atoms. "
1296
+ "The fallback coordinates will be used instead",
1297
+ UserWarning,
1298
+ )
1299
+ else:
1300
+ raise
1301
+ array.coord = _parse_component_coordinates(
1302
+ [atom_category[field] for field in alt_coord_fields]
1159
1303
  )
1160
- for i, field in enumerate(alt_coord_fields):
1161
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1162
1304
 
1163
1305
  try:
1164
1306
  bond_category = block["chem_comp_bond"]
@@ -1188,6 +1330,17 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1188
1330
  return array
1189
1331
 
1190
1332
 
1333
+ def _parse_component_coordinates(coord_columns):
1334
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1335
+ for i, column in enumerate(coord_columns):
1336
+ if column.mask is not None and column.mask.array.any():
1337
+ raise ValueError(
1338
+ "Missing coordinates for some atoms",
1339
+ )
1340
+ coord[:, i] = column.as_array(np.float32)
1341
+ return coord
1342
+
1343
+
1191
1344
  def set_component(pdbx_file, array, data_block=None):
1192
1345
  """
1193
1346
  Set the ``chem_comp_atom`` and, if bonds are available,
@@ -1404,7 +1557,10 @@ def get_assembly(
1404
1557
  Returns
1405
1558
  -------
1406
1559
  assembly : AtomArray or AtomArrayStack
1407
- The assembly. The return type depends on the `model` parameter.
1560
+ The assembly.
1561
+ The return type depends on the `model` parameter.
1562
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1563
+ unit in the assembly.
1408
1564
 
1409
1565
  Examples
1410
1566
  --------
@@ -1493,7 +1649,6 @@ def _apply_transformations(structure, transformation_dict, operations):
1493
1649
  """
1494
1650
  # Additional first dimesion for 'structure.repeat()'
1495
1651
  assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1496
-
1497
1652
  # Apply corresponding transformation for each copy in the assembly
1498
1653
  for i, operation in enumerate(operations):
1499
1654
  coord = structure.coord
@@ -1507,7 +1662,11 @@ def _apply_transformations(structure, transformation_dict, operations):
1507
1662
  coord += translation_vector
1508
1663
  assembly_coord[i] = coord
1509
1664
 
1510
- return repeat(structure, assembly_coord)
1665
+ assembly = repeat(structure, assembly_coord)
1666
+ assembly.set_annotation(
1667
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1668
+ )
1669
+ return assembly
1511
1670
 
1512
1671
 
1513
1672
  def _get_transformations(struct_oper):