biotite 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (90) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  20. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  26. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  28. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  37. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +129 -40
  51. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  54. biotite/structure/charges.cpython-311-darwin.so +0 -0
  55. biotite/structure/geometry.py +60 -113
  56. biotite/structure/info/__init__.py +1 -0
  57. biotite/structure/info/atoms.py +13 -13
  58. biotite/structure/info/bonds.py +12 -6
  59. biotite/structure/info/ccd.py +125 -32
  60. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  61. biotite/structure/info/groups.py +63 -17
  62. biotite/structure/info/masses.py +9 -6
  63. biotite/structure/info/misc.py +15 -21
  64. biotite/structure/info/standardize.py +3 -2
  65. biotite/structure/io/mol/sdf.py +41 -40
  66. biotite/structure/io/pdb/convert.py +2 -0
  67. biotite/structure/io/pdb/file.py +74 -3
  68. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  69. biotite/structure/io/pdbqt/file.py +32 -32
  70. biotite/structure/io/pdbx/__init__.py +1 -0
  71. biotite/structure/io/pdbx/bcif.py +32 -8
  72. biotite/structure/io/pdbx/cif.py +72 -59
  73. biotite/structure/io/pdbx/component.py +9 -4
  74. biotite/structure/io/pdbx/compress.py +321 -0
  75. biotite/structure/io/pdbx/convert.py +194 -48
  76. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  77. biotite/structure/io/pdbx/encoding.pyx +98 -17
  78. biotite/structure/molecules.py +141 -141
  79. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  80. biotite/structure/segments.py +1 -2
  81. biotite/structure/util.py +73 -1
  82. biotite/version.py +2 -2
  83. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/METADATA +3 -1
  84. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/RECORD +86 -76
  85. biotite/structure/info/ccd/README.rst +0 -8
  86. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  87. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  88. biotite/structure/info/ccd/nucleotides.txt +0 -798
  89. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  90. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -24,6 +24,10 @@ from biotite.structure.atoms import AtomArray, AtomArrayStack, repeat
24
24
  from biotite.structure.bonds import BondList, BondType, connect_via_residue_names
25
25
  from biotite.structure.box import unitcell_from_vectors, vectors_from_unitcell
26
26
  from biotite.structure.error import BadStructureError
27
+ from biotite.structure.filter import _canonical_aa_list as canonical_aa_list
28
+ from biotite.structure.filter import (
29
+ _canonical_nucleotide_list as canonical_nucleotide_list,
30
+ )
27
31
  from biotite.structure.filter import (
28
32
  filter_first_altloc,
29
33
  filter_highest_occupancy_altloc,
@@ -36,32 +40,38 @@ from biotite.structure.io.pdbx.bcif import (
36
40
  from biotite.structure.io.pdbx.cif import CIFBlock, CIFFile
37
41
  from biotite.structure.io.pdbx.component import MaskValue
38
42
  from biotite.structure.io.pdbx.encoding import StringArrayEncoding
39
- from biotite.structure.residues import get_residue_count, get_residue_starts_for
43
+ from biotite.structure.residues import (
44
+ get_residue_count,
45
+ get_residue_positions,
46
+ get_residue_starts_for,
47
+ )
40
48
  from biotite.structure.util import matrix_rotate
41
49
 
42
- # Cond types in `struct_conn` category that refer to covalent bonds
43
- PDBX_COVALENT_TYPES = [
44
- "covale",
45
- "covale_base",
46
- "covale_phosphate",
47
- "covale_sugar",
48
- "disulf",
49
- "modres",
50
- "modres_link",
51
- "metalc",
52
- ]
53
- # Map 'struct_conn' bond orders to 'BondType'...
54
- PDBX_BOND_ORDER_TO_TYPE = {
55
- "": BondType.ANY,
56
- "sing": BondType.SINGLE,
57
- "doub": BondType.DOUBLE,
58
- "trip": BondType.TRIPLE,
59
- "quad": BondType.QUADRUPLE,
50
+ # Bond types in `struct_conn` category that refer to covalent bonds
51
+ PDBX_BOND_TYPE_ID_TO_TYPE = {
52
+ # Although a covalent bond, could in theory have a higher bond order,
53
+ # practically inter-residue bonds are always single
54
+ "covale": BondType.SINGLE,
55
+ "covale_base": BondType.SINGLE,
56
+ "covale_phosphate": BondType.SINGLE,
57
+ "covale_sugar": BondType.SINGLE,
58
+ "disulf": BondType.SINGLE,
59
+ "modres": BondType.SINGLE,
60
+ "modres_link": BondType.SINGLE,
61
+ "metalc": BondType.COORDINATION,
62
+ }
63
+ PDBX_BOND_TYPE_TO_TYPE_ID = {
64
+ BondType.ANY: "covale",
65
+ BondType.SINGLE: "covale",
66
+ BondType.DOUBLE: "covale",
67
+ BondType.TRIPLE: "covale",
68
+ BondType.QUADRUPLE: "covale",
69
+ BondType.AROMATIC_SINGLE: "covale",
70
+ BondType.AROMATIC_DOUBLE: "covale",
71
+ BondType.AROMATIC_TRIPLE: "covale",
72
+ BondType.COORDINATION: "metalc",
60
73
  }
61
- # ...and vice versa
62
74
  PDBX_BOND_TYPE_TO_ORDER = {
63
- # 'ANY' is masked later, it is merely added here to avoid a KeyError
64
- BondType.ANY: "",
65
75
  BondType.SINGLE: "sing",
66
76
  BondType.DOUBLE: "doub",
67
77
  BondType.TRIPLE: "trip",
@@ -69,6 +79,9 @@ PDBX_BOND_TYPE_TO_ORDER = {
69
79
  BondType.AROMATIC_SINGLE: "sing",
70
80
  BondType.AROMATIC_DOUBLE: "doub",
71
81
  BondType.AROMATIC_TRIPLE: "trip",
82
+ # These are masked later, it is merely added here to avoid a KeyError
83
+ BondType.ANY: "",
84
+ BondType.COORDINATION: "",
72
85
  }
73
86
  # Map 'chem_comp_bond' bond orders and aromaticity to 'BondType'...
74
87
  COMP_BOND_ORDER_TO_TYPE = {
@@ -84,6 +97,7 @@ COMP_BOND_ORDER_TO_TYPE = {
84
97
  COMP_BOND_TYPE_TO_ORDER = {
85
98
  bond_type: order for order, bond_type in COMP_BOND_ORDER_TO_TYPE.items()
86
99
  }
100
+ CANONICAL_RESIDUE_LIST = canonical_aa_list + canonical_nucleotide_list
87
101
 
88
102
  _proteinseq_type_list = ["polypeptide(D)", "polypeptide(L)"]
89
103
  _nucleotideseq_type_list = [
@@ -475,16 +489,53 @@ def _fill_annotations(array, atom_site, extra_fields, use_author_fields):
475
489
  array.set_annotation("element", atom_site["type_symbol"].as_array(str))
476
490
 
477
491
  if "atom_id" in extra_fields:
478
- array.set_annotation("atom_id", atom_site["id"].as_array(int))
492
+ if "id" in atom_site:
493
+ array.set_annotation("atom_id", atom_site["id"].as_array(int))
494
+ else:
495
+ warnings.warn(
496
+ "Missing 'id' in 'atom_site' category. 'atom_id' generated automatically.",
497
+ UserWarning,
498
+ )
499
+ array.set_annotation("atom_id", np.arange(array.array_length()))
479
500
  extra_fields.remove("atom_id")
480
501
  if "b_factor" in extra_fields:
481
- array.set_annotation("b_factor", atom_site["B_iso_or_equiv"].as_array(float))
502
+ if "B_iso_or_equiv" in atom_site:
503
+ array.set_annotation(
504
+ "b_factor", atom_site["B_iso_or_equiv"].as_array(float)
505
+ )
506
+ else:
507
+ warnings.warn(
508
+ "Missing 'B_iso_or_equiv' in 'atom_site' category. 'b_factor' will be set to `nan`.",
509
+ UserWarning,
510
+ )
511
+ array.set_annotation("b_factor", np.full(array.array_length(), np.nan))
482
512
  extra_fields.remove("b_factor")
483
513
  if "occupancy" in extra_fields:
484
- array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
514
+ if "occupancy" in atom_site:
515
+ array.set_annotation("occupancy", atom_site["occupancy"].as_array(float))
516
+ else:
517
+ warnings.warn(
518
+ "Missing 'occupancy' in 'atom_site' category. 'occupancy' will be assumed to be 1.0",
519
+ UserWarning,
520
+ )
521
+ array.set_annotation(
522
+ "occupancy", np.ones(array.array_length(), dtype=float)
523
+ )
485
524
  extra_fields.remove("occupancy")
486
525
  if "charge" in extra_fields:
487
- array.set_annotation("charge", atom_site["pdbx_formal_charge"].as_array(int, 0))
526
+ if "pdbx_formal_charge" in atom_site:
527
+ array.set_annotation(
528
+ "charge",
529
+ atom_site["pdbx_formal_charge"].as_array(
530
+ int, 0
531
+ ), # masked values are set to 0
532
+ )
533
+ else:
534
+ warnings.warn(
535
+ "Missing 'pdbx_formal_charge' in 'atom_site' category. 'charge' will be set to 0",
536
+ UserWarning,
537
+ )
538
+ array.set_annotation("charge", np.zeros(array.array_length(), dtype=int))
488
539
  extra_fields.remove("charge")
489
540
 
490
541
  # Handle all remaining custom fields
@@ -536,7 +587,8 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
536
587
  ]
537
588
 
538
589
  covale_mask = np.isin(
539
- struct_conn["conn_type_id"].as_array(str), PDBX_COVALENT_TYPES
590
+ struct_conn["conn_type_id"].as_array(str),
591
+ list(PDBX_BOND_TYPE_ID_TO_TYPE.keys()),
540
592
  )
541
593
  if "ptnr1_symmetry" in struct_conn:
542
594
  covale_mask &= struct_conn["ptnr1_symmetry"].as_array(str, IDENTITY) == IDENTITY
@@ -576,13 +628,14 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
576
628
  atoms_indices_1 = atoms_indices_1[mapping_exists_mask]
577
629
  atoms_indices_2 = atoms_indices_2[mapping_exists_mask]
578
630
 
579
- # Interpret missing values as ANY bonds
580
- bond_order = struct_conn["pdbx_value_order"].as_array(str, "")
631
+ bond_type_id = struct_conn["conn_type_id"].as_array()
581
632
  # Consecutively apply the same masks as applied to the atom indices
582
633
  # Logical combination does not work here,
583
634
  # as the second mask was created based on already filtered data
584
- bond_order = bond_order[covale_mask][mapping_exists_mask]
585
- bond_types = [PDBX_BOND_ORDER_TO_TYPE[order] for order in bond_order]
635
+ bond_type_id = bond_type_id[covale_mask][mapping_exists_mask]
636
+ # The type ID is always present in the dictionary,
637
+ # as it was used to filter the applicable bonds
638
+ bond_types = [PDBX_BOND_TYPE_ID_TO_TYPE[type_id] for type_id in bond_type_id]
586
639
 
587
640
  return BondList(
588
641
  atom_site.row_count,
@@ -593,7 +646,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
593
646
  def _find_matches(query_arrays, reference_arrays):
594
647
  """
595
648
  For each index in the `query_arrays` find the indices in the
596
- `reference_arrays` where all query values the reference counterpart.
649
+ `reference_arrays` where all query values match the reference counterpart.
597
650
  If no match is found for a query, the corresponding index is -1.
598
651
  """
599
652
  match_masks_for_all_columns = np.stack(
@@ -703,7 +756,13 @@ def _get_box(block):
703
756
  return vectors_from_unitcell(len_a, len_b, len_c, alpha, beta, gamma)
704
757
 
705
758
 
706
- def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
759
+ def set_structure(
760
+ pdbx_file,
761
+ array,
762
+ data_block=None,
763
+ include_bonds=False,
764
+ extra_fields=[],
765
+ ):
707
766
  """
708
767
  Set the ``atom_site`` category with atom information from an
709
768
  :class:`AtomArray` or :class:`AtomArrayStack`.
@@ -737,6 +796,10 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
737
796
  category.
738
797
  Inter-residue bonds will be written into the ``struct_conn``
739
798
  independent of this parameter.
799
+ extra_fields : list of str, optional
800
+ List of additional fields from the ``atom_site`` category
801
+ that should be written into the file.
802
+ Default is an empty list.
740
803
 
741
804
  Notes
742
805
  -----
@@ -797,6 +860,32 @@ def set_structure(pdbx_file, array, data_block=None, include_bonds=False):
797
860
  np.where(array.charge == 0, MaskValue.MISSING, MaskValue.PRESENT),
798
861
  )
799
862
 
863
+ # Handle all remaining custom fields
864
+ if len(extra_fields) > 0:
865
+ # ... check to avoid clashes with standard annotations
866
+ _standard_annotations = [
867
+ "hetero",
868
+ "element",
869
+ "atom_name",
870
+ "res_name",
871
+ "chain_id",
872
+ "res_id",
873
+ "ins_code",
874
+ "atom_id",
875
+ "b_factor",
876
+ "occupancy",
877
+ "charge",
878
+ ]
879
+ _reserved_annotation_names = list(atom_site.keys()) + _standard_annotations
880
+
881
+ for annot in extra_fields:
882
+ if annot in _reserved_annotation_names:
883
+ raise ValueError(
884
+ f"Annotation name '{annot}' is reserved and cannot be written to as extra field. "
885
+ "Please choose another name."
886
+ )
887
+ atom_site[annot] = np.copy(array.get_annotation(annot))
888
+
800
889
  if array.bonds is not None:
801
890
  struct_conn = _set_inter_residue_bonds(array, atom_site)
802
891
  if struct_conn is not None:
@@ -1021,13 +1110,21 @@ def _set_inter_residue_bonds(array, atom_site):
1021
1110
  if len(bond_array) == 0:
1022
1111
  return None
1023
1112
 
1113
+ # Filter out 'standard' links, i.e. backbone bonds between adjacent canonical
1114
+ # nucleotide/amino acid residues
1115
+ bond_array = bond_array[~_filter_canonical_links(array, bond_array)]
1116
+ if len(bond_array) == 0:
1117
+ return None
1118
+
1024
1119
  struct_conn = Category()
1025
1120
  struct_conn["id"] = np.arange(1, len(bond_array) + 1)
1026
- struct_conn["conn_type_id"] = np.full(len(bond_array), "covale")
1121
+ struct_conn["conn_type_id"] = [
1122
+ PDBX_BOND_TYPE_TO_TYPE_ID[btype] for btype in bond_array[:, 2]
1123
+ ]
1027
1124
  struct_conn["pdbx_value_order"] = Column(
1028
1125
  np.array([PDBX_BOND_TYPE_TO_ORDER[btype] for btype in bond_array[:, 2]]),
1029
1126
  np.where(
1030
- bond_array[:, 2] == BondType.ANY,
1127
+ np.isin(bond_array[:, 2], (BondType.ANY, BondType.COORDINATION)),
1031
1128
  MaskValue.MISSING,
1032
1129
  MaskValue.PRESENT,
1033
1130
  ),
@@ -1063,6 +1160,27 @@ def _filter_bonds(array, connection):
1063
1160
  raise ValueError("Invalid 'connection' option")
1064
1161
 
1065
1162
 
1163
+ def _filter_canonical_links(array, bond_array):
1164
+ """
1165
+ Filter out peptide bonds between adjacent canonical amino acid residues.
1166
+ """
1167
+ # Get the residue index for each bonded atom
1168
+ residue_indices = get_residue_positions(array, bond_array[:, :2].flatten()).reshape(
1169
+ -1, 2
1170
+ )
1171
+
1172
+ return (
1173
+ # Must be canonical residues
1174
+ np.isin(array.res_name[bond_array[:, 0]], CANONICAL_RESIDUE_LIST) &
1175
+ np.isin(array.res_name[bond_array[:, 1]], CANONICAL_RESIDUE_LIST) &
1176
+ # Must be backbone bond
1177
+ np.isin(array.atom_name[bond_array[:, 0]], ("C", "O3'")) &
1178
+ np.isin(array.atom_name[bond_array[:, 1]], ("N", "P")) &
1179
+ # Must connect adjacent residues
1180
+ residue_indices[:, 1] - residue_indices[:, 0] == 1
1181
+ ) # fmt: skip
1182
+
1183
+
1066
1184
  def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=None):
1067
1185
  """
1068
1186
  Create an :class:`AtomArray` for a chemical component from the
@@ -1161,17 +1279,28 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1161
1279
  # Swap with the fallback option
1162
1280
  coord_fields, alt_coord_fields = alt_coord_fields, coord_fields
1163
1281
  try:
1164
- for i, field in enumerate(coord_fields):
1165
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1166
- except KeyError as err:
1167
- key = err.args[0]
1168
- warnings.warn(
1169
- f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1170
- f"The fallback coordinates will be used instead",
1171
- UserWarning,
1282
+ array.coord = _parse_component_coordinates(
1283
+ [atom_category[field] for field in coord_fields]
1284
+ )
1285
+ except Exception as err:
1286
+ if isinstance(err, KeyError):
1287
+ key = err.args[0]
1288
+ warnings.warn(
1289
+ f"Attribute '{key}' not found within 'chem_comp_atom' category. "
1290
+ f"The fallback coordinates will be used instead",
1291
+ UserWarning,
1292
+ )
1293
+ elif isinstance(err, ValueError):
1294
+ warnings.warn(
1295
+ "The coordinates are missing for some atoms. "
1296
+ "The fallback coordinates will be used instead",
1297
+ UserWarning,
1298
+ )
1299
+ else:
1300
+ raise
1301
+ array.coord = _parse_component_coordinates(
1302
+ [atom_category[field] for field in alt_coord_fields]
1172
1303
  )
1173
- for i, field in enumerate(alt_coord_fields):
1174
- array.coord[:, i] = atom_category[field].as_array(np.float32)
1175
1304
 
1176
1305
  try:
1177
1306
  bond_category = block["chem_comp_bond"]
@@ -1201,6 +1330,17 @@ def get_component(pdbx_file, data_block=None, use_ideal_coord=True, res_name=Non
1201
1330
  return array
1202
1331
 
1203
1332
 
1333
+ def _parse_component_coordinates(coord_columns):
1334
+ coord = np.zeros((len(coord_columns[0]), 3), dtype=np.float32)
1335
+ for i, column in enumerate(coord_columns):
1336
+ if column.mask is not None and column.mask.array.any():
1337
+ raise ValueError(
1338
+ "Missing coordinates for some atoms",
1339
+ )
1340
+ coord[:, i] = column.as_array(np.float32)
1341
+ return coord
1342
+
1343
+
1204
1344
  def set_component(pdbx_file, array, data_block=None):
1205
1345
  """
1206
1346
  Set the ``chem_comp_atom`` and, if bonds are available,
@@ -1417,7 +1557,10 @@ def get_assembly(
1417
1557
  Returns
1418
1558
  -------
1419
1559
  assembly : AtomArray or AtomArrayStack
1420
- The assembly. The return type depends on the `model` parameter.
1560
+ The assembly.
1561
+ The return type depends on the `model` parameter.
1562
+ Contains the `sym_id` annotation, which enumerates the copies of the asymmetric
1563
+ unit in the assembly.
1421
1564
 
1422
1565
  Examples
1423
1566
  --------
@@ -1506,7 +1649,6 @@ def _apply_transformations(structure, transformation_dict, operations):
1506
1649
  """
1507
1650
  # Additional first dimesion for 'structure.repeat()'
1508
1651
  assembly_coord = np.zeros((len(operations),) + structure.coord.shape)
1509
-
1510
1652
  # Apply corresponding transformation for each copy in the assembly
1511
1653
  for i, operation in enumerate(operations):
1512
1654
  coord = structure.coord
@@ -1520,7 +1662,11 @@ def _apply_transformations(structure, transformation_dict, operations):
1520
1662
  coord += translation_vector
1521
1663
  assembly_coord[i] = coord
1522
1664
 
1523
- return repeat(structure, assembly_coord)
1665
+ assembly = repeat(structure, assembly_coord)
1666
+ assembly.set_annotation(
1667
+ "sym_id", np.repeat(np.arange(len(operations)), structure.array_length())
1668
+ )
1669
+ return assembly
1524
1670
 
1525
1671
 
1526
1672
  def _get_transformations(struct_oper):
@@ -287,7 +287,8 @@ class FixedPointEncoding(Encoding):
287
287
  The data type of the array to be encoded.
288
288
  Either a NumPy dtype or a *BinaryCIF* type code is accepted.
289
289
  The dtype must be a float type.
290
- If omitted, 32-bit floats are assumed.
290
+ If omitted, the data type is taken from the data the
291
+ first time :meth:`encode()` is called.
291
292
 
292
293
  Attributes
293
294
  ----------
@@ -304,7 +305,7 @@ class FixedPointEncoding(Encoding):
304
305
  [987 654]
305
306
  """
306
307
  factor: ...
307
- src_type: ... = TypeCode.FLOAT32
308
+ src_type: ... = None
308
309
 
309
310
  def __post_init__(self):
310
311
  if self.src_type is not None:
@@ -315,6 +316,14 @@ class FixedPointEncoding(Encoding):
315
316
  )
316
317
 
317
318
  def encode(self, data):
319
+ # If not given in constructor, it is determined from the data
320
+ if self.src_type is None:
321
+ self.src_type = TypeCode.from_dtype(data.dtype)
322
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
323
+ raise ValueError(
324
+ "Only floating point types are supported"
325
+ )
326
+
318
327
  # Round to avoid wrong values due to floating point inaccuracies
319
328
  return np.round(data * self.factor).astype(np.int32)
320
329
 
@@ -340,7 +349,8 @@ class IntervalQuantizationEncoding(Encoding):
340
349
  The data type of the array to be encoded.
341
350
  Either a NumPy dtype or a *BinaryCIF* type code is accepted.
342
351
  The dtype must be a float type.
343
- If omitted, 32-bit floats are assumed.
352
+ If omitted, the data type is taken from the data the
353
+ first time :meth:`encode()` is called.
344
354
 
345
355
  Attributes
346
356
  ----------
@@ -367,13 +377,17 @@ class IntervalQuantizationEncoding(Encoding):
367
377
  min: ...
368
378
  max: ...
369
379
  num_steps: ...
370
- src_type: ... = TypeCode.FLOAT32
380
+ src_type: ... = None
371
381
 
372
382
  def __post_init__(self):
373
383
  if self.src_type is not None:
374
384
  self.src_type = TypeCode.from_dtype(self.src_type)
375
385
 
376
386
  def encode(self, data):
387
+ # If not given in constructor, it is determined from the data
388
+ if self.src_type is None:
389
+ self.src_type = TypeCode.from_dtype(data.dtype)
390
+
377
391
  steps = np.linspace(
378
392
  self.min, self.max, self.num_steps, dtype=data.dtype
379
393
  )
@@ -524,7 +538,8 @@ class DeltaEncoding(Encoding):
524
538
  first time :meth:`encode()` is called.
525
539
  origin : int, optional
526
540
  The starting value from which the differences are calculated.
527
- If omitted, the origin is set to 0.
541
+ If omitted, the value is taken from the first array element the
542
+ first time :meth:`encode()` is called.
528
543
 
529
544
  Attributes
530
545
  ----------
@@ -535,11 +550,14 @@ class DeltaEncoding(Encoding):
535
550
  --------
536
551
 
537
552
  >>> data = np.array([1, 1, 2, 3, 5, 8])
538
- >>> print(DeltaEncoding().encode(data))
539
- [1 0 1 1 2 3]
553
+ >>> encoding = DeltaEncoding()
554
+ >>> print(encoding.encode(data))
555
+ [0 0 1 1 2 3]
556
+ >>> print(encoding.origin)
557
+ 1
540
558
  """
541
559
  src_type: ... = None
542
- origin: ... = 0
560
+ origin: ... = None
543
561
 
544
562
  def __post_init__(self):
545
563
  if self.src_type is not None:
@@ -549,6 +567,8 @@ class DeltaEncoding(Encoding):
549
567
  # If not given in constructor, it is determined from the data
550
568
  if self.src_type is None:
551
569
  self.src_type = TypeCode.from_dtype(data.dtype)
570
+ if self.origin is None:
571
+ self.origin = data[0]
552
572
 
553
573
  data = data - self.origin
554
574
  return np.diff(data, prepend=0).astype(np.int32, copy=False)
@@ -582,7 +602,8 @@ class IntegerPackingEncoding(Encoding):
582
602
  is_unsigned : bool, optional
583
603
  Whether the values should be packed into signed or unsigned
584
604
  integers.
585
- If omitted, the values are packed into signed integers.
605
+ If omitted, first time :meth:`encode()` is called, determines whether
606
+ the values fit into unsigned integers.
586
607
 
587
608
  Attributes
588
609
  ----------
@@ -601,7 +622,7 @@ class IntegerPackingEncoding(Encoding):
601
622
  """
602
623
  byte_count: ...
603
624
  src_size: ... = None
604
- is_unsigned: ... = False
625
+ is_unsigned: ... = None
605
626
 
606
627
  def encode(self, data):
607
628
  if self.src_size is None:
@@ -610,6 +631,9 @@ class IntegerPackingEncoding(Encoding):
610
631
  raise IndexError(
611
632
  "Given source size does not match actual data size"
612
633
  )
634
+ if self.is_unsigned is None:
635
+ # Only positive values -> use unsigned integers
636
+ self.is_unsigned = data.min().item() >= 0
613
637
 
614
638
  data = data.astype(np.int32, copy=False)
615
639
  return self._encode(
@@ -672,7 +696,7 @@ class IntegerPackingEncoding(Encoding):
672
696
  # Get length of output array
673
697
  # by summing up required length of each element
674
698
  cdef int number
675
- cdef int length = 0
699
+ cdef long length = 0
676
700
  for i in range(data.shape[0]):
677
701
  number = data[i]
678
702
  if number < 0:
@@ -750,7 +774,7 @@ class StringArrayEncoding(Encoding):
750
774
  If omitted, the unique strings are determined from the data the
751
775
  first time :meth:`encode()` is called.
752
776
  data_encoding : list of Encoding, optional
753
- The encodings that are applied to the indiy array.
777
+ The encodings that are applied to the index array.
754
778
  If omitted, the array is directly encoded into bytes without
755
779
  further compression.
756
780
  offset_encoding : list of Encoding, optional
@@ -837,8 +861,11 @@ class StringArrayEncoding(Encoding):
837
861
  raise TypeError("Data must be of string type")
838
862
 
839
863
  if self.strings is None:
840
- # 'unique()' already sorts the strings
841
- self.strings = np.unique(data)
864
+ # 'unique()' already sorts the strings, but this is not necessarily
865
+ # desired, as this makes efficient encoding of the indices more difficult
866
+ # -> Bring into the original order
867
+ _, unique_indices = np.unique(data, return_index=True)
868
+ self.strings = data[np.sort(unique_indices)]
842
869
  check_present = False
843
870
  else:
844
871
  check_present = True
@@ -888,6 +915,19 @@ _encoding_classes_kinds = {
888
915
 
889
916
 
890
917
  def deserialize_encoding(content):
918
+ """
919
+ Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
920
+
921
+ Parameters
922
+ ----------
923
+ content : dict
924
+ The encoding represenet as *BinaryCIF* dictionary.
925
+
926
+ Returns
927
+ -------
928
+ encoding : Encoding
929
+ The deserialized encoding.
930
+ """
891
931
  try:
892
932
  encoding_class = _encoding_classes[content["kind"]]
893
933
  except KeyError:
@@ -898,28 +938,69 @@ def deserialize_encoding(content):
898
938
 
899
939
 
900
940
  def create_uncompressed_encoding(array):
901
- dtype = array.dtype
941
+ """
942
+ Create a simple encoding for the given array that does not compress the data.
902
943
 
903
- if np.issubdtype(dtype, np.str_):
944
+ Parameters
945
+ ----------
946
+ array : ndarray
947
+ The array to to create the encoding for.
948
+
949
+ Returns
950
+ -------
951
+ encoding : list of Encoding
952
+ The encoding for the data.
953
+ """
954
+ if np.issubdtype(array.dtype, np.str_):
904
955
  return [StringArrayEncoding()]
905
956
  else:
906
957
  return [ByteArrayEncoding()]
907
958
 
908
959
 
909
960
  def encode_stepwise(data, encoding):
961
+ """
962
+ Apply a list of encodings stepwise to the given data.
963
+
964
+ Parameters
965
+ ----------
966
+ data : ndarray
967
+ The data to be encoded.
968
+ encoding : list of Encoding
969
+ The encodings to be applied.
970
+
971
+ Returns
972
+ -------
973
+ encoded_data : ndarray or bytes
974
+ The encoded data.
975
+ """
910
976
  for encoding in encoding:
911
977
  data = encoding.encode(data)
912
978
  return data
913
979
 
914
980
 
915
981
  def decode_stepwise(data, encoding):
982
+ """
983
+ Apply a list of encodings stepwise to the given data.
984
+
985
+ Parameters
986
+ ----------
987
+ data : ndarray or bytes
988
+ The data to be decoded.
989
+ encoding : list of Encoding
990
+ The encodings to be applied.
991
+
992
+ Returns
993
+ -------
994
+ decoded_data : ndarray
995
+ The decoded data.
996
+ """
916
997
  for enc in reversed(encoding):
917
998
  data = enc.decode(data)
918
999
  return data
919
1000
 
920
1001
 
921
1002
  def _camel_to_snake_case(attribute_name):
922
- return re.sub(CAMEL_CASE_PATTERN, "_", attribute_name).lower()
1003
+ return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
923
1004
 
924
1005
 
925
1006
  def _snake_to_camel_case(attribute_name):