biotite 1.0.1__cp311-cp311-win_amd64.whl → 1.1.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (90) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cp311-win_amd64.pyd +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cp311-win_amd64.pyd +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cp311-win_amd64.pyd +0 -0
  17. biotite/sequence/align/kmertable.cp311-win_amd64.pyd +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cp311-win_amd64.pyd +0 -0
  20. biotite/sequence/align/localungapped.cp311-win_amd64.pyd +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cp311-win_amd64.pyd +0 -0
  26. biotite/sequence/align/pairwise.cp311-win_amd64.pyd +0 -0
  27. biotite/sequence/align/permutation.cp311-win_amd64.pyd +0 -0
  28. biotite/sequence/align/selector.cp311-win_amd64.pyd +0 -0
  29. biotite/sequence/align/tracetable.cp311-win_amd64.pyd +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cp311-win_amd64.pyd +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cp311-win_amd64.pyd +0 -0
  36. biotite/sequence/phylo/tree.cp311-win_amd64.pyd +0 -0
  37. biotite/sequence/phylo/upgma.cp311-win_amd64.pyd +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +129 -40
  51. biotite/structure/bonds.cp311-win_amd64.pyd +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cp311-win_amd64.pyd +0 -0
  54. biotite/structure/charges.cp311-win_amd64.pyd +0 -0
  55. biotite/structure/geometry.py +60 -113
  56. biotite/structure/info/__init__.py +1 -0
  57. biotite/structure/info/atoms.py +13 -13
  58. biotite/structure/info/bonds.py +12 -6
  59. biotite/structure/info/ccd.py +125 -32
  60. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  61. biotite/structure/info/groups.py +63 -17
  62. biotite/structure/info/masses.py +9 -6
  63. biotite/structure/info/misc.py +15 -21
  64. biotite/structure/info/standardize.py +3 -2
  65. biotite/structure/io/mol/sdf.py +41 -40
  66. biotite/structure/io/pdb/convert.py +2 -0
  67. biotite/structure/io/pdb/file.py +74 -3
  68. biotite/structure/io/pdb/hybrid36.cp311-win_amd64.pyd +0 -0
  69. biotite/structure/io/pdbqt/file.py +32 -32
  70. biotite/structure/io/pdbx/__init__.py +1 -0
  71. biotite/structure/io/pdbx/bcif.py +32 -8
  72. biotite/structure/io/pdbx/cif.py +72 -59
  73. biotite/structure/io/pdbx/component.py +9 -4
  74. biotite/structure/io/pdbx/compress.py +321 -0
  75. biotite/structure/io/pdbx/convert.py +194 -48
  76. biotite/structure/io/pdbx/encoding.cp311-win_amd64.pyd +0 -0
  77. biotite/structure/io/pdbx/encoding.pyx +98 -17
  78. biotite/structure/molecules.py +141 -141
  79. biotite/structure/sasa.cp311-win_amd64.pyd +0 -0
  80. biotite/structure/segments.py +1 -2
  81. biotite/structure/util.py +73 -1
  82. biotite/version.py +2 -2
  83. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/METADATA +3 -1
  84. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/RECORD +86 -76
  85. biotite/structure/info/ccd/README.rst +0 -8
  86. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  87. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  88. biotite/structure/info/ccd/nucleotides.txt +0 -798
  89. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  90. {biotite-1.0.1.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -7,7 +7,6 @@ __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
8
8
 
9
9
  import itertools
10
- import re
11
10
  from collections.abc import MutableMapping, Sequence
12
11
  import numpy as np
13
12
  from biotite.file import (
@@ -357,7 +356,7 @@ class CIFCategory(_Component, MutableMapping):
357
356
  return CIFBlock
358
357
 
359
358
  @staticmethod
360
- def deserialize(text, expect_whitespace=True):
359
+ def deserialize(text):
361
360
  lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
362
361
 
363
362
  if _is_loop_start(lines[0]):
@@ -372,7 +371,7 @@ class CIFCategory(_Component, MutableMapping):
372
371
 
373
372
  lines = _to_single(lines)
374
373
  if is_looped:
375
- category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
374
+ category_dict = CIFCategory._deserialize_looped(lines)
376
375
  else:
377
376
  category_dict = CIFCategory._deserialize_single(lines)
378
377
  return CIFCategory(category_dict, category_name)
@@ -416,6 +415,9 @@ class CIFCategory(_Component, MutableMapping):
416
415
  raise ValueError("At least one column must remain")
417
416
  del self._columns[key]
418
417
 
418
+ def __contains__(self, key):
419
+ return key in self._columns
420
+
419
421
  def __iter__(self):
420
422
  return iter(self._columns)
421
423
 
@@ -442,7 +444,7 @@ class CIFCategory(_Component, MutableMapping):
442
444
  line_i = 0
443
445
  while line_i < len(lines):
444
446
  line = lines[line_i]
445
- parts = _split_one_line(line)
447
+ parts = list(_split_one_line(line))
446
448
  if len(parts) == 2:
447
449
  # Standard case -> name and value in one line
448
450
  name_part, value_part = parts
@@ -450,7 +452,7 @@ class CIFCategory(_Component, MutableMapping):
450
452
  elif len(parts) == 1:
451
453
  # Value is a multiline value on the next line
452
454
  name_part = parts[0]
453
- parts = _split_one_line(lines[line_i + 1])
455
+ parts = list(_split_one_line(lines[line_i + 1]))
454
456
  if len(parts) == 1:
455
457
  value_part = parts[0]
456
458
  else:
@@ -464,7 +466,7 @@ class CIFCategory(_Component, MutableMapping):
464
466
  return category_dict
465
467
 
466
468
  @staticmethod
467
- def _deserialize_looped(lines, expect_whitespace):
469
+ def _deserialize_looped(lines):
468
470
  """
469
471
  Process a category where each field has multiple values
470
472
  (category is a table).
@@ -487,20 +489,7 @@ class CIFCategory(_Component, MutableMapping):
487
489
  # row-line-alignment at all and simply cycle through columns
488
490
  column_indices = itertools.cycle(range(len(column_names)))
489
491
  for data_line in data_lines:
490
- # If whitespace is expected in quote protected values,
491
- # use regex-based _split_one_line() to split
492
- # Otherwise use much more faster whitespace split
493
- # and quote removal if applicable.
494
- if expect_whitespace:
495
- values = _split_one_line(data_line)
496
- else:
497
- values = data_line.split()
498
- for k in range(len(values)):
499
- # Remove quotes
500
- if (values[k][0] == '"' and values[k][-1] == '"') or (
501
- values[k][0] == "'" and values[k][-1] == "'"
502
- ):
503
- values[k] = values[k][1:-1]
492
+ values = _split_one_line(data_line)
504
493
  for val in values:
505
494
  column_index = next(column_indices)
506
495
  column_name = column_names[column_index]
@@ -569,6 +558,17 @@ class CIFBlock(_Component, MutableMapping):
569
558
  The keys are the category names and the values are the
570
559
  :class:`CIFCategory` objects.
571
560
  By default, an empty block is created.
561
+ name : str, optional
562
+ The name of the block.
563
+ This is only used for serialization and is automatically set,
564
+ when the :class:`CIFBlock` is added to a :class:`CIFFile`.
565
+ It only needs to be set manually, when the block is directly
566
+ serialized.
567
+
568
+ Attributes
569
+ ----------
570
+ name : str
571
+ The name of the block.
572
572
 
573
573
  Notes
574
574
  -----
@@ -580,13 +580,15 @@ class CIFBlock(_Component, MutableMapping):
580
580
  --------
581
581
 
582
582
  >>> # Add category on creation
583
- >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
583
+ >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})}, name="baz")
584
584
  >>> # Add category later on
585
585
  >>> block["bar"] = CIFCategory({"another_column": [2, 3]})
586
586
  >>> # Access a column
587
587
  >>> print(block["bar"]["another_column"].as_array())
588
588
  ['2' '3']
589
589
  >>> print(block.serialize())
590
+ data_baz
591
+ #
590
592
  _foo.some_column 1
591
593
  #
592
594
  loop_
@@ -596,11 +598,20 @@ class CIFBlock(_Component, MutableMapping):
596
598
  #
597
599
  """
598
600
 
599
- def __init__(self, categories=None):
601
+ def __init__(self, categories=None, name=None):
602
+ self._name = name
600
603
  if categories is None:
601
604
  categories = {}
602
605
  self._categories = categories
603
606
 
607
+ @property
608
+ def name(self):
609
+ return self._name
610
+
611
+ @name.setter
612
+ def name(self, name):
613
+ self._name = name
614
+
604
615
  @staticmethod
605
616
  def subcomponent_class():
606
617
  return CIFCategory
@@ -634,7 +645,10 @@ class CIFBlock(_Component, MutableMapping):
634
645
  return CIFBlock(_create_element_dict(lines, category_names, category_starts))
635
646
 
636
647
  def serialize(self):
637
- text_blocks = []
648
+ if self._name is None:
649
+ raise SerializationError("Block name is required")
650
+ # The block starts with the black name line followed by a comment line
651
+ text_blocks = ["data_" + self._name + "\n#\n"]
638
652
  for category_name, category in self._categories.items():
639
653
  if isinstance(category, str):
640
654
  # Category is already stored as lines
@@ -657,15 +671,7 @@ class CIFBlock(_Component, MutableMapping):
657
671
  # Element is stored in serialized form
658
672
  # -> must be deserialized first
659
673
  try:
660
- # Special optimization for "atom_site":
661
- # Even if the values are quote protected,
662
- # no whitespace is expected in escaped values
663
- # Therefore slow regex-based _split_one_line() call is not necessary
664
- if key == "atom_site":
665
- expect_whitespace = False
666
- else:
667
- expect_whitespace = True
668
- category = CIFCategory.deserialize(category, expect_whitespace)
674
+ category = CIFCategory.deserialize(category)
669
675
  except Exception:
670
676
  raise DeserializationError(f"Failed to deserialize category '{key}'")
671
677
  # Update with deserialized object
@@ -683,6 +689,9 @@ class CIFBlock(_Component, MutableMapping):
683
689
  def __delitem__(self, key):
684
690
  del self._categories[key]
685
691
 
692
+ def __contains__(self, key):
693
+ return key in self._categories
694
+
686
695
  def __iter__(self):
687
696
  return iter(self._categories)
688
697
 
@@ -806,14 +815,12 @@ class CIFFile(_Component, File, MutableMapping):
806
815
  def serialize(self):
807
816
  text_blocks = []
808
817
  for block_name, block in self._blocks.items():
809
- text_blocks.append("data_" + block_name + "\n")
810
- # A comment line is set after the block indicator
811
- text_blocks.append("#\n")
812
818
  if isinstance(block, str):
813
819
  # Block is already stored as text
814
820
  text_blocks.append(block)
815
821
  else:
816
822
  try:
823
+ block.name = block_name
817
824
  text_blocks.append(block.serialize())
818
825
  except Exception:
819
826
  raise SerializationError(
@@ -884,11 +891,15 @@ class CIFFile(_Component, File, MutableMapping):
884
891
  def __setitem__(self, key, block):
885
892
  if not isinstance(block, CIFBlock):
886
893
  raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
894
+ block.name = key
887
895
  self._blocks[key] = block
888
896
 
889
897
  def __delitem__(self, key):
890
898
  del self._blocks[key]
891
899
 
900
+ def __contains__(self, key):
901
+ return key in self._blocks
902
+
892
903
  def __iter__(self):
893
904
  return iter(self._blocks)
894
905
 
@@ -921,7 +932,7 @@ def _create_element_dict(lines, element_names, element_starts):
921
932
  # Lazy deserialization
922
933
  # -> keep as text for now and deserialize later if needed
923
934
  return {
924
- element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
935
+ element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) + "\n"
925
936
  for i, element_name in enumerate(element_names)
926
937
  }
927
938
 
@@ -1029,29 +1040,31 @@ def _split_one_line(line):
1029
1040
  """
1030
1041
  # Special case of multiline value, where the line starts with ';'
1031
1042
  if line[0] == ";":
1032
- return [line[1:]]
1033
-
1034
- # Define the patterns for different types of fields
1035
- single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
1036
- double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
1037
- unquoted_pattern = r"([^\s]+)"
1038
-
1039
- # Combine the patterns using alternation
1040
- combined_pattern = (
1041
- f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
1042
- )
1043
-
1044
- # Find all matches
1045
- matches = re.findall(combined_pattern, line)
1046
-
1047
- # Extract non-empty groups from the matches
1048
- fields = []
1049
- for match in matches:
1050
- field = next(group for group in match if group)
1051
- if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
1052
- field = field[1:-1]
1053
- fields.append(field)
1054
- return fields
1043
+ yield line[1:]
1044
+ elif "'" in line or '"' in line:
1045
+ # Quoted values in the line
1046
+ while line:
1047
+ # Strip leading whitespace(s)
1048
+ stripped_line = line.lstrip()
1049
+ # Split the line on whitespace
1050
+ word, _, line = stripped_line.partition(" ")
1051
+ # Handle the case where the word start with a quote
1052
+ if word.startswith(("'", '"')):
1053
+ # Set the separator to the quote found
1054
+ separator = word[0]
1055
+ # Handle the case of a quoted word without space
1056
+ if word.endswith(separator) and len(word) > 1:
1057
+ # Yield the word without the opening and closing quotes
1058
+ yield word[1:-1]
1059
+ continue
1060
+ # split the word on the separator
1061
+ word, _, line = stripped_line[1:].partition(separator)
1062
+
1063
+ yield word
1064
+ else:
1065
+ # No quoted values in the line -> simple whitespace split
1066
+ for line in line.split():
1067
+ yield line
1055
1068
 
1056
1069
 
1057
1070
  def _arrayfy(data):
@@ -171,10 +171,10 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
171
171
  Parameters
172
172
  ----------
173
173
  store_key_in: str, optional
174
- If given, the key of each element is stored as value in the
175
- serialized element.
176
- This is basically the reverse operation of `take_key_from` in
177
- :meth:`_deserialize_elements()`.
174
+ If given, the key of each element is stored as value in the
175
+ serialized element.
176
+ This is basically the reverse operation of `take_key_from` in
177
+ :meth:`_deserialize_elements()`.
178
178
  """
179
179
  serialized_elements = []
180
180
  for key, element in self._elements.items():
@@ -223,6 +223,11 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
223
223
  def __delitem__(self, key):
224
224
  del self._elements[key]
225
225
 
226
+ # Implement `__contains__()` explicitly,
227
+ # because the mixin method unnecessarily deserializes the value, if available
228
+ def __contains__(self, key):
229
+ return key in self._elements
230
+
226
231
  def __iter__(self):
227
232
  return iter(self._elements)
228
233
 
@@ -0,0 +1,321 @@
1
+ __all__ = ["compress"]
2
+ __name__ = "biotite.structure.io.pdbx"
3
+ __author__ = "Patrick Kunzmann"
4
+
5
+ import itertools
6
+ import msgpack
7
+ import numpy as np
8
+ import biotite.structure.io.pdbx.bcif as bcif
9
+ from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
10
+ from biotite.structure.io.pdbx.encoding import (
11
+ ByteArrayEncoding,
12
+ DeltaEncoding,
13
+ FixedPointEncoding,
14
+ IntegerPackingEncoding,
15
+ RunLengthEncoding,
16
+ StringArrayEncoding,
17
+ )
18
+
19
+
20
+ def compress(data, float_tolerance=1e-6):
21
+ """
22
+ Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
23
+ different data encodings for each data array and selecting the one, which results in
24
+ the smallest size.
25
+
26
+ Parameters
27
+ ----------
28
+ data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
29
+ The data to compress.
30
+
31
+ Returns
32
+ -------
33
+ compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
34
+ The compressed data with the same type as the input data.
35
+ If no improved compression is found for a :class:`BinaryCIFData` array,
36
+ the input data is kept.
37
+ Hence, the return value is no deep copy of the input data.
38
+ float_tolerance : float, optional
39
+ The relative error that is accepted when compressing floating point numbers.
40
+
41
+ Examples
42
+ --------
43
+
44
+ >>> from io import BytesIO
45
+ >>> pdbx_file = BinaryCIFFile()
46
+ >>> set_structure(pdbx_file, atom_array_stack)
47
+ >>> # Write uncompressed file
48
+ >>> uncompressed_file = BytesIO()
49
+ >>> pdbx_file.write(uncompressed_file)
50
+ >>> _ = uncompressed_file.seek(0)
51
+ >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
52
+ 927 KB
53
+ >>> # Write compressed file
54
+ >>> pdbx_file = compress(pdbx_file)
55
+ >>> compressed_file = BytesIO()
56
+ >>> pdbx_file.write(compressed_file)
57
+ >>> _ = compressed_file.seek(0)
58
+ >>> print(f"{len(compressed_file.read()) // 1000} KB")
59
+ 111 KB
60
+ """
61
+ match type(data):
62
+ case bcif.BinaryCIFFile:
63
+ return _compress_file(data, float_tolerance)
64
+ case bcif.BinaryCIFBlock:
65
+ return _compress_block(data, float_tolerance)
66
+ case bcif.BinaryCIFCategory:
67
+ return _compress_category(data, float_tolerance)
68
+ case bcif.BinaryCIFColumn:
69
+ return _compress_column(data, float_tolerance)
70
+ case bcif.BinaryCIFData:
71
+ return _compress_data(data, float_tolerance)
72
+ case _:
73
+ raise TypeError(f"Unsupported type {type(data).__name__}")
74
+
75
+
76
+ def _compress_file(bcif_file, float_tolerance):
77
+ compressed_file = bcif.BinaryCIFFile()
78
+ for block_name, bcif_block in bcif_file.items():
79
+ compressed_block = _compress_block(bcif_block, float_tolerance)
80
+ compressed_file[block_name] = compressed_block
81
+ return compressed_file
82
+
83
+
84
+ def _compress_block(bcif_block, float_tolerance):
85
+ compressed_block = bcif.BinaryCIFBlock()
86
+ for category_name, bcif_category in bcif_block.items():
87
+ compressed_category = _compress_category(bcif_category, float_tolerance)
88
+ compressed_block[category_name] = compressed_category
89
+ return compressed_block
90
+
91
+
92
+ def _compress_category(bcif_category, float_tolerance):
93
+ compressed_category = bcif.BinaryCIFCategory()
94
+ for column_name, bcif_column in bcif_category.items():
95
+ compressed_column = _compress_column(bcif_column, float_tolerance)
96
+ compressed_category[column_name] = compressed_column
97
+ return compressed_category
98
+
99
+
100
+ def _compress_column(bcif_column, float_tolerance):
101
+ data = _compress_data(bcif_column.data, float_tolerance)
102
+ if bcif_column.mask is not None:
103
+ mask = _compress_data(bcif_column.mask, float_tolerance)
104
+ else:
105
+ mask = None
106
+ return bcif.BinaryCIFColumn(data, mask)
107
+
108
+
109
+ def _compress_data(bcif_data, float_tolerance):
110
+ array = bcif_data.array
111
+ if len(array) == 1:
112
+ # No need to compress a single value -> Use default uncompressed encoding
113
+ return bcif.BinaryCIFData(array)
114
+
115
+ if np.issubdtype(array.dtype, np.str_):
116
+ # Leave encoding empty for now, as it is explicitly set later
117
+ encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
118
+ # Run encode to initialize the data and offset arrays
119
+ indices = encoding.encode(array)
120
+ offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
121
+ encoding.data_encoding, _ = _find_best_integer_compression(indices)
122
+ encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
123
+ return bcif.BinaryCIFData(array, [encoding])
124
+
125
+ elif np.issubdtype(array.dtype, np.floating):
126
+ to_integer_encoding = FixedPointEncoding(
127
+ 10 ** _get_decimal_places(array, float_tolerance)
128
+ )
129
+ integer_array = to_integer_encoding.encode(array)
130
+ best_encoding, size_compressed = _find_best_integer_compression(integer_array)
131
+ if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
132
+ return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
133
+ else:
134
+ # The float array is smaller -> encode it directly as bytes
135
+ return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
136
+
137
+ elif np.issubdtype(array.dtype, np.integer):
138
+ array = _to_smallest_integer_type(array)
139
+ encodings, _ = _find_best_integer_compression(array)
140
+ return bcif.BinaryCIFData(array, encodings)
141
+
142
+ else:
143
+ raise TypeError(f"Unsupported data type {array.dtype}")
144
+
145
+
146
+ def _find_best_integer_compression(array):
147
+ """
148
+ Try different data encodings on an integer array and return the one that results in
149
+ the smallest size.
150
+ """
151
+ best_encoding_sequence = None
152
+ smallest_size = np.inf
153
+
154
+ for use_delta in [False, True]:
155
+ if use_delta:
156
+ encoding = DeltaEncoding()
157
+ array_after_delta = encoding.encode(array)
158
+ encodings_after_delta = [encoding]
159
+ else:
160
+ encodings_after_delta = []
161
+ array_after_delta = array
162
+ for use_run_length in [False, True]:
163
+ # Use encoded data from previous step to save time
164
+ if use_run_length:
165
+ encoding = RunLengthEncoding()
166
+ array_after_rle = encoding.encode(array_after_delta)
167
+ encodings_after_rle = encodings_after_delta + [encoding]
168
+ else:
169
+ encodings_after_rle = encodings_after_delta
170
+ array_after_rle = array_after_delta
171
+ for packed_byte_count in [None, 1, 2]:
172
+ if packed_byte_count is not None:
173
+ # Quickly check this heuristic
174
+ # to avoid computing an exploding packed data array
175
+ if (
176
+ _estimate_packed_length(array_after_rle, packed_byte_count)
177
+ >= array_after_rle.nbytes
178
+ ):
179
+ # Packing would not reduce the size
180
+ continue
181
+ encoding = IntegerPackingEncoding(packed_byte_count)
182
+ array_after_packing = encoding.encode(array_after_rle)
183
+ encodings_after_packing = encodings_after_rle + [encoding]
184
+ else:
185
+ encodings_after_packing = encodings_after_rle
186
+ array_after_packing = array_after_rle
187
+ encoding = ByteArrayEncoding()
188
+ encoded_array = encoding.encode(array_after_packing)
189
+ encodings = encodings_after_packing + [encoding]
190
+ # Pack data directly instead of using the BinaryCIFData class
191
+ # to avoid the unnecessary re-encoding of the array,
192
+ # as it is already available in 'encoded_array'
193
+ serialized_encoding = [enc.serialize() for enc in encodings]
194
+ serialized_data = {
195
+ "data": encoded_array,
196
+ "encoding": serialized_encoding,
197
+ }
198
+ size = _data_size_in_file(serialized_data)
199
+ if size < smallest_size:
200
+ best_encoding_sequence = encodings
201
+ smallest_size = size
202
+ return best_encoding_sequence, smallest_size
203
+
204
+
205
+ def _estimate_packed_length(array, packed_byte_count):
206
+ """
207
+ Estimate the length of an integer array after packing it with a given number of
208
+ bytes.
209
+
210
+ Parameters
211
+ ----------
212
+ array : numpy.ndarray
213
+ The array to pack.
214
+ packed_byte_count : int
215
+ The number of bytes used for packing.
216
+
217
+ Returns
218
+ -------
219
+ length : int
220
+ The estimated length of the packed array.
221
+ """
222
+ # Use int64 to avoid integer overflow in the following line
223
+ max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
224
+ n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
225
+ return np.sum(n_bytes_per_element, dtype=np.int64)
226
+
227
+
228
+ def _to_smallest_integer_type(array):
229
+ """
230
+ Convert an integer array to the smallest possible integer type, that is still able
231
+ to represent all values in the array.
232
+
233
+ Parameters
234
+ ----------
235
+ array : numpy.ndarray
236
+ The array to convert.
237
+
238
+ Returns
239
+ -------
240
+ array : numpy.ndarray
241
+ The converted array.
242
+ """
243
+ if array.min() >= 0:
244
+ for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
245
+ if np.all(array <= np.iinfo(dtype).max):
246
+ return array.astype(dtype)
247
+ for dtype in [np.int8, np.int16, np.int32, np.int64]:
248
+ if np.all(array >= np.iinfo(dtype).min) and np.all(
249
+ array <= np.iinfo(dtype).max
250
+ ):
251
+ return array.astype(dtype)
252
+ raise ValueError("Array is out of bounds for all integer types")
253
+
254
+
255
+ def _data_size_in_file(data):
256
+ """
257
+ Get the size of the data, it would have when written into a *BinaryCIF* file.
258
+
259
+ Parameters
260
+ ----------
261
+ data : BinaryCIFData or dict
262
+ The data array whose size is measured.
263
+ Can be either a :class:`BinaryCIFData` object or already serialized data.
264
+
265
+ Returns
266
+ -------
267
+ size : int
268
+ The size of the data array in the file in bytes.
269
+ """
270
+ if isinstance(data, bcif.BinaryCIFData):
271
+ data = data.serialize()
272
+ bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
273
+ return len(bytes_in_file)
274
+
275
+
276
+ def _get_decimal_places(array, tol):
277
+ """
278
+ Get the number of decimal places in a floating point array.
279
+
280
+ Parameters
281
+ ----------
282
+ array : numpy.ndarray
283
+ The array to analyze.
284
+ tol : float, optional
285
+ The relative tolerance allowed when the values are cut off after the returned
286
+ number of decimal places.
287
+
288
+ Returns
289
+ -------
290
+ decimals : int
291
+ The number of decimal places.
292
+ """
293
+ # Decimals of NaN or infinite values do not make sense
294
+ # and 0 would give NaN when rounding on decimals
295
+ array = array[np.isfinite(array) & (array != 0)]
296
+ for decimals in itertools.count(start=-_order_magnitude(array)):
297
+ error = np.abs(np.round(array, decimals) - array)
298
+ if np.all(error < tol * np.abs(array)):
299
+ return decimals
300
+
301
+
302
+ def _order_magnitude(array):
303
+ """
304
+ Get the order of magnitude of floating point values.
305
+
306
+ Parameters
307
+ ----------
308
+ array : ndarray, dtype=float
309
+ The value to analyze.
310
+
311
+ Returns
312
+ -------
313
+ magnitude : int
314
+ The order of magnitude, i.e. the maximum exponent a number in the array would
315
+ have in scientific notation, if only one digit is left of the decimal point.
316
+ """
317
+ array = array[array != 0]
318
+ if len(array) == 0:
319
+ # No non-zero values -> define order of magnitude as 0
320
+ return 0
321
+ return int(np.max(np.floor(np.log10(np.abs(array)))).item())