biotite 1.0.1__cp311-cp311-macosx_11_0_arm64.whl → 1.2.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (177) hide show
  1. biotite/application/application.py +3 -3
  2. biotite/application/autodock/app.py +1 -1
  3. biotite/application/blast/webapp.py +1 -1
  4. biotite/application/clustalo/app.py +1 -1
  5. biotite/application/dssp/app.py +13 -3
  6. biotite/application/localapp.py +36 -2
  7. biotite/application/msaapp.py +10 -10
  8. biotite/application/muscle/app3.py +5 -18
  9. biotite/application/muscle/app5.py +5 -5
  10. biotite/application/sra/app.py +0 -5
  11. biotite/application/util.py +22 -2
  12. biotite/application/viennarna/rnaalifold.py +8 -8
  13. biotite/application/viennarna/rnaplot.py +9 -3
  14. biotite/application/viennarna/util.py +1 -1
  15. biotite/application/webapp.py +1 -1
  16. biotite/database/afdb/__init__.py +12 -0
  17. biotite/database/afdb/download.py +191 -0
  18. biotite/database/entrez/dbnames.py +10 -0
  19. biotite/database/entrez/download.py +9 -10
  20. biotite/database/entrez/key.py +1 -1
  21. biotite/database/entrez/query.py +5 -4
  22. biotite/database/pubchem/download.py +6 -6
  23. biotite/database/pubchem/error.py +10 -0
  24. biotite/database/pubchem/query.py +12 -23
  25. biotite/database/rcsb/download.py +3 -2
  26. biotite/database/rcsb/query.py +8 -9
  27. biotite/database/uniprot/check.py +22 -17
  28. biotite/database/uniprot/download.py +3 -6
  29. biotite/database/uniprot/query.py +4 -5
  30. biotite/file.py +14 -2
  31. biotite/interface/__init__.py +19 -0
  32. biotite/interface/openmm/__init__.py +16 -0
  33. biotite/interface/openmm/state.py +93 -0
  34. biotite/interface/openmm/system.py +227 -0
  35. biotite/interface/pymol/__init__.py +198 -0
  36. biotite/interface/pymol/cgo.py +346 -0
  37. biotite/interface/pymol/convert.py +185 -0
  38. biotite/interface/pymol/display.py +267 -0
  39. biotite/interface/pymol/object.py +1226 -0
  40. biotite/interface/pymol/shapes.py +178 -0
  41. biotite/interface/pymol/startup.py +169 -0
  42. biotite/interface/rdkit/__init__.py +15 -0
  43. biotite/interface/rdkit/mol.py +490 -0
  44. biotite/interface/version.py +71 -0
  45. biotite/interface/warning.py +19 -0
  46. biotite/sequence/align/__init__.py +0 -4
  47. biotite/sequence/align/alignment.py +49 -14
  48. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  49. biotite/sequence/align/banded.pyx +26 -26
  50. biotite/sequence/align/cigar.py +2 -2
  51. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  52. biotite/sequence/align/kmeralphabet.pyx +19 -2
  53. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  54. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  55. biotite/sequence/align/kmertable.pyx +58 -48
  56. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  57. biotite/sequence/align/localgapped.pyx +47 -47
  58. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  59. biotite/sequence/align/localungapped.pyx +10 -10
  60. biotite/sequence/align/matrix.py +284 -57
  61. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  62. biotite/sequence/align/matrix_data/PB.license +21 -0
  63. biotite/sequence/align/matrix_data/PB.mat +18 -0
  64. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  65. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  66. biotite/sequence/align/pairwise.pyx +35 -35
  67. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  68. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  69. biotite/sequence/align/selector.pyx +2 -2
  70. biotite/sequence/align/statistics.py +1 -1
  71. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  72. biotite/sequence/alphabet.py +5 -2
  73. biotite/sequence/annotation.py +19 -13
  74. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  75. biotite/sequence/codon.py +1 -2
  76. biotite/sequence/graphics/alignment.py +25 -39
  77. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  78. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  79. biotite/sequence/graphics/colorschemes.py +44 -11
  80. biotite/sequence/graphics/dendrogram.py +4 -2
  81. biotite/sequence/graphics/features.py +2 -2
  82. biotite/sequence/graphics/logo.py +10 -12
  83. biotite/sequence/io/fasta/convert.py +1 -2
  84. biotite/sequence/io/fasta/file.py +1 -1
  85. biotite/sequence/io/fastq/file.py +3 -3
  86. biotite/sequence/io/genbank/file.py +3 -3
  87. biotite/sequence/io/genbank/sequence.py +2 -0
  88. biotite/sequence/io/gff/convert.py +1 -1
  89. biotite/sequence/io/gff/file.py +1 -2
  90. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  91. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  92. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  93. biotite/sequence/profile.py +105 -29
  94. biotite/sequence/search.py +0 -1
  95. biotite/sequence/seqtypes.py +136 -8
  96. biotite/sequence/sequence.py +1 -2
  97. biotite/setup_ccd.py +197 -0
  98. biotite/structure/__init__.py +6 -3
  99. biotite/structure/alphabet/__init__.py +25 -0
  100. biotite/structure/alphabet/encoder.py +332 -0
  101. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  102. biotite/structure/alphabet/i3d.py +109 -0
  103. biotite/structure/alphabet/layers.py +86 -0
  104. biotite/structure/alphabet/pb.license +21 -0
  105. biotite/structure/alphabet/pb.py +170 -0
  106. biotite/structure/alphabet/unkerasify.py +128 -0
  107. biotite/structure/atoms.py +163 -66
  108. biotite/structure/basepairs.py +26 -26
  109. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  110. biotite/structure/bonds.pyx +79 -25
  111. biotite/structure/box.py +19 -21
  112. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  113. biotite/structure/celllist.pyx +83 -67
  114. biotite/structure/chains.py +5 -37
  115. biotite/structure/charges.cpython-311-darwin.so +0 -0
  116. biotite/structure/compare.py +420 -13
  117. biotite/structure/density.py +1 -1
  118. biotite/structure/dotbracket.py +27 -28
  119. biotite/structure/filter.py +8 -8
  120. biotite/structure/geometry.py +74 -127
  121. biotite/structure/hbond.py +17 -19
  122. biotite/structure/info/__init__.py +1 -0
  123. biotite/structure/info/atoms.py +24 -15
  124. biotite/structure/info/bonds.py +12 -6
  125. biotite/structure/info/ccd.py +125 -34
  126. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  127. biotite/structure/info/groups.py +62 -19
  128. biotite/structure/info/masses.py +9 -6
  129. biotite/structure/info/misc.py +15 -22
  130. biotite/structure/info/radii.py +92 -22
  131. biotite/structure/info/standardize.py +4 -4
  132. biotite/structure/integrity.py +4 -6
  133. biotite/structure/io/general.py +2 -2
  134. biotite/structure/io/gro/file.py +8 -9
  135. biotite/structure/io/mol/convert.py +1 -1
  136. biotite/structure/io/mol/ctab.py +33 -28
  137. biotite/structure/io/mol/mol.py +1 -1
  138. biotite/structure/io/mol/sdf.py +80 -53
  139. biotite/structure/io/pdb/convert.py +4 -3
  140. biotite/structure/io/pdb/file.py +85 -25
  141. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  142. biotite/structure/io/pdbqt/file.py +36 -36
  143. biotite/structure/io/pdbx/__init__.py +1 -0
  144. biotite/structure/io/pdbx/bcif.py +54 -15
  145. biotite/structure/io/pdbx/cif.py +92 -66
  146. biotite/structure/io/pdbx/component.py +15 -4
  147. biotite/structure/io/pdbx/compress.py +321 -0
  148. biotite/structure/io/pdbx/convert.py +410 -75
  149. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  150. biotite/structure/io/pdbx/encoding.pyx +98 -17
  151. biotite/structure/io/trajfile.py +9 -6
  152. biotite/structure/io/util.py +38 -0
  153. biotite/structure/mechanics.py +0 -1
  154. biotite/structure/molecules.py +141 -156
  155. biotite/structure/pseudoknots.py +7 -13
  156. biotite/structure/repair.py +2 -4
  157. biotite/structure/residues.py +13 -24
  158. biotite/structure/rings.py +335 -0
  159. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  160. biotite/structure/sasa.pyx +2 -1
  161. biotite/structure/segments.py +69 -11
  162. biotite/structure/sequence.py +0 -1
  163. biotite/structure/sse.py +0 -2
  164. biotite/structure/superimpose.py +74 -62
  165. biotite/structure/tm.py +581 -0
  166. biotite/structure/transform.py +12 -25
  167. biotite/structure/util.py +76 -4
  168. biotite/version.py +9 -4
  169. biotite/visualize.py +111 -1
  170. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/METADATA +6 -2
  171. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/RECORD +173 -143
  172. biotite/structure/info/ccd/README.rst +0 -8
  173. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  174. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  175. biotite/structure/info/ccd/nucleotides.txt +0 -798
  176. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/WHEEL +0 -0
  177. {biotite-1.0.1.dist-info → biotite-1.2.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -7,7 +7,6 @@ __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["CIFFile", "CIFBlock", "CIFCategory", "CIFColumn", "CIFData"]
8
8
 
9
9
  import itertools
10
- import re
11
10
  from collections.abc import MutableMapping, Sequence
12
11
  import numpy as np
13
12
  from biotite.file import (
@@ -150,7 +149,7 @@ class CIFColumn:
150
149
  mask = CIFData(mask, np.uint8)
151
150
  if len(mask) != len(data):
152
151
  raise IndexError(
153
- f"Data has length {len(data)}, " f"but mask has length {len(mask)}"
152
+ f"Data has length {len(data)}, but mask has length {len(mask)}"
154
153
  )
155
154
  self._data = data
156
155
  self._mask = mask
@@ -216,6 +215,11 @@ class CIFColumn:
216
215
  ``MaskValue.INAPPLICABLE`` or ``MaskValue.MISSING``.
217
216
  By default, masked elements are converted to ``'.'`` or
218
217
  ``'?'`` depending on the :class:`MaskValue`.
218
+
219
+ Returns
220
+ -------
221
+ array : ndarray
222
+ The column data as array.
219
223
  """
220
224
  if self._mask is None:
221
225
  return self._data.array.astype(dtype, copy=False)
@@ -357,7 +361,7 @@ class CIFCategory(_Component, MutableMapping):
357
361
  return CIFBlock
358
362
 
359
363
  @staticmethod
360
- def deserialize(text, expect_whitespace=True):
364
+ def deserialize(text):
361
365
  lines = [line.strip() for line in text.splitlines() if not _is_empty(line)]
362
366
 
363
367
  if _is_loop_start(lines[0]):
@@ -372,7 +376,7 @@ class CIFCategory(_Component, MutableMapping):
372
376
 
373
377
  lines = _to_single(lines)
374
378
  if is_looped:
375
- category_dict = CIFCategory._deserialize_looped(lines, expect_whitespace)
379
+ category_dict = CIFCategory._deserialize_looped(lines)
376
380
  else:
377
381
  category_dict = CIFCategory._deserialize_single(lines)
378
382
  return CIFCategory(category_dict, category_name)
@@ -416,6 +420,9 @@ class CIFCategory(_Component, MutableMapping):
416
420
  raise ValueError("At least one column must remain")
417
421
  del self._columns[key]
418
422
 
423
+ def __contains__(self, key):
424
+ return key in self._columns
425
+
419
426
  def __iter__(self):
420
427
  return iter(self._columns)
421
428
 
@@ -442,7 +449,7 @@ class CIFCategory(_Component, MutableMapping):
442
449
  line_i = 0
443
450
  while line_i < len(lines):
444
451
  line = lines[line_i]
445
- parts = _split_one_line(line)
452
+ parts = list(_split_one_line(line))
446
453
  if len(parts) == 2:
447
454
  # Standard case -> name and value in one line
448
455
  name_part, value_part = parts
@@ -450,7 +457,7 @@ class CIFCategory(_Component, MutableMapping):
450
457
  elif len(parts) == 1:
451
458
  # Value is a multiline value on the next line
452
459
  name_part = parts[0]
453
- parts = _split_one_line(lines[line_i + 1])
460
+ parts = list(_split_one_line(lines[line_i + 1]))
454
461
  if len(parts) == 1:
455
462
  value_part = parts[0]
456
463
  else:
@@ -464,7 +471,7 @@ class CIFCategory(_Component, MutableMapping):
464
471
  return category_dict
465
472
 
466
473
  @staticmethod
467
- def _deserialize_looped(lines, expect_whitespace):
474
+ def _deserialize_looped(lines):
468
475
  """
469
476
  Process a category where each field has multiple values
470
477
  (category is a table).
@@ -487,20 +494,7 @@ class CIFCategory(_Component, MutableMapping):
487
494
  # row-line-alignment at all and simply cycle through columns
488
495
  column_indices = itertools.cycle(range(len(column_names)))
489
496
  for data_line in data_lines:
490
- # If whitespace is expected in quote protected values,
491
- # use regex-based _split_one_line() to split
492
- # Otherwise use much more faster whitespace split
493
- # and quote removal if applicable.
494
- if expect_whitespace:
495
- values = _split_one_line(data_line)
496
- else:
497
- values = data_line.split()
498
- for k in range(len(values)):
499
- # Remove quotes
500
- if (values[k][0] == '"' and values[k][-1] == '"') or (
501
- values[k][0] == "'" and values[k][-1] == "'"
502
- ):
503
- values[k] = values[k][1:-1]
497
+ values = _split_one_line(data_line)
504
498
  for val in values:
505
499
  column_index = next(column_indices)
506
500
  column_name = column_names[column_index]
@@ -569,6 +563,17 @@ class CIFBlock(_Component, MutableMapping):
569
563
  The keys are the category names and the values are the
570
564
  :class:`CIFCategory` objects.
571
565
  By default, an empty block is created.
566
+ name : str, optional
567
+ The name of the block.
568
+ This is only used for serialization and is automatically set,
569
+ when the :class:`CIFBlock` is added to a :class:`CIFFile`.
570
+ It only needs to be set manually, when the block is directly
571
+ serialized.
572
+
573
+ Attributes
574
+ ----------
575
+ name : str
576
+ The name of the block.
572
577
 
573
578
  Notes
574
579
  -----
@@ -580,13 +585,15 @@ class CIFBlock(_Component, MutableMapping):
580
585
  --------
581
586
 
582
587
  >>> # Add category on creation
583
- >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})})
588
+ >>> block = CIFBlock({"foo": CIFCategory({"some_column": 1})}, name="baz")
584
589
  >>> # Add category later on
585
590
  >>> block["bar"] = CIFCategory({"another_column": [2, 3]})
586
591
  >>> # Access a column
587
592
  >>> print(block["bar"]["another_column"].as_array())
588
593
  ['2' '3']
589
594
  >>> print(block.serialize())
595
+ data_baz
596
+ #
590
597
  _foo.some_column 1
591
598
  #
592
599
  loop_
@@ -596,11 +603,20 @@ class CIFBlock(_Component, MutableMapping):
596
603
  #
597
604
  """
598
605
 
599
- def __init__(self, categories=None):
606
+ def __init__(self, categories=None, name=None):
607
+ self._name = name
600
608
  if categories is None:
601
609
  categories = {}
602
610
  self._categories = categories
603
611
 
612
+ @property
613
+ def name(self):
614
+ return self._name
615
+
616
+ @name.setter
617
+ def name(self, name):
618
+ self._name = name
619
+
604
620
  @staticmethod
605
621
  def subcomponent_class():
606
622
  return CIFCategory
@@ -634,7 +650,10 @@ class CIFBlock(_Component, MutableMapping):
634
650
  return CIFBlock(_create_element_dict(lines, category_names, category_starts))
635
651
 
636
652
  def serialize(self):
637
- text_blocks = []
653
+ if self._name is None:
654
+ raise SerializationError("Block name is required")
655
+ # The block starts with the black name line followed by a comment line
656
+ text_blocks = ["data_" + self._name + "\n#\n"]
638
657
  for category_name, category in self._categories.items():
639
658
  if isinstance(category, str):
640
659
  # Category is already stored as lines
@@ -657,15 +676,7 @@ class CIFBlock(_Component, MutableMapping):
657
676
  # Element is stored in serialized form
658
677
  # -> must be deserialized first
659
678
  try:
660
- # Special optimization for "atom_site":
661
- # Even if the values are quote protected,
662
- # no whitespace is expected in escaped values
663
- # Therefore slow regex-based _split_one_line() call is not necessary
664
- if key == "atom_site":
665
- expect_whitespace = False
666
- else:
667
- expect_whitespace = True
668
- category = CIFCategory.deserialize(category, expect_whitespace)
679
+ category = CIFCategory.deserialize(category)
669
680
  except Exception:
670
681
  raise DeserializationError(f"Failed to deserialize category '{key}'")
671
682
  # Update with deserialized object
@@ -683,6 +694,9 @@ class CIFBlock(_Component, MutableMapping):
683
694
  def __delitem__(self, key):
684
695
  del self._categories[key]
685
696
 
697
+ def __contains__(self, key):
698
+ return key in self._categories
699
+
686
700
  def __iter__(self):
687
701
  return iter(self._categories)
688
702
 
@@ -712,6 +726,19 @@ class CIFFile(_Component, File, MutableMapping):
712
726
  use the high-level :func:`get_structure()` or
713
727
  :func:`set_structure()` function respectively.
714
728
 
729
+ Parameters
730
+ ----------
731
+ blocks : dict (str -> CIFBlock), optional
732
+ The initial blocks of the file.
733
+ Maps the block names to the corresponding :class:`CIFBlock` objects.
734
+ By default no initial blocks are added.
735
+
736
+ Attributes
737
+ ----------
738
+ block : CIFBlock
739
+ The sole block of the file.
740
+ If the file contains multiple blocks, an exception is raised.
741
+
715
742
  Notes
716
743
  -----
717
744
  The content of CIF files are lazily deserialized:
@@ -722,12 +749,6 @@ class CIFFile(_Component, File, MutableMapping):
722
749
  The deserialized :class:`CIFBlock`/:class:`CIFCategory` objects
723
750
  are cached for subsequent accesses.
724
751
 
725
- Attributes
726
- ----------
727
- block : CIFBlock
728
- The sole block of the file.
729
- If the file contains multiple blocks, an exception is raised.
730
-
731
752
  Examples
732
753
  --------
733
754
  Read a CIF file and access its content:
@@ -806,14 +827,12 @@ class CIFFile(_Component, File, MutableMapping):
806
827
  def serialize(self):
807
828
  text_blocks = []
808
829
  for block_name, block in self._blocks.items():
809
- text_blocks.append("data_" + block_name + "\n")
810
- # A comment line is set after the block indicator
811
- text_blocks.append("#\n")
812
830
  if isinstance(block, str):
813
831
  # Block is already stored as text
814
832
  text_blocks.append(block)
815
833
  else:
816
834
  try:
835
+ block.name = block_name
817
836
  text_blocks.append(block.serialize())
818
837
  except Exception:
819
838
  raise SerializationError(
@@ -877,6 +896,7 @@ class CIFFile(_Component, File, MutableMapping):
877
896
  block = CIFBlock.deserialize(block)
878
897
  except Exception:
879
898
  raise DeserializationError(f"Failed to deserialize block '{key}'")
899
+ block.name = key
880
900
  # Update with deserialized object
881
901
  self._blocks[key] = block
882
902
  return block
@@ -884,11 +904,15 @@ class CIFFile(_Component, File, MutableMapping):
884
904
  def __setitem__(self, key, block):
885
905
  if not isinstance(block, CIFBlock):
886
906
  raise TypeError(f"Expected 'CIFBlock', but got '{type(block).__name__}'")
907
+ block.name = key
887
908
  self._blocks[key] = block
888
909
 
889
910
  def __delitem__(self, key):
890
911
  del self._blocks[key]
891
912
 
913
+ def __contains__(self, key):
914
+ return key in self._blocks
915
+
892
916
  def __iter__(self):
893
917
  return iter(self._blocks)
894
918
 
@@ -921,7 +945,7 @@ def _create_element_dict(lines, element_names, element_starts):
921
945
  # Lazy deserialization
922
946
  # -> keep as text for now and deserialize later if needed
923
947
  return {
924
- element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]])
948
+ element_name: "\n".join(lines[element_starts[i] : element_starts[i + 1]]) + "\n"
925
949
  for i, element_name in enumerate(element_names)
926
950
  }
927
951
 
@@ -1029,29 +1053,31 @@ def _split_one_line(line):
1029
1053
  """
1030
1054
  # Special case of multiline value, where the line starts with ';'
1031
1055
  if line[0] == ";":
1032
- return [line[1:]]
1033
-
1034
- # Define the patterns for different types of fields
1035
- single_quote_pattern = r"('(?:'(?! )|[^'])*')(?:\s|$)"
1036
- double_quote_pattern = r'("(?:"(?! )|[^"])*")(?:\s|$)'
1037
- unquoted_pattern = r"([^\s]+)"
1038
-
1039
- # Combine the patterns using alternation
1040
- combined_pattern = (
1041
- f"{single_quote_pattern}|{double_quote_pattern}|{unquoted_pattern}"
1042
- )
1043
-
1044
- # Find all matches
1045
- matches = re.findall(combined_pattern, line)
1046
-
1047
- # Extract non-empty groups from the matches
1048
- fields = []
1049
- for match in matches:
1050
- field = next(group for group in match if group)
1051
- if field[0] == field[-1] == "'" or field[0] == field[-1] == '"':
1052
- field = field[1:-1]
1053
- fields.append(field)
1054
- return fields
1056
+ yield line[1:]
1057
+ elif "'" in line or '"' in line:
1058
+ # Quoted values in the line
1059
+ while line:
1060
+ # Strip leading whitespace(s)
1061
+ stripped_line = line.lstrip()
1062
+ # Split the line on whitespace
1063
+ word, _, line = stripped_line.partition(" ")
1064
+ # Handle the case where the word start with a quote
1065
+ if word.startswith(("'", '"')):
1066
+ # Set the separator to the quote found
1067
+ separator = word[0]
1068
+ # Handle the case of a quoted word without space
1069
+ if word.endswith(separator) and len(word) > 1:
1070
+ # Yield the word without the opening and closing quotes
1071
+ yield word[1:-1]
1072
+ continue
1073
+ # split the word on the separator
1074
+ word, _, line = stripped_line[1:].partition(separator)
1075
+
1076
+ yield word
1077
+ else:
1078
+ # No quoted values in the line -> simple whitespace split
1079
+ for line in line.split():
1080
+ yield line
1055
1081
 
1056
1082
 
1057
1083
  def _arrayfy(data):
@@ -120,6 +120,12 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
120
120
  A component is only deserialized from the serialized data, if it
121
121
  is accessed.
122
122
  The deserialized component is then cached in the container.
123
+
124
+ Parameters
125
+ ----------
126
+ elements : dict, optional
127
+ The initial elements of the container.
128
+ By default no initial elements are added.
123
129
  """
124
130
 
125
131
  def __init__(self, elements=None):
@@ -171,10 +177,10 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
171
177
  Parameters
172
178
  ----------
173
179
  store_key_in: str, optional
174
- If given, the key of each element is stored as value in the
175
- serialized element.
176
- This is basically the reverse operation of `take_key_from` in
177
- :meth:`_deserialize_elements()`.
180
+ If given, the key of each element is stored as value in the
181
+ serialized element.
182
+ This is basically the reverse operation of `take_key_from` in
183
+ :meth:`_deserialize_elements()`.
178
184
  """
179
185
  serialized_elements = []
180
186
  for key, element in self._elements.items():
@@ -223,6 +229,11 @@ class _HierarchicalContainer(_Component, MutableMapping, metaclass=ABCMeta):
223
229
  def __delitem__(self, key):
224
230
  del self._elements[key]
225
231
 
232
+ # Implement `__contains__()` explicitly,
233
+ # because the mixin method unnecessarily deserializes the value, if available
234
+ def __contains__(self, key):
235
+ return key in self._elements
236
+
226
237
  def __iter__(self):
227
238
  return iter(self._elements)
228
239
 
@@ -0,0 +1,321 @@
1
+ __all__ = ["compress"]
2
+ __name__ = "biotite.structure.io.pdbx"
3
+ __author__ = "Patrick Kunzmann"
4
+
5
+ import itertools
6
+ import msgpack
7
+ import numpy as np
8
+ import biotite.structure.io.pdbx.bcif as bcif
9
+ from biotite.structure.io.pdbx.bcif import _encode_numpy as encode_numpy
10
+ from biotite.structure.io.pdbx.encoding import (
11
+ ByteArrayEncoding,
12
+ DeltaEncoding,
13
+ FixedPointEncoding,
14
+ IntegerPackingEncoding,
15
+ RunLengthEncoding,
16
+ StringArrayEncoding,
17
+ )
18
+
19
+
20
+ def compress(data, float_tolerance=1e-6):
21
+ """
22
+ Try to reduce the size of a *BinaryCIF* file (or block, category, etc.) by testing
23
+ different data encodings for each data array and selecting the one, which results in
24
+ the smallest size.
25
+
26
+ Parameters
27
+ ----------
28
+ data : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
29
+ The data to compress.
30
+ float_tolerance : float, optional
31
+ The relative error that is accepted when compressing floating point numbers.
32
+
33
+ Returns
34
+ -------
35
+ compressed_file : BinaryCIFFile or BinaryCIFBlock or BinaryCIFCategory or BinaryCIFColumn or BinaryCIFData
36
+ The compressed data with the same type as the input data.
37
+ If no improved compression is found for a :class:`BinaryCIFData` array,
38
+ the input data is kept.
39
+ Hence, the return value is no deep copy of the input data.
40
+
41
+ Examples
42
+ --------
43
+
44
+ >>> from io import BytesIO
45
+ >>> pdbx_file = BinaryCIFFile()
46
+ >>> set_structure(pdbx_file, atom_array_stack)
47
+ >>> # Write uncompressed file
48
+ >>> uncompressed_file = BytesIO()
49
+ >>> pdbx_file.write(uncompressed_file)
50
+ >>> _ = uncompressed_file.seek(0)
51
+ >>> print(f"{len(uncompressed_file.read()) // 1000} KB")
52
+ 927 KB
53
+ >>> # Write compressed file
54
+ >>> pdbx_file = compress(pdbx_file)
55
+ >>> compressed_file = BytesIO()
56
+ >>> pdbx_file.write(compressed_file)
57
+ >>> _ = compressed_file.seek(0)
58
+ >>> print(f"{len(compressed_file.read()) // 1000} KB")
59
+ 111 KB
60
+ """
61
+ match type(data):
62
+ case bcif.BinaryCIFFile:
63
+ return _compress_file(data, float_tolerance)
64
+ case bcif.BinaryCIFBlock:
65
+ return _compress_block(data, float_tolerance)
66
+ case bcif.BinaryCIFCategory:
67
+ return _compress_category(data, float_tolerance)
68
+ case bcif.BinaryCIFColumn:
69
+ return _compress_column(data, float_tolerance)
70
+ case bcif.BinaryCIFData:
71
+ return _compress_data(data, float_tolerance)
72
+ case _:
73
+ raise TypeError(f"Unsupported type {type(data).__name__}")
74
+
75
+
76
+ def _compress_file(bcif_file, float_tolerance):
77
+ compressed_file = bcif.BinaryCIFFile()
78
+ for block_name, bcif_block in bcif_file.items():
79
+ compressed_block = _compress_block(bcif_block, float_tolerance)
80
+ compressed_file[block_name] = compressed_block
81
+ return compressed_file
82
+
83
+
84
+ def _compress_block(bcif_block, float_tolerance):
85
+ compressed_block = bcif.BinaryCIFBlock()
86
+ for category_name, bcif_category in bcif_block.items():
87
+ compressed_category = _compress_category(bcif_category, float_tolerance)
88
+ compressed_block[category_name] = compressed_category
89
+ return compressed_block
90
+
91
+
92
+ def _compress_category(bcif_category, float_tolerance):
93
+ compressed_category = bcif.BinaryCIFCategory()
94
+ for column_name, bcif_column in bcif_category.items():
95
+ compressed_column = _compress_column(bcif_column, float_tolerance)
96
+ compressed_category[column_name] = compressed_column
97
+ return compressed_category
98
+
99
+
100
+ def _compress_column(bcif_column, float_tolerance):
101
+ data = _compress_data(bcif_column.data, float_tolerance)
102
+ if bcif_column.mask is not None:
103
+ mask = _compress_data(bcif_column.mask, float_tolerance)
104
+ else:
105
+ mask = None
106
+ return bcif.BinaryCIFColumn(data, mask)
107
+
108
+
109
+ def _compress_data(bcif_data, float_tolerance):
110
+ array = bcif_data.array
111
+ if len(array) == 1:
112
+ # No need to compress a single value -> Use default uncompressed encoding
113
+ return bcif.BinaryCIFData(array)
114
+
115
+ if np.issubdtype(array.dtype, np.str_):
116
+ # Leave encoding empty for now, as it is explicitly set later
117
+ encoding = StringArrayEncoding(data_encoding=[], offset_encoding=[])
118
+ # Run encode to initialize the data and offset arrays
119
+ indices = encoding.encode(array)
120
+ offsets = np.cumsum([0] + [len(s) for s in encoding.strings])
121
+ encoding.data_encoding, _ = _find_best_integer_compression(indices)
122
+ encoding.offset_encoding, _ = _find_best_integer_compression(offsets)
123
+ return bcif.BinaryCIFData(array, [encoding])
124
+
125
+ elif np.issubdtype(array.dtype, np.floating):
126
+ to_integer_encoding = FixedPointEncoding(
127
+ 10 ** _get_decimal_places(array, float_tolerance)
128
+ )
129
+ integer_array = to_integer_encoding.encode(array)
130
+ best_encoding, size_compressed = _find_best_integer_compression(integer_array)
131
+ if size_compressed < _data_size_in_file(bcif.BinaryCIFData(array)):
132
+ return bcif.BinaryCIFData(array, [to_integer_encoding] + best_encoding)
133
+ else:
134
+ # The float array is smaller -> encode it directly as bytes
135
+ return bcif.BinaryCIFData(array, [ByteArrayEncoding()])
136
+
137
+ elif np.issubdtype(array.dtype, np.integer):
138
+ array = _to_smallest_integer_type(array)
139
+ encodings, _ = _find_best_integer_compression(array)
140
+ return bcif.BinaryCIFData(array, encodings)
141
+
142
+ else:
143
+ raise TypeError(f"Unsupported data type {array.dtype}")
144
+
145
+
146
+ def _find_best_integer_compression(array):
147
+ """
148
+ Try different data encodings on an integer array and return the one that results in
149
+ the smallest size.
150
+ """
151
+ best_encoding_sequence = None
152
+ smallest_size = np.inf
153
+
154
+ for use_delta in [False, True]:
155
+ if use_delta:
156
+ encoding = DeltaEncoding()
157
+ array_after_delta = encoding.encode(array)
158
+ encodings_after_delta = [encoding]
159
+ else:
160
+ encodings_after_delta = []
161
+ array_after_delta = array
162
+ for use_run_length in [False, True]:
163
+ # Use encoded data from previous step to save time
164
+ if use_run_length:
165
+ encoding = RunLengthEncoding()
166
+ array_after_rle = encoding.encode(array_after_delta)
167
+ encodings_after_rle = encodings_after_delta + [encoding]
168
+ else:
169
+ encodings_after_rle = encodings_after_delta
170
+ array_after_rle = array_after_delta
171
+ for packed_byte_count in [None, 1, 2]:
172
+ if packed_byte_count is not None:
173
+ # Quickly check this heuristic
174
+ # to avoid computing an exploding packed data array
175
+ if (
176
+ _estimate_packed_length(array_after_rle, packed_byte_count)
177
+ >= array_after_rle.nbytes
178
+ ):
179
+ # Packing would not reduce the size
180
+ continue
181
+ encoding = IntegerPackingEncoding(packed_byte_count)
182
+ array_after_packing = encoding.encode(array_after_rle)
183
+ encodings_after_packing = encodings_after_rle + [encoding]
184
+ else:
185
+ encodings_after_packing = encodings_after_rle
186
+ array_after_packing = array_after_rle
187
+ encoding = ByteArrayEncoding()
188
+ encoded_array = encoding.encode(array_after_packing)
189
+ encodings = encodings_after_packing + [encoding]
190
+ # Pack data directly instead of using the BinaryCIFData class
191
+ # to avoid the unnecessary re-encoding of the array,
192
+ # as it is already available in 'encoded_array'
193
+ serialized_encoding = [enc.serialize() for enc in encodings]
194
+ serialized_data = {
195
+ "data": encoded_array,
196
+ "encoding": serialized_encoding,
197
+ }
198
+ size = _data_size_in_file(serialized_data)
199
+ if size < smallest_size:
200
+ best_encoding_sequence = encodings
201
+ smallest_size = size
202
+ return best_encoding_sequence, smallest_size
203
+
204
+
205
+ def _estimate_packed_length(array, packed_byte_count):
206
+ """
207
+ Estimate the length of an integer array after packing it with a given number of
208
+ bytes.
209
+
210
+ Parameters
211
+ ----------
212
+ array : numpy.ndarray
213
+ The array to pack.
214
+ packed_byte_count : int
215
+ The number of bytes used for packing.
216
+
217
+ Returns
218
+ -------
219
+ length : int
220
+ The estimated length of the packed array.
221
+ """
222
+ # Use int64 to avoid integer overflow in the following line
223
+ max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
224
+ n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
225
+ return np.sum(n_bytes_per_element, dtype=np.int64)
226
+
227
+
228
+ def _to_smallest_integer_type(array):
229
+ """
230
+ Convert an integer array to the smallest possible integer type, that is still able
231
+ to represent all values in the array.
232
+
233
+ Parameters
234
+ ----------
235
+ array : numpy.ndarray
236
+ The array to convert.
237
+
238
+ Returns
239
+ -------
240
+ array : numpy.ndarray
241
+ The converted array.
242
+ """
243
+ if array.min() >= 0:
244
+ for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
245
+ if np.all(array <= np.iinfo(dtype).max):
246
+ return array.astype(dtype)
247
+ for dtype in [np.int8, np.int16, np.int32, np.int64]:
248
+ if np.all(array >= np.iinfo(dtype).min) and np.all(
249
+ array <= np.iinfo(dtype).max
250
+ ):
251
+ return array.astype(dtype)
252
+ raise ValueError("Array is out of bounds for all integer types")
253
+
254
+
255
+ def _data_size_in_file(data):
256
+ """
257
+ Get the size of the data, it would have when written into a *BinaryCIF* file.
258
+
259
+ Parameters
260
+ ----------
261
+ data : BinaryCIFData or dict
262
+ The data array whose size is measured.
263
+ Can be either a :class:`BinaryCIFData` object or already serialized data.
264
+
265
+ Returns
266
+ -------
267
+ size : int
268
+ The size of the data array in the file in bytes.
269
+ """
270
+ if isinstance(data, bcif.BinaryCIFData):
271
+ data = data.serialize()
272
+ bytes_in_file = msgpack.packb(data, use_bin_type=True, default=encode_numpy)
273
+ return len(bytes_in_file)
274
+
275
+
276
+ def _get_decimal_places(array, tol):
277
+ """
278
+ Get the number of decimal places in a floating point array.
279
+
280
+ Parameters
281
+ ----------
282
+ array : numpy.ndarray
283
+ The array to analyze.
284
+ tol : float, optional
285
+ The relative tolerance allowed when the values are cut off after the returned
286
+ number of decimal places.
287
+
288
+ Returns
289
+ -------
290
+ decimals : int
291
+ The number of decimal places.
292
+ """
293
+ # Decimals of NaN or infinite values do not make sense
294
+ # and 0 would give NaN when rounding on decimals
295
+ array = array[np.isfinite(array) & (array != 0)]
296
+ for decimals in itertools.count(start=-_order_magnitude(array)):
297
+ error = np.abs(np.round(array, decimals) - array)
298
+ if np.all(error < tol * np.abs(array)):
299
+ return decimals
300
+
301
+
302
+ def _order_magnitude(array):
303
+ """
304
+ Get the order of magnitude of floating point values.
305
+
306
+ Parameters
307
+ ----------
308
+ array : ndarray, dtype=float
309
+ The value to analyze.
310
+
311
+ Returns
312
+ -------
313
+ magnitude : int
314
+ The order of magnitude, i.e. the maximum exponent a number in the array would
315
+ have in scientific notation, if only one digit is left of the decimal point.
316
+ """
317
+ array = array[array != 0]
318
+ if len(array) == 0:
319
+ # No non-zero values -> define order of magnitude as 0
320
+ return 0
321
+ return int(np.max(np.floor(np.log10(np.abs(array)))).item())