biotite 1.0.0__cp311-cp311-macosx_11_0_arm64.whl → 1.1.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (92) hide show
  1. biotite/application/dssp/app.py +13 -3
  2. biotite/application/localapp.py +34 -0
  3. biotite/application/muscle/app3.py +2 -15
  4. biotite/application/muscle/app5.py +2 -2
  5. biotite/application/util.py +1 -1
  6. biotite/application/viennarna/rnaplot.py +6 -2
  7. biotite/database/rcsb/query.py +6 -6
  8. biotite/database/uniprot/check.py +20 -15
  9. biotite/database/uniprot/download.py +1 -1
  10. biotite/database/uniprot/query.py +1 -1
  11. biotite/sequence/align/alignment.py +16 -3
  12. biotite/sequence/align/banded.cpython-311-darwin.so +0 -0
  13. biotite/sequence/align/banded.pyx +5 -5
  14. biotite/sequence/align/kmeralphabet.cpython-311-darwin.so +0 -0
  15. biotite/sequence/align/kmeralphabet.pyx +17 -0
  16. biotite/sequence/align/kmersimilarity.cpython-311-darwin.so +0 -0
  17. biotite/sequence/align/kmertable.cpython-311-darwin.so +0 -0
  18. biotite/sequence/align/kmertable.pyx +52 -42
  19. biotite/sequence/align/localgapped.cpython-311-darwin.so +0 -0
  20. biotite/sequence/align/localungapped.cpython-311-darwin.so +0 -0
  21. biotite/sequence/align/matrix.py +273 -55
  22. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  23. biotite/sequence/align/matrix_data/PB.license +21 -0
  24. biotite/sequence/align/matrix_data/PB.mat +18 -0
  25. biotite/sequence/align/multiple.cpython-311-darwin.so +0 -0
  26. biotite/sequence/align/pairwise.cpython-311-darwin.so +0 -0
  27. biotite/sequence/align/permutation.cpython-311-darwin.so +0 -0
  28. biotite/sequence/align/selector.cpython-311-darwin.so +0 -0
  29. biotite/sequence/align/tracetable.cpython-311-darwin.so +0 -0
  30. biotite/sequence/alphabet.py +3 -0
  31. biotite/sequence/codec.cpython-311-darwin.so +0 -0
  32. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  33. biotite/sequence/graphics/color_schemes/pb_flower.json +2 -1
  34. biotite/sequence/graphics/colorschemes.py +44 -11
  35. biotite/sequence/phylo/nj.cpython-311-darwin.so +0 -0
  36. biotite/sequence/phylo/tree.cpython-311-darwin.so +0 -0
  37. biotite/sequence/phylo/upgma.cpython-311-darwin.so +0 -0
  38. biotite/sequence/profile.py +86 -4
  39. biotite/sequence/seqtypes.py +124 -3
  40. biotite/setup_ccd.py +197 -0
  41. biotite/structure/__init__.py +4 -3
  42. biotite/structure/alphabet/__init__.py +25 -0
  43. biotite/structure/alphabet/encoder.py +332 -0
  44. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  45. biotite/structure/alphabet/i3d.py +110 -0
  46. biotite/structure/alphabet/layers.py +86 -0
  47. biotite/structure/alphabet/pb.license +21 -0
  48. biotite/structure/alphabet/pb.py +171 -0
  49. biotite/structure/alphabet/unkerasify.py +122 -0
  50. biotite/structure/atoms.py +156 -43
  51. biotite/structure/bonds.cpython-311-darwin.so +0 -0
  52. biotite/structure/bonds.pyx +72 -21
  53. biotite/structure/celllist.cpython-311-darwin.so +0 -0
  54. biotite/structure/charges.cpython-311-darwin.so +0 -0
  55. biotite/structure/filter.py +1 -1
  56. biotite/structure/geometry.py +60 -113
  57. biotite/structure/info/__init__.py +1 -0
  58. biotite/structure/info/atoms.py +13 -13
  59. biotite/structure/info/bonds.py +12 -6
  60. biotite/structure/info/ccd.py +125 -32
  61. biotite/structure/info/{ccd/components.bcif → components.bcif} +0 -0
  62. biotite/structure/info/groups.py +63 -17
  63. biotite/structure/info/masses.py +9 -6
  64. biotite/structure/info/misc.py +15 -21
  65. biotite/structure/info/standardize.py +3 -2
  66. biotite/structure/io/mol/sdf.py +41 -40
  67. biotite/structure/io/pdb/convert.py +2 -0
  68. biotite/structure/io/pdb/file.py +74 -3
  69. biotite/structure/io/pdb/hybrid36.cpython-311-darwin.so +0 -0
  70. biotite/structure/io/pdbqt/file.py +32 -32
  71. biotite/structure/io/pdbx/__init__.py +1 -0
  72. biotite/structure/io/pdbx/bcif.py +32 -8
  73. biotite/structure/io/pdbx/cif.py +148 -107
  74. biotite/structure/io/pdbx/component.py +9 -4
  75. biotite/structure/io/pdbx/compress.py +321 -0
  76. biotite/structure/io/pdbx/convert.py +227 -68
  77. biotite/structure/io/pdbx/encoding.cpython-311-darwin.so +0 -0
  78. biotite/structure/io/pdbx/encoding.pyx +98 -17
  79. biotite/structure/io/trajfile.py +16 -16
  80. biotite/structure/molecules.py +141 -141
  81. biotite/structure/sasa.cpython-311-darwin.so +0 -0
  82. biotite/structure/segments.py +1 -2
  83. biotite/structure/util.py +73 -1
  84. biotite/version.py +2 -2
  85. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/METADATA +4 -1
  86. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/RECORD +88 -78
  87. biotite/structure/info/ccd/README.rst +0 -8
  88. biotite/structure/info/ccd/amino_acids.txt +0 -1663
  89. biotite/structure/info/ccd/carbohydrates.txt +0 -1135
  90. biotite/structure/info/ccd/nucleotides.txt +0 -798
  91. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/WHEEL +0 -0
  92. {biotite-1.0.0.dist-info → biotite-1.1.0.dist-info}/licenses/LICENSE.rst +0 -0
@@ -6,10 +6,11 @@ __name__ = "biotite.application.dssp"
6
6
  __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["DsspApp"]
8
8
 
9
+ from subprocess import SubprocessError
9
10
  from tempfile import NamedTemporaryFile
10
11
  import numpy as np
11
12
  from biotite.application.application import AppState, requires_state
12
- from biotite.application.localapp import LocalApp, cleanup_tempfile
13
+ from biotite.application.localapp import LocalApp, cleanup_tempfile, get_version
13
14
  from biotite.structure.io.pdbx.cif import CIFFile
14
15
  from biotite.structure.io.pdbx.convert import set_structure
15
16
 
@@ -72,7 +73,13 @@ class DsspApp(LocalApp):
72
73
  self._array.set_annotation(
73
74
  "occupancy", np.ones(self._array.array_length(), dtype=float)
74
75
  )
75
-
76
+ try:
77
+ # The parameters have changed in version 4
78
+ self._new_cli = get_version(bin_path)[0] >= 4
79
+ except SubprocessError:
80
+ # In older versions, the no version is returned with `--version`
81
+ # -> a SubprocessError is raised
82
+ self._new_cli = False
76
83
  self._in_file = NamedTemporaryFile("w", suffix=".cif", delete=False)
77
84
  self._out_file = NamedTemporaryFile("r", suffix=".dssp", delete=False)
78
85
 
@@ -81,7 +88,10 @@ class DsspApp(LocalApp):
81
88
  set_structure(in_file, self._array)
82
89
  in_file.write(self._in_file)
83
90
  self._in_file.flush()
84
- self.set_arguments(["-i", self._in_file.name, "-o", self._out_file.name])
91
+ if self._new_cli:
92
+ self.set_arguments([self._in_file.name, self._out_file.name])
93
+ else:
94
+ self.set_arguments(["-i", self._in_file.name, "-o", self._out_file.name])
85
95
  super().run()
86
96
 
87
97
  def evaluate(self):
@@ -8,7 +8,10 @@ __all__ = ["LocalApp"]
8
8
 
9
9
  import abc
10
10
  import copy
11
+ import re
12
+ import subprocess
11
13
  from os import chdir, getcwd, remove
14
+ from pathlib import Path
12
15
  from subprocess import PIPE, Popen, SubprocessError, TimeoutExpired
13
16
  from biotite.application.application import (
14
17
  Application,
@@ -306,3 +309,34 @@ def cleanup_tempfile(temp_file):
306
309
  except FileNotFoundError:
307
310
  # File was already deleted, e.g. due to `TemporaryFile(delete=True)`
308
311
  pass
312
+
313
+
314
+ def get_version(bin_path, version_option="--version"):
315
+ """
316
+ Get the version of a locally installed application.
317
+
318
+ Parameters
319
+ ----------
320
+ bin_path : str or Path
321
+ Path of the application.
322
+ version_option : str, optional
323
+ The command line option to get the version.
324
+
325
+ Returns
326
+ -------
327
+ major, minor : int
328
+ The major and minor version number.
329
+ """
330
+ output = subprocess.run(
331
+ [bin_path, version_option], capture_output=True, text=True
332
+ ).stdout
333
+ # Find matches for version string containing major and minor version
334
+ match = re.search(r"\d+\.\d+", output)
335
+ if match is None:
336
+ raise subprocess.SubprocessError(
337
+ f"Could not determine '{Path(bin_path).name}' version "
338
+ f"from the string '{output}'"
339
+ )
340
+ version_string = match.group(0)
341
+ splitted = version_string.split(".")
342
+ return int(splitted[0]), int(splitted[1])
@@ -7,13 +7,11 @@ __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["MuscleApp"]
8
8
 
9
9
  import numbers
10
- import re
11
- import subprocess
12
10
  import warnings
13
11
  from collections.abc import Sequence
14
12
  from tempfile import NamedTemporaryFile
15
13
  from biotite.application.application import AppState, VersionError, requires_state
16
- from biotite.application.localapp import cleanup_tempfile
14
+ from biotite.application.localapp import cleanup_tempfile, get_version
17
15
  from biotite.application.msaapp import MSAApp
18
16
  from biotite.sequence.phylo.tree import Tree
19
17
 
@@ -54,7 +52,7 @@ class MuscleApp(MSAApp):
54
52
  """
55
53
 
56
54
  def __init__(self, sequences, bin_path="muscle", matrix=None):
57
- major_version = get_version(bin_path)[0]
55
+ major_version = get_version(bin_path, "-version")[0]
58
56
  if major_version != 3:
59
57
  raise VersionError(f"Muscle 3 is required, got version {major_version}")
60
58
 
@@ -227,14 +225,3 @@ class MuscleApp(MSAApp):
227
225
  app.start()
228
226
  app.join()
229
227
  return app.get_alignment()
230
-
231
-
232
- def get_version(bin_path="muscle"):
233
- output = subprocess.run([bin_path, "-version"], capture_output=True, text=True)
234
- # Find matches for version string containing major and minor version
235
- match = re.search(r"\d+\.\d+", output.stdout)
236
- if match is None:
237
- raise subprocess.SubprocessError("Could not determine Muscle version")
238
- version_string = match.group(0)
239
- splitted = version_string.split(".")
240
- return int(splitted[0]), int(splitted[1])
@@ -7,8 +7,8 @@ __author__ = "Patrick Kunzmann"
7
7
  __all__ = ["Muscle5App"]
8
8
 
9
9
  from biotite.application.application import AppState, VersionError, requires_state
10
+ from biotite.application.localapp import get_version
10
11
  from biotite.application.msaapp import MSAApp
11
- from biotite.application.muscle.app3 import get_version
12
12
 
13
13
 
14
14
  class Muscle5App(MSAApp):
@@ -49,7 +49,7 @@ class Muscle5App(MSAApp):
49
49
  """
50
50
 
51
51
  def __init__(self, sequences, bin_path="muscle"):
52
- major_version = get_version(bin_path)[0]
52
+ major_version = get_version(bin_path, "-version")[0]
53
53
  if major_version < 5:
54
54
  raise VersionError(
55
55
  f"At least Muscle 5 is required, got version {major_version}"
@@ -50,7 +50,7 @@ def map_matrix(matrix):
50
50
  # All trailing symbols are filled with zeros
51
51
  old_length = len(matrix.get_alphabet1())
52
52
  new_length = len(ProteinSequence.alphabet)
53
- new_score_matrix = np.zeros((new_length, new_length))
53
+ new_score_matrix = np.zeros((new_length, new_length), dtype=np.int32)
54
54
  new_score_matrix[:old_length, :old_length] = matrix.score_matrix()
55
55
  return SubstitutionMatrix(
56
56
  ProteinSequence.alphabet, ProteinSequence.alphabet, new_score_matrix
@@ -99,8 +99,12 @@ class RNAplotApp(LocalApp):
99
99
  self._in_file.write(self._dot_bracket)
100
100
  self._in_file.flush()
101
101
  self.set_arguments(
102
- ["-i", self._in_file.name, "-o", "xrna", "-t", self._layout_type]
103
- )
102
+ [
103
+ "-i", self._in_file.name,
104
+ "--output-format", "xrna",
105
+ "-t", self._layout_type,
106
+ ]
107
+ ) # fmt: skip
104
108
  super().run()
105
109
 
106
110
  def evaluate(self):
@@ -146,9 +146,9 @@ class BasicQuery(SingleQuery):
146
146
  Examples
147
147
  --------
148
148
 
149
- >>> query = BasicQuery("tc5b")
149
+ >>> query = BasicQuery("Miniprotein Construct")
150
150
  >>> print(sorted(search(query)))
151
- ['1L2Y', '8ANG', '8ANH', '8ANI', '8ANM', '8QWW']
151
+ ['1L2Y']
152
152
  """
153
153
 
154
154
  def __init__(self, term):
@@ -346,9 +346,9 @@ class SequenceQuery(SingleQuery):
346
346
  --------
347
347
 
348
348
  >>> sequence = "NLYIQWLKDGGPSSGRPPPS"
349
- >>> query = SequenceQuery(sequence, scope="protein", min_identity=0.8)
349
+ >>> query = SequenceQuery(sequence, scope="protein", min_identity=0.95)
350
350
  >>> print(sorted(search(query)))
351
- ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2LL5', '2MJ9', '3UC7', '3UC8']
351
+ ['1L2Y', '2LDJ', '9G22', '9G2N', '9G2O', '9G31', '9G32', '9GDL', '9GDN', '9GDT', '9GDU', '9GE1']
352
352
  """
353
353
 
354
354
  def __init__(self, sequence, scope, min_identity=0.0, max_expect_value=10000000.0):
@@ -441,7 +441,7 @@ class StructureQuery(SingleQuery):
441
441
 
442
442
  >>> query = StructureQuery("1L2Y", chain="A")
443
443
  >>> print(sorted(search(query)))
444
- ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS']
444
+ ['1L2Y', '1RIJ', '2JOF', '2LDJ', '2M7D', '7MQS', '9DPF']
445
445
  """
446
446
 
447
447
  def __init__(self, pdb_id, chain=None, assembly=None, strict=True):
@@ -868,7 +868,7 @@ def search(
868
868
  ... query, return_type="polymer_entity", return_groups=True,
869
869
  ... group_by=UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"),
870
870
  ... ))
871
- {'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['1EJG_1', '3NIR_1']}
871
+ {'P24297': ['5NW3_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']}
872
872
  """
873
873
  query_dict = _initialize_query_dict(query, return_type, group_by, content_types)
874
874
 
@@ -10,7 +10,7 @@ from biotite.database.error import RequestError
10
10
 
11
11
 
12
12
  # Taken from https://www.uniprot.org/help/api_retrieve_entries
13
- def assert_valid_response(response_status_code):
13
+ def assert_valid_response(response):
14
14
  """
15
15
  Checks whether the response is valid.
16
16
 
@@ -19,17 +19,22 @@ def assert_valid_response(response_status_code):
19
19
  response_status_code: int
20
20
  Status code of request.get.
21
21
  """
22
- if response_status_code == 400:
23
- raise RequestError("Bad request. There is a problem with your input.")
24
- elif response_status_code == 404:
25
- raise RequestError("Not found. The resource you requested doesn't exist.")
26
- elif response_status_code == 410:
27
- raise RequestError("Gone. The resource you requested was removed.")
28
- elif response_status_code == 500:
29
- raise RequestError(
30
- "Internal server error. Most likely a temporary problem, but if the problem persists please contact UniProt team."
31
- )
32
- elif response_status_code == 503:
33
- raise RequestError(
34
- "Service not available. The server is being updated, try again later."
35
- )
22
+ if len(response.content) == 0:
23
+ raise RequestError("No content returned")
24
+ match response.status_code:
25
+ case 400:
26
+ raise RequestError("Bad request. There is a problem with your input.")
27
+ case 404:
28
+ raise RequestError("Not found. The resource you requested doesn't exist.")
29
+ case 410:
30
+ raise RequestError("Gone. The resource you requested was removed.")
31
+ case 500:
32
+ raise RequestError(
33
+ "Internal server error. "
34
+ "Most likely a temporary problem, "
35
+ "but if the problem persists please contact UniProt team."
36
+ )
37
+ case 503:
38
+ raise RequestError(
39
+ "Service not available. The server is being updated, try again later."
40
+ )
@@ -111,7 +111,7 @@ def fetch(ids, format, target_path=None, overwrite=False, verbose=False):
111
111
  if format in ["fasta", "gff", "txt", "xml", "rdf", "tab"]:
112
112
  r = requests.get(_fetch_url + db_name + "/" + id + "." + format)
113
113
  content = r.text
114
- assert_valid_response(r.status_code)
114
+ assert_valid_response(r)
115
115
  else:
116
116
  raise ValueError(f"Format '{format}' is not supported")
117
117
  if file is None:
@@ -289,5 +289,5 @@ def search(query, number=500):
289
289
  params = {"query": str(query), "format": "list", "size": str(number)}
290
290
  r = requests.get(_base_url, params=params)
291
291
  content = r.text
292
- assert_valid_response(r.status_code)
292
+ assert_valid_response(r)
293
293
  return content.split("\n")[:-1]
@@ -9,7 +9,6 @@ import numbers
9
9
  import textwrap
10
10
  from collections.abc import Sequence
11
11
  import numpy as np
12
- from biotite.sequence.alphabet import LetterAlphabet
13
12
 
14
13
  __all__ = [
15
14
  "Alignment",
@@ -111,7 +110,7 @@ class Alignment(object):
111
110
  for i in range(len(self.trace)):
112
111
  j = self.trace[i][seq_index]
113
112
  if j != -1:
114
- seq_str += self.sequences[seq_index][j]
113
+ seq_str += str(self.sequences[seq_index][j])
115
114
  else:
116
115
  seq_str += "-"
117
116
  return seq_str
@@ -133,7 +132,7 @@ class Alignment(object):
133
132
  # has an non-single letter alphabet
134
133
  all_single_letter = True
135
134
  for seq in self.sequences:
136
- if not isinstance(seq.get_alphabet(), LetterAlphabet):
135
+ if not _is_single_letter(seq.alphabet):
137
136
  all_single_letter = False
138
137
  if all_single_letter:
139
138
  # First dimension: sequence number,
@@ -665,3 +664,17 @@ def remove_terminal_gaps(alignment):
665
664
  "no overlap and the resulting alignment would be empty"
666
665
  )
667
666
  return alignment[start:stop]
667
+
668
+
669
+ def _is_single_letter(alphabet):
670
+ """
671
+ More relaxed version of :func:`biotite.sequence.alphabet.is_letter_alphabet()`:
672
+ It is sufficient that only only the string representation of each symbol is only
673
+ a single character.
674
+ """
675
+ if alphabet.is_letter_alphabet():
676
+ return True
677
+ for symbol in alphabet:
678
+ if len(str(symbol)) != 1:
679
+ return False
680
+ return True
@@ -214,9 +214,6 @@ def align_banded(seq1, seq2, matrix, band, gap_penalty=-10, local=False,
214
214
  else:
215
215
  is_swapped = False
216
216
  lower_diag, upper_diag = min(band), max(band)
217
- band_width = upper_diag - lower_diag + 1
218
- if band_width < 1:
219
- raise ValueError("The width of the band is 0")
220
217
  if len(seq1) + upper_diag <= 0 or lower_diag >= len(seq2):
221
218
  raise ValueError(
222
219
  "Alignment band is out of range, the band allows no overlap "
@@ -226,6 +223,9 @@ def align_banded(seq1, seq2, matrix, band, gap_penalty=-10, local=False,
226
223
  # covers the search space of an unbanded alignment
227
224
  lower_diag = max(lower_diag, -len(seq1)+1)
228
225
  upper_diag = min(upper_diag, len(seq2)-1)
226
+ band_width = upper_diag - lower_diag + 1
227
+ if band_width < 1:
228
+ raise ValueError("The width of the band is 0")
229
229
 
230
230
  # This implementation uses transposed tables in comparison
231
231
  # to the common visualization
@@ -249,12 +249,12 @@ def align_banded(seq1, seq2, matrix, band, gap_penalty=-10, local=False,
249
249
  ###############
250
250
 
251
251
  # A score value that signals that the respective direction in the
252
- # dynamic programming matrix should not be used since, it would be
252
+ # dynamic programming matrix should not be used, since it would be
253
253
  # outside the band
254
254
  # It is the 'worst' score available, so the trace table will never
255
255
  # include such a direction
256
256
  neg_inf = np.iinfo(np.int32).min
257
- # Correct the 'negative infinity' integer, by making it more positve
257
+ # Correct the 'negative infinity' integer, by making it more positive
258
258
  # This prevents an integer underflow when the gap penalty or
259
259
  # match score is added to this value
260
260
  neg_inf -= min(gap_penalty) if affine_penalty else gap_penalty
@@ -568,6 +568,23 @@ class KmerAlphabet(Alphabet):
568
568
  return int(len(self._base_alph) ** self._k)
569
569
 
570
570
 
571
+ def __iter__(self):
572
+ # Creating all symbols is expensive
573
+ # -> Use a generator instead
574
+ if isinstance(self._base_alph, LetterAlphabet):
575
+ return ("".join(self.decode(code)) for code in range(len(self)))
576
+ else:
577
+ return (list(self.decode(code)) for code in range(len(self)))
578
+
579
+
580
+ def __contains__(self, symbol):
581
+ try:
582
+ self.fuse(self._base_alph.encode_multiple(symbol))
583
+ return True
584
+ except AlphabetError:
585
+ return False
586
+
587
+
571
588
  def _to_array_form(model_string):
572
589
  """
573
590
  Convert the the common string representation of a *k-mer* spacing
@@ -1384,8 +1384,7 @@ cdef class KmerTable:
1384
1384
 
1385
1385
 
1386
1386
  def __getstate__(self):
1387
- relevant_kmers = self.get_kmers()
1388
- return _pickle_c_arrays(self._ptr_array, relevant_kmers)
1387
+ return _pickle_c_arrays(self._ptr_array)
1389
1388
 
1390
1389
 
1391
1390
  def __setstate__(self, state):
@@ -2836,12 +2835,7 @@ cdef class BucketKmerTable:
2836
2835
 
2837
2836
 
2838
2837
  def __getstate__(self):
2839
- cdef int64[:] relevant_buckets = np.where(
2840
- np.asarray(self._ptr_array) != 0
2841
- )[0]
2842
- return _pickle_c_arrays(self._ptr_array, relevant_buckets)
2843
-
2844
-
2838
+ return _pickle_c_arrays(self._ptr_array)
2845
2839
 
2846
2840
  def __setstate__(self, state):
2847
2841
  _unpickle_c_arrays(self._ptr_array, state)
@@ -3097,27 +3091,44 @@ def _append_entries(ptr[:] trg_ptr_array, ptr[:] src_ptr_array):
3097
3091
 
3098
3092
  @cython.boundscheck(False)
3099
3093
  @cython.wraparound(False)
3100
- def _pickle_c_arrays(ptr[:] ptr_array, int64[:] relevant_buckets):
3094
+ def _pickle_c_arrays(ptr[:] ptr_array):
3101
3095
  """
3102
- Pickle the `relevant_buckets` (i.e. the buckets that actualy point
3103
- to an array) of the `ptr_array` into a list of bytes.
3096
+ Pickle the C arrays into a single concatenated :class:`ndarray`.
3097
+ The lengths of each C-array on these concatenated array is saved as well.
3104
3098
  """
3105
- cdef int64 i
3106
- cdef int64 bucket
3099
+ cdef int64 pointer_i, bucket_i, concat_i
3107
3100
  cdef int64 length
3108
3101
  cdef uint32* bucket_ptr
3109
3102
 
3110
- cdef list pickled_arrays = [b""] * relevant_buckets.shape[0]
3111
-
3112
- for i in range(relevant_buckets.shape[0]):
3113
- bucket = relevant_buckets[i]
3114
- bucket_ptr = <uint32*>ptr_array[bucket]
3115
- length = (<int64*>bucket_ptr)[0]
3116
- # Get directly the bytes coding for each C-array
3117
- pickled_arrays[i] \
3118
- = <bytes>(<char*>bucket_ptr)[:sizeof(uint32) * length]
3103
+ # First pass: Count the total concatenated size
3104
+ cdef int64 total_length = 0
3105
+ for pointer_i in range(ptr_array.shape[0]):
3106
+ bucket_ptr = <uint32*>ptr_array[pointer_i]
3107
+ if bucket_ptr != NULL:
3108
+ # The first element of the C-array is the length
3109
+ # of the array
3110
+ total_length += (<int64*>bucket_ptr)[0]
3111
+
3112
+ # Second pass: Copy the C-arrays into a single concatenated array
3113
+ # and track the start position of each C-array
3114
+ cdef uint32[:] concatenated_array = np.empty(total_length, dtype=np.uint32)
3115
+ cdef int64[:] lengths = np.empty(ptr_array.shape[0], dtype=np.int64)
3116
+ concat_i = 0
3117
+ for pointer_i in range(ptr_array.shape[0]):
3118
+ bucket_ptr = <uint32*>ptr_array[pointer_i]
3119
+ if bucket_ptr != NULL:
3120
+ length = (<int64*>bucket_ptr)[0]
3121
+ lengths[pointer_i] = length
3122
+ memcpy(
3123
+ &concatenated_array[concat_i],
3124
+ bucket_ptr,
3125
+ length * sizeof(uint32),
3126
+ )
3127
+ concat_i += length
3128
+ else:
3129
+ lengths[pointer_i] = 0
3119
3130
 
3120
- return np.asarray(relevant_buckets), pickled_arrays
3131
+ return np.asarray(concatenated_array), np.asarray(lengths)
3121
3132
 
3122
3133
 
3123
3134
  @cython.boundscheck(False)
@@ -3126,28 +3137,27 @@ def _unpickle_c_arrays(ptr[:] ptr_array, state):
3126
3137
  """
3127
3138
  Unpickle the pickled `state` into the given `ptr_array`.
3128
3139
  """
3129
- cdef int64 i
3130
- cdef int64 bucket
3131
- cdef int64 byte_length
3140
+ cdef int64 pointer_i, concat_i
3141
+ cdef int64 length
3132
3142
  cdef uint32* bucket_ptr
3133
- cdef bytes pickled_bytes
3134
-
3135
- cdef int64[:] relevant_buckets = state[0]
3136
- cdef list pickled_pointers = state[1]
3137
-
3138
- for i in range(relevant_buckets.shape[0]):
3139
- bucket = relevant_buckets[i]
3140
- if bucket < 0 or bucket >= ptr_array.shape[0]:
3141
- raise ValueError("Invalid bucket found while unpickling")
3142
- pickled_bytes = pickled_pointers[i]
3143
- byte_length = len(pickled_bytes)
3144
- if byte_length != 0:
3145
- bucket_ptr = <uint32*>malloc(byte_length)
3143
+
3144
+ cdef uint32[:] concatenated_array = state[0]
3145
+ cdef int64[:] lengths = state[1]
3146
+
3147
+ concat_i = 0
3148
+ for pointer_i in range(ptr_array.shape[0]):
3149
+ length = lengths[pointer_i]
3150
+ if length != 0:
3151
+ bucket_ptr = <uint32*>malloc(length * sizeof(uint32))
3146
3152
  if not bucket_ptr:
3147
3153
  raise MemoryError
3148
- # Convert bytes back into C-array
3149
- memcpy(bucket_ptr, <char*>pickled_bytes, byte_length)
3150
- ptr_array[bucket] = <ptr>bucket_ptr
3154
+ memcpy(
3155
+ bucket_ptr,
3156
+ &concatenated_array[concat_i],
3157
+ length * sizeof(uint32),
3158
+ )
3159
+ concat_i += length
3160
+ ptr_array[pointer_i] = <ptr>bucket_ptr
3151
3161
 
3152
3162
 
3153
3163
  cdef inline void _deallocate_ptrs(ptr[:] ptrs):