rechunkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+ junit/
50
+ junit.xml
51
+ test.db
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+
61
+ # Flask stuff:
62
+ instance/
63
+ .webassets-cache
64
+
65
+ # Scrapy stuff:
66
+ .scrapy
67
+
68
+ # Sphinx documentation
69
+ docs/_build/
70
+
71
+ # PyBuilder
72
+ target/
73
+
74
+ # Jupyter Notebook
75
+ .ipynb_checkpoints
76
+
77
+ # pyenv
78
+ .python-version
79
+
80
+ # celery beat schedule file
81
+ celerybeat-schedule
82
+
83
+ # SageMath parsed files
84
+ *.sage.py
85
+
86
+ # dotenv
87
+ .env
88
+
89
+ # virtualenv
90
+ .venv
91
+ venv/
92
+ ENV/
93
+ .ruff*
94
+
95
+ # Spyder project settings
96
+ .spyderproject
97
+ .spyproject
98
+
99
+ # Rope project settings
100
+ .ropeproject
101
+
102
+ # mkdocs documentation
103
+ /site
104
+
105
+ # mypy
106
+ .mypy_cache/
107
+
108
+ # .vscode
109
+ .vscode/
110
+
111
+ # OS files
112
+ .DS_Store
113
+
114
+ # Temp data
115
+ data/*
116
+
117
+ # Test config files
118
+ /rechunkit/tests/*.toml
119
+ /rechunkit/tests/*.yml
@@ -0,0 +1,16 @@
1
+ Apache Software License 2.0
2
+
3
+ Copyright (c) 2025, Mike Kittridge
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: rechunkit
3
+ Version: 0.1.0
4
+ Summary: Functions to efficiently rechunk multidimensional arrays
5
+ Project-URL: Documentation, https://mullenkamp.github.io/rechunkit/
6
+ Project-URL: Source, https://github.com/mullenkamp/rechunkit
7
+ Author-email: mullenkamp <mullenkamp1@gmail.com>
8
+ License-File: LICENSE
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Requires-Python: >=3.9
11
+ Requires-Dist: numpy>=1.26
12
+ Description-Content-Type: text/markdown
13
+
14
+ # rechunkit
15
+
16
+ <p align="center">
17
+ <em>Functions to efficiently rechunk multidimensional arrays</em>
18
+ </p>
19
+
20
+ [![build](https://github.com/mullenkamp/rechunkit/workflows/Build/badge.svg)](https://github.com/mullenkamp/rechunkit/actions)
21
+ [![codecov](https://codecov.io/gh/mullenkamp/rechunkit/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/rechunkit)
22
+ [![PyPI version](https://badge.fury.io/py/rechunkit.svg)](https://badge.fury.io/py/rechunkit)
23
+
24
+ ---
25
+
26
+ **Source Code**: <a href="https://github.com/mullenkamp/rechunkit" target="_blank">https://github.com/mullenkamp/rechunkit</a>
27
+
28
+ ---
29
+ ## Introduction
30
+ Rechunkit is a set of functions to allow efficient rechunking of multidimensional arrays that have been stored as chunks of numpy ndarrays. It allows for rechunking on-the-fly via python generators instead of requiring the user to save the full target array. It also contains several other handy tools for assisting the user as part of the rechunking process (e.g. estimating an optimal or ideal chunking size, iterating over chunks with a range-type function, etc).
31
+
32
+
33
+ ## Installation
34
+ ```
35
+ pip install rechunkit
36
+ ```
37
+ I can add it to conda-forge if there is demand.
38
+
39
+ ## Usage
40
+ Import the necessary modules and assign some parameters for the examples:
41
+
42
+ ```python
43
+ from rechunkit import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
44
+
45
+ source_shape = (31, 31, 31)
46
+ shape = source_shape
47
+
48
+ sel = (slice(3, 21), slice(11, 27), slice(7, 17))
49
+
50
+ source_chunk_shape = (5, 2, 4)
51
+ target_chunk_shape = (4, 5, 3)
52
+ max_mem = 2000 # smaller than the ideal chunk size
53
+
54
+ dtype = np.dtype('int32')
55
+ ```
56
+
57
+ ### Preprocessing tools
58
+ We have defined our target_chunk_shape above, but rechunkit has a function to guess a good chunk shape given a user-defined amount of memory per chunk:
59
+
60
+ ```python
61
+ new_chunk_shape = guess_chunk_shape(source_shape, dtype, 400)
62
+ ```
63
+
64
+ Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple (LCM) of two composite numbers will be significantly lower than the product of those two numbers. The LCM is used to determine the ideal chunk size for the rechunking process.
65
+
66
+ Speaking of the ideal chunk size, we can determine the ideal chunk shape and size via a couple functions:
67
+
68
+ ```python
69
+ ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape) # (20, 10, 12)
70
+
71
+ ideal_read_chunk_size = calc_ideal_read_chunk_mem(ideal_read_chunk_shape, dtype.itemsize) # 9600 bytes
72
+ ```
73
+
74
+ If the ideal_read_chunk_size can comfortably fit in your memory, then you should use this value. Using the ideal chunk size will mean that you will only need to read all chunks in the source once. If the chunk size (called max_mem in the functions) is less than the ideal, then some chunks will need to be read multiple times.
75
+
76
+ To see how many reads are required if no optimization is performed during rechunking (i.e. every target chunk must iterate over every associated source chunk), you can use the calc_n_reads_simple function and compare it to the total number of chunks in the source:
77
+
78
+ ```python
79
+ n_chunks_source = calc_n_chunks(source_shape, source_chunk_shape) # 896
80
+ n_chunks_target = calc_n_chunks(source_shape, target_chunk_shape) # 616
81
+
82
+ n_reads_simple = calc_n_reads_simple(source_shape, source_chunk_shape, target_chunk_shape) # 3952
83
+ ```
84
+
85
+ Using the simple brute force method requires one chunk to be read 4.4 times on average.
86
+
87
+ There's also a function to check the number of reads (and writes) using the optimized algorithm:
88
+
89
+ ```python
90
+ n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem) # 2044, 616
91
+ ```
92
+
93
+ In this case, we only require one chunk to be read 2.28 times on average. The more max_mem you give to the rechunker, the less reads per chunk is required (to a minium of 1 in the ideal case).
94
+
95
+
96
+ ### Rechunking
97
+ We need a source dataset to get data from. Rechunkit requires that the source input is a function/method that has a single parameter input of a tuple of slices. The slices contain the start and stop of the chunk to be read in the source.
98
+
99
+ For example, we can simply use a numpy array and it's `__getitem__` method as the source:
100
+
101
+ ```python
102
+ source_data = np.arange(1, prod(source_shape) + 1, dtype=dtype).reshape(source_shape)
103
+ source = source_data.__getitem__
104
+ ```
105
+
106
+ And again as a simple example, we can use a numpy array as the target:
107
+
108
+ ```python
109
+ target = np.zeros(source_shape, dtype=dtype)
110
+ ```
111
+
112
+ We don't necessarily need the target as an array to be filled, because the rechunker function returns a generator that can be iterated over. The generator returns a tuple of slices (representing the target chunk) and the associated numpy array data:
113
+
114
+ ```python
115
+ for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem):
116
+ target[write_chunk] = data
117
+
118
+ assert np.all(source(()) == target) # Should pass!
119
+ ```
120
+
121
+ #### Subsets of the source
122
+ There are many use-cases where you don't want the entire dataset. Rather you want a subset of the dataset, but you also want the subset rechunked. The rechunker function has a `sel` parameter which needs to be a tuple of slices of the number of dimensions.
123
+
124
+ ```python
125
+ n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel) # 288, 80
126
+
127
+ target = np.zeros(source_shape, dtype=dtype)[sel]
128
+
129
+ for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel):
130
+ target[write_chunk] = data
131
+
132
+ assert np.all(source(sel) == target) # Should pass!
133
+ ```
134
+
135
+
136
+ ## License
137
+
138
+ This project is licensed under the terms of the Apache Software License 2.0.
@@ -0,0 +1,125 @@
1
+ # rechunkit
2
+
3
+ <p align="center">
4
+ <em>Functions to efficiently rechunk multidimensional arrays</em>
5
+ </p>
6
+
7
+ [![build](https://github.com/mullenkamp/rechunkit/workflows/Build/badge.svg)](https://github.com/mullenkamp/rechunkit/actions)
8
+ [![codecov](https://codecov.io/gh/mullenkamp/rechunkit/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/rechunkit)
9
+ [![PyPI version](https://badge.fury.io/py/rechunkit.svg)](https://badge.fury.io/py/rechunkit)
10
+
11
+ ---
12
+
13
+ **Source Code**: <a href="https://github.com/mullenkamp/rechunkit" target="_blank">https://github.com/mullenkamp/rechunkit</a>
14
+
15
+ ---
16
+ ## Introduction
17
+ Rechunkit is a set of functions to allow efficient rechunking of multidimensional arrays that have been stored as chunks of numpy ndarrays. It allows for rechunking on-the-fly via python generators instead of requiring the user to save the full target array. It also contains several other handy tools for assisting the user as part of the rechunking process (e.g. estimating an optimal or ideal chunking size, iterating over chunks with a range-type function, etc).
18
+
19
+
20
+ ## Installation
21
+ ```
22
+ pip install rechunkit
23
+ ```
24
+ I can add it to conda-forge if there is demand.
25
+
26
+ ## Usage
27
+ Import the necessary modules and assign some parameters for the examples:
28
+
29
+ ```python
30
+ from rechunkit import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
31
+
32
+ source_shape = (31, 31, 31)
33
+ shape = source_shape
34
+
35
+ sel = (slice(3, 21), slice(11, 27), slice(7, 17))
36
+
37
+ source_chunk_shape = (5, 2, 4)
38
+ target_chunk_shape = (4, 5, 3)
39
+ max_mem = 2000 # smaller than the ideal chunk size
40
+
41
+ dtype = np.dtype('int32')
42
+ ```
43
+
44
+ ### Preprocessing tools
45
+ We have defined our target_chunk_shape above, but rechunkit has a function to guess a good chunk shape given a user-defined amount of memory per chunk:
46
+
47
+ ```python
48
+ new_chunk_shape = guess_chunk_shape(source_shape, dtype, 400)
49
+ ```
50
+
51
+ Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple (LCM) of two composite numbers will be significantly lower than the product of those two numbers. The LCM is used to determine the ideal chunk size for the rechunking process.
52
+
53
+ Speaking of the ideal chunk size, we can determine the ideal chunk shape and size via a couple functions:
54
+
55
+ ```python
56
+ ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape) # (20, 10, 12)
57
+
58
+ ideal_read_chunk_size = calc_ideal_read_chunk_mem(ideal_read_chunk_shape, dtype.itemsize) # 9600 bytes
59
+ ```
60
+
61
+ If the ideal_read_chunk_size can comfortably fit in your memory, then you should use this value. Using the ideal chunk size will mean that you will only need to read all chunks in the source once. If the chunk size (called max_mem in the functions) is less than the ideal, then some chunks will need to be read multiple times.
62
+
63
+ To see how many reads are required if no optimization is performed during rechunking (i.e. every target chunk must iterate over every associated source chunk), you can use the calc_n_reads_simple function and compare it to the total number of chunks in the source:
64
+
65
+ ```python
66
+ n_chunks_source = calc_n_chunks(source_shape, source_chunk_shape) # 896
67
+ n_chunks_target = calc_n_chunks(source_shape, target_chunk_shape) # 616
68
+
69
+ n_reads_simple = calc_n_reads_simple(source_shape, source_chunk_shape, target_chunk_shape) # 3952
70
+ ```
71
+
72
+ Using the simple brute force method requires one chunk to be read 4.4 times on average.
73
+
74
+ There's also a function to check the number of reads (and writes) using the optimized algorithm:
75
+
76
+ ```python
77
+ n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem) # 2044, 616
78
+ ```
79
+
80
+ In this case, we only require one chunk to be read 2.28 times on average. The more max_mem you give to the rechunker, the less reads per chunk is required (to a minium of 1 in the ideal case).
81
+
82
+
83
+ ### Rechunking
84
+ We need a source dataset to get data from. Rechunkit requires that the source input is a function/method that has a single parameter input of a tuple of slices. The slices contain the start and stop of the chunk to be read in the source.
85
+
86
+ For example, we can simply use a numpy array and it's `__getitem__` method as the source:
87
+
88
+ ```python
89
+ source_data = np.arange(1, prod(source_shape) + 1, dtype=dtype).reshape(source_shape)
90
+ source = source_data.__getitem__
91
+ ```
92
+
93
+ And again as a simple example, we can use a numpy array as the target:
94
+
95
+ ```python
96
+ target = np.zeros(source_shape, dtype=dtype)
97
+ ```
98
+
99
+ We don't necessarily need the target as an array to be filled, because the rechunker function returns a generator that can be iterated over. The generator returns a tuple of slices (representing the target chunk) and the associated numpy array data:
100
+
101
+ ```python
102
+ for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem):
103
+ target[write_chunk] = data
104
+
105
+ assert np.all(source(()) == target) # Should pass!
106
+ ```
107
+
108
+ #### Subsets of the source
109
+ There are many use-cases where you don't want the entire dataset. Rather you want a subset of the dataset, but you also want the subset rechunked. The rechunker function has a `sel` parameter which needs to be a tuple of slices of the number of dimensions.
110
+
111
+ ```python
112
+ n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel) # 288, 80
113
+
114
+ target = np.zeros(source_shape, dtype=dtype)[sel]
115
+
116
+ for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel):
117
+ target[write_chunk] = data
118
+
119
+ assert np.all(source(sel) == target) # Should pass!
120
+ ```
121
+
122
+
123
+ ## License
124
+
125
+ This project is licensed under the terms of the Apache Software License 2.0.
@@ -0,0 +1,187 @@
1
+ [project]
2
+ name = "rechunkit"
3
+ authors = [
4
+ { name = "mullenkamp", email = "mullenkamp1@gmail.com" }
5
+ ]
6
+ description = "Functions to efficiently rechunk multidimensional arrays"
7
+ readme = "README.md"
8
+ dynamic = ["version"]
9
+ classifiers = [
10
+ "Programming Language :: Python :: 3 :: Only",
11
+ ]
12
+ requires-python = ">=3.9"
13
+ dependencies = [
14
+ 'numpy>=1.26'
15
+ ]
16
+
17
+ [dependency-groups]
18
+ dev = [
19
+ "spyder-kernels==2.5.2",
20
+ "black",
21
+ "mypy",
22
+ "ruff",
23
+ "pytest",
24
+ "pytest-cov",
25
+ ]
26
+
27
+ [tool.hatch]
28
+
29
+ [tool.hatch.metadata]
30
+ allow-direct-references = true
31
+
32
+ # [tool.hatch.version]
33
+ # source = "regex_commit"
34
+ # commit_extra_args = ["-e"]
35
+ # path = "rechunkit/__init__.py"
36
+
37
+ [tool.hatch.envs.default]
38
+ python = "3.11"
39
+ dependencies = [
40
+ "spyder-kernels==2.5.2",
41
+ "black",
42
+ "mypy",
43
+ "ruff",
44
+ "pytest",
45
+ "pytest-cov",
46
+ "mkdocs-material",
47
+ "mkdocstrings[python]",
48
+
49
+ ]
50
+
51
+ [[tool.hatch.envs.all.matrix]]
52
+ python = ['3.10', '3.11', '3.12']
53
+
54
+ [tool.hatch.envs.lint]
55
+ detached = true
56
+ dependencies = [
57
+ "black>=23.1.0",
58
+ "mypy>=1.0.0",
59
+ "ruff>=0.0.243",
60
+ ]
61
+ [tool.hatch.envs.lint.scripts]
62
+ typing = "mypy --install-types --non-interactive {args:src/rechunkit tests}"
63
+ style = [
64
+ "ruff {args:.}",
65
+ "black --check --diff {args:.}",
66
+ ]
67
+ fmt = [
68
+ "black {args:.}",
69
+ "ruff --fix {args:.}",
70
+ "style",
71
+ ]
72
+ all = [
73
+ "style",
74
+ "typing",
75
+ ]
76
+
77
+ [tool.hatch.envs.default.scripts]
78
+ test = "pytest {args:tests}"
79
+ test-cov = "coverage run -m pytest {args:tests}"
80
+ cov-report = [
81
+ "- coverage combine",
82
+ "coverage report",
83
+ ]
84
+ cov = [
85
+ "test-cov",
86
+ "cov-report",
87
+ ]
88
+ docs-serve = "mkdocs serve"
89
+ docs-build = "mkdocs build"
90
+
91
+ [tool.black]
92
+ target-version = ["py37"]
93
+ line-length = 120
94
+ skip-string-normalization = true
95
+
96
+ [tool.ruff]
97
+ target-version = "py311"
98
+ line-length = 120
99
+ select = [
100
+ "A",
101
+ "ARG",
102
+ "B",
103
+ "C",
104
+ "DTZ",
105
+ "E",
106
+ "EM",
107
+ "F",
108
+ "FBT",
109
+ "I",
110
+ "ICN",
111
+ "ISC",
112
+ "N",
113
+ "PLC",
114
+ "PLE",
115
+ "PLR",
116
+ "PLW",
117
+ "Q",
118
+ "RUF",
119
+ "S",
120
+ "T",
121
+ "TID",
122
+ "UP",
123
+ "W",
124
+ "YTT",
125
+ ]
126
+ ignore = [
127
+ # Allow non-abstract empty methods in abstract base classes
128
+ "B027",
129
+ # Allow boolean positional values in function calls, like `dict.get(... True)`
130
+ "FBT003",
131
+ # Ignore checks for possible passwords
132
+ "S105", "S106", "S107",
133
+ # Ignore complexity
134
+ "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
135
+ ]
136
+ unfixable = [
137
+ # Don't touch unused imports
138
+ "F401",
139
+ ]
140
+
141
+ [tool.ruff.isort]
142
+ known-first-party = ["rechunkit"]
143
+
144
+ [tool.ruff.flake8-tidy-imports]
145
+ ban-relative-imports = "all"
146
+
147
+ [tool.ruff.per-file-ignores]
148
+ # Tests can use magic values, assertions, and relative imports
149
+ "tests/**/*" = ["PLR2004", "S101", "TID252"]
150
+
151
+ [tool.coverage.run]
152
+ source_pkgs = ["rechunkit", "tests"]
153
+ branch = true
154
+ parallel = true
155
+ omit = [
156
+ "src/rechunkit/__about__.py",
157
+ ]
158
+
159
+ [tool.coverage.paths]
160
+ rechunkit = ["src/rechunkit", "*/rechunkit/src/rechunkit"]
161
+ tests = ["tests", "*/rechunkit/tests"]
162
+
163
+ [tool.coverage.report]
164
+ exclude_lines = [
165
+ "no cov",
166
+ "if __name__ == .__main__.:",
167
+ "if TYPE_CHECKING:",
168
+ ]
169
+
170
+ [build-system]
171
+ requires = ["hatchling>=1.26.1"]
172
+ build-backend = "hatchling.build"
173
+
174
+ [project.urls]
175
+ Documentation = "https://mullenkamp.github.io/rechunkit/"
176
+ Source = "https://github.com/mullenkamp/rechunkit"
177
+
178
+ [tool.hatch.build.targets.sdist]
179
+ include = [
180
+ "/rechunkit",
181
+ ]
182
+ exclude = [
183
+ "/rechunkit/tests/*",
184
+ ]
185
+
186
+ [tool.hatch.version]
187
+ path = "rechunkit/__init__.py"
@@ -0,0 +1,5 @@
1
+ """Functions to efficiently rechunk multidimensional arrays"""
2
+ from rechunkit.main import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
3
+
4
+
5
+ __version__ = '0.1.0'
@@ -0,0 +1,557 @@
1
+ """Core rechunking algorithm stuff."""
2
+ # import copy
3
+ from typing import List, Optional, Sequence, Tuple, Iterator, Generator
4
+ import numpy as np
5
+ import itertools
6
+ # from time import time
7
+ from math import prod, lcm, ceil
8
+ from collections import Counter, deque
9
+ from itertools import count
10
+ from bisect import bisect
11
+
12
+ ########################################################
13
+ ### Parameters
14
+
15
+ composite_numbers = (1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720, 1081080, 1441440, 2162160)
16
+
17
+ ########################################################
18
+ ### Functions
19
+
20
+
21
+ def guess_chunk_shape(shape: Tuple[int, ...], dtype: np.dtype, target_chunk_size: int = 2**21) -> Tuple[int, ...]:
22
+ """
23
+ Guess an appropriate chunk layout for a dataset, given its shape and
24
+ the size of each element in bytes. Will allocate chunks only as large
25
+ as target_chunk_size. Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple of two composite numbers will be significantly lower than the product of those two numbers.
26
+
27
+ Parameters
28
+ ----------
29
+ shape: tuple of ints
30
+ Shape of the array.
31
+ dtype: np.dtype or str
32
+ The dtype of the array.
33
+ target_chunk_size: int
34
+ The maximum size per chunk in bytes.
35
+
36
+ Returns
37
+ -------
38
+ tuple of ints
39
+ shape of the chunk
40
+ """
41
+ ndims = len(shape)
42
+
43
+ if ndims > 0:
44
+
45
+ if not all(isinstance(v, int) for v in shape):
46
+ raise TypeError('All values in the shape must be ints.')
47
+
48
+ chunks = np.array(shape, dtype='=f8')
49
+ if not np.all(np.isfinite(chunks)):
50
+ raise ValueError("Illegal value in chunk tuple")
51
+
52
+ dtype = np.dtype(dtype)
53
+ typesize = dtype.itemsize
54
+
55
+ idx = 0
56
+ while True:
57
+ chunk_bytes = prod(chunks)*typesize
58
+
59
+ if (chunk_bytes < target_chunk_size or \
60
+ abs(chunk_bytes - target_chunk_size)/target_chunk_size < 0.5):
61
+ break
62
+
63
+ if prod(chunks) == 1:
64
+ break
65
+
66
+ chunks[idx%ndims] = ceil(chunks[idx%ndims] / 2.0)
67
+ idx += 1
68
+
69
+ return tuple(composite_numbers[bisect(composite_numbers, int(x)) - 1] for x in chunks)
70
+ else:
71
+ return None
72
+
73
+
74
+ def get_slice_min_max(read_slices, write_slices):
75
+ """
76
+ Function to get the max start position and the min stop position.
77
+ """
78
+ slices = tuple(slice(max(rs.start, ws.start), min(rs.stop, ws.stop)) for rs, ws in zip(read_slices, write_slices))
79
+
80
+ return slices
81
+
82
+
83
+ def chunk_range(
84
+ chunk_start: Tuple[int, ...], chunk_stop: Tuple[int, ...], chunk_step: Tuple[int, ...], include_partial_chunks=True, clip_ends=True,
85
+ ) -> Iterator[Tuple[slice, ...]]:
86
+ """
87
+ Generator like the Python range function, but for multiple dimensions and it returns tuples of slices.
88
+
89
+ Parameters
90
+ ----------
91
+ chunk_start: tuple of int
92
+ The start positions of the chunks.
93
+ chunk_stop: tuple of int
94
+ The stop positions of the chunks.
95
+ chunk_step: tuple of int
96
+ The chunking step.
97
+ include_partial_chunks: bool
98
+ Should partial chunks be included? True by default.
99
+ clip_ends: bool
100
+ Only applies when include_partial_chunks == True. Should the chunks be clipped to the overall extents? True by default.
101
+
102
+ Returns
103
+ -------
104
+ Generator with tuples of slices
105
+ """
106
+ if not isinstance(chunk_start, tuple):
107
+ chunk_start = tuple(0 for i in range(len(chunk_stop)))
108
+
109
+ if include_partial_chunks:
110
+ start_ranges = [cs * (sc//cs) for cs, sc in zip(chunk_step, chunk_start)]
111
+ else:
112
+ start_ranges = [cs * (((sc - 1)//cs) + 1) for cs, sc in zip(chunk_step, chunk_start)]
113
+
114
+ ranges = [range(sr, ec, cs) for ec, cs, sr in zip(chunk_stop, chunk_step, start_ranges)]
115
+
116
+ for indices in itertools.product(*ranges):
117
+ # print(indices)
118
+ inside = True
119
+ res = []
120
+ for i, ec, cs, sc in zip(indices, chunk_stop, chunk_step, chunk_start):
121
+ stop = i + cs
122
+ if stop > ec:
123
+ if clip_ends:
124
+ stop = ec
125
+ inside = False
126
+
127
+ start = i
128
+ if start < sc:
129
+ if clip_ends:
130
+ start = sc
131
+ inside = False
132
+
133
+ res.append(slice(start, stop))
134
+
135
+ if inside or include_partial_chunks:
136
+ yield tuple(res)
137
+
138
+
139
+ def calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape):
140
+ """
141
+ Calculates the minimum ideal read chunk shape between a source and target.
142
+ """
143
+ return tuple(lcm(i, s) for i, s in zip(source_chunk_shape, target_chunk_shape))
144
+
145
+
146
+ def calc_ideal_read_chunk_mem(ideal_read_chunk_shape, itemsize):
147
+ """
148
+ Calculates the minimum ideal read chunk memory between a source and target.
149
+ """
150
+ return int(prod(ideal_read_chunk_shape) * itemsize)
151
+
152
+
153
+ def calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem):
154
+ """
155
+ Calculates the optimum read chunk shape given a maximum amount of available memory.
156
+
157
+ Parameters
158
+ ----------
159
+ source_chunk_shape: tuple of int
160
+ The source chunk shape
161
+ target_chunk_shape: tuple of int
162
+ The target chunk shape
163
+ itemsize: int
164
+ The byte length of the data type.
165
+ max_mem: int
166
+ The max allocated memory to perform the chunking operation in bytes.
167
+
168
+ Returns
169
+ -------
170
+ optimal chunk shape: tuple of ints
171
+ """
172
+ max_cells = max_mem // itemsize
173
+ source_len = len(source_chunk_shape)
174
+ target_len = len(target_chunk_shape)
175
+
176
+ if source_len != target_len:
177
+ raise ValueError('The source_chunk_shape and target_chunk_shape do not have the same number of dims.')
178
+
179
+ tot_source = prod(source_chunk_shape)
180
+ if tot_source >= max_cells:
181
+ return source_chunk_shape
182
+
183
+ new_chunks = list(calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape))
184
+
185
+ ## Max mem
186
+ tot_target = prod(new_chunks)
187
+ pos = 0
188
+ while tot_target > max_cells:
189
+ prod_chunk = new_chunks[pos]
190
+ source_chunk = source_chunk_shape[pos]
191
+ if prod_chunk > source_chunk:
192
+ new_chunks[pos] = prod_chunk - source_chunk
193
+
194
+ tot_target = prod(new_chunks)
195
+
196
+ if tot_target == tot_source:
197
+ return source_chunk_shape
198
+ else:
199
+ if pos + 1 == source_len:
200
+ pos = 0
201
+ else:
202
+ pos += 1
203
+
204
+ ## Min mem
205
+ n_chunks_write = tuple(s//target_chunk_shape[i] for i, s in enumerate(new_chunks))
206
+ for i in range(len(new_chunks)):
207
+ while True:
208
+ n_chunk_write = n_chunks_write[i]
209
+ prod_chunk = new_chunks[i]
210
+ source_chunk = source_chunk_shape[i]
211
+ target_chunk = target_chunk_shape[i]
212
+ new_chunk = prod_chunk - source_chunk
213
+ if new_chunk//target_chunk == n_chunk_write:
214
+ new_chunks[i] = new_chunk
215
+ else:
216
+ break
217
+
218
+ return tuple(new_chunks)
219
+
220
+
221
+ def calc_n_chunks_per_read(source_chunk_shape, source_read_chunk_shape):
222
+ """
223
+
224
+ """
225
+ return prod(tuple(nc//sc for nc, sc in zip(source_read_chunk_shape, source_chunk_shape)))
226
+
227
+
228
+ def calc_n_chunks(shape, chunk_shape):
229
+ """
230
+
231
+ """
232
+ chunk_start = tuple(0 for i in range(len(shape)))
233
+ chunk_iter = chunk_range(chunk_start, shape, chunk_shape)
234
+
235
+ counter = count()
236
+ deque(zip(chunk_iter, counter), maxlen=0)
237
+
238
+ return next(counter)
239
+
240
+
241
+ def calc_n_reads_simple(shape, source_chunk_shape, target_chunk_shape):
242
+ """
243
+ Brute force chunking read count. Every target chunk must iterate over every associated source chunk. This should be considered the maximum number of reads between a source and target (most inefficient). The number of writes is the total number of chunks in the target.
244
+
245
+ Parameters
246
+ ----------
247
+ shape: tuple of ints
248
+ The shape of the source dataset, which will also be the shape of the target dataset.
249
+ dtype: np.dtype
250
+ The numpy data type of the source/target.
251
+ source_chunk_shape: tuple of ints
252
+ The chunk_shape of the source.
253
+ target_chunk_shape: tuple of ints
254
+ The chunk_shape of the target.
255
+
256
+ Returns
257
+ -------
258
+ int
259
+ Count of the number of reads
260
+ """
261
+ chunk_start = tuple(0 for i in range(len(shape)))
262
+ read_counter = count()
263
+
264
+ for write_chunk in chunk_range(chunk_start, shape, target_chunk_shape):
265
+ write_chunk_start = tuple(rc.start for rc in write_chunk)
266
+ write_chunk_stop = tuple(rc.stop for rc in write_chunk)
267
+ for chunk_slice in chunk_range(write_chunk_start, write_chunk_stop, source_chunk_shape):
268
+ next(read_counter)
269
+
270
+ return next(read_counter)
271
+
272
+
273
+ def calc_n_reads_rechunker(shape: Tuple[int, ...], dtype: np.dtype, source_chunk_shape: Tuple[int, ...], target_chunk_shape: Tuple[int, ...], max_mem: int, sel=None) -> Tuple[int, int]:
274
+ """
275
+ This function calculates the total number of reads (aand writes) using the more optimized rechunking algorithm. It optimises the rechunking by using an in-memory numpy ndarray with a size defined by the max_mem provided by the user.
276
+
277
+ Parameters
278
+ ----------
279
+ source: array-like
280
+ The source function to read the dataset/array. The function must have a single parameter input as a tuple of slices to retrieve an array chunk of data.
281
+ shape: tuple of ints
282
+ The shape of the source dataset, which will also be the shape of the target dataset.
283
+ dtype: np.dtype
284
+ The numpy data type of the source/target.
285
+ source_chunk_shape: tuple of ints
286
+ The chunk_shape of the source.
287
+ target_chunk_shape: tuple of ints
288
+ The chunk_shape of the target.
289
+ max_mem: int
290
+ The max allocated memory to perform the chunking operation in bytes.
291
+ sel: tuple of slices
292
+ A subset selection of the source in the form of a tuple of slices. The starts and stops must be within the shape of the source.
293
+
294
+ Returns
295
+ -------
296
+ tuple
297
+ number of reads, number of writes
298
+ """
299
+ itemsize = dtype.itemsize
300
+
301
+ ## Calc the optimum read_chunk_shape
302
+ source_read_chunk_shape = calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem)
303
+
304
+ ## Calc ideal read chunking shape
305
+ ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape)
306
+
307
+ chunk_start = tuple(0 for i in range(len(shape)))
308
+
309
+ if sel is None:
310
+ target_shape = shape
311
+ else:
312
+ # Checks
313
+ for s, sh in zip(sel, shape):
314
+ if s.start < 0 or s.stop > sh:
315
+ raise ValueError('The selection must be a subset of the source.')
316
+
317
+ target_shape = tuple(s.stop - s.start for s in sel)
318
+
319
+ ## Counters
320
+ read_counter = count()
321
+ write_counter = count()
322
+ # write_counter2 = count()
323
+
324
+ ## If the read chunking is set to the ideal chunking case, then use the simple implementation. Otherwise, use the more complicated one.
325
+ if source_read_chunk_shape == ideal_read_chunk_shape:
326
+ read_chunk_iter = chunk_range(chunk_start, target_shape, source_read_chunk_shape)
327
+ for read_chunk_grp in read_chunk_iter:
328
+ read_chunk_grp_start = tuple(s.start for s in read_chunk_grp)
329
+ read_chunk_grp_stop = tuple(s.stop for s in read_chunk_grp)
330
+ for read_chunk in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, source_chunk_shape):
331
+ next(read_counter)
332
+
333
+ for write_chunk1 in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, target_chunk_shape):
334
+ next(write_counter)
335
+
336
+ else:
337
+ writen_chunks = set() # Need to keep track of the bulk writes
338
+
339
+ write_chunk_iter = chunk_range(chunk_start, target_shape, target_chunk_shape)
340
+
341
+ for write_chunk in write_chunk_iter:
342
+ write_chunk_start = tuple(s.start for s in write_chunk)
343
+ if write_chunk_start not in writen_chunks:
344
+ write_chunk_stop = tuple(s.stop for s in write_chunk)
345
+
346
+ read_chunk_start = tuple(rc * (wc//rc) for wc, rc in zip(write_chunk_start, source_chunk_shape))
347
+ read_chunk_stop = tuple(min(max(rcs + rc, wc), sh) for rcs, rc, wc, sh in zip(read_chunk_start, source_read_chunk_shape, write_chunk_stop, shape))
348
+ read_chunks_iter = chunk_range(read_chunk_start, read_chunk_stop, source_chunk_shape, True, False)
349
+
350
+ if all(stop - start <= rcs for start, stop, rcs in zip(read_chunk_start, read_chunk_stop, source_read_chunk_shape)):
351
+ for read_chunk in read_chunks_iter:
352
+ next(read_counter)
353
+
354
+ is_end_chunk = any(wc.stop == ts for wc, ts in zip(write_chunk, target_shape))
355
+ for write_chunk1 in chunk_range(write_chunk_start, read_chunk_stop, target_chunk_shape, include_partial_chunks=is_end_chunk, clip_ends=False):
356
+ write_chunk2 = tuple(slice(wc.start, min(wc.stop, s)) for wc, s in zip(write_chunk1, target_shape))
357
+ if all(all((wc.stop - wcs <= src, wc.start < wc.stop)) for wcs, wc, src in zip(read_chunk_start, write_chunk2, source_read_chunk_shape)):
358
+ write_chunk1_start = tuple(s.start for s in write_chunk2)
359
+ if write_chunk1_start not in writen_chunks:
360
+ next(write_counter)
361
+
362
+ writen_chunks.add(write_chunk1_start)
363
+ else:
364
+ for read_chunk in read_chunks_iter:
365
+ next(read_counter)
366
+
367
+ next(write_counter)
368
+
369
+ writen_chunks.add(write_chunk_start)
370
+
371
+ return next(read_counter), next(write_counter)
372
+
373
+
374
+ def rechunker(source: object, shape: Tuple[int, ...], dtype: np.dtype, source_chunk_shape: Tuple[int, ...], target_chunk_shape: Tuple[int, ...], max_mem: int, sel=None) -> Iterator[Tuple[Tuple[slice, ...], np.ndarray]]:
375
+ """
376
+ This function takes a source dataset function with a specific chunk_shape and returns a generator that converts to a new chunk_shape. It optimises the rechunking by using an in-memory numpy ndarray with a size defined by the max_mem provided by the user.
377
+
378
+ Parameters
379
+ ----------
380
+ source: array-like
381
+ The source function to read the dataset/array. The function must have a single parameter input as a tuple of slices to retrieve an array chunk of data.
382
+ shape: tuple of ints
383
+ The shape of the source dataset, which will also be the shape of the target dataset unless sel is passed.
384
+ dtype: np.dtype
385
+ The numpy data type of the source/target.
386
+ source_chunk_shape: tuple of ints
387
+ The chunk_shape of the source.
388
+ target_chunk_shape: tuple of ints
389
+ The chunk_shape of the target.
390
+ max_mem: int
391
+ The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
392
+ sel: tuple of slices
393
+ A subset selection of the source in the form of a tuple of slices. The starts and stops must be within the shape of the source.
394
+
395
+ Returns
396
+ -------
397
+ Generator
398
+ tuple of the target slices to the np.ndarray of data
399
+ """
400
+ itemsize = dtype.itemsize
401
+
402
+ ## Calc the optimum read_chunk_shape
403
+ source_read_chunk_shape = calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem)
404
+
405
+ mem_arr1 = np.zeros(source_read_chunk_shape, dtype=dtype)
406
+
407
+ ## Calc ideal read chunking shape
408
+ ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape)
409
+
410
+ chunk_start = tuple(0 for i in range(len(shape)))
411
+
412
+ if sel is None:
413
+ chunk_read_offset = chunk_start
414
+ target_shape = shape
415
+ else:
416
+ # Checks
417
+ for s, sh in zip(sel, shape):
418
+ if s.start < 0 or s.stop > sh:
419
+ raise ValueError('The selection must be a subset of the source.')
420
+
421
+ chunk_read_offset = tuple(s.start for s in sel)
422
+ target_shape = tuple(s.stop - s.start for s in sel)
423
+
424
+ # target = np.zeros(target_shape, dtype=dtype)
425
+
426
+ ## If the read chunking is set to the ideal chunking case, then use the simple implementation. Otherwise, use the more complicated one.
427
+ if source_read_chunk_shape == ideal_read_chunk_shape:
428
+ read_chunk_iter = chunk_range(chunk_start, target_shape, source_read_chunk_shape)
429
+ for read_chunk_grp in read_chunk_iter:
430
+ read_chunk_grp_start = tuple(s.start for s in read_chunk_grp)
431
+ read_chunk_grp_stop = tuple(s.stop for s in read_chunk_grp)
432
+ for read_chunk in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, source_chunk_shape):
433
+ offset_slices = tuple(slice(rc.start - wcs, rc.stop - wcs) for wcs, rc in zip(read_chunk_grp_start, read_chunk))
434
+ read_chunk1 = tuple(slice(rc.start + cro, rc.stop + cro) for rc, cro in zip(read_chunk, chunk_read_offset))
435
+ mem_arr1[offset_slices] = source(read_chunk1)
436
+
437
+ for write_chunk1 in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, target_chunk_shape):
438
+ offset_slices = tuple(slice(wc.start - wcs, wc.stop - wcs) for wcs, wc in zip(read_chunk_grp_start, write_chunk1))
439
+
440
+ # target[write_chunk1] = mem_arr1[offset_slices]
441
+
442
+ yield write_chunk1, mem_arr1[offset_slices]
443
+
444
+ else:
445
+ writen_chunks = set() # Need to keep track of the bulk writes
446
+
447
+ write_chunk_iter = chunk_range(chunk_start, target_shape, target_chunk_shape)
448
+
449
+ for write_chunk in write_chunk_iter:
450
+ write_chunk_start = tuple(s.start for s in write_chunk)
451
+ if write_chunk_start not in writen_chunks:
452
+ write_chunk_stop = tuple(s.stop for s in write_chunk)
453
+
454
+ read_chunk_start = tuple(rc * (wc//rc) for wc, rc in zip(write_chunk_start, source_chunk_shape))
455
+ read_chunk_stop = tuple(min(max(rcs + rc, wc), sh) for rcs, rc, wc, sh in zip(read_chunk_start, source_read_chunk_shape, write_chunk_stop, shape))
456
+ read_chunks_iter = chunk_range(read_chunk_start, read_chunk_stop, source_chunk_shape, True, False)
457
+
458
+ if all(stop - start <= rcs for start, stop, rcs in zip(read_chunk_start, read_chunk_stop, source_read_chunk_shape)):
459
+ for read_chunk in read_chunks_iter:
460
+ read_chunk1 = tuple(slice(rc.start + cro, min(rc.stop + cro, s)) for rc, cro, s in zip(read_chunk, chunk_read_offset, shape))
461
+ offset_slices = tuple(slice(rc.start - rcs - rco, rc.stop - rcs - rco) for rcs, rco, rc in zip(read_chunk_start, chunk_read_offset, read_chunk1))
462
+
463
+ mem_arr1[offset_slices] = source(read_chunk1)
464
+
465
+ is_end_chunk = any(wc.stop == ts for wc, ts in zip(write_chunk, target_shape))
466
+ for write_chunk1 in chunk_range(write_chunk_start, read_chunk_stop, target_chunk_shape, include_partial_chunks=is_end_chunk, clip_ends=False):
467
+ write_chunk2 = tuple(slice(wc.start, min(wc.stop, s)) for wc, s in zip(write_chunk1, target_shape))
468
+ if all(all((wc.stop - wcs <= src, wc.start < wc.stop)) for wcs, wc, src in zip(read_chunk_start, write_chunk2, source_read_chunk_shape)):
469
+ write_chunk1_start = tuple(s.start for s in write_chunk2)
470
+ if write_chunk1_start not in writen_chunks:
471
+ offset_slices = tuple(slice(wc.start - rcs, wc.stop - rcs) for rcs, wc in zip(read_chunk_start, write_chunk2))
472
+ # print(write_chunk1, offset_slices)
473
+
474
+ # target[write_chunk2] = mem_arr1[offset_slices]
475
+
476
+ yield write_chunk2, mem_arr1[offset_slices]
477
+
478
+ writen_chunks.add(write_chunk1_start)
479
+ # if write_chunk1_start == (14, 5):
480
+ # raise ValueError()
481
+ else:
482
+ mem_read_chunk_slice = tuple(slice(0, wc.stop - wc.start) for wc in write_chunk)
483
+ # for read_chunk in chunk_range(write_chunk_start, write_chunk_stop, source_chunk_shape, True, False):
484
+ for read_chunk in read_chunks_iter:
485
+ read_chunk1 = tuple(slice(rc.start + cro, rc.stop + cro) for rc, cro in zip(read_chunk, chunk_read_offset))
486
+ clip_read_chunk = get_slice_min_max(read_chunk, write_chunk)
487
+ read_slice = tuple(slice(cc.start - rc.start, cc.stop - rc.start) for cc, rc in zip(clip_read_chunk, read_chunk))
488
+ write_slice = tuple(slice(cc.start - rc.start, cc.stop - rc.start) for cc, rc in zip(clip_read_chunk, write_chunk))
489
+
490
+ # print(read_chunk, read_slice, write_slice)
491
+ mem_arr1[write_slice] = source(read_chunk1)[read_slice]
492
+
493
+ # target[write_chunk] = mem_arr1[mem_read_chunk_slice]
494
+
495
+ yield write_chunk, mem_arr1[mem_read_chunk_slice]
496
+
497
+ writen_chunks.add(write_chunk_start)
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+
556
+
557
+