rechunkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rechunkit-0.1.0/.gitignore +119 -0
- rechunkit-0.1.0/LICENSE +16 -0
- rechunkit-0.1.0/PKG-INFO +138 -0
- rechunkit-0.1.0/README.md +125 -0
- rechunkit-0.1.0/pyproject.toml +187 -0
- rechunkit-0.1.0/rechunkit/__init__.py +5 -0
- rechunkit-0.1.0/rechunkit/main.py +557 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
env/
|
|
12
|
+
build/
|
|
13
|
+
develop-eggs/
|
|
14
|
+
dist/
|
|
15
|
+
downloads/
|
|
16
|
+
eggs/
|
|
17
|
+
.eggs/
|
|
18
|
+
lib/
|
|
19
|
+
lib64/
|
|
20
|
+
parts/
|
|
21
|
+
sdist/
|
|
22
|
+
var/
|
|
23
|
+
wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
|
|
28
|
+
# PyInstaller
|
|
29
|
+
# Usually these files are written by a python script from a template
|
|
30
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
31
|
+
*.manifest
|
|
32
|
+
*.spec
|
|
33
|
+
|
|
34
|
+
# Installer logs
|
|
35
|
+
pip-log.txt
|
|
36
|
+
pip-delete-this-directory.txt
|
|
37
|
+
|
|
38
|
+
# Unit test / coverage reports
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.coverage
|
|
42
|
+
.coverage.*
|
|
43
|
+
.cache
|
|
44
|
+
nosetests.xml
|
|
45
|
+
coverage.xml
|
|
46
|
+
*.cover
|
|
47
|
+
.hypothesis/
|
|
48
|
+
.pytest_cache/
|
|
49
|
+
junit/
|
|
50
|
+
junit.xml
|
|
51
|
+
test.db
|
|
52
|
+
|
|
53
|
+
# Translations
|
|
54
|
+
*.mo
|
|
55
|
+
*.pot
|
|
56
|
+
|
|
57
|
+
# Django stuff:
|
|
58
|
+
*.log
|
|
59
|
+
local_settings.py
|
|
60
|
+
|
|
61
|
+
# Flask stuff:
|
|
62
|
+
instance/
|
|
63
|
+
.webassets-cache
|
|
64
|
+
|
|
65
|
+
# Scrapy stuff:
|
|
66
|
+
.scrapy
|
|
67
|
+
|
|
68
|
+
# Sphinx documentation
|
|
69
|
+
docs/_build/
|
|
70
|
+
|
|
71
|
+
# PyBuilder
|
|
72
|
+
target/
|
|
73
|
+
|
|
74
|
+
# Jupyter Notebook
|
|
75
|
+
.ipynb_checkpoints
|
|
76
|
+
|
|
77
|
+
# pyenv
|
|
78
|
+
.python-version
|
|
79
|
+
|
|
80
|
+
# celery beat schedule file
|
|
81
|
+
celerybeat-schedule
|
|
82
|
+
|
|
83
|
+
# SageMath parsed files
|
|
84
|
+
*.sage.py
|
|
85
|
+
|
|
86
|
+
# dotenv
|
|
87
|
+
.env
|
|
88
|
+
|
|
89
|
+
# virtualenv
|
|
90
|
+
.venv
|
|
91
|
+
venv/
|
|
92
|
+
ENV/
|
|
93
|
+
.ruff*
|
|
94
|
+
|
|
95
|
+
# Spyder project settings
|
|
96
|
+
.spyderproject
|
|
97
|
+
.spyproject
|
|
98
|
+
|
|
99
|
+
# Rope project settings
|
|
100
|
+
.ropeproject
|
|
101
|
+
|
|
102
|
+
# mkdocs documentation
|
|
103
|
+
/site
|
|
104
|
+
|
|
105
|
+
# mypy
|
|
106
|
+
.mypy_cache/
|
|
107
|
+
|
|
108
|
+
# .vscode
|
|
109
|
+
.vscode/
|
|
110
|
+
|
|
111
|
+
# OS files
|
|
112
|
+
.DS_Store
|
|
113
|
+
|
|
114
|
+
# Temp data
|
|
115
|
+
data/*
|
|
116
|
+
|
|
117
|
+
# Test config files
|
|
118
|
+
/rechunkit/tests/*.toml
|
|
119
|
+
/rechunkit/tests/*.yml
|
rechunkit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Apache Software License 2.0
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Mike Kittridge
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
16
|
+
|
rechunkit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rechunkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Functions to efficiently rechunk multidimensional arrays
|
|
5
|
+
Project-URL: Documentation, https://mullenkamp.github.io/rechunkit/
|
|
6
|
+
Project-URL: Source, https://github.com/mullenkamp/rechunkit
|
|
7
|
+
Author-email: mullenkamp <mullenkamp1@gmail.com>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: numpy>=1.26
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# rechunkit
|
|
15
|
+
|
|
16
|
+
<p align="center">
|
|
17
|
+
<em>Functions to efficiently rechunk multidimensional arrays</em>
|
|
18
|
+
</p>
|
|
19
|
+
|
|
20
|
+
[](https://github.com/mullenkamp/rechunkit/actions)
|
|
21
|
+
[](https://codecov.io/gh/mullenkamp/rechunkit)
|
|
22
|
+
[](https://badge.fury.io/py/rechunkit)
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
**Source Code**: <a href="https://github.com/mullenkamp/rechunkit" target="_blank">https://github.com/mullenkamp/rechunkit</a>
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
## Introduction
|
|
30
|
+
Rechunkit is a set of functions to allow efficient rechunking of multidimensional arrays that have been stored as chunks of numpy ndarrays. It allows for rechunking on-the-fly via python generators instead of requiring the user to save the full target array. It also contains several other handy tools for assisting the user as part of the rechunking process (e.g. estimating an optimal or ideal chunking size, iterating over chunks with a range-type function, etc).
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
```
|
|
35
|
+
pip install rechunkit
|
|
36
|
+
```
|
|
37
|
+
I can add it to conda-forge if there is demand.
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
Import the necessary modules and assign some parameters for the examples:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from rechunkit import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
|
|
44
|
+
|
|
45
|
+
source_shape = (31, 31, 31)
|
|
46
|
+
shape = source_shape
|
|
47
|
+
|
|
48
|
+
sel = (slice(3, 21), slice(11, 27), slice(7, 17))
|
|
49
|
+
|
|
50
|
+
source_chunk_shape = (5, 2, 4)
|
|
51
|
+
target_chunk_shape = (4, 5, 3)
|
|
52
|
+
max_mem = 2000 # smaller than the ideal chunk size
|
|
53
|
+
|
|
54
|
+
dtype = np.dtype('int32')
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Preprocessing tools
|
|
58
|
+
We have defined our target_chunk_shape above, but rechunkit has a function to guess a good chunk shape given a user-defined amount of memory per chunk:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
new_chunk_shape = guess_chunk_shape(source_shape, dtype, 400)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple (LCM) of two composite numbers will be significantly lower than the product of those two numbers. The LCM is used to determine the ideal chunk size for the rechunking process.
|
|
65
|
+
|
|
66
|
+
Speaking of the ideal chunk size, we can determine the ideal chunk shape and size via a couple functions:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape) # (20, 10, 12)
|
|
70
|
+
|
|
71
|
+
ideal_read_chunk_size = calc_ideal_read_chunk_mem(ideal_read_chunk_shape, dtype.itemsize) # 9600 bytes
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
If the ideal_read_chunk_size can comfortably fit in your memory, then you should use this value. Using the ideal chunk size will mean that you will only need to read all chunks in the source once. If the chunk size (called max_mem in the functions) is less than the ideal, then some chunks will need to be read multiple times.
|
|
75
|
+
|
|
76
|
+
To see how many reads are required if no optimization is performed during rechunking (i.e. every target chunk must iterate over every associated source chunk), you can use the calc_n_reads_simple function and compare it to the total number of chunks in the source:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
n_chunks_source = calc_n_chunks(source_shape, source_chunk_shape) # 896
|
|
80
|
+
n_chunks_target = calc_n_chunks(source_shape, target_chunk_shape) # 616
|
|
81
|
+
|
|
82
|
+
n_reads_simple = calc_n_reads_simple(source_shape, source_chunk_shape, target_chunk_shape) # 3952
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Using the simple brute force method requires one chunk to be read 4.4 times on average.
|
|
86
|
+
|
|
87
|
+
There's also a function to check the number of reads (and writes) using the optimized algorithm:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem) # 2044, 616
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
In this case, we only require one chunk to be read 2.28 times on average. The more max_mem you give to the rechunker, the less reads per chunk is required (to a minium of 1 in the ideal case).
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
### Rechunking
|
|
97
|
+
We need a source dataset to get data from. Rechunkit requires that the source input is a function/method that has a single parameter input of a tuple of slices. The slices contain the start and stop of the chunk to be read in the source.
|
|
98
|
+
|
|
99
|
+
For example, we can simply use a numpy array and it's `__getitem__` method as the source:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
source_data = np.arange(1, prod(source_shape) + 1, dtype=dtype).reshape(source_shape)
|
|
103
|
+
source = source_data.__getitem__
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
And again as a simple example, we can use a numpy array as the target:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
target = np.zeros(source_shape, dtype=dtype)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
We don't necessarily need the target as an array to be filled, because the rechunker function returns a generator that can be iterated over. The generator returns a tuple of slices (representing the target chunk) and the associated numpy array data:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem):
|
|
116
|
+
target[write_chunk] = data
|
|
117
|
+
|
|
118
|
+
assert np.all(source(()) == target) # Should pass!
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
#### Subsets of the source
|
|
122
|
+
There are many use-cases where you don't want the entire dataset. Rather you want a subset of the dataset, but you also want the subset rechunked. The rechunker function has a `sel` parameter which needs to be a tuple of slices of the number of dimensions.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel) # 288, 80
|
|
126
|
+
|
|
127
|
+
target = np.zeros(source_shape, dtype=dtype)[sel]
|
|
128
|
+
|
|
129
|
+
for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel):
|
|
130
|
+
target[write_chunk] = data
|
|
131
|
+
|
|
132
|
+
assert np.all(source(sel) == target) # Should pass!
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
This project is licensed under the terms of the Apache Software License 2.0.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# rechunkit
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<em>Functions to efficiently rechunk multidimensional arrays</em>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
[](https://github.com/mullenkamp/rechunkit/actions)
|
|
8
|
+
[](https://codecov.io/gh/mullenkamp/rechunkit)
|
|
9
|
+
[](https://badge.fury.io/py/rechunkit)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
**Source Code**: <a href="https://github.com/mullenkamp/rechunkit" target="_blank">https://github.com/mullenkamp/rechunkit</a>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
## Introduction
|
|
17
|
+
Rechunkit is a set of functions to allow efficient rechunking of multidimensional arrays that have been stored as chunks of numpy ndarrays. It allows for rechunking on-the-fly via python generators instead of requiring the user to save the full target array. It also contains several other handy tools for assisting the user as part of the rechunking process (e.g. estimating an optimal or ideal chunking size, iterating over chunks with a range-type function, etc).
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
```
|
|
22
|
+
pip install rechunkit
|
|
23
|
+
```
|
|
24
|
+
I can add it to conda-forge if there is demand.
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
Import the necessary modules and assign some parameters for the examples:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from rechunkit import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
|
|
31
|
+
|
|
32
|
+
source_shape = (31, 31, 31)
|
|
33
|
+
shape = source_shape
|
|
34
|
+
|
|
35
|
+
sel = (slice(3, 21), slice(11, 27), slice(7, 17))
|
|
36
|
+
|
|
37
|
+
source_chunk_shape = (5, 2, 4)
|
|
38
|
+
target_chunk_shape = (4, 5, 3)
|
|
39
|
+
max_mem = 2000 # smaller than the ideal chunk size
|
|
40
|
+
|
|
41
|
+
dtype = np.dtype('int32')
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Preprocessing tools
|
|
45
|
+
We have defined our target_chunk_shape above, but rechunkit has a function to guess a good chunk shape given a user-defined amount of memory per chunk:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
new_chunk_shape = guess_chunk_shape(source_shape, dtype, 400)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple (LCM) of two composite numbers will be significantly lower than the product of those two numbers. The LCM is used to determine the ideal chunk size for the rechunking process.
|
|
52
|
+
|
|
53
|
+
Speaking of the ideal chunk size, we can determine the ideal chunk shape and size via a couple functions:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape) # (20, 10, 12)
|
|
57
|
+
|
|
58
|
+
ideal_read_chunk_size = calc_ideal_read_chunk_mem(ideal_read_chunk_shape, dtype.itemsize) # 9600 bytes
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
If the ideal_read_chunk_size can comfortably fit in your memory, then you should use this value. Using the ideal chunk size will mean that you will only need to read all chunks in the source once. If the chunk size (called max_mem in the functions) is less than the ideal, then some chunks will need to be read multiple times.
|
|
62
|
+
|
|
63
|
+
To see how many reads are required if no optimization is performed during rechunking (i.e. every target chunk must iterate over every associated source chunk), you can use the calc_n_reads_simple function and compare it to the total number of chunks in the source:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
n_chunks_source = calc_n_chunks(source_shape, source_chunk_shape) # 896
|
|
67
|
+
n_chunks_target = calc_n_chunks(source_shape, target_chunk_shape) # 616
|
|
68
|
+
|
|
69
|
+
n_reads_simple = calc_n_reads_simple(source_shape, source_chunk_shape, target_chunk_shape) # 3952
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Using the simple brute force method requires one chunk to be read 4.4 times on average.
|
|
73
|
+
|
|
74
|
+
There's also a function to check the number of reads (and writes) using the optimized algorithm:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem) # 2044, 616
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
In this case, we only require one chunk to be read 2.28 times on average. The more max_mem you give to the rechunker, the less reads per chunk is required (to a minium of 1 in the ideal case).
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
### Rechunking
|
|
84
|
+
We need a source dataset to get data from. Rechunkit requires that the source input is a function/method that has a single parameter input of a tuple of slices. The slices contain the start and stop of the chunk to be read in the source.
|
|
85
|
+
|
|
86
|
+
For example, we can simply use a numpy array and it's `__getitem__` method as the source:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
source_data = np.arange(1, prod(source_shape) + 1, dtype=dtype).reshape(source_shape)
|
|
90
|
+
source = source_data.__getitem__
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
And again as a simple example, we can use a numpy array as the target:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
target = np.zeros(source_shape, dtype=dtype)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
We don't necessarily need the target as an array to be filled, because the rechunker function returns a generator that can be iterated over. The generator returns a tuple of slices (representing the target chunk) and the associated numpy array data:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem):
|
|
103
|
+
target[write_chunk] = data
|
|
104
|
+
|
|
105
|
+
assert np.all(source(()) == target) # Should pass!
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
#### Subsets of the source
|
|
109
|
+
There are many use-cases where you don't want the entire dataset. Rather you want a subset of the dataset, but you also want the subset rechunked. The rechunker function has a `sel` parameter which needs to be a tuple of slices of the number of dimensions.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
n_reads, n_writes = calc_n_reads_rechunker(source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel) # 288, 80
|
|
113
|
+
|
|
114
|
+
target = np.zeros(source_shape, dtype=dtype)[sel]
|
|
115
|
+
|
|
116
|
+
for write_chunk, data in rechunker(source, source_shape, dtype, source_chunk_shape, target_chunk_shape, max_mem, sel):
|
|
117
|
+
target[write_chunk] = data
|
|
118
|
+
|
|
119
|
+
assert np.all(source(sel) == target) # Should pass!
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
This project is licensed under the terms of the Apache Software License 2.0.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "rechunkit"
|
|
3
|
+
authors = [
|
|
4
|
+
{ name = "mullenkamp", email = "mullenkamp1@gmail.com" }
|
|
5
|
+
]
|
|
6
|
+
description = "Functions to efficiently rechunk multidimensional arrays"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
dynamic = ["version"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
11
|
+
]
|
|
12
|
+
requires-python = ">=3.9"
|
|
13
|
+
dependencies = [
|
|
14
|
+
'numpy>=1.26'
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[dependency-groups]
|
|
18
|
+
dev = [
|
|
19
|
+
"spyder-kernels==2.5.2",
|
|
20
|
+
"black",
|
|
21
|
+
"mypy",
|
|
22
|
+
"ruff",
|
|
23
|
+
"pytest",
|
|
24
|
+
"pytest-cov",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.hatch]
|
|
28
|
+
|
|
29
|
+
[tool.hatch.metadata]
|
|
30
|
+
allow-direct-references = true
|
|
31
|
+
|
|
32
|
+
# [tool.hatch.version]
|
|
33
|
+
# source = "regex_commit"
|
|
34
|
+
# commit_extra_args = ["-e"]
|
|
35
|
+
# path = "rechunkit/__init__.py"
|
|
36
|
+
|
|
37
|
+
[tool.hatch.envs.default]
|
|
38
|
+
python = "3.11"
|
|
39
|
+
dependencies = [
|
|
40
|
+
"spyder-kernels==2.5.2",
|
|
41
|
+
"black",
|
|
42
|
+
"mypy",
|
|
43
|
+
"ruff",
|
|
44
|
+
"pytest",
|
|
45
|
+
"pytest-cov",
|
|
46
|
+
"mkdocs-material",
|
|
47
|
+
"mkdocstrings[python]",
|
|
48
|
+
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[[tool.hatch.envs.all.matrix]]
|
|
52
|
+
python = ['3.10', '3.11', '3.12']
|
|
53
|
+
|
|
54
|
+
[tool.hatch.envs.lint]
|
|
55
|
+
detached = true
|
|
56
|
+
dependencies = [
|
|
57
|
+
"black>=23.1.0",
|
|
58
|
+
"mypy>=1.0.0",
|
|
59
|
+
"ruff>=0.0.243",
|
|
60
|
+
]
|
|
61
|
+
[tool.hatch.envs.lint.scripts]
|
|
62
|
+
typing = "mypy --install-types --non-interactive {args:src/rechunkit tests}"
|
|
63
|
+
style = [
|
|
64
|
+
"ruff {args:.}",
|
|
65
|
+
"black --check --diff {args:.}",
|
|
66
|
+
]
|
|
67
|
+
fmt = [
|
|
68
|
+
"black {args:.}",
|
|
69
|
+
"ruff --fix {args:.}",
|
|
70
|
+
"style",
|
|
71
|
+
]
|
|
72
|
+
all = [
|
|
73
|
+
"style",
|
|
74
|
+
"typing",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
[tool.hatch.envs.default.scripts]
|
|
78
|
+
test = "pytest {args:tests}"
|
|
79
|
+
test-cov = "coverage run -m pytest {args:tests}"
|
|
80
|
+
cov-report = [
|
|
81
|
+
"- coverage combine",
|
|
82
|
+
"coverage report",
|
|
83
|
+
]
|
|
84
|
+
cov = [
|
|
85
|
+
"test-cov",
|
|
86
|
+
"cov-report",
|
|
87
|
+
]
|
|
88
|
+
docs-serve = "mkdocs serve"
|
|
89
|
+
docs-build = "mkdocs build"
|
|
90
|
+
|
|
91
|
+
[tool.black]
|
|
92
|
+
target-version = ["py37"]
|
|
93
|
+
line-length = 120
|
|
94
|
+
skip-string-normalization = true
|
|
95
|
+
|
|
96
|
+
[tool.ruff]
|
|
97
|
+
target-version = "py311"
|
|
98
|
+
line-length = 120
|
|
99
|
+
select = [
|
|
100
|
+
"A",
|
|
101
|
+
"ARG",
|
|
102
|
+
"B",
|
|
103
|
+
"C",
|
|
104
|
+
"DTZ",
|
|
105
|
+
"E",
|
|
106
|
+
"EM",
|
|
107
|
+
"F",
|
|
108
|
+
"FBT",
|
|
109
|
+
"I",
|
|
110
|
+
"ICN",
|
|
111
|
+
"ISC",
|
|
112
|
+
"N",
|
|
113
|
+
"PLC",
|
|
114
|
+
"PLE",
|
|
115
|
+
"PLR",
|
|
116
|
+
"PLW",
|
|
117
|
+
"Q",
|
|
118
|
+
"RUF",
|
|
119
|
+
"S",
|
|
120
|
+
"T",
|
|
121
|
+
"TID",
|
|
122
|
+
"UP",
|
|
123
|
+
"W",
|
|
124
|
+
"YTT",
|
|
125
|
+
]
|
|
126
|
+
ignore = [
|
|
127
|
+
# Allow non-abstract empty methods in abstract base classes
|
|
128
|
+
"B027",
|
|
129
|
+
# Allow boolean positional values in function calls, like `dict.get(... True)`
|
|
130
|
+
"FBT003",
|
|
131
|
+
# Ignore checks for possible passwords
|
|
132
|
+
"S105", "S106", "S107",
|
|
133
|
+
# Ignore complexity
|
|
134
|
+
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
|
|
135
|
+
]
|
|
136
|
+
unfixable = [
|
|
137
|
+
# Don't touch unused imports
|
|
138
|
+
"F401",
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
[tool.ruff.isort]
|
|
142
|
+
known-first-party = ["rechunkit"]
|
|
143
|
+
|
|
144
|
+
[tool.ruff.flake8-tidy-imports]
|
|
145
|
+
ban-relative-imports = "all"
|
|
146
|
+
|
|
147
|
+
[tool.ruff.per-file-ignores]
|
|
148
|
+
# Tests can use magic values, assertions, and relative imports
|
|
149
|
+
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
150
|
+
|
|
151
|
+
[tool.coverage.run]
|
|
152
|
+
source_pkgs = ["rechunkit", "tests"]
|
|
153
|
+
branch = true
|
|
154
|
+
parallel = true
|
|
155
|
+
omit = [
|
|
156
|
+
"src/rechunkit/__about__.py",
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
[tool.coverage.paths]
|
|
160
|
+
rechunkit = ["src/rechunkit", "*/rechunkit/src/rechunkit"]
|
|
161
|
+
tests = ["tests", "*/rechunkit/tests"]
|
|
162
|
+
|
|
163
|
+
[tool.coverage.report]
|
|
164
|
+
exclude_lines = [
|
|
165
|
+
"no cov",
|
|
166
|
+
"if __name__ == .__main__.:",
|
|
167
|
+
"if TYPE_CHECKING:",
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
[build-system]
|
|
171
|
+
requires = ["hatchling>=1.26.1"]
|
|
172
|
+
build-backend = "hatchling.build"
|
|
173
|
+
|
|
174
|
+
[project.urls]
|
|
175
|
+
Documentation = "https://mullenkamp.github.io/rechunkit/"
|
|
176
|
+
Source = "https://github.com/mullenkamp/rechunkit"
|
|
177
|
+
|
|
178
|
+
[tool.hatch.build.targets.sdist]
|
|
179
|
+
include = [
|
|
180
|
+
"/rechunkit",
|
|
181
|
+
]
|
|
182
|
+
exclude = [
|
|
183
|
+
"/rechunkit/tests/*",
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
[tool.hatch.version]
|
|
187
|
+
path = "rechunkit/__init__.py"
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
"""Functions to efficiently rechunk multidimensional arrays"""
|
|
2
|
+
from rechunkit.main import guess_chunk_shape, chunk_range, calc_ideal_read_chunk_shape, calc_ideal_read_chunk_mem, calc_source_read_chunk_shape, calc_n_chunks, calc_n_reads_simple, calc_n_reads_rechunker, rechunker
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
__version__ = '0.1.0'
|
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
"""Core rechunking algorithm stuff."""
|
|
2
|
+
# import copy
|
|
3
|
+
from typing import List, Optional, Sequence, Tuple, Iterator, Generator
|
|
4
|
+
import numpy as np
|
|
5
|
+
import itertools
|
|
6
|
+
# from time import time
|
|
7
|
+
from math import prod, lcm, ceil
|
|
8
|
+
from collections import Counter, deque
|
|
9
|
+
from itertools import count
|
|
10
|
+
from bisect import bisect
|
|
11
|
+
|
|
12
|
+
########################################################
|
|
13
|
+
### Parameters
|
|
14
|
+
|
|
15
|
+
composite_numbers = (1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160, 25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720, 1081080, 1441440, 2162160)
|
|
16
|
+
|
|
17
|
+
########################################################
|
|
18
|
+
### Functions
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def guess_chunk_shape(shape: Tuple[int, ...], dtype: np.dtype, target_chunk_size: int = 2**21) -> Tuple[int, ...]:
|
|
22
|
+
"""
|
|
23
|
+
Guess an appropriate chunk layout for a dataset, given its shape and
|
|
24
|
+
the size of each element in bytes. Will allocate chunks only as large
|
|
25
|
+
as target_chunk_size. Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple of two composite numbers will be significantly lower than the product of those two numbers.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
shape: tuple of ints
|
|
30
|
+
Shape of the array.
|
|
31
|
+
dtype: np.dtype or str
|
|
32
|
+
The dtype of the array.
|
|
33
|
+
target_chunk_size: int
|
|
34
|
+
The maximum size per chunk in bytes.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
tuple of ints
|
|
39
|
+
shape of the chunk
|
|
40
|
+
"""
|
|
41
|
+
ndims = len(shape)
|
|
42
|
+
|
|
43
|
+
if ndims > 0:
|
|
44
|
+
|
|
45
|
+
if not all(isinstance(v, int) for v in shape):
|
|
46
|
+
raise TypeError('All values in the shape must be ints.')
|
|
47
|
+
|
|
48
|
+
chunks = np.array(shape, dtype='=f8')
|
|
49
|
+
if not np.all(np.isfinite(chunks)):
|
|
50
|
+
raise ValueError("Illegal value in chunk tuple")
|
|
51
|
+
|
|
52
|
+
dtype = np.dtype(dtype)
|
|
53
|
+
typesize = dtype.itemsize
|
|
54
|
+
|
|
55
|
+
idx = 0
|
|
56
|
+
while True:
|
|
57
|
+
chunk_bytes = prod(chunks)*typesize
|
|
58
|
+
|
|
59
|
+
if (chunk_bytes < target_chunk_size or \
|
|
60
|
+
abs(chunk_bytes - target_chunk_size)/target_chunk_size < 0.5):
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
if prod(chunks) == 1:
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
chunks[idx%ndims] = ceil(chunks[idx%ndims] / 2.0)
|
|
67
|
+
idx += 1
|
|
68
|
+
|
|
69
|
+
return tuple(composite_numbers[bisect(composite_numbers, int(x)) - 1] for x in chunks)
|
|
70
|
+
else:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_slice_min_max(read_slices, write_slices):
|
|
75
|
+
"""
|
|
76
|
+
Function to get the max start position and the min stop position.
|
|
77
|
+
"""
|
|
78
|
+
slices = tuple(slice(max(rs.start, ws.start), min(rs.stop, ws.stop)) for rs, ws in zip(read_slices, write_slices))
|
|
79
|
+
|
|
80
|
+
return slices
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def chunk_range(
|
|
84
|
+
chunk_start: Tuple[int, ...], chunk_stop: Tuple[int, ...], chunk_step: Tuple[int, ...], include_partial_chunks=True, clip_ends=True,
|
|
85
|
+
) -> Iterator[Tuple[slice, ...]]:
|
|
86
|
+
"""
|
|
87
|
+
Generator like the Python range function, but for multiple dimensions and it returns tuples of slices.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
chunk_start: tuple of int
|
|
92
|
+
The start positions of the chunks.
|
|
93
|
+
chunk_stop: tuple of int
|
|
94
|
+
The stop positions of the chunks.
|
|
95
|
+
chunk_step: tuple of int
|
|
96
|
+
The chunking step.
|
|
97
|
+
include_partial_chunks: bool
|
|
98
|
+
Should partial chunks be included? True by default.
|
|
99
|
+
clip_ends: bool
|
|
100
|
+
Only applies when include_partial_chunks == True. Should the chunks be clipped to the overall extents? True by default.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
Generator with tuples of slices
|
|
105
|
+
"""
|
|
106
|
+
if not isinstance(chunk_start, tuple):
|
|
107
|
+
chunk_start = tuple(0 for i in range(len(chunk_stop)))
|
|
108
|
+
|
|
109
|
+
if include_partial_chunks:
|
|
110
|
+
start_ranges = [cs * (sc//cs) for cs, sc in zip(chunk_step, chunk_start)]
|
|
111
|
+
else:
|
|
112
|
+
start_ranges = [cs * (((sc - 1)//cs) + 1) for cs, sc in zip(chunk_step, chunk_start)]
|
|
113
|
+
|
|
114
|
+
ranges = [range(sr, ec, cs) for ec, cs, sr in zip(chunk_stop, chunk_step, start_ranges)]
|
|
115
|
+
|
|
116
|
+
for indices in itertools.product(*ranges):
|
|
117
|
+
# print(indices)
|
|
118
|
+
inside = True
|
|
119
|
+
res = []
|
|
120
|
+
for i, ec, cs, sc in zip(indices, chunk_stop, chunk_step, chunk_start):
|
|
121
|
+
stop = i + cs
|
|
122
|
+
if stop > ec:
|
|
123
|
+
if clip_ends:
|
|
124
|
+
stop = ec
|
|
125
|
+
inside = False
|
|
126
|
+
|
|
127
|
+
start = i
|
|
128
|
+
if start < sc:
|
|
129
|
+
if clip_ends:
|
|
130
|
+
start = sc
|
|
131
|
+
inside = False
|
|
132
|
+
|
|
133
|
+
res.append(slice(start, stop))
|
|
134
|
+
|
|
135
|
+
if inside or include_partial_chunks:
|
|
136
|
+
yield tuple(res)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape):
|
|
140
|
+
"""
|
|
141
|
+
Calculates the minimum ideal read chunk shape between a source and target.
|
|
142
|
+
"""
|
|
143
|
+
return tuple(lcm(i, s) for i, s in zip(source_chunk_shape, target_chunk_shape))
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def calc_ideal_read_chunk_mem(ideal_read_chunk_shape, itemsize):
|
|
147
|
+
"""
|
|
148
|
+
Calculates the minimum ideal read chunk memory between a source and target.
|
|
149
|
+
"""
|
|
150
|
+
return int(prod(ideal_read_chunk_shape) * itemsize)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem):
|
|
154
|
+
"""
|
|
155
|
+
Calculates the optimum read chunk shape given a maximum amount of available memory.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
source_chunk_shape: tuple of int
|
|
160
|
+
The source chunk shape
|
|
161
|
+
target_chunk_shape: tuple of int
|
|
162
|
+
The target chunk shape
|
|
163
|
+
itemsize: int
|
|
164
|
+
The byte length of the data type.
|
|
165
|
+
max_mem: int
|
|
166
|
+
The max allocated memory to perform the chunking operation in bytes.
|
|
167
|
+
|
|
168
|
+
Returns
|
|
169
|
+
-------
|
|
170
|
+
optimal chunk shape: tuple of ints
|
|
171
|
+
"""
|
|
172
|
+
max_cells = max_mem // itemsize
|
|
173
|
+
source_len = len(source_chunk_shape)
|
|
174
|
+
target_len = len(target_chunk_shape)
|
|
175
|
+
|
|
176
|
+
if source_len != target_len:
|
|
177
|
+
raise ValueError('The source_chunk_shape and target_chunk_shape do not have the same number of dims.')
|
|
178
|
+
|
|
179
|
+
tot_source = prod(source_chunk_shape)
|
|
180
|
+
if tot_source >= max_cells:
|
|
181
|
+
return source_chunk_shape
|
|
182
|
+
|
|
183
|
+
new_chunks = list(calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape))
|
|
184
|
+
|
|
185
|
+
## Max mem
|
|
186
|
+
tot_target = prod(new_chunks)
|
|
187
|
+
pos = 0
|
|
188
|
+
while tot_target > max_cells:
|
|
189
|
+
prod_chunk = new_chunks[pos]
|
|
190
|
+
source_chunk = source_chunk_shape[pos]
|
|
191
|
+
if prod_chunk > source_chunk:
|
|
192
|
+
new_chunks[pos] = prod_chunk - source_chunk
|
|
193
|
+
|
|
194
|
+
tot_target = prod(new_chunks)
|
|
195
|
+
|
|
196
|
+
if tot_target == tot_source:
|
|
197
|
+
return source_chunk_shape
|
|
198
|
+
else:
|
|
199
|
+
if pos + 1 == source_len:
|
|
200
|
+
pos = 0
|
|
201
|
+
else:
|
|
202
|
+
pos += 1
|
|
203
|
+
|
|
204
|
+
## Min mem
|
|
205
|
+
n_chunks_write = tuple(s//target_chunk_shape[i] for i, s in enumerate(new_chunks))
|
|
206
|
+
for i in range(len(new_chunks)):
|
|
207
|
+
while True:
|
|
208
|
+
n_chunk_write = n_chunks_write[i]
|
|
209
|
+
prod_chunk = new_chunks[i]
|
|
210
|
+
source_chunk = source_chunk_shape[i]
|
|
211
|
+
target_chunk = target_chunk_shape[i]
|
|
212
|
+
new_chunk = prod_chunk - source_chunk
|
|
213
|
+
if new_chunk//target_chunk == n_chunk_write:
|
|
214
|
+
new_chunks[i] = new_chunk
|
|
215
|
+
else:
|
|
216
|
+
break
|
|
217
|
+
|
|
218
|
+
return tuple(new_chunks)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def calc_n_chunks_per_read(source_chunk_shape, source_read_chunk_shape):
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
"""
|
|
225
|
+
return prod(tuple(nc//sc for nc, sc in zip(source_read_chunk_shape, source_chunk_shape)))
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def calc_n_chunks(shape, chunk_shape):
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
"""
|
|
232
|
+
chunk_start = tuple(0 for i in range(len(shape)))
|
|
233
|
+
chunk_iter = chunk_range(chunk_start, shape, chunk_shape)
|
|
234
|
+
|
|
235
|
+
counter = count()
|
|
236
|
+
deque(zip(chunk_iter, counter), maxlen=0)
|
|
237
|
+
|
|
238
|
+
return next(counter)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def calc_n_reads_simple(shape, source_chunk_shape, target_chunk_shape):
|
|
242
|
+
"""
|
|
243
|
+
Brute force chunking read count. Every target chunk must iterate over every associated source chunk. This should be considered the maximum number of reads between a source and target (most inefficient). The number of writes is the total number of chunks in the target.
|
|
244
|
+
|
|
245
|
+
Parameters
|
|
246
|
+
----------
|
|
247
|
+
shape: tuple of ints
|
|
248
|
+
The shape of the source dataset, which will also be the shape of the target dataset.
|
|
249
|
+
dtype: np.dtype
|
|
250
|
+
The numpy data type of the source/target.
|
|
251
|
+
source_chunk_shape: tuple of ints
|
|
252
|
+
The chunk_shape of the source.
|
|
253
|
+
target_chunk_shape: tuple of ints
|
|
254
|
+
The chunk_shape of the target.
|
|
255
|
+
|
|
256
|
+
Returns
|
|
257
|
+
-------
|
|
258
|
+
int
|
|
259
|
+
Count of the number of reads
|
|
260
|
+
"""
|
|
261
|
+
chunk_start = tuple(0 for i in range(len(shape)))
|
|
262
|
+
read_counter = count()
|
|
263
|
+
|
|
264
|
+
for write_chunk in chunk_range(chunk_start, shape, target_chunk_shape):
|
|
265
|
+
write_chunk_start = tuple(rc.start for rc in write_chunk)
|
|
266
|
+
write_chunk_stop = tuple(rc.stop for rc in write_chunk)
|
|
267
|
+
for chunk_slice in chunk_range(write_chunk_start, write_chunk_stop, source_chunk_shape):
|
|
268
|
+
next(read_counter)
|
|
269
|
+
|
|
270
|
+
return next(read_counter)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def calc_n_reads_rechunker(shape: Tuple[int, ...], dtype: np.dtype, source_chunk_shape: Tuple[int, ...], target_chunk_shape: Tuple[int, ...], max_mem: int, sel=None) -> Tuple[int, int]:
|
|
274
|
+
"""
|
|
275
|
+
This function calculates the total number of reads (aand writes) using the more optimized rechunking algorithm. It optimises the rechunking by using an in-memory numpy ndarray with a size defined by the max_mem provided by the user.
|
|
276
|
+
|
|
277
|
+
Parameters
|
|
278
|
+
----------
|
|
279
|
+
source: array-like
|
|
280
|
+
The source function to read the dataset/array. The function must have a single parameter input as a tuple of slices to retrieve an array chunk of data.
|
|
281
|
+
shape: tuple of ints
|
|
282
|
+
The shape of the source dataset, which will also be the shape of the target dataset.
|
|
283
|
+
dtype: np.dtype
|
|
284
|
+
The numpy data type of the source/target.
|
|
285
|
+
source_chunk_shape: tuple of ints
|
|
286
|
+
The chunk_shape of the source.
|
|
287
|
+
target_chunk_shape: tuple of ints
|
|
288
|
+
The chunk_shape of the target.
|
|
289
|
+
max_mem: int
|
|
290
|
+
The max allocated memory to perform the chunking operation in bytes.
|
|
291
|
+
sel: tuple of slices
|
|
292
|
+
A subset selection of the source in the form of a tuple of slices. The starts and stops must be within the shape of the source.
|
|
293
|
+
|
|
294
|
+
Returns
|
|
295
|
+
-------
|
|
296
|
+
tuple
|
|
297
|
+
number of reads, number of writes
|
|
298
|
+
"""
|
|
299
|
+
itemsize = dtype.itemsize
|
|
300
|
+
|
|
301
|
+
## Calc the optimum read_chunk_shape
|
|
302
|
+
source_read_chunk_shape = calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem)
|
|
303
|
+
|
|
304
|
+
## Calc ideal read chunking shape
|
|
305
|
+
ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape)
|
|
306
|
+
|
|
307
|
+
chunk_start = tuple(0 for i in range(len(shape)))
|
|
308
|
+
|
|
309
|
+
if sel is None:
|
|
310
|
+
target_shape = shape
|
|
311
|
+
else:
|
|
312
|
+
# Checks
|
|
313
|
+
for s, sh in zip(sel, shape):
|
|
314
|
+
if s.start < 0 or s.stop > sh:
|
|
315
|
+
raise ValueError('The selection must be a subset of the source.')
|
|
316
|
+
|
|
317
|
+
target_shape = tuple(s.stop - s.start for s in sel)
|
|
318
|
+
|
|
319
|
+
## Counters
|
|
320
|
+
read_counter = count()
|
|
321
|
+
write_counter = count()
|
|
322
|
+
# write_counter2 = count()
|
|
323
|
+
|
|
324
|
+
## If the read chunking is set to the ideal chunking case, then use the simple implementation. Otherwise, use the more complicated one.
|
|
325
|
+
if source_read_chunk_shape == ideal_read_chunk_shape:
|
|
326
|
+
read_chunk_iter = chunk_range(chunk_start, target_shape, source_read_chunk_shape)
|
|
327
|
+
for read_chunk_grp in read_chunk_iter:
|
|
328
|
+
read_chunk_grp_start = tuple(s.start for s in read_chunk_grp)
|
|
329
|
+
read_chunk_grp_stop = tuple(s.stop for s in read_chunk_grp)
|
|
330
|
+
for read_chunk in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, source_chunk_shape):
|
|
331
|
+
next(read_counter)
|
|
332
|
+
|
|
333
|
+
for write_chunk1 in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, target_chunk_shape):
|
|
334
|
+
next(write_counter)
|
|
335
|
+
|
|
336
|
+
else:
|
|
337
|
+
writen_chunks = set() # Need to keep track of the bulk writes
|
|
338
|
+
|
|
339
|
+
write_chunk_iter = chunk_range(chunk_start, target_shape, target_chunk_shape)
|
|
340
|
+
|
|
341
|
+
for write_chunk in write_chunk_iter:
|
|
342
|
+
write_chunk_start = tuple(s.start for s in write_chunk)
|
|
343
|
+
if write_chunk_start not in writen_chunks:
|
|
344
|
+
write_chunk_stop = tuple(s.stop for s in write_chunk)
|
|
345
|
+
|
|
346
|
+
read_chunk_start = tuple(rc * (wc//rc) for wc, rc in zip(write_chunk_start, source_chunk_shape))
|
|
347
|
+
read_chunk_stop = tuple(min(max(rcs + rc, wc), sh) for rcs, rc, wc, sh in zip(read_chunk_start, source_read_chunk_shape, write_chunk_stop, shape))
|
|
348
|
+
read_chunks_iter = chunk_range(read_chunk_start, read_chunk_stop, source_chunk_shape, True, False)
|
|
349
|
+
|
|
350
|
+
if all(stop - start <= rcs for start, stop, rcs in zip(read_chunk_start, read_chunk_stop, source_read_chunk_shape)):
|
|
351
|
+
for read_chunk in read_chunks_iter:
|
|
352
|
+
next(read_counter)
|
|
353
|
+
|
|
354
|
+
is_end_chunk = any(wc.stop == ts for wc, ts in zip(write_chunk, target_shape))
|
|
355
|
+
for write_chunk1 in chunk_range(write_chunk_start, read_chunk_stop, target_chunk_shape, include_partial_chunks=is_end_chunk, clip_ends=False):
|
|
356
|
+
write_chunk2 = tuple(slice(wc.start, min(wc.stop, s)) for wc, s in zip(write_chunk1, target_shape))
|
|
357
|
+
if all(all((wc.stop - wcs <= src, wc.start < wc.stop)) for wcs, wc, src in zip(read_chunk_start, write_chunk2, source_read_chunk_shape)):
|
|
358
|
+
write_chunk1_start = tuple(s.start for s in write_chunk2)
|
|
359
|
+
if write_chunk1_start not in writen_chunks:
|
|
360
|
+
next(write_counter)
|
|
361
|
+
|
|
362
|
+
writen_chunks.add(write_chunk1_start)
|
|
363
|
+
else:
|
|
364
|
+
for read_chunk in read_chunks_iter:
|
|
365
|
+
next(read_counter)
|
|
366
|
+
|
|
367
|
+
next(write_counter)
|
|
368
|
+
|
|
369
|
+
writen_chunks.add(write_chunk_start)
|
|
370
|
+
|
|
371
|
+
return next(read_counter), next(write_counter)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def rechunker(source: object, shape: Tuple[int, ...], dtype: np.dtype, source_chunk_shape: Tuple[int, ...], target_chunk_shape: Tuple[int, ...], max_mem: int, sel=None) -> Iterator[Tuple[Tuple[slice, ...], np.ndarray]]:
|
|
375
|
+
"""
|
|
376
|
+
This function takes a source dataset function with a specific chunk_shape and returns a generator that converts to a new chunk_shape. It optimises the rechunking by using an in-memory numpy ndarray with a size defined by the max_mem provided by the user.
|
|
377
|
+
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
380
|
+
source: array-like
|
|
381
|
+
The source function to read the dataset/array. The function must have a single parameter input as a tuple of slices to retrieve an array chunk of data.
|
|
382
|
+
shape: tuple of ints
|
|
383
|
+
The shape of the source dataset, which will also be the shape of the target dataset unless sel is passed.
|
|
384
|
+
dtype: np.dtype
|
|
385
|
+
The numpy data type of the source/target.
|
|
386
|
+
source_chunk_shape: tuple of ints
|
|
387
|
+
The chunk_shape of the source.
|
|
388
|
+
target_chunk_shape: tuple of ints
|
|
389
|
+
The chunk_shape of the target.
|
|
390
|
+
max_mem: int
|
|
391
|
+
The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
|
|
392
|
+
sel: tuple of slices
|
|
393
|
+
A subset selection of the source in the form of a tuple of slices. The starts and stops must be within the shape of the source.
|
|
394
|
+
|
|
395
|
+
Returns
|
|
396
|
+
-------
|
|
397
|
+
Generator
|
|
398
|
+
tuple of the target slices to the np.ndarray of data
|
|
399
|
+
"""
|
|
400
|
+
itemsize = dtype.itemsize
|
|
401
|
+
|
|
402
|
+
## Calc the optimum read_chunk_shape
|
|
403
|
+
source_read_chunk_shape = calc_source_read_chunk_shape(source_chunk_shape, target_chunk_shape, itemsize, max_mem)
|
|
404
|
+
|
|
405
|
+
mem_arr1 = np.zeros(source_read_chunk_shape, dtype=dtype)
|
|
406
|
+
|
|
407
|
+
## Calc ideal read chunking shape
|
|
408
|
+
ideal_read_chunk_shape = calc_ideal_read_chunk_shape(source_chunk_shape, target_chunk_shape)
|
|
409
|
+
|
|
410
|
+
chunk_start = tuple(0 for i in range(len(shape)))
|
|
411
|
+
|
|
412
|
+
if sel is None:
|
|
413
|
+
chunk_read_offset = chunk_start
|
|
414
|
+
target_shape = shape
|
|
415
|
+
else:
|
|
416
|
+
# Checks
|
|
417
|
+
for s, sh in zip(sel, shape):
|
|
418
|
+
if s.start < 0 or s.stop > sh:
|
|
419
|
+
raise ValueError('The selection must be a subset of the source.')
|
|
420
|
+
|
|
421
|
+
chunk_read_offset = tuple(s.start for s in sel)
|
|
422
|
+
target_shape = tuple(s.stop - s.start for s in sel)
|
|
423
|
+
|
|
424
|
+
# target = np.zeros(target_shape, dtype=dtype)
|
|
425
|
+
|
|
426
|
+
## If the read chunking is set to the ideal chunking case, then use the simple implementation. Otherwise, use the more complicated one.
|
|
427
|
+
if source_read_chunk_shape == ideal_read_chunk_shape:
|
|
428
|
+
read_chunk_iter = chunk_range(chunk_start, target_shape, source_read_chunk_shape)
|
|
429
|
+
for read_chunk_grp in read_chunk_iter:
|
|
430
|
+
read_chunk_grp_start = tuple(s.start for s in read_chunk_grp)
|
|
431
|
+
read_chunk_grp_stop = tuple(s.stop for s in read_chunk_grp)
|
|
432
|
+
for read_chunk in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, source_chunk_shape):
|
|
433
|
+
offset_slices = tuple(slice(rc.start - wcs, rc.stop - wcs) for wcs, rc in zip(read_chunk_grp_start, read_chunk))
|
|
434
|
+
read_chunk1 = tuple(slice(rc.start + cro, rc.stop + cro) for rc, cro in zip(read_chunk, chunk_read_offset))
|
|
435
|
+
mem_arr1[offset_slices] = source(read_chunk1)
|
|
436
|
+
|
|
437
|
+
for write_chunk1 in chunk_range(read_chunk_grp_start, read_chunk_grp_stop, target_chunk_shape):
|
|
438
|
+
offset_slices = tuple(slice(wc.start - wcs, wc.stop - wcs) for wcs, wc in zip(read_chunk_grp_start, write_chunk1))
|
|
439
|
+
|
|
440
|
+
# target[write_chunk1] = mem_arr1[offset_slices]
|
|
441
|
+
|
|
442
|
+
yield write_chunk1, mem_arr1[offset_slices]
|
|
443
|
+
|
|
444
|
+
else:
|
|
445
|
+
writen_chunks = set() # Need to keep track of the bulk writes
|
|
446
|
+
|
|
447
|
+
write_chunk_iter = chunk_range(chunk_start, target_shape, target_chunk_shape)
|
|
448
|
+
|
|
449
|
+
for write_chunk in write_chunk_iter:
|
|
450
|
+
write_chunk_start = tuple(s.start for s in write_chunk)
|
|
451
|
+
if write_chunk_start not in writen_chunks:
|
|
452
|
+
write_chunk_stop = tuple(s.stop for s in write_chunk)
|
|
453
|
+
|
|
454
|
+
read_chunk_start = tuple(rc * (wc//rc) for wc, rc in zip(write_chunk_start, source_chunk_shape))
|
|
455
|
+
read_chunk_stop = tuple(min(max(rcs + rc, wc), sh) for rcs, rc, wc, sh in zip(read_chunk_start, source_read_chunk_shape, write_chunk_stop, shape))
|
|
456
|
+
read_chunks_iter = chunk_range(read_chunk_start, read_chunk_stop, source_chunk_shape, True, False)
|
|
457
|
+
|
|
458
|
+
if all(stop - start <= rcs for start, stop, rcs in zip(read_chunk_start, read_chunk_stop, source_read_chunk_shape)):
|
|
459
|
+
for read_chunk in read_chunks_iter:
|
|
460
|
+
read_chunk1 = tuple(slice(rc.start + cro, min(rc.stop + cro, s)) for rc, cro, s in zip(read_chunk, chunk_read_offset, shape))
|
|
461
|
+
offset_slices = tuple(slice(rc.start - rcs - rco, rc.stop - rcs - rco) for rcs, rco, rc in zip(read_chunk_start, chunk_read_offset, read_chunk1))
|
|
462
|
+
|
|
463
|
+
mem_arr1[offset_slices] = source(read_chunk1)
|
|
464
|
+
|
|
465
|
+
is_end_chunk = any(wc.stop == ts for wc, ts in zip(write_chunk, target_shape))
|
|
466
|
+
for write_chunk1 in chunk_range(write_chunk_start, read_chunk_stop, target_chunk_shape, include_partial_chunks=is_end_chunk, clip_ends=False):
|
|
467
|
+
write_chunk2 = tuple(slice(wc.start, min(wc.stop, s)) for wc, s in zip(write_chunk1, target_shape))
|
|
468
|
+
if all(all((wc.stop - wcs <= src, wc.start < wc.stop)) for wcs, wc, src in zip(read_chunk_start, write_chunk2, source_read_chunk_shape)):
|
|
469
|
+
write_chunk1_start = tuple(s.start for s in write_chunk2)
|
|
470
|
+
if write_chunk1_start not in writen_chunks:
|
|
471
|
+
offset_slices = tuple(slice(wc.start - rcs, wc.stop - rcs) for rcs, wc in zip(read_chunk_start, write_chunk2))
|
|
472
|
+
# print(write_chunk1, offset_slices)
|
|
473
|
+
|
|
474
|
+
# target[write_chunk2] = mem_arr1[offset_slices]
|
|
475
|
+
|
|
476
|
+
yield write_chunk2, mem_arr1[offset_slices]
|
|
477
|
+
|
|
478
|
+
writen_chunks.add(write_chunk1_start)
|
|
479
|
+
# if write_chunk1_start == (14, 5):
|
|
480
|
+
# raise ValueError()
|
|
481
|
+
else:
|
|
482
|
+
mem_read_chunk_slice = tuple(slice(0, wc.stop - wc.start) for wc in write_chunk)
|
|
483
|
+
# for read_chunk in chunk_range(write_chunk_start, write_chunk_stop, source_chunk_shape, True, False):
|
|
484
|
+
for read_chunk in read_chunks_iter:
|
|
485
|
+
read_chunk1 = tuple(slice(rc.start + cro, rc.stop + cro) for rc, cro in zip(read_chunk, chunk_read_offset))
|
|
486
|
+
clip_read_chunk = get_slice_min_max(read_chunk, write_chunk)
|
|
487
|
+
read_slice = tuple(slice(cc.start - rc.start, cc.stop - rc.start) for cc, rc in zip(clip_read_chunk, read_chunk))
|
|
488
|
+
write_slice = tuple(slice(cc.start - rc.start, cc.stop - rc.start) for cc, rc in zip(clip_read_chunk, write_chunk))
|
|
489
|
+
|
|
490
|
+
# print(read_chunk, read_slice, write_slice)
|
|
491
|
+
mem_arr1[write_slice] = source(read_chunk1)[read_slice]
|
|
492
|
+
|
|
493
|
+
# target[write_chunk] = mem_arr1[mem_read_chunk_slice]
|
|
494
|
+
|
|
495
|
+
yield write_chunk, mem_arr1[mem_read_chunk_slice]
|
|
496
|
+
|
|
497
|
+
writen_chunks.add(write_chunk_start)
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
|