bsplyne 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ import numpy as np
2
+ import scipy.sparse as sps
3
+ import numba as nb
4
+
5
+
6
+ @nb.njit(cache=True)
7
+ def _wide_product_max_nnz(
8
+ a_indptr: np.ndarray, b_indptr: np.ndarray, height: int
9
+ ) -> int:
10
+ """
11
+ Compute the maximum number of nonzeros in the result.
12
+
13
+ Parameters
14
+ ----------
15
+ a_indptr : np.ndarray
16
+ CSR pointer array for matrix A.
17
+ b_indptr : np.ndarray
18
+ CSR pointer array for matrix B.
19
+ height : int
20
+ Number of rows (must be the same for both A and B).
21
+
22
+ Returns
23
+ -------
24
+ max_nnz : int
25
+ Total number of nonzero elements in the resulting matrix.
26
+ """
27
+ max_nnz = 0
28
+ for i in range(height):
29
+ nnz_a = a_indptr[i + 1] - a_indptr[i]
30
+ nnz_b = b_indptr[i + 1] - b_indptr[i]
31
+ max_nnz += nnz_a * nnz_b
32
+ return max_nnz
33
+
34
+
35
+ @nb.njit(cache=True)
36
+ def _wide_product_row(
37
+ a_data: np.ndarray,
38
+ a_indices: np.ndarray,
39
+ b_data: np.ndarray,
40
+ b_indices: np.ndarray,
41
+ b_width: int,
42
+ out_data: np.ndarray,
43
+ out_indices: np.ndarray,
44
+ ) -> int:
45
+ """
46
+ Compute the wide product for one row.
47
+
48
+ For each nonzero in the row of A and each nonzero in the row of B, it computes:
49
+ out_index = a_indices[i] * b_width + b_indices[j]
50
+ out_value = a_data[i] * b_data[j]
51
+
52
+ Parameters
53
+ ----------
54
+ a_data : np.ndarray
55
+ Nonzero values for the row in A.
56
+ a_indices : np.ndarray
57
+ Column indices for the row in A.
58
+ b_data : np.ndarray
59
+ Nonzero values for the row in B.
60
+ b_indices : np.ndarray
61
+ Column indices for the row in B.
62
+ b_width : int
63
+ Number of columns in B.
64
+ out_data : np.ndarray
65
+ Preallocated output array for the row's data.
66
+ out_indices : np.ndarray
67
+ Preallocated output array for the row's indices.
68
+
69
+ Returns
70
+ -------
71
+ off : int
72
+ Number of nonzero entries computed for this row.
73
+ """
74
+ off = 0
75
+ for i in range(a_data.shape[0]):
76
+ for j in range(b_data.shape[0]):
77
+ out_indices[off] = a_indices[i] * b_width + b_indices[j]
78
+ out_data[off] = a_data[i] * b_data[j]
79
+ off += 1
80
+ return off
81
+
82
+
83
+ @nb.njit(cache=True)
84
+ def _wide_product_numba(
85
+ height: int,
86
+ a_data: np.ndarray,
87
+ a_indices: np.ndarray,
88
+ a_indptr: np.ndarray,
89
+ a_width: int,
90
+ b_data: np.ndarray,
91
+ b_indices: np.ndarray,
92
+ b_indptr: np.ndarray,
93
+ b_width: int,
94
+ ):
95
+ """
96
+ Compute the row-wise wide (Khatri-Rao) product for two CSR matrices.
97
+
98
+ For each row i, the result[i, :] = kron(A[i, :], B[i, :]), i.e. the Kronecker product
99
+ of the i-th rows of A and B.
100
+
101
+ Parameters
102
+ ----------
103
+ height : int
104
+ Number of rows in A and B.
105
+ a_data : np.ndarray
106
+ Data array for matrix A (CSR format).
107
+ a_indices : np.ndarray
108
+ Indices array for matrix A.
109
+ a_indptr : np.ndarray
110
+ Index pointer array for matrix A.
111
+ a_width : int
112
+ Number of columns in A.
113
+ b_data : np.ndarray
114
+ Data array for matrix B (CSR format).
115
+ b_indices : np.ndarray
116
+ Indices array for matrix B.
117
+ b_indptr : np.ndarray
118
+ Index pointer array for matrix B.
119
+ b_width : int
120
+ Number of columns in B.
121
+
122
+ Returns
123
+ -------
124
+ out_data : np.ndarray
125
+ Data array for the resulting CSR matrix.
126
+ out_indices : np.ndarray
127
+ Indices array for the resulting CSR matrix.
128
+ out_indptr : np.ndarray
129
+ Index pointer array for the resulting CSR matrix.
130
+ total_nnz : int
131
+ Total number of nonzero entries computed.
132
+ """
133
+ max_nnz = _wide_product_max_nnz(a_indptr, b_indptr, height)
134
+ out_data = np.empty(max_nnz, dtype=a_data.dtype)
135
+ out_indices = np.empty(max_nnz, dtype=a_indices.dtype)
136
+ out_indptr = np.empty(height + 1, dtype=a_indptr.dtype)
137
+
138
+ off = 0
139
+ for i in range(height):
140
+ out_indptr[i] = off
141
+ a_start = a_indptr[i]
142
+ a_end = a_indptr[i + 1]
143
+ b_start = b_indptr[i]
144
+ b_end = b_indptr[i + 1]
145
+
146
+ row_nnz = _wide_product_row(
147
+ a_data[a_start:a_end],
148
+ a_indices[a_start:a_end],
149
+ b_data[b_start:b_end],
150
+ b_indices[b_start:b_end],
151
+ b_width,
152
+ out_data[off:],
153
+ out_indices[off:],
154
+ )
155
+ off += row_nnz
156
+ out_indptr[height] = off
157
+ return out_data[:off], out_indices[:off], out_indptr, off
158
+
159
+
160
+ def my_wide_product(A: sps.spmatrix, B: sps.spmatrix) -> sps.csr_matrix:
161
+ """
162
+ Compute a "1D" Kronecker product row by row.
163
+
164
+ For each row i, the result C[i, :] = kron(A[i, :], B[i, :]).
165
+ Matrices A and B must have the same number of rows.
166
+
167
+ Parameters
168
+ ----------
169
+ A : scipy.sparse.spmatrix
170
+ Input sparse matrix A in CSR format.
171
+ B : scipy.sparse.spmatrix
172
+ Input sparse matrix B in CSR format.
173
+
174
+ Returns
175
+ -------
176
+ C : scipy.sparse.csr_matrix
177
+ Resulting sparse matrix in CSR format with shape (A.shape[0], A.shape[1]*B.shape[1]).
178
+ """
179
+ if A.shape[0] != B.shape[0]:
180
+ raise ValueError("A and B must have the same number of rows")
181
+
182
+ # Ensure matrices are in CSR format for fast row slicing.
183
+ if not sps.isspmatrix_csr(A):
184
+ A = A.tocsr()
185
+ if not sps.isspmatrix_csr(B):
186
+ B = B.tocsr()
187
+
188
+ height = A.shape[0]
189
+ a_width = A.shape[1]
190
+ b_width = B.shape[1]
191
+
192
+ out_data, out_indices, out_indptr, total_nnz = _wide_product_numba(
193
+ height,
194
+ A.data,
195
+ A.indices,
196
+ A.indptr,
197
+ a_width,
198
+ B.data,
199
+ B.indices,
200
+ B.indptr,
201
+ b_width,
202
+ )
203
+
204
+ # Build the resulting CSR matrix.
205
+ C = sps.csr_matrix(
206
+ (out_data, out_indices, out_indptr), shape=(height, a_width * b_width)
207
+ )
208
+
209
+ return C
@@ -0,0 +1,378 @@
1
+ import multiprocessing as mp
2
+ from multiprocessing import shared_memory
3
+ import threading
4
+ import queue
5
+ from typing import Iterable, Union, Callable
6
+ from tqdm import tqdm
7
+ import numpy as np
8
+ import tempfile, os
9
+ import time
10
+ import gc
11
+
12
+
13
+ def _save_worker(save_queue: queue.Queue):
14
+ """
15
+ Background worker that saves data to disk from a queue.
16
+
17
+ This function runs in a separate thread. It waits for filename-result pairs
18
+ pushed into the `save_queue`, and writes them to disk using `np.save`.
19
+ When it receives a `None` sentinel value, it terminates.
20
+
21
+ Parameters
22
+ ----------
23
+ save_queue : queue.Queue
24
+ A thread-safe queue containing `(fname, result)` pairs to be saved. The thread
25
+ stops when it receives a `None` item.
26
+
27
+ Notes
28
+ -----
29
+ - Each result is saved as a `numpy.ndarray` with `dtype=object` using `np.save`.
30
+ - The queue can be bounded to control memory usage in the main thread.
31
+ """
32
+ while True:
33
+ item = save_queue.get()
34
+ if item is None:
35
+ break # stop signal
36
+ fname, result = item
37
+ to_save = np.empty(1, dtype=object)
38
+ to_save[0] = result
39
+ np.save(fname, to_save, allow_pickle=True)
40
+ save_queue.task_done()
41
+
42
+
43
+ def _worker(
44
+ func: Iterable[Callable],
45
+ block_args: Iterable[tuple],
46
+ idx: int,
47
+ temp_dir: str,
48
+ verbose: bool,
49
+ jupyter: bool,
50
+ pbar_title_prefix: str,
51
+ shared_mem: Union[tuple[str, tuple[int, ...], type], None],
52
+ ) -> str:
53
+ """
54
+ Apply a function to a sequence of argument tuples, saving results asynchronously during computation.
55
+
56
+ This function iterates over `block_args`, applying `func[i]` to each tuple of arguments,
57
+ and delegates saving to disk to a background thread. This allows computation to proceed
58
+ without waiting for disk writes to complete, reducing idle CPU time and improving throughput.
59
+
60
+ Parameters
61
+ ----------
62
+ func : Iterable[Callable]
63
+ Functions to apply to each tuple in `block_args`. Each function should accept unpacked arguments
64
+ from its corresponding tuple.
65
+ block_args : Iterable[tuple]
66
+ Iterable of `tuple` arguments to be passed to `func`. Each `tuple` is unpacked as arguments.
67
+ idx : int
68
+ Index of the current block, used for progress bar positioning and output file naming.
69
+ temp_dir : str
70
+ Path to the temporary directory where result files will be saved.
71
+ verbose : bool
72
+ If `True`, enables the progress bar. If `False`, disables it.
73
+ jupyter : bool
74
+ If `True`, sets the progress bar position to `0` for Jupyter notebook compatibility.
75
+ If `False`, uses `idx` as the position.
76
+ pbar_title_prefix : str
77
+ Prefix string for the progress bar description.
78
+
79
+ Returns
80
+ -------
81
+ str
82
+ Path to the folder containing the saved `.npy` result files for this block.
83
+
84
+ Notes
85
+ -----
86
+ - A background thread is launched to handle saving to disk via `np.save`.
87
+ - A bounded queue is used to limit memory usage when saving is slower than computation.
88
+ - Result files are saved as `"res_{i}.npy"` in a folder named `"block_{idx}"` within `temp_dir`.
89
+ - Each file contains a single `numpy.ndarray` with `dtype=object` and one element.
90
+ - Explicit garbage collection is triggered after saving.
91
+ """
92
+ position = 0 if jupyter else idx
93
+ block_folder = os.path.join(temp_dir, f"block_{idx}")
94
+ os.makedirs(block_folder, exist_ok=True)
95
+
96
+ # File d’écriture limitée pour éviter surcharge mémoire
97
+ save_queue = queue.Queue(maxsize=10)
98
+ writer_thread = threading.Thread(target=_save_worker, args=(save_queue,))
99
+ writer_thread.start()
100
+
101
+ with tqdm(
102
+ block_args,
103
+ desc=f"{pbar_title_prefix}: Block {idx}",
104
+ disable=not verbose,
105
+ position=position,
106
+ ) as pbar:
107
+ if shared_mem is not None:
108
+ shm_name, shape, dtype = shared_mem
109
+ shm = shared_memory.SharedMemory(name=shm_name)
110
+ try:
111
+ array = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
112
+ for i, args in enumerate(pbar):
113
+ result = func[i](*args, array)
114
+ fname = os.path.join(block_folder, f"res_{i}.npy")
115
+ save_queue.put((fname, result)) # enqueue (blocks if full)
116
+ finally:
117
+ shm.close()
118
+ else:
119
+ for i, args in enumerate(pbar):
120
+ result = func[i](*args)
121
+ fname = os.path.join(block_folder, f"res_{i}.npy")
122
+ save_queue.put((fname, result)) # mise en file (bloque si trop plein)
123
+
124
+ # Finalisation propre
125
+ save_queue.put(None) # signal de fin
126
+ writer_thread.join() # on attend que tout soit écrit
127
+ gc.collect()
128
+ return block_folder
129
+
130
+
131
+ def parallel_blocks_inner(
132
+ funcs: Iterable[Callable],
133
+ all_args: Iterable[tuple],
134
+ num_blocks: int,
135
+ verbose: bool,
136
+ pbar_title: str,
137
+ disable_parallel: bool,
138
+ shared_mem_last_arg: Union[np.ndarray, None],
139
+ ) -> list:
140
+ """
141
+ Execute a list of functions with their corresponding argument tuples, optionally in parallel blocks.
142
+
143
+ This function performs the actual execution of tasks either sequentially or in parallel,
144
+ depending on the `disable_parallel` flag. When running in parallel, the tasks are divided into
145
+ `num_blocks` groups (blocks), each executed by a separate worker process. Intermediate results
146
+ are temporarily saved to disk as `.npy` files to limit memory usage and are reloaded sequentially
147
+ after all processes complete.
148
+
149
+ Parameters
150
+ ----------
151
+ funcs : Iterable[Callable]
152
+ List of functions to execute. Must have the same length as `all_args`.
153
+ Each function is called as `func(*args)` for its corresponding argument tuple.
154
+ all_args : Iterable[tuple]
155
+ List of tuples, each containing the arguments for the corresponding function in `funcs`.
156
+ num_blocks : int
157
+ Number of parallel blocks (i.e., worker processes) to use when `disable_parallel` is False.
158
+ Determines how many subsets of tasks will be distributed among processes.
159
+ verbose : bool
160
+ If True, enables progress bars and displays information about block processing and result gathering.
161
+ pbar_title : str
162
+ Title prefix used for progress bar descriptions.
163
+ disable_parallel : bool
164
+ If True, all tasks are executed sequentially in the current process. If False, tasks are divided
165
+ into blocks and processed in parallel using a multiprocessing pool.
166
+ shared_mem_last_arg : Union[np.ndarray, None]
167
+ Optional NumPy array placed in shared memory and appended automatically as the last argument
168
+ of each task. This is useful for sharing large read-only data (e.g., images, meshes) without
169
+ duplicating memory across processes.
170
+
171
+ Returns
172
+ -------
173
+ list
174
+ List of results obtained from applying each function to its corresponding argument tuple,
175
+ preserving the original task order.
176
+
177
+ Notes
178
+ -----
179
+ - **Sequential mode:** if `disable_parallel` is True, all functions are executed in the current
180
+ process with an optional progress bar.
181
+ - **Parallel mode:**
182
+ * The tasks are split into `num_blocks` subsets.
183
+ * Each subset is processed by a separate worker via `multiprocessing.Pool`.
184
+ * Each worker writes its results as `.npy` files inside a temporary subfolder.
185
+ * After all workers complete, results are reloaded in the original order, and temporary files
186
+ and folders are deleted.
187
+ - **Shared memory:** if `shared_mem_last_arg` is provided, it is stored once in shared memory
188
+ and accessible by all workers, avoiding redundant copies of large data arrays.
189
+ - Compatible with both standard Python terminals and Jupyter notebooks (adaptive progress bars).
190
+ - Intended for internal use by higher-level orchestration functions such as `parallel_blocks()`.
191
+ """
192
+
193
+ n_tasks = len(all_args)
194
+
195
+ # Detect if running in a Jupyter environment
196
+ try:
197
+ from IPython import get_ipython
198
+
199
+ jupyter = get_ipython().__class__.__name__ == "ZMQInteractiveShell"
200
+ except:
201
+ jupyter = False
202
+
203
+ # Runs in sequential if necessary
204
+ if disable_parallel:
205
+ if shared_mem_last_arg is not None:
206
+ all_args = [list(args) + [shared_mem_last_arg] for args in all_args]
207
+ results = []
208
+ position = 0 if jupyter else 0
209
+ for i, (func, args) in enumerate(
210
+ tqdm(
211
+ zip(funcs, all_args),
212
+ total=n_tasks,
213
+ desc=pbar_title,
214
+ disable=not verbose,
215
+ position=position,
216
+ )
217
+ ):
218
+ results.append(func(*args))
219
+ return results
220
+
221
+ # Create the shared memory buffer if necessary
222
+ if shared_mem_last_arg is not None:
223
+ shm = shared_memory.SharedMemory(create=True, size=shared_mem_last_arg.nbytes)
224
+ np.ndarray(
225
+ shared_mem_last_arg.shape, dtype=shared_mem_last_arg.dtype, buffer=shm.buf
226
+ )[:] = shared_mem_last_arg
227
+ shared_mem = (shm.name, shared_mem_last_arg.shape, shared_mem_last_arg.dtype)
228
+ else:
229
+ shared_mem = None
230
+
231
+ # Split the functions and arguments into blocks
232
+ nb_each, extras = divmod(n_tasks, num_blocks)
233
+ sizes = extras * [nb_each + 1] + (num_blocks - extras) * [nb_each]
234
+ blocks = []
235
+ funcs_blocks = []
236
+ start = 0
237
+ for size in sizes:
238
+ end = start + size
239
+ blocks.append(all_args[start:end])
240
+ funcs_blocks.append(funcs[start:end])
241
+ start = end
242
+
243
+ temp_dir = tempfile.mkdtemp(prefix="parallel_blocks_")
244
+ args = [
245
+ (func_block, block, i, temp_dir, verbose, jupyter, pbar_title, shared_mem)
246
+ for i, (func_block, block) in enumerate(zip(funcs_blocks, blocks))
247
+ ]
248
+
249
+ # Start the worker processes
250
+ with mp.Pool(num_blocks) as pool:
251
+ files = pool.starmap(_worker, args)
252
+
253
+ # Free the memory space allocated to the shared array
254
+ if shared_mem_last_arg is not None:
255
+ shm.close()
256
+ shm.unlink()
257
+
258
+ # Load and collect results from each file
259
+ with tqdm(total=n_tasks, desc=f"{pbar_title}: Gather", disable=not verbose) as pbar:
260
+ results = []
261
+ for idx, block_folder in enumerate(files):
262
+ for i in range(len(blocks[idx])):
263
+ fpath = os.path.join(block_folder, f"res_{i}.npy")
264
+ results.append(np.load(fpath, allow_pickle=True)[0])
265
+ os.remove(fpath)
266
+ pbar.update(1)
267
+ os.rmdir(block_folder)
268
+ os.rmdir(temp_dir)
269
+ return results
270
+
271
+
272
+ def parallel_blocks(
273
+ funcs: Union[Callable, Iterable[Callable]],
274
+ all_args: Union[Iterable[tuple], None] = None,
275
+ num_blocks: Union[int, None] = None,
276
+ verbose: bool = True,
277
+ pbar_title: str = "Processing blocks",
278
+ disable_parallel: bool = False,
279
+ est_proc_cost: float = 0.5,
280
+ shared_mem_last_arg: Union[np.ndarray, None] = None,
281
+ ) -> list:
282
+ """
283
+ Execute a set of independent tasks sequentially or in parallel, depending on their estimated cost.
284
+
285
+ The function evaluates the runtime of the first task to decide whether parallelization is worth
286
+ the overhead of process creation. If parallel execution is deemed beneficial, the remaining tasks
287
+ are distributed across several blocks processed in parallel. Otherwise, all tasks are executed
288
+ sequentially. This strategy is especially useful when task runtimes are variable or short compared
289
+ to process spawning costs.
290
+
291
+ Parameters
292
+ ----------
293
+ funcs : Union[Callable, Iterable[Callable]]
294
+ Function or list of functions to execute.
295
+ - If a single function is provided, it will be applied to each argument tuple in `all_args`.
296
+ - If a list of functions is provided, it must have the same length as `all_args`, allowing each
297
+ task to use a distinct callable.
298
+ all_args : Union[Iterable[tuple], None], optional
299
+ Iterable of tuples containing the positional arguments for each function call.
300
+ If the function takes no arguments, set `all_args` to `None` (defaults to empty tuples).
301
+ num_blocks : Union[int, None], optional
302
+ Number of parallel blocks (i.e., worker processes) to use. Defaults to half the number of CPU cores.
303
+ A value of 1 forces sequential execution.
304
+ verbose : bool, optional
305
+ If True, displays timing information and progress bars. Default is True.
306
+ pbar_title : str, optional
307
+ Title prefix displayed in the progress bar. Default is "Processing blocks".
308
+ disable_parallel : bool, optional
309
+ If True, forces all computations to run sequentially regardless of estimated profitability.
310
+ Default is False.
311
+ est_proc_cost : float, optional
312
+ Estimated process creation cost in seconds. Used to determine whether parallelization
313
+ will yield a net speedup. Default is 0.5 s.
314
+ shared_mem_last_arg : Union[np.ndarray, None], optional
315
+ Shared-memory NumPy array to be appended automatically as the last argument in each task.
316
+ This allows tasks to read from a large, read-only array without duplicating it in memory.
317
+ Default is None.
318
+
319
+ Returns
320
+ -------
321
+ list
322
+ List of results, one per task, preserving the input order.
323
+
324
+ Notes
325
+ -----
326
+ - The first task is executed sequentially to estimate its runtime.
327
+ - Parallelization is enabled only if the estimated time saved exceeds the cost of process creation.
328
+ - When parallel mode is used, tasks are executed in blocks, and intermediate results are stored
329
+ temporarily on disk to limit memory usage, then reloaded and combined sequentially.
330
+ - Compatible with Jupyter progress bars (`tqdm.notebook`).
331
+ """
332
+ if num_blocks is None:
333
+ num_blocks = max(1, os.cpu_count() // 2)
334
+
335
+ if callable(funcs):
336
+ assert (
337
+ all_args is not None
338
+ ), "If 'funcs' is a single callable, 'all_args' must be provided as a list of argument tuples."
339
+ funcs = [funcs] * len(all_args)
340
+ else:
341
+ if all_args is None:
342
+ all_args = [()] * len(funcs)
343
+ assert len(funcs) == len(
344
+ all_args
345
+ ), "If 'funcs' is an iterable of callables, its length must match the number of argument tuples in 'all_args'."
346
+
347
+ n_tasks = len(all_args)
348
+ if disable_parallel or num_blocks == 1 or n_tasks <= 1:
349
+ return parallel_blocks_inner(
350
+ funcs, all_args, num_blocks, verbose, pbar_title, True, shared_mem_last_arg
351
+ )
352
+
353
+ t0 = time.time()
354
+ first_result = (
355
+ funcs[0](*all_args[0])
356
+ if shared_mem_last_arg is None
357
+ else funcs[0](*all_args[0], shared_mem_last_arg)
358
+ )
359
+ t_first = time.time() - t0
360
+ t_thresh = (num_blocks / (num_blocks - 1)) * (num_blocks / n_tasks) * est_proc_cost
361
+ disable_parallel = t_first <= t_thresh
362
+ if verbose:
363
+ print(
364
+ f"First task time: {t_first:.3f}s, t_seuil: {t_thresh:.3f}s -> "
365
+ f"{'Sequential' if disable_parallel else 'Parallel'}"
366
+ )
367
+ results_rest = parallel_blocks_inner(
368
+ funcs[1:],
369
+ all_args[1:],
370
+ num_blocks,
371
+ verbose,
372
+ pbar_title,
373
+ disable_parallel,
374
+ shared_mem_last_arg,
375
+ )
376
+ results = [first_result] + results_rest
377
+
378
+ return results