integrate_module 0.99.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1836 @@
1
+ """
2
+ Rejection Sampling Module for INTEGRATE
3
+
4
+ This module contains functions for Bayesian inversion using rejection sampling methodology.
5
+ It includes the main rejection sampling algorithm, likelihood calculations, and parallel
6
+ processing support for efficient posterior sampling.
7
+
8
+ Key Functions:
9
+ - integrate_rejection(): Main rejection sampling function
10
+ - integrate_rejection_range(): Core rejection sampling for data point ranges
11
+ - likelihood_*(): Various likelihood calculation functions
12
+ - Shared memory functions for parallel processing
13
+ """
14
+
15
+ import numpy as np
16
+ import h5py
17
+ import os
18
+ import time
19
+ import multiprocessing
20
+ from multiprocessing import shared_memory
21
+ from datetime import datetime
22
+ from tqdm import tqdm
23
+ import logging
24
+
25
+ # Set up logging
26
+ logger = logging.getLogger(__name__)
27
+
28
+ def integrate_rejection(f_prior_h5='prior.h5',
29
+ f_data_h5='DAUGAAD_AVG_inout.h5',
30
+ f_post_h5='',
31
+ N_use=100000000000,
32
+ id_use=[],
33
+ ip_range=[],
34
+ nr=1000,
35
+ autoT=1,
36
+ T_base = 1,
37
+ Nchunks=0,
38
+ Ncpu=0,
39
+ parallel=True,
40
+ use_N_best=0,
41
+ T_N_above=None,
42
+ T_P_acc_level=None,
43
+ progress_callback=None,
44
+ console_progress=None,
45
+ backend='numpy',
46
+ **kwargs):
47
+ """
48
+ Perform probabilistic inversion using rejection sampling.
49
+
50
+ This is the main function for Bayesian inversion using rejection sampling methodology.
51
+ It samples the posterior distribution by rejecting prior samples that are inconsistent
52
+ with observed data within a temperature-controlled tolerance. Supports parallel processing
53
+ and automatic temperature estimation for efficient sampling.
54
+
55
+ Parameters
56
+ ----------
57
+ f_prior_h5 : str, optional
58
+ Path to HDF5 file containing prior model and data samples.
59
+ Default is 'prior.h5'.
60
+ f_data_h5 : str, optional
61
+ Path to HDF5 file containing observed data for inversion.
62
+ Default is 'DAUGAAD_AVG_inout.h5'.
63
+ f_post_h5 : str, optional
64
+ Output path for posterior samples. If empty, auto-generated from prior filename.
65
+ Default is empty string.
66
+ N_use : int, optional
67
+ Maximum number of prior samples to use for inversion.
68
+ Default is 100000000000.
69
+ id_use : list, optional
70
+ List of data identifiers to use for inversion. If empty, uses all available data.
71
+ Default is empty list.
72
+ ip_range : list, optional
73
+ List of data point indices to invert. If empty, inverts all data points.
74
+ Default is empty list.
75
+ nr : int, optional
76
+ Number of posterior samples to retain per data point.
77
+ Default is 1000.
78
+ autoT : int, optional
79
+ Automatic temperature estimation method (1=enabled, 0=disabled).
80
+ Default is 1.
81
+ T_base : float, optional
82
+ Base temperature for rejection sampling when autoT=0.
83
+ Default is 1.
84
+ Nchunks : int, optional
85
+ Number of chunks for parallel processing. If 0, auto-determined.
86
+ Default is 0.
87
+ Ncpu : int, optional
88
+ Number of CPU cores to use. If 0, auto-determined from system.
89
+ Default is 0.
90
+ parallel : bool, optional
91
+ Enable parallel processing if environment supports it.
92
+ Default is True.
93
+ use_N_best : int, optional
94
+ Use only the N best-fitting samples (0=disabled).
95
+ Default is 0.
96
+ T_N_above : int, optional
97
+ Number of top samples used by ``logl_T_est`` to estimate the annealing
98
+ temperature. Passed as ``N_above`` to :func:`logl_T_est`.
99
+ Default is None (uses ``logl_T_est`` default of 10).
100
+ T_P_acc_level : float, optional
101
+ Target acceptance probability level used by ``logl_T_est`` to estimate
102
+ the annealing temperature. Passed as ``P_acc_lev`` to :func:`logl_T_est`.
103
+ Default is None (uses ``logl_T_est`` default of 0.2).
104
+ progress_callback : callable, optional
105
+ Callback function for progress updates. Called as progress_callback(current, total, info_dict).
106
+ Default is None (no callback).
107
+ console_progress : bool, optional
108
+ Whether to show console TQDM progress bar. If None, auto-detects based on progress_callback.
109
+ Default is None.
110
+ backend : str, optional
111
+ Computation backend to use. ``'numpy'`` (default) uses the original
112
+ NumPy/multiprocessing implementation. ``'jax'`` uses a JIT-compiled,
113
+ vmapped JAX implementation that processes data points in batches; pass
114
+ ``Nbatch=<int>`` (via **kwargs, default 64) to tune the batch size.
115
+ JAX must be installed separately: ``pip install jax``.
116
+ **kwargs : dict
117
+ Additional keyword arguments including showInfo, updatePostStat, post_dir,
118
+ and Nbatch (batch size for backend='jax').
119
+
120
+ Returns
121
+ -------
122
+ str
123
+ Path to the output HDF5 file containing posterior samples and statistics.
124
+
125
+ Notes
126
+ -----
127
+ The function automatically determines optimal processing parameters based on data size
128
+ and system capabilities. Temperature annealing is used to improve sampling efficiency.
129
+
130
+ Large datasets may require significant memory and processing time. Monitor system
131
+ resources during execution.
132
+
133
+ Examples
134
+ --------
135
+ >>> import integrate as ig
136
+ >>> f_post = ig.integrate_rejection('prior.h5', 'data.h5', N_use=10000)
137
+ >>> print(f"Results saved to: {f_post}")
138
+ """
139
+ # Safety guard: if somehow called from a worker process, do nothing.
140
+ if multiprocessing.current_process().name != 'MainProcess':
141
+ return None
142
+
143
+ import integrate as ig
144
+
145
+ # get optional arguments
146
+ showInfo = kwargs.get('showInfo', 0)
147
+ updatePostStat = kwargs.get('updatePostStat', True)
148
+ # If set, Nproc will be used as the number of processors
149
+ Ncpu = kwargs.get('Nproc', Ncpu)
150
+ Ncpu = kwargs.get('N_cpu', Ncpu) # Allow using N_cpu instead of Ncpu
151
+ Nchunks = kwargs.get('N_chunks', Nchunks) # Allow using N_chunks instead of Nchunks
152
+ posterior_output_path = kwargs.get('post_dir', os.getcwd())
153
+
154
+ # Setup progress callback functionality
155
+ if console_progress is None:
156
+ # Auto-detect: disable console if callback provided
157
+ console_progress = (progress_callback is None)
158
+
159
+ def update_progress(current, total, extra_info=None):
160
+ """Update both TQDM and GUI callback"""
161
+ if progress_callback:
162
+ try:
163
+ info = {
164
+ 'data_point': current,
165
+ 'total_points': total,
166
+ 'phase': extra_info.get('phase', 'processing') if extra_info else 'processing',
167
+ 'status': extra_info.get('status', '') if extra_info else ''
168
+ }
169
+ if extra_info:
170
+ info.update(extra_info)
171
+ progress_callback(current, total, info)
172
+ except Exception as e:
173
+ # Don't break main process on callback error
174
+ if showInfo > 0:
175
+ print(f"Progress callback error: {e}")
176
+ import traceback
177
+ traceback.print_exc()
178
+
179
+ # Note: TQDM disabling is handled in individual tqdm() calls via disable parameter
180
+
181
+ if Ncpu < 1 :
182
+ Ncpu = int(multiprocessing.cpu_count())
183
+ # Set Ncpu to be min of Ncpu and 8
184
+ # as no gain is expected from using more than 8 processors
185
+ Ncpu = min(Ncpu, 8)
186
+
187
+ # Initial progress update - starting process
188
+ if progress_callback:
189
+ update_progress(0, 1, {'phase': 'initializing', 'status': 'Starting rejection sampling'})
190
+
191
+ # Set default f_post_h5 filename if not set
192
+ if len(f_post_h5)==0:
193
+ # Extract the base name of f_prior_h5 without its path or extension
194
+ f_prior_basename = os.path.splitext(os.path.basename(f_prior_h5))[0]
195
+
196
+ # Construct the new filename
197
+ f_post_h5 = os.path.join(posterior_output_path, "POST_%s_Nu%d_aT%d.h5" % (f_prior_basename, N_use, autoT))
198
+
199
+ # Check that f_post_h5 allready exists, and warn the user
200
+ if os.path.isfile(f_post_h5):
201
+ if (showInfo>0):
202
+ print('File %s allready exists' % f_post_h5)
203
+ print('Overwriting...')
204
+
205
+
206
+ # Load ALL observed data from f_data_h5, mostly to find out how many data types there are
207
+ # This could be more efficient.
208
+ #DATA = ig.load_data(f_data_h5, showInfo=showInfo)
209
+ #Ndt = len(DATA['d_obs']) # Number of data types
210
+ Ndt_total = ig.get_number_of_datasets(f_data_h5)
211
+ Ndt = ig.get_number_of_datasets(f_data_h5)
212
+
213
+
214
+ # if if_use is not a list, convert it to a list
215
+ if not isinstance(id_use, list):
216
+ id_use = [id_use]
217
+
218
+ if len(id_use)==0:
219
+ id_use = np.arange(1,Ndt_total+1).tolist()
220
+
221
+
222
+ Ndt = len(id_use) # Number of data types used
223
+
224
+ if showInfo>1:
225
+ print('-- Number of data types: %d' % Ndt_total)
226
+
227
+ print('-- Using these data types: %s' % str(id_use))
228
+
229
+
230
+ # Load the observed data from the h5 files
231
+ DATA = ig.load_data(f_data_h5, id_arr=id_use, showInfo=showInfo)
232
+
233
+ # Load the prior data from the h5 files
234
+ id_data_prior = DATA['id_prior']
235
+ D, idx = ig.load_prior_data(f_prior_h5, id_use=id_data_prior, N_use=N_use, Randomize=True, showInfo=showInfo)
236
+
237
+
238
+ if showInfo>1:
239
+ for i in range(len(D)):
240
+ print('Memory size of /D%d: %s' % (id_use[i], str(np.array(D[i]).nbytes)))
241
+
242
+ # Print infomration about DATA and D, and make sure the same data types are used in both D and DATA
243
+ print(' Number of data types in DATA: %d' % len(DATA['d_obs']))
244
+ print(' Number of data types in D: %d' % len(D))
245
+ for i in range(len(id_use)):
246
+ print(' Size of data in DATA:/D%d: (%d, %d)' % (id_use[i], DATA['d_obs'][i].shape[0], DATA['d_obs'][i].shape[1]))
247
+ print(' Size of prior data PRIOR:/D%d: (%d, %d)' % (id_use[i], D[i].shape[0], D[i].shape[1]))
248
+
249
+
250
+ #D, idx = load_prior_data(f_prior_h5, id_use = id_use, N_use = N_use, Randomize=True)
251
+ # M, idx = load_prior_model(f_prior_h5, idx=idx, N_use=N_use, Randomize=True)
252
+
253
+ # Get sample size N from f_prior_h5
254
+ N = D[0].shape[0]
255
+ if N_use>N:
256
+ N_use = N
257
+
258
+ # Get number of data points from, f_data_h5
259
+ Ndp = DATA['d_obs'][0].shape[0]
260
+
261
+ # if ip_range is empty then use all data points
262
+ if len(ip_range)==0:
263
+ ip_range = np.arange(Ndp)
264
+ Ndp_invert = len(ip_range)
265
+
266
+ # Store the ip_range for later use in integrate_posterior_stats
267
+ ip_range_for_stats = np.copy(ip_range)
268
+
269
+ if Ncpu ==1:
270
+ parallel = False
271
+
272
+ if showInfo>0:
273
+ print('<--INTEGRATE_REJECTION-->')
274
+ print('f_prior_h5=%s, f_data_h5=%s\nf_post_h5=%s' % (f_prior_h5, f_data_h5, f_post_h5))
275
+
276
+ if showInfo>1:
277
+ print('Number of data points: %d (available), %d (used). Nchunks=%s, Ncpu=%d,use_N_best=%d' % (Ndp,Ndp_invert,Nchunks,Ncpu,use_N_best))
278
+ print('N_use = %d' % (N_use))
279
+ print('Ndp to invert = %d, ip_range=%s' % (len(ip_range),str(ip_range)))
280
+ print('use_N_best=%d' % use_N_best)
281
+ print('Number of data types: %d' % Ndt)
282
+ print('Using these data types: %s' % str(id_use))
283
+ print('Loaded these prior data model types:', str(id_data_prior))
284
+
285
+ # set i_use_all to be a 2d Matrix of size (nump,nr) of random integers in range(N)
286
+ i_use_all = np.random.randint(0, N, (Ndp, nr))
287
+ N_UNIQUE_all = np.zeros(Ndp)*np.nan
288
+ T_all = np.zeros(Ndp)*np.nan
289
+ EV_all = np.zeros(Ndp)*np.nan
290
+ # 'posterior' evience - mean posterior likelihood TODO
291
+ EV_post_all = np.zeros(Ndp)*np.nan
292
+ EV_post_all_mean = np.zeros(Ndp)*np.nan
293
+ CHI2_all = np.zeros((Ndp, Ndt))*np.nan
294
+
295
+ date_start = str(datetime.now())
296
+ t_start = datetime.now()
297
+
298
+
299
+ # Resolve None to logl_T_est defaults before forwarding into parallel/serial paths
300
+ _T_N_above = T_N_above if T_N_above is not None else 10
301
+ _T_P_acc_level = T_P_acc_level if T_P_acc_level is not None else 0.2
302
+
303
+ # Depending in whether parallel processing is used or not,
304
+ # two function are implemented to perform the inversion directly on the loaded data.
305
+ # A third option, backend='jax', uses JAX for likelihood computation.
306
+
307
+
308
+ if backend == 'jax':
309
+ from integrate.integrate_rejection_jax import integrate_rejection_range_jax
310
+ Nbatch = kwargs.pop('Nbatch', 64)
311
+ i_use, T, EV, EV_post, EV_post_mean, CHI2, N_UNIQUE, ip_range = integrate_rejection_range_jax(
312
+ D=D,
313
+ DATA=DATA,
314
+ idx=idx,
315
+ N_use=N_use,
316
+ id_use=id_use,
317
+ ip_range=ip_range,
318
+ autoT=autoT,
319
+ T_base=T_base,
320
+ nr=nr,
321
+ T_N_above=_T_N_above,
322
+ T_P_acc_level=_T_P_acc_level,
323
+ progress_callback=progress_callback,
324
+ Nbatch=Nbatch,
325
+ **kwargs,
326
+ )
327
+ for i in range(len(ip_range)):
328
+ ip = ip_range[i]
329
+ i_use_all[ip] = i_use[i]
330
+ T_all[ip] = T[i]
331
+ EV_all[ip] = EV[i]
332
+ EV_post_all[ip] = EV_post[i]
333
+ EV_post_all_mean[ip] = EV_post_mean[i]
334
+ CHI2_all[ip, :] = CHI2[i, :]
335
+ N_UNIQUE_all[ip] = N_UNIQUE[i]
336
+
337
+ elif parallel:
338
+ # Split the ip_range into Nchunks
339
+ if Nchunks==0:
340
+ if parallel:
341
+ if progress_callback is not None:
342
+ # Finer chunking gives smoother live progress updates
343
+ Nchunks = min(len(ip_range), Ncpu * 4)
344
+ else:
345
+ Nchunks = Ncpu
346
+ else:
347
+ Nchunks = 1
348
+ ip_range_shuffled = ip_range.copy()
349
+ np.random.shuffle(ip_range_shuffled)
350
+ ip_chunks = np.array_split(ip_range_shuffled, Nchunks)
351
+
352
+ if showInfo>1:
353
+ print('Ncpu = %d\nNchunks=%d' % (Ncpu, Nchunks))
354
+
355
+ i_use_all, T_all, EV_all, EV_post_all, EV_post_all_mean, CHI2_all, N_UNIQUE_all = integrate_posterior_main(
356
+ ip_chunks=ip_chunks,
357
+ D=D,
358
+ DATA = DATA,
359
+ idx = idx,
360
+ N_use=N_use,
361
+ id_use=id_use,
362
+ autoT=autoT,
363
+ T_base=T_base,
364
+ nr=nr,
365
+ Ncpu=Ncpu,
366
+ use_N_best=use_N_best,
367
+ T_N_above=_T_N_above,
368
+ T_P_acc_level=_T_P_acc_level,
369
+ progress_callback=update_progress if progress_callback else None,
370
+ )
371
+
372
+
373
+ else:
374
+
375
+ i_use, T, EV, EV_post, EV_post_mean, CHI2, N_UNIQUE, ip_range = integrate_rejection_range(D=D,
376
+ DATA = DATA,
377
+ idx = idx,
378
+ N_use=N_use,
379
+ id_use=id_use,
380
+ ip_range=ip_range,
381
+ autoT=autoT,
382
+ T_base = T_base,
383
+ nr=nr,
384
+ use_N_best=use_N_best,
385
+ T_N_above=_T_N_above,
386
+ T_P_acc_level=_T_P_acc_level,
387
+ progress_callback=progress_callback,
388
+ console_progress=console_progress,
389
+ **kwargs
390
+ )
391
+
392
+ for i in range(len(ip_range)):
393
+ ip = ip_range[i]
394
+ #print('ip=%d, i=%d' % (ip,i))
395
+ i_use_all[ip] = i_use[i]
396
+ T_all[ip] = T[i]
397
+ EV_all[ip] = EV[i]
398
+ EV_post_all[ip] = EV_post[i]
399
+ EV_post_all_mean[ip] = EV_post_mean[i]
400
+ CHI2_all[ip, :] = CHI2[i, :]
401
+ N_UNIQUE_all[ip] = N_UNIQUE[i]
402
+
403
+ # WHere T_all is Inf set it to Nan
404
+ T_all[T_all==np.inf] = np.nan
405
+ EV_all[EV_all==np.inf] = np.nan
406
+
407
+ date_end = str(datetime.now())
408
+ t_end = datetime.now()
409
+ t_elapsed = (t_end - t_start).total_seconds()
410
+ t_per_sounding = t_elapsed / Ndp_invert
411
+ if (showInfo>-1):
412
+ print('integrate_rejection: Time=%5.1fs/%d soundings, %4.1fms/sounding, %3.1fit/s. ' % (t_elapsed,Ndp_invert,t_per_sounding*1000,Ndp_invert/t_elapsed), end='')
413
+ print('T_av=%3.1f, EV_av=%3.1f' % (np.nanmean(T_all), np.nanmean(EV_all)))
414
+
415
+ # SAVE THE RESULTS to f_post_h5
416
+ with h5py.File(f_post_h5, 'w') as f_post:
417
+ f_post.create_dataset('i_use', data=i_use_all)
418
+ f_post.create_dataset('T', data=T_all)
419
+ f_post.create_dataset('EV', data=EV_all)
420
+ f_post.create_dataset('EV_post', data=EV_post_all)
421
+ f_post.create_dataset('EV_post_mean', data=EV_post_all_mean)
422
+ f_post.create_dataset('CHI2', data=CHI2_all)
423
+ f_post.create_dataset('N_UNIQUE', data=N_UNIQUE_all)
424
+ #f_post.create_dataset('ip_range', data=ip_range)
425
+ f_post.attrs['date_start'] = date_start
426
+ f_post.attrs['date_end'] = date_end
427
+ f_post.attrs['inv_time'] = t_elapsed
428
+ f_post.attrs['f5_prior'] = f_prior_h5
429
+ f_post.attrs['f5_data'] = f_data_h5
430
+ f_post.attrs['N_use'] = N_use
431
+
432
+ # Update progress - saving results
433
+ if progress_callback:
434
+ update_progress(len(ip_range), len(ip_range), {'phase': 'saving', 'status': 'Results saved to HDF5 file'})
435
+
436
+ if updatePostStat:
437
+ if progress_callback:
438
+ update_progress(len(ip_range), len(ip_range), {'phase': 'post_processing', 'status': 'Computing posterior statistics'})
439
+ ig.integrate_posterior_stats(f_post_h5, ip_range=ip_range_for_stats, **kwargs)
440
+
441
+ # Final progress update - completion
442
+ if progress_callback:
443
+ update_progress(len(ip_range), len(ip_range), {'phase': 'completed', 'status': 'Integration completed successfully'})
444
+
445
+ #return f_post_h5 T_all, EV_all, i_use_all
446
+ return f_post_h5
447
+
448
+
449
+
450
+ def integrate_rejection_range(D,
451
+ DATA,
452
+ idx = [],
453
+ N_use=None,
454
+ id_use=[],
455
+ ip_range=[],
456
+ nr=1000,
457
+ autoT=1,
458
+ T_base = 1,
459
+ T_N_above=10,
460
+ T_P_acc_level=0.2,
461
+ progress_callback=None,
462
+ **kwargs):
463
+ """
464
+ Perform rejection sampling for a specific range of data points.
465
+
466
+ This function implements the core rejection sampling algorithm for a subset of data points.
467
+ It evaluates likelihood for each data point in the range and accepts/rejects prior samples
468
+ based on temperature-controlled criteria. Used internally by integrate_rejection for
469
+ both serial and parallel processing.
470
+
471
+ Parameters
472
+ ----------
473
+ D : list
474
+ List of forward modeled data arrays for each data type.
475
+ DATA : dict
476
+ Dictionary containing observed data including 'd_obs', 'd_std', and other data arrays.
477
+ idx : list, optional
478
+ Indices of prior samples to use. If empty, uses sequential indexing.
479
+ Default is empty list.
480
+ N_use : int, optional
481
+ Maximum number of prior samples to evaluate.
482
+ Default is 1000.
483
+ id_use : list, optional
484
+ List of data identifiers to use for likelihood calculation.
485
+ Default is [] which use all data types available.
486
+ ip_range : list, optional
487
+ Range of data point indices to process. If empty, processes all data points.
488
+ Default is empty list.
489
+ nr : int, optional
490
+ Number of posterior samples to retain per data point.
491
+ Default is 1000.
492
+ autoT : int, optional
493
+ Automatic temperature estimation method (1=enabled, 0=disabled).
494
+ Default is 1.
495
+ T_base : float, optional
496
+ Base temperature for rejection sampling when autoT=0.
497
+ Default is 1.
498
+ T_N_above : int, optional
499
+ Number of top samples used by ``logl_T_est`` to estimate the annealing
500
+ temperature. Passed as ``N_above`` to :func:`logl_T_est`.
501
+ Default is 10.
502
+ T_P_acc_level : float, optional
503
+ Target acceptance probability level used by ``logl_T_est`` to estimate
504
+ the annealing temperature. Passed as ``P_acc_lev`` to :func:`logl_T_est`.
505
+ Default is 0.2.
506
+ progress_callback : callable, optional
507
+ Optional callback function for progress updates. Called with (current, total).
508
+ Default is None (no callbacks).
509
+ **kwargs : dict
510
+ Additional arguments including useRandomData, showInfo, use_N_best.
511
+
512
+ Returns
513
+ -------
514
+ i_use_all : ndarray, shape (nump, nr)
515
+ Indices of accepted posterior samples for each data point.
516
+ T_all : ndarray, shape (nump,)
517
+ Temperature values used for each data point.
518
+ EV_all : ndarray, shape (nump,)
519
+ Evidence values for each data point.
520
+ EV_post_all : ndarray, shape (nump,)
521
+ Posterior evidence values for each data point.
522
+ N_UNIQUE_all : ndarray, shape (nump,)
523
+ Number of unique samples for each data point.
524
+ ip_range : ndarray
525
+ Range of data point indices that were processed.
526
+
527
+ Notes
528
+ -----
529
+ This function is the computational core of the rejection sampling algorithm.
530
+ It handles temperature annealing and likelihood evaluation for efficient sampling.
531
+
532
+ The algorithm evaluates the likelihood of observed data given forward modeled data
533
+ for each prior sample, then uses temperature-controlled acceptance criteria to
534
+ select posterior samples that are consistent with observations.
535
+ """
536
+
537
+ import integrate as ig
538
+
539
+ # get optional arguments
540
+ use_N_best = kwargs.get('use_N_best', 0)
541
+ #print("use_N_best=%d" % use_N_best)
542
+ showInfo = kwargs.get('showInfo', 0)
543
+ console_progress = kwargs.get('console_progress', True)
544
+ if (showInfo<0):
545
+ disableTqdm=True
546
+ else:
547
+ disableTqdm=not console_progress
548
+
549
+ useRandomData = kwargs.get('useRandomData', True)
550
+ #useRandomData = kwargs.get('useRandomData', False)
551
+
552
+
553
+ # Get number of data points
554
+ Ndp = DATA['d_obs'][0].shape[0]
555
+ # if ip_range is empty then use all data points
556
+ if len(ip_range)==0:
557
+ ip_range = np.arange(Ndp)
558
+
559
+ nump=len(ip_range)
560
+
561
+ # Get number of data types used - needed for array initialization
562
+ if len(id_use)==0:
563
+ # Get nmumber of data points from
564
+ Ndt=len(DATA['d_obs'])
565
+ id_use = np.arange(Ndt)
566
+ Ndt = len(id_use)
567
+
568
+ i_use_all = np.zeros((nump, nr), dtype=np.int32)
569
+ T_all = np.zeros(nump)*np.nan
570
+ EV_all = np.zeros(nump)*np.nan
571
+ EV_post_all = np.zeros(nump)*np.nan
572
+ EV_post_all_mean = np.zeros(nump)*np.nan
573
+ CHI2_all = np.zeros((nump, Ndt))*np.nan
574
+ N_UNIQUE_all = np.zeros(nump)*np.nan
575
+
576
+
577
+ # Get the lookup sample size
578
+ N = D[0].shape[0]
579
+ if N_use is None:
580
+ N_use = N
581
+
582
+ if N_use>N:
583
+ N_use = N
584
+
585
+ if len(idx)==0:
586
+ idx = np.arange(N_use)
587
+
588
+ noise_model = DATA['noise_model']
589
+ i_use_data = DATA['i_use']
590
+
591
+ if showInfo>1:
592
+ print('Number of data points to invert: %d' % nump)
593
+ print('Number of data type(s) used, Ndt=%d' % Ndt)
594
+ print('Noise model(s):', noise_model)
595
+
596
+ # Convert class id to index
597
+ # The class_id_list, could /should be loaded prior_h5:/M1/class_id !!
598
+
599
+ # Select whether to convert CLASS to IDX before doing inversion?
600
+ # class_is_idx = True is MUCH faster!!
601
+ class_is_idx = True
602
+ #class_is_idx = False
603
+
604
+
605
+ class_id_list = []
606
+ updated_data_ids = []
607
+ for i in range(Ndt):
608
+ i_prior = i
609
+ if (noise_model[i]=='multinomial'):
610
+ Di, class_id, class_id_out = ig.class_id_to_idx(D[i_prior])
611
+ #print(class_id_out)
612
+ if (class_is_idx)&(i_prior not in updated_data_ids):
613
+ updated_data_ids.append(i_prior)
614
+ D[i_prior]=Di
615
+ if showInfo>1:
616
+ print('Updated prior id %d' % i_prior)
617
+
618
+ if (class_is_idx):
619
+ class_id_list.append(class_id_out)
620
+ else:
621
+ class_id_list.append(class_id)
622
+
623
+ else:
624
+ class_id_list.append([])
625
+
626
+ if showInfo>2:
627
+ print('class_id_list',class_id_list)
628
+ print('len(class_id_list)',len(class_id_list))
629
+
630
+ #print(ip_range)
631
+ # Throttle callback to ~100 updates so it does not dominate runtime
632
+ progress_step = max(1, nump // 100)
633
+ #tqdm(range(nd), mininterval=1, disable=disableTqdm, desc='gatdaem1d', leave=False):
634
+ for j in tqdm(range(len(ip_range)), disable=disableTqdm, desc='Rejection Sampling', leave=False):
635
+ if progress_callback and ((j + 1) % progress_step == 0 or j + 1 == nump):
636
+ try:
637
+ progress_callback(j + 1, nump,
638
+ {'phase': 'sampling',
639
+ 'status': 'Rejection sampling (%d/%d data points)' % (j + 1, nump)})
640
+ except Exception:
641
+ pass
642
+ ip = ip_range[j] # This is the index of the data point to invert
643
+
644
+ t=[]
645
+ N = D[0].shape[0]
646
+ # Get number of data types used - needed for array initialization
647
+ NDsets = len(id_use)
648
+ L = np.zeros((NDsets, N))
649
+
650
+ if showInfo>3:
651
+ print('Ndt=%d, ip=%d/%d, N=%d' % (Ndt, ip, nump, N))
652
+
653
+ # Loop over the number of data types Ndt
654
+ total_n_data_non_nan = 0 # Initialize total count for all data types
655
+ n_data_per_type = np.zeros(Ndt) # Track data count per data type
656
+
657
+ for i in range(Ndt):
658
+ use_data_point = i_use_data[i][ip]
659
+ #print(j)
660
+ #print(ip)
661
+ #print('..')
662
+ #print('i=%g, j=%g, ip=%g' % (i,j,ip))
663
+ #print(use_data_point)
664
+ #use_data_point = 1 # FORCE USE OF DATA POINT
665
+ #use_data_point = 0 # FORCE NOT TO USE DATA POINT
666
+ if showInfo>3:
667
+ print("-i=%d, Using data type %d" % (i,Ndt))
668
+ print("len(D)",len(D))
669
+
670
+ n_data_non_nan=0
671
+ if (use_data_point==1):
672
+ #if i_use_data[i]==1:
673
+ # print('Using data %d' % i)
674
+ #
675
+ if showInfo>3:
676
+ print('j=%4d Using data %d --> %d' % (j,i,use_data_point))
677
+
678
+ # ONLY PERFORM CALUCATION IF I_USE_DATA = 1.. UPDATE LOAD_DATA, TO ALWAY PROVIDE I_USE
679
+ # Select the proper data types. It is give, the integer in 'D1/', 'D2/' etc, so we need to subtract 1
680
+ # as D1 is the first data types D[0]
681
+
682
+ i_prior = i
683
+
684
+ t0=time.time()
685
+ #id = id_use[i_prior]
686
+ if noise_model[i]=='gaussian':
687
+ d_obs = DATA['d_obs'][i][ip]
688
+ n_data_non_nan = np.sum(~np.isnan(d_obs))
689
+ total_n_data_non_nan += n_data_non_nan
690
+ n_data_per_type[i] = n_data_non_nan
691
+
692
+ if DATA['Cd'][0] is not None:
693
+ # if Cd is 3 dimensional, take the first slice
694
+ if len(DATA['Cd'][0].shape) == 3:
695
+ Cd = DATA['Cd'][0][ip]
696
+ else:
697
+ Cd = DATA['Cd'][0][:]
698
+
699
+ L_single = likelihood_gaussian_full(D[i_prior], d_obs, Cd, N_app = use_N_best)
700
+
701
+ elif DATA['d_std'][0] is not None:
702
+ d_std = DATA['d_std'][i][ip]
703
+ #print(d_obs)
704
+ #print(d_std)
705
+ #print(D[i_prior][0])
706
+ L_single = likelihood_gaussian_diagonal(D[i_prior], d_obs, d_std, use_N_best)
707
+ #print(L_single[0:3])
708
+ else:
709
+ print('No d_std or Cd in %s' % DS)
710
+
711
+ L[i] = L_single
712
+ t.append(time.time()-t0)
713
+ elif noise_model[i]=='multinomial':
714
+ d_obs = DATA['d_obs'][i][ip]
715
+ n_data_non_nan = np.sum(~np.isnan(d_obs))
716
+ total_n_data_non_nan += n_data_non_nan
717
+ n_data_per_type[i] = n_data_non_nan
718
+
719
+ if showInfo>3:
720
+ print(D[i])
721
+
722
+ class_id = class_id_list[i]
723
+ #print(class_id)
724
+ useMultiNomal = True
725
+ if useMultiNomal:
726
+
727
+ L_single = likelihood_multinomial(D[i_prior],d_obs, np.array(class_id), class_is_idx=class_is_idx)
728
+ #print(L_single[0])
729
+ L[i] = L_single
730
+ t.append(time.time()-t0)
731
+
732
+ else:
733
+ # noise model not regcognized
734
+ # L_single = -1
735
+ pass
736
+ else:
737
+ L[i] = np.zeros(N)
738
+
739
+ t0=time.time()
740
+
741
+ # Now we have all the likelihoods for all data types. Combine them into one
742
+ # L is an array of shape (Ndt,1)
743
+ # If we have only one data type, then L is already correct,
744
+ # and we do not need to sum
745
+ L_single = L
746
+ if Ndt>1:
747
+ L = np.sum(L_single, axis=0)
748
+
749
+ # Automatic annealing temperature estimation, if autoT=1, else use T=T_base
750
+ # T_base = 1 indicates no annealing
751
+ t0=time.time()
752
+ # Compute the annealing temperature
753
+ if autoT == 1:
754
+ T = ig.logl_T_est(L, N_above=T_N_above, P_acc_lev=T_P_acc_level)
755
+ else:
756
+ T = T_base
757
+ # maxlogL = np.nanmax(logL)
758
+ t.append(time.time()-t0)
759
+
760
+ # Find ns realizations of the posterior, using the log-likelihood values logL, and the annealing tempetrature T
761
+
762
+ P_acc = np.exp((1/T) * (L - np.nanmax(L)))
763
+ P_acc[np.isnan(P_acc)] = 0
764
+
765
+ # Select the index of P_acc propportion to the probabilituy given by P_acc
766
+ t0=time.time()
767
+ try:
768
+ if P_acc.shape[0] == 1:
769
+ # This should probably not happen!
770
+ P_acc = P_acc.flatten()
771
+ p=P_acc/np.sum(P_acc)
772
+ i_use = np.random.choice(N, nr, p=p)
773
+ except:
774
+ print('####################################################################')
775
+ print('####################################################################')
776
+ print('Error in np.random.choice for ip=%d' % ip)
777
+ print('####################################################################')
778
+ print('####################################################################')
779
+ i_use = np.random.choice(N, nr)
780
+
781
+
782
+ #print(P_acc.shape)
783
+ #print(p.shape)
784
+ #print(i_use.shape)
785
+
786
+ # Store i_use before reordering (for computing CHI2)
787
+ #i_use_before_reordering = i_use.copy()
788
+ # Compute CHI2 (reduced chi-squared) per data type
789
+ # CHI2 = mean(-2 * log-likelihood) / n_data
790
+ CHI2_current = np.zeros(Ndt) * np.nan
791
+ for i in range(Ndt):
792
+ if n_data_per_type[i] > 0:
793
+ # Get log-likelihood for accepted samples for this data type
794
+ L_accepted = L_single[i, i_use] # Log-likelihood for accepted samples, data type i
795
+
796
+ # Convert log-likelihood to chi-squared: chi2 = -2 * logL
797
+ chi2_samples = -2.0 * L_accepted
798
+
799
+ # Compute mean chi-squared
800
+ chi2_mean = np.nanmean(chi2_samples)
801
+
802
+ # Normalize by number of data points to get reduced chi-squared
803
+ CHI2_current[i] = chi2_mean / n_data_per_type[i]
804
+
805
+ if useRandomData:
806
+ # get the correct index of the subset used
807
+ i_use = idx[i_use]
808
+
809
+ t.append(time.time()-t0)
810
+
811
+ # Compute the evidence
812
+ # Numerically stable log-mean-exp calculation
813
+ max_L = np.nanmax(L)
814
+ EV = max_L + np.log(np.nanmean(np.exp(L - max_L)))
815
+
816
+ # BUG !!!
817
+ # Compute log-'posterior evidence' - mean posterior log-likelihood
818
+ EV_post = np.nan # np.nanmean(exp_logL)
819
+ #EV_post = maxlogL + np.log(np.nansum(exp_logL[i_use])/len(L[i_use]))
820
+
821
+ # Compute normalized posterior evidence per data point
822
+ if total_n_data_non_nan > 0:
823
+ EV_post_mean = EV_post / total_n_data_non_nan
824
+ else:
825
+ EV_post_mean = np.nan
826
+
827
+ t.append(time.time()-t0)
828
+
829
+ pltDegug = 0
830
+ if pltDegug>0:
831
+ import matplotlib.pyplot as plt
832
+ plt.semilogy(d_obs, 'k', linewidth=4)
833
+ plt.semilogy(D[0][i_use].T, 'r', linewidth=1)
834
+ plt.show()
835
+ print(D[0][10])
836
+
837
+ i_use_all[j] = i_use
838
+ T_all[j] = T
839
+ EV_all[j] = EV
840
+ EV_post_all[j] = EV_post
841
+ EV_post_all_mean[j] = EV_post_mean
842
+ CHI2_all[j, :] = CHI2_current
843
+ # find the number of unique indexes
844
+ N_UNIQUE_all[j] = len(np.unique(i_use))
845
+
846
+ if showInfo>2:
847
+ for i in range(len(t)):
848
+ if i<Ndt:
849
+ print(' Time id%d: %f - %s' % (i,t[i],noise_model[i]))
850
+ else:
851
+ print(' Time id%d, sampling: %f' % (i,t[i]))
852
+ print('Time total: %f' % np.sum(t))
853
+
854
+ return i_use_all, T_all, EV_all, EV_post_all, EV_post_all_mean, CHI2_all, N_UNIQUE_all, ip_range
855
+
856
+
857
+
858
+ def integrate_posterior_main(ip_chunks, D, DATA, idx, N_use, id_use, autoT, T_base, nr, Ncpu, use_N_best, T_N_above=10, T_P_acc_level=0.2, progress_callback=None):
859
+ """
860
+ Coordinate parallel processing of posterior sampling across multiple chunks.
861
+
862
+ This function manages the parallel execution of rejection sampling by distributing
863
+ data point chunks across multiple CPU cores. It handles shared memory management
864
+ for efficient data transfer between processes and aggregates results from all chunks.
865
+
866
+ Parameters
867
+ ----------
868
+ ip_chunks : list
869
+ List of data point index chunks for parallel processing.
870
+ D : list
871
+ List of forward modeled data arrays shared across processes.
872
+ DATA : dict
873
+ Dictionary containing observed data structures.
874
+ idx : list
875
+ Indices of prior samples to use for inversion.
876
+ N_use : int
877
+ Maximum number of prior samples per chunk.
878
+ id_use : list
879
+ List of data identifiers for likelihood calculation.
880
+ autoT : int
881
+ Automatic temperature estimation flag.
882
+ T_base : float
883
+ Base temperature for rejection sampling.
884
+ nr : int
885
+ Number of posterior samples to retain per data point.
886
+ Ncpu : int
887
+ Number of CPU cores to use for parallel processing.
888
+ use_N_best : int
889
+ Flag to use only the N best-fitting samples.
890
+ T_N_above : int, optional
891
+ Passed through to ``integrate_rejection_range`` / ``logl_T_est``. Default is 10.
892
+ T_P_acc_level : float, optional
893
+ Passed through to ``integrate_rejection_range`` / ``logl_T_est``. Default is 0.2.
894
+
895
+ Returns
896
+ -------
897
+ i_use_all : ndarray, shape (Ndp, nr)
898
+ Indices of accepted posterior samples for all data points.
899
+ T_all : ndarray, shape (Ndp,)
900
+ Temperature values used for all data points.
901
+ EV_all : ndarray, shape (Ndp,)
902
+ Evidence values for all data points.
903
+ EV_post_all : ndarray, shape (Ndp,)
904
+ Posterior evidence values for all data points.
905
+ N_UNIQUE_all : ndarray, shape (Ndp,)
906
+ Number of unique samples for all data points.
907
+
908
+ Notes
909
+ -----
910
+ This function uses shared memory to minimize data copying overhead during parallel processing.
911
+ Shared memory is automatically cleaned up after processing completion.
912
+
913
+ The function creates a process pool with the specified number of CPUs and distributes
914
+ the work chunks across them. Each worker process operates on a subset of data points
915
+ and returns its results, which are then aggregated by the main process.
916
+ """
917
+ #import integrate as ig
918
+ import multiprocessing
919
+ import os
920
+
921
+ #shared_memory_refs = create_shared_memory(D)
922
+ shared_memory_refs, shm_objects = create_shared_memory(D)
923
+ #reconstructed_arrays = reconstruct_shared_arrays(shared_memory_refs)
924
+
925
+ # Use spawn on Windows and macOS (required for correctness);
926
+ # fork on Linux for performance
927
+ if os.name == 'nt' or (os.name == 'posix' and os.uname().sysname == 'Darwin'):
928
+ ctx = multiprocessing.get_context('spawn')
929
+ else:
930
+ ctx = multiprocessing.get_context('fork')
931
+
932
+ # Prevent spawned workers from re-executing the user's __main__ script.
933
+ # See prior_data_gaaem for a full explanation of this mechanism.
934
+ import sys
935
+ import types as _types
936
+ _main_module = sys.modules.get('__main__')
937
+ _spec_patched = _main_module is not None and getattr(_main_module, '__spec__', None) is None
938
+ if _spec_patched:
939
+ _main_module.__spec__ = _types.SimpleNamespace(name='__main__')
940
+
941
+ try:
942
+ with ctx.Pool(Ncpu) as p:
943
+ # New implementation with shared memory
944
+ chunk_args = [(i, ip_chunks, DATA, idx, N_use, id_use, shared_memory_refs, autoT, T_base, nr, use_N_best, T_N_above, T_P_acc_level) for i in range(len(ip_chunks))]
945
+ if progress_callback is None:
946
+ results = p.map(integrate_posterior_chunk, chunk_args)
947
+ else:
948
+ # imap_unordered yields per finished chunk, enabling live
949
+ # progress. Result order does not matter: each result carries
950
+ # its own ip_range used for aggregation below.
951
+ results = []
952
+ total_dp = sum(len(c) for c in ip_chunks)
953
+ done_dp = 0
954
+ for res in p.imap_unordered(integrate_posterior_chunk, chunk_args):
955
+ results.append(res)
956
+ done_dp += len(res[-1]) # last element is the chunk's ip_range
957
+ progress_callback(done_dp, total_dp,
958
+ {'phase': 'sampling',
959
+ 'status': 'Rejection sampling (%d/%d data points)' % (done_dp, total_dp)})
960
+ # Old implementation where D was copied to each process
961
+ #results = p.map(integrate_posterior_chunk, [(i, ip_chunks, D, DATA, idx, N_use, id_use, shared_memory_refs, autoT, T_base, nr, use_N_best) for i in range(len(ip_chunks))])
962
+ finally:
963
+ # Always clean up shared memory
964
+ if shm_objects:
965
+ cleanup_shared_memory(shm_objects)
966
+ if _spec_patched:
967
+ _main_module.__spec__ = None
968
+
969
+ # Cleanup shared memory
970
+ #cleanup_shared_memory(shared_memory_refs)
971
+ cleanup_shared_memory(shm_objects)
972
+
973
+
974
+ # Get sample size N from f_prior_h5
975
+ N=D[0].shape[0]
976
+
977
+ # Get number of data points from, f_data_h5
978
+ Ndp = DATA['d_obs'][0].shape[0]
979
+
980
+ # Get number of data types
981
+ Ndt = len(id_use)
982
+
983
+ i_use_all = np.random.randint(0, N, (Ndp, nr))
984
+ T_all = np.zeros(Ndp)*np.nan
985
+ EV_all = np.zeros(Ndp)*np.nan
986
+ EV_post_all = np.zeros(Ndp)*np.nan
987
+ EV_post_all_mean = np.zeros(Ndp)*np.nan
988
+ CHI2_all = np.zeros((Ndp, Ndt))*np.nan
989
+ N_UNIQUE_all = np.zeros(Ndp)*np.nan
990
+
991
+ for i, (i_use, T, EV, EV_post, EV_post_mean, CHI2, N_UNIQUE, ip_range) in enumerate(results):
992
+ for i in range(len(ip_range)):
993
+ ip = ip_range[i]
994
+ #print('ip=%d, i=%d' % (ip,i))
995
+ i_use_all[ip] = i_use[i]
996
+ T_all[ip] = T[i]
997
+ EV_all[ip] = EV[i]
998
+ EV_post_all[ip] = EV_post[i]
999
+ EV_post_all_mean[ip] = EV_post_mean[i]
1000
+ CHI2_all[ip, :] = CHI2[i, :]
1001
+ N_UNIQUE_all[ip] = N_UNIQUE[i]
1002
+
1003
+ return i_use_all, T_all, EV_all, EV_post_all, EV_post_all_mean, CHI2_all, N_UNIQUE_all
1004
+
1005
+
1006
+
1007
+ def integrate_posterior_chunk(args):
1008
+ """
1009
+ Process a single chunk of data points for parallel rejection sampling.
1010
+
1011
+ This function is called by each worker process in the parallel processing pool.
1012
+ It reconstructs shared data arrays, processes the assigned chunk of data points
1013
+ using rejection sampling, and returns results for aggregation by the main process.
1014
+
1015
+ Parameters
1016
+ ----------
1017
+ args : tuple
1018
+ Packed arguments: (i_chunk, ip_chunks, DATA, idx, N_use, id_use,
1019
+ shared_memory_refs, autoT, T_base, nr, use_N_best, T_N_above, T_P_acc_level). See
1020
+ ``integrate_rejection_range`` for descriptions of individual fields.
1021
+
1022
+ Returns
1023
+ -------
1024
+ i_use : ndarray, shape (nump, nr)
1025
+ Indices of accepted posterior samples for the chunk.
1026
+ T : ndarray, shape (nump,)
1027
+ Temperature values used for the chunk.
1028
+ EV : ndarray, shape (nump,)
1029
+ Evidence values for the chunk.
1030
+ EV_post : ndarray, shape (nump,)
1031
+ Posterior evidence values for the chunk.
1032
+ N_UNIQUE : ndarray, shape (nump,)
1033
+ Number of unique samples for the chunk.
1034
+ ip_range : ndarray
1035
+ Range of data point indices that were processed.
1036
+
1037
+ Notes
1038
+ -----
1039
+ This function runs in a separate process and communicates with the main process
1040
+ through shared memory for data arrays and return values through the process pool.
1041
+
1042
+ The function first reconstructs the shared data arrays from memory references,
1043
+ then calls integrate_rejection_range to perform the actual rejection sampling
1044
+ on the assigned chunk of data points.
1045
+ """
1046
+ #import integrate as ig
1047
+
1048
+ # New implementation with shared memory
1049
+ i_chunk, ip_chunks, DATA, idx, N_use, id_use, shared_memory_refs, autoT, T_base, nr, use_N_best, T_N_above, T_P_acc_level = args
1050
+ # Old implementation where D was copied to each process
1051
+ #i_chunk, ip_chunks, D, DATA, idx, N_use, id_use, shared_memory_refs, autoT, T_base, nr, use_N_best = args
1052
+ #D=reconstruct_shared_arrays(shared_memory_refs)
1053
+
1054
+ # Reconstruct shared arrays without copying - returns tuple (arrays, shm_objects)
1055
+ D, worker_shm_objects = reconstruct_shared_arrays(shared_memory_refs)
1056
+
1057
+ try:
1058
+ # Perhaps truncat according to N_use
1059
+ #for i in len(D)
1060
+ # D[i] = D[i][:N_use]
1061
+
1062
+ ip_range = ip_chunks[i_chunk]
1063
+
1064
+ #print(f'Chunk {i_chunk+1}/{len(ip_chunks)}, ndp={len(ip_range)}')
1065
+
1066
+ i_use, T, EV, EV_post, EV_post_mean, CHI2, N_UNIQUE, ip_range = integrate_rejection_range(
1067
+ D,
1068
+ DATA,
1069
+ idx,
1070
+ N_use=N_use,
1071
+ id_use=id_use,
1072
+ ip_range=ip_range,
1073
+ autoT=autoT,
1074
+ T_base=T_base,
1075
+ nr=nr,
1076
+ use_N_best=use_N_best,
1077
+ T_N_above=T_N_above,
1078
+ T_P_acc_level=T_P_acc_level,
1079
+ )
1080
+
1081
+ return i_use, T, EV, EV_post, EV_post_mean, CHI2, N_UNIQUE, ip_range
1082
+
1083
+ finally:
1084
+ # Clean up worker's shared memory references
1085
+ for shm in worker_shm_objects:
1086
+ try:
1087
+ shm.close()
1088
+ except Exception as e:
1089
+ logger.debug(f"Error closing shared memory in worker: {e}")
1090
+
1091
+
1092
+ def select_subset_for_inversion(dd, N_app):
1093
+ """
1094
+ Select a subset of indices for inversion based on the sum of squared residuals.
1095
+
1096
+ This function calculates the sum of squared values along the specified axis
1097
+ for each row in the input array `dd`. It then selects the indices of the
1098
+ `N_app` smallest sums for fastest performance.
1099
+
1100
+ Parameters
1101
+ ----------
1102
+ dd : numpy.ndarray
1103
+ A 2D array of data from which to select the subset.
1104
+ N_app : int
1105
+ The number of indices to select based on the smallest sums.
1106
+
1107
+ Returns
1108
+ -------
1109
+ idx : numpy.ndarray
1110
+ An array of indices corresponding to the `N_app` smallest L2 norms.
1111
+
1112
+ Notes
1113
+ -----
1114
+ This function uses squared residuals (L2 norm) for optimal performance,
1115
+ avoiding expensive absolute value operations. Uses `np.argpartition`
1116
+ for efficient selection of the smallest sums.
1117
+ """
1118
+ norms = np.sum(dd**2, axis=1)
1119
+ idx = np.argpartition(norms, N_app)[:N_app]
1120
+ return idx
1121
+
1122
+
1123
+ def likelihood_gaussian_diagonal(D, d_obs, d_std, N_app=0):
1124
+ """
1125
+ Compute the Gaussian likelihood for a diagonal covariance matrix.
1126
+
1127
+ This function calculates the likelihood of observed data given a set of predicted data
1128
+ and standard deviations, assuming a Gaussian distribution with a diagonal covariance matrix.
1129
+
1130
+ Parameters
1131
+ ----------
1132
+ D : ndarray, shape (n_samples, n_features)
1133
+ Predicted data array containing forward model predictions.
1134
+ d_obs : ndarray, shape (n_features,)
1135
+ Observed data array containing measured values.
1136
+ d_std : ndarray, shape (n_features)
1137
+ Standard deviation array containing measurement uncertainties.
1138
+ N_app : int, optional
1139
+ Number of data points to use for approximation. If 0, uses all data.
1140
+ Default is 0.
1141
+
1142
+ Returns
1143
+ -------
1144
+ ndarray, shape (n_samples,)
1145
+ Log-likelihood values for each sample, computed as:
1146
+ L[i] = -0.5 * sum((D[i] - d_obs)**2 / d_std**2)
1147
+
1148
+ Notes
1149
+ -----
1150
+ The function assumes independent Gaussian errors with diagonal covariance matrix.
1151
+ The log-likelihood is computed using vectorized operations for efficiency.
1152
+
1153
+ When N_app > 0, only the N_app samples with smallest residuals are evaluated,
1154
+ and the remaining samples are assigned a very low likelihood (-1e15).
1155
+
1156
+ This implementation is already well-optimized. Micro-optimizations like pre-computing
1157
+ inverse variance do not improve performance with modern NumPy.
1158
+ """
1159
+
1160
+ # Compute the likelihood (fully vectorized)
1161
+ dd = D - d_obs
1162
+
1163
+ if N_app > 0:
1164
+ L = np.ones(D.shape[0])*-1e+15
1165
+ idx = select_subset_for_inversion(dd, N_app)
1166
+ L_small = likelihood_gaussian_diagonal(D[idx], d_obs, d_std,0)
1167
+ L[idx]=L_small
1168
+
1169
+ else:
1170
+ # Vectorized computation - already optimal
1171
+ L = -0.5 * np.nansum((dd / d_std)**2, axis=1)
1172
+
1173
+ return L
1174
+
1175
+
1176
+ def likelihood_gaussian_diagonal_old(D, d_obs, d_std, N_app=0):
1177
+ """
1178
+ Compute the Gaussian likelihood for a diagonal covariance matrix (original version).
1179
+
1180
+ This is the original implementation kept for reference and backwards compatibility.
1181
+ For better performance, use likelihood_gaussian_diagonal() instead.
1182
+
1183
+ Parameters
1184
+ ----------
1185
+ D : ndarray, shape (n_samples, n_features)
1186
+ Predicted data array containing forward model predictions.
1187
+ d_obs : ndarray, shape (n_features,)
1188
+ Observed data array containing measured values.
1189
+ d_std : ndarray, shape (n_features,)
1190
+ Standard deviation array containing measurement uncertainties.
1191
+ N_app : int, optional
1192
+ Number of data points to use for approximation. If 0, uses all data.
1193
+ Default is 0.
1194
+
1195
+ Returns
1196
+ -------
1197
+ ndarray, shape (n_samples,)
1198
+ Log-likelihood values for each sample, computed as:
1199
+ L[i] = -0.5 * sum((D[i] - d_obs)**2 / d_std**2)
1200
+
1201
+ Notes
1202
+ -----
1203
+ This is the original implementation. It has been replaced by an optimized
1204
+ version that is ~15-25% faster. This function is kept for reference and validation.
1205
+ """
1206
+
1207
+ # Compute the likelihood
1208
+ dd = D - d_obs
1209
+
1210
+ if N_app > 0:
1211
+ L = np.ones(D.shape[0])*-1e+15
1212
+ idx = select_subset_for_inversion(dd, N_app)
1213
+ L_small = likelihood_gaussian_diagonal_old(D[idx], d_obs, d_std,0)
1214
+ L[idx]=L_small
1215
+
1216
+ else:
1217
+ # Explicit broadcasting
1218
+ L = -0.5 * np.nansum((dd / d_std)**2, axis=1)
1219
+
1220
+ return L
1221
+
1222
+ def likelihood_gaussian_full(D, d_obs, Cd, N_app=0, checkNaN=True, useVectorized=True):
1223
+ """
1224
+ Calculate the Gaussian likelihood with full covariance matrix.
1225
+
1226
+ This function computes likelihood values for model predictions given observed data
1227
+ and a full covariance matrix, handling NaN values appropriately.
1228
+
1229
+ Parameters
1230
+ ----------
1231
+ D : ndarray, shape (n_samples, n_features)
1232
+ Model predictions containing forward model results.
1233
+ d_obs : ndarray, shape (n_features,)
1234
+ Observed data containing measured values.
1235
+ Cd : ndarray, shape (n_features, n_features)
1236
+ Full covariance matrix of observed data uncertainties.
1237
+ N_app : int, optional
1238
+ Number of data points to use for approximation. If 0, uses all data.
1239
+ Default is 0.
1240
+ checkNaN : bool, optional
1241
+ If True, handles NaN values in d_obs by ignoring them in calculations.
1242
+ Default is True.
1243
+ useVectorized : bool, optional
1244
+ If True, uses vectorized computation for better performance.
1245
+ Default is False.
1246
+
1247
+ Returns
1248
+ -------
1249
+ ndarray, shape (n_samples,)
1250
+ Log-likelihood values for each sample, computed as:
1251
+ L[i] = -0.5 * (D[i] - d_obs)^T * Cd^(-1) * (D[i] - d_obs)
1252
+
1253
+ Notes
1254
+ -----
1255
+ The function handles full covariance matrices accounting for correlated errors.
1256
+ When checkNaN=True, only non-NaN data points are used in the likelihood calculation.
1257
+
1258
+ The vectorized implementation uses einsum for efficient matrix operations.
1259
+ When N_app > 0, only the N_app samples with smallest residuals are evaluated.
1260
+
1261
+ TODO: Check that this works when D has NaN values and determine why they occur.
1262
+ """
1263
+
1264
+ if checkNaN:
1265
+ # find index of non-nan values in d_obs or non-nan values in np.sum(Cd, axis=0)
1266
+ #ind = np.where(~np.isnan(d_obs))[0]
1267
+ ind = np.where(~np.isnan(d_obs) & ~np.isnan(np.sum(Cd, axis=0)))[0]
1268
+ # Exclude also all data for which one Nan Is available.. This is probably not ideal
1269
+ ind = np.where(~np.isnan(d_obs) & ~np.isnan(np.sum(Cd, axis=0)) & ~np.isnan(np.sum(D, axis=0)) )[0]
1270
+ dd = D[:,ind] - d_obs[ind]
1271
+ iCd = np.linalg.inv(Cd[np.ix_(ind, ind)])
1272
+ else:
1273
+ dd = D - d_obs
1274
+ iCd = np.linalg.inv(Cd)
1275
+
1276
+ if N_app > 0:
1277
+ L = np.ones(D.shape[0])*-1e+15
1278
+ idx = select_subset_for_inversion(dd, N_app)
1279
+ if useVectorized:
1280
+ #print('Using vectorized likelihood calculation -approximation')
1281
+ L_small = -.5 * np.einsum('ij,ij->i', dd[idx] @ iCd, dd[idx])
1282
+ else:
1283
+ L_small = np.zeros(idx.shape[0])
1284
+ for i in range(idx.shape[0]):
1285
+ L_small[i] = -.5 * np.nansum(dd[idx[i]].T @ iCd @ dd[idx[i]])
1286
+ L[idx] = L_small
1287
+
1288
+ return L
1289
+
1290
+ if useVectorized:
1291
+ # vectorized
1292
+ #print('Using vectorized likelihood calculation')
1293
+ L = -.5 * np.einsum('ij,ij->i', dd @ iCd, dd)
1294
+ else:
1295
+ # non-vectorized
1296
+ L = np.zeros(D.shape[0])
1297
+ for i in range(D.shape[0]):
1298
+ L[i] = -.5 * np.nansum(dd[i].T @ iCd @ dd[i])
1299
+
1300
+ return L
1301
+
1302
+
1303
+
1304
+
1305
+ def likelihood_multinomial(D, P_obs, class_id=None, class_is_idx=False, entropyFilter=False, entropyThreshold=0.99):
1306
+ """
1307
+ Calculate log-likelihood of multinomial distribution for discrete data.
1308
+
1309
+ This function computes the log-likelihood of multinomial distribution for discrete data
1310
+ using fully vectorized array operations for efficient computation.
1311
+
1312
+ Parameters
1313
+ ----------
1314
+ D : ndarray, shape (N, n_features)
1315
+ Matrix of observed discrete data, where each element represents a class ID.
1316
+ P_obs : ndarray, shape (n_classes, n_features)
1317
+ Matrix of probabilities, where each column represents probability distribution over classes.
1318
+ class_id : ndarray, optional
1319
+ Array of unique class IDs corresponding to rows in P_obs.
1320
+ If None, extracted from unique values in D.
1321
+ Default is None.
1322
+ class_is_idx : bool, optional
1323
+ If True, class_id is already an index. If False, computes index from class_id array.
1324
+ Default is False.
1325
+ entropyFilter : bool, optional
1326
+ If True, applies entropy filtering to select features.
1327
+ Default is False.
1328
+ entropyThreshold : float, optional
1329
+ Threshold for entropy filtering. Features with entropy below this value are selected.
1330
+ Default is 0.99.
1331
+
1332
+ Returns
1333
+ -------
1334
+ ndarray, shape (N,)
1335
+ Log-likelihood values for each sample, computed using natural logarithm.
1336
+ For each sample i: logL[i] = sum(log(p[i,j])) over all features j.
1337
+
1338
+ Notes
1339
+ -----
1340
+ This vectorized implementation eliminates Python loops for significant performance gains.
1341
+ The log-likelihood is calculated as the sum of natural logarithms of probabilities:
1342
+ logL[i] = sum(log(p[i,j])) for all features j
1343
+
1344
+ This means exp(logL[i]) equals the product of probabilities across features.
1345
+ For single-feature cases, exp(logL) directly equals the observed probability.
1346
+
1347
+ When entropyFilter is True, only features with entropy below the threshold
1348
+ are used in the likelihood calculation, which can improve computational efficiency
1349
+ for datasets with many uninformative features.
1350
+
1351
+ Performance: This vectorized version is approximately 5-10x faster than the loop-based
1352
+ implementation for large datasets (N > 10,000).
1353
+
1354
+ Examples
1355
+ --------
1356
+ >>> D = np.array([[1, 2], [2, 1]]) # Sample data with class IDs
1357
+ >>> P_obs = np.array([[0.3, 0.7], [0.7, 0.3]]) # Class probabilities
1358
+ >>> logL = likelihood_multinomial(D, P_obs)
1359
+ """
1360
+
1361
+ from scipy.stats import entropy
1362
+
1363
+ if class_id is None:
1364
+ class_id = np.unique(D).astype(int)
1365
+
1366
+ D = np.atleast_2d(D)
1367
+
1368
+ # Filter out columns with NaN values in P_obs before any processing
1369
+ valid_features = ~np.any(np.isnan(P_obs), axis=0)
1370
+
1371
+ if not np.any(valid_features):
1372
+ # If all features have NaN, return array of NaN
1373
+ return np.full(D.shape[0], np.nan)
1374
+
1375
+ # Apply NaN filtering to both D and P_obs
1376
+ D = D[:, valid_features]
1377
+ P_obs = P_obs[:, valid_features]
1378
+
1379
+ if entropyFilter:
1380
+ H = entropy(P_obs.T)
1381
+ used = np.where(H < entropyThreshold)[0]
1382
+ if len(used) == 0:
1383
+ used = np.arange(1)
1384
+ D = D[:, used]
1385
+ P_obs = P_obs[:, used]
1386
+
1387
+ N, nm = D.shape
1388
+
1389
+ # Convert D to integer indices
1390
+ if class_is_idx:
1391
+ # D already contains indices
1392
+ indices = D.astype(int)
1393
+ else:
1394
+ # Create vectorized mapping from class_id to indices
1395
+ class_id = class_id.astype(int)
1396
+
1397
+ # Create a lookup array for fast vectorized conversion
1398
+ # This assumes class IDs are reasonably bounded
1399
+ max_class_id = np.max(class_id)
1400
+ min_class_id = np.min(class_id)
1401
+
1402
+ # Use a lookup table approach for efficiency
1403
+ lookup = np.full(max_class_id + 1, -1, dtype=int)
1404
+ lookup[class_id] = np.arange(len(class_id))
1405
+
1406
+ # Vectorized conversion of all class IDs to indices
1407
+ indices = lookup[D.astype(int)]
1408
+
1409
+ # Create column indices for advanced indexing
1410
+ col_indices = np.arange(nm)
1411
+
1412
+ # Vectorized probability extraction using advanced indexing
1413
+ # indices has shape (N, nm), col_indices has shape (nm,)
1414
+ # Broadcasting: indices[:, j] selects row, col_indices[j] selects column
1415
+ probs = P_obs[indices, col_indices]
1416
+
1417
+ # Vectorized log-likelihood calculation
1418
+ # Sum log probabilities along features axis
1419
+ # Use np.errstate to suppress divide-by-zero warnings when probs=0
1420
+ # log(0) = -inf is mathematically correct (zero probability events)
1421
+ with np.errstate(divide='ignore'):
1422
+ logL = np.sum(np.log(probs), axis=1)
1423
+
1424
+ return logL
1425
+
1426
+
1427
+ def likelihood_multinomial_old(D, P_obs, class_id=None, class_is_idx=False, entropyFilter=False, entropyThreshold=0.99):
1428
+ """
1429
+ Calculate log-likelihood of multinomial distribution for discrete data (old loop-based version).
1430
+
1431
+ This is the original loop-based implementation kept for reference and backwards compatibility.
1432
+ For better performance, use likelihood_multinomial() instead.
1433
+
1434
+ Parameters
1435
+ ----------
1436
+ D : ndarray, shape (N, n_features)
1437
+ Matrix of observed discrete data, where each element represents a class ID.
1438
+ P_obs : ndarray, shape (n_classes, n_features)
1439
+ Matrix of probabilities, where each column represents probability distribution over classes.
1440
+ class_id : ndarray, optional
1441
+ Array of unique class IDs corresponding to rows in P_obs.
1442
+ If None, extracted from unique values in D.
1443
+ Default is None.
1444
+ class_is_idx : bool, optional
1445
+ If True, class_id is already an index. If False, computes index from class_id array.
1446
+ Default is False.
1447
+ entropyFilter : bool, optional
1448
+ If True, applies entropy filtering to select features.
1449
+ Default is False.
1450
+ entropyThreshold : float, optional
1451
+ Threshold for entropy filtering. Features with entropy below this value are selected.
1452
+ Default is 0.99.
1453
+
1454
+ Returns
1455
+ -------
1456
+ ndarray, shape (N,)
1457
+ Log-likelihood values for each sample, computed using natural logarithm.
1458
+ For each sample i: logL[i] = sum(log(p[i,j])) over all features j.
1459
+
1460
+ Notes
1461
+ -----
1462
+ This is the original loop-based implementation. It has been replaced by a vectorized
1463
+ version that is 5-10x faster. This function is kept for reference and validation.
1464
+
1465
+ Examples
1466
+ --------
1467
+ >>> D = np.array([[1, 2], [2, 1]]) # Sample data with class IDs
1468
+ >>> P_obs = np.array([[0.3, 0.7], [0.7, 0.3]]) # Class probabilities
1469
+ >>> logL = likelihood_multinomial_old(D, P_obs)
1470
+ """
1471
+
1472
+ from scipy.stats import entropy
1473
+
1474
+ if class_id is None:
1475
+ class_id = np.arange(len(np.unique(D))).astype(int)
1476
+ class_id = np.unique(D).astype(int)
1477
+
1478
+ D=np.atleast_2d(D)
1479
+
1480
+ # Filter out columns with NaN values in P_obs before any processing
1481
+ # Check each column (feature) for NaN values
1482
+ valid_features = ~np.any(np.isnan(P_obs), axis=0)
1483
+
1484
+ if not np.any(valid_features):
1485
+ # If all features have NaN, return array of NaN
1486
+ return np.full(D.shape[0], np.nan)
1487
+
1488
+ # Apply NaN filtering to both D and P_obs
1489
+ D = D[:, valid_features]
1490
+ P_obs = P_obs[:, valid_features]
1491
+
1492
+ if entropyFilter:
1493
+ H=entropy(P_obs.T)
1494
+ used = np.where(H<entropyThreshold)[0]
1495
+ if len(used)==0:
1496
+ used = np.arange(1)
1497
+ D = D[:,used]
1498
+ P_obs = P_obs[:,used]
1499
+
1500
+ N, nm = D.shape
1501
+ logL = np.zeros((N))
1502
+ class_id = class_id.astype(int)
1503
+
1504
+ # Create mapping from class_id to index
1505
+ class_to_idx = {cid: idx for idx, cid in enumerate(class_id)}
1506
+
1507
+ for i in range(N):
1508
+ # Convert test data to indices using the mapping
1509
+ if class_is_idx:
1510
+ i_test = D[i]
1511
+ else:
1512
+ i_test = np.array([class_to_idx[cls] for cls in D[i]])
1513
+
1514
+ # Get probabilities directly using advanced indexing
1515
+ p = P_obs[i_test, np.arange(nm)]
1516
+ # Calculate log likelihood (natural log of probabilities)
1517
+ logL[i] = np.sum(np.log(p))
1518
+
1519
+
1520
+ return logL
1521
+
1522
+
1523
+ def create_shared_memory(arrays):
1524
+ """
1525
+ Create shared memory segments for arrays.
1526
+
1527
+ This function creates shared memory segments for a list of numpy arrays,
1528
+ allowing them to be accessed efficiently across multiple processes without
1529
+ copying data. Returns both memory references and objects for cleanup.
1530
+
1531
+ Parameters
1532
+ ----------
1533
+ arrays : list
1534
+ List of numpy arrays to place in shared memory.
1535
+
1536
+ Returns
1537
+ -------
1538
+ shared_memories : list
1539
+ List of ``(name, shape, dtype)`` tuples identifying shared memory segments.
1540
+ shm_objects : list
1541
+ List of SharedMemory objects for cleanup.
1542
+
1543
+ Notes
1544
+ -----
1545
+ The returned shm_objects must be cleaned up using cleanup_shared_memory()
1546
+ to prevent memory leaks. This should be done in a finally block.
1547
+
1548
+ If an error occurs during creation, any successfully created memory segments
1549
+ are automatically cleaned up before raising the exception.
1550
+ """
1551
+ shared_memories = []
1552
+ shm_objects = []
1553
+
1554
+ try:
1555
+ for array in arrays:
1556
+ shm = shared_memory.SharedMemory(create=True, size=array.nbytes)
1557
+ shared_array = np.ndarray(array.shape, dtype=array.dtype, buffer=shm.buf)
1558
+ shared_array[:] = array[:]
1559
+ shared_memories.append((shm.name, array.shape, array.dtype))
1560
+ shm_objects.append(shm)
1561
+ logger.debug(f"Created shared memory: {shm.name}")
1562
+ return shared_memories, shm_objects
1563
+ except Exception as e:
1564
+ logger.error(f"Error creating shared memory: {e}")
1565
+ # Clean up any created memory segments before raising
1566
+ for shm in shm_objects:
1567
+ shm.close()
1568
+ shm.unlink()
1569
+ raise
1570
+
1571
+ def reconstruct_shared_arrays(shared_memory_refs):
1572
+ """
1573
+ Reconstruct arrays from shared memory references.
1574
+
1575
+ This function takes shared memory references (created by create_shared_memory)
1576
+ and reconstructs the original numpy arrays by accessing the shared memory
1577
+ segments. Used by worker processes to access shared data.
1578
+
1579
+ Parameters
1580
+ ----------
1581
+ shared_memory_refs : list
1582
+ List of (name, shape, dtype) tuples identifying shared memory segments.
1583
+
1584
+ Returns
1585
+ -------
1586
+ reconstructed_arrays : list
1587
+ List of numpy arrays reconstructed from shared memory.
1588
+ shm_objects : list
1589
+ List of shared memory objects that must be closed after use.
1590
+
1591
+ Warnings
1592
+ --------
1593
+ The reconstructed arrays are views into shared memory. Modifications
1594
+ will affect the shared data across all processes. Do NOT modify these arrays.
1595
+ The shared memory objects must be closed after use to prevent leaks.
1596
+
1597
+ Notes
1598
+ -----
1599
+ If an error occurs during reconstruction, any successfully opened shared memory
1600
+ objects are automatically closed before raising the exception.
1601
+ """
1602
+ reconstructed_arrays = []
1603
+ shm_objects = []
1604
+ for shm_name, shape, dtype in shared_memory_refs:
1605
+ try:
1606
+ shm = shared_memory.SharedMemory(name=shm_name)
1607
+ array = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
1608
+ reconstructed_arrays.append(array)
1609
+ shm_objects.append(shm)
1610
+ except Exception as e:
1611
+ logger.error(f"Error reconstructing array: {e}")
1612
+ # Clean up any successfully opened shared memory objects
1613
+ for opened_shm in shm_objects:
1614
+ opened_shm.close()
1615
+ raise
1616
+ return reconstructed_arrays, shm_objects
1617
+
1618
+ def cleanup_shared_memory(shm_objects):
1619
+ """
1620
+ Clean up shared memory segments.
1621
+
1622
+ This function properly closes and unlinks shared memory objects created during
1623
+ parallel processing to prevent memory leaks and system resource exhaustion.
1624
+ It handles cleanup gracefully by catching and ignoring errors for objects
1625
+ that may have already been cleaned up.
1626
+
1627
+ Parameters
1628
+ ----------
1629
+ shm_objects : list
1630
+ List of shared memory objects to clean up.
1631
+
1632
+ Returns
1633
+ -------
1634
+ None
1635
+
1636
+ Notes
1637
+ -----
1638
+ This function should always be called in a finally block or similar
1639
+ error-safe context to ensure cleanup occurs even if exceptions are raised.
1640
+
1641
+ Each shared memory object is both closed (to release the local reference)
1642
+ and unlinked (to remove it from the system). Errors during cleanup are
1643
+ silently ignored to prevent cascading failures.
1644
+
1645
+ Examples
1646
+ --------
1647
+ >>> shared_memories, shm_objects = create_shared_memory(arrays)
1648
+ >>> try:
1649
+ ... # Use shared memory
1650
+ ... pass
1651
+ ... finally:
1652
+ ... cleanup_shared_memory(shm_objects)
1653
+ """
1654
+ if not shm_objects:
1655
+ return
1656
+
1657
+ for shm in shm_objects:
1658
+ try:
1659
+ shm.close()
1660
+ shm.unlink()
1661
+ logger.debug(f"Cleaned up shared memory: {shm.name}")
1662
+
1663
+ except Exception as e:
1664
+ #logger.error(f"Error cleaning up shared memory: {e}")
1665
+ pass
1666
+
1667
+
1668
+ def compute_hypothesis_probability(f_post_h5_list, **kwargs):
1669
+ """
1670
+ Compute hypothesis probabilities from evidence values in posterior files.
1671
+
1672
+ This function reads evidence (EV) values from multiple posterior HDF5 files,
1673
+ each representing a different hypothesis/prior model, and computes the
1674
+ probability of each hypothesis at each data point using Bayesian model averaging.
1675
+
1676
+ The probability is computed using Bayes' theorem for model selection with the
1677
+ assumption of equal prior probabilities for all hypotheses.
1678
+
1679
+ Parameters
1680
+ ----------
1681
+ f_post_h5_list : list of str
1682
+ List of paths to posterior HDF5 files, one for each hypothesis.
1683
+ Each file must contain an '/EV' dataset with log-evidence values (natural log).
1684
+ showInfo : int, optional
1685
+ Level of verbosity for output. Default is 0.
1686
+
1687
+ Returns
1688
+ -------
1689
+ P : ndarray, shape (n_data_points, n_hypotheses)
1690
+ Probability of each hypothesis at each data point.
1691
+ P[i, j] is the probability of hypothesis j at data point i.
1692
+ Each row sums to 1.0 (within numerical precision).
1693
+ mode : ndarray, shape (n_data_points,)
1694
+ Index of the most probable hypothesis for each data point.
1695
+ Values are 0-based indices in range [0, n_hypotheses-1].
1696
+ entropy_values : ndarray, shape (n_data_points,)
1697
+ Entropy (uncertainty measure) for each data point.
1698
+ Values are in range [0, log_base(n_hypotheses)], where base=n_hypotheses.
1699
+ 0 = certain (one hypothesis has probability 1), higher values = more uncertain.
1700
+
1701
+ Notes
1702
+ -----
1703
+ The probability is computed using Bayes' theorem for model selection:
1704
+
1705
+ P(hypothesis_i | data) = P(data | hypothesis_i) * P(hypothesis_i) / P(data)
1706
+
1707
+ Where P(data | hypothesis_i) is the marginal likelihood (evidence), stored
1708
+ as log-evidence (EV, natural log) in the HDF5 files. Assuming equal prior
1709
+ probabilities for all hypotheses:
1710
+
1711
+ P(hypothesis_i | data) = exp(EV_i) / sum_j(exp(EV_j))
1712
+
1713
+ For numerical stability, the log-sum-exp trick is used:
1714
+
1715
+ P(hypothesis_i | data) = exp(EV_i - log_sum_exp(all EVs))
1716
+
1717
+ where log_sum_exp is computed using np.logaddexp.reduce() for arbitrary
1718
+ number of hypotheses.
1719
+
1720
+ The evidence (EV) values are stored as natural logarithms (ln, not log10)
1721
+ as computed by integrate_rejection_range().
1722
+
1723
+ Examples
1724
+ --------
1725
+ >>> import integrate as ig
1726
+ >>> # Create three posterior files from different prior models
1727
+ >>> f_post_list = ['post_valley.h5', 'post_standard.h5', 'post_merged.h5']
1728
+ >>> P, mode, entropy = ig.compute_hypothesis_probability(f_post_list)
1729
+ >>> print(P.shape) # (n_data_points, 3)
1730
+ >>> print(P[0]) # Probabilities for first data point: [0.3, 0.5, 0.2]
1731
+ >>> print(np.sum(P[0])) # Should be 1.0
1732
+ >>> print(mode[0]) # Most probable hypothesis index: 1 (0-based)
1733
+ >>> print(entropy[0]) # Uncertainty measure: ~0.96
1734
+
1735
+ >>> # For two hypotheses (e.g., valley vs standard lithology)
1736
+ >>> f_post_list = ['post_valley.h5', 'post_standard.h5']
1737
+ >>> P, mode, entropy = ig.compute_hypothesis_probability(f_post_list, showInfo=1)
1738
+ >>> P_valley = P[:, 0] # Probability of valley hypothesis
1739
+ >>> P_standard = P[:, 1] # Probability of standard hypothesis
1740
+ >>> most_probable_hypothesis = mode # Index of most probable hypothesis per data point
1741
+ >>> uncertainty = entropy # Entropy values indicating uncertainty
1742
+
1743
+ See Also
1744
+ --------
1745
+ integrate_rejection : Main rejection sampling function that creates posterior files
1746
+ integrate_rejection_range : Core rejection sampling that computes EV values
1747
+ """
1748
+ import h5py
1749
+ import numpy as np
1750
+ import integrate as ig
1751
+
1752
+ showInfo = kwargs.get('showInfo', 0)
1753
+
1754
+ n_hypotheses = len(f_post_h5_list)
1755
+
1756
+ if n_hypotheses < 2:
1757
+ raise ValueError("At least two posterior files are required for hypothesis comparison. "
1758
+ f"Received {n_hypotheses} file(s).")
1759
+
1760
+ # Read EV from all files
1761
+ EV_list = []
1762
+ n_data_points = None
1763
+
1764
+ for i, f_post_h5 in enumerate(f_post_h5_list):
1765
+ try:
1766
+ with h5py.File(f_post_h5, 'r') as f:
1767
+ if '/EV' not in f:
1768
+ raise KeyError(f"'/EV' dataset not found in {f_post_h5}")
1769
+
1770
+ EV = f['/EV'][:]
1771
+ EV_list.append(EV)
1772
+
1773
+ if n_data_points is None:
1774
+ n_data_points = len(EV)
1775
+ elif len(EV) != n_data_points:
1776
+ raise ValueError(f"Inconsistent number of data points: file '{f_post_h5}' "
1777
+ f"has {len(EV)} data points, expected {n_data_points}")
1778
+
1779
+ if showInfo > 0:
1780
+ print(f"Hypothesis {i+1}: Loaded EV from {os.path.basename(f_post_h5)}")
1781
+ if showInfo > 1:
1782
+ print(f" - Data points: {len(EV)}")
1783
+ print(f" - EV range: [{np.nanmin(EV):.2f}, {np.nanmax(EV):.2f}]")
1784
+
1785
+ except FileNotFoundError:
1786
+ raise FileNotFoundError(f"Posterior file not found: {f_post_h5}")
1787
+
1788
+ # Stack EV values: shape (n_data_points, n_hypotheses)
1789
+ EV_all = np.stack(EV_list, axis=1)
1790
+
1791
+ if showInfo > 1:
1792
+ print(f"\nCombined EV array shape: {EV_all.shape}")
1793
+ print(f"Overall EV range: [{np.nanmin(EV_all):.2f}, {np.nanmax(EV_all):.2f}]")
1794
+
1795
+ # Compute probabilities using log-sum-exp trick for numerical stability
1796
+ # P(hypothesis_i | data) = exp(EV_i) / sum_j(exp(EV_j))
1797
+ # = exp(EV_i - log_sum_exp(all EVs))
1798
+
1799
+ # Compute log_sum_exp across hypotheses for each data point
1800
+ # np.logaddexp.reduce handles arbitrary number of hypotheses
1801
+ log_sum = np.logaddexp.reduce(EV_all, axis=1, keepdims=True)
1802
+
1803
+ # Compute probabilities
1804
+ P = np.exp(EV_all - log_sum)
1805
+
1806
+ # Compute mode (most probable hypothesis index for each data point)
1807
+ mode = np.argmax(P, axis=1)
1808
+
1809
+ # Compute entropy for each data point using the entropy function from integrate
1810
+ entropy_values = ig.entropy(P, base=n_hypotheses)
1811
+
1812
+ if showInfo > 0:
1813
+ print(f"\nComputed hypothesis probabilities:")
1814
+ print(f" - Output shape: {P.shape} (n_data_points × n_hypotheses)")
1815
+ print(f" - Probability range: [{np.nanmin(P):.4f}, {np.nanmax(P):.4f}]")
1816
+ row_sums = np.sum(P, axis=1)
1817
+ print(f" - Row sums (should be 1.0): mean={np.nanmean(row_sums):.6f}, "
1818
+ f"std={np.nanstd(row_sums):.2e}")
1819
+
1820
+ # Print mode and entropy statistics
1821
+ print(f"\nMode and entropy statistics:")
1822
+ mode_counts = np.bincount(mode, minlength=n_hypotheses)
1823
+ print(f" - Mode distribution:")
1824
+ for i in range(n_hypotheses):
1825
+ percentage = (mode_counts[i] / n_data_points) * 100
1826
+ print(f" Hypothesis {i+1}: {mode_counts[i]} data points ({percentage:.1f}%)")
1827
+ print(f" - Entropy range: [{np.nanmin(entropy_values):.4f}, {np.nanmax(entropy_values):.4f}]")
1828
+ print(f" - Mean entropy: {np.nanmean(entropy_values):.4f}, Std entropy: {np.nanstd(entropy_values):.4f}")
1829
+
1830
+ if showInfo > 1:
1831
+ # Print summary statistics for each hypothesis
1832
+ for i in range(n_hypotheses):
1833
+ print(f" - Hypothesis {i+1}: mean P = {np.nanmean(P[:, i]):.4f}, "
1834
+ f"median P = {np.nanmedian(P[:, i]):.4f}")
1835
+
1836
+ return P, mode, entropy_values