integrate_module 0.99.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
integrate/integrate.py ADDED
@@ -0,0 +1,4063 @@
1
+ """
2
+ INTEGRATE Core Module - Probabilistic Geophysical Data Integration
3
+
4
+ This module implements rejection sampling algorithms for Bayesian inversion and
5
+ probabilistic data integration in geophysics, with particular focus on electromagnetic
6
+ (EM) data analysis. The module provides comprehensive tools for prior model generation,
7
+ forward modeling, likelihood computation, and posterior sampling.
8
+
9
+ Key Features:
10
+ - Rejection sampling for Bayesian inversion
11
+ - Parallel processing with shared memory optimization
12
+ - Temperature annealing for improved sampling efficiency
13
+ - Support for multiple data types (TDEM, multinomial, etc.)
14
+ - Integration with GA-AEM electromagnetic forward modeling
15
+ - Automatic temperature estimation and adaptive sampling
16
+
17
+ Main Functions:
18
+ - integrate_rejection(): Main rejection sampling workflow (now in integrate_rejection module)
19
+ - prior_data(): Integration of forward modeling with prior structure
20
+ - forward_gaaem(): Electromagnetic forward modeling interface
21
+ - likelihood_*(): Various likelihood calculation functions (now in integrate_rejection module)
22
+ - posterior_*(): Posterior analysis and statistics
23
+
24
+ Author: Thomas Mejer Hansen
25
+ Email: tmeha@geo.au.dk
26
+ """
27
+
28
+ import h5py
29
+ import numpy as np
30
+ import os.path
31
+ import subprocess
32
+ from sys import exit
33
+ import sys
34
+ import types
35
+ import multiprocessing
36
+ from multiprocessing import Pool
37
+ from multiprocessing import shared_memory
38
+ from multiprocessing import get_context
39
+ from functools import partial
40
+ import time
41
+
42
+ # %% Set up logging.. USed to test creation and use of shared memory
43
+ import logging
44
+ logger = logging.getLogger(__name__)
45
+ logger.setLevel(logging.WARNING) # For production
46
+ #logger.setLevel(logging.DEBUG) # For debugging
47
+ if not logger.handlers:
48
+ ch = logging.StreamHandler()
49
+ ch.setLevel(logging.DEBUG)
50
+ formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
51
+ ch.setFormatter(formatter)
52
+ logger.addHandler(ch)
53
+
54
+ def is_notebook():
55
+ """
56
+ Check if the code is running in a Jupyter notebook or IPython shell.
57
+
58
+ Returns
59
+ -------
60
+ bool
61
+ True if running in a Jupyter notebook or IPython shell, False otherwise.
62
+ """
63
+ try:
64
+ # Get the shell type from IPython
65
+ shell = get_ipython().__class__.__name__
66
+
67
+ if shell == 'ZMQInteractiveShell':
68
+ # Additional check for VS Code
69
+ import sys
70
+ if 'vscode' in sys.modules:
71
+ return False
72
+ return True
73
+ else:
74
+ return False
75
+
76
+ except NameError: # If get_ipython is not defined (standard Python)
77
+ return False
78
+
79
+
80
+ def use_parallel(**kwargs):
81
+ """
82
+ Determine if parallel processing can be used based on the environment.
83
+
84
+ Parallel processing is supported on all platforms. The module handles
85
+ platform differences internally: Linux uses fork for performance, while
86
+ Windows and macOS use spawn for correctness. No `if __name__ == "__main__":`
87
+ guard is required in user scripts on any platform.
88
+
89
+ Parameters
90
+ ----------
91
+ showInfo : int, optional
92
+ If greater than 0, prints information about the environment and
93
+ parallel processing status. Default is 0.
94
+
95
+ Returns
96
+ -------
97
+ bool
98
+ True if parallel processing is safe, False otherwise.
99
+ """
100
+ import os
101
+ showInfo = kwargs.get('showInfo', 0)
102
+
103
+ if is_notebook():
104
+ if showInfo>0:
105
+ print('Notebook detected. Parallel processing is OK.')
106
+ else:
107
+ if os.name == 'posix':
108
+ if os.uname().sysname == 'Darwin':
109
+ if showInfo>0:
110
+ print('macOS detected. Parallel processing is OK (using spawn).')
111
+ else:
112
+ if showInfo>0:
113
+ print('Linux/POSIX detected. Parallel processing is OK (using fork).')
114
+ else:
115
+ if showInfo>0:
116
+ print('Windows detected. Parallel processing is OK (using spawn).')
117
+
118
+ return True
119
+
120
+
121
+
122
+
123
+ def logl_T_est(logL, N_above=10, P_acc_lev=0.2):
124
+ """
125
+ Estimate a temperature (T_est) based on a given logarithmic likelihood (logL),
126
+ a number (N_above), and an acceptance level (P_acc_lev).
127
+
128
+ Parameters
129
+ ----------
130
+ logL : numpy.ndarray
131
+ An array of logarithmic likelihoods.
132
+ N_above : int, optional
133
+ The number of elements above which to consider in the sorted logL array.
134
+ Default is 10.
135
+ P_acc_lev : float, optional
136
+ The acceptance level for the calculation. Default is 0.2.
137
+
138
+ Returns
139
+ -------
140
+ float
141
+ The estimated temperature. It's either a positive number or infinity.
142
+
143
+ Notes
144
+ -----
145
+ The function sorts the logL array in ascending order after normalizing the data
146
+ by subtracting the maximum value from each element. It then removes any NaN values
147
+ from the sorted array. If the sorted array is not empty, it calculates T_est based
148
+ on the N_above+1th last element in the sorted array and the natural logarithm of
149
+ P_acc_lev. If the sorted array is empty, it sets T_est to infinity.
150
+ """
151
+ sorted_logL = np.sort(logL - np.nanmax(logL))
152
+ sorted_logL = sorted_logL[~np.isnan(sorted_logL)]
153
+
154
+ if sorted_logL.size > 0:
155
+ logL_lev = sorted_logL[-N_above-1]
156
+ T_est = logL_lev / np.log(P_acc_lev)
157
+ T_est = np.nanmax([1, T_est])
158
+ else:
159
+ T_est = np.inf
160
+
161
+ return T_est
162
+
163
+
164
+ def lu_post_sample_logl(logL, ns=1, T=1):
165
+ """
166
+ Perform LU post-sampling log-likelihood calculation.
167
+
168
+ Parameters
169
+ ----------
170
+ logL : array-like
171
+ Array of log-likelihood values.
172
+ ns : int, optional
173
+ Number of samples to generate. Defaults to 1.
174
+ T : float, optional
175
+ Temperature parameter. Defaults to 1.
176
+
177
+ Returns
178
+ -------
179
+ tuple
180
+ A tuple containing the generated samples and the acceptance probabilities.
181
+
182
+ i_use_all : numpy.ndarray
183
+ Array of indices of the selected samples.
184
+ P_acc : numpy.ndarray
185
+ Array of acceptance probabilities.
186
+ """
187
+
188
+ N = len(logL)
189
+ P_acc = np.exp((1/T) * (logL - np.nanmax(logL)))
190
+ P_acc[np.isnan(P_acc)] = 0
191
+
192
+ Cum_P = np.cumsum(P_acc)
193
+ Cum_P = Cum_P / np.nanmax(Cum_P)
194
+ dp = 1 / N
195
+ p = np.array([i * dp for i in range(1, N+1)])
196
+
197
+ i_use_all = np.zeros(ns, dtype=int)
198
+ for is_ in range(ns):
199
+ r = np.random.rand()
200
+ i_use = np.where(Cum_P > r)[0][0]
201
+ i_use_all[is_] = i_use
202
+
203
+ return i_use_all, P_acc
204
+
205
+ def integrate_update_prior_attributes(f_prior_h5, **kwargs):
206
+ """
207
+ Update the 'is_discrete' attribute of datasets in an HDF5 file.
208
+
209
+ This function iterates over all datasets in the provided HDF5 file.
210
+ If a dataset's name starts with 'M', the function checks if the dataset
211
+ has an 'is_discrete' attribute. If not, it checks if the dataset appears
212
+ to represent discrete data by sampling the first 1000 elements and checking
213
+ how many unique values there are. If there are fewer than 20 unique values,
214
+ it sets 'is_discrete' to 1; otherwise, it sets 'is_discrete' to 0.
215
+ The 'is_discrete' attribute is then added to the dataset.
216
+
217
+ Parameters
218
+ ----------
219
+ f_prior_h5 : str
220
+ The path to the HDF5 file to process.
221
+ showInfo : int, optional
222
+ Level of verbosity for output (default is 0).
223
+ """
224
+
225
+ showInfo = kwargs.get('showInfo', 0)
226
+
227
+ # Check that hdf5 files exists
228
+ if not os.path.isfile(f_prior_h5):
229
+ if showInfo>=2:
230
+ print('integrate_update_prior_attributes: File %s does not exist' % f_prior_h5)
231
+ exit()
232
+
233
+ with h5py.File(f_prior_h5, 'a') as f: # open file in append mode
234
+ for name, dataset in f.items():
235
+ if showInfo>0:
236
+ print("integrate_update_prior_attributes: Checking %s" % (name))
237
+ if name.upper().startswith('M'):
238
+ # Check if the attribute 'is_discrete' exists
239
+ if 'x' in dataset.attrs:
240
+ pass
241
+ else:
242
+ if 'z' in dataset.attrs:
243
+ dataset.attrs['x'] = dataset.attrs['z']
244
+ else:
245
+ x = np.arange(dataset.shape[1])
246
+ dataset.attrs['x'] = x
247
+ print(dataset.attrs)
248
+ #if 'M1' in f.keys():
249
+ # if 'x' in f['/M1'].attrs.keys():
250
+ # f[name].attrs['x'] = f['/M1'].attrs['x']
251
+ # print('Setting %s/x = /M1/x ' % name)
252
+ # else:
253
+ # print('No x attribute found in %s' % name)
254
+
255
+ if 'is_discrete' in dataset.attrs:
256
+ if (showInfo>0):
257
+ print('%s: %s.is_discrete=%d' % (f_prior_h5,name,dataset.attrs['is_discrete']))
258
+ else:
259
+ # Check if M is discrete
260
+ M_sample = dataset[:1000] # get the first 1000 elements
261
+ class_id = np.unique(M_sample)
262
+ print(class_id)
263
+ if len(class_id) < 20:
264
+ is_discrete = 1
265
+ dataset.attrs['class_id'] = class_id
266
+ ## convert class_id to an array of strings and save it as an attribute if the attribute does not
267
+ ## already exist
268
+ if 'class_name' not in dataset.attrs:
269
+ dataset.attrs['class_name'] = np.array([str(x) for x in class_id])
270
+
271
+ else:
272
+ is_discrete = 0
273
+
274
+ if (showInfo>0):
275
+ print(f'Setting is_discrete={is_discrete}, for {name}')
276
+ dataset.attrs['is_discrete'] = is_discrete
277
+
278
+ if dataset.attrs['is_discrete']==1:
279
+ if not ('class_id' in dataset.attrs):
280
+ M_sample = dataset[:1000] # get the first 1000 elements
281
+ class_id = np.unique(M_sample)
282
+ dataset.attrs['class_id'] = class_id
283
+ if not ('class_name' in dataset.attrs):
284
+ # Convert class_id to an array of strings and save it as an attribute if the attribute does not
285
+ class_id = dataset.attrs['class_id']
286
+ dataset.attrs['class_name'] = [str(x) for x in class_id]
287
+
288
+
289
+
290
+ def integrate_posterior_stats(f_post_h5='POST.h5', ip_range=None, **kwargs):
291
+ """
292
+ Compute posterior statistics for all model parameters in a POST HDF5 file.
293
+
294
+ Reads posterior sample indices (i_use) and the corresponding prior model
295
+ realizations, then computes per-location statistics and writes them back
296
+ into the same POST.h5 file.
297
+
298
+ Parameters
299
+ ----------
300
+ f_post_h5 : str, optional
301
+ Path to the POST HDF5 file. Default is 'POST.h5'.
302
+ ip_range : array-like or None, optional
303
+ Indices of data locations to process. If None, all locations are processed.
304
+ Locations not in ip_range receive NaN values. Default is None.
305
+ **kwargs : dict
306
+ showInfo : int, optional
307
+ Verbosity level. 0 = silent, 1 = progress bars. Default is 0.
308
+ usePrior : bool, optional
309
+ If True, use randomly drawn prior samples instead of i_use indices
310
+ (useful for computing prior statistics as a baseline). Default is False.
311
+ updateGeometryFromData : bool, optional
312
+ Copy UTMX, UTMY, LINE, ELEVATION from the DATA.h5 file into POST.h5
313
+ if not already present. Default is True.
314
+ computeKL : bool, optional
315
+ Shorthand to enable KL divergence computation for all parameter types.
316
+ If True, both ``computeKL_continuous`` and ``computeKL_discrete`` are
317
+ set to True. Default is False.
318
+ computeKL_continuous : bool, optional
319
+ Compute KL divergence D_KL(posterior || prior) for continuous model
320
+ parameters using log10-space histograms (50 bins). Result is in bits
321
+ (log base 2). Default is False.
322
+ computeKL_discrete : bool, optional
323
+ Compute KL divergence D_KL(posterior || prior) for discrete model
324
+ parameters. Normalised to [0, 1] using log_base = number of classes
325
+ (0 = posterior equals prior, 1 = completely certain). Default is False.
326
+
327
+ Writes to POST.h5
328
+ -----------------
329
+ Always written:
330
+
331
+ - ``/N_UNIQUE`` [Np] Number of unique prior realizations used per location.
332
+ - ``/UTMX``, ``/UTMY``, ``/LINE``, ``/ELEVATION`` [Np]
333
+ Geometry copied from DATA.h5 (if ``updateGeometryFromData=True``).
334
+
335
+ For each **continuous** model parameter ``/Mx``:
336
+
337
+ - ``/Mx/Mean`` [Np, Nm] Arithmetic mean of posterior realizations.
338
+ - ``/Mx/LogMean`` [Np, Nm] Geometric mean (exp of mean of log values).
339
+ - ``/Mx/Median`` [Np, Nm] Median of posterior realizations.
340
+ - ``/Mx/Std`` [Np, Nm] Standard deviation of posterior realizations.
341
+ - ``/Mx/LogStd`` [Np, Nm] Standard deviation of log10(posterior realizations).
342
+ - ``/Mx/HarmonicMean`` [Np, Nm] Trimmed harmonic mean: conductivity samples
343
+ are trimmed 10% each tail, averaged, then inverted back to resistivity.
344
+ - ``/Mx/KL`` [Np, Nm] KL divergence in bits. Only written when
345
+ ``computeKL_continuous=True``.
346
+
347
+ For each **discrete** model parameter ``/Mx``:
348
+
349
+ - ``/Mx/Mode`` [Np, Nm] Most probable class at each location/depth.
350
+ - ``/Mx/Entropy`` [Np, Nm] Shannon entropy normalised by log(n_classes).
351
+ - ``/Mx/P`` [Np, Nclass, Nm] Posterior probability of each class.
352
+ - ``/Mx/KL`` [Np, Nm] KL divergence normalised to [0, 1]. Only written
353
+ when ``computeKL_discrete=True``.
354
+
355
+ Returns
356
+ -------
357
+ None
358
+ """
359
+ import h5py
360
+ import numpy as np
361
+ import integrate
362
+ import scipy as sp
363
+ from tqdm import tqdm
364
+
365
+ showInfo = kwargs.get('showInfo', 0)
366
+ if showInfo<0:
367
+ disableTqdm=True
368
+ else:
369
+ disableTqdm=False
370
+ usePrior = kwargs.get('usePrior', False)
371
+ updateGeometryFromData = kwargs.get('updateGeometryFromData', True)
372
+ computeKL = kwargs.get('computeKL', False)
373
+ computeKL_continuous = kwargs.get('computeKL_continuous', False) or computeKL
374
+ computeKL_discrete = kwargs.get('computeKL_discrete', False) or computeKL
375
+
376
+ # Check if f_prior_h5 attribute exists in the HDF5 file
377
+ with h5py.File(f_post_h5, 'r') as f:
378
+ if 'f5_prior' in f.attrs:
379
+ f_prior_h5 = f.attrs['f5_prior']
380
+ else:
381
+ f_prior_h5 = None
382
+ if showInfo>=1:
383
+ raise ValueError(f"'f5_prior' attribute does not exist in {f_post_h5}")
384
+
385
+ # Check if f5_data attribute exists in the HDF5 file
386
+ with h5py.File(f_post_h5, 'r') as f:
387
+ if 'f5_data' in f.attrs:
388
+ f_data_h5 = f.attrs['f5_data']
389
+ else:
390
+ f_data_h5 = None
391
+ if showInfo>=1:
392
+ raise ValueError(f"'f5_data' attribute does not exist in {f_post_h5}")
393
+
394
+ # update Geometry from f_data_h5
395
+ if (updateGeometryFromData)&(f_data_h5 is not None):
396
+ with h5py.File(f_data_h5, 'r') as f_data, h5py.File(f_post_h5, 'a') as f_post:
397
+ if '/UTMX' in f_data:
398
+ if '/UTMX' not in f_post:
399
+ f_data.copy('/UTMX', f_post)
400
+ if '/UTMY' in f_data:
401
+ if '/UTMY' not in f_post:
402
+ f_data.copy('/UTMY', f_post)
403
+ if '/LINE' in f_data:
404
+ if '/LINE' not in f_post:
405
+ f_data.copy('/LINE', f_post)
406
+ if '/ELEVATION' in f_data:
407
+ if '/ELEVATION' not in f_post:
408
+ f_data.copy('/ELEVATION', f_post)
409
+
410
+ # Load 'i_use' data from the HDF5 file
411
+ try:
412
+ with h5py.File(f_post_h5, 'r') as f:
413
+ i_use = f['i_use'][:]
414
+ except KeyError:
415
+ print(f"Could not read 'i_use' from {f_post_h5}")
416
+ #return
417
+
418
+ if usePrior:
419
+ with h5py.File(f_prior_h5, 'r') as f_prior:
420
+ N = f_prior['/M1'].shape[0]
421
+ nr=i_use.shape[1]
422
+ nd=i_use.shape[0]
423
+ # compute i_use of (nd,nr), with random integer numbers between 0 and N-1
424
+ i_use = np.random.randint(0, N, (nd,nr))
425
+
426
+ # Handle ip_range parameter
427
+ nsounding = i_use.shape[0]
428
+ if ip_range is None or len(ip_range) == 0:
429
+ ip_range = np.arange(nsounding)
430
+ if showInfo > 0:
431
+ print(f'Computing posterior statistics for all {nsounding} data points')
432
+ else:
433
+ ip_range = np.asarray(ip_range)
434
+ if showInfo > 0:
435
+ print(f'Computing posterior statistics for {len(ip_range)} of {nsounding} data points')
436
+ # Validate ip_range
437
+ if np.any(ip_range < 0) or np.any(ip_range >= nsounding):
438
+ raise ValueError(f"ip_range contains indices outside valid range [0, {nsounding-1}]")
439
+
440
+ # Compute number of unique realizations for each data location
441
+ if showInfo > 0:
442
+ print('Computing number of unique realizations (N_UNIQUE)')
443
+
444
+ N_UNIQUE = np.full(nsounding, np.nan)
445
+ for iid in tqdm(ip_range, mininterval=1, disable=disableTqdm, desc='N_UNIQUE', leave=False):
446
+ N_UNIQUE[iid] = len(np.unique(i_use[iid, :]))
447
+
448
+ # Save N_UNIQUE to the HDF5 file
449
+ with h5py.File(f_post_h5, 'a') as f_post:
450
+ if '/N_UNIQUE' not in f_post:
451
+ if showInfo > 0:
452
+ print('Creating /N_UNIQUE in %s' % f_post_h5)
453
+ f_post.create_dataset('/N_UNIQUE', data=N_UNIQUE)
454
+ else:
455
+ if showInfo > 0:
456
+ print('Updating /N_UNIQUE in %s' % f_post_h5)
457
+ f_post['/N_UNIQUE'][:] = N_UNIQUE
458
+
459
+ # Process each dataset in f_prior_h5
460
+ with h5py.File(f_prior_h5, 'r') as f_prior, h5py.File(f_post_h5, 'a') as f_post:
461
+ for name, dataset in f_prior.items():
462
+
463
+ if name.upper().startswith('M') and 'is_discrete' in dataset.attrs and dataset.attrs['is_discrete'] == 0:
464
+ if showInfo>2:
465
+ print('%s: CONTINUOUS' % name)
466
+
467
+ nm = dataset.shape[1]
468
+ nsounding, nr = i_use.shape
469
+ m_post = np.zeros((nm, nr))
470
+
471
+ # Initialize with NaN for all data points
472
+ M_logmean = np.full((nsounding, nm), np.nan)
473
+ M_mean = np.full((nsounding, nm), np.nan)
474
+ M_std = np.full((nsounding, nm), np.nan)
475
+ M_logstd = np.full((nsounding, nm), np.nan)
476
+ M_median = np.full((nsounding, nm), np.nan)
477
+ M_harmonicmean = np.full((nsounding, nm), np.nan)
478
+ if computeKL_continuous:
479
+ M_KL = np.full((nsounding, nm), np.nan)
480
+
481
+ # Load all prior data into memory
482
+ M_all = dataset[:]
483
+
484
+ useSequential = True
485
+ if useSequential:
486
+
487
+ # Precompute log10 prior histograms once — prior is fixed across all soundings.
488
+ # Using histograms (O(n)) instead of KDE (O(n²)) for speed.
489
+ # Log10 bins are appropriate for log-normally distributed continuous parameters.
490
+ if computeKL_continuous:
491
+ n_bins_kl = 50
492
+ idx_kl = np.random.choice(M_all.shape[0], nr, replace=False)
493
+ M_prior_kl = np.log10(np.maximum(M_all[idx_kl, :], 1e-10))
494
+ kl_bins = []
495
+ kl_prior_hist = []
496
+ for _i in range(nm):
497
+ col = M_prior_kl[:, _i]
498
+ bins = np.linspace(col.min(), col.max(), n_bins_kl + 1)
499
+ h, _ = np.histogram(col, bins=bins)
500
+ h = (h + 1e-10) / (h.sum() + n_bins_kl * 1e-10)
501
+ kl_bins.append(bins)
502
+ kl_prior_hist.append(h)
503
+
504
+ # Sequential processing - simple, fast, memory-efficient
505
+ for iid in tqdm(ip_range, mininterval=1, disable=disableTqdm, desc='%s-continuous' % name, leave=False):
506
+ ir = np.int64(i_use[iid,:])
507
+ m_post = M_all[ir,:]
508
+
509
+ M_logmean[iid,:] = np.exp(np.mean(np.log(m_post), axis=0))
510
+ M_mean[iid,:] = np.mean(m_post, axis=0)
511
+ M_median[iid,:] = np.median(m_post, axis=0)
512
+ with np.errstate(invalid='ignore', divide='ignore'):
513
+ M_logstd[iid,:] = np.std(np.log10(np.maximum(m_post, 1e-10)), axis=0)
514
+ M_std[iid,:] = np.std(m_post, axis=0)
515
+ _c = 1.0 / np.maximum(m_post, 1e-10)
516
+ _k = int(np.floor(0.10 * _c.shape[0]))
517
+ _cs = np.sort(_c, axis=0)
518
+ M_harmonicmean[iid, :] = 1.0 / np.mean(_cs[_k:_c.shape[0]-_k, :], axis=0)
519
+ if computeKL_continuous:
520
+ m_post_log = np.log10(np.maximum(m_post, 1e-10))
521
+ for _i in range(nm):
522
+ h_q, _ = np.histogram(m_post_log[:, _i], bins=kl_bins[_i])
523
+ h_q = (h_q + 1e-10) / (h_q.sum() + n_bins_kl * 1e-10)
524
+ M_KL[iid, _i] = np.sum(h_q * np.log2(h_q / kl_prior_hist[_i]))
525
+ elif a==1:
526
+
527
+ # NEW Experimental METHOD
528
+ # 3. Optimization Constants
529
+ BATCH_SIZE = 100 # Process 1000 soundings at a time
530
+ INV_LOG_10 = 1.0 / np.log(10.0) # Pre-calculate constant
531
+
532
+ # 4. Batched Processing
533
+ # Instead of 40,000 iterations, we do 40.
534
+ for start_idx in tqdm(range(0, len(ip_range), BATCH_SIZE),
535
+ disable=disableTqdm,
536
+ desc=f'{name}-optimized',
537
+ leave=False):
538
+
539
+ # A. Define Batch Range
540
+ end_idx = min(start_idx + BATCH_SIZE, len(ip_range))
541
+ current_iids = ip_range[start_idx:end_idx]
542
+
543
+ # B. Vectorized Indexing
544
+ # Gather all indices for this batch at once.
545
+ # Shape: (Batch_Size, K) where K is number of priors used per sounding
546
+ batch_indices = np.int64(i_use[current_iids, :])
547
+
548
+ # C. Create 3D Data Cube
549
+ # Fetch data for 1000 soundings simultaneously.
550
+ # Shape: (Batch_Size, K, nm) -> e.g., (1000, 50, 100)
551
+ # This is the biggest speedup: one large memory read instead of 1000 tiny ones.
552
+ m_cube = M_all[batch_indices, :]
553
+
554
+ # D. Compute Statistics (Collapsing Axis 1)
555
+
556
+ # -- Arithmetic Mean & Median --
557
+ M_mean[current_iids, :] = np.mean(m_cube, axis=1)
558
+ M_median[current_iids, :] = np.median(m_cube, axis=1)
559
+
560
+ # -- Logarithmic Stats (Optimized) --
561
+ # Calculate Log ONCE.
562
+ # Use maximum to prevent log(0) errors (NaNs)
563
+ # Shape: (Batch_Size, K, nm)
564
+ log_cube = np.log(np.maximum(m_cube, 1e-10))
565
+
566
+ # Geometric Mean: exp(mean(log(x)))
567
+ M_logmean[current_iids, :] = np.exp(np.mean(log_cube, axis=1))
568
+
569
+ # LogStd: std(log10(x)) = std(ln(x)) * (1/ln(10)); reuse log_cube for speed
570
+ M_logstd[current_iids, :] = np.std(log_cube, axis=1) * INV_LOG_10
571
+ M_std[current_iids, :] = np.std(m_cube, axis=1)
572
+
573
+ # Harmonic mean (trimmed 10% each tail in conductivity space)
574
+ _c = 1.0 / np.maximum(m_cube, 1e-10)
575
+ _nr = _c.shape[1]
576
+ _k = int(np.floor(0.10 * _nr))
577
+ _cs = np.sort(_c, axis=1)
578
+ M_harmonicmean[current_iids, :] = 1.0 / np.mean(_cs[:, _k:_nr-_k, :], axis=1)
579
+
580
+
581
+
582
+
583
+ # Create datasets
584
+ for stat in ['Mean', 'Median', 'Std', 'LogStd', 'LogMean', 'HarmonicMean']:
585
+ if stat not in f_post:
586
+ dset = '/%s/%s' % (name,stat)
587
+ if dset not in f_post:
588
+ if (showInfo>0):
589
+ print('Creating %s in %s' % (dset,f_post_h5 ))
590
+ f_post.create_dataset(dset, (nsounding,nm))
591
+
592
+ f_post['/%s/%s' % (name,'LogMean')][:] = M_logmean
593
+ f_post['/%s/%s' % (name,'Mean')][:] = M_mean
594
+ f_post['/%s/%s' % (name,'Median')][:] = M_median
595
+ f_post['/%s/%s' % (name,'Std')][:] = M_std
596
+ f_post['/%s/%s' % (name,'LogStd')][:] = M_logstd
597
+ f_post['/%s/%s' % (name,'HarmonicMean')][:] = M_harmonicmean
598
+ if computeKL_continuous:
599
+ dset = '/%s/KL' % name
600
+ if dset not in f_post:
601
+ f_post.create_dataset(dset, (nsounding, nm))
602
+ f_post[dset][:] = M_KL
603
+
604
+ elif name.upper().startswith('M') and 'is_discrete' in dataset.attrs and dataset.attrs['is_discrete'] == 1:
605
+ if showInfo>2:
606
+ print('%s: DISCRETE' % name)
607
+
608
+ nm = dataset.shape[1]
609
+ nsounding, nr = i_use.shape
610
+ # Get number of classes for name
611
+ class_id = f_prior[name].attrs['class_id']
612
+ n_classes = len(class_id)
613
+
614
+ # Determine log_base for KL: use n_classes from class_name if available
615
+ if 'class_name' in f_prior[name].attrs:
616
+ log_base_kl = len(f_prior[name].attrs['class_name'])
617
+ else:
618
+ log_base_kl = n_classes if n_classes > 1 else 2
619
+
620
+ if showInfo>1:
621
+ print('%s: DISCRETE, N_classes =%d' % (name,n_classes))
622
+
623
+ # Initialize with NaN for all data points
624
+ M_mode = np.full((nsounding, nm), np.nan)
625
+ M_entropy = np.full((nsounding, nm), np.nan)
626
+ M_P = np.full((nsounding, n_classes, nm), np.nan)
627
+ if computeKL_discrete:
628
+ M_KL = np.full((nsounding, nm), np.nan)
629
+
630
+ # Create datasets in h5 file
631
+ for stat in ['Mode', 'Entropy']:
632
+ if stat not in f_post:
633
+ dset = '/%s/%s' % (name,stat)
634
+ if dset not in f_post:
635
+ if (showInfo>0):
636
+ print('Creating %s in %s' % (dset,f_post_h5 ))
637
+ f_post.create_dataset(dset, (nsounding,nm))
638
+ for stat in ['P']:
639
+ if stat not in f_post:
640
+ dset = '/%s/%s' % (name,stat)
641
+ if dset not in f_post:
642
+ if (showInfo>0):
643
+ print('Creating %s' % dset)
644
+ f_post.create_dataset(dset, (nsounding,n_classes,nm))
645
+
646
+ # Load all prior data into memory
647
+ M_all = dataset[:]
648
+
649
+ # Subsample prior once for KL (prior is fixed across soundings)
650
+ if computeKL_discrete:
651
+ idx_kl = np.random.choice(M_all.shape[0], nr, replace=False)
652
+ M_prior_kl = M_all[idx_kl, :]
653
+
654
+ # Sequential processing - simple, fast, memory-efficient
655
+ for iid in tqdm(ip_range, mininterval=1, disable=disableTqdm, desc='%s-discrete' % name, leave=False):
656
+ ir = np.int64(i_use[iid,:])
657
+ m_post = M_all[ir,:]
658
+
659
+ # Compute class probabilities
660
+ n_count = np.zeros((n_classes,nm))
661
+ for ic in range(n_classes):
662
+ n_count[ic,:] = np.sum(class_id[ic]==m_post, axis=0)/nr
663
+ M_P[iid,:,:] = n_count
664
+
665
+ # Compute mode
666
+ M_mode[iid,:] = class_id[np.argmax(n_count, axis=0)]
667
+
668
+ # Compute entropy
669
+ M_entropy[iid,:] = sp.stats.entropy(n_count, base=n_classes)
670
+ if computeKL_discrete:
671
+ M_KL[iid, :] = kl_divergence(m_post, M_prior_kl, is_discrete=True, log_base=log_base_kl)
672
+
673
+ f_post['/%s/%s' % (name,'Mode')][:] = M_mode
674
+ f_post['/%s/%s' % (name,'Entropy')][:] = M_entropy
675
+ f_post['/%s/%s' % (name,'P')][:] = M_P
676
+ if computeKL_discrete:
677
+ dset = '/%s/KL' % name
678
+ if dset not in f_post:
679
+ f_post.create_dataset(dset, (nsounding, nm))
680
+ f_post[dset][:] = M_KL
681
+
682
+
683
+ else:
684
+ if (showInfo>1):
685
+ print('%s: NOT RECOGNIZED' % name.upper())
686
+
687
+
688
+
689
+
690
+ return None
691
+
692
+
693
+ def sample_from_posterior(is_, d_sim, f_data_h5='tTEM-Djursland.h5', N_use=1000000, autoT=1, ns=400):
694
+ """
695
+ Sample from the posterior distribution.
696
+
697
+ Parameters
698
+ ----------
699
+ is_ : int
700
+ Index of data f_data_h5.
701
+ d_sim : ndarray
702
+ Simulated data.
703
+ f_data_h5 : str, optional
704
+ Filepath of the data file. Default is 'tTEM-Djursland.h5'.
705
+ N_use : int, optional
706
+ Number of samples to use. Default is 1000000.
707
+ autoT : int, optional
708
+ Flag indicating whether to estimate temperature. Default is 1.
709
+ ns : int, optional
710
+ Number of samples to draw from the posterior. Default is 400.
711
+
712
+ Returns
713
+ -------
714
+ tuple
715
+ A tuple containing the following elements:
716
+
717
+ i_use : ndarray
718
+ Indices of the samples used.
719
+ T : float
720
+ Temperature.
721
+ EV : float
722
+ Expected value.
723
+ is_index : int
724
+ Index of the posterior sample.
725
+ """
726
+ with h5py.File(f_data_h5, 'r') as f:
727
+ d_obs = f['/D1/d_obs'][is_,:]
728
+ d_std = f['/D1/d_std'][is_,:]
729
+
730
+ i_use = np.where(~np.isnan(d_obs) & (np.abs(d_obs) > 0))[0]
731
+ d_obs = d_obs[i_use]
732
+ d_var = d_std[i_use]**2
733
+
734
+ dd = (d_sim[:, i_use] - d_obs)**2
735
+ #logL = -.5*np.sum(dd/d_var, axis=1)
736
+ logL = np.sum(-0.5 * dd / d_var, axis=1)
737
+
738
+ # Compute the annealing temperature
739
+ if autoT == 1:
740
+ T = logl_T_est(logL)
741
+ else:
742
+ T = 1
743
+ maxlogL = np.nanmax(logL)
744
+
745
+ # Find ns realizations of the posterior, using the log-likelihood values logL, and the annealing tempetrature T
746
+ i_use, P_acc = lu_post_sample_logl(logL, ns, T)
747
+
748
+ # Compute the evidence
749
+ exp_logL = np.exp(logL - maxlogL)
750
+ EV = maxlogL + np.log(np.nansum(exp_logL)/len(logL))
751
+ return i_use, T, EV, is_
752
+
753
+
754
+
755
+
756
+ #def sample_from_posterior_chunk(is_,d_sim,f_data_h5, N_use,autoT,ns):
757
+ # return sample_from_posterior(is_,d_sim,f_data_h5, N_use,autoT,ns)
758
+
759
+ #%% integrate_prior_data: updates PRIOR strutcure with DATA
760
+ def prior_data(f_prior_in_h5, f_forward_h5, id=1, im=1, doMakePriorCopy=0, parallel=True):
761
+ """
762
+ Update prior structure with forward modeled data.
763
+
764
+ This function integrates forward modeling results into the prior data structure,
765
+ supporting different data types including TDEM (time-domain electromagnetic) data
766
+ with GA-AEM forward modeling and identity transforms.
767
+
768
+ Parameters
769
+ ----------
770
+ f_prior_in_h5 : str
771
+ Path to input prior HDF5 file containing prior models.
772
+ f_forward_h5 : str
773
+ Path to forward modeling results HDF5 file.
774
+ id : int, optional
775
+ Data identifier for the prior structure. Default is 1.
776
+ im : int, optional
777
+ Model identifier for the prior structure. Default is 1.
778
+ doMakePriorCopy : int, optional
779
+ Flag to create a copy of the prior file (0=no copy, 1=copy). Default is 0.
780
+ parallel : bool, optional
781
+ Enable parallel processing for forward modeling. Default is True.
782
+
783
+ Returns
784
+ -------
785
+ str
786
+ Path to the updated prior HDF5 file containing integrated data.
787
+
788
+ Notes
789
+ -----
790
+ The function automatically detects the data type from the forward modeling file
791
+ and calls appropriate integration methods (GA-AEM for TDEM, identity for direct data).
792
+ Prints error messages for unsupported data types or methods.
793
+ """
794
+ # Check if at least two inputs are provided
795
+ if f_prior_in_h5 is None or f_forward_h5 is None:
796
+ print(f'{__name__}: Use at least two inputs to')
797
+ help(__name__)
798
+ return ''
799
+
800
+ # Open HDF5 files
801
+ with h5py.File(f_forward_h5, 'r') as f:
802
+ # Check type=='TDEM'
803
+ if 'type' in f.attrs:
804
+ data_type = f.attrs['type']
805
+ else:
806
+ data_type = 'TDEM'
807
+
808
+ f_prior_h5 = ''
809
+ if data_type.lower() == 'tdem':
810
+ # TDEM
811
+ with h5py.File(f_forward_h5, 'r') as f:
812
+ if 'method' in f.attrs:
813
+ method = f.attrs['method']
814
+ else:
815
+ print(f'{__name__}: "TDEM/{method}" not supported')
816
+ return
817
+
818
+ if method.lower() == 'ga-aem':
819
+ f_prior_h5, id, im = integrate_prior_data_gaaem(f_prior_in_h5, f_forward_h5, id, im, doMakePriorCopy)
820
+ else:
821
+ print(f'{__name__}: "TDEM/{method}" not supported')
822
+ return
823
+ elif data_type.lower() == 'identity':
824
+ f_prior_h5, id, im = integrate_prior_data_identity(f_prior_in_h5, f_forward_h5, id, im, doMakePriorCopy)
825
+ else:
826
+ print(f'{__name__}: "{data_type}" not supported')
827
+ return
828
+
829
+ # update prior data with an attribute defining the prior
830
+ with h5py.File(f_prior_h5, 'a') as f:
831
+ f.attrs[f'/D{id}'] = 'f5_forward'
832
+
833
+
834
+ integrate_update_prior_attributes(f_prior_h5)
835
+
836
+ return f_prior_h5
837
+
838
+
839
+ '''
840
+ Forward simulation
841
+ '''
842
+
843
+ def forward_gaaem(C=np.array(()),
844
+ thickness=np.array(()),
845
+ stmfiles=None,
846
+ tx_height=np.array(()),
847
+ txrx_dx = -13,
848
+ txrx_dy = 0,
849
+ txrx_dz = .1,
850
+ GEX={},
851
+ file_gex=None,
852
+ showtime=False,
853
+ **kwargs):
854
+ """
855
+ Perform forward modeling using the GA-AEM method.
856
+
857
+ Parameters
858
+ ----------
859
+ C : numpy.ndarray, optional
860
+ Conductivity array. Default is np.array(()).
861
+ thickness : numpy.ndarray, optional
862
+ Thickness array. Default is np.array(()).
863
+ stmfiles : list, optional
864
+ List of STM files. Default is None.
865
+ tx_height : numpy.ndarray, optional
866
+ Transmitter height array. Default is np.array(()).
867
+ txrx_dx : float, optional
868
+ X-distance between transmitter and receiver. Default is -13.
869
+ txrx_dy : float, optional
870
+ Y-distance between transmitter and receiver. Default is 0.
871
+ txrx_dz : float, optional
872
+ Z-distance between transmitter and receiver. Default is 0.1.
873
+ GEX : dict, optional
874
+ GEX dictionary. Default is {}.
875
+ file_gex : str, optional
876
+ Path to GEX file. Default is None.
877
+ showtime : bool, optional
878
+ Flag to display execution time. Default is False.
879
+ showInfo : int, optional
880
+ Level of verbosity for output.
881
+ doCompress : bool, optional
882
+ Flag to enable layer compression. Default is True.
883
+
884
+ Returns
885
+ -------
886
+ numpy.ndarray
887
+ Forward modeled data array.
888
+ """
889
+ from gatdaem1d import Earth;
890
+ from gatdaem1d import Geometry;
891
+ # Next should probably only be loaded if the DLL is not allready loaded!!!
892
+ from gatdaem1d import TDAEMSystem; # loads the DLL!!
893
+ import integrate as ig
894
+ import time
895
+ from tqdm import tqdm
896
+
897
+ showInfo = kwargs.get('showInfo', 0)
898
+ progress_callback = kwargs.get('progress_callback', None)
899
+ if (showInfo<0):
900
+ disableTqdm=True
901
+ else:
902
+ disableTqdm=False
903
+
904
+ doCompress = kwargs.get('doCompress', True)
905
+
906
+ # Handle None defaults
907
+ if stmfiles is None:
908
+ stmfiles = []
909
+ if file_gex is None:
910
+ file_gex = ''
911
+
912
+ #print(stmfiles)
913
+ #print(file_gex)
914
+
915
+ if (len(stmfiles)>0) and (file_gex != '') and (len(GEX)==0):
916
+ # GEX FILE and STM FILES
917
+ if (showInfo)>1:
918
+ print('Using submitted GEX file (%s)' % (file_gex))
919
+ # Try legacy read_gex first, fallback to read_gex_workbench if needed
920
+ try:
921
+ GEX = ig.read_gex(file_gex)
922
+ except (ValueError, KeyError) as e:
923
+ if showInfo > 0:
924
+ print(f"Legacy read_gex() failed ({type(e).__name__}), trying read_gex_workbench()...")
925
+ GEX = ig.read_gex_workbench(file_gex, showInfo=showInfo)
926
+ elif (len(stmfiles)>0):
927
+ # USING STM FILES
928
+ if (showInfo)>1:
929
+ print('Using submitted STM files (%s)' % (stmfiles))
930
+
931
+ elif (len(stmfiles)==0) and (file_gex != '') and (len(GEX)==0):
932
+ # ONLY GEX FILE
933
+ stmfiles, GEX = ig.gex_to_stm(file_gex, **kwargs)
934
+ elif (len(stmfiles)>0) and (file_gex == '') and (len(GEX)>0):
935
+ # Using GEX dict and STM FILES
936
+ a = 1
937
+ elif (len(GEX)>0) and (len(stmfiles)>1):
938
+ # using the GEX file in stmfiles
939
+ print('Using submitted GEX and STM files')
940
+ elif (len(GEX)>0) and (len(stmfiles)==0):
941
+ # using GEX file and writing STM files
942
+ print('Using submitted GEX and writing STM files')
943
+ stmfiles = ig.write_stm_files(GEX, **kwargs)
944
+ elif (len(GEX)==0) and (len(stmfiles)>1):
945
+ if (file_gex == ''):
946
+ if (showInfo>-1):
947
+ print('Using STM files without GEX file')
948
+ #return -1
949
+ else:
950
+ print('Converting STM files to GEX')
951
+ # Try legacy read_gex first, fallback to read_gex_workbench if needed
952
+ try:
953
+ GEX = ig.read_gex(file_gex)
954
+ except (ValueError, KeyError) as e:
955
+ if showInfo > 0:
956
+ print(f"Legacy read_gex() failed ({type(e).__name__}), trying read_gex_workbench()...")
957
+ GEX = ig.read_gex_workbench(file_gex, showInfo=showInfo)
958
+ elif (len(GEX)>0) and (len(stmfiles)==0):
959
+ stmfiles, GEX = ig.gex_to_stm(file_gex, **kwargs)
960
+ elif (file_gex != ''):
961
+ a=1
962
+ #stmfiles, GEX = ig.gex_to_stm(file_gex, **kwargs)
963
+ else:
964
+ print('Error: No GEX or STM files provided')
965
+ return -1
966
+
967
+ if (showInfo>0):
968
+ print('Using STM files : ')
969
+ print(stmfiles)
970
+
971
+ if (showInfo>1):
972
+ if 'filename' in GEX:
973
+ print('Using GEX file: ', GEX['filename'])
974
+
975
+ nstm=len(stmfiles)
976
+ if (showInfo>0):
977
+ for i in range(len(stmfiles)):
978
+ print('Using MOMENT:', stmfiles[i])
979
+
980
+ if C.ndim==1:
981
+ nd=1
982
+ nl=C.shape[0]
983
+ else:
984
+ nd,nl=C.shape
985
+
986
+ nt = thickness.shape[0]
987
+ if nt != (nl-1):
988
+ raise ValueError('Error: thickness array (nt=%d) does not match the number of layers minus 1(nl=%d)' % (nt,nl))
989
+
990
+ if (showInfo>0):
991
+ print('nd=%s, nl=%d, nstm=%d' %(nd,nl,nstm))
992
+
993
+ # SETTING UP t1=time.time()
994
+ t1=time.time()
995
+
996
+ S_LM = TDAEMSystem(stmfiles[0])
997
+ if nstm>1:
998
+ S_HM = TDAEMSystem(stmfiles[1])
999
+ S=[S_LM, S_HM]
1000
+ else:
1001
+ S=[S_LM]
1002
+ t2=time.time()
1003
+ t_system = 1000*(t2-t1)
1004
+ if showtime:
1005
+ print("Time, Setting up systems = %4.1fms" % t_system)
1006
+
1007
+ # Setting up geometry
1008
+ if len(GEX)>0:
1009
+ # Try legacy read_gex first, fallback to read_gex_workbench if needed
1010
+ try:
1011
+ GEX = ig.read_gex(file_gex)
1012
+ except (ValueError, KeyError) as e:
1013
+ if showInfo > 0:
1014
+ print(f"Legacy read_gex() failed ({type(e).__name__}), trying read_gex_workbench()...")
1015
+ GEX = ig.read_gex_workbench(file_gex, showInfo=showInfo)
1016
+ if 'TxCoilPosition1' in GEX['General']:
1017
+ # Typical for tTEM system
1018
+ txrx_dx = float(GEX['General']['RxCoilPosition1'][0])-float(GEX['General']['TxCoilPosition1'][0])
1019
+ txrx_dy = float(GEX['General']['RxCoilPosition1'][1])-float(GEX['General']['TxCoilPosition1'][1])
1020
+ txrx_dz = float(GEX['General']['RxCoilPosition1'][2])-float(GEX['General']['TxCoilPosition1'][2])
1021
+ if len(tx_height)==0:
1022
+ tx_height = -float(GEX['General']['TxCoilPosition1'][2])
1023
+ tx_height=np.array([tx_height])
1024
+
1025
+ else:
1026
+ # Typical for SkyTEM system
1027
+ txrx_dx = float(GEX['General']['RxCoilPosition1'][0])
1028
+ txrx_dy = float(GEX['General']['RxCoilPosition1'][1])
1029
+ txrx_dz = float(GEX['General']['RxCoilPosition1'][2])
1030
+ if len(tx_height)==0:
1031
+ tx_height=np.array([40])
1032
+
1033
+
1034
+ # Set geometry once, if tx_height has one value
1035
+ if len(tx_height)==1:
1036
+ if (showInfo>1):
1037
+ print('Using tx_height=%f' % tx_height[0])
1038
+ G = Geometry(tx_height=float(tx_height[0]), txrx_dx = txrx_dx, txrx_dy = txrx_dy, txrx_dz = txrx_dz)
1039
+ if (showInfo>1):
1040
+ print('tx_height=%f, txrx_dx=%f, txrx_dy=%f, txrx_dz=%f' % (tx_height[0], txrx_dx, txrx_dy, txrx_dz))
1041
+
1042
+ # Handle both scalar and array values for NumPy 2.x compatibility
1043
+ no_gates_ch1 = np.atleast_1d(GEX['Channel1']['NoGates'])[0]
1044
+ remove_gates_ch1 = np.atleast_1d(GEX['Channel1']['RemoveInitialGates'])[0]
1045
+ ng0 = no_gates_ch1 - remove_gates_ch1
1046
+ if nstm>1:
1047
+ no_gates_ch2 = np.atleast_1d(GEX['Channel2']['NoGates'])[0]
1048
+ remove_gates_ch2 = np.atleast_1d(GEX['Channel2']['RemoveInitialGates'])[0]
1049
+ ng1 = no_gates_ch2 - remove_gates_ch2
1050
+ else:
1051
+ ng1 = 0
1052
+ ng = int(ng0+ng1)
1053
+
1054
+ else:
1055
+ if len(tx_height)==0:
1056
+ tx_height=np.array([0])
1057
+ G = Geometry(tx_height=float(tx_height[0]), txrx_dx = txrx_dx, txrx_dy = txrx_dy, txrx_dz = txrx_dz)
1058
+ # Here we should read the number of gates from the lines in STMFILES that conatin 'NumberOfWindows = 41'
1059
+ ng = 41
1060
+
1061
+ # pinrt txrx_dx, txrx_dy, txrx_dz
1062
+ if (showInfo>0):
1063
+ print('txrx_dx=%f, txrx_dy=%f, txrx_dz=%f' % (txrx_dx, txrx_dy, txrx_dz))
1064
+ print('ng=%d' % ng)
1065
+
1066
+
1067
+ D = np.zeros((nd,ng))
1068
+
1069
+ # Compute forward data
1070
+ t1=time.time()
1071
+ # Throttle callback to ~100 updates so it does not dominate runtime
1072
+ progress_step = max(1, nd // 100)
1073
+
1074
+ for i in tqdm(range(nd), mininterval=1, disable=disableTqdm, desc='gatdaem1d', leave=False):
1075
+ if progress_callback and ((i + 1) % progress_step == 0 or i + 1 == nd):
1076
+ _report_progress(progress_callback, i + 1, nd,
1077
+ 'computing', 'Forward modeling (%d/%d soundings)' % (i + 1, nd))
1078
+ if C.ndim==1:
1079
+ # Only one model
1080
+ conductivity = C
1081
+ else:
1082
+ conductivity = C[i]
1083
+
1084
+ # Update geometry, tx_height is changing!
1085
+ if len(tx_height)>1:
1086
+ if (showInfo>1):
1087
+ print('Using tx_height=%f' % tx_height[i])
1088
+ G = Geometry(tx_height=float(tx_height[i]), txrx_dx = txrx_dx, txrx_dy = txrx_dy, txrx_dz = txrx_dz)
1089
+
1090
+ #doCompress=True
1091
+ if doCompress:
1092
+ i_change=np.where(np.diff(conductivity) != 0 )[0]+1
1093
+ n_change = len(i_change)
1094
+ conductivity_compress = np.zeros(n_change+1)+conductivity[0]
1095
+ thickness_compress = np.zeros(n_change)
1096
+ for il in range(n_change):
1097
+ conductivity_compress[il+1] = conductivity[i_change[il]]
1098
+ if il==0:
1099
+ thickness_compress[il]=np.sum(thickness[0:i_change[il]])
1100
+ else:
1101
+ i1=i_change[il-1]
1102
+ i2=i_change[il]
1103
+ #print("i1: %d, i2: %d" % (i1, i2))
1104
+ thickness_compress[il]=np.sum(thickness[i1:i2])
1105
+ E = Earth(conductivity_compress,thickness_compress)
1106
+ else:
1107
+ E = Earth(conductivity,thickness)
1108
+
1109
+ fm0 = S[0].forwardmodel(G,E)
1110
+ d = -fm0.SZ
1111
+ if nstm>1:
1112
+ fm1 = S[1].forwardmodel(G,E)
1113
+ d1 = -fm1.SZ
1114
+ d = np.concatenate((d,d1))
1115
+
1116
+ D[i] = d
1117
+
1118
+ '''
1119
+ fm_lm = S_LM.forwardmodel(G,E)
1120
+ fm_hm = S_HM.forwardmodel(G,E)
1121
+ # combine -fm_lm.SZ and -fm_hm.SZ
1122
+ d = np.concatenate((-fm_lm.SZ,-fm_hm.SZ))
1123
+ d_ref = D[i]
1124
+ '''
1125
+
1126
+ t2=time.time()
1127
+ if showtime:
1128
+ print("Time = %4.1fms per model and %d model tests" % (1000*(t2-t1)/nd, nd))
1129
+
1130
+ return D
1131
+
1132
+ def forward_gaaem_chunk(C_chunk, tx_height_chunk, thickness, stmfiles, file_gex, Nhank, Nfreq, **kwargs):
1133
+ """
1134
+ Perform forward modeling using the GA-AEM method on a chunk of data.
1135
+
1136
+ Parameters
1137
+ ----------
1138
+ C_chunk : numpy.ndarray
1139
+ The chunk of data to be processed.
1140
+ tx_height_chunk : numpy.ndarray
1141
+ The transmitter heights for this chunk.
1142
+ thickness : float
1143
+ The thickness of the model.
1144
+ stmfiles : list
1145
+ A list of STM files.
1146
+ file_gex : str
1147
+ The path to the GEX file.
1148
+ Nhank : int
1149
+ The number of Hankel functions.
1150
+ Nfreq : int
1151
+ The number of frequencies.
1152
+ **kwargs : dict
1153
+ Additional keyword arguments.
1154
+
1155
+ Returns
1156
+ -------
1157
+ numpy.ndarray
1158
+ The result of the forward modeling.
1159
+ """
1160
+ return forward_gaaem(C=C_chunk,
1161
+ thickness=thickness,
1162
+ tx_height=tx_height_chunk,
1163
+ stmfiles=stmfiles,
1164
+ file_gex=file_gex,
1165
+ Nhank=Nhank,
1166
+ Nfreq=Nfreq,
1167
+ parallel=False,
1168
+ **kwargs)
1169
+
1170
+ # %% PRIOR DATA GENERATORS
1171
+
1172
+ # Add this function to check current handle count (Windows only)
1173
+ def get_process_handle_count():
1174
+ """
1175
+ Return the number of handles used by the current process (Windows only).
1176
+
1177
+ Returns
1178
+ -------
1179
+ int
1180
+ The number of handles used by the current process.
1181
+ """
1182
+ import psutil
1183
+ import os
1184
+ return psutil.Process(os.getpid()).num_handles()
1185
+
1186
+ def prior_data_gaaem(f_prior_h5, file_gex=None, stmfiles=None, N=0, doMakePriorCopy=True, im=1, id=1, im_height=0, Nhank=280, Nfreq=12, is_log=False, parallel=True, force_replace=False, **kwargs):
1187
+ """
1188
+ Generate prior data for the GA-AEM method.
1189
+
1190
+ Parameters
1191
+ ----------
1192
+ f_prior_h5 : str
1193
+ Path to the prior data file in HDF5 format.
1194
+ file_gex : str, optional
1195
+ Path to the file containing geophysical exploration data (.gex format).
1196
+ stmfiles : list of str, optional
1197
+ List of STM files for system configuration. If not provided, will be
1198
+ generated from file_gex.
1199
+ N : int, optional
1200
+ Number of soundings to consider. Default is 0 (use all).
1201
+ doMakePriorCopy : bool, optional
1202
+ Flag indicating whether to make a copy of the prior file. Default is True.
1203
+ im : int, optional
1204
+ Index of the model. Default is 1.
1205
+ id : int, optional
1206
+ Index of the data. Default is 1.
1207
+ im_height : int, optional
1208
+ Index of the model for height. Default is 0.
1209
+ Nhank : int, optional
1210
+ Number of Hankel transform quadrature points. Default is 280.
1211
+ Nfreq : int, optional
1212
+ Number of frequencies. Default is 12.
1213
+ is_log : bool, optional
1214
+ Flag to apply logarithmic scaling to data. Default is False.
1215
+ parallel : bool, optional
1216
+ Flag indicating whether multiprocessing is used. Default is True.
1217
+ When True, forward modeling is parallelized across available CPUs.
1218
+ **kwargs : dict
1219
+ Additional keyword arguments:
1220
+
1221
+ Ncpu : int, optional
1222
+ Number of CPUs to use for parallel processing. Default is 0, which
1223
+ uses all available CPUs. Only used when parallel=True.
1224
+ force_replace : bool, optional
1225
+ If True, delete an existing /D{id} dataset before writing.
1226
+ If False (default), print a warning and return early if the
1227
+ dataset already exists.
1228
+ showInfo : int, optional
1229
+ Level of verbosity for output (0=silent, 1=normal, 2=verbose).
1230
+
1231
+ Returns
1232
+ -------
1233
+ str
1234
+ Filename of the HDF5 file containing the updated prior data.
1235
+
1236
+ Notes
1237
+ -----
1238
+ This function computes forward-modeled electromagnetic responses for prior
1239
+ model realizations using the GA-AEM forward modeling code. The forward
1240
+ modeling can be parallelized for faster computation on multi-core systems.
1241
+
1242
+ Examples
1243
+ --------
1244
+ >>> # Basic usage with all CPUs
1245
+ >>> f_prior_data = prior_data_gaaem(f_prior_h5, file_gex)
1246
+
1247
+ >>> # Use specific number of CPUs
1248
+ >>> f_prior_data = prior_data_gaaem(f_prior_h5, file_gex, Ncpu=4)
1249
+
1250
+ >>> # Sequential processing (no parallelization)
1251
+ >>> f_prior_data = prior_data_gaaem(f_prior_h5, file_gex, parallel=False)
1252
+ """
1253
+ import integrate as ig
1254
+ import os
1255
+ # Safety guard: if somehow called from a worker process, do nothing.
1256
+ if multiprocessing.current_process().name != 'MainProcess':
1257
+ return None
1258
+
1259
+ type = 'TDEM'
1260
+ method = 'ga-aem'
1261
+ showInfo = kwargs.get('showInfo', 0)
1262
+ Ncpu = kwargs.get('Ncpu', 0)
1263
+ # of 'Nproc' is set in kwargs use it
1264
+ Ncpu = kwargs.get('Nproc', Ncpu)
1265
+ # Pop (not get): the callback must never be pickled to worker processes
1266
+ progress_callback = kwargs.pop('progress_callback', None)
1267
+
1268
+ if showInfo>0:
1269
+ print('prior_data_gaaem: %s/%s -- starting' % (type, method))
1270
+
1271
+ # Force open/close of hdf5 file
1272
+ if showInfo>0:
1273
+ print('Forcing open and close of %s' % (f_prior_h5))
1274
+ with h5py.File(f_prior_h5, 'r') as f:
1275
+ # open and close
1276
+ pass
1277
+
1278
+ with h5py.File(f_prior_h5, 'r') as f:
1279
+ N_in = f['M1'].shape[0]
1280
+ if N==0:
1281
+ N = N_in
1282
+ if N>N_in:
1283
+ N=N_in
1284
+
1285
+ # if is not None file_gex
1286
+ if (file_gex is not None):
1287
+ if not os.path.isfile(file_gex):
1288
+ print("ERRROR: file_gex=%s does not exist in the current folder." % file_gex)
1289
+
1290
+ if (stmfiles is not None):
1291
+ for i in range(len(stmfiles)):
1292
+ if not os.path.isfile(stmfiles[i]):
1293
+ print("ERRROR: stmfiles[%d]=%s does not exist in the current folder." % (i,stmfiles[i]))
1294
+
1295
+
1296
+ if doMakePriorCopy:
1297
+
1298
+ # If file_gex is not None, then use it to get the file_base_name
1299
+ if (file_gex is not None) and os.path.isfile(file_gex):
1300
+ file_basename = os.path.splitext(os.path.basename(file_gex))[0]
1301
+ elif (stmfiles is not None) and (len(stmfiles)>0):
1302
+ file_basename = os.path.splitext(os.path.basename(stmfiles[0]))[0]
1303
+ else:
1304
+ file_basename = 'GAAEM'
1305
+
1306
+ print('Using file_basename=%s' % file_basename)
1307
+
1308
+ if N < N_in:
1309
+ f_prior_data_h5 = '%s_%s_N%d_Nh%d_Nf%d.h5' % (os.path.splitext(f_prior_h5)[0], os.path.splitext(file_basename)[0], N, Nhank, Nfreq)
1310
+ else:
1311
+ f_prior_data_h5 = '%s_%s_Nh%d_Nf%d.h5' % (os.path.splitext(f_prior_h5)[0], os.path.splitext(file_basename)[0], Nhank, Nfreq)
1312
+
1313
+
1314
+ if (showInfo>0):
1315
+ print("Creating a copy of %s" % (f_prior_h5))
1316
+ print(" as %s" % (f_prior_data_h5))
1317
+ if (showInfo>1):
1318
+ print(' using N=%d of N_in=%d data' % (N,N_in))
1319
+
1320
+ # make a copy of the prior file
1321
+ ig.copy_hdf5_file(f_prior_h5, f_prior_data_h5,N,showInfo=showInfo)
1322
+
1323
+ else:
1324
+ f_prior_data_h5 = f_prior_h5
1325
+
1326
+
1327
+ Mname = '/M%d' % im
1328
+ Mheight = '/M%d' % im_height
1329
+ Dname = '/D%d' % id
1330
+
1331
+
1332
+ with h5py.File(f_prior_data_h5, 'r') as f_prior_r:
1333
+ if im_height>0:
1334
+ if (showInfo>1):
1335
+ print('Using M%d for height' % im_height)
1336
+ tx_height = f_prior_r[Mheight][:]
1337
+
1338
+ # Get thickness
1339
+ if 'x' in f_prior_r[Mname].attrs:
1340
+ z = f_prior_r[Mname].attrs['x']
1341
+ else:
1342
+ z = f_prior_r[Mname].attrs['z']
1343
+ thickness = np.diff(z)
1344
+
1345
+ # Get conductivity
1346
+ if Mname in f_prior_r.keys():
1347
+ C = 1 / f_prior_r[Mname][:]
1348
+ else:
1349
+ print('Could not load %s from %s' % (Mname, f_prior_data_h5))
1350
+
1351
+ N = f_prior_r[Mname].shape[0]
1352
+
1353
+ t1 = time.time()
1354
+ if not parallel:
1355
+ if (showInfo>-1):
1356
+ print("prior_data_gaaem: Using 1 thread /(sequential).")
1357
+ # Sequential
1358
+ if im_height>0:
1359
+ if (showInfo>0):
1360
+ print('Using tx_height')
1361
+ D = ig.forward_gaaem(C=C,
1362
+ thickness=thickness,
1363
+ tx_height=tx_height,
1364
+ file_gex=file_gex,
1365
+ stmfiles=stmfiles,
1366
+ Nhank=Nhank,
1367
+ Nfreq=Nfreq,
1368
+ parallel=parallel,
1369
+ progress_callback=progress_callback, **kwargs)
1370
+ else:
1371
+ D = ig.forward_gaaem(C=C,
1372
+ thickness=thickness,
1373
+ file_gex=file_gex,
1374
+ stmfiles=stmfiles,
1375
+ Nhank=Nhank,
1376
+ Nfreq=Nfreq,
1377
+ parallel=parallel,
1378
+ progress_callback=progress_callback, **kwargs)
1379
+ if is_log:
1380
+ D = np.log10(D)
1381
+ else:
1382
+
1383
+ # Make sure STM files are only written once!!! (need for multihreading)
1384
+ # D = ig.forward_gaaem(C=C[0:1,:], thickness=thickness, file_gex=file_gex, Nhank=Nhank, Nfreq=Nfreq, parallel=False, **kwargs)
1385
+ if stmfiles is None or len(stmfiles)==0:
1386
+ stmfiles, _ = ig.gex_to_stm(file_gex, Nhank=Nhank, Nfreq=Nfreq, **kwargs)
1387
+
1388
+ # Parallel
1389
+ if Ncpu < 1 :
1390
+ #Ncpu = int(multiprocessing.cpu_count()/2)
1391
+ Ncpu = int(multiprocessing.cpu_count())
1392
+ if (showInfo>-1):
1393
+ print("prior_data_gaaem: Using %d parallel threads." % (Ncpu))
1394
+
1395
+ # 1: Define a function to compute a chunk
1396
+ ## OUTSIDE
1397
+ # 2: Create chunks
1398
+ if progress_callback is None:
1399
+ n_chunks = Ncpu
1400
+ else:
1401
+ # Finer chunking gives smoother live progress updates
1402
+ n_chunks = min(C.shape[0], Ncpu * 4)
1403
+ C_chunks = np.array_split(C, n_chunks)
1404
+
1405
+ if im_height>0:
1406
+ tx_height_chunks = np.array_split(tx_height, n_chunks)
1407
+
1408
+ else:
1409
+ # create tx_height_chunks as a list of length n_chunks, where each entry is tx_height=np.array(())
1410
+ tx_height_chunks = [np.array(())]*n_chunks
1411
+
1412
+
1413
+ import os
1414
+
1415
+ # 3: Compute the chunks in parallel
1416
+ forward_gaaem_chunk_partial = partial(forward_gaaem_chunk, thickness=thickness, stmfiles=stmfiles, file_gex=file_gex, Nhank=Nhank, Nfreq=Nfreq, **kwargs)
1417
+
1418
+ # On Windows and macOS, multiprocessing uses 'spawn' which normally
1419
+ # re-executes the user's __main__ script in every worker process.
1420
+ # We prevent this by setting __main__.__spec__ = SimpleNamespace(name='__main__')
1421
+ # before creating the Pool. The spawn bootstrap then calls
1422
+ # _fixup_main_from_name('__main__'), which immediately returns because the
1423
+ # worker's bootstrap module already has __name__ == '__main__' — so the
1424
+ # user's script is never re-run in workers. No if __name__=='__main__' guard
1425
+ # is needed in user scripts on any platform.
1426
+ _main_module = sys.modules.get('__main__')
1427
+ _spec_patched = _main_module is not None and getattr(_main_module, '__spec__', None) is None
1428
+ if _spec_patched:
1429
+ _main_module.__spec__ = types.SimpleNamespace(name='__main__')
1430
+
1431
+ is_spawn = os.name == 'nt' or (os.name == 'posix' and os.uname().sysname == 'Darwin')
1432
+ try:
1433
+ if is_spawn:
1434
+ if os.name == 'nt':
1435
+ Ncpu = min(Ncpu, 60) # Windows handle limit
1436
+ ctx = multiprocessing.get_context('spawn')
1437
+ else:
1438
+ ctx = multiprocessing.get_context('fork')
1439
+ with ctx.Pool(processes=Ncpu) as p:
1440
+ if progress_callback is None:
1441
+ D_chunks = p.starmap(forward_gaaem_chunk_partial, zip(C_chunks, tx_height_chunks))
1442
+ else:
1443
+ # apply_async + ordered get() keeps chunk order for the
1444
+ # concatenate below while reporting per finished chunk
1445
+ async_results = [p.apply_async(forward_gaaem_chunk_partial, args=(Cc, th))
1446
+ for Cc, th in zip(C_chunks, tx_height_chunks)]
1447
+ D_chunks = []
1448
+ n_total = C.shape[0]
1449
+ n_done = 0
1450
+ for r in async_results:
1451
+ D_chunk = r.get()
1452
+ D_chunks.append(D_chunk)
1453
+ n_done += D_chunk.shape[0]
1454
+ _report_progress(progress_callback, n_done, n_total,
1455
+ 'computing', 'Forward modeling (%d/%d soundings)' % (n_done, n_total))
1456
+ finally:
1457
+ if _spec_patched:
1458
+ _main_module.__spec__ = None
1459
+
1460
+
1461
+ D = np.concatenate(D_chunks)
1462
+
1463
+ if is_log:
1464
+ D = np.log10(D)
1465
+
1466
+ if os.name == 'nt' and 'get_process_handle_count' in globals():
1467
+ # Log handle count after pool is closed
1468
+ handle_count_after = get_process_handle_count()
1469
+ # print(f"Handle count after pool: {handle_count_after}")
1470
+
1471
+
1472
+ # D = ig.forward_gaaem(C=C, thickness=thickness, file_gex=file_gex, Nhank=Nhank, Nfreq=Nfreq, parallel=parallel, **kwargs)
1473
+
1474
+ t2 = time.time()
1475
+ t_elapsed = t2 - t1
1476
+ if (showInfo>-1):
1477
+ print('prior_data_gaaem: Time=%5.1fs/%d soundings. %4.1fms/sounding, %3.1fit/s' % (t_elapsed, N, 1000*t_elapsed/N,N/t_elapsed))
1478
+
1479
+ _report_progress(progress_callback, N, N,
1480
+ 'saving', 'Saving forward data to %s' % f_prior_data_h5)
1481
+
1482
+ # Write D to f_prior['/D1']
1483
+ with h5py.File(f_prior_data_h5, 'a') as f_prior:
1484
+ if Dname in f_prior:
1485
+ if force_replace:
1486
+ del f_prior[Dname]
1487
+ else:
1488
+ print("Key '%s' already exists in %s. Use force_replace=True to overwrite." % (Dname, f_prior_data_h5))
1489
+ return f_prior_data_h5
1490
+ f_prior[Dname] = D
1491
+
1492
+ # Add method, type, file_ex, and im as attributes to '/D1'
1493
+ f_prior[Dname].attrs['method'] = method
1494
+ f_prior[Dname].attrs['type'] = type
1495
+ f_prior[Dname].attrs['im'] = im
1496
+ f_prior[Dname].attrs['Nhank'] = Nhank
1497
+ f_prior[Dname].attrs['Nfreq'] = Nfreq
1498
+
1499
+ integrate_update_prior_attributes(f_prior_data_h5)
1500
+
1501
+ _report_progress(progress_callback, N, N,
1502
+ 'completed', 'Forward data saved to %s' % f_prior_data_h5)
1503
+
1504
+ return f_prior_data_h5
1505
+
1506
+
1507
+ def prior_data_identity(f_prior_h5, id=0, im=1, N=0, doMakePriorCopy=False, **kwargs):
1508
+ """
1509
+ Generate data D{id} from model M{im} in the prior file f_prior_h5 as an identity of M{im}.
1510
+
1511
+ Parameters
1512
+ ----------
1513
+ f_prior_h5 : str
1514
+ Path to the prior data file in HDF5 format.
1515
+ id : int, optional
1516
+ Index of the data. If id=0, the next available data id is used. Default is 0.
1517
+ im : int, optional
1518
+ Index of the model. Default is 1.
1519
+ N : int, optional
1520
+ Number of soundings to consider. Default is 0 (use all).
1521
+ doMakePriorCopy : bool, optional
1522
+ Flag indicating whether to make a copy of the prior file. Default is False.
1523
+ showInfo : int, optional
1524
+ Level of verbosity for output.
1525
+ forceDeleteExisting : bool, optional
1526
+ Flag to force deletion of existing data. Default is True.
1527
+
1528
+ Returns
1529
+ -------
1530
+ str
1531
+ Path to the HDF5 file containing the updated prior data.
1532
+ """
1533
+ import integrate as ig
1534
+ import time
1535
+
1536
+ type = 'idenity'
1537
+ method = '--'
1538
+ showInfo = kwargs.get('showInfo', 0)
1539
+ forceDeleteExisting = kwargs.get('forceDeleteExisting', True)
1540
+
1541
+
1542
+ # check keys for the data with max id form 'D1', 'D2', 'D3', ...
1543
+ if id==0:
1544
+ with h5py.File(f_prior_h5, 'a') as f_prior:
1545
+ id = 1
1546
+ for id_test in range(999):
1547
+ key = '/D%d' % id_test
1548
+ if key in f_prior.keys():
1549
+ if showInfo>1:
1550
+ print('Checking key EXISTS: %s' % key)
1551
+ id = id_test+1
1552
+ else:
1553
+ pass
1554
+ if showInfo>0:
1555
+ print('using id = %d' % id)
1556
+
1557
+
1558
+ with h5py.File(f_prior_h5, 'a') as f:
1559
+ N_in = f['M1'].shape[0]
1560
+ if N==0:
1561
+ N = N_in
1562
+ if N>N_in:
1563
+ N=N_in
1564
+
1565
+ if showInfo>2:
1566
+ print('N=%d, N_in=%d' % (N,N_in))
1567
+ if doMakePriorCopy:
1568
+ if N < N_in:
1569
+ f_prior_data_h5 = '%s_N%s_IDEN_im%d_id%d.h5' % (os.path.splitext(f_prior_h5)[0], N, im, id)
1570
+ else:
1571
+ f_prior_data_h5 = '%s_IDEN_im%d_id%d.h5' % (os.path.splitext(f_prior_h5)[0], im, id)
1572
+ if (showInfo>0):
1573
+ print("Creating a copy of %s as %s" % (f_prior_h5, f_prior_data_h5))
1574
+ ig.copy_hdf5_file(f_prior_h5, f_prior_data_h5,N)
1575
+
1576
+ else:
1577
+ f_prior_data_h5 = f_prior_h5
1578
+
1579
+ Mname = '/M%d' % im
1580
+ Dname = '/D%d' % id
1581
+
1582
+ # copy f_prior[Mname] to Dname
1583
+ if showInfo>0:
1584
+ print('Copying %s to %s in filename=%s' % (Mname, Dname, f_prior_data_h5))
1585
+
1586
+ # f_prior = h5py.File(f_prior_data_h5, 'r+')
1587
+ with h5py.File(f_prior_data_h5, 'a') as f:
1588
+ D = f[Mname]
1589
+ # check if Dname exists, if so, delete it
1590
+ if Dname in f.keys():
1591
+ if forceDeleteExisting:
1592
+ print('Key %s allready exists -- DELETING !!!!' % Dname)
1593
+ del f[Dname]
1594
+ else:
1595
+ print('Key %s allready exists - doing nothing' % Dname)
1596
+ return f_prior_data_h5
1597
+
1598
+ dataset = f.create_dataset(Dname, data=D) # 'i4' represents 32-bit integers
1599
+ dataset.attrs['description'] = 'Identiy of %s' % Mname
1600
+ dataset.attrs['f5_forward'] = 'none'
1601
+ dataset.attrs['with_noise'] = 0
1602
+ #f_prior.close()
1603
+
1604
+ return f_prior_data_h5, id
1605
+
1606
+ # %% PRIOR MODEL GENERATORS
1607
+ def _report_progress(progress_callback, current, total, phase, status):
1608
+ """Invoke an optional progress callback as progress_callback(current, total, info_dict).
1609
+
1610
+ Follows the same convention as integrate_rejection(): info_dict carries
1611
+ 'phase' and 'status' keys, and errors raised by the callback are ignored
1612
+ so they can never break the computation.
1613
+ """
1614
+ if progress_callback is None:
1615
+ return
1616
+ try:
1617
+ progress_callback(current, total, {'phase': phase, 'status': status})
1618
+ except Exception:
1619
+ pass
1620
+
1621
+
1622
+ def prior_model_layered(lay_dist='uniform', dz = 1, z_max = 90,
1623
+ NLAY_min=3, NLAY_max=6, NLAY_deg=6,
1624
+ RHO_dist='log-uniform', RHO_min=0.1, RHO_max=5000, RHO_mean=100, RHO_std=80,
1625
+ N=100000, save_sparse=True, RHO_threshold=0.001, **kwargs):
1626
+ """
1627
+ Generate a prior model with layered structure.
1628
+
1629
+ This optimized implementation uses vectorized NumPy operations for improved
1630
+ performance, providing ~2x speedup for large N compared to the original
1631
+ loop-based implementation.
1632
+
1633
+ Parameters
1634
+ ----------
1635
+ lay_dist : str, optional
1636
+ Distribution of the number of layers. Options are 'chi2' and 'uniform'.
1637
+ Default is 'uniform'.
1638
+ dz : float, optional
1639
+ Depth discretization step. Default is 1.
1640
+ z_max : float, optional
1641
+ Maximum depth in m. Default is 90.
1642
+ NLAY_min : int, optional
1643
+ Minimum number of layers. Default is 3.
1644
+ NLAY_max : int, optional
1645
+ Maximum number of layers. Default is 6.
1646
+ NLAY_deg : int, optional
1647
+ Degrees of freedom for chi-square distribution. Only applicable if
1648
+ lay_dist is 'chi2'. Default is 6.
1649
+ RHO_dist : str, optional
1650
+ Distribution of resistivity within each layer. Options are 'log-uniform',
1651
+ 'uniform', 'normal', and 'lognormal'. Default is 'log-uniform'.
1652
+ RHO_min : float, optional
1653
+ Minimum resistivity value. Default is 0.1.
1654
+ RHO_max : float, optional
1655
+ Maximum resistivity value. Default is 5000.
1656
+ RHO_mean : float, optional
1657
+ Mean resistivity value. Only applicable if RHO_dist is 'normal' or
1658
+ 'lognormal'. Default is 100.
1659
+ RHO_std : float, optional
1660
+ Standard deviation of resistivity value. Only applicable if RHO_dist is
1661
+ 'normal' or 'lognormal'. Default is 80.
1662
+ N : int, optional
1663
+ Number of prior models to generate. Default is 100000.
1664
+ save_sparse : bool, optional
1665
+ Whether to save the sparse representation (M2: depth-resistivity pairs)
1666
+ to the HDF5 file. Setting to False can reduce file size and processing
1667
+ time for large priors. Default is True.
1668
+ RHO_threshold : float, optional
1669
+ Minimum physical resistivity threshold in Ohm·m. Any generated resistivity
1670
+ values below this threshold (including zero or negative values from 'normal'
1671
+ distribution) will be clamped to this value. Ensures physically realistic
1672
+ positive resistivity values. Default is 0.001 Ohm·m.
1673
+ f_prior_h5 : str, optional
1674
+ Path to the prior model file in HDF5 format. Default is ''.
1675
+ showInfo : int, optional
1676
+ Level of verbosity for output.
1677
+
1678
+ Returns
1679
+ -------
1680
+ str
1681
+ Filepath of the saved prior model.
1682
+
1683
+ Notes
1684
+ -----
1685
+ This implementation pre-generates all random values using vectorized NumPy
1686
+ operations, significantly improving performance for large N (e.g., N=50000).
1687
+ """
1688
+
1689
+ # Safety guard: if somehow called from a worker process, do nothing.
1690
+ if multiprocessing.current_process().name != 'MainProcess':
1691
+ return None
1692
+
1693
+ import integrate as ig
1694
+
1695
+ showInfo = kwargs.get('showInfo', 0)
1696
+ f_prior_h5 = kwargs.get('f_prior_h5', '')
1697
+ progress_callback = kwargs.get('progress_callback', None)
1698
+
1699
+ if NLAY_max < NLAY_min:
1700
+ NLAY_max = NLAY_min
1701
+
1702
+ if NLAY_min < 1:
1703
+ NLAY_min = 1
1704
+
1705
+ # Generate number of layers for all models at once
1706
+ if lay_dist == 'uniform':
1707
+ NLAY = np.random.randint(NLAY_min, NLAY_max+1, N)
1708
+ if len(f_prior_h5)<1:
1709
+ f_prior_h5 = 'PRIOR_UNIFORM_NL_%d-%d_%s_N%d.h5' % (NLAY_min, NLAY_max, RHO_dist, N)
1710
+
1711
+ elif lay_dist == 'chi2':
1712
+ NLAY = np.random.chisquare(NLAY_deg, N)
1713
+ NLAY = np.ceil(NLAY).astype(int)
1714
+ if len(f_prior_h5)<1:
1715
+ f_prior_h5 = 'PRIOR_CHI2_NF_%d_%s_N%d.h5' % (NLAY_deg, RHO_dist, N)
1716
+ NLAY_max = np.max(NLAY)
1717
+
1718
+ # Setup depth discretization
1719
+ z_min = 0
1720
+ nz = int(np.ceil((z_max - z_min) / dz)) + 1
1721
+ z = np.linspace(z_min, z_max, nz)
1722
+
1723
+ # Pre-allocate output arrays
1724
+ M_rho = np.zeros((N, nz), dtype=np.float32)
1725
+ nm_sparse = NLAY_max + NLAY_max - 1
1726
+
1727
+ if save_sparse:
1728
+ M_rho_sparse = np.ones((N, nm_sparse), dtype=np.float32) * np.nan
1729
+ else:
1730
+ M_rho_sparse = None
1731
+
1732
+ # Generate all layer boundaries at once
1733
+ max_boundaries = NLAY_max - 1
1734
+
1735
+ if max_boundaries > 0:
1736
+ # Generate random boundaries for all models at once
1737
+ # NOTE: Do NOT sort! Boundaries are at random locations
1738
+ z_boundaries_all = np.random.random((N, max_boundaries)) * z_max
1739
+ # Convert to indices
1740
+ i_boundaries_all = np.searchsorted(z, z_boundaries_all)
1741
+ i_boundaries_all = np.minimum(i_boundaries_all, nz - 1)
1742
+ else:
1743
+ z_boundaries_all = np.zeros((N, 0))
1744
+ i_boundaries_all = np.zeros((N, 0), dtype=int)
1745
+
1746
+ # Generate all resistivity values at once
1747
+ if RHO_dist == 'log-normal':
1748
+ rho_all = np.random.lognormal(mean=np.log(RHO_mean), sigma=np.log(RHO_std), size=(N, NLAY_max))
1749
+ elif RHO_dist == 'normal':
1750
+ rho_all = np.random.normal(loc=RHO_mean, scale=RHO_std, size=(N, NLAY_max))
1751
+ elif RHO_dist == 'log-uniform':
1752
+ rho_all = np.exp(np.random.uniform(np.log(RHO_min), np.log(RHO_max), (N, NLAY_max)))
1753
+ elif RHO_dist == 'uniform':
1754
+ rho_all = np.random.uniform(RHO_min, RHO_max, (N, NLAY_max))
1755
+
1756
+ # Ensure physical resistivity values (must be positive)
1757
+ # First clamp to threshold to handle zero/negative values
1758
+ rho_all = np.maximum(rho_all, RHO_threshold)
1759
+
1760
+ # Then clip to user-specified bounds, ensuring RHO_min is at least threshold
1761
+ effective_rho_min = max(RHO_min, RHO_threshold)
1762
+ rho_all = np.clip(rho_all, effective_rho_min, RHO_max)
1763
+
1764
+ # Assign resistivity values to depth profiles
1765
+ if showInfo > 0:
1766
+ from tqdm import tqdm
1767
+ iterator = tqdm(range(N), mininterval=1, desc='prior_layered', leave=False)
1768
+ else:
1769
+ iterator = range(N)
1770
+
1771
+ # Throttle callback to ~100 updates so it does not dominate runtime
1772
+ progress_step = max(1, N // 100)
1773
+
1774
+ for i in iterator:
1775
+ if progress_callback and ((i + 1) % progress_step == 0 or i + 1 == N):
1776
+ _report_progress(progress_callback, i + 1, N,
1777
+ 'generating', 'Generating prior realizations')
1778
+ n_lay = NLAY[i]
1779
+ n_boundaries = n_lay - 1
1780
+
1781
+ # Start with first layer resistivity
1782
+ M_rho[i, :] = rho_all[i, 0]
1783
+
1784
+ # Apply boundaries if any exist
1785
+ if n_boundaries > 0:
1786
+ boundaries = np.sort(i_boundaries_all[i, :n_boundaries])
1787
+ for j in range(n_boundaries):
1788
+ M_rho[i, boundaries[j]:] = rho_all[i, j + 1]
1789
+
1790
+ # Save sparse representation if requested
1791
+ if save_sparse:
1792
+ if n_boundaries > 0:
1793
+ z_sorted = np.sort(z_boundaries_all[i, :n_boundaries])
1794
+ m_current = np.concatenate((z_sorted, rho_all[i, :n_lay]))
1795
+ else:
1796
+ m_current = rho_all[i, :n_lay]
1797
+ M_rho_sparse[i, 0:len(m_current)] = m_current
1798
+
1799
+ if showInfo > 0:
1800
+ print("prior_model_layered: Saving prior model to %s" % f_prior_h5)
1801
+ _report_progress(progress_callback, N, N,
1802
+ 'saving', 'Saving prior model to %s' % f_prior_h5)
1803
+
1804
+ # Save to HDF5 file
1805
+ im = 0
1806
+
1807
+ # Extract compression parameters from kwargs if provided
1808
+ # Build a dict of save_prior_model kwargs
1809
+ save_kwargs = {}
1810
+ if 'compression' in kwargs:
1811
+ save_kwargs['compression'] = kwargs['compression']
1812
+ if 'compression_opts' in kwargs:
1813
+ save_kwargs['compression_opts'] = kwargs['compression_opts']
1814
+
1815
+ if showInfo > 1:
1816
+ print("Saving '/M1' prior model %s" % f_prior_h5)
1817
+ im = im + 1
1818
+ ig.save_prior_model(f_prior_h5, M_rho,
1819
+ im=im,
1820
+ name='resistivity',
1821
+ is_discrete=0,
1822
+ x=z,
1823
+ z=z,
1824
+ delete_if_exist=True,
1825
+ force_replace=True,
1826
+ showInfo=showInfo,
1827
+ **save_kwargs,
1828
+ )
1829
+
1830
+ if save_sparse:
1831
+ if showInfo > 1:
1832
+ print("Saving '/M2' prior model %s" % f_prior_h5)
1833
+ im = im + 1
1834
+ ig.save_prior_model(f_prior_h5, M_rho_sparse,
1835
+ im=im,
1836
+ name='sparse - depth-resistivity',
1837
+ is_discrete=0,
1838
+ x=np.arange(0, nm_sparse),
1839
+ z=np.arange(0, nm_sparse),
1840
+ force_replace=True,
1841
+ showInfo=showInfo,
1842
+ **save_kwargs,
1843
+ )
1844
+
1845
+ if showInfo > 1:
1846
+ print("Saving '/M%d' prior model %s" % (im,f_prior_h5))
1847
+ im = im + 1
1848
+ NLAY_2d = NLAY[:, np.newaxis] if NLAY.ndim == 1 else NLAY
1849
+ ig.save_prior_model(f_prior_h5, NLAY_2d.astype(np.float32),
1850
+ im=im,
1851
+ name='Number of layers',
1852
+ is_discrete=0,
1853
+ x=np.array([0]),
1854
+ z=np.array([0]),
1855
+ force_replace=True,
1856
+ showInfo=showInfo,
1857
+ **save_kwargs,
1858
+ )
1859
+
1860
+ _report_progress(progress_callback, N, N,
1861
+ 'completed', 'Prior model saved to %s' % f_prior_h5)
1862
+
1863
+ return f_prior_h5
1864
+
1865
+ def prior_model_workbench_direct(N=100000, RHO_dist='log-uniform', z1=0, z_max= 100,
1866
+ nlayers=0, p=2, NLAY_min=3, NLAY_max=6,
1867
+ RHO_min = 1, RHO_max= 300, RHO_mean=180, RHO_std=80, chi2_deg= 100,
1868
+ RHO_threshold=0.001, **kwargs):
1869
+ """
1870
+ Generate a prior model with increasingly thick layers.
1871
+
1872
+ All models have the same number of layers! See also: prior_model_workbench.
1873
+
1874
+ Parameters
1875
+ ----------
1876
+ N : int, optional
1877
+ Number of prior models to generate. Default is 100000.
1878
+ RHO_dist : str, optional
1879
+ Distribution of resistivity within each layer. Options are 'log-uniform',
1880
+ 'uniform', 'normal', 'lognormal', and 'chi2'. Default is 'log-uniform'.
1881
+ z1 : float, optional
1882
+ Minimum depth value. Default is 0.
1883
+ z_max : float, optional
1884
+ Maximum depth value. Default is 100.
1885
+ nlayers : int, optional
1886
+ Number of layers. Default is 0 (uses 30 if less than 1).
1887
+ p : int, optional
1888
+ Power parameter for thickness increase. Default is 2.
1889
+ NLAY_min : int, optional
1890
+ Minimum number of layers. Default is 3.
1891
+ NLAY_max : int, optional
1892
+ Maximum number of layers. Default is 6.
1893
+ RHO_min : float, optional
1894
+ Minimum resistivity value. Default is 1.
1895
+ RHO_max : float, optional
1896
+ Maximum resistivity value. Default is 300.
1897
+ RHO_mean : float, optional
1898
+ Mean resistivity value. Only applicable if RHO_dist is 'normal' or
1899
+ 'lognormal'. Default is 180.
1900
+ RHO_std : float, optional
1901
+ Standard deviation of resistivity value. Only applicable if RHO_dist is
1902
+ 'normal' or 'lognormal'. Default is 80.
1903
+ chi2_deg : int, optional
1904
+ Degrees of freedom for chi2 distribution. Only applicable if RHO_dist is
1905
+ 'chi2'. Default is 100.
1906
+ RHO_threshold : float, optional
1907
+ Minimum physical resistivity threshold in Ohm·m. Any generated resistivity
1908
+ values below this threshold (including zero or negative values from 'normal'
1909
+ distribution) will be clamped to this value. Ensures physically realistic
1910
+ positive resistivity values. Default is 0.001 Ohm·m.
1911
+ f_prior_h5 : str, optional
1912
+ Path to the prior model file in HDF5 format. Default is ''.
1913
+ showInfo : int, optional
1914
+ Level of verbosity for output.
1915
+
1916
+ Returns
1917
+ -------
1918
+ str
1919
+ Filepath of the saved prior model.
1920
+ """
1921
+ # Safety guard: if somehow called from a worker process, do nothing.
1922
+ if multiprocessing.current_process().name != 'MainProcess':
1923
+ return None
1924
+
1925
+ import integrate as ig
1926
+
1927
+ showInfo = kwargs.get('showInfo', 0)
1928
+ f_prior_h5 = kwargs.get('f_prior_h5', '')
1929
+ progress_callback = kwargs.get('progress_callback', None)
1930
+
1931
+ # Fully vectorized: only coarse phase updates are possible
1932
+ _report_progress(progress_callback, 0, 100,
1933
+ 'generating', 'Generating prior realizations')
1934
+
1935
+ if nlayers<1:
1936
+ nlayers = 30
1937
+
1938
+ z2=z_max
1939
+ z= z1 + (z2 - z1) * np.linspace(0, 1, nlayers) ** p
1940
+
1941
+ nz = len(z)
1942
+
1943
+ if RHO_dist=='uniform':
1944
+ M_rho = np.random.uniform(low=RHO_min, high = RHO_max, size=(N, nz))
1945
+ if len(f_prior_h5)<1:
1946
+ f_prior_h5 = '%s_R%g_%g.h5' % (f_prior_h5, RHO_min, RHO_max)
1947
+ elif RHO_dist=='log-uniform':
1948
+ M_rho = np.exp(np.random.uniform(low=np.log(RHO_min), high = np.log(RHO_max), size=(N, nz)))
1949
+ if len(f_prior_h5)<1:
1950
+ f_prior_h5 = '%s_R%g_%g.h5' % (f_prior_h5, RHO_min, RHO_max)
1951
+ elif RHO_dist=='normal':
1952
+ M_rho = np.random.normal(loc=RHO_mean, scale = RHO_std, size=(N, nz))
1953
+ if len(f_prior_h5)<1:
1954
+ f_prior_h5 = '%s_R%g_%g.h5' % (f_prior_h5, RHO_mean, RHO_std)
1955
+ elif RHO_dist=='log-normal':
1956
+ M_rho = np.random.lognormal(mean=np.log(RHO_mean), sigma = RHO_std/RHO_mean, size=(N, nz))
1957
+ if len(f_prior_h5)<1:
1958
+ f_prior_h5 = '%s_R%g_%g.h5' % (f_prior_h5, RHO_mean, RHO_std)
1959
+ elif RHO_dist=='chi2':
1960
+ M_rho = np.random.chisquare(df = chi2_deg, size=(N, nz))
1961
+ if len(f_prior_h5)<1:
1962
+ f_prior_h5 = '%s_deg%d.h5' % (f_prior_h5,chi2_deg)
1963
+ else:
1964
+ raise ValueError('RHO_dist=%s not supported' % RHO_dist)
1965
+
1966
+ # Ensure physical resistivity values (must be positive)
1967
+ # First clamp to threshold to handle zero/negative values
1968
+ M_rho = np.maximum(M_rho, RHO_threshold)
1969
+
1970
+ # Then clip to user-specified bounds, ensuring RHO_min is at least threshold
1971
+ effective_rho_min = max(RHO_min, RHO_threshold)
1972
+ M_rho = np.clip(M_rho, effective_rho_min, RHO_max)
1973
+
1974
+
1975
+ if (showInfo>0):
1976
+ print("prior_model_workbench_direct: Saving prior model to %s" % f_prior_h5)
1977
+ _report_progress(progress_callback, 80, 100,
1978
+ 'saving', 'Saving prior model to %s' % f_prior_h5)
1979
+
1980
+ if (showInfo>1):
1981
+ print("Saving '/M1' prior model %s" % f_prior_h5)
1982
+ ig.save_prior_model(f_prior_h5,M_rho.astype(np.float32),
1983
+ im=1,
1984
+ name='Resistivity',
1985
+ is_discrete = 0,
1986
+ x = z,
1987
+ z = z,
1988
+ delete_if_exist = True,
1989
+ force_replace=True,
1990
+ showInfo=showInfo,
1991
+ )
1992
+
1993
+ _report_progress(progress_callback, 100, 100,
1994
+ 'completed', 'Prior model saved to %s' % f_prior_h5)
1995
+
1996
+ return f_prior_h5
1997
+
1998
+
1999
+ def prior_model_workbench(N=100000, p=2, z1=0, z_max= 100, dz=1,
2000
+ lay_dist='uniform', nlayers=0, NLAY_min=3, NLAY_max=6, NLAY_deg=5,
2001
+ RHO_dist='log-uniform',
2002
+ RHO_min = 1, RHO_max= 300, RHO_mean=180, RHO_std=80, chi2_deg= 100,
2003
+ RHO_threshold=0.001, **kwargs):
2004
+ """
2005
+ Generate a prior model with increasingly thick layers.
2006
+
2007
+ Parameters
2008
+ ----------
2009
+ N : int, optional
2010
+ Number of prior models to generate. Default is 100000.
2011
+ p : int, optional
2012
+ Power parameter for thickness increase. Default is 2.
2013
+ z1 : float, optional
2014
+ Minimum depth value. Default is 0.
2015
+ z_max : float, optional
2016
+ Maximum depth value. Default is 100.
2017
+ dz : float, optional
2018
+ Depth discretization step. Default is 1.
2019
+ lay_dist : str, optional
2020
+ Distribution of the number of layers. Options are 'chi2' and 'uniform'.
2021
+ Default is 'uniform'.
2022
+ nlayers : int, optional
2023
+ Number of layers. If greater than 0, sets both NLAY_min and NLAY_max
2024
+ to this value. Default is 0.
2025
+ NLAY_min : int, optional
2026
+ Minimum number of layers. Default is 3.
2027
+ NLAY_max : int, optional
2028
+ Maximum number of layers. Default is 6.
2029
+ NLAY_deg : int, optional
2030
+ Degrees of freedom for chi-square distribution. Only applicable if
2031
+ lay_dist is 'chi2'. Default is 5.
2032
+ RHO_dist : str, optional
2033
+ Distribution of resistivity within each layer. Options are 'log-uniform',
2034
+ 'uniform', 'normal', 'lognormal', and 'chi2'. Default is 'log-uniform'.
2035
+ RHO_min : float, optional
2036
+ Minimum resistivity value. Default is 1.
2037
+ RHO_max : float, optional
2038
+ Maximum resistivity value. Default is 300.
2039
+ RHO_mean : float, optional
2040
+ Mean resistivity value. Only applicable if RHO_dist is 'normal' or
2041
+ 'lognormal'. Default is 180.
2042
+ RHO_std : float, optional
2043
+ Standard deviation of resistivity value. Only applicable if RHO_dist is
2044
+ 'normal' or 'lognormal'. Default is 80.
2045
+ chi2_deg : int, optional
2046
+ Degrees of freedom for chi2 distribution. Only applicable if RHO_dist is
2047
+ 'chi2'. Default is 100.
2048
+ RHO_threshold : float, optional
2049
+ Minimum physical resistivity threshold in Ohm·m. Any generated resistivity
2050
+ values below this threshold (including zero or negative values from 'normal'
2051
+ distribution) will be clamped to this value. Ensures physically realistic
2052
+ positive resistivity values. Default is 0.001 Ohm·m.
2053
+ f_prior_h5 : str, optional
2054
+ Path to the prior model file in HDF5 format. Default is ''.
2055
+ showInfo : int, optional
2056
+ Level of verbosity for output.
2057
+
2058
+ Returns
2059
+ -------
2060
+ str
2061
+ Filepath of the saved prior model.
2062
+ """
2063
+ # Safety guard: if somehow called from a worker process, do nothing.
2064
+ if multiprocessing.current_process().name != 'MainProcess':
2065
+ return None
2066
+
2067
+ from tqdm import tqdm
2068
+ import integrate as ig
2069
+
2070
+ f_prior_h5 = kwargs.get('f_prior_h5', '')
2071
+ showInfo = kwargs.get('showInfo', 0)
2072
+ progress_callback = kwargs.get('progress_callback', None)
2073
+ if nlayers>0:
2074
+ NLAY_min = nlayers
2075
+ NLAY_max = nlayers
2076
+
2077
+ if NLAY_max < NLAY_min:
2078
+ #raise ValueError('NLAY_max must be greater than or equal to NLAY_min.')
2079
+ NLAY_max = NLAY_min
2080
+
2081
+ if NLAY_min < 1:
2082
+ #raise ValueError('NLAY_min must be greater than or equal to 1.')
2083
+ NLAY_min = 1
2084
+
2085
+
2086
+ if lay_dist == 'chi2':
2087
+ NLAY = np.random.chisquare(NLAY_deg, N)
2088
+ NLAY = np.ceil(NLAY).astype(int)
2089
+ if len(f_prior_h5)<1:
2090
+ f_prior_h5 = 'PRIOR_WB_CHI2_NF_%d_%s_N%d.h5' % (NLAY_deg, RHO_dist, N)
2091
+ NLAY_max = np.max(NLAY) # Update NLAY_max to accommodate chi2 distribution
2092
+ elif lay_dist == 'uniform':
2093
+ NLAY = np.random.randint(NLAY_min, NLAY_max+1, N)
2094
+ if NLAY_min == NLAY_max:
2095
+ nlayers = NLAY_min
2096
+ if len(f_prior_h5)<1:
2097
+ f_prior_h5 = 'PRIOR_WB_UNIFORM_%d_N%d_%s' % (nlayers,N,RHO_dist)
2098
+ else:
2099
+ if len(f_prior_h5)<1:
2100
+ f_prior_h5 = 'PROPR_WB_UNIFORM_%d-%d_N%d_%s' % (NLAY_min,NLAY_max,N,RHO_dist)
2101
+
2102
+
2103
+
2104
+ # Force NLAY to be a 2 dimensional numpy array (for when exporting to HDF5)
2105
+ NLAY = NLAY[:, np.newaxis]
2106
+
2107
+
2108
+ z_min = 0
2109
+ # Ensure z_max is included in the array
2110
+ nz = int(np.ceil((z_max - z_min) / dz)) + 1
2111
+ z = np.linspace(z_min, z_max, nz)
2112
+
2113
+ if showInfo>1:
2114
+ print('z_min, z_max, dz, nz = %g, %g, %g, %d' % (z_min, z_max, dz, nz))
2115
+ M_rho = np.zeros((N, nz))
2116
+
2117
+ nm_sparse = NLAY_max+NLAY_max-1
2118
+ if (showInfo>1):
2119
+ print("nm_sparse", nm_sparse)
2120
+ M_rho_sparse = np.ones((N, nm_sparse))*np.nan
2121
+
2122
+
2123
+ # Throttle callback to ~100 updates so it does not dominate runtime
2124
+ progress_step = max(1, N // 100)
2125
+
2126
+ for i in tqdm(range(N), mininterval=1, disable=(showInfo<0), desc='prior_workbench', leave=False):
2127
+ if progress_callback and ((i + 1) % progress_step == 0 or i + 1 == N):
2128
+ _report_progress(progress_callback, i + 1, N,
2129
+ 'generating', 'Generating prior realizations')
2130
+ nlayers = NLAY[i][0]
2131
+ #print(nlayers)
2132
+ z2=z_max
2133
+ z_single= z1 + (z2 - z1) * np.linspace(0, 1, nlayers) ** p
2134
+
2135
+ if RHO_dist=='uniform':
2136
+ M_rho_single = np.random.uniform(low=RHO_min, high = RHO_max, size=(1, nlayers))
2137
+ elif RHO_dist=='log-uniform':
2138
+ M_rho_single = np.exp(np.random.uniform(low=np.log(RHO_min), high = np.log(RHO_max), size=(1, nlayers)))
2139
+ elif RHO_dist=='normal':
2140
+ M_rho_single = np.random.normal(loc=RHO_mean, scale = RHO_std, size=(1, nlayers))
2141
+ elif RHO_dist=='log-normal' or RHO_dist=='lognormal':
2142
+ M_rho_single = np.random.lognormal(mean=np.log(RHO_mean), sigma = RHO_std/RHO_mean, size=(1, nlayers))
2143
+ elif RHO_dist=='chi2':
2144
+ M_rho_single = np.random.chisquare(df = chi2_deg, size=(1, nlayers))
2145
+ else:
2146
+ # Default to log-uniform if RHO_dist is not recognized
2147
+ M_rho_single = np.exp(np.random.uniform(low=np.log(RHO_min), high = np.log(RHO_max), size=(1, nlayers)))
2148
+
2149
+ # Ensure physical resistivity values (must be positive)
2150
+ # First clamp to threshold to handle zero/negative values
2151
+ M_rho_single = np.maximum(M_rho_single, RHO_threshold)
2152
+
2153
+ # Then clip to user-specified bounds, ensuring RHO_min is at least threshold
2154
+ effective_rho_min = max(RHO_min, RHO_threshold)
2155
+ M_rho_single = np.clip(M_rho_single, effective_rho_min, RHO_max)
2156
+
2157
+ for j in range(nlayers):
2158
+ ind = np.where(z>=z_single[j])[0]
2159
+ M_rho[i,ind]= M_rho_single[0,j]
2160
+
2161
+
2162
+ m_current = np.concatenate((z_single[0:-1].flatten(), M_rho_single.flatten()))
2163
+ M_rho_sparse[i,0:len(m_current)] = m_current
2164
+
2165
+
2166
+
2167
+ if (showInfo>0):
2168
+ print("prior_model_workbench: Saving prior model to %s" % f_prior_h5)
2169
+ _report_progress(progress_callback, N, N,
2170
+ 'saving', 'Saving prior model to %s' % f_prior_h5)
2171
+
2172
+ if (showInfo>1):
2173
+ print("Saving '/M1' prior model %s" % f_prior_h5)
2174
+ ig.save_prior_model(f_prior_h5,M_rho.astype(np.float32),
2175
+ im=1,
2176
+ name='Resistivity',
2177
+ is_discrete = 0,
2178
+ x = z,
2179
+ z = z,
2180
+ delete_if_exist = True,
2181
+ force_replace=True,
2182
+ showInfo=showInfo,
2183
+ )
2184
+
2185
+ if (showInfo>1):
2186
+ print("Saving '/M2' prior model %s" % f_prior_h5)
2187
+ ig.save_prior_model(f_prior_h5,M_rho_sparse.astype(np.float32),
2188
+ im=2,
2189
+ name='sparse - depth-resistivity',
2190
+ is_discrete = 0,
2191
+ x = np.arange(0,nm_sparse),
2192
+ z = np.arange(0,nm_sparse),
2193
+ force_replace=True,
2194
+ showInfo=showInfo,
2195
+ )
2196
+
2197
+ if (showInfo>1):
2198
+ print("Saving '/M3' prior model %s" % f_prior_h5)
2199
+ ig.save_prior_model(f_prior_h5,NLAY.astype(np.float32),
2200
+ im=3,
2201
+ name = 'Number of layers',
2202
+ is_discrete=0,
2203
+ x=np.array([0]),
2204
+ z=np.array([0]),
2205
+ force_replace=True,
2206
+ showInfo=showInfo,
2207
+ )
2208
+
2209
+ _report_progress(progress_callback, N, N,
2210
+ 'completed', 'Prior model saved to %s' % f_prior_h5)
2211
+
2212
+ # return the full filepath to f_prior_h5
2213
+ return f_prior_h5
2214
+
2215
+
2216
+
2217
+ def posterior_cumulative_thickness(f_post_h5, im=2, icat=[0], usePrior=False, **kwargs):
2218
+ """
2219
+ Calculate the posterior cumulative thickness based on the given inputs.
2220
+
2221
+ Parameters
2222
+ ----------
2223
+ f_post_h5 : str
2224
+ Path to the input h5 file.
2225
+ im : int, optional
2226
+ Index of model parameter number, M[im]. Default is 2.
2227
+ icat : list, optional
2228
+ List of category indices. Default is [0].
2229
+ usePrior : bool, optional
2230
+ Flag indicating whether to use prior. Default is False.
2231
+ **kwargs : dict
2232
+ Additional keyword arguments.
2233
+
2234
+ Returns
2235
+ -------
2236
+ tuple
2237
+ A tuple containing the following elements:
2238
+
2239
+ thick_mean : ndarray
2240
+ Array of mean cumulative thickness.
2241
+ thick_median : ndarray
2242
+ Array of median cumulative thickness.
2243
+ thick_std : ndarray
2244
+ Array of standard deviation of cumulative thickness.
2245
+ class_out : list
2246
+ List of class names.
2247
+ X : ndarray
2248
+ Array of X values.
2249
+ Y : ndarray
2250
+ Array of Y values.
2251
+ """
2252
+
2253
+ import h5py
2254
+ import integrate as ig
2255
+
2256
+ if isinstance(icat, int):
2257
+ icat = np.array([icat])
2258
+
2259
+ with h5py.File(f_post_h5,'r') as f_post:
2260
+ f_prior_h5 = f_post['/'].attrs['f5_prior']
2261
+ f_data_h5 = f_post['/'].attrs['f5_data']
2262
+
2263
+ X, Y, LINE, ELEVATION = ig.get_geometry(f_data_h5)
2264
+
2265
+ Mstr = '/M%d' % im
2266
+ with h5py.File(f_prior_h5,'r') as f_prior:
2267
+ if not Mstr in f_prior.keys():
2268
+ print('No %s found in %s' % (Mstr, f_prior_h5))
2269
+ return -1
2270
+ if not f_prior[Mstr].attrs['is_discrete']:
2271
+ print('M%d is not discrete' % im)
2272
+ return -1
2273
+
2274
+
2275
+
2276
+ with h5py.File(f_prior_h5,'r') as f_prior:
2277
+ try:
2278
+ z = f_prior[Mstr].attrs['z'][:].flatten()
2279
+ except:
2280
+ z = f_prior[Mstr].attrs['x'][:].flatten()
2281
+ is_discrete = f_prior[Mstr].attrs['is_discrete']
2282
+ if 'clim' in f_prior[Mstr].attrs.keys():
2283
+ clim = f_prior[Mstr].attrs['clim'][:].flatten()
2284
+ else:
2285
+ # if clim set in kwargs, use it, otherwise use default
2286
+ if 'clim' in kwargs:
2287
+ clim = kwargs['clim']
2288
+ else:
2289
+ clim = [.1, 2600]
2290
+ clim = [10, 500]
2291
+ if 'class_id' in f_prior[Mstr].attrs.keys():
2292
+ class_id = f_prior[Mstr].attrs['class_id'][:].flatten()
2293
+ else:
2294
+ print('No class_id found')
2295
+ if 'class_name' in f_prior[Mstr].attrs.keys():
2296
+ class_name = f_prior[Mstr].attrs['class_name'][:].flatten()
2297
+ else:
2298
+ class_name = []
2299
+ n_class = len(class_name)
2300
+ if 'cmap' in f_prior[Mstr].attrs.keys():
2301
+ cmap = f_prior[Mstr].attrs['cmap'][:]
2302
+ else:
2303
+ cmap = plt.cm.hot(np.linspace(0, 1, n_class)).T
2304
+ from matplotlib.colors import ListedColormap
2305
+
2306
+ with h5py.File(f_post_h5,'r') as f_post:
2307
+ #P=f_post[Mstr+'/P'][:]
2308
+ i_use = f_post['/i_use'][:]
2309
+
2310
+ ns,nr=i_use.shape
2311
+
2312
+ if usePrior:
2313
+ for i in range(ns):
2314
+ i_use[i,:]=np.arange(nr)
2315
+
2316
+
2317
+ with h5py.File(f_prior_h5,'r') as f_prior:
2318
+ M_prior = f_prior[Mstr][:]
2319
+ nz = M_prior.shape[1]
2320
+
2321
+ thick_mean = np.zeros((ns))
2322
+ thick_median = np.zeros((ns))
2323
+ thick_std = np.zeros((ns))
2324
+
2325
+
2326
+ thick = np.diff(z)
2327
+
2328
+ for i in range(ns):
2329
+
2330
+ jj = i_use[i,:].astype(int)-1
2331
+ m_sample = M_prior[jj,:]
2332
+
2333
+ cum_thick = np.zeros((nr))
2334
+ for ic in range(len(icat)):
2335
+
2336
+
2337
+ # the number of values of i_cat in the sample
2338
+
2339
+ i_match = (m_sample == class_id[icat[ic]]).astype(int)
2340
+ i_match = i_match[:,0:nz-1]
2341
+
2342
+ n_cat = np.sum(m_sample==icat[ic], axis=0)
2343
+
2344
+ cum_thick = cum_thick + np.sum(i_match*thick, axis=1)
2345
+
2346
+ thick_mean[i] = np.mean(cum_thick)
2347
+ thick_median[i] = np.median(cum_thick)
2348
+ thick_std[i] = np.std(cum_thick)
2349
+
2350
+ class_out = class_name[icat]
2351
+
2352
+ return thick_mean, thick_median, thick_std, class_out, X, Y
2353
+
2354
+
2355
+ # # Import rejection sampling functions from separate module
2356
+ # # Note: These imports work when the package is properly installed
2357
+ # # For development, you may need to modify paths or use try/except
2358
+ # try:
2359
+ # from integrate.integrate_rejection import (
2360
+ # integrate_rejection,
2361
+ # integrate_rejection_range,
2362
+ # integrate_posterior_main,
2363
+ # integrate_posterior_chunk,
2364
+ # likelihood_gaussian_diagonal,
2365
+ # likelihood_gaussian_full,
2366
+ # likelihood_multinomial,
2367
+ # select_subset_for_inversion,
2368
+ # create_shared_memory,
2369
+ # reconstruct_shared_arrays,
2370
+ # cleanup_shared_memory
2371
+ # )
2372
+ # except ImportError:
2373
+ # # For development when running directly, try relative import
2374
+ # from .integrate_rejection import (
2375
+ # integrate_rejection,
2376
+ # integrate_rejection_range,
2377
+ # integrate_posterior_main,
2378
+ # integrate_posterior_chunk,
2379
+ # likelihood_gaussian_diagonal,
2380
+ # likelihood_gaussian_full,
2381
+ # likelihood_multinomial,
2382
+ # select_subset_for_inversion,
2383
+ # create_shared_memory,
2384
+ # reconstruct_shared_arrays,
2385
+ # cleanup_shared_memory
2386
+ # )
2387
+
2388
+
2389
+ # Functions moved to integrate_rejection.py
2390
+ # Functions moved to integrate_rejection.py have been removed
2391
+
2392
+ # All rejection sampling related functions have been moved to integrate_rejection.py
2393
+ # This includes:
2394
+ # - reconstruct_shared_arrays
2395
+ # - cleanup_shared_memory
2396
+ # - integrate_rejection
2397
+ # - integrate_rejection_range
2398
+ # - integrate_posterior_main
2399
+ # - integrate_posterior_chunk
2400
+ # - likelihood_gaussian_diagonal
2401
+ # - likelihood_gaussian_full
2402
+ # - likelihood_multinomial
2403
+ # - select_subset_for_inversion
2404
+ # moved to integrate_rejection.py
2405
+
2406
+
2407
+ # %% Synthetic data
2408
+
2409
+ def _interpolate_resistivity(rho_values, nx):
2410
+ """
2411
+ Interpolate resistivity values along a profile.
2412
+
2413
+ Parameters
2414
+ ----------
2415
+ rho_values : array_like
2416
+ Resistivity values at control points. Can be:
2417
+ - Single value [v]: constant resistivity
2418
+ - Two values [v1, v2]: linear from left to right
2419
+ - Multiple values [v1, v2, ..., vN]: interpolated through all points
2420
+ nx : int
2421
+ Number of x positions to interpolate to.
2422
+
2423
+ Returns
2424
+ -------
2425
+ ndarray
2426
+ Interpolated resistivity values of length nx.
2427
+ """
2428
+ rho_values = np.atleast_1d(rho_values)
2429
+ n_control = len(rho_values)
2430
+
2431
+ if n_control == 1:
2432
+ # Constant value
2433
+ return np.full(nx, rho_values[0])
2434
+ else:
2435
+ # Interpolate through control points
2436
+ control_indices = np.linspace(0, nx-1, n_control)
2437
+ x_indices = np.arange(nx)
2438
+ return np.interp(x_indices, control_indices, rho_values)
2439
+
2440
+
2441
+ def synthetic_case(case='Wedge', **kwargs):
2442
+ """
2443
+ Generate synthetic geological models for different cases.
2444
+
2445
+ This function creates synthetic 2D geological models for testing and validation
2446
+ purposes. Supports 'Wedge' and '3Layer' model types with customizable parameters.
2447
+
2448
+ Parameters
2449
+ ----------
2450
+ case : str, optional
2451
+ The type of synthetic case to generate. Options are 'Wedge' and '3Layer'.
2452
+ Default is 'Wedge'.
2453
+ showInfo : int, optional
2454
+ If greater than 0, print information about the generated case. Default is 0.
2455
+ rho_1 : list or array_like, optional
2456
+ Resistivity values for layer 1 along the profile. If a single value [v],
2457
+ resistivity is constant at v. If multiple values [v1, v2, ...], resistivity
2458
+ varies from v1 (left) to v2 (middle) to vN (right) using interpolation.
2459
+ Only used when rho_1, rho_2, and rho_3 are all provided.
2460
+ rho_2 : list or array_like, optional
2461
+ Resistivity values for layer 2 along the profile. Same format as rho_1.
2462
+ rho_3 : list or array_like, optional
2463
+ Resistivity values for layer 3 along the profile. Same format as rho_1.
2464
+ x_max : int, optional
2465
+ Maximum x-dimension size. Default is 1000 for 'Wedge', 100 for '3Layer'.
2466
+ dx : float, optional
2467
+ Step size in the x-dimension.
2468
+ z_max : int, optional
2469
+ Maximum z-dimension size. Default is 90 for 'Wedge', 60 for '3Layer'.
2470
+ dz : float, optional
2471
+ Step size in the z-dimension. Default is 1.
2472
+ z1 : float, optional
2473
+ Depth at which the wedge starts ('Wedge': default z_max/10) or first layer
2474
+ ends ('3Layer': default z_max/3).
2475
+ rho : list, optional
2476
+ Density values for different layers ('Wedge' case). Default is [100, 200, 120].
2477
+ Overridden by rho_1, rho_2, rho_3 if all three are provided.
2478
+ wedge_angle : float, optional
2479
+ Angle of the wedge in degrees ('Wedge' case). Default is 1.
2480
+ x_range : float, optional
2481
+ Range in the x-dimension for the cosine function ('3Layer' case).
2482
+ Default is x_max/4.
2483
+ z_thick : float, optional
2484
+ Thickness of the second layer ('3Layer' case). Default is z_max/2.
2485
+ rho1_1 : float, optional
2486
+ Density at the start of the first layer ('3Layer'). Default is 120.
2487
+ rho1_2 : float, optional
2488
+ Density at the end of the first layer ('3Layer'). Default is 10.
2489
+ rho2_1 : float, optional
2490
+ Density at the start of the second layer ('3Layer'). Default is rho1_2.
2491
+ rho2_2 : float, optional
2492
+ Density at the end of the second layer ('3Layer'). Default is rho1_1.
2493
+ rho3 : float, optional
2494
+ Density of the third layer ('3Layer'). Default is 120.
2495
+
2496
+ Returns
2497
+ -------
2498
+ M : ndarray
2499
+ The generated synthetic resistivity model of shape (nx, nz).
2500
+ x : ndarray
2501
+ X-coordinates of the model.
2502
+ z : ndarray
2503
+ Z-coordinates (depth) of the model.
2504
+ M_ref_lith : ndarray
2505
+ Lithology/layer number for each pixel, same shape as M. Values are 1, 2, 3
2506
+ corresponding to the layer number.
2507
+ layer_depths : ndarray
2508
+ Depth to the top of layers 1, 2, and 3 for each trace, shape (nx, 3).
2509
+ Column 0: depth to top of layer 1 (always 0)
2510
+ Column 1: depth to top of layer 2
2511
+ Column 2: depth to top of layer 3
2512
+
2513
+ Examples
2514
+ --------
2515
+ >>> # Constant resistivity in each layer
2516
+ >>> M, x, z, M_lith, depths = ig.synthetic_case(case='3layer', rho_1=[10], rho_2=[80], rho_3=[10])
2517
+ >>>
2518
+ >>> # Linear variation from left to right
2519
+ >>> M, x, z, M_lith, depths = ig.synthetic_case(case='3layer', rho_1=[10, 80], rho_2=[80, 10], rho_3=[10, 10])
2520
+ >>>
2521
+ >>> # Three-point variation (left, middle, right)
2522
+ >>> M, x, z, M_lith, depths = ig.synthetic_case(case='3layer', rho_1=[10, 80, 10], rho_2=[50, 100, 50], rho_3=[10, 10, 10])
2523
+ """
2524
+
2525
+ showInfo = kwargs.get('showInfo', 0)
2526
+
2527
+ # Check if rho_1, rho_2, rho_3 are all provided
2528
+ rho_1 = kwargs.get('rho_1', None)
2529
+ rho_2 = kwargs.get('rho_2', None)
2530
+ rho_3 = kwargs.get('rho_3', None)
2531
+ use_rho_arrays = (rho_1 is not None) and (rho_2 is not None) and (rho_3 is not None)
2532
+
2533
+ if case.lower() == 'wedge':
2534
+ # Create synthetic wedge model
2535
+
2536
+ # variables
2537
+ x_max = kwargs.get('x_max', 1000)
2538
+ dx = kwargs.get('dx', 1000./x_max)
2539
+ z_max = kwargs.get('z_max', 90)
2540
+ dz = kwargs.get('dz', 1)
2541
+ z1 = kwargs.get('z1', z_max/10)
2542
+ rho = kwargs.get('rho', [100,200,120])
2543
+ wedge_angle = kwargs.get('wedge_angle', 1)
2544
+
2545
+ if showInfo>0:
2546
+ print('Creating synthetic %s case with wedge angle=%f' % (case,wedge_angle))
2547
+
2548
+ z = np.arange(0,z_max,dz)
2549
+ x = np.arange(0,x_max,dx)
2550
+
2551
+ nx = x.shape[0]
2552
+ nz = z.shape[0]
2553
+
2554
+ # Initialize M and M_ref_lith
2555
+ M = np.zeros((nx,nz))
2556
+ M_ref_lith = np.ones((nx,nz), dtype=int) # Layer 1 by default
2557
+
2558
+ # Initialize layer depths array (nx, 3)
2559
+ layer_depths = np.zeros((nx, 3))
2560
+
2561
+ if use_rho_arrays:
2562
+ # Convert to numpy arrays
2563
+ rho_1 = np.atleast_1d(rho_1)
2564
+ rho_2 = np.atleast_1d(rho_2)
2565
+ rho_3 = np.atleast_1d(rho_3)
2566
+
2567
+ # Interpolate resistivity values along the profile
2568
+ rho1_interp = _interpolate_resistivity(rho_1, nx)
2569
+ rho2_interp = _interpolate_resistivity(rho_2, nx)
2570
+ rho3_interp = _interpolate_resistivity(rho_3, nx)
2571
+ else:
2572
+ # Use constant values from rho array
2573
+ rho1_interp = np.full(nx, rho[0])
2574
+ rho2_interp = np.full(nx, rho[1])
2575
+ rho3_interp = np.full(nx, rho[2])
2576
+
2577
+ # Build the model
2578
+ for ix in range(nx):
2579
+ # Layer 1 (top layer) - always starts at depth 0
2580
+ M[ix,:] = rho1_interp[ix]
2581
+ M_ref_lith[ix,:] = 1
2582
+ layer_depths[ix, 0] = 0 # Layer 1 starts at surface
2583
+
2584
+ # Layer 2 (wedge) - starts at z1
2585
+ wedge_angle_rad = np.deg2rad(wedge_angle)
2586
+ z2 = z1 + x[ix]*np.tan(wedge_angle_rad)
2587
+ iz2 = np.where((z>=z1) & (z<=z2))[0]
2588
+ M[ix,iz2] = rho2_interp[ix]
2589
+ M_ref_lith[ix,iz2] = 2
2590
+ layer_depths[ix, 1] = z1 # Layer 2 starts at z1
2591
+
2592
+ # Layer 3 (bottom layer, below wedge)
2593
+ iz3 = np.where(z>=z1)[0]
2594
+ M[ix,iz3] = rho3_interp[ix]
2595
+ M_ref_lith[ix,iz3] = 3
2596
+ layer_depths[ix, 2] = z2 # Layer 3 starts at bottom of wedge
2597
+
2598
+ return M, x, z, M_ref_lith, layer_depths
2599
+
2600
+ elif case.lower() == '3layer':
2601
+ # Create synthetic 3 layer model
2602
+
2603
+ # variables
2604
+ x_max = kwargs.get('x_max', 100)
2605
+ x_range = kwargs.get('x_range', x_max/4)
2606
+ dx = kwargs.get('dx', 1)
2607
+ z_max = kwargs.get('z_max', 60)
2608
+ dz = kwargs.get('dz', 1)
2609
+ z1 = kwargs.get('z1', z_max/3)
2610
+ z_thick = kwargs.get('z_thick', z_max/2)
2611
+
2612
+ rho1_1 = kwargs.get('rho1_1', 120)
2613
+ rho1_2 = kwargs.get('rho1_2', 10)
2614
+ rho2_1 = kwargs.get('rho2_1', rho1_2)
2615
+ rho2_2 = kwargs.get('rho2_2', rho1_1)
2616
+ rho3 = kwargs.get('rho3', 120)
2617
+
2618
+ if showInfo>0:
2619
+ print('Creating synthetic %s case' % case)
2620
+
2621
+ z = np.arange(0,z_max,dz)
2622
+ x = np.arange(0,x_max,dx)
2623
+
2624
+ nx = x.shape[0]
2625
+ nz = z.shape[0]
2626
+
2627
+ # Initialize M and M_ref_lith
2628
+ M = np.zeros((nx,nz))
2629
+ M_ref_lith = np.zeros((nx,nz), dtype=int)
2630
+
2631
+ # Initialize layer depths array (nx, 3)
2632
+ layer_depths = np.zeros((nx, 3))
2633
+
2634
+ if use_rho_arrays:
2635
+ # Convert to numpy arrays
2636
+ rho_1 = np.atleast_1d(rho_1)
2637
+ rho_2 = np.atleast_1d(rho_2)
2638
+ rho_3 = np.atleast_1d(rho_3)
2639
+
2640
+ # Interpolate resistivity values along the profile
2641
+ rho1_interp = _interpolate_resistivity(rho_1, nx)
2642
+ rho2_interp = _interpolate_resistivity(rho_2, nx)
2643
+ rho3_interp = _interpolate_resistivity(rho_3, nx)
2644
+
2645
+ # Build model with variable resistivity
2646
+ for ix in range(nx):
2647
+ # Layer 3 (bottom layer) - default
2648
+ M[ix,:] = rho3_interp[ix]
2649
+ M_ref_lith[ix,:] = 3
2650
+
2651
+ # Layer 1 (top layer) - starts at surface
2652
+ iz1 = np.where(z<=z1)[0]
2653
+ M[ix,iz1] = rho1_interp[ix]
2654
+ M_ref_lith[ix,iz1] = 1
2655
+ layer_depths[ix, 0] = 0 # Layer 1 starts at surface
2656
+
2657
+ # Layer 2 (middle layer with varying thickness) - starts at z1
2658
+ z2 = z1 + z_thick*0.5*(1+np.cos(np.pi+x[ix]/(x_range)*np.pi))
2659
+ iz2 = np.where((z>=z1) & (z<=z2))[0]
2660
+ M[ix,iz2] = rho2_interp[ix]
2661
+ M_ref_lith[ix,iz2] = 2
2662
+ layer_depths[ix, 1] = z1 # Layer 2 starts at z1
2663
+
2664
+ # Layer 3 depth
2665
+ layer_depths[ix, 2] = z2 # Layer 3 starts at z2
2666
+ else:
2667
+ # Use original linear variation from rho1_1 to rho1_2
2668
+ for ix in range(nx):
2669
+ # Layer 3 (bottom layer) - default
2670
+ M[ix,:] = rho3
2671
+ M_ref_lith[ix,:] = 3
2672
+
2673
+ # Layer 1 (top layer) - starts at surface
2674
+ iz1 = np.where(z<=z1)[0]
2675
+ rho1 = rho1_1 + (rho1_2 - rho1_1) * x[ix]/x_max
2676
+ M[ix,iz1] = rho1
2677
+ M_ref_lith[ix,iz1] = 1
2678
+ layer_depths[ix, 0] = 0 # Layer 1 starts at surface
2679
+
2680
+ # Layer 2 (middle layer with varying thickness) - starts at z1
2681
+ z2 = z1 + z_thick*0.5*(1+np.cos(np.pi+x[ix]/(x_range)*np.pi))
2682
+ rho2 = rho2_1 + (rho2_2 - rho2_1) * x[ix]/x_max
2683
+ iz2 = np.where((z>=z1) & (z<=z2))[0]
2684
+ M[ix,iz2] = rho2
2685
+ M_ref_lith[ix,iz2] = 2
2686
+ layer_depths[ix, 1] = z1 # Layer 2 starts at z1
2687
+
2688
+ # Layer 3 depth
2689
+ layer_depths[ix, 2] = z2 # Layer 3 starts at z2
2690
+
2691
+ return M, x, z, M_ref_lith, layer_depths
2692
+
2693
+
2694
+ ####################################
2695
+ ## MISC
2696
+
2697
+ def comb_cprob(pA, pAgB, pAgC, tau=1.0):
2698
+ """
2699
+ Combine conditional probabilities based on permanence of updating ratios.
2700
+
2701
+ This function implements the probability combination method described in
2702
+ Journel's "An Alternative to Traditional Data Independence Hypotheses"
2703
+ (Math Geology, 2004).
2704
+
2705
+ Parameters:
2706
+ -----------
2707
+ pA : array_like
2708
+ Probability of event A
2709
+ pAgB : array_like
2710
+ Conditional probability of A given B
2711
+ pAgC : array_like
2712
+ Conditional probability of A given C
2713
+ tau : float, optional
2714
+ Combination parameter controlling the ratio permanence (default=1.0)
2715
+
2716
+ Returns:
2717
+ --------
2718
+ ndarray
2719
+ Combined conditional probability Prob(A|B,C)
2720
+
2721
+ References:
2722
+ -----------
2723
+ Journel, An Alternative to Traditional Data Independence Hypotheses,
2724
+ Mathematical Geology, 2002
2725
+ """
2726
+ # Compute odds ratios
2727
+ a = (1 - pA) / pA
2728
+ b = (1 - pAgB) / pAgB
2729
+ c = (1 - pAgC) / pAgC
2730
+
2731
+ # Compute combined probability
2732
+ pAgBC = 1 / (1 + b * (c / a) ** tau)
2733
+
2734
+ return pAgBC
2735
+
2736
+ def kl_divergence(prior_sample, posterior_sample, is_discrete=False, log_base=None):
2737
+ """
2738
+ Compute KL divergence D_KL(posterior || prior) for one or multiple parameters.
2739
+
2740
+ Parameters
2741
+ ----------
2742
+ prior_sample : array, shape (n_realizations,) or (n_realizations, n_params)
2743
+ posterior_sample : array, shape (n_realizations,) or (n_realizations, n_params)
2744
+ If 2D, KL is computed per column and a 1D array of size n_params is returned.
2745
+ is_discrete : bool, optional
2746
+ If True, treat the parameter as discrete. Default is False.
2747
+ log_base : int, float, or None
2748
+ Base of the logarithm. None uses natural log (nats).
2749
+ For discrete parameters, passing log_base=N (number of classes)
2750
+ normalizes the result to [0, 1], where 1 means complete certainty.
2751
+
2752
+ Returns
2753
+ -------
2754
+ float or numpy.ndarray
2755
+ KL divergence value(s).
2756
+ """
2757
+ from scipy.stats import gaussian_kde
2758
+
2759
+ prior_sample = np.asarray(prior_sample)
2760
+ posterior_sample = np.asarray(posterior_sample)
2761
+
2762
+ if prior_sample.ndim == 2:
2763
+ return np.array([
2764
+ kl_divergence(prior_sample[:, i], posterior_sample[:, i],
2765
+ is_discrete=is_discrete, log_base=log_base)
2766
+ for i in range(prior_sample.shape[1])
2767
+ ])
2768
+
2769
+ # Change-of-base divisor: ln(base). If None, divisor=1 (natural log).
2770
+ log_divisor = np.log(log_base) if log_base is not None else 1.0
2771
+
2772
+ if is_discrete:
2773
+ # 1. Get unique values and counts (empirical distribution)
2774
+ all_vals = np.unique(np.concatenate([prior_sample, posterior_sample]))
2775
+ p = np.array([np.sum(prior_sample == v) for v in all_vals])
2776
+ q = np.array([np.sum(posterior_sample == v) for v in all_vals])
2777
+
2778
+ # 2. Add small smoothing to avoid zero probabilities
2779
+ p = (p + 1e-10) / (np.sum(p) + len(all_vals) * 1e-10)
2780
+ q = (q + 1e-10) / (np.sum(q) + len(all_vals) * 1e-10)
2781
+
2782
+ # 3. Discrete KL: D_KL(posterior || prior) = Σ q·log(q/p)
2783
+ return np.sum(p * np.log(p / q)) / log_divisor
2784
+ else:
2785
+ # Continuous KL using KDE
2786
+ kde_p = gaussian_kde(prior_sample)
2787
+ kde_q = gaussian_kde(posterior_sample)
2788
+
2789
+ # Evaluate on a grid
2790
+ grid = np.linspace(min(prior_sample.min(), posterior_sample.min()),
2791
+ max(prior_sample.max(), posterior_sample.max()), 1000)
2792
+ p_vals = kde_p(grid)
2793
+ q_vals = kde_q(grid)
2794
+
2795
+ # Normalize to ensure they are valid PDFs
2796
+ p_vals /= p_vals.sum()
2797
+ q_vals /= q_vals.sum()
2798
+
2799
+ # Numerical integration: D_KL(posterior || prior) = Σ q·log(q/p)
2800
+ return np.sum(p_vals * np.log(p_vals / (q_vals + 1e-10))) / log_divisor
2801
+
2802
+
2803
+ def entropy(P, base = None):
2804
+ """
2805
+ Calculate the entropy of a discrete probability distribution.
2806
+
2807
+ The entropy is calculated using the formula:
2808
+ H(P) = -sum(P_i * log_b(P_i))
2809
+
2810
+ Parameters
2811
+ ----------
2812
+ P : numpy.ndarray
2813
+ Probability distribution. Can be a 1D or 2D array.
2814
+ If 2D, each row represents a different distribution.
2815
+ base : int, optional
2816
+ The logarithm base to use. If None, uses the number of elements
2817
+ in the probability distribution (P.shape[1]). Default is None.
2818
+
2819
+ Returns
2820
+ -------
2821
+ numpy.ndarray
2822
+ The entropy value(s). If input P is 2D, returns an array with
2823
+ entropy for each row distribution.
2824
+
2825
+ Notes
2826
+ -----
2827
+ - Input probabilities are assumed to be normalized (sum to 1)
2828
+ - Zero probabilities are handled by numpy's log function
2829
+ - For 2D input, entropy is calculated row-wise
2830
+
2831
+ Examples
2832
+ --------
2833
+ >>> P = np.array([0.5, 0.5])
2834
+ >>> entropy(P)
2835
+ 1.0
2836
+
2837
+ >>> P = np.array([[0.5, 0.5], [0.1, 0.9]])
2838
+ >>> entropy(P)
2839
+ array([1.0, 0.469])
2840
+ """
2841
+ P = np.atleast_2d(P)
2842
+ if base is None:
2843
+ base = P.shape[1]
2844
+ H = -np.sum(P*np.log(P)/np.log(base), axis=1)
2845
+ return H
2846
+
2847
+ def class_id_to_idx(D, class_id=None):
2848
+ """
2849
+ Convert class identifiers to indices.
2850
+
2851
+ This function takes an array of class identifiers and converts them to
2852
+ corresponding indices. If no class identifiers are provided, it will
2853
+ automatically determine the unique class identifiers from the input array.
2854
+
2855
+ Parameters
2856
+ ----------
2857
+ D : numpy.ndarray
2858
+ Array containing class identifiers.
2859
+ class_id : numpy.ndarray, optional
2860
+ Array of unique class identifiers. If None, unique class identifiers
2861
+ will be determined from the input array `D`. Default is None.
2862
+
2863
+ Returns
2864
+ -------
2865
+ tuple
2866
+ A tuple containing the following elements:
2867
+
2868
+ D_idx : numpy.ndarray
2869
+ Array with class identifiers converted to indices.
2870
+ class_id : numpy.ndarray
2871
+ Array of unique class identifiers.
2872
+ class_id_out : numpy.ndarray
2873
+ Array of unique output class identifiers.
2874
+ """
2875
+
2876
+ if class_id is None:
2877
+ class_id = np.unique(D)
2878
+ D_idx = np.zeros(D.shape)
2879
+ for i in range(len(class_id)):
2880
+ D_idx[D==class_id[i]]=i
2881
+ # Make sure the indices are integers
2882
+ D_idx = D_idx.astype(int)
2883
+ class_id_out = np.unique(D_idx)
2884
+
2885
+ return D_idx, class_id, class_id_out
2886
+
2887
+
2888
+
2889
+ def get_hypothesis_probability(f_post_h5_arr, T=1):
2890
+ """
2891
+ Calculate hypothesis probabilities and related statistics from posterior files.
2892
+
2893
+ This function processes an array of HDF5 file paths containing posterior evidences
2894
+ to compute normalized probabilities for each hypothesis, along with evidence values,
2895
+ mode hypotheses, and entropy measures.
2896
+
2897
+ Parameters
2898
+ ----------
2899
+ f_post_h5_arr : list of str
2900
+ Array of file paths to HDF5 files containing posterior evidence values.
2901
+ Each file should have an '/EV' dataset.
2902
+ T : float, optional
2903
+ Temperature parameter that applies annealing. Higher temperatures create
2904
+ more uniform distributions. Useful for smoothing distributions from smaller
2905
+ lookup tables. Default is 1.
2906
+
2907
+ Returns
2908
+ -------
2909
+ tuple
2910
+ A tuple containing the following elements:
2911
+
2912
+ P : numpy.ndarray
2913
+ Normalized probabilities for each hypothesis (shape: n_hypothesis, n_samples).
2914
+ EV_all : numpy.ndarray
2915
+ Evidence values for each hypothesis and sample (shape: n_hypothesis, n_samples).
2916
+ MODE_hypothesis : numpy.ndarray
2917
+ Index of most probable hypothesis per sample (shape: n_samples).
2918
+ ENT_hypothesis : numpy.ndarray
2919
+ Entropy of hypothesis distribution per sample, normalized by number
2920
+ of hypotheses (shape: n_samples).
2921
+
2922
+ Notes
2923
+ -----
2924
+ The probability normalization uses the log-sum-exp trick to avoid numerical
2925
+ underflow issues when working with evidence values.
2926
+ """
2927
+
2928
+ from scipy import stats
2929
+
2930
+ n_hypothesis = len(f_post_h5_arr)
2931
+ EV_all = []
2932
+ for f_post_h5 in f_post_h5_arr:
2933
+ with h5py.File(f_post_h5, 'r') as f:
2934
+ EV = f['/EV'][()]
2935
+ EV_all.append(EV)
2936
+ EV_all = np.array(EV_all)
2937
+ # subtract the small value on each column form each column
2938
+ P = np.exp(EV_all - np.max(EV_all, axis=0))**(1/T)
2939
+ P = np.exp((1/T)*(EV_all - np.max(EV_all, axis=0)))
2940
+ #P_acc = np.exp((1/T) * (logL - np.nanmax(logL)))
2941
+ # Normalize each column to sum to 1 using NumPy broadcasting
2942
+ P = P / np.sum(P, axis=0, keepdims=True)
2943
+
2944
+ ENT_hypothesis = np.zeros(P.shape[1])
2945
+ MODE_hypothesis = np.zeros(P.shape[1])
2946
+
2947
+ for i in range(P.shape[1]):
2948
+ # get the entropy for each hypothesis
2949
+ ENT_hypothesis[i] = stats.entropy(P[:,i], base=n_hypothesis)
2950
+ # get the is of the hypothesis with the maximum probability
2951
+ MODE_hypothesis[i] = np.argmax(P[:,i], axis=0)
2952
+
2953
+
2954
+
2955
+ return P, EV_all, MODE_hypothesis, ENT_hypothesis
2956
+
2957
+
2958
+
2959
+ def sample_posterior_multiple_hypotheses(f_post_h5_arr, P_hypothesis=None):
2960
+ """
2961
+ Sample posterior models from multiple hypotheses.
2962
+
2963
+ This function samples posterior models from multiple hypotheses stored in HDF5 files,
2964
+ according to the given hypothesis probabilities.
2965
+
2966
+ Parameters
2967
+ ----------
2968
+ f_post_h5_arr : list of str
2969
+ List of paths to HDF5 files containing posterior models for different hypotheses.
2970
+ P_hypothesis : numpy.ndarray, optional
2971
+ Array of shape (n_hypotheses, n_soundings) containing probability of each
2972
+ hypothesis for each sounding. If None, uniform probabilities are used.
2973
+ Default is None.
2974
+
2975
+ Returns
2976
+ -------
2977
+ list of numpy.ndarray
2978
+ List of posterior model arrays. Each array has shape (n_soundings, n_samples,
2979
+ n_parameters), where n_samples is determined by the first hypothesis's number
2980
+ of samples.
2981
+
2982
+ Notes
2983
+ -----
2984
+ The function combines posterior samples from different hypotheses in proportion to their
2985
+ probabilities and ensures the total number of samples equals the first hypothesis's
2986
+ sample count.
2987
+ """
2988
+
2989
+ import numpy as np
2990
+ import h5py
2991
+ import integrate as ig
2992
+
2993
+ f_prior_h5_arr = []
2994
+ M_all = []
2995
+ i_use_all = []
2996
+ n_use_all = []
2997
+ for ip in range(len(f_post_h5_arr)):
2998
+ f_post_h5 = f_post_h5_arr[ip]
2999
+ with h5py.File(f_post_h5, 'r') as f_post:
3000
+ f_prior_h5 = f_post['/'].attrs['f5_prior']
3001
+ f_data_h5 = f_post['/'].attrs['f5_data']
3002
+ f_prior_h5_arr.append(f_prior_h5)
3003
+
3004
+ i_use = f_post['/i_use'][:]
3005
+ i_use_all.append(i_use)
3006
+ n_use_all.append(i_use.shape[1])
3007
+
3008
+ print("loading prior model %s" % f_prior_h5)
3009
+ M, idx = ig.load_prior_model(f_prior_h5)
3010
+ M_all.append(M)
3011
+ print(f_prior_h5_arr)
3012
+ i_use_all = np.array(i_use_all)
3013
+ n_use_all = np.array(n_use_all)
3014
+
3015
+ if P_hypothesis is None:
3016
+ print('Using unform hypothesis probability')
3017
+ D = ig.load_data(f_data_h5)
3018
+ n_soundings = D['d_obs'][0].shape[0]
3019
+
3020
+ P_hypothesis = np.ones((len(f_post_h5_arr), n_soundings))
3021
+ P_hypothesis = P_hypothesis/np.sum(P_hypothesis, axis=0)
3022
+
3023
+ # def sample_posterior_multipleh_hypotheses(f_post_h5_arr, P_hypothesis, n_post=100):
3024
+ nsoundings = P_hypothesis.shape[1]
3025
+ # If different hypothsis have different number of realizations..
3026
+ #for i in range(nsoundings):
3027
+
3028
+
3029
+ M_post_arr = []
3030
+
3031
+ for im in range(len(M)):
3032
+ print("im=%d/%d" % (im+1,len(M)))
3033
+ nm = M[im].shape[1]
3034
+ M_post = np.zeros((nsoundings, n_use_all[0],nm))
3035
+
3036
+ for i in range(nsoundings):
3037
+ # get the probabliity of each hypothesis
3038
+ P_hypothesis_is = P_hypothesis[:,i]
3039
+ i_use = i_use_all[:,i,:]
3040
+
3041
+ n_use = P_hypothesis_is*n_use_all
3042
+ n_use = np.round(n_use).astype(int)
3043
+ #print(n_use)
3044
+ n_sum = np.sum(n_use)
3045
+ # make sure that the sum of n_use is equal to the number of realizations
3046
+ delta_n = n_use_all[0]-n_sum
3047
+ if delta_n > 0:
3048
+ n_use[0] = n_use[0] + delta_n
3049
+ elif delta_n < 0:
3050
+ pass
3051
+ #n_use[0] = n_use[0] - np.abs(delta_n)
3052
+ n_sum = np.sum(n_use)
3053
+
3054
+ M_dummy = []
3055
+ for j in range(len(n_use)):
3056
+ # use the first realizations from i_use[j]
3057
+ #i_use_single = i_use[j,:n_use[j]]
3058
+ # use n_use[j] random realizations from iuse[j]
3059
+ #print(' j=%d, n_use=%d' % (j,n_use[j]))
3060
+ if n_use[j]>0:
3061
+ i_use_single = np.random.choice(i_use[j], n_use[j], replace=False)
3062
+ M_dummy.append(M_all[j][im][i_use_single,:])
3063
+
3064
+ M_sounding = np.concatenate(M_dummy, axis=0)
3065
+ try:
3066
+ # take the first n_use_all[0] realizations
3067
+ M_post[i]=M_sounding[:n_use_all[0]]
3068
+ except:
3069
+ print('i=%d, [%d], n_sum=%d, delta_n=%d, n_use_all[0]=%d' % (i,M_sounding.shape[0],n_sum,delta_n,n_use_all[0]))
3070
+ try:
3071
+ M_post[i]=M_sounding[:,:n_use[0]]
3072
+ except:
3073
+ pass
3074
+
3075
+ M_post_arr.append(M_post)
3076
+
3077
+ return M_post_arr
3078
+
3079
+
3080
+ # %% TIMING FUNCTIONS
3081
+ # Functions moved from integrate_timing_cli.py
3082
+
3083
+ def allocate_large_page():
3084
+ """
3085
+ Allocate a 2MB large page if running on Windows.
3086
+
3087
+ Returns
3088
+ -------
3089
+ int or None
3090
+ Pointer to allocated memory on success, None on failure or non-Windows systems.
3091
+
3092
+ Notes
3093
+ -----
3094
+ Large pages can improve performance but require specific Windows privileges.
3095
+ """
3096
+ import os
3097
+ import ctypes
3098
+
3099
+ if os.name == "nt":
3100
+ kernel32 = ctypes.windll.kernel32
3101
+ kernel32.VirtualAlloc.restype = ctypes.c_void_p
3102
+
3103
+ LARGE_PAGE_SIZE = 2 * 1024 * 1024 # 2MB
3104
+
3105
+ MEM_COMMIT = 0x1000
3106
+ MEM_LARGE_PAGES = 0x20000000
3107
+ PAGE_READWRITE = 0x04
3108
+
3109
+ ptr = kernel32.VirtualAlloc(None, LARGE_PAGE_SIZE, MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE)
3110
+
3111
+ if not ptr:
3112
+ error_code = ctypes.GetLastError()
3113
+ print(f"Failed to allocate large page. Error code: {error_code}")
3114
+ return None
3115
+
3116
+ print(f"Successfully allocated {LARGE_PAGE_SIZE} bytes at address {hex(ptr)}")
3117
+ return ptr
3118
+ else:
3119
+ print("Large pages are only supported on Windows.")
3120
+ return None
3121
+
3122
+
3123
+ def timing_compute(N_arr=[], Nproc_arr=[], backend='numpy', NcpuForward=0):
3124
+ """
3125
+ Execute timing benchmark for INTEGRATE workflow components.
3126
+
3127
+ This function benchmarks the performance of the complete INTEGRATE workflow including
3128
+ prior model generation, forward modeling, rejection sampling, and posterior statistics
3129
+ computation across different dataset sizes and processor counts.
3130
+
3131
+ Parameters
3132
+ ----------
3133
+ N_arr : array_like, optional
3134
+ Array of dataset sizes (number of prior models) to test.
3135
+ Default is [100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000].
3136
+ Nproc_arr : array_like, optional
3137
+ Array of processor counts to test. Default is powers of 2 up to available CPUs.
3138
+ NcpuForward : int, optional
3139
+ Fixed number of CPUs to use for forward modeling. When > 0, forward modeling always
3140
+ uses this many CPUs regardless of the current Nproc_arr entry. The inversion
3141
+ (rejection sampling) still varies over Nproc_arr. Default is 0 (use Nproc_arr value).
3142
+
3143
+ Returns
3144
+ -------
3145
+ str
3146
+ Filename of the NPZ file containing timing results.
3147
+
3148
+ Notes
3149
+ -----
3150
+ The benchmark tests four main components:
3151
+ 1. Prior model generation (layered geological models)
3152
+ 2. Forward modeling using GA-AEM electromagnetic simulation
3153
+ 3. Rejection sampling for Bayesian inversion
3154
+ 4. Posterior statistics computation
3155
+
3156
+ Results are saved to an NPZ file with timing arrays and system information.
3157
+ The function automatically uses appropriate test data and handles parallel processing
3158
+ configuration based on system capabilities.
3159
+ """
3160
+ import integrate as ig
3161
+ # check if parallel computations can be performed
3162
+ parallel = ig.use_parallel(showInfo=1)
3163
+
3164
+ import numpy as np
3165
+ import matplotlib.pyplot as plt
3166
+ from matplotlib.pyplot import loglog
3167
+ import time
3168
+ import h5py
3169
+ # get name of CPU
3170
+ import psutil
3171
+
3172
+ # Get hostname and number of processors
3173
+ import socket
3174
+ hostname = socket.gethostname()
3175
+ import platform
3176
+ hostname = platform.node()
3177
+ system = platform.system()
3178
+
3179
+ ## Get number of processors
3180
+ physical_cores = psutil.cpu_count(logical=False)
3181
+ logical_cores = psutil.cpu_count(logical=True)
3182
+ Ncpu = physical_cores
3183
+
3184
+ print("# TIMING TEST")
3185
+ print("Hostname (system): %s (%s) " % (hostname, system))
3186
+ print("Number of processors: %d" % Ncpu)
3187
+
3188
+ # SELECT THE CASE TO CONSIDER AND DOWNLOAD THE DATA
3189
+ files = ig.get_case_data(showInfo=-1)
3190
+ f_data_h5 = files[0]
3191
+ file_gex= ig.get_gex_file_from_data(f_data_h5)
3192
+
3193
+ print("Using data file: %s" % f_data_h5)
3194
+ print("Using GEX file: %s" % file_gex)
3195
+
3196
+ with h5py.File(f_data_h5, 'r') as f:
3197
+ nobs = f['D1/d_obs'].shape[0]
3198
+
3199
+
3200
+ ## Setup the timing test
3201
+
3202
+ #### Set the size of the data sets to test
3203
+ if len(N_arr)==0:
3204
+ N_arr = np.array([100,500,1000,5000,10000,50000,100000, 500000, 1000000, 5000000])
3205
+
3206
+ # Set the number of cores to test
3207
+ if len(Nproc_arr)==0:
3208
+ Nproc_arr=2**(np.double(np.arange(1+int(np.log2(Ncpu)))))
3209
+
3210
+ n1 = len(N_arr)
3211
+ n2 = len(Nproc_arr)
3212
+
3213
+ print("Testing on %d data sets of size(s):" % len(N_arr), N_arr)
3214
+ print("Testing on %d sets of core(s):" % len(Nproc_arr), Nproc_arr)
3215
+ if NcpuForward > 0:
3216
+ print("Forward modeling fixed to %d CPUs (inversion varies over Nproc_arr)" % NcpuForward)
3217
+
3218
+
3219
+ print("Rejection sampling backend: %s" % backend)
3220
+ backend_label = backend
3221
+ if backend == 'jax':
3222
+ import os
3223
+ jax_platform_env = os.environ.get('JAX_PLATFORMS', '').strip().lower()
3224
+ if jax_platform_env in ('gpu', 'cuda', 'rocm'):
3225
+ backend_label = 'jax_gpu'
3226
+ elif jax_platform_env == 'cpu':
3227
+ backend_label = 'jax_cpu'
3228
+ else:
3229
+ try:
3230
+ import jax
3231
+ jax_device = jax.default_backend()
3232
+ backend_label = 'jax_gpu' if jax_device == 'gpu' else 'jax_cpu'
3233
+ except Exception:
3234
+ backend_label = 'jax_cpu'
3235
+ print("JAX backend label: %s (JAX_PLATFORMS='%s')" % (backend_label, jax_platform_env))
3236
+ file_out = 'timing_%s-%s-%dcore_Nproc%d_N%d_%s.npz' % (hostname,system,Ncpu,len(Nproc_arr), len(N_arr), backend_label)
3237
+ print("Writing results to %s " % file_out)
3238
+
3239
+ ## TIMING
3240
+
3241
+ showInfo = 0
3242
+
3243
+ T_prior = np.zeros((n1,n2))*np.nan
3244
+ T_forward = np.zeros((n1,n2))*np.nan
3245
+ T_rejection = np.zeros((n1,n2))*np.nan
3246
+ T_poststat = np.zeros((n1,n2))*np.nan
3247
+
3248
+ testRejection = True
3249
+ testPostStat = True
3250
+
3251
+ for j in np.arange(n2):
3252
+ Ncpu = int(Nproc_arr[j])
3253
+
3254
+ for i in np.arange(len(N_arr)):
3255
+ N=int(N_arr[i])
3256
+ Ncpu_min = int(np.floor(2**(np.log10(N)-4)))
3257
+
3258
+ print('=====================================================')
3259
+ print('TIMING: N=%d, Ncpu=%d, Ncpu_min=%d'%(N,Ncpu,Ncpu_min))
3260
+ print('=====================================================')
3261
+
3262
+ RHO_min = 1
3263
+ RHO_max = 800
3264
+ z_max = 50
3265
+ useP = 1
3266
+
3267
+ if (Ncpu>=Ncpu_min):
3268
+
3269
+ t0_prior = time.time()
3270
+ if useP ==1:
3271
+ ## Layered model
3272
+ f_prior_h5 = ig.prior_model_layered(N=N,lay_dist='chi2', NLAY_deg=5, z_max = z_max, RHO_dist='log-uniform', RHO_min=RHO_min, RHO_max=RHO_max, showInfo=showInfo)
3273
+ #f_prior_h5 = ig.prior_model_layered(N=N,lay_dist='uniform', z_max = z_max, NLAY_min=1, NLAY_max=3, rho_dist='log-uniform', RHO_min=RHO_min, RHO_max=RHO_max)
3274
+ #f_prior_h5 = ig.prior_model_layered(N=N,lay_dist='uniform', z_max = z_max, NLAY_min=1, NLAY_max=8, rho_dist='log-uniform', RHO_min=RHO_min, RHO_max=RHO_max)
3275
+ else:
3276
+ ## N layer model with increasing thickness
3277
+ f_prior_h5 = ig.prior_model_workbench(N=N, z_max = 30, nlayers=20, rho_min = RHO_min, rho_max = RHO_max, showInfo=showInfo)
3278
+ #t_prior.append(time.time()-t0_prior)
3279
+ T_prior[i,j] = time.time()-t0_prior
3280
+
3281
+
3282
+ #ig.plot_prior_stats(f_prior_h5)
3283
+ #% A2. Compute prior DATA
3284
+ t0_forward = time.time()
3285
+ Ncpu_fwd = NcpuForward if NcpuForward > 0 else Ncpu
3286
+ f_prior_data_h5 = ig.prior_data_gaaem(f_prior_h5, file_gex, Ncpu=Ncpu_fwd, showInfo=showInfo)
3287
+ T_forward[i,j]=time.time()-t0_forward
3288
+
3289
+ #% READY FOR INVERSION
3290
+ N_use = 1000000
3291
+ t0_rejection = time.time()
3292
+ if testRejection:
3293
+ f_post_h5 = ig.integrate_rejection(f_prior_data_h5, f_data_h5, N_use=N_use, parallel=1, updatePostStat=False, Ncpu=Ncpu, showInfo=showInfo, backend=backend)
3294
+ T_rejection[i,j]=time.time()-t0_rejection
3295
+
3296
+ #% Compute some generic statistic of the posterior distribution (Mean, Median, Std)
3297
+ t0_poststat = time.time()
3298
+ if testPostStat and testRejection:
3299
+ ig.integrate_posterior_stats(f_post_h5,showInfo=showInfo)
3300
+ T_poststat[i,j]=time.time()-t0_poststat
3301
+
3302
+ T_total = T_prior + T_forward + T_rejection + T_poststat
3303
+ np.savez(file_out, T_total=T_total, T_prior=T_prior, T_forward=T_forward, T_rejection=T_rejection, T_poststat=T_poststat, N_arr=N_arr, Nproc_arr=Nproc_arr, nobs=nobs, backend=backend)
3304
+
3305
+
3306
+ return file_out
3307
+
3308
+
3309
+ def timing_plot(f_timing=''):
3310
+ """
3311
+ Generate comprehensive timing analysis plots from benchmark results.
3312
+
3313
+ This function creates multiple plots analyzing the performance characteristics
3314
+ of the INTEGRATE workflow across different dataset sizes and processor counts.
3315
+
3316
+ Parameters
3317
+ ----------
3318
+ f_timing : str
3319
+ Path to NPZ file containing timing benchmark results from timing_compute().
3320
+
3321
+ Returns
3322
+ -------
3323
+ None
3324
+ Saves multiple PNG files with timing analysis plots.
3325
+
3326
+ Notes
3327
+ -----
3328
+ Generated plots include:
3329
+ - Total execution time vs processors and dataset size
3330
+ - Forward modeling performance and speedup analysis
3331
+ - Rejection sampling performance and scaling
3332
+ - Posterior statistics computation performance
3333
+ - Cumulative time breakdowns for different processor counts
3334
+ - Comparisons with traditional least squares and MCMC methods
3335
+
3336
+ The function handles missing data gracefully and includes reference lines
3337
+ for linear scaling to assess parallel efficiency.
3338
+ """
3339
+ import numpy as np
3340
+ import matplotlib.pyplot as plt
3341
+
3342
+ def safe_show():
3343
+ """Show plot only if using interactive backend, otherwise do nothing."""
3344
+ backend = plt.get_backend()
3345
+ if backend.lower() != 'agg':
3346
+ safe_show()
3347
+
3348
+ if len(f_timing)==0:
3349
+ print('No timing file provided')
3350
+ return
3351
+ else:
3352
+ print('Plotting timing results from %s' % f_timing)
3353
+
3354
+ # file_out is f_timing, without file extension
3355
+ file_out = f_timing.split('.')[0]
3356
+
3357
+ data = np.load(f_timing)
3358
+ T_prior = data['T_prior']
3359
+ T_forward = data['T_forward']
3360
+ T_rejection = data['T_rejection']
3361
+ T_poststat = data['T_poststat']
3362
+
3363
+ N_arr = data['N_arr']
3364
+ Nproc_arr = data['Nproc_arr']
3365
+
3366
+ xlim_N = xlim_N = [np.min([800,np.min(N_arr)]),np.max([1.2e+6,np.max(N_arr)])]
3367
+ xlim_Nproc = np.min([.95,np.min(Nproc_arr)]),np.max([34,np.max(Nproc_arr)])
3368
+ try:
3369
+ T_total = data['T_total']
3370
+ except:
3371
+ T_total = T_prior + T_forward + T_rejection + T_poststat
3372
+
3373
+ try:
3374
+ nobs=data['nobs']
3375
+ except:
3376
+ nobs=11693
3377
+
3378
+
3379
+ # ############################################
3380
+ # TOTAL TIME
3381
+ # ############################################
3382
+
3383
+ # Plot
3384
+ # LSQ, Assumed time, in seconds, for least squares inversion of a single sounding
3385
+ t_lsq = 2.0
3386
+ # SAMPLING, Assumed time, in seconds, for an McMC inversion of a single sounding
3387
+ t_mcmc = 10.0*60.0
3388
+
3389
+ total_lsq = np.array([nobs*t_lsq, nobs*t_lsq/Nproc_arr[-1]])
3390
+ total_mcmc = np.array([nobs*t_mcmc, nobs*t_mcmc/Nproc_arr[-1]])
3391
+
3392
+ # loglog(T_total.T)
3393
+ plt.figure(figsize=(6,6))
3394
+ plt.loglog(Nproc_arr, T_total.T, 'o-', label=N_arr)
3395
+ plt.ylabel(r'Total time - $[s]$')
3396
+ plt.xlabel('Number of processors')
3397
+ plt.grid()
3398
+ total_lsq = np.array([nobs*t_lsq, nobs*t_lsq/Nproc_arr[-1]])
3399
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], total_lsq, 'k--', label='LSQ')
3400
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], total_mcmc, 'r--', label='MCMC')
3401
+ plt.legend(loc='upper right')
3402
+ plt.xticks(ticks=Nproc_arr, labels=[str(int(x)) for x in Nproc_arr])
3403
+ plt.tight_layout()
3404
+ plt.ylim(1,1e+8)
3405
+ plt.xlim(xlim_Nproc)
3406
+ plt.savefig('%s_total_sec_CPU' % file_out)
3407
+ safe_show()
3408
+ plt.close()
3409
+
3410
+ plt.figure(figsize=(6,6))
3411
+ plt.loglog(N_arr, T_total, 'o-', label=[f'{int(x)}' for x in Nproc_arr])
3412
+ plt.ylabel(r'Total time - $[s]$')
3413
+ plt.xlabel('N-prior')
3414
+ plt.grid()
3415
+ plt.tight_layout()
3416
+ plt.plot([N_arr[0], N_arr[-1]], [nobs*t_lsq, nobs*t_lsq], 'k--', label='LSQ')
3417
+ plt.plot([N_arr[0], N_arr[-1]], [nobs*t_mcmc, nobs*t_mcmc], 'r--', label='MCMC')
3418
+ plt.legend(loc='upper left')
3419
+ #plt.xticks(ticks=N_arr, labels=[str(int(x)) for x in Nproc_arr])
3420
+ plt.ylim(1,1e+8)
3421
+ #plt.xlim(np.min([1000,np.min(N_arr)]),np.max([1e+6,np.max(N_arr)]))
3422
+ plt.xlim(xlim_N)
3423
+ plt.savefig('%s_total_sec_N' % file_out)
3424
+ safe_show()
3425
+ plt.close()
3426
+
3427
+ # ############################################
3428
+ # FORWARD MODELING
3429
+ # ############################################
3430
+
3431
+ #### Plot timing results for forward modeling - GAAEM
3432
+ # Average timer per sounding
3433
+ T_forward_sounding = T_forward/N_arr[:,np.newaxis]
3434
+ T_forward_sounding_per_sec = N_arr[:,np.newaxis]/T_forward
3435
+ T_forward_sounding_per_sec_per_cpu = T_forward_sounding_per_sec/Nproc_arr[np.newaxis,:]
3436
+ T_forward_sounding_speedup = T_forward_sounding_per_sec/T_forward_sounding_per_sec[0,0]
3437
+
3438
+ ## Forward time per sounding - CPU
3439
+ plt.figure(figsize=(6,6))
3440
+ plt.loglog(Nproc_arr, T_forward.T, 'o-', label='A')
3441
+ # plot dashed line indicating linear scaling
3442
+ for i in range(len(N_arr)):
3443
+ # Find index of first non-nan value in T_forward[i,:]
3444
+ try:
3445
+ idx = np.nonzero(~np.isnan(T_forward[i,:]))[0][0]
3446
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [T_forward[i,idx]*Nproc_arr[idx]/Nproc_arr[0], T_forward[i,idx]*Nproc_arr[idx]/Nproc_arr[-1]], 'k--',
3447
+ label='Linear scaling',
3448
+ linewidth=0.5)
3449
+ except:
3450
+ pass
3451
+
3452
+ plt.ylabel(r'Forward time - $[s]$')
3453
+ plt.xlabel('Number of processors')
3454
+ #plt.title('Forward calculation')
3455
+ plt.grid()
3456
+ plt.legend(N_arr, loc='upper right')
3457
+ plt.ylim(1e-1, 1e+5)
3458
+ #plt.xlim(Nproc_arr[0], Nproc_arr[-1])
3459
+ plt.xlim(xlim_Nproc)
3460
+ plt.tight_layout()
3461
+ plt.savefig('%s_forward_sec_CPU' % file_out)
3462
+ safe_show()
3463
+ plt.close()
3464
+
3465
+ ## Forward time per sounding - Nproc
3466
+ plt.figure(figsize=(6,6))
3467
+ plt.loglog(N_arr, T_forward, 'o-', label='A')
3468
+ # plot dashed line indicating linear scaling
3469
+ for i in range(len(N_arr)):
3470
+ # Find index of first non-nan value in T_forward[i,:]
3471
+ try:
3472
+ idx = np.nonzero(~np.isnan(T_forward[i,:]))[0][0]
3473
+ ref_time = T_forward[i,idx]
3474
+ ref_N = N_arr[i]
3475
+ plt.plot([N_arr[0], N_arr[-1]], [ref_time*N_arr[0]/ref_N, ref_time*N_arr[-1]/ref_N], 'k--', label='Linear scaling', linewidth=0.5)
3476
+ except:
3477
+ pass
3478
+ plt.ylabel(r'Forward time - $[s]$')
3479
+ plt.xlabel('Number of models')
3480
+ #plt.title('Forward calculation')
3481
+ plt.grid()
3482
+ plt.legend(Nproc_arr, loc='upper left')
3483
+ plt.ylim(1e+0, 1e+5)
3484
+ #plt.xlim(Nproc_arr[0], Nproc_arr[-1])
3485
+ plt.xlim(xlim_N)
3486
+ plt.tight_layout()
3487
+ plt.savefig('%s_forward_sec_N' % file_out)
3488
+ safe_show()
3489
+ plt.close()
3490
+
3491
+
3492
+ #
3493
+ plt.figure(figsize=(6,6))
3494
+ plt.plot(Nproc_arr, T_forward_sounding_per_sec.T, 'o-')
3495
+ # plot line
3496
+ plt.ylabel(r'Forward computations per second - $[s^{-1}]$')
3497
+ plt.xlabel('Number of processors')
3498
+ #plt.title('Forward calculation')
3499
+ plt.grid()
3500
+ plt.legend(N_arr, loc='lower right')
3501
+ plt.xlim(xlim_Nproc)
3502
+ plt.ylim(10,1000)
3503
+ plt.tight_layout()
3504
+ plt.savefig('%s_forward_sounding_per_sec' % file_out)
3505
+ safe_show()
3506
+ plt.close()
3507
+
3508
+ #
3509
+ plt.figure(figsize=(6,6))
3510
+ plt.plot(Nproc_arr, T_forward_sounding_per_sec_per_cpu.T, 'o-')
3511
+ plt.ylabel('Forward computations per second per cpu')
3512
+ plt.xlabel('Number of processors')
3513
+ #plt.title('Forward calculation')
3514
+ plt.grid()
3515
+ # Make yaxis start at 0
3516
+ plt.ylim(0, 140)
3517
+ plt.xlim(Nproc_arr[0], Nproc_arr[-1])
3518
+ plt.xlim(xlim_Nproc)
3519
+ plt.legend(N_arr)
3520
+ plt.tight_layout()
3521
+ plt.savefig('%s_forward_sounding_per_sec_per_cpu' % file_out)
3522
+ safe_show()
3523
+ plt.close()
3524
+ #
3525
+
3526
+ plt.figure(figsize=(6,6))
3527
+ plt.plot(Nproc_arr, T_forward_sounding_speedup.T, 'o-')
3528
+ # plot a line from 0,0 tp Nproc_arr[-1], Nproc_arr[-1]
3529
+ plt.plot([0, Nproc_arr[-1]], [0, Nproc_arr[-1]], 'k--')
3530
+ # set xlim to 1, Nproc_arr[-1]
3531
+ plt.xlim(.8, Nproc_arr[-1])
3532
+ plt.ylim(.8, Nproc_arr[-1])
3533
+ plt.ylabel('gatdaem - speedup compared to 1 processor')
3534
+ plt.xlabel('Number of processors')
3535
+ plt.grid()
3536
+ plt.legend(N_arr)
3537
+ plt.xlim(xlim_Nproc)
3538
+ plt.ylim(0.5, 30)
3539
+ plt.tight_layout()
3540
+ plt.savefig('%s_forward_speedup' % file_out)
3541
+ safe_show()
3542
+ plt.close()
3543
+
3544
+ # ############################################
3545
+ # REJECTION SAMPLING
3546
+ # ############################################
3547
+
3548
+ # Average timer per sounding
3549
+ T_rejection_sounding = T_rejection/N_arr[:,np.newaxis]
3550
+ T_rejection_sounding_per_sec = N_arr[:,np.newaxis]/T_rejection
3551
+ T_rejection_sounding_per_sec_per_cpu = T_rejection_sounding_per_sec/Nproc_arr[np.newaxis,:]
3552
+ T_rejection_sounding_speedup = T_rejection_sounding_per_sec/T_rejection_sounding_per_sec[0,0]
3553
+ T_rejection_sounding_speedup = T_rejection_sounding_per_sec*0
3554
+
3555
+ T_rejection_per_data = nobs/T_rejection
3556
+
3557
+ for i in range(len(N_arr)):
3558
+ # find index of first value in T_rejection_sounding_per_sec[i,:] that is not nan
3559
+ try:
3560
+ idx = np.where(~np.isnan(T_rejection_sounding_per_sec[i,:]))[0][0]
3561
+ T_rejection_sounding_speedup[i,:] = T_rejection_sounding_per_sec[i,:]/(T_rejection_sounding_per_sec[i,idx]/Nproc_arr[idx])
3562
+ except:
3563
+ T_rejection_sounding_speedup[i,:] = T_rejection_sounding_per_sec[i,:]*0
3564
+
3565
+
3566
+ ## Rejection total sec - per CPU
3567
+ plt.figure(figsize=(6,6))
3568
+ plt.loglog(Nproc_arr, T_rejection.T, 'o-')
3569
+ for i in range(len(N_arr)):
3570
+ # Find index of first non-nan value in T_forward[i,:]
3571
+ try:
3572
+ idx = np.nonzero(~np.isnan(T_rejection[i,:]))[0][0]
3573
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [T_rejection[i,idx]*Nproc_arr[idx]/Nproc_arr[0], T_rejection[i,idx]*Nproc_arr[idx]/Nproc_arr[-1]], 'k--',
3574
+ label='Linear scaling',
3575
+ linewidth=0.5)
3576
+ except:
3577
+ pass
3578
+ plt.ylabel('Rejection sampling - time $[s]$')
3579
+ plt.xlabel('Number of processors')
3580
+ plt.grid()
3581
+ plt.legend(N_arr)
3582
+ plt.tight_layout()
3583
+ plt.ylim(1e-1, 2e+3)
3584
+ plt.xlim(xlim_Nproc)
3585
+ plt.savefig('%s_rejection_sec_CPU' % file_out)
3586
+ safe_show()
3587
+ plt.close()
3588
+
3589
+
3590
+ ## Rejection total sec - per process
3591
+ plt.figure(figsize=(6,6))
3592
+ plt.loglog(N_arr, T_rejection, 'o-')
3593
+ for i in range(len(Nproc_arr)):
3594
+ # Find index of first non-nan value in T_forward[i,:]
3595
+ try:
3596
+ idx = np.nonzero(~np.isnan(T_rejection[:,i]))[0][0]
3597
+ ref_time = np.abs(T_rejection[idx,i])
3598
+ plt.plot([N_arr[0], N_arr[-1]], [ref_time*N_arr[0]/N_arr[idx], ref_time*N_arr[-1]/N_arr[idx]], 'k--',
3599
+ label='Linear scaling',
3600
+ linewidth=0.5)
3601
+ except:
3602
+ pass
3603
+ plt.ylabel('Rejection sampling - time $[s]$')
3604
+ plt.xlabel('Lookup table size')
3605
+ plt.grid()
3606
+ plt.legend(Nproc_arr)
3607
+ plt.ylim(1e-1, 2e+3)
3608
+ plt.xlim(xlim_N)
3609
+ plt.tight_layout()
3610
+ plt.savefig('%s_rejection_sec_N' % file_out)
3611
+ safe_show()
3612
+ plt.close()
3613
+
3614
+
3615
+ ## Rejection speedup
3616
+ plt.figure(figsize=(6,6))
3617
+ plt.plot(Nproc_arr, T_rejection_sounding_speedup.T, 'o-')
3618
+ # plot a line from 0,0 tp Nproc_arr[-1], Nproc_arr[-1]
3619
+ plt.plot([0, Nproc_arr[-1]], [0, Nproc_arr[-1]], 'k--')
3620
+ # set xlim to 1, Nproc_arr[-1]
3621
+ plt.xlim(.8, Nproc_arr[-1])
3622
+ plt.ylim(.8, Nproc_arr[-1])
3623
+ plt.ylabel('Rejection sampling - speedup compared to 1 processor')
3624
+ plt.xlabel('Number of processors')
3625
+ plt.grid()
3626
+ plt.xlim(xlim_Nproc)
3627
+ plt.legend(N_arr)
3628
+ plt.savefig('%s_rejection_speedup' % file_out)
3629
+ safe_show()
3630
+ plt.close()
3631
+
3632
+
3633
+ ## Rejection sound per sec
3634
+ plt.figure(figsize=(6,6))
3635
+ plt.loglog(Nproc_arr, T_rejection_per_data.T, 'o-', label=N_arr)
3636
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [1./t_lsq, 1./t_lsq], 'k--', label='LSQ')
3637
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [1./t_mcmc, 1./t_mcmc], 'r--', label='MCMC')
3638
+ plt.ylabel('Rejection sampling - number of soundings per second - $s^{-1}$')
3639
+ plt.xlabel('Number of processors')
3640
+ plt.grid()
3641
+ plt.legend(loc='lower left')
3642
+ plt.ylim(1e-3, 1e+5)
3643
+ plt.xlim(xlim_Nproc)
3644
+ plt.tight_layout()
3645
+ plt.savefig('%s_rejection_sounding_per_sec' % file_out)
3646
+ safe_show()
3647
+ plt.close()
3648
+
3649
+ ## Rejection sec per sounding
3650
+ plt.figure(figsize=(6,6))
3651
+ plt.semilogy(Nproc_arr, 1./T_rejection_per_data.T, 'o-', label=N_arr)
3652
+ #plt.plot(Nproc_arr, 1./T_rejection_per_data.T, 'o-', label=N_arr)
3653
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [t_lsq, t_lsq], 'k--', label='LSQ')
3654
+ plt.plot([Nproc_arr[0], Nproc_arr[-1]], [t_mcmc, t_mcmc], 'r--', label='MCMC')
3655
+ plt.ylabel('Rejection sampling - seconds per sounding - $s$')
3656
+ plt.xlabel('Number of processors')
3657
+ plt.grid()
3658
+ plt.legend(loc='upper right')
3659
+ plt.ylim(1e-5, 1e+3)
3660
+ plt.xlim(xlim_Nproc)
3661
+ plt.tight_layout()
3662
+ plt.savefig('%s_rejection_sec_per_sound' % file_out)
3663
+ safe_show()
3664
+ plt.close()
3665
+
3666
+ ## Rejection sound per sec - N
3667
+ plt.figure(figsize=(6,6))
3668
+ plt.loglog(N_arr, T_rejection_sounding_per_sec, 'o-')
3669
+ #plt.ylim(0, 8000)
3670
+ plt.ylabel('Rejection sampling - Soundings per second')
3671
+ plt.xlabel('Lookup table size')
3672
+ plt.grid()
3673
+ plt.legend(Nproc_arr)
3674
+ plt.xlim(xlim_N)
3675
+ plt.tight_layout()
3676
+ plt.savefig('%s_rejection_sounding_per_sec_N' % file_out)
3677
+ safe_show()
3678
+ plt.close()
3679
+
3680
+ ## Rejection sound per sec - per CPU
3681
+ plt.figure(figsize=(6,6))
3682
+ plt.loglog(Nproc_arr, T_rejection_sounding_per_sec.T, 'o-')
3683
+ #plt.ylim(0, 8000)
3684
+ plt.ylabel('Rejection sampling - Soundings per second')
3685
+ plt.xlabel('Number of processors')
3686
+ plt.grid()
3687
+ plt.legend(N_arr)
3688
+ plt.xlim(xlim_Nproc)
3689
+ plt.tight_layout()
3690
+ plt.savefig('%s_rejection_sounding_per_sec_CPU' % file_out)
3691
+ safe_show()
3692
+ plt.close()
3693
+
3694
+ ## Sound per sec per CPU - N
3695
+ plt.figure(figsize=(6,6))
3696
+ plt.loglog(N_arr, T_rejection_sounding_per_sec_per_cpu, 'o-')
3697
+ plt.plot([0, Nproc_arr[-1]], [0, Nproc_arr[-1]], 'k--')
3698
+ plt.xlim(90, 5000000*1.1)
3699
+ #plt.ylim(0, 8000)
3700
+ plt.ylabel('Rejection sampling - Soundings per second per cpu')
3701
+ plt.xlabel('Lookup table size')
3702
+ plt.grid()
3703
+ plt.legend(Nproc_arr)
3704
+ plt.xlim(xlim_N)
3705
+ plt.tight_layout()
3706
+ plt.savefig('%s_rejection_sounding_per_sec_per_cpu_N' % file_out)
3707
+ safe_show()
3708
+ plt.close()
3709
+
3710
+
3711
+ ## Sound per sec per CPU - CPU
3712
+ plt.figure(figsize=(6,6))
3713
+ plt.semilogx(Nproc_arr, T_rejection_sounding_per_sec_per_cpu.T, 'o-')
3714
+ plt.ylim([0, np.nanmax(T_rejection_sounding_per_sec_per_cpu.T)*1.1])
3715
+ plt.ylabel('Rejection sampling - Soundings per second per cpu')
3716
+ plt.xlabel('Number of processors')
3717
+ plt.grid()
3718
+ plt.legend(N_arr)
3719
+ plt.xlim(xlim_Nproc)
3720
+ plt.tight_layout()
3721
+ plt.savefig('%s_rejection_sounding_per_sec_per_cpu_CPU' % file_out)
3722
+ safe_show()
3723
+ plt.close()
3724
+
3725
+
3726
+ # ############################################
3727
+ # POSTERIOR STATISTICS
3728
+ # ############################################
3729
+
3730
+ # Average timer per sounding
3731
+ T_poststat_sounding = T_poststat/N_arr[:,np.newaxis]
3732
+ T_poststat_sounding_per_sec = N_arr[:,np.newaxis]/T_poststat
3733
+ T_poststat_sounding_per_sec_per_cpu = T_poststat_sounding_per_sec/Nproc_arr[np.newaxis,:]
3734
+ T_poststat_sounding_speedup = T_poststat_sounding_per_sec/T_poststat_sounding_per_sec[0,0]
3735
+
3736
+ plt.figure(figsize=(6,6))
3737
+ plt.plot(Nproc_arr, T_poststat_sounding_per_sec.T, 'o-')
3738
+ plt.ylabel('Posterior statistics - Soundings per second - $[s^{-1}]$')
3739
+ plt.xlabel('Number of processors')
3740
+ plt.grid()
3741
+ plt.legend(N_arr)
3742
+ plt.xlim(xlim_Nproc)
3743
+ plt.tight_layout()
3744
+ plt.savefig('%s_poststat_sounding_per_sec' % file_out)
3745
+ safe_show()
3746
+ plt.close()
3747
+
3748
+ # plt.figure(figsize=(6,6))
3749
+ # plt.plot(Nproc_arr, T_poststat_sounding_speedup.T, 'o-')
3750
+ # # plot a line from 0,0 tp Nproc_arr[-1], Nproc_arr[-1]
3751
+ # plt.plot([0, Nproc_arr[-1]], [0, Nproc_arr[-1]], 'k--')
3752
+ # # set xlim to 1, Nproc_arr[-1]
3753
+ # plt.xlim(.8, Nproc_arr[-1])
3754
+ # plt.ylim(.8, Nproc_arr[-1])
3755
+ # plt.ylabel('Posterior statistics - speedup compared to 1 processor')
3756
+ # plt.xlabel('Number of processors')
3757
+ # plt.grid()
3758
+ # plt.legend(N_arr)
3759
+ # plt.savefig('%s_poststat_speedup' % file_out)
3760
+
3761
+ #####
3762
+ # ## Plot Cumulative Time useage for min and max number of used cores
3763
+
3764
+ i_proc = len(Nproc_arr)-1
3765
+ #i_proc= 0
3766
+
3767
+ for i_proc in [0,len(Nproc_arr)-1]:
3768
+
3769
+ T=[T_prior[:,i_proc], T_forward[:,i_proc], T_rejection[:,i_proc], T_poststat[:,i_proc]]
3770
+
3771
+ ### %% Plor cumT as an area plot
3772
+ plt.figure(figsize=(6,6))
3773
+ plt.stackplot(N_arr, T, labels=['Prior', 'Forward', 'Rejection', 'PostStat'])
3774
+ plt.plot(N_arr, T_total[:, i_proc], 'k--')
3775
+ plt.xscale('log')
3776
+ plt.yscale('log')
3777
+ plt.xlabel('$N_{lookup}$')
3778
+ plt.ylabel('Time [$s$]')
3779
+ plt.title('Cumulative time, using %d processors' % Nproc_arr[i_proc])
3780
+ plt.legend(loc='upper left')
3781
+ plt.grid(True, which="both", ls="--")
3782
+ plt.tight_layout()
3783
+ plt.savefig('%s_Ncpu%d_cumT' % (file_out,Nproc_arr[i_proc]))
3784
+ safe_show()
3785
+ plt.close()
3786
+
3787
+ # The same as thea area plot but normalized to the total time
3788
+ plt.figure(figsize=(6,6))
3789
+ plt.stackplot(N_arr, T/np.sum(T, axis=0), labels=['Prior', 'Forward', 'Rejection', 'PostStat'])
3790
+ plt.xscale('log')
3791
+ plt.xlabel('$N_{lookup}$')
3792
+ plt.ylabel('Normalized time')
3793
+ plt.legend(loc='upper left')
3794
+ plt.grid(True, which="both", ls="--")
3795
+ plt.tight_layout()
3796
+ plt.title('Normalized time, using %d processors' % Nproc_arr[i_proc])
3797
+ plt.savefig('%s_Ncpu%d_cumT_norm' % (file_out,Nproc_arr[i_proc]))
3798
+ safe_show()
3799
+ plt.close()
3800
+
3801
+ # Working will well data
3802
+
3803
+ def compute_P_obs_from_log(depth_top, depth_bottom, lithology_obs, z, class_id, P_single=0.8, P_prior=None):
3804
+ """
3805
+ Compute discrete observation probability matrix from depth intervals and lithology observations.
3806
+
3807
+ This function creates a probability matrix where each depth point is assigned
3808
+ probabilities based on observed lithology classes within specified depth intervals.
3809
+
3810
+ Parameters
3811
+ ----------
3812
+ depth_top : array-like
3813
+ Array of top depths for each observation interval.
3814
+ depth_bottom : array-like
3815
+ Array of bottom depths for each observation interval.
3816
+ lithology_obs : array-like
3817
+ Array of observed lithology class IDs for each interval.
3818
+ z : array-like
3819
+ Array of depth/position values where probabilities are computed.
3820
+ class_id : array-like
3821
+ Array of unique class identifiers (e.g., [0, 1, 2] for 3 lithology types).
3822
+ P_single : float, optional
3823
+ Probability assigned to the observed class. Default is 0.8.
3824
+ P_prior : ndarray, optional
3825
+ Prior probability matrix of shape (nclass, nm). If None, uses uniform distribution
3826
+ for depths not covered by observations. Default is None.
3827
+
3828
+ Returns
3829
+ -------
3830
+ P_obs : ndarray
3831
+ Probability matrix of shape (nclass, nm) where nclass is the number of classes
3832
+ and nm is the number of depth points. For each depth point covered by observations,
3833
+ the observed class gets probability P_single and other classes share (1-P_single).
3834
+ Depths not covered by any observation contain NaN or prior probabilities if provided.
3835
+
3836
+ Examples
3837
+ --------
3838
+ >>> depth_top = [0, 10, 20]
3839
+ >>> depth_bottom = [10, 20, 30]
3840
+ >>> lithology_obs = [1, 2, 1] # clay, sand, clay
3841
+ >>> z = np.arange(30)
3842
+ >>> class_id = [0, 1, 2] # gravel, clay, sand
3843
+ >>> P_obs = compute_P_obs_from_log(depth_top, depth_bottom, lithology_obs, z, class_id)
3844
+ >>> print(P_obs.shape) # (3, 30)
3845
+ """
3846
+ import numpy as np
3847
+
3848
+ nm = len(z)
3849
+ nclass = len(class_id)
3850
+
3851
+ # Compute probability for non-hit classes
3852
+ P_nohit = (1 - P_single) / (nclass - 1)
3853
+
3854
+ # Initialize with NaN or prior
3855
+ if P_prior is not None:
3856
+ P_obs = P_prior.copy()
3857
+ else:
3858
+ P_obs = np.zeros((nclass, nm)) * np.nan
3859
+
3860
+ # Loop through each depth point
3861
+ for im in range(nm):
3862
+ # Loop through each observation interval
3863
+ for i in range(len(depth_top)):
3864
+ # Check if current depth is within this interval
3865
+ if z[im] >= depth_top[i] and z[im] < depth_bottom[i]:
3866
+ # Assign probabilities for all classes
3867
+ for ic in range(nclass):
3868
+ if class_id[ic] == lithology_obs[i]:
3869
+ P_obs[ic, im] = P_single
3870
+ else:
3871
+ P_obs[ic, im] = P_nohit
3872
+
3873
+ return P_obs
3874
+
3875
+ def rescale_P_obs_temperature(P_obs, T=1.0):
3876
+ """
3877
+ Rescale discrete observation probabilities by temperature and renormalize.
3878
+
3879
+ This function applies temperature annealing to probability distributions by raising
3880
+ each probability to the power (1/T), then renormalizing each column (depth point)
3881
+ so that probabilities sum to 1. Higher temperatures (T > 1) flatten the distribution,
3882
+ while lower temperatures (T < 1) sharpen it.
3883
+
3884
+ Parameters
3885
+ ----------
3886
+ P_obs : ndarray
3887
+ Probability matrix of shape (nclass, nm) where nclass is the number of classes
3888
+ and nm is the number of model parameters (e.g., depth points).
3889
+ Each column should represent a probability distribution over classes.
3890
+ T : float, optional
3891
+ Temperature parameter for annealing. Default is 1.0 (no scaling).
3892
+ - T = 1.0: No change (original probabilities)
3893
+ - T > 1.0: Flattens distribution (less certain)
3894
+ - T < 1.0: Sharpens distribution (more certain)
3895
+ - T → ∞: Approaches uniform distribution
3896
+ - T → 0: Approaches one-hot distribution
3897
+
3898
+ Returns
3899
+ -------
3900
+ P_obs_scaled : ndarray
3901
+ Temperature-scaled and renormalized probability matrix of shape (nclass, nm).
3902
+ Each column sums to 1.0. NaN values in input are preserved in output.
3903
+
3904
+ Examples
3905
+ --------
3906
+ >>> P_obs = np.array([[0.8, 0.6, 0.5],
3907
+ ... [0.1, 0.2, 0.3],
3908
+ ... [0.1, 0.2, 0.2]])
3909
+ >>> P_scaled = rescale_P_obs_temperature(P_obs, T=2.0)
3910
+ >>> print(P_scaled) # More uniform distribution
3911
+ >>> P_scaled = rescale_P_obs_temperature(P_obs, T=0.5)
3912
+ >>> print(P_scaled) # Sharper distribution
3913
+
3914
+ Notes
3915
+ -----
3916
+ The temperature scaling follows the Boltzmann distribution:
3917
+ P_new(c) ∝ P_old(c)^(1/T)
3918
+
3919
+ After scaling, each column (depth point) is renormalized:
3920
+ P_new(c) = P_new(c) / sum_c(P_new(c))
3921
+
3922
+ This is commonly used in simulated annealing and rejection sampling to control
3923
+ the strength of discrete observations during Bayesian inference.
3924
+ """
3925
+ import numpy as np
3926
+
3927
+ # Copy to avoid modifying the original
3928
+ P_obs_scaled = P_obs.copy()
3929
+
3930
+ # Get shape
3931
+ nclass, nm = P_obs.shape
3932
+
3933
+ # Apply temperature scaling: p^(1/T)
3934
+ # Handle special case where T=1 (no scaling needed)
3935
+ if T != 1.0:
3936
+ P_obs_scaled = np.power(P_obs_scaled, 1.0 / T)
3937
+
3938
+ # Renormalize each column (each depth point) to sum to 1
3939
+ for im in range(nm):
3940
+ col_sum = np.nansum(P_obs_scaled[:, im])
3941
+
3942
+ # Only renormalize if the sum is non-zero and not NaN
3943
+ if col_sum > 0 and not np.isnan(col_sum):
3944
+ P_obs_scaled[:, im] = P_obs_scaled[:, im] / col_sum
3945
+
3946
+ return P_obs_scaled
3947
+
3948
+ # def Pobs_to_datagrid(P_obs, X, Y, f_data_h5, r_data=10, r_dis=100, doPlot=False):
3949
+ # """
3950
+ # Convert point-based discrete probability observations to gridded data with distance-based weighting.
3951
+
3952
+ # This function distributes discrete probability observations (e.g., from a borehole) across
3953
+ # a spatial grid using distance-based weighting. Observations at location (X, Y) are applied
3954
+ # to nearby grid points with decreasing influence based on distance. Temperature annealing
3955
+ # is used to reduce the strength of observations far from the source point.
3956
+
3957
+ # Parameters
3958
+ # ----------
3959
+ # P_obs : ndarray
3960
+ # Probability matrix of shape (nclass, nm) where nclass is the number of classes
3961
+ # and nm is the number of model parameters (e.g., depth points).
3962
+ # Each column represents a probability distribution over discrete classes.
3963
+ # X : float
3964
+ # X coordinate (e.g., UTM Easting) of the observation point.
3965
+ # Y : float
3966
+ # Y coordinate (e.g., UTM Northing) of the observation point.
3967
+ # f_data_h5 : str
3968
+ # Path to HDF5 data file containing survey geometry (X, Y coordinates).
3969
+ # r_data : float, optional
3970
+ # Inner radius in meters within which observations have full strength.
3971
+ # Default is 10 meters.
3972
+ # r_dis : float, optional
3973
+ # Outer radius in meters for distance-based weighting. Beyond this distance,
3974
+ # observations are fully attenuated (temperature → ∞). Default is 100 meters.
3975
+ # doPlot : bool, optional
3976
+ # If True, creates diagnostic plots showing weight distributions.
3977
+ # Default is False.
3978
+
3979
+ # Returns
3980
+ # -------
3981
+ # d_obs : ndarray
3982
+ # Gridded observation data of shape (nd, nclass, nm) where nd is the number
3983
+ # of spatial locations in the survey. Each location gets temperature-scaled
3984
+ # probabilities based on distance from (X, Y).
3985
+ # i_use : ndarray
3986
+ # Binary mask of shape (nd, 1) indicating which grid points should be used
3987
+ # (1) or ignored (0) in the inversion. Points with temperature < 100 are used.
3988
+ # T_all : ndarray
3989
+ # Array of temperature values of shape (nd,) for each grid point, indicating
3990
+ # the strength of observation influence based on distance.
3991
+
3992
+ # Notes
3993
+ # -----
3994
+ # The function uses distance-based temperature annealing:
3995
+ # 1. Computes distance-based weights using `get_weight_from_position()`
3996
+ # 2. Converts distance weight to temperature: T = 1 / w_dis
3997
+ # 3. Caps maximum temperature at 100 (very weak influence)
3998
+ # 4. For each grid point:
3999
+ # - If T < 100: include point (i_use=1) and apply temperature scaling
4000
+ # - If T ≥ 100: exclude point (i_use=0) and set observations to NaN
4001
+
4002
+ # Temperature scaling reduces probability certainty with distance:
4003
+ # - T = 1 (close to observation): Original probabilities preserved
4004
+ # - T > 1 (far from observation): Probabilities become more uniform
4005
+ # - T ≥ 100 (very far): Observations effectively ignored
4006
+
4007
+ # Examples
4008
+ # --------
4009
+ # >>> # Borehole observation at specific location
4010
+ # >>> P_obs = compute_P_obs_from_log(depth_top, depth_bottom, lithology, z, class_id)
4011
+ # >>> X_well, Y_well = 543000.0, 6175800.0
4012
+ # >>> d_obs, i_use, T_all = Pobs_to_datagrid(P_obs, X_well, Y_well, 'survey_data.h5',
4013
+ # ... r_data=10, r_dis=100)
4014
+ # >>> # Write to data file
4015
+ # >>> ig.write_data_multinomial(d_obs, i_use=i_use, id=2, f_data_h5='survey_data.h5')
4016
+
4017
+ # See Also
4018
+ # --------
4019
+ # rescale_P_obs_temperature : Temperature scaling function
4020
+ # compute_P_obs_from_log : Create P_obs from depth intervals
4021
+ # get_weight_from_position : Distance-based weighting function
4022
+ # """
4023
+ # import numpy as np
4024
+ # import integrate as ig
4025
+
4026
+ # # Get grid dimensions from data file
4027
+ # X_grid, Y_grid, _, _ = ig.get_geometry(f_data_h5)
4028
+ # nd = len(X_grid)
4029
+ # nclass, nm = P_obs.shape
4030
+
4031
+ # # Initialize output arrays
4032
+ # i_use = np.zeros((nd, 1))
4033
+ # d_obs = np.zeros((nd, nclass, nm)) * np.nan
4034
+
4035
+ # # Compute distance-based weights for all grid points
4036
+ # w_combined, w_dis, w_data, i_use_from_func = ig.get_weight_from_position(
4037
+ # f_data_h5, X, Y, r_data=r_data, r_dis=r_dis, doPlot=doPlot
4038
+ # )
4039
+
4040
+ # # Convert distance weight to temperature
4041
+ # # w_dis is 1 at observation point, decreases with distance
4042
+ # # T = 1/w_dis means T increases with distance (weaker influence)
4043
+ # T_all = 1 / w_combined
4044
+ # #T_all = 1 / w_dis
4045
+ # #T_all = 1 / w_data
4046
+
4047
+ # # Cap maximum temperature at 100 (beyond this, observation has negligible effect)
4048
+ # T_all[T_all > 100] = 100
4049
+
4050
+ # # Apply temperature scaling to each grid point
4051
+ # for ip in np.arange(nd):
4052
+ # T = T_all[ip]
4053
+
4054
+ # # Only use points where temperature is reasonable (< 100)
4055
+ # if T < 100:
4056
+ # i_use[ip] = 1
4057
+ # # Scale probabilities based on distance (higher T = more uniform distribution)
4058
+ # P_obs_local = rescale_P_obs_temperature(P_obs, T=T)
4059
+ # d_obs[ip, :, :] = P_obs_local
4060
+ # # else: i_use[ip] = 0 and d_obs[ip] stays NaN
4061
+
4062
+ # return d_obs, i_use, T_all
4063
+