PyPI - integrate-module - Versions diffs - 0.96.0__tar.gz → 0.97.0__tar.gz - Mend

integrate-module 0.96.0tar.gz → 0.97.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{integrate_module-0.96.0/integrate_module.egg-info → integrate_module-0.97.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: integrate_module
-Version: 0.96.0
+Version: 0.97.0
 Summary: Localized probabilistic data integration
 Author-email: Thomas Mejer Hansen <tmeha@geo.au.dk>
 License: MIT

{integrate_module-0.96.0 → integrate_module-0.97.0}/integrate/integrate.py RENAMED Viewed

@@ -1173,7 +1173,7 @@ def get_process_handle_count():
     import os
     return psutil.Process(os.getpid()).num_handles()
-def prior_data_gaaem(f_prior_h5, file_gex=None, stmfiles=None, N=0, doMakePriorCopy=True, im=1, id=1, im_height=0, Nhank=280, Nfreq=12, is_log=False, parallel=True, **kwargs):
+def prior_data_gaaem(f_prior_h5, file_gex=None, stmfiles=None, N=0, doMakePriorCopy=True, im=1, id=1, im_height=0, Nhank=280, Nfreq=12, is_log=False, parallel=True, force_replace=False, **kwargs):
     """
     Generate prior data for the GA-AEM method.
@@ -1211,6 +1211,10 @@ def prior_data_gaaem(f_prior_h5, file_gex=None, stmfiles=None, N=0, doMakePriorC
         Ncpu : int, optional
             Number of CPUs to use for parallel processing. Default is 0, which
             uses all available CPUs. Only used when parallel=True.
+        force_replace : bool, optional
+            If True, delete an existing /D{id} dataset before writing.
+            If False (default), print a warning and return early if the
+            dataset already exists.
         showInfo : int, optional
             Level of verbosity for output (0=silent, 1=normal, 2=verbose).
@@ -1440,6 +1444,12 @@ def prior_data_gaaem(f_prior_h5, file_gex=None, stmfiles=None, N=0, doMakePriorC
     # Write D to f_prior['/D1']
     with h5py.File(f_prior_data_h5, 'a') as f_prior:
+        if Dname in f_prior:
+            if force_replace:
+                del f_prior[Dname]
+            else:
+                print("Key '%s' already exists in %s. Use force_replace=True to overwrite." % (Dname, f_prior_data_h5))
+                return f_prior_data_h5
         f_prior[Dname] = D
         # Add method, type, file_ex, and im as attributes to '/D1'

{integrate_module-0.96.0 → integrate_module-0.97.0}/integrate/integrate_io.py RENAMED Viewed

@@ -538,6 +538,10 @@ def save_prior_data(f_prior_h5, D_new, id=None, force_delete=False,
                 print("Key '%s' already exists. Use force_delete=True to overwrite." % key)
                 return False
+        # Ensure 2D array: reshape 1D (N,) to (N, 1)
+        if D_new.ndim == 1:
+            D_new = D_new.reshape(-1, 1)
         # Write the new data
         # Convert to 32-bit float for better memory efficiency if the data is floating point
         if np.issubdtype(D_new.dtype, np.floating):
@@ -2415,6 +2419,93 @@ def copy_prior(input_filename, output_filename, idx=None, N_use=None, loadtomem=
     return output_filename
+def filter_prior(f_prior_h5, type='nonnegative_data', id=1,
+                 f_prior_filtered_h5='', **kwargs):
+    """
+    Filter prior realizations and write the result to a new HDF5 file.
+    Removes rows (realizations) from all M and D datasets in a prior file
+    based on a criterion evaluated on a chosen D dataset. The filtered file
+    is a complete, self-consistent prior that can be used directly in place
+    of the original.
+    Parameters
+    ----------
+    f_prior_h5 : str
+        Path to the input prior HDF5 file.
+    type : str, optional
+        Filter criterion to apply. Supported values:
+        ``'nonnegative_data'``
+            Keep only realizations where every value in ``/D{id}`` is >= 0.
+            Useful after forward modelling to remove unphysical responses.
+        Default is ``'nonnegative_data'``.
+    id : int, optional
+        Index of the D dataset used for filtering (e.g. ``id=1`` uses ``/D1``).
+        Default is 1.
+    f_prior_filtered_h5 : str, optional
+        Output filename. If empty, auto-generates as
+        ``<stem>_filtered_<type>.h5``. Default is ``''``.
+    **kwargs
+        showInfo : int, optional
+            Verbosity level (default 0). Passed through to ``copy_prior``.
+    Returns
+    -------
+    str
+        Path to the filtered output HDF5 file.
+    Raises
+    ------
+    KeyError
+        If ``/D{id}`` is not found in the input file.
+    ValueError
+        If an unknown ``type`` is specified.
+    Examples
+    --------
+    >>> f_prior_filtered = ig.filter_prior(f_prior_h5, type='nonnegative_data', id=1)
+    Notes
+    -----
+    Filtering is delegated to ``copy_prior``, which preserves all dataset
+    attributes and applies compression. New filter types can be added by
+    extending the ``if/elif`` block that computes ``idx``.
+    """
+    import numpy as np
+    import os
+    showInfo = kwargs.get('showInfo', 0)
+    if not f_prior_filtered_h5:
+        stem = os.path.splitext(f_prior_h5)[0]
+        f_prior_filtered_h5 = '%s_filtered_%s.h5' % (stem, type)
+    Dname = '/D%d' % id
+    with h5py.File(f_prior_h5, 'r') as f:
+        if Dname not in f:
+            raise KeyError("Dataset '%s' not found in %s" % (Dname, f_prior_h5))
+        D = f[Dname][:]
+    if type == 'nonnegative_data':
+        idx = np.where(np.all(D >= 0, axis=1))[0]
+    else:
+        raise ValueError("Unknown filter type: '%s'" % type)
+    N_in = D.shape[0]
+    N_out = len(idx)
+    if showInfo >= 0:
+        print("filter_prior [%s on %s]: keeping %d / %d realizations (%.1f%%)"
+              % (type, Dname, N_out, N_in, 100.0 * N_out / N_in))
+    copy_prior(f_prior_h5, f_prior_filtered_h5, idx=idx, **kwargs)
+    return f_prior_filtered_h5
 def hdf5_scan(file_path):
     """
     Scans an HDF5 file and prints information about datasets (including their size) and attributes.
@@ -2499,13 +2590,15 @@ def download_file(url, download_dir, use_checksum=False, **kwargs):
         return
     # Download and save the file
-    print(f'Downloading {file_name}')
+    if showInfo>0:
+        print(f'Downloading {file_name}')
     response = requests.get(url)
     response.raise_for_status()  # Check if the request was successful
     with open(file_path, 'wb') as file:
         file.write(response.content)
-    print(f'Downloaded {file_name}')
+    if showInfo>-1:
+        print(f'Downloaded {file_name}')
     # Check if checksum verification is enabled
     if use_checksum:
@@ -2787,6 +2880,31 @@ def get_case_data(case='DAUGAARD', loadAll=False, loadType='', filelist=None, **
         if loadAll:
             filelist.append('haderup_N1000000_dmax90_dz1.h5')
+    elif case=='SOENDER_FELDING':
+        filelist.append('README_SOENDER_FELDING')
+        filelist.append('TX07_20240802_2x4_RC20-39.gex')
+        filelist.append('TX07_20240802_2x4_RC20-39_eksternGPS.gex')
+        filelist.append('TX07_20240912_2x4_RC20-39_eksterngps.gex')
+        filelist.append('TX07_20241014_2x4_RC20_33_and_57_EksternGPS.gex')
+        filelist.append('TX07_20241202_2x4_RC20_57.gex')
+        filelist.append('TX07_20241202_2x4_RC20_57_EksternGPS.gex')
+        filelist.append('20240819_AVG_export.xyz')
+        filelist.append('20240820_AVG_export.xyz')
+        filelist.append('20240821_AVG_export.xyz')
+        filelist.append('20240911_AVG_export.xyz')
+        filelist.append('20240924_AVG_export.xyz')
+        filelist.append('20240924_test_AVG_export.xyz')
+        filelist.append('20241007_AVG_export.xyz')
+        filelist.append('20241008_AVG_export.xyz')
+        filelist.append('20241029_AVG_export.xyz')
+        filelist.append('20240911_eksterngps_AVG_export.xyz')
+        filelist.append('20241210_AVG_export.xyz')
+        filelist.append('20241210_InternGPS_AVG_export.xyz')
+        filelist.append('Sdr_Felding_prior_standard_N1000000_dmax90_20260417_0929.h5')
+        filelist.append('SdrFelding_boreholes.json')
     else:
         filelist = []
@@ -2798,10 +2916,9 @@ def get_case_data(case='DAUGAARD', loadAll=False, loadType='', filelist=None, **
     urlErda = 'https://anon.erda.au.dk/share_redirect/dxOLKDtoul'
     urlErdaCase = '%s/%s' % (urlErda,case)
-    for remotefile in filelist:
-        #print(remotefile)
+    from tqdm import tqdm
+    for remotefile in tqdm(filelist, desc='Downloading %s' % case):
         remoteurl = '%s/%s' % (urlErdaCase,remotefile)
-        #remoteurl = 'https://anon.erda.au.dk/share_redirect/dxOLKDtoul/%s/%s' % (case,remotefile)
         download_file(remoteurl,'.',showInfo=showInfo)
     if showInfo>-1:
         print('--> Got data for case: %s' % case)
@@ -3051,7 +3168,7 @@ def save_data_gaussian(D_obs, D_std = [], d_std=[], Cd=[], id=1, id_prior=None,
     return f_data_h5
-def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None, nan_value=None, showInfo=0, disregardFullNan=True):
+def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None, nan_value=None, showInfo=0, disregardFullNan=True, data_obs=None, data_std=None):
     """
     Convert Aarhus Workbench XYZ export file(s) to an INTEGRATE HDF5 data file.
@@ -3094,6 +3211,15 @@ def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None
     disregardFullNan : bool, optional
         If True (default), soundings where all gates are NaN are excluded
         from the output HDF5 file.
+    data_obs : list of str, optional
+        Flightlines column names (case-insensitive) to write as additional
+        data blocks.  The first entry becomes ``/D2/d_obs``, the second
+        ``/D3/d_obs``, and so on.  Example: ``['RX_ALTITUDE', 'TX_ALTITUDE']``.
+    data_std : list of str or None, optional
+        Flightlines column names for the corresponding standard deviations,
+        same length as ``data_obs``.  Use ``None`` for an individual entry to
+        fall back to ``0.05 * |d_obs|`` for that column.  If the whole
+        parameter is omitted, all columns default to ``0.05 * |d_obs|``.
     Returns
     -------
@@ -3182,6 +3308,10 @@ def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None
     ld = {k: pd.concat([xyz.layer_data[k] for xyz in xyz_list], ignore_index=True)
           for k in xyz_list[0].layer_data}
+    # Handle XYZ files that use 'x'/'y' instead of 'utmx'/'utmy'
+    if 'utmx' not in fl.columns and 'x' in fl.columns:
+        fl = fl.rename(columns={'x': 'utmx', 'y': 'utmy'})
     # Determine dummy/missing value: explicit arg > XYZ header > fallback 9999
     if nan_value is None:
         nan_value = xyz_list[0].model_info.get('dummy', 9999)
@@ -3245,17 +3375,17 @@ def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None
                 d_std[:, n_lm + j_arr[0]] = d_std_high
     # --- exclude all-NaN soundings ---
-    if disregardFullNan:
-        keep = ~np.all(np.isnan(d_obs), axis=1)
+    keep = ~np.all(np.isnan(d_obs), axis=1) if disregardFullNan else np.ones(len(d_obs), dtype=bool)
+    if disregardFullNan and showInfo >= 1:
         n_removed = np.sum(~keep)
-        if showInfo >= 1 and n_removed > 0:
+        if n_removed > 0:
             print('Removed %d all-NaN soundings (%d remaining)' % (n_removed, np.sum(keep)))
-        d_obs     = d_obs[keep]
-        d_std     = d_std[keep]
-        UTMX      = UTMX[keep]
-        UTMY      = UTMY[keep]
-        LINE      = LINE[keep]
-        ELEVATION = ELEVATION[keep]
+    d_obs     = d_obs[keep]
+    d_std     = d_std[keep]
+    UTMX      = UTMX[keep]
+    UTMY      = UTMY[keep]
+    LINE      = LINE[keep]
+    ELEVATION = ELEVATION[keep]
     # --- write HDF5 ---
     save_data_gaussian(
@@ -3273,10 +3403,29 @@ def xyz_to_h5(file_xyz, file_gex, f_data_h5=None, i_lm_skip=None, i_hm_skip=None
         if n_channels >= 2:
             hf.create_dataset('/D1/i_hm', data=np.arange(i_hm_start, i_hm_end))
+    # --- write additional data columns as D2, D3, ... ---
+    if data_obs is not None:
+        _data_std = data_std if data_std is not None else [None] * len(data_obs)
+        for i, col_obs in enumerate(data_obs):
+            obs = fl[col_obs.lower()].values[ch1_pos][keep].reshape(-1, 1).astype(float)
+            std_col = _data_std[i]
+            if std_col is not None:
+                std = fl[std_col.lower()].values[ch1_pos][keep].reshape(-1, 1).astype(float)
+            else:
+                std = 0.05 * np.abs(obs)
+            save_data_gaussian(
+                obs, D_std=std,
+                f_data_h5=f_data_h5,
+                id=i + 2,
+                name=col_obs,
+                delete_if_exist=False,
+                showInfo=showInfo,
+            )
     return f_data_h5
-def save_data_multinomial(D_obs, i_use=None, id=[],  id_prior=None, f_data_h5='data.h5', compression=None, compression_opts=None, **kwargs):
+def save_data_multinomial(D_obs, i_use=None, id=[],  id_prior=None, f_data_h5='data.h5', name=None, compression=None, compression_opts=None, **kwargs):
     """
     Save observed data to an HDF5 file in a specified group with a multinomial noise model.
@@ -3288,6 +3437,9 @@ def save_data_multinomial(D_obs, i_use=None, id=[],  id_prior=None, f_data_h5='d
     :type id_prior: int, optional
     :param f_data_h5: The path to the HDF5 file where the data will be written. Default is 'data.h5'.
     :type f_data_h5: str, optional
+    :param name: Optional human-readable name for this dataset (e.g. 'Lithology'). Stored as
+        the ``name`` attribute on the HDF5 group and used by plotting routines for titles.
+    :type name: str, optional
     :param kwargs: Additional keyword arguments.
     :return: The path to the HDF5 file where the data was written.
     :rtype: str
@@ -3374,7 +3526,9 @@ def save_data_multinomial(D_obs, i_use=None, id=[],  id_prior=None, f_data_h5='d
         # write attribute noise_model as 'multinomial'
         f['/%s/' % D_str].attrs['noise_model'] = 'multinomial'
+        if name is not None:
+            f['/%s/' % D_str].attrs['name'] = name
     return id, f_data_h5
@@ -3561,20 +3715,30 @@ def merge_data(f_data, f_gex='', delta_line=0, f_data_merged_h5='', **kwargs):
         X, Y, LINE, ELEVATION = ig.get_geometry(f_data_h5)
         D = ig.load_data(f_data_h5, showInfo=showInfo)
-        # append data
-        Xc = np.append(Xc, X)
-        Yc = np.append(Yc, Y)
-        LINEc = np.append(LINEc, LINE+i*delta_line)
-        ELEVATIONc = np.append(ELEVATIONc, ELEVATION)
+        # attempt data merge before touching geometry
+        merge_ok = True
+        d_obs_new = list(d_obs_c)
+        d_std_new = list(d_std_c)
         for id in range(len(d_obs_c)):
-            #print(id)
             try:
-                d_obs_c[id] = np.vstack((d_obs_c[id], np.atleast_2d(D['d_obs'][id])))
-                d_std_c[id] = np.vstack((d_std_c[id], np.atleast_2d(D['d_std'][id])))
+                d_obs_new[id] = np.vstack((d_obs_c[id], np.atleast_2d(D['d_obs'][id])))
+                d_std_new[id] = np.vstack((d_std_c[id], np.atleast_2d(D['d_std'][id])))
             except:
+                merge_ok = False
                 if showInfo>-1:
                     print("!!!!! Could not merge %s" % f_data_h5)
+                break
+        if not merge_ok:
+            continue
+        # only append geometry when data merged successfully
+        d_obs_c = d_obs_new
+        d_std_c = d_std_new
+        Xc = np.append(Xc, X)
+        Yc = np.append(Yc, Y)
+        LINEc = np.append(LINEc, LINE+i*delta_line)
+        ELEVATIONc = np.append(ELEVATIONc, ELEVATION)
     Xc = np.atleast_2d(Xc).T
     Yc = np.atleast_2d(Yc).T

{integrate_module-0.96.0 → integrate_module-0.97.0}/integrate/integrate_plot.py RENAMED Viewed

@@ -2765,6 +2765,8 @@ def plot_data(f_data_h5, i_plot=[], Dkey=[], plType='imshow', uselog=True, **kwa
     import matplotlib
     import h5py
+    showInfo = kwargs.get('showInfo', -1)
     # Check if the data file f_data_h5 exists
     if not os.path.exists(f_data_h5):
         print("plot_data: File %s does not exist" % f_data_h5)
@@ -2778,11 +2780,13 @@ def plot_data(f_data_h5, i_plot=[], Dkey=[], plType='imshow', uselog=True, **kwa
             Dkeys = []
             for key in f_data.keys():
                 if key[0]=='D':
-                    print("plot_data: Found data set %s" % key)
+                    if showInfo>0:
+                        print("plot_data: Found data set %s" % key)
                     Dkeys.append(key)
                 nd += 1
             Dkey=Dkeys[0]
-            print("plot_data: Using data set %s" % Dkey)
+            if showInfo>0:
+                print("plot_data: Using data set %s" % Dkey)
         noise_model = f_data['/%s' % Dkey].attrs['noise_model']
@@ -2810,9 +2814,6 @@ def plot_data(f_data_h5, i_plot=[], Dkey=[], plType='imshow', uselog=True, **kwa
             # remove all values in i_plot that are smaller than 0
             i_plot = i_plot[i_plot>=0]
-            # reaplce values larger than 1 with nan in d_std
-            d_std[d_std>1] = np.nan
             # find number of nan values on d_obs
             non_nan = np.sum(~np.isnan(d_obs), axis=1)
@@ -2981,6 +2982,12 @@ def plot_data_prior(f_prior_data_h5,
         obs_data = None
         is_1d = False
+        # Read name attribute from observed data group
+        dh5_str_name = 'D%d' % id_data
+        name_attr = f_data[dh5_str_name].attrs.get('name', None) if dh5_str_name in f_data else None
+        if isinstance(name_attr, bytes):
+            name_attr = name_attr.decode('utf-8')
         # Load prior data
         dh5_str_prior = 'D%d' % (id)
         if dh5_str_prior in f_prior_data:
@@ -3031,7 +3038,8 @@ def plot_data_prior(f_prior_data_h5,
         plt.xlabel('Data Value')
         plt.ylabel('Probability Density')
         plt.legend()
-        plt.title('Prior data vs Observed data (1D Histogram)')
+        name_suffix = ': %s' % name_attr if name_attr else ''
+        plt.title('D%d%s: Prior vs Observed (1D Histogram)' % (id_data, name_suffix))
     else:
         # Original 2D line plot
         if prior_data is not None:
@@ -3044,7 +3052,8 @@ def plot_data_prior(f_prior_data_h5,
         plt.xlabel('Data #')
         plt.ylabel('Data Value')
-        plt.title('Prior data (black) and observed data (red)')
+        name_suffix = ': %s' % name_attr if name_attr else ''
+        plt.title('D%d%s: Prior (black) vs Observed (red)' % (id_data, name_suffix))
     if ylim is not None:
         if is_1d:
@@ -4763,6 +4772,5 @@ def plot_boreholes(W, f_prior_h5=None, Mstr='/M2', hardcopy=False, **kwargs):
         if showInfo >= 0:
             print(f'plot_boreholes: saved {out}')
-    plt.show()
     return fig

{integrate_module-0.96.0 → integrate_module-0.97.0}/integrate/integrate_rejection.py RENAMED Viewed

@@ -31,7 +31,7 @@ def integrate_rejection(f_prior_h5='prior.h5',
                               N_use=100000000000,
                               id_use=[],
                               ip_range=[],
-                              nr=400,
+                              nr=1000,
                               autoT=1,
                               T_base = 1,
                               Nchunks=0,
@@ -73,7 +73,7 @@ def integrate_rejection(f_prior_h5='prior.h5',
         Default is empty list.
     nr : int, optional
         Number of posterior samples to retain per data point.
-        Default is 400.
+        Default is 1000.
     autoT : int, optional
         Automatic temperature estimation method (1=enabled, 0=disabled).
         Default is 1.
@@ -413,7 +413,7 @@ def integrate_rejection_range(D,
                               N_use=None,
                               id_use=[],
                               ip_range=[],
-                              nr=400,
+                              nr=1000,
                               autoT=1,
                               T_base = 1,
                               T_N_above=10,
@@ -448,7 +448,7 @@ def integrate_rejection_range(D,
         Default is empty list.
     nr : int, optional
         Number of posterior samples to retain per data point.
-        Default is 400.
+        Default is 1000.
     autoT : int, optional
         Automatic temperature estimation method (1=enabled, 0=disabled).
         Default is 1.
@@ -1769,4 +1769,4 @@ def compute_hypothesis_probability(f_post_h5_list, **kwargs):
                 print(f"  - Hypothesis {i+1}: mean P = {np.nanmean(P[:, i]):.4f}, "
                       f"median P = {np.nanmedian(P[:, i]):.4f}")
-    return P, mode, entropy_values
+    return P, mode, entropy_values

{integrate_module-0.96.0 → integrate_module-0.97.0/integrate_module.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: integrate_module
-Version: 0.96.0
+Version: 0.97.0
 Summary: Localized probabilistic data integration
 Author-email: Thomas Mejer Hansen <tmeha@geo.au.dk>
 License: MIT

{integrate_module-0.96.0 → integrate_module-0.97.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "integrate_module"
-version = "0.96.0"
+version = "0.97.0"
 description = "Localized probabilistic data integration"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"