PyPI - ONE-api - Versions diffs - 3.0b0__tar.gz → 3.0b3__tar.gz - Mend

ONE-api 3.0b0tar.gz → 3.0b3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{one_api-3.0b0 → one_api-3.0b3/ONE_api.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: ONE-api
-Version: 3.0b0
+Version: 3.0b3
 Summary: Open Neurophysiology Environment
 Author: IBL Staff
 License: MIT
@@ -12,7 +12,7 @@ Project-URL: Changelog, https://github.com/int-brain-lab/ONE/blob/main/CHANGELOG
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: flake8>=3.7.8
+Requires-Dist: ruff
 Requires-Dist: numpy>=1.18
 Requires-Dist: pandas>=1.5.0
 Requires-Dist: tqdm>=4.32.1

{one_api-3.0b0 → one_api-3.0b3}/ONE_api.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-flake8>=3.7.8
+ruff
 numpy>=1.18
 pandas>=1.5.0
 tqdm>=4.32.1

{one_api-3.0b0/ONE_api.egg-info → one_api-3.0b3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: ONE-api
-Version: 3.0b0
+Version: 3.0b3
 Summary: Open Neurophysiology Environment
 Author: IBL Staff
 License: MIT
@@ -12,7 +12,7 @@ Project-URL: Changelog, https://github.com/int-brain-lab/ONE/blob/main/CHANGELOG
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: flake8>=3.7.8
+Requires-Dist: ruff
 Requires-Dist: numpy>=1.18
 Requires-Dist: pandas>=1.5.0
 Requires-Dist: tqdm>=4.32.1

{one_api-3.0b0 → one_api-3.0b3}/one/__init__.py RENAMED Viewed

@@ -1,2 +1,2 @@
 """The Open Neurophysiology Environment (ONE) API."""
-__version__ = '3.0b0'
+__version__ = '3.0b3'

{one_api-3.0b0 → one_api-3.0b3}/one/alf/__init__.py RENAMED Viewed

	@@ -1 +1 @@
1	- """Constructing, parsing, validating and loading ALyx Files (ALF)"""
1	+ """Constructing, parsing, validating and loading ALyx Files (ALF)."""

{one_api-3.0b0 → one_api-3.0b3}/one/alf/cache.py RENAMED Viewed

@@ -9,6 +9,7 @@ Examples
 >>> cache_dir = 'path/to/data'
 >>> make_parquet_db(cache_dir)
 >>> one = One(cache_dir=cache_dir)
 """
 # -------------------------------------------------------------------------------------------------
@@ -25,6 +26,7 @@ import logging
 import pandas as pd
 import numpy as np
 from packaging import version
+from iblutil.util import Bunch
 from iblutil.io import parquet
 from iblutil.io.hashfile import md5
@@ -32,8 +34,9 @@ from one.alf.spec import QC, is_uuid_string
 from one.alf.io import iter_sessions
 from one.alf.path import session_path_parts, get_alf_path
-__all__ = ['make_parquet_db', 'patch_cache', 'remove_missing_datasets',
-           'remove_cache_table_files', 'EMPTY_DATASETS_FRAME', 'EMPTY_SESSIONS_FRAME', 'QC_TYPE']
+__all__ = [
+    'make_parquet_db', 'patch_tables', 'merge_tables', 'QC_TYPE', 'remove_table_files',
+    'remove_missing_datasets', 'load_tables', 'EMPTY_DATASETS_FRAME', 'EMPTY_SESSIONS_FRAME']
 _logger = logging.getLogger(__name__)
 # -------------------------------------------------------------------------------------------------
@@ -107,6 +110,7 @@ def _get_session_info(rel_ses_path):
         The task protocol (empty str).
     str
         The associated project (empty str).
     """
     lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
     eid = _ses_str_id(rel_ses_path)
@@ -142,6 +146,7 @@ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
         Whether the file exists.
     str
         The QC value for the dataset ('NOT_SET').
     """
     rel_dset_path = get_alf_path(dset_path.relative_to_session())
     ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
@@ -188,13 +193,13 @@ def _ids_to_uuid(df_ses, df_dsets):
 # -------------------------------------------------------------------------------------------------
 def _metadata(origin):
-    """
-    Metadata dictionary for Parquet files.
+    """Metadata dictionary for Parquet files.
     Parameters
     ----------
     origin : str, pathlib.Path
         Path to full directory, or computer name / db name.
     """
     return {
         'date_created': datetime.datetime.now().isoformat(sep=' ', timespec='minutes'),
@@ -203,8 +208,7 @@ def _metadata(origin):
 def _make_sessions_df(root_dir) -> pd.DataFrame:
-    """
-    Given a root directory, recursively finds all sessions and returns a sessions DataFrame.
+    """Given a root directory, recursively finds all sessions and returns a sessions DataFrame.
     Parameters
     ----------
@@ -215,6 +219,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame:
     -------
     pandas.DataFrame
         A pandas DataFrame of session info.
     """
     rows = []
     for full_path in iter_sessions(root_dir):
@@ -229,8 +234,7 @@ def _make_sessions_df(root_dir) -> pd.DataFrame:
 def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
-    """
-    Given a root directory, recursively finds all datasets and returns a datasets DataFrame.
+    """Given a root directory, recursively finds all datasets and returns a datasets DataFrame.
     Parameters
     ----------
@@ -243,6 +247,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
     -------
     pandas.DataFrame
         A pandas DataFrame of dataset info.
     """
     # Go through sessions and append datasets
     rows = []
@@ -255,8 +260,7 @@ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
 def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
-    """
-    Given a data directory, index the ALF datasets and save the generated cache tables.
+    """Given a data directory, index the ALF datasets and save the generated cache tables.
     Parameters
     ----------
@@ -281,6 +285,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab
         The full path of the saved sessions parquet table.
     pathlib.Path
         The full path of the saved datasets parquet table.
     """
     root_dir = Path(root_dir).resolve()
@@ -323,8 +328,7 @@ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab
 def cast_index_object(df: pd.DataFrame, dtype: type = uuid.UUID) -> pd.Index:
-    """
-    Cast the index object to the specified dtype.
+    """Cast the index object to the specified dtype.
     NB: The data frame index will remain as 'object', however the underlying object type will be
     modified.
@@ -340,6 +344,7 @@ def cast_index_object(df: pd.DataFrame, dtype: type = uuid.UUID) -> pd.Index:
     -------
     pandas.DataFrame
         An updated data frame with a new index data type.
     """
     if isinstance(df.index, pd.MultiIndex):
         # df.index = df.index.map(lambda x: tuple(map(UUID, x)))
@@ -353,9 +358,148 @@ def cast_index_object(df: pd.DataFrame, dtype: type = uuid.UUID) -> pd.Index:
     return df
-def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True, dry=True):
+def load_tables(tables_dir, glob_pattern='*.pqt'):
+    """Load parquet cache files from a local directory.
+    Parameters
+    ----------
+    tables_dir : str, pathlib.Path
+        The directory location of the parquet files.
+    glob_pattern : str
+        A glob pattern to match the cache files.
+    Returns
+    -------
+    Bunch
+        A Bunch object containing the loaded cache tables and associated metadata.
     """
-    Remove dataset files and session folders that are not in the provided cache.
+    meta = {
+        'expired': False,
+        'created_time': None,
+        'loaded_time': None,
+        'modified_time': None,
+        'saved_time': None,
+        'raw': {}
+    }
+    caches = Bunch({
+            'datasets': EMPTY_DATASETS_FRAME.copy(),
+            'sessions': EMPTY_SESSIONS_FRAME.copy(),
+            '_meta': meta})
+    INDEX_KEY = '.?id'
+    for cache_file in Path(tables_dir).glob(glob_pattern):
+        table = cache_file.stem
+        # we need to keep this part fast enough for transient objects
+        cache, meta['raw'][table] = parquet.load(cache_file)
+        if 'date_created' not in meta['raw'][table]:
+            _logger.warning(f"{cache_file} does not appear to be a valid table. Skipping")
+            continue
+        meta['loaded_time'] = datetime.datetime.now()
+        # Set the appropriate index if none already set
+        if isinstance(cache.index, pd.RangeIndex):
+            idx_columns = sorted(cache.filter(regex=INDEX_KEY).columns)
+            if len(idx_columns) == 0:
+                raise KeyError('Failed to set index')
+            cache.set_index(idx_columns, inplace=True)
+        # Patch older tables
+        cache = patch_tables(cache, meta['raw'][table].get('min_api_version'), table)
+        # Cast indices to UUID
+        cache = cast_index_object(cache, uuid.UUID)
+        # Check sorted
+        # Sorting makes MultiIndex indexing O(N) -> O(1)
+        if not cache.index.is_monotonic_increasing:
+            cache.sort_index(inplace=True)
+        caches[table] = cache
+    created = [datetime.datetime.fromisoformat(x['date_created'])
+               for x in meta['raw'].values() if 'date_created' in x]
+    if created:
+        meta['created_time'] = min(created)
+    return caches
+def merge_tables(cache, strict=False, **kwargs):
+    """Update the cache tables with new records.
+    Parameters
+    ----------
+    dict
+        A map of cache tables to update.
+    strict : bool
+        If not True, the columns don't need to match.  Extra columns in input tables are
+        dropped and missing columns are added and filled with np.nan.
+    kwargs
+        pandas.DataFrame or pandas.Series to insert/update for each table.
+    Returns
+    -------
+    datetime.datetime:
+        A timestamp of when the cache was updated.
+    Example
+    -------
+    >>> session, datasets = ses2records(self.get_details(eid, full=True))
+    ... self._update_cache_from_records(sessions=session, datasets=datasets)
+    Raises
+    ------
+    AssertionError
+        When strict is True the input columns must exactly match those oo the cache table,
+        including the order.
+    KeyError
+        One or more of the keyword arguments does not match a table in cache.
+    """
+    updated = None
+    for table, records in kwargs.items():
+        if records is None or records.empty:
+            continue
+        if table not in cache:
+            raise KeyError(f'Table "{table}" not in cache')
+        if isinstance(records, pd.Series):
+            records = pd.DataFrame([records])
+            records.index.set_names(cache[table].index.names, inplace=True)
+        # Drop duplicate indices
+        records = records[~records.index.duplicated(keep='first')]
+        if not strict:
+            # Deal with case where there are extra columns in the cache
+            extra_columns = list(set(cache[table].columns) - set(records.columns))
+            # Convert these columns to nullable, if required
+            cache_columns = cache[table][extra_columns]
+            cache[table][extra_columns] = cache_columns.convert_dtypes()
+            column_ids = map(list(cache[table].columns).index, extra_columns)
+            for col, n in sorted(zip(extra_columns, column_ids), key=lambda x: x[1]):
+                dtype = cache[table][col].dtype
+                nan = getattr(dtype, 'na_value', np.nan)
+                val = records.get('exists', True) if col.startswith('exists_') else nan
+                records.insert(n, col, val)
+            # Drop any extra columns in the records that aren't in cache table
+            to_drop = set(records.columns) - set(cache[table].columns)
+            records = records.drop(to_drop, axis=1)
+            records = records.reindex(columns=cache[table].columns)
+        assert set(cache[table].columns) == set(records.columns)
+        records = records.astype(cache[table].dtypes)
+        # Update existing rows
+        to_update = records.index.isin(cache[table].index)
+        cache[table].loc[records.index[to_update], :] = records[to_update]
+        # Assign new rows
+        to_assign = records[~to_update]
+        frames = [cache[table], to_assign]
+        # Concatenate and sort
+        cache[table] = pd.concat(frames).sort_index()
+        updated = datetime.datetime.now()
+    cache['_meta']['modified_time'] = updated
+    return updated
+def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True, dry=True):
+    """Remove dataset files and session folders that are not in the provided cache.
     NB: This *does not* remove entries from the cache tables that are missing on disk.
     Non-ALF files are not removed. Empty sessions that exist in the sessions table are not removed.
@@ -374,13 +518,14 @@ def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True,
     -------
     list
         A sorted list of paths to be removed.
     """
     cache_dir = Path(cache_dir)
     if tables is None:
         tables = {}
         for name in ('datasets', 'sessions'):
             table, m = parquet.load(cache_dir / f'{name}.pqt')
-            tables[name] = patch_cache(table, m.get('min_api_version'), name)
+            tables[name] = patch_tables(table, m.get('min_api_version'), name)
     INDEX_KEY = '.?id'
     for name in tables:
@@ -429,7 +574,7 @@ def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True,
     return sorted(to_delete)
-def remove_cache_table_files(folder, tables=('sessions', 'datasets')):
+def remove_table_files(folder, tables=('sessions', 'datasets')):
     """Delete cache tables on disk.
     Parameters
@@ -444,6 +589,7 @@ def remove_cache_table_files(folder, tables=('sessions', 'datasets')):
     -------
     list of pathlib.Path
         A list of the removed files.
     """
     filenames = ('cache_info.json', *(f'{t}.pqt' for t in tables))
     removed = []
@@ -478,7 +624,7 @@ def _cache_int2str(table: pd.DataFrame) -> pd.DataFrame:
     return table
-def patch_cache(table: pd.DataFrame, min_api_version=None, name=None) -> pd.DataFrame:
+def patch_tables(table: pd.DataFrame, min_api_version=None, name=None) -> pd.DataFrame:
     """Reformat older cache tables to comply with this version of ONE.
     Currently this function will 1. convert integer UUIDs to string UUIDs; 2. rename the 'project'
@@ -492,6 +638,7 @@ def patch_cache(table: pd.DataFrame, min_api_version=None, name=None) -> pd.Data
         The minimum API version supported by this cache table.
     name : {'dataset', 'session'} str
         The name of the table.
     """
     min_version = version.parse(min_api_version or '0.0.0')
     table = _cache_int2str(table)

{one_api-3.0b0 → one_api-3.0b3}/one/alf/exceptions.py RENAMED Viewed

@@ -13,7 +13,9 @@ class ALFError(Exception):
     explanation : str
         An optional, verbose but general explanation of the error class.  All errors will display
         the same explanation.
     """
     explanation = ''
     def __init__(self, *args, terse=False):
@@ -33,6 +35,7 @@ class ALFError(Exception):
         >>> raise ALFError('invalid/path/one', 'invalid/path/two')
         one.alf.exceptions.ALFError: "invalid/path/one", "invalid/path/two"
         """
         if args:
             if len(args) == 1 and isinstance(args[0], str):
@@ -50,19 +53,22 @@ class ALFError(Exception):
 class AlyxSubjectNotFound(ALFError):
-    """'Subject not found' error"""
+    """'Subject not found' error."""
     explanation = 'The subject was not found in Alyx database'
 class ALFObjectNotFound(ALFError):
-    """'Object not found' error"""
+    """'Object not found' error."""
     explanation = ('The ALF object was not found.  This may occur if the object or namespace or '
                    'incorrectly formatted e.g. the object "_ibl_trials.intervals.npy" would be '
                    'found with the filters `object="trials", namespace="ibl"`')
 class ALFMultipleObjectsFound(ALFError):
-    """'Multiple objects found' error"""
+    """'Multiple objects found' error."""
     explanation = ('Dataset files belonging to more than one object found.  '
                    'ALF names have the pattern '
                    '(_namespace_)object.attribute(_timescale).extension, e.g. for the file '
@@ -70,7 +76,8 @@ class ALFMultipleObjectsFound(ALFError):
 class ALFMultipleCollectionsFound(ALFError):
-    """'Multiple collections found' error"""
+    """'Multiple collections found' error."""
     explanation = ('The matching object/file(s) belong to more than one collection.  '
                    'ALF names have the pattern '
                    'collection/(_namespace_)object.attribute(_timescale).extension, e.g. for the '
@@ -78,7 +85,8 @@ class ALFMultipleCollectionsFound(ALFError):
 class ALFMultipleRevisionsFound(ALFError):
-    """'Multiple objects found' error"""
+    """'Multiple objects found' error."""
     explanation = ('The matching object/file(s) belong to more than one revision.  '
                    'Multiple datasets in different revision folders were found with no default '
                    'specified.')
@@ -86,10 +94,12 @@ class ALFMultipleRevisionsFound(ALFError):
 class ALFWarning(Warning):
     """Cautions when loading ALF datasets."""
     pass
 class ALFInvalid(ALFError, ValueError):
     """ALF path invalid."""
     explanation = ('The file path provided is does not match the ALF path specification defined '
                    'in `one.alf.spec`.')

ONE-api 3.0b0__tar.gz → 3.0b3__tar.gz

ONE-api 3.0b0tar.gz → 3.0b3tar.gz