PyPI - anemoi-datasets - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

anemoi-datasets 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

anemoi/datasets/__init__.py CHANGED Viewed

@@ -13,9 +13,10 @@ from .data import list_dataset_names
 from .data import open_dataset
 __all__ = [
-    "open_dataset",
-    "MissingDateError",
+    "__version__",
     "add_dataset_path",
     "add_named_dataset",
     "list_dataset_names",
+    "MissingDateError",
+    "open_dataset",
 ]

anemoi/datasets/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.3.7'
-__version_tuple__ = version_tuple = (0, 3, 7)
+__version__ = version = '0.3.9'
+__version_tuple__ = version_tuple = (0, 3, 9)

anemoi/datasets/commands/copy.py CHANGED Viewed

@@ -7,6 +7,7 @@
 import logging
 import os
+import shutil
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import as_completed
@@ -26,54 +27,61 @@ except AttributeError:
 class S3Downloader:
-    def __init__(self, source, target, transfers, overwrite, resume, progress, **kwargs):
+    def __init__(self, source, target, transfers, overwrite, resume, verbosity, **kwargs):
         self.source = source
         self.target = target
         self.transfers = transfers
         self.overwrite = overwrite
         self.resume = resume
-        self.progress = progress
+        self.verbosity = verbosity
     def run(self):
+        if self.target == ".":
+            self.target = os.path.basename(self.source)
+        if self.overwrite and os.path.exists(self.target):
+            LOG.info(f"Deleting {self.target}")
+            shutil.rmtree(self.target)
         download(
             self.source + "/" if not self.source.endswith("/") else self.source,
             self.target,
             overwrite=self.overwrite,
-            ignore_existing=self.resume,
+            resume=self.resume,
+            verbosity=self.verbosity,
             threads=self.transfers,
-            show_progress=self.progress,
         )
 class S3Uploader:
-    def __init__(self, source, target, transfers, overwrite, resume, progress, **kwargs):
+    def __init__(self, source, target, transfers, overwrite, resume, verbosity, **kwargs):
         self.source = source
         self.target = target
         self.transfers = transfers
         self.overwrite = overwrite
         self.resume = resume
-        self.progress = progress
+        self.verbosity = verbosity
     def run(self):
         upload(
             self.source,
             self.target,
             overwrite=self.overwrite,
-            ignore_existing=self.resume,
+            resume=self.resume,
+            verbosity=self.verbosity,
             threads=self.transfers,
-            show_progress=self.progress,
         )
 class DefaultCopier:
-    def __init__(self, source, target, transfers, block_size, overwrite, resume, progress, nested, rechunk, **kwargs):
+    def __init__(self, source, target, transfers, block_size, overwrite, resume, verbosity, nested, rechunk, **kwargs):
         self.source = source
         self.target = target
         self.transfers = transfers
         self.block_size = block_size
         self.overwrite = overwrite
         self.resume = resume
-        self.progress = progress
+        self.verbosity = verbosity
         self.nested = nested
         self.rechunk = rechunk
@@ -86,7 +94,7 @@ class DefaultCopier:
             return zarr.storage.NestedDirectoryStore(path)
         return path
-    def copy_chunk(self, n, m, source, target, _copy, progress):
+    def copy_chunk(self, n, m, source, target, _copy, verbosity):
         if _copy[n:m].all():
             LOG.info(f"Skipping {n} to {m}")
             return None
@@ -106,7 +114,7 @@ class DefaultCopier:
                 range(n, m),
                 desc=f"Copying {n} to {m}",
                 leave=False,
-                disable=not isatty and not progress,
+                disable=not isatty and not verbosity,
             ):
                 target[i] = source[i]
@@ -131,7 +139,7 @@ class DefaultCopier:
             #    raise NotImplementedError("Rechunking with multiple transfers is not implemented")
         return chunks
-    def copy_data(self, source, target, _copy, progress):
+    def copy_data(self, source, target, _copy, verbosity):
         LOG.info("Copying data")
         source_data = source["data"]
@@ -145,6 +153,7 @@ class DefaultCopier:
                 shape=source_data.shape,
                 chunks=self.data_chunks,
                 dtype=source_data.dtype,
+                fill_value=source_data.fill_value,
             )
         )
@@ -160,7 +169,7 @@ class DefaultCopier:
                     source_data,
                     target_data,
                     _copy,
-                    progress,
+                    verbosity,
                 )
             )
             n += self.block_size
@@ -175,7 +184,7 @@ class DefaultCopier:
         LOG.info("Copied data")
-    def copy_array(self, name, source, target, _copy, progress):
+    def copy_array(self, name, source, target, _copy, verbosity):
         for k, v in source.attrs.items():
             target.attrs[k] = v
@@ -183,14 +192,14 @@ class DefaultCopier:
             return
         if name == "data":
-            self.copy_data(source, target, _copy, progress)
+            self.copy_data(source, target, _copy, verbosity)
             return
         LOG.info(f"Copying {name}")
         target[name] = source[name]
         LOG.info(f"Copied {name}")
-    def copy_group(self, source, target, _copy, progress):
+    def copy_group(self, source, target, _copy, verbosity):
         import zarr
         for k, v in source.attrs.items():
@@ -203,7 +212,7 @@ class DefaultCopier:
                     source[name],
                     group,
                     _copy,
-                    progress,
+                    verbosity,
                 )
             else:
                 self.copy_array(
@@ -211,10 +220,10 @@ class DefaultCopier:
                     source,
                     target,
                     _copy,
-                    progress,
+                    verbosity,
                 )
-    def copy(self, source, target, progress):
+    def copy(self, source, target, verbosity):
         import zarr
         if "_copy" not in target:
@@ -225,7 +234,7 @@ class DefaultCopier:
         _copy = target["_copy"]
         _copy_np = _copy[:]
-        self.copy_group(source, target, _copy_np, progress)
+        self.copy_group(source, target, _copy_np, verbosity)
         del target["_copy"]
     def run(self):
@@ -284,7 +293,7 @@ class DefaultCopier:
         assert target is not None, target
         source = zarr.open(self._store(self.source), mode="r")
-        self.copy(source, target, self.progress)
+        self.copy(source, target, self.verbosity)
 class CopyMixin:
@@ -303,7 +312,10 @@ class CopyMixin:
         )
         command_parser.add_argument("--transfers", type=int, default=8, help="Number of parallel transfers.")
         command_parser.add_argument(
-            "--progress", action="store_true", help="Force show progress bar, even if not in an interactive shell."
+            "--verbosity",
+            type=int,
+            help="Verbosity level. 0 is silent, 1 is normal, 2 is verbose.",
+            default=1,
         )
         command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
         command_parser.add_argument(

anemoi/datasets/create/__init__.py CHANGED Viewed

@@ -97,13 +97,14 @@ class Creator:
         apply_patch(self.path, **kwargs)
-    def init_additions(self, delta=[1, 3, 6, 12, 24]):
+    def init_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
         from .loaders import StatisticsAddition
         from .loaders import TendenciesStatisticsAddition
         from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
-        a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
-        a.initialise()
+        if statistics:
+            a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
+            a.initialise()
         for d in delta:
             try:
@@ -112,13 +113,14 @@ class Creator:
             except TendenciesStatisticsDeltaNotMultipleOfFrequency:
                 self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
-    def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24]):
+    def run_additions(self, parts=None, delta=[1, 3, 6, 12, 24], statistics=True):
         from .loaders import StatisticsAddition
         from .loaders import TendenciesStatisticsAddition
         from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
-        a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
-        a.run(parts)
+        if statistics:
+            a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
+            a.run(parts)
         for d in delta:
             try:
@@ -127,13 +129,14 @@ class Creator:
             except TendenciesStatisticsDeltaNotMultipleOfFrequency:
                 self.print(f"Skipping delta={d} as it is not a multiple of the frequency.")
-    def finalise_additions(self, delta=[1, 3, 6, 12, 24]):
+    def finalise_additions(self, delta=[1, 3, 6, 12, 24], statistics=True):
         from .loaders import StatisticsAddition
         from .loaders import TendenciesStatisticsAddition
         from .loaders import TendenciesStatisticsDeltaNotMultipleOfFrequency
-        a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
-        a.finalise()
+        if statistics:
+            a = StatisticsAddition.from_dataset(path=self.path, print=self.print)
+            a.finalise()
         for d in delta:
             try:

anemoi/datasets/create/functions/sources/accumulations.py CHANGED Viewed

@@ -19,6 +19,8 @@ from climetlab.utils.availability import Availability
 from anemoi.datasets.create.utils import to_datetime_list
+from .mars import use_grib_paramid
 LOG = logging.getLogger(__name__)
@@ -85,6 +87,7 @@ class Accumulation:
             stepType="accum",
             startStep=self.startStep,
             endStep=self.endStep,
+            check_nans=True,
         )
         self.values = None
         self.done = True
@@ -230,6 +233,7 @@ def identity(x):
 def compute_accumulations(
+    context,
     dates,
     request,
     user_accumulation_period=6,
@@ -306,7 +310,10 @@ def compute_accumulations(
     ds = cml.load_source("empty")
     for r in compressed.iterate():
         request.update(r)
+        if context.use_grib_paramid and "param" in request:
+            request = use_grib_paramid(request)
         print("🌧️", request)
         ds = ds + cml.load_source("mars", **request)
     accumulations = {}
@@ -395,7 +402,7 @@ def accumulations(context, dates, **request):
     class_ = request.get("class", "od")
     stream = request.get("stream", "oper")
-    user_accumulation_period = request.get("accumulation_period", 6)
+    user_accumulation_period = request.pop("accumulation_period", 6)
     KWARGS = {
         ("od", "oper"): dict(patch=scda),
@@ -409,6 +416,7 @@ def accumulations(context, dates, **request):
     context.trace("🌧️", f"accumulations {request} {user_accumulation_period} {kwargs}")
     return compute_accumulations(
+        context,
         dates,
         request,
         user_accumulation_period=user_accumulation_period,

anemoi/datasets/create/functions/sources/mars.py CHANGED Viewed

@@ -9,6 +9,7 @@
 import datetime
 from copy import deepcopy
+from anemoi.utils.humanize import did_you_mean
 from climetlab import load_source
 from climetlab.utils.availability import Availability
@@ -102,6 +103,74 @@ def use_grib_paramid(r):
     return r
+MARS_KEYS = [
+    "accuracy",
+    "activity",
+    "anoffset",
+    "area",
+    "bitmap",
+    "channel",
+    "class",
+    "database",
+    "dataset",
+    "date",
+    "diagnostic",
+    "direction",
+    "domain",
+    "expect",
+    "experiment",
+    "expver",
+    "fcmonth",
+    "fcperiod",
+    "fieldset",
+    "filter",
+    "format",
+    "frame",
+    "frequency",
+    "gaussian",
+    "generation",
+    "grid",
+    "hdate",
+    "ident",
+    "instrument",
+    "interpolation",
+    "intgrid",
+    "iteration",
+    "level",
+    "levelist",
+    "levtype",
+    "method",
+    "model",
+    "month",
+    "number",
+    "obsgroup",
+    "obstype",
+    "offsetdate",
+    "offsettime",
+    "optimise",
+    "origin",
+    "packing",
+    "padding",
+    "param",
+    "quantile",
+    "realization",
+    "reference",
+    "reportype",
+    "repres",
+    "resol",
+    "resolution",
+    "rotation",
+    "step",
+    "stream",
+    "system",
+    "target",
+    "time",
+    "truncation",
+    "type",
+    "year",
+]
 def mars(context, dates, *requests, date_key="date", **kwargs):
     if not requests:
         requests = [kwargs]
@@ -117,6 +186,11 @@ def mars(context, dates, *requests, date_key="date", **kwargs):
         if DEBUG:
             context.trace("✅", f"load_source(mars, {r}")
+        for k, v in r.items():
+            if k not in MARS_KEYS:
+                raise ValueError(
+                    f"⚠️ Unknown key {k}={v} in MARS request. Did you mean '{did_you_mean(k, MARS_KEYS)}' ?"
+                )
         ds = ds + load_source("mars", **r)
     return ds

anemoi/datasets/create/loaders.py CHANGED Viewed

@@ -46,8 +46,44 @@ LOG = logging.getLogger(__name__)
 VERSION = "0.20"
+def set_to_test_mode(cfg):
+    NUMBER_OF_DATES = 4
+    dates = cfg.dates
+    LOG.warn(f"Running in test mode. Changing the list of dates to use only {NUMBER_OF_DATES}.")
+    groups = Groups(**cfg.dates)
+    dates = groups.dates
+    cfg.dates = dict(
+        start=dates[0],
+        end=dates[NUMBER_OF_DATES - 1],
+        frequency=dates.frequency,
+        group_by=NUMBER_OF_DATES,
+    )
+    def set_element_to_test(obj):
+        if isinstance(obj, (list, tuple)):
+            for v in obj:
+                set_element_to_test(v)
+            return
+        if isinstance(obj, (dict, DictObj)):
+            if "grid" in obj:
+                previous = obj["grid"]
+                obj["grid"] = "20./20."
+                LOG.warn(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
+            if "number" in obj:
+                if isinstance(obj["number"], (list, tuple)):
+                    previous = obj["number"]
+                    obj["number"] = previous[0:3]
+                    LOG.warn(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
+            for k, v in obj.items():
+                set_element_to_test(v)
+    set_element_to_test(cfg)
 class GenericDatasetHandler:
     def __init__(self, *, path, print=print, **kwargs):
         # Catch all floating point errors, including overflow, sqrt(<0), etc
         np.seterr(all="raise", under="warn")
@@ -61,12 +97,15 @@ class GenericDatasetHandler:
     @classmethod
     def from_config(cls, *, config, path, print=print, **kwargs):
-        # config is the path to the config file or a dict with the config
+        """Config is the path to the config file or a dict with the config"""
         assert isinstance(config, dict) or isinstance(config, str), config
         return cls(config=config, path=path, print=print, **kwargs)
     @classmethod
     def from_dataset_config(cls, *, path, print=print, **kwargs):
+        """Read the config saved inside the zarr dataset and instantiate the class for this config."""
         assert os.path.exists(path), f"Path {path} does not exist."
         z = zarr.open(path, mode="r")
         config = z.attrs["_create_yaml_config"]
@@ -75,6 +114,8 @@ class GenericDatasetHandler:
     @classmethod
     def from_dataset(cls, *, path, **kwargs):
+        """Instanciate the class from the path to the zarr dataset, without config."""
         assert os.path.exists(path), f"Path {path} does not exist."
         return cls(path=path, **kwargs)
@@ -156,68 +197,50 @@ class Loader(DatasetHandlerWithStatistics):
 class InitialiserLoader(Loader):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
-        self.main_config = loader_config(config)
-        self.tmp_statistics.delete()
+        self.main_config = loader_config(config)
         if self.test:
-            def test_dates(cfg, n=4):
-                LOG.warn("Running in test mode. Changing the list of dates to use only 4.")
-                groups = Groups(**cfg)
-                dates = groups.dates
-                return dict(start=dates[0], end=dates[n - 1], frequency=dates.frequency, group_by=n)
-            self.main_config.dates = test_dates(self.main_config.dates)
-            def set_to_test_mode(obj):
-                if isinstance(obj, (list, tuple)):
-                    for v in obj:
-                        set_to_test_mode(v)
-                    return
-                if isinstance(obj, (dict, DictObj)):
-                    if "grid" in obj:
-                        previous = obj["grid"]
-                        obj["grid"] = "20./20."
-                        LOG.warn(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
-                    if "number" in obj:
-                        if isinstance(obj["number"], (list, tuple)):
-                            previous = obj["number"]
-                            obj["number"] = previous[0:3]
-                            LOG.warn(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
-                    for k, v in obj.items():
-                        set_to_test_mode(v)
             set_to_test_mode(self.main_config)
         LOG.info(self.main_config.dates)
+        self.tmp_statistics.delete()
         self.groups = Groups(**self.main_config.dates)
+        LOG.info(self.groups)
         self.output = build_output(self.main_config.output, parent=self)
         self.input = self.build_input()
         LOG.info(self.input)
-        all_dates = self.groups.dates
-        self.minimal_input = self.input.select([all_dates[0]])
-        LOG.info(self.groups)
-        LOG.info("MINIMAL INPUT :")
+        first_date = self.groups.dates[0]
+        self.minimal_input = self.input.select([first_date])
+        LOG.info("Minimal input (using only the first date) :")
         LOG.info(self.minimal_input)
     def build_statistics_dates(self, start, end):
+        """Compute the start and end dates for the statistics, based on :
+        - The start and end dates in the config
+        - The default statistics dates convention
+        Then adapt according to the actual dates in the dataset.
+        """
         ds = open_dataset(self.path)
         dates = ds.dates
+        # if not specified, use the default statistics dates
         default_start, default_end = default_statistics_dates(dates)
         if start is None:
             start = default_start
         if end is None:
             end = default_end
+        # in any case, adapt to the actual dates in the dataset
         start = as_first_date(start, dates)
         end = as_last_date(end, dates)
+        # and convert to datetime to isoformat
         start = start.astype(datetime.datetime)
         end = end.astype(datetime.datetime)
         return (start.isoformat(), end.isoformat())
@@ -227,7 +250,10 @@ class InitialiserLoader(Loader):
         z.create_group("_build")
     def initialise(self, check_name=True):
-        """Create empty dataset."""
+        """Create an empty dataset of the right final shape
+        Read a small part of the data to get the shape of the data and the resolution and more metadata.
+        """
         self.print("Config loaded ok:")
         LOG.info(self.main_config)
@@ -276,11 +302,10 @@ class InitialiserLoader(Loader):
         metadata["_create_yaml_config"] = self.main_config.get_serialisable_dict()
         metadata["description"] = self.main_config.description
-        metadata["version"] = VERSION
+        metadata["licence"] = self.main_config["licence"]
+        metadata["attribution"] = self.main_config["attribution"]
-        metadata["data_request"] = self.minimal_input.data_request
         metadata["remapping"] = self.output.remapping
         metadata["order_by"] = self.output.order_by_as_list
         metadata["flatten_grid"] = self.output.flatten_grid
@@ -288,26 +313,21 @@ class InitialiserLoader(Loader):
         metadata["variables"] = variables
         metadata["variables_with_nans"] = variables_with_nans
         metadata["resolution"] = resolution
+        metadata["data_request"] = self.minimal_input.data_request
         metadata["field_shape"] = self.minimal_input.field_shape
         metadata["proj_string"] = self.minimal_input.proj_string
-        metadata["licence"] = self.main_config["licence"]
-        metadata["attribution"] = self.main_config["attribution"]
-        metadata["frequency"] = frequency
         metadata["start_date"] = dates[0].isoformat()
         metadata["end_date"] = dates[-1].isoformat()
+        metadata["frequency"] = frequency
         metadata["missing_dates"] = [_.isoformat() for _ in dates.missing]
+        metadata["version"] = VERSION
         if check_name:
             basename, ext = os.path.splitext(os.path.basename(self.path))  # noqa: F841
-            ds_name = DatasetName(
-                basename,
-                resolution,
-                dates[0],
-                dates[-1],
-                frequency,
-            )
+            ds_name = DatasetName(basename, resolution, dates[0], dates[-1], frequency)
             ds_name.raise_if_not_valid(print=self.print)
         if len(dates) != total_shape[0]:
@@ -316,17 +336,12 @@ class InitialiserLoader(Loader):
                 f"does not match data shape {total_shape[0]}. {total_shape=}"
             )
-        dates = normalize_and_check_dates(
-            dates,
-            metadata["start_date"],
-            metadata["end_date"],
-            metadata["frequency"],
-        )
+        dates = normalize_and_check_dates(dates, metadata["start_date"], metadata["end_date"], metadata["frequency"])
         metadata.update(self.main_config.get("force_metadata", {}))
         ###############################################################
-        # write data
+        # write metadata
         ###############################################################
         self.initialise_dataset_backend()
@@ -346,10 +361,7 @@ class InitialiserLoader(Loader):
             self.main_config.statistics.get("start"),
             self.main_config.statistics.get("end"),
         )
-        self.update_metadata(
-            statistics_start_date=statistics_start,
-            statistics_end_date=statistics_end,
-        )
+        self.update_metadata(statistics_start_date=statistics_start, statistics_end_date=statistics_end)
         LOG.info(f"Will compute statistics from {statistics_start} to {statistics_end}")
         self.registry.add_to_history("init finished")
@@ -586,37 +598,22 @@ class GenericAdditions(GenericDatasetHandler):
     @property
     def tmp_storage_path(self):
-        raise NotImplementedError
+        """This should be implemented in the subclass."""
+        raise NotImplementedError()
     @property
     def final_storage_path(self):
-        raise NotImplementedError
+        """This should be implemented in the subclass."""
+        raise NotImplementedError()
     def initialise(self):
         self.tmp_storage.delete()
         self.tmp_storage.create()
         LOG.info(f"Dataset {self.path} additions initialized.")
-    @cached_property
-    def _variables_with_nans(self):
-        z = zarr.open(self.path, mode="r")
-        if "variables_with_nans" in z.attrs:
-            return z.attrs["variables_with_nans"]
-        return None
-    def allow_nan(self, name):
-        if self._variables_with_nans is not None:
-            return name in self._variables_with_nans
-        warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, Assuming nans allowed for {name}.")
-        return True
-    @classmethod
-    def _check_type_equal(cls, a, b):
-        a = list(a)
-        b = list(b)
-        a = a[0] if a else None
-        b = b[0] if b else None
-        assert type(a) is type(b), (type(a), type(b))
+    def run(self, parts):
+        """This should be implemented in the subclass."""
+        raise NotImplementedError()
     def finalise(self):
         shape = (len(self.dates), len(self.variables))
@@ -696,7 +693,7 @@ class GenericAdditions(GenericDatasetHandler):
             variables_names=self.variables,
             has_nans=has_nans,
         )
-        LOG.info(f"Dataset {self.path} additions finalized.")
+        LOG.info(f"Dataset {self.path} additions finalised.")
         self.check_statistics()
         self._write(self.summary)
         self.tmp_storage.delete()
@@ -711,6 +708,19 @@ class GenericAdditions(GenericDatasetHandler):
     def check_statistics(self):
         pass
+    @cached_property
+    def _variables_with_nans(self):
+        z = zarr.open(self.path, mode="r")
+        if "variables_with_nans" in z.attrs:
+            return z.attrs["variables_with_nans"]
+        return None
+    def allow_nan(self, name):
+        if self._variables_with_nans is not None:
+            return name in self._variables_with_nans
+        warnings.warn(f"❗Cannot find 'variables_with_nans' in {self.path}, Assuming nans allowed for {name}.")
+        return True
 class StatisticsAddition(GenericAdditions):
     def __init__(self, **kwargs):
@@ -798,7 +808,7 @@ class TendenciesStatisticsAddition(GenericAdditions):
         start = z.attrs["statistics_start_date"]
         end = z.attrs["statistics_end_date"]
         start = datetime.datetime.fromisoformat(start)
-        ds = open_dataset(self.path, start=start + datetime.timedelta(hours=delta), end=end)
+        ds = open_dataset(self.path, start=start, end=end)
         self.dates = ds.dates
         self.total = len(self.dates)

anemoi/datasets/create/statistics/__init__.py CHANGED Viewed

@@ -98,6 +98,8 @@ def check_variance(x, variables_names, minimum, maximum, mean, count, sums, squa
 def compute_statistics(array, check_variables_names=None, allow_nan=False):
+    """Compute statistics for a given array, provides minimum, maximum, sum, squares, count and has_nans as a dictionary."""
     nvars = array.shape[1]
     LOG.info(f"Stats {nvars}, {array.shape}, {check_variables_names}")
@@ -242,10 +244,7 @@ class StatAggregator:
         offset = 0
         for _, _dates, stats in self.owner._gather_data():
             assert isinstance(stats, dict), stats
-            assert stats["minimum"].shape[0] == len(_dates), (
-                stats["minimum"].shape,
-                len(_dates),
-            )
+            assert stats["minimum"].shape[0] == len(_dates), (stats["minimum"].shape, len(_dates))
             assert stats["minimum"].shape[1] == len(self.variables_names), (
                 stats["minimum"].shape,
                 len(self.variables_names),
@@ -270,19 +269,13 @@ class StatAggregator:
             for k in self.NAMES:
                 stats[k] = stats[k][bitmap]
-            assert stats["minimum"].shape[0] == len(dates), (
-                stats["minimum"].shape,
-                len(dates),
-            )
+            assert stats["minimum"].shape[0] == len(dates), (stats["minimum"].shape, len(dates))
             # store data in self
             found |= set(dates)
             for name in self.NAMES:
                 array = getattr(self, name)
-                assert stats[name].shape[0] == len(dates), (
-                    stats[name].shape,
-                    len(dates),
-                )
+                assert stats[name].shape[0] == len(dates), (stats[name].shape, len(dates))
                 array[offset : offset + len(dates)] = stats[name]
             offset += len(dates)
@@ -310,133 +303,7 @@ class StatAggregator:
         stdev = np.sqrt(x)
         for j, name in enumerate(self.variables_names):
-            check_data_values(
-                np.array(
-                    [
-                        mean[j],
-                    ]
-                ),
-                name=name,
-                allow_nan=False,
-            )
-        return Summary(
-            minimum=minimum,
-            maximum=maximum,
-            mean=mean,
-            count=count,
-            sums=sums,
-            squares=squares,
-            stdev=stdev,
-            variables_names=self.variables_names,
-            has_nans=has_nans,
-        )
-class SummaryAggregator:
-    NAMES = ["minimum", "maximum", "sums", "squares", "count", "has_nans"]
-    def __init__(self, owner, dates, variables_names, allow_nan):
-        dates = sorted(dates)
-        dates = to_datetimes(dates)
-        assert dates, "No dates selected"
-        self.owner = owner
-        self.dates = dates
-        self.variables_names = variables_names
-        self.allow_nan = allow_nan
-        self.shape = (len(self.dates), len(self.variables_names))
-        LOG.info(f"Aggregating statistics on shape={self.shape}. Variables : {self.variables_names}")
-        self.minimum = np.full(self.shape, np.nan, dtype=np.float64)
-        self.maximum = np.full(self.shape, np.nan, dtype=np.float64)
-        self.sums = np.full(self.shape, np.nan, dtype=np.float64)
-        self.squares = np.full(self.shape, np.nan, dtype=np.float64)
-        self.count = np.full(self.shape, -1, dtype=np.int64)
-        self.has_nans = np.full(self.shape, False, dtype=np.bool_)
-        self._read()
-    def _read(self):
-        def check_type(a, b):
-            a = list(a)
-            b = list(b)
-            a = a[0] if a else None
-            b = b[0] if b else None
-            assert type(a) is type(b), (type(a), type(b))
-        found = set()
-        offset = 0
-        for _, _dates, stats in self.owner._gather_data():
-            for n in self.NAMES:
-                assert n in stats, (n, list(stats.keys()))
-            _dates = to_datetimes(_dates)
-            check_type(_dates, self.dates)
-            if found:
-                check_type(found, self.dates)
-                assert found.isdisjoint(_dates), "Duplicate dates found in precomputed statistics"
-            # filter dates
-            dates = set(_dates) & set(self.dates)
-            if not dates:
-                # dates have been completely filtered for this chunk
-                continue
-            # filter data
-            bitmap = np.isin(_dates, self.dates)
-            for k in self.NAMES:
-                stats[k] = stats[k][bitmap]
-            assert stats["minimum"].shape[0] == len(dates), (
-                stats["minimum"].shape,
-                len(dates),
-            )
-            # store data in self
-            found |= set(dates)
-            for name in self.NAMES:
-                array = getattr(self, name)
-                assert stats[name].shape[0] == len(dates), (
-                    stats[name].shape,
-                    len(dates),
-                )
-                array[offset : offset + len(dates)] = stats[name]
-            offset += len(dates)
-        for d in self.dates:
-            assert d in found, f"Statistics for date {d} not precomputed."
-        assert len(self.dates) == len(found), "Not all dates found in precomputed statistics"
-        assert len(self.dates) == offset, "Not all dates found in precomputed statistics."
-        LOG.info(f"Statistics for {len(found)} dates found.")
-    def aggregate(self):
-        minimum = np.nanmin(self.minimum, axis=0)
-        maximum = np.nanmax(self.maximum, axis=0)
-        sums = np.nansum(self.sums, axis=0)
-        squares = np.nansum(self.squares, axis=0)
-        count = np.nansum(self.count, axis=0)
-        has_nans = np.any(self.has_nans, axis=0)
-        mean = sums / count
-        assert sums.shape == count.shape == squares.shape == mean.shape == minimum.shape == maximum.shape
-        x = squares / count - mean * mean
-        # remove negative variance due to numerical errors
-        # x[- 1e-15 < (x / (np.sqrt(squares / count) + np.abs(mean))) < 0] = 0
-        check_variance(x, self.variables_names, minimum, maximum, mean, count, sums, squares)
-        stdev = np.sqrt(x)
-        for j, name in enumerate(self.variables_names):
-            check_data_values(
-                np.array(
-                    [
-                        mean[j],
-                    ]
-                ),
-                name=name,
-                allow_nan=False,
-            )
+            check_data_values(np.array([mean[j]]), name=name, allow_nan=False)
         return Summary(
             minimum=minimum,

anemoi/datasets/data/misc.py CHANGED Viewed

@@ -49,7 +49,9 @@ def load_config():
     if CONFIG is not None:
         return CONFIG
-    conf = os.path.expanduser("~/.anemoi.toml")
+    conf = os.path.expanduser("~/.config/anemoi/settings.toml")
+    if not os.path.exists(conf):
+        conf = os.path.expanduser("~/.anemoi.toml")
     if os.path.exists(conf):

anemoi/datasets/data/select.py CHANGED Viewed

@@ -23,15 +23,19 @@ LOG = logging.getLogger(__name__)
 class Select(Forwards):
     """Select a subset of the variables."""
-    def __init__(self, dataset, indices, title):
+    def __init__(self, dataset, indices, reason):
+        reason = reason.copy()
         while isinstance(dataset, Select):
             indices = [dataset.indices[i] for i in indices]
+            reason.update(dataset.reason)
             dataset = dataset.dataset
         self.dataset = dataset
         self.indices = list(indices)
         assert len(self.indices) > 0
-        self.title = title or {"indices": self.indices}
+        self.reason = reason or {"indices": self.indices}
         # Forward other properties to the main dataset
         super().__init__(dataset)
@@ -86,11 +90,11 @@ class Select(Forwards):
         return Source(self, index, self.dataset.source(self.indices[index]))
     def tree(self):
-        return Node(self, [self.dataset.tree()], **self.title)
+        return Node(self, [self.dataset.tree()], **self.reason)
     def subclass_metadata_specific(self):
         # return dict(indices=self.indices)
-        return {}
+        return dict(reason=self.reason)
 class Rename(Forwards):

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: anemoi-datasets
-Version: 0.3.7
+Version: 0.3.9
 Summary: A package to hold various functions to support training of ML models on ECMWF data.
 Author-email: "European Centre for Medium-Range Weather Forecasts (ECMWF)" <software.support@ecmwf.int>
 License: Apache License

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,21 @@
-anemoi/datasets/__init__.py,sha256=50-v6XPwmNE4LK9PVmyWu3v42F38EhxShd2EaXt1IIA,763
+anemoi/datasets/__init__.py,sha256=Z1gqZWhecLcT0RZQqYBLlz01MUlUZd0kWEj_RavbITM,782
 anemoi/datasets/__main__.py,sha256=cLA2PidDTOUHaDGzd0_E5iioKYNe-PSTv567Y2fuwQk,723
-anemoi/datasets/_version.py,sha256=vVN20516E2VTC9JNgtvqrQNlj5XptaB_a5z2XL8NFxg,411
+anemoi/datasets/_version.py,sha256=nV2HEiFwTdaOZoFEyVxxG_D8Oq_nlSmX2vHL4jK4h6w,411
 anemoi/datasets/grids.py,sha256=3YBMMJodgYhavarXPAlMZHaMtDT9v2IbTmAXZTqf8Qo,8481
 anemoi/datasets/commands/__init__.py,sha256=qAybFZPBBQs0dyx7dZ3X5JsLpE90pwrqt1vSV7cqEIw,706
 anemoi/datasets/commands/compare.py,sha256=p2jQOAC3JhScCLF0GjTCO8goYLWLN8p7vzy_gf5fFcI,1473
-anemoi/datasets/commands/copy.py,sha256=CHIQuzHUs0uezU9c6jJry6yhilYeGn1wmIubGYatEaY,11450
+anemoi/datasets/commands/copy.py,sha256=SxAeN51owyN5gwtwpt30xhJSIJRlJb9YOUt_4K4m-D8,11780
 anemoi/datasets/commands/create.py,sha256=POdOsVDlvRrHFFkI3SNXNgNIbSxkVUUPMoo660x7Ma0,987
 anemoi/datasets/commands/inspect.py,sha256=G3fzcgiLaU8jln7GKvgamN7Y06-qC_JnFw2SbNn1_E4,18646
 anemoi/datasets/commands/scan.py,sha256=HxsLdCgBMSdEXjlJfPq5M_9LxXHHQIoZ1ZEHO_AoPgA,2881
 anemoi/datasets/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 anemoi/datasets/compute/recentre.py,sha256=GRxI6rY_KyXJnZGPxU_UO9YDb-rY_raK70Fiwv1mjhs,4792
-anemoi/datasets/create/__init__.py,sha256=jji65Zni5aPTvS269fAMix4pN9ukmSoK0z5SVsbpr5E,5807
+anemoi/datasets/create/__init__.py,sha256=Q8uXUdbE-SRYYaZd5cPQ2RVbSoHnGX7-eKdOJHYVhDk,5951
 anemoi/datasets/create/check.py,sha256=DLjw-eyaCNxPhoKFsP4Yn_l3SIr57YHdyPR-tE5vx80,5791
 anemoi/datasets/create/chunks.py,sha256=YEDcr0K2KiiceSTiBuZzj0TbRbzZ9J546XO7rrrTFQw,2441
 anemoi/datasets/create/config.py,sha256=uLIp1WHg3hbqwwMV9EepMwJQsXJAGImkbo0okBeEVd4,7683
 anemoi/datasets/create/input.py,sha256=3G7sqdn7R1pLBeeswXwwi8VRAHrBnjq1PdRYHJBe594,27741
-anemoi/datasets/create/loaders.py,sha256=Y9HwH_yPDvnrOTpXVwZxB6VKqGYrHC4xMKhizUhR4lU,30081
+anemoi/datasets/create/loaders.py,sha256=-fJ9qKjsCd8Wvnobn34WsQpE9uAjon5M4REgCpW5q_w,30594
 anemoi/datasets/create/patch.py,sha256=xjCLhvIQKRqmypsKInRU1CvFh1uoaB3YGSQP1UVZZik,3682
 anemoi/datasets/create/persistent.py,sha256=nT8gvhVPdI1H3zW_F7uViGKIlQQ94jCDrMSWTmhQ2_A,4290
 anemoi/datasets/create/size.py,sha256=A1w6RkaL0L9IlwIdmYsCTJTecmY_QtvbkGf__jvQle0,1068
@@ -31,19 +31,19 @@ anemoi/datasets/create/functions/filters/rename.py,sha256=cGoHr-IS-PhYEtZvXDpH03
 anemoi/datasets/create/functions/filters/rotate_winds.py,sha256=fUdh8ILcMzMzckGlvwzdgG-c7w5R9NnWfaijp28Bf5M,4092
 anemoi/datasets/create/functions/filters/unrotate_winds.py,sha256=nsa3EHly8ppWd2WH4ROoMczM8WFu5qKaIhO_UFcL9TY,3502
 anemoi/datasets/create/functions/sources/__init__.py,sha256=Xe9G54CKvCI3ji-7k0R5l0WZZdhlydRgawsXuBcX_hg,379
-anemoi/datasets/create/functions/sources/accumulations.py,sha256=li1tpEew1XUv4sJzNCFYJHx1bve8LuXhEv-mgDoUqPE,12739
+anemoi/datasets/create/functions/sources/accumulations.py,sha256=klbp-akoZlOk9jByDFsgPfHRCdfLvpatTLMxDPZaNZc,12943
 anemoi/datasets/create/functions/sources/constants.py,sha256=aqquu6HDc8t-zsF9KRFLaj0eV4S0UPZ59BVna8E3bU8,785
 anemoi/datasets/create/functions/sources/empty.py,sha256=SBuAfC33imbfcRnFnnOR44y8Q3KSQcqx3juIcXfCa3c,481
 anemoi/datasets/create/functions/sources/forcings.py,sha256=EVcdu8puMSW451qj3LKCWWXaSf2LlmF8YXVs8hSMxkU,643
 anemoi/datasets/create/functions/sources/grib.py,sha256=YQNuGnlh2EYb2NIHYpzlipwUTmOhrmyQtP3zgk8MAUU,1661
 anemoi/datasets/create/functions/sources/hindcasts.py,sha256=0Psnsx2J0cRLMpJuNN-gESm1xJFC1gmQzI8sdnXCoYE,13042
-anemoi/datasets/create/functions/sources/mars.py,sha256=Jau-ceN_cI3Z2-uql92iS4-Emh9Pie7omdRkFB5oe1I,4025
+anemoi/datasets/create/functions/sources/mars.py,sha256=JWsbzyoXF95HPk2VWzmX53f_SJwXhKkaJvXtXJMGLig,5285
 anemoi/datasets/create/functions/sources/netcdf.py,sha256=kic6PH7SAK3gseXChD38IDXw6Zcg2zhF4SeDXB2LQ8Q,2084
 anemoi/datasets/create/functions/sources/opendap.py,sha256=T0CPinscfafrVLaye5ue-PbiCNbcNqf_3m6pphN9rCU,543
 anemoi/datasets/create/functions/sources/recentre.py,sha256=t07LIXG3Hp9gmPkPriILVt86TxubsHyS1EL1lzwgtXY,1810
 anemoi/datasets/create/functions/sources/source.py,sha256=hPQnV_6UIxFw97uRKcTA8TplcgG1kC8NlFHoEaaLet4,1418
 anemoi/datasets/create/functions/sources/tendencies.py,sha256=kwS_GZt8R9kpfs5RrvxPb0Gj-5nDP0sgJgfSRCAwwww,4057
-anemoi/datasets/create/statistics/__init__.py,sha256=X50drgE-ltuNe7bSIyvyeC4GeTqGTQGbglh2-2aVWKE,15445
+anemoi/datasets/create/statistics/__init__.py,sha256=eXyOdlgXBt6QdVWM7ZVyUWdFMv6iNsFefkjvOVvZAlQ,11010
 anemoi/datasets/create/statistics/summary.py,sha256=sgmhA24y3VRyjmDUgTnPIqcHSlWBbFA0qynx6gJ9Xw8,3370
 anemoi/datasets/data/__init__.py,sha256=to9L_RZVQ4OgyHUpX6lcvt4GqJdZjBa5HCTaWx1aGKo,1046
 anemoi/datasets/data/concat.py,sha256=AkpyOs16OjW7X0cdyYFQfWSCV6dteXBp-x9WlokO-DI,3550
@@ -56,8 +56,8 @@ anemoi/datasets/data/grids.py,sha256=rooOeR6rvjl4U8B4LO3N23fcgxvGE7ZUmhVryk1QS4M
 anemoi/datasets/data/indexing.py,sha256=625m__JG5m_tDMrkz1hB6Vydenwt0oHuyAlc-o3Zwos,4799
 anemoi/datasets/data/join.py,sha256=dtCBbMTicqrRPxfBULi3RwEcQBLhQpIcvCjdN5A3XUU,4892
 anemoi/datasets/data/masked.py,sha256=czAv1ZfZ9q6Wr4RqI2Xj8SEm7yoCgJrwMl-CPDs_wSI,3857
-anemoi/datasets/data/misc.py,sha256=cu2rMNtq8M8yzcR1CGHHQksE0p9C2SLVigOTMH6ilMs,10400
-anemoi/datasets/data/select.py,sha256=U3AEid80mrJKu0SF4lLc-bRWMVcAZwHNUHUHRehvuHU,3680
+anemoi/datasets/data/misc.py,sha256=tuNsUY06nWh3Raf_RCi8bzCXsMB4t2hOuIkNGV4epj8,10501
+anemoi/datasets/data/select.py,sha256=Oje3KG1shRawjuBy2-GM8s_Nk_68l-uujvx5SGW0tUM,3781
 anemoi/datasets/data/statistics.py,sha256=lZCcKw9s7ttMBEp6ANyxtbXoZZvchhE7SClq-D4AUR8,1645
 anemoi/datasets/data/stores.py,sha256=yy914zMHIYKm5q6mHOqGeK0dC_26VFeqKLXyb7x9NXE,11190
 anemoi/datasets/data/subset.py,sha256=9urVTXdnwCgqn0_BRYquMi8oiXn4ubAf0n4586hWfKw,3814
@@ -65,9 +65,9 @@ anemoi/datasets/data/unchecked.py,sha256=xhdMg-ToI1UfBWHNsWyn1y2meZWngZtHx-33L0K
 anemoi/datasets/dates/__init__.py,sha256=4ItowfLLh90T8L_JOjtv98lE6M7gAaWt7dV3niUrFvk,4473
 anemoi/datasets/dates/groups.py,sha256=iq310Pi7ullglOhcNblv14MmcT8FPgYCD5s45qAfV_s,3383
 anemoi/datasets/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-anemoi_datasets-0.3.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-anemoi_datasets-0.3.7.dist-info/METADATA,sha256=MviDtSJZZDW9XL00ALGMtwUjOJzfozG1wrxzq4tb3DA,16019
-anemoi_datasets-0.3.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-anemoi_datasets-0.3.7.dist-info/entry_points.txt,sha256=yR-o-4uiPEA_GLBL81SkMYnUoxq3CAV3hHulQiRtGG0,66
-anemoi_datasets-0.3.7.dist-info/top_level.txt,sha256=DYn8VPs-fNwr7fNH9XIBqeXIwiYYd2E2k5-dUFFqUz0,7
-anemoi_datasets-0.3.7.dist-info/RECORD,,
+anemoi_datasets-0.3.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+anemoi_datasets-0.3.9.dist-info/METADATA,sha256=Gmz0Y5ihJDyIqDTTMbO-1s3QYBmbe1vsKvSqt0mh6_0,16019
+anemoi_datasets-0.3.9.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
+anemoi_datasets-0.3.9.dist-info/entry_points.txt,sha256=yR-o-4uiPEA_GLBL81SkMYnUoxq3CAV3hHulQiRtGG0,66
+anemoi_datasets-0.3.9.dist-info/top_level.txt,sha256=DYn8VPs-fNwr7fNH9XIBqeXIwiYYd2E2k5-dUFFqUz0,7
+anemoi_datasets-0.3.9.dist-info/RECORD,,

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (70.1.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{anemoi_datasets-0.3.7.dist-info → anemoi_datasets-0.3.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

anemoi-datasets 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

anemoi-datasets 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl