PyPI - anemoi-datasets - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

anemoi-datasets 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

anemoi/datasets/__main__.py +7 -51
anemoi/datasets/_version.py +2 -2
anemoi/datasets/commands/__init__.py +5 -59
anemoi/datasets/commands/copy.py +141 -83
anemoi/datasets/commands/create.py +14 -3
anemoi/datasets/commands/inspect/__init__.py +1 -5
anemoi/datasets/compute/{perturbations.py → recentre.py} +24 -23
anemoi/datasets/create/__init__.py +3 -0
anemoi/datasets/create/config.py +7 -1
anemoi/datasets/create/functions/sources/accumulations.py +7 -3
anemoi/datasets/create/functions/sources/hindcasts.py +437 -0
anemoi/datasets/create/functions/sources/mars.py +13 -7
anemoi/datasets/create/functions/sources/{perturbations.py → recentre.py} +5 -5
anemoi/datasets/create/input.py +0 -5
anemoi/datasets/create/loaders.py +36 -0
anemoi/datasets/create/persistent.py +1 -3
anemoi/datasets/create/statistics/__init__.py +7 -17
anemoi/datasets/create/statistics/summary.py +1 -4
anemoi/datasets/create/writer.py +4 -3
anemoi/datasets/data/indexing.py +1 -3
anemoi/datasets/data/stores.py +2 -6
anemoi/datasets/data/unchecked.py +1 -6
anemoi/datasets/grids.py +2 -2
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/METADATA +30 -21
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/RECORD +29 -28
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/LICENSE +0 -0
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/WHEEL +0 -0
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/entry_points.txt +0 -0
{anemoi_datasets-0.2.0.dist-info → anemoi_datasets-0.3.0.dist-info}/top_level.txt +0 -0

anemoi/datasets/__main__.py CHANGED Viewed

@@ -8,64 +8,20 @@
 # nor does it submit to any jurisdiction.
 #
-import argparse
-import logging
-import sys
-import traceback
+from anemoi.utils.cli import cli_main
+from anemoi.utils.cli import make_parser
 from . import __version__
 from .commands import COMMANDS
-LOG = logging.getLogger(__name__)
-def main():
-    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument(
-        "--version",
-        "-V",
-        action="store_true",
-        help="show the version and exit",
-    )
-    parser.add_argument(
-        "--debug",
-        "-d",
-        action="store_true",
-        help="Debug mode",
-    )
-    subparsers = parser.add_subparsers(help="commands:", dest="command")
-    for name, command in COMMANDS.items():
-        command_parser = subparsers.add_parser(name, help=command.__doc__)
-        command.add_arguments(command_parser)
+# For read-the-docs
+def create_parser():
+    return make_parser(__doc__, COMMANDS)
-    args = parser.parse_args()
-    if args.version:
-        print(__version__)
-        return
-    if args.command is None:
-        parser.print_help()
-        return
-    cmd = COMMANDS[args.command]
-    logging.basicConfig(
-        format="%(asctime)s %(levelname)s %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-        level=logging.DEBUG if args.debug else logging.INFO,
-    )
-    try:
-        cmd.run(args)
-    except ValueError as e:
-        traceback.print_exc()
-        LOG.error("\n💣 %s", str(e).lstrip())
-        LOG.error("💣 Exiting")
-        sys.exit(1)
+def main():
+    cli_main(__version__, __doc__, COMMANDS)
 if __name__ == "__main__":

anemoi/datasets/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.0'
-__version_tuple__ = version_tuple = (0, 2, 0)
+__version__ = version = '0.3.0'
+__version_tuple__ = version_tuple = (0, 3, 0)

anemoi/datasets/commands/__init__.py CHANGED Viewed

@@ -8,69 +8,15 @@
 # nor does it submit to any jurisdiction.
 #
-import argparse
-import importlib
-import logging
 import os
-import sys
-LOG = logging.getLogger(__name__)
+from anemoi.utils.cli import Command
+from anemoi.utils.cli import Failed
+from anemoi.utils.cli import register_commands
+__all__ = ["Command"]
-def register(here, package, select, fail=None):
-    result = {}
-    not_available = {}
-    for p in os.listdir(here):
-        full = os.path.join(here, p)
-        if p.startswith("_"):
-            continue
-        if not (p.endswith(".py") or (os.path.isdir(full) and os.path.exists(os.path.join(full, "__init__.py")))):
-            continue
-        name, _ = os.path.splitext(p)
-        try:
-            imported = importlib.import_module(
-                f".{name}",
-                package=package,
-            )
-        except ImportError as e:
-            not_available[name] = e
-            continue
-        obj = select(imported)
-        if obj is not None:
-            result[name] = obj
-    for name, e in not_available.items():
-        if fail is None:
-            pass
-        if callable(fail):
-            result[name] = fail(name, e)
-    return result
-class Command:
-    def run(self, args):
-        raise NotImplementedError(f"Command not implemented: {args.command}")
-class Failed(Command):
-    def __init__(self, name, error):
-        self.name = name
-        self.error = error
-    def add_arguments(self, command_parser):
-        command_parser.add_argument("x", nargs=argparse.REMAINDER)
-    def run(self, args):
-        print(f"Command '{self.name}' not available: {self.error}")
-        sys.exit(1)
-COMMANDS = register(
+COMMANDS = register_commands(
     os.path.dirname(__file__),
     __name__,
     lambda x: x.command(),

anemoi/datasets/commands/copy.py CHANGED Viewed

@@ -41,24 +41,19 @@ zinfo https://object-store.os-api.cci1.ecmwf.int/
 """
-class CopyMixin:
-    internal = True
-    timestamp = True
-    def add_arguments(self, command_parser):
-        command_parser.add_argument("--transfers", type=int, default=8)
-        command_parser.add_argument("--block-size", type=int, default=100)
-        command_parser.add_argument("--overwrite", action="store_true")
-        command_parser.add_argument("--progress", action="store_true")
-        command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
-        command_parser.add_argument(
-            "--rechunk",
-            nargs="+",
-            help="Rechunk given array.",
-            metavar="array=i,j,k,l",
-        )
-        command_parser.add_argument("source")
-        command_parser.add_argument("target")
+class Copier:
+    def __init__(self, source, target, transfers, block_size, overwrite, resume, progress, nested, rechunk, **kwargs):
+        self.source = source
+        self.target = target
+        self.transfers = transfers
+        self.block_size = block_size
+        self.overwrite = overwrite
+        self.resume = resume
+        self.progress = progress
+        self.nested = nested
+        self.rechunk = rechunk
+        self.rechunking = rechunk.split(",") if rechunk else []
     def _store(self, path, nested=False):
         if nested:
@@ -67,30 +62,56 @@ class CopyMixin:
             return zarr.storage.NestedDirectoryStore(path)
         return path
-    def copy_chunk(self, n, m, source, target, block_size, _copy, progress):
+    def copy_chunk(self, n, m, source, target, _copy, progress):
         if _copy[n:m].all():
             LOG.info(f"Skipping {n} to {m}")
             return None
-        for i in tqdm.tqdm(
-            range(n, m),
-            desc=f"Copying {n} to {m}",
-            leave=False,
-            disable=not isatty and not progress,
-        ):
-            target[i] = source[i]
+        if self.block_size % self.data_chunks[0] == 0:
+            target[slice(n, m)] = source[slice(n, m)]
+        else:
+            LOG.warning(
+                f"Block size ({self.block_size}) is not a multiple of target chunk size ({self.data_chunks[0]}). Slow copy expected."
+            )
+            if self.transfers > 1:
+                # race condition, different threads might copy the same data to the same chunk
+                raise NotImplementedError(
+                    "Block size is not a multiple of target chunk size. Parallel copy not supported."
+                )
+            for i in tqdm.tqdm(
+                range(n, m),
+                desc=f"Copying {n} to {m}",
+                leave=False,
+                disable=not isatty and not progress,
+            ):
+                target[i] = source[i]
         return slice(n, m)
-    def copy_data(self, source, target, transfers, block_size, _copy, progress, rechunking):
+    def parse_rechunking(self, rechunking, source_data):
+        shape = source_data.shape
+        chunks = list(source_data.chunks)
+        for i, c in enumerate(rechunking):
+            if not c:
+                continue
+            elif c == "full":
+                chunks[i] = shape[i]
+            c = int(c)
+            c = min(c, shape[i])
+            chunks[i] = c
+        chunks = tuple(chunks)
+        if chunks != source_data.chunks:
+            LOG.info(f"Rechunking data from {source_data.chunks} to {chunks}")
+            # if self.transfers > 1:
+            #    raise NotImplementedError("Rechunking with multiple transfers is not implemented")
+        return chunks
+    def copy_data(self, source, target, _copy, progress):
         LOG.info("Copying data")
         source_data = source["data"]
-        chunks = list(source_data.chunks)
-        if "data" in rechunking:
-            assert len(chunks) == len(rechunking["data"]), (chunks, rechunking["data"])
-            for i, c in enumerate(rechunking["data"]):
-                if c != -1:
-                    chunks[i] = c
+        self.data_chunks = self.parse_rechunking(self.rechunking, source_data)
         target_data = (
             target["data"]
@@ -98,12 +119,12 @@ class CopyMixin:
             else target.create_dataset(
                 "data",
                 shape=source_data.shape,
-                chunks=chunks,
+                chunks=self.data_chunks,
                 dtype=source_data.dtype,
             )
         )
-        executor = ThreadPoolExecutor(max_workers=transfers)
+        executor = ThreadPoolExecutor(max_workers=self.transfers)
         tasks = []
         n = 0
         while n < target_data.shape[0]:
@@ -111,15 +132,14 @@ class CopyMixin:
                 executor.submit(
                     self.copy_chunk,
                     n,
-                    min(n + block_size, target_data.shape[0]),
+                    min(n + self.block_size, target_data.shape[0]),
                     source_data,
                     target_data,
-                    block_size,
                     _copy,
                     progress,
                 )
             )
-            n += block_size
+            n += self.block_size
         for future in tqdm.tqdm(as_completed(tasks), total=len(tasks), smoothing=0):
             copied = future.result()
@@ -131,7 +151,7 @@ class CopyMixin:
         LOG.info("Copied data")
-    def copy_array(self, name, source, target, transfers, block_size, _copy, progress, rechunking):
+    def copy_array(self, name, source, target, _copy, progress):
         for k, v in source.attrs.items():
             target.attrs[k] = v
@@ -139,14 +159,14 @@ class CopyMixin:
             return
         if name == "data":
-            self.copy_data(source, target, transfers, block_size, _copy, progress, rechunking)
+            self.copy_data(source, target, _copy, progress)
             return
         LOG.info(f"Copying {name}")
         target[name] = source[name]
         LOG.info(f"Copied {name}")
-    def copy_group(self, source, target, transfers, block_size, _copy, progress, rechunking):
+    def copy_group(self, source, target, _copy, progress):
         import zarr
         for k, v in source.attrs.items():
@@ -158,25 +178,19 @@ class CopyMixin:
                 self.copy_group(
                     source[name],
                     group,
-                    transfers,
-                    block_size,
                     _copy,
                     progress,
-                    rechunking,
                 )
             else:
                 self.copy_array(
                     name,
                     source,
                     target,
-                    transfers,
-                    block_size,
                     _copy,
                     progress,
-                    rechunking,
                 )
-    def copy(self, source, target, transfers, block_size, progress, rechunking):
+    def copy(self, source, target, progress):
         import zarr
         if "_copy" not in target:
@@ -187,32 +201,26 @@ class CopyMixin:
         _copy = target["_copy"]
         _copy_np = _copy[:]
-        self.copy_group(source, target, transfers, block_size, _copy_np, progress, rechunking)
+        self.copy_group(source, target, _copy_np, progress)
         del target["_copy"]
-    def run(self, args):
+    def run(self):
         import zarr
         # base, ext = os.path.splitext(os.path.basename(args.source))
         # assert ext == ".zarr", ext
         # assert "." not in base, base
-        LOG.info(f"Copying {args.source} to {args.target}")
-        rechunking = {}
-        if args.rechunk:
-            for r in args.rechunk:
-                k, v = r.split("=")
-                if k != "data":
-                    raise ValueError(f"Only rechunking data is supported: {k}")
-                values = v.split(",")
-                values = [-1 if x == "" else x for x in values]
-                values = tuple(int(x) for x in values)
-                rechunking[k] = values
-            for k, v in rechunking.items():
-                LOG.info(f"Rechunking {k} to {v}")
-        try:
-            target = zarr.open(self._store(args.target, args.nested), mode="r")
+        LOG.info(f"Copying {self.source} to {self.target}")
+        def target_exists():
+            try:
+                zarr.open(self._store(self.target), mode="r")
+                return True
+            except ValueError:
+                return False
+        def target_finished():
+            target = zarr.open(self._store(self.target), mode="r")
             if "_copy" in target:
                 done = sum(1 if x else 0 for x in target["_copy"])
                 todo = len(target["_copy"])
@@ -222,26 +230,76 @@ class CopyMixin:
                     todo,
                     int(done / todo * 100 + 0.5),
                 )
+                return False
             elif "sums" in target and "data" in target:  # sums is copied last
-                LOG.error("Target already exists")
-                return
-        except ValueError as e:
-            LOG.info(f"Target does not exist: {e}")
-            pass
-        source = zarr.open(self._store(args.source), mode="r")
-        if args.overwrite:
-            target = zarr.open(self._store(args.target, args.nested), mode="w")
-        else:
-            try:
-                target = zarr.open(self._store(args.target, args.nested), mode="w+")
-            except ValueError:
-                target = zarr.open(self._store(args.target, args.nested), mode="w")
-        self.copy(source, target, args.transfers, args.block_size, args.progress, rechunking)
+                return True
+            return False
+        def open_target():
+            if not target_exists():
+                return zarr.open(self._store(self.target, self.nested), mode="w")
+            if self.overwrite:
+                LOG.error("Target already exists, overwriting.")
+                return zarr.open(self._store(self.target, self.nested), mode="w")
+            if self.resume:
+                if target_finished():
+                    LOG.error("Target already exists and is finished.")
+                    sys.exit(0)
+                LOG.error("Target already exists, resuming copy.")
+                return zarr.open(self._store(self.target, self.nested), mode="w+")
+            LOG.error("Target already exists, use either --overwrite or --resume.")
+            sys.exit(1)
+        target = open_target()
+        assert target is not None, target
+        source = zarr.open(self._store(self.source), mode="r")
+        self.copy(source, target, self.progress)
+class CopyMixin:
+    internal = True
+    timestamp = True
+    def add_arguments(self, command_parser):
+        group = command_parser.add_mutually_exclusive_group()
+        group.add_argument(
+            "--overwrite",
+            action="store_true",
+            help="Overwrite existing dataset. This will delete the target dataset if it already exists. Cannot be used with --resume.",
+        )
+        group.add_argument(
+            "--resume", action="store_true", help="Resume copying an existing dataset. Cannot be used with --overwrite."
+        )
+        command_parser.add_argument("--transfers", type=int, default=8, help="Number of parallel transfers.")
+        command_parser.add_argument(
+            "--progress", action="store_true", help="Force show progress bar, even if not in an interactive shell."
+        )
+        command_parser.add_argument("--nested", action="store_true", help="Use ZARR's nested directpry backend.")
+        command_parser.add_argument(
+            "--rechunk", help="Rechunk the target data array. Rechunk size should be a diviser of the block size."
+        )
+        command_parser.add_argument(
+            "--block-size",
+            type=int,
+            default=100,
+            help="For optimisation purposes, data is transfered by blocks. Default is 100.",
+        )
+        command_parser.add_argument("source", help="Source location.")
+        command_parser.add_argument("target", help="Target location.")
+    def run(self, args):
+        Copier(**vars(args)).run()
 class Copy(CopyMixin, Command):
-    pass
+    """Copy a dataset from one location to another."""
 command = Copy

anemoi/datasets/commands/create.py CHANGED Viewed

@@ -4,13 +4,24 @@ from . import Command
 class Create(Command):
+    """Create a dataset."""
     internal = True
     timestamp = True
     def add_arguments(self, command_parser):
-        command_parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
-        command_parser.add_argument("config", help="Configuration file")
-        command_parser.add_argument("path", help="Path to store the created data")
+        command_parser.add_argument(
+            "--overwrite",
+            action="store_true",
+            help="Overwrite existing files. This will delete the target dataset if it already exists.",
+        )
+        command_parser.add_argument(
+            "--test",
+            action="store_true",
+            help="Build a small dataset, using only the first dates. And, when possible, using low resolution and less ensemble members.",
+        )
+        command_parser.add_argument("config", help="Configuration yaml file defining the recipe to create the dataset.")
+        command_parser.add_argument("path", help="Path to store the created data.")
     def run(self, args):
         kwargs = vars(args)

anemoi/datasets/commands/inspect/__init__.py CHANGED Viewed

@@ -11,16 +11,12 @@ import os
 from .. import Command
 from .zarr import InspectZarr
-# from .checkpoint import InspectCheckpoint
 class Inspect(Command, InspectZarr):
-    # class Inspect(Command, InspectCheckpoint, InspectZarr):
-    """Inspect a checkpoint or zarr file."""
+    """Inspect a zarr dataset."""
     def add_arguments(self, command_parser):
         # g = command_parser.add_mutually_exclusive_group()
-        # g.add_argument("--inspect", action="store_true", help="Inspect weights")
         command_parser.add_argument("path", metavar="PATH", nargs="+")
         command_parser.add_argument("--detailed", action="store_true")
         # command_parser.add_argument("--probe", action="store_true")

anemoi/datasets/compute/{perturbations.py → recentre.py} RENAMED Viewed

@@ -32,7 +32,7 @@ CLIP_VARIABLES = (
 SKIP = ("class", "stream", "type", "number", "expver", "_leg_number", "anoffset")
-def check_compatible(f1, f2, center_field_as_mars, ensemble_field_as_mars):
+def check_compatible(f1, f2, centre_field_as_mars, ensemble_field_as_mars):
     assert f1.mars_grid == f2.mars_grid, (f1.mars_grid, f2.mars_grid)
     assert f1.mars_area == f2.mars_area, (f1.mars_area, f2.mars_area)
     assert f1.shape == f2.shape, (f1.shape, f2.shape)
@@ -43,21 +43,22 @@ def check_compatible(f1, f2, center_field_as_mars, ensemble_field_as_mars):
         f2.metadata("valid_datetime"),
     )
-    for k in set(center_field_as_mars.keys()) | set(ensemble_field_as_mars.keys()):
+    for k in set(centre_field_as_mars.keys()) | set(ensemble_field_as_mars.keys()):
         if k in SKIP:
             continue
-        assert center_field_as_mars[k] == ensemble_field_as_mars[k], (
+        assert centre_field_as_mars[k] == ensemble_field_as_mars[k], (
             k,
-            center_field_as_mars[k],
+            centre_field_as_mars[k],
             ensemble_field_as_mars[k],
         )
-def perturbations(
+def recentre(
     *,
     members,
-    center,
+    centre,
     clip_variables=CLIP_VARIABLES,
+    alpha=1.0,
     output=None,
 ):
@@ -70,16 +71,16 @@ def perturbations(
     LOG.info("Ordering fields")
     members = members.order_by(*keys)
-    center = center.order_by(*keys)
+    centre = centre.order_by(*keys)
     LOG.info("Done")
-    if len(center) * n_numbers != len(members):
-        LOG.error("%s %s %s", len(center), n_numbers, len(members))
+    if len(centre) * n_numbers != len(members):
+        LOG.error("%s %s %s", len(centre), n_numbers, len(members))
         for f in members:
             LOG.error("Member: %r", f)
-        for f in center:
-            LOG.error("Center: %r", f)
-        raise ValueError(f"Inconsistent number of fields: {len(center)} * {n_numbers} != {len(members)}")
+        for f in centre:
+            LOG.error("centre: %r", f)
+        raise ValueError(f"Inconsistent number of fields: {len(centre)} * {n_numbers} != {len(members)}")
     if output is None:
         # prepare output tmp file so we can read it back
@@ -93,32 +94,32 @@ def perturbations(
     seen = set()
-    for i, center_field in enumerate(center):
-        param = center_field.metadata("param")
-        center_field_as_mars = center_field.as_mars()
+    for i, centre_field in enumerate(centre):
+        param = centre_field.metadata("param")
+        centre_field_as_mars = centre_field.as_mars()
-        # load the center field
-        center_np = center_field.to_numpy()
+        # load the centre field
+        centre_np = centre_field.to_numpy()
         # load the ensemble fields and compute the mean
-        members_np = np.zeros((n_numbers, *center_np.shape))
+        members_np = np.zeros((n_numbers, *centre_np.shape))
         for j in range(n_numbers):
             ensemble_field = members[i * n_numbers + j]
             ensemble_field_as_mars = ensemble_field.as_mars()
-            check_compatible(center_field, ensemble_field, center_field_as_mars, ensemble_field_as_mars)
+            check_compatible(centre_field, ensemble_field, centre_field_as_mars, ensemble_field_as_mars)
             members_np[j] = ensemble_field.to_numpy()
             ensemble_field_as_mars = tuple(sorted(ensemble_field_as_mars.items()))
             assert ensemble_field_as_mars not in seen, ensemble_field_as_mars
             seen.add(ensemble_field_as_mars)
-        # cmin=np.amin(center_np)
+        # cmin=np.amin(centre_np)
         # emin=np.amin(members_np)
         # if cmin < 0 and emin >= 0:
         #     LOG.warning(f"Negative values in {param} cmin={cmin} emin={emin}")
-        #     LOG.warning(f"Center: {center_field_as_mars}")
+        #     LOG.warning(f"centre: {centre_field_as_mars}")
         mean_np = members_np.mean(axis=0)
@@ -126,11 +127,11 @@ def perturbations(
             template = members[i * n_numbers + j]
             e = members_np[j]
             m = mean_np
-            c = center_np
+            c = centre_np
             assert e.shape == c.shape == m.shape, (e.shape, c.shape, m.shape)
-            x = c - m + e
+            x = c + (e - m) * alpha
             if param in clip_variables:
                 # LOG.warning(f"Clipping {param} to be positive")

anemoi/datasets/create/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ class Creator:
         print=print,
         statistics_tmp=None,
         overwrite=False,
+        test=None,
         **kwargs,
     ):
         self.path = path  # Output path
@@ -27,6 +28,7 @@ class Creator:
         self.print = print
         self.statistics_tmp = statistics_tmp
         self.overwrite = overwrite
+        self.test = test
     def init(self, check_name=False):
         # check path
@@ -43,6 +45,7 @@ class Creator:
                 config=self.config,
                 statistics_tmp=self.statistics_tmp,
                 print=self.print,
+                test=self.test,
             )
             obj.initialise(check_name=check_name)

anemoi-datasets 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

anemoi-datasets 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl