PyPI - anemoi-datasets - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

anemoi-datasets 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

anemoi/datasets/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.4.5'
-__version_tuple__ = version_tuple = (0, 4, 5)
+__version__ = version = '0.5.0'
+__version_tuple__ = version_tuple = (0, 5, 0)

anemoi/datasets/commands/create.py CHANGED Viewed

@@ -19,7 +19,7 @@ def task(what, options, *args, **kwargs):
     """
     now = datetime.datetime.now()
-    LOG.info(f"Task {what}({args},{kwargs}) starting")
+    LOG.info(f"🎬 Task {what}({args},{kwargs}) starting")
     from anemoi.datasets.create import creator_factory
@@ -28,7 +28,7 @@ def task(what, options, *args, **kwargs):
     c = creator_factory(what.replace("-", "_"), **options)
     result = c.run()
-    LOG.debug(f"Task {what}({args},{kwargs}) completed ({datetime.datetime.now()-now})")
+    LOG.info(f"🏁 Task {what}({args},{kwargs}) completed ({datetime.datetime.now()-now})")
     return result
@@ -57,6 +57,7 @@ class Create(Command):
         command_parser.add_argument("--trace", action="store_true")
     def run(self, args):
         now = time.time()
         if args.threads + args.processes:
             self.parallel_create(args)

anemoi/datasets/create/__init__.py CHANGED Viewed

@@ -132,7 +132,7 @@ class Dataset:
                 v = v.isoformat()
             z.attrs[k] = json.loads(json.dumps(v, default=json_tidy))
-    @property
+    @cached_property
     def anemoi_dataset(self):
         return open_dataset(self.path)
@@ -245,9 +245,9 @@ class Actor:  # TODO: rename to Creator
             missing_dates = z.attrs.get("missing_dates", [])
             missing_dates = sorted([np.datetime64(d) for d in missing_dates])
             if missing_dates != expected:
-                LOG.warn("Missing dates given in recipe do not match the actual missing dates in the dataset.")
-                LOG.warn(f"Missing dates in recipe: {sorted(str(x) for x in missing_dates)}")
-                LOG.warn(f"Missing dates in dataset: {sorted(str(x) for x in  expected)}")
+                LOG.warning("Missing dates given in recipe do not match the actual missing dates in the dataset.")
+                LOG.warning(f"Missing dates in recipe: {sorted(str(x) for x in missing_dates)}")
+                LOG.warning(f"Missing dates in dataset: {sorted(str(x) for x in  expected)}")
                 raise ValueError("Missing dates given in recipe do not match the actual missing dates in the dataset.")
         check_missing_dates(self.missing_dates)
@@ -327,7 +327,7 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
     dataset_class = NewDataset
     def __init__(self, path, config, check_name=False, overwrite=False, use_threads=False, statistics_temp_dir=None, progress=None, test=False, cache=None, **kwargs):  # fmt: skip
         if _path_readable(path) and not overwrite:
-            raise Exception(f"{self.path} already exists. Use overwrite=True to overwrite.")
+            raise Exception(f"{path} already exists. Use overwrite=True to overwrite.")
         super().__init__(path, cache=cache)
         self.config = config
@@ -345,9 +345,12 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         assert isinstance(self.main_config.output.order_by, dict), self.main_config.output.order_by
         self.create_elements(self.main_config)
-        first_date = self.groups.dates[0]
-        self.minimal_input = self.input.select([first_date])
-        LOG.info("Minimal input for 'init' step (using only the first date) :")
+        LOG.info(f"Groups: {self.groups}")
+        one_date = self.groups.one_date()
+        # assert False, (type(one_date), type(self.groups))
+        self.minimal_input = self.input.select(one_date)
+        LOG.info(f"Minimal input for 'init' step (using only the first date) : {one_date}")
         LOG.info(self.minimal_input)
     def run(self):
@@ -363,13 +366,15 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         LOG.info("Config loaded ok:")
         # LOG.info(self.main_config)
-        dates = self.groups.dates
-        frequency = dates.frequency
+        dates = self.groups.provider.values
+        frequency = self.groups.provider.frequency
+        missing = self.groups.provider.missing
         assert isinstance(frequency, datetime.timedelta), frequency
         LOG.info(f"Found {len(dates)} datetimes.")
         LOG.info(f"Dates: Found {len(dates)} datetimes, in {len(self.groups)} groups: ")
-        LOG.info(f"Missing dates: {len(dates.missing)}")
+        LOG.info(f"Missing dates: {len(missing)}")
         lengths = tuple(len(g) for g in self.groups)
         variables = self.minimal_input.variables
@@ -426,7 +431,7 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         metadata["start_date"] = dates[0].isoformat()
         metadata["end_date"] = dates[-1].isoformat()
         metadata["frequency"] = frequency
-        metadata["missing_dates"] = [_.isoformat() for _ in dates.missing]
+        metadata["missing_dates"] = [_.isoformat() for _ in missing]
         metadata["version"] = VERSION
@@ -481,17 +486,6 @@ class Init(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         assert chunks == self.dataset.get_zarr_chunks(), (chunks, self.dataset.get_zarr_chunks())
-        def sanity_check_config(a, b):
-            a = json.dumps(a, sort_keys=True, default=str)
-            b = json.dumps(b, sort_keys=True, default=str)
-            b = b.replace("T", " ")  # dates are expected to be different because
-            if a != b:
-                print("❌❌❌ FIXME: Config serialisation to be checked")
-                print(a)
-                print(b)
-        sanity_check_config(self.main_config, self.dataset.get_main_config())
         # Return the number of groups to process, so we can show a nice progress bar
         return len(lengths)
@@ -527,11 +521,11 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
                 LOG.info(f" -> Skipping {igroup} total={len(self.groups)} (already done)")
                 continue
-            assert isinstance(group[0], datetime.datetime), group
+            # assert isinstance(group[0], datetime.datetime), type(group[0])
             LOG.debug(f"Building data for group {igroup}/{self.n_groups}")
             result = self.input.select(dates=group)
-            assert result.dates == group, (len(result.dates), len(group))
+            assert result.group_of_dates == group, (len(result.group_of_dates), len(group), group)
             # There are several groups.
             # There is one result to load for each group.
@@ -545,7 +539,7 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
     def load_result(self, result):
         # There is one cube to load for each result.
-        dates = result.dates
+        dates = list(result.group_of_dates)
         cube = result.get_cube()
         shape = cube.extended_user_shape
@@ -555,7 +549,9 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
         def check_shape(cube, dates, dates_in_data):
             if cube.extended_user_shape[0] != len(dates):
-                print(f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}")
+                print(
+                    f"Cube shape does not match the number of dates got {cube.extended_user_shape[0]}, expected {len(dates)}"
+                )
                 print("Requested dates", compress_dates(dates))
                 print("Cube dates", compress_dates(dates_in_data))
@@ -566,7 +562,7 @@ class Load(Actor, HasRegistryMixin, HasStatisticTempMixin, HasElementForDataMixi
                 print("Extra dates", compress_dates(b - a))
                 raise ValueError(
-                    f"Cube shape does not match the number of dates {cube.extended_user_shape[0]}, {len(dates)}"
+                    f"Cube shape does not match the number of dates got {cube.extended_user_shape[0]}, expected {len(dates)}"
                 )
         check_shape(cube, dates, dates_in_data)
@@ -846,7 +842,7 @@ class _FinaliseAdditions(Actor, HasRegistryMixin, AdditionsMixin):
         )
         if len(ifound) < 2:
-            LOG.warn(f"Not enough data found in {self.path} to compute {self.__class__.__name__}. Skipped.")
+            LOG.warning(f"Not enough data found in {self.path} to compute {self.__class__.__name__}. Skipped.")
             self.tmp_storage.delete()
             return
@@ -919,7 +915,7 @@ def multi_addition(cls):
                 self.actors.append(cls(*args, delta=k, **kwargs))
             if not self.actors:
-                LOG.warning("No delta found in kwargs, no addtions will be computed.")
+                LOG.warning("No delta found in kwargs, no additions will be computed.")
         def run(self):
             for actor in self.actors:
@@ -947,7 +943,9 @@ class Statistics(Actor, HasStatisticTempMixin, HasRegistryMixin):
         )
         start, end = np.datetime64(start), np.datetime64(end)
         dates = self.dataset.anemoi_dataset.dates
-        assert type(dates[0]) == type(start), (type(dates[0]), type(start))  # noqa
+        assert type(dates[0]) is type(start), (type(dates[0]), type(start))
         dates = [d for d in dates if d >= start and d <= end]
         dates = [d for i, d in enumerate(dates) if i not in self.dataset.anemoi_dataset.missing]
         variables = self.dataset.anemoi_dataset.variables
@@ -956,7 +954,7 @@ class Statistics(Actor, HasStatisticTempMixin, HasRegistryMixin):
         LOG.info(stats)
         if not all(self.registry.get_flags(sync=False)):
-            raise Exception(f"❗Zarr {self.path} is not fully built, not writting statistics into dataset.")
+            raise Exception(f"❗Zarr {self.path} is not fully built, not writing statistics into dataset.")
         for k in ["mean", "stdev", "minimum", "maximum", "sums", "squares", "count", "has_nans"]:
             self.dataset.add_dataset(name=k, array=stats[k], dimensions=("variable",))

anemoi/datasets/create/config.py CHANGED Viewed

@@ -215,8 +215,9 @@ def set_to_test_mode(cfg):
     NUMBER_OF_DATES = 4
     dates = cfg["dates"]
-    LOG.warn(f"Running in test mode. Changing the list of dates to use only {NUMBER_OF_DATES}.")
+    LOG.warning(f"Running in test mode. Changing the list of dates to use only {NUMBER_OF_DATES}.")
     groups = Groups(**LoadersConfig(cfg).dates)
     dates = groups.dates
     cfg["dates"] = dict(
         start=dates[0],
@@ -234,12 +235,12 @@ def set_to_test_mode(cfg):
             if "grid" in obj:
                 previous = obj["grid"]
                 obj["grid"] = "20./20."
-                LOG.warn(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
+                LOG.warning(f"Running in test mode. Setting grid to {obj['grid']} instead of {previous}")
             if "number" in obj:
                 if isinstance(obj["number"], (list, tuple)):
                     previous = obj["number"]
                     obj["number"] = previous[0:3]
-                    LOG.warn(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
+                    LOG.warning(f"Running in test mode. Setting number to {obj['number']} instead of {previous}")
             for k, v in obj.items():
                 set_element_to_test(v)
             if "constants" in obj:

anemoi/datasets/create/functions/filters/pressure_level_relative_humidity_to_specific_humidity.py ADDED Viewed

@@ -0,0 +1,57 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+from collections import defaultdict
+from earthkit.data.indexing.fieldlist import FieldArray
+from earthkit.meteo import thermo
+from .single_level_specific_humidity_to_relative_humidity import NewDataField
+def execute(context, input, t, rh, q="q"):
+    """Convert relative humidity on pressure levels to specific humidity"""
+    result = FieldArray()
+    params = (t, rh)
+    pairs = defaultdict(dict)
+    # Gather all necessary fields
+    for f in input:
+        key = f.metadata(namespace="mars")
+        param = key.pop("param")
+        if param in params:
+            key = tuple(key.items())
+            if param in pairs[key]:
+                raise ValueError(f"Duplicate field {param} for {key}")
+            pairs[key][param] = f
+            if param == t:
+                result.append(f)
+        # all other parameters
+        else:
+            result.append(f)
+    for keys, values in pairs.items():
+        # some checks
+        if len(values) != 2:
+            raise ValueError("Missing fields")
+        t_pl = values[t].to_numpy(flatten=True)
+        rh_pl = values[rh].to_numpy(flatten=True)
+        pressure = keys[4][1] * 100  # TODO: REMOVE HARDCODED INDICES
+        # print(f"Handling fields for pressure level {pressure}...")
+        # actual conversion from rh --> q_v
+        q_pl = thermo.specific_humidity_from_relative_humidity(t_pl, rh_pl, pressure)
+        result.append(NewDataField(values[rh], q_pl, q))
+    return result

anemoi/datasets/create/functions/filters/pressure_level_specific_humidity_to_relative_humidity.py ADDED Viewed

@@ -0,0 +1,57 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+from collections import defaultdict
+from earthkit.data.indexing.fieldlist import FieldArray
+from earthkit.meteo import thermo
+from .single_level_specific_humidity_to_relative_humidity import NewDataField
+def execute(context, input, t, q, rh="r"):
+    """Convert specific humidity on pressure levels to relative humidity"""
+    result = FieldArray()
+    params = (t, q)
+    pairs = defaultdict(dict)
+    # Gather all necessary fields
+    for f in input:
+        key = f.metadata(namespace="mars")
+        param = key.pop("param")
+        if param in params:
+            key = tuple(key.items())
+            if param in pairs[key]:
+                raise ValueError(f"Duplicate field {param} for {key}")
+            pairs[key][param] = f
+            if param == t:
+                result.append(f)
+        # all other parameters
+        else:
+            result.append(f)
+    for keys, values in pairs.items():
+        # some checks
+        if len(values) != 2:
+            raise ValueError("Missing fields")
+        t_pl = values[t].to_numpy(flatten=True)
+        q_pl = values[q].to_numpy(flatten=True)
+        pressure = keys[4][1] * 100  # TODO: REMOVE HARDCODED INDICES
+        # print(f"Handling fields for pressure level {pressure}...")
+        # actual conversion from rh --> q_v
+        rh_pl = thermo.relative_humidity_from_specific_humidity(t_pl, q_pl, pressure)
+        result.append(NewDataField(values[q], rh_pl, rh))
+    return result

anemoi/datasets/create/functions/filters/single_level_dewpoint_to_relative_humidity.py ADDED Viewed

@@ -0,0 +1,54 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+from collections import defaultdict
+from earthkit.data.indexing.fieldlist import FieldArray
+from earthkit.meteo import thermo
+from .single_level_specific_humidity_to_relative_humidity import NewDataField
+def execute(context, input, t, td, rh="d"):
+    """Convert relative humidity on single levels to dewpoint"""
+    result = FieldArray()
+    params = (t, td)
+    pairs = defaultdict(dict)
+    # Gather all necessary fields
+    for f in input:
+        key = f.metadata(namespace="mars")
+        param = key.pop("param")
+        if param in params:
+            key = tuple(key.items())
+            if param in pairs[key]:
+                raise ValueError(f"Duplicate field {param} for {key}")
+            pairs[key][param] = f
+            if param == t:
+                result.append(f)
+        # all other parameters
+        else:
+            result.append(f)
+    for keys, values in pairs.items():
+        # some checks
+        if len(values) != 2:
+            raise ValueError("Missing fields")
+        t_values = values[t].to_numpy(flatten=True)
+        td_values = values[td].to_numpy(flatten=True)
+        # actual conversion from td --> rh
+        rh_values = thermo.relative_humidity_from_dewpoint(t=t_values, td=td_values)
+        result.append(NewDataField(values[td], rh_values, rh))
+    return result

anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_dewpoint.py ADDED Viewed

@@ -0,0 +1,59 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+from collections import defaultdict
+from earthkit.data.indexing.fieldlist import FieldArray
+from earthkit.meteo import thermo
+from .single_level_specific_humidity_to_relative_humidity import NewDataField
+EPS = 1.0e-4
+def execute(context, input, t, rh, td="d"):
+    """Convert relative humidity on single levels to dewpoint"""
+    result = FieldArray()
+    params = (t, rh)
+    pairs = defaultdict(dict)
+    # Gather all necessary fields
+    for f in input:
+        key = f.metadata(namespace="mars")
+        param = key.pop("param")
+        if param in params:
+            key = tuple(key.items())
+            if param in pairs[key]:
+                raise ValueError(f"Duplicate field {param} for {key}")
+            pairs[key][param] = f
+            if param == t:
+                result.append(f)
+        # all other parameters
+        else:
+            result.append(f)
+    for keys, values in pairs.items():
+        # some checks
+        if len(values) != 2:
+            raise ValueError("Missing fields")
+        t_values = values[t].to_numpy(flatten=True)
+        rh_values = values[rh].to_numpy(flatten=True)
+        # Prevent 0 % Relative humidity which cannot be converted to dewpoint
+        # Seems to happen over Egypt in the CERRA dataset
+        rh_values[rh_values == 0] = EPS
+        # actual conversion from rh --> td
+        td_values = thermo.dewpoint_from_relative_humidity(t=t_values, r=rh_values)
+        result.append(NewDataField(values[rh], td_values, td))
+    return result

anemoi/datasets/create/functions/filters/single_level_relative_humidity_to_specific_humidity.py ADDED Viewed

@@ -0,0 +1,115 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+import numpy as np
+from earthkit.data.indexing.fieldlist import FieldArray
+from earthkit.meteo import thermo
+from .single_level_specific_humidity_to_relative_humidity import AutoDict
+from .single_level_specific_humidity_to_relative_humidity import NewDataField
+from .single_level_specific_humidity_to_relative_humidity import pressure_at_height_level
+def execute(context, input, height, t, rh, sp, new_name="2q", **kwargs):
+    """Convert the single (height) level relative humidity to specific humidity"""
+    result = FieldArray()
+    MANDATORY_KEYS = ["A", "B"]
+    OPTIONAL_KEYS = ["t_ml", "q_ml"]
+    MISSING_KEYS = []
+    DEFAULTS = dict(t_ml="t", q_ml="q")
+    for key in OPTIONAL_KEYS:
+        if key not in kwargs:
+            print(f"key {key} not found in yaml-file, using default key: {DEFAULTS[key]}")
+            kwargs[key] = DEFAULTS[key]
+    for key in MANDATORY_KEYS:
+        if key not in kwargs:
+            MISSING_KEYS.append(key)
+    if MISSING_KEYS:
+        raise KeyError(f"Following keys are missing: {', '.join(MISSING_KEYS)}")
+    single_level_params = (t, rh, sp)
+    model_level_params = (kwargs["t_ml"], kwargs["q_ml"])
+    needed_fields = AutoDict()
+    # Gather all necessary fields
+    for f in input:
+        key = f.metadata(namespace="mars")
+        param = key.pop("param")
+        # check single level parameters
+        if param in single_level_params:
+            levtype = key.pop("levtype")
+            key = tuple(key.items())
+            if param in needed_fields[key][levtype]:
+                raise ValueError(f"Duplicate single level field {param} for {key}")
+            needed_fields[key][levtype][param] = f
+            if param == rh:
+                if kwargs.get("keep_rh", False):
+                    result.append(f)
+            else:
+                result.append(f)
+        # check model level parameters
+        elif param in model_level_params:
+            levtype = key.pop("levtype")
+            levelist = key.pop("levelist")
+            key = tuple(key.items())
+            if param in needed_fields[key][levtype][levelist]:
+                raise ValueError(f"Duplicate model level field {param} for {key} at level {levelist}")
+            needed_fields[key][levtype][levelist][param] = f
+        # all other parameters
+        else:
+            result.append(f)
+    for _, values in needed_fields.items():
+        # some checks
+        if len(values["sfc"]) != 3:
+            raise ValueError("Missing surface fields")
+        rh_sl = values["sfc"][rh].to_numpy(flatten=True)
+        t_sl = values["sfc"][t].to_numpy(flatten=True)
+        sp_sl = values["sfc"][sp].to_numpy(flatten=True)
+        nlevels = len(kwargs["A"]) - 1
+        if len(values["ml"]) != nlevels:
+            raise ValueError("Missing model levels")
+        for key in values["ml"].keys():
+            if len(values["ml"][key]) != 2:
+                raise ValueError(f"Missing field on level {key}")
+        # create 3D arrays for upper air fields
+        levels = list(values["ml"].keys())
+        levels.sort()
+        t_ml = []
+        q_ml = []
+        for level in levels:
+            t_ml.append(values["ml"][level][kwargs["t_ml"]].to_numpy(flatten=True))
+            q_ml.append(values["ml"][level][kwargs["q_ml"]].to_numpy(flatten=True))
+        t_ml = np.stack(t_ml)
+        q_ml = np.stack(q_ml)
+        # actual conversion from rh --> q_v
+        p_sl = pressure_at_height_level(height, q_ml, t_ml, sp_sl, np.array(kwargs["A"]), np.array(kwargs["B"]))
+        q_sl = thermo.specific_humidity_from_relative_humidity(t_sl, rh_sl, p_sl)
+        result.append(NewDataField(values["sfc"][rh], q_sl, new_name))
+    return result

anemoi-datasets 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

anemoi-datasets 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl