PyPI - ocf-data-sampler - Versions diffs - 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl - Mend

ocf-data-sampler 0.5.14py3-none-any.whl → 0.5.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ocf-data-sampler might be problematic. Click here for more details.

Files changed (7) hide show

ocf_data_sampler/config/model.py CHANGED Viewed

@@ -90,7 +90,7 @@ class DropoutMixin(Base):
         "negative or zero.",
     )
-    dropout_fraction: float|list[float] = Field(
+    dropout_fraction: float | list[float] = Field(
         default=0,
         description="Either a float(Chance of dropout being applied to each sample) or a list of "
         "floats (probability that dropout of the corresponding timedelta is applied)",
@@ -106,31 +106,22 @@ class DropoutMixin(Base):
     @field_validator("dropout_fraction")
-    def dropout_fractions(cls, dropout_frac: float|list[float]) -> float|list[float]:
+    def dropout_fractions(cls, dropout_frac: float | list[float]) -> float | list[float]:
         """Validate 'dropout_frac'."""
-        from math import isclose
-        if isinstance(dropout_frac, float):
-            if not (dropout_frac <= 1):
-                raise ValueError("Input should be less than or equal to 1")
-            elif not (dropout_frac >= 0):
-                raise ValueError("Input should be greater than or equal to 0")
+        if isinstance(dropout_frac, float | int):
+            if not (0<= dropout_frac <= 1):
+                raise ValueError("Dropout fractions must be in range [0, 1]")
         elif isinstance(dropout_frac, list):
             if not dropout_frac:
                 raise ValueError("List cannot be empty")
-            if not all(isinstance(i, float) for i in dropout_frac):
-                raise ValueError("All elements in the list must be floats")
             if not all(0 <= i <= 1 for i in dropout_frac):
-                raise ValueError("Each float in the list must be between 0 and 1")
-            if not isclose(sum(dropout_frac), 1.0, rel_tol=1e-9):
-                raise ValueError("Sum of all floats in the list must be 1.0")
+                raise ValueError("All dropout fractions must be in range [0, 1]")
+            if not (0 <= sum(dropout_frac) <= 1):
+                raise ValueError("The sum of dropout fractions must be in range [0, 1]")
-        else:
-            raise TypeError("Must be either a float or a list of floats")
         return dropout_frac

ocf_data_sampler/select/dropout.py CHANGED Viewed

@@ -9,53 +9,51 @@ import pandas as pd
 import xarray as xr
-def apply_sampled_dropout_time(
+def apply_history_dropout(
     t0: pd.Timestamp,
     dropout_timedeltas: list[pd.Timedelta],
-    dropout_frac: float|list[float],
+    dropout_frac: float | list[float],
     da: xr.DataArray,
 ) -> xr.DataArray:
-    """Randomly pick a dropout time from a list of timedeltas and apply dropout time to the data.
+    """Apply randomly sampled dropout to the historical part of some sequence data.
+    Dropped out data is replaced with NaNs
     Args:
-        t0: The forecast init-time
+        t0: The forecast init-time.
         dropout_timedeltas: List of timedeltas relative to t0 to pick from
-        dropout_frac: Either a probability that dropout will be applied.
-            This should be between 0 and 1 inclusive.
-            Or a list of probabilities for each of the corresponding timedeltas
+        dropout_frac: The probabilit(ies) that each dropout timedelta will be applied. This should
+            be between 0 and 1 inclusive.
         da: Xarray DataArray with 'time_utc' coordinate
     """
-    if  isinstance(dropout_frac, list):
-        # checking if len match
-        if len(dropout_frac) != len(dropout_timedeltas):
-            raise ValueError("Lengths of dropout_frac and dropout_timedeltas should match")
+    if len(dropout_timedeltas)==0:
+        return da
+    if isinstance(dropout_frac, float | int):
+        if not (0<=dropout_frac<=1):
+            raise ValueError("`dropout_frac` must be in range [0, 1]")
+        # Create list with equal chance for all dropout timedeltas
+        n = len(dropout_timedeltas)
+        dropout_frac = [dropout_frac/n for _ in range(n)]
+    else:
+        if not 0<=sum(dropout_frac)<=1:
+            raise ValueError("The sum of `dropout_frac` must be in range [0, 1]")
+        if len(dropout_timedeltas)!=len(dropout_frac):
+            raise ValueError("`dropout_timedeltas` and `dropout_frac` must have the same length")
-        dropout_time = t0 + np.random.choice(dropout_timedeltas,p=dropout_frac)
+        dropout_frac = [*dropout_frac] # Make copy of the list so we can append to it
-        return da.where(da.time_utc <= dropout_time)
+    dropout_timedeltas = [*dropout_timedeltas] # Make copy of the list so we can append to it
+    # Add chance of no dropout
+    dropout_frac.append(1-sum(dropout_frac))
+    dropout_timedeltas.append(None)
+    timedelta_choice = np.random.choice(dropout_timedeltas, p=dropout_frac)
-    # old logic
+    if timedelta_choice is None:
+        return da
     else:
-        # sample dropout time
-        if dropout_frac > 0 and len(dropout_timedeltas) == 0:
-            raise ValueError("To apply dropout, dropout_timedeltas must be provided")
-        if not (0 <= dropout_frac <= 1):
-            raise ValueError("dropout_frac must be between 0 and 1 inclusive")
-        if (len(dropout_timedeltas) == 0) or (np.random.uniform() >= dropout_frac):
-            dropout_time = None
-        else:
-            dropout_time = t0 + np.random.choice(dropout_timedeltas)
-        # apply dropout time
-        if dropout_time is None:
-            return da
-        # This replaces the times after the dropout with NaNs
-        return da.where(da.time_utc <= dropout_time)
+        return da.where((da.time_utc <= timedelta_choice + t0) | (da.time_utc> t0))

ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """Slice datasets by time."""
 import pandas as pd
-import xarray as xr
 from ocf_data_sampler.config import Configuration
-from ocf_data_sampler.select.dropout import apply_sampled_dropout_time
+from ocf_data_sampler.select.dropout import apply_history_dropout
 from ocf_data_sampler.select.select_time_slice import select_time_slice, select_time_slice_nwp
 from ocf_data_sampler.utils import minutes
@@ -52,7 +51,7 @@ def slice_datasets_by_time(
         )
         # Apply the randomly sampled dropout
-        sliced_datasets_dict["sat"] = apply_sampled_dropout_time(
+        sliced_datasets_dict["sat"] = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(sat_config.dropout_timedeltas_minutes),
             dropout_frac=sat_config.dropout_fraction,
@@ -62,59 +61,44 @@ def slice_datasets_by_time(
     if "gsp" in datasets_dict:
         gsp_config = config.input_data.gsp
-        da_gsp_past = select_time_slice(
+        da_gsp = select_time_slice(
             datasets_dict["gsp"],
             t0,
             time_resolution=minutes(gsp_config.time_resolution_minutes),
             interval_start=minutes(gsp_config.interval_start_minutes),
-            interval_end=minutes(0),
+            interval_end=minutes(gsp_config.interval_end_minutes),
         )
         # Dropout on the past GSP, but not the future GSP
-        da_gsp_past = apply_sampled_dropout_time(
+        da_gsp = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(gsp_config.dropout_timedeltas_minutes),
             dropout_frac=gsp_config.dropout_fraction,
-            da=da_gsp_past,
-        )
-        da_gsp_future = select_time_slice(
-            datasets_dict["gsp"],
-            t0,
-            time_resolution=minutes(gsp_config.time_resolution_minutes),
-            interval_start=minutes(gsp_config.time_resolution_minutes),
-            interval_end=minutes(gsp_config.interval_end_minutes),
+            da=da_gsp,
         )
-        sliced_datasets_dict["gsp"] = xr.concat([da_gsp_past, da_gsp_future], dim="time_utc")
+        sliced_datasets_dict["gsp"] = da_gsp
     if "site" in datasets_dict:
         site_config = config.input_data.site
-        da_site_past = select_time_slice(
+        da_site = select_time_slice(
             datasets_dict["site"],
             t0,
             time_resolution=minutes(site_config.time_resolution_minutes),
             interval_start=minutes(site_config.interval_start_minutes),
-            interval_end=minutes(0),
+            interval_end=minutes(site_config.interval_end_minutes),
         )
         # Apply the randomly sampled dropout on the past site not the future
-        da_site_past = apply_sampled_dropout_time(
+        da_site = apply_history_dropout(
             t0,
             dropout_timedeltas=minutes(site_config.dropout_timedeltas_minutes),
             dropout_frac=site_config.dropout_fraction,
-            da=da_site_past,
+            da=da_site,
         )
-        da_site_future = select_time_slice(
-            datasets_dict["site"],
-            t0,
-            time_resolution=minutes(site_config.time_resolution_minutes),
-            interval_start=minutes(site_config.time_resolution_minutes),
-            interval_end=minutes(site_config.interval_end_minutes),
-        )
+        sliced_datasets_dict["site"] = da_site
-        sliced_datasets_dict["site"] = xr.concat([da_site_past, da_site_future], dim="time_utc")
     return sliced_datasets_dict

{ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocf-data-sampler
-Version: 0.5.14
+Version: 0.5.15
 Author: James Fulton, Peter Dudfield
 Author-email: Open Climate Fix team <info@openclimatefix.org>
 License: MIT License

{ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ ocf_data_sampler/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,
 ocf_data_sampler/utils.py,sha256=CTJf9bjHjO8vOJebUtXiMpvgwUpF7gEOjjaoE77fhTk,1177
 ocf_data_sampler/config/__init__.py,sha256=O29mbH0XG2gIY1g3BaveGCnpBO2SFqdu-qzJ7a6evl0,223
 ocf_data_sampler/config/load.py,sha256=LL-7wemI8o4KPkx35j-wQ3HjsMvDgqXr7G46IcASfnU,632
-ocf_data_sampler/config/model.py,sha256=ucddp09yM4HGZKVuW-0N8vLGqLVAo_S4mPT89N_iG-0,11881
+ocf_data_sampler/config/model.py,sha256=5ou8BZgQ9h-xyJEqHdspPKZgZO9Vr6opjSphUys7yE8,11505
 ocf_data_sampler/config/save.py,sha256=m8SPw5rXjkMm1rByjh3pK5StdBi4e8ysnn3jQopdRaI,1064
 ocf_data_sampler/data/uk_gsp_locations_20220314.csv,sha256=RSh7DRh55E3n8lVAaWXGTaXXHevZZtI58td4d4DhGos,10415772
 ocf_data_sampler/data/uk_gsp_locations_20250109.csv,sha256=XZISFatnbpO9j8LwaxNKFzQSjs6hcHFsV8a9uDDpy2E,9055334
@@ -32,7 +32,7 @@ ocf_data_sampler/numpy_sample/satellite.py,sha256=RaYzYIcB1AmDrKeiqSpn4QVfBH-QMe
 ocf_data_sampler/numpy_sample/site.py,sha256=4S19bzCN5lswVUrmWRfwpVsBPUE7bi0OIdxsD9wgvhU,982
 ocf_data_sampler/numpy_sample/sun_position.py,sha256=5tt-zNm6aRuZMsxZPaAxyg7HeikswfZCeHWXTHuO2K0,1555
 ocf_data_sampler/select/__init__.py,sha256=mK7Wu_-j9IXGTYrOuDf5yDDuU5a306b0iGKTAooNg_s,210
-ocf_data_sampler/select/dropout.py,sha256=BYpv8L771faPOyN7SdIJ5cwkpDve-ohClj95jjsHmjg,1973
+ocf_data_sampler/select/dropout.py,sha256=i5NDP6oQnZBkQRJW-aXVrPXawktVKQz5VMexe5Ww51g,2021
 ocf_data_sampler/select/fill_time_periods.py,sha256=TlGxp1xiAqnhdWfLy0pv3FuZc00dtimjWdLzr4JoTGA,865
 ocf_data_sampler/select/find_contiguous_time_periods.py,sha256=etkr6LuB7zxkfzWJ6SgHiULdRuFzFlq5bOUNd257Qx4,11545
 ocf_data_sampler/select/geospatial.py,sha256=rvMy_e--3tm-KAy9pU6b9-UMBQqH2sXykr3N_4SHYy4,6528
@@ -51,12 +51,12 @@ ocf_data_sampler/torch_datasets/utils/add_alterate_coordinate_projections.py,sha
 ocf_data_sampler/torch_datasets/utils/config_normalization_values_to_dicts.py,sha256=SGt1H2nXcaj44ND14-gHzvA7dkLfgjTacCq7rOkRGwg,1991
 ocf_data_sampler/torch_datasets/utils/merge_and_fill_utils.py,sha256=we7BTxRH7B7jKayDT7YfNyfI3zZClz2Bk-HXKQIokgU,956
 ocf_data_sampler/torch_datasets/utils/spatial_slice_for_dataset.py,sha256=Hvz0wHSWMYYamf2oHNiGlzJcM4cAH6pL_7ZEvIBL2dE,1882
-ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=8E4a5v9dqr-sZOyBruuO-tjLPBbjtpYtdFY5z23aqnU,4365
+ocf_data_sampler/torch_datasets/utils/time_slice_for_dataset.py,sha256=Q_-kCTtUieyEDpSElY1xwJct7Vsw0LAn5MbYSg2O6vg,3621
 ocf_data_sampler/torch_datasets/utils/valid_time_periods.py,sha256=xcy75cVxl0WrglnX5YUAFjXXlO2GwEBHWyqo8TDuiOA,4714
 ocf_data_sampler/torch_datasets/utils/validation_utils.py,sha256=YqmT-lExWlI8_ul3l0EP73Ik002fStr_bhsZh9mQqEU,4735
 scripts/download_gsp_location_data.py,sha256=rRDXMoqX-RYY4jPdxhdlxJGhWdl6r245F5UARgKV6P4,3121
 scripts/refactor_site.py,sha256=skzvsPP0Cn9yTKndzkilyNcGz4DZ88ctvCJ0XrBdc2A,3135
-ocf_data_sampler-0.5.14.dist-info/METADATA,sha256=OgS9xvqBfhmlWym0DYBBbT-IwZ3tRz_EKo2wEdErmCA,12817
-ocf_data_sampler-0.5.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ocf_data_sampler-0.5.14.dist-info/top_level.txt,sha256=deUxqmsONNAGZDNbsntbXH7BRA1MqWaUeAJrCo6q_xA,25
-ocf_data_sampler-0.5.14.dist-info/RECORD,,
+ocf_data_sampler-0.5.15.dist-info/METADATA,sha256=AcLJpUOG6smk3WDSZkj3K8cjhvSg9z0lPoEKM16B6q8,12817
+ocf_data_sampler-0.5.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ocf_data_sampler-0.5.15.dist-info/top_level.txt,sha256=deUxqmsONNAGZDNbsntbXH7BRA1MqWaUeAJrCo6q_xA,25
+ocf_data_sampler-0.5.15.dist-info/RECORD,,

{ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{ocf_data_sampler-0.5.14.dist-info → ocf_data_sampler-0.5.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

ocf-data-sampler 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl

Potentially problematic release.

ocf-data-sampler 0.5.14py3-none-any.whl → 0.5.15py3-none-any.whl