PyPI - domainiac - Versions diffs - 9.3.1__tar.gz → 10.0.0__tar.gz - Mend

domainiac 9.3.1tar.gz → 10.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{domainiac-9.3.1 → domainiac-10.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: domainiac
-Version: 9.3.1
+Version: 10.0.0
 Summary: Package for working with Energinet data, but with specialized functions used for Enigma.
 Author: Team Enigma
 Author-email: enigma@energinet.dk
@@ -17,3 +17,4 @@ Requires-Dist: pvlib (>=0.13.1)
 Requires-Dist: scikit-learn (>=1.3.0)
 Requires-Dist: scipy (>=1.15.3)
 Requires-Dist: typeguard (>=4.2.1)
+Requires-Dist: utm (>=0.8.1)

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/__init__.py RENAMED Viewed

@@ -1,9 +1,8 @@
 from .managers import (
     AvailabilityManager,
+    MasterdataManager,
     MeteringManager,
     NWPManager,
-    PlantManager,
     ResourceManager,
-    UnitManager,
 )
 from .modeling import Coordinate, Group, Neighborhood, NWPParameter, NWPProvider, Plant

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/__init__.py RENAMED Viewed

@@ -3,6 +3,4 @@ from .masterdata_manager import MasterdataManager
 from .metering_manager import MeteringManager
 from .nwp_manager import NWPManager
 from .outage_manager import OutageManager
-from .plant_manager import PlantManager
 from .resource_manager import ResourceManager
-from .unit_manager import UnitManager

domainiac-10.0.0/domainiac/managers/masterdata_manager.py ADDED Viewed

@@ -0,0 +1,229 @@
+import json
+from pathlib import Path
+import datamazing.pandas as pdz
+import pandas as pd
+import utm
+CONNECTION_POINT_BEHIND_THE_METER = "installationstilsluttet (I)"
+class MasterdataManager:
+    SCHEMA = json.loads(
+        (Path(__file__).parent / "schemas/masterdata_manager.json").read_bytes()
+    )
+    def __init__(
+        self,
+        db: pdz.Database,
+        time_interval: pdz.TimeInterval = None,
+        as_of_time: pd.Timestamp = None,
+    ) -> None:
+        self.db = db
+        self.time_interval = time_interval
+        self.as_of_time = as_of_time
+        if as_of_time is not None and as_of_time.utcoffset().total_seconds() != 0:
+            raise ValueError("'as_of_time' must be utc")
+        if as_of_time is not None and (time_interval is not None):
+            raise ValueError("Cannot provide both 'as_of_time' and 'time_interval'")
+        if as_of_time is None and (time_interval is None):
+            raise ValueError(
+                "Either 'as_of_time' must be provided, or 'time_interval' must be"
+                "provided"
+            )
+    @property
+    def start_time(self) -> pd.Timestamp:
+        if self.as_of_time:
+            return self.as_of_time
+        else:
+            return self.time_interval.left
+    @property
+    def end_time(self) -> pd.Timestamp:
+        if self.as_of_time:
+            return self.as_of_time + pdz.get_epsilon(dtype=pd.DatetimeTZDtype(tz="UTC"))
+        else:
+            return self.time_interval.right
+    @staticmethod
+    def _intersect_time_and_comission_intervals(df: pd.DataFrame) -> pd.DataFrame:
+        df = df.copy()
+        # take max og start time and comission time (if latter is not null)
+        df["start_time_utc"] = df[["start_time_utc", "commission_time_utc"]].max(axis=1)
+        # take min of end time and decommission time (if latter is not null)
+        df["end_time_utc"] = df[["end_time_utc", "decommission_time_utc"]].min(axis=1)
+        # filter out rows where start time is after end time
+        df = df[df["start_time_utc"] <= df["end_time_utc"]]
+        df = df.drop(columns=["commission_time_utc", "decommission_time_utc"])
+        return df
+    def _filter_interval(self, df: pd.DataFrame) -> pd.DataFrame:
+        df["start_time_utc"] = df["start_time_utc"].clip(lower=self.start_time)
+        df["end_time_utc"] = df["end_time_utc"].clip(upper=self.end_time)
+        df = df[df["start_time_utc"] < df["end_time_utc"]]
+        return df
+    def _get_table(self, table_name: str) -> pd.DataFrame:
+        df = self.db.query(table_name)
+        df = df.drop(columns=["emda_version", "created_time_utc"], errors="ignore")
+        return df
+    def get_plant_masterdata(self) -> pd.DataFrame:
+        df_plant = self._get_table("masterdata_emda_plants")
+        df_unit = self._get_table("masterdata_emda_units")
+        df_market_participant = self._get_table("masterdata_emda_market_participants")
+        df_plant = self._intersect_time_and_comission_intervals(df_plant)
+        df_unit = self._intersect_time_and_comission_intervals(df_unit)
+        df_plant = self._filter_interval(df_plant)
+        df_unit = self._filter_interval(df_unit)
+        df_market_participant = self._filter_interval(df_market_participant)
+        df_unit = df_unit.sort_values(by="capacity_max_MW", ascending=False)
+        df_unit_summary = pdz.group_interval(
+            df_unit,
+            by=["plant_id"],
+            interval=("start_time_utc", "end_time_utc"),
+        ).agg(
+            {
+                "power_system_resource_type": "first",  # From largest unit
+                "asset_type": "first",  # From largest unit
+                "power_system_resource": "first",  # From largest unit
+                "coordinate_x_utm": "mean",
+                "coordinate_y_utm": "mean",
+                "hub_height_m": "mean",
+                "c11": "first",  # Assuming unique per group
+            }
+        )
+        df_plant = pdz.merge_interval_interval(
+            df_plant,
+            df_unit_summary,
+            on=["plant_id"],
+            interval=("start_time_utc", "end_time_utc"),
+            how="left",
+        )
+        df_plant = pdz.merge_interval_interval(
+            df_plant,
+            df_market_participant,
+            on=["market_participant_id"],
+            interval=("start_time_utc", "end_time_utc"),
+            how="left",
+        )
+        df_address = self._get_table("masterdata_address")
+        df_address = df_address.rename(
+            columns={
+                "street_name": "address_street_name",
+                "house_number": "address_house_number",
+                "postal_code": "address_postal_code",
+                "latitude": "address_latitude",
+                "longitude": "address_longitude",
+            }
+        )
+        df_plant = pdz.merge(
+            df_plant,
+            df_address,
+            on=["address_street_name", "address_house_number", "address_postal_code"],
+            how="left",
+        )
+        df_plant["is_household"] = is_household(df_plant)
+        df_plant["latitude"], df_plant["longitude"] = coordinates(df_plant)
+        df_plant = df_plant.drop(columns=["address_latitude", "address_longitude"])
+        return df_plant
+def is_household(df: pd.DataFrame) -> pd.Series:
+    """
+    Determine if a plant type corresponds to a household.
+    """
+    # due to inadequate quality of master data
+    # we apply several filters to estimate
+    # if an installation is a household or not,
+    # based on the description found in
+    # https://ens.dk/sites/ens.dk/files/Stoette_vedvarende_energi/energistyrelsens_vejledning_om_beregning_af_nettoafregning_og_opgoerelse_.pdf
+    # household installations will be behind-the-meter
+    # ("installationstilsluttet")
+    is_behind_the_meter = df["connection_point"] == CONNECTION_POINT_BEHIND_THE_METER
+    # household installations will be in the yearly
+    # settlement group 6 ("årsbaseret nettoafregning")
+    is_settlement_group_6 = df["settlement_group"] == 6
+    # also remove installations with installed power
+    # below 200 KW, as these will also most likely
+    # be behind-the-meter installations (this should be
+    # captured already in the above filters, but
+    # masterdata is not fully reliable)
+    # an example of a 200 KW plant can be found at coordinates 55.691, 9.397
+    is_small = df["capacity_max_MW"] <= 0.2
+    # if the installation is connected to the TSO,
+    # it is definitely not a household installation
+    is_tso_connected = df["operation_type"] == "Tso"
+    return (is_behind_the_meter | is_settlement_group_6 | is_small) & ~is_tso_connected
+def coordinates(
+    df: pd.DataFrame,
+) -> pd.DataFrame:
+    # Translate UTM to lat/lon if they exist
+    latitude, longitude = utm_to_latlon(
+        df["coordinate_x_utm"],
+        df["coordinate_y_utm"],
+        df["price_area"],
+    )
+    # Prefer translated utm coordinates, fall back to address coordinates
+    latitude = latitude.combine_first(df["address_latitude"])
+    longitude = longitude.combine_first(df["address_longitude"])
+    return latitude, longitude
+def utm_to_latlon(
+    x: pd.Series, y: pd.Series, price_area: pd.Series
+) -> tuple[pd.Series, pd.Series]:
+    """
+    Convert UTM coordinates to latitude and longitude, given the price area.
+    Ideally, we would use the zone number, but this is not available in masterdata
+    currently. Instead, we use a workaround based on the price area, which should
+    be sufficient for now. It follows the following logic:
+    - If the price area is DK1, use UTM zone 32
+    - If the price area is DK2, use UTM zone 33, unless the resulting longitude
+      is above 16.0 (about 50 km to the right of Bornholm): In this case, we assume
+      the real zone is actually 32, since no entities should be placed there.
+    - If the price area is undefined, use UTM zone 32, since this is the majority of
+      the area in Denmark.
+    """
+    if x.empty or y.empty or x.isnull().all() or y.isnull().all():
+        return pd.Series(dtype=float), pd.Series(dtype=float)
+    lat_32, lon_32 = utm.to_latlon(x, y, zone_number=32, northern=True, strict=False)
+    lat_33, lon_33 = utm.to_latlon(x, y, zone_number=33, northern=True, strict=False)
+    in_DK1 = price_area == "DK1"
+    in_DK2 = price_area == "DK2"
+    undefined = price_area.isnull()
+    is_zone_32 = in_DK1 | undefined | (in_DK2 & (lon_33 >= 16.0))
+    lat = lat_32.where(is_zone_32, lat_33)
+    lon = lon_32.where(is_zone_32, lon_33)
+    return lat, lon

domainiac-10.0.0/domainiac/managers/schemas/masterdata_manager.json ADDED Viewed

@@ -0,0 +1,40 @@
+{
+  "address_city": "object",
+  "address_country": "object",
+  "address_house_number": "object",
+  "address_municipality": "object",
+  "address_postal_code": "object",
+  "address_street_name": "object",
+  "asset_type": "object",
+  "c11": "object",
+  "capacity_max_MW": "float64",
+  "capacity_min_MW": "float64",
+  "connection_point": "object",
+  "coordinate_x_utm": "float64",
+  "coordinate_y_utm": "float64",
+  "datahub_gsrn_e17": "object",
+  "datahub_gsrn_e18": "object",
+  "end_time_utc": "datetime64[ns, UTC]",
+  "hub_height_m": "float64",
+  "market_participant_ecp_eic": "object",
+  "market_participant_eic": "object",
+  "market_participant_gln": "object",
+  "market_participant_id": "object",
+  "market_participant_name": "object",
+  "market_participant_short_name": "object",
+  "operation_type": "object",
+  "plant_gsrn": "object",
+  "plant_id": "object",
+  "plant_name": "object",
+  "plant_short_name": "object",
+  "plant_type": "object",
+  "power_system_resource": "object",
+  "power_system_resource_type": "object",
+  "price_area": "object",
+  "settlement_group": "object",
+  "start_time_utc": "datetime64[ns, UTC]",
+  "substation_id": "object",
+  "is_household": "bool",
+  "latitude": "float64",
+  "longitude": "float64"
+}

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/modeling/plant.py RENAMED Viewed

@@ -12,7 +12,7 @@ class Plant:
     datahub_gsrn_e18: str
     price_area: str
     coordinate: Coordinate
-    installed_power_MW: float
+    capacity_max_MW: float
     power_system_resource_type: str
     @classmethod
@@ -40,7 +40,7 @@ class Plant:
                     longitude=row["longitude"],
                     altitude=altitude,
                 ),
-                installed_power_MW=row["installed_power_MW"],
+                capacity_max_MW=row["capacity_max_MW"],
                 power_system_resource_type=psrt,
             )
             plants.append(plant)
@@ -50,7 +50,7 @@ class Plant:
 @dataclass(frozen=True)
 class Group:
     coordinate: Coordinate
-    installed_power_MW: float
+    capacity_max_MW: float
     identifiers: dict[str, str]
     @classmethod
@@ -65,7 +65,7 @@ class Group:
             identifiers = {identifier: row[identifier] for identifier in identifiers}
             group = cls(
                 identifiers=identifiers,
-                installed_power_MW=row["installed_power_MW"],
+                capacity_max_MW=row["capacity_max_MW"],
                 coordinate=Coordinate(
                     latitude=row["latitude"],
                     longitude=row["longitude"],

{domainiac-9.3.1 → domainiac-10.0.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "domainiac"
-version = "9.3.1"
+version = "10.0.0"
 description = "Package for working with Energinet data, but with specialized functions used for Enigma."
 authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
 requires-python = ">=3.10"
@@ -16,6 +16,7 @@ typeguard = ">=4.2.1"
 scikit-learn = ">=1.3.0"
 scipy = ">=1.15.3"
 pvlib = ">=0.13.1"
+utm = ">=0.8.1"
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=2.20.0"

domainiac-9.3.1/domainiac/managers/masterdata_manager.py DELETED Viewed

@@ -1,79 +0,0 @@
-import datamazing.pandas as pdz
-import pandas as pd
-from typeguard import typechecked
-from ..wrappers import cache_decorator
-class MasterdataManager:
-    """
-    Manager which simplifies the process of getting units from masterdata.
-    """
-    def __init__(
-        self,
-        db: pdz.Database,
-        time_interval: pdz.TimeInterval,
-        resolution: pd.Timedelta,
-        cache_masterdata: bool = False,
-    ) -> None:
-        self.db = db
-        self.time_interval = time_interval
-        self.resolution = resolution
-        self.cache_masterdata = cache_masterdata
-    masterdata_cache = {}
-    @typechecked
-    def _get_operational_entities(self, table: str) -> pd.DataFrame:
-        filters = {"standing_entity_state": "InOperation"}
-        df = self.db.query(table, filters=filters)
-        df = df[df["decommission_date_utc"].isna()].reset_index(drop=True)
-        return df
-    @typechecked
-    def get_operational_entities(self, table: str) -> pd.DataFrame:
-        """Gets the operational data for a given table."""
-        if self.cache_masterdata:
-            cached_query = cache_decorator(self.masterdata_cache)(
-                self._get_operational_entities
-            )
-            df = cached_query(table)
-        else:
-            df = self._get_operational_entities(table)
-        return df
-    @typechecked
-    def get_data(
-        self,
-        table: str,
-        filters: dict = {},
-        columns: list = [],
-    ) -> pd.DataFrame:
-        """Gets the data for a given table.
-        Filters for rows valid at the end of time interval.
-        """
-        # Get operational entities
-        df = self.get_operational_entities(table)
-        # Apply the filters
-        for column, value in filters.items():
-            if isinstance(value, list):
-                df = df[df[column].isin(value)].reset_index()
-            else:
-                df = df[df[column] == value].reset_index()
-        for column in columns:
-            if column not in df.columns:
-                raise KeyError(f"Column {column} not found in {table}")
-        df = pdz.as_of_time(
-            df=df,
-            period=("valid_from_date_utc", "valid_to_date_utc"),
-            at=self.time_interval.right,
-        )
-        df = df.filter(columns)
-        return df

domainiac-9.3.1/domainiac/managers/plant_manager.py DELETED Viewed

@@ -1,160 +0,0 @@
-import datamazing.pandas as pdz
-import pandas as pd
-from .masterdata_manager import MasterdataManager
-from .unit_manager import UnitManager
-class PlantManager(MasterdataManager):
-    """
-    Manager which simplifies the process of getting plants from masterdata.
-    """
-    def __init__(
-        self,
-        db: pdz.Database,
-        time_interval: pdz.TimeInterval,
-        resolution: pd.Timedelta,
-        cache_masterdata: bool = False,
-    ) -> None:
-        self.db = db
-        self.time_interval = time_interval
-        self.resolution = resolution
-        self.cache_masterdata = cache_masterdata
-        self.unit_manager = UnitManager(db, time_interval, resolution, cache_masterdata)
-    def get_plants(
-        self,
-        filters: dict = {},
-        columns: list | None = None,
-    ) -> pd.DataFrame:
-        """Gets the plants for a given plant type.
-        Filters for plants valid at the end of time interval.
-        Filters by default for plants in operation.
-        """
-        default_columns = [
-            "plant_id",
-            "masterdata_gsrn",
-            "datahub_gsrn_e18",
-            "installed_power_MW",
-            "price_area",
-            "is_tso_connected",
-            "valid_from_date_utc",
-            "valid_to_date_utc",
-            "primary_net_component_id",
-        ]
-        if not columns:
-            columns = default_columns
-        # TODO: masterdata_plant table doesn't have net_component_id column
-        # Find a better way to do this in future.
-        plant_columns = [col for col in columns if col != "primary_net_component_id"]
-        df_plant = self.get_data(
-            "masterdataPlant", filters=filters, columns=plant_columns
-        )
-        df_psr = self._get_power_system_resource()
-        df = df_plant.merge(
-            df_psr, on=["plant_id"], how="left", validate="m:1"
-        ).drop_duplicates()
-        df = df[columns]
-        return df
-    def get_installed_power_timeseries(self, gsrn: str) -> pd.DataFrame:
-        """Gets the installed power timeseries for a plant."""
-        df_times = self.time_interval.to_range(self.resolution).to_frame(
-            index=False, name="time_utc"
-        )
-        # explode plant to time series
-        df_plant = self.get_operational_entities("masterdataPlant")
-        df_plant = df_plant.query(f"masterdata_gsrn == '{gsrn}'")
-        df_plant = pdz.merge_point_interval(
-            df_times,
-            df_plant,
-            left_point="time_utc",
-            right_interval=("valid_from_date_utc", "valid_to_date_utc"),
-        )
-        return df_plant.filter(["time_utc", "installed_power_MW"]).reset_index(
-            drop=True
-        )
-    def _get_corrected_installed_power(
-        self, gsrn: str, df_invalid_periods: pd.DataFrame
-    ):
-        df_times = self.time_interval.to_range(self.resolution).to_frame(
-            index=False, name="time_utc"
-        )
-        df = self.get_installed_power_timeseries(gsrn=gsrn)
-        # explode invalid periods to time series
-        df_invalid_periods = df_invalid_periods.query(f"masterdata_gsrn == '{gsrn}'")
-        df_invalid_periods = pdz.merge(
-            df_times,
-            df_invalid_periods,
-            left_time="time_utc",
-            right_period=("start_date_utc", "end_date_utc"),
-        )
-        df = pdz.merge(
-            df,
-            df_invalid_periods,
-            on="time_utc",
-            how="left",
-        )
-        # correct installed power for invalid periods
-        df["installed_power_MW"] = df["installed_power_MW"].where(
-            df["corrected_installed_power_MW"].isnull(),
-            df["corrected_installed_power_MW"],
-        )
-        df = df[["time_utc", "installed_power_MW"]]
-        return df
-    def _get_power_system_resource(self) -> pd.DataFrame:
-        df_unit = self.unit_manager.get_units(
-            columns=["masterdata_gsrn", "capacity_min_MW", "capacity_max_MW"]
-        )
-        df_psr_mapping = self.db.query("masterdataAggregatedUnit")[
-            ["unit_gsrn", "net_component_id"]
-        ]
-        df = pd.merge(
-            df_psr_mapping,
-            df_unit,
-            left_on="unit_gsrn",
-            right_on="masterdata_gsrn",
-            how="left",
-            validate="1:m",
-        )
-        # for a small number of plants, the underlying unit can
-        # be associated with different net components. Too avoid
-        # this issue, we choose for each plant, the net component,
-        # for which the underlying units amounts to the largest
-        # capacity
-        df = pdz.group(df, by=["net_component_id", "plant_id"]).agg(
-            {"capacity_min_MW": "sum", "capacity_max_MW": "sum"}
-        )
-        df["capacity_range_MW"] = df["capacity_max_MW"] - df["capacity_min_MW"]
-        df = df.sort_values(
-            ["plant_id", "capacity_range_MW"], ascending=False
-        ).drop_duplicates(subset=["plant_id"], keep="first")
-        df = df.rename(
-            columns={
-                "net_component_id": "primary_net_component_id",
-            }
-        )
-        return df

domainiac-9.3.1/domainiac/managers/unit_manager.py DELETED Viewed

@@ -1,42 +0,0 @@
-import datamazing.pandas as pdz
-import pandas as pd
-from .masterdata_manager import MasterdataManager
-class UnitManager(MasterdataManager):
-    """
-    Manager which simplifies the process of getting units from masterdata.
-    """
-    def __init__(
-        self,
-        db: pdz.Database,
-        time_interval: pdz.TimeInterval,
-        resolution: pd.Timedelta,
-        cache_masterdata: bool = False,
-    ) -> None:
-        self.db = db
-        self.time_interval = time_interval
-        self.resolution = resolution
-        self.cache_masterdata = cache_masterdata
-    def get_units(
-        self,
-        filters: dict = {},
-        columns: list | None = None,
-    ) -> pd.DataFrame:
-        """Gets the units for a given unit type.
-        Filters for units valid at the end of time interval.
-        Filters by default for units in operation.
-        """
-        default_columns = [
-            "masterdata_gsrn",
-            "plant_id",
-            "power_system_resource_type",
-        ]
-        if not columns:
-            columns = default_columns
-        else:
-            columns = list(set(default_columns + columns))
-        return self.get_data("masterdataUnit", filters=filters, columns=columns)

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/__init__.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/conversions.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/interpolation.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/solar.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/temperature.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/typing.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/functions/wind.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/availability_manager.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/metering_manager.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/nwp_manager.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/outage_manager.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/managers/resource_manager.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/modeling/__init__.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/modeling/nwp.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/wrappers/__init__.py RENAMED Viewed

File without changes

{domainiac-9.3.1 → domainiac-10.0.0}/domainiac/wrappers/cache_wrapper.py RENAMED Viewed

File without changes

domainiac 9.3.1__tar.gz → 10.0.0__tar.gz

domainiac 9.3.1tar.gz → 10.0.0tar.gz