PyPI - hydraflow - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

hydraflow 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

hydraflow/core/collection.py +541 -0
hydraflow/core/group_by.py +205 -0
hydraflow/core/run.py +42 -61
hydraflow/core/run_collection.py +37 -494
hydraflow/core/run_info.py +0 -9
{hydraflow-0.16.2.dist-info → hydraflow-0.17.0.dist-info}/METADATA +1 -1
{hydraflow-0.16.2.dist-info → hydraflow-0.17.0.dist-info}/RECORD +10 -8
{hydraflow-0.16.2.dist-info → hydraflow-0.17.0.dist-info}/WHEEL +0 -0
{hydraflow-0.16.2.dist-info → hydraflow-0.17.0.dist-info}/entry_points.txt +0 -0
{hydraflow-0.16.2.dist-info → hydraflow-0.17.0.dist-info}/licenses/LICENSE +0 -0

hydraflow/core/group_by.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""GroupBy module for organizing and aggregating collections of items.
+This module provides the GroupBy class, which represents the result of a
+group_by operation on a Collection. It organizes items into groups based on
+specified keys and enables aggregation operations across those groups.
+The GroupBy class implements a dictionary-like interface, allowing access to
+individual groups through key lookup, iteration, and standard dictionary
+methods like keys(), values(), and items().
+Example:
+    ```python
+    # Group runs by model type
+    grouped = runs.group_by("model.type")
+    # Access a specific group
+    transformer_runs = grouped["transformer"]
+    # Iterate through groups
+    for model_type, group in grouped.items():
+        print(f"Model: {model_type}, Runs: {len(group)}")
+    # Perform aggregations
+    stats = grouped.agg(
+        "accuracy",
+        "loss",
+        avg_time=lambda g: sum(r.get("runtime") for r in g) / len(g)
+    )
+    ```
+The GroupBy class supports aggregation through the agg() method, which can
+compute both predefined metrics from the grouped items and custom aggregations
+specified as callables.
+"""
+from __future__ import annotations
+from dataclasses import MISSING
+from typing import TYPE_CHECKING, Any
+from polars import DataFrame, Series
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        ItemsView,
+        Iterator,
+        KeysView,
+        Sequence,
+        ValuesView,
+    )
+    from .collection import Collection
+class GroupBy[C: Collection[Any], I]:
+    """Represents the result of a group_by operation on a Collection.
+    The GroupBy class organizes items from a Collection into groups based on
+    specified keys. It provides a dictionary-like interface for accessing the
+    groups and methods for aggregating data across the groups.
+    Attributes:
+        by: The keys used for grouping.
+        groups: A dictionary mapping group keys to Collection instances.
+    """
+    by: tuple[str, ...]
+    groups: dict[Any, C]
+    def __init__(self, by: tuple[str, ...], groups: dict[Any, C]) -> None:
+        """Initialize a GroupBy instance.
+        Args:
+            by: The keys used for grouping.
+            groups: A dictionary mapping group keys to Collection instances.
+        """
+        self.by = by
+        self.groups = groups
+    def __getitem__(self, key: Any) -> C:
+        """Get a group by its key.
+        Args:
+            key: The group key to look up.
+        Returns:
+            The Collection corresponding to the key.
+        Raises:
+            KeyError: If the key is not found in the groups.
+        """
+        return self.groups[key]
+    def __iter__(self) -> Iterator[Any]:
+        """Iterate over group keys.
+        Returns:
+            An iterator over the group keys.
+        """
+        return iter(self.groups)
+    def __len__(self) -> int:
+        """Get the number of groups.
+        Returns:
+            The number of groups.
+        """
+        return len(self.groups)
+    def __contains__(self, key: Any) -> bool:
+        """Check if a key is in the groups.
+        Args:
+            key: The key to check for.
+        Returns:
+            True if the key is in the groups, False otherwise.
+        """
+        return key in self.groups
+    def keys(self) -> KeysView[Any]:
+        """Get the keys of the groups.
+        Returns:
+            A view of the group keys.
+        """
+        return self.groups.keys()
+    def values(self) -> ValuesView[C]:
+        """Get the values (Collections) of the groups.
+        Returns:
+            A view of the group values.
+        """
+        return self.groups.values()
+    def items(self) -> ItemsView[Any, C]:
+        """Get the (key, value) pairs of the groups.
+        Returns:
+            A view of the (key, value) pairs.
+        """
+        return self.groups.items()
+    def agg(
+        self,
+        *aggs: str,
+        **named_aggs: Callable[[C | Sequence[I]], Any],
+    ) -> DataFrame:
+        """Aggregate data across groups.
+        This method computes aggregations for each group and returns the results
+        as a DataFrame. There are two ways to specify aggregations:
+        1. String keys: These are interpreted as attributes to extract from each
+           item in the group.
+        2. Callables: Functions that take a Collection or Sequence of items and
+           return an aggregated value.
+        Args:
+            *aggs: String keys to aggregate.
+            **named_aggs: Named aggregation functions.
+        Returns:
+            A DataFrame with group keys and aggregated values.
+        Example:
+            ```python
+            # Aggregate by accuracy and loss, and compute average runtime
+            stats = grouped.agg(
+                "accuracy",
+                "loss",
+                avg_runtime=lambda g: sum(r.get("runtime") for r in g) / len(g)
+            )
+            ```
+        """
+        gp = self.groups
+        if len(self.by) == 1:
+            df = DataFrame({self.by[0]: list(gp)})
+        else:
+            df = DataFrame(dict(zip(self.by, k, strict=True)) for k in gp)
+        columns = []
+        for agg in aggs:
+            values = [[c._get(i, agg, MISSING) for i in c] for c in gp.values()]  # noqa: SLF001
+            columns.append(Series(agg, values))
+        for k, v in named_aggs.items():
+            columns.append(Series(k, [v(r) for r in gp.values()]))
+        return df.with_columns(columns)

hydraflow/core/run.py CHANGED Viewed

@@ -29,7 +29,7 @@ from functools import cached_property
 from pathlib import Path
 from typing import TYPE_CHECKING, cast, overload
-from omegaconf import DictConfig, ListConfig, OmegaConf
+from omegaconf import DictConfig, OmegaConf
 from .run_info import RunInfo
@@ -54,6 +54,7 @@ class Run[C, I = None]:
     """Factory function to create the implementation instance.
     This can be a callable that accepts either:
     - A single Path parameter: the artifacts directory
     - Both a Path and a config parameter: the artifacts directory and
       the configuration instance
@@ -65,10 +66,10 @@ class Run[C, I = None]:
     def __init__(
         self,
         run_dir: Path,
-        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,
+        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
     ) -> None:
         self.info = RunInfo(run_dir)
-        self.impl_factory = impl_factory
+        self.impl_factory = impl_factory or (lambda _: None)  # type: ignore
     def __repr__(self) -> str:
         """Return a string representation of the Run."""
@@ -132,7 +133,7 @@ class Run[C, I = None]:
         impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,  # type: ignore
         *,
         n_jobs: int = 0,
-    ) -> RunCollection[Self]: ...
+    ) -> RunCollection[Self, I]: ...
     @classmethod
     def load(
@@ -141,7 +142,7 @@ class Run[C, I = None]:
         impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,  # type: ignore
         *,
         n_jobs: int = 0,
-    ) -> Self | RunCollection[Self]:
+    ) -> Self | RunCollection[Self, I]:
         """Load a Run from a run directory.
         Args:
@@ -167,13 +168,14 @@ class Run[C, I = None]:
         from .run_collection import RunCollection
         if n_jobs == 0:
-            return RunCollection(cls(Path(r), impl_factory) for r in run_dir)
+            runs = (cls(Path(r), impl_factory) for r in run_dir)
+            return RunCollection(runs, cls.get)  # type: ignore
         from joblib import Parallel, delayed
         parallel = Parallel(backend="threading", n_jobs=n_jobs)
         runs = parallel(delayed(cls)(Path(r), impl_factory) for r in run_dir)
-        return RunCollection(runs)  # type: ignore
+        return RunCollection(runs, cls.get)  # type: ignore
     @overload
     def update(
@@ -211,7 +213,9 @@ class Run[C, I = None]:
                 (can use dot notation like "section.subsection.param"),
                 or a tuple of strings to set multiple related configuration
                 values at once.
-            value: The value to set. This can be:
+            value: The value to set.
+                This can be:
                 - For string keys: Any value, or a callable that returns
                   a value
                 - For tuple keys: An iterable with the same length as the
@@ -258,6 +262,12 @@ class Run[C, I = None]:
         Args:
             key: The key to look for. Can use dot notation for
                 nested keys in configuration.
+                Special keys:
+                - "cfg": Returns the configuration object
+                - "impl": Returns the implementation object
+                - "info": Returns the run information object
             default: Value to return if the key is not found.
                 If a callable, it will be called with the Run instance
                 and the value returned will be used as the default.
@@ -272,6 +282,13 @@ class Run[C, I = None]:
             AttributeError: If the key is not found and
                 no default is provided.
+        Note:
+            The search order for keys is:
+            1. Configuration (cfg)
+            2. Implementation (impl)
+            3. Run information (info)
+            4. Run object itself (self)
         """
         key = key.replace("__", ".")
@@ -279,12 +296,10 @@ class Run[C, I = None]:
         if value is not MISSING:
             return value
-        if self.impl and hasattr(self.impl, key):
-            return getattr(self.impl, key)
-        info = self.info.to_dict()
-        if key in info:
-            return info[key]
+        for attr in [self.impl, self.info, self]:
+            value = getattr(attr, key, MISSING)
+            if value is not MISSING:
+                return value
         if default is not MISSING:
             if callable(default):
@@ -295,71 +310,37 @@ class Run[C, I = None]:
         msg = f"No such key: {key}"
         raise AttributeError(msg)
-    def predicate(self, key: str, value: Any) -> bool:
-        """Check if a value satisfies a condition for filtering.
-        This method retrieves the attribute specified by the key
-        using the get method, and then compares it with the given
-        value according to the following rules:
-        - If value is callable: Call it with the attribute and return
-          the boolean result
-        - If value is a list or set: Check if the attribute is in the list/set
-        - If value is a tuple of length 2: Check if the attribute is
-          in the range [value[0], value[1]]. Both sides are inclusive
-        - Otherwise: Check if the attribute equals the value
+    def to_dict(self, flatten: bool = True) -> dict[str, Any]:
+        """Convert the Run to a dictionary.
         Args:
-            key: The key to get the attribute from.
-            value: The value to compare with, or a callable that takes
-                the attribute and returns a boolean.
+            flatten (bool, optional): If True, flattens nested dictionaries.
+                Defaults to True.
         Returns:
-            bool: True if the attribute satisfies the condition, False otherwise.
+            dict[str, Any]: A dictionary representation of the Run's configuration.
         """
-        attr = self.get(key)
-        return _predicate(attr, value)
-    def to_dict(self) -> dict[str, Any]:
-        """Convert the Run to a dictionary."""
-        info = self.info.to_dict()
         cfg = OmegaConf.to_container(self.cfg)
-        return info | _flatten_dict(cfg)  # type: ignore
+        if not isinstance(cfg, dict):
+            raise TypeError("Configuration must be a dictionary")
+        standard_dict: dict[str, Any] = {str(k): v for k, v in cfg.items()}
-def _predicate(attr: Any, value: Any) -> bool:
-    if callable(value):
-        return bool(value(attr))
+        if flatten:
+            return _flatten_dict(standard_dict)
-    if isinstance(value, ListConfig):
-        value = list(value)
-    if isinstance(value, list | set) and not _is_iterable(attr):
-        return attr in value
-    if isinstance(value, tuple) and len(value) == 2 and not _is_iterable(attr):
-        return value[0] <= attr <= value[1]
-    if _is_iterable(value):
-        value = list(value)
-    if _is_iterable(attr):
-        attr = list(attr)
-    return attr == value
-def _is_iterable(value: Any) -> bool:
-    return isinstance(value, Iterable) and not isinstance(value, str)
+        return standard_dict
 def _flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
     items = []
     for k, v in d.items():
         key = f"{parent_key}.{k}" if parent_key else k
         if isinstance(v, dict):
             items.extend(_flatten_dict(v, key).items())
         else:
             items.append((key, v))
     return dict(items)

hydraflow 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

hydraflow 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl