PyPI - hydraflow - Versions diffs - 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl - Mend

hydraflow 0.17.0py3-none-any.whl → 0.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

hydraflow/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Integrate Hydra and MLflow to manage and track machine learning experiments."""
+from hydraflow.core.collection import Collection
 from hydraflow.core.context import chdir_artifact, log_run, start_run
 from hydraflow.core.io import (
     get_artifact_dir,
@@ -14,6 +15,7 @@ from hydraflow.core.run import Run
 from hydraflow.core.run_collection import RunCollection
 __all__ = [
+    "Collection",
     "Run",
     "RunCollection",
     "chdir_artifact",

hydraflow/core/collection.py CHANGED Viewed

@@ -4,9 +4,10 @@ from __future__ import annotations
 from collections.abc import Hashable, Iterable, Sequence
 from dataclasses import MISSING
-from typing import TYPE_CHECKING, overload
+from typing import TYPE_CHECKING, Concatenate, overload
 import numpy as np
+from joblib.parallel import Parallel, delayed
 from omegaconf import ListConfig, OmegaConf
 from polars import DataFrame, Series
@@ -378,6 +379,77 @@ class Collection[I](Sequence[I]):
         return self[index]
+    def map[**P, R](
+        self,
+        function: Callable[Concatenate[I, P], R],
+        *args: P.args,
+        **kwargs: P.kwargs,
+    ) -> Iterator[R]:
+        """Apply a function to each item and return an iterator of results.
+        This is a memory-efficient mapping operation that lazily evaluates results.
+        Ideal for large collections where memory usage is a concern.
+        Args:
+            function: Function to apply to each item. The item is passed
+                as the first argument.
+            *args: Additional positional arguments to pass to the function.
+            **kwargs: Additional keyword arguments to pass to the function.
+        Returns:
+            Iterator[R]: An iterator of the function's results.
+        Examples:
+            ```python
+            # Process results one at a time
+            for result in collection.map(process_item, additional_arg):
+                handle_result(result)
+            # Convert to list if needed
+            results = list(collection.map(transform_item))
+            ```
+        """
+        yield from (function(i, *args, **kwargs) for i in self)
+    def pmap[**P, R](
+        self,
+        function: Callable[Concatenate[I, P], R],
+        n_jobs: int = -1,
+        backend: str = "multiprocessing",
+        *args: P.args,
+        **kwargs: P.kwargs,
+    ) -> list[R]:
+        """Apply a function to each item in parallel and return a list of results.
+        This method processes items concurrently for improved performance on
+        CPU-bound or I/O-bound operations, depending on the backend.
+        Args:
+            function: Function to apply to each item. The item is passed
+                as the first argument.
+            n_jobs (int): Number of jobs to run in parallel. -1 means using all
+                processors.
+            backend (str): Parallelization backend.
+            *args: Additional positional arguments to pass to the function.
+            **kwargs: Additional keyword arguments to pass to the function.
+        Returns:
+            list[R]: A list containing all results of the function applications.
+        Examples:
+            ```python
+            # Process all items in parallel using all cores
+            results = collection.pmap(heavy_computation)
+            # Specify number of parallel jobs and backend
+            results = collection.pmap(process_files, n_jobs=4, backend="threading")
+            ```
+        """
+        parallel = Parallel(n_jobs=n_jobs, backend=backend, return_as="list")
+        return parallel(delayed(function)(i, *args, **kwargs) for i in self)  # type: ignore
     def to_frame(
         self,
         *keys: str,
@@ -409,7 +481,7 @@ class Collection[I](Sequence[I]):
         if not kwargs:
             return df
-        columns = [Series(k, [v(r) for r in self]) for k, v in kwargs.items()]
+        columns = [Series(k, self.map(v)) for k, v in kwargs.items()]
         return df.with_columns(*columns)
     def group_by(self, *by: str) -> GroupBy[Self, I]:

hydraflow/core/context.py CHANGED Viewed

@@ -128,13 +128,12 @@ def chdir_artifact(run: Run) -> Iterator[Path]:
         run (Run | None): The run to get the artifact directory from.
     """
-    curdir = Path.cwd()
+    current_dir = Path.cwd()
     artifact_dir = get_artifact_dir(run)
-    os.chdir(artifact_dir)
     try:
+        os.chdir(artifact_dir)
         yield artifact_dir
     finally:
-        os.chdir(curdir)
+        os.chdir(current_dir)

hydraflow/core/run.py CHANGED Viewed

@@ -3,7 +3,7 @@
 This module provides the Run class, which represents an MLflow
 Run in HydraFlow. A Run contains three main components:
-1. info: Information about the run, such as run directory,
+1. info: Information about the run, which includes the run directory,
    run ID, and job name.
 2. cfg: Configuration loaded from the Hydra configuration file.
 3. impl: Implementation instance created by the provided
@@ -23,7 +23,9 @@ behavior based on the run's configuration.
 from __future__ import annotations
 import inspect
+import os
 from collections.abc import Callable, Iterable
+from contextlib import contextmanager
 from dataclasses import MISSING
 from functools import cached_property
 from pathlib import Path
@@ -34,6 +36,7 @@ from omegaconf import DictConfig, OmegaConf
 from .run_info import RunInfo
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from typing import Any, Self
     from .run_collection import RunCollection
@@ -122,7 +125,7 @@ class Run[C, I = None]:
     def load(  # type: ignore
         cls,
         run_dir: str | Path,
-        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,  # type: ignore
+        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
     ) -> Self: ...
     @overload
@@ -130,7 +133,7 @@ class Run[C, I = None]:
     def load(
         cls,
         run_dir: Iterable[str | Path],
-        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,  # type: ignore
+        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
         *,
         n_jobs: int = 0,
     ) -> RunCollection[Self, I]: ...
@@ -139,7 +142,7 @@ class Run[C, I = None]:
     def load(
         cls,
         run_dir: str | Path | Iterable[str | Path],
-        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None,  # type: ignore
+        impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
         *,
         n_jobs: int = 0,
     ) -> Self | RunCollection[Self, I]:
@@ -149,11 +152,11 @@ class Run[C, I = None]:
             run_dir (str | Path | Iterable[str | Path]): The directory where the
                 MLflow runs are stored, either as a string, a Path instance,
                 or an iterable of them.
-            impl_factory (Callable[[Path], I] | Callable[[Path, C], I]): A factory
-                function that creates the implementation instance. It can accept
-                either just the artifacts directory path, or both the path and
-                the configuration instance. Defaults to a function that returns
-                None.
+            impl_factory (Callable[[Path], I] | Callable[[Path, C], I] | None):
+                A factory function that creates the implementation instance. It
+                can accept either just the artifacts directory path, or both the
+                path and the configuration instance. Defaults to None, in which
+                case a function that returns None is used.
             n_jobs (int): The number of parallel jobs. If 0 (default), runs
                 sequentially. If -1, uses all available CPU cores.
@@ -284,10 +287,11 @@ class Run[C, I = None]:
         Note:
             The search order for keys is:
-            1. Configuration (cfg)
-            2. Implementation (impl)
-            3. Run information (info)
-            4. Run object itself (self)
+            1. Configuration (`cfg`)
+            2. Implementation (`impl`)
+            3. Run information (`info`)
+            4. Run object itself (`self`)
         """
         key = key.replace("__", ".")
@@ -298,7 +302,7 @@ class Run[C, I = None]:
         for attr in [self.impl, self.info, self]:
             value = getattr(attr, key, MISSING)
-            if value is not MISSING:
+            if value is not MISSING and not callable(value):
                 return value
         if default is not MISSING:
@@ -332,6 +336,70 @@ class Run[C, I = None]:
         return standard_dict
+    @contextmanager
+    def chdir(self, relative_dir: str = "") -> Iterator[Path]:
+        """Change the current working directory to the artifact directory.
+        This context manager changes the current working directory
+        to the artifact directory of the run.
+        It ensures that the directory is changed back
+        to the original directory after the context is exited.
+        Args:
+            relative_dir (str): The relative directory to the artifact
+                directory. Defaults to an empty string.
+        Yields:
+            Path: The artifact directory of the run.
+        """
+        artifacts_dir = self.info.run_dir / "artifacts" / relative_dir
+        current_dir = Path.cwd()
+        try:
+            os.chdir(artifacts_dir)
+            yield artifacts_dir
+        finally:
+            os.chdir(current_dir)
+    def path(self, relative_path: str = "") -> Path:
+        """Return the path relative to the artifact directory.
+        Args:
+            relative_path (str): The relative path to the artifact directory.
+        Returns:
+            Path: The path relative to the artifact directory.
+        """
+        return self.info.run_dir / "artifacts" / relative_path
+    def iterdir(self, relative_dir: str = "") -> Iterator[Path]:
+        """Iterate over the artifact directories for the run.
+        Args:
+            relative_dir (str): The relative directory to iterate over.
+        Yields:
+            Path: The artifact directory for the run.
+        """
+        yield from self.path(relative_dir).iterdir()
+    def glob(self, pattern: str, relative_dir: str = "") -> Iterator[Path]:
+        """Glob the artifact directories for the run.
+        Args:
+            pattern (str): The pattern to glob.
+            relative_dir (str): The relative directory to glob.
+        Yields:
+            Path: The existing artifact paths that match the pattern.
+        """
+        yield from self.path(relative_dir).glob(pattern)
 def _flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
     items = []

hydraflow/core/run_collection.py CHANGED Viewed

@@ -20,10 +20,13 @@ Example:
     # Sort runs by specific keys
     sorted_runs = runs.sort("metrics.accuracy", reverse=True)
-    # Group runs by model type and compute aggregates
-    grouped = runs.group_by("model.type",
-                           avg_acc=lambda rc: sum(r.get("metrics.accuracy")
-                                                 for r in rc) / len(rc))
+    # Group runs by model type
+    grouped = runs.group_by("model.type")
+    # Compute aggregates on grouped data
+    metrics_df = grouped.agg(
+        avg_acc=lambda rc: sum(r.get("metrics.accuracy") for r in rc) / len(rc)
+    )
     # Convert runs to a DataFrame for analysis
     df = runs.to_frame("run_id", "model.type", "metrics.accuracy")
@@ -44,7 +47,8 @@ from .collection import Collection
 from .run import Run
 if TYPE_CHECKING:
-    from collections.abc import Callable, Iterable
+    from collections.abc import Callable, Iterable, Iterator
+    from pathlib import Path
     from typing import Any, Self
@@ -166,10 +170,46 @@ class RunCollection[R: Run[Any, Any], I = None](Collection[R]):
     @cached_property
     def impls(self) -> Collection[I]:
-        """Get the implementation object for all runs in the collection.
+        """Get the implementation objects for all runs in the collection.
         Returns:
-            Collection[Any]: A collection of implementation objects for all runs.
+            Collection[I]: A collection of implementation objects for all runs.
         """
         return Collection(run.impl for run in self)
+    def iterdir(self, relative_dir: str = "") -> Iterator[Path]:
+        """Iterate over the artifact directories for all runs in the collection.
+        This method yields all files and directories in the specified
+        relative directory for each run in the collection.
+        Args:
+            relative_dir (str): The relative directory within the artifacts
+                directory to iterate over.
+        Yields:
+            Path: Each path in the specified directory for each run
+            in the collection.
+        """
+        for run in self:
+            yield from run.path(relative_dir).iterdir()
+    def glob(self, pattern: str, relative_dir: str = "") -> Iterator[Path]:
+        """Glob the artifact directories for all runs in the collection.
+        This method yields all paths matching the specified pattern
+        in the relative directory for each run in the collection.
+        Args:
+            pattern (str): The glob pattern to match files or directories.
+            relative_dir (str): The relative directory within the artifacts
+                directory to search in.
+        Yields:
+            Path: Each path matching the pattern for each run in the collection.
+        """
+        for run in self:
+            yield from run.path(relative_dir).glob(pattern)

{hydraflow-0.17.0.dist-info → hydraflow-0.17.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hydraflow
-Version: 0.17.0
+Version: 0.17.2
 Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
 Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
 Project-URL: Source, https://github.com/daizutabi/hydraflow

{hydraflow-0.17.0.dist-info → hydraflow-0.17.2.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
-hydraflow/__init__.py,sha256=8UraqH00Qp0In301ZUmQBRTIGbV1L5zSZACOUlIRPn8,727
+hydraflow/__init__.py,sha256=_cLLokEv0pUlwvG8RMnjOwCTtDQBs0-RgGbtDk5m_Xg,794
 hydraflow/cli.py,sha256=3rGr___wwp8KazjLGQ7JO_IgAMqLyMlcVSs_QJK7g0Y,3135
 hydraflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hydraflow/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-hydraflow/core/collection.py,sha256=tUdjV_v4vzUHSNET-Z7a_8k5oXoH6nkZ_0OxZ-u8_nI,16791
-hydraflow/core/context.py,sha256=igE17oQESGjH-sBnICI8HkZbngY_crkHTgx2E-YkmEo,4155
+hydraflow/core/collection.py,sha256=RSYgS4VsGjSm0Inrz4GAng_jmm-Ct_VSDmZ9rvKFQQw,19472
+hydraflow/core/context.py,sha256=6vpwe0Xfl6mzh2hHLE-4uB9Hjew-CK4pA0KFihQ80U8,4168
 hydraflow/core/group_by.py,sha256=Pnw-oA5aXHeRG9lMLz-bKc8drqQ8LIRsWzvVn153iyQ,5488
 hydraflow/core/io.py,sha256=B3-jPuJWttRgpbIpy_XA-Z2qpXzNF1ATwyYEwA7Pv3w,5172
 hydraflow/core/main.py,sha256=pgr2b9A4VoZuwbApE71NElmV64MFJv8UKda05q4uCqk,6010
-hydraflow/core/run.py,sha256=VQfS3DkAR2GBWdltmlD0XMStiOUo1YZiRONm-mPW2x4,11948
-hydraflow/core/run_collection.py,sha256=4YjnAmB4lpGxTnlHzZOIwEXNfdI5yU5cj3PRiCW6vuA,5439
+hydraflow/core/run.py,sha256=Kbq4s47f6KDNeyNUwrUpW55FrWlf5CCpmdgVCMakU2g,14046
+hydraflow/core/run_collection.py,sha256=sdbkjs01ougaqXlp88gGC10TmO_7s-UEQozLl0jMI4Y,6771
 hydraflow/core/run_info.py,sha256=SMOTZXEa7OBV_XjTyctk5gJGrggmYwhePvRF8CLF1kU,1616
 hydraflow/executor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hydraflow/executor/aio.py,sha256=xXsmBPIPdBlopv_1h0FdtOvoKUcuW7PQeKCV2d_lN9I,2122
@@ -16,8 +16,8 @@ hydraflow/executor/conf.py,sha256=8Xq4UAenRKJIl1NBgNbSfv6VUTJhdwPLayZIEAsiBR0,41
 hydraflow/executor/io.py,sha256=18wnHpCMQRGYL-oN2841h9W2aSW_X2SmO68Lx-3FIbU,1043
 hydraflow/executor/job.py,sha256=6QeJ18OMeocXeM04rCYL46GgArfX1SvZs9_4HTomTgE,5436
 hydraflow/executor/parser.py,sha256=RxP8qpDaJ8VLqZ51VlPFyVitWctObhkE_3iPIsY66Cs,14610
-hydraflow-0.17.0.dist-info/METADATA,sha256=f9LHLgsZMEiTl1CusfZQHUSv6rlz8DfL78EoMfheCBA,7535
-hydraflow-0.17.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-hydraflow-0.17.0.dist-info/entry_points.txt,sha256=XI0khPbpCIUo9UPqkNEpgh-kqK3Jy8T7L2VCWOdkbSM,48
-hydraflow-0.17.0.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
-hydraflow-0.17.0.dist-info/RECORD,,
+hydraflow-0.17.2.dist-info/METADATA,sha256=zEjD1acRRed6Le0G8-KjkWtUYaxXKB6JO9T6StRNkVM,7535
+hydraflow-0.17.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+hydraflow-0.17.2.dist-info/entry_points.txt,sha256=XI0khPbpCIUo9UPqkNEpgh-kqK3Jy8T7L2VCWOdkbSM,48
+hydraflow-0.17.2.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
+hydraflow-0.17.2.dist-info/RECORD,,

{hydraflow-0.17.0.dist-info → hydraflow-0.17.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{hydraflow-0.17.0.dist-info → hydraflow-0.17.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hydraflow-0.17.0.dist-info → hydraflow-0.17.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

hydraflow 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

hydraflow 0.17.0py3-none-any.whl → 0.17.2py3-none-any.whl