PyPI - hydraflow - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

hydraflow 0.2.6tar.gz → 0.2.8tar.gz

Files changed (35) hide show

{hydraflow-0.2.6 → hydraflow-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: hydraflow
-Version: 0.2.6
+Version: 0.2.8
 Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
 Project-URL: Documentation, https://github.com/daizutabi/hydraflow
 Project-URL: Source, https://github.com/daizutabi/hydraflow

{hydraflow-0.2.6 → hydraflow-0.2.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hydraflow"
-version = "0.2.6"
+version = "0.2.8"
 description = "Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments."
 readme = "README.md"
 license = "MIT"

{hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/__init__.py RENAMED Viewed

@@ -1,11 +1,11 @@
 from .context import chdir_artifact, log_run, start_run, watch
-from .mlflow import get_artifact_dir, get_hydra_output_dir, set_experiment
-from .runs import (
-    RunCollection,
+from .info import get_artifact_dir, get_hydra_output_dir, load_config
+from .mlflow import (
     list_runs,
-    load_config,
     search_runs,
+    set_experiment,
 )
+from .run_collection import RunCollection
 __all__ = [
     "RunCollection",

{hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/context.py RENAMED Viewed

@@ -14,10 +14,11 @@ from typing import TYPE_CHECKING
 import mlflow
 from hydra.core.hydra_config import HydraConfig
-from watchdog.events import FileModifiedEvent, FileSystemEventHandler
+from watchdog.events import FileModifiedEvent, PatternMatchingEventHandler
 from watchdog.observers import Observer
-from hydraflow.mlflow import get_artifact_dir, log_params
+from hydraflow.info import get_artifact_dir
+from hydraflow.mlflow import log_params
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterator
@@ -68,7 +69,7 @@ def log_run(
         mlflow.log_artifact(local_path)
     try:
-        with watch(log_artifact, output_dir):
+        with watch(log_artifact, output_dir, ignore_log=False):
             yield
     except Exception as e:
@@ -140,9 +141,11 @@ def start_run(
 @contextmanager
 def watch(
-    func: Callable[[Path], None],
+    callback: Callable[[Path], None],
     dir: Path | str = "",
     timeout: int = 60,
+    ignore_patterns: list[str] | None = None,
+    ignore_log: bool = True,
 ) -> Iterator[None]:
     """
     Watch the given directory for changes and call the provided function
@@ -154,7 +157,7 @@ def watch(
     period or until the context is exited.
     Args:
-        func (Callable[[Path], None]): The function to call when a change is
+        callback (Callable[[Path], None]): The function to call when a change is
             detected. It should accept a single argument of type `Path`,
             which is the path of the modified file.
         dir (Path | str): The directory to watch. If not specified,
@@ -174,7 +177,7 @@ def watch(
     if isinstance(dir, Path):
         dir = dir.as_posix()
-    handler = Handler(func)
+    handler = Handler(callback, ignore_patterns=ignore_patterns, ignore_log=ignore_log)
     observer = Observer()
     observer.schedule(handler, dir, recursive=True)
     observer.start()
@@ -198,10 +201,23 @@ def watch(
         observer.join()
-class Handler(FileSystemEventHandler):
-    def __init__(self, func: Callable[[Path], None]) -> None:
+class Handler(PatternMatchingEventHandler):
+    def __init__(
+        self,
+        func: Callable[[Path], None],
+        ignore_patterns: list[str] | None = None,
+        ignore_log: bool = True,
+    ) -> None:
         self.func = func
+        if ignore_log:
+            if ignore_patterns:
+                ignore_patterns.append("*.log")
+            else:
+                ignore_patterns = ["*.log"]
+        super().__init__(ignore_patterns=ignore_patterns)
     def on_modified(self, event: FileModifiedEvent) -> None:
         file = Path(str(event.src_path))
         if file.is_file():

hydraflow-0.2.8/src/hydraflow/info.py ADDED Viewed

@@ -0,0 +1,116 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import mlflow
+from hydra.core.hydra_config import HydraConfig
+from mlflow.tracking import artifact_utils
+from omegaconf import DictConfig, OmegaConf
+if TYPE_CHECKING:
+    from mlflow.entities import Run
+    from hydraflow.run_collection import RunCollection
+class RunCollectionInfo:
+    def __init__(self, runs: RunCollection):
+        self._runs = runs
+    @property
+    def run_id(self) -> list[str]:
+        return [run.info.run_id for run in self._runs]
+    @property
+    def params(self) -> list[dict[str, str]]:
+        return [run.data.params for run in self._runs]
+    @property
+    def metrics(self) -> list[dict[str, float]]:
+        return [run.data.metrics for run in self._runs]
+    @property
+    def artifact_uri(self) -> list[str | None]:
+        return [run.info.artifact_uri for run in self._runs]
+    @property
+    def artifact_dir(self) -> list[Path]:
+        return [get_artifact_dir(run) for run in self._runs]
+    @property
+    def config(self) -> list[DictConfig]:
+        return [load_config(run) for run in self._runs]
+def get_artifact_dir(run: Run | None = None) -> Path:
+    """
+    Retrieve the artifact directory for the given run.
+    This function uses MLflow to get the artifact directory for the given run.
+    Args:
+        run (Run | None): The run object. Defaults to None.
+    Returns:
+        The local path to the directory where the artifacts are downloaded.
+    """
+    if run is None:
+        uri = mlflow.get_artifact_uri()
+    else:
+        uri = artifact_utils.get_artifact_uri(run.info.run_id)
+    return Path(mlflow.artifacts.download_artifacts(uri))
+def get_hydra_output_dir(run: Run | None = None) -> Path:
+    """
+    Retrieve the Hydra output directory for the given run.
+    This function returns the Hydra output directory. If no run is provided,
+    it retrieves the output directory from the current Hydra configuration.
+    If a run is provided, it retrieves the artifact path for the run, loads
+    the Hydra configuration from the downloaded artifacts, and returns the
+    output directory specified in that configuration.
+    Args:
+        run (Run | None): The run object. Defaults to None.
+    Returns:
+        Path: The path to the Hydra output directory.
+    Raises:
+        FileNotFoundError: If the Hydra configuration file is not found
+            in the artifacts.
+    """
+    if run is None:
+        hc = HydraConfig.get()
+        return Path(hc.runtime.output_dir)
+    path = get_artifact_dir(run) / ".hydra/hydra.yaml"
+    if path.exists():
+        hc = OmegaConf.load(path)
+        return Path(hc.hydra.runtime.output_dir)
+    raise FileNotFoundError
+def load_config(run: Run) -> DictConfig:
+    """
+    Load the configuration for a given run.
+    This function loads the configuration for the provided Run instance
+    by downloading the configuration file from the MLflow artifacts and
+    loading it using OmegaConf. It returns an empty config if
+    `.hydra/config.yaml` is not found in the run's artifact directory.
+    Args:
+        run (Run): The Run instance for which to load the configuration.
+    Returns:
+        The loaded configuration as a DictConfig object. Returns an empty
+        DictConfig if the configuration file is not found.
+    """
+    path = get_artifact_dir(run) / ".hydra/config.yaml"
+    return OmegaConf.load(path)  # type: ignore

hydraflow-0.2.8/src/hydraflow/mlflow.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""
+This module provides functionality to log parameters from Hydra configuration objects
+and set up experiments using MLflow. It includes methods for managing experiments,
+searching for runs, and logging parameters and artifacts.
+Key Features:
+- **Experiment Management**: Set and manage MLflow experiments with customizable names
+  based on Hydra configuration.
+- **Run Logging**: Log parameters and metrics from Hydra configuration objects to
+  MLflow, ensuring that all relevant information is captured during experiments.
+- **Run Search**: Search for runs based on various criteria, allowing for flexible
+  retrieval of experiment results.
+- **Artifact Management**: Retrieve and log artifacts associated with runs, facilitating
+  easy access to outputs generated during experiments.
+This module is designed to integrate seamlessly with Hydra, providing a robust
+solution for tracking machine learning experiments and their associated metadata.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import mlflow
+from hydra.core.hydra_config import HydraConfig
+from mlflow.entities import ViewType
+from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
+from hydraflow.config import iter_params
+from hydraflow.run_collection import RunCollection
+if TYPE_CHECKING:
+    from mlflow.entities.experiment import Experiment
+def set_experiment(
+    prefix: str = "",
+    suffix: str = "",
+    uri: str | Path | None = None,
+) -> Experiment:
+    """
+    Sets the experiment name and tracking URI optionally.
+    This function sets the experiment name by combining the given prefix,
+    the job name from HydraConfig, and the given suffix. Optionally, it can
+    also set the tracking URI.
+    Args:
+        prefix (str): The prefix to prepend to the experiment name.
+        suffix (str): The suffix to append to the experiment name.
+        uri (str | Path | None): The tracking URI to use. Defaults to None.
+    Returns:
+        Experiment: An instance of `mlflow.entities.Experiment` representing
+        the new active experiment.
+    """
+    if uri is not None:
+        mlflow.set_tracking_uri(uri)
+    hc = HydraConfig.get()
+    name = f"{prefix}{hc.job.name}{suffix}"
+    return mlflow.set_experiment(name)
+def log_params(config: object, *, synchronous: bool | None = None) -> None:
+    """
+    Log the parameters from the given configuration object.
+    This method logs the parameters from the provided configuration object
+    using MLflow. It iterates over the parameters and logs them using the
+    `mlflow.log_param` method.
+    Args:
+        config (object): The configuration object to log the parameters from.
+        synchronous (bool | None): Whether to log the parameters synchronously.
+            Defaults to None.
+    """
+    for key, value in iter_params(config):
+        mlflow.log_param(key, value, synchronous=synchronous)
+def search_runs(
+    experiment_ids: list[str] | None = None,
+    filter_string: str = "",
+    run_view_type: int = ViewType.ACTIVE_ONLY,
+    max_results: int = SEARCH_MAX_RESULTS_PANDAS,
+    order_by: list[str] | None = None,
+    search_all_experiments: bool = False,
+    experiment_names: list[str] | None = None,
+) -> RunCollection:
+    """
+    Search for Runs that fit the specified criteria.
+    This function wraps the `mlflow.search_runs` function and returns the
+    results as a `RunCollection` object. It allows for flexible searching of
+    MLflow runs based on various criteria.
+    Note:
+        The returned runs are sorted by their start time in ascending order.
+    Args:
+        experiment_ids (list[str] | None): List of experiment IDs. Search can
+            work with experiment IDs or experiment names, but not both in the
+            same call. Values other than ``None`` or ``[]`` will result in
+            error if ``experiment_names`` is also not ``None`` or ``[]``.
+            ``None`` will default to the active experiment if ``experiment_names``
+            is ``None`` or ``[]``.
+        filter_string (str): Filter query string, defaults to searching all
+            runs.
+        run_view_type (int): one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``,
+            or ``ALL`` runs defined in :py:class:`mlflow.entities.ViewType`.
+        max_results (int): The maximum number of runs to put in the dataframe.
+            Default is 100,000 to avoid causing out-of-memory issues on the user's
+            machine.
+        order_by (list[str] | None): List of columns to order by (e.g.,
+            "metrics.rmse"). The ``order_by`` column can contain an optional
+            ``DESC`` or ``ASC`` value. The default is ``ASC``. The default
+            ordering is to sort by ``start_time DESC``, then ``run_id``.
+            ``start_time DESC``, then ``run_id``.
+        search_all_experiments (bool): Boolean specifying whether all
+            experiments should be searched. Only honored if ``experiment_ids``
+            is ``[]`` or ``None``.
+        experiment_names (list[str] | None): List of experiment names. Search
+            can work with experiment IDs or experiment names, but not both in
+            the same call. Values other than ``None`` or ``[]`` will result in
+            error if ``experiment_ids`` is also not ``None`` or ``[]``.
+            ``experiment_ids`` is also not ``None`` or ``[]``. ``None`` will
+            default to the active experiment if ``experiment_ids`` is ``None``
+            or ``[]``.
+    Returns:
+        A `RunCollection` object containing the search results.
+    """
+    runs = mlflow.search_runs(
+        experiment_ids=experiment_ids,
+        filter_string=filter_string,
+        run_view_type=run_view_type,
+        max_results=max_results,
+        order_by=order_by,
+        output_format="list",
+        search_all_experiments=search_all_experiments,
+        experiment_names=experiment_names,
+    )
+    runs = sorted(runs, key=lambda run: run.info.start_time)  # type: ignore
+    return RunCollection(runs)  # type: ignore
+def list_runs(experiment_names: list[str] | None = None) -> RunCollection:
+    """
+    List all runs for the specified experiments.
+    This function retrieves all runs for the given list of experiment names.
+    If no experiment names are provided (None), it defaults to searching all runs
+    for the currently active experiment. If an empty list is provided, the function
+    will search all runs for all experiments except the "Default" experiment.
+    The function returns the results as a `RunCollection` object.
+    Note:
+        The returned runs are sorted by their start time in ascending order.
+    Args:
+        experiment_names (list[str] | None): List of experiment names to search
+            for runs. If None or an empty list is provided, the function will
+            search the currently active experiment or all experiments except
+            the "Default" experiment.
+    Returns:
+        A `RunCollection` object containing the runs for the specified experiments.
+    """
+    if experiment_names == []:
+        experiments = mlflow.search_experiments()
+        experiment_names = [e.name for e in experiments if e.name != "Default"]
+    return search_runs(experiment_names=experiment_names)

hydraflow-0.2.8/src/hydraflow/progress.py ADDED Viewed

@@ -0,0 +1,131 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import joblib
+from rich.progress import Progress
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from rich.progress import ProgressColumn
+def multi_task_progress(
+    iterables: Iterable[Iterable[int | tuple[int, int]]],
+    *columns: ProgressColumn | str,
+    n_jobs: int = -1,
+    description: str = "#{:0>3}",
+    main_description: str = "main",
+    transient: bool | None = None,
+    **kwargs,
+) -> None:
+    """
+    Render auto-updating progress bars for multiple tasks concurrently.
+    Args:
+        iterables (Iterable[Iterable[int | tuple[int, int]]]): A collection of
+            iterables, each representing a task. Each iterable can yield
+            integers (completed) or tuples of integers (completed, total).
+        *columns (ProgressColumn | str): Additional columns to display in the
+            progress bars.
+        n_jobs (int, optional): Number of jobs to run in parallel. Defaults to
+            -1, which means using all processors.
+        description (str, optional): Format string for describing tasks. Defaults to
+            "#{:0>3}".
+        main_description (str, optional): Description for the main task.
+            Defaults to "main".
+        transient (bool | None, optional): Whether to remove the progress bar
+            after completion. Defaults to None.
+        **kwargs: Additional keyword arguments passed to the Progress instance.
+    Returns:
+        None
+    """
+    if not columns:
+        columns = Progress.get_default_columns()
+    iterables = list(iterables)
+    with Progress(*columns, transient=transient or False, **kwargs) as progress:
+        n = len(iterables)
+        task_main = progress.add_task(main_description, total=None) if n > 1 else None
+        tasks = [
+            progress.add_task(description.format(i), start=False, total=None) for i in range(n)
+        ]
+        total = {}
+        completed = {}
+        def func(i: int) -> None:
+            completed[i] = 0
+            total[i] = None
+            progress.start_task(tasks[i])
+            for index in iterables[i]:
+                if isinstance(index, tuple):
+                    completed[i], total[i] = index[0] + 1, index[1]
+                else:
+                    completed[i] = index + 1
+                progress.update(tasks[i], total=total[i], completed=completed[i])
+                if task_main is not None:
+                    if all(t is not None for t in total.values()):
+                        t = sum(total.values())
+                    else:
+                        t = None
+                    c = sum(completed.values())
+                    progress.update(task_main, total=t, completed=c)
+            if transient or n > 1:
+                progress.remove_task(tasks[i])
+        if n > 1:
+            it = (joblib.delayed(func)(i) for i in range(n))
+            joblib.Parallel(n_jobs, prefer="threads")(it)
+        else:
+            func(0)
+if __name__ == "__main__":
+    import random
+    import time
+    from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TimeElapsedColumn
+    from hydraflow.progress import multi_task_progress
+    def task(total):
+        for i in range(total or 90):
+            if total is None:
+                yield i
+            else:
+                yield i, total
+            time.sleep(random.random() / 30)
+    def multi_task_progress_test(unknown_total: bool):
+        tasks = [task(random.randint(80, 100)) for _ in range(4)]
+        if unknown_total:
+            tasks = [task(None), *tasks, task(None)]
+        columns = [
+            SpinnerColumn(),
+            *Progress.get_default_columns(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+        ]
+        kwargs = {}
+        if unknown_total:
+            kwargs["main_description"] = "unknown"
+        multi_task_progress(tasks, *columns, n_jobs=4, **kwargs)
+    multi_task_progress_test(False)
+    multi_task_progress_test(True)
+    multi_task_progress([task(100)])
+    multi_task_progress([task(None)], description="unknown")
+    multi_task_progress([task(100), task(None)], main_description="transient", transient=True)
+    multi_task_progress([task(100)], description="transient", transient=True)

hydraflow 0.2.6__tar.gz → 0.2.8__tar.gz

hydraflow 0.2.6tar.gz → 0.2.8tar.gz