PyPI - hydraflow - Versions diffs - 0.2.7__tar.gz → 0.2.9__tar.gz - Mend

hydraflow 0.2.7tar.gz → 0.2.9tar.gz

Files changed (35) hide show

{hydraflow-0.2.7 → hydraflow-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: hydraflow
-Version: 0.2.7
+Version: 0.2.9
 Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
 Project-URL: Documentation, https://github.com/daizutabi/hydraflow
 Project-URL: Source, https://github.com/daizutabi/hydraflow

{hydraflow-0.2.7 → hydraflow-0.2.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hydraflow"
-version = "0.2.7"
+version = "0.2.9"
 description = "Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments."
 readme = "README.md"
 license = "MIT"
@@ -63,7 +63,7 @@ asyncio_default_fixture_loop_scope = "function"
 exclude_lines = ["no cov", "raise NotImplementedError", "if TYPE_CHECKING:"]
 [tool.ruff]
-line-length = 100
+line-length = 88
 target-version = "py312"
 [tool.ruff.lint]

{hydraflow-0.2.7 → hydraflow-0.2.9}/src/hydraflow/__init__.py RENAMED Viewed

@@ -1,11 +1,12 @@
 from .context import chdir_artifact, log_run, start_run, watch
-from .info import load_config
-from .mlflow import get_artifact_dir, get_hydra_output_dir, set_experiment
-from .run_collection import (
-    RunCollection,
+from .info import get_artifact_dir, get_hydra_output_dir, load_config
+from .mlflow import (
     list_runs,
     search_runs,
+    set_experiment,
 )
+from .progress import multi_tasks_progress, parallel_progress
+from .run_collection import RunCollection
 __all__ = [
     "RunCollection",
@@ -15,6 +16,8 @@ __all__ = [
     "list_runs",
     "load_config",
     "log_run",
+    "multi_tasks_progress",
+    "parallel_progress",
     "search_runs",
     "set_experiment",
     "start_run",

{hydraflow-0.2.7 → hydraflow-0.2.9}/src/hydraflow/asyncio.py RENAMED Viewed

@@ -41,7 +41,9 @@ async def execute_command(
         int: The return code of the process.
     """
     try:
-        process = await asyncio.create_subprocess_exec(program, *args, stdout=PIPE, stderr=PIPE)
+        process = await asyncio.create_subprocess_exec(
+            program, *args, stdout=PIPE, stderr=PIPE
+        )
         await asyncio.gather(
             process_stream(process.stdout, stdout),
             process_stream(process.stderr, stderr),
@@ -100,7 +102,9 @@ async def monitor_file_changes(
     """
     str_paths = [str(path) for path in paths]
     try:
-        async for changes in watchfiles.awatch(*str_paths, stop_event=stop_event, **awatch_kwargs):
+        async for changes in watchfiles.awatch(
+            *str_paths, stop_event=stop_event, **awatch_kwargs
+        ):
             callback(changes)
     except Exception as e:
         logger.error(f"Error watching files: {e}")
@@ -129,7 +133,9 @@ async def run_and_monitor(
     """
     stop_event = asyncio.Event()
     run_task = asyncio.create_task(
-        execute_command(program, *args, stop_event=stop_event, stdout=stdout, stderr=stderr)
+        execute_command(
+            program, *args, stop_event=stop_event, stdout=stdout, stderr=stderr
+        )
     )
     if watch and paths:
         monitor_task = asyncio.create_task(

{hydraflow-0.2.7 → hydraflow-0.2.9}/src/hydraflow/context.py RENAMED Viewed

@@ -14,10 +14,11 @@ from typing import TYPE_CHECKING
 import mlflow
 from hydra.core.hydra_config import HydraConfig
-from watchdog.events import FileModifiedEvent, FileSystemEventHandler
+from watchdog.events import FileModifiedEvent, PatternMatchingEventHandler
 from watchdog.observers import Observer
-from hydraflow.mlflow import get_artifact_dir, log_params
+from hydraflow.info import get_artifact_dir
+from hydraflow.mlflow import log_params
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterator
@@ -68,7 +69,7 @@ def log_run(
         mlflow.log_artifact(local_path)
     try:
-        with watch(log_artifact, output_dir):
+        with watch(log_artifact, output_dir, ignore_log=False):
             yield
     except Exception as e:
@@ -140,9 +141,11 @@ def start_run(
 @contextmanager
 def watch(
-    func: Callable[[Path], None],
+    callback: Callable[[Path], None],
     dir: Path | str = "",
     timeout: int = 60,
+    ignore_patterns: list[str] | None = None,
+    ignore_log: bool = True,
 ) -> Iterator[None]:
     """
     Watch the given directory for changes and call the provided function
@@ -154,7 +157,7 @@ def watch(
     period or until the context is exited.
     Args:
-        func (Callable[[Path], None]): The function to call when a change is
+        callback (Callable[[Path], None]): The function to call when a change is
             detected. It should accept a single argument of type `Path`,
             which is the path of the modified file.
         dir (Path | str): The directory to watch. If not specified,
@@ -174,7 +177,7 @@ def watch(
     if isinstance(dir, Path):
         dir = dir.as_posix()
-    handler = Handler(func)
+    handler = Handler(callback, ignore_patterns=ignore_patterns, ignore_log=ignore_log)
     observer = Observer()
     observer.schedule(handler, dir, recursive=True)
     observer.start()
@@ -198,10 +201,23 @@ def watch(
         observer.join()
-class Handler(FileSystemEventHandler):
-    def __init__(self, func: Callable[[Path], None]) -> None:
+class Handler(PatternMatchingEventHandler):
+    def __init__(
+        self,
+        func: Callable[[Path], None],
+        ignore_patterns: list[str] | None = None,
+        ignore_log: bool = True,
+    ) -> None:
         self.func = func
+        if ignore_log:
+            if ignore_patterns:
+                ignore_patterns.append("*.log")
+            else:
+                ignore_patterns = ["*.log"]
+        super().__init__(ignore_patterns=ignore_patterns)
     def on_modified(self, event: FileModifiedEvent) -> None:
         file = Path(str(event.src_path))
         if file.is_file():

{hydraflow-0.2.7 → hydraflow-0.2.9}/src/hydraflow/info.py RENAMED Viewed

@@ -1,14 +1,14 @@
 from __future__ import annotations
+from pathlib import Path
 from typing import TYPE_CHECKING
+import mlflow
+from hydra.core.hydra_config import HydraConfig
+from mlflow.tracking import artifact_utils
 from omegaconf import DictConfig, OmegaConf
-from hydraflow.mlflow import get_artifact_dir
 if TYPE_CHECKING:
-    from pathlib import Path
     from mlflow.entities import Run
     from hydraflow.run_collection import RunCollection
@@ -43,6 +43,59 @@ class RunCollectionInfo:
         return [load_config(run) for run in self._runs]
+def get_artifact_dir(run: Run | None = None) -> Path:
+    """
+    Retrieve the artifact directory for the given run.
+    This function uses MLflow to get the artifact directory for the given run.
+    Args:
+        run (Run | None): The run object. Defaults to None.
+    Returns:
+        The local path to the directory where the artifacts are downloaded.
+    """
+    if run is None:
+        uri = mlflow.get_artifact_uri()
+    else:
+        uri = artifact_utils.get_artifact_uri(run.info.run_id)
+    return Path(mlflow.artifacts.download_artifacts(uri))
+def get_hydra_output_dir(run: Run | None = None) -> Path:
+    """
+    Retrieve the Hydra output directory for the given run.
+    This function returns the Hydra output directory. If no run is provided,
+    it retrieves the output directory from the current Hydra configuration.
+    If a run is provided, it retrieves the artifact path for the run, loads
+    the Hydra configuration from the downloaded artifacts, and returns the
+    output directory specified in that configuration.
+    Args:
+        run (Run | None): The run object. Defaults to None.
+    Returns:
+        Path: The path to the Hydra output directory.
+    Raises:
+        FileNotFoundError: If the Hydra configuration file is not found
+            in the artifacts.
+    """
+    if run is None:
+        hc = HydraConfig.get()
+        return Path(hc.runtime.output_dir)
+    path = get_artifact_dir(run) / ".hydra/hydra.yaml"
+    if path.exists():
+        hc = OmegaConf.load(path)
+        return Path(hc.hydra.runtime.output_dir)
+    raise FileNotFoundError
 def load_config(run: Run) -> DictConfig:
     """
     Load the configuration for a given run.

hydraflow-0.2.9/src/hydraflow/mlflow.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""
+This module provides functionality to log parameters from Hydra configuration objects
+and set up experiments using MLflow. It includes methods for managing experiments,
+searching for runs, and logging parameters and artifacts.
+Key Features:
+- **Experiment Management**: Set and manage MLflow experiments with customizable names
+  based on Hydra configuration.
+- **Run Logging**: Log parameters and metrics from Hydra configuration objects to
+  MLflow, ensuring that all relevant information is captured during experiments.
+- **Run Search**: Search for runs based on various criteria, allowing for flexible
+  retrieval of experiment results.
+- **Artifact Management**: Retrieve and log artifacts associated with runs, facilitating
+  easy access to outputs generated during experiments.
+This module is designed to integrate seamlessly with Hydra, providing a robust
+solution for tracking machine learning experiments and their associated metadata.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import mlflow
+from hydra.core.hydra_config import HydraConfig
+from mlflow.entities import ViewType
+from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
+from hydraflow.config import iter_params
+from hydraflow.run_collection import RunCollection
+if TYPE_CHECKING:
+    from mlflow.entities.experiment import Experiment
+def set_experiment(
+    prefix: str = "",
+    suffix: str = "",
+    uri: str | Path | None = None,
+) -> Experiment:
+    """
+    Sets the experiment name and tracking URI optionally.
+    This function sets the experiment name by combining the given prefix,
+    the job name from HydraConfig, and the given suffix. Optionally, it can
+    also set the tracking URI.
+    Args:
+        prefix (str): The prefix to prepend to the experiment name.
+        suffix (str): The suffix to append to the experiment name.
+        uri (str | Path | None): The tracking URI to use. Defaults to None.
+    Returns:
+        Experiment: An instance of `mlflow.entities.Experiment` representing
+        the new active experiment.
+    """
+    if uri is not None:
+        mlflow.set_tracking_uri(uri)
+    hc = HydraConfig.get()
+    name = f"{prefix}{hc.job.name}{suffix}"
+    return mlflow.set_experiment(name)
+def log_params(config: object, *, synchronous: bool | None = None) -> None:
+    """
+    Log the parameters from the given configuration object.
+    This method logs the parameters from the provided configuration object
+    using MLflow. It iterates over the parameters and logs them using the
+    `mlflow.log_param` method.
+    Args:
+        config (object): The configuration object to log the parameters from.
+        synchronous (bool | None): Whether to log the parameters synchronously.
+            Defaults to None.
+    """
+    for key, value in iter_params(config):
+        mlflow.log_param(key, value, synchronous=synchronous)
+def search_runs(
+    experiment_ids: list[str] | None = None,
+    filter_string: str = "",
+    run_view_type: int = ViewType.ACTIVE_ONLY,
+    max_results: int = SEARCH_MAX_RESULTS_PANDAS,
+    order_by: list[str] | None = None,
+    search_all_experiments: bool = False,
+    experiment_names: list[str] | None = None,
+) -> RunCollection:
+    """
+    Search for Runs that fit the specified criteria.
+    This function wraps the `mlflow.search_runs` function and returns the
+    results as a `RunCollection` object. It allows for flexible searching of
+    MLflow runs based on various criteria.
+    Note:
+        The returned runs are sorted by their start time in ascending order.
+    Args:
+        experiment_ids (list[str] | None): List of experiment IDs. Search can
+            work with experiment IDs or experiment names, but not both in the
+            same call. Values other than ``None`` or ``[]`` will result in
+            error if ``experiment_names`` is also not ``None`` or ``[]``.
+            ``None`` will default to the active experiment if ``experiment_names``
+            is ``None`` or ``[]``.
+        filter_string (str): Filter query string, defaults to searching all
+            runs.
+        run_view_type (int): one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``,
+            or ``ALL`` runs defined in :py:class:`mlflow.entities.ViewType`.
+        max_results (int): The maximum number of runs to put in the dataframe.
+            Default is 100,000 to avoid causing out-of-memory issues on the user's
+            machine.
+        order_by (list[str] | None): List of columns to order by (e.g.,
+            "metrics.rmse"). The ``order_by`` column can contain an optional
+            ``DESC`` or ``ASC`` value. The default is ``ASC``. The default
+            ordering is to sort by ``start_time DESC``, then ``run_id``.
+            ``start_time DESC``, then ``run_id``.
+        search_all_experiments (bool): Boolean specifying whether all
+            experiments should be searched. Only honored if ``experiment_ids``
+            is ``[]`` or ``None``.
+        experiment_names (list[str] | None): List of experiment names. Search
+            can work with experiment IDs or experiment names, but not both in
+            the same call. Values other than ``None`` or ``[]`` will result in
+            error if ``experiment_ids`` is also not ``None`` or ``[]``.
+            ``experiment_ids`` is also not ``None`` or ``[]``. ``None`` will
+            default to the active experiment if ``experiment_ids`` is ``None``
+            or ``[]``.
+    Returns:
+        A `RunCollection` object containing the search results.
+    """
+    runs = mlflow.search_runs(
+        experiment_ids=experiment_ids,
+        filter_string=filter_string,
+        run_view_type=run_view_type,
+        max_results=max_results,
+        order_by=order_by,
+        output_format="list",
+        search_all_experiments=search_all_experiments,
+        experiment_names=experiment_names,
+    )
+    runs = sorted(runs, key=lambda run: run.info.start_time)  # type: ignore
+    return RunCollection(runs)  # type: ignore
+def list_runs(experiment_names: list[str] | None = None) -> RunCollection:
+    """
+    List all runs for the specified experiments.
+    This function retrieves all runs for the given list of experiment names.
+    If no experiment names are provided (None), it defaults to searching all runs
+    for the currently active experiment. If an empty list is provided, the function
+    will search all runs for all experiments except the "Default" experiment.
+    The function returns the results as a `RunCollection` object.
+    Note:
+        The returned runs are sorted by their start time in ascending order.
+    Args:
+        experiment_names (list[str] | None): List of experiment names to search
+            for runs. If None or an empty list is provided, the function will
+            search the currently active experiment or all experiments except
+            the "Default" experiment.
+    Returns:
+        A `RunCollection` object containing the runs for the specified experiments.
+    """
+    if experiment_names == []:
+        experiments = mlflow.search_experiments()
+        experiment_names = [e.name for e in experiments if e.name != "Default"]
+    return search_runs(experiment_names=experiment_names)

hydraflow-0.2.9/src/hydraflow/progress.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""
+Module for managing progress tracking in parallel processing using Joblib
+and Rich's Progress bar.
+Provide context managers and functions to facilitate the execution
+of tasks in parallel while displaying progress updates.
+The following key components are provided:
+- JoblibProgress: A context manager for tracking progress with Rich's Progress
+    bar.
+- parallel_progress: A function to execute a given function in parallel over
+    an iterable with progress tracking.
+- multi_tasks_progress: A function to render auto-updating progress bars for
+    multiple tasks concurrently.
+Usage:
+    Import the necessary functions and use them to manage progress in your
+    parallel processing tasks.
+"""
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, TypeVar
+import joblib
+from rich.progress import Progress
+if TYPE_CHECKING:
+    from collections.abc import Callable, Iterable, Iterator
+    from rich.progress import ProgressColumn
+# https://github.com/jonghwanhyeon/joblib-progress/blob/main/joblib_progress/__init__.py
+@contextmanager
+def JoblibProgress(
+    *columns: ProgressColumn | str,
+    description: str | None = None,
+    total: int | None = None,
+    **kwargs,
+) -> Iterator[Progress]:
+    """
+    Context manager for tracking progress using Joblib with Rich's Progress bar.
+    Args:
+        *columns (ProgressColumn | str): Columns to display in the progress bar.
+        description (str | None, optional): A description for the progress task.
+            Defaults to None.
+        total (int | None, optional): The total number of tasks. If None, it will
+            be determined automatically.
+        **kwargs: Additional keyword arguments passed to the Progress instance.
+    Yields:
+        Progress: A Progress instance for managing the progress bar.
+    Example:
+        with JoblibProgress("task", total=100) as progress:
+            # Your parallel processing code here
+    """
+    if not columns:
+        columns = Progress.get_default_columns()
+    progress = Progress(*columns, **kwargs)
+    if description is None:
+        description = "Processing..."
+    task_id = progress.add_task(description, total=total)
+    print_progress = joblib.parallel.Parallel.print_progress
+    def update_progress(self: joblib.parallel.Parallel):
+        progress.update(task_id, completed=self.n_completed_tasks, refresh=True)
+        return print_progress(self)
+    try:
+        joblib.parallel.Parallel.print_progress = update_progress
+        progress.start()
+        yield progress
+    finally:
+        progress.stop()
+        joblib.parallel.Parallel.print_progress = print_progress
+T = TypeVar("T")
+U = TypeVar("U")
+def parallel_progress(
+    func: Callable[[T], U],
+    iterable: Iterable[T],
+    *columns: ProgressColumn | str,
+    n_jobs: int = -1,
+    description: str | None = None,
+    **kwargs,
+) -> list[U]:
+    """
+    Execute a function in parallel over an iterable with progress tracking.
+    Args:
+        func (Callable[[T], U]): The function to execute on each item in the
+            iterable.
+        iterable (Iterable[T]): An iterable of items to process.
+        *columns (ProgressColumn | str): Additional columns to display in the
+            progress bar.
+        n_jobs (int, optional): The number of jobs to run in parallel.
+            Defaults to -1 (all processors).
+        description (str | None, optional): A description for the progress bar.
+            Defaults to None.
+        **kwargs: Additional keyword arguments passed to the Progress instance.
+    Returns:
+        list[U]: A list of results from applying the function to each item in
+        the iterable.
+    """
+    iterable = list(iterable)
+    total = len(iterable)
+    with JoblibProgress(*columns, description=description, total=total, **kwargs):
+        it = (joblib.delayed(func)(x) for x in iterable)
+        return joblib.Parallel(n_jobs=n_jobs)(it)  # type: ignore
+def multi_tasks_progress(
+    iterables: Iterable[Iterable[int | tuple[int, int]]],
+    *columns: ProgressColumn | str,
+    n_jobs: int = -1,
+    description: str = "#{:0>3}",
+    main_description: str = "main",
+    transient: bool | None = None,
+    **kwargs,
+) -> None:
+    """
+    Render auto-updating progress bars for multiple tasks concurrently.
+    Args:
+        iterables (Iterable[Iterable[int | tuple[int, int]]]): A collection of
+            iterables, each representing a task. Each iterable can yield
+            integers (completed) or tuples of integers (completed, total).
+        *columns (ProgressColumn | str): Additional columns to display in the
+            progress bars.
+        n_jobs (int, optional): Number of jobs to run in parallel. Defaults to
+            -1, which means using all processors.
+        description (str, optional): Format string for describing tasks. Defaults to
+            "#{:0>3}".
+        main_description (str, optional): Description for the main task.
+            Defaults to "main".
+        transient (bool | None, optional): Whether to remove the progress bar
+            after completion. Defaults to None.
+        **kwargs: Additional keyword arguments passed to the Progress instance.
+    Returns:
+        None
+    """
+    if not columns:
+        columns = Progress.get_default_columns()
+    iterables = list(iterables)
+    with Progress(*columns, transient=transient or False, **kwargs) as progress:
+        n = len(iterables)
+        task_main = progress.add_task(main_description, total=None) if n > 1 else None
+        tasks = [
+            progress.add_task(description.format(i), start=False, total=None)
+            for i in range(n)
+        ]
+        total = {}
+        completed = {}
+        def func(i: int) -> None:
+            completed[i] = 0
+            total[i] = None
+            progress.start_task(tasks[i])
+            for index in iterables[i]:
+                if isinstance(index, tuple):
+                    completed[i], total[i] = index[0] + 1, index[1]
+                else:
+                    completed[i] = index + 1
+                progress.update(tasks[i], total=total[i], completed=completed[i])
+                if task_main is not None:
+                    if all(t is not None for t in total.values()):
+                        t = sum(total.values())
+                    else:
+                        t = None
+                    c = sum(completed.values())
+                    progress.update(task_main, total=t, completed=c)
+            if transient or n > 1:
+                progress.remove_task(tasks[i])
+        if n > 1:
+            it = (joblib.delayed(func)(i) for i in range(n))
+            joblib.Parallel(n_jobs, prefer="threads")(it)
+        else:
+            func(0)

hydraflow 0.2.7__tar.gz → 0.2.9__tar.gz

hydraflow 0.2.7tar.gz → 0.2.9tar.gz