hydraflow 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
hydraflow/__init__.py CHANGED
@@ -1,11 +1,12 @@
1
1
  from .context import chdir_artifact, log_run, start_run, watch
2
- from .info import load_config
3
- from .mlflow import get_artifact_dir, get_hydra_output_dir, set_experiment
4
- from .run_collection import (
5
- RunCollection,
2
+ from .info import get_artifact_dir, get_hydra_output_dir, load_config
3
+ from .mlflow import (
6
4
  list_runs,
7
5
  search_runs,
6
+ set_experiment,
8
7
  )
8
+ from .progress import multi_tasks_progress, parallel_progress
9
+ from .run_collection import RunCollection
9
10
 
10
11
  __all__ = [
11
12
  "RunCollection",
@@ -15,6 +16,8 @@ __all__ = [
15
16
  "list_runs",
16
17
  "load_config",
17
18
  "log_run",
19
+ "multi_tasks_progress",
20
+ "parallel_progress",
18
21
  "search_runs",
19
22
  "set_experiment",
20
23
  "start_run",
hydraflow/asyncio.py CHANGED
@@ -41,7 +41,9 @@ async def execute_command(
41
41
  int: The return code of the process.
42
42
  """
43
43
  try:
44
- process = await asyncio.create_subprocess_exec(program, *args, stdout=PIPE, stderr=PIPE)
44
+ process = await asyncio.create_subprocess_exec(
45
+ program, *args, stdout=PIPE, stderr=PIPE
46
+ )
45
47
  await asyncio.gather(
46
48
  process_stream(process.stdout, stdout),
47
49
  process_stream(process.stderr, stderr),
@@ -100,7 +102,9 @@ async def monitor_file_changes(
100
102
  """
101
103
  str_paths = [str(path) for path in paths]
102
104
  try:
103
- async for changes in watchfiles.awatch(*str_paths, stop_event=stop_event, **awatch_kwargs):
105
+ async for changes in watchfiles.awatch(
106
+ *str_paths, stop_event=stop_event, **awatch_kwargs
107
+ ):
104
108
  callback(changes)
105
109
  except Exception as e:
106
110
  logger.error(f"Error watching files: {e}")
@@ -129,7 +133,9 @@ async def run_and_monitor(
129
133
  """
130
134
  stop_event = asyncio.Event()
131
135
  run_task = asyncio.create_task(
132
- execute_command(program, *args, stop_event=stop_event, stdout=stdout, stderr=stderr)
136
+ execute_command(
137
+ program, *args, stop_event=stop_event, stdout=stdout, stderr=stderr
138
+ )
133
139
  )
134
140
  if watch and paths:
135
141
  monitor_task = asyncio.create_task(
hydraflow/context.py CHANGED
@@ -14,10 +14,11 @@ from typing import TYPE_CHECKING
14
14
 
15
15
  import mlflow
16
16
  from hydra.core.hydra_config import HydraConfig
17
- from watchdog.events import FileModifiedEvent, FileSystemEventHandler
17
+ from watchdog.events import FileModifiedEvent, PatternMatchingEventHandler
18
18
  from watchdog.observers import Observer
19
19
 
20
- from hydraflow.mlflow import get_artifact_dir, log_params
20
+ from hydraflow.info import get_artifact_dir
21
+ from hydraflow.mlflow import log_params
21
22
 
22
23
  if TYPE_CHECKING:
23
24
  from collections.abc import Callable, Iterator
@@ -68,7 +69,7 @@ def log_run(
68
69
  mlflow.log_artifact(local_path)
69
70
 
70
71
  try:
71
- with watch(log_artifact, output_dir):
72
+ with watch(log_artifact, output_dir, ignore_log=False):
72
73
  yield
73
74
 
74
75
  except Exception as e:
@@ -140,9 +141,11 @@ def start_run(
140
141
 
141
142
  @contextmanager
142
143
  def watch(
143
- func: Callable[[Path], None],
144
+ callback: Callable[[Path], None],
144
145
  dir: Path | str = "",
145
146
  timeout: int = 60,
147
+ ignore_patterns: list[str] | None = None,
148
+ ignore_log: bool = True,
146
149
  ) -> Iterator[None]:
147
150
  """
148
151
  Watch the given directory for changes and call the provided function
@@ -154,7 +157,7 @@ def watch(
154
157
  period or until the context is exited.
155
158
 
156
159
  Args:
157
- func (Callable[[Path], None]): The function to call when a change is
160
+ callback (Callable[[Path], None]): The function to call when a change is
158
161
  detected. It should accept a single argument of type `Path`,
159
162
  which is the path of the modified file.
160
163
  dir (Path | str): The directory to watch. If not specified,
@@ -174,7 +177,7 @@ def watch(
174
177
  if isinstance(dir, Path):
175
178
  dir = dir.as_posix()
176
179
 
177
- handler = Handler(func)
180
+ handler = Handler(callback, ignore_patterns=ignore_patterns, ignore_log=ignore_log)
178
181
  observer = Observer()
179
182
  observer.schedule(handler, dir, recursive=True)
180
183
  observer.start()
@@ -198,10 +201,23 @@ def watch(
198
201
  observer.join()
199
202
 
200
203
 
201
- class Handler(FileSystemEventHandler):
202
- def __init__(self, func: Callable[[Path], None]) -> None:
204
+ class Handler(PatternMatchingEventHandler):
205
+ def __init__(
206
+ self,
207
+ func: Callable[[Path], None],
208
+ ignore_patterns: list[str] | None = None,
209
+ ignore_log: bool = True,
210
+ ) -> None:
203
211
  self.func = func
204
212
 
213
+ if ignore_log:
214
+ if ignore_patterns:
215
+ ignore_patterns.append("*.log")
216
+ else:
217
+ ignore_patterns = ["*.log"]
218
+
219
+ super().__init__(ignore_patterns=ignore_patterns)
220
+
205
221
  def on_modified(self, event: FileModifiedEvent) -> None:
206
222
  file = Path(str(event.src_path))
207
223
  if file.is_file():
hydraflow/info.py CHANGED
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from pathlib import Path
3
4
  from typing import TYPE_CHECKING
4
5
 
6
+ import mlflow
7
+ from hydra.core.hydra_config import HydraConfig
8
+ from mlflow.tracking import artifact_utils
5
9
  from omegaconf import DictConfig, OmegaConf
6
10
 
7
- from hydraflow.mlflow import get_artifact_dir
8
-
9
11
  if TYPE_CHECKING:
10
- from pathlib import Path
11
-
12
12
  from mlflow.entities import Run
13
13
 
14
14
  from hydraflow.run_collection import RunCollection
@@ -43,6 +43,59 @@ class RunCollectionInfo:
43
43
  return [load_config(run) for run in self._runs]
44
44
 
45
45
 
46
+ def get_artifact_dir(run: Run | None = None) -> Path:
47
+ """
48
+ Retrieve the artifact directory for the given run.
49
+
50
+ This function uses MLflow to get the artifact directory for the given run.
51
+
52
+ Args:
53
+ run (Run | None): The run object. Defaults to None.
54
+
55
+ Returns:
56
+ The local path to the directory where the artifacts are downloaded.
57
+ """
58
+ if run is None:
59
+ uri = mlflow.get_artifact_uri()
60
+ else:
61
+ uri = artifact_utils.get_artifact_uri(run.info.run_id)
62
+
63
+ return Path(mlflow.artifacts.download_artifacts(uri))
64
+
65
+
66
+ def get_hydra_output_dir(run: Run | None = None) -> Path:
67
+ """
68
+ Retrieve the Hydra output directory for the given run.
69
+
70
+ This function returns the Hydra output directory. If no run is provided,
71
+ it retrieves the output directory from the current Hydra configuration.
72
+ If a run is provided, it retrieves the artifact path for the run, loads
73
+ the Hydra configuration from the downloaded artifacts, and returns the
74
+ output directory specified in that configuration.
75
+
76
+ Args:
77
+ run (Run | None): The run object. Defaults to None.
78
+
79
+ Returns:
80
+ Path: The path to the Hydra output directory.
81
+
82
+ Raises:
83
+ FileNotFoundError: If the Hydra configuration file is not found
84
+ in the artifacts.
85
+ """
86
+ if run is None:
87
+ hc = HydraConfig.get()
88
+ return Path(hc.runtime.output_dir)
89
+
90
+ path = get_artifact_dir(run) / ".hydra/hydra.yaml"
91
+
92
+ if path.exists():
93
+ hc = OmegaConf.load(path)
94
+ return Path(hc.hydra.runtime.output_dir)
95
+
96
+ raise FileNotFoundError
97
+
98
+
46
99
  def load_config(run: Run) -> DictConfig:
47
100
  """
48
101
  Load the configuration for a given run.
hydraflow/mlflow.py CHANGED
@@ -1,6 +1,20 @@
1
1
  """
2
- This module provides functionality to log parameters from Hydra
3
- configuration objects and set up experiments using MLflow.
2
+ This module provides functionality to log parameters from Hydra configuration objects
3
+ and set up experiments using MLflow. It includes methods for managing experiments,
4
+ searching for runs, and logging parameters and artifacts.
5
+
6
+ Key Features:
7
+ - **Experiment Management**: Set and manage MLflow experiments with customizable names
8
+ based on Hydra configuration.
9
+ - **Run Logging**: Log parameters and metrics from Hydra configuration objects to
10
+ MLflow, ensuring that all relevant information is captured during experiments.
11
+ - **Run Search**: Search for runs based on various criteria, allowing for flexible
12
+ retrieval of experiment results.
13
+ - **Artifact Management**: Retrieve and log artifacts associated with runs, facilitating
14
+ easy access to outputs generated during experiments.
15
+
16
+ This module is designed to integrate seamlessly with Hydra, providing a robust
17
+ solution for tracking machine learning experiments and their associated metadata.
4
18
  """
5
19
 
6
20
  from __future__ import annotations
@@ -10,14 +24,14 @@ from typing import TYPE_CHECKING
10
24
 
11
25
  import mlflow
12
26
  from hydra.core.hydra_config import HydraConfig
13
- from mlflow.tracking import artifact_utils
14
- from omegaconf import OmegaConf
27
+ from mlflow.entities import ViewType
28
+ from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
15
29
 
16
30
  from hydraflow.config import iter_params
31
+ from hydraflow.run_collection import RunCollection
17
32
 
18
33
  if TYPE_CHECKING:
19
34
  from mlflow.entities.experiment import Experiment
20
- from mlflow.entities.run import Run
21
35
 
22
36
 
23
37
  def set_experiment(
@@ -26,7 +40,7 @@ def set_experiment(
26
40
  uri: str | Path | None = None,
27
41
  ) -> Experiment:
28
42
  """
29
- Set the experiment name and tracking URI optionally.
43
+ Sets the experiment name and tracking URI optionally.
30
44
 
31
45
  This function sets the experiment name by combining the given prefix,
32
46
  the job name from HydraConfig, and the given suffix. Optionally, it can
@@ -66,54 +80,96 @@ def log_params(config: object, *, synchronous: bool | None = None) -> None:
66
80
  mlflow.log_param(key, value, synchronous=synchronous)
67
81
 
68
82
 
69
- def get_artifact_dir(run: Run | None = None) -> Path:
83
+ def search_runs(
84
+ experiment_ids: list[str] | None = None,
85
+ filter_string: str = "",
86
+ run_view_type: int = ViewType.ACTIVE_ONLY,
87
+ max_results: int = SEARCH_MAX_RESULTS_PANDAS,
88
+ order_by: list[str] | None = None,
89
+ search_all_experiments: bool = False,
90
+ experiment_names: list[str] | None = None,
91
+ ) -> RunCollection:
70
92
  """
71
- Retrieve the artifact directory for the given run.
93
+ Search for Runs that fit the specified criteria.
72
94
 
73
- This function uses MLflow to get the artifact directory for the given run.
95
+ This function wraps the `mlflow.search_runs` function and returns the
96
+ results as a `RunCollection` object. It allows for flexible searching of
97
+ MLflow runs based on various criteria.
98
+
99
+ Note:
100
+ The returned runs are sorted by their start time in ascending order.
74
101
 
75
102
  Args:
76
- run (Run | None): The run object. Defaults to None.
103
+ experiment_ids (list[str] | None): List of experiment IDs. Search can
104
+ work with experiment IDs or experiment names, but not both in the
105
+ same call. Values other than ``None`` or ``[]`` will result in
106
+ error if ``experiment_names`` is also not ``None`` or ``[]``.
107
+ ``None`` will default to the active experiment if ``experiment_names``
108
+ is ``None`` or ``[]``.
109
+ filter_string (str): Filter query string, defaults to searching all
110
+ runs.
111
+ run_view_type (int): one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``,
112
+ or ``ALL`` runs defined in :py:class:`mlflow.entities.ViewType`.
113
+ max_results (int): The maximum number of runs to put in the dataframe.
114
+ Default is 100,000 to avoid causing out-of-memory issues on the user's
115
+ machine.
116
+ order_by (list[str] | None): List of columns to order by (e.g.,
117
+ "metrics.rmse"). The ``order_by`` column can contain an optional
118
+ ``DESC`` or ``ASC`` value. The default is ``ASC``. The default
119
+ ordering is to sort by ``start_time DESC``, then ``run_id``.
120
+ ``start_time DESC``, then ``run_id``.
121
+ search_all_experiments (bool): Boolean specifying whether all
122
+ experiments should be searched. Only honored if ``experiment_ids``
123
+ is ``[]`` or ``None``.
124
+ experiment_names (list[str] | None): List of experiment names. Search
125
+ can work with experiment IDs or experiment names, but not both in
126
+ the same call. Values other than ``None`` or ``[]`` will result in
127
+ error if ``experiment_ids`` is also not ``None`` or ``[]``.
128
+ ``experiment_ids`` is also not ``None`` or ``[]``. ``None`` will
129
+ default to the active experiment if ``experiment_ids`` is ``None``
130
+ or ``[]``.
77
131
 
78
132
  Returns:
79
- The local path to the directory where the artifacts are downloaded.
133
+ A `RunCollection` object containing the search results.
80
134
  """
81
- if run is None:
82
- uri = mlflow.get_artifact_uri()
83
- else:
84
- uri = artifact_utils.get_artifact_uri(run.info.run_id)
85
-
86
- return Path(mlflow.artifacts.download_artifacts(uri))
87
-
88
-
89
- def get_hydra_output_dir(*, run: Run | None = None) -> Path:
135
+ runs = mlflow.search_runs(
136
+ experiment_ids=experiment_ids,
137
+ filter_string=filter_string,
138
+ run_view_type=run_view_type,
139
+ max_results=max_results,
140
+ order_by=order_by,
141
+ output_format="list",
142
+ search_all_experiments=search_all_experiments,
143
+ experiment_names=experiment_names,
144
+ )
145
+ runs = sorted(runs, key=lambda run: run.info.start_time) # type: ignore
146
+ return RunCollection(runs) # type: ignore
147
+
148
+
149
+ def list_runs(experiment_names: list[str] | None = None) -> RunCollection:
90
150
  """
91
- Retrieve the Hydra output directory for the given run.
151
+ List all runs for the specified experiments.
92
152
 
93
- This function returns the Hydra output directory. If no run is provided,
94
- it retrieves the output directory from the current Hydra configuration.
95
- If a run is provided, it retrieves the artifact path for the run, loads
96
- the Hydra configuration from the downloaded artifacts, and returns the
97
- output directory specified in that configuration.
153
+ This function retrieves all runs for the given list of experiment names.
154
+ If no experiment names are provided (None), it defaults to searching all runs
155
+ for the currently active experiment. If an empty list is provided, the function
156
+ will search all runs for all experiments except the "Default" experiment.
157
+ The function returns the results as a `RunCollection` object.
158
+
159
+ Note:
160
+ The returned runs are sorted by their start time in ascending order.
98
161
 
99
162
  Args:
100
- run (Run | None): The run object. Defaults to None.
163
+ experiment_names (list[str] | None): List of experiment names to search
164
+ for runs. If None or an empty list is provided, the function will
165
+ search the currently active experiment or all experiments except
166
+ the "Default" experiment.
101
167
 
102
168
  Returns:
103
- Path: The path to the Hydra output directory.
104
-
105
- Raises:
106
- FileNotFoundError: If the Hydra configuration file is not found
107
- in the artifacts.
169
+ A `RunCollection` object containing the runs for the specified experiments.
108
170
  """
109
- if run is None:
110
- hc = HydraConfig.get()
111
- return Path(hc.runtime.output_dir)
112
-
113
- path = get_artifact_dir(run) / ".hydra/hydra.yaml"
114
-
115
- if path.exists():
116
- hc = OmegaConf.load(path)
117
- return Path(hc.hydra.runtime.output_dir)
171
+ if experiment_names == []:
172
+ experiments = mlflow.search_experiments()
173
+ experiment_names = [e.name for e in experiments if e.name != "Default"]
118
174
 
119
- raise FileNotFoundError
175
+ return search_runs(experiment_names=experiment_names)
hydraflow/progress.py CHANGED
@@ -1,17 +1,129 @@
1
+ """
2
+ Module for managing progress tracking in parallel processing using Joblib
3
+ and Rich's Progress bar.
4
+
5
+ Provide context managers and functions to facilitate the execution
6
+ of tasks in parallel while displaying progress updates.
7
+
8
+ The following key components are provided:
9
+
10
+ - JoblibProgress: A context manager for tracking progress with Rich's Progress
11
+ bar.
12
+ - parallel_progress: A function to execute a given function in parallel over
13
+ an iterable with progress tracking.
14
+ - multi_tasks_progress: A function to render auto-updating progress bars for
15
+ multiple tasks concurrently.
16
+
17
+ Usage:
18
+ Import the necessary functions and use them to manage progress in your
19
+ parallel processing tasks.
20
+ """
21
+
1
22
  from __future__ import annotations
2
23
 
3
- from typing import TYPE_CHECKING
24
+ from contextlib import contextmanager
25
+ from typing import TYPE_CHECKING, TypeVar
4
26
 
5
27
  import joblib
6
28
  from rich.progress import Progress
7
29
 
8
30
  if TYPE_CHECKING:
9
- from collections.abc import Iterable
31
+ from collections.abc import Callable, Iterable, Iterator
10
32
 
11
33
  from rich.progress import ProgressColumn
12
34
 
13
35
 
14
- def multi_task_progress(
36
+ # https://github.com/jonghwanhyeon/joblib-progress/blob/main/joblib_progress/__init__.py
37
+ @contextmanager
38
+ def JoblibProgress(
39
+ *columns: ProgressColumn | str,
40
+ description: str | None = None,
41
+ total: int | None = None,
42
+ **kwargs,
43
+ ) -> Iterator[Progress]:
44
+ """
45
+ Context manager for tracking progress using Joblib with Rich's Progress bar.
46
+
47
+ Args:
48
+ *columns (ProgressColumn | str): Columns to display in the progress bar.
49
+ description (str | None, optional): A description for the progress task.
50
+ Defaults to None.
51
+ total (int | None, optional): The total number of tasks. If None, it will
52
+ be determined automatically.
53
+ **kwargs: Additional keyword arguments passed to the Progress instance.
54
+
55
+ Yields:
56
+ Progress: A Progress instance for managing the progress bar.
57
+
58
+ Example:
59
+ with JoblibProgress("task", total=100) as progress:
60
+ # Your parallel processing code here
61
+ """
62
+ if not columns:
63
+ columns = Progress.get_default_columns()
64
+
65
+ progress = Progress(*columns, **kwargs)
66
+
67
+ if description is None:
68
+ description = "Processing..."
69
+
70
+ task_id = progress.add_task(description, total=total)
71
+ print_progress = joblib.parallel.Parallel.print_progress
72
+
73
+ def update_progress(self: joblib.parallel.Parallel):
74
+ progress.update(task_id, completed=self.n_completed_tasks, refresh=True)
75
+ return print_progress(self)
76
+
77
+ try:
78
+ joblib.parallel.Parallel.print_progress = update_progress
79
+ progress.start()
80
+ yield progress
81
+
82
+ finally:
83
+ progress.stop()
84
+ joblib.parallel.Parallel.print_progress = print_progress
85
+
86
+
87
+ T = TypeVar("T")
88
+ U = TypeVar("U")
89
+
90
+
91
+ def parallel_progress(
92
+ func: Callable[[T], U],
93
+ iterable: Iterable[T],
94
+ *columns: ProgressColumn | str,
95
+ n_jobs: int = -1,
96
+ description: str | None = None,
97
+ **kwargs,
98
+ ) -> list[U]:
99
+ """
100
+ Execute a function in parallel over an iterable with progress tracking.
101
+
102
+ Args:
103
+ func (Callable[[T], U]): The function to execute on each item in the
104
+ iterable.
105
+ iterable (Iterable[T]): An iterable of items to process.
106
+ *columns (ProgressColumn | str): Additional columns to display in the
107
+ progress bar.
108
+ n_jobs (int, optional): The number of jobs to run in parallel.
109
+ Defaults to -1 (all processors).
110
+ description (str | None, optional): A description for the progress bar.
111
+ Defaults to None.
112
+ **kwargs: Additional keyword arguments passed to the Progress instance.
113
+
114
+ Returns:
115
+ list[U]: A list of results from applying the function to each item in
116
+ the iterable.
117
+ """
118
+ iterable = list(iterable)
119
+ total = len(iterable)
120
+
121
+ with JoblibProgress(*columns, description=description, total=total, **kwargs):
122
+ it = (joblib.delayed(func)(x) for x in iterable)
123
+ return joblib.Parallel(n_jobs=n_jobs)(it) # type: ignore
124
+
125
+
126
+ def multi_tasks_progress(
15
127
  iterables: Iterable[Iterable[int | tuple[int, int]]],
16
128
  *columns: ProgressColumn | str,
17
129
  n_jobs: int = -1,
@@ -52,7 +164,8 @@ def multi_task_progress(
52
164
 
53
165
  task_main = progress.add_task(main_description, total=None) if n > 1 else None
54
166
  tasks = [
55
- progress.add_task(description.format(i), start=False, total=None) for i in range(n)
167
+ progress.add_task(description.format(i), start=False, total=None)
168
+ for i in range(n)
56
169
  ]
57
170
 
58
171
  total = {}
@@ -87,45 +200,3 @@ def multi_task_progress(
87
200
 
88
201
  else:
89
202
  func(0)
90
-
91
-
92
- if __name__ == "__main__":
93
- import random
94
- import time
95
-
96
- from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TimeElapsedColumn
97
-
98
- from hydraflow.progress import multi_task_progress
99
-
100
- def task(total):
101
- for i in range(total or 90):
102
- if total is None:
103
- yield i
104
- else:
105
- yield i, total
106
- time.sleep(random.random() / 30)
107
-
108
- def multi_task_progress_test(unknown_total: bool):
109
- tasks = [task(random.randint(80, 100)) for _ in range(4)]
110
- if unknown_total:
111
- tasks = [task(None), *tasks, task(None)]
112
-
113
- columns = [
114
- SpinnerColumn(),
115
- *Progress.get_default_columns(),
116
- MofNCompleteColumn(),
117
- TimeElapsedColumn(),
118
- ]
119
-
120
- kwargs = {}
121
- if unknown_total:
122
- kwargs["main_description"] = "unknown"
123
-
124
- multi_task_progress(tasks, *columns, n_jobs=4, **kwargs)
125
-
126
- multi_task_progress_test(False)
127
- multi_task_progress_test(True)
128
- multi_task_progress([task(100)])
129
- multi_task_progress([task(None)], description="unknown")
130
- multi_task_progress([task(100), task(None)], main_description="transient", transient=True)
131
- multi_task_progress([task(100)], description="transient", transient=True)
@@ -1,7 +1,24 @@
1
1
  """
2
- This module provides functionality for managing and interacting with MLflow
3
- runs. It includes the `RunCollection` class and various methods to filter
4
- runs, retrieve run information, log artifacts, and load configurations.
2
+ This module provides functionality for managing and interacting with MLflow runs.
3
+ It includes the `RunCollection` class, which serves as a container for multiple MLflow
4
+ run objects, and various methods to filter, retrieve, and manipulate these runs.
5
+
6
+ Key Features:
7
+ - **Run Management**: The `RunCollection` class allows for easy management of multiple
8
+ MLflow runs, providing methods to access, filter, and sort runs based on various
9
+ criteria.
10
+ - **Filtering**: The module supports filtering runs based on specific configurations
11
+ and parameters, enabling users to easily find runs that match certain conditions.
12
+ - **Retrieval**: Users can retrieve specific runs, including the first, last, or any
13
+ run that matches a given configuration.
14
+ - **Artifact Handling**: The module provides methods to access and manipulate the
15
+ artifacts associated with each run, including retrieving artifact URIs and directories.
16
+
17
+ The `RunCollection` class is designed to work seamlessly with the MLflow tracking
18
+ API, providing a robust solution for managing machine learning experiment runs and
19
+ their associated metadata. This module is particularly useful for data scientists and
20
+ machine learning engineers who need to track and analyze the results of their experiments
21
+ efficiently.
5
22
  """
6
23
 
7
24
  from __future__ import annotations
@@ -10,10 +27,7 @@ from dataclasses import dataclass, field
10
27
  from itertools import chain
11
28
  from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar
12
29
 
13
- import mlflow
14
- from mlflow.entities import ViewType
15
30
  from mlflow.entities.run import Run
16
- from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
17
31
 
18
32
  from hydraflow.config import iter_params
19
33
  from hydraflow.info import RunCollectionInfo
@@ -26,101 +40,6 @@ if TYPE_CHECKING:
26
40
  from omegaconf import DictConfig
27
41
 
28
42
 
29
- def search_runs(
30
- experiment_ids: list[str] | None = None,
31
- filter_string: str = "",
32
- run_view_type: int = ViewType.ACTIVE_ONLY,
33
- max_results: int = SEARCH_MAX_RESULTS_PANDAS,
34
- order_by: list[str] | None = None,
35
- search_all_experiments: bool = False,
36
- experiment_names: list[str] | None = None,
37
- ) -> RunCollection:
38
- """
39
- Search for Runs that fit the specified criteria.
40
-
41
- This function wraps the `mlflow.search_runs` function and returns the
42
- results as a `RunCollection` object. It allows for flexible searching of
43
- MLflow runs based on various criteria.
44
-
45
- Note:
46
- The returned runs are sorted by their start time in ascending order.
47
-
48
- Args:
49
- experiment_ids (list[str] | None): List of experiment IDs. Search can
50
- work with experiment IDs or experiment names, but not both in the
51
- same call. Values other than ``None`` or ``[]`` will result in
52
- error if ``experiment_names`` is also not ``None`` or ``[]``.
53
- ``None`` will default to the active experiment if ``experiment_names``
54
- is ``None`` or ``[]``.
55
- filter_string (str): Filter query string, defaults to searching all
56
- runs.
57
- run_view_type (int): one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``,
58
- or ``ALL`` runs defined in :py:class:`mlflow.entities.ViewType`.
59
- max_results (int): The maximum number of runs to put in the dataframe.
60
- Default is 100,000 to avoid causing out-of-memory issues on the user's
61
- machine.
62
- order_by (list[str] | None): List of columns to order by (e.g.,
63
- "metrics.rmse"). The ``order_by`` column can contain an optional
64
- ``DESC`` or ``ASC`` value. The default is ``ASC``. The default
65
- ordering is to sort by ``start_time DESC``, then ``run_id``.
66
- ``start_time DESC``, then ``run_id``.
67
- search_all_experiments (bool): Boolean specifying whether all
68
- experiments should be searched. Only honored if ``experiment_ids``
69
- is ``[]`` or ``None``.
70
- experiment_names (list[str] | None): List of experiment names. Search
71
- can work with experiment IDs or experiment names, but not both in
72
- the same call. Values other than ``None`` or ``[]`` will result in
73
- error if ``experiment_ids`` is also not ``None`` or ``[]``.
74
- ``experiment_ids`` is also not ``None`` or ``[]``. ``None`` will
75
- default to the active experiment if ``experiment_ids`` is ``None``
76
- or ``[]``.
77
-
78
- Returns:
79
- A `RunCollection` object containing the search results.
80
- """
81
- runs = mlflow.search_runs(
82
- experiment_ids=experiment_ids,
83
- filter_string=filter_string,
84
- run_view_type=run_view_type,
85
- max_results=max_results,
86
- order_by=order_by,
87
- output_format="list",
88
- search_all_experiments=search_all_experiments,
89
- experiment_names=experiment_names,
90
- )
91
- runs = sorted(runs, key=lambda run: run.info.start_time) # type: ignore
92
- return RunCollection(runs) # type: ignore
93
-
94
-
95
- def list_runs(experiment_names: list[str] | None = None) -> RunCollection:
96
- """
97
- List all runs for the specified experiments.
98
-
99
- This function retrieves all runs for the given list of experiment names.
100
- If no experiment names are provided (None), it defaults to searching all runs
101
- for the currently active experiment. If an empty list is provided, the function
102
- will search all runs for all experiments except the "Default" experiment.
103
- The function returns the results as a `RunCollection` object.
104
-
105
- Note:
106
- The returned runs are sorted by their start time in ascending order.
107
-
108
- Args:
109
- experiment_names (list[str] | None): List of experiment names to search
110
- for runs. If None or an empty list is provided, the function will
111
- search the currently active experiment or all experiments except
112
- the "Default" experiment.
113
-
114
- Returns:
115
- A `RunCollection` object containing the runs for the specified experiments.
116
- """
117
- if experiment_names == []:
118
- experiments = mlflow.search_experiments()
119
- experiment_names = [e.name for e in experiments if e.name != "Default"]
120
-
121
- return search_runs(experiment_names=experiment_names)
122
-
123
-
124
43
  T = TypeVar("T")
125
44
  P = ParamSpec("P")
126
45
 
@@ -132,6 +51,11 @@ class RunCollection:
132
51
 
133
52
  This class provides methods to interact with the runs, such as filtering,
134
53
  retrieving specific runs, and accessing run information.
54
+
55
+ Key Features:
56
+ - Filtering: Easily filter runs based on various criteria.
57
+ - Retrieval: Access specific runs by index or through methods.
58
+ - Metadata: Access run metadata and associated information.
135
59
  """
136
60
 
137
61
  _runs: list[Run]
@@ -544,7 +468,9 @@ class RunCollection:
544
468
  """
545
469
  return (func(dir, *args, **kwargs) for dir in self.info.artifact_dir)
546
470
 
547
- def group_by(self, *names: str | list[str]) -> dict[tuple[str | None, ...], RunCollection]:
471
+ def group_by(
472
+ self, *names: str | list[str]
473
+ ) -> dict[tuple[str | None, ...], RunCollection]:
548
474
  """
549
475
  Group runs by specified parameter names.
550
476
 
@@ -569,25 +495,6 @@ class RunCollection:
569
495
 
570
496
  return {key: RunCollection(runs) for key, runs in grouped_runs.items()}
571
497
 
572
- def group_by_values(self, *names: str | list[str]) -> list[RunCollection]:
573
- """
574
- Group runs by specified parameter names.
575
-
576
- This method groups the runs in the collection based on the values of the
577
- specified parameters. Each unique combination of parameter values will
578
- form a separate RunCollection in the returned list.
579
-
580
- Args:
581
- *names (str | list[str]): The names of the parameters to group by.
582
- This can be a single parameter name or multiple names provided
583
- as separate arguments or as a list.
584
-
585
- Returns:
586
- list[RunCollection]: A list of RunCollection objects, where each
587
- object contains runs that match the specified parameter values.
588
- """
589
- return list(self.group_by(*names).values())
590
-
591
498
 
592
499
  def _param_matches(run: Run, key: str, value: Any) -> bool:
593
500
  """
@@ -747,7 +654,9 @@ def find_last_run(runs: list[Run], config: object | None = None, **kwargs) -> Ru
747
654
  return filtered_runs[-1]
748
655
 
749
656
 
750
- def try_find_last_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
657
+ def try_find_last_run(
658
+ runs: list[Run], config: object | None = None, **kwargs
659
+ ) -> Run | None:
751
660
  """
752
661
  Find the last run based on the provided configuration.
753
662
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hydraflow
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
5
5
  Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -0,0 +1,12 @@
1
+ hydraflow/__init__.py,sha256=B7rWSiGP5WwWjijcb41Bv9uuo5MQ6gbBbVWGAWYtK-k,598
2
+ hydraflow/asyncio.py,sha256=jdXuEFC6f7L_Dq6beASFZPQSvCnGimVxU-PRFsNc5U0,6241
3
+ hydraflow/config.py,sha256=6TCKNQZ3sSrIEvl245T2udwFuknejyN1dMcIVmOHdrQ,2102
4
+ hydraflow/context.py,sha256=G7JMrG70sgBH2qILXl5nkGWNUoRggj518JWUq0ZiJ9E,7776
5
+ hydraflow/info.py,sha256=Vj2sT66Ric63mmaq7Yu8nDFhsGQYO3MCHrxFpapDufc,3458
6
+ hydraflow/mlflow.py,sha256=Q8RGijSURTjRkEDxzi_2Tk9KOx3QK__al5aArGQriHA,7249
7
+ hydraflow/progress.py,sha256=UIIKlweji3L0uRi4hZ_DrtRcnayHPlsMoug7hVEKq8k,6753
8
+ hydraflow/run_collection.py,sha256=V5lGdGHYgsSpBOYGaVEL1mpKJvdiEshBL0KmmZ8qeZo,29161
9
+ hydraflow-0.2.9.dist-info/METADATA,sha256=ZjJQz_4MogGkcs16dOwnsp_J0icg9ypgQdXOYxVdxJg,4181
10
+ hydraflow-0.2.9.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
11
+ hydraflow-0.2.9.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
12
+ hydraflow-0.2.9.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- hydraflow/__init__.py,sha256=ObIv7fGbNsqUhZf3sst-9pbgyFsJr6jVsNV10NmMQas,483
2
- hydraflow/asyncio.py,sha256=yh851L315QHzRBwq6r-uwO2oZKgz1JawHp-fswfxT1E,6175
3
- hydraflow/config.py,sha256=6TCKNQZ3sSrIEvl245T2udwFuknejyN1dMcIVmOHdrQ,2102
4
- hydraflow/context.py,sha256=8Qn99yCSkCarDDthQ6hjgW80CBBIg0H7fnLvtw4ZXo8,7248
5
- hydraflow/info.py,sha256=LziP71wQ-tDQPMUPFV_6JExBU8r-Ja-O05F07b_RUcc,1812
6
- hydraflow/mlflow.py,sha256=USd51C5YFlk4Bjhs4F1PMakxDxjD6Nn2t4GhL6aZ6QQ,3647
7
- hydraflow/progress.py,sha256=0GJfKnnY_SAHVWpGvLdgOBsogGs8vVofjLuphuUEy2g,4296
8
- hydraflow/run_collection.py,sha256=NO_QEwIwxU0EouKCJ4HAhXd35uJrxqolI7vM5QfsNxw,33152
9
- hydraflow-0.2.7.dist-info/METADATA,sha256=_kqK5pFLntvmiFIc1UBWOzDSRMeerXDZ0ZozhlTMkSw,4181
10
- hydraflow-0.2.7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
11
- hydraflow-0.2.7.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
12
- hydraflow-0.2.7.dist-info/RECORD,,