hydraflow 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hydraflow/__init__.py CHANGED
@@ -3,15 +3,10 @@ from .mlflow import set_experiment
3
3
  from .runs import (
4
4
  Run,
5
5
  Runs,
6
- drop_unique_params,
7
6
  filter_runs,
8
- get_artifact_dir,
9
- get_artifact_path,
10
- get_artifact_uri,
11
7
  get_param_dict,
12
8
  get_param_names,
13
9
  get_run,
14
- get_run_id,
15
10
  load_config,
16
11
  )
17
12
 
@@ -20,15 +15,10 @@ __all__ = [
20
15
  "Run",
21
16
  "Runs",
22
17
  "chdir_artifact",
23
- "drop_unique_params",
24
18
  "filter_runs",
25
- "get_artifact_dir",
26
- "get_artifact_path",
27
- "get_artifact_uri",
28
19
  "get_param_dict",
29
20
  "get_param_names",
30
21
  "get_run",
31
- "get_run_id",
32
22
  "load_config",
33
23
  "log_run",
34
24
  "set_experiment",
hydraflow/config.py CHANGED
@@ -1,3 +1,8 @@
1
+ """
2
+ This module provides functionality for working with configuration
3
+ objects using the OmegaConf library.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
3
8
  from typing import TYPE_CHECKING
@@ -10,12 +15,32 @@ if TYPE_CHECKING:
10
15
 
11
16
 
12
17
  def iter_params(config: object, prefix: str = "") -> Iterator[tuple[str, Any]]:
13
- if not isinstance(config, DictConfig | ListConfig):
18
+ """
19
+ Recursively iterate over the parameters in the given configuration object.
20
+
21
+ This function traverses the configuration object and yields key-value pairs
22
+ representing the parameters. The keys are prefixed with the provided prefix.
23
+
24
+ Args:
25
+ config: The configuration object to iterate over. This can be a dictionary,
26
+ list, DictConfig, or ListConfig.
27
+ prefix: The prefix to prepend to the parameter keys.
28
+ Defaults to an empty string.
29
+
30
+ Yields:
31
+ Key-value pairs representing the parameters in the configuration object.
32
+ """
33
+ if not isinstance(config, (DictConfig, ListConfig)):
14
34
  config = OmegaConf.create(config) # type: ignore
15
35
 
16
36
  if isinstance(config, DictConfig):
17
37
  for key, value in config.items():
18
- if isinstance(value, (DictConfig, ListConfig)):
38
+ if isinstance(value, ListConfig) and not any(
39
+ isinstance(v, (DictConfig, ListConfig)) for v in value
40
+ ):
41
+ yield f"{prefix}{key}", value
42
+
43
+ elif isinstance(value, (DictConfig, ListConfig)):
19
44
  yield from iter_params(value, f"{prefix}{key}.")
20
45
 
21
46
  else:
hydraflow/context.py CHANGED
@@ -1,5 +1,11 @@
1
+ """
2
+ This module provides context managers to log parameters and manage the MLflow
3
+ run context.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
8
+ import logging
3
9
  import os
4
10
  import time
5
11
  from contextlib import contextmanager
@@ -12,15 +18,14 @@ from hydra.core.hydra_config import HydraConfig
12
18
  from watchdog.events import FileModifiedEvent, FileSystemEventHandler
13
19
  from watchdog.observers import Observer
14
20
 
15
- from hydraflow.mlflow import log_params
16
- from hydraflow.runs import get_artifact_path
17
- from hydraflow.util import uri_to_path
21
+ from hydraflow.mlflow import get_artifact_dir, log_params
18
22
 
19
23
  if TYPE_CHECKING:
20
24
  from collections.abc import Callable, Iterator
21
25
 
22
26
  from mlflow.entities.run import Run
23
- from pandas import Series
27
+
28
+ log = logging.getLogger(__name__)
24
29
 
25
30
 
26
31
  @dataclass
@@ -35,12 +40,33 @@ def log_run(
35
40
  *,
36
41
  synchronous: bool | None = None,
37
42
  ) -> Iterator[Info]:
43
+ """
44
+ Log the parameters from the given configuration object and manage the MLflow
45
+ run context.
46
+
47
+ This context manager logs the parameters from the provided configuration object
48
+ using MLflow. It also manages the MLflow run context, ensuring that artifacts
49
+ are logged and the run is properly closed.
50
+
51
+ Args:
52
+ config: The configuration object to log the parameters from.
53
+ synchronous: Whether to log the parameters synchronously.
54
+ Defaults to None.
55
+
56
+ Yields:
57
+ Info: An `Info` object containing the output directory and artifact directory
58
+ paths.
59
+
60
+ Example:
61
+ with log_run(config) as info:
62
+ # Perform operations within the MLflow run context
63
+ pass
64
+ """
38
65
  log_params(config, synchronous=synchronous)
39
66
 
40
67
  hc = HydraConfig.get()
41
68
  output_dir = Path(hc.runtime.output_dir)
42
- uri = mlflow.get_artifact_uri()
43
- info = Info(output_dir, uri_to_path(uri))
69
+ info = Info(output_dir, get_artifact_dir())
44
70
 
45
71
  # Save '.hydra' config directory first.
46
72
  output_subdir = output_dir / (hc.output_subdir or "")
@@ -54,16 +80,48 @@ def log_run(
54
80
  with watch(log_artifact, output_dir):
55
81
  yield info
56
82
 
83
+ except Exception as e:
84
+ log.error(f"Error during log_run: {e}")
85
+ raise
86
+
57
87
  finally:
58
88
  # Save output_dir including '.hydra' config directory.
59
89
  mlflow.log_artifacts(output_dir.as_posix())
60
90
 
61
91
 
62
92
  @contextmanager
63
- def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60) -> Iterator[None]:
64
- if not dir:
65
- uri = mlflow.get_artifact_uri()
66
- dir = uri_to_path(uri)
93
+ def watch(
94
+ func: Callable[[Path], None],
95
+ dir: Path | str = "",
96
+ timeout: int = 60,
97
+ ) -> Iterator[None]:
98
+ """
99
+ Watch the given directory for changes and call the provided function
100
+ when a change is detected.
101
+
102
+ This context manager sets up a file system watcher on the specified directory.
103
+ When a file modification is detected, the provided function is called with
104
+ the path of the modified file. The watcher runs for the specified timeout
105
+ period or until the context is exited.
106
+
107
+ Args:
108
+ func: The function to call when a change is
109
+ detected. It should accept a single argument of type `Path`,
110
+ which is the path of the modified file.
111
+ dir: The directory to watch. If not specified,
112
+ the current MLflow artifact URI is used. Defaults to "".
113
+ timeout: The timeout period in seconds for the watcher
114
+ to run after the context is exited. Defaults to 60.
115
+
116
+ Yields:
117
+ None
118
+
119
+ Example:
120
+ with watch(log_artifact, "/path/to/dir"):
121
+ # Perform operations while watching the directory for changes
122
+ pass
123
+ """
124
+ dir = dir or get_artifact_dir()
67
125
 
68
126
  handler = Handler(func)
69
127
  observer = Observer()
@@ -73,6 +131,10 @@ def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60)
73
131
  try:
74
132
  yield
75
133
 
134
+ except Exception as e:
135
+ log.error(f"Error during watch: {e}")
136
+ raise
137
+
76
138
  finally:
77
139
  elapsed = 0
78
140
  while not observer.event_queue.empty():
@@ -97,15 +159,30 @@ class Handler(FileSystemEventHandler):
97
159
 
98
160
  @contextmanager
99
161
  def chdir_artifact(
100
- run: Run | Series | str,
162
+ run: Run,
101
163
  artifact_path: str | None = None,
102
164
  ) -> Iterator[Path]:
165
+ """
166
+ Change the current working directory to the artifact directory of the
167
+ given run.
168
+
169
+ This context manager changes the current working directory to the artifact
170
+ directory of the given run. It ensures that the directory is changed back
171
+ to the original directory after the context is exited.
172
+
173
+ Args:
174
+ run: The run to get the artifact directory from.
175
+ artifact_path: The artifact path.
176
+ """
103
177
  curdir = Path.cwd()
178
+ path = mlflow.artifacts.download_artifacts(
179
+ run_id=run.info.run_id,
180
+ artifact_path=artifact_path,
181
+ )
104
182
 
105
- artifact_dir = get_artifact_path(run, artifact_path)
106
-
107
- os.chdir(artifact_dir)
183
+ os.chdir(path)
108
184
  try:
109
- yield artifact_dir
185
+ yield Path(path)
186
+
110
187
  finally:
111
188
  os.chdir(curdir)
hydraflow/mlflow.py CHANGED
@@ -1,5 +1,12 @@
1
+ """
2
+ This module provides functionality to log parameters from Hydra
3
+ configuration objects and set up experiments using MLflow.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
8
+ from pathlib import Path
9
+
3
10
  import mlflow
4
11
  from hydra.core.hydra_config import HydraConfig
5
12
 
@@ -7,6 +14,18 @@ from hydraflow.config import iter_params
7
14
 
8
15
 
9
16
  def set_experiment(prefix: str = "", suffix: str = "", uri: str | None = None) -> None:
17
+ """
18
+ Set the experiment name and tracking URI optionally.
19
+
20
+ This function sets the experiment name by combining the given prefix,
21
+ the job name from HydraConfig, and the given suffix. Optionally, it can
22
+ also set the tracking URI.
23
+
24
+ Args:
25
+ prefix: The prefix to prepend to the experiment name.
26
+ suffix: The suffix to append to the experiment name.
27
+ uri: The tracking URI to use.
28
+ """
10
29
  if uri:
11
30
  mlflow.set_tracking_uri(uri)
12
31
 
@@ -16,5 +35,38 @@ def set_experiment(prefix: str = "", suffix: str = "", uri: str | None = None) -
16
35
 
17
36
 
18
37
  def log_params(config: object, *, synchronous: bool | None = None) -> None:
38
+ """
39
+ Log the parameters from the given configuration object.
40
+
41
+ This method logs the parameters from the provided configuration object
42
+ using MLflow. It iterates over the parameters and logs them using the
43
+ `mlflow.log_param` method.
44
+
45
+ Args:
46
+ config: The configuration object to log the parameters from.
47
+ synchronous: Whether to log the parameters synchronously.
48
+ Defaults to None.
49
+ """
19
50
  for key, value in iter_params(config):
20
51
  mlflow.log_param(key, value, synchronous=synchronous)
52
+
53
+
54
+ def get_artifact_dir(artifact_path: str | None = None) -> Path:
55
+ """
56
+ Get the artifact directory for the given artifact path.
57
+
58
+ This function retrieves the artifact URI for the specified artifact path
59
+ using MLflow, downloads the artifacts to a local directory, and returns
60
+ the path to that directory.
61
+
62
+ Args:
63
+ artifact_path: The artifact path for which to get the directory.
64
+ Defaults to None.
65
+
66
+ Returns:
67
+ The local path to the directory where the artifacts are downloaded.
68
+ """
69
+ uri = mlflow.get_artifact_uri(artifact_path)
70
+ dir = mlflow.artifacts.download_artifacts(artifact_uri=uri)
71
+
72
+ return Path(dir)
hydraflow/runs.py CHANGED
@@ -1,27 +1,95 @@
1
+ """
2
+ This module provides functionality for managing and interacting with MLflow runs.
3
+ It includes the `Runs` class and various methods to filter runs, retrieve run information,
4
+ log artifacts, and load configurations.
5
+ """
6
+
1
7
  from __future__ import annotations
2
8
 
3
9
  from dataclasses import dataclass
4
10
  from functools import cache
5
- from pathlib import Path
11
+ from itertools import chain
6
12
  from typing import TYPE_CHECKING, Any
7
13
 
8
14
  import mlflow
9
- import numpy as np
10
- from mlflow.entities.run import Run as Run_
11
- from mlflow.tracking import artifact_utils
15
+ from mlflow.entities import ViewType
16
+ from mlflow.entities.run import Run
17
+ from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
12
18
  from omegaconf import DictConfig, OmegaConf
13
- from pandas import DataFrame, Series
14
19
 
15
20
  from hydraflow.config import iter_params
16
- from hydraflow.util import uri_to_path
17
21
 
18
22
  if TYPE_CHECKING:
19
23
  from typing import Any
20
24
 
21
25
 
26
+ def search_runs(
27
+ experiment_ids: list[str] | None = None,
28
+ filter_string: str = "",
29
+ run_view_type: int = ViewType.ACTIVE_ONLY,
30
+ max_results: int = SEARCH_MAX_RESULTS_PANDAS,
31
+ order_by: list[str] | None = None,
32
+ search_all_experiments: bool = False,
33
+ experiment_names: list[str] | None = None,
34
+ ) -> Runs:
35
+ """
36
+ Search for Runs that fit the specified criteria.
37
+
38
+ This function wraps the `mlflow.search_runs` function and returns the results
39
+ as a `Runs` object. It allows for flexible searching of MLflow runs based on
40
+ various criteria.
41
+
42
+ Args:
43
+ experiment_ids: List of experiment IDs. Search can work with experiment IDs or
44
+ experiment names, but not both in the same call. Values other than
45
+ ``None`` or ``[]`` will result in error if ``experiment_names`` is
46
+ also not ``None`` or ``[]``. ``None`` will default to the active
47
+ experiment if ``experiment_names`` is ``None`` or ``[]``.
48
+ filter_string: Filter query string, defaults to searching all runs.
49
+ run_view_type: one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``, or ``ALL`` runs
50
+ defined in :py:class:`mlflow.entities.ViewType`.
51
+ max_results: The maximum number of runs to put in the dataframe. Default is 100,000
52
+ to avoid causing out-of-memory issues on the user's machine.
53
+ order_by: List of columns to order by (e.g., "metrics.rmse"). The ``order_by`` column
54
+ can contain an optional ``DESC`` or ``ASC`` value. The default is ``ASC``.
55
+ The default ordering is to sort by ``start_time DESC``, then ``run_id``.
56
+ output_format: The output format to be returned. If ``pandas``, a ``pandas.DataFrame``
57
+ is returned and, if ``list``, a list of :py:class:`mlflow.entities.Run`
58
+ is returned.
59
+ search_all_experiments: Boolean specifying whether all experiments should be searched.
60
+ Only honored if ``experiment_ids`` is ``[]`` or ``None``.
61
+ experiment_names: List of experiment names. Search can work with experiment IDs or
62
+ experiment names, but not both in the same call. Values other
63
+ than ``None`` or ``[]`` will result in error if ``experiment_ids``
64
+ is also not ``None`` or ``[]``. ``None`` will default to the active
65
+ experiment if ``experiment_ids`` is ``None`` or ``[]``.
66
+
67
+ Returns:
68
+ A `Runs` object containing the search results.
69
+ """
70
+ runs = mlflow.search_runs(
71
+ experiment_ids=experiment_ids,
72
+ filter_string=filter_string,
73
+ run_view_type=run_view_type,
74
+ max_results=max_results,
75
+ order_by=order_by,
76
+ output_format="list",
77
+ search_all_experiments=search_all_experiments,
78
+ experiment_names=experiment_names,
79
+ )
80
+ return Runs(runs) # type: ignore
81
+
82
+
22
83
  @dataclass
23
84
  class Runs:
24
- runs: list[Run_] | DataFrame
85
+ """
86
+ A class to represent a collection of MLflow runs.
87
+
88
+ This class provides methods to interact with the runs, such as filtering,
89
+ retrieving specific runs, and accessing run information.
90
+ """
91
+
92
+ runs: list[Run]
25
93
 
26
94
  def __repr__(self) -> str:
27
95
  return f"{self.__class__.__name__}({len(self)})"
@@ -30,161 +98,280 @@ class Runs:
30
98
  return len(self.runs)
31
99
 
32
100
  def filter(self, config: object) -> Runs:
101
+ """
102
+ Filter the runs based on the provided configuration.
103
+
104
+ This method filters the runs in the collection according to the
105
+ specified configuration object. The configuration object should
106
+ contain key-value pairs that correspond to the parameters of the
107
+ runs. Only the runs that match all the specified parameters will
108
+ be included in the returned `Runs` object.
109
+
110
+ Args:
111
+ config: The configuration object to filter the runs.
112
+
113
+ Returns:
114
+ A new `Runs` object containing the filtered runs.
115
+ """
33
116
  return Runs(filter_runs(self.runs, config))
34
117
 
35
- def get(self, config: object) -> Run:
36
- return Run(get_run(self.runs, config))
118
+ def get(self, config: object) -> Run | None:
119
+ """
120
+ Retrieve a specific run based on the provided configuration.
37
121
 
38
- def drop_unique_params(self) -> Runs:
39
- if isinstance(self.runs, DataFrame):
40
- return Runs(drop_unique_params(self.runs))
122
+ This method filters the runs in the collection according to the
123
+ specified configuration object and returns the run that matches
124
+ the provided parameters. If more than one run matches the criteria,
125
+ a `ValueError` is raised.
41
126
 
42
- raise NotImplementedError
127
+ Args:
128
+ config: The configuration object to identify the run.
43
129
 
44
- def get_param_names(self) -> list[str]:
45
- if isinstance(self.runs, DataFrame):
46
- return get_param_names(self.runs)
130
+ Returns:
131
+ Run: The run object that matches the provided configuration.
132
+ None, if the runs are not in a DataFrame format.
47
133
 
48
- raise NotImplementedError
134
+ Raises:
135
+ ValueError: If the number of filtered runs is not exactly one.
136
+ """
137
+ return get_run(self.runs, config)
49
138
 
50
- def get_param_dict(self) -> dict[str, list[str]]:
51
- if isinstance(self.runs, DataFrame):
52
- return get_param_dict(self.runs)
139
+ def get_earliest_run(self, config: object | None = None, **kwargs) -> Run | None:
140
+ """
141
+ Get the earliest run from the list of runs based on the start time.
53
142
 
54
- raise NotImplementedError
143
+ This method filters the runs based on the configuration if provided
144
+ and returns the run with the earliest start time.
55
145
 
146
+ Args:
147
+ config: The configuration object to filter the runs.
148
+ If None, no filtering is applied.
149
+ **kwargs: Additional key-value pairs to filter the runs.
56
150
 
57
- def filter_runs(runs: list[Run_] | DataFrame, config: object) -> list[Run_] | DataFrame:
58
- if isinstance(runs, list):
59
- return filter_runs_list(runs, config)
151
+ Returns:
152
+ The run with the earliest start time, or None if no runs match the criteria.
153
+ """
154
+ return get_earliest_run(self.runs, config, **kwargs)
60
155
 
61
- return filter_runs_dataframe(runs, config)
156
+ def get_latest_run(self, config: object | None = None, **kwargs) -> Run | None:
157
+ """
158
+ Get the latest run from the list of runs based on the start time.
62
159
 
160
+ Args:
161
+ config: The configuration object to filter the runs.
162
+ If None, no filtering is applied.
163
+ **kwargs: Additional key-value pairs to filter the runs.
63
164
 
64
- def _is_equal(run: Run_, key: str, value: Any) -> bool:
65
- param = run.data.params.get(key, value)
165
+ Returns:
166
+ The run with the latest start time, or None if no runs match the criteria.
167
+ """
168
+ return get_latest_run(self.runs, config, **kwargs)
66
169
 
67
- if param is None:
68
- return False
170
+ def get_param_names(self) -> list[str]:
171
+ """
172
+ Get the parameter names from the runs.
69
173
 
70
- return type(value)(param) == value
174
+ This method extracts the unique parameter names from the provided list of runs.
175
+ It iterates through each run and collects the parameter names into a set to
176
+ ensure uniqueness.
71
177
 
178
+ Returns:
179
+ A list of unique parameter names.
180
+ """
181
+ return get_param_names(self.runs)
72
182
 
73
- def filter_runs_list(runs: list[Run_], config: object) -> list[Run_]:
74
- for key, value in iter_params(config):
183
+ def get_param_dict(self) -> dict[str, list[str]]:
184
+ """
185
+ Get the parameter dictionary from the list of runs.
186
+
187
+ This method extracts the parameter names and their corresponding values
188
+ from the provided list of runs. It iterates through each run and collects
189
+ the parameter values into a dictionary where the keys are parameter names
190
+ and the values are lists of parameter values.
191
+
192
+ Returns:
193
+ A dictionary where the keys are parameter names and the values are lists
194
+ of parameter values.
195
+ """
196
+ return get_param_dict(self.runs)
197
+
198
+
199
+ def filter_runs(runs: list[Run], config: object, **kwargs) -> list[Run]:
200
+ """
201
+ Filter the runs based on the provided configuration.
202
+
203
+ This method filters the runs in the collection according to the
204
+ specified configuration object. The configuration object should
205
+ contain key-value pairs that correspond to the parameters of the
206
+ runs. Only the runs that match all the specified parameters will
207
+ be included in the returned list of runs.
208
+
209
+ Args:
210
+ runs: The runs to filter.
211
+ config: The configuration object to filter the runs.
212
+ **kwargs: Additional key-value pairs to filter the runs.
213
+
214
+ Returns:
215
+ A filtered list of runs.
216
+ """
217
+ for key, value in chain(iter_params(config), kwargs.items()):
75
218
  runs = [run for run in runs if _is_equal(run, key, value)]
76
219
 
220
+ if len(runs) == 0:
221
+ return []
222
+
77
223
  return runs
78
224
 
79
225
 
80
- def filter_runs_dataframe(runs: DataFrame, config: object) -> DataFrame:
81
- index = np.ones(len(runs), dtype=bool)
226
+ def _is_equal(run: Run, key: str, value: Any) -> bool:
227
+ param = run.data.params.get(key, value)
82
228
 
83
- for key, value in iter_params(config):
84
- name = f"params.{key}"
229
+ if param is None:
230
+ return False
231
+
232
+ return type(value)(param) == value
233
+
234
+
235
+ def get_run(runs: list[Run], config: object, **kwargs) -> Run | None:
236
+ """
237
+ Retrieve a specific run based on the provided configuration.
85
238
 
86
- if name in runs:
87
- series = runs[name]
88
- is_value = -series.isna()
89
- param = series.fillna(value).astype(type(value))
90
- index &= is_value & (param == value)
239
+ This method filters the runs in the collection according to the
240
+ specified configuration object and returns the run that matches
241
+ the provided parameters. If more than one run matches the criteria,
242
+ a `ValueError` is raised.
91
243
 
92
- return runs[index]
244
+ Args:
245
+ runs: The runs to filter.
246
+ config: The configuration object to identify the run.
247
+ **kwargs: Additional key-value pairs to filter the runs.
93
248
 
249
+ Returns:
250
+ The run object that matches the provided configuration, or None
251
+ if no runs match the criteria.
94
252
 
95
- def get_run(runs: list[Run_] | DataFrame, config: object) -> Run_ | Series:
96
- runs = filter_runs(runs, config)
253
+ Raises:
254
+ ValueError: If more than one run matches the criteria.
255
+ """
256
+ runs = filter_runs(runs, config, **kwargs)
257
+
258
+ if len(runs) == 0:
259
+ return None
97
260
 
98
261
  if len(runs) == 1:
99
- return runs[0] if isinstance(runs, list) else runs.iloc[0]
262
+ return runs[0]
100
263
 
101
- msg = f"number of filtered runs is not 1: got {len(runs)}"
264
+ msg = f"Multiple runs were filtered. Expected number of runs is 1, but found {len(runs)} runs."
102
265
  raise ValueError(msg)
103
266
 
104
267
 
105
- def drop_unique_params(runs: DataFrame) -> DataFrame:
106
- def select(column: str) -> bool:
107
- return not column.startswith("params.") or len(runs[column].unique()) > 1
268
+ def get_earliest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
269
+ """
270
+ Get the earliest run from the list of runs based on the start time.
108
271
 
109
- columns = [select(column) for column in runs.columns]
110
- return runs.iloc[:, columns]
272
+ This method filters the runs based on the configuration if provided
273
+ and returns the run with the earliest start time.
111
274
 
275
+ Args:
276
+ runs: The list of runs.
277
+ config: The configuration object to filter the runs.
278
+ If None, no filtering is applied.
279
+ **kwargs: Additional key-value pairs to filter the runs.
112
280
 
113
- def get_param_names(runs: DataFrame) -> list[str]:
114
- def get_name(column: str) -> str:
115
- if column.startswith("params."):
116
- return column.split(".", maxsplit=1)[-1]
281
+ Returns:
282
+ The run with the earliest start time, or None if no runs match the criteria.
283
+ """
284
+ if config is not None or kwargs:
285
+ runs = filter_runs(runs, config or {}, **kwargs)
117
286
 
118
- return ""
287
+ return min(runs, key=lambda run: run.info.start_time, default=None)
119
288
 
120
- columns = [get_name(column) for column in runs.columns]
121
- return [column for column in columns if column]
122
289
 
290
+ def get_latest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
291
+ """
292
+ Get the latest run from the list of runs based on the start time.
123
293
 
124
- def get_param_dict(runs: DataFrame) -> dict[str, list[str]]:
125
- params = {}
126
- for name in get_param_names(runs):
127
- params[name] = list(runs[f"params.{name}"].unique())
294
+ This method filters the runs based on the configuration if provided
295
+ and returns the run with the latest start time.
128
296
 
129
- return params
297
+ Args:
298
+ runs: The list of runs.
299
+ config: The configuration object to filter the runs.
300
+ If None, no filtering is applied.
301
+ **kwargs: Additional key-value pairs to filter the runs.
130
302
 
303
+ Returns:
304
+ The run with the latest start time, or None if no runs match the criteria.
305
+ """
306
+ if config is not None or kwargs:
307
+ runs = filter_runs(runs, config or {}, **kwargs)
131
308
 
132
- @dataclass
133
- class Run:
134
- run: Run_ | Series | str
309
+ return max(runs, key=lambda run: run.info.start_time, default=None)
135
310
 
136
- def __repr__(self) -> str:
137
- return f"{self.__class__.__name__}({self.run_id!r})"
138
311
 
139
- @property
140
- def run_id(self) -> str:
141
- return get_run_id(self.run)
312
+ def get_param_names(runs: list[Run]) -> list[str]:
313
+ """
314
+ Get the parameter names from the runs.
142
315
 
143
- def artifact_uri(self, artifact_path: str | None = None) -> str:
144
- return get_artifact_uri(self.run, artifact_path)
316
+ This method extracts the unique parameter names from the provided list of runs.
317
+ It iterates through each run and collects the parameter names into a set to
318
+ ensure uniqueness.
145
319
 
146
- @property
147
- def artifact_dir(self) -> Path:
148
- return get_artifact_dir(self.run)
320
+ Args:
321
+ runs: The list of runs from which to extract parameter names.
149
322
 
150
- def artifact_path(self, artifact_path: str | None = None) -> Path:
151
- return get_artifact_path(self.run, artifact_path)
323
+ Returns:
324
+ A list of unique parameter names.
325
+ """
326
+ param_names = set()
152
327
 
153
- @property
154
- def config(self) -> DictConfig:
155
- return load_config(self.run)
328
+ for run in runs:
329
+ for param in run.data.params.keys():
330
+ param_names.add(param)
156
331
 
157
- def log_hydra_output_dir(self) -> None:
158
- log_hydra_output_dir(self.run)
332
+ return list(param_names)
159
333
 
160
334
 
161
- def get_run_id(run: Run_ | Series | str) -> str:
162
- if isinstance(run, str):
163
- return run
335
+ def get_param_dict(runs: list[Run]) -> dict[str, list[str]]:
336
+ """
337
+ Get the parameter dictionary from the list of runs.
164
338
 
165
- if isinstance(run, Run_):
166
- return run.info.run_id
339
+ This method extracts the parameter names and their corresponding values
340
+ from the provided list of runs. It iterates through each run and collects
341
+ the parameter values into a dictionary where the keys are parameter names
342
+ and the values are lists of parameter values.
167
343
 
168
- return run.run_id
344
+ Args:
345
+ runs: The list of runs from which to extract parameter names and values.
169
346
 
347
+ Returns:
348
+ A dictionary where the keys are parameter names and the values are lists
349
+ of parameter values.
350
+ """
351
+ params = {}
170
352
 
171
- def get_artifact_uri(run: Run_ | Series | str, artifact_path: str | None = None) -> str:
172
- run_id = get_run_id(run)
173
- return artifact_utils.get_artifact_uri(run_id, artifact_path)
353
+ for name in get_param_names(runs):
354
+ it = (run.data.params[name] for run in runs if name in run.data.params)
355
+ params[name] = sorted(set(it))
174
356
 
357
+ return params
175
358
 
176
- def get_artifact_dir(run: Run_ | Series | str) -> Path:
177
- uri = get_artifact_uri(run)
178
- return uri_to_path(uri)
179
359
 
360
+ def load_config(run: Run) -> DictConfig:
361
+ """
362
+ Load the configuration for a given run.
180
363
 
181
- def get_artifact_path(run: Run_ | Series | str, artifact_path: str | None = None) -> Path:
182
- artifact_dir = get_artifact_dir(run)
183
- return artifact_dir / artifact_path if artifact_path else artifact_dir
364
+ This function loads the configuration for the provided Run instance
365
+ by downloading the configuration file from the MLflow artifacts and
366
+ loading it using OmegaConf.
184
367
 
368
+ Args:
369
+ run: The Run instance to load the configuration for.
185
370
 
186
- def load_config(run: Run_ | Series | str) -> DictConfig:
187
- run_id = get_run_id(run)
371
+ Returns:
372
+ The loaded configuration.
373
+ """
374
+ run_id = run.info.run_id
188
375
  return _load_config(run_id)
189
376
 
190
377
 
@@ -201,17 +388,35 @@ def _load_config(run_id: str) -> DictConfig:
201
388
  return OmegaConf.load(path) # type: ignore
202
389
 
203
390
 
204
- def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
205
- path = get_artifact_dir(run) / ".hydra/hydra.yaml"
391
+ # def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
392
+ # """
393
+ # Get the Hydra output directory.
394
+
395
+ # Args:
396
+ # run: The run object.
397
+
398
+ # Returns:
399
+ # Path: The Hydra output directory.
400
+ # """
401
+ # path = get_artifact_dir(run) / ".hydra/hydra.yaml"
402
+
403
+ # if path.exists():
404
+ # hc = OmegaConf.load(path)
405
+ # return Path(hc.hydra.runtime.output_dir)
406
+
407
+ # raise FileNotFoundError
206
408
 
207
- if path.exists():
208
- hc = OmegaConf.load(path)
209
- return Path(hc.hydra.runtime.output_dir)
210
409
 
211
- raise FileNotFoundError
410
+ # def log_hydra_output_dir(run: Run_ | Series | str) -> None:
411
+ # """
412
+ # Log the Hydra output directory.
212
413
 
414
+ # Args:
415
+ # run: The run object.
213
416
 
214
- def log_hydra_output_dir(run: Run_ | Series | str) -> None:
215
- output_dir = get_hydra_output_dir(run)
216
- run_id = run if isinstance(run, str) else run.info.run_id
217
- mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
417
+ # Returns:
418
+ # None
419
+ # """
420
+ # output_dir = get_hydra_output_dir(run)
421
+ # run_id = run if isinstance(run, str) else run.info.run_id
422
+ # mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
@@ -0,0 +1,111 @@
1
+ Metadata-Version: 2.3
2
+ Name: hydraflow
3
+ Version: 0.2.0
4
+ Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
5
+ Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
+ Project-URL: Source, https://github.com/daizutabi/hydraflow
7
+ Project-URL: Issues, https://github.com/daizutabi/hydraflow/issues
8
+ Author-email: daizutabi <daizutabi@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Documentation
17
+ Classifier: Topic :: Software Development :: Documentation
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: hydra-core>1.3
20
+ Requires-Dist: mlflow>2.15
21
+ Requires-Dist: setuptools
22
+ Requires-Dist: watchdog
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-clarity; extra == 'dev'
25
+ Requires-Dist: pytest-cov; extra == 'dev'
26
+ Requires-Dist: pytest-randomly; extra == 'dev'
27
+ Requires-Dist: pytest-xdist; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Hydraflow
31
+
32
+ [![PyPI Version][pypi-v-image]][pypi-v-link]
33
+ [![Python Version][python-v-image]][python-v-link]
34
+ [![Build Status][GHAction-image]][GHAction-link]
35
+ [![Coverage Status][codecov-image]][codecov-link]
36
+
37
+ <!-- Badges -->
38
+ [pypi-v-image]: https://img.shields.io/pypi/v/hydraflow.svg
39
+ [pypi-v-link]: https://pypi.org/project/hydraflow/
40
+ [python-v-image]: https://img.shields.io/pypi/pyversions/hydraflow.svg
41
+ [python-v-link]: https://pypi.org/project/hydraflow
42
+ [GHAction-image]: https://github.com/daizutabi/hydraflow/actions/workflows/ci.yml/badge.svg?branch=main&event=push
43
+ [GHAction-link]: https://github.com/daizutabi/hydraflow/actions?query=event%3Apush+branch%3Amain
44
+ [codecov-image]: https://codecov.io/github/daizutabi/hydraflow/coverage.svg?branch=main
45
+ [codecov-link]: https://codecov.io/github/daizutabi/hydraflow?branch=main
46
+
47
+ ## Overview
48
+
49
+ Hydraflow is a powerful library designed to seamlessly integrate [Hydra](https://hydra.cc/) and [MLflow](https://mlflow.org/), making it easier to manage and track machine learning experiments. By combining the flexibility of Hydra's configuration management with the robust experiment tracking capabilities of MLflow, Hydraflow provides a comprehensive solution for managing complex machine learning workflows.
50
+
51
+ ## Key Features
52
+
53
+ - **Configuration Management**: Utilize Hydra's advanced configuration management to handle complex parameter sweeps and experiment setups.
54
+ - **Experiment Tracking**: Leverage MLflow's tracking capabilities to log parameters, metrics, and artifacts for each run.
55
+ - **Artifact Management**: Automatically log and manage artifacts, such as model checkpoints and configuration files, with MLflow.
56
+ - **Seamless Integration**: Easily integrate Hydra and MLflow in your machine learning projects with minimal setup.
57
+
58
+ ## Installation
59
+
60
+ You can install Hydraflow via pip:
61
+
62
+ ```bash
63
+ pip install hydraflow
64
+ ```
65
+
66
+ ## Getting Started
67
+
68
+ Here is a simple example to get you started with Hydraflow:
69
+
70
+ ```python
71
+ import hydra
72
+ import hydraflow
73
+ import mlflow
74
+ from dataclasses import dataclass
75
+ from hydra.core.config_store import ConfigStore
76
+ from pathlib import Path
77
+
78
+ @dataclass
79
+ class MySQLConfig:
80
+ host: str = "localhost"
81
+ port: int = 3306
82
+
83
+ cs = ConfigStore.instance()
84
+ cs.store(name="config", node=MySQLConfig)
85
+
86
+ @hydra.main(version_base=None, config_name="config")
87
+ def my_app(cfg: MySQLConfig) -> None:
88
+ # Set experiment by Hydra job name.
89
+ hydraflow.set_experiment()
90
+
91
+ # Automatically log params using Hydra config.
92
+ with mlflow.start_run(), hydraflow.log_run(cfg) as info:
93
+ # Your app code below.
94
+
95
+ # `info.output_dir` is the Hydra output directory.
96
+ # `info.artifact_dir` is the MLflow artifact directory.
97
+
98
+ with hydraflow.watch(callback):
99
+ # Watch files in the MLflow artifact directory.
100
+ # You can update metrics or log other artifacts
101
+ # according to the watched files in your callback
102
+ # function.
103
+ pass
104
+
105
+ # Your callback function here.
106
+ def callback(file: Path) -> None:
107
+ pass
108
+
109
+ if __name__ == "__main__":
110
+ my_app()
111
+ ```
@@ -0,0 +1,9 @@
1
+ hydraflow/__init__.py,sha256=PzziOG9RnGAVbl9Yz4ScvsL8nfkjsuN0alMKRvZT-_Y,442
2
+ hydraflow/config.py,sha256=BcyOYvdiqSCsmUMA_EvnWPXuW0fC5cT-Q2ilBk9-5gc,1863
3
+ hydraflow/context.py,sha256=MqkEhKEZL_N3eb3v5u9D4EqKkiSmiPyXXafhPkALRlg,5129
4
+ hydraflow/mlflow.py,sha256=_Los9E38eG8sTiN8bGwZmvjCrS0S-wSGiA4fyhQM3Zw,2251
5
+ hydraflow/runs.py,sha256=NT7IzE-Pf7T2Ey-eWEPZzQQaX4Gt_RKDKSn2pj2yzGc,14304
6
+ hydraflow-0.2.0.dist-info/METADATA,sha256=dfQ2_-Nk79yVazy5BHasYK681kiG1z-_i4VxWT8fJjg,4224
7
+ hydraflow-0.2.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
8
+ hydraflow-0.2.0.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
9
+ hydraflow-0.2.0.dist-info/RECORD,,
hydraflow/util.py DELETED
@@ -1,11 +0,0 @@
1
- import platform
2
- from pathlib import Path
3
- from urllib.parse import urlparse
4
-
5
-
6
- def uri_to_path(uri: str) -> Path:
7
- path = urlparse(uri).path
8
- if platform.system() == "Windows" and path.startswith("/"):
9
- path = path[1:]
10
-
11
- return Path(path)
@@ -1,45 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: hydraflow
3
- Version: 0.1.4
4
- Summary: Hydra with MLflow
5
- Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
- Project-URL: Source, https://github.com/daizutabi/hydraflow
7
- Project-URL: Issues, https://github.com/daizutabi/hydraflow/issues
8
- Author-email: daizutabi <daizutabi@gmail.com>
9
- License-Expression: MIT
10
- License-File: LICENSE
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Programming Language :: Python
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Topic :: Documentation
17
- Classifier: Topic :: Software Development :: Documentation
18
- Requires-Python: >=3.10
19
- Requires-Dist: hydra-core>1.3
20
- Requires-Dist: mlflow>2.15
21
- Requires-Dist: setuptools
22
- Requires-Dist: watchdog
23
- Provides-Extra: dev
24
- Requires-Dist: pytest-clarity; extra == 'dev'
25
- Requires-Dist: pytest-cov; extra == 'dev'
26
- Requires-Dist: pytest-randomly; extra == 'dev'
27
- Requires-Dist: pytest-xdist; extra == 'dev'
28
- Description-Content-Type: text/markdown
29
-
30
- # hydraflow
31
-
32
- [![PyPI Version][pypi-v-image]][pypi-v-link]
33
- [![Python Version][python-v-image]][python-v-link]
34
- [![Build Status][GHAction-image]][GHAction-link]
35
- [![Coverage Status][codecov-image]][codecov-link]
36
-
37
- <!-- Badges -->
38
- [pypi-v-image]: https://img.shields.io/pypi/v/hydraflow.svg
39
- [pypi-v-link]: https://pypi.org/project/hydraflow/
40
- [python-v-image]: https://img.shields.io/pypi/pyversions/hydraflow.svg
41
- [python-v-link]: https://pypi.org/project/hydraflow
42
- [GHAction-image]: https://github.com/daizutabi/hydraflow/actions/workflows/ci.yml/badge.svg?branch=main&event=push
43
- [GHAction-link]: https://github.com/daizutabi/hydraflow/actions?query=event%3Apush+branch%3Amain
44
- [codecov-image]: https://codecov.io/github/daizutabi/hydraflow/coverage.svg?branch=main
45
- [codecov-link]: https://codecov.io/github/daizutabi/hydraflow?branch=main
@@ -1,10 +0,0 @@
1
- hydraflow/__init__.py,sha256=e1Q0Sskx39jaU2zkGNXjFWNC5xugEz_hDERTN_6Mzy8,666
2
- hydraflow/config.py,sha256=b3Plh_lmq94loZNw9QP2asd6thCLyTzzYSutH0cONXA,964
3
- hydraflow/context.py,sha256=3vejDbRYQBuBwlhpBpOv5aoyZ-yS8UUzpbCFK1V1uvw,2720
4
- hydraflow/mlflow.py,sha256=unBP3Y7ujTM3E_Hq_eYvRVFZoGfTA7B0h4FkOZtPPqc,566
5
- hydraflow/runs.py,sha256=127YykWzmiNUUuJSGPOCZasXmd6tcE15HU32j8x71ck,5864
6
- hydraflow/util.py,sha256=_BdOMq5tKPm8HOehb2s2ZIBpJYyVpvO_yaAIxbSj51I,253
7
- hydraflow-0.1.4.dist-info/METADATA,sha256=Xw-xcDKdzkHa7bKDZUI6MXpOKekcyFbMyBy1yANjNQs,1903
8
- hydraflow-0.1.4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
9
- hydraflow-0.1.4.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
10
- hydraflow-0.1.4.dist-info/RECORD,,