hydraflow 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
hydraflow/__init__.py CHANGED
@@ -3,15 +3,10 @@ from .mlflow import set_experiment
3
3
  from .runs import (
4
4
  Run,
5
5
  Runs,
6
- drop_unique_params,
7
6
  filter_runs,
8
- get_artifact_dir,
9
- get_artifact_path,
10
- get_artifact_uri,
11
7
  get_param_dict,
12
8
  get_param_names,
13
9
  get_run,
14
- get_run_id,
15
10
  load_config,
16
11
  )
17
12
 
@@ -20,15 +15,10 @@ __all__ = [
20
15
  "Run",
21
16
  "Runs",
22
17
  "chdir_artifact",
23
- "drop_unique_params",
24
18
  "filter_runs",
25
- "get_artifact_dir",
26
- "get_artifact_path",
27
- "get_artifact_uri",
28
19
  "get_param_dict",
29
20
  "get_param_names",
30
21
  "get_run",
31
- "get_run_id",
32
22
  "load_config",
33
23
  "log_run",
34
24
  "set_experiment",
hydraflow/config.py CHANGED
@@ -1,3 +1,8 @@
1
+ """
2
+ This module provides functionality for working with configuration
3
+ objects using the OmegaConf library.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
3
8
  from typing import TYPE_CHECKING
@@ -10,12 +15,32 @@ if TYPE_CHECKING:
10
15
 
11
16
 
12
17
  def iter_params(config: object, prefix: str = "") -> Iterator[tuple[str, Any]]:
13
- if not isinstance(config, DictConfig | ListConfig):
18
+ """
19
+ Recursively iterate over the parameters in the given configuration object.
20
+
21
+ This function traverses the configuration object and yields key-value pairs
22
+ representing the parameters. The keys are prefixed with the provided prefix.
23
+
24
+ Args:
25
+ config: The configuration object to iterate over. This can be a dictionary,
26
+ list, DictConfig, or ListConfig.
27
+ prefix: The prefix to prepend to the parameter keys.
28
+ Defaults to an empty string.
29
+
30
+ Yields:
31
+ Key-value pairs representing the parameters in the configuration object.
32
+ """
33
+ if not isinstance(config, (DictConfig, ListConfig)):
14
34
  config = OmegaConf.create(config) # type: ignore
15
35
 
16
36
  if isinstance(config, DictConfig):
17
37
  for key, value in config.items():
18
- if isinstance(value, (DictConfig, ListConfig)):
38
+ if isinstance(value, ListConfig) and not any(
39
+ isinstance(v, (DictConfig, ListConfig)) for v in value
40
+ ):
41
+ yield f"{prefix}{key}", value
42
+
43
+ elif isinstance(value, (DictConfig, ListConfig)):
19
44
  yield from iter_params(value, f"{prefix}{key}.")
20
45
 
21
46
  else:
hydraflow/context.py CHANGED
@@ -1,5 +1,11 @@
1
+ """
2
+ This module provides context managers to log parameters and manage the MLflow
3
+ run context.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
8
+ import logging
3
9
  import os
4
10
  import time
5
11
  from contextlib import contextmanager
@@ -12,15 +18,14 @@ from hydra.core.hydra_config import HydraConfig
12
18
  from watchdog.events import FileModifiedEvent, FileSystemEventHandler
13
19
  from watchdog.observers import Observer
14
20
 
15
- from hydraflow.mlflow import log_params
16
- from hydraflow.runs import get_artifact_path
17
- from hydraflow.util import uri_to_path
21
+ from hydraflow.mlflow import get_artifact_dir, log_params
18
22
 
19
23
  if TYPE_CHECKING:
20
24
  from collections.abc import Callable, Iterator
21
25
 
22
26
  from mlflow.entities.run import Run
23
- from pandas import Series
27
+
28
+ log = logging.getLogger(__name__)
24
29
 
25
30
 
26
31
  @dataclass
@@ -35,12 +40,33 @@ def log_run(
35
40
  *,
36
41
  synchronous: bool | None = None,
37
42
  ) -> Iterator[Info]:
43
+ """
44
+ Log the parameters from the given configuration object and manage the MLflow
45
+ run context.
46
+
47
+ This context manager logs the parameters from the provided configuration object
48
+ using MLflow. It also manages the MLflow run context, ensuring that artifacts
49
+ are logged and the run is properly closed.
50
+
51
+ Args:
52
+ config: The configuration object to log the parameters from.
53
+ synchronous: Whether to log the parameters synchronously.
54
+ Defaults to None.
55
+
56
+ Yields:
57
+ Info: An `Info` object containing the output directory and artifact directory
58
+ paths.
59
+
60
+ Example:
61
+ with log_run(config) as info:
62
+ # Perform operations within the MLflow run context
63
+ pass
64
+ """
38
65
  log_params(config, synchronous=synchronous)
39
66
 
40
67
  hc = HydraConfig.get()
41
68
  output_dir = Path(hc.runtime.output_dir)
42
- uri = mlflow.get_artifact_uri()
43
- info = Info(output_dir, uri_to_path(uri))
69
+ info = Info(output_dir, get_artifact_dir())
44
70
 
45
71
  # Save '.hydra' config directory first.
46
72
  output_subdir = output_dir / (hc.output_subdir or "")
@@ -54,16 +80,48 @@ def log_run(
54
80
  with watch(log_artifact, output_dir):
55
81
  yield info
56
82
 
83
+ except Exception as e:
84
+ log.error(f"Error during log_run: {e}")
85
+ raise
86
+
57
87
  finally:
58
88
  # Save output_dir including '.hydra' config directory.
59
89
  mlflow.log_artifacts(output_dir.as_posix())
60
90
 
61
91
 
62
92
  @contextmanager
63
- def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60) -> Iterator[None]:
64
- if not dir:
65
- uri = mlflow.get_artifact_uri()
66
- dir = uri_to_path(uri)
93
+ def watch(
94
+ func: Callable[[Path], None],
95
+ dir: Path | str = "",
96
+ timeout: int = 60,
97
+ ) -> Iterator[None]:
98
+ """
99
+ Watch the given directory for changes and call the provided function
100
+ when a change is detected.
101
+
102
+ This context manager sets up a file system watcher on the specified directory.
103
+ When a file modification is detected, the provided function is called with
104
+ the path of the modified file. The watcher runs for the specified timeout
105
+ period or until the context is exited.
106
+
107
+ Args:
108
+ func: The function to call when a change is
109
+ detected. It should accept a single argument of type `Path`,
110
+ which is the path of the modified file.
111
+ dir: The directory to watch. If not specified,
112
+ the current MLflow artifact URI is used. Defaults to "".
113
+ timeout: The timeout period in seconds for the watcher
114
+ to run after the context is exited. Defaults to 60.
115
+
116
+ Yields:
117
+ None
118
+
119
+ Example:
120
+ with watch(log_artifact, "/path/to/dir"):
121
+ # Perform operations while watching the directory for changes
122
+ pass
123
+ """
124
+ dir = dir or get_artifact_dir()
67
125
 
68
126
  handler = Handler(func)
69
127
  observer = Observer()
@@ -73,6 +131,10 @@ def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60)
73
131
  try:
74
132
  yield
75
133
 
134
+ except Exception as e:
135
+ log.error(f"Error during watch: {e}")
136
+ raise
137
+
76
138
  finally:
77
139
  elapsed = 0
78
140
  while not observer.event_queue.empty():
@@ -97,15 +159,30 @@ class Handler(FileSystemEventHandler):
97
159
 
98
160
  @contextmanager
99
161
  def chdir_artifact(
100
- run: Run | Series | str,
162
+ run: Run,
101
163
  artifact_path: str | None = None,
102
164
  ) -> Iterator[Path]:
165
+ """
166
+ Change the current working directory to the artifact directory of the
167
+ given run.
168
+
169
+ This context manager changes the current working directory to the artifact
170
+ directory of the given run. It ensures that the directory is changed back
171
+ to the original directory after the context is exited.
172
+
173
+ Args:
174
+ run: The run to get the artifact directory from.
175
+ artifact_path: The artifact path.
176
+ """
103
177
  curdir = Path.cwd()
178
+ path = mlflow.artifacts.download_artifacts(
179
+ run_id=run.info.run_id,
180
+ artifact_path=artifact_path,
181
+ )
104
182
 
105
- artifact_dir = get_artifact_path(run, artifact_path)
106
-
107
- os.chdir(artifact_dir)
183
+ os.chdir(path)
108
184
  try:
109
- yield artifact_dir
185
+ yield Path(path)
186
+
110
187
  finally:
111
188
  os.chdir(curdir)
hydraflow/mlflow.py CHANGED
@@ -1,5 +1,12 @@
1
+ """
2
+ This module provides functionality to log parameters from Hydra
3
+ configuration objects and set up experiments using MLflow.
4
+ """
5
+
1
6
  from __future__ import annotations
2
7
 
8
+ from pathlib import Path
9
+
3
10
  import mlflow
4
11
  from hydra.core.hydra_config import HydraConfig
5
12
 
@@ -7,6 +14,18 @@ from hydraflow.config import iter_params
7
14
 
8
15
 
9
16
  def set_experiment(prefix: str = "", suffix: str = "", uri: str | None = None) -> None:
17
+ """
18
+ Set the experiment name and tracking URI optionally.
19
+
20
+ This function sets the experiment name by combining the given prefix,
21
+ the job name from HydraConfig, and the given suffix. Optionally, it can
22
+ also set the tracking URI.
23
+
24
+ Args:
25
+ prefix: The prefix to prepend to the experiment name.
26
+ suffix: The suffix to append to the experiment name.
27
+ uri: The tracking URI to use.
28
+ """
10
29
  if uri:
11
30
  mlflow.set_tracking_uri(uri)
12
31
 
@@ -16,5 +35,38 @@ def set_experiment(prefix: str = "", suffix: str = "", uri: str | None = None) -
16
35
 
17
36
 
18
37
  def log_params(config: object, *, synchronous: bool | None = None) -> None:
38
+ """
39
+ Log the parameters from the given configuration object.
40
+
41
+ This method logs the parameters from the provided configuration object
42
+ using MLflow. It iterates over the parameters and logs them using the
43
+ `mlflow.log_param` method.
44
+
45
+ Args:
46
+ config: The configuration object to log the parameters from.
47
+ synchronous: Whether to log the parameters synchronously.
48
+ Defaults to None.
49
+ """
19
50
  for key, value in iter_params(config):
20
51
  mlflow.log_param(key, value, synchronous=synchronous)
52
+
53
+
54
+ def get_artifact_dir(artifact_path: str | None = None) -> Path:
55
+ """
56
+ Get the artifact directory for the given artifact path.
57
+
58
+ This function retrieves the artifact URI for the specified artifact path
59
+ using MLflow, downloads the artifacts to a local directory, and returns
60
+ the path to that directory.
61
+
62
+ Args:
63
+ artifact_path: The artifact path for which to get the directory.
64
+ Defaults to None.
65
+
66
+ Returns:
67
+ The local path to the directory where the artifacts are downloaded.
68
+ """
69
+ uri = mlflow.get_artifact_uri(artifact_path)
70
+ dir = mlflow.artifacts.download_artifacts(artifact_uri=uri)
71
+
72
+ return Path(dir)
hydraflow/runs.py CHANGED
@@ -1,27 +1,95 @@
1
+ """
2
+ This module provides functionality for managing and interacting with MLflow runs.
3
+ It includes the `Runs` class and various methods to filter runs, retrieve run information,
4
+ log artifacts, and load configurations.
5
+ """
6
+
1
7
  from __future__ import annotations
2
8
 
3
9
  from dataclasses import dataclass
4
10
  from functools import cache
5
- from pathlib import Path
11
+ from itertools import chain
6
12
  from typing import TYPE_CHECKING, Any
7
13
 
8
14
  import mlflow
9
- import numpy as np
10
- from mlflow.entities.run import Run as Run_
11
- from mlflow.tracking import artifact_utils
15
+ from mlflow.entities import ViewType
16
+ from mlflow.entities.run import Run
17
+ from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
12
18
  from omegaconf import DictConfig, OmegaConf
13
- from pandas import DataFrame, Series
14
19
 
15
20
  from hydraflow.config import iter_params
16
- from hydraflow.util import uri_to_path
17
21
 
18
22
  if TYPE_CHECKING:
19
23
  from typing import Any
20
24
 
21
25
 
26
+ def search_runs(
27
+ experiment_ids: list[str] | None = None,
28
+ filter_string: str = "",
29
+ run_view_type: int = ViewType.ACTIVE_ONLY,
30
+ max_results: int = SEARCH_MAX_RESULTS_PANDAS,
31
+ order_by: list[str] | None = None,
32
+ search_all_experiments: bool = False,
33
+ experiment_names: list[str] | None = None,
34
+ ) -> Runs:
35
+ """
36
+ Search for Runs that fit the specified criteria.
37
+
38
+ This function wraps the `mlflow.search_runs` function and returns the results
39
+ as a `Runs` object. It allows for flexible searching of MLflow runs based on
40
+ various criteria.
41
+
42
+ Args:
43
+ experiment_ids: List of experiment IDs. Search can work with experiment IDs or
44
+ experiment names, but not both in the same call. Values other than
45
+ ``None`` or ``[]`` will result in error if ``experiment_names`` is
46
+ also not ``None`` or ``[]``. ``None`` will default to the active
47
+ experiment if ``experiment_names`` is ``None`` or ``[]``.
48
+ filter_string: Filter query string, defaults to searching all runs.
49
+ run_view_type: one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``, or ``ALL`` runs
50
+ defined in :py:class:`mlflow.entities.ViewType`.
51
+ max_results: The maximum number of runs to put in the dataframe. Default is 100,000
52
+ to avoid causing out-of-memory issues on the user's machine.
53
+ order_by: List of columns to order by (e.g., "metrics.rmse"). The ``order_by`` column
54
+ can contain an optional ``DESC`` or ``ASC`` value. The default is ``ASC``.
55
+ The default ordering is to sort by ``start_time DESC``, then ``run_id``.
56
+ output_format: The output format to be returned. If ``pandas``, a ``pandas.DataFrame``
57
+ is returned and, if ``list``, a list of :py:class:`mlflow.entities.Run`
58
+ is returned.
59
+ search_all_experiments: Boolean specifying whether all experiments should be searched.
60
+ Only honored if ``experiment_ids`` is ``[]`` or ``None``.
61
+ experiment_names: List of experiment names. Search can work with experiment IDs or
62
+ experiment names, but not both in the same call. Values other
63
+ than ``None`` or ``[]`` will result in error if ``experiment_ids``
64
+ is also not ``None`` or ``[]``. ``None`` will default to the active
65
+ experiment if ``experiment_ids`` is ``None`` or ``[]``.
66
+
67
+ Returns:
68
+ A `Runs` object containing the search results.
69
+ """
70
+ runs = mlflow.search_runs(
71
+ experiment_ids=experiment_ids,
72
+ filter_string=filter_string,
73
+ run_view_type=run_view_type,
74
+ max_results=max_results,
75
+ order_by=order_by,
76
+ output_format="list",
77
+ search_all_experiments=search_all_experiments,
78
+ experiment_names=experiment_names,
79
+ )
80
+ return Runs(runs) # type: ignore
81
+
82
+
22
83
  @dataclass
23
84
  class Runs:
24
- runs: list[Run_] | DataFrame
85
+ """
86
+ A class to represent a collection of MLflow runs.
87
+
88
+ This class provides methods to interact with the runs, such as filtering,
89
+ retrieving specific runs, and accessing run information.
90
+ """
91
+
92
+ runs: list[Run]
25
93
 
26
94
  def __repr__(self) -> str:
27
95
  return f"{self.__class__.__name__}({len(self)})"
@@ -30,161 +98,280 @@ class Runs:
30
98
  return len(self.runs)
31
99
 
32
100
  def filter(self, config: object) -> Runs:
101
+ """
102
+ Filter the runs based on the provided configuration.
103
+
104
+ This method filters the runs in the collection according to the
105
+ specified configuration object. The configuration object should
106
+ contain key-value pairs that correspond to the parameters of the
107
+ runs. Only the runs that match all the specified parameters will
108
+ be included in the returned `Runs` object.
109
+
110
+ Args:
111
+ config: The configuration object to filter the runs.
112
+
113
+ Returns:
114
+ A new `Runs` object containing the filtered runs.
115
+ """
33
116
  return Runs(filter_runs(self.runs, config))
34
117
 
35
- def get(self, config: object) -> Run:
36
- return Run(get_run(self.runs, config))
118
+ def get(self, config: object) -> Run | None:
119
+ """
120
+ Retrieve a specific run based on the provided configuration.
37
121
 
38
- def drop_unique_params(self) -> Runs:
39
- if isinstance(self.runs, DataFrame):
40
- return Runs(drop_unique_params(self.runs))
122
+ This method filters the runs in the collection according to the
123
+ specified configuration object and returns the run that matches
124
+ the provided parameters. If more than one run matches the criteria,
125
+ a `ValueError` is raised.
41
126
 
42
- raise NotImplementedError
127
+ Args:
128
+ config: The configuration object to identify the run.
43
129
 
44
- def get_param_names(self) -> list[str]:
45
- if isinstance(self.runs, DataFrame):
46
- return get_param_names(self.runs)
130
+ Returns:
131
+ Run: The run object that matches the provided configuration.
132
+ None, if the runs are not in a DataFrame format.
47
133
 
48
- raise NotImplementedError
134
+ Raises:
135
+ ValueError: If the number of filtered runs is not exactly one.
136
+ """
137
+ return get_run(self.runs, config)
49
138
 
50
- def get_param_dict(self) -> dict[str, list[str]]:
51
- if isinstance(self.runs, DataFrame):
52
- return get_param_dict(self.runs)
139
+ def get_earliest_run(self, config: object | None = None, **kwargs) -> Run | None:
140
+ """
141
+ Get the earliest run from the list of runs based on the start time.
53
142
 
54
- raise NotImplementedError
143
+ This method filters the runs based on the configuration if provided
144
+ and returns the run with the earliest start time.
55
145
 
146
+ Args:
147
+ config: The configuration object to filter the runs.
148
+ If None, no filtering is applied.
149
+ **kwargs: Additional key-value pairs to filter the runs.
56
150
 
57
- def filter_runs(runs: list[Run_] | DataFrame, config: object) -> list[Run_] | DataFrame:
58
- if isinstance(runs, list):
59
- return filter_runs_list(runs, config)
151
+ Returns:
152
+ The run with the earliest start time, or None if no runs match the criteria.
153
+ """
154
+ return get_earliest_run(self.runs, config, **kwargs)
60
155
 
61
- return filter_runs_dataframe(runs, config)
156
+ def get_latest_run(self, config: object | None = None, **kwargs) -> Run | None:
157
+ """
158
+ Get the latest run from the list of runs based on the start time.
62
159
 
160
+ Args:
161
+ config: The configuration object to filter the runs.
162
+ If None, no filtering is applied.
163
+ **kwargs: Additional key-value pairs to filter the runs.
63
164
 
64
- def _is_equal(run: Run_, key: str, value: Any) -> bool:
65
- param = run.data.params.get(key, value)
165
+ Returns:
166
+ The run with the latest start time, or None if no runs match the criteria.
167
+ """
168
+ return get_latest_run(self.runs, config, **kwargs)
66
169
 
67
- if param is None:
68
- return False
170
+ def get_param_names(self) -> list[str]:
171
+ """
172
+ Get the parameter names from the runs.
69
173
 
70
- return type(value)(param) == value
174
+ This method extracts the unique parameter names from the provided list of runs.
175
+ It iterates through each run and collects the parameter names into a set to
176
+ ensure uniqueness.
71
177
 
178
+ Returns:
179
+ A list of unique parameter names.
180
+ """
181
+ return get_param_names(self.runs)
72
182
 
73
- def filter_runs_list(runs: list[Run_], config: object) -> list[Run_]:
74
- for key, value in iter_params(config):
183
+ def get_param_dict(self) -> dict[str, list[str]]:
184
+ """
185
+ Get the parameter dictionary from the list of runs.
186
+
187
+ This method extracts the parameter names and their corresponding values
188
+ from the provided list of runs. It iterates through each run and collects
189
+ the parameter values into a dictionary where the keys are parameter names
190
+ and the values are lists of parameter values.
191
+
192
+ Returns:
193
+ A dictionary where the keys are parameter names and the values are lists
194
+ of parameter values.
195
+ """
196
+ return get_param_dict(self.runs)
197
+
198
+
199
+ def filter_runs(runs: list[Run], config: object, **kwargs) -> list[Run]:
200
+ """
201
+ Filter the runs based on the provided configuration.
202
+
203
+ This method filters the runs in the collection according to the
204
+ specified configuration object. The configuration object should
205
+ contain key-value pairs that correspond to the parameters of the
206
+ runs. Only the runs that match all the specified parameters will
207
+ be included in the returned list of runs.
208
+
209
+ Args:
210
+ runs: The runs to filter.
211
+ config: The configuration object to filter the runs.
212
+ **kwargs: Additional key-value pairs to filter the runs.
213
+
214
+ Returns:
215
+ A filtered list of runs.
216
+ """
217
+ for key, value in chain(iter_params(config), kwargs.items()):
75
218
  runs = [run for run in runs if _is_equal(run, key, value)]
76
219
 
220
+ if len(runs) == 0:
221
+ return []
222
+
77
223
  return runs
78
224
 
79
225
 
80
- def filter_runs_dataframe(runs: DataFrame, config: object) -> DataFrame:
81
- index = np.ones(len(runs), dtype=bool)
226
+ def _is_equal(run: Run, key: str, value: Any) -> bool:
227
+ param = run.data.params.get(key, value)
82
228
 
83
- for key, value in iter_params(config):
84
- name = f"params.{key}"
229
+ if param is None:
230
+ return False
231
+
232
+ return type(value)(param) == value
233
+
234
+
235
+ def get_run(runs: list[Run], config: object, **kwargs) -> Run | None:
236
+ """
237
+ Retrieve a specific run based on the provided configuration.
85
238
 
86
- if name in runs:
87
- series = runs[name]
88
- is_value = -series.isna()
89
- param = series.fillna(value).astype(type(value))
90
- index &= is_value & (param == value)
239
+ This method filters the runs in the collection according to the
240
+ specified configuration object and returns the run that matches
241
+ the provided parameters. If more than one run matches the criteria,
242
+ a `ValueError` is raised.
91
243
 
92
- return runs[index]
244
+ Args:
245
+ runs: The runs to filter.
246
+ config: The configuration object to identify the run.
247
+ **kwargs: Additional key-value pairs to filter the runs.
93
248
 
249
+ Returns:
250
+ The run object that matches the provided configuration, or None
251
+ if no runs match the criteria.
94
252
 
95
- def get_run(runs: list[Run_] | DataFrame, config: object) -> Run_ | Series:
96
- runs = filter_runs(runs, config)
253
+ Raises:
254
+ ValueError: If more than one run matches the criteria.
255
+ """
256
+ runs = filter_runs(runs, config, **kwargs)
257
+
258
+ if len(runs) == 0:
259
+ return None
97
260
 
98
261
  if len(runs) == 1:
99
- return runs[0] if isinstance(runs, list) else runs.iloc[0]
262
+ return runs[0]
100
263
 
101
- msg = f"number of filtered runs is not 1: got {len(runs)}"
264
+ msg = f"Multiple runs were filtered. Expected number of runs is 1, but found {len(runs)} runs."
102
265
  raise ValueError(msg)
103
266
 
104
267
 
105
- def drop_unique_params(runs: DataFrame) -> DataFrame:
106
- def select(column: str) -> bool:
107
- return not column.startswith("params.") or len(runs[column].unique()) > 1
268
+ def get_earliest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
269
+ """
270
+ Get the earliest run from the list of runs based on the start time.
108
271
 
109
- columns = [select(column) for column in runs.columns]
110
- return runs.iloc[:, columns]
272
+ This method filters the runs based on the configuration if provided
273
+ and returns the run with the earliest start time.
111
274
 
275
+ Args:
276
+ runs: The list of runs.
277
+ config: The configuration object to filter the runs.
278
+ If None, no filtering is applied.
279
+ **kwargs: Additional key-value pairs to filter the runs.
112
280
 
113
- def get_param_names(runs: DataFrame) -> list[str]:
114
- def get_name(column: str) -> str:
115
- if column.startswith("params."):
116
- return column.split(".", maxsplit=1)[-1]
281
+ Returns:
282
+ The run with the earliest start time, or None if no runs match the criteria.
283
+ """
284
+ if config is not None or kwargs:
285
+ runs = filter_runs(runs, config or {}, **kwargs)
117
286
 
118
- return ""
287
+ return min(runs, key=lambda run: run.info.start_time, default=None)
119
288
 
120
- columns = [get_name(column) for column in runs.columns]
121
- return [column for column in columns if column]
122
289
 
290
+ def get_latest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
291
+ """
292
+ Get the latest run from the list of runs based on the start time.
123
293
 
124
- def get_param_dict(runs: DataFrame) -> dict[str, list[str]]:
125
- params = {}
126
- for name in get_param_names(runs):
127
- params[name] = list(runs[f"params.{name}"].unique())
294
+ This method filters the runs based on the configuration if provided
295
+ and returns the run with the latest start time.
128
296
 
129
- return params
297
+ Args:
298
+ runs: The list of runs.
299
+ config: The configuration object to filter the runs.
300
+ If None, no filtering is applied.
301
+ **kwargs: Additional key-value pairs to filter the runs.
130
302
 
303
+ Returns:
304
+ The run with the latest start time, or None if no runs match the criteria.
305
+ """
306
+ if config is not None or kwargs:
307
+ runs = filter_runs(runs, config or {}, **kwargs)
131
308
 
132
- @dataclass
133
- class Run:
134
- run: Run_ | Series | str
309
+ return max(runs, key=lambda run: run.info.start_time, default=None)
135
310
 
136
- def __repr__(self) -> str:
137
- return f"{self.__class__.__name__}({self.run_id!r})"
138
311
 
139
- @property
140
- def run_id(self) -> str:
141
- return get_run_id(self.run)
312
+ def get_param_names(runs: list[Run]) -> list[str]:
313
+ """
314
+ Get the parameter names from the runs.
142
315
 
143
- def artifact_uri(self, artifact_path: str | None = None) -> str:
144
- return get_artifact_uri(self.run, artifact_path)
316
+ This method extracts the unique parameter names from the provided list of runs.
317
+ It iterates through each run and collects the parameter names into a set to
318
+ ensure uniqueness.
145
319
 
146
- @property
147
- def artifact_dir(self) -> Path:
148
- return get_artifact_dir(self.run)
320
+ Args:
321
+ runs: The list of runs from which to extract parameter names.
149
322
 
150
- def artifact_path(self, artifact_path: str | None = None) -> Path:
151
- return get_artifact_path(self.run, artifact_path)
323
+ Returns:
324
+ A list of unique parameter names.
325
+ """
326
+ param_names = set()
152
327
 
153
- @property
154
- def config(self) -> DictConfig:
155
- return load_config(self.run)
328
+ for run in runs:
329
+ for param in run.data.params.keys():
330
+ param_names.add(param)
156
331
 
157
- def log_hydra_output_dir(self) -> None:
158
- log_hydra_output_dir(self.run)
332
+ return list(param_names)
159
333
 
160
334
 
161
- def get_run_id(run: Run_ | Series | str) -> str:
162
- if isinstance(run, str):
163
- return run
335
+ def get_param_dict(runs: list[Run]) -> dict[str, list[str]]:
336
+ """
337
+ Get the parameter dictionary from the list of runs.
164
338
 
165
- if isinstance(run, Run_):
166
- return run.info.run_id
339
+ This method extracts the parameter names and their corresponding values
340
+ from the provided list of runs. It iterates through each run and collects
341
+ the parameter values into a dictionary where the keys are parameter names
342
+ and the values are lists of parameter values.
167
343
 
168
- return run.run_id
344
+ Args:
345
+ runs: The list of runs from which to extract parameter names and values.
169
346
 
347
+ Returns:
348
+ A dictionary where the keys are parameter names and the values are lists
349
+ of parameter values.
350
+ """
351
+ params = {}
170
352
 
171
- def get_artifact_uri(run: Run_ | Series | str, artifact_path: str | None = None) -> str:
172
- run_id = get_run_id(run)
173
- return artifact_utils.get_artifact_uri(run_id, artifact_path)
353
+ for name in get_param_names(runs):
354
+ it = (run.data.params[name] for run in runs if name in run.data.params)
355
+ params[name] = sorted(set(it))
174
356
 
357
+ return params
175
358
 
176
- def get_artifact_dir(run: Run_ | Series | str) -> Path:
177
- uri = get_artifact_uri(run)
178
- return uri_to_path(uri)
179
359
 
360
+ def load_config(run: Run) -> DictConfig:
361
+ """
362
+ Load the configuration for a given run.
180
363
 
181
- def get_artifact_path(run: Run_ | Series | str, artifact_path: str | None = None) -> Path:
182
- artifact_dir = get_artifact_dir(run)
183
- return artifact_dir / artifact_path if artifact_path else artifact_dir
364
+ This function loads the configuration for the provided Run instance
365
+ by downloading the configuration file from the MLflow artifacts and
366
+ loading it using OmegaConf.
184
367
 
368
+ Args:
369
+ run: The Run instance to load the configuration for.
185
370
 
186
- def load_config(run: Run_ | Series | str) -> DictConfig:
187
- run_id = get_run_id(run)
371
+ Returns:
372
+ The loaded configuration.
373
+ """
374
+ run_id = run.info.run_id
188
375
  return _load_config(run_id)
189
376
 
190
377
 
@@ -201,17 +388,35 @@ def _load_config(run_id: str) -> DictConfig:
201
388
  return OmegaConf.load(path) # type: ignore
202
389
 
203
390
 
204
- def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
205
- path = get_artifact_dir(run) / ".hydra/hydra.yaml"
391
+ # def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
392
+ # """
393
+ # Get the Hydra output directory.
394
+
395
+ # Args:
396
+ # run: The run object.
397
+
398
+ # Returns:
399
+ # Path: The Hydra output directory.
400
+ # """
401
+ # path = get_artifact_dir(run) / ".hydra/hydra.yaml"
402
+
403
+ # if path.exists():
404
+ # hc = OmegaConf.load(path)
405
+ # return Path(hc.hydra.runtime.output_dir)
406
+
407
+ # raise FileNotFoundError
206
408
 
207
- if path.exists():
208
- hc = OmegaConf.load(path)
209
- return Path(hc.hydra.runtime.output_dir)
210
409
 
211
- raise FileNotFoundError
410
+ # def log_hydra_output_dir(run: Run_ | Series | str) -> None:
411
+ # """
412
+ # Log the Hydra output directory.
212
413
 
414
+ # Args:
415
+ # run: The run object.
213
416
 
214
- def log_hydra_output_dir(run: Run_ | Series | str) -> None:
215
- output_dir = get_hydra_output_dir(run)
216
- run_id = run if isinstance(run, str) else run.info.run_id
217
- mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
417
+ # Returns:
418
+ # None
419
+ # """
420
+ # output_dir = get_hydra_output_dir(run)
421
+ # run_id = run if isinstance(run, str) else run.info.run_id
422
+ # mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
@@ -0,0 +1,111 @@
1
+ Metadata-Version: 2.3
2
+ Name: hydraflow
3
+ Version: 0.2.0
4
+ Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
5
+ Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
+ Project-URL: Source, https://github.com/daizutabi/hydraflow
7
+ Project-URL: Issues, https://github.com/daizutabi/hydraflow/issues
8
+ Author-email: daizutabi <daizutabi@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Documentation
17
+ Classifier: Topic :: Software Development :: Documentation
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: hydra-core>1.3
20
+ Requires-Dist: mlflow>2.15
21
+ Requires-Dist: setuptools
22
+ Requires-Dist: watchdog
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-clarity; extra == 'dev'
25
+ Requires-Dist: pytest-cov; extra == 'dev'
26
+ Requires-Dist: pytest-randomly; extra == 'dev'
27
+ Requires-Dist: pytest-xdist; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Hydraflow
31
+
32
+ [![PyPI Version][pypi-v-image]][pypi-v-link]
33
+ [![Python Version][python-v-image]][python-v-link]
34
+ [![Build Status][GHAction-image]][GHAction-link]
35
+ [![Coverage Status][codecov-image]][codecov-link]
36
+
37
+ <!-- Badges -->
38
+ [pypi-v-image]: https://img.shields.io/pypi/v/hydraflow.svg
39
+ [pypi-v-link]: https://pypi.org/project/hydraflow/
40
+ [python-v-image]: https://img.shields.io/pypi/pyversions/hydraflow.svg
41
+ [python-v-link]: https://pypi.org/project/hydraflow
42
+ [GHAction-image]: https://github.com/daizutabi/hydraflow/actions/workflows/ci.yml/badge.svg?branch=main&event=push
43
+ [GHAction-link]: https://github.com/daizutabi/hydraflow/actions?query=event%3Apush+branch%3Amain
44
+ [codecov-image]: https://codecov.io/github/daizutabi/hydraflow/coverage.svg?branch=main
45
+ [codecov-link]: https://codecov.io/github/daizutabi/hydraflow?branch=main
46
+
47
+ ## Overview
48
+
49
+ Hydraflow is a powerful library designed to seamlessly integrate [Hydra](https://hydra.cc/) and [MLflow](https://mlflow.org/), making it easier to manage and track machine learning experiments. By combining the flexibility of Hydra's configuration management with the robust experiment tracking capabilities of MLflow, Hydraflow provides a comprehensive solution for managing complex machine learning workflows.
50
+
51
+ ## Key Features
52
+
53
+ - **Configuration Management**: Utilize Hydra's advanced configuration management to handle complex parameter sweeps and experiment setups.
54
+ - **Experiment Tracking**: Leverage MLflow's tracking capabilities to log parameters, metrics, and artifacts for each run.
55
+ - **Artifact Management**: Automatically log and manage artifacts, such as model checkpoints and configuration files, with MLflow.
56
+ - **Seamless Integration**: Easily integrate Hydra and MLflow in your machine learning projects with minimal setup.
57
+
58
+ ## Installation
59
+
60
+ You can install Hydraflow via pip:
61
+
62
+ ```bash
63
+ pip install hydraflow
64
+ ```
65
+
66
+ ## Getting Started
67
+
68
+ Here is a simple example to get you started with Hydraflow:
69
+
70
+ ```python
71
+ import hydra
72
+ import hydraflow
73
+ import mlflow
74
+ from dataclasses import dataclass
75
+ from hydra.core.config_store import ConfigStore
76
+ from pathlib import Path
77
+
78
+ @dataclass
79
+ class MySQLConfig:
80
+ host: str = "localhost"
81
+ port: int = 3306
82
+
83
+ cs = ConfigStore.instance()
84
+ cs.store(name="config", node=MySQLConfig)
85
+
86
+ @hydra.main(version_base=None, config_name="config")
87
+ def my_app(cfg: MySQLConfig) -> None:
88
+ # Set experiment by Hydra job name.
89
+ hydraflow.set_experiment()
90
+
91
+ # Automatically log params using Hydra config.
92
+ with mlflow.start_run(), hydraflow.log_run(cfg) as info:
93
+ # Your app code below.
94
+
95
+ # `info.output_dir` is the Hydra output directory.
96
+ # `info.artifact_dir` is the MLflow artifact directory.
97
+
98
+ with hydraflow.watch(callback):
99
+ # Watch files in the MLflow artifact directory.
100
+ # You can update metrics or log other artifacts
101
+ # according to the watched files in your callback
102
+ # function.
103
+ pass
104
+
105
+ # Your callback function here.
106
+ def callback(file: Path) -> None:
107
+ pass
108
+
109
+ if __name__ == "__main__":
110
+ my_app()
111
+ ```
@@ -0,0 +1,9 @@
1
+ hydraflow/__init__.py,sha256=PzziOG9RnGAVbl9Yz4ScvsL8nfkjsuN0alMKRvZT-_Y,442
2
+ hydraflow/config.py,sha256=BcyOYvdiqSCsmUMA_EvnWPXuW0fC5cT-Q2ilBk9-5gc,1863
3
+ hydraflow/context.py,sha256=MqkEhKEZL_N3eb3v5u9D4EqKkiSmiPyXXafhPkALRlg,5129
4
+ hydraflow/mlflow.py,sha256=_Los9E38eG8sTiN8bGwZmvjCrS0S-wSGiA4fyhQM3Zw,2251
5
+ hydraflow/runs.py,sha256=NT7IzE-Pf7T2Ey-eWEPZzQQaX4Gt_RKDKSn2pj2yzGc,14304
6
+ hydraflow-0.2.0.dist-info/METADATA,sha256=dfQ2_-Nk79yVazy5BHasYK681kiG1z-_i4VxWT8fJjg,4224
7
+ hydraflow-0.2.0.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
8
+ hydraflow-0.2.0.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
9
+ hydraflow-0.2.0.dist-info/RECORD,,
hydraflow/util.py DELETED
@@ -1,11 +0,0 @@
1
- import platform
2
- from pathlib import Path
3
- from urllib.parse import urlparse
4
-
5
-
6
- def uri_to_path(uri: str) -> Path:
7
- path = urlparse(uri).path
8
- if platform.system() == "Windows" and path.startswith("/"):
9
- path = path[1:]
10
-
11
- return Path(path)
@@ -1,45 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: hydraflow
3
- Version: 0.1.4
4
- Summary: Hydra with MLflow
5
- Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
- Project-URL: Source, https://github.com/daizutabi/hydraflow
7
- Project-URL: Issues, https://github.com/daizutabi/hydraflow/issues
8
- Author-email: daizutabi <daizutabi@gmail.com>
9
- License-Expression: MIT
10
- License-File: LICENSE
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Programming Language :: Python
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Topic :: Documentation
17
- Classifier: Topic :: Software Development :: Documentation
18
- Requires-Python: >=3.10
19
- Requires-Dist: hydra-core>1.3
20
- Requires-Dist: mlflow>2.15
21
- Requires-Dist: setuptools
22
- Requires-Dist: watchdog
23
- Provides-Extra: dev
24
- Requires-Dist: pytest-clarity; extra == 'dev'
25
- Requires-Dist: pytest-cov; extra == 'dev'
26
- Requires-Dist: pytest-randomly; extra == 'dev'
27
- Requires-Dist: pytest-xdist; extra == 'dev'
28
- Description-Content-Type: text/markdown
29
-
30
- # hydraflow
31
-
32
- [![PyPI Version][pypi-v-image]][pypi-v-link]
33
- [![Python Version][python-v-image]][python-v-link]
34
- [![Build Status][GHAction-image]][GHAction-link]
35
- [![Coverage Status][codecov-image]][codecov-link]
36
-
37
- <!-- Badges -->
38
- [pypi-v-image]: https://img.shields.io/pypi/v/hydraflow.svg
39
- [pypi-v-link]: https://pypi.org/project/hydraflow/
40
- [python-v-image]: https://img.shields.io/pypi/pyversions/hydraflow.svg
41
- [python-v-link]: https://pypi.org/project/hydraflow
42
- [GHAction-image]: https://github.com/daizutabi/hydraflow/actions/workflows/ci.yml/badge.svg?branch=main&event=push
43
- [GHAction-link]: https://github.com/daizutabi/hydraflow/actions?query=event%3Apush+branch%3Amain
44
- [codecov-image]: https://codecov.io/github/daizutabi/hydraflow/coverage.svg?branch=main
45
- [codecov-link]: https://codecov.io/github/daizutabi/hydraflow?branch=main
@@ -1,10 +0,0 @@
1
- hydraflow/__init__.py,sha256=e1Q0Sskx39jaU2zkGNXjFWNC5xugEz_hDERTN_6Mzy8,666
2
- hydraflow/config.py,sha256=b3Plh_lmq94loZNw9QP2asd6thCLyTzzYSutH0cONXA,964
3
- hydraflow/context.py,sha256=3vejDbRYQBuBwlhpBpOv5aoyZ-yS8UUzpbCFK1V1uvw,2720
4
- hydraflow/mlflow.py,sha256=unBP3Y7ujTM3E_Hq_eYvRVFZoGfTA7B0h4FkOZtPPqc,566
5
- hydraflow/runs.py,sha256=127YykWzmiNUUuJSGPOCZasXmd6tcE15HU32j8x71ck,5864
6
- hydraflow/util.py,sha256=_BdOMq5tKPm8HOehb2s2ZIBpJYyVpvO_yaAIxbSj51I,253
7
- hydraflow-0.1.4.dist-info/METADATA,sha256=Xw-xcDKdzkHa7bKDZUI6MXpOKekcyFbMyBy1yANjNQs,1903
8
- hydraflow-0.1.4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
9
- hydraflow-0.1.4.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
10
- hydraflow-0.1.4.dist-info/RECORD,,