hydraflow 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
hydraflow/__init__.py CHANGED
@@ -3,15 +3,10 @@ from .mlflow import set_experiment
3
3
  from .runs import (
4
4
  Run,
5
5
  Runs,
6
- drop_unique_params,
7
6
  filter_runs,
8
- get_artifact_dir,
9
- get_artifact_path,
10
- get_artifact_uri,
11
7
  get_param_dict,
12
8
  get_param_names,
13
9
  get_run,
14
- get_run_id,
15
10
  load_config,
16
11
  )
17
12
 
@@ -20,15 +15,10 @@ __all__ = [
20
15
  "Run",
21
16
  "Runs",
22
17
  "chdir_artifact",
23
- "drop_unique_params",
24
18
  "filter_runs",
25
- "get_artifact_dir",
26
- "get_artifact_path",
27
- "get_artifact_uri",
28
19
  "get_param_dict",
29
20
  "get_param_names",
30
21
  "get_run",
31
- "get_run_id",
32
22
  "load_config",
33
23
  "log_run",
34
24
  "set_experiment",
hydraflow/config.py CHANGED
@@ -16,39 +16,51 @@ if TYPE_CHECKING:
16
16
 
17
17
  def iter_params(config: object, prefix: str = "") -> Iterator[tuple[str, Any]]:
18
18
  """
19
- Iterate over the parameters in the given configuration object.
19
+ Recursively iterate over the parameters in the given configuration object.
20
20
 
21
- This function recursively traverses the configuration object and yields
22
- key-value pairs representing the parameters.
21
+ This function traverses the configuration object and yields key-value pairs
22
+ representing the parameters. The keys are prefixed with the provided prefix.
23
23
 
24
24
  Args:
25
- config (object): The configuration object to iterate over.
26
- prefix (str, optional): The prefix to prepend to the parameter keys.
27
- Defaults to "".
25
+ config: The configuration object to iterate over. This can be a dictionary,
26
+ list, DictConfig, or ListConfig.
27
+ prefix: The prefix to prepend to the parameter keys.
28
+ Defaults to an empty string.
28
29
 
29
30
  Yields:
30
- Key-value pairs representing the parameters.
31
+ Key-value pairs representing the parameters in the configuration object.
31
32
  """
32
33
  if not isinstance(config, (DictConfig, ListConfig)):
33
34
  config = OmegaConf.create(config) # type: ignore
34
35
 
36
+ yield from _iter_params(config, prefix)
37
+
38
+
39
+ def _iter_params(config: object, prefix: str = "") -> Iterator[tuple[str, Any]]:
35
40
  if isinstance(config, DictConfig):
36
41
  for key, value in config.items():
37
- if isinstance(value, ListConfig) and not any(
38
- isinstance(v, (DictConfig, ListConfig)) for v in value
39
- ):
42
+ if _is_param(value):
40
43
  yield f"{prefix}{key}", value
41
44
 
42
- elif isinstance(value, (DictConfig, ListConfig)):
43
- yield from iter_params(value, f"{prefix}{key}.")
44
-
45
45
  else:
46
- yield f"{prefix}{key}", value
46
+ yield from _iter_params(value, f"{prefix}{key}.")
47
47
 
48
48
  elif isinstance(config, ListConfig):
49
49
  for index, value in enumerate(config):
50
- if isinstance(value, (DictConfig, ListConfig)):
51
- yield from iter_params(value, f"{prefix}{index}.")
50
+ if _is_param(value):
51
+ yield f"{prefix}{index}", value
52
52
 
53
53
  else:
54
- yield f"{prefix}{index}", value
54
+ yield from _iter_params(value, f"{prefix}{index}.")
55
+
56
+
57
+ def _is_param(value: object) -> bool:
58
+ """Check if the given value is a parameter."""
59
+ if isinstance(value, DictConfig):
60
+ return False
61
+
62
+ if isinstance(value, ListConfig):
63
+ if any(isinstance(v, (DictConfig, ListConfig)) for v in value):
64
+ return False
65
+
66
+ return True
hydraflow/context.py CHANGED
@@ -5,6 +5,7 @@ run context.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import logging
8
9
  import os
9
10
  import time
10
11
  from contextlib import contextmanager
@@ -17,15 +18,14 @@ from hydra.core.hydra_config import HydraConfig
17
18
  from watchdog.events import FileModifiedEvent, FileSystemEventHandler
18
19
  from watchdog.observers import Observer
19
20
 
20
- from hydraflow.mlflow import log_params
21
- from hydraflow.runs import get_artifact_path
22
- from hydraflow.util import uri_to_path
21
+ from hydraflow.mlflow import get_artifact_dir, log_params
23
22
 
24
23
  if TYPE_CHECKING:
25
24
  from collections.abc import Callable, Iterator
26
25
 
27
26
  from mlflow.entities.run import Run
28
- from pandas import Series
27
+
28
+ log = logging.getLogger(__name__)
29
29
 
30
30
 
31
31
  @dataclass
@@ -66,8 +66,7 @@ def log_run(
66
66
 
67
67
  hc = HydraConfig.get()
68
68
  output_dir = Path(hc.runtime.output_dir)
69
- uri = mlflow.get_artifact_uri()
70
- info = Info(output_dir, uri_to_path(uri))
69
+ info = Info(output_dir, get_artifact_dir())
71
70
 
72
71
  # Save '.hydra' config directory first.
73
72
  output_subdir = output_dir / (hc.output_subdir or "")
@@ -81,13 +80,21 @@ def log_run(
81
80
  with watch(log_artifact, output_dir):
82
81
  yield info
83
82
 
83
+ except Exception as e:
84
+ log.error(f"Error during log_run: {e}")
85
+ raise
86
+
84
87
  finally:
85
88
  # Save output_dir including '.hydra' config directory.
86
89
  mlflow.log_artifacts(output_dir.as_posix())
87
90
 
88
91
 
89
92
  @contextmanager
90
- def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60) -> Iterator[None]:
93
+ def watch(
94
+ func: Callable[[Path], None],
95
+ dir: Path | str = "",
96
+ timeout: int = 60,
97
+ ) -> Iterator[None]:
91
98
  """
92
99
  Watch the given directory for changes and call the provided function
93
100
  when a change is detected.
@@ -98,25 +105,23 @@ def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60)
98
105
  period or until the context is exited.
99
106
 
100
107
  Args:
101
- func (Callable[[Path], None]): The function to call when a change is
108
+ func: The function to call when a change is
102
109
  detected. It should accept a single argument of type `Path`,
103
110
  which is the path of the modified file.
104
- dir (Path | str, optional): The directory to watch. If not specified,
111
+ dir: The directory to watch. If not specified,
105
112
  the current MLflow artifact URI is used. Defaults to "".
106
- timeout (int, optional): The timeout period in seconds for the watcher
113
+ timeout: The timeout period in seconds for the watcher
107
114
  to run after the context is exited. Defaults to 60.
108
115
 
109
116
  Yields:
110
- None: This context manager does not return any value.
117
+ None
111
118
 
112
119
  Example:
113
120
  with watch(log_artifact, "/path/to/dir"):
114
121
  # Perform operations while watching the directory for changes
115
122
  pass
116
123
  """
117
- if not dir:
118
- uri = mlflow.get_artifact_uri()
119
- dir = uri_to_path(uri)
124
+ dir = dir or get_artifact_dir()
120
125
 
121
126
  handler = Handler(func)
122
127
  observer = Observer()
@@ -126,6 +131,10 @@ def watch(func: Callable[[Path], None], dir: Path | str = "", timeout: int = 60)
126
131
  try:
127
132
  yield
128
133
 
134
+ except Exception as e:
135
+ log.error(f"Error during watch: {e}")
136
+ raise
137
+
129
138
  finally:
130
139
  elapsed = 0
131
140
  while not observer.event_queue.empty():
@@ -150,7 +159,7 @@ class Handler(FileSystemEventHandler):
150
159
 
151
160
  @contextmanager
152
161
  def chdir_artifact(
153
- run: Run | Series | str,
162
+ run: Run,
154
163
  artifact_path: str | None = None,
155
164
  ) -> Iterator[Path]:
156
165
  """
@@ -166,11 +175,14 @@ def chdir_artifact(
166
175
  artifact_path: The artifact path.
167
176
  """
168
177
  curdir = Path.cwd()
178
+ path = mlflow.artifacts.download_artifacts(
179
+ run_id=run.info.run_id,
180
+ artifact_path=artifact_path,
181
+ )
169
182
 
170
- artifact_dir = get_artifact_path(run, artifact_path)
171
-
172
- os.chdir(artifact_dir)
183
+ os.chdir(path)
173
184
  try:
174
- yield artifact_dir
185
+ yield Path(path)
186
+
175
187
  finally:
176
188
  os.chdir(curdir)
hydraflow/mlflow.py CHANGED
@@ -5,6 +5,8 @@ configuration objects and set up experiments using MLflow.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ from pathlib import Path
9
+
8
10
  import mlflow
9
11
  from hydra.core.hydra_config import HydraConfig
10
12
 
@@ -47,3 +49,24 @@ def log_params(config: object, *, synchronous: bool | None = None) -> None:
47
49
  """
48
50
  for key, value in iter_params(config):
49
51
  mlflow.log_param(key, value, synchronous=synchronous)
52
+
53
+
54
+ def get_artifact_dir(artifact_path: str | None = None) -> Path:
55
+ """
56
+ Get the artifact directory for the given artifact path.
57
+
58
+ This function retrieves the artifact URI for the specified artifact path
59
+ using MLflow, downloads the artifacts to a local directory, and returns
60
+ the path to that directory.
61
+
62
+ Args:
63
+ artifact_path: The artifact path for which to get the directory.
64
+ Defaults to None.
65
+
66
+ Returns:
67
+ The local path to the directory where the artifacts are downloaded.
68
+ """
69
+ uri = mlflow.get_artifact_uri(artifact_path)
70
+ dir = mlflow.artifacts.download_artifacts(artifact_uri=uri)
71
+
72
+ return Path(dir)
hydraflow/runs.py CHANGED
@@ -1,30 +1,85 @@
1
1
  """
2
2
  This module provides functionality for managing and interacting with MLflow runs.
3
- It includes classes and functions to filter runs, retrieve run information, and
4
- log artifacts and configurations.
3
+ It includes the `Runs` class and various methods to filter runs, retrieve run information,
4
+ log artifacts, and load configurations.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from dataclasses import dataclass
10
10
  from functools import cache
11
- from pathlib import Path
11
+ from itertools import chain
12
12
  from typing import TYPE_CHECKING, Any
13
13
 
14
14
  import mlflow
15
- import numpy as np
16
- from mlflow.entities.run import Run as Run_
17
- from mlflow.tracking import artifact_utils
15
+ from mlflow.entities import ViewType
16
+ from mlflow.entities.run import Run
17
+ from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
18
18
  from omegaconf import DictConfig, OmegaConf
19
- from pandas import DataFrame, Series
20
19
 
21
20
  from hydraflow.config import iter_params
22
- from hydraflow.util import uri_to_path
23
21
 
24
22
  if TYPE_CHECKING:
25
23
  from typing import Any
26
24
 
27
25
 
26
+ def search_runs(
27
+ experiment_ids: list[str] | None = None,
28
+ filter_string: str = "",
29
+ run_view_type: int = ViewType.ACTIVE_ONLY,
30
+ max_results: int = SEARCH_MAX_RESULTS_PANDAS,
31
+ order_by: list[str] | None = None,
32
+ search_all_experiments: bool = False,
33
+ experiment_names: list[str] | None = None,
34
+ ) -> Runs:
35
+ """
36
+ Search for Runs that fit the specified criteria.
37
+
38
+ This function wraps the `mlflow.search_runs` function and returns the results
39
+ as a `Runs` object. It allows for flexible searching of MLflow runs based on
40
+ various criteria.
41
+
42
+ Args:
43
+ experiment_ids: List of experiment IDs. Search can work with experiment IDs or
44
+ experiment names, but not both in the same call. Values other than
45
+ ``None`` or ``[]`` will result in error if ``experiment_names`` is
46
+ also not ``None`` or ``[]``. ``None`` will default to the active
47
+ experiment if ``experiment_names`` is ``None`` or ``[]``.
48
+ filter_string: Filter query string, defaults to searching all runs.
49
+ run_view_type: one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``, or ``ALL`` runs
50
+ defined in :py:class:`mlflow.entities.ViewType`.
51
+ max_results: The maximum number of runs to put in the dataframe. Default is 100,000
52
+ to avoid causing out-of-memory issues on the user's machine.
53
+ order_by: List of columns to order by (e.g., "metrics.rmse"). The ``order_by`` column
54
+ can contain an optional ``DESC`` or ``ASC`` value. The default is ``ASC``.
55
+ The default ordering is to sort by ``start_time DESC``, then ``run_id``.
56
+ output_format: The output format to be returned. If ``pandas``, a ``pandas.DataFrame``
57
+ is returned and, if ``list``, a list of :py:class:`mlflow.entities.Run`
58
+ is returned.
59
+ search_all_experiments: Boolean specifying whether all experiments should be searched.
60
+ Only honored if ``experiment_ids`` is ``[]`` or ``None``.
61
+ experiment_names: List of experiment names. Search can work with experiment IDs or
62
+ experiment names, but not both in the same call. Values other
63
+ than ``None`` or ``[]`` will result in error if ``experiment_ids``
64
+ is also not ``None`` or ``[]``. ``None`` will default to the active
65
+ experiment if ``experiment_ids`` is ``None`` or ``[]``.
66
+
67
+ Returns:
68
+ A `Runs` object containing the search results.
69
+ """
70
+ runs = mlflow.search_runs(
71
+ experiment_ids=experiment_ids,
72
+ filter_string=filter_string,
73
+ run_view_type=run_view_type,
74
+ max_results=max_results,
75
+ order_by=order_by,
76
+ output_format="list",
77
+ search_all_experiments=search_all_experiments,
78
+ experiment_names=experiment_names,
79
+ )
80
+ return Runs(runs) # type: ignore
81
+
82
+
28
83
  @dataclass
29
84
  class Runs:
30
85
  """
@@ -34,7 +89,7 @@ class Runs:
34
89
  retrieving specific runs, and accessing run information.
35
90
  """
36
91
 
37
- runs: list[Run_] | DataFrame
92
+ runs: list[Run]
38
93
 
39
94
  def __repr__(self) -> str:
40
95
  return f"{self.__class__.__name__}({len(self)})"
@@ -53,115 +108,95 @@ class Runs:
53
108
  be included in the returned `Runs` object.
54
109
 
55
110
  Args:
56
- config (object): The configuration object to filter the runs.
57
- This object should contain key-value pairs representing
58
- the parameters to filter by.
111
+ config: The configuration object to filter the runs.
59
112
 
60
113
  Returns:
61
- Runs: A new `Runs` object containing the filtered runs.
114
+ A new `Runs` object containing the filtered runs.
62
115
  """
63
116
  return Runs(filter_runs(self.runs, config))
64
117
 
65
- def get(self, config: object) -> Run:
118
+ def get(self, config: object) -> Run | None:
66
119
  """
67
120
  Retrieve a specific run based on the provided configuration.
68
121
 
69
122
  This method filters the runs in the collection according to the
70
123
  specified configuration object and returns the run that matches
71
124
  the provided parameters. If more than one run matches the criteria,
72
- an error is raised.
125
+ a `ValueError` is raised.
73
126
 
74
127
  Args:
75
- config (object): The configuration object to identify the run.
128
+ config: The configuration object to identify the run.
76
129
 
77
130
  Returns:
78
131
  Run: The run object that matches the provided configuration.
132
+ None, if the runs are not in a DataFrame format.
79
133
 
80
134
  Raises:
81
135
  ValueError: If the number of filtered runs is not exactly one.
82
136
  """
83
- return Run(get_run(self.runs, config))
137
+ return get_run(self.runs, config)
84
138
 
85
- def drop_unique_params(self) -> Runs:
139
+ def get_earliest_run(self, config: object | None = None, **kwargs) -> Run | None:
86
140
  """
87
- Drop unique parameters from the runs and return a new Runs object.
141
+ Get the earliest run from the list of runs based on the start time.
88
142
 
89
- This method removes parameters that have unique values across all runs
90
- in the collection. This is useful for identifying common parameters
91
- that are shared among multiple runs.
143
+ This method filters the runs based on the configuration if provided
144
+ and returns the run with the earliest start time.
145
+
146
+ Args:
147
+ config: The configuration object to filter the runs.
148
+ If None, no filtering is applied.
149
+ **kwargs: Additional key-value pairs to filter the runs.
92
150
 
93
151
  Returns:
94
- Runs: A new `Runs` object with unique parameters dropped.
152
+ The run with the earliest start time, or None if no runs match the criteria.
153
+ """
154
+ return get_earliest_run(self.runs, config, **kwargs)
95
155
 
96
- Raises:
97
- NotImplementedError: If the runs are not in a DataFrame format.
156
+ def get_latest_run(self, config: object | None = None, **kwargs) -> Run | None:
98
157
  """
99
- if isinstance(self.runs, DataFrame):
100
- return Runs(drop_unique_params(self.runs))
158
+ Get the latest run from the list of runs based on the start time.
101
159
 
102
- raise NotImplementedError
160
+ Args:
161
+ config: The configuration object to filter the runs.
162
+ If None, no filtering is applied.
163
+ **kwargs: Additional key-value pairs to filter the runs.
164
+
165
+ Returns:
166
+ The run with the latest start time, or None if no runs match the criteria.
167
+ """
168
+ return get_latest_run(self.runs, config, **kwargs)
103
169
 
104
170
  def get_param_names(self) -> list[str]:
105
171
  """
106
172
  Get the parameter names from the runs.
107
173
 
108
- This method extracts the parameter names from the runs in the collection.
109
- If the runs are stored in a DataFrame, it retrieves the column names
110
- that correspond to the parameters.
174
+ This method extracts the unique parameter names from the provided list of runs.
175
+ It iterates through each run and collects the parameter names into a set to
176
+ ensure uniqueness.
111
177
 
112
178
  Returns:
113
- list[str]: A list of parameter names.
114
-
115
- Raises:
116
- NotImplementedError: If the runs are not in a DataFrame format.
179
+ A list of unique parameter names.
117
180
  """
118
- if isinstance(self.runs, DataFrame):
119
- return get_param_names(self.runs)
120
-
121
- raise NotImplementedError
181
+ return get_param_names(self.runs)
122
182
 
123
183
  def get_param_dict(self) -> dict[str, list[str]]:
124
184
  """
125
- Get the parameter dictionary from the runs.
185
+ Get the parameter dictionary from the list of runs.
126
186
 
127
187
  This method extracts the parameter names and their corresponding values
128
- from the runs in the collection. If the runs are stored in a DataFrame,
129
- it retrieves the unique values for each parameter.
130
-
188
+ from the provided list of runs. It iterates through each run and collects
189
+ the parameter values into a dictionary where the keys are parameter names
190
+ and the values are lists of parameter values.
131
191
 
132
192
  Returns:
133
- dict[str, list[str]]: A dictionary of parameter names and their
134
- corresponding values.
135
-
136
- Raises:
137
- NotImplementedError: If the runs are not in a DataFrame format.
193
+ A dictionary where the keys are parameter names and the values are lists
194
+ of parameter values.
138
195
  """
139
- if isinstance(self.runs, DataFrame):
140
- return get_param_dict(self.runs)
141
-
142
- raise NotImplementedError
143
-
144
-
145
- def search_runs(*args, **kwargs) -> Runs:
146
- """
147
- Search for runs that match the specified criteria.
148
-
149
- This function wraps the `mlflow.search_runs` function and returns the results
150
- as a `Runs` object. It allows for flexible searching of MLflow runs based on
151
- various criteria.
152
-
153
- Args:
154
- *args: Positional arguments to pass to `mlflow.search_runs`.
155
- **kwargs: Keyword arguments to pass to `mlflow.search_runs`.
156
-
157
- Returns:
158
- Runs: A `Runs` object containing the search results.
159
- """
160
- runs = mlflow.search_runs(*args, **kwargs)
161
- return Runs(runs)
196
+ return get_param_dict(self.runs)
162
197
 
163
198
 
164
- def filter_runs(runs: list[Run_] | DataFrame, config: object) -> list[Run_] | DataFrame:
199
+ def filter_runs(runs: list[Run], config: object, **kwargs) -> list[Run]:
165
200
  """
166
201
  Filter the runs based on the provided configuration.
167
202
 
@@ -169,22 +204,26 @@ def filter_runs(runs: list[Run_] | DataFrame, config: object) -> list[Run_] | Da
169
204
  specified configuration object. The configuration object should
170
205
  contain key-value pairs that correspond to the parameters of the
171
206
  runs. Only the runs that match all the specified parameters will
172
- be included in the returned `Runs` object.
207
+ be included in the returned list of runs.
173
208
 
174
209
  Args:
175
210
  runs: The runs to filter.
176
211
  config: The configuration object to filter the runs.
212
+ **kwargs: Additional key-value pairs to filter the runs.
177
213
 
178
214
  Returns:
179
- Runs: A filtered list of runs or a DataFrame.
215
+ A filtered list of runs.
180
216
  """
181
- if isinstance(runs, list):
182
- return _filter_runs_list(runs, config)
217
+ for key, value in chain(iter_params(config), kwargs.items()):
218
+ runs = [run for run in runs if _is_equal(run, key, value)]
183
219
 
184
- return _filter_runs_dataframe(runs, config)
220
+ if len(runs) == 0:
221
+ return []
222
+
223
+ return runs
185
224
 
186
225
 
187
- def _is_equal(run: Run_, key: str, value: Any) -> bool:
226
+ def _is_equal(run: Run, key: str, value: Any) -> bool:
188
227
  param = run.data.params.get(key, value)
189
228
 
190
229
  if param is None:
@@ -193,275 +232,146 @@ def _is_equal(run: Run_, key: str, value: Any) -> bool:
193
232
  return type(value)(param) == value
194
233
 
195
234
 
196
- def _filter_runs_list(runs: list[Run_], config: object) -> list[Run_]:
197
- for key, value in iter_params(config):
198
- runs = [run for run in runs if _is_equal(run, key, value)]
199
-
200
- return runs
201
-
202
-
203
- def _filter_runs_dataframe(runs: DataFrame, config: object) -> DataFrame:
204
- index = np.ones(len(runs), dtype=bool)
205
-
206
- for key, value in iter_params(config):
207
- name = f"params.{key}"
208
-
209
- if name in runs:
210
- series = runs[name]
211
- is_value = -series.isna()
212
- param = series.fillna(value).astype(type(value))
213
- index &= is_value & (param == value)
214
-
215
- return runs[index]
216
-
217
-
218
- def get_run(runs: list[Run_] | DataFrame, config: object) -> Run_ | Series:
235
+ def get_run(runs: list[Run], config: object, **kwargs) -> Run | None:
219
236
  """
220
237
  Retrieve a specific run based on the provided configuration.
221
238
 
222
239
  This method filters the runs in the collection according to the
223
240
  specified configuration object and returns the run that matches
224
241
  the provided parameters. If more than one run matches the criteria,
225
- an error is raised.
242
+ a `ValueError` is raised.
226
243
 
227
244
  Args:
228
245
  runs: The runs to filter.
229
246
  config: The configuration object to identify the run.
247
+ **kwargs: Additional key-value pairs to filter the runs.
230
248
 
231
249
  Returns:
232
- Run: The run object that matches the provided configuration.
250
+ The run object that matches the provided configuration, or None
251
+ if no runs match the criteria.
252
+
253
+ Raises:
254
+ ValueError: If more than one run matches the criteria.
233
255
  """
234
- runs = filter_runs(runs, config)
256
+ runs = filter_runs(runs, config, **kwargs)
257
+
258
+ if len(runs) == 0:
259
+ return None
235
260
 
236
261
  if len(runs) == 1:
237
- return runs[0] if isinstance(runs, list) else runs.iloc[0]
262
+ return runs[0]
238
263
 
239
- msg = f"number of filtered runs is not 1: got {len(runs)}"
264
+ msg = f"Multiple runs were filtered. Expected number of runs is 1, but found {len(runs)} runs."
240
265
  raise ValueError(msg)
241
266
 
242
267
 
243
- def drop_unique_params(runs: DataFrame) -> DataFrame:
268
+ def get_earliest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
244
269
  """
245
- Drop unique parameters from the runs and return a new DataFrame.
270
+ Get the earliest run from the list of runs based on the start time.
246
271
 
247
- This method removes parameters that have unique values across all runs
248
- in the collection. This is useful for identifying common parameters
249
- that are shared among multiple runs.
272
+ This method filters the runs based on the configuration if provided
273
+ and returns the run with the earliest start time.
250
274
 
251
275
  Args:
252
- runs: The DataFrame containing the runs.
276
+ runs: The list of runs.
277
+ config: The configuration object to filter the runs.
278
+ If None, no filtering is applied.
279
+ **kwargs: Additional key-value pairs to filter the runs.
253
280
 
254
281
  Returns:
255
- DataFrame: A new DataFrame with unique parameters dropped.
282
+ The run with the earliest start time, or None if no runs match the criteria.
256
283
  """
284
+ if config is not None or kwargs:
285
+ runs = filter_runs(runs, config or {}, **kwargs)
257
286
 
258
- def select(column: str) -> bool:
259
- return not column.startswith("params.") or len(runs[column].unique()) > 1
287
+ return min(runs, key=lambda run: run.info.start_time, default=None)
260
288
 
261
- columns = [select(column) for column in runs.columns]
262
- return runs.iloc[:, columns]
263
289
 
264
-
265
- def get_param_names(runs: DataFrame) -> list[str]:
290
+ def get_latest_run(runs: list[Run], config: object | None = None, **kwargs) -> Run | None:
266
291
  """
267
- Get the parameter names from the runs.
292
+ Get the latest run from the list of runs based on the start time.
268
293
 
269
- This method extracts the parameter names from the runs in the collection.
270
- If the runs are stored in a DataFrame, it retrieves the column names
271
- that correspond to the parameters.
294
+ This method filters the runs based on the configuration if provided
295
+ and returns the run with the latest start time.
272
296
 
273
297
  Args:
274
- runs: The DataFrame containing the runs.
298
+ runs: The list of runs.
299
+ config: The configuration object to filter the runs.
300
+ If None, no filtering is applied.
301
+ **kwargs: Additional key-value pairs to filter the runs.
275
302
 
276
303
  Returns:
277
- list[str]: A list of parameter names.
304
+ The run with the latest start time, or None if no runs match the criteria.
278
305
  """
306
+ if config is not None or kwargs:
307
+ runs = filter_runs(runs, config or {}, **kwargs)
279
308
 
280
- def get_name(column: str) -> str:
281
- if column.startswith("params."):
282
- return column.split(".", maxsplit=1)[-1]
283
-
284
- return ""
285
-
286
- columns = [get_name(column) for column in runs.columns]
287
- return [column for column in columns if column]
309
+ return max(runs, key=lambda run: run.info.start_time, default=None)
288
310
 
289
311
 
290
- def get_param_dict(runs: DataFrame) -> dict[str, list[str]]:
312
+ def get_param_names(runs: list[Run]) -> list[str]:
291
313
  """
292
- Get the parameter dictionary from the runs.
293
-
294
- This method extracts the parameter names and their corresponding values
295
- from the runs in the collection. If the runs are stored in a DataFrame,
296
- it retrieves the unique values for each parameter.
297
-
298
- Args:
299
- runs: The DataFrame containing the runs.
300
-
301
- Returns:
302
- dict[str, list[str]]: A dictionary of parameter names and
303
- their corresponding values.
304
- """
305
- params = {}
306
- for name in get_param_names(runs):
307
- params[name] = list(runs[f"params.{name}"].unique())
308
-
309
- return params
310
-
311
-
312
- @dataclass
313
- class Run:
314
- """
315
- A class to represent a specific MLflow run.
316
-
317
- This class provides methods to interact with the run, such as retrieving
318
- the run ID, artifact URI, and configuration. It also includes properties
319
- to access the artifact directory, artifact path, and Hydra output directory.
320
- """
321
-
322
- run: Run_ | Series | str
323
-
324
- def __repr__(self) -> str:
325
- return f"{self.__class__.__name__}({self.run_id!r})"
326
-
327
- @property
328
- def run_id(self) -> str:
329
- """
330
- Get the run ID.
331
-
332
- Returns:
333
- str: The run ID.
334
- """
335
- return get_run_id(self.run)
336
-
337
- def artifact_uri(self, artifact_path: str | None = None) -> str:
338
- """
339
- Get the artifact URI.
340
-
341
- Args:
342
- artifact_path (str | None): The artifact path.
343
-
344
- Returns:
345
- str: The artifact URI.
346
- """
347
- return get_artifact_uri(self.run, artifact_path)
348
-
349
- @property
350
- def artifact_dir(self) -> Path:
351
- """
352
- Get the artifact directory.
353
-
354
- Returns:
355
- Path: The artifact directory.
356
- """
357
- return get_artifact_dir(self.run)
358
-
359
- def artifact_path(self, artifact_path: str | None = None) -> Path:
360
- """
361
- Get the artifact path.
362
-
363
- Args:
364
- artifact_path: The artifact path.
365
-
366
- Returns:
367
- Path: The artifact path.
368
- """
369
- return get_artifact_path(self.run, artifact_path)
370
-
371
- @property
372
- def config(self) -> DictConfig:
373
- """
374
- Get the configuration.
375
-
376
- Returns:
377
- DictConfig: The configuration.
378
- """
379
- return load_config(self.run)
380
-
381
- def log_hydra_output_dir(self) -> None:
382
- """
383
- Log the Hydra output directory.
384
-
385
- Returns:
386
- None
387
- """
388
- log_hydra_output_dir(self.run)
314
+ Get the parameter names from the runs.
389
315
 
390
-
391
- def get_run_id(run: Run_ | Series | str) -> str:
392
- """
393
- Get the run ID.
316
+ This method extracts the unique parameter names from the provided list of runs.
317
+ It iterates through each run and collects the parameter names into a set to
318
+ ensure uniqueness.
394
319
 
395
320
  Args:
396
- run: The run object.
321
+ runs: The list of runs from which to extract parameter names.
397
322
 
398
323
  Returns:
399
- str: The run ID.
324
+ A list of unique parameter names.
400
325
  """
401
- if isinstance(run, str):
402
- return run
403
-
404
- if isinstance(run, Run_):
405
- return run.info.run_id
326
+ param_names = set()
406
327
 
407
- return run.run_id
328
+ for run in runs:
329
+ for param in run.data.params.keys():
330
+ param_names.add(param)
408
331
 
332
+ return list(param_names)
409
333
 
410
- def get_artifact_uri(run: Run_ | Series | str, artifact_path: str | None = None) -> str:
411
- """
412
- Get the artifact URI.
413
-
414
- Args:
415
- run: The run object.
416
- artifact_path: The artifact path.
417
334
 
418
- Returns:
419
- str: The artifact URI.
335
+ def get_param_dict(runs: list[Run]) -> dict[str, list[str]]:
420
336
  """
421
- run_id = get_run_id(run)
422
- return artifact_utils.get_artifact_uri(run_id, artifact_path)
337
+ Get the parameter dictionary from the list of runs.
423
338
 
424
-
425
- def get_artifact_dir(run: Run_ | Series | str) -> Path:
426
- """
427
- Get the artifact directory.
339
+ This method extracts the parameter names and their corresponding values
340
+ from the provided list of runs. It iterates through each run and collects
341
+ the parameter values into a dictionary where the keys are parameter names
342
+ and the values are lists of parameter values.
428
343
 
429
344
  Args:
430
- run: The run object.
345
+ runs: The list of runs from which to extract parameter names and values.
431
346
 
432
347
  Returns:
433
- Path: The artifact directory.
348
+ A dictionary where the keys are parameter names and the values are lists
349
+ of parameter values.
434
350
  """
435
- uri = get_artifact_uri(run)
436
- return uri_to_path(uri)
351
+ params = {}
437
352
 
353
+ for name in get_param_names(runs):
354
+ it = (run.data.params[name] for run in runs if name in run.data.params)
355
+ params[name] = sorted(set(it))
438
356
 
439
- def get_artifact_path(run: Run_ | Series | str, artifact_path: str | None = None) -> Path:
440
- """
441
- Get the artifact path.
357
+ return params
442
358
 
443
- Args:
444
- run: The run object.
445
- artifact_path: The artifact path.
446
359
 
447
- Returns:
448
- Path: The artifact path.
360
+ def load_config(run: Run) -> DictConfig:
449
361
  """
450
- artifact_dir = get_artifact_dir(run)
451
- return artifact_dir / artifact_path if artifact_path else artifact_dir
362
+ Load the configuration for a given run.
452
363
 
453
-
454
- def load_config(run: Run_ | Series | str) -> DictConfig:
455
- """
456
- Load the configuration.
364
+ This function loads the configuration for the provided Run instance
365
+ by downloading the configuration file from the MLflow artifacts and
366
+ loading it using OmegaConf.
457
367
 
458
368
  Args:
459
- run: The run object.
369
+ run: The Run instance to load the configuration for.
460
370
 
461
371
  Returns:
462
- DictConfig: The configuration.
372
+ The loaded configuration.
463
373
  """
464
- run_id = get_run_id(run)
374
+ run_id = run.info.run_id
465
375
  return _load_config(run_id)
466
376
 
467
377
 
@@ -478,35 +388,35 @@ def _load_config(run_id: str) -> DictConfig:
478
388
  return OmegaConf.load(path) # type: ignore
479
389
 
480
390
 
481
- def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
482
- """
483
- Get the Hydra output directory.
391
+ # def get_hydra_output_dir(run: Run_ | Series | str) -> Path:
392
+ # """
393
+ # Get the Hydra output directory.
484
394
 
485
- Args:
486
- run: The run object.
395
+ # Args:
396
+ # run: The run object.
487
397
 
488
- Returns:
489
- Path: The Hydra output directory.
490
- """
491
- path = get_artifact_dir(run) / ".hydra/hydra.yaml"
398
+ # Returns:
399
+ # Path: The Hydra output directory.
400
+ # """
401
+ # path = get_artifact_dir(run) / ".hydra/hydra.yaml"
492
402
 
493
- if path.exists():
494
- hc = OmegaConf.load(path)
495
- return Path(hc.hydra.runtime.output_dir)
403
+ # if path.exists():
404
+ # hc = OmegaConf.load(path)
405
+ # return Path(hc.hydra.runtime.output_dir)
496
406
 
497
- raise FileNotFoundError
407
+ # raise FileNotFoundError
498
408
 
499
409
 
500
- def log_hydra_output_dir(run: Run_ | Series | str) -> None:
501
- """
502
- Log the Hydra output directory.
410
+ # def log_hydra_output_dir(run: Run_ | Series | str) -> None:
411
+ # """
412
+ # Log the Hydra output directory.
503
413
 
504
- Args:
505
- run: The run object.
414
+ # Args:
415
+ # run: The run object.
506
416
 
507
- Returns:
508
- None
509
- """
510
- output_dir = get_hydra_output_dir(run)
511
- run_id = run if isinstance(run, str) else run.info.run_id
512
- mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
417
+ # Returns:
418
+ # None
419
+ # """
420
+ # output_dir = get_hydra_output_dir(run)
421
+ # run_id = run if isinstance(run, str) else run.info.run_id
422
+ # mlflow.log_artifacts(output_dir.as_posix(), run_id=run_id)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hydraflow
3
- Version: 0.1.5
3
+ Version: 0.2.1
4
4
  Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
5
5
  Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -0,0 +1,9 @@
1
+ hydraflow/__init__.py,sha256=PzziOG9RnGAVbl9Yz4ScvsL8nfkjsuN0alMKRvZT-_Y,442
2
+ hydraflow/config.py,sha256=wI8uNuD2D-hIf4BAhEYJaMC6EyO-erKopy_ia_b1pYA,2048
3
+ hydraflow/context.py,sha256=MqkEhKEZL_N3eb3v5u9D4EqKkiSmiPyXXafhPkALRlg,5129
4
+ hydraflow/mlflow.py,sha256=_Los9E38eG8sTiN8bGwZmvjCrS0S-wSGiA4fyhQM3Zw,2251
5
+ hydraflow/runs.py,sha256=NT7IzE-Pf7T2Ey-eWEPZzQQaX4Gt_RKDKSn2pj2yzGc,14304
6
+ hydraflow-0.2.1.dist-info/METADATA,sha256=4C_hnw1gMb8WUQXyqj4q8eA1IVbp0wZuLGGthIk1G7U,4224
7
+ hydraflow-0.2.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
8
+ hydraflow-0.2.1.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
9
+ hydraflow-0.2.1.dist-info/RECORD,,
hydraflow/util.py DELETED
@@ -1,24 +0,0 @@
1
- import platform
2
- from pathlib import Path
3
- from urllib.parse import urlparse
4
-
5
-
6
- def uri_to_path(uri: str) -> Path:
7
- """
8
- Convert a URI to a path.
9
-
10
- This function parses the given URI and converts it to a local file system
11
- path. On Windows, if the path starts with a forward slash, it is removed
12
- to ensure the path is correctly formatted.
13
-
14
- Args:
15
- uri (str): The URI to convert.
16
-
17
- Returns:
18
- Path: The path corresponding to the URI.
19
- """
20
- path = urlparse(uri).path
21
- if platform.system() == "Windows" and path.startswith("/"):
22
- path = path[1:]
23
-
24
- return Path(path)
@@ -1,10 +0,0 @@
1
- hydraflow/__init__.py,sha256=e1Q0Sskx39jaU2zkGNXjFWNC5xugEz_hDERTN_6Mzy8,666
2
- hydraflow/config.py,sha256=WARa5u1F0n3wCOi65v8v8rUO78ME-mtzMeeeE2Yc1I8,1728
3
- hydraflow/context.py,sha256=NYjIMepLtaKyvw1obpE8gR1qu1OBpSB_uc6-5So2tg8,5139
4
- hydraflow/mlflow.py,sha256=2YWOYpv8eRB_ROD2yFh6ksKDXHvAPDYb86hrUi9zv6E,1558
5
- hydraflow/runs.py,sha256=vH-hrlcoTo8HRmgUWam9gtLXAl_wDzX26HEZGWckdMs,14038
6
- hydraflow/util.py,sha256=qdUGtBgY7qOF4Yr4PibJHImbLPf-6WYFVuIKu6zbNbY,614
7
- hydraflow-0.1.5.dist-info/METADATA,sha256=8mCKAA9KjcJAUiqP-DPdMl4Gcp3MSXxOF34VYKA2P8I,4224
8
- hydraflow-0.1.5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
9
- hydraflow-0.1.5.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
10
- hydraflow-0.1.5.dist-info/RECORD,,