hydraflow 0.2.6__tar.gz → 0.2.8__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. {hydraflow-0.2.6 → hydraflow-0.2.8}/PKG-INFO +1 -1
  2. {hydraflow-0.2.6 → hydraflow-0.2.8}/pyproject.toml +1 -1
  3. {hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/__init__.py +4 -4
  4. {hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/context.py +24 -8
  5. hydraflow-0.2.8/src/hydraflow/info.py +116 -0
  6. hydraflow-0.2.8/src/hydraflow/mlflow.py +175 -0
  7. hydraflow-0.2.8/src/hydraflow/progress.py +131 -0
  8. hydraflow-0.2.6/src/hydraflow/runs.py → hydraflow-0.2.8/src/hydraflow/run_collection.py +117 -154
  9. hydraflow-0.2.6/tests/scripts/log_run.py → hydraflow-0.2.8/tests/scripts/app.py +21 -3
  10. hydraflow-0.2.8/tests/test_app.py +109 -0
  11. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_context.py +2 -2
  12. hydraflow-0.2.8/tests/test_info.py +64 -0
  13. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_log_run.py +6 -4
  14. hydraflow-0.2.8/tests/test_progress.py +12 -0
  15. hydraflow-0.2.6/tests/test_runs.py → hydraflow-0.2.8/tests/test_run_collection.py +45 -41
  16. hydraflow-0.2.6/src/hydraflow/mlflow.py +0 -124
  17. hydraflow-0.2.6/src/hydraflow/progress.py +0 -56
  18. hydraflow-0.2.6/tests/scripts/progress.py +0 -22
  19. hydraflow-0.2.6/tests/test_progress.py +0 -0
  20. {hydraflow-0.2.6 → hydraflow-0.2.8}/.devcontainer/devcontainer.json +0 -0
  21. {hydraflow-0.2.6 → hydraflow-0.2.8}/.devcontainer/postCreate.sh +0 -0
  22. {hydraflow-0.2.6 → hydraflow-0.2.8}/.devcontainer/starship.toml +0 -0
  23. {hydraflow-0.2.6 → hydraflow-0.2.8}/.gitattributes +0 -0
  24. {hydraflow-0.2.6 → hydraflow-0.2.8}/.gitignore +0 -0
  25. {hydraflow-0.2.6 → hydraflow-0.2.8}/LICENSE +0 -0
  26. {hydraflow-0.2.6 → hydraflow-0.2.8}/README.md +0 -0
  27. {hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/asyncio.py +0 -0
  28. {hydraflow-0.2.6 → hydraflow-0.2.8}/src/hydraflow/config.py +0 -0
  29. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/scripts/__init__.py +0 -0
  30. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/scripts/watch.py +0 -0
  31. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_asyncio.py +0 -0
  32. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_config.py +0 -0
  33. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_mlflow.py +0 -0
  34. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_version.py +0 -0
  35. {hydraflow-0.2.6 → hydraflow-0.2.8}/tests/test_watch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: hydraflow
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments.
5
5
  Project-URL: Documentation, https://github.com/daizutabi/hydraflow
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hydraflow"
7
- version = "0.2.6"
7
+ version = "0.2.8"
8
8
  description = "Hydraflow integrates Hydra and MLflow to manage and track machine learning experiments."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1,11 +1,11 @@
1
1
  from .context import chdir_artifact, log_run, start_run, watch
2
- from .mlflow import get_artifact_dir, get_hydra_output_dir, set_experiment
3
- from .runs import (
4
- RunCollection,
2
+ from .info import get_artifact_dir, get_hydra_output_dir, load_config
3
+ from .mlflow import (
5
4
  list_runs,
6
- load_config,
7
5
  search_runs,
6
+ set_experiment,
8
7
  )
8
+ from .run_collection import RunCollection
9
9
 
10
10
  __all__ = [
11
11
  "RunCollection",
@@ -14,10 +14,11 @@ from typing import TYPE_CHECKING
14
14
 
15
15
  import mlflow
16
16
  from hydra.core.hydra_config import HydraConfig
17
- from watchdog.events import FileModifiedEvent, FileSystemEventHandler
17
+ from watchdog.events import FileModifiedEvent, PatternMatchingEventHandler
18
18
  from watchdog.observers import Observer
19
19
 
20
- from hydraflow.mlflow import get_artifact_dir, log_params
20
+ from hydraflow.info import get_artifact_dir
21
+ from hydraflow.mlflow import log_params
21
22
 
22
23
  if TYPE_CHECKING:
23
24
  from collections.abc import Callable, Iterator
@@ -68,7 +69,7 @@ def log_run(
68
69
  mlflow.log_artifact(local_path)
69
70
 
70
71
  try:
71
- with watch(log_artifact, output_dir):
72
+ with watch(log_artifact, output_dir, ignore_log=False):
72
73
  yield
73
74
 
74
75
  except Exception as e:
@@ -140,9 +141,11 @@ def start_run(
140
141
 
141
142
  @contextmanager
142
143
  def watch(
143
- func: Callable[[Path], None],
144
+ callback: Callable[[Path], None],
144
145
  dir: Path | str = "",
145
146
  timeout: int = 60,
147
+ ignore_patterns: list[str] | None = None,
148
+ ignore_log: bool = True,
146
149
  ) -> Iterator[None]:
147
150
  """
148
151
  Watch the given directory for changes and call the provided function
@@ -154,7 +157,7 @@ def watch(
154
157
  period or until the context is exited.
155
158
 
156
159
  Args:
157
- func (Callable[[Path], None]): The function to call when a change is
160
+ callback (Callable[[Path], None]): The function to call when a change is
158
161
  detected. It should accept a single argument of type `Path`,
159
162
  which is the path of the modified file.
160
163
  dir (Path | str): The directory to watch. If not specified,
@@ -174,7 +177,7 @@ def watch(
174
177
  if isinstance(dir, Path):
175
178
  dir = dir.as_posix()
176
179
 
177
- handler = Handler(func)
180
+ handler = Handler(callback, ignore_patterns=ignore_patterns, ignore_log=ignore_log)
178
181
  observer = Observer()
179
182
  observer.schedule(handler, dir, recursive=True)
180
183
  observer.start()
@@ -198,10 +201,23 @@ def watch(
198
201
  observer.join()
199
202
 
200
203
 
201
- class Handler(FileSystemEventHandler):
202
- def __init__(self, func: Callable[[Path], None]) -> None:
204
+ class Handler(PatternMatchingEventHandler):
205
+ def __init__(
206
+ self,
207
+ func: Callable[[Path], None],
208
+ ignore_patterns: list[str] | None = None,
209
+ ignore_log: bool = True,
210
+ ) -> None:
203
211
  self.func = func
204
212
 
213
+ if ignore_log:
214
+ if ignore_patterns:
215
+ ignore_patterns.append("*.log")
216
+ else:
217
+ ignore_patterns = ["*.log"]
218
+
219
+ super().__init__(ignore_patterns=ignore_patterns)
220
+
205
221
  def on_modified(self, event: FileModifiedEvent) -> None:
206
222
  file = Path(str(event.src_path))
207
223
  if file.is_file():
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+
6
+ import mlflow
7
+ from hydra.core.hydra_config import HydraConfig
8
+ from mlflow.tracking import artifact_utils
9
+ from omegaconf import DictConfig, OmegaConf
10
+
11
+ if TYPE_CHECKING:
12
+ from mlflow.entities import Run
13
+
14
+ from hydraflow.run_collection import RunCollection
15
+
16
+
17
+ class RunCollectionInfo:
18
+ def __init__(self, runs: RunCollection):
19
+ self._runs = runs
20
+
21
+ @property
22
+ def run_id(self) -> list[str]:
23
+ return [run.info.run_id for run in self._runs]
24
+
25
+ @property
26
+ def params(self) -> list[dict[str, str]]:
27
+ return [run.data.params for run in self._runs]
28
+
29
+ @property
30
+ def metrics(self) -> list[dict[str, float]]:
31
+ return [run.data.metrics for run in self._runs]
32
+
33
+ @property
34
+ def artifact_uri(self) -> list[str | None]:
35
+ return [run.info.artifact_uri for run in self._runs]
36
+
37
+ @property
38
+ def artifact_dir(self) -> list[Path]:
39
+ return [get_artifact_dir(run) for run in self._runs]
40
+
41
+ @property
42
+ def config(self) -> list[DictConfig]:
43
+ return [load_config(run) for run in self._runs]
44
+
45
+
46
+ def get_artifact_dir(run: Run | None = None) -> Path:
47
+ """
48
+ Retrieve the artifact directory for the given run.
49
+
50
+ This function uses MLflow to get the artifact directory for the given run.
51
+
52
+ Args:
53
+ run (Run | None): The run object. Defaults to None.
54
+
55
+ Returns:
56
+ The local path to the directory where the artifacts are downloaded.
57
+ """
58
+ if run is None:
59
+ uri = mlflow.get_artifact_uri()
60
+ else:
61
+ uri = artifact_utils.get_artifact_uri(run.info.run_id)
62
+
63
+ return Path(mlflow.artifacts.download_artifacts(uri))
64
+
65
+
66
+ def get_hydra_output_dir(run: Run | None = None) -> Path:
67
+ """
68
+ Retrieve the Hydra output directory for the given run.
69
+
70
+ This function returns the Hydra output directory. If no run is provided,
71
+ it retrieves the output directory from the current Hydra configuration.
72
+ If a run is provided, it retrieves the artifact path for the run, loads
73
+ the Hydra configuration from the downloaded artifacts, and returns the
74
+ output directory specified in that configuration.
75
+
76
+ Args:
77
+ run (Run | None): The run object. Defaults to None.
78
+
79
+ Returns:
80
+ Path: The path to the Hydra output directory.
81
+
82
+ Raises:
83
+ FileNotFoundError: If the Hydra configuration file is not found
84
+ in the artifacts.
85
+ """
86
+ if run is None:
87
+ hc = HydraConfig.get()
88
+ return Path(hc.runtime.output_dir)
89
+
90
+ path = get_artifact_dir(run) / ".hydra/hydra.yaml"
91
+
92
+ if path.exists():
93
+ hc = OmegaConf.load(path)
94
+ return Path(hc.hydra.runtime.output_dir)
95
+
96
+ raise FileNotFoundError
97
+
98
+
99
+ def load_config(run: Run) -> DictConfig:
100
+ """
101
+ Load the configuration for a given run.
102
+
103
+ This function loads the configuration for the provided Run instance
104
+ by downloading the configuration file from the MLflow artifacts and
105
+ loading it using OmegaConf. It returns an empty config if
106
+ `.hydra/config.yaml` is not found in the run's artifact directory.
107
+
108
+ Args:
109
+ run (Run): The Run instance for which to load the configuration.
110
+
111
+ Returns:
112
+ The loaded configuration as a DictConfig object. Returns an empty
113
+ DictConfig if the configuration file is not found.
114
+ """
115
+ path = get_artifact_dir(run) / ".hydra/config.yaml"
116
+ return OmegaConf.load(path) # type: ignore
@@ -0,0 +1,175 @@
1
+ """
2
+ This module provides functionality to log parameters from Hydra configuration objects
3
+ and set up experiments using MLflow. It includes methods for managing experiments,
4
+ searching for runs, and logging parameters and artifacts.
5
+
6
+ Key Features:
7
+ - **Experiment Management**: Set and manage MLflow experiments with customizable names
8
+ based on Hydra configuration.
9
+ - **Run Logging**: Log parameters and metrics from Hydra configuration objects to
10
+ MLflow, ensuring that all relevant information is captured during experiments.
11
+ - **Run Search**: Search for runs based on various criteria, allowing for flexible
12
+ retrieval of experiment results.
13
+ - **Artifact Management**: Retrieve and log artifacts associated with runs, facilitating
14
+ easy access to outputs generated during experiments.
15
+
16
+ This module is designed to integrate seamlessly with Hydra, providing a robust
17
+ solution for tracking machine learning experiments and their associated metadata.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from pathlib import Path
23
+ from typing import TYPE_CHECKING
24
+
25
+ import mlflow
26
+ from hydra.core.hydra_config import HydraConfig
27
+ from mlflow.entities import ViewType
28
+ from mlflow.tracking.fluent import SEARCH_MAX_RESULTS_PANDAS
29
+
30
+ from hydraflow.config import iter_params
31
+ from hydraflow.run_collection import RunCollection
32
+
33
+ if TYPE_CHECKING:
34
+ from mlflow.entities.experiment import Experiment
35
+
36
+
37
+ def set_experiment(
38
+ prefix: str = "",
39
+ suffix: str = "",
40
+ uri: str | Path | None = None,
41
+ ) -> Experiment:
42
+ """
43
+ Sets the experiment name and tracking URI optionally.
44
+
45
+ This function sets the experiment name by combining the given prefix,
46
+ the job name from HydraConfig, and the given suffix. Optionally, it can
47
+ also set the tracking URI.
48
+
49
+ Args:
50
+ prefix (str): The prefix to prepend to the experiment name.
51
+ suffix (str): The suffix to append to the experiment name.
52
+ uri (str | Path | None): The tracking URI to use. Defaults to None.
53
+
54
+ Returns:
55
+ Experiment: An instance of `mlflow.entities.Experiment` representing
56
+ the new active experiment.
57
+ """
58
+ if uri is not None:
59
+ mlflow.set_tracking_uri(uri)
60
+
61
+ hc = HydraConfig.get()
62
+ name = f"{prefix}{hc.job.name}{suffix}"
63
+ return mlflow.set_experiment(name)
64
+
65
+
66
+ def log_params(config: object, *, synchronous: bool | None = None) -> None:
67
+ """
68
+ Log the parameters from the given configuration object.
69
+
70
+ This method logs the parameters from the provided configuration object
71
+ using MLflow. It iterates over the parameters and logs them using the
72
+ `mlflow.log_param` method.
73
+
74
+ Args:
75
+ config (object): The configuration object to log the parameters from.
76
+ synchronous (bool | None): Whether to log the parameters synchronously.
77
+ Defaults to None.
78
+ """
79
+ for key, value in iter_params(config):
80
+ mlflow.log_param(key, value, synchronous=synchronous)
81
+
82
+
83
+ def search_runs(
84
+ experiment_ids: list[str] | None = None,
85
+ filter_string: str = "",
86
+ run_view_type: int = ViewType.ACTIVE_ONLY,
87
+ max_results: int = SEARCH_MAX_RESULTS_PANDAS,
88
+ order_by: list[str] | None = None,
89
+ search_all_experiments: bool = False,
90
+ experiment_names: list[str] | None = None,
91
+ ) -> RunCollection:
92
+ """
93
+ Search for Runs that fit the specified criteria.
94
+
95
+ This function wraps the `mlflow.search_runs` function and returns the
96
+ results as a `RunCollection` object. It allows for flexible searching of
97
+ MLflow runs based on various criteria.
98
+
99
+ Note:
100
+ The returned runs are sorted by their start time in ascending order.
101
+
102
+ Args:
103
+ experiment_ids (list[str] | None): List of experiment IDs. Search can
104
+ work with experiment IDs or experiment names, but not both in the
105
+ same call. Values other than ``None`` or ``[]`` will result in
106
+ error if ``experiment_names`` is also not ``None`` or ``[]``.
107
+ ``None`` will default to the active experiment if ``experiment_names``
108
+ is ``None`` or ``[]``.
109
+ filter_string (str): Filter query string, defaults to searching all
110
+ runs.
111
+ run_view_type (int): one of enum values ``ACTIVE_ONLY``, ``DELETED_ONLY``,
112
+ or ``ALL`` runs defined in :py:class:`mlflow.entities.ViewType`.
113
+ max_results (int): The maximum number of runs to put in the dataframe.
114
+ Default is 100,000 to avoid causing out-of-memory issues on the user's
115
+ machine.
116
+ order_by (list[str] | None): List of columns to order by (e.g.,
117
+ "metrics.rmse"). The ``order_by`` column can contain an optional
118
+ ``DESC`` or ``ASC`` value. The default is ``ASC``. The default
119
+ ordering is to sort by ``start_time DESC``, then ``run_id``.
120
+ ``start_time DESC``, then ``run_id``.
121
+ search_all_experiments (bool): Boolean specifying whether all
122
+ experiments should be searched. Only honored if ``experiment_ids``
123
+ is ``[]`` or ``None``.
124
+ experiment_names (list[str] | None): List of experiment names. Search
125
+ can work with experiment IDs or experiment names, but not both in
126
+ the same call. Values other than ``None`` or ``[]`` will result in
127
+ error if ``experiment_ids`` is also not ``None`` or ``[]``.
128
+ ``experiment_ids`` is also not ``None`` or ``[]``. ``None`` will
129
+ default to the active experiment if ``experiment_ids`` is ``None``
130
+ or ``[]``.
131
+
132
+ Returns:
133
+ A `RunCollection` object containing the search results.
134
+ """
135
+ runs = mlflow.search_runs(
136
+ experiment_ids=experiment_ids,
137
+ filter_string=filter_string,
138
+ run_view_type=run_view_type,
139
+ max_results=max_results,
140
+ order_by=order_by,
141
+ output_format="list",
142
+ search_all_experiments=search_all_experiments,
143
+ experiment_names=experiment_names,
144
+ )
145
+ runs = sorted(runs, key=lambda run: run.info.start_time) # type: ignore
146
+ return RunCollection(runs) # type: ignore
147
+
148
+
149
+ def list_runs(experiment_names: list[str] | None = None) -> RunCollection:
150
+ """
151
+ List all runs for the specified experiments.
152
+
153
+ This function retrieves all runs for the given list of experiment names.
154
+ If no experiment names are provided (None), it defaults to searching all runs
155
+ for the currently active experiment. If an empty list is provided, the function
156
+ will search all runs for all experiments except the "Default" experiment.
157
+ The function returns the results as a `RunCollection` object.
158
+
159
+ Note:
160
+ The returned runs are sorted by their start time in ascending order.
161
+
162
+ Args:
163
+ experiment_names (list[str] | None): List of experiment names to search
164
+ for runs. If None or an empty list is provided, the function will
165
+ search the currently active experiment or all experiments except
166
+ the "Default" experiment.
167
+
168
+ Returns:
169
+ A `RunCollection` object containing the runs for the specified experiments.
170
+ """
171
+ if experiment_names == []:
172
+ experiments = mlflow.search_experiments()
173
+ experiment_names = [e.name for e in experiments if e.name != "Default"]
174
+
175
+ return search_runs(experiment_names=experiment_names)
@@ -0,0 +1,131 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import joblib
6
+ from rich.progress import Progress
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Iterable
10
+
11
+ from rich.progress import ProgressColumn
12
+
13
+
14
+ def multi_task_progress(
15
+ iterables: Iterable[Iterable[int | tuple[int, int]]],
16
+ *columns: ProgressColumn | str,
17
+ n_jobs: int = -1,
18
+ description: str = "#{:0>3}",
19
+ main_description: str = "main",
20
+ transient: bool | None = None,
21
+ **kwargs,
22
+ ) -> None:
23
+ """
24
+ Render auto-updating progress bars for multiple tasks concurrently.
25
+
26
+ Args:
27
+ iterables (Iterable[Iterable[int | tuple[int, int]]]): A collection of
28
+ iterables, each representing a task. Each iterable can yield
29
+ integers (completed) or tuples of integers (completed, total).
30
+ *columns (ProgressColumn | str): Additional columns to display in the
31
+ progress bars.
32
+ n_jobs (int, optional): Number of jobs to run in parallel. Defaults to
33
+ -1, which means using all processors.
34
+ description (str, optional): Format string for describing tasks. Defaults to
35
+ "#{:0>3}".
36
+ main_description (str, optional): Description for the main task.
37
+ Defaults to "main".
38
+ transient (bool | None, optional): Whether to remove the progress bar
39
+ after completion. Defaults to None.
40
+ **kwargs: Additional keyword arguments passed to the Progress instance.
41
+
42
+ Returns:
43
+ None
44
+ """
45
+ if not columns:
46
+ columns = Progress.get_default_columns()
47
+
48
+ iterables = list(iterables)
49
+
50
+ with Progress(*columns, transient=transient or False, **kwargs) as progress:
51
+ n = len(iterables)
52
+
53
+ task_main = progress.add_task(main_description, total=None) if n > 1 else None
54
+ tasks = [
55
+ progress.add_task(description.format(i), start=False, total=None) for i in range(n)
56
+ ]
57
+
58
+ total = {}
59
+ completed = {}
60
+
61
+ def func(i: int) -> None:
62
+ completed[i] = 0
63
+ total[i] = None
64
+ progress.start_task(tasks[i])
65
+
66
+ for index in iterables[i]:
67
+ if isinstance(index, tuple):
68
+ completed[i], total[i] = index[0] + 1, index[1]
69
+ else:
70
+ completed[i] = index + 1
71
+
72
+ progress.update(tasks[i], total=total[i], completed=completed[i])
73
+ if task_main is not None:
74
+ if all(t is not None for t in total.values()):
75
+ t = sum(total.values())
76
+ else:
77
+ t = None
78
+ c = sum(completed.values())
79
+ progress.update(task_main, total=t, completed=c)
80
+
81
+ if transient or n > 1:
82
+ progress.remove_task(tasks[i])
83
+
84
+ if n > 1:
85
+ it = (joblib.delayed(func)(i) for i in range(n))
86
+ joblib.Parallel(n_jobs, prefer="threads")(it)
87
+
88
+ else:
89
+ func(0)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ import random
94
+ import time
95
+
96
+ from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TimeElapsedColumn
97
+
98
+ from hydraflow.progress import multi_task_progress
99
+
100
+ def task(total):
101
+ for i in range(total or 90):
102
+ if total is None:
103
+ yield i
104
+ else:
105
+ yield i, total
106
+ time.sleep(random.random() / 30)
107
+
108
+ def multi_task_progress_test(unknown_total: bool):
109
+ tasks = [task(random.randint(80, 100)) for _ in range(4)]
110
+ if unknown_total:
111
+ tasks = [task(None), *tasks, task(None)]
112
+
113
+ columns = [
114
+ SpinnerColumn(),
115
+ *Progress.get_default_columns(),
116
+ MofNCompleteColumn(),
117
+ TimeElapsedColumn(),
118
+ ]
119
+
120
+ kwargs = {}
121
+ if unknown_total:
122
+ kwargs["main_description"] = "unknown"
123
+
124
+ multi_task_progress(tasks, *columns, n_jobs=4, **kwargs)
125
+
126
+ multi_task_progress_test(False)
127
+ multi_task_progress_test(True)
128
+ multi_task_progress([task(100)])
129
+ multi_task_progress([task(None)], description="unknown")
130
+ multi_task_progress([task(100), task(None)], main_description="transient", transient=True)
131
+ multi_task_progress([task(100)], description="transient", transient=True)