hydraflow 0.17.0__py3-none-any.whl → 0.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hydraflow/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Integrate Hydra and MLflow to manage and track machine learning experiments."""
2
2
 
3
+ from hydraflow.core.collection import Collection
3
4
  from hydraflow.core.context import chdir_artifact, log_run, start_run
4
5
  from hydraflow.core.io import (
5
6
  get_artifact_dir,
@@ -14,6 +15,7 @@ from hydraflow.core.run import Run
14
15
  from hydraflow.core.run_collection import RunCollection
15
16
 
16
17
  __all__ = [
18
+ "Collection",
17
19
  "Run",
18
20
  "RunCollection",
19
21
  "chdir_artifact",
@@ -4,9 +4,10 @@ from __future__ import annotations
4
4
 
5
5
  from collections.abc import Hashable, Iterable, Sequence
6
6
  from dataclasses import MISSING
7
- from typing import TYPE_CHECKING, overload
7
+ from typing import TYPE_CHECKING, Concatenate, overload
8
8
 
9
9
  import numpy as np
10
+ from joblib.parallel import Parallel, delayed
10
11
  from omegaconf import ListConfig, OmegaConf
11
12
  from polars import DataFrame, Series
12
13
 
@@ -378,6 +379,77 @@ class Collection[I](Sequence[I]):
378
379
 
379
380
  return self[index]
380
381
 
382
+ def map[**P, R](
383
+ self,
384
+ function: Callable[Concatenate[I, P], R],
385
+ *args: P.args,
386
+ **kwargs: P.kwargs,
387
+ ) -> Iterator[R]:
388
+ """Apply a function to each item and return an iterator of results.
389
+
390
+ This is a memory-efficient mapping operation that lazily evaluates results.
391
+ Ideal for large collections where memory usage is a concern.
392
+
393
+ Args:
394
+ function: Function to apply to each item. The item is passed
395
+ as the first argument.
396
+ *args: Additional positional arguments to pass to the function.
397
+ **kwargs: Additional keyword arguments to pass to the function.
398
+
399
+ Returns:
400
+ Iterator[R]: An iterator of the function's results.
401
+
402
+ Examples:
403
+ ```python
404
+ # Process results one at a time
405
+ for result in collection.map(process_item, additional_arg):
406
+ handle_result(result)
407
+
408
+ # Convert to list if needed
409
+ results = list(collection.map(transform_item))
410
+ ```
411
+
412
+ """
413
+ yield from (function(i, *args, **kwargs) for i in self)
414
+
415
+ def pmap[**P, R](
416
+ self,
417
+ function: Callable[Concatenate[I, P], R],
418
+ n_jobs: int = -1,
419
+ backend: str = "multiprocessing",
420
+ *args: P.args,
421
+ **kwargs: P.kwargs,
422
+ ) -> list[R]:
423
+ """Apply a function to each item in parallel and return a list of results.
424
+
425
+ This method processes items concurrently for improved performance on
426
+ CPU-bound or I/O-bound operations, depending on the backend.
427
+
428
+ Args:
429
+ function: Function to apply to each item. The item is passed
430
+ as the first argument.
431
+ n_jobs (int): Number of jobs to run in parallel. -1 means using all
432
+ processors.
433
+ backend (str): Parallelization backend.
434
+ *args: Additional positional arguments to pass to the function.
435
+ **kwargs: Additional keyword arguments to pass to the function.
436
+
437
+ Returns:
438
+ list[R]: A list containing all results of the function applications.
439
+
440
+ Examples:
441
+ ```python
442
+ # Process all items in parallel using all cores
443
+ results = collection.pmap(heavy_computation)
444
+
445
+ # Specify number of parallel jobs and backend
446
+ results = collection.pmap(process_files, n_jobs=4, backend="threading")
447
+ ```
448
+
449
+ """
450
+ parallel = Parallel(n_jobs=n_jobs, backend=backend, return_as="list")
451
+ return parallel(delayed(function)(i, *args, **kwargs) for i in self) # type: ignore
452
+
381
453
  def to_frame(
382
454
  self,
383
455
  *keys: str,
@@ -409,7 +481,7 @@ class Collection[I](Sequence[I]):
409
481
  if not kwargs:
410
482
  return df
411
483
 
412
- columns = [Series(k, [v(r) for r in self]) for k, v in kwargs.items()]
484
+ columns = [Series(k, self.map(v)) for k, v in kwargs.items()]
413
485
  return df.with_columns(*columns)
414
486
 
415
487
  def group_by(self, *by: str) -> GroupBy[Self, I]:
hydraflow/core/context.py CHANGED
@@ -128,13 +128,12 @@ def chdir_artifact(run: Run) -> Iterator[Path]:
128
128
  run (Run | None): The run to get the artifact directory from.
129
129
 
130
130
  """
131
- curdir = Path.cwd()
131
+ current_dir = Path.cwd()
132
132
  artifact_dir = get_artifact_dir(run)
133
133
 
134
- os.chdir(artifact_dir)
135
-
136
134
  try:
135
+ os.chdir(artifact_dir)
137
136
  yield artifact_dir
138
137
 
139
138
  finally:
140
- os.chdir(curdir)
139
+ os.chdir(current_dir)
hydraflow/core/run.py CHANGED
@@ -3,7 +3,7 @@
3
3
  This module provides the Run class, which represents an MLflow
4
4
  Run in HydraFlow. A Run contains three main components:
5
5
 
6
- 1. info: Information about the run, such as run directory,
6
+ 1. info: Information about the run, which includes the run directory,
7
7
  run ID, and job name.
8
8
  2. cfg: Configuration loaded from the Hydra configuration file.
9
9
  3. impl: Implementation instance created by the provided
@@ -23,7 +23,9 @@ behavior based on the run's configuration.
23
23
  from __future__ import annotations
24
24
 
25
25
  import inspect
26
+ import os
26
27
  from collections.abc import Callable, Iterable
28
+ from contextlib import contextmanager
27
29
  from dataclasses import MISSING
28
30
  from functools import cached_property
29
31
  from pathlib import Path
@@ -34,6 +36,7 @@ from omegaconf import DictConfig, OmegaConf
34
36
  from .run_info import RunInfo
35
37
 
36
38
  if TYPE_CHECKING:
39
+ from collections.abc import Iterator
37
40
  from typing import Any, Self
38
41
 
39
42
  from .run_collection import RunCollection
@@ -122,7 +125,7 @@ class Run[C, I = None]:
122
125
  def load( # type: ignore
123
126
  cls,
124
127
  run_dir: str | Path,
125
- impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None, # type: ignore
128
+ impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
126
129
  ) -> Self: ...
127
130
 
128
131
  @overload
@@ -130,7 +133,7 @@ class Run[C, I = None]:
130
133
  def load(
131
134
  cls,
132
135
  run_dir: Iterable[str | Path],
133
- impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None, # type: ignore
136
+ impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
134
137
  *,
135
138
  n_jobs: int = 0,
136
139
  ) -> RunCollection[Self, I]: ...
@@ -139,7 +142,7 @@ class Run[C, I = None]:
139
142
  def load(
140
143
  cls,
141
144
  run_dir: str | Path | Iterable[str | Path],
142
- impl_factory: Callable[[Path], I] | Callable[[Path, C], I] = lambda _: None, # type: ignore
145
+ impl_factory: Callable[[Path], I] | Callable[[Path, C], I] | None = None,
143
146
  *,
144
147
  n_jobs: int = 0,
145
148
  ) -> Self | RunCollection[Self, I]:
@@ -149,11 +152,11 @@ class Run[C, I = None]:
149
152
  run_dir (str | Path | Iterable[str | Path]): The directory where the
150
153
  MLflow runs are stored, either as a string, a Path instance,
151
154
  or an iterable of them.
152
- impl_factory (Callable[[Path], I] | Callable[[Path, C], I]): A factory
153
- function that creates the implementation instance. It can accept
154
- either just the artifacts directory path, or both the path and
155
- the configuration instance. Defaults to a function that returns
156
- None.
155
+ impl_factory (Callable[[Path], I] | Callable[[Path, C], I] | None):
156
+ A factory function that creates the implementation instance. It
157
+ can accept either just the artifacts directory path, or both the
158
+ path and the configuration instance. Defaults to None, in which
159
+ case a function that returns None is used.
157
160
  n_jobs (int): The number of parallel jobs. If 0 (default), runs
158
161
  sequentially. If -1, uses all available CPU cores.
159
162
 
@@ -284,10 +287,11 @@ class Run[C, I = None]:
284
287
 
285
288
  Note:
286
289
  The search order for keys is:
287
- 1. Configuration (cfg)
288
- 2. Implementation (impl)
289
- 3. Run information (info)
290
- 4. Run object itself (self)
290
+
291
+ 1. Configuration (`cfg`)
292
+ 2. Implementation (`impl`)
293
+ 3. Run information (`info`)
294
+ 4. Run object itself (`self`)
291
295
 
292
296
  """
293
297
  key = key.replace("__", ".")
@@ -298,7 +302,7 @@ class Run[C, I = None]:
298
302
 
299
303
  for attr in [self.impl, self.info, self]:
300
304
  value = getattr(attr, key, MISSING)
301
- if value is not MISSING:
305
+ if value is not MISSING and not callable(value):
302
306
  return value
303
307
 
304
308
  if default is not MISSING:
@@ -332,6 +336,70 @@ class Run[C, I = None]:
332
336
 
333
337
  return standard_dict
334
338
 
339
+ @contextmanager
340
+ def chdir(self, relative_dir: str = "") -> Iterator[Path]:
341
+ """Change the current working directory to the artifact directory.
342
+
343
+ This context manager changes the current working directory
344
+ to the artifact directory of the run.
345
+ It ensures that the directory is changed back
346
+ to the original directory after the context is exited.
347
+
348
+ Args:
349
+ relative_dir (str): The relative directory to the artifact
350
+ directory. Defaults to an empty string.
351
+
352
+ Yields:
353
+ Path: The artifact directory of the run.
354
+
355
+ """
356
+ artifacts_dir = self.info.run_dir / "artifacts" / relative_dir
357
+ current_dir = Path.cwd()
358
+
359
+ try:
360
+ os.chdir(artifacts_dir)
361
+ yield artifacts_dir
362
+
363
+ finally:
364
+ os.chdir(current_dir)
365
+
366
+ def path(self, relative_path: str = "") -> Path:
367
+ """Return the path relative to the artifact directory.
368
+
369
+ Args:
370
+ relative_path (str): The relative path to the artifact directory.
371
+
372
+ Returns:
373
+ Path: The path relative to the artifact directory.
374
+
375
+ """
376
+ return self.info.run_dir / "artifacts" / relative_path
377
+
378
+ def iterdir(self, relative_dir: str = "") -> Iterator[Path]:
379
+ """Iterate over the artifact directories for the run.
380
+
381
+ Args:
382
+ relative_dir (str): The relative directory to iterate over.
383
+
384
+ Yields:
385
+ Path: The artifact directory for the run.
386
+
387
+ """
388
+ yield from self.path(relative_dir).iterdir()
389
+
390
+ def glob(self, pattern: str, relative_dir: str = "") -> Iterator[Path]:
391
+ """Glob the artifact directories for the run.
392
+
393
+ Args:
394
+ pattern (str): The pattern to glob.
395
+ relative_dir (str): The relative directory to glob.
396
+
397
+ Yields:
398
+ Path: The existing artifact paths that match the pattern.
399
+
400
+ """
401
+ yield from self.path(relative_dir).glob(pattern)
402
+
335
403
 
336
404
  def _flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
337
405
  items = []
@@ -20,10 +20,13 @@ Example:
20
20
  # Sort runs by specific keys
21
21
  sorted_runs = runs.sort("metrics.accuracy", reverse=True)
22
22
 
23
- # Group runs by model type and compute aggregates
24
- grouped = runs.group_by("model.type",
25
- avg_acc=lambda rc: sum(r.get("metrics.accuracy")
26
- for r in rc) / len(rc))
23
+ # Group runs by model type
24
+ grouped = runs.group_by("model.type")
25
+
26
+ # Compute aggregates on grouped data
27
+ metrics_df = grouped.agg(
28
+ avg_acc=lambda rc: sum(r.get("metrics.accuracy") for r in rc) / len(rc)
29
+ )
27
30
 
28
31
  # Convert runs to a DataFrame for analysis
29
32
  df = runs.to_frame("run_id", "model.type", "metrics.accuracy")
@@ -44,7 +47,8 @@ from .collection import Collection
44
47
  from .run import Run
45
48
 
46
49
  if TYPE_CHECKING:
47
- from collections.abc import Callable, Iterable
50
+ from collections.abc import Callable, Iterable, Iterator
51
+ from pathlib import Path
48
52
  from typing import Any, Self
49
53
 
50
54
 
@@ -166,10 +170,46 @@ class RunCollection[R: Run[Any, Any], I = None](Collection[R]):
166
170
 
167
171
  @cached_property
168
172
  def impls(self) -> Collection[I]:
169
- """Get the implementation object for all runs in the collection.
173
+ """Get the implementation objects for all runs in the collection.
170
174
 
171
175
  Returns:
172
- Collection[Any]: A collection of implementation objects for all runs.
176
+ Collection[I]: A collection of implementation objects for all runs.
173
177
 
174
178
  """
175
179
  return Collection(run.impl for run in self)
180
+
181
+ def iterdir(self, relative_dir: str = "") -> Iterator[Path]:
182
+ """Iterate over the artifact directories for all runs in the collection.
183
+
184
+ This method yields all files and directories in the specified
185
+ relative directory for each run in the collection.
186
+
187
+ Args:
188
+ relative_dir (str): The relative directory within the artifacts
189
+ directory to iterate over.
190
+
191
+ Yields:
192
+ Path: Each path in the specified directory for each run
193
+ in the collection.
194
+
195
+ """
196
+ for run in self:
197
+ yield from run.path(relative_dir).iterdir()
198
+
199
+ def glob(self, pattern: str, relative_dir: str = "") -> Iterator[Path]:
200
+ """Glob the artifact directories for all runs in the collection.
201
+
202
+ This method yields all paths matching the specified pattern
203
+ in the relative directory for each run in the collection.
204
+
205
+ Args:
206
+ pattern (str): The glob pattern to match files or directories.
207
+ relative_dir (str): The relative directory within the artifacts
208
+ directory to search in.
209
+
210
+ Yields:
211
+ Path: Each path matching the pattern for each run in the collection.
212
+
213
+ """
214
+ for run in self:
215
+ yield from run.path(relative_dir).glob(pattern)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hydraflow
3
- Version: 0.17.0
3
+ Version: 0.17.2
4
4
  Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
5
5
  Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -1,14 +1,14 @@
1
- hydraflow/__init__.py,sha256=8UraqH00Qp0In301ZUmQBRTIGbV1L5zSZACOUlIRPn8,727
1
+ hydraflow/__init__.py,sha256=_cLLokEv0pUlwvG8RMnjOwCTtDQBs0-RgGbtDk5m_Xg,794
2
2
  hydraflow/cli.py,sha256=3rGr___wwp8KazjLGQ7JO_IgAMqLyMlcVSs_QJK7g0Y,3135
3
3
  hydraflow/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  hydraflow/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- hydraflow/core/collection.py,sha256=tUdjV_v4vzUHSNET-Z7a_8k5oXoH6nkZ_0OxZ-u8_nI,16791
6
- hydraflow/core/context.py,sha256=igE17oQESGjH-sBnICI8HkZbngY_crkHTgx2E-YkmEo,4155
5
+ hydraflow/core/collection.py,sha256=RSYgS4VsGjSm0Inrz4GAng_jmm-Ct_VSDmZ9rvKFQQw,19472
6
+ hydraflow/core/context.py,sha256=6vpwe0Xfl6mzh2hHLE-4uB9Hjew-CK4pA0KFihQ80U8,4168
7
7
  hydraflow/core/group_by.py,sha256=Pnw-oA5aXHeRG9lMLz-bKc8drqQ8LIRsWzvVn153iyQ,5488
8
8
  hydraflow/core/io.py,sha256=B3-jPuJWttRgpbIpy_XA-Z2qpXzNF1ATwyYEwA7Pv3w,5172
9
9
  hydraflow/core/main.py,sha256=pgr2b9A4VoZuwbApE71NElmV64MFJv8UKda05q4uCqk,6010
10
- hydraflow/core/run.py,sha256=VQfS3DkAR2GBWdltmlD0XMStiOUo1YZiRONm-mPW2x4,11948
11
- hydraflow/core/run_collection.py,sha256=4YjnAmB4lpGxTnlHzZOIwEXNfdI5yU5cj3PRiCW6vuA,5439
10
+ hydraflow/core/run.py,sha256=Kbq4s47f6KDNeyNUwrUpW55FrWlf5CCpmdgVCMakU2g,14046
11
+ hydraflow/core/run_collection.py,sha256=sdbkjs01ougaqXlp88gGC10TmO_7s-UEQozLl0jMI4Y,6771
12
12
  hydraflow/core/run_info.py,sha256=SMOTZXEa7OBV_XjTyctk5gJGrggmYwhePvRF8CLF1kU,1616
13
13
  hydraflow/executor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  hydraflow/executor/aio.py,sha256=xXsmBPIPdBlopv_1h0FdtOvoKUcuW7PQeKCV2d_lN9I,2122
@@ -16,8 +16,8 @@ hydraflow/executor/conf.py,sha256=8Xq4UAenRKJIl1NBgNbSfv6VUTJhdwPLayZIEAsiBR0,41
16
16
  hydraflow/executor/io.py,sha256=18wnHpCMQRGYL-oN2841h9W2aSW_X2SmO68Lx-3FIbU,1043
17
17
  hydraflow/executor/job.py,sha256=6QeJ18OMeocXeM04rCYL46GgArfX1SvZs9_4HTomTgE,5436
18
18
  hydraflow/executor/parser.py,sha256=RxP8qpDaJ8VLqZ51VlPFyVitWctObhkE_3iPIsY66Cs,14610
19
- hydraflow-0.17.0.dist-info/METADATA,sha256=f9LHLgsZMEiTl1CusfZQHUSv6rlz8DfL78EoMfheCBA,7535
20
- hydraflow-0.17.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
21
- hydraflow-0.17.0.dist-info/entry_points.txt,sha256=XI0khPbpCIUo9UPqkNEpgh-kqK3Jy8T7L2VCWOdkbSM,48
22
- hydraflow-0.17.0.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
23
- hydraflow-0.17.0.dist-info/RECORD,,
19
+ hydraflow-0.17.2.dist-info/METADATA,sha256=zEjD1acRRed6Le0G8-KjkWtUYaxXKB6JO9T6StRNkVM,7535
20
+ hydraflow-0.17.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
21
+ hydraflow-0.17.2.dist-info/entry_points.txt,sha256=XI0khPbpCIUo9UPqkNEpgh-kqK3Jy8T7L2VCWOdkbSM,48
22
+ hydraflow-0.17.2.dist-info/licenses/LICENSE,sha256=IGdDrBPqz1O0v_UwCW-NJlbX9Hy9b3uJ11t28y2srmY,1062
23
+ hydraflow-0.17.2.dist-info/RECORD,,