hydraflow 0.16.2__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {hydraflow-0.16.2 → hydraflow-0.17.1}/PKG-INFO +1 -1
  2. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/index.md +20 -5
  3. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/run-class.md +9 -6
  4. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/run-collection.md +43 -100
  5. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/advanced.md +1 -1
  6. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/analysis.md +9 -6
  7. {hydraflow-0.16.2 → hydraflow-0.17.1}/pyproject.toml +1 -1
  8. hydraflow-0.17.1/src/hydraflow/core/collection.py +613 -0
  9. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/context.py +3 -4
  10. hydraflow-0.17.1/src/hydraflow/core/group_by.py +205 -0
  11. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/run.py +111 -62
  12. hydraflow-0.17.1/src/hydraflow/core/run_collection.py +215 -0
  13. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/run_info.py +0 -9
  14. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run.py +50 -41
  15. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run_collection.py +24 -68
  16. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run_info.py +0 -8
  17. hydraflow-0.17.1/tests/core/test_collection.py +298 -0
  18. hydraflow-0.17.1/tests/core/test_group_by.py +125 -0
  19. hydraflow-0.16.2/src/hydraflow/core/run_collection.py +0 -632
  20. {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/devcontainer.json +0 -0
  21. {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/postCreate.sh +0 -0
  22. {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/starship.toml +0 -0
  23. {hydraflow-0.16.2 → hydraflow-0.17.1}/.gitattributes +0 -0
  24. {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/ci.yaml +0 -0
  25. {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/docs.yaml +0 -0
  26. {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/publish.yaml +0 -0
  27. {hydraflow-0.16.2 → hydraflow-0.17.1}/.gitignore +0 -0
  28. {hydraflow-0.16.2 → hydraflow-0.17.1}/LICENSE +0 -0
  29. {hydraflow-0.16.2 → hydraflow-0.17.1}/README.md +0 -0
  30. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/concepts.md +0 -0
  31. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/index.md +0 -0
  32. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/installation.md +0 -0
  33. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/index.md +0 -0
  34. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/configuration.md +0 -0
  35. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/execution.md +0 -0
  36. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/index.md +0 -0
  37. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/main-decorator.md +0 -0
  38. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/index.md +0 -0
  39. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/job-configuration.md +0 -0
  40. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/sweep-syntax.md +0 -0
  41. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/updating-runs.md +0 -0
  42. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/applications.md +0 -0
  43. {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/index.md +0 -0
  44. {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/example.py +0 -0
  45. {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/hydraflow.yaml +0 -0
  46. {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/submit.py +0 -0
  47. {hydraflow-0.16.2 → hydraflow-0.17.1}/mkdocs.yaml +0 -0
  48. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/__init__.py +0 -0
  49. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/cli.py +0 -0
  50. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/__init__.py +0 -0
  51. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/io.py +0 -0
  52. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/main.py +0 -0
  53. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/__init__.py +0 -0
  54. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/aio.py +0 -0
  55. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/conf.py +0 -0
  56. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/io.py +0 -0
  57. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/job.py +0 -0
  58. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/parser.py +0 -0
  59. {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/py.typed +0 -0
  60. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/__init__.py +0 -0
  61. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/__init__.py +0 -0
  62. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/app.py +0 -0
  63. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/conftest.py +0 -0
  64. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/hydraflow.yaml +0 -0
  65. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/submit.py +0 -0
  66. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_run.py +0 -0
  67. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_setup.py +0 -0
  68. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_show.py +0 -0
  69. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_version.py +0 -0
  70. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/conftest.py +0 -0
  71. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/__init__.py +0 -0
  72. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/__init__.py +0 -0
  73. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/chdir.py +0 -0
  74. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/log_run.py +0 -0
  75. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/start_run.py +0 -0
  76. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_chdir.py +0 -0
  77. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_log_run.py +0 -0
  78. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_start_run.py +0 -0
  79. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/__init__.py +0 -0
  80. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/default.py +0 -0
  81. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/force_new_run.py +0 -0
  82. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/match_overrides.py +0 -0
  83. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/rerun_finished.py +0 -0
  84. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/skip_finished.py +0 -0
  85. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_default.py +0 -0
  86. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_force_new_run.py +0 -0
  87. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_main.py +0 -0
  88. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_match_overrides.py +0 -0
  89. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_rerun_finished.py +0 -0
  90. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_skip_finished.py +0 -0
  91. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_update.py +0 -0
  92. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/update.py +0 -0
  93. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/__init__.py +0 -0
  94. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/run.py +0 -0
  95. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/test_io.py +0 -0
  96. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/__init__.py +0 -0
  97. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/conftest.py +0 -0
  98. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/echo.py +0 -0
  99. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/read.py +0 -0
  100. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_aio.py +0 -0
  101. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_args.py +0 -0
  102. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_conf.py +0 -0
  103. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_io.py +0 -0
  104. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_job.py +0 -0
  105. {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hydraflow
3
- Version: 0.16.2
3
+ Version: 0.17.1
4
4
  Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
5
5
  Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -20,11 +20,15 @@ The main components of HydraFlow's analysis tools are:
20
20
  1. **[`Run`][hydraflow.core.run.Run] Class**: Represents a single experiment
21
21
  run, providing access to configuration and artifacts.
22
22
 
23
- 2. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
24
- A collection of `Run` instances with tools for filtering, grouping, and
25
- aggregating results.
23
+ 2. **[`Collection`][hydraflow.core.collection.Collection] Class**: A generic base class
24
+ implementing the `Sequence` protocol with powerful filtering, grouping, and data
25
+ extraction capabilities.
26
26
 
27
- 3. **Data Analysis Integration**: Tools to convert experiment data into
27
+ 3. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
28
+ A collection of `Run` instances with specialized tools for filtering, grouping, and
29
+ aggregating results, built on top of the `Collection` class.
30
+
31
+ 4. **Data Analysis Integration**: Tools to convert experiment data into
28
32
  Polars DataFrames for advanced analysis.
29
33
 
30
34
  ## Practical Examples
@@ -47,6 +51,12 @@ filtered_runs = runs.filter(learning_rate=0.01, model_type="transformer")
47
51
  # Group runs by a parameter
48
52
  grouped_runs = runs.group_by("batch_size")
49
53
 
54
+ # Aggregate grouped data
55
+ df_aggregated = grouped_runs.agg(
56
+ count=lambda runs: len(runs),
57
+ avg_accuracy=lambda runs: sum(run.get("accuracy", 0) for run in runs) / len(runs)
58
+ )
59
+
50
60
  # Convert to DataFrame for analysis
51
61
  df = runs.to_frame("learning_rate", "batch_size", accuracy=lambda run: run.get("accuracy"))
52
62
 
@@ -141,4 +151,9 @@ In the following pages, we'll explore HydraFlow's analysis tools in detail:
141
151
  working with multiple runs.
142
152
 
143
153
  - [Updating Runs](updating-runs.md): Learn how to update existing runs with
144
- new metrics, tags, and artifacts.
154
+ new metrics, tags, and artifacts.
155
+
156
+ [hydraflow.core.run.Run]: ../../api/hydraflow/core/run.html#hydraflow.core.run.Run
157
+ [hydraflow.core.run_collection.RunCollection]: ../../api/hydraflow/core/run_collection.html#hydraflow.core.run_collection.RunCollection
158
+ [hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
159
+ [hydraflow.core.io.iter_run_dirs]: ../../api/hydraflow/core/io.html#hydraflow.core.io.iter_run_dirs
@@ -63,22 +63,25 @@ model_type = run.get("model__type") # Equivalent to "model.type"
63
63
  metric_value = run.get("accuracy") # From impl or cfg
64
64
  run_id = run.get("run_id") # From RunInfo
65
65
 
66
+ # Access special object keys
67
+ cfg = run.get("cfg") # Returns the complete configuration object
68
+ impl = run.get("impl") # Returns the implementation object
69
+ info = run.get("info") # Returns the run information object
70
+
66
71
  # Provide a default value if the key doesn't exist
67
72
  batch_size = run.get("batch_size", 32)
68
73
 
69
74
  # Use a callable as default to dynamically generate values based on the run
70
75
  # This is useful for derived parameters or conditional defaults
71
76
  lr = run.get("learning_rate", default=lambda r: r.get("base_lr", 0.01) / 10)
72
-
73
- # Complex default logic based on other parameters
74
- steps = run.get("steps", default=lambda r: r.get("epochs", 10) * r.get("steps_per_epoch", 100))
75
77
  ```
76
78
 
77
79
  The `get` method searches for values in the following order:
78
80
 
79
- 1. First in the configuration (`cfg`)
80
- 2. Then in the implementation instance (`impl`)
81
- 3. Finally in the run information (`info`)
81
+ 1. In the configuration (`cfg`)
82
+ 2. In the implementation instance (`impl`)
83
+ 3. In the run information (`info`)
84
+ 4. In the run object itself (`self`)
82
85
 
83
86
  This provides a unified access interface regardless of where the data is stored.
84
87
 
@@ -5,6 +5,22 @@ powerful tool for working with multiple experiment runs. It provides methods
5
5
  for filtering, grouping, and analyzing sets of [`Run`][hydraflow.core.run.Run]
6
6
  instances, making it easy to compare and extract insights from your experiments.
7
7
 
8
+ ## Architecture
9
+
10
+ `RunCollection` is built on top of the more general [`Collection`][hydraflow.core.collection.Collection]
11
+ class, which provides a flexible foundation for working with sequences of items. This architecture offers several benefits:
12
+
13
+ 1. **Consistent Interface**: All collection-based classes in HydraFlow share a common interface and behavior
14
+ 2. **Code Reuse**: Core functionality is implemented once in the base class and inherited by specialized collections
15
+ 3. **Extensibility**: New collection types can easily be created for different item types
16
+ 4. **Type Safety**: Generic type parameters ensure type checking throughout the collection hierarchy
17
+
18
+ The `Collection` class implements the Python `Sequence` protocol, allowing it to be used like standard Python
19
+ collections (lists, tuples) while providing specialized methods for filtering, grouping, and data extraction.
20
+
21
+ `RunCollection` extends this foundation with run-specific functionality, particularly for working with MLflow
22
+ experiment data. This layered design separates generic collection behavior from domain-specific operations.
23
+
8
24
  ## Creating a Run Collection
9
25
 
10
26
  There are several ways to create a `RunCollection`:
@@ -101,7 +117,7 @@ multiple_models = runs.filter(model_type=["transformer", "lstm"])
101
117
  def is_large_image(run: Run):
102
118
  return run.get("width") + run.get("height") > 100
103
119
 
104
- good_runs = runs.filter(predicate=is_large_image)
120
+ good_runs = runs.filter(is_large_image)
105
121
  ```
106
122
 
107
123
  The double underscore notation (`__`) is particularly useful for accessing nested
@@ -133,7 +149,7 @@ def has_efficient_lr(run: Run) -> bool:
133
149
  return lr * batch_size < 0.5
134
150
 
135
151
  # Apply the complex predicate
136
- efficient_runs = runs.filter(predicate=has_efficient_lr)
152
+ efficient_runs = runs.filter(has_efficient_lr)
137
153
  ```
138
154
 
139
155
  The combination of predicate functions with callable defaults in `get` enables sophisticated
@@ -250,6 +266,11 @@ df = runs.to_frame()
250
266
  # DataFrame with specific configuration parameters
251
267
  df = runs.to_frame("model_type", "learning_rate", "batch_size")
252
268
 
269
+ # Include Run, configuration, or implementation objects as columns
270
+ df = runs.to_frame("model_type", "learning_rate", "run") # Include Run objects
271
+ df = runs.to_frame("model_type", "cfg") # Include configuration objects
272
+ df = runs.to_frame("run_id", "run", "cfg", "impl") # Include all objects
273
+
253
274
  # Specify default values for missing parameters using the defaults parameter
254
275
  df = runs.to_frame(
255
276
  "model_type",
@@ -258,17 +279,6 @@ df = runs.to_frame(
258
279
  defaults={"learning_rate": 0.01, "batch_size": 32}
259
280
  )
260
281
 
261
- # Use callable defaults for dynamic values based on each run
262
- df = runs.to_frame(
263
- "model_type",
264
- "learning_rate",
265
- "epochs",
266
- defaults={
267
- "learning_rate": lambda run: run.get("base_lr", 0.01) * run.get("lr_multiplier", 1.0),
268
- "epochs": lambda run: int(run.get("max_steps", 1000) / run.get("steps_per_epoch", 100))
269
- }
270
- )
271
-
272
282
  # Missing values without defaults are represented as None (null) in the DataFrame
273
283
  # This allows for standard handling of missing data in Polars
274
284
  missing_values_df = runs.to_frame("model_type", "parameter_that_might_be_missing")
@@ -281,24 +291,6 @@ valid_rows = missing_values_df.filter(pl.col("parameter_that_might_be_missing").
281
291
  filled_df = missing_values_df.with_columns(
282
292
  pl.col("parameter_that_might_be_missing").fill_null("default_value")
283
293
  )
284
-
285
- # Using a custom function that returns multiple columns as keyword arguments
286
- def get_metrics(run: Run) -> dict[str, float]:
287
- return {
288
- "accuracy": run.get("accuracy", default=lambda r: r.get("val_accuracy", 0.0) * 0.9),
289
- "precision": run.get("precision", default=lambda r: r.get("val_precision", 0.0) * 0.9),
290
- }
291
-
292
- # Add custom columns using a function
293
- df = runs.to_frame("model_type", metrics=get_metrics)
294
-
295
- # Combine defaults with custom column generator functions
296
- df = runs.to_frame(
297
- "model_type",
298
- "learning_rate",
299
- defaults={"learning_rate": 0.01},
300
- metrics=get_metrics
301
- )
302
294
  ```
303
295
 
304
296
  The `to_frame` method provides several ways to handle missing data:
@@ -313,12 +305,10 @@ The `to_frame` method provides several ways to handle missing data:
313
305
  - Fill nulls: `df.with_columns(pl.col("param").fill_null(value))`
314
306
  - Aggregations: Most aggregation functions handle nulls appropriately
315
307
 
316
- 3. **Custom column generators**: Use keyword argument functions to compute complex columns
317
- - These functions receive each Run instance and can implement custom logic
318
- - They can use `run.get()` with defaults to handle missing parameters
319
-
320
- These approaches can be combined to create flexible and robust data extraction pipelines
321
- that handle different experiment configurations and parameter evolution over time.
308
+ 3. **Special object keys**: Use the special keys `"run"`, `"cfg"`, and `"impl"` to include the actual
309
+ Run objects, configuration objects, or implementation objects in the DataFrame
310
+ - This allows direct access to the original objects for further operations
311
+ - You can combine regular data columns with object columns as needed
322
312
 
323
313
  ## Grouping Runs
324
314
 
@@ -343,75 +333,26 @@ param_groups = runs.group_by("model_type", "model__hidden_size", "optimizer__lea
343
333
 
344
334
  # Access a specific group
345
335
  transformer_001_group = param_groups[("transformer", 0.001)]
336
+
337
+ # Aggregating grouped runs using the agg method
338
+ # This returns a DataFrame with the aggregated results
339
+ model_counts = model_groups.agg(count=lambda runs: len(runs))
340
+ model_avg_loss = model_groups.agg(
341
+ avg_loss=lambda runs: sum(run.get("loss", 0) for run in runs) / len(runs),
342
+ min_loss=lambda runs: min(run.get("loss", float("inf")) for run in runs)
343
+ )
346
344
  ```
347
345
 
348
- When no aggregation functions are provided, `group_by` returns a dictionary mapping keys to `RunCollection` instances. This intentional design allows you to:
346
+ The `group_by` method returns a `GroupBy` instance that maps keys to `RunCollection` instances. This design allows you to:
349
347
 
350
348
  - Work with each group as a separate `RunCollection` with all the filtering, sorting, and analysis capabilities
351
349
  - Perform custom operations on each group that might not be expressible as simple aggregation functions
352
350
  - Chain additional operations on specific groups that interest you
353
351
  - Implement multi-stage analysis workflows where you need to maintain the full run information at each step
354
352
 
355
- This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
356
-
357
- ## Aggregation with Group By
358
-
359
- Combine `group_by` with aggregation for powerful analysis:
360
-
361
- ```python
362
- # Simple aggregation function using get method with callable defaults
363
- def mean_accuracy(runs: RunCollection) -> float:
364
- return runs.to_numpy(
365
- "accuracy",
366
- default=lambda run: run.get("val_accuracy", 0.0) * 0.9
367
- ).mean()
368
-
369
- # Complex aggregation from implementation or configuration with fallbacks
370
- def combined_metric(runs: RunCollection) -> float:
371
- # Use callable defaults to handle missing values consistently
372
- accuracies = runs.to_numpy("accuracy", default=lambda r: r.get("val_accuracy", 0.0))
373
- precisions = runs.to_numpy("precision", default=lambda r: r.get("val_precision", 0.0))
374
- return (accuracies.mean() + precisions.mean()) / 2
375
-
376
-
377
- # Group by model type and calculate average accuracy
378
- model_accuracies = runs.group_by(
379
- "model_type",
380
- accuracy=mean_accuracy
381
- )
382
-
383
- # Group by multiple parameters with multiple aggregations
384
- results = runs.group_by(
385
- "model_type",
386
- "learning_rate",
387
- count=len,
388
- accuracy=mean_accuracy,
389
- combined=combined_metric
390
- )
391
-
392
- # Group by parameters that might be missing in some runs using callable defaults
393
- def normalize_architecture(run: Run) -> str:
394
- # Get architecture with a fallback to model type if not available
395
- arch = run.get("architecture", default=lambda r: r.get("model_type", "unknown"))
396
- return arch.lower() # Normalize to lowercase
397
-
398
- # Group by the normalized architecture
399
- arch_results = runs.group_by(normalize_architecture, accuracy=mean_accuracy)
400
- ```
401
-
402
- With the enhanced `get` method and callable defaults support throughout the API, writing aggregation
403
- functions becomes more straightforward and robust. You can handle missing values consistently and
404
- implement complex transformations that work across heterogeneous runs.
405
-
406
- When aggregation functions are provided as keyword arguments, `group_by` returns a Polars DataFrame with the group keys and aggregated values. This design choice offers several advantages:
407
-
408
- - Directly produces analysis-ready results with all aggregations computed in a single operation
409
- - Enables efficient downstream analysis using Polars' powerful DataFrame operations
410
- - Simplifies visualization and reporting workflows
411
- - Reduces memory usage by computing only the requested aggregations rather than maintaining full RunCollections
412
- - Creates a clean interface that separates grouping from additional analysis steps
353
+ To perform aggregations on the grouped data, use the `agg` method on the GroupBy instance. This transforms the grouped data into a DataFrame with aggregated results. You can define multiple aggregation functions to compute different metrics across each group.
413
354
 
414
- The DataFrame output is particularly useful for final analysis steps where you need to summarize results across many runs or prepare data for visualization.
355
+ This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
415
356
 
416
357
  ## Type-Safe Run Collections
417
358
 
@@ -483,7 +424,7 @@ for run in runs:
483
424
  type checking.
484
425
 
485
426
  3. **Chain Operations**: Combine filtering, grouping,
486
- and aggregation for efficient analysis workflows.
427
+ and object extraction for efficient analysis workflows.
487
428
 
488
429
  4. **Use DataFrame Integration**: Convert to DataFrames
489
430
  for complex analysis and visualization needs.
@@ -492,6 +433,8 @@ for run in runs:
492
433
 
493
434
  The [`RunCollection`][hydraflow.core.run_collection.RunCollection] class is a
494
435
  powerful tool for comparative analysis of machine learning experiments. Its
495
- filtering, grouping, and aggregation capabilities enable efficient extraction
436
+ filtering, grouping, and data extraction capabilities enable efficient extraction
496
437
  of insights from large sets of experiments, helping you identify optimal
497
- configurations and understand performance trends.
438
+ configurations and understand performance trends.
439
+
440
+ [hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
@@ -61,7 +61,7 @@ Where:
61
61
  - `[overrides]` are optional Hydra-style parameter overrides
62
62
 
63
63
  For more details on the CLI,
64
- see the [Job Configuration](../part2-advanced/job-configuration.md#command-line-interface)
64
+ see the [Job Configuration](../part2-advanced/job-configuration.md)
65
65
  documentation.
66
66
 
67
67
  ## Previewing Execution with Dry Run
@@ -220,7 +220,7 @@ You can perform basic operations on a collection:
220
220
 
221
221
  ### Filtering Runs
222
222
 
223
- The [`filter`][hydraflow.core.run_collection.RunCollection.filter] method lets you select runs based on various criteria:
223
+ The [`filter`][hydraflow.core.collection.Collection.filter] method lets you select runs based on various criteria:
224
224
 
225
225
  ```pycon exec="1" source="console" session="results" workdir="examples"
226
226
  >>> print(rc.filter(width=400))
@@ -246,7 +246,7 @@ You can even use custom filter functions:
246
246
 
247
247
  ### Finding Specific Runs
248
248
 
249
- The [`get`][hydraflow.core.run_collection.RunCollection.get] method returns a single run matching your criteria:
249
+ The [`get`][hydraflow.core.collection.Collection.get] method returns a single run matching your criteria:
250
250
 
251
251
  ```pycon exec="1" source="console" session="results" workdir="examples"
252
252
  >>> run = rc.get(width=250, height=(100, 200))
@@ -288,7 +288,7 @@ Or dictionaries for multiple named columns:
288
288
 
289
289
  ### Grouping Runs
290
290
 
291
- The [`group_by`][hydraflow.core.run_collection.RunCollection.group_by] method organizes runs by common attributes:
291
+ The [`group_by`][hydraflow.core.collection.Collection.group_by] method organizes runs by common attributes:
292
292
 
293
293
  ```pycon exec="1" source="console" session="results" workdir="examples"
294
294
  >>> grouped = rc.group_by("width")
@@ -304,10 +304,12 @@ You can group by multiple keys:
304
304
  ... print(key, group)
305
305
  ```
306
306
 
307
- Adding aggregation functions transforms the result into a DataFrame:
307
+ Adding aggregation functions using the [`agg`][hydraflow.core.collection.Collection.agg]
308
+ method transforms the result into a DataFrame:
308
309
 
309
310
  ```pycon exec="1" source="console" session="results" workdir="examples"
310
- >>> df = rc.group_by("width", n=lambda runs: len(runs))
311
+ >>> grouped = rc.group_by("width")
312
+ >>> df = grouped.agg(n=lambda runs: len(runs))
311
313
  >>> print(df)
312
314
  ```
313
315
 
@@ -321,7 +323,8 @@ In this tutorial, you've learned how to:
321
323
  4. Filter, group, and analyze collections of runs
322
324
  5. Convert run data to DataFrames for advanced analysis
323
325
 
324
- These capabilities enable you to efficiently analyze your experiments and extract valuable insights from your machine learning workflows.
326
+ These capabilities enable you to efficiently analyze your experiments and extract
327
+ valuable insights from your machine learning workflows.
325
328
 
326
329
  ## Next Steps
327
330
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hydraflow"
7
- version = "0.16.2"
7
+ version = "0.17.1"
8
8
  description = "HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }