hydraflow 0.16.1__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {hydraflow-0.16.1 → hydraflow-0.17.0}/PKG-INFO +2 -6
  2. {hydraflow-0.16.1 → hydraflow-0.17.0}/README.md +1 -5
  3. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/index.md +20 -5
  4. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/run-class.md +9 -6
  5. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/run-collection.md +50 -99
  6. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/advanced.md +1 -1
  7. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/analysis.md +4 -3
  8. {hydraflow-0.16.1 → hydraflow-0.17.0}/pyproject.toml +1 -1
  9. hydraflow-0.17.0/src/hydraflow/core/collection.py +541 -0
  10. hydraflow-0.17.0/src/hydraflow/core/group_by.py +205 -0
  11. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/io.py +33 -15
  12. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/run.py +42 -61
  13. hydraflow-0.17.0/src/hydraflow/core/run_collection.py +175 -0
  14. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/run_info.py +3 -34
  15. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run.py +29 -41
  16. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run_collection.py +11 -74
  17. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run_info.py +0 -8
  18. hydraflow-0.17.0/tests/core/test_collection.py +298 -0
  19. hydraflow-0.17.0/tests/core/test_group_by.py +125 -0
  20. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/test_io.py +2 -2
  21. hydraflow-0.16.1/src/hydraflow/core/run_collection.py +0 -632
  22. {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/devcontainer.json +0 -0
  23. {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/postCreate.sh +0 -0
  24. {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/starship.toml +0 -0
  25. {hydraflow-0.16.1 → hydraflow-0.17.0}/.gitattributes +0 -0
  26. {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/ci.yaml +0 -0
  27. {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/docs.yaml +0 -0
  28. {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/publish.yaml +0 -0
  29. {hydraflow-0.16.1 → hydraflow-0.17.0}/.gitignore +0 -0
  30. {hydraflow-0.16.1 → hydraflow-0.17.0}/LICENSE +0 -0
  31. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/concepts.md +0 -0
  32. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/index.md +0 -0
  33. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/installation.md +0 -0
  34. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/index.md +0 -0
  35. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/configuration.md +0 -0
  36. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/execution.md +0 -0
  37. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/index.md +0 -0
  38. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/main-decorator.md +0 -0
  39. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/index.md +0 -0
  40. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/job-configuration.md +0 -0
  41. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/sweep-syntax.md +0 -0
  42. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/updating-runs.md +0 -0
  43. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/applications.md +0 -0
  44. {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/index.md +0 -0
  45. {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/example.py +0 -0
  46. {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/hydraflow.yaml +0 -0
  47. {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/submit.py +0 -0
  48. {hydraflow-0.16.1 → hydraflow-0.17.0}/mkdocs.yaml +0 -0
  49. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/__init__.py +0 -0
  50. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/cli.py +0 -0
  51. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/__init__.py +0 -0
  52. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/context.py +0 -0
  53. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/main.py +0 -0
  54. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/__init__.py +0 -0
  55. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/aio.py +0 -0
  56. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/conf.py +0 -0
  57. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/io.py +0 -0
  58. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/job.py +0 -0
  59. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/parser.py +0 -0
  60. {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/py.typed +0 -0
  61. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/__init__.py +0 -0
  62. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/__init__.py +0 -0
  63. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/app.py +0 -0
  64. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/conftest.py +0 -0
  65. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/hydraflow.yaml +0 -0
  66. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/submit.py +0 -0
  67. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_run.py +0 -0
  68. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_setup.py +0 -0
  69. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_show.py +0 -0
  70. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_version.py +0 -0
  71. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/conftest.py +0 -0
  72. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/__init__.py +0 -0
  73. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/__init__.py +0 -0
  74. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/chdir.py +0 -0
  75. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/log_run.py +0 -0
  76. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/start_run.py +0 -0
  77. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_chdir.py +0 -0
  78. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_log_run.py +0 -0
  79. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_start_run.py +0 -0
  80. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/__init__.py +0 -0
  81. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/default.py +0 -0
  82. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/force_new_run.py +0 -0
  83. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/match_overrides.py +0 -0
  84. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/rerun_finished.py +0 -0
  85. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/skip_finished.py +0 -0
  86. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_default.py +0 -0
  87. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_force_new_run.py +0 -0
  88. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_main.py +0 -0
  89. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_match_overrides.py +0 -0
  90. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_rerun_finished.py +0 -0
  91. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_skip_finished.py +0 -0
  92. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_update.py +0 -0
  93. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/update.py +0 -0
  94. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/__init__.py +0 -0
  95. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/run.py +0 -0
  96. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/__init__.py +0 -0
  97. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/conftest.py +0 -0
  98. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/echo.py +0 -0
  99. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/read.py +0 -0
  100. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_aio.py +0 -0
  101. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_args.py +0 -0
  102. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_conf.py +0 -0
  103. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_io.py +0 -0
  104. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_job.py +0 -0
  105. {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hydraflow
3
- Version: 0.16.1
3
+ Version: 0.17.0
4
4
  Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
5
5
  Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
6
6
  Project-URL: Source, https://github.com/daizutabi/hydraflow
@@ -194,10 +194,6 @@ For detailed documentation, visit our [documentation site](https://daizutabi.git
194
194
  - [User Guide](https://daizutabi.github.io/hydraflow/part1-applications/) - Detailed documentation of HydraFlow's capabilities
195
195
  - [API Reference](https://daizutabi.github.io/hydraflow/api/hydraflow/) - Complete API documentation
196
196
 
197
- ## Contributing
198
-
199
- We welcome contributions! Please see our [contributing guide](CONTRIBUTING.md) for details.
200
-
201
197
  ## License
202
198
 
203
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
199
+ This project is licensed under the MIT License.
@@ -141,10 +141,6 @@ For detailed documentation, visit our [documentation site](https://daizutabi.git
141
141
  - [User Guide](https://daizutabi.github.io/hydraflow/part1-applications/) - Detailed documentation of HydraFlow's capabilities
142
142
  - [API Reference](https://daizutabi.github.io/hydraflow/api/hydraflow/) - Complete API documentation
143
143
 
144
- ## Contributing
145
-
146
- We welcome contributions! Please see our [contributing guide](CONTRIBUTING.md) for details.
147
-
148
144
  ## License
149
145
 
150
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
146
+ This project is licensed under the MIT License.
@@ -20,11 +20,15 @@ The main components of HydraFlow's analysis tools are:
20
20
  1. **[`Run`][hydraflow.core.run.Run] Class**: Represents a single experiment
21
21
  run, providing access to configuration and artifacts.
22
22
 
23
- 2. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
24
- A collection of `Run` instances with tools for filtering, grouping, and
25
- aggregating results.
23
+ 2. **[`Collection`][hydraflow.core.collection.Collection] Class**: A generic base class
24
+ implementing the `Sequence` protocol with powerful filtering, grouping, and data
25
+ extraction capabilities.
26
26
 
27
- 3. **Data Analysis Integration**: Tools to convert experiment data into
27
+ 3. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
28
+ A collection of `Run` instances with specialized tools for filtering, grouping, and
29
+ aggregating results, built on top of the `Collection` class.
30
+
31
+ 4. **Data Analysis Integration**: Tools to convert experiment data into
28
32
  Polars DataFrames for advanced analysis.
29
33
 
30
34
  ## Practical Examples
@@ -47,6 +51,12 @@ filtered_runs = runs.filter(learning_rate=0.01, model_type="transformer")
47
51
  # Group runs by a parameter
48
52
  grouped_runs = runs.group_by("batch_size")
49
53
 
54
+ # Aggregate grouped data
55
+ df_aggregated = grouped_runs.agg(
56
+ count=lambda runs: len(runs),
57
+ avg_accuracy=lambda runs: sum(run.get("accuracy", 0) for run in runs) / len(runs)
58
+ )
59
+
50
60
  # Convert to DataFrame for analysis
51
61
  df = runs.to_frame("learning_rate", "batch_size", accuracy=lambda run: run.get("accuracy"))
52
62
 
@@ -141,4 +151,9 @@ In the following pages, we'll explore HydraFlow's analysis tools in detail:
141
151
  working with multiple runs.
142
152
 
143
153
  - [Updating Runs](updating-runs.md): Learn how to update existing runs with
144
- new metrics, tags, and artifacts.
154
+ new metrics, tags, and artifacts.
155
+
156
+ [hydraflow.core.run.Run]: ../../api/hydraflow/core/run.html#hydraflow.core.run.Run
157
+ [hydraflow.core.run_collection.RunCollection]: ../../api/hydraflow/core/run_collection.html#hydraflow.core.run_collection.RunCollection
158
+ [hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
159
+ [hydraflow.core.io.iter_run_dirs]: ../../api/hydraflow/core/io.html#hydraflow.core.io.iter_run_dirs
@@ -63,22 +63,25 @@ model_type = run.get("model__type") # Equivalent to "model.type"
63
63
  metric_value = run.get("accuracy") # From impl or cfg
64
64
  run_id = run.get("run_id") # From RunInfo
65
65
 
66
+ # Access special object keys
67
+ cfg = run.get("cfg") # Returns the complete configuration object
68
+ impl = run.get("impl") # Returns the implementation object
69
+ info = run.get("info") # Returns the run information object
70
+
66
71
  # Provide a default value if the key doesn't exist
67
72
  batch_size = run.get("batch_size", 32)
68
73
 
69
74
  # Use a callable as default to dynamically generate values based on the run
70
75
  # This is useful for derived parameters or conditional defaults
71
76
  lr = run.get("learning_rate", default=lambda r: r.get("base_lr", 0.01) / 10)
72
-
73
- # Complex default logic based on other parameters
74
- steps = run.get("steps", default=lambda r: r.get("epochs", 10) * r.get("steps_per_epoch", 100))
75
77
  ```
76
78
 
77
79
  The `get` method searches for values in the following order:
78
80
 
79
- 1. First in the configuration (`cfg`)
80
- 2. Then in the implementation instance (`impl`)
81
- 3. Finally in the run information (`info`)
81
+ 1. In the configuration (`cfg`)
82
+ 2. In the implementation instance (`impl`)
83
+ 3. In the run information (`info`)
84
+ 4. In the run object itself (`self`)
82
85
 
83
86
  This provides a unified access interface regardless of where the data is stored.
84
87
 
@@ -5,6 +5,22 @@ powerful tool for working with multiple experiment runs. It provides methods
5
5
  for filtering, grouping, and analyzing sets of [`Run`][hydraflow.core.run.Run]
6
6
  instances, making it easy to compare and extract insights from your experiments.
7
7
 
8
+ ## Architecture
9
+
10
+ `RunCollection` is built on top of the more general [`Collection`][hydraflow.core.collection.Collection]
11
+ class, which provides a flexible foundation for working with sequences of items. This architecture offers several benefits:
12
+
13
+ 1. **Consistent Interface**: All collection-based classes in HydraFlow share a common interface and behavior
14
+ 2. **Code Reuse**: Core functionality is implemented once in the base class and inherited by specialized collections
15
+ 3. **Extensibility**: New collection types can easily be created for different item types
16
+ 4. **Type Safety**: Generic type parameters ensure type checking throughout the collection hierarchy
17
+
18
+ The `Collection` class implements the Python `Sequence` protocol, allowing it to be used like standard Python
19
+ collections (lists, tuples) while providing specialized methods for filtering, grouping, and data extraction.
20
+
21
+ `RunCollection` extends this foundation with run-specific functionality, particularly for working with MLflow
22
+ experiment data. This layered design separates generic collection behavior from domain-specific operations.
23
+
8
24
  ## Creating a Run Collection
9
25
 
10
26
  There are several ways to create a `RunCollection`:
@@ -101,7 +117,7 @@ multiple_models = runs.filter(model_type=["transformer", "lstm"])
101
117
  def is_large_image(run: Run):
102
118
  return run.get("width") + run.get("height") > 100
103
119
 
104
- good_runs = runs.filter(predicate=is_large_image)
120
+ good_runs = runs.filter(is_large_image)
105
121
  ```
106
122
 
107
123
  The double underscore notation (`__`) is particularly useful for accessing nested
@@ -133,7 +149,7 @@ def has_efficient_lr(run: Run) -> bool:
133
149
  return lr * batch_size < 0.5
134
150
 
135
151
  # Apply the complex predicate
136
- efficient_runs = runs.filter(predicate=has_efficient_lr)
152
+ efficient_runs = runs.filter(has_efficient_lr)
137
153
  ```
138
154
 
139
155
  The combination of predicate functions with callable defaults in `get` enables sophisticated
@@ -250,6 +266,11 @@ df = runs.to_frame()
250
266
  # DataFrame with specific configuration parameters
251
267
  df = runs.to_frame("model_type", "learning_rate", "batch_size")
252
268
 
269
+ # Include Run, configuration, or implementation objects as columns
270
+ df = runs.to_frame("model_type", "learning_rate", "run") # Include Run objects
271
+ df = runs.to_frame("model_type", "cfg") # Include configuration objects
272
+ df = runs.to_frame("run_id", "run", "cfg", "impl") # Include all objects
273
+
253
274
  # Specify default values for missing parameters using the defaults parameter
254
275
  df = runs.to_frame(
255
276
  "model_type",
@@ -258,17 +279,6 @@ df = runs.to_frame(
258
279
  defaults={"learning_rate": 0.01, "batch_size": 32}
259
280
  )
260
281
 
261
- # Use callable defaults for dynamic values based on each run
262
- df = runs.to_frame(
263
- "model_type",
264
- "learning_rate",
265
- "epochs",
266
- defaults={
267
- "learning_rate": lambda run: run.get("base_lr", 0.01) * run.get("lr_multiplier", 1.0),
268
- "epochs": lambda run: int(run.get("max_steps", 1000) / run.get("steps_per_epoch", 100))
269
- }
270
- )
271
-
272
282
  # Missing values without defaults are represented as None (null) in the DataFrame
273
283
  # This allows for standard handling of missing data in Polars
274
284
  missing_values_df = runs.to_frame("model_type", "parameter_that_might_be_missing")
@@ -281,24 +291,6 @@ valid_rows = missing_values_df.filter(pl.col("parameter_that_might_be_missing").
281
291
  filled_df = missing_values_df.with_columns(
282
292
  pl.col("parameter_that_might_be_missing").fill_null("default_value")
283
293
  )
284
-
285
- # Using a custom function that returns multiple columns as keyword arguments
286
- def get_metrics(run: Run) -> dict[str, float]:
287
- return {
288
- "accuracy": run.get("accuracy", default=lambda r: r.get("val_accuracy", 0.0) * 0.9),
289
- "precision": run.get("precision", default=lambda r: r.get("val_precision", 0.0) * 0.9),
290
- }
291
-
292
- # Add custom columns using a function
293
- df = runs.to_frame("model_type", metrics=get_metrics)
294
-
295
- # Combine defaults with custom column generator functions
296
- df = runs.to_frame(
297
- "model_type",
298
- "learning_rate",
299
- defaults={"learning_rate": 0.01},
300
- metrics=get_metrics
301
- )
302
294
  ```
303
295
 
304
296
  The `to_frame` method provides several ways to handle missing data:
@@ -313,12 +305,10 @@ The `to_frame` method provides several ways to handle missing data:
313
305
  - Fill nulls: `df.with_columns(pl.col("param").fill_null(value))`
314
306
  - Aggregations: Most aggregation functions handle nulls appropriately
315
307
 
316
- 3. **Custom column generators**: Use keyword argument functions to compute complex columns
317
- - These functions receive each Run instance and can implement custom logic
318
- - They can use `run.get()` with defaults to handle missing parameters
319
-
320
- These approaches can be combined to create flexible and robust data extraction pipelines
321
- that handle different experiment configurations and parameter evolution over time.
308
+ 3. **Special object keys**: Use the special keys `"run"`, `"cfg"`, and `"impl"` to include the actual
309
+ Run objects, configuration objects, or implementation objects in the DataFrame
310
+ - This allows direct access to the original objects for further operations
311
+ - You can combine regular data columns with object columns as needed
322
312
 
323
313
  ## Grouping Runs
324
314
 
@@ -331,6 +321,12 @@ model_groups = runs.group_by("model_type")
331
321
  # Group by nested parameter using dot notation
332
322
  architecture_groups = runs.group_by("model.architecture")
333
323
 
324
+ # Group by and include Run objects in the result DataFrame
325
+ model_groups_df = runs.group_by("model_type", "run")
326
+
327
+ # Include multiple object types in the result
328
+ grouped_df = runs.group_by("model_type", "batch_size", "run", "cfg")
329
+
334
330
  # Iterate through groups
335
331
  for model_type, group in model_groups.items():
336
332
  print(f"Model type: {model_type}, Runs: {len(group)}")
@@ -343,75 +339,28 @@ param_groups = runs.group_by("model_type", "model__hidden_size", "optimizer__lea
343
339
 
344
340
  # Access a specific group
345
341
  transformer_001_group = param_groups[("transformer", 0.001)]
342
+
343
+ # Aggregating grouped runs using the agg method
344
+ # This returns a DataFrame with the aggregated results
345
+ model_counts = model_groups.agg(count=lambda runs: len(runs))
346
+ model_avg_loss = model_groups.agg(
347
+ avg_loss=lambda runs: sum(run.get("loss", 0) for run in runs) / len(runs),
348
+ min_loss=lambda runs: min(run.get("loss", float("inf")) for run in runs)
349
+ )
346
350
  ```
347
351
 
348
- When no aggregation functions are provided, `group_by` returns a dictionary mapping keys to `RunCollection` instances. This intentional design allows you to:
352
+ The `group_by` method returns a `GroupBy` instance that maps keys to `RunCollection` instances. This design allows you to:
349
353
 
350
354
  - Work with each group as a separate `RunCollection` with all the filtering, sorting, and analysis capabilities
351
355
  - Perform custom operations on each group that might not be expressible as simple aggregation functions
352
356
  - Chain additional operations on specific groups that interest you
353
357
  - Implement multi-stage analysis workflows where you need to maintain the full run information at each step
354
358
 
355
- This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
356
-
357
- ## Aggregation with Group By
358
-
359
- Combine `group_by` with aggregation for powerful analysis:
360
-
361
- ```python
362
- # Simple aggregation function using get method with callable defaults
363
- def mean_accuracy(runs: RunCollection) -> float:
364
- return runs.to_numpy(
365
- "accuracy",
366
- default=lambda run: run.get("val_accuracy", 0.0) * 0.9
367
- ).mean()
368
-
369
- # Complex aggregation from implementation or configuration with fallbacks
370
- def combined_metric(runs: RunCollection) -> float:
371
- # Use callable defaults to handle missing values consistently
372
- accuracies = runs.to_numpy("accuracy", default=lambda r: r.get("val_accuracy", 0.0))
373
- precisions = runs.to_numpy("precision", default=lambda r: r.get("val_precision", 0.0))
374
- return (accuracies.mean() + precisions.mean()) / 2
375
-
376
-
377
- # Group by model type and calculate average accuracy
378
- model_accuracies = runs.group_by(
379
- "model_type",
380
- accuracy=mean_accuracy
381
- )
382
-
383
- # Group by multiple parameters with multiple aggregations
384
- results = runs.group_by(
385
- "model_type",
386
- "learning_rate",
387
- count=len,
388
- accuracy=mean_accuracy,
389
- combined=combined_metric
390
- )
391
-
392
- # Group by parameters that might be missing in some runs using callable defaults
393
- def normalize_architecture(run: Run) -> str:
394
- # Get architecture with a fallback to model type if not available
395
- arch = run.get("architecture", default=lambda r: r.get("model_type", "unknown"))
396
- return arch.lower() # Normalize to lowercase
397
-
398
- # Group by the normalized architecture
399
- arch_results = runs.group_by(normalize_architecture, accuracy=mean_accuracy)
400
- ```
401
-
402
- With the enhanced `get` method and callable defaults support throughout the API, writing aggregation
403
- functions becomes more straightforward and robust. You can handle missing values consistently and
404
- implement complex transformations that work across heterogeneous runs.
405
-
406
- When aggregation functions are provided as keyword arguments, `group_by` returns a Polars DataFrame with the group keys and aggregated values. This design choice offers several advantages:
359
+ To perform aggregations on the grouped data, use the `agg` method on the GroupBy instance. This transforms the grouped data into a DataFrame with aggregated results. You can define multiple aggregation functions to compute different metrics across each group.
407
360
 
408
- - Directly produces analysis-ready results with all aggregations computed in a single operation
409
- - Enables efficient downstream analysis using Polars' powerful DataFrame operations
410
- - Simplifies visualization and reporting workflows
411
- - Reduces memory usage by computing only the requested aggregations rather than maintaining full RunCollections
412
- - Creates a clean interface that separates grouping from additional analysis steps
361
+ When special object keys (`"run"`, `"cfg"`, `"impl"`) are included in the `group_by` call, it returns a DataFrame with those objects included as columns, making it easy to access the original objects for further processing.
413
362
 
414
- The DataFrame output is particularly useful for final analysis steps where you need to summarize results across many runs or prepare data for visualization.
363
+ This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
415
364
 
416
365
  ## Type-Safe Run Collections
417
366
 
@@ -483,7 +432,7 @@ for run in runs:
483
432
  type checking.
484
433
 
485
434
  3. **Chain Operations**: Combine filtering, grouping,
486
- and aggregation for efficient analysis workflows.
435
+ and object extraction for efficient analysis workflows.
487
436
 
488
437
  4. **Use DataFrame Integration**: Convert to DataFrames
489
438
  for complex analysis and visualization needs.
@@ -492,6 +441,8 @@ for run in runs:
492
441
 
493
442
  The [`RunCollection`][hydraflow.core.run_collection.RunCollection] class is a
494
443
  powerful tool for comparative analysis of machine learning experiments. Its
495
- filtering, grouping, and aggregation capabilities enable efficient extraction
444
+ filtering, grouping, and data extraction capabilities enable efficient extraction
496
445
  of insights from large sets of experiments, helping you identify optimal
497
- configurations and understand performance trends.
446
+ configurations and understand performance trends.
447
+
448
+ [hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
@@ -61,7 +61,7 @@ Where:
61
61
  - `[overrides]` are optional Hydra-style parameter overrides
62
62
 
63
63
  For more details on the CLI,
64
- see the [Job Configuration](../part2-advanced/job-configuration.md#command-line-interface)
64
+ see the [Job Configuration](../part2-advanced/job-configuration.md)
65
65
  documentation.
66
66
 
67
67
  ## Previewing Execution with Dry Run
@@ -288,7 +288,7 @@ Or dictionaries for multiple named columns:
288
288
 
289
289
  ### Grouping Runs
290
290
 
291
- The [`group_by`][hydraflow.core.run_collection.RunCollection.group_by] method organizes runs by common attributes:
291
+ The [`group_by`][hydraflow.core.collection.Collection.group_by] method organizes runs by common attributes:
292
292
 
293
293
  ```pycon exec="1" source="console" session="results" workdir="examples"
294
294
  >>> grouped = rc.group_by("width")
@@ -304,10 +304,11 @@ You can group by multiple keys:
304
304
  ... print(key, group)
305
305
  ```
306
306
 
307
- Adding aggregation functions transforms the result into a DataFrame:
307
+ Adding aggregation functions using the `agg` method transforms the result into a DataFrame:
308
308
 
309
309
  ```pycon exec="1" source="console" session="results" workdir="examples"
310
- >>> df = rc.group_by("width", n=lambda runs: len(runs))
310
+ >>> grouped = rc.group_by("width")
311
+ >>> df = grouped.agg(n=lambda runs: len(runs))
311
312
  >>> print(df)
312
313
  ```
313
314
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hydraflow"
7
- version = "0.16.1"
7
+ version = "0.17.0"
8
8
  description = "HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }