hydraflow 0.16.1__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hydraflow-0.16.1 → hydraflow-0.17.0}/PKG-INFO +2 -6
- {hydraflow-0.16.1 → hydraflow-0.17.0}/README.md +1 -5
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/index.md +20 -5
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/run-class.md +9 -6
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/run-collection.md +50 -99
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/advanced.md +1 -1
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/analysis.md +4 -3
- {hydraflow-0.16.1 → hydraflow-0.17.0}/pyproject.toml +1 -1
- hydraflow-0.17.0/src/hydraflow/core/collection.py +541 -0
- hydraflow-0.17.0/src/hydraflow/core/group_by.py +205 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/io.py +33 -15
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/run.py +42 -61
- hydraflow-0.17.0/src/hydraflow/core/run_collection.py +175 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/run_info.py +3 -34
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run.py +29 -41
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run_collection.py +11 -74
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/test_run_info.py +0 -8
- hydraflow-0.17.0/tests/core/test_collection.py +298 -0
- hydraflow-0.17.0/tests/core/test_group_by.py +125 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/test_io.py +2 -2
- hydraflow-0.16.1/src/hydraflow/core/run_collection.py +0 -632
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/devcontainer.json +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/postCreate.sh +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.devcontainer/starship.toml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.gitattributes +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/ci.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/docs.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.github/workflows/publish.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/.gitignore +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/LICENSE +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/concepts.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/index.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/getting-started/installation.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/index.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/configuration.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/execution.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/index.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part1-applications/main-decorator.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/index.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/job-configuration.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part2-advanced/sweep-syntax.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/part3-analysis/updating-runs.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/applications.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/docs/practical-tutorials/index.md +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/example.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/hydraflow.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/examples/submit.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/mkdocs.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/cli.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/context.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/core/main.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/aio.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/conf.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/io.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/job.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/executor/parser.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/src/hydraflow/py.typed +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/app.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/conftest.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/hydraflow.yaml +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/submit.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_setup.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_show.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/cli/test_version.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/conftest.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/chdir.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/log_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/start_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_chdir.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_log_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/context/test_start_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/default.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/force_new_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/match_overrides.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/rerun_finished.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/skip_finished.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_default.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_force_new_run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_main.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_match_overrides.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_rerun_finished.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_skip_finished.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/test_update.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/main/update.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/core/run/run.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/__init__.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/conftest.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/echo.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/read.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_aio.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_args.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_conf.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_io.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_job.py +0 -0
- {hydraflow-0.16.1 → hydraflow-0.17.0}/tests/executor/test_parser.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: hydraflow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.17.0
|
4
4
|
Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
|
5
5
|
Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
|
6
6
|
Project-URL: Source, https://github.com/daizutabi/hydraflow
|
@@ -194,10 +194,6 @@ For detailed documentation, visit our [documentation site](https://daizutabi.git
|
|
194
194
|
- [User Guide](https://daizutabi.github.io/hydraflow/part1-applications/) - Detailed documentation of HydraFlow's capabilities
|
195
195
|
- [API Reference](https://daizutabi.github.io/hydraflow/api/hydraflow/) - Complete API documentation
|
196
196
|
|
197
|
-
## Contributing
|
198
|
-
|
199
|
-
We welcome contributions! Please see our [contributing guide](CONTRIBUTING.md) for details.
|
200
|
-
|
201
197
|
## License
|
202
198
|
|
203
|
-
This project is licensed under the MIT License
|
199
|
+
This project is licensed under the MIT License.
|
@@ -141,10 +141,6 @@ For detailed documentation, visit our [documentation site](https://daizutabi.git
|
|
141
141
|
- [User Guide](https://daizutabi.github.io/hydraflow/part1-applications/) - Detailed documentation of HydraFlow's capabilities
|
142
142
|
- [API Reference](https://daizutabi.github.io/hydraflow/api/hydraflow/) - Complete API documentation
|
143
143
|
|
144
|
-
## Contributing
|
145
|
-
|
146
|
-
We welcome contributions! Please see our [contributing guide](CONTRIBUTING.md) for details.
|
147
|
-
|
148
144
|
## License
|
149
145
|
|
150
|
-
This project is licensed under the MIT License
|
146
|
+
This project is licensed under the MIT License.
|
@@ -20,11 +20,15 @@ The main components of HydraFlow's analysis tools are:
|
|
20
20
|
1. **[`Run`][hydraflow.core.run.Run] Class**: Represents a single experiment
|
21
21
|
run, providing access to configuration and artifacts.
|
22
22
|
|
23
|
-
2. **[`
|
24
|
-
|
25
|
-
|
23
|
+
2. **[`Collection`][hydraflow.core.collection.Collection] Class**: A generic base class
|
24
|
+
implementing the `Sequence` protocol with powerful filtering, grouping, and data
|
25
|
+
extraction capabilities.
|
26
26
|
|
27
|
-
3. **
|
27
|
+
3. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
|
28
|
+
A collection of `Run` instances with specialized tools for filtering, grouping, and
|
29
|
+
aggregating results, built on top of the `Collection` class.
|
30
|
+
|
31
|
+
4. **Data Analysis Integration**: Tools to convert experiment data into
|
28
32
|
Polars DataFrames for advanced analysis.
|
29
33
|
|
30
34
|
## Practical Examples
|
@@ -47,6 +51,12 @@ filtered_runs = runs.filter(learning_rate=0.01, model_type="transformer")
|
|
47
51
|
# Group runs by a parameter
|
48
52
|
grouped_runs = runs.group_by("batch_size")
|
49
53
|
|
54
|
+
# Aggregate grouped data
|
55
|
+
df_aggregated = grouped_runs.agg(
|
56
|
+
count=lambda runs: len(runs),
|
57
|
+
avg_accuracy=lambda runs: sum(run.get("accuracy", 0) for run in runs) / len(runs)
|
58
|
+
)
|
59
|
+
|
50
60
|
# Convert to DataFrame for analysis
|
51
61
|
df = runs.to_frame("learning_rate", "batch_size", accuracy=lambda run: run.get("accuracy"))
|
52
62
|
|
@@ -141,4 +151,9 @@ In the following pages, we'll explore HydraFlow's analysis tools in detail:
|
|
141
151
|
working with multiple runs.
|
142
152
|
|
143
153
|
- [Updating Runs](updating-runs.md): Learn how to update existing runs with
|
144
|
-
new metrics, tags, and artifacts.
|
154
|
+
new metrics, tags, and artifacts.
|
155
|
+
|
156
|
+
[hydraflow.core.run.Run]: ../../api/hydraflow/core/run.html#hydraflow.core.run.Run
|
157
|
+
[hydraflow.core.run_collection.RunCollection]: ../../api/hydraflow/core/run_collection.html#hydraflow.core.run_collection.RunCollection
|
158
|
+
[hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
|
159
|
+
[hydraflow.core.io.iter_run_dirs]: ../../api/hydraflow/core/io.html#hydraflow.core.io.iter_run_dirs
|
@@ -63,22 +63,25 @@ model_type = run.get("model__type") # Equivalent to "model.type"
|
|
63
63
|
metric_value = run.get("accuracy") # From impl or cfg
|
64
64
|
run_id = run.get("run_id") # From RunInfo
|
65
65
|
|
66
|
+
# Access special object keys
|
67
|
+
cfg = run.get("cfg") # Returns the complete configuration object
|
68
|
+
impl = run.get("impl") # Returns the implementation object
|
69
|
+
info = run.get("info") # Returns the run information object
|
70
|
+
|
66
71
|
# Provide a default value if the key doesn't exist
|
67
72
|
batch_size = run.get("batch_size", 32)
|
68
73
|
|
69
74
|
# Use a callable as default to dynamically generate values based on the run
|
70
75
|
# This is useful for derived parameters or conditional defaults
|
71
76
|
lr = run.get("learning_rate", default=lambda r: r.get("base_lr", 0.01) / 10)
|
72
|
-
|
73
|
-
# Complex default logic based on other parameters
|
74
|
-
steps = run.get("steps", default=lambda r: r.get("epochs", 10) * r.get("steps_per_epoch", 100))
|
75
77
|
```
|
76
78
|
|
77
79
|
The `get` method searches for values in the following order:
|
78
80
|
|
79
|
-
1.
|
80
|
-
2.
|
81
|
-
3.
|
81
|
+
1. In the configuration (`cfg`)
|
82
|
+
2. In the implementation instance (`impl`)
|
83
|
+
3. In the run information (`info`)
|
84
|
+
4. In the run object itself (`self`)
|
82
85
|
|
83
86
|
This provides a unified access interface regardless of where the data is stored.
|
84
87
|
|
@@ -5,6 +5,22 @@ powerful tool for working with multiple experiment runs. It provides methods
|
|
5
5
|
for filtering, grouping, and analyzing sets of [`Run`][hydraflow.core.run.Run]
|
6
6
|
instances, making it easy to compare and extract insights from your experiments.
|
7
7
|
|
8
|
+
## Architecture
|
9
|
+
|
10
|
+
`RunCollection` is built on top of the more general [`Collection`][hydraflow.core.collection.Collection]
|
11
|
+
class, which provides a flexible foundation for working with sequences of items. This architecture offers several benefits:
|
12
|
+
|
13
|
+
1. **Consistent Interface**: All collection-based classes in HydraFlow share a common interface and behavior
|
14
|
+
2. **Code Reuse**: Core functionality is implemented once in the base class and inherited by specialized collections
|
15
|
+
3. **Extensibility**: New collection types can easily be created for different item types
|
16
|
+
4. **Type Safety**: Generic type parameters ensure type checking throughout the collection hierarchy
|
17
|
+
|
18
|
+
The `Collection` class implements the Python `Sequence` protocol, allowing it to be used like standard Python
|
19
|
+
collections (lists, tuples) while providing specialized methods for filtering, grouping, and data extraction.
|
20
|
+
|
21
|
+
`RunCollection` extends this foundation with run-specific functionality, particularly for working with MLflow
|
22
|
+
experiment data. This layered design separates generic collection behavior from domain-specific operations.
|
23
|
+
|
8
24
|
## Creating a Run Collection
|
9
25
|
|
10
26
|
There are several ways to create a `RunCollection`:
|
@@ -101,7 +117,7 @@ multiple_models = runs.filter(model_type=["transformer", "lstm"])
|
|
101
117
|
def is_large_image(run: Run):
|
102
118
|
return run.get("width") + run.get("height") > 100
|
103
119
|
|
104
|
-
good_runs = runs.filter(
|
120
|
+
good_runs = runs.filter(is_large_image)
|
105
121
|
```
|
106
122
|
|
107
123
|
The double underscore notation (`__`) is particularly useful for accessing nested
|
@@ -133,7 +149,7 @@ def has_efficient_lr(run: Run) -> bool:
|
|
133
149
|
return lr * batch_size < 0.5
|
134
150
|
|
135
151
|
# Apply the complex predicate
|
136
|
-
efficient_runs = runs.filter(
|
152
|
+
efficient_runs = runs.filter(has_efficient_lr)
|
137
153
|
```
|
138
154
|
|
139
155
|
The combination of predicate functions with callable defaults in `get` enables sophisticated
|
@@ -250,6 +266,11 @@ df = runs.to_frame()
|
|
250
266
|
# DataFrame with specific configuration parameters
|
251
267
|
df = runs.to_frame("model_type", "learning_rate", "batch_size")
|
252
268
|
|
269
|
+
# Include Run, configuration, or implementation objects as columns
|
270
|
+
df = runs.to_frame("model_type", "learning_rate", "run") # Include Run objects
|
271
|
+
df = runs.to_frame("model_type", "cfg") # Include configuration objects
|
272
|
+
df = runs.to_frame("run_id", "run", "cfg", "impl") # Include all objects
|
273
|
+
|
253
274
|
# Specify default values for missing parameters using the defaults parameter
|
254
275
|
df = runs.to_frame(
|
255
276
|
"model_type",
|
@@ -258,17 +279,6 @@ df = runs.to_frame(
|
|
258
279
|
defaults={"learning_rate": 0.01, "batch_size": 32}
|
259
280
|
)
|
260
281
|
|
261
|
-
# Use callable defaults for dynamic values based on each run
|
262
|
-
df = runs.to_frame(
|
263
|
-
"model_type",
|
264
|
-
"learning_rate",
|
265
|
-
"epochs",
|
266
|
-
defaults={
|
267
|
-
"learning_rate": lambda run: run.get("base_lr", 0.01) * run.get("lr_multiplier", 1.0),
|
268
|
-
"epochs": lambda run: int(run.get("max_steps", 1000) / run.get("steps_per_epoch", 100))
|
269
|
-
}
|
270
|
-
)
|
271
|
-
|
272
282
|
# Missing values without defaults are represented as None (null) in the DataFrame
|
273
283
|
# This allows for standard handling of missing data in Polars
|
274
284
|
missing_values_df = runs.to_frame("model_type", "parameter_that_might_be_missing")
|
@@ -281,24 +291,6 @@ valid_rows = missing_values_df.filter(pl.col("parameter_that_might_be_missing").
|
|
281
291
|
filled_df = missing_values_df.with_columns(
|
282
292
|
pl.col("parameter_that_might_be_missing").fill_null("default_value")
|
283
293
|
)
|
284
|
-
|
285
|
-
# Using a custom function that returns multiple columns as keyword arguments
|
286
|
-
def get_metrics(run: Run) -> dict[str, float]:
|
287
|
-
return {
|
288
|
-
"accuracy": run.get("accuracy", default=lambda r: r.get("val_accuracy", 0.0) * 0.9),
|
289
|
-
"precision": run.get("precision", default=lambda r: r.get("val_precision", 0.0) * 0.9),
|
290
|
-
}
|
291
|
-
|
292
|
-
# Add custom columns using a function
|
293
|
-
df = runs.to_frame("model_type", metrics=get_metrics)
|
294
|
-
|
295
|
-
# Combine defaults with custom column generator functions
|
296
|
-
df = runs.to_frame(
|
297
|
-
"model_type",
|
298
|
-
"learning_rate",
|
299
|
-
defaults={"learning_rate": 0.01},
|
300
|
-
metrics=get_metrics
|
301
|
-
)
|
302
294
|
```
|
303
295
|
|
304
296
|
The `to_frame` method provides several ways to handle missing data:
|
@@ -313,12 +305,10 @@ The `to_frame` method provides several ways to handle missing data:
|
|
313
305
|
- Fill nulls: `df.with_columns(pl.col("param").fill_null(value))`
|
314
306
|
- Aggregations: Most aggregation functions handle nulls appropriately
|
315
307
|
|
316
|
-
3. **
|
317
|
-
|
318
|
-
-
|
319
|
-
|
320
|
-
These approaches can be combined to create flexible and robust data extraction pipelines
|
321
|
-
that handle different experiment configurations and parameter evolution over time.
|
308
|
+
3. **Special object keys**: Use the special keys `"run"`, `"cfg"`, and `"impl"` to include the actual
|
309
|
+
Run objects, configuration objects, or implementation objects in the DataFrame
|
310
|
+
- This allows direct access to the original objects for further operations
|
311
|
+
- You can combine regular data columns with object columns as needed
|
322
312
|
|
323
313
|
## Grouping Runs
|
324
314
|
|
@@ -331,6 +321,12 @@ model_groups = runs.group_by("model_type")
|
|
331
321
|
# Group by nested parameter using dot notation
|
332
322
|
architecture_groups = runs.group_by("model.architecture")
|
333
323
|
|
324
|
+
# Group by and include Run objects in the result DataFrame
|
325
|
+
model_groups_df = runs.group_by("model_type", "run")
|
326
|
+
|
327
|
+
# Include multiple object types in the result
|
328
|
+
grouped_df = runs.group_by("model_type", "batch_size", "run", "cfg")
|
329
|
+
|
334
330
|
# Iterate through groups
|
335
331
|
for model_type, group in model_groups.items():
|
336
332
|
print(f"Model type: {model_type}, Runs: {len(group)}")
|
@@ -343,75 +339,28 @@ param_groups = runs.group_by("model_type", "model__hidden_size", "optimizer__lea
|
|
343
339
|
|
344
340
|
# Access a specific group
|
345
341
|
transformer_001_group = param_groups[("transformer", 0.001)]
|
342
|
+
|
343
|
+
# Aggregating grouped runs using the agg method
|
344
|
+
# This returns a DataFrame with the aggregated results
|
345
|
+
model_counts = model_groups.agg(count=lambda runs: len(runs))
|
346
|
+
model_avg_loss = model_groups.agg(
|
347
|
+
avg_loss=lambda runs: sum(run.get("loss", 0) for run in runs) / len(runs),
|
348
|
+
min_loss=lambda runs: min(run.get("loss", float("inf")) for run in runs)
|
349
|
+
)
|
346
350
|
```
|
347
351
|
|
348
|
-
|
352
|
+
The `group_by` method returns a `GroupBy` instance that maps keys to `RunCollection` instances. This design allows you to:
|
349
353
|
|
350
354
|
- Work with each group as a separate `RunCollection` with all the filtering, sorting, and analysis capabilities
|
351
355
|
- Perform custom operations on each group that might not be expressible as simple aggregation functions
|
352
356
|
- Chain additional operations on specific groups that interest you
|
353
357
|
- Implement multi-stage analysis workflows where you need to maintain the full run information at each step
|
354
358
|
|
355
|
-
This
|
356
|
-
|
357
|
-
## Aggregation with Group By
|
358
|
-
|
359
|
-
Combine `group_by` with aggregation for powerful analysis:
|
360
|
-
|
361
|
-
```python
|
362
|
-
# Simple aggregation function using get method with callable defaults
|
363
|
-
def mean_accuracy(runs: RunCollection) -> float:
|
364
|
-
return runs.to_numpy(
|
365
|
-
"accuracy",
|
366
|
-
default=lambda run: run.get("val_accuracy", 0.0) * 0.9
|
367
|
-
).mean()
|
368
|
-
|
369
|
-
# Complex aggregation from implementation or configuration with fallbacks
|
370
|
-
def combined_metric(runs: RunCollection) -> float:
|
371
|
-
# Use callable defaults to handle missing values consistently
|
372
|
-
accuracies = runs.to_numpy("accuracy", default=lambda r: r.get("val_accuracy", 0.0))
|
373
|
-
precisions = runs.to_numpy("precision", default=lambda r: r.get("val_precision", 0.0))
|
374
|
-
return (accuracies.mean() + precisions.mean()) / 2
|
375
|
-
|
376
|
-
|
377
|
-
# Group by model type and calculate average accuracy
|
378
|
-
model_accuracies = runs.group_by(
|
379
|
-
"model_type",
|
380
|
-
accuracy=mean_accuracy
|
381
|
-
)
|
382
|
-
|
383
|
-
# Group by multiple parameters with multiple aggregations
|
384
|
-
results = runs.group_by(
|
385
|
-
"model_type",
|
386
|
-
"learning_rate",
|
387
|
-
count=len,
|
388
|
-
accuracy=mean_accuracy,
|
389
|
-
combined=combined_metric
|
390
|
-
)
|
391
|
-
|
392
|
-
# Group by parameters that might be missing in some runs using callable defaults
|
393
|
-
def normalize_architecture(run: Run) -> str:
|
394
|
-
# Get architecture with a fallback to model type if not available
|
395
|
-
arch = run.get("architecture", default=lambda r: r.get("model_type", "unknown"))
|
396
|
-
return arch.lower() # Normalize to lowercase
|
397
|
-
|
398
|
-
# Group by the normalized architecture
|
399
|
-
arch_results = runs.group_by(normalize_architecture, accuracy=mean_accuracy)
|
400
|
-
```
|
401
|
-
|
402
|
-
With the enhanced `get` method and callable defaults support throughout the API, writing aggregation
|
403
|
-
functions becomes more straightforward and robust. You can handle missing values consistently and
|
404
|
-
implement complex transformations that work across heterogeneous runs.
|
405
|
-
|
406
|
-
When aggregation functions are provided as keyword arguments, `group_by` returns a Polars DataFrame with the group keys and aggregated values. This design choice offers several advantages:
|
359
|
+
To perform aggregations on the grouped data, use the `agg` method on the GroupBy instance. This transforms the grouped data into a DataFrame with aggregated results. You can define multiple aggregation functions to compute different metrics across each group.
|
407
360
|
|
408
|
-
|
409
|
-
- Enables efficient downstream analysis using Polars' powerful DataFrame operations
|
410
|
-
- Simplifies visualization and reporting workflows
|
411
|
-
- Reduces memory usage by computing only the requested aggregations rather than maintaining full RunCollections
|
412
|
-
- Creates a clean interface that separates grouping from additional analysis steps
|
361
|
+
When special object keys (`"run"`, `"cfg"`, `"impl"`) are included in the `group_by` call, it returns a DataFrame with those objects included as columns, making it easy to access the original objects for further processing.
|
413
362
|
|
414
|
-
|
363
|
+
This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
|
415
364
|
|
416
365
|
## Type-Safe Run Collections
|
417
366
|
|
@@ -483,7 +432,7 @@ for run in runs:
|
|
483
432
|
type checking.
|
484
433
|
|
485
434
|
3. **Chain Operations**: Combine filtering, grouping,
|
486
|
-
and
|
435
|
+
and object extraction for efficient analysis workflows.
|
487
436
|
|
488
437
|
4. **Use DataFrame Integration**: Convert to DataFrames
|
489
438
|
for complex analysis and visualization needs.
|
@@ -492,6 +441,8 @@ for run in runs:
|
|
492
441
|
|
493
442
|
The [`RunCollection`][hydraflow.core.run_collection.RunCollection] class is a
|
494
443
|
powerful tool for comparative analysis of machine learning experiments. Its
|
495
|
-
filtering, grouping, and
|
444
|
+
filtering, grouping, and data extraction capabilities enable efficient extraction
|
496
445
|
of insights from large sets of experiments, helping you identify optimal
|
497
|
-
configurations and understand performance trends.
|
446
|
+
configurations and understand performance trends.
|
447
|
+
|
448
|
+
[hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
|
@@ -61,7 +61,7 @@ Where:
|
|
61
61
|
- `[overrides]` are optional Hydra-style parameter overrides
|
62
62
|
|
63
63
|
For more details on the CLI,
|
64
|
-
see the [Job Configuration](../part2-advanced/job-configuration.md
|
64
|
+
see the [Job Configuration](../part2-advanced/job-configuration.md)
|
65
65
|
documentation.
|
66
66
|
|
67
67
|
## Previewing Execution with Dry Run
|
@@ -288,7 +288,7 @@ Or dictionaries for multiple named columns:
|
|
288
288
|
|
289
289
|
### Grouping Runs
|
290
290
|
|
291
|
-
The [`group_by`][hydraflow.core.
|
291
|
+
The [`group_by`][hydraflow.core.collection.Collection.group_by] method organizes runs by common attributes:
|
292
292
|
|
293
293
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
294
294
|
>>> grouped = rc.group_by("width")
|
@@ -304,10 +304,11 @@ You can group by multiple keys:
|
|
304
304
|
... print(key, group)
|
305
305
|
```
|
306
306
|
|
307
|
-
Adding aggregation functions transforms the result into a DataFrame:
|
307
|
+
Adding aggregation functions using the `agg` method transforms the result into a DataFrame:
|
308
308
|
|
309
309
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
310
|
-
>>>
|
310
|
+
>>> grouped = rc.group_by("width")
|
311
|
+
>>> df = grouped.agg(n=lambda runs: len(runs))
|
311
312
|
>>> print(df)
|
312
313
|
```
|
313
314
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "hydraflow"
|
7
|
-
version = "0.
|
7
|
+
version = "0.17.0"
|
8
8
|
description = "HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities."
|
9
9
|
readme = "README.md"
|
10
10
|
license = { file = "LICENSE" }
|