hydraflow 0.16.2__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hydraflow-0.16.2 → hydraflow-0.17.1}/PKG-INFO +1 -1
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/index.md +20 -5
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/run-class.md +9 -6
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/run-collection.md +43 -100
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/advanced.md +1 -1
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/analysis.md +9 -6
- {hydraflow-0.16.2 → hydraflow-0.17.1}/pyproject.toml +1 -1
- hydraflow-0.17.1/src/hydraflow/core/collection.py +613 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/context.py +3 -4
- hydraflow-0.17.1/src/hydraflow/core/group_by.py +205 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/run.py +111 -62
- hydraflow-0.17.1/src/hydraflow/core/run_collection.py +215 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/run_info.py +0 -9
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run.py +50 -41
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run_collection.py +24 -68
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/test_run_info.py +0 -8
- hydraflow-0.17.1/tests/core/test_collection.py +298 -0
- hydraflow-0.17.1/tests/core/test_group_by.py +125 -0
- hydraflow-0.16.2/src/hydraflow/core/run_collection.py +0 -632
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/devcontainer.json +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/postCreate.sh +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.devcontainer/starship.toml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.gitattributes +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/ci.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/docs.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.github/workflows/publish.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/.gitignore +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/LICENSE +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/README.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/concepts.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/index.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/getting-started/installation.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/index.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/configuration.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/execution.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/index.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part1-applications/main-decorator.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/index.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/job-configuration.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part2-advanced/sweep-syntax.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/part3-analysis/updating-runs.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/applications.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/docs/practical-tutorials/index.md +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/example.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/hydraflow.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/examples/submit.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/mkdocs.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/cli.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/io.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/core/main.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/aio.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/conf.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/io.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/job.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/executor/parser.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/src/hydraflow/py.typed +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/app.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/conftest.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/hydraflow.yaml +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/submit.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_setup.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_show.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/cli/test_version.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/conftest.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/chdir.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/log_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/start_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_chdir.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_log_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/context/test_start_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/default.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/force_new_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/match_overrides.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/rerun_finished.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/skip_finished.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_default.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_force_new_run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_main.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_match_overrides.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_rerun_finished.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_skip_finished.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/test_update.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/main/update.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/run/run.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/core/test_io.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/__init__.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/conftest.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/echo.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/read.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_aio.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_args.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_conf.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_io.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_job.py +0 -0
- {hydraflow-0.16.2 → hydraflow-0.17.1}/tests/executor/test_parser.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: hydraflow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.17.1
|
4
4
|
Summary: HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities.
|
5
5
|
Project-URL: Documentation, https://daizutabi.github.io/hydraflow/
|
6
6
|
Project-URL: Source, https://github.com/daizutabi/hydraflow
|
@@ -20,11 +20,15 @@ The main components of HydraFlow's analysis tools are:
|
|
20
20
|
1. **[`Run`][hydraflow.core.run.Run] Class**: Represents a single experiment
|
21
21
|
run, providing access to configuration and artifacts.
|
22
22
|
|
23
|
-
2. **[`
|
24
|
-
|
25
|
-
|
23
|
+
2. **[`Collection`][hydraflow.core.collection.Collection] Class**: A generic base class
|
24
|
+
implementing the `Sequence` protocol with powerful filtering, grouping, and data
|
25
|
+
extraction capabilities.
|
26
26
|
|
27
|
-
3. **
|
27
|
+
3. **[`RunCollection`][hydraflow.core.run_collection.RunCollection] Class**:
|
28
|
+
A collection of `Run` instances with specialized tools for filtering, grouping, and
|
29
|
+
aggregating results, built on top of the `Collection` class.
|
30
|
+
|
31
|
+
4. **Data Analysis Integration**: Tools to convert experiment data into
|
28
32
|
Polars DataFrames for advanced analysis.
|
29
33
|
|
30
34
|
## Practical Examples
|
@@ -47,6 +51,12 @@ filtered_runs = runs.filter(learning_rate=0.01, model_type="transformer")
|
|
47
51
|
# Group runs by a parameter
|
48
52
|
grouped_runs = runs.group_by("batch_size")
|
49
53
|
|
54
|
+
# Aggregate grouped data
|
55
|
+
df_aggregated = grouped_runs.agg(
|
56
|
+
count=lambda runs: len(runs),
|
57
|
+
avg_accuracy=lambda runs: sum(run.get("accuracy", 0) for run in runs) / len(runs)
|
58
|
+
)
|
59
|
+
|
50
60
|
# Convert to DataFrame for analysis
|
51
61
|
df = runs.to_frame("learning_rate", "batch_size", accuracy=lambda run: run.get("accuracy"))
|
52
62
|
|
@@ -141,4 +151,9 @@ In the following pages, we'll explore HydraFlow's analysis tools in detail:
|
|
141
151
|
working with multiple runs.
|
142
152
|
|
143
153
|
- [Updating Runs](updating-runs.md): Learn how to update existing runs with
|
144
|
-
new metrics, tags, and artifacts.
|
154
|
+
new metrics, tags, and artifacts.
|
155
|
+
|
156
|
+
[hydraflow.core.run.Run]: ../../api/hydraflow/core/run.html#hydraflow.core.run.Run
|
157
|
+
[hydraflow.core.run_collection.RunCollection]: ../../api/hydraflow/core/run_collection.html#hydraflow.core.run_collection.RunCollection
|
158
|
+
[hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
|
159
|
+
[hydraflow.core.io.iter_run_dirs]: ../../api/hydraflow/core/io.html#hydraflow.core.io.iter_run_dirs
|
@@ -63,22 +63,25 @@ model_type = run.get("model__type") # Equivalent to "model.type"
|
|
63
63
|
metric_value = run.get("accuracy") # From impl or cfg
|
64
64
|
run_id = run.get("run_id") # From RunInfo
|
65
65
|
|
66
|
+
# Access special object keys
|
67
|
+
cfg = run.get("cfg") # Returns the complete configuration object
|
68
|
+
impl = run.get("impl") # Returns the implementation object
|
69
|
+
info = run.get("info") # Returns the run information object
|
70
|
+
|
66
71
|
# Provide a default value if the key doesn't exist
|
67
72
|
batch_size = run.get("batch_size", 32)
|
68
73
|
|
69
74
|
# Use a callable as default to dynamically generate values based on the run
|
70
75
|
# This is useful for derived parameters or conditional defaults
|
71
76
|
lr = run.get("learning_rate", default=lambda r: r.get("base_lr", 0.01) / 10)
|
72
|
-
|
73
|
-
# Complex default logic based on other parameters
|
74
|
-
steps = run.get("steps", default=lambda r: r.get("epochs", 10) * r.get("steps_per_epoch", 100))
|
75
77
|
```
|
76
78
|
|
77
79
|
The `get` method searches for values in the following order:
|
78
80
|
|
79
|
-
1.
|
80
|
-
2.
|
81
|
-
3.
|
81
|
+
1. In the configuration (`cfg`)
|
82
|
+
2. In the implementation instance (`impl`)
|
83
|
+
3. In the run information (`info`)
|
84
|
+
4. In the run object itself (`self`)
|
82
85
|
|
83
86
|
This provides a unified access interface regardless of where the data is stored.
|
84
87
|
|
@@ -5,6 +5,22 @@ powerful tool for working with multiple experiment runs. It provides methods
|
|
5
5
|
for filtering, grouping, and analyzing sets of [`Run`][hydraflow.core.run.Run]
|
6
6
|
instances, making it easy to compare and extract insights from your experiments.
|
7
7
|
|
8
|
+
## Architecture
|
9
|
+
|
10
|
+
`RunCollection` is built on top of the more general [`Collection`][hydraflow.core.collection.Collection]
|
11
|
+
class, which provides a flexible foundation for working with sequences of items. This architecture offers several benefits:
|
12
|
+
|
13
|
+
1. **Consistent Interface**: All collection-based classes in HydraFlow share a common interface and behavior
|
14
|
+
2. **Code Reuse**: Core functionality is implemented once in the base class and inherited by specialized collections
|
15
|
+
3. **Extensibility**: New collection types can easily be created for different item types
|
16
|
+
4. **Type Safety**: Generic type parameters ensure type checking throughout the collection hierarchy
|
17
|
+
|
18
|
+
The `Collection` class implements the Python `Sequence` protocol, allowing it to be used like standard Python
|
19
|
+
collections (lists, tuples) while providing specialized methods for filtering, grouping, and data extraction.
|
20
|
+
|
21
|
+
`RunCollection` extends this foundation with run-specific functionality, particularly for working with MLflow
|
22
|
+
experiment data. This layered design separates generic collection behavior from domain-specific operations.
|
23
|
+
|
8
24
|
## Creating a Run Collection
|
9
25
|
|
10
26
|
There are several ways to create a `RunCollection`:
|
@@ -101,7 +117,7 @@ multiple_models = runs.filter(model_type=["transformer", "lstm"])
|
|
101
117
|
def is_large_image(run: Run):
|
102
118
|
return run.get("width") + run.get("height") > 100
|
103
119
|
|
104
|
-
good_runs = runs.filter(
|
120
|
+
good_runs = runs.filter(is_large_image)
|
105
121
|
```
|
106
122
|
|
107
123
|
The double underscore notation (`__`) is particularly useful for accessing nested
|
@@ -133,7 +149,7 @@ def has_efficient_lr(run: Run) -> bool:
|
|
133
149
|
return lr * batch_size < 0.5
|
134
150
|
|
135
151
|
# Apply the complex predicate
|
136
|
-
efficient_runs = runs.filter(
|
152
|
+
efficient_runs = runs.filter(has_efficient_lr)
|
137
153
|
```
|
138
154
|
|
139
155
|
The combination of predicate functions with callable defaults in `get` enables sophisticated
|
@@ -250,6 +266,11 @@ df = runs.to_frame()
|
|
250
266
|
# DataFrame with specific configuration parameters
|
251
267
|
df = runs.to_frame("model_type", "learning_rate", "batch_size")
|
252
268
|
|
269
|
+
# Include Run, configuration, or implementation objects as columns
|
270
|
+
df = runs.to_frame("model_type", "learning_rate", "run") # Include Run objects
|
271
|
+
df = runs.to_frame("model_type", "cfg") # Include configuration objects
|
272
|
+
df = runs.to_frame("run_id", "run", "cfg", "impl") # Include all objects
|
273
|
+
|
253
274
|
# Specify default values for missing parameters using the defaults parameter
|
254
275
|
df = runs.to_frame(
|
255
276
|
"model_type",
|
@@ -258,17 +279,6 @@ df = runs.to_frame(
|
|
258
279
|
defaults={"learning_rate": 0.01, "batch_size": 32}
|
259
280
|
)
|
260
281
|
|
261
|
-
# Use callable defaults for dynamic values based on each run
|
262
|
-
df = runs.to_frame(
|
263
|
-
"model_type",
|
264
|
-
"learning_rate",
|
265
|
-
"epochs",
|
266
|
-
defaults={
|
267
|
-
"learning_rate": lambda run: run.get("base_lr", 0.01) * run.get("lr_multiplier", 1.0),
|
268
|
-
"epochs": lambda run: int(run.get("max_steps", 1000) / run.get("steps_per_epoch", 100))
|
269
|
-
}
|
270
|
-
)
|
271
|
-
|
272
282
|
# Missing values without defaults are represented as None (null) in the DataFrame
|
273
283
|
# This allows for standard handling of missing data in Polars
|
274
284
|
missing_values_df = runs.to_frame("model_type", "parameter_that_might_be_missing")
|
@@ -281,24 +291,6 @@ valid_rows = missing_values_df.filter(pl.col("parameter_that_might_be_missing").
|
|
281
291
|
filled_df = missing_values_df.with_columns(
|
282
292
|
pl.col("parameter_that_might_be_missing").fill_null("default_value")
|
283
293
|
)
|
284
|
-
|
285
|
-
# Using a custom function that returns multiple columns as keyword arguments
|
286
|
-
def get_metrics(run: Run) -> dict[str, float]:
|
287
|
-
return {
|
288
|
-
"accuracy": run.get("accuracy", default=lambda r: r.get("val_accuracy", 0.0) * 0.9),
|
289
|
-
"precision": run.get("precision", default=lambda r: r.get("val_precision", 0.0) * 0.9),
|
290
|
-
}
|
291
|
-
|
292
|
-
# Add custom columns using a function
|
293
|
-
df = runs.to_frame("model_type", metrics=get_metrics)
|
294
|
-
|
295
|
-
# Combine defaults with custom column generator functions
|
296
|
-
df = runs.to_frame(
|
297
|
-
"model_type",
|
298
|
-
"learning_rate",
|
299
|
-
defaults={"learning_rate": 0.01},
|
300
|
-
metrics=get_metrics
|
301
|
-
)
|
302
294
|
```
|
303
295
|
|
304
296
|
The `to_frame` method provides several ways to handle missing data:
|
@@ -313,12 +305,10 @@ The `to_frame` method provides several ways to handle missing data:
|
|
313
305
|
- Fill nulls: `df.with_columns(pl.col("param").fill_null(value))`
|
314
306
|
- Aggregations: Most aggregation functions handle nulls appropriately
|
315
307
|
|
316
|
-
3. **
|
317
|
-
|
318
|
-
-
|
319
|
-
|
320
|
-
These approaches can be combined to create flexible and robust data extraction pipelines
|
321
|
-
that handle different experiment configurations and parameter evolution over time.
|
308
|
+
3. **Special object keys**: Use the special keys `"run"`, `"cfg"`, and `"impl"` to include the actual
|
309
|
+
Run objects, configuration objects, or implementation objects in the DataFrame
|
310
|
+
- This allows direct access to the original objects for further operations
|
311
|
+
- You can combine regular data columns with object columns as needed
|
322
312
|
|
323
313
|
## Grouping Runs
|
324
314
|
|
@@ -343,75 +333,26 @@ param_groups = runs.group_by("model_type", "model__hidden_size", "optimizer__lea
|
|
343
333
|
|
344
334
|
# Access a specific group
|
345
335
|
transformer_001_group = param_groups[("transformer", 0.001)]
|
336
|
+
|
337
|
+
# Aggregating grouped runs using the agg method
|
338
|
+
# This returns a DataFrame with the aggregated results
|
339
|
+
model_counts = model_groups.agg(count=lambda runs: len(runs))
|
340
|
+
model_avg_loss = model_groups.agg(
|
341
|
+
avg_loss=lambda runs: sum(run.get("loss", 0) for run in runs) / len(runs),
|
342
|
+
min_loss=lambda runs: min(run.get("loss", float("inf")) for run in runs)
|
343
|
+
)
|
346
344
|
```
|
347
345
|
|
348
|
-
|
346
|
+
The `group_by` method returns a `GroupBy` instance that maps keys to `RunCollection` instances. This design allows you to:
|
349
347
|
|
350
348
|
- Work with each group as a separate `RunCollection` with all the filtering, sorting, and analysis capabilities
|
351
349
|
- Perform custom operations on each group that might not be expressible as simple aggregation functions
|
352
350
|
- Chain additional operations on specific groups that interest you
|
353
351
|
- Implement multi-stage analysis workflows where you need to maintain the full run information at each step
|
354
352
|
|
355
|
-
This
|
356
|
-
|
357
|
-
## Aggregation with Group By
|
358
|
-
|
359
|
-
Combine `group_by` with aggregation for powerful analysis:
|
360
|
-
|
361
|
-
```python
|
362
|
-
# Simple aggregation function using get method with callable defaults
|
363
|
-
def mean_accuracy(runs: RunCollection) -> float:
|
364
|
-
return runs.to_numpy(
|
365
|
-
"accuracy",
|
366
|
-
default=lambda run: run.get("val_accuracy", 0.0) * 0.9
|
367
|
-
).mean()
|
368
|
-
|
369
|
-
# Complex aggregation from implementation or configuration with fallbacks
|
370
|
-
def combined_metric(runs: RunCollection) -> float:
|
371
|
-
# Use callable defaults to handle missing values consistently
|
372
|
-
accuracies = runs.to_numpy("accuracy", default=lambda r: r.get("val_accuracy", 0.0))
|
373
|
-
precisions = runs.to_numpy("precision", default=lambda r: r.get("val_precision", 0.0))
|
374
|
-
return (accuracies.mean() + precisions.mean()) / 2
|
375
|
-
|
376
|
-
|
377
|
-
# Group by model type and calculate average accuracy
|
378
|
-
model_accuracies = runs.group_by(
|
379
|
-
"model_type",
|
380
|
-
accuracy=mean_accuracy
|
381
|
-
)
|
382
|
-
|
383
|
-
# Group by multiple parameters with multiple aggregations
|
384
|
-
results = runs.group_by(
|
385
|
-
"model_type",
|
386
|
-
"learning_rate",
|
387
|
-
count=len,
|
388
|
-
accuracy=mean_accuracy,
|
389
|
-
combined=combined_metric
|
390
|
-
)
|
391
|
-
|
392
|
-
# Group by parameters that might be missing in some runs using callable defaults
|
393
|
-
def normalize_architecture(run: Run) -> str:
|
394
|
-
# Get architecture with a fallback to model type if not available
|
395
|
-
arch = run.get("architecture", default=lambda r: r.get("model_type", "unknown"))
|
396
|
-
return arch.lower() # Normalize to lowercase
|
397
|
-
|
398
|
-
# Group by the normalized architecture
|
399
|
-
arch_results = runs.group_by(normalize_architecture, accuracy=mean_accuracy)
|
400
|
-
```
|
401
|
-
|
402
|
-
With the enhanced `get` method and callable defaults support throughout the API, writing aggregation
|
403
|
-
functions becomes more straightforward and robust. You can handle missing values consistently and
|
404
|
-
implement complex transformations that work across heterogeneous runs.
|
405
|
-
|
406
|
-
When aggregation functions are provided as keyword arguments, `group_by` returns a Polars DataFrame with the group keys and aggregated values. This design choice offers several advantages:
|
407
|
-
|
408
|
-
- Directly produces analysis-ready results with all aggregations computed in a single operation
|
409
|
-
- Enables efficient downstream analysis using Polars' powerful DataFrame operations
|
410
|
-
- Simplifies visualization and reporting workflows
|
411
|
-
- Reduces memory usage by computing only the requested aggregations rather than maintaining full RunCollections
|
412
|
-
- Creates a clean interface that separates grouping from additional analysis steps
|
353
|
+
To perform aggregations on the grouped data, use the `agg` method on the GroupBy instance. This transforms the grouped data into a DataFrame with aggregated results. You can define multiple aggregation functions to compute different metrics across each group.
|
413
354
|
|
414
|
-
|
355
|
+
This approach preserves all information in each group, giving you maximum flexibility for downstream analysis.
|
415
356
|
|
416
357
|
## Type-Safe Run Collections
|
417
358
|
|
@@ -483,7 +424,7 @@ for run in runs:
|
|
483
424
|
type checking.
|
484
425
|
|
485
426
|
3. **Chain Operations**: Combine filtering, grouping,
|
486
|
-
and
|
427
|
+
and object extraction for efficient analysis workflows.
|
487
428
|
|
488
429
|
4. **Use DataFrame Integration**: Convert to DataFrames
|
489
430
|
for complex analysis and visualization needs.
|
@@ -492,6 +433,8 @@ for run in runs:
|
|
492
433
|
|
493
434
|
The [`RunCollection`][hydraflow.core.run_collection.RunCollection] class is a
|
494
435
|
powerful tool for comparative analysis of machine learning experiments. Its
|
495
|
-
filtering, grouping, and
|
436
|
+
filtering, grouping, and data extraction capabilities enable efficient extraction
|
496
437
|
of insights from large sets of experiments, helping you identify optimal
|
497
|
-
configurations and understand performance trends.
|
438
|
+
configurations and understand performance trends.
|
439
|
+
|
440
|
+
[hydraflow.core.collection.Collection]: ../../api/hydraflow/core/collection.html#hydraflow.core.collection.Collection
|
@@ -61,7 +61,7 @@ Where:
|
|
61
61
|
- `[overrides]` are optional Hydra-style parameter overrides
|
62
62
|
|
63
63
|
For more details on the CLI,
|
64
|
-
see the [Job Configuration](../part2-advanced/job-configuration.md
|
64
|
+
see the [Job Configuration](../part2-advanced/job-configuration.md)
|
65
65
|
documentation.
|
66
66
|
|
67
67
|
## Previewing Execution with Dry Run
|
@@ -220,7 +220,7 @@ You can perform basic operations on a collection:
|
|
220
220
|
|
221
221
|
### Filtering Runs
|
222
222
|
|
223
|
-
The [`filter`][hydraflow.core.
|
223
|
+
The [`filter`][hydraflow.core.collection.Collection.filter] method lets you select runs based on various criteria:
|
224
224
|
|
225
225
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
226
226
|
>>> print(rc.filter(width=400))
|
@@ -246,7 +246,7 @@ You can even use custom filter functions:
|
|
246
246
|
|
247
247
|
### Finding Specific Runs
|
248
248
|
|
249
|
-
The [`get`][hydraflow.core.
|
249
|
+
The [`get`][hydraflow.core.collection.Collection.get] method returns a single run matching your criteria:
|
250
250
|
|
251
251
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
252
252
|
>>> run = rc.get(width=250, height=(100, 200))
|
@@ -288,7 +288,7 @@ Or dictionaries for multiple named columns:
|
|
288
288
|
|
289
289
|
### Grouping Runs
|
290
290
|
|
291
|
-
The [`group_by`][hydraflow.core.
|
291
|
+
The [`group_by`][hydraflow.core.collection.Collection.group_by] method organizes runs by common attributes:
|
292
292
|
|
293
293
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
294
294
|
>>> grouped = rc.group_by("width")
|
@@ -304,10 +304,12 @@ You can group by multiple keys:
|
|
304
304
|
... print(key, group)
|
305
305
|
```
|
306
306
|
|
307
|
-
Adding aggregation functions
|
307
|
+
Adding aggregation functions using the [`agg`][hydraflow.core.collection.Collection.agg]
|
308
|
+
method transforms the result into a DataFrame:
|
308
309
|
|
309
310
|
```pycon exec="1" source="console" session="results" workdir="examples"
|
310
|
-
>>>
|
311
|
+
>>> grouped = rc.group_by("width")
|
312
|
+
>>> df = grouped.agg(n=lambda runs: len(runs))
|
311
313
|
>>> print(df)
|
312
314
|
```
|
313
315
|
|
@@ -321,7 +323,8 @@ In this tutorial, you've learned how to:
|
|
321
323
|
4. Filter, group, and analyze collections of runs
|
322
324
|
5. Convert run data to DataFrames for advanced analysis
|
323
325
|
|
324
|
-
These capabilities enable you to efficiently analyze your experiments and extract
|
326
|
+
These capabilities enable you to efficiently analyze your experiments and extract
|
327
|
+
valuable insights from your machine learning workflows.
|
325
328
|
|
326
329
|
## Next Steps
|
327
330
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "hydraflow"
|
7
|
-
version = "0.
|
7
|
+
version = "0.17.1"
|
8
8
|
description = "HydraFlow seamlessly integrates Hydra and MLflow to streamline ML experiment management, combining Hydra's configuration management with MLflow's tracking capabilities."
|
9
9
|
readme = "README.md"
|
10
10
|
license = { file = "LICENSE" }
|