expops 0.1.16.dev0__tar.gz → 0.1.18.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/PKG-INFO +1 -1
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/data-parallelism.md +9 -10
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/pipelines.md +7 -5
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/reporting.md +9 -32
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/seed-parallelism.md +15 -14
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/project-structure/configuration.md +8 -3
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/project-structure/overview.md +0 -2
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/PKG-INFO +1 -1
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/SOURCES.txt +6 -1
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/_version.py +3 -3
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/custom/custom_adapter.py +27 -12
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/executor_worker.py +127 -15
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/networkx_parser.py +35 -11
- expops-0.1.18.dev0/src/mlops/core/pipeline_tree.py +524 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/pipeline_utils.py +34 -5
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/probe_path_selectors.py +6 -1
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/process_hashing.py +19 -5
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/step_state_manager.py +80 -17
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/step_system.py +3 -5
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/managers/project_manager.py +91 -11
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/platform.py +274 -12
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/reporting/entrypoint.py +61 -2
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/factory.py +26 -12
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/charts/plot_metrics.js +18 -11
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/charts/plot_metrics.py +7 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/configs/project_config.yaml +21 -21
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/configs/project_config.yaml +3 -3
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/server.py +46 -13
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/ui/mlops-charts.js +64 -13
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/ui/script.js +124 -26
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/testing-plan.md +17 -7
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_executor_worker.py +72 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_graph_expansion.py +29 -0
- expops-0.1.18.dev0/tests/unit/test_core/test_networkx_parser_code_field.py +78 -0
- expops-0.1.18.dev0/tests/unit/test_core/test_pipeline_tree_steps.py +101 -0
- expops-0.1.18.dev0/tests/unit/test_core/test_probe_path_selectors_xpath.py +81 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_process_hashing.py +1 -1
- expops-0.1.18.dev0/tests/unit/test_core/test_step_state_manager.py +90 -0
- expops-0.1.18.dev0/tests/unit/test_platform/test_dynamic_js_charts.py +146 -0
- expops-0.1.18.dev0/tests/unit/test_platform/test_project_metadata_resilience.py +94 -0
- expops-0.1.18.dev0/tests/unit/test_reporting/test_entrypoint.py +53 -0
- expops-0.1.16.dev0/docs/project-structure/charts.md +0 -160
- expops-0.1.16.dev0/src/mlops/core/pipeline_tree.py +0 -234
- expops-0.1.16.dev0/tests/unit/test_core/test_step_state_manager.py +0 -49
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/.github/workflows/ci.yml +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/.github/workflows/release.yml +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/.gitignore +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/LICENSE +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/README.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/advanced/backends.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/caching.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/distributed.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/features/environments.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/getting-started/creating-a-project.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/getting-started/quick-start.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/index.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/project-structure/model-code.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/requirements.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/templates/premier-league.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/templates/sklearn-basic.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/docs/web-ui/local-ui.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/pyproject.toml +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/setup.cfg +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/__main__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/core/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/main.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/reporting/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/reporting/context.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/reporting/registry.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/web/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops/web/server.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/dependency_links.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/entry_points.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/requires.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/expops.egg-info/top_level.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/__main__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/base.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/config_schema.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/custom/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/plugin_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/sklearn/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/adapters/sklearn/adapter.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/cluster/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/cluster/controller.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/cluster/process_runner.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/cluster/providers.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/compute_config.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/custom_model_base.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/dask_networkx_executor.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/data_hashing.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/graph_expansion.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/graph_types.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/payload_spill.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/core/workspace.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/base.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/conda_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/factory.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/pyenv_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/setup_env.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/system_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/utils.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/environment/venv_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/main.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/managers/reproducibility_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/reporting/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/reporting/context.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/reporting/kv_utils.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/reporting/registry.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/runtime/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/runtime/context.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/runtime/env_export.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/gcp_kv_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/gcs_object_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/memory_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/redis_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/adapters/sqlite_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/interfaces/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/interfaces/kv_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/storage/path_utils.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/charts/requirements.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/configs/compute_config.yaml +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/data/England CSV.csv +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/models/premier_league_model.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/premier-league/requirements.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/README.md +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/charts/plot_metrics.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/charts/requirements.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/configs/compute_config.yaml +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/data/train.csv +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/models/model.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/templates/sklearn-basic/requirements.txt +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/ui/index.html +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/src/mlops/web/ui/styles.css +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/conftest.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_dask_networkx_executor.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_data_hashing.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_networkx_parser.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_payload_spill.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_prepare_runner_kwargs.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_core/test_step_system.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_managers/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_managers/test_reproducibility_manager.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_storage/__init__.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_storage/test_factory.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_storage/test_gcp_kv_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_storage/test_gcs_object_store.py +0 -0
- {expops-0.1.16.dev0 → expops-0.1.18.dev0}/tests/unit/test_storage/test_sqlite_store.py +0 -0
|
@@ -5,7 +5,7 @@ Use it when a process returns row-splittable data (pandas DataFrame, numpy array
|
|
|
5
5
|
|
|
6
6
|
## Config
|
|
7
7
|
|
|
8
|
-
### Split from upstream (no `
|
|
8
|
+
### Split from upstream (no `code`)
|
|
9
9
|
|
|
10
10
|
```yaml
|
|
11
11
|
processes:
|
|
@@ -17,16 +17,16 @@ processes:
|
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
- `data_name` tells the system which upstream output key to split.
|
|
20
|
-
- The split process has no `
|
|
21
|
-
-
|
|
20
|
+
- The split process has no `code`; it will merge its upstream outputs and split the `data_name` key.
|
|
21
|
+
- The script path for such helper split nodes is optional.
|
|
22
22
|
|
|
23
|
-
### User-defined splitter (`
|
|
23
|
+
### User-defined splitter (`code`)
|
|
24
24
|
|
|
25
25
|
```yaml
|
|
26
26
|
processes:
|
|
27
27
|
- name: "nn_data_parallel"
|
|
28
28
|
description: "Custom splitter"
|
|
29
|
-
|
|
29
|
+
code: "define_nn_data_parallel"
|
|
30
30
|
data_parallelism:
|
|
31
31
|
size: [50, 20, 20]
|
|
32
32
|
```
|
|
@@ -42,7 +42,7 @@ def define_nn_data_parallel():
|
|
|
42
42
|
|
|
43
43
|
- Return a **list of rows**; the framework will split it based on `size`.
|
|
44
44
|
- If `data_name` is omitted and the function returns a list, the data key defaults to `data`.
|
|
45
|
-
- `script_path` is required when using `
|
|
45
|
+
- `script_path` is required when using an explicit `code` function.
|
|
46
46
|
|
|
47
47
|
### Size formats
|
|
48
48
|
|
|
@@ -57,10 +57,10 @@ Add a dedicated aggregation process to collapse the **latest** data-parallel lay
|
|
|
57
57
|
processes:
|
|
58
58
|
- name: "aggregate_results"
|
|
59
59
|
data_aggregation: true
|
|
60
|
-
|
|
60
|
+
code: "define_aggregate_results"
|
|
61
61
|
```
|
|
62
62
|
|
|
63
|
-
Aggregation processes **must** define a `
|
|
63
|
+
Aggregation processes **must** define a `code` function reference.
|
|
64
64
|
|
|
65
65
|
### Input shape at aggregation
|
|
66
66
|
|
|
@@ -76,8 +76,7 @@ def define_aggregate_results(df):
|
|
|
76
76
|
## Notes
|
|
77
77
|
|
|
78
78
|
- Data parallelism duplicates downstream nodes in the graph and is visible in the UI.
|
|
79
|
-
- Chart probe paths can use selector syntax to automatically expand to data-parallel
|
|
80
|
-
partitions; see the chart documentation for details.
|
|
79
|
+
- Chart probe paths can use XPath selector syntax over the pipeline tree (including `@partition='p1'` predicates) to automatically expand to data-parallel partitions; see the chart documentation for details.
|
|
81
80
|
- Multiple data-parallel layers are supported. Each aggregation collapses only the
|
|
82
81
|
most recent data-parallel layer; outer layers are still represented by separate
|
|
83
82
|
process nodes.
|
|
@@ -46,12 +46,12 @@ Each process must be explicitly defined with its configuration:
|
|
|
46
46
|
processes:
|
|
47
47
|
- name: "feature_engineering"
|
|
48
48
|
description: "Load and prepare data"
|
|
49
|
-
|
|
49
|
+
code: "define_feature_engineering_process"
|
|
50
50
|
environment: "my-project-env"
|
|
51
51
|
|
|
52
52
|
- name: "train_model"
|
|
53
53
|
description: "Train the model"
|
|
54
|
-
|
|
54
|
+
code: "define_training_process"
|
|
55
55
|
environment: "my-project-env"
|
|
56
56
|
parameters:
|
|
57
57
|
learning_rate: 0.001
|
|
@@ -67,8 +67,10 @@ processes:
|
|
|
67
67
|
|
|
68
68
|
- `name`: Unique process identifier (must match names in `process_adjlist`)
|
|
69
69
|
- `description`: Human-readable description
|
|
70
|
-
- `
|
|
71
|
-
- `
|
|
70
|
+
- `code` (optional): Unified code reference for the process:
|
|
71
|
+
- `code: "script_key.function_name"` uses the script registered under `script_key` in the top-level `scripts` map and calls `function_name`.
|
|
72
|
+
- `code: "function_name"` uses the first script key in `scripts` and calls `function_name`.
|
|
73
|
+
- If omitted, ordinary processes default to a function with the same name as the process on the default script, while pure split/aggregation helper nodes can omit `code` entirely.
|
|
72
74
|
- `environment`: Environment name to use (defaults to the first environment if omitted)
|
|
73
75
|
- `parameters`: Optional parameters injected by name into the process function
|
|
74
76
|
- `type`: Optional type (e.g., `"chart"` for chart generation processes)
|
|
@@ -77,7 +79,7 @@ processes:
|
|
|
77
79
|
|
|
78
80
|
## Process Functions
|
|
79
81
|
|
|
80
|
-
Processes are implemented in Python using the `@process()` decorator. The function name
|
|
82
|
+
Processes are implemented in Python using the `@process()` decorator. The function name referenced in `code` (or the process name when `code` is omitted) must match the registered process:
|
|
81
83
|
|
|
82
84
|
```python
|
|
83
85
|
from expops.core import process, step,
|
|
@@ -69,8 +69,8 @@ reporting:
|
|
|
69
69
|
charts:
|
|
70
70
|
- name: "my_chart"
|
|
71
71
|
probe_paths:
|
|
72
|
-
train: "//*[@name
|
|
73
|
-
eval: "//*[@name
|
|
72
|
+
train: "//*[@name='train_model']"
|
|
73
|
+
eval: "//*[@name='evaluate_model']"
|
|
74
74
|
```
|
|
75
75
|
|
|
76
76
|
The chart function receives:
|
|
@@ -100,38 +100,15 @@ Common patterns:
|
|
|
100
100
|
|
|
101
101
|
| Goal | XPath pattern |
|
|
102
102
|
|------|----------------|
|
|
103
|
-
| Process by name | `//*[@name=
|
|
104
|
-
| Process + step | `//*[@name=
|
|
105
|
-
| Specific partition/seed | `//*[@partition=
|
|
106
|
-
| Any partition/seed | `//*[@partition]/*[@seed]/*[@name=
|
|
107
|
-
|
|
108
|
-
In `project_config.yaml`, probe paths are double-quoted YAML strings, so escape each `"` as `\"` (see examples below).
|
|
109
|
-
|
|
110
|
-
**Examples from projects:**
|
|
111
|
-
|
|
112
|
-
- **sklearn-basic** (simple pipeline, no steps):
|
|
113
|
-
|
|
114
|
-
```yaml
|
|
115
|
-
probe_paths:
|
|
116
|
-
train: "//*[@name=\"train_model\"]"
|
|
117
|
-
eval: "//*[@name=\"evaluate_model\"]"
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
- **premier-league** (process + step; partition and seed):
|
|
121
|
-
|
|
122
|
-
```yaml
|
|
123
|
-
probe_paths:
|
|
124
|
-
feat: "//*[@name=\"feature_engineering_generic\"]/step[@name=\"feature_analysis\"]"
|
|
125
|
-
nn_a_p1_seed41: "//*[@partition=\"p1\"]/*[@seed=\"41\"]/*[@name=\"nn_training_a\"]/*[@name=\"train_and_evaluate_nn_classifier\"]"
|
|
126
|
-
linear: "//*[@partition]/*[@seed]/*[@name=\"linear_inference\"]/*[@name=\"test_inference_classification\"]"
|
|
127
|
-
nn_best: "//*[@partition]/*[@seed]/*[@name=\"nn_best_inference\"]/step[@name=\"test_inference_classification\"]"
|
|
128
|
-
ensemble: "//*[@partition]/*[@seed]/*[@name=\"ensemble_inference\"]"
|
|
129
|
-
```
|
|
103
|
+
| Process by name | `//*[@name='process_name']` |
|
|
104
|
+
| Process + step | `//*[@name='process_name']/*[@name='step_name']` or `//*[@name='step_name']` if the step name is unique among process names |
|
|
105
|
+
| Specific partition/seed | `//*[@partition='p1']/*[@seed='41']/*[@name='process_name']` |
|
|
106
|
+
| Any partition/seed | `//*[@partition]/*[@seed]/*[@name='process_name']` |
|
|
130
107
|
|
|
131
108
|
#### How keys map to chart metrics
|
|
132
109
|
|
|
133
110
|
- **One XPath match**: The config key is preserved (e.g. `train` → `metrics['train']`).
|
|
134
|
-
- **Multiple XPath matches**: Each resolved probe path becomes a key (e.g. `
|
|
111
|
+
- **Multiple XPath matches**: Each resolved probe path becomes a key. The key is the canonical XPath-style identifier for that process/step (e.g. `"//*[@partition='p1']/*[@seed='41']/*[@name='nn_training_a']/step[@name='train_and_evaluate_nn_classifier']"`). Chart code can iterate over keys or use prefix/grouping logic to aggregate across partitions or seeds.
|
|
135
112
|
- **Literal path**: Single key as in config (e.g. `train: "train_model"` → `metrics['train']`).
|
|
136
113
|
|
|
137
114
|
### Output
|
|
@@ -147,7 +124,7 @@ Dynamic charts provide real-time, interactive visualizations.
|
|
|
147
124
|
|
|
148
125
|
**Configuration**: Dynamic charts are defined as **pipeline processes** in `project_config.yaml` (under `experiment.parameters.pipeline.processes`). Each dynamic chart process must have:
|
|
149
126
|
|
|
150
|
-
- `
|
|
127
|
+
- `code` - a unified code reference that points to your JS chart script and function (e.g. `code: "reporting_js.nn_losses"`), where `reporting_js` is defined under `scripts:` at the top of the config
|
|
151
128
|
- `chart_type: "dynamic"`
|
|
152
129
|
- `probe_paths` - same XPath semantics as static charts (see [Probe paths](#probe-paths) below)
|
|
153
130
|
|
|
@@ -158,7 +135,7 @@ Dynamic charts provide real-time, interactive visualizations.
|
|
|
158
135
|
```yaml
|
|
159
136
|
# Under experiment.parameters.pipeline.processes:
|
|
160
137
|
- name: "nn_losses"
|
|
161
|
-
|
|
138
|
+
code: "reporting_js.nn_losses"
|
|
162
139
|
environment: "premier-league-env-reporting"
|
|
163
140
|
chart_type: "dynamic"
|
|
164
141
|
probe_paths: ...
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
### Seed Parallelism
|
|
2
2
|
|
|
3
3
|
Seed parallelism duplicates the downstream process graph and runs each duplicate under a different random seed. It uses deterministic task-level seeding so that each duplicated branch is reproducible.
|
|
4
4
|
|
|
5
5
|
## Config
|
|
6
6
|
|
|
7
|
-
### Predefined seed split (no `
|
|
7
|
+
### Predefined seed split (no `code`)
|
|
8
8
|
|
|
9
9
|
```yaml
|
|
10
10
|
processes:
|
|
@@ -14,18 +14,17 @@ processes:
|
|
|
14
14
|
seeds: [41, 42, 43]
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
-
- The split process has no `
|
|
17
|
+
- The split process has no `code`; it will merge upstream outputs and pass them through.
|
|
18
18
|
- The engine duplicates all downstream nodes until a `seed_aggregation` node is reached.
|
|
19
|
-
- Duplicated nodes
|
|
20
|
-
- `script_path` is optional for split nodes with no `code_function`.
|
|
19
|
+
- Duplicated nodes are tracked via structured `seed_value` metadata on each process instance.
|
|
21
20
|
|
|
22
|
-
### User-defined seed split (`
|
|
21
|
+
### User-defined seed split (`code`)
|
|
23
22
|
|
|
24
23
|
```yaml
|
|
25
24
|
processes:
|
|
26
25
|
- name: "seed_parallel"
|
|
27
26
|
description: "Custom seed hook"
|
|
28
|
-
|
|
27
|
+
code: "define_seed_parallel_process"
|
|
29
28
|
seed_parallelism:
|
|
30
29
|
seeds: [41, 42, 43]
|
|
31
30
|
```
|
|
@@ -39,7 +38,6 @@ def define_seed_parallel_process(seeds, **inputs):
|
|
|
39
38
|
```
|
|
40
39
|
|
|
41
40
|
- The `seeds` list is passed to the user-defined function as a kwarg.
|
|
42
|
-
- `script_path` is required when using `code_function`.
|
|
43
41
|
|
|
44
42
|
## Aggregation
|
|
45
43
|
|
|
@@ -49,7 +47,7 @@ Add a dedicated aggregation process to collapse the **latest** seed-parallel lay
|
|
|
49
47
|
processes:
|
|
50
48
|
- name: "aggregate_seeds"
|
|
51
49
|
seed_aggregation: true
|
|
52
|
-
|
|
50
|
+
code: "define_aggregate_seeds"
|
|
53
51
|
```
|
|
54
52
|
|
|
55
53
|
### Input shape at aggregation
|
|
@@ -74,8 +72,11 @@ def define_aggregate_data_and_seed(metrics):
|
|
|
74
72
|
|
|
75
73
|
## Notes
|
|
76
74
|
|
|
77
|
-
- Seed
|
|
78
|
-
|
|
79
|
-
seed
|
|
80
|
-
|
|
81
|
-
|
|
75
|
+
- **UI representation**: Seed-parallel duplicates now use canonical XPath-style
|
|
76
|
+
process IDs in the process graph (for example
|
|
77
|
+
`//*[@partition='p1']/*[@seed='41']/process[@name='train']`), and human-readable
|
|
78
|
+
labels such as `train P1 S41` are derived from this metadata.
|
|
79
|
+
- **Task-level RNG seeding**: Uses the most recent (innermost) seed layer when
|
|
80
|
+
multiple seed-parallel layers are nested.
|
|
81
|
+
- **Aggregation behavior**: Each aggregation collapses only the most recent
|
|
82
|
+
seed-parallel layer; outer layers remain as separate process nodes.
|
|
@@ -41,8 +41,13 @@ reporting: # Chart entrypoints, probe paths, reporting environment
|
|
|
41
41
|
|
|
42
42
|
You can omit some process fields and rely on defaults to keep config minimal:
|
|
43
43
|
|
|
44
|
-
- **
|
|
45
|
-
- **
|
|
44
|
+
- **Scripts section**: The top-level `scripts` map still defines script keys to file paths. The **first key** is treated as the default script for processes that do not explicitly specify a script key in `code`.
|
|
45
|
+
- **Code field**: Each process may define a single `code` field instead of separate `script` and `code_function` fields:
|
|
46
|
+
- `code: "script_key.function_name"` → use the script registered under `script_key` and call `function_name` from that module.
|
|
47
|
+
- `code: "function_name"` → use the first script key in `scripts` and call `function_name`.
|
|
48
|
+
- **Omitted code**:
|
|
49
|
+
- For ordinary processes, if `code` is omitted the system assumes a function with the same name as the process, loaded from the default script.
|
|
50
|
+
- For data/seed split helper nodes (processes that only define `data_parallelism` or `seed_parallelism` and no `code`), the system treats them as function-less split nodes.
|
|
46
51
|
|
|
47
52
|
Example minimal process that uses both defaults:
|
|
48
53
|
|
|
@@ -88,7 +93,7 @@ For detailed information on each configuration section:
|
|
|
88
93
|
- **Process & Step Code**: [Model Code](model-code.md)
|
|
89
94
|
- **Caching**: [Caching & Reproducibility](../features/caching.md)
|
|
90
95
|
- **Backends**: [Backends](../advanced/backends.md)
|
|
91
|
-
- **Reporting/Charts**: [Reporting Features](../features/reporting.md)
|
|
96
|
+
- **Reporting/Charts**: [Reporting Features](../features/reporting.md)
|
|
92
97
|
- **Cluster Execution**: [Cluster Configuration](../advanced/cluster-config.md) and [Distributed Computing](../features/distributed.md)
|
|
93
98
|
|
|
94
99
|
## Example Configurations
|
|
@@ -49,8 +49,6 @@ The `charts/` directory contains visualization code:
|
|
|
49
49
|
- **plot_metrics.py**: Static PNG chart generation
|
|
50
50
|
- **plot_metrics.js**: Dynamic interactive charts
|
|
51
51
|
|
|
52
|
-
See [Chart Generation](charts.md) for details.
|
|
53
|
-
|
|
54
52
|
### Dependencies
|
|
55
53
|
|
|
56
54
|
- **requirements.txt**: Main dependencies for training/inference
|
|
@@ -16,7 +16,6 @@ docs/features/reporting.md
|
|
|
16
16
|
docs/features/seed-parallelism.md
|
|
17
17
|
docs/getting-started/creating-a-project.md
|
|
18
18
|
docs/getting-started/quick-start.md
|
|
19
|
-
docs/project-structure/charts.md
|
|
20
19
|
docs/project-structure/configuration.md
|
|
21
20
|
docs/project-structure/model-code.md
|
|
22
21
|
docs/project-structure/overview.md
|
|
@@ -134,13 +133,19 @@ tests/unit/test_core/test_data_hashing.py
|
|
|
134
133
|
tests/unit/test_core/test_executor_worker.py
|
|
135
134
|
tests/unit/test_core/test_graph_expansion.py
|
|
136
135
|
tests/unit/test_core/test_networkx_parser.py
|
|
136
|
+
tests/unit/test_core/test_networkx_parser_code_field.py
|
|
137
137
|
tests/unit/test_core/test_payload_spill.py
|
|
138
|
+
tests/unit/test_core/test_pipeline_tree_steps.py
|
|
138
139
|
tests/unit/test_core/test_prepare_runner_kwargs.py
|
|
140
|
+
tests/unit/test_core/test_probe_path_selectors_xpath.py
|
|
139
141
|
tests/unit/test_core/test_process_hashing.py
|
|
140
142
|
tests/unit/test_core/test_step_state_manager.py
|
|
141
143
|
tests/unit/test_core/test_step_system.py
|
|
142
144
|
tests/unit/test_managers/__init__.py
|
|
143
145
|
tests/unit/test_managers/test_reproducibility_manager.py
|
|
146
|
+
tests/unit/test_platform/test_dynamic_js_charts.py
|
|
147
|
+
tests/unit/test_platform/test_project_metadata_resilience.py
|
|
148
|
+
tests/unit/test_reporting/test_entrypoint.py
|
|
144
149
|
tests/unit/test_storage/__init__.py
|
|
145
150
|
tests/unit/test_storage/test_factory.py
|
|
146
151
|
tests/unit/test_storage/test_gcp_kv_store.py
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.18.dev0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 18, 'dev0')
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g45a2e6dab'
|
|
@@ -181,20 +181,34 @@ class CustomModelAdapter(ModelAdapter):
|
|
|
181
181
|
probe_paths = proc.get("probe_paths")
|
|
182
182
|
chart_type = proc.get("chart_type")
|
|
183
183
|
is_chart = (proc_type == "chart") or (probe_paths is not None) or (chart_type is not None)
|
|
184
|
+
|
|
185
|
+
# Derive script key from unified `code` field where possible.
|
|
186
|
+
raw_code = proc.get("code")
|
|
187
|
+
code_str = str(raw_code).strip() if isinstance(raw_code, str) else ""
|
|
188
|
+
has_explicit_code = bool(code_str)
|
|
189
|
+
has_data_parallelism = proc.get("data_parallelism") is not None
|
|
190
|
+
has_seed_parallelism = proc.get("seed_parallelism") is not None
|
|
191
|
+
is_default_split = (not has_explicit_code) and (has_data_parallelism or has_seed_parallelism)
|
|
192
|
+
|
|
193
|
+
script_key: str | None = None
|
|
194
|
+
if has_explicit_code:
|
|
195
|
+
if "." in code_str:
|
|
196
|
+
script_key, _ = code_str.rsplit(".", 1)
|
|
197
|
+
script_key = script_key or None
|
|
198
|
+
else:
|
|
199
|
+
# No explicit script key: use default script
|
|
200
|
+
script_key = None
|
|
201
|
+
|
|
184
202
|
if is_chart:
|
|
185
|
-
script_key = proc.get("script")
|
|
186
203
|
script_path = resolve_script_path(script_key, scripts_section, default_script) if scripts_section else None
|
|
187
204
|
if script_path and str(script_path).lower().endswith(".py"):
|
|
188
205
|
chart_script_paths.append(str(script_path))
|
|
189
206
|
continue
|
|
190
|
-
|
|
207
|
+
|
|
191
208
|
script_path = resolve_script_path(script_key, scripts_section, default_script) if scripts_section else None
|
|
192
|
-
has_code_function = bool(proc.get("code_function"))
|
|
193
|
-
has_data_parallelism = proc.get("data_parallelism") is not None
|
|
194
|
-
has_seed_parallelism = proc.get("seed_parallelism") is not None
|
|
195
|
-
is_default_split = (not has_code_function) and (has_data_parallelism or has_seed_parallelism)
|
|
196
209
|
if not script_path:
|
|
197
210
|
if is_default_split:
|
|
211
|
+
# Default data/seed split nodes without explicit code may omit scripts.
|
|
198
212
|
continue
|
|
199
213
|
missing_script_paths.append(str(proc.get("name", "unknown")))
|
|
200
214
|
continue
|
|
@@ -204,7 +218,7 @@ class CustomModelAdapter(ModelAdapter):
|
|
|
204
218
|
missing = ", ".join(sorted(set(missing_script_paths)))
|
|
205
219
|
raise ValueError(
|
|
206
220
|
"script must be specified for non-chart processes "
|
|
207
|
-
"(except default data/seed split nodes without
|
|
221
|
+
"(except default data/seed split nodes without explicit code): "
|
|
208
222
|
f"{missing}"
|
|
209
223
|
)
|
|
210
224
|
if not script_paths:
|
|
@@ -563,15 +577,16 @@ class CustomModelAdapter(ModelAdapter):
|
|
|
563
577
|
proc_type = str(proc.get("type", "process"))
|
|
564
578
|
if proc_type == "chart":
|
|
565
579
|
continue
|
|
566
|
-
|
|
567
|
-
|
|
580
|
+
raw_code = proc.get("code")
|
|
581
|
+
code_str = str(raw_code).strip() if isinstance(raw_code, str) else ""
|
|
582
|
+
has_explicit_code = bool(code_str)
|
|
568
583
|
has_data_parallelism = proc.get("data_parallelism") is not None
|
|
569
584
|
has_seed_parallelism = proc.get("seed_parallelism") is not None
|
|
570
|
-
is_default_split = (not
|
|
585
|
+
is_default_split = (not has_explicit_code) and (has_data_parallelism or has_seed_parallelism)
|
|
571
586
|
if is_default_split:
|
|
572
587
|
continue
|
|
573
|
-
# For non-split processes,
|
|
574
|
-
#
|
|
588
|
+
# For non-split processes, we require either explicit code or default
|
|
589
|
+
# behavior (code omitted but no split flags). In both cases validation passes.
|
|
575
590
|
return True
|
|
576
591
|
except Exception:
|
|
577
592
|
return False
|
|
@@ -30,7 +30,12 @@ def _apply_hash_overrides(proc_payload: Dict[str, Any], config_hash: Optional[st
|
|
|
30
30
|
|
|
31
31
|
def _strip_internal_keys(value: Any) -> Any:
|
|
32
32
|
if isinstance(value, dict):
|
|
33
|
-
keep_keys = {
|
|
33
|
+
keep_keys = {
|
|
34
|
+
"__data_partition_hashes__",
|
|
35
|
+
"__data_partition_data_name__",
|
|
36
|
+
"__data_hash__",
|
|
37
|
+
"__parallel_context__",
|
|
38
|
+
}
|
|
34
39
|
return {k: v for k, v in value.items() if (k in keep_keys) or (not str(k).startswith("__"))}
|
|
35
40
|
return value
|
|
36
41
|
|
|
@@ -559,8 +564,23 @@ def _prepare_runner_kwargs(
|
|
|
559
564
|
should_select = True
|
|
560
565
|
if should_select:
|
|
561
566
|
dep_result = _select_partition_value(dep_result, data_name, int(partition_index))
|
|
562
|
-
part_idx =
|
|
563
|
-
seed_val =
|
|
567
|
+
part_idx = None
|
|
568
|
+
seed_val = None
|
|
569
|
+
try:
|
|
570
|
+
if isinstance(dep_result, dict):
|
|
571
|
+
part_idx = dep_result.get("__partition_index__")
|
|
572
|
+
seed_val = dep_result.get("__seed_value__")
|
|
573
|
+
except Exception:
|
|
574
|
+
part_idx = None
|
|
575
|
+
seed_val = None
|
|
576
|
+
# Fallback to parsing partition/seed indices from the dependency name when
|
|
577
|
+
# structured metadata is not present on the result payload. This keeps real
|
|
578
|
+
# metadata authoritative while allowing name-based aggregation for simple
|
|
579
|
+
# dict results (e.g. in tests or lightweight contexts).
|
|
580
|
+
if part_idx is None:
|
|
581
|
+
part_idx = _partition_index_from_name(dep_name)
|
|
582
|
+
if seed_val is None:
|
|
583
|
+
seed_val = _seed_value_from_name(dep_name)
|
|
564
584
|
for key, value in dep_result.items():
|
|
565
585
|
if isinstance(key, str) and key.startswith("__"):
|
|
566
586
|
continue
|
|
@@ -948,8 +968,6 @@ def _execute_process_on_worker(ctx: Any, proc_payload: Dict[str, Any], run_id: O
|
|
|
948
968
|
seed_override = None
|
|
949
969
|
if isinstance(proc_payload, dict):
|
|
950
970
|
seed_override = proc_payload.get("seed_value")
|
|
951
|
-
if seed_override is None:
|
|
952
|
-
seed_override = _seed_value_from_name(process_name or "")
|
|
953
971
|
_seed_rng_for_task(run_id, process_name, None, 0, seed_override=seed_override)
|
|
954
972
|
|
|
955
973
|
dp_cfg = _normalize_data_parallelism_cfg(proc_payload.get("data_parallelism") if isinstance(proc_payload, dict) else None)
|
|
@@ -960,8 +978,7 @@ def _execute_process_on_worker(ctx: Any, proc_payload: Dict[str, Any], run_id: O
|
|
|
960
978
|
or (isinstance(parallel_ctx, dict) and parallel_ctx.get("role") == "partition")
|
|
961
979
|
)
|
|
962
980
|
partition_index = proc_payload.get("partition_index") if isinstance(proc_payload, dict) else None
|
|
963
|
-
if
|
|
964
|
-
partition_index = _partition_index_from_name(process_name or "")
|
|
981
|
+
seed_value = proc_payload.get("seed_value") if isinstance(proc_payload, dict) else None
|
|
965
982
|
|
|
966
983
|
from .step_system import get_process_registry as _get_pr, set_current_context as _set_ctx, set_current_process_context as _set_proc
|
|
967
984
|
import io as _io
|
|
@@ -1302,8 +1319,6 @@ def _execute_process_on_worker(ctx: Any, proc_payload: Dict[str, Any], run_id: O
|
|
|
1302
1319
|
allow_seed_agg = bool(proc_payload.get("seed_aggregation")) if isinstance(proc_payload, dict) else False
|
|
1303
1320
|
extra_inputs = {}
|
|
1304
1321
|
seed_value = proc_payload.get("seed_value") if isinstance(proc_payload, dict) else None
|
|
1305
|
-
if seed_value is None:
|
|
1306
|
-
seed_value = _seed_value_from_name(process_name or "")
|
|
1307
1322
|
if seed_value is not None:
|
|
1308
1323
|
try:
|
|
1309
1324
|
extra_inputs["random_seed"] = int(seed_value)
|
|
@@ -1370,9 +1385,61 @@ def _execute_process_on_worker(ctx: Any, proc_payload: Dict[str, Any], run_id: O
|
|
|
1370
1385
|
_captured = None
|
|
1371
1386
|
return ExecutionResult(name=process_name, result=_captured, execution_time=exec_time, was_cached=False, error=f"Process '{process_name}' must return a dictionary, got {type(ret).__name__}.")
|
|
1372
1387
|
|
|
1373
|
-
if
|
|
1388
|
+
if isinstance(ret, dict):
|
|
1389
|
+
# Attach structured parallelism metadata to the result so downstream
|
|
1390
|
+
# aggregation can rely on it instead of parsing name suffixes.
|
|
1391
|
+
try:
|
|
1392
|
+
if partition_index is not None:
|
|
1393
|
+
ret.setdefault("__partition_index__", partition_index)
|
|
1394
|
+
except Exception:
|
|
1395
|
+
pass
|
|
1374
1396
|
try:
|
|
1375
|
-
|
|
1397
|
+
if seed_value is not None:
|
|
1398
|
+
ret.setdefault("__seed_value__", seed_value)
|
|
1399
|
+
except Exception:
|
|
1400
|
+
pass
|
|
1401
|
+
try:
|
|
1402
|
+
if data_hash:
|
|
1403
|
+
ret.setdefault("__data_hash__", data_hash)
|
|
1404
|
+
except Exception:
|
|
1405
|
+
pass
|
|
1406
|
+
# Expose the full layered parallel context (data/seed) on the result
|
|
1407
|
+
# payload so downstream consumers can reason about multi-layer
|
|
1408
|
+
# partition/seed structure without relying on process names.
|
|
1409
|
+
try:
|
|
1410
|
+
if isinstance(parallel_ctx, dict):
|
|
1411
|
+
data_layers_src = parallel_ctx.get("data_layers")
|
|
1412
|
+
seed_layers_src = parallel_ctx.get("seed_layers")
|
|
1413
|
+
layers: Dict[str, Any] = {}
|
|
1414
|
+
if isinstance(data_layers_src, list) and data_layers_src:
|
|
1415
|
+
dl_clean: list[dict[str, Any]] = []
|
|
1416
|
+
for layer in data_layers_src:
|
|
1417
|
+
if not isinstance(layer, dict):
|
|
1418
|
+
continue
|
|
1419
|
+
dl_clean.append(
|
|
1420
|
+
{
|
|
1421
|
+
"source": layer.get("source"),
|
|
1422
|
+
"partition_index": layer.get("partition_index"),
|
|
1423
|
+
"data_name": layer.get("data_name"),
|
|
1424
|
+
}
|
|
1425
|
+
)
|
|
1426
|
+
if dl_clean:
|
|
1427
|
+
layers["data_layers"] = dl_clean
|
|
1428
|
+
if isinstance(seed_layers_src, list) and seed_layers_src:
|
|
1429
|
+
sl_clean: list[dict[str, Any]] = []
|
|
1430
|
+
for layer in seed_layers_src:
|
|
1431
|
+
if not isinstance(layer, dict):
|
|
1432
|
+
continue
|
|
1433
|
+
sl_clean.append(
|
|
1434
|
+
{
|
|
1435
|
+
"source": layer.get("source"),
|
|
1436
|
+
"seed_value": layer.get("seed_value"),
|
|
1437
|
+
}
|
|
1438
|
+
)
|
|
1439
|
+
if sl_clean:
|
|
1440
|
+
layers["seed_layers"] = sl_clean
|
|
1441
|
+
if layers and "__parallel_context__" not in ret:
|
|
1442
|
+
ret["__parallel_context__"] = layers
|
|
1376
1443
|
except Exception:
|
|
1377
1444
|
pass
|
|
1378
1445
|
|
|
@@ -1487,6 +1554,49 @@ def _run_chart_process_on_worker(ctx: Any, proc_payload: Dict[str, Any], run_id:
|
|
|
1487
1554
|
break
|
|
1488
1555
|
except Exception:
|
|
1489
1556
|
pass
|
|
1557
|
+
def _probe_paths_preview(value: Any, max_items: int = 8) -> str:
|
|
1558
|
+
try:
|
|
1559
|
+
if not isinstance(value, dict):
|
|
1560
|
+
return str(value)
|
|
1561
|
+
parts: list[str] = []
|
|
1562
|
+
for idx, (k, v) in enumerate(value.items()):
|
|
1563
|
+
if idx >= max_items:
|
|
1564
|
+
parts.append("...")
|
|
1565
|
+
break
|
|
1566
|
+
parts.append(f"{k}=>{str(v)[:140]}")
|
|
1567
|
+
return ", ".join(parts)
|
|
1568
|
+
except Exception:
|
|
1569
|
+
return "<unavailable>"
|
|
1570
|
+
|
|
1571
|
+
# Upgrade probe_paths to canonical XPath-based keys so they match the
|
|
1572
|
+
# metrics written by the StepStateManager. This mirrors the behavior used
|
|
1573
|
+
# for dynamic charts and the web UI chart-config endpoint.
|
|
1574
|
+
try:
|
|
1575
|
+
if chart_spec.get('probe_paths'):
|
|
1576
|
+
gcfg = getattr(ctx, 'global_config', {}) if ctx else {}
|
|
1577
|
+
pipeline_cfg = (gcfg.get('pipeline') or {}) if isinstance(gcfg, dict) else {}
|
|
1578
|
+
if isinstance(pipeline_cfg, dict) and pipeline_cfg:
|
|
1579
|
+
from mlops.core.networkx_parser import parse_networkx_pipeline_from_config
|
|
1580
|
+
from mlops.core.graph_expansion import expand_process_graph
|
|
1581
|
+
from mlops.core.probe_path_selectors import expand_probe_paths
|
|
1582
|
+
|
|
1583
|
+
nx_cfg = parse_networkx_pipeline_from_config(pipeline_cfg)
|
|
1584
|
+
expanded_procs = expand_process_graph(nx_cfg.processes or [])
|
|
1585
|
+
if expanded_procs:
|
|
1586
|
+
chart_spec['probe_paths'] = expand_probe_paths(chart_spec.get('probe_paths') or {}, expanded_procs)
|
|
1587
|
+
logger.info(
|
|
1588
|
+
"[Charts] Probe paths resolved for '%s': %s",
|
|
1589
|
+
name,
|
|
1590
|
+
_probe_paths_preview(chart_spec.get('probe_paths')),
|
|
1591
|
+
)
|
|
1592
|
+
except Exception as exc:
|
|
1593
|
+
# Best-effort only; fall back to raw probe_paths if expansion fails.
|
|
1594
|
+
logger.warning(
|
|
1595
|
+
"[Charts] Failed to expand probe_paths for '%s': %s. Using raw probe_paths: %s",
|
|
1596
|
+
name,
|
|
1597
|
+
exc,
|
|
1598
|
+
_probe_paths_preview(chart_spec.get('probe_paths')),
|
|
1599
|
+
)
|
|
1490
1600
|
def _load_env_python_map() -> dict[str, str]:
|
|
1491
1601
|
try:
|
|
1492
1602
|
raw = os.environ.get("MLOPS_ENV_PYTHON_MAP") or ""
|
|
@@ -1722,8 +1832,12 @@ def _worker_execute_step_task(step_name: str, process_name: Optional[str], conte
|
|
|
1722
1832
|
logger.warning(f"[Distributed] Worker state manager init failed for step {step_name}: {e}")
|
|
1723
1833
|
|
|
1724
1834
|
set_current_context(ctx)
|
|
1725
|
-
# Deterministic task-level seeding (step scope)
|
|
1726
|
-
seed_override =
|
|
1835
|
+
# Deterministic task-level seeding (step scope) using structured seed value
|
|
1836
|
+
seed_override = None
|
|
1837
|
+
try:
|
|
1838
|
+
seed_override = getattr(ctx, "shared_state", {}).get("seed_value")
|
|
1839
|
+
except Exception:
|
|
1840
|
+
seed_override = None
|
|
1727
1841
|
_seed_rng_for_task(run_id, process_name, step_name, iteration, seed_override=seed_override)
|
|
1728
1842
|
try:
|
|
1729
1843
|
from .step_system import get_current_state_manager as _get_sm
|
|
@@ -1899,8 +2013,6 @@ def _worker_execute_process_task(proc_payload: Dict[str, Any], context_payload:
|
|
|
1899
2013
|
seed_override = None
|
|
1900
2014
|
if isinstance(proc_payload, dict):
|
|
1901
2015
|
seed_override = proc_payload.get("seed_value")
|
|
1902
|
-
if seed_override is None:
|
|
1903
|
-
seed_override = _seed_value_from_name(process_name or "")
|
|
1904
2016
|
_seed_rng_for_task(run_id, process_name, None, 0, seed_override=seed_override)
|
|
1905
2017
|
|
|
1906
2018
|
# Process start is now recorded inside _execute_process_on_worker at the exact timing start
|