fusion-bench 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fusion_bench/__init__.py +25 -2
- fusion_bench/compat/method/__init__.py +5 -2
- fusion_bench/compat/method/base_algorithm.py +3 -2
- fusion_bench/compat/modelpool/base_pool.py +3 -3
- fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
- fusion_bench/constants/__init__.py +1 -0
- fusion_bench/constants/runtime.py +57 -0
- fusion_bench/dataset/gpt2_glue.py +1 -1
- fusion_bench/method/__init__.py +12 -4
- fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
- fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
- fusion_bench/method/bitdelta/__init__.py +1 -0
- fusion_bench/method/bitdelta/bitdelta.py +7 -23
- fusion_bench/method/classification/clip_finetune.py +1 -1
- fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
- fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
- fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
- fusion_bench/method/fisher_merging/clip_fisher_merging.py +0 -4
- fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +2 -2
- fusion_bench/method/linear/simple_average_for_llama.py +16 -11
- fusion_bench/method/model_stock/__init__.py +1 -0
- fusion_bench/method/model_stock/model_stock.py +309 -0
- fusion_bench/method/regmean/clip_regmean.py +3 -6
- fusion_bench/method/regmean/regmean.py +27 -56
- fusion_bench/method/regmean/utils.py +56 -0
- fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
- fusion_bench/method/simple_average.py +7 -7
- fusion_bench/method/slerp/__init__.py +1 -1
- fusion_bench/method/slerp/slerp.py +110 -14
- fusion_bench/method/smile_upscaling/causal_lm_upscaling.py +371 -0
- fusion_bench/method/smile_upscaling/projected_energy.py +1 -2
- fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +5 -1
- fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +40 -31
- fusion_bench/method/smile_upscaling/smile_upscaling.py +1 -1
- fusion_bench/method/we_moe/__init__.py +1 -0
- fusion_bench/method/we_moe/entropy_loss.py +25 -0
- fusion_bench/method/we_moe/flan_t5_we_moe.py +320 -0
- fusion_bench/method/we_moe/utils.py +15 -0
- fusion_bench/method/weighted_average/llama.py +1 -1
- fusion_bench/mixins/clip_classification.py +37 -48
- fusion_bench/mixins/serialization.py +30 -10
- fusion_bench/modelpool/base_pool.py +1 -1
- fusion_bench/modelpool/causal_lm/causal_lm.py +293 -75
- fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
- fusion_bench/models/__init__.py +5 -0
- fusion_bench/models/hf_utils.py +69 -86
- fusion_bench/models/linearized/vision_model.py +6 -6
- fusion_bench/models/model_card_templates/default.md +46 -0
- fusion_bench/models/modeling_smile_llama/__init__.py +7 -0
- fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +1 -8
- fusion_bench/models/modeling_smile_mistral/__init__.py +2 -1
- fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +1 -5
- fusion_bench/models/we_moe.py +8 -8
- fusion_bench/programs/fabric_fusion_program.py +29 -60
- fusion_bench/scripts/cli.py +34 -1
- fusion_bench/taskpool/base_pool.py +99 -17
- fusion_bench/taskpool/clip_vision/taskpool.py +10 -5
- fusion_bench/taskpool/dummy.py +101 -13
- fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
- fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
- fusion_bench/utils/__init__.py +2 -0
- fusion_bench/utils/cache_utils.py +101 -1
- fusion_bench/utils/data.py +6 -4
- fusion_bench/utils/devices.py +7 -4
- fusion_bench/utils/dtype.py +3 -2
- fusion_bench/utils/fabric.py +2 -2
- fusion_bench/utils/lazy_imports.py +23 -0
- fusion_bench/utils/lazy_state_dict.py +117 -19
- fusion_bench/utils/modelscope.py +3 -3
- fusion_bench/utils/packages.py +3 -3
- fusion_bench/utils/parameters.py +0 -2
- fusion_bench/utils/path.py +56 -0
- fusion_bench/utils/pylogger.py +1 -1
- fusion_bench/utils/timer.py +92 -10
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/METADATA +1 -23
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/RECORD +89 -75
- fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
- fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml +0 -1
- fusion_bench_config/method/linear/simple_average_for_llama.yaml +3 -2
- fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
- fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
- fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml +21 -0
- fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +1 -1
- fusion_bench_config/method/wemoe/flan_t5_weight_ensembling_moe.yaml +20 -0
- fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +1 -1
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/WHEEL +0 -0
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/entry_points.txt +0 -0
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/licenses/LICENSE +0 -0
- {fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/top_level.txt +0 -0
|
@@ -5,33 +5,115 @@ from fusion_bench.mixins import BaseYAMLSerializable
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class BaseTaskPool(BaseYAMLSerializable):
|
|
8
|
+
"""Abstract base class for task pools in the FusionBench framework.
|
|
9
|
+
|
|
10
|
+
A task pool represents a collection of evaluation tasks that can be used to
|
|
11
|
+
assess model performance across multiple benchmarks or datasets. This base
|
|
12
|
+
class defines the common interface that all task pool implementations must
|
|
13
|
+
follow, ensuring consistency across different task types and evaluation
|
|
14
|
+
scenarios.
|
|
15
|
+
|
|
16
|
+
Task pools are designed to be configurable through YAML files and can be
|
|
17
|
+
used in various model fusion and evaluation workflows. They provide a
|
|
18
|
+
standardized way to evaluate models on multiple tasks and aggregate results.
|
|
19
|
+
|
|
20
|
+
The class inherits from BaseYAMLSerializable to support configuration
|
|
21
|
+
management and serialization capabilities.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
_program: Optional program reference for execution context.
|
|
25
|
+
_config_key: Configuration key used for YAML configuration ("taskpool").
|
|
26
|
+
|
|
27
|
+
Abstract Methods:
|
|
28
|
+
evaluate: Must be implemented by subclasses to define task-specific
|
|
29
|
+
evaluation logic.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
Implementing a custom task pool:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
class MyTaskPool(BaseTaskPool):
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def evaluate(self, model, **kwargs):
|
|
39
|
+
results = {}
|
|
40
|
+
for task_name in self.tasks:
|
|
41
|
+
# Implement task-specific evaluation
|
|
42
|
+
results[task_name] = self._evaluate_task(model, task_name)
|
|
43
|
+
return results
|
|
44
|
+
```
|
|
45
|
+
"""
|
|
46
|
+
|
|
8
47
|
_program = None
|
|
9
48
|
_config_key = "taskpool"
|
|
10
49
|
|
|
11
50
|
@abstractmethod
|
|
12
51
|
def evaluate(self, model: Any, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
|
13
|
-
"""
|
|
14
|
-
Evaluate the model on all tasks in the task pool, and return a report.
|
|
52
|
+
"""Evaluate a model on all tasks in the task pool and return aggregated results.
|
|
15
53
|
|
|
16
|
-
|
|
54
|
+
This abstract method defines the core evaluation interface that all task pool
|
|
55
|
+
implementations must provide. It should evaluate the given model on all tasks
|
|
56
|
+
managed by the pool and return a structured report of the results.
|
|
17
57
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
},
|
|
24
|
-
<task_name>: {
|
|
25
|
-
<metric_name>: <metric_value>,
|
|
26
|
-
...
|
|
27
|
-
},
|
|
28
|
-
}
|
|
29
|
-
```
|
|
58
|
+
The evaluation process typically involves:
|
|
59
|
+
1. Iterating through all tasks in the pool
|
|
60
|
+
2. Running model inference on each task's dataset
|
|
61
|
+
3. Computing task-specific metrics
|
|
62
|
+
4. Aggregating results into a standardized report format
|
|
30
63
|
|
|
31
64
|
Args:
|
|
32
|
-
model: The model to evaluate.
|
|
65
|
+
model: The model to evaluate. Can be any model type (PyTorch model,
|
|
66
|
+
Hugging Face model, etc.) that is compatible with the specific
|
|
67
|
+
task pool implementation.
|
|
68
|
+
*args: Additional positional arguments that may be needed for
|
|
69
|
+
task-specific evaluation procedures.
|
|
70
|
+
**kwargs: Additional keyword arguments for evaluation configuration,
|
|
71
|
+
such as batch_size, device, evaluation metrics, etc.
|
|
33
72
|
|
|
34
73
|
Returns:
|
|
35
|
-
|
|
74
|
+
Dict[str, Any]: A dictionary containing evaluation results for each task.
|
|
75
|
+
The structure follows the pattern:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
{
|
|
79
|
+
"task_name_1": {
|
|
80
|
+
"metric_1": value,
|
|
81
|
+
"metric_2": value,
|
|
82
|
+
...
|
|
83
|
+
},
|
|
84
|
+
"task_name_2": {
|
|
85
|
+
"metric_1": value,
|
|
86
|
+
"metric_2": value,
|
|
87
|
+
...
|
|
88
|
+
},
|
|
89
|
+
...
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
For an image classification task pool:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
results = task_pool.evaluate(model)
|
|
98
|
+
# Returns:
|
|
99
|
+
# {
|
|
100
|
+
# "mnist": {
|
|
101
|
+
# "accuracy": 0.95,
|
|
102
|
+
# "loss": 0.15,
|
|
103
|
+
# },
|
|
104
|
+
# "cifar10": {
|
|
105
|
+
# "accuracy": 0.87,
|
|
106
|
+
# "loss": 0.42,
|
|
107
|
+
# }
|
|
108
|
+
# }
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
NotImplementedError: This method must be implemented by subclasses.
|
|
113
|
+
|
|
114
|
+
Note:
|
|
115
|
+
Implementations should ensure that the returned dictionary structure
|
|
116
|
+
is consistent and that metric names are standardized across similar
|
|
117
|
+
task types to enable meaningful comparison and aggregation.
|
|
36
118
|
"""
|
|
37
119
|
pass
|
|
@@ -27,8 +27,9 @@ from tqdm.autonotebook import tqdm
|
|
|
27
27
|
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
|
|
28
28
|
from transformers.models.clip.modeling_clip import CLIPVisionTransformer
|
|
29
29
|
|
|
30
|
+
from fusion_bench import RuntimeConstants
|
|
30
31
|
from fusion_bench.dataset import CLIPDataset
|
|
31
|
-
from fusion_bench.mixins import LightningFabricMixin
|
|
32
|
+
from fusion_bench.mixins import HydraConfigMixin, LightningFabricMixin
|
|
32
33
|
from fusion_bench.models.hf_clip import HFCLIPClassifier
|
|
33
34
|
from fusion_bench.taskpool import BaseTaskPool
|
|
34
35
|
from fusion_bench.tasks.clip_classification import get_classnames_and_templates
|
|
@@ -86,8 +87,9 @@ class LayerWiseFeatureSaver:
|
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
class CLIPVisionModelTaskPool(
|
|
89
|
-
|
|
90
|
+
HydraConfigMixin,
|
|
90
91
|
LightningFabricMixin,
|
|
92
|
+
BaseTaskPool,
|
|
91
93
|
):
|
|
92
94
|
"""
|
|
93
95
|
This class is used to define the image classification task for CLIP models.
|
|
@@ -131,7 +133,7 @@ class CLIPVisionModelTaskPool(
|
|
|
131
133
|
layer_wise_feature_save_path: Optional[str] = None,
|
|
132
134
|
layer_wise_feature_first_token_only: bool = True,
|
|
133
135
|
layer_wise_feature_max_num: Optional[int] = None,
|
|
134
|
-
fast_dev_run: bool =
|
|
136
|
+
fast_dev_run: Optional[bool] = None,
|
|
135
137
|
**kwargs,
|
|
136
138
|
):
|
|
137
139
|
"""
|
|
@@ -153,7 +155,10 @@ class CLIPVisionModelTaskPool(
|
|
|
153
155
|
self.layer_wise_feature_first_token_only = layer_wise_feature_first_token_only
|
|
154
156
|
self.layer_wise_feature_max_num = layer_wise_feature_max_num
|
|
155
157
|
|
|
156
|
-
self.fast_dev_run
|
|
158
|
+
if self.fast_dev_run is None:
|
|
159
|
+
self.fast_dev_run = RuntimeConstants().debug
|
|
160
|
+
else:
|
|
161
|
+
self.fast_dev_run = fast_dev_run
|
|
157
162
|
super().__init__(**kwargs)
|
|
158
163
|
|
|
159
164
|
def setup(self):
|
|
@@ -304,7 +309,7 @@ class CLIPVisionModelTaskPool(
|
|
|
304
309
|
self.setup()
|
|
305
310
|
|
|
306
311
|
report = {}
|
|
307
|
-
# CLIPVisionModel works the same with
|
|
312
|
+
# CLIPVisionModel works the same with CLIPVisionTransformer, so we can use it directly
|
|
308
313
|
if hasattr(model, "is_surgery_model") and model.is_surgery_model:
|
|
309
314
|
log.info("running evaluation on a surgery model.")
|
|
310
315
|
model: "SurgeryModelWrapper" = model
|
fusion_bench/taskpool/dummy.py
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Dummy task pool implementation for debugging and testing purposes.
|
|
3
|
+
|
|
4
|
+
This module provides a minimal task pool implementation that can be used for
|
|
5
|
+
debugging model fusion workflows, testing infrastructure, and validating model
|
|
6
|
+
architectures without running expensive evaluation procedures. It's particularly
|
|
7
|
+
useful during development and prototyping phases.
|
|
3
8
|
"""
|
|
4
9
|
|
|
5
10
|
from typing import Optional
|
|
@@ -14,14 +19,41 @@ from fusion_bench.utils.parameters import count_parameters, print_parameters
|
|
|
14
19
|
|
|
15
20
|
|
|
16
21
|
def get_model_summary(model: nn.Module) -> dict:
|
|
17
|
-
"""
|
|
18
|
-
|
|
22
|
+
"""Generate a comprehensive summary report for a PyTorch model.
|
|
23
|
+
|
|
24
|
+
Analyzes the given model to extract key information about its architecture,
|
|
25
|
+
parameter count, and training characteristics. This function is useful for
|
|
26
|
+
model introspection and comparative analysis during model fusion workflows.
|
|
27
|
+
|
|
28
|
+
The summary includes both trainable and total parameter counts, which helps
|
|
29
|
+
in understanding model complexity and memory requirements. The trainable
|
|
30
|
+
percentage is particularly useful for identifying models with frozen layers
|
|
31
|
+
or parameter-efficient fine-tuning setups.
|
|
19
32
|
|
|
20
33
|
Args:
|
|
21
|
-
model: The model to
|
|
34
|
+
model: The PyTorch model to analyze. Can be any nn.Module instance
|
|
35
|
+
including complex models, fusion models, or pre-trained models.
|
|
22
36
|
|
|
23
37
|
Returns:
|
|
24
|
-
dict:
|
|
38
|
+
dict: A structured report containing model information:
|
|
39
|
+
- model_info: Dictionary with parameter statistics
|
|
40
|
+
- trainable_params: Number of trainable parameters
|
|
41
|
+
- all_params: Total number of parameters (trainable + frozen)
|
|
42
|
+
- trainable_percentage: Ratio of trainable to total parameters
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
```python
|
|
46
|
+
>>> model = MyModel()
|
|
47
|
+
>>> summary = get_model_summary(model)
|
|
48
|
+
>>> print(summary)
|
|
49
|
+
{
|
|
50
|
+
"model_info": {
|
|
51
|
+
"trainable_params": 1234567,
|
|
52
|
+
"all_params": 1234567,
|
|
53
|
+
"trainable_percentage": 1.0
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
```
|
|
25
57
|
"""
|
|
26
58
|
report = {}
|
|
27
59
|
training_params, all_params = count_parameters(model)
|
|
@@ -34,21 +66,77 @@ def get_model_summary(model: nn.Module) -> dict:
|
|
|
34
66
|
|
|
35
67
|
|
|
36
68
|
class DummyTaskPool(BaseTaskPool):
|
|
69
|
+
"""A lightweight task pool implementation for debugging and development workflows.
|
|
70
|
+
|
|
71
|
+
This dummy task pool provides a minimal evaluation interface that focuses on
|
|
72
|
+
model introspection rather than task-specific performance evaluation. It's
|
|
73
|
+
designed for development scenarios where you need to test model fusion
|
|
74
|
+
pipelines, validate architectures, or debug workflows without the overhead
|
|
75
|
+
of running actual evaluation tasks.
|
|
76
|
+
|
|
77
|
+
The task pool is particularly useful when:
|
|
78
|
+
- You want to verify model fusion works correctly
|
|
79
|
+
- You need to check parameter counts after fusion
|
|
80
|
+
- You're developing new fusion algorithms
|
|
81
|
+
- You want to test infrastructure without expensive evaluations
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
```python
|
|
85
|
+
>>> taskpool = DummyTaskPool(model_save_path="/tmp/fused_model")
|
|
86
|
+
>>> results = taskpool.evaluate(fused_model)
|
|
87
|
+
>>> print(f"Model has {results['model_info']['trainable_params']} parameters")
|
|
88
|
+
```
|
|
37
89
|
"""
|
|
38
|
-
This is a dummy task pool used for debugging purposes. It inherits from the base TaskPool class.
|
|
39
|
-
"""
|
|
40
90
|
|
|
41
|
-
def __init__(self, model_save_path: Optional[str] = None):
|
|
42
|
-
|
|
91
|
+
def __init__(self, model_save_path: Optional[str] = None, **kwargs):
|
|
92
|
+
"""Initialize the dummy task pool with optional model saving capability.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
model_save_path: Optional path where the evaluated model should be saved.
|
|
96
|
+
If provided, the model will be serialized and saved to this location
|
|
97
|
+
after evaluation using the separate_save utility. If None, no model
|
|
98
|
+
saving will be performed.
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
```python
|
|
102
|
+
>>> # Create taskpool without saving
|
|
103
|
+
>>> taskpool = DummyTaskPool()
|
|
104
|
+
|
|
105
|
+
>>> # Create taskpool with model saving
|
|
106
|
+
>>> taskpool = DummyTaskPool(model_save_path="/path/to/save/model.pth")
|
|
107
|
+
```
|
|
108
|
+
"""
|
|
109
|
+
super().__init__(**kwargs)
|
|
43
110
|
self.model_save_path = model_save_path
|
|
44
111
|
|
|
45
112
|
def evaluate(self, model):
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
This method
|
|
113
|
+
"""Perform lightweight evaluation and analysis of the given model.
|
|
114
|
+
|
|
115
|
+
This method provides a minimal evaluation that focuses on model introspection
|
|
116
|
+
rather than task-specific performance metrics. It performs parameter analysis,
|
|
117
|
+
optionally saves the model, and returns a summary report.
|
|
118
|
+
|
|
119
|
+
The evaluation process includes:
|
|
120
|
+
1. Printing human-readable parameter information (rank-zero only)
|
|
121
|
+
2. Optionally saving the model if a save path was configured
|
|
122
|
+
3. Generating and returning a model summary report
|
|
49
123
|
|
|
50
124
|
Args:
|
|
51
|
-
model: The model to evaluate.
|
|
125
|
+
model: The model to evaluate. Can be any PyTorch nn.Module including
|
|
126
|
+
fusion models, pre-trained models, or custom architectures.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
dict: A model summary report containing parameter statistics and
|
|
130
|
+
architecture information. See get_model_summary() for detailed
|
|
131
|
+
format specification.
|
|
132
|
+
|
|
133
|
+
Example:
|
|
134
|
+
```python
|
|
135
|
+
>>> taskpool = DummyTaskPool(model_save_path="/tmp/model.pth")
|
|
136
|
+
>>> model = torch.nn.Linear(10, 5)
|
|
137
|
+
>>> results = taskpool.evaluate(model)
|
|
138
|
+
>>> print(f"Trainable params: {results['model_info']['trainable_params']}")
|
|
139
|
+
```
|
|
52
140
|
"""
|
|
53
141
|
if rank_zero_only.rank == 0:
|
|
54
142
|
print_parameters(model, is_human_readable=True)
|
|
@@ -16,6 +16,47 @@ log = logging.getLogger(__name__)
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class LMEvalHarnessTaskPool(BaseTaskPool, LightningFabricMixin):
|
|
19
|
+
"""A task pool implementation that interfaces with the LM Evaluation Harness framework.
|
|
20
|
+
|
|
21
|
+
This class provides a wrapper around the LM Evaluation Harness (lm-eval) library,
|
|
22
|
+
enabling evaluation of language models on various standardized benchmarks and tasks.
|
|
23
|
+
It inherits from BaseTaskPool and LightningFabricMixin to provide distributed
|
|
24
|
+
computing capabilities through PyTorch Lightning Fabric.
|
|
25
|
+
|
|
26
|
+
The task pool supports evaluation on multiple tasks simultaneously and provides
|
|
27
|
+
flexible configuration options for batch processing, output formatting, and
|
|
28
|
+
logging. It automatically handles model setup and wrapping for distributed
|
|
29
|
+
evaluation when using Lightning Fabric.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
tasks: A single task name or list of task names to evaluate on.
|
|
33
|
+
Examples: "hellaswag", ["arc_easy", "arc_challenge", "hellaswag"]
|
|
34
|
+
apply_chat_template: Whether to apply chat template formatting to inputs.
|
|
35
|
+
Useful for instruction-tuned or chat models.
|
|
36
|
+
include_path: Path to additional task definitions or custom tasks.
|
|
37
|
+
batch_size: Number of samples to process in each batch. Larger values
|
|
38
|
+
may improve throughput but require more memory.
|
|
39
|
+
metadata: Additional metadata to include in evaluation results.
|
|
40
|
+
verbosity: Logging verbosity level for the evaluation process.
|
|
41
|
+
output_path: Custom path for saving evaluation results. If None,
|
|
42
|
+
results are saved to the default log directory.
|
|
43
|
+
log_samples: Whether to log individual sample predictions and targets.
|
|
44
|
+
Useful for debugging but increases output size significantly.
|
|
45
|
+
_usage_: Internal usage tracking string.
|
|
46
|
+
_version_: Internal version tracking string.
|
|
47
|
+
**kwargs: Additional arguments passed to the LM Evaluation Harness.
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
```python
|
|
51
|
+
>>> taskpool = LMEvalHarnessTaskPool(
|
|
52
|
+
... tasks=["arc_easy", "hellaswag"],
|
|
53
|
+
... batch_size=8,
|
|
54
|
+
... verbosity="INFO"
|
|
55
|
+
... )
|
|
56
|
+
>>> results = taskpool.evaluate(model)
|
|
57
|
+
```
|
|
58
|
+
"""
|
|
59
|
+
|
|
19
60
|
def __init__(
|
|
20
61
|
self,
|
|
21
62
|
tasks: Union[str, List[str]],
|
|
@@ -44,6 +85,45 @@ class LMEvalHarnessTaskPool(BaseTaskPool, LightningFabricMixin):
|
|
|
44
85
|
self.log_samples = log_samples
|
|
45
86
|
|
|
46
87
|
def evaluate(self, model, *command_line_args, **kwargs):
|
|
88
|
+
"""Evaluate a language model on the configured tasks using LM Evaluation Harness.
|
|
89
|
+
|
|
90
|
+
This method wraps the model with the LM Evaluation Harness framework and
|
|
91
|
+
executes evaluation on all configured tasks. It automatically handles
|
|
92
|
+
command-line argument construction, model wrapping with Lightning Fabric
|
|
93
|
+
for distributed evaluation, and result logging.
|
|
94
|
+
|
|
95
|
+
The evaluation process includes:
|
|
96
|
+
1. Building command-line arguments from instance configuration
|
|
97
|
+
2. Setting up the LM Evaluation Harness argument parser
|
|
98
|
+
3. Wrapping the model with Lightning Fabric if not already wrapped
|
|
99
|
+
4. Creating an HFLM (Hugging Face Language Model) wrapper
|
|
100
|
+
5. Executing the evaluation through the LM-Eval CLI interface
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
model: The language model to evaluate. Can be a Hugging Face model,
|
|
104
|
+
PyTorch model, or any model compatible with the LM Evaluation Harness.
|
|
105
|
+
The model will be automatically wrapped with Lightning Fabric for
|
|
106
|
+
distributed evaluation if not already wrapped.
|
|
107
|
+
*command_line_args: Additional positional command-line arguments
|
|
108
|
+
(currently unused but preserved for interface compatibility).
|
|
109
|
+
**kwargs: Additional keyword arguments that will be converted to
|
|
110
|
+
command-line flags and passed to the LM Evaluation Harness.
|
|
111
|
+
Keys will be prefixed with '--' and values converted to strings.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
None: Results are written to the configured output path and logged.
|
|
115
|
+
|
|
116
|
+
Example:
|
|
117
|
+
```python
|
|
118
|
+
>>> taskpool = LMEvalHarnessTaskPool(tasks=["arc_easy"])
|
|
119
|
+
>>> taskpool.evaluate(model, limit=100, device="cuda")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Note:
|
|
123
|
+
The method leverages the LM Evaluation Harness's command-line interface
|
|
124
|
+
internally, which provides standardized evaluation procedures and
|
|
125
|
+
ensures compatibility with the broader evaluation ecosystem.
|
|
126
|
+
"""
|
|
47
127
|
command_line_args = []
|
|
48
128
|
if self.include_path is not None:
|
|
49
129
|
command_line_args.extend(["--include_path", self.include_path])
|
|
@@ -15,9 +15,37 @@ log = logging.getLogger(__name__)
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class NYUv2TaskPool(TaskPool):
|
|
18
|
+
"""Task pool for multi-task learning evaluation on the NYUv2 dataset.
|
|
19
|
+
|
|
20
|
+
This task pool provides evaluation capabilities for multi-task learning models
|
|
21
|
+
on the NYU Depth V2 (NYUv2) dataset, which is a popular benchmark for indoor
|
|
22
|
+
scene understanding. The dataset supports multiple computer vision tasks
|
|
23
|
+
including semantic segmentation, depth estimation, and surface normal prediction.
|
|
24
|
+
|
|
25
|
+
The task pool is designed to work with encoder-decoder architectures where
|
|
26
|
+
a shared encoder processes input images and task-specific decoders generate
|
|
27
|
+
predictions for different tasks. It integrates with PyTorch Lightning for
|
|
28
|
+
streamlined training and evaluation workflows.
|
|
29
|
+
|
|
30
|
+
Supported Tasks:
|
|
31
|
+
- Semantic segmentation
|
|
32
|
+
- Depth estimation
|
|
33
|
+
- Surface normal prediction
|
|
34
|
+
"""
|
|
35
|
+
|
|
18
36
|
_trainer: L.Trainer = None
|
|
19
37
|
|
|
20
38
|
def __init__(self, taskpool_config: DictConfig):
|
|
39
|
+
"""Initialize the NYUv2 task pool with configuration settings.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
taskpool_config: Configuration object containing all necessary
|
|
43
|
+
parameters for the task pool, including:
|
|
44
|
+
- data_dir: Path to the directory containing NYUv2 dataset
|
|
45
|
+
- tasks: List of tasks to evaluate (e.g., ["semantic", "depth"])
|
|
46
|
+
- batch_size: Batch size for evaluation data loader
|
|
47
|
+
- num_workers: Number of worker processes for data loading
|
|
48
|
+
"""
|
|
21
49
|
self.config = taskpool_config
|
|
22
50
|
|
|
23
51
|
def load_datasets(self):
|
fusion_bench/utils/__init__.py
CHANGED
|
@@ -18,4 +18,6 @@ from .lazy_state_dict import LazyStateDict
|
|
|
18
18
|
from .misc import *
|
|
19
19
|
from .packages import import_object
|
|
20
20
|
from .parameters import *
|
|
21
|
+
from .pylogger import get_rankzero_logger
|
|
21
22
|
from .timer import timeit_context
|
|
23
|
+
from .type import BoolStateDictType, StateDictType, TorchModelType
|
|
@@ -1,15 +1,30 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import pickle
|
|
4
|
+
import warnings
|
|
4
5
|
from functools import wraps
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any, Callable, Union
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
from joblib import Memory
|
|
10
|
+
|
|
11
|
+
__all__ = ["cache_to_disk", "cache_with_joblib", "set_default_cache_dir"]
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
log = logging.getLogger(__name__)
|
|
12
15
|
|
|
16
|
+
DEFAULT_CACHE_DIR = Path.cwd() / "outputs" / "cache"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def set_default_cache_dir(path: str | Path):
|
|
20
|
+
global DEFAULT_CACHE_DIR
|
|
21
|
+
if path is None:
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
if isinstance(path, str):
|
|
25
|
+
path = Path(path)
|
|
26
|
+
DEFAULT_CACHE_DIR = path
|
|
27
|
+
|
|
13
28
|
|
|
14
29
|
def cache_to_disk(file_path: Union[str, Path]) -> Callable:
|
|
15
30
|
"""
|
|
@@ -17,6 +32,11 @@ def cache_to_disk(file_path: Union[str, Path]) -> Callable:
|
|
|
17
32
|
the result is loaded from the file. Otherwise, the function is executed and
|
|
18
33
|
the result is saved to the file.
|
|
19
34
|
|
|
35
|
+
!!! warning "deprecated"
|
|
36
|
+
This function is deprecated. Use `cache_with_joblib` instead for better
|
|
37
|
+
caching capabilities including automatic cache invalidation, better object
|
|
38
|
+
handling, and memory efficiency.
|
|
39
|
+
|
|
20
40
|
## Example usage
|
|
21
41
|
|
|
22
42
|
```python
|
|
@@ -32,6 +52,13 @@ def cache_to_disk(file_path: Union[str, Path]) -> Callable:
|
|
|
32
52
|
Returns:
|
|
33
53
|
Callable: The decorated function.
|
|
34
54
|
"""
|
|
55
|
+
warnings.warn(
|
|
56
|
+
"cache_to_disk is deprecated. Use cache_with_joblib instead for better "
|
|
57
|
+
"caching capabilities including automatic cache invalidation, better object "
|
|
58
|
+
"handling, and memory efficiency.",
|
|
59
|
+
DeprecationWarning,
|
|
60
|
+
stacklevel=2,
|
|
61
|
+
)
|
|
35
62
|
if isinstance(file_path, str):
|
|
36
63
|
file_path = Path(file_path)
|
|
37
64
|
assert isinstance(file_path, Path)
|
|
@@ -56,3 +83,76 @@ def cache_to_disk(file_path: Union[str, Path]) -> Callable:
|
|
|
56
83
|
return wrapper
|
|
57
84
|
|
|
58
85
|
return decorator
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def cache_with_joblib(
|
|
89
|
+
cache_dir: Union[str, Path] = None,
|
|
90
|
+
verbose: int = 0,
|
|
91
|
+
) -> Callable:
|
|
92
|
+
"""
|
|
93
|
+
A decorator to cache the result of a function using joblib.Memory. This provides
|
|
94
|
+
more advanced caching capabilities compared to cache_to_disk, including:
|
|
95
|
+
- Automatic cache invalidation when function arguments change
|
|
96
|
+
- Better handling of numpy arrays and other complex objects
|
|
97
|
+
- Memory-efficient storage
|
|
98
|
+
- Optional verbose output for cache hits/misses
|
|
99
|
+
|
|
100
|
+
## Example usage
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
@cache_with_joblib("./cache", verbose=1)
|
|
104
|
+
def expensive_computation(x: int, y: str) -> Any:
|
|
105
|
+
# Function implementation
|
|
106
|
+
return complex_result
|
|
107
|
+
|
|
108
|
+
# Or with default settings:
|
|
109
|
+
@cache_with_joblib()
|
|
110
|
+
def another_function(x: int) -> int:
|
|
111
|
+
return x * 2
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
cache_dir (Union[str, Path]): The directory where cache files should be stored.
|
|
116
|
+
If `None`, a default directory `outputs/cache` will be used.
|
|
117
|
+
verbose (int): Verbosity level for joblib.Memory (0=silent, 1=basic, 2++=verbose).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Callable: A decorator function that can be applied to functions.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
if cache_dir is None:
|
|
124
|
+
cache_dir = DEFAULT_CACHE_DIR
|
|
125
|
+
|
|
126
|
+
if isinstance(cache_dir, str):
|
|
127
|
+
cache_dir = Path(cache_dir)
|
|
128
|
+
assert isinstance(cache_dir, Path)
|
|
129
|
+
|
|
130
|
+
# Create the cache directory if it doesn't exist
|
|
131
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
|
|
133
|
+
# Create a Memory object for this function
|
|
134
|
+
memory = Memory(location=cache_dir, verbose=verbose)
|
|
135
|
+
|
|
136
|
+
def decorator(func: Callable) -> Callable:
|
|
137
|
+
nonlocal memory
|
|
138
|
+
|
|
139
|
+
# Create the cached version of the function
|
|
140
|
+
cached_func = memory.cache(func)
|
|
141
|
+
|
|
142
|
+
@wraps(func)
|
|
143
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
144
|
+
return cached_func(*args, **kwargs)
|
|
145
|
+
|
|
146
|
+
# Expose useful methods from joblib.Memory
|
|
147
|
+
if not (
|
|
148
|
+
hasattr(cached_func, "clear")
|
|
149
|
+
or hasattr(cached_func, "call")
|
|
150
|
+
or hasattr(cached_func, "check_call_in_cache")
|
|
151
|
+
):
|
|
152
|
+
wrapper.clear = cached_func.clear
|
|
153
|
+
wrapper.call = cached_func.call
|
|
154
|
+
wrapper.check_call_in_cache = cached_func.check_call_in_cache
|
|
155
|
+
|
|
156
|
+
return wrapper
|
|
157
|
+
|
|
158
|
+
return decorator
|
fusion_bench/utils/data.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import pickle
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Literal, Optional, Union
|
|
3
|
+
from typing import Any, Literal, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import torch
|
|
@@ -37,7 +37,9 @@ class InfiniteDataLoader:
|
|
|
37
37
|
return data
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def load_tensor_from_file(
|
|
40
|
+
def load_tensor_from_file(
|
|
41
|
+
file_path: Union[str, Path], device: Optional[Union[str, torch.device]] = None
|
|
42
|
+
) -> torch.Tensor:
|
|
41
43
|
"""
|
|
42
44
|
Loads a tensor from a file, which can be either a .pt, .pth or .np file.
|
|
43
45
|
If the file is not one of these formats, it will try to load it as a pickle file.
|
|
@@ -72,7 +74,7 @@ def train_validation_split(
|
|
|
72
74
|
validation_size: Optional[int] = None,
|
|
73
75
|
random_seed: Optional[int] = None,
|
|
74
76
|
return_split: Literal["all", "train", "val"] = "both",
|
|
75
|
-
):
|
|
77
|
+
) -> Union[Tuple[Dataset, Dataset], Dataset]:
|
|
76
78
|
"""
|
|
77
79
|
Split a dataset into a training and validation set.
|
|
78
80
|
|
|
@@ -134,7 +136,7 @@ def train_validation_test_split(
|
|
|
134
136
|
test_fraction: float,
|
|
135
137
|
random_seed: Optional[int] = None,
|
|
136
138
|
return_spilt: Literal["all", "train", "val", "test"] = "all",
|
|
137
|
-
):
|
|
139
|
+
) -> Union[Tuple[Dataset, Dataset, Dataset], Dataset]:
|
|
138
140
|
"""
|
|
139
141
|
Split a dataset into a training, validation and test set.
|
|
140
142
|
|