EvoScientist 0.0.1.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. EvoScientist/EvoScientist.py +157 -0
  2. EvoScientist/__init__.py +24 -0
  3. EvoScientist/__main__.py +4 -0
  4. EvoScientist/backends.py +392 -0
  5. EvoScientist/cli.py +1553 -0
  6. EvoScientist/middleware.py +35 -0
  7. EvoScientist/prompts.py +277 -0
  8. EvoScientist/skills/accelerate/SKILL.md +332 -0
  9. EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
  10. EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
  11. EvoScientist/skills/accelerate/references/performance.md +525 -0
  12. EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
  13. EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
  14. EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
  15. EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
  16. EvoScientist/skills/find-skills/SKILL.md +133 -0
  17. EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
  18. EvoScientist/skills/flash-attention/SKILL.md +367 -0
  19. EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
  20. EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
  21. EvoScientist/skills/llama-cpp/SKILL.md +258 -0
  22. EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
  23. EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
  24. EvoScientist/skills/llama-cpp/references/server.md +125 -0
  25. EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
  26. EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  27. EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  28. EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  29. EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  30. EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
  31. EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
  32. EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
  33. EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
  34. EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
  35. EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
  36. EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
  37. EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
  38. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  39. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  40. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
  41. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
  42. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
  43. EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
  44. EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
  45. EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
  46. EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
  47. EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
  48. EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
  49. EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
  50. EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
  51. EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
  52. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
  53. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
  54. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
  55. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
  56. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
  57. EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
  58. EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
  59. EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
  60. EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
  61. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
  62. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
  63. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
  64. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
  65. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
  66. EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
  67. EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
  68. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
  69. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
  70. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
  71. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
  72. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
  73. EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
  74. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
  75. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
  76. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
  77. EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
  78. EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
  79. EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
  80. EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
  81. EvoScientist/skills/peft/SKILL.md +431 -0
  82. EvoScientist/skills/peft/references/advanced-usage.md +514 -0
  83. EvoScientist/skills/peft/references/troubleshooting.md +480 -0
  84. EvoScientist/skills/ray-data/SKILL.md +326 -0
  85. EvoScientist/skills/ray-data/references/integration.md +82 -0
  86. EvoScientist/skills/ray-data/references/transformations.md +83 -0
  87. EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
  88. EvoScientist/skills/skill-creator/SKILL.md +356 -0
  89. EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
  90. EvoScientist/skills/skill-creator/references/workflows.md +28 -0
  91. EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
  92. EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
  93. EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
  94. EvoScientist/stream/__init__.py +53 -0
  95. EvoScientist/stream/emitter.py +94 -0
  96. EvoScientist/stream/formatter.py +168 -0
  97. EvoScientist/stream/tracker.py +115 -0
  98. EvoScientist/stream/utils.py +255 -0
  99. EvoScientist/subagent.yaml +147 -0
  100. EvoScientist/tools.py +135 -0
  101. EvoScientist/utils.py +207 -0
  102. evoscientist-0.0.1.dev2.dist-info/METADATA +227 -0
  103. evoscientist-0.0.1.dev2.dist-info/RECORD +107 -0
  104. evoscientist-0.0.1.dev2.dist-info/WHEEL +5 -0
  105. evoscientist-0.0.1.dev2.dist-info/entry_points.txt +5 -0
  106. evoscientist-0.0.1.dev2.dist-info/licenses/LICENSE +21 -0
  107. evoscientist-0.0.1.dev2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,453 @@
1
+ # Custom Plugins for Accelerate
2
+
3
+ ## Overview
4
+
5
+ Accelerate allows creating **custom plugins** to extend distributed training strategies beyond built-in options (DDP, FSDP, DeepSpeed).
6
+
7
+ ## Plugin Architecture
8
+
9
+ ### Base Plugin Structure
10
+
11
+ ```python
12
+ from accelerate.utils import DistributedDataParallelKwargs
13
+ from dataclasses import dataclass
14
+
15
+ @dataclass
16
+ class CustomPlugin:
17
+ """Custom training plugin."""
18
+
19
+ # Plugin configuration
20
+ param1: int = 1
21
+ param2: str = "default"
22
+
23
+ def __post_init__(self):
24
+ # Validation logic
25
+ if self.param1 < 1:
26
+ raise ValueError("param1 must be >= 1")
27
+ ```
28
+
29
+ ### Using Custom Plugin
30
+
31
+ ```python
32
+ from accelerate import Accelerator
33
+
34
+ # Create plugin
35
+ custom_plugin = CustomPlugin(param1=4, param2="value")
36
+
37
+ # Pass to Accelerator
38
+ accelerator = Accelerator(
39
+ custom_plugin=custom_plugin # Not a real parameter, example only
40
+ )
41
+ ```
42
+
43
+ ## Built-In Plugin Examples
44
+
45
+ ### 1. GradScalerKwargs (FP16 Configuration)
46
+
47
+ ```python
48
+ from accelerate.utils import GradScalerKwargs
49
+
50
+ # Configure gradient scaler for FP16
51
+ scaler_kwargs = GradScalerKwargs(
52
+ init_scale=2.**16, # Initial loss scale
53
+ growth_factor=2.0, # Scale growth rate
54
+ backoff_factor=0.5, # Scale backoff rate
55
+ growth_interval=2000, # Steps between scale increases
56
+ enabled=True # Enable scaler
57
+ )
58
+
59
+ accelerator = Accelerator(
60
+ mixed_precision='fp16',
61
+ kwargs_handlers=[scaler_kwargs] # Pass as kwargs handler
62
+ )
63
+ ```
64
+
65
+ **Use case**: Fine-tune FP16 gradient scaling behavior
66
+
67
+ ### 2. DistributedDataParallelKwargs
68
+
69
+ ```python
70
+ from accelerate.utils import DistributedDataParallelKwargs
71
+
72
+ # Configure DDP behavior
73
+ ddp_kwargs = DistributedDataParallelKwargs(
74
+ bucket_cap_mb=25, # Gradient bucketing size
75
+ find_unused_parameters=False, # Find unused params (slower)
76
+ check_reduction=False, # Check gradient reduction
77
+ gradient_as_bucket_view=True, # Memory optimization
78
+ static_graph=False # Static computation graph
79
+ )
80
+
81
+ accelerator = Accelerator(
82
+ kwargs_handlers=[ddp_kwargs]
83
+ )
84
+ ```
85
+
86
+ **Use case**: Optimize DDP performance for specific models
87
+
88
+ ### 3. FP8RecipeKwargs (H100 FP8)
89
+
90
+ ```python
91
+ from accelerate.utils import FP8RecipeKwargs
92
+
93
+ # Configure FP8 training (H100)
94
+ fp8_recipe = FP8RecipeKwargs(
95
+ backend="te", # TransformerEngine backend
96
+ margin=0, # Scaling margin
97
+ interval=1, # Scaling interval
98
+ fp8_format="HYBRID", # E4M3 + E5M2 hybrid
99
+ amax_history_len=1024, # AMAX history length
100
+ amax_compute_algo="max" # AMAX computation algorithm
101
+ )
102
+
103
+ accelerator = Accelerator(
104
+ mixed_precision='fp8',
105
+ kwargs_handlers=[fp8_recipe]
106
+ )
107
+ ```
108
+
109
+ **Use case**: Ultra-fast training on H100 GPUs
110
+
111
+ ## Custom DeepSpeed Configuration
112
+
113
+ ### ZeRO-3 with CPU Offload
114
+
115
+ ```python
116
+ from accelerate import Accelerator
117
+ from accelerate.utils import DeepSpeedPlugin
118
+
119
+ # Custom DeepSpeed config
120
+ ds_plugin = DeepSpeedPlugin(
121
+ zero_stage=3, # ZeRO-3
122
+ offload_optimizer_device="cpu", # CPU offload optimizer
123
+ offload_param_device="cpu", # CPU offload parameters
124
+ zero3_init_flag=True, # ZeRO-3 initialization
125
+ zero3_save_16bit_model=True, # Save FP16 weights
126
+ )
127
+
128
+ accelerator = Accelerator(
129
+ deepspeed_plugin=ds_plugin,
130
+ mixed_precision='bf16'
131
+ )
132
+ ```
133
+
134
+ ### ZeRO-2 with NVMe Offload
135
+
136
+ ```python
137
+ ds_plugin = DeepSpeedPlugin(
138
+ zero_stage=2,
139
+ offload_optimizer_device="nvme", # NVMe offload
140
+ offload_param_device="nvme",
141
+ nvme_path="/local_nvme", # NVMe mount path
142
+ )
143
+ ```
144
+
145
+ ### Custom JSON Config
146
+
147
+ ```python
148
+ import json
149
+
150
+ # Load custom DeepSpeed config
151
+ with open('deepspeed_config.json', 'r') as f:
152
+ ds_config = json.load(f)
153
+
154
+ ds_plugin = DeepSpeedPlugin(hf_ds_config=ds_config)
155
+
156
+ accelerator = Accelerator(deepspeed_plugin=ds_plugin)
157
+ ```
158
+
159
+ **Example config** (`deepspeed_config.json`):
160
+ ```json
161
+ {
162
+ "train_batch_size": "auto",
163
+ "train_micro_batch_size_per_gpu": "auto",
164
+ "gradient_accumulation_steps": "auto",
165
+ "gradient_clipping": 1.0,
166
+ "zero_optimization": {
167
+ "stage": 3,
168
+ "offload_optimizer": {
169
+ "device": "cpu",
170
+ "pin_memory": true
171
+ },
172
+ "offload_param": {
173
+ "device": "cpu",
174
+ "pin_memory": true
175
+ },
176
+ "overlap_comm": true,
177
+ "contiguous_gradients": true,
178
+ "sub_group_size": 1e9,
179
+ "reduce_bucket_size": 5e8,
180
+ "stage3_prefetch_bucket_size": 5e8,
181
+ "stage3_param_persistence_threshold": 1e6,
182
+ "stage3_max_live_parameters": 1e9,
183
+ "stage3_max_reuse_distance": 1e9,
184
+ "stage3_gather_16bit_weights_on_model_save": true
185
+ },
186
+ "bf16": {
187
+ "enabled": true
188
+ },
189
+ "steps_per_print": 100,
190
+ "wall_clock_breakdown": false
191
+ }
192
+ ```
193
+
194
+ ## Custom FSDP Configuration
195
+
196
+ ### FSDP with Custom Auto-Wrap Policy
197
+
198
+ ```python
199
+ from accelerate.utils import FullyShardedDataParallelPlugin
200
+ from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
201
+ from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
202
+ import functools
203
+
204
+ # Custom wrap policy (size-based)
205
+ wrap_policy = functools.partial(
206
+ size_based_auto_wrap_policy,
207
+ min_num_params=1e6 # Wrap layers with 1M+ params
208
+ )
209
+
210
+ fsdp_plugin = FullyShardedDataParallelPlugin(
211
+ sharding_strategy=ShardingStrategy.FULL_SHARD, # ZeRO-3 equivalent
212
+ backward_prefetch=BackwardPrefetch.BACKWARD_PRE, # Prefetch strategy
213
+ mixed_precision_policy=None, # Use Accelerator's mixed precision
214
+ auto_wrap_policy=wrap_policy, # Custom wrapping
215
+ cpu_offload=False,
216
+ ignored_modules=None, # Modules to not wrap
217
+ state_dict_type="FULL_STATE_DICT", # Save format
218
+ optim_state_dict_config=None,
219
+ limit_all_gathers=False,
220
+ use_orig_params=True, # Use original param shapes
221
+ )
222
+
223
+ accelerator = Accelerator(
224
+ fsdp_plugin=fsdp_plugin,
225
+ mixed_precision='bf16'
226
+ )
227
+ ```
228
+
229
+ ### FSDP with Transformer Auto-Wrap
230
+
231
+ ```python
232
+ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
233
+ from transformers.models.gpt2.modeling_gpt2 import GPT2Block
234
+
235
+ # Wrap at transformer block level
236
+ wrap_policy = functools.partial(
237
+ transformer_auto_wrap_policy,
238
+ transformer_layer_cls={GPT2Block} # Wrap GPT2Block layers
239
+ )
240
+
241
+ fsdp_plugin = FullyShardedDataParallelPlugin(
242
+ auto_wrap_policy=wrap_policy
243
+ )
244
+ ```
245
+
246
+ ## Creating Custom Training Strategy
247
+
248
+ ### Example: Custom Gradient Accumulation
249
+
250
+ ```python
251
+ from accelerate import Accelerator
252
+
253
+ class CustomGradientAccumulation:
254
+ def __init__(self, steps=4, adaptive=False):
255
+ self.steps = steps
256
+ self.adaptive = adaptive
257
+ self.current_step = 0
258
+
259
+ def should_sync(self, loss):
260
+ """Decide whether to sync gradients."""
261
+ self.current_step += 1
262
+
263
+ # Adaptive: sync on high loss
264
+ if self.adaptive and loss > threshold:
265
+ self.current_step = 0
266
+ return True
267
+
268
+ # Regular: sync every N steps
269
+ if self.current_step >= self.steps:
270
+ self.current_step = 0
271
+ return True
272
+
273
+ return False
274
+
275
+ # Usage
276
+ custom_accum = CustomGradientAccumulation(steps=8, adaptive=True)
277
+ accelerator = Accelerator()
278
+
279
+ for batch in dataloader:
280
+ outputs = model(**batch)
281
+ loss = outputs.loss
282
+
283
+ # Scale loss
284
+ loss = loss / custom_accum.steps
285
+ accelerator.backward(loss)
286
+
287
+ # Conditional sync
288
+ if custom_accum.should_sync(loss.item()):
289
+ optimizer.step()
290
+ optimizer.zero_grad()
291
+ ```
292
+
293
+ ### Example: Custom Mixed Precision
294
+
295
+ ```python
296
+ import torch
297
+
298
+ class CustomMixedPrecision:
299
+ """Custom mixed precision with dynamic loss scaling."""
300
+
301
+ def __init__(self, init_scale=2**16, scale_window=2000):
302
+ self.scaler = torch.cuda.amp.GradScaler(
303
+ init_scale=init_scale,
304
+ growth_interval=scale_window
305
+ )
306
+ self.scale_history = []
307
+
308
+ def scale_loss(self, loss):
309
+ """Scale loss for backward."""
310
+ return self.scaler.scale(loss)
311
+
312
+ def unscale_and_clip(self, optimizer, max_norm=1.0):
313
+ """Unscale gradients and clip."""
314
+ self.scaler.unscale_(optimizer)
315
+ torch.nn.utils.clip_grad_norm_(
316
+ optimizer.param_groups[0]['params'],
317
+ max_norm
318
+ )
319
+
320
+ def step(self, optimizer):
321
+ """Optimizer step with scaler update."""
322
+ scale_before = self.scaler.get_scale()
323
+ self.scaler.step(optimizer)
324
+ self.scaler.update()
325
+ scale_after = self.scaler.get_scale()
326
+
327
+ # Track scale changes
328
+ if scale_before != scale_after:
329
+ self.scale_history.append(scale_after)
330
+
331
+ # Usage
332
+ custom_mp = CustomMixedPrecision()
333
+
334
+ for batch in dataloader:
335
+ with torch.cuda.amp.autocast(dtype=torch.float16):
336
+ loss = model(**batch).loss
337
+
338
+ scaled_loss = custom_mp.scale_loss(loss)
339
+ scaled_loss.backward()
340
+
341
+ custom_mp.unscale_and_clip(optimizer, max_norm=1.0)
342
+ custom_mp.step(optimizer)
343
+ optimizer.zero_grad()
344
+ ```
345
+
346
+ ## Advanced: Custom Distributed Backend
347
+
348
+ ### Custom AllReduce Strategy
349
+
350
+ ```python
351
+ import torch.distributed as dist
352
+
353
+ class CustomAllReduce:
354
+ """Custom all-reduce with compression."""
355
+
356
+ def __init__(self, compression_ratio=0.1):
357
+ self.compression_ratio = compression_ratio
358
+
359
+ def compress_gradients(self, tensor):
360
+ """Top-k gradient compression."""
361
+ k = int(tensor.numel() * self.compression_ratio)
362
+ values, indices = torch.topk(tensor.abs().view(-1), k)
363
+ return values, indices
364
+
365
+ def all_reduce_compressed(self, tensor):
366
+ """All-reduce with gradient compression."""
367
+ # Compress
368
+ values, indices = self.compress_gradients(tensor)
369
+
370
+ # All-reduce compressed gradients
371
+ dist.all_reduce(values, op=dist.ReduceOp.SUM)
372
+
373
+ # Decompress
374
+ tensor_compressed = torch.zeros_like(tensor).view(-1)
375
+ tensor_compressed[indices] = values / dist.get_world_size()
376
+
377
+ return tensor_compressed.view_as(tensor)
378
+
379
+ # Usage in training loop
380
+ custom_ar = CustomAllReduce(compression_ratio=0.1)
381
+
382
+ for batch in dataloader:
383
+ loss = model(**batch).loss
384
+ loss.backward()
385
+
386
+ # Custom all-reduce
387
+ for param in model.parameters():
388
+ if param.grad is not None:
389
+ param.grad.data = custom_ar.all_reduce_compressed(param.grad.data)
390
+
391
+ optimizer.step()
392
+ optimizer.zero_grad()
393
+ ```
394
+
395
+ ## Plugin Best Practices
396
+
397
+ ### 1. Validation in `__post_init__`
398
+
399
+ ```python
400
+ @dataclass
401
+ class CustomPlugin:
402
+ learning_rate: float = 1e-3
403
+ warmup_steps: int = 1000
404
+
405
+ def __post_init__(self):
406
+ # Validate parameters
407
+ if self.learning_rate <= 0:
408
+ raise ValueError("learning_rate must be positive")
409
+ if self.warmup_steps < 0:
410
+ raise ValueError("warmup_steps must be non-negative")
411
+
412
+ # Compute derived values
413
+ self.min_lr = self.learning_rate * 0.1
414
+ ```
415
+
416
+ ### 2. Compatibility Checks
417
+
418
+ ```python
419
+ @dataclass
420
+ class CustomPlugin:
421
+ feature_enabled: bool = True
422
+
423
+ def is_compatible(self, accelerator):
424
+ """Check if plugin is compatible with accelerator config."""
425
+ if self.feature_enabled and accelerator.mixed_precision == 'fp8':
426
+ raise ValueError("Custom plugin not compatible with FP8")
427
+ return True
428
+ ```
429
+
430
+ ### 3. State Management
431
+
432
+ ```python
433
+ @dataclass
434
+ class CustomPlugin:
435
+ counter: int = 0
436
+ history: list = None
437
+
438
+ def __post_init__(self):
439
+ if self.history is None:
440
+ self.history = []
441
+
442
+ def update_state(self, value):
443
+ """Update plugin state during training."""
444
+ self.counter += 1
445
+ self.history.append(value)
446
+ ```
447
+
448
+ ## Resources
449
+
450
+ - Accelerate Plugins: https://huggingface.co/docs/accelerate/package_reference/kwargs
451
+ - DeepSpeed Config: https://www.deepspeed.ai/docs/config-json/
452
+ - FSDP Guide: https://pytorch.org/docs/stable/fsdp.html
453
+ - Custom Training Loops: https://huggingface.co/docs/accelerate/usage_guides/training_tpu