omgkit 2.20.0 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +125 -10
  2. package/package.json +1 -1
  3. package/plugin/agents/ai-architect-agent.md +282 -0
  4. package/plugin/agents/data-scientist-agent.md +221 -0
  5. package/plugin/agents/experiment-analyst-agent.md +318 -0
  6. package/plugin/agents/ml-engineer-agent.md +165 -0
  7. package/plugin/agents/mlops-engineer-agent.md +324 -0
  8. package/plugin/agents/model-optimizer-agent.md +287 -0
  9. package/plugin/agents/production-engineer-agent.md +360 -0
  10. package/plugin/agents/research-scientist-agent.md +274 -0
  11. package/plugin/commands/omgdata/augment.md +86 -0
  12. package/plugin/commands/omgdata/collect.md +81 -0
  13. package/plugin/commands/omgdata/label.md +83 -0
  14. package/plugin/commands/omgdata/split.md +83 -0
  15. package/plugin/commands/omgdata/validate.md +76 -0
  16. package/plugin/commands/omgdata/version.md +85 -0
  17. package/plugin/commands/omgdeploy/ab.md +94 -0
  18. package/plugin/commands/omgdeploy/cloud.md +89 -0
  19. package/plugin/commands/omgdeploy/edge.md +93 -0
  20. package/plugin/commands/omgdeploy/package.md +91 -0
  21. package/plugin/commands/omgdeploy/serve.md +92 -0
  22. package/plugin/commands/omgfeature/embed.md +93 -0
  23. package/plugin/commands/omgfeature/extract.md +93 -0
  24. package/plugin/commands/omgfeature/select.md +85 -0
  25. package/plugin/commands/omgfeature/store.md +97 -0
  26. package/plugin/commands/omgml/init.md +60 -0
  27. package/plugin/commands/omgml/status.md +82 -0
  28. package/plugin/commands/omgops/drift.md +87 -0
  29. package/plugin/commands/omgops/monitor.md +99 -0
  30. package/plugin/commands/omgops/pipeline.md +102 -0
  31. package/plugin/commands/omgops/registry.md +109 -0
  32. package/plugin/commands/omgops/retrain.md +91 -0
  33. package/plugin/commands/omgoptim/distill.md +90 -0
  34. package/plugin/commands/omgoptim/profile.md +92 -0
  35. package/plugin/commands/omgoptim/prune.md +81 -0
  36. package/plugin/commands/omgoptim/quantize.md +83 -0
  37. package/plugin/commands/omgtrain/baseline.md +78 -0
  38. package/plugin/commands/omgtrain/compare.md +99 -0
  39. package/plugin/commands/omgtrain/evaluate.md +85 -0
  40. package/plugin/commands/omgtrain/train.md +81 -0
  41. package/plugin/commands/omgtrain/tune.md +89 -0
  42. package/plugin/registry.yaml +252 -2
  43. package/plugin/skills/ml-systems/SKILL.md +65 -0
  44. package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
  45. package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
  46. package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
  47. package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
  48. package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
  49. package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
  50. package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
  51. package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
  52. package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
  53. package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
  54. package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
  55. package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
  56. package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
  57. package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
  58. package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
  59. package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
  60. package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
  61. package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
  62. package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
  63. package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
  64. package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
  65. package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
  66. package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
  67. package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
  68. package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
  69. package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
  70. package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
  71. package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
  72. package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
  73. package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
@@ -1,9 +1,9 @@
1
1
  # OMGKIT Component Registry
2
2
  # Single Source of Truth for Agents, Skills, Commands, Workflows, and MCPs
3
- # Version: 2.20.0
3
+ # Version: 2.21.0
4
4
  # Updated: 2026-01-02
5
5
 
6
- version: "2.20.0"
6
+ version: "2.21.1"
7
7
 
8
8
  # =============================================================================
9
9
  # OPTIMIZED ALIGNMENT PRINCIPLE (OAP)
@@ -493,6 +493,154 @@ agents:
493
493
  - simulation/visualization-scientific
494
494
  commands: []
495
495
 
496
+ # ---------------------------------------------------------------------------
497
+ # ML SYSTEMS AGENTS (Harvard CS 329S + Chip Huyen)
498
+ # ---------------------------------------------------------------------------
499
+ ml-engineer-agent:
500
+ file: agents/ml-engineer-agent.md
501
+ description: Full-stack ML engineering for end-to-end ML systems
502
+ skills:
503
+ - ml-systems/ml-systems-fundamentals
504
+ - ml-systems/data-eng
505
+ - ml-systems/feature-engineering
506
+ - ml-systems/ml-workflow
507
+ - ml-systems/model-dev
508
+ - ml-systems/ml-frameworks
509
+ - ml-systems/model-deployment
510
+ - ml-systems/mlops
511
+ commands:
512
+ - /omgml:init
513
+ - /omgml:status
514
+ - /omgdata:collect
515
+ - /omgdata:validate
516
+ - /omgfeature:extract
517
+ - /omgfeature:select
518
+ - /omgtrain:train
519
+ - /omgtrain:evaluate
520
+ - /omgdeploy:package
521
+ - /omgdeploy:serve
522
+ - /omgops:pipeline
523
+
524
+ data-scientist-agent:
525
+ file: agents/data-scientist-agent.md
526
+ description: Expert data science for EDA, statistical modeling, and insights
527
+ skills:
528
+ - ml-systems/ml-systems-fundamentals
529
+ - ml-systems/data-eng
530
+ - ml-systems/training-data
531
+ - ml-systems/feature-engineering
532
+ - ml-systems/ml-workflow
533
+ - ml-systems/model-dev
534
+ commands:
535
+ - /omgdata:collect
536
+ - /omgdata:validate
537
+ - /omgdata:label
538
+ - /omgdata:augment
539
+ - /omgdata:split
540
+ - /omgfeature:extract
541
+ - /omgfeature:select
542
+ - /omgtrain:baseline
543
+ - /omgtrain:train
544
+ - /omgtrain:evaluate
545
+ - /omgtrain:compare
546
+
547
+ mlops-engineer-agent:
548
+ file: agents/mlops-engineer-agent.md
549
+ description: MLOps for production ML infrastructure and automation
550
+ skills:
551
+ - ml-systems/mlops
552
+ - ml-systems/robust-ai
553
+ - ml-systems/model-deployment
554
+ - ml-systems/ml-serving-optimization
555
+ commands:
556
+ - /omgops:pipeline
557
+ - /omgops:monitor
558
+ - /omgops:drift
559
+ - /omgops:retrain
560
+ - /omgops:registry
561
+ - /omgdeploy:package
562
+ - /omgdeploy:serve
563
+ - /omgdeploy:cloud
564
+ - /omgdeploy:ab
565
+
566
+ research-scientist-agent:
567
+ file: agents/research-scientist-agent.md
568
+ description: AI/ML research for novel approaches and paper implementation
569
+ skills:
570
+ - ml-systems/ml-systems-fundamentals
571
+ - ml-systems/deep-learning-primer
572
+ - ml-systems/dnn-architectures
573
+ - ml-systems/ml-workflow
574
+ - ml-systems/model-dev
575
+ - ml-systems/ml-frameworks
576
+ commands:
577
+ - /omgtrain:train
578
+ - /omgtrain:tune
579
+ - /omgtrain:evaluate
580
+ - /omgtrain:compare
581
+ - /omgml:status
582
+
583
+ model-optimizer-agent:
584
+ file: agents/model-optimizer-agent.md
585
+ description: Model optimization through quantization, pruning, and distillation
586
+ skills:
587
+ - ml-systems/efficient-ai
588
+ - ml-systems/model-optimization
589
+ - ml-systems/ai-accelerators
590
+ - ml-systems/ml-serving-optimization
591
+ commands:
592
+ - /omgoptim:quantize
593
+ - /omgoptim:prune
594
+ - /omgoptim:distill
595
+ - /omgoptim:profile
596
+ - /omgtrain:evaluate
597
+
598
+ production-engineer-agent:
599
+ file: agents/production-engineer-agent.md
600
+ description: ML production deployment with reliability and scalability
601
+ skills:
602
+ - ml-systems/model-deployment
603
+ - ml-systems/ml-serving-optimization
604
+ - ml-systems/edge-deployment
605
+ - ml-systems/robust-ai
606
+ commands:
607
+ - /omgdeploy:package
608
+ - /omgdeploy:serve
609
+ - /omgdeploy:edge
610
+ - /omgdeploy:cloud
611
+ - /omgdeploy:ab
612
+ - /omgops:monitor
613
+
614
+ ai-architect-agent:
615
+ file: agents/ai-architect-agent.md
616
+ description: Senior AI/ML architect for end-to-end ML system design
617
+ skills:
618
+ - ml-systems/ml-systems-fundamentals
619
+ - ml-systems/deployment-paradigms
620
+ - ml-systems/data-eng
621
+ - ml-systems/feature-engineering
622
+ - ml-systems/ml-workflow
623
+ - ml-systems/model-deployment
624
+ - ml-systems/mlops
625
+ - ml-systems/robust-ai
626
+ commands:
627
+ - /omgml:init
628
+ - /omgml:status
629
+ - /omgops:pipeline
630
+ - /omgops:registry
631
+
632
+ experiment-analyst-agent:
633
+ file: agents/experiment-analyst-agent.md
634
+ description: ML experiment analysis and model comparison
635
+ skills:
636
+ - ml-systems/ml-workflow
637
+ - ml-systems/model-dev
638
+ - ml-systems/training-data
639
+ commands:
640
+ - /omgtrain:evaluate
641
+ - /omgtrain:compare
642
+ - /omgml:status
643
+
496
644
  # =============================================================================
497
645
  # SKILL CATEGORIES
498
646
  # =============================================================================
@@ -514,6 +662,7 @@ skill_categories:
514
662
  - languages
515
663
  - methodology
516
664
  - microservices
665
+ - ml-systems # ML Systems Design (Harvard CS 329S + Chip Huyen)
517
666
  - mobile
518
667
  - mobile-advanced
519
668
  - omega
@@ -540,6 +689,13 @@ command_namespaces:
540
689
  - iot # IoT operations
541
690
  - ml # Machine learning
542
691
  - omega # Omega principles
692
+ - omgdata # ML Data Engineering
693
+ - omgdeploy # ML Model Deployment
694
+ - omgfeature # ML Feature Engineering
695
+ - omgml # ML Project Management
696
+ - omgops # ML Operations
697
+ - omgoptim # ML Model Optimization
698
+ - omgtrain # ML Model Training
543
699
  - perf # Performance
544
700
  - planning # Planning and research
545
701
  - platform # Platform engineering
@@ -772,3 +928,97 @@ workflows:
772
928
  agents: [copywriter, researcher]
773
929
  skills: []
774
930
  commands: [/planning:brainstorm, /planning:research]
931
+
932
+ # ---------------------------------------------------------------------------
933
+ # ML SYSTEMS WORKFLOWS (Harvard CS 329S + Chip Huyen)
934
+ # ---------------------------------------------------------------------------
935
+ ml-systems/model-development-workflow:
936
+ agents: [data-scientist-agent, research-scientist-agent, experiment-analyst-agent]
937
+ skills:
938
+ - ml-systems/ml-systems-fundamentals
939
+ - ml-systems/ml-workflow
940
+ - ml-systems/model-dev
941
+ commands: [/omgml:init, /omgtrain:baseline, /omgtrain:train, /omgtrain:evaluate]
942
+
943
+ ml-systems/data-preparation-workflow:
944
+ agents: [data-scientist-agent, ml-engineer-agent]
945
+ skills:
946
+ - ml-systems/data-eng
947
+ - ml-systems/training-data
948
+ - ml-systems/feature-engineering
949
+ commands: [/omgdata:collect, /omgdata:validate, /omgdata:label, /omgdata:augment, /omgdata:split]
950
+
951
+ ml-systems/training-pipeline-workflow:
952
+ agents: [ml-engineer-agent, mlops-engineer-agent]
953
+ skills:
954
+ - ml-systems/ml-workflow
955
+ - ml-systems/mlops
956
+ commands: [/omgops:pipeline, /omgtrain:train, /omgtrain:evaluate, /omgops:registry]
957
+
958
+ ml-systems/hyperparameter-tuning-workflow:
959
+ agents: [research-scientist-agent, experiment-analyst-agent]
960
+ skills:
961
+ - ml-systems/model-dev
962
+ - ml-systems/ml-frameworks
963
+ commands: [/omgtrain:tune, /omgtrain:compare]
964
+
965
+ ml-systems/model-evaluation-workflow:
966
+ agents: [experiment-analyst-agent, data-scientist-agent]
967
+ skills:
968
+ - ml-systems/model-dev
969
+ - ml-systems/training-data
970
+ commands: [/omgtrain:evaluate, /omgtrain:compare]
971
+
972
+ ml-systems/model-optimization-workflow:
973
+ agents: [model-optimizer-agent, production-engineer-agent]
974
+ skills:
975
+ - ml-systems/efficient-ai
976
+ - ml-systems/model-optimization
977
+ - ml-systems/ml-serving-optimization
978
+ commands: [/omgoptim:profile, /omgoptim:quantize, /omgoptim:prune, /omgoptim:distill]
979
+
980
+ ml-systems/model-deployment-workflow:
981
+ agents: [production-engineer-agent, mlops-engineer-agent]
982
+ skills:
983
+ - ml-systems/model-deployment
984
+ - ml-systems/ml-serving-optimization
985
+ - ml-systems/robust-ai
986
+ commands: [/omgdeploy:package, /omgdeploy:serve, /omgdeploy:cloud, /omgdeploy:ab]
987
+
988
+ ml-systems/edge-deployment-workflow:
989
+ agents: [model-optimizer-agent, production-engineer-agent]
990
+ skills:
991
+ - ml-systems/edge-deployment
992
+ - ml-systems/efficient-ai
993
+ commands: [/omgoptim:quantize, /omgdeploy:edge]
994
+
995
+ ml-systems/mlops-pipeline-workflow:
996
+ agents: [mlops-engineer-agent, production-engineer-agent]
997
+ skills:
998
+ - ml-systems/mlops
999
+ - ml-systems/model-deployment
1000
+ commands: [/omgops:pipeline, /omgops:monitor, /omgops:registry]
1001
+
1002
+ ml-systems/monitoring-drift-workflow:
1003
+ agents: [mlops-engineer-agent, experiment-analyst-agent]
1004
+ skills:
1005
+ - ml-systems/robust-ai
1006
+ - ml-systems/mlops
1007
+ commands: [/omgops:monitor, /omgops:drift]
1008
+
1009
+ ml-systems/retraining-workflow:
1010
+ agents: [ml-engineer-agent, mlops-engineer-agent, experiment-analyst-agent]
1011
+ skills:
1012
+ - ml-systems/mlops
1013
+ - ml-systems/ml-workflow
1014
+ commands: [/omgops:retrain, /omgtrain:train, /omgtrain:evaluate]
1015
+
1016
+ ml-systems/full-ml-lifecycle-workflow:
1017
+ agents: [ai-architect-agent, data-scientist-agent, ml-engineer-agent, research-scientist-agent, model-optimizer-agent, production-engineer-agent, mlops-engineer-agent, experiment-analyst-agent]
1018
+ skills:
1019
+ - ml-systems/ml-systems-fundamentals
1020
+ - ml-systems/data-eng
1021
+ - ml-systems/ml-workflow
1022
+ - ml-systems/model-deployment
1023
+ - ml-systems/mlops
1024
+ commands: [/omgml:init, /omgdata:collect, /omgtrain:train, /omgdeploy:serve, /omgops:monitor]
@@ -0,0 +1,65 @@
1
+ ---
2
+ name: ml-systems
3
+ description: Machine Learning Systems - comprehensive knowledge for building production ML systems from data engineering through deployment and operations. Based on Harvard ML Systems course and Designing ML Systems by Chip Huyen.
4
+ ---
5
+
6
+ # ML Systems
7
+
8
+ Building production-ready machine learning systems.
9
+
10
+ ## Overview
11
+
12
+ This skill category covers the complete ML system lifecycle:
13
+
14
+ 1. **Foundations** - Core concepts, architectures, paradigms
15
+ 2. **Data Engineering** - Data collection, quality, feature engineering
16
+ 3. **Model Development** - Training, evaluation, frameworks
17
+ 4. **Performance** - Optimization, acceleration, efficiency
18
+ 5. **Deployment** - Serving, edge deployment, scaling
19
+ 6. **Operations** - MLOps, monitoring, reliability
20
+
21
+ ## Categories
22
+
23
+ ### Foundations
24
+ - `ml-systems-fundamentals` - Core ML systems concepts
25
+ - `deep-learning-primer` - Deep learning foundations
26
+ - `dnn-architectures` - Neural network architectures
27
+ - `deployment-paradigms` - Deployment patterns
28
+
29
+ ### Data Engineering
30
+ - `data-engineering` - Data pipelines and quality
31
+ - `training-data` - Training data management
32
+ - `feature-engineering` - Feature creation and stores
33
+
34
+ ### Model Development
35
+ - `ml-workflow` - ML development workflow
36
+ - `model-development` - Model training and selection
37
+ - `ml-frameworks` - Framework best practices
38
+
39
+ ### Performance
40
+ - `efficient-ai` - Efficiency techniques
41
+ - `model-optimization` - Quantization, pruning, distillation
42
+ - `ai-accelerators` - Hardware acceleration
43
+
44
+ ### Deployment
45
+ - `model-deployment` - Production deployment
46
+ - `inference-optimization` - Inference optimization
47
+ - `edge-deployment` - Edge and mobile deployment
48
+
49
+ ### Operations
50
+ - `mlops` - ML operations and lifecycle
51
+ - `robust-ai` - Reliability and robustness
52
+
53
+ ## Key Principles
54
+
55
+ 1. **Data-Centric AI** - Focus on data quality over model complexity
56
+ 2. **Iterative Development** - Start simple, iterate based on metrics
57
+ 3. **Production-First** - Design for deployment from the start
58
+ 4. **Monitoring** - Continuous monitoring and improvement
59
+ 5. **Reproducibility** - Version everything (data, code, models)
60
+
61
+ ## References
62
+
63
+ - Harvard CS 329S: Machine Learning Systems Design
64
+ - Designing Machine Learning Systems by Chip Huyen
65
+ - MLOps: Continuous Delivery and Automation Pipelines
@@ -0,0 +1,342 @@
1
+ ---
2
+ name: ai-accelerators
3
+ description: AI hardware accelerators including GPUs, TPUs, custom silicon, and hardware-aware optimization strategies for ML workloads.
4
+ ---
5
+
6
+ # AI Accelerators
7
+
8
+ Hardware acceleration for ML workloads.
9
+
10
+ ## Hardware Landscape
11
+
12
+ ```
13
+ ┌─────────────────────────────────────────────────────────────┐
14
+ │ AI ACCELERATOR TYPES │
15
+ ├─────────────────────────────────────────────────────────────┤
16
+ │ │
17
+ │ GPU (NVIDIA) TPU (Google) NPU/Custom │
18
+ │ ───────────── ──────────── ───────── │
19
+ │ CUDA cores Systolic array Apple Neural │
20
+ │ Tensor cores BF16 native Qualcomm Hexagon │
21
+ │ General purpose TPU pods Intel Habana │
22
+ │ PyTorch/TF native JAX optimized AWS Inferentia │
23
+ │ │
24
+ │ FPGA ASIC Edge Accelerators │
25
+ │ ───────────── ──────────── ───────── │
26
+ │ Reconfigurable Fixed function Coral Edge TPU │
27
+ │ Low latency Maximum perf Jetson (NVIDIA) │
28
+ │ Power efficient High volume Intel NCS2 │
29
+ │ │
30
+ └─────────────────────────────────────────────────────────────┘
31
+ ```
32
+
33
+ ## GPU Optimization
34
+
35
+ ### CUDA Memory Management
36
+ ```python
37
+ import torch
38
+
39
+ # Memory allocation
40
+ torch.cuda.empty_cache()
41
+ torch.cuda.memory_allocated()
42
+ torch.cuda.max_memory_allocated()
43
+
44
+ # Pin memory for faster transfers
45
+ train_loader = DataLoader(
46
+ dataset,
47
+ batch_size=32,
48
+ pin_memory=True,
49
+ num_workers=4
50
+ )
51
+
52
+ # Async data transfer
53
+ def async_prefetch(loader, device):
54
+ stream = torch.cuda.Stream()
55
+ for batch in loader:
56
+ with torch.cuda.stream(stream):
57
+ batch = batch.to(device, non_blocking=True)
58
+ torch.cuda.current_stream().wait_stream(stream)
59
+ yield batch
60
+ ```
61
+
62
+ ### Tensor Core Utilization
63
+ ```python
64
+ # Ensure tensor core alignment (multiples of 8)
65
+ class TensorCoreOptimized(nn.Module):
66
+ def __init__(self, in_features, out_features):
67
+ super().__init__()
68
+ # Round to multiple of 8 for tensor cores
69
+ self.in_features = ((in_features + 7) // 8) * 8
70
+ self.out_features = ((out_features + 7) // 8) * 8
71
+ self.linear = nn.Linear(self.in_features, self.out_features)
72
+ self.pad_in = self.in_features - in_features
73
+
74
+ def forward(self, x):
75
+ if self.pad_in > 0:
76
+ x = F.pad(x, (0, self.pad_in))
77
+ return self.linear(x)
78
+
79
+ # Enable TF32 on Ampere+ GPUs
80
+ torch.backends.cuda.matmul.allow_tf32 = True
81
+ torch.backends.cudnn.allow_tf32 = True
82
+
83
+ # Force FP16 computation
84
+ with torch.cuda.amp.autocast(dtype=torch.float16):
85
+ output = model(input)
86
+ ```
87
+
88
+ ### Multi-GPU Strategies
89
+ ```python
90
+ # DataParallel (simple, not recommended for training)
91
+ model = nn.DataParallel(model)
92
+
93
+ # DistributedDataParallel (recommended)
94
+ model = DistributedDataParallel(model, device_ids=[local_rank])
95
+
96
+ # Model Parallelism (for large models)
97
+ class ModelParallel(nn.Module):
98
+ def __init__(self):
99
+ super().__init__()
100
+ self.encoder = nn.TransformerEncoder(...).to('cuda:0')
101
+ self.decoder = nn.TransformerDecoder(...).to('cuda:1')
102
+
103
+ def forward(self, x):
104
+ x = self.encoder(x.to('cuda:0'))
105
+ x = self.decoder(x.to('cuda:1'))
106
+ return x
107
+
108
+ # Pipeline Parallelism
109
+ from torch.distributed.pipeline.sync import Pipe
110
+
111
+ model = nn.Sequential(
112
+ nn.Linear(100, 200).to('cuda:0'),
113
+ nn.ReLU().to('cuda:0'),
114
+ nn.Linear(200, 100).to('cuda:1')
115
+ )
116
+ model = Pipe(model, chunks=8)
117
+ ```
118
+
119
+ ## TPU Optimization
120
+
121
+ ```python
122
+ # JAX/TPU optimized training
123
+ import jax
124
+ import jax.numpy as jnp
125
+ from flax import linen as nn
126
+
127
+ class TPUModel(nn.Module):
128
+ features: int
129
+
130
+ @nn.compact
131
+ def __call__(self, x):
132
+ x = nn.Dense(self.features)(x)
133
+ x = nn.relu(x)
134
+ return nn.Dense(10)(x)
135
+
136
+ # pmap for data parallelism across TPU cores
137
+ @jax.pmap
138
+ def train_step(state, batch):
139
+ def loss_fn(params):
140
+ logits = state.apply_fn({'params': params}, batch['image'])
141
+ loss = jnp.mean(optax.softmax_cross_entropy(logits, batch['label']))
142
+ return loss
143
+
144
+ grad_fn = jax.value_and_grad(loss_fn)
145
+ loss, grads = grad_fn(state.params)
146
+ grads = jax.lax.pmean(grads, axis_name='batch')
147
+ state = state.apply_gradients(grads=grads)
148
+ return state, loss
149
+
150
+ # PyTorch/XLA for TPU
151
+ import torch_xla.core.xla_model as xm
152
+ import torch_xla.distributed.parallel_loader as pl
153
+
154
+ device = xm.xla_device()
155
+ model = model.to(device)
156
+
157
+ for batch in pl.ParallelLoader(train_loader, [device]):
158
+ output = model(batch)
159
+ loss.backward()
160
+ xm.optimizer_step(optimizer)
161
+ ```
162
+
163
+ ## Edge Accelerators
164
+
165
+ ### NVIDIA Jetson
166
+ ```python
167
+ # TensorRT optimization for Jetson
168
+ import tensorrt as trt
169
+ import pycuda.driver as cuda
170
+
171
+ def build_engine(onnx_path, precision='fp16'):
172
+ logger = trt.Logger(trt.Logger.WARNING)
173
+ builder = trt.Builder(logger)
174
+ network = builder.create_network(
175
+ 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
176
+ )
177
+ parser = trt.OnnxParser(network, logger)
178
+
179
+ with open(onnx_path, 'rb') as f:
180
+ parser.parse(f.read())
181
+
182
+ config = builder.create_builder_config()
183
+ config.max_workspace_size = 1 << 30 # 1GB
184
+
185
+ if precision == 'fp16':
186
+ config.set_flag(trt.BuilderFlag.FP16)
187
+ elif precision == 'int8':
188
+ config.set_flag(trt.BuilderFlag.INT8)
189
+ config.int8_calibrator = EntropyCalibrator(calibration_data)
190
+
191
+ return builder.build_engine(network, config)
192
+
193
+ # DeepStream for video inference
194
+ # gst-launch-1.0 filesrc location=video.mp4 ! \
195
+ # decodebin ! nvvideoconvert ! \
196
+ # nvinfer config-file-path=config.txt ! \
197
+ # nvdsosd ! nveglglessink
198
+ ```
199
+
200
+ ### Coral Edge TPU
201
+ ```python
202
+ from pycoral.utils import edgetpu
203
+ from pycoral.adapters import common, classify
204
+
205
+ # Load Edge TPU model
206
+ interpreter = edgetpu.make_interpreter('model_edgetpu.tflite')
207
+ interpreter.allocate_tensors()
208
+
209
+ # Inference
210
+ common.set_input(interpreter, image)
211
+ interpreter.invoke()
212
+ classes = classify.get_classes(interpreter, top_k=5)
213
+
214
+ # Compile model for Edge TPU
215
+ # edgetpu_compiler model.tflite
216
+ ```
217
+
218
+ ### TFLite for Mobile
219
+ ```python
220
+ import tensorflow as tf
221
+
222
+ # Convert to TFLite with quantization
223
+ converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/')
224
+ converter.optimizations = [tf.lite.Optimize.DEFAULT]
225
+ converter.target_spec.supported_types = [tf.float16]
226
+
227
+ # Full integer quantization
228
+ converter.representative_dataset = representative_dataset
229
+ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
230
+ converter.inference_input_type = tf.uint8
231
+ converter.inference_output_type = tf.uint8
232
+
233
+ tflite_model = converter.convert()
234
+
235
+ # Inference
236
+ interpreter = tf.lite.Interpreter(model_content=tflite_model)
237
+ interpreter.allocate_tensors()
238
+ ```
239
+
240
+ ## Hardware-Aware Optimization
241
+
242
+ ### Auto-Tuning
243
+ ```python
244
+ # TVM auto-tuning for specific hardware
245
+ import tvm
246
+ from tvm import relay, autotvm
247
+
248
+ # Extract tuning tasks
249
+ tasks = autotvm.task.extract_from_program(
250
+ mod["main"], target="cuda", params=params
251
+ )
252
+
253
+ # Tune each task
254
+ for task in tasks:
255
+ tuner = autotvm.tuner.XGBTuner(task)
256
+ tuner.tune(
257
+ n_trial=1000,
258
+ measure_option=autotvm.measure_option(
259
+ builder=autotvm.LocalBuilder(),
260
+ runner=autotvm.LocalRunner(number=10)
261
+ ),
262
+ callbacks=[autotvm.callback.log_to_file('tune.log')]
263
+ )
264
+
265
+ # Compile with best configs
266
+ with autotvm.apply_history_best('tune.log'):
267
+ with tvm.transform.PassContext(opt_level=3):
268
+ lib = relay.build(mod, target="cuda", params=params)
269
+ ```
270
+
271
+ ### Hardware Selection Matrix
272
+ ```python
273
+ def select_hardware(model_size, latency_req, batch_size, budget):
274
+ """Select optimal hardware for ML workload."""
275
+ recommendations = []
276
+
277
+ if model_size > 10e9: # >10B params
278
+ recommendations.append({
279
+ 'hardware': 'Multi-GPU (A100/H100)',
280
+ 'reason': 'Large model requires high memory bandwidth',
281
+ 'cost': 'High'
282
+ })
283
+
284
+ if latency_req < 10: # <10ms
285
+ recommendations.append({
286
+ 'hardware': 'TensorRT + GPU',
287
+ 'reason': 'Low latency requires optimized inference',
288
+ 'cost': 'Medium'
289
+ })
290
+
291
+ if batch_size == 1 and latency_req < 5:
292
+ recommendations.append({
293
+ 'hardware': 'Edge TPU / Jetson',
294
+ 'reason': 'Single-sample low-latency inference',
295
+ 'cost': 'Low'
296
+ })
297
+
298
+ return recommendations
299
+ ```
300
+
301
+ ## Benchmarking
302
+
303
+ ```python
304
+ import torch.utils.benchmark as benchmark
305
+
306
+ def benchmark_model(model, input_shape, device='cuda'):
307
+ x = torch.randn(*input_shape).to(device)
308
+ model = model.to(device)
309
+ model.eval()
310
+
311
+ # Warmup
312
+ for _ in range(10):
313
+ model(x)
314
+
315
+ # Benchmark
316
+ timer = benchmark.Timer(
317
+ stmt='model(x)',
318
+ globals={'model': model, 'x': x}
319
+ )
320
+
321
+ result = timer.blocked_autorange(min_run_time=1)
322
+
323
+ return {
324
+ 'mean_ms': result.mean * 1000,
325
+ 'median_ms': result.median * 1000,
326
+ 'iqr_ms': result.iqr * 1000,
327
+ 'throughput': 1000 / (result.mean * 1000)
328
+ }
329
+ ```
330
+
331
+ ## Commands
332
+ - `/omgoptim:profile` - Profile on hardware
333
+ - `/omgdeploy:edge` - Edge deployment
334
+ - `/omgdeploy:cloud` - Cloud GPU deployment
335
+
336
+ ## Best Practices
337
+
338
+ 1. Profile on target hardware early
339
+ 2. Use hardware-specific optimizations
340
+ 3. Batch for throughput, stream for latency
341
+ 4. Consider power consumption for edge
342
+ 5. Test with production data volumes