omgkit 2.20.0 → 2.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -10
- package/package.json +1 -1
- package/plugin/agents/ai-architect-agent.md +282 -0
- package/plugin/agents/data-scientist-agent.md +221 -0
- package/plugin/agents/experiment-analyst-agent.md +318 -0
- package/plugin/agents/ml-engineer-agent.md +165 -0
- package/plugin/agents/mlops-engineer-agent.md +324 -0
- package/plugin/agents/model-optimizer-agent.md +287 -0
- package/plugin/agents/production-engineer-agent.md +360 -0
- package/plugin/agents/research-scientist-agent.md +274 -0
- package/plugin/commands/omgdata/augment.md +86 -0
- package/plugin/commands/omgdata/collect.md +81 -0
- package/plugin/commands/omgdata/label.md +83 -0
- package/plugin/commands/omgdata/split.md +83 -0
- package/plugin/commands/omgdata/validate.md +76 -0
- package/plugin/commands/omgdata/version.md +85 -0
- package/plugin/commands/omgdeploy/ab.md +94 -0
- package/plugin/commands/omgdeploy/cloud.md +89 -0
- package/plugin/commands/omgdeploy/edge.md +93 -0
- package/plugin/commands/omgdeploy/package.md +91 -0
- package/plugin/commands/omgdeploy/serve.md +92 -0
- package/plugin/commands/omgfeature/embed.md +93 -0
- package/plugin/commands/omgfeature/extract.md +93 -0
- package/plugin/commands/omgfeature/select.md +85 -0
- package/plugin/commands/omgfeature/store.md +97 -0
- package/plugin/commands/omgml/init.md +60 -0
- package/plugin/commands/omgml/status.md +82 -0
- package/plugin/commands/omgops/drift.md +87 -0
- package/plugin/commands/omgops/monitor.md +99 -0
- package/plugin/commands/omgops/pipeline.md +102 -0
- package/plugin/commands/omgops/registry.md +109 -0
- package/plugin/commands/omgops/retrain.md +91 -0
- package/plugin/commands/omgoptim/distill.md +90 -0
- package/plugin/commands/omgoptim/profile.md +92 -0
- package/plugin/commands/omgoptim/prune.md +81 -0
- package/plugin/commands/omgoptim/quantize.md +83 -0
- package/plugin/commands/omgtrain/baseline.md +78 -0
- package/plugin/commands/omgtrain/compare.md +99 -0
- package/plugin/commands/omgtrain/evaluate.md +85 -0
- package/plugin/commands/omgtrain/train.md +81 -0
- package/plugin/commands/omgtrain/tune.md +89 -0
- package/plugin/registry.yaml +252 -2
- package/plugin/skills/ml-systems/SKILL.md +65 -0
- package/plugin/skills/ml-systems/ai-accelerators/SKILL.md +342 -0
- package/plugin/skills/ml-systems/data-eng/SKILL.md +126 -0
- package/plugin/skills/ml-systems/deep-learning-primer/SKILL.md +143 -0
- package/plugin/skills/ml-systems/deployment-paradigms/SKILL.md +148 -0
- package/plugin/skills/ml-systems/dnn-architectures/SKILL.md +128 -0
- package/plugin/skills/ml-systems/edge-deployment/SKILL.md +366 -0
- package/plugin/skills/ml-systems/efficient-ai/SKILL.md +316 -0
- package/plugin/skills/ml-systems/feature-engineering/SKILL.md +151 -0
- package/plugin/skills/ml-systems/ml-frameworks/SKILL.md +187 -0
- package/plugin/skills/ml-systems/ml-serving-optimization/SKILL.md +371 -0
- package/plugin/skills/ml-systems/ml-systems-fundamentals/SKILL.md +103 -0
- package/plugin/skills/ml-systems/ml-workflow/SKILL.md +162 -0
- package/plugin/skills/ml-systems/mlops/SKILL.md +386 -0
- package/plugin/skills/ml-systems/model-deployment/SKILL.md +350 -0
- package/plugin/skills/ml-systems/model-dev/SKILL.md +160 -0
- package/plugin/skills/ml-systems/model-optimization/SKILL.md +339 -0
- package/plugin/skills/ml-systems/robust-ai/SKILL.md +395 -0
- package/plugin/skills/ml-systems/training-data/SKILL.md +152 -0
- package/plugin/workflows/ml-systems/data-preparation-workflow.md +276 -0
- package/plugin/workflows/ml-systems/edge-deployment-workflow.md +413 -0
- package/plugin/workflows/ml-systems/full-ml-lifecycle-workflow.md +405 -0
- package/plugin/workflows/ml-systems/hyperparameter-tuning-workflow.md +352 -0
- package/plugin/workflows/ml-systems/mlops-pipeline-workflow.md +384 -0
- package/plugin/workflows/ml-systems/model-deployment-workflow.md +392 -0
- package/plugin/workflows/ml-systems/model-development-workflow.md +218 -0
- package/plugin/workflows/ml-systems/model-evaluation-workflow.md +416 -0
- package/plugin/workflows/ml-systems/model-optimization-workflow.md +390 -0
- package/plugin/workflows/ml-systems/monitoring-drift-workflow.md +446 -0
- package/plugin/workflows/ml-systems/retraining-workflow.md +401 -0
- package/plugin/workflows/ml-systems/training-pipeline-workflow.md +382 -0
package/plugin/registry.yaml
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# OMGKIT Component Registry
|
|
2
2
|
# Single Source of Truth for Agents, Skills, Commands, Workflows, and MCPs
|
|
3
|
-
# Version: 2.
|
|
3
|
+
# Version: 2.21.0
|
|
4
4
|
# Updated: 2026-01-02
|
|
5
5
|
|
|
6
|
-
version: "2.
|
|
6
|
+
version: "2.21.1"
|
|
7
7
|
|
|
8
8
|
# =============================================================================
|
|
9
9
|
# OPTIMIZED ALIGNMENT PRINCIPLE (OAP)
|
|
@@ -493,6 +493,154 @@ agents:
|
|
|
493
493
|
- simulation/visualization-scientific
|
|
494
494
|
commands: []
|
|
495
495
|
|
|
496
|
+
# ---------------------------------------------------------------------------
|
|
497
|
+
# ML SYSTEMS AGENTS (Harvard CS 329S + Chip Huyen)
|
|
498
|
+
# ---------------------------------------------------------------------------
|
|
499
|
+
ml-engineer-agent:
|
|
500
|
+
file: agents/ml-engineer-agent.md
|
|
501
|
+
description: Full-stack ML engineering for end-to-end ML systems
|
|
502
|
+
skills:
|
|
503
|
+
- ml-systems/ml-systems-fundamentals
|
|
504
|
+
- ml-systems/data-eng
|
|
505
|
+
- ml-systems/feature-engineering
|
|
506
|
+
- ml-systems/ml-workflow
|
|
507
|
+
- ml-systems/model-dev
|
|
508
|
+
- ml-systems/ml-frameworks
|
|
509
|
+
- ml-systems/model-deployment
|
|
510
|
+
- ml-systems/mlops
|
|
511
|
+
commands:
|
|
512
|
+
- /omgml:init
|
|
513
|
+
- /omgml:status
|
|
514
|
+
- /omgdata:collect
|
|
515
|
+
- /omgdata:validate
|
|
516
|
+
- /omgfeature:extract
|
|
517
|
+
- /omgfeature:select
|
|
518
|
+
- /omgtrain:train
|
|
519
|
+
- /omgtrain:evaluate
|
|
520
|
+
- /omgdeploy:package
|
|
521
|
+
- /omgdeploy:serve
|
|
522
|
+
- /omgops:pipeline
|
|
523
|
+
|
|
524
|
+
data-scientist-agent:
|
|
525
|
+
file: agents/data-scientist-agent.md
|
|
526
|
+
description: Expert data science for EDA, statistical modeling, and insights
|
|
527
|
+
skills:
|
|
528
|
+
- ml-systems/ml-systems-fundamentals
|
|
529
|
+
- ml-systems/data-eng
|
|
530
|
+
- ml-systems/training-data
|
|
531
|
+
- ml-systems/feature-engineering
|
|
532
|
+
- ml-systems/ml-workflow
|
|
533
|
+
- ml-systems/model-dev
|
|
534
|
+
commands:
|
|
535
|
+
- /omgdata:collect
|
|
536
|
+
- /omgdata:validate
|
|
537
|
+
- /omgdata:label
|
|
538
|
+
- /omgdata:augment
|
|
539
|
+
- /omgdata:split
|
|
540
|
+
- /omgfeature:extract
|
|
541
|
+
- /omgfeature:select
|
|
542
|
+
- /omgtrain:baseline
|
|
543
|
+
- /omgtrain:train
|
|
544
|
+
- /omgtrain:evaluate
|
|
545
|
+
- /omgtrain:compare
|
|
546
|
+
|
|
547
|
+
mlops-engineer-agent:
|
|
548
|
+
file: agents/mlops-engineer-agent.md
|
|
549
|
+
description: MLOps for production ML infrastructure and automation
|
|
550
|
+
skills:
|
|
551
|
+
- ml-systems/mlops
|
|
552
|
+
- ml-systems/robust-ai
|
|
553
|
+
- ml-systems/model-deployment
|
|
554
|
+
- ml-systems/ml-serving-optimization
|
|
555
|
+
commands:
|
|
556
|
+
- /omgops:pipeline
|
|
557
|
+
- /omgops:monitor
|
|
558
|
+
- /omgops:drift
|
|
559
|
+
- /omgops:retrain
|
|
560
|
+
- /omgops:registry
|
|
561
|
+
- /omgdeploy:package
|
|
562
|
+
- /omgdeploy:serve
|
|
563
|
+
- /omgdeploy:cloud
|
|
564
|
+
- /omgdeploy:ab
|
|
565
|
+
|
|
566
|
+
research-scientist-agent:
|
|
567
|
+
file: agents/research-scientist-agent.md
|
|
568
|
+
description: AI/ML research for novel approaches and paper implementation
|
|
569
|
+
skills:
|
|
570
|
+
- ml-systems/ml-systems-fundamentals
|
|
571
|
+
- ml-systems/deep-learning-primer
|
|
572
|
+
- ml-systems/dnn-architectures
|
|
573
|
+
- ml-systems/ml-workflow
|
|
574
|
+
- ml-systems/model-dev
|
|
575
|
+
- ml-systems/ml-frameworks
|
|
576
|
+
commands:
|
|
577
|
+
- /omgtrain:train
|
|
578
|
+
- /omgtrain:tune
|
|
579
|
+
- /omgtrain:evaluate
|
|
580
|
+
- /omgtrain:compare
|
|
581
|
+
- /omgml:status
|
|
582
|
+
|
|
583
|
+
model-optimizer-agent:
|
|
584
|
+
file: agents/model-optimizer-agent.md
|
|
585
|
+
description: Model optimization through quantization, pruning, and distillation
|
|
586
|
+
skills:
|
|
587
|
+
- ml-systems/efficient-ai
|
|
588
|
+
- ml-systems/model-optimization
|
|
589
|
+
- ml-systems/ai-accelerators
|
|
590
|
+
- ml-systems/ml-serving-optimization
|
|
591
|
+
commands:
|
|
592
|
+
- /omgoptim:quantize
|
|
593
|
+
- /omgoptim:prune
|
|
594
|
+
- /omgoptim:distill
|
|
595
|
+
- /omgoptim:profile
|
|
596
|
+
- /omgtrain:evaluate
|
|
597
|
+
|
|
598
|
+
production-engineer-agent:
|
|
599
|
+
file: agents/production-engineer-agent.md
|
|
600
|
+
description: ML production deployment with reliability and scalability
|
|
601
|
+
skills:
|
|
602
|
+
- ml-systems/model-deployment
|
|
603
|
+
- ml-systems/ml-serving-optimization
|
|
604
|
+
- ml-systems/edge-deployment
|
|
605
|
+
- ml-systems/robust-ai
|
|
606
|
+
commands:
|
|
607
|
+
- /omgdeploy:package
|
|
608
|
+
- /omgdeploy:serve
|
|
609
|
+
- /omgdeploy:edge
|
|
610
|
+
- /omgdeploy:cloud
|
|
611
|
+
- /omgdeploy:ab
|
|
612
|
+
- /omgops:monitor
|
|
613
|
+
|
|
614
|
+
ai-architect-agent:
|
|
615
|
+
file: agents/ai-architect-agent.md
|
|
616
|
+
description: Senior AI/ML architect for end-to-end ML system design
|
|
617
|
+
skills:
|
|
618
|
+
- ml-systems/ml-systems-fundamentals
|
|
619
|
+
- ml-systems/deployment-paradigms
|
|
620
|
+
- ml-systems/data-eng
|
|
621
|
+
- ml-systems/feature-engineering
|
|
622
|
+
- ml-systems/ml-workflow
|
|
623
|
+
- ml-systems/model-deployment
|
|
624
|
+
- ml-systems/mlops
|
|
625
|
+
- ml-systems/robust-ai
|
|
626
|
+
commands:
|
|
627
|
+
- /omgml:init
|
|
628
|
+
- /omgml:status
|
|
629
|
+
- /omgops:pipeline
|
|
630
|
+
- /omgops:registry
|
|
631
|
+
|
|
632
|
+
experiment-analyst-agent:
|
|
633
|
+
file: agents/experiment-analyst-agent.md
|
|
634
|
+
description: ML experiment analysis and model comparison
|
|
635
|
+
skills:
|
|
636
|
+
- ml-systems/ml-workflow
|
|
637
|
+
- ml-systems/model-dev
|
|
638
|
+
- ml-systems/training-data
|
|
639
|
+
commands:
|
|
640
|
+
- /omgtrain:evaluate
|
|
641
|
+
- /omgtrain:compare
|
|
642
|
+
- /omgml:status
|
|
643
|
+
|
|
496
644
|
# =============================================================================
|
|
497
645
|
# SKILL CATEGORIES
|
|
498
646
|
# =============================================================================
|
|
@@ -514,6 +662,7 @@ skill_categories:
|
|
|
514
662
|
- languages
|
|
515
663
|
- methodology
|
|
516
664
|
- microservices
|
|
665
|
+
- ml-systems # ML Systems Design (Harvard CS 329S + Chip Huyen)
|
|
517
666
|
- mobile
|
|
518
667
|
- mobile-advanced
|
|
519
668
|
- omega
|
|
@@ -540,6 +689,13 @@ command_namespaces:
|
|
|
540
689
|
- iot # IoT operations
|
|
541
690
|
- ml # Machine learning
|
|
542
691
|
- omega # Omega principles
|
|
692
|
+
- omgdata # ML Data Engineering
|
|
693
|
+
- omgdeploy # ML Model Deployment
|
|
694
|
+
- omgfeature # ML Feature Engineering
|
|
695
|
+
- omgml # ML Project Management
|
|
696
|
+
- omgops # ML Operations
|
|
697
|
+
- omgoptim # ML Model Optimization
|
|
698
|
+
- omgtrain # ML Model Training
|
|
543
699
|
- perf # Performance
|
|
544
700
|
- planning # Planning and research
|
|
545
701
|
- platform # Platform engineering
|
|
@@ -772,3 +928,97 @@ workflows:
|
|
|
772
928
|
agents: [copywriter, researcher]
|
|
773
929
|
skills: []
|
|
774
930
|
commands: [/planning:brainstorm, /planning:research]
|
|
931
|
+
|
|
932
|
+
# ---------------------------------------------------------------------------
|
|
933
|
+
# ML SYSTEMS WORKFLOWS (Harvard CS 329S + Chip Huyen)
|
|
934
|
+
# ---------------------------------------------------------------------------
|
|
935
|
+
ml-systems/model-development-workflow:
|
|
936
|
+
agents: [data-scientist-agent, research-scientist-agent, experiment-analyst-agent]
|
|
937
|
+
skills:
|
|
938
|
+
- ml-systems/ml-systems-fundamentals
|
|
939
|
+
- ml-systems/ml-workflow
|
|
940
|
+
- ml-systems/model-dev
|
|
941
|
+
commands: [/omgml:init, /omgtrain:baseline, /omgtrain:train, /omgtrain:evaluate]
|
|
942
|
+
|
|
943
|
+
ml-systems/data-preparation-workflow:
|
|
944
|
+
agents: [data-scientist-agent, ml-engineer-agent]
|
|
945
|
+
skills:
|
|
946
|
+
- ml-systems/data-eng
|
|
947
|
+
- ml-systems/training-data
|
|
948
|
+
- ml-systems/feature-engineering
|
|
949
|
+
commands: [/omgdata:collect, /omgdata:validate, /omgdata:label, /omgdata:augment, /omgdata:split]
|
|
950
|
+
|
|
951
|
+
ml-systems/training-pipeline-workflow:
|
|
952
|
+
agents: [ml-engineer-agent, mlops-engineer-agent]
|
|
953
|
+
skills:
|
|
954
|
+
- ml-systems/ml-workflow
|
|
955
|
+
- ml-systems/mlops
|
|
956
|
+
commands: [/omgops:pipeline, /omgtrain:train, /omgtrain:evaluate, /omgops:registry]
|
|
957
|
+
|
|
958
|
+
ml-systems/hyperparameter-tuning-workflow:
|
|
959
|
+
agents: [research-scientist-agent, experiment-analyst-agent]
|
|
960
|
+
skills:
|
|
961
|
+
- ml-systems/model-dev
|
|
962
|
+
- ml-systems/ml-frameworks
|
|
963
|
+
commands: [/omgtrain:tune, /omgtrain:compare]
|
|
964
|
+
|
|
965
|
+
ml-systems/model-evaluation-workflow:
|
|
966
|
+
agents: [experiment-analyst-agent, data-scientist-agent]
|
|
967
|
+
skills:
|
|
968
|
+
- ml-systems/model-dev
|
|
969
|
+
- ml-systems/training-data
|
|
970
|
+
commands: [/omgtrain:evaluate, /omgtrain:compare]
|
|
971
|
+
|
|
972
|
+
ml-systems/model-optimization-workflow:
|
|
973
|
+
agents: [model-optimizer-agent, production-engineer-agent]
|
|
974
|
+
skills:
|
|
975
|
+
- ml-systems/efficient-ai
|
|
976
|
+
- ml-systems/model-optimization
|
|
977
|
+
- ml-systems/ml-serving-optimization
|
|
978
|
+
commands: [/omgoptim:profile, /omgoptim:quantize, /omgoptim:prune, /omgoptim:distill]
|
|
979
|
+
|
|
980
|
+
ml-systems/model-deployment-workflow:
|
|
981
|
+
agents: [production-engineer-agent, mlops-engineer-agent]
|
|
982
|
+
skills:
|
|
983
|
+
- ml-systems/model-deployment
|
|
984
|
+
- ml-systems/ml-serving-optimization
|
|
985
|
+
- ml-systems/robust-ai
|
|
986
|
+
commands: [/omgdeploy:package, /omgdeploy:serve, /omgdeploy:cloud, /omgdeploy:ab]
|
|
987
|
+
|
|
988
|
+
ml-systems/edge-deployment-workflow:
|
|
989
|
+
agents: [model-optimizer-agent, production-engineer-agent]
|
|
990
|
+
skills:
|
|
991
|
+
- ml-systems/edge-deployment
|
|
992
|
+
- ml-systems/efficient-ai
|
|
993
|
+
commands: [/omgoptim:quantize, /omgdeploy:edge]
|
|
994
|
+
|
|
995
|
+
ml-systems/mlops-pipeline-workflow:
|
|
996
|
+
agents: [mlops-engineer-agent, production-engineer-agent]
|
|
997
|
+
skills:
|
|
998
|
+
- ml-systems/mlops
|
|
999
|
+
- ml-systems/model-deployment
|
|
1000
|
+
commands: [/omgops:pipeline, /omgops:monitor, /omgops:registry]
|
|
1001
|
+
|
|
1002
|
+
ml-systems/monitoring-drift-workflow:
|
|
1003
|
+
agents: [mlops-engineer-agent, experiment-analyst-agent]
|
|
1004
|
+
skills:
|
|
1005
|
+
- ml-systems/robust-ai
|
|
1006
|
+
- ml-systems/mlops
|
|
1007
|
+
commands: [/omgops:monitor, /omgops:drift]
|
|
1008
|
+
|
|
1009
|
+
ml-systems/retraining-workflow:
|
|
1010
|
+
agents: [ml-engineer-agent, mlops-engineer-agent, experiment-analyst-agent]
|
|
1011
|
+
skills:
|
|
1012
|
+
- ml-systems/mlops
|
|
1013
|
+
- ml-systems/ml-workflow
|
|
1014
|
+
commands: [/omgops:retrain, /omgtrain:train, /omgtrain:evaluate]
|
|
1015
|
+
|
|
1016
|
+
ml-systems/full-ml-lifecycle-workflow:
|
|
1017
|
+
agents: [ai-architect-agent, data-scientist-agent, ml-engineer-agent, research-scientist-agent, model-optimizer-agent, production-engineer-agent, mlops-engineer-agent, experiment-analyst-agent]
|
|
1018
|
+
skills:
|
|
1019
|
+
- ml-systems/ml-systems-fundamentals
|
|
1020
|
+
- ml-systems/data-eng
|
|
1021
|
+
- ml-systems/ml-workflow
|
|
1022
|
+
- ml-systems/model-deployment
|
|
1023
|
+
- ml-systems/mlops
|
|
1024
|
+
commands: [/omgml:init, /omgdata:collect, /omgtrain:train, /omgdeploy:serve, /omgops:monitor]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-systems
|
|
3
|
+
description: Machine Learning Systems - comprehensive knowledge for building production ML systems from data engineering through deployment and operations. Based on Harvard ML Systems course and Designing ML Systems by Chip Huyen.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# ML Systems
|
|
7
|
+
|
|
8
|
+
Building production-ready machine learning systems.
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
This skill category covers the complete ML system lifecycle:
|
|
13
|
+
|
|
14
|
+
1. **Foundations** - Core concepts, architectures, paradigms
|
|
15
|
+
2. **Data Engineering** - Data collection, quality, feature engineering
|
|
16
|
+
3. **Model Development** - Training, evaluation, frameworks
|
|
17
|
+
4. **Performance** - Optimization, acceleration, efficiency
|
|
18
|
+
5. **Deployment** - Serving, edge deployment, scaling
|
|
19
|
+
6. **Operations** - MLOps, monitoring, reliability
|
|
20
|
+
|
|
21
|
+
## Categories
|
|
22
|
+
|
|
23
|
+
### Foundations
|
|
24
|
+
- `ml-systems-fundamentals` - Core ML systems concepts
|
|
25
|
+
- `deep-learning-primer` - Deep learning foundations
|
|
26
|
+
- `dnn-architectures` - Neural network architectures
|
|
27
|
+
- `deployment-paradigms` - Deployment patterns
|
|
28
|
+
|
|
29
|
+
### Data Engineering
|
|
30
|
+
- `data-engineering` - Data pipelines and quality
|
|
31
|
+
- `training-data` - Training data management
|
|
32
|
+
- `feature-engineering` - Feature creation and stores
|
|
33
|
+
|
|
34
|
+
### Model Development
|
|
35
|
+
- `ml-workflow` - ML development workflow
|
|
36
|
+
- `model-development` - Model training and selection
|
|
37
|
+
- `ml-frameworks` - Framework best practices
|
|
38
|
+
|
|
39
|
+
### Performance
|
|
40
|
+
- `efficient-ai` - Efficiency techniques
|
|
41
|
+
- `model-optimization` - Quantization, pruning, distillation
|
|
42
|
+
- `ai-accelerators` - Hardware acceleration
|
|
43
|
+
|
|
44
|
+
### Deployment
|
|
45
|
+
- `model-deployment` - Production deployment
|
|
46
|
+
- `inference-optimization` - Inference optimization
|
|
47
|
+
- `edge-deployment` - Edge and mobile deployment
|
|
48
|
+
|
|
49
|
+
### Operations
|
|
50
|
+
- `mlops` - ML operations and lifecycle
|
|
51
|
+
- `robust-ai` - Reliability and robustness
|
|
52
|
+
|
|
53
|
+
## Key Principles
|
|
54
|
+
|
|
55
|
+
1. **Data-Centric AI** - Focus on data quality over model complexity
|
|
56
|
+
2. **Iterative Development** - Start simple, iterate based on metrics
|
|
57
|
+
3. **Production-First** - Design for deployment from the start
|
|
58
|
+
4. **Monitoring** - Continuous monitoring and improvement
|
|
59
|
+
5. **Reproducibility** - Version everything (data, code, models)
|
|
60
|
+
|
|
61
|
+
## References
|
|
62
|
+
|
|
63
|
+
- Harvard CS 329S: Machine Learning Systems Design
|
|
64
|
+
- Designing Machine Learning Systems by Chip Huyen
|
|
65
|
+
- MLOps: Continuous Delivery and Automation Pipelines
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ai-accelerators
|
|
3
|
+
description: AI hardware accelerators including GPUs, TPUs, custom silicon, and hardware-aware optimization strategies for ML workloads.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AI Accelerators
|
|
7
|
+
|
|
8
|
+
Hardware acceleration for ML workloads.
|
|
9
|
+
|
|
10
|
+
## Hardware Landscape
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
14
|
+
│ AI ACCELERATOR TYPES │
|
|
15
|
+
├─────────────────────────────────────────────────────────────┤
|
|
16
|
+
│ │
|
|
17
|
+
│ GPU (NVIDIA) TPU (Google) NPU/Custom │
|
|
18
|
+
│ ───────────── ──────────── ───────── │
|
|
19
|
+
│ CUDA cores Systolic array Apple Neural │
|
|
20
|
+
│ Tensor cores BF16 native Qualcomm Hexagon │
|
|
21
|
+
│ General purpose TPU pods Intel Habana │
|
|
22
|
+
│ PyTorch/TF native JAX optimized AWS Inferentia │
|
|
23
|
+
│ │
|
|
24
|
+
│ FPGA ASIC Edge Accelerators │
|
|
25
|
+
│ ───────────── ──────────── ───────── │
|
|
26
|
+
│ Reconfigurable Fixed function Coral Edge TPU │
|
|
27
|
+
│ Low latency Maximum perf Jetson (NVIDIA) │
|
|
28
|
+
│ Power efficient High volume Intel NCS2 │
|
|
29
|
+
│ │
|
|
30
|
+
└─────────────────────────────────────────────────────────────┘
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## GPU Optimization
|
|
34
|
+
|
|
35
|
+
### CUDA Memory Management
|
|
36
|
+
```python
|
|
37
|
+
import torch
|
|
38
|
+
|
|
39
|
+
# Memory allocation
|
|
40
|
+
torch.cuda.empty_cache()
|
|
41
|
+
torch.cuda.memory_allocated()
|
|
42
|
+
torch.cuda.max_memory_allocated()
|
|
43
|
+
|
|
44
|
+
# Pin memory for faster transfers
|
|
45
|
+
train_loader = DataLoader(
|
|
46
|
+
dataset,
|
|
47
|
+
batch_size=32,
|
|
48
|
+
pin_memory=True,
|
|
49
|
+
num_workers=4
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Async data transfer
|
|
53
|
+
def async_prefetch(loader, device):
|
|
54
|
+
stream = torch.cuda.Stream()
|
|
55
|
+
for batch in loader:
|
|
56
|
+
with torch.cuda.stream(stream):
|
|
57
|
+
batch = batch.to(device, non_blocking=True)
|
|
58
|
+
torch.cuda.current_stream().wait_stream(stream)
|
|
59
|
+
yield batch
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Tensor Core Utilization
|
|
63
|
+
```python
|
|
64
|
+
# Ensure tensor core alignment (multiples of 8)
|
|
65
|
+
class TensorCoreOptimized(nn.Module):
|
|
66
|
+
def __init__(self, in_features, out_features):
|
|
67
|
+
super().__init__()
|
|
68
|
+
# Round to multiple of 8 for tensor cores
|
|
69
|
+
self.in_features = ((in_features + 7) // 8) * 8
|
|
70
|
+
self.out_features = ((out_features + 7) // 8) * 8
|
|
71
|
+
self.linear = nn.Linear(self.in_features, self.out_features)
|
|
72
|
+
self.pad_in = self.in_features - in_features
|
|
73
|
+
|
|
74
|
+
def forward(self, x):
|
|
75
|
+
if self.pad_in > 0:
|
|
76
|
+
x = F.pad(x, (0, self.pad_in))
|
|
77
|
+
return self.linear(x)
|
|
78
|
+
|
|
79
|
+
# Enable TF32 on Ampere+ GPUs
|
|
80
|
+
torch.backends.cuda.matmul.allow_tf32 = True
|
|
81
|
+
torch.backends.cudnn.allow_tf32 = True
|
|
82
|
+
|
|
83
|
+
# Force FP16 computation
|
|
84
|
+
with torch.cuda.amp.autocast(dtype=torch.float16):
|
|
85
|
+
output = model(input)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Multi-GPU Strategies
|
|
89
|
+
```python
|
|
90
|
+
# DataParallel (simple, not recommended for training)
|
|
91
|
+
model = nn.DataParallel(model)
|
|
92
|
+
|
|
93
|
+
# DistributedDataParallel (recommended)
|
|
94
|
+
model = DistributedDataParallel(model, device_ids=[local_rank])
|
|
95
|
+
|
|
96
|
+
# Model Parallelism (for large models)
|
|
97
|
+
class ModelParallel(nn.Module):
|
|
98
|
+
def __init__(self):
|
|
99
|
+
super().__init__()
|
|
100
|
+
self.encoder = nn.TransformerEncoder(...).to('cuda:0')
|
|
101
|
+
self.decoder = nn.TransformerDecoder(...).to('cuda:1')
|
|
102
|
+
|
|
103
|
+
def forward(self, x):
|
|
104
|
+
x = self.encoder(x.to('cuda:0'))
|
|
105
|
+
x = self.decoder(x.to('cuda:1'))
|
|
106
|
+
return x
|
|
107
|
+
|
|
108
|
+
# Pipeline Parallelism
|
|
109
|
+
from torch.distributed.pipeline.sync import Pipe
|
|
110
|
+
|
|
111
|
+
model = nn.Sequential(
|
|
112
|
+
nn.Linear(100, 200).to('cuda:0'),
|
|
113
|
+
nn.ReLU().to('cuda:0'),
|
|
114
|
+
nn.Linear(200, 100).to('cuda:1')
|
|
115
|
+
)
|
|
116
|
+
model = Pipe(model, chunks=8)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## TPU Optimization
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# JAX/TPU optimized training
|
|
123
|
+
import jax
|
|
124
|
+
import jax.numpy as jnp
|
|
125
|
+
from flax import linen as nn
|
|
126
|
+
|
|
127
|
+
class TPUModel(nn.Module):
|
|
128
|
+
features: int
|
|
129
|
+
|
|
130
|
+
@nn.compact
|
|
131
|
+
def __call__(self, x):
|
|
132
|
+
x = nn.Dense(self.features)(x)
|
|
133
|
+
x = nn.relu(x)
|
|
134
|
+
return nn.Dense(10)(x)
|
|
135
|
+
|
|
136
|
+
# pmap for data parallelism across TPU cores
|
|
137
|
+
@jax.pmap
|
|
138
|
+
def train_step(state, batch):
|
|
139
|
+
def loss_fn(params):
|
|
140
|
+
logits = state.apply_fn({'params': params}, batch['image'])
|
|
141
|
+
loss = jnp.mean(optax.softmax_cross_entropy(logits, batch['label']))
|
|
142
|
+
return loss
|
|
143
|
+
|
|
144
|
+
grad_fn = jax.value_and_grad(loss_fn)
|
|
145
|
+
loss, grads = grad_fn(state.params)
|
|
146
|
+
grads = jax.lax.pmean(grads, axis_name='batch')
|
|
147
|
+
state = state.apply_gradients(grads=grads)
|
|
148
|
+
return state, loss
|
|
149
|
+
|
|
150
|
+
# PyTorch/XLA for TPU
|
|
151
|
+
import torch_xla.core.xla_model as xm
|
|
152
|
+
import torch_xla.distributed.parallel_loader as pl
|
|
153
|
+
|
|
154
|
+
device = xm.xla_device()
|
|
155
|
+
model = model.to(device)
|
|
156
|
+
|
|
157
|
+
for batch in pl.ParallelLoader(train_loader, [device]):
|
|
158
|
+
output = model(batch)
|
|
159
|
+
loss.backward()
|
|
160
|
+
xm.optimizer_step(optimizer)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Edge Accelerators
|
|
164
|
+
|
|
165
|
+
### NVIDIA Jetson
|
|
166
|
+
```python
|
|
167
|
+
# TensorRT optimization for Jetson
|
|
168
|
+
import tensorrt as trt
|
|
169
|
+
import pycuda.driver as cuda
|
|
170
|
+
|
|
171
|
+
def build_engine(onnx_path, precision='fp16'):
|
|
172
|
+
logger = trt.Logger(trt.Logger.WARNING)
|
|
173
|
+
builder = trt.Builder(logger)
|
|
174
|
+
network = builder.create_network(
|
|
175
|
+
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
|
176
|
+
)
|
|
177
|
+
parser = trt.OnnxParser(network, logger)
|
|
178
|
+
|
|
179
|
+
with open(onnx_path, 'rb') as f:
|
|
180
|
+
parser.parse(f.read())
|
|
181
|
+
|
|
182
|
+
config = builder.create_builder_config()
|
|
183
|
+
config.max_workspace_size = 1 << 30 # 1GB
|
|
184
|
+
|
|
185
|
+
if precision == 'fp16':
|
|
186
|
+
config.set_flag(trt.BuilderFlag.FP16)
|
|
187
|
+
elif precision == 'int8':
|
|
188
|
+
config.set_flag(trt.BuilderFlag.INT8)
|
|
189
|
+
config.int8_calibrator = EntropyCalibrator(calibration_data)
|
|
190
|
+
|
|
191
|
+
return builder.build_engine(network, config)
|
|
192
|
+
|
|
193
|
+
# DeepStream for video inference
|
|
194
|
+
# gst-launch-1.0 filesrc location=video.mp4 ! \
|
|
195
|
+
# decodebin ! nvvideoconvert ! \
|
|
196
|
+
# nvinfer config-file-path=config.txt ! \
|
|
197
|
+
# nvdsosd ! nveglglessink
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Coral Edge TPU
|
|
201
|
+
```python
|
|
202
|
+
from pycoral.utils import edgetpu
|
|
203
|
+
from pycoral.adapters import common, classify
|
|
204
|
+
|
|
205
|
+
# Load Edge TPU model
|
|
206
|
+
interpreter = edgetpu.make_interpreter('model_edgetpu.tflite')
|
|
207
|
+
interpreter.allocate_tensors()
|
|
208
|
+
|
|
209
|
+
# Inference
|
|
210
|
+
common.set_input(interpreter, image)
|
|
211
|
+
interpreter.invoke()
|
|
212
|
+
classes = classify.get_classes(interpreter, top_k=5)
|
|
213
|
+
|
|
214
|
+
# Compile model for Edge TPU
|
|
215
|
+
# edgetpu_compiler model.tflite
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### TFLite for Mobile
|
|
219
|
+
```python
|
|
220
|
+
import tensorflow as tf
|
|
221
|
+
|
|
222
|
+
# Convert to TFLite with quantization
|
|
223
|
+
converter = tf.lite.TFLiteConverter.from_saved_model('saved_model/')
|
|
224
|
+
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
|
225
|
+
converter.target_spec.supported_types = [tf.float16]
|
|
226
|
+
|
|
227
|
+
# Full integer quantization
|
|
228
|
+
converter.representative_dataset = representative_dataset
|
|
229
|
+
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
|
|
230
|
+
converter.inference_input_type = tf.uint8
|
|
231
|
+
converter.inference_output_type = tf.uint8
|
|
232
|
+
|
|
233
|
+
tflite_model = converter.convert()
|
|
234
|
+
|
|
235
|
+
# Inference
|
|
236
|
+
interpreter = tf.lite.Interpreter(model_content=tflite_model)
|
|
237
|
+
interpreter.allocate_tensors()
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Hardware-Aware Optimization
|
|
241
|
+
|
|
242
|
+
### Auto-Tuning
|
|
243
|
+
```python
|
|
244
|
+
# TVM auto-tuning for specific hardware
|
|
245
|
+
import tvm
|
|
246
|
+
from tvm import relay, autotvm
|
|
247
|
+
|
|
248
|
+
# Extract tuning tasks
|
|
249
|
+
tasks = autotvm.task.extract_from_program(
|
|
250
|
+
mod["main"], target="cuda", params=params
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Tune each task
|
|
254
|
+
for task in tasks:
|
|
255
|
+
tuner = autotvm.tuner.XGBTuner(task)
|
|
256
|
+
tuner.tune(
|
|
257
|
+
n_trial=1000,
|
|
258
|
+
measure_option=autotvm.measure_option(
|
|
259
|
+
builder=autotvm.LocalBuilder(),
|
|
260
|
+
runner=autotvm.LocalRunner(number=10)
|
|
261
|
+
),
|
|
262
|
+
callbacks=[autotvm.callback.log_to_file('tune.log')]
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Compile with best configs
|
|
266
|
+
with autotvm.apply_history_best('tune.log'):
|
|
267
|
+
with tvm.transform.PassContext(opt_level=3):
|
|
268
|
+
lib = relay.build(mod, target="cuda", params=params)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Hardware Selection Matrix
|
|
272
|
+
```python
|
|
273
|
+
def select_hardware(model_size, latency_req, batch_size, budget):
|
|
274
|
+
"""Select optimal hardware for ML workload."""
|
|
275
|
+
recommendations = []
|
|
276
|
+
|
|
277
|
+
if model_size > 10e9: # >10B params
|
|
278
|
+
recommendations.append({
|
|
279
|
+
'hardware': 'Multi-GPU (A100/H100)',
|
|
280
|
+
'reason': 'Large model requires high memory bandwidth',
|
|
281
|
+
'cost': 'High'
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
if latency_req < 10: # <10ms
|
|
285
|
+
recommendations.append({
|
|
286
|
+
'hardware': 'TensorRT + GPU',
|
|
287
|
+
'reason': 'Low latency requires optimized inference',
|
|
288
|
+
'cost': 'Medium'
|
|
289
|
+
})
|
|
290
|
+
|
|
291
|
+
if batch_size == 1 and latency_req < 5:
|
|
292
|
+
recommendations.append({
|
|
293
|
+
'hardware': 'Edge TPU / Jetson',
|
|
294
|
+
'reason': 'Single-sample low-latency inference',
|
|
295
|
+
'cost': 'Low'
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return recommendations
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## Benchmarking
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
import torch.utils.benchmark as benchmark
|
|
305
|
+
|
|
306
|
+
def benchmark_model(model, input_shape, device='cuda'):
|
|
307
|
+
x = torch.randn(*input_shape).to(device)
|
|
308
|
+
model = model.to(device)
|
|
309
|
+
model.eval()
|
|
310
|
+
|
|
311
|
+
# Warmup
|
|
312
|
+
for _ in range(10):
|
|
313
|
+
model(x)
|
|
314
|
+
|
|
315
|
+
# Benchmark
|
|
316
|
+
timer = benchmark.Timer(
|
|
317
|
+
stmt='model(x)',
|
|
318
|
+
globals={'model': model, 'x': x}
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
result = timer.blocked_autorange(min_run_time=1)
|
|
322
|
+
|
|
323
|
+
return {
|
|
324
|
+
'mean_ms': result.mean * 1000,
|
|
325
|
+
'median_ms': result.median * 1000,
|
|
326
|
+
'iqr_ms': result.iqr * 1000,
|
|
327
|
+
'throughput': 1000 / (result.mean * 1000)
|
|
328
|
+
}
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
## Commands
|
|
332
|
+
- `/omgoptim:profile` - Profile on hardware
|
|
333
|
+
- `/omgdeploy:edge` - Edge deployment
|
|
334
|
+
- `/omgdeploy:cloud` - Cloud GPU deployment
|
|
335
|
+
|
|
336
|
+
## Best Practices
|
|
337
|
+
|
|
338
|
+
1. Profile on target hardware early
|
|
339
|
+
2. Use hardware-specific optimizations
|
|
340
|
+
3. Batch for throughput, stream for latency
|
|
341
|
+
4. Consider power consumption for edge
|
|
342
|
+
5. Test with production data volumes
|