packwise-skills 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursorrules +23 -23
- package/CLAUDE.md +25 -25
- package/LICENSE +21 -0
- package/README.md +404 -295
- package/audit.md +224 -224
- package/bin/packwise.js +322 -155
- package/install.sh +123 -0
- package/package.json +32 -31
- package/skill.md +944 -719
- package/sub-skills/ai/local-llm.md +183 -183
- package/sub-skills/ai/python-ml.md +164 -164
- package/sub-skills/backend/go-server.md +184 -184
- package/sub-skills/backend/java-spring.md +241 -241
- package/sub-skills/backend/node-server.md +164 -164
- package/sub-skills/backend/php-laravel.md +175 -175
- package/sub-skills/backend/python-server.md +164 -164
- package/sub-skills/backend/rust-backend.md +118 -118
- package/sub-skills/cli/python-cli.md +236 -236
- package/sub-skills/cli/sdk-library.md +497 -497
- package/sub-skills/cloud/ci-cd-pipelines.md +350 -350
- package/sub-skills/cloud/docker.md +191 -191
- package/sub-skills/cloud/kubernetes.md +277 -277
- package/sub-skills/cloud/payment-integration.md +307 -307
- package/sub-skills/cross-platform/multiplatform.md +252 -252
- package/sub-skills/desktop/electron.md +783 -783
- package/sub-skills/desktop/game-dev.md +443 -443
- package/sub-skills/desktop/native-app.md +123 -123
- package/sub-skills/desktop/scenarios.md +443 -443
- package/sub-skills/desktop/smart-platforms.md +324 -324
- package/sub-skills/desktop/tauri.md +428 -428
- package/sub-skills/desktop/vr-ar.md +252 -252
- package/sub-skills/desktop/web-to-desktop.md +153 -153
- package/sub-skills/embedded/car-infotainment.md +129 -129
- package/sub-skills/embedded/esp32.md +184 -184
- package/sub-skills/embedded/ros.md +150 -150
- package/sub-skills/embedded/stm32.md +160 -160
- package/sub-skills/mobile/android.md +322 -322
- package/sub-skills/mobile/capacitor.md +232 -232
- package/sub-skills/mobile/flutter-mobile.md +138 -138
- package/sub-skills/mobile/harmonyos.md +150 -150
- package/sub-skills/mobile/ios.md +245 -245
- package/sub-skills/mobile/react-native.md +443 -443
- package/sub-skills/mobile/wearables.md +230 -230
- package/sub-skills/plugins/browser-extension.md +308 -308
- package/sub-skills/plugins/jetbrains-plugin.md +226 -226
- package/sub-skills/plugins/vscode-extension.md +204 -204
- package/sub-skills/security/security-tools.md +174 -174
- package/sub-skills/web/monorepo.md +274 -274
- package/sub-skills/web/pwa.md +220 -220
- package/sub-skills/web/serverless-edge.md +295 -295
- package/sub-skills/web/spa.md +266 -266
- package/sub-skills/web/ssr.md +228 -228
- package/sub-skills/web/wasm.md +243 -243
|
@@ -1,164 +1,164 @@
|
|
|
1
|
-
# Python ML Model Packaging Sub-Skill
|
|
2
|
-
|
|
3
|
-
Package, optimize, and serve machine learning models for production.
|
|
4
|
-
|
|
5
|
-
**Current version**: Python 3.12+ / PyTorch 2.x / TensorFlow 2.17+ / ONNX Runtime 1.19+ (2025-2026)
|
|
6
|
-
|
|
7
|
-
## When to Use
|
|
8
|
-
|
|
9
|
-
- Trained ML model deployed as API service
|
|
10
|
-
- Image recognition / NLP / recommendation system
|
|
11
|
-
- Data analysis pipeline
|
|
12
|
-
- Research model publication and reproducibility
|
|
13
|
-
- Edge ML deployment (mobile/embedded)
|
|
14
|
-
|
|
15
|
-
## Deployment Options
|
|
16
|
-
|
|
17
|
-
### FastAPI + Uvicorn (Recommended for APIs)
|
|
18
|
-
|
|
19
|
-
```python
|
|
20
|
-
from fastapi import FastAPI
|
|
21
|
-
from transformers import pipeline
|
|
22
|
-
import torch
|
|
23
|
-
|
|
24
|
-
app = FastAPI()
|
|
25
|
-
|
|
26
|
-
# Load model once at startup
|
|
27
|
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
28
|
-
classifier = pipeline("text-classification", model="my-model", device=device)
|
|
29
|
-
|
|
30
|
-
@app.post("/predict")
|
|
31
|
-
async def predict(text: str):
|
|
32
|
-
return classifier(text)
|
|
33
|
-
|
|
34
|
-
@app.get("/health")
|
|
35
|
-
async def health():
|
|
36
|
-
return {"status": "ok", "device": device}
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
### Flask + Gunicorn
|
|
40
|
-
|
|
41
|
-
```python
|
|
42
|
-
from flask import Flask, request, jsonify
|
|
43
|
-
import pickle
|
|
44
|
-
|
|
45
|
-
app = Flask(__name__)
|
|
46
|
-
model = pickle.load(open("model.pkl", "rb"))
|
|
47
|
-
|
|
48
|
-
@app.route("/predict", methods=["POST"])
|
|
49
|
-
def predict():
|
|
50
|
-
data = request.json
|
|
51
|
-
result = model.predict([data["features"]])
|
|
52
|
-
return jsonify({"prediction": result.tolist()})
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
### ONNX Runtime (High-Performance Inference)
|
|
56
|
-
|
|
57
|
-
```python
|
|
58
|
-
import onnxruntime as ort
|
|
59
|
-
import numpy as np
|
|
60
|
-
|
|
61
|
-
# CPU inference
|
|
62
|
-
session = ort.InferenceSession("model.onnx", providers=["CPUExecutionProvider"])
|
|
63
|
-
|
|
64
|
-
# GPU inference (CUDA)
|
|
65
|
-
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
|
|
66
|
-
|
|
67
|
-
# TensorRT (fastest, NVIDIA GPU only)
|
|
68
|
-
session = ort.InferenceSession("model.onnx", providers=["TensorrtExecutionProvider", "CUDAExecutionProvider"])
|
|
69
|
-
|
|
70
|
-
result = session.run(None, {"input": input_data.astype(np.float32)})
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
### Model Export
|
|
74
|
-
|
|
75
|
-
```python
|
|
76
|
-
# PyTorch → ONNX
|
|
77
|
-
import torch
|
|
78
|
-
model = torch.load("model.pt")
|
|
79
|
-
model.eval()
|
|
80
|
-
dummy_input = torch.randn(1, 3, 224, 224)
|
|
81
|
-
torch.onnx.export(model, dummy_input, "model.onnx", opset_version=17)
|
|
82
|
-
|
|
83
|
-
# TensorFlow → SavedModel
|
|
84
|
-
model.save("saved_model/")
|
|
85
|
-
|
|
86
|
-
# TensorFlow → TFLite (mobile)
|
|
87
|
-
converter = tf.lite.TFLiteConverter.from_saved_model("saved_model/")
|
|
88
|
-
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
|
89
|
-
tflite_model = converter.convert()
|
|
90
|
-
with open("model.tflite", "wb") as f:
|
|
91
|
-
f.write(tflite_model)
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## Model Formats
|
|
95
|
-
|
|
96
|
-
| Format | Framework | Size | Inference Speed | Best For |
|
|
97
|
-
|--------|----------|------|----------------|----------|
|
|
98
|
-
| PyTorch (.pt) | PyTorch | Large | Standard | Research, training |
|
|
99
|
-
| ONNX (.onnx) | Cross-framework | Medium | Fast | Production APIs |
|
|
100
|
-
| SavedModel | TensorFlow | Large | Standard | TF Serving |
|
|
101
|
-
| TFLite (.tflite) | TensorFlow Lite | Small | Fast (mobile) | Mobile/edge |
|
|
102
|
-
| TensorRT | NVIDIA | Medium | Fastest (GPU) | NVIDIA GPU servers |
|
|
103
|
-
| GGUF | llama.cpp | Small | Fast (CPU) | Local LLM inference |
|
|
104
|
-
| CoreML (.mlmodel) | Apple | Medium | Fast (Apple) | iOS/macOS on-device |
|
|
105
|
-
| OpenVINO | Intel | Medium | Fast (Intel CPU) | Intel hardware |
|
|
106
|
-
|
|
107
|
-
## Docker (GPU)
|
|
108
|
-
|
|
109
|
-
```dockerfile
|
|
110
|
-
FROM nvidia/cuda:12.4-runtime-ubuntu22.04
|
|
111
|
-
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
|
|
112
|
-
WORKDIR /app
|
|
113
|
-
COPY requirements.txt .
|
|
114
|
-
RUN pip3 install --no-cache-dir -r requirements.txt
|
|
115
|
-
COPY . .
|
|
116
|
-
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
|
117
|
-
USER appuser
|
|
118
|
-
EXPOSE 8000
|
|
119
|
-
HEALTHCHECK --interval=30s --timeout=3s CMD curl -f http://localhost:8000/health || exit 1
|
|
120
|
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
121
|
-
```
|
|
122
|
-
|
|
123
|
-
```dockerfile
|
|
124
|
-
# CPU-only (smaller)
|
|
125
|
-
FROM python:3.13-slim
|
|
126
|
-
WORKDIR /app
|
|
127
|
-
COPY requirements.txt .
|
|
128
|
-
RUN pip install --no-cache-dir -r requirements.txt
|
|
129
|
-
COPY . .
|
|
130
|
-
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
|
131
|
-
USER appuser
|
|
132
|
-
EXPOSE 8000
|
|
133
|
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0"]
|
|
134
|
-
```
|
|
135
|
-
|
|
136
|
-
## Model Optimization
|
|
137
|
-
|
|
138
|
-
| Technique | Description | Typical Savings |
|
|
139
|
-
|-----------|-------------|----------------|
|
|
140
|
-
| Quantization (INT8) | Reduce precision from FP32 to INT8 | 4x smaller, 2-3x faster |
|
|
141
|
-
| Quantization (INT4) | Further reduction | 8x smaller (LLMs) |
|
|
142
|
-
| Pruning | Remove redundant weights | 2-10x smaller |
|
|
143
|
-
| Knowledge Distillation | Train smaller model from larger | Custom |
|
|
144
|
-
| ONNX Optimization | Graph optimization | 10-30% faster |
|
|
145
|
-
| TensorRT | NVIDIA GPU optimization | 2-5x faster |
|
|
146
|
-
|
|
147
|
-
```python
|
|
148
|
-
# ONNX quantization
|
|
149
|
-
from onnxruntime.quantization import quantize_dynamic, QuantType
|
|
150
|
-
quantize_dynamic("model.onnx", "model_quant.onnx", weight_type=QuantType.QUInt8)
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
## Common Pitfalls
|
|
154
|
-
|
|
155
|
-
| Issue | Fix |
|
|
156
|
-
|-------|-----|
|
|
157
|
-
| Model too large for deployment | Quantize (INT8/INT4); use ONNX; prune |
|
|
158
|
-
| GPU out of memory | Reduce batch size; use gradient checkpointing; quantize |
|
|
159
|
-
| Dependency conflicts | Use Docker isolation; pin exact versions |
|
|
160
|
-
| Slow inference | Convert to ONNX or TensorRT; use GPU; batch requests |
|
|
161
|
-
| CUDA version mismatch | Match PyTorch CUDA version with system CUDA; use Docker |
|
|
162
|
-
| Model loading time too long | Load once at startup; use model server (TorchServe, TF Serving) |
|
|
163
|
-
| Inconsistent results between environments | Fix random seeds; pin all dependency versions |
|
|
164
|
-
| ONNX export fails | Check opset version; use `torch.onnx.export` with `opset_version=17` |
|
|
1
|
+
# Python ML Model Packaging Sub-Skill
|
|
2
|
+
|
|
3
|
+
Package, optimize, and serve machine learning models for production.
|
|
4
|
+
|
|
5
|
+
**Current version**: Python 3.12+ / PyTorch 2.x / TensorFlow 2.17+ / ONNX Runtime 1.19+ (2025-2026)
|
|
6
|
+
|
|
7
|
+
## When to Use
|
|
8
|
+
|
|
9
|
+
- Trained ML model deployed as API service
|
|
10
|
+
- Image recognition / NLP / recommendation system
|
|
11
|
+
- Data analysis pipeline
|
|
12
|
+
- Research model publication and reproducibility
|
|
13
|
+
- Edge ML deployment (mobile/embedded)
|
|
14
|
+
|
|
15
|
+
## Deployment Options
|
|
16
|
+
|
|
17
|
+
### FastAPI + Uvicorn (Recommended for APIs)
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from fastapi import FastAPI
|
|
21
|
+
from transformers import pipeline
|
|
22
|
+
import torch
|
|
23
|
+
|
|
24
|
+
app = FastAPI()
|
|
25
|
+
|
|
26
|
+
# Load model once at startup
|
|
27
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
28
|
+
classifier = pipeline("text-classification", model="my-model", device=device)
|
|
29
|
+
|
|
30
|
+
@app.post("/predict")
|
|
31
|
+
async def predict(text: str):
|
|
32
|
+
return classifier(text)
|
|
33
|
+
|
|
34
|
+
@app.get("/health")
|
|
35
|
+
async def health():
|
|
36
|
+
return {"status": "ok", "device": device}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Flask + Gunicorn
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from flask import Flask, request, jsonify
|
|
43
|
+
import pickle
|
|
44
|
+
|
|
45
|
+
app = Flask(__name__)
|
|
46
|
+
model = pickle.load(open("model.pkl", "rb"))
|
|
47
|
+
|
|
48
|
+
@app.route("/predict", methods=["POST"])
|
|
49
|
+
def predict():
|
|
50
|
+
data = request.json
|
|
51
|
+
result = model.predict([data["features"]])
|
|
52
|
+
return jsonify({"prediction": result.tolist()})
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### ONNX Runtime (High-Performance Inference)
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import onnxruntime as ort
|
|
59
|
+
import numpy as np
|
|
60
|
+
|
|
61
|
+
# CPU inference
|
|
62
|
+
session = ort.InferenceSession("model.onnx", providers=["CPUExecutionProvider"])
|
|
63
|
+
|
|
64
|
+
# GPU inference (CUDA)
|
|
65
|
+
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
|
|
66
|
+
|
|
67
|
+
# TensorRT (fastest, NVIDIA GPU only)
|
|
68
|
+
session = ort.InferenceSession("model.onnx", providers=["TensorrtExecutionProvider", "CUDAExecutionProvider"])
|
|
69
|
+
|
|
70
|
+
result = session.run(None, {"input": input_data.astype(np.float32)})
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Model Export
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
# PyTorch → ONNX
|
|
77
|
+
import torch
|
|
78
|
+
model = torch.load("model.pt")
|
|
79
|
+
model.eval()
|
|
80
|
+
dummy_input = torch.randn(1, 3, 224, 224)
|
|
81
|
+
torch.onnx.export(model, dummy_input, "model.onnx", opset_version=17)
|
|
82
|
+
|
|
83
|
+
# TensorFlow → SavedModel
|
|
84
|
+
model.save("saved_model/")
|
|
85
|
+
|
|
86
|
+
# TensorFlow → TFLite (mobile)
|
|
87
|
+
converter = tf.lite.TFLiteConverter.from_saved_model("saved_model/")
|
|
88
|
+
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
|
89
|
+
tflite_model = converter.convert()
|
|
90
|
+
with open("model.tflite", "wb") as f:
|
|
91
|
+
f.write(tflite_model)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Model Formats
|
|
95
|
+
|
|
96
|
+
| Format | Framework | Size | Inference Speed | Best For |
|
|
97
|
+
|--------|----------|------|----------------|----------|
|
|
98
|
+
| PyTorch (.pt) | PyTorch | Large | Standard | Research, training |
|
|
99
|
+
| ONNX (.onnx) | Cross-framework | Medium | Fast | Production APIs |
|
|
100
|
+
| SavedModel | TensorFlow | Large | Standard | TF Serving |
|
|
101
|
+
| TFLite (.tflite) | TensorFlow Lite | Small | Fast (mobile) | Mobile/edge |
|
|
102
|
+
| TensorRT | NVIDIA | Medium | Fastest (GPU) | NVIDIA GPU servers |
|
|
103
|
+
| GGUF | llama.cpp | Small | Fast (CPU) | Local LLM inference |
|
|
104
|
+
| CoreML (.mlmodel) | Apple | Medium | Fast (Apple) | iOS/macOS on-device |
|
|
105
|
+
| OpenVINO | Intel | Medium | Fast (Intel CPU) | Intel hardware |
|
|
106
|
+
|
|
107
|
+
## Docker (GPU)
|
|
108
|
+
|
|
109
|
+
```dockerfile
|
|
110
|
+
FROM nvidia/cuda:12.4-runtime-ubuntu22.04
|
|
111
|
+
RUN apt-get update && apt-get install -y python3 python3-pip && rm -rf /var/lib/apt/lists/*
|
|
112
|
+
WORKDIR /app
|
|
113
|
+
COPY requirements.txt .
|
|
114
|
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
|
115
|
+
COPY . .
|
|
116
|
+
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
|
117
|
+
USER appuser
|
|
118
|
+
EXPOSE 8000
|
|
119
|
+
HEALTHCHECK --interval=30s --timeout=3s CMD curl -f http://localhost:8000/health || exit 1
|
|
120
|
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
```dockerfile
|
|
124
|
+
# CPU-only (smaller)
|
|
125
|
+
FROM python:3.13-slim
|
|
126
|
+
WORKDIR /app
|
|
127
|
+
COPY requirements.txt .
|
|
128
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
129
|
+
COPY . .
|
|
130
|
+
RUN groupadd -r appuser && useradd -r -g appuser appuser
|
|
131
|
+
USER appuser
|
|
132
|
+
EXPOSE 8000
|
|
133
|
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0"]
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Model Optimization
|
|
137
|
+
|
|
138
|
+
| Technique | Description | Typical Savings |
|
|
139
|
+
|-----------|-------------|----------------|
|
|
140
|
+
| Quantization (INT8) | Reduce precision from FP32 to INT8 | 4x smaller, 2-3x faster |
|
|
141
|
+
| Quantization (INT4) | Further reduction | 8x smaller (LLMs) |
|
|
142
|
+
| Pruning | Remove redundant weights | 2-10x smaller |
|
|
143
|
+
| Knowledge Distillation | Train smaller model from larger | Custom |
|
|
144
|
+
| ONNX Optimization | Graph optimization | 10-30% faster |
|
|
145
|
+
| TensorRT | NVIDIA GPU optimization | 2-5x faster |
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
# ONNX quantization
|
|
149
|
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
|
150
|
+
quantize_dynamic("model.onnx", "model_quant.onnx", weight_type=QuantType.QUInt8)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Common Pitfalls
|
|
154
|
+
|
|
155
|
+
| Issue | Fix |
|
|
156
|
+
|-------|-----|
|
|
157
|
+
| Model too large for deployment | Quantize (INT8/INT4); use ONNX; prune |
|
|
158
|
+
| GPU out of memory | Reduce batch size; use gradient checkpointing; quantize |
|
|
159
|
+
| Dependency conflicts | Use Docker isolation; pin exact versions |
|
|
160
|
+
| Slow inference | Convert to ONNX or TensorRT; use GPU; batch requests |
|
|
161
|
+
| CUDA version mismatch | Match PyTorch CUDA version with system CUDA; use Docker |
|
|
162
|
+
| Model loading time too long | Load once at startup; use model server (TorchServe, TF Serving) |
|
|
163
|
+
| Inconsistent results between environments | Fix random seeds; pin all dependency versions |
|
|
164
|
+
| ONNX export fails | Check opset version; use `torch.onnx.export` with `opset_version=17` |
|