hdsp-jupyter-extension 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. agent_server/langchain/MULTI_AGENT_ARCHITECTURE.md +1114 -0
  2. agent_server/langchain/__init__.py +2 -2
  3. agent_server/langchain/agent.py +72 -33
  4. agent_server/langchain/agent_factory.py +400 -0
  5. agent_server/langchain/agent_prompts/__init__.py +25 -0
  6. agent_server/langchain/agent_prompts/athena_query_prompt.py +71 -0
  7. agent_server/langchain/agent_prompts/planner_prompt.py +85 -0
  8. agent_server/langchain/agent_prompts/python_developer_prompt.py +123 -0
  9. agent_server/langchain/agent_prompts/researcher_prompt.py +38 -0
  10. agent_server/langchain/custom_middleware.py +652 -195
  11. agent_server/langchain/hitl_config.py +34 -10
  12. agent_server/langchain/middleware/__init__.py +24 -0
  13. agent_server/langchain/middleware/code_history_middleware.py +412 -0
  14. agent_server/langchain/middleware/description_injector.py +150 -0
  15. agent_server/langchain/middleware/skill_middleware.py +298 -0
  16. agent_server/langchain/middleware/subagent_events.py +171 -0
  17. agent_server/langchain/middleware/subagent_middleware.py +329 -0
  18. agent_server/langchain/prompts.py +96 -101
  19. agent_server/langchain/skills/data_analysis.md +236 -0
  20. agent_server/langchain/skills/data_loading.md +158 -0
  21. agent_server/langchain/skills/inference.md +392 -0
  22. agent_server/langchain/skills/model_training.md +318 -0
  23. agent_server/langchain/skills/pyspark.md +352 -0
  24. agent_server/langchain/subagents/__init__.py +20 -0
  25. agent_server/langchain/subagents/base.py +173 -0
  26. agent_server/langchain/tools/__init__.py +3 -0
  27. agent_server/langchain/tools/jupyter_tools.py +58 -20
  28. agent_server/langchain/tools/lsp_tools.py +1 -1
  29. agent_server/langchain/tools/shared/__init__.py +26 -0
  30. agent_server/langchain/tools/shared/qdrant_search.py +175 -0
  31. agent_server/langchain/tools/tool_registry.py +219 -0
  32. agent_server/langchain/tools/workspace_tools.py +197 -0
  33. agent_server/routers/config.py +40 -1
  34. agent_server/routers/langchain_agent.py +818 -337
  35. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
  36. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +7 -2
  37. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js +1108 -179
  38. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
  39. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js +3916 -8128
  40. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
  41. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js +17 -17
  42. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
  43. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/METADATA +1 -1
  44. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/RECORD +75 -51
  45. jupyter_ext/_version.py +1 -1
  46. jupyter_ext/handlers.py +59 -8
  47. jupyter_ext/labextension/build_log.json +1 -1
  48. jupyter_ext/labextension/package.json +7 -2
  49. jupyter_ext/labextension/static/{frontend_styles_index_js.2d9fb488c82498c45c2d.js → frontend_styles_index_js.037b3c8e5d6a92b63b16.js} +1108 -179
  50. jupyter_ext/labextension/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
  51. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js → jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js +3916 -8128
  52. jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
  53. jupyter_ext/labextension/static/{remoteEntry.9da31d1134a53b0c4af5.js → remoteEntry.a8e0b064eb9b1c1ff463.js} +17 -17
  54. jupyter_ext/labextension/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
  55. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
  56. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js.map +0 -1
  57. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js.map +0 -1
  58. jupyter_ext/labextension/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
  59. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js.map +0 -1
  60. jupyter_ext/labextension/static/remoteEntry.9da31d1134a53b0c4af5.js.map +0 -1
  61. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
  62. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
  63. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
  64. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
  65. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
  66. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
  67. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
  68. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
  69. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
  70. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
  71. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
  72. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
  73. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
  74. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
  75. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
  76. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
  77. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
  78. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
  79. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
  80. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/WHEEL +0 -0
  81. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,392 @@
1
+ ---
2
+ name: inference
3
+ description: 모델 추론 최적화. 추론 속도가 느리거나 배치 처리가 필요할 때 사용. 배치 추론, 양자화(INT8/FP16), ONNX 변환, TensorRT 가속 가이드 제공.
4
+ ---
5
+
6
+ # Model Inference Optimization Guide
7
+
8
+ 모델 추론 속도를 높이고 리소스 효율을 개선하는 방법을 안내합니다.
9
+
10
+ ## Resource Tiers
11
+
12
+ ### TIER_SMALL: CPU only 또는 단일 추론
13
+ - 배치 처리 불필요
14
+ - ONNX Runtime CPU 권장
15
+ - 동적 양자화(INT8) 적용
16
+
17
+ ### TIER_MEDIUM: GPU 사용, 중간 규모 추론
18
+ - 배치 크기: 8-64
19
+ - FP16 추론
20
+ - ONNX Runtime GPU 또는 TorchScript
21
+
22
+ ### TIER_LARGE: 대규모 서빙, 고성능 필요
23
+ - 배치 크기: 64-256
24
+ - TensorRT 또는 TensorRT-LLM
25
+ - INT8 양자화 + 배치 최적화
26
+
27
+ ---
28
+
29
+ ## 1. 배치 추론 (Batch Inference)
30
+
31
+ ### PyTorch 배치 처리
32
+ ```python
33
+ import torch
34
+
35
+ model.eval()
36
+
37
+ # 개별 추론 (느림)
38
+ # for item in items:
39
+ # result = model(item)
40
+
41
+ # 배치 추론 (빠름)
42
+ batch = torch.stack(items) # [N, C, H, W]
43
+ with torch.no_grad():
44
+ results = model(batch)
45
+ ```
46
+
47
+ ### 동적 배칭 (Dynamic Batching)
48
+ ```python
49
+ def dynamic_batch_inference(items, model, max_batch=32, max_wait_ms=50):
50
+ """요청을 모아서 배치 처리"""
51
+ import time
52
+
53
+ batch = []
54
+ start_time = time.time()
55
+
56
+ for item in items:
57
+ batch.append(item)
58
+
59
+ # 배치가 꽉 차거나 대기 시간 초과
60
+ if len(batch) >= max_batch or (time.time() - start_time) * 1000 > max_wait_ms:
61
+ batch_tensor = torch.stack(batch)
62
+ with torch.no_grad():
63
+ yield model(batch_tensor)
64
+ batch = []
65
+ start_time = time.time()
66
+
67
+ # 남은 배치 처리
68
+ if batch:
69
+ batch_tensor = torch.stack(batch)
70
+ with torch.no_grad():
71
+ yield model(batch_tensor)
72
+ ```
73
+
74
+ ### 최적 배치 크기
75
+ | GPU Memory | 권장 Batch Size | 참고 |
76
+ |------------|----------------|------|
77
+ | 4GB | 8-16 | 작은 모델 |
78
+ | 8GB | 16-32 | 중간 모델 |
79
+ | 16GB | 32-64 | 큰 모델 |
80
+ | 24GB+ | 64-128 | 대형 모델 |
81
+
82
+ ---
83
+
84
+ ## 2. 양자화 (Quantization)
85
+
86
+ ### PyTorch 동적 양자화 (가장 간단)
87
+ ```python
88
+ import torch
89
+
90
+ # 동적 양자화: 추론 시 가중치를 INT8로 변환
91
+ model_int8 = torch.quantization.quantize_dynamic(
92
+ model,
93
+ {torch.nn.Linear, torch.nn.LSTM}, # 양자화할 레이어 타입
94
+ dtype=torch.qint8
95
+ )
96
+
97
+ # 모델 크기 약 4배 감소
98
+ # 추론 속도 2-4배 향상 (CPU)
99
+ ```
100
+
101
+ ### PyTorch 정적 양자화 (더 빠름)
102
+ ```python
103
+ import torch
104
+ from torch.quantization import get_default_qconfig, prepare, convert
105
+
106
+ # 1. 양자화 설정
107
+ model.qconfig = get_default_qconfig('fbgemm') # CPU용
108
+ # model.qconfig = get_default_qconfig('qnnpack') # 모바일용
109
+
110
+ # 2. 준비 (observer 삽입)
111
+ model_prepared = prepare(model)
112
+
113
+ # 3. 캘리브레이션 (대표 데이터로)
114
+ with torch.no_grad():
115
+ for data in calibration_loader:
116
+ model_prepared(data)
117
+
118
+ # 4. 변환
119
+ model_quantized = convert(model_prepared)
120
+ ```
121
+
122
+ ### 양자화 효과
123
+ | Precision | Model Size | Speed (CPU) | Speed (GPU) | Accuracy |
124
+ |-----------|------------|-------------|-------------|----------|
125
+ | FP32 | 1x | 1x | 1x | Baseline |
126
+ | FP16 | 0.5x | 1x | 1.5-2x | ~동일 |
127
+ | INT8 | 0.25x | 2-4x | 2-3x | 약간 감소 |
128
+
129
+ ---
130
+
131
+ ## 3. ONNX 변환 및 최적화
132
+
133
+ ### PyTorch → ONNX 변환
134
+ ```python
135
+ import torch
136
+
137
+ model.eval()
138
+ dummy_input = torch.randn(1, 3, 224, 224)
139
+
140
+ torch.onnx.export(
141
+ model,
142
+ dummy_input,
143
+ "model.onnx",
144
+ input_names=["input"],
145
+ output_names=["output"],
146
+ dynamic_axes={
147
+ "input": {0: "batch_size"}, # 동적 배치 지원
148
+ "output": {0: "batch_size"}
149
+ },
150
+ opset_version=17 # 최신 opset 권장
151
+ )
152
+ ```
153
+
154
+ ### ONNX Runtime 추론
155
+ ```python
156
+ import onnxruntime as ort
157
+ import numpy as np
158
+
159
+ # CPU 세션
160
+ session = ort.InferenceSession(
161
+ "model.onnx",
162
+ providers=['CPUExecutionProvider']
163
+ )
164
+
165
+ # GPU 세션 (CUDA)
166
+ session = ort.InferenceSession(
167
+ "model.onnx",
168
+ providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
169
+ )
170
+
171
+ # 추론
172
+ input_name = session.get_inputs()[0].name
173
+ output = session.run(None, {input_name: input_data.numpy()})
174
+ ```
175
+
176
+ ### ONNX 모델 최적화
177
+ ```python
178
+ import onnx
179
+ from onnxruntime.transformers import optimizer
180
+
181
+ # 자동 최적화 (Transformer 모델용)
182
+ optimized_model = optimizer.optimize_model(
183
+ "model.onnx",
184
+ model_type='bert', # 'gpt2', 'bert', 'vit' 등
185
+ num_heads=12,
186
+ hidden_size=768
187
+ )
188
+ optimized_model.save_model_to_file("model_optimized.onnx")
189
+ ```
190
+
191
+ ### ONNX 속도 향상
192
+ - PyTorch → ONNX: **CPU 최대 3배**, GPU 1.5-2배 속도 향상
193
+ - 추가 그래프 최적화: 10-30% 추가 향상
194
+
195
+ ---
196
+
197
+ ## 4. TensorRT 가속
198
+
199
+ ### ONNX → TensorRT 변환
200
+ ```python
201
+ import tensorrt as trt
202
+
203
+ logger = trt.Logger(trt.Logger.WARNING)
204
+ builder = trt.Builder(logger)
205
+ network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
206
+ parser = trt.OnnxParser(network, logger)
207
+
208
+ # ONNX 모델 파싱
209
+ with open("model.onnx", "rb") as f:
210
+ parser.parse(f.read())
211
+
212
+ # 빌더 설정
213
+ config = builder.create_builder_config()
214
+ config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB
215
+
216
+ # FP16 활성화 (Volta 이상)
217
+ config.set_flag(trt.BuilderFlag.FP16)
218
+
219
+ # INT8 활성화 (캘리브레이션 필요)
220
+ # config.set_flag(trt.BuilderFlag.INT8)
221
+ # config.int8_calibrator = calibrator
222
+
223
+ # 엔진 빌드
224
+ engine = builder.build_serialized_network(network, config)
225
+
226
+ # 저장
227
+ with open("model.trt", "wb") as f:
228
+ f.write(engine)
229
+ ```
230
+
231
+ ### Torch-TensorRT (더 간단)
232
+ ```python
233
+ import torch
234
+ import torch_tensorrt
235
+
236
+ model = model.eval().cuda()
237
+
238
+ # TensorRT 컴파일
239
+ trt_model = torch_tensorrt.compile(
240
+ model,
241
+ inputs=[
242
+ torch_tensorrt.Input(
243
+ min_shape=[1, 3, 224, 224],
244
+ opt_shape=[8, 3, 224, 224],
245
+ max_shape=[32, 3, 224, 224],
246
+ dtype=torch.float16
247
+ )
248
+ ],
249
+ enabled_precisions={torch.float16},
250
+ workspace_size=1 << 30
251
+ )
252
+
253
+ # 추론
254
+ with torch.no_grad():
255
+ output = trt_model(input_tensor.cuda().half())
256
+ ```
257
+
258
+ ### TensorRT 속도 향상
259
+ - PyTorch → TensorRT: **GPU 최대 5배** 속도 향상
260
+ - FP16 + TensorRT: 3-4배 일반적
261
+ - INT8 + TensorRT: 5-8배 가능
262
+
263
+ ---
264
+
265
+ ## 5. Hugging Face Transformers 최적화
266
+
267
+ ### 기본 최적화
268
+ ```python
269
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
270
+ import torch
271
+
272
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
273
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
274
+
275
+ # FP16 추론
276
+ model = model.half().cuda()
277
+
278
+ # BetterTransformer (Flash Attention)
279
+ model = model.to_bettertransformer()
280
+
281
+ # 추론
282
+ with torch.no_grad():
283
+ inputs = tokenizer(texts, return_tensors="pt", padding=True).to("cuda")
284
+ outputs = model(**inputs)
285
+ ```
286
+
287
+ ### Optimum + ONNX Runtime
288
+ ```python
289
+ from optimum.onnxruntime import ORTModelForSequenceClassification
290
+ from transformers import AutoTokenizer
291
+
292
+ # ONNX 변환 및 로드
293
+ model = ORTModelForSequenceClassification.from_pretrained(
294
+ "bert-base-uncased",
295
+ export=True,
296
+ provider="CUDAExecutionProvider"
297
+ )
298
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
299
+
300
+ # 추론
301
+ inputs = tokenizer(texts, return_tensors="pt", padding=True)
302
+ outputs = model(**inputs)
303
+ ```
304
+
305
+ ---
306
+
307
+ ## 6. 추론 서버 최적화
308
+
309
+ ### torch.inference_mode (torch.no_grad 보다 빠름)
310
+ ```python
311
+ # torch.no_grad() 대신 사용
312
+ with torch.inference_mode():
313
+ output = model(input_tensor)
314
+ ```
315
+
316
+ ### CUDA Graphs (반복 추론용)
317
+ ```python
318
+ import torch
319
+
320
+ # 워밍업
321
+ s = torch.cuda.Stream()
322
+ s.wait_stream(torch.cuda.current_stream())
323
+ with torch.cuda.stream(s):
324
+ for _ in range(3):
325
+ output = model(static_input)
326
+ torch.cuda.current_stream().wait_stream(s)
327
+
328
+ # 그래프 캡처
329
+ g = torch.cuda.CUDAGraph()
330
+ with torch.cuda.graph(g):
331
+ static_output = model(static_input)
332
+
333
+ # 추론 (그래프 재생)
334
+ g.replay()
335
+ result = static_output.clone()
336
+ ```
337
+
338
+ ---
339
+
340
+ ## 7. 메모리 최적화
341
+
342
+ ```python
343
+ import torch
344
+
345
+ # 추론 후 캐시 정리
346
+ torch.cuda.empty_cache()
347
+
348
+ # 그래디언트 비활성화 (메모리 절약)
349
+ for param in model.parameters():
350
+ param.requires_grad = False
351
+
352
+ # 스트리밍 추론 (대용량 데이터)
353
+ def stream_inference(dataloader, model):
354
+ model.eval()
355
+ with torch.inference_mode():
356
+ for batch in dataloader:
357
+ yield model(batch.cuda())
358
+ torch.cuda.empty_cache() # 배치마다 정리
359
+ ```
360
+
361
+ ---
362
+
363
+ ## Quick Reference: 최적화 선택 가이드
364
+
365
+ | 상황 | 권장 방법 | 예상 향상 |
366
+ |------|----------|----------|
367
+ | CPU 추론 | ONNX Runtime + INT8 | 3-4x |
368
+ | GPU 추론 (간단) | FP16 + torch.compile | 1.5-2x |
369
+ | GPU 추론 (최대 성능) | TensorRT + FP16/INT8 | 3-5x |
370
+ | Transformer 모델 | BetterTransformer + ONNX | 2-3x |
371
+ | 대량 추론 | 배치 처리 + TensorRT | 5-10x |
372
+
373
+ ### 빠른 적용 체크리스트
374
+ ```python
375
+ # 1. 기본 최적화
376
+ model.eval()
377
+ with torch.inference_mode():
378
+ ...
379
+
380
+ # 2. FP16 (GPU)
381
+ model = model.half().cuda()
382
+
383
+ # 3. torch.compile (PyTorch 2.0+)
384
+ model = torch.compile(model, mode="reduce-overhead")
385
+
386
+ # 4. ONNX 변환 (더 빠른 추론 필요시)
387
+ torch.onnx.export(model, ...)
388
+
389
+ # 5. TensorRT (최대 성능 필요시)
390
+ torch_tensorrt.compile(model, ...)
391
+ ```
392
+
@@ -0,0 +1,318 @@
1
+ ---
2
+ name: model-training
3
+ description: 모델 훈련 최적화. GPU 메모리 부족, 훈련 속도 개선 시 사용. mixed precision(fp16/bf16), gradient checkpointing, batch size 튜닝, optimizer 최적화 제공.
4
+ ---
5
+
6
+ # Model Training Optimization Guide
7
+
8
+ GPU/CPU 환경에서 효율적인 모델 훈련 방법을 안내합니다.
9
+
10
+ ## Resource Tiers
11
+
12
+ ### TIER_SMALL: GPU < 8GB 또는 CPU only
13
+ - batch_size: 4-16
14
+ - gradient_checkpointing: 필수
15
+ - mixed precision: 권장
16
+ - optimizer: 8-bit Adam
17
+
18
+ ### TIER_MEDIUM: GPU 8-24GB (RTX 3090, T4, A10)
19
+ - batch_size: 16-64
20
+ - gradient_checkpointing: 선택
21
+ - mixed precision: fp16/bf16 필수
22
+ - optimizer: AdamW 또는 8-bit Adam
23
+
24
+ ### TIER_LARGE: GPU > 24GB (A100, H100)
25
+ - batch_size: 64-256
26
+ - gradient_checkpointing: 불필요
27
+ - mixed precision: bf16 권장
28
+ - optimizer: AdamW, CUDA Graphs 활용
29
+
30
+ ---
31
+
32
+ ## 1. Mixed Precision Training
33
+
34
+ ### PyTorch Native (권장)
35
+ ```python
36
+ import torch
37
+ from torch.cuda.amp import autocast, GradScaler
38
+
39
+ model = model.cuda()
40
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
41
+ scaler = GradScaler()
42
+
43
+ for batch in dataloader:
44
+ optimizer.zero_grad()
45
+
46
+ # Mixed precision forward pass
47
+ with autocast():
48
+ outputs = model(batch["input_ids"].cuda())
49
+ loss = criterion(outputs, batch["labels"].cuda())
50
+
51
+ # Scaled backward pass
52
+ scaler.scale(loss).backward()
53
+ scaler.step(optimizer)
54
+ scaler.update()
55
+ ```
56
+
57
+ ### Hugging Face Transformers
58
+ ```python
59
+ from transformers import TrainingArguments, Trainer
60
+
61
+ training_args = TrainingArguments(
62
+ output_dir="./results",
63
+ per_device_train_batch_size=16,
64
+
65
+ # Mixed Precision 선택 (GPU 아키텍처에 따라)
66
+ fp16=True, # Volta, Turing, Ampere
67
+ # bf16=True, # Ampere 이상 권장 (더 안정적)
68
+ )
69
+ ```
70
+
71
+ ### Data Type 선택 가이드
72
+
73
+ | GPU Architecture | Recommended | Notes |
74
+ |------------------|-------------|-------|
75
+ | Volta (V100) | fp16 | bf16 미지원 |
76
+ | Turing (RTX 20xx, T4) | fp16 | bf16 미지원 |
77
+ | Ampere (A100, RTX 30xx) | bf16 | fp16도 가능, bf16이 더 안정적 |
78
+ | Hopper (H100) | bf16 | FP8도 지원 |
79
+
80
+ ---
81
+
82
+ ## 2. Gradient Checkpointing
83
+
84
+ 메모리 50-80% 절약, 훈련 속도 20-30% 감소 트레이드오프.
85
+
86
+ ### PyTorch
87
+ ```python
88
+ from torch.utils.checkpoint import checkpoint_sequential
89
+
90
+ # 모델 레이어를 청크로 나누어 체크포인팅
91
+ model = nn.Sequential(layer1, layer2, layer3, layer4)
92
+ output = checkpoint_sequential(model, 2, input) # 2개 청크로 분할
93
+ ```
94
+
95
+ ### Hugging Face Transformers
96
+ ```python
97
+ from transformers import TrainingArguments
98
+
99
+ training_args = TrainingArguments(
100
+ output_dir="./results",
101
+ per_device_train_batch_size=4,
102
+ gradient_checkpointing=True, # 활성화
103
+ gradient_accumulation_steps=16, # effective batch = 4 * 16 = 64
104
+ )
105
+ ```
106
+
107
+ ### 언제 사용?
108
+ - GPU 메모리 부족 (OOM)
109
+ - 큰 모델 (7B+ 파라미터)
110
+ - 작은 GPU (< 16GB)
111
+
112
+ ---
113
+
114
+ ## 3. Batch Size & Gradient Accumulation
115
+
116
+ ### 최적 Batch Size 찾기
117
+ ```python
118
+ # 1. 최대 batch size 찾기 (OOM 직전까지)
119
+ # 2. 2의 거듭제곱 사용 (8, 16, 32, 64, 128)
120
+ # 3. fp16: 8의 배수, A100: 64의 배수
121
+
122
+ # Gradient Accumulation으로 effective batch size 확보
123
+ training_args = TrainingArguments(
124
+ per_device_train_batch_size=4, # GPU에 맞는 최대값
125
+ gradient_accumulation_steps=16, # 4 * 16 = 64 effective
126
+ )
127
+ ```
128
+
129
+ ### Batch Size 권장표
130
+
131
+ | GPU Memory | Max Batch (fp32) | Max Batch (fp16) | Recommended |
132
+ |------------|------------------|------------------|-------------|
133
+ | 8GB | 4-8 | 8-16 | 8 (fp16) |
134
+ | 16GB | 8-16 | 16-32 | 16 (fp16) |
135
+ | 24GB | 16-32 | 32-64 | 32 (fp16) |
136
+ | 40GB+ | 32-64 | 64-128 | 64 (bf16) |
137
+
138
+ ---
139
+
140
+ ## 4. Optimizer 최적화
141
+
142
+ ### 8-bit Adam (메모리 절약)
143
+ ```python
144
+ # bitsandbytes 설치 필요: pip install bitsandbytes
145
+ import bitsandbytes as bnb
146
+
147
+ optimizer = bnb.optim.Adam8bit(
148
+ model.parameters(),
149
+ lr=1e-4,
150
+ betas=(0.9, 0.999)
151
+ )
152
+
153
+ # Hugging Face
154
+ training_args = TrainingArguments(
155
+ optim="adamw_bnb_8bit", # 8-bit AdamW
156
+ )
157
+ ```
158
+
159
+ ### Adafactor (대용량 모델용)
160
+ ```python
161
+ from transformers import Adafactor
162
+
163
+ optimizer = Adafactor(
164
+ model.parameters(),
165
+ scale_parameter=True,
166
+ relative_step=True,
167
+ warmup_init=True,
168
+ lr=None # relative_step=True면 자동 조절
169
+ )
170
+ ```
171
+
172
+ ---
173
+
174
+ ## 5. Data Loading 최적화
175
+
176
+ ```python
177
+ from torch.utils.data import DataLoader
178
+
179
+ dataloader = DataLoader(
180
+ dataset,
181
+ batch_size=32,
182
+ shuffle=True,
183
+ num_workers=4, # CPU 코어 수에 맞게
184
+ pin_memory=True, # GPU 전송 가속
185
+ prefetch_factor=2, # 미리 로드할 배치 수
186
+ persistent_workers=True # 워커 재사용
187
+ )
188
+
189
+ # Hugging Face
190
+ training_args = TrainingArguments(
191
+ dataloader_pin_memory=True,
192
+ dataloader_num_workers=4,
193
+ )
194
+ ```
195
+
196
+ ---
197
+
198
+ ## 6. torch.compile (PyTorch 2.0+)
199
+
200
+ ```python
201
+ import torch
202
+
203
+ # 모델 컴파일 (최대 2x 속도 향상)
204
+ model = torch.compile(model, mode="reduce-overhead")
205
+
206
+ # Hugging Face
207
+ training_args = TrainingArguments(
208
+ torch_compile=True,
209
+ torch_compile_backend="inductor", # 기본값, 최적
210
+ )
211
+ ```
212
+
213
+ ### 컴파일 모드
214
+
215
+ | Mode | Speed | Memory | Use Case |
216
+ |------|-------|--------|----------|
217
+ | default | Good | Neutral | 일반적인 경우 |
218
+ | reduce-overhead | Best | Slight increase | 긴 훈련 |
219
+ | max-autotune | Best | Neutral | 시간 여유 있을 때 |
220
+
221
+ ---
222
+
223
+ ## 7. sklearn 모델 최적화
224
+
225
+ ### CPU 병렬화
226
+ ```python
227
+ from sklearn.ensemble import RandomForestClassifier
228
+
229
+ # n_jobs=-1: 모든 CPU 코어 사용
230
+ model = RandomForestClassifier(
231
+ n_estimators=100,
232
+ n_jobs=-1, # 병렬 처리
233
+ random_state=42
234
+ )
235
+
236
+ # GridSearchCV도 병렬화
237
+ from sklearn.model_selection import GridSearchCV
238
+ grid_search = GridSearchCV(
239
+ model, param_grid,
240
+ cv=5,
241
+ n_jobs=-1 # 병렬 교차 검증
242
+ )
243
+ ```
244
+
245
+ ### 점진적 학습 (대용량 데이터)
246
+ ```python
247
+ from sklearn.linear_model import SGDClassifier
248
+
249
+ model = SGDClassifier(warm_start=True)
250
+
251
+ # 청크별 학습
252
+ for chunk in pd.read_csv("large_data.csv", chunksize=10000):
253
+ X, y = chunk.drop("target", axis=1), chunk["target"]
254
+ model.partial_fit(X, y, classes=[0, 1])
255
+ ```
256
+
257
+ ---
258
+
259
+ ## 8. 메모리 모니터링
260
+
261
+ ```python
262
+ import torch
263
+
264
+ # GPU 메모리 확인
265
+ print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
266
+ print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
267
+
268
+ # 메모리 정리
269
+ torch.cuda.empty_cache()
270
+
271
+ # 훈련 중 주기적 정리 (Hugging Face)
272
+ training_args = TrainingArguments(
273
+ torch_empty_cache_steps=4, # 4스텝마다 캐시 정리
274
+ )
275
+ ```
276
+
277
+ ---
278
+
279
+ ## Quick Reference: 최적 설정 조합
280
+
281
+ ### TIER_SMALL (GPU < 8GB)
282
+ ```python
283
+ TrainingArguments(
284
+ per_device_train_batch_size=4,
285
+ gradient_accumulation_steps=16,
286
+ gradient_checkpointing=True,
287
+ fp16=True,
288
+ optim="adamw_bnb_8bit",
289
+ dataloader_num_workers=2,
290
+ )
291
+ ```
292
+
293
+ ### TIER_MEDIUM (GPU 8-24GB)
294
+ ```python
295
+ TrainingArguments(
296
+ per_device_train_batch_size=16,
297
+ gradient_accumulation_steps=4,
298
+ bf16=True, # or fp16
299
+ optim="adamw_torch",
300
+ dataloader_pin_memory=True,
301
+ dataloader_num_workers=4,
302
+ torch_compile=True,
303
+ )
304
+ ```
305
+
306
+ ### TIER_LARGE (GPU > 24GB)
307
+ ```python
308
+ TrainingArguments(
309
+ per_device_train_batch_size=64,
310
+ bf16=True,
311
+ optim="adamw_torch",
312
+ dataloader_pin_memory=True,
313
+ dataloader_num_workers=8,
314
+ torch_compile=True,
315
+ torch_compile_backend="inductor",
316
+ )
317
+ ```
318
+