hdsp-jupyter-extension 2.0.10__py3-none-any.whl → 2.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_server/core/notebook_generator.py +4 -4
- agent_server/langchain/MULTI_AGENT_ARCHITECTURE.md +1114 -0
- agent_server/langchain/__init__.py +2 -2
- agent_server/langchain/agent.py +72 -33
- agent_server/langchain/agent_factory.py +400 -0
- agent_server/langchain/agent_prompts/__init__.py +25 -0
- agent_server/langchain/agent_prompts/athena_query_prompt.py +71 -0
- agent_server/langchain/agent_prompts/planner_prompt.py +85 -0
- agent_server/langchain/agent_prompts/python_developer_prompt.py +123 -0
- agent_server/langchain/agent_prompts/researcher_prompt.py +38 -0
- agent_server/langchain/custom_middleware.py +656 -113
- agent_server/langchain/hitl_config.py +38 -9
- agent_server/langchain/llm_factory.py +1 -85
- agent_server/langchain/middleware/__init__.py +24 -0
- agent_server/langchain/middleware/code_history_middleware.py +412 -0
- agent_server/langchain/middleware/description_injector.py +150 -0
- agent_server/langchain/middleware/skill_middleware.py +298 -0
- agent_server/langchain/middleware/subagent_events.py +171 -0
- agent_server/langchain/middleware/subagent_middleware.py +329 -0
- agent_server/langchain/prompts.py +107 -135
- agent_server/langchain/skills/data_analysis.md +236 -0
- agent_server/langchain/skills/data_loading.md +158 -0
- agent_server/langchain/skills/inference.md +392 -0
- agent_server/langchain/skills/model_training.md +318 -0
- agent_server/langchain/skills/pyspark.md +352 -0
- agent_server/langchain/subagents/__init__.py +20 -0
- agent_server/langchain/subagents/base.py +173 -0
- agent_server/langchain/tools/__init__.py +3 -0
- agent_server/langchain/tools/jupyter_tools.py +58 -20
- agent_server/langchain/tools/lsp_tools.py +1 -1
- agent_server/langchain/tools/shared/__init__.py +26 -0
- agent_server/langchain/tools/shared/qdrant_search.py +175 -0
- agent_server/langchain/tools/tool_registry.py +219 -0
- agent_server/langchain/tools/workspace_tools.py +197 -0
- agent_server/prompts/file_action_prompts.py +8 -8
- agent_server/routers/config.py +40 -1
- agent_server/routers/langchain_agent.py +868 -321
- hdsp_agent_core/__init__.py +46 -47
- hdsp_agent_core/factory.py +6 -10
- hdsp_agent_core/interfaces.py +4 -2
- hdsp_agent_core/knowledge/__init__.py +5 -5
- hdsp_agent_core/knowledge/chunking.py +87 -61
- hdsp_agent_core/knowledge/loader.py +103 -101
- hdsp_agent_core/llm/service.py +192 -107
- hdsp_agent_core/managers/config_manager.py +16 -22
- hdsp_agent_core/managers/session_manager.py +5 -4
- hdsp_agent_core/models/__init__.py +12 -12
- hdsp_agent_core/models/agent.py +15 -8
- hdsp_agent_core/models/common.py +1 -2
- hdsp_agent_core/models/rag.py +48 -111
- hdsp_agent_core/prompts/__init__.py +12 -12
- hdsp_agent_core/prompts/cell_action_prompts.py +9 -7
- hdsp_agent_core/services/agent_service.py +10 -8
- hdsp_agent_core/services/chat_service.py +10 -6
- hdsp_agent_core/services/rag_service.py +3 -6
- hdsp_agent_core/tests/conftest.py +4 -1
- hdsp_agent_core/tests/test_factory.py +2 -2
- hdsp_agent_core/tests/test_services.py +12 -19
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +7 -2
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js +1108 -179
- hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
- jupyter_ext/labextension/static/lib_index_js.dc6434bee96ab03a0539.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js +3936 -8144
- hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.4a252df3ade74efee8d6.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js +17 -17
- hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
- {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/METADATA +1 -1
- {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/RECORD +100 -76
- jupyter_ext/__init__.py +21 -11
- jupyter_ext/_version.py +1 -1
- jupyter_ext/handlers.py +128 -58
- jupyter_ext/labextension/build_log.json +1 -1
- jupyter_ext/labextension/package.json +7 -2
- jupyter_ext/labextension/static/{frontend_styles_index_js.2d9fb488c82498c45c2d.js → frontend_styles_index_js.037b3c8e5d6a92b63b16.js} +1108 -179
- jupyter_ext/labextension/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.dc6434bee96ab03a0539.js → jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js +3936 -8144
- jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
- jupyter_ext/labextension/static/{remoteEntry.4a252df3ade74efee8d6.js → remoteEntry.a8e0b064eb9b1c1ff463.js} +17 -17
- jupyter_ext/labextension/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.dc6434bee96ab03a0539.js.map +0 -1
- hdsp_jupyter_extension-2.0.10.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.4a252df3ade74efee8d6.js.map +0 -1
- jupyter_ext/labextension/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
- jupyter_ext/labextension/static/lib_index_js.dc6434bee96ab03a0539.js.map +0 -1
- jupyter_ext/labextension/static/remoteEntry.4a252df3ade74efee8d6.js.map +0 -1
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
- {hdsp_jupyter_extension-2.0.10.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
- {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/WHEEL +0 -0
- {hdsp_jupyter_extension-2.0.10.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-analysis
|
|
3
|
+
description: DataFrame 연산 최적화. groupby, merge, pivot 등 메모리 집약적 연산 시 사용. 벡터화 연산, query 최적화, 메모리 효율적 패턴 제공.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Data Analysis Optimization Guide
|
|
7
|
+
|
|
8
|
+
pandas DataFrame의 분석 연산을 메모리 효율적으로 수행하는 방법을 안내합니다.
|
|
9
|
+
|
|
10
|
+
## Resource Tiers
|
|
11
|
+
|
|
12
|
+
### TIER_SMALL: DataFrame < 1GB, RAM 여유 충분
|
|
13
|
+
일반 pandas 연산 OK.
|
|
14
|
+
|
|
15
|
+
### TIER_MEDIUM: DataFrame 1~5GB
|
|
16
|
+
벡터화 연산 + query 최적화 필수
|
|
17
|
+
|
|
18
|
+
### TIER_LARGE: DataFrame > 5GB 또는 메모리 부족
|
|
19
|
+
Dask/Polars 사용 또는 청크 처리
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. 벡터화 연산 (Vectorization)
|
|
24
|
+
|
|
25
|
+
### 나쁜 예: 반복문 사용
|
|
26
|
+
```python
|
|
27
|
+
# 느림 - 절대 사용 금지
|
|
28
|
+
for i in range(len(df)):
|
|
29
|
+
df.loc[i, "new_col"] = df.loc[i, "col1"] * 2
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### 좋은 예: 벡터화
|
|
33
|
+
```python
|
|
34
|
+
# 빠름 - 항상 이 방법 사용
|
|
35
|
+
df["new_col"] = df["col1"] * 2
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### 조건부 연산
|
|
39
|
+
```python
|
|
40
|
+
# 나쁜 예: apply 사용
|
|
41
|
+
df["category"] = df["value"].apply(lambda x: "high" if x > 100 else "low")
|
|
42
|
+
|
|
43
|
+
# 좋은 예: np.where 사용 (10x+ 빠름)
|
|
44
|
+
import numpy as np
|
|
45
|
+
df["category"] = np.where(df["value"] > 100, "high", "low")
|
|
46
|
+
|
|
47
|
+
# 여러 조건: np.select 사용
|
|
48
|
+
conditions = [
|
|
49
|
+
df["value"] > 100,
|
|
50
|
+
df["value"] > 50,
|
|
51
|
+
]
|
|
52
|
+
choices = ["high", "medium"]
|
|
53
|
+
df["category"] = np.select(conditions, choices, default="low")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 2. GroupBy 최적화
|
|
59
|
+
|
|
60
|
+
### 기본 최적화
|
|
61
|
+
```python
|
|
62
|
+
# sort=False로 정렬 비용 제거
|
|
63
|
+
result = df.groupby("category", sort=False)["value"].sum()
|
|
64
|
+
|
|
65
|
+
# 여러 집계 한 번에
|
|
66
|
+
result = df.groupby("category", sort=False).agg({
|
|
67
|
+
"value": ["sum", "mean", "count"],
|
|
68
|
+
"amount": "sum"
|
|
69
|
+
})
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 대용량 GroupBy (메모리 부족 시)
|
|
73
|
+
```python
|
|
74
|
+
# Option A: numba 가속 (설치 필요)
|
|
75
|
+
@numba.jit
|
|
76
|
+
def custom_agg(values):
|
|
77
|
+
return values.sum()
|
|
78
|
+
|
|
79
|
+
result = df.groupby("category")["value"].agg(custom_agg)
|
|
80
|
+
|
|
81
|
+
# Option B: Dask 사용
|
|
82
|
+
import dask.dataframe as dd
|
|
83
|
+
ddf = dd.from_pandas(df, npartitions=4)
|
|
84
|
+
result = ddf.groupby("category")["value"].sum().compute()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 3. Merge/Join 최적화
|
|
90
|
+
|
|
91
|
+
### 메모리 효율적 Merge
|
|
92
|
+
```python
|
|
93
|
+
# 1. 필요한 컬럼만 선택 후 merge
|
|
94
|
+
df1_subset = df1[["key", "needed_col1", "needed_col2"]]
|
|
95
|
+
df2_subset = df2[["key", "needed_col3"]]
|
|
96
|
+
result = pd.merge(df1_subset, df2_subset, on="key")
|
|
97
|
+
|
|
98
|
+
# 2. 작은 테이블을 왼쪽에 배치 (메모리 효율)
|
|
99
|
+
result = pd.merge(small_df, large_df, on="key", how="left")
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 대용량 Merge (메모리 부족 시)
|
|
103
|
+
```python
|
|
104
|
+
# Chunked merge
|
|
105
|
+
def chunked_merge(large_df, small_df, on, chunksize=100_000):
|
|
106
|
+
chunks = []
|
|
107
|
+
for start in range(0, len(large_df), chunksize):
|
|
108
|
+
chunk = large_df.iloc[start:start + chunksize]
|
|
109
|
+
merged = pd.merge(chunk, small_df, on=on, how="left")
|
|
110
|
+
chunks.append(merged)
|
|
111
|
+
return pd.concat(chunks, ignore_index=True)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 4. Query 최적화
|
|
117
|
+
|
|
118
|
+
### eval() 사용 (대용량 DataFrame에서 빠름)
|
|
119
|
+
```python
|
|
120
|
+
# 일반 방식
|
|
121
|
+
df["c"] = df["a"] + df["b"]
|
|
122
|
+
df["d"] = df["c"] * 2
|
|
123
|
+
|
|
124
|
+
# eval() 사용 (중간 결과 메모리 절약)
|
|
125
|
+
df = df.eval("""
|
|
126
|
+
c = a + b
|
|
127
|
+
d = c * 2
|
|
128
|
+
""")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### query() 사용 (필터링)
|
|
132
|
+
```python
|
|
133
|
+
# 일반 방식
|
|
134
|
+
result = df[(df["col1"] > 10) & (df["col2"] == "active")]
|
|
135
|
+
|
|
136
|
+
# query() 사용 (더 빠르고 가독성 좋음)
|
|
137
|
+
result = df.query("col1 > 10 and col2 == 'active'")
|
|
138
|
+
|
|
139
|
+
# 변수 사용
|
|
140
|
+
threshold = 10
|
|
141
|
+
status = "active"
|
|
142
|
+
result = df.query("col1 > @threshold and col2 == @status")
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## 5. 피벗/언피벗 최적화
|
|
148
|
+
|
|
149
|
+
### Pivot Table
|
|
150
|
+
```python
|
|
151
|
+
# 기본 사용
|
|
152
|
+
pivot = df.pivot_table(
|
|
153
|
+
values="amount",
|
|
154
|
+
index="date",
|
|
155
|
+
columns="category",
|
|
156
|
+
aggfunc="sum",
|
|
157
|
+
fill_value=0
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# 메모리 부족 시: 청크로 처리
|
|
161
|
+
def chunked_pivot(df, chunksize=100_000):
|
|
162
|
+
results = []
|
|
163
|
+
for start in range(0, len(df), chunksize):
|
|
164
|
+
chunk = df.iloc[start:start + chunksize]
|
|
165
|
+
pivot = chunk.pivot_table(...)
|
|
166
|
+
results.append(pivot)
|
|
167
|
+
return pd.concat(results).groupby(level=0).sum()
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 6. 메모리 관리
|
|
173
|
+
|
|
174
|
+
### 불필요한 객체 삭제
|
|
175
|
+
```python
|
|
176
|
+
import gc
|
|
177
|
+
|
|
178
|
+
# 중간 결과 삭제
|
|
179
|
+
del intermediate_df
|
|
180
|
+
gc.collect()
|
|
181
|
+
|
|
182
|
+
# 컬럼 삭제 (inplace)
|
|
183
|
+
df.drop(columns=["unneeded_col"], inplace=True)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### 원본 유지하면서 메모리 절약
|
|
187
|
+
```python
|
|
188
|
+
# 복사 대신 view 사용 (가능할 때)
|
|
189
|
+
subset = df[["col1", "col2"]] # view (메모리 공유)
|
|
190
|
+
subset = df[["col1", "col2"]].copy() # copy (별도 메모리)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## 7. 연산 속도 비교표
|
|
196
|
+
|
|
197
|
+
| Operation | Slow Method | Fast Method | Speedup |
|
|
198
|
+
|-----------|-------------|-------------|---------|
|
|
199
|
+
| 조건부 할당 | apply(lambda) | np.where | 10-100x |
|
|
200
|
+
| 문자열 연산 | apply(str) | .str accessor | 5-20x |
|
|
201
|
+
| 반복 계산 | for loop | vectorized | 100-1000x |
|
|
202
|
+
| 다중 집계 | 여러 groupby | 단일 .agg() | 2-5x |
|
|
203
|
+
| 필터링 | boolean indexing | .query() | 1.5-3x |
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## 8. 성능 측정
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
import time
|
|
211
|
+
|
|
212
|
+
# 실행 시간 측정
|
|
213
|
+
start = time.time()
|
|
214
|
+
result = df.groupby("category")["value"].sum()
|
|
215
|
+
print(f"Elapsed: {time.time() - start:.2f}s")
|
|
216
|
+
|
|
217
|
+
# 메모리 프로파일링
|
|
218
|
+
import memory_profiler
|
|
219
|
+
%memit df.groupby("category")["value"].sum()
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Quick Reference
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# 대용량 분석 체크리스트
|
|
228
|
+
# 1. dtype 최적화 (data_loading 스킬 참조)
|
|
229
|
+
# 2. 필요한 컬럼만 선택
|
|
230
|
+
# 3. 벡터화 연산 사용 (apply 대신 np.where/np.select)
|
|
231
|
+
# 4. eval()/query() 활용
|
|
232
|
+
# 5. groupby에 sort=False 추가
|
|
233
|
+
# 6. 중간 결과 삭제 (del + gc.collect())
|
|
234
|
+
# 7. 메모리 부족 시 Dask/Polars 전환
|
|
235
|
+
```
|
|
236
|
+
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: data-loading
|
|
3
|
+
description: 대용량 파일 로드 최적화. CSV/Parquet 파일이 100MB 이상이거나 메모리 부족 시 사용. chunking, sampling, dtype 최적화, Dask/Polars 전환 가이드 제공.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Data Loading Optimization Guide
|
|
7
|
+
|
|
8
|
+
대용량 데이터셋 로드 시 메모리 효율적인 방법을 안내합니다.
|
|
9
|
+
|
|
10
|
+
## Resource Tiers
|
|
11
|
+
|
|
12
|
+
### TIER_SMALL: 파일 < 100MB, RAM 여유 충분
|
|
13
|
+
직접 로드 OK. 특별한 최적화 불필요.
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import pandas as pd
|
|
17
|
+
df = pd.read_csv("data.csv")
|
|
18
|
+
# 또는
|
|
19
|
+
df = pd.read_parquet("data.parquet")
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### TIER_MEDIUM: 파일 100MB ~ 1GB
|
|
23
|
+
dtype 최적화 + 필요한 컬럼만 로드
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
# 1. 필요한 컬럼만 로드 (메모리 최대 90% 절약)
|
|
29
|
+
df = pd.read_csv("data.csv", usecols=["col1", "col2", "col3"])
|
|
30
|
+
|
|
31
|
+
# 2. dtype 최적화 지정
|
|
32
|
+
dtype_map = {
|
|
33
|
+
"id": "int32", # int64 → int32 (50% 절약)
|
|
34
|
+
"category_col": "category", # string → category (90%+ 절약)
|
|
35
|
+
"float_col": "float32", # float64 → float32 (50% 절약)
|
|
36
|
+
}
|
|
37
|
+
df = pd.read_csv("data.csv", dtype=dtype_map)
|
|
38
|
+
|
|
39
|
+
# 3. Parquet 사용 시 (자동 압축, 컬럼 선택 지원)
|
|
40
|
+
df = pd.read_parquet("data.parquet", columns=["col1", "col2"])
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### TIER_LARGE: 파일 > 1GB 또는 메모리 부족
|
|
44
|
+
Chunking 또는 Dask/Polars 사용
|
|
45
|
+
|
|
46
|
+
#### Option A: Chunking (단순 집계용)
|
|
47
|
+
```python
|
|
48
|
+
import pandas as pd
|
|
49
|
+
|
|
50
|
+
# 청크 단위 처리 (메모리: 청크 크기만 사용)
|
|
51
|
+
chunks = pd.read_csv("large_data.csv", chunksize=100_000)
|
|
52
|
+
|
|
53
|
+
# 예: 청크별 집계 후 합산
|
|
54
|
+
total_count = 0
|
|
55
|
+
for chunk in chunks:
|
|
56
|
+
total_count += len(chunk[chunk["status"] == "active"])
|
|
57
|
+
|
|
58
|
+
print(f"Active records: {total_count}")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### Option B: Dask (복잡한 연산, groupby 등)
|
|
62
|
+
```python
|
|
63
|
+
import dask.dataframe as dd
|
|
64
|
+
|
|
65
|
+
# Dask로 로드 (lazy evaluation, 메모리 효율적)
|
|
66
|
+
ddf = dd.read_csv("large_data.csv")
|
|
67
|
+
|
|
68
|
+
# pandas처럼 사용 (내부적으로 청크 처리)
|
|
69
|
+
result = ddf.groupby("category")["value"].mean().compute()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### Option C: Polars (고성능 대안)
|
|
73
|
+
```python
|
|
74
|
+
import polars as pl
|
|
75
|
+
|
|
76
|
+
# Polars: Rust 기반, pandas보다 30x 빠름
|
|
77
|
+
df = pl.read_csv("large_data.csv")
|
|
78
|
+
|
|
79
|
+
# 또는 lazy mode (메모리 최적화)
|
|
80
|
+
df = pl.scan_csv("large_data.csv").filter(
|
|
81
|
+
pl.col("date") > "2024-01-01"
|
|
82
|
+
).collect()
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## dtype 최적화 상세
|
|
86
|
+
|
|
87
|
+
| Original Type | Optimized Type | Memory Savings | When to Use |
|
|
88
|
+
|---------------|----------------|----------------|-------------|
|
|
89
|
+
| int64 | int32 | 50% | 값 범위가 ±2B 이내 |
|
|
90
|
+
| int64 | int16 | 75% | 값 범위가 ±32K 이내 |
|
|
91
|
+
| int64 | int8 | 87.5% | 값 범위가 ±127 이내 |
|
|
92
|
+
| float64 | float32 | 50% | 소수점 7자리 정밀도 OK |
|
|
93
|
+
| object (string) | category | 90%+ | 고유값 < 50% |
|
|
94
|
+
|
|
95
|
+
### 자동 dtype 최적화 함수
|
|
96
|
+
```python
|
|
97
|
+
def optimize_dtypes(df):
|
|
98
|
+
"""DataFrame의 dtype을 자동 최적화"""
|
|
99
|
+
for col in df.columns:
|
|
100
|
+
col_type = df[col].dtype
|
|
101
|
+
|
|
102
|
+
if col_type == "int64":
|
|
103
|
+
if df[col].min() >= 0:
|
|
104
|
+
if df[col].max() < 255:
|
|
105
|
+
df[col] = df[col].astype("uint8")
|
|
106
|
+
elif df[col].max() < 65535:
|
|
107
|
+
df[col] = df[col].astype("uint16")
|
|
108
|
+
else:
|
|
109
|
+
df[col] = df[col].astype("uint32")
|
|
110
|
+
else:
|
|
111
|
+
if df[col].min() > -128 and df[col].max() < 127:
|
|
112
|
+
df[col] = df[col].astype("int8")
|
|
113
|
+
elif df[col].min() > -32768 and df[col].max() < 32767:
|
|
114
|
+
df[col] = df[col].astype("int16")
|
|
115
|
+
else:
|
|
116
|
+
df[col] = df[col].astype("int32")
|
|
117
|
+
|
|
118
|
+
elif col_type == "float64":
|
|
119
|
+
df[col] = df[col].astype("float32")
|
|
120
|
+
|
|
121
|
+
elif col_type == "object":
|
|
122
|
+
num_unique = df[col].nunique()
|
|
123
|
+
num_total = len(df[col])
|
|
124
|
+
if num_unique / num_total < 0.5: # 50% 미만 고유값
|
|
125
|
+
df[col] = df[col].astype("category")
|
|
126
|
+
|
|
127
|
+
return df
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## 파일 포맷별 권장사항
|
|
131
|
+
|
|
132
|
+
| Format | Read Speed | Write Speed | Compression | Best For |
|
|
133
|
+
|--------|------------|-------------|-------------|----------|
|
|
134
|
+
| CSV | Slow | Slow | None | 호환성, 간단한 데이터 |
|
|
135
|
+
| Parquet | Fast | Fast | Excellent | 대용량 분석, 컬럼 선택 |
|
|
136
|
+
| Feather | Fastest | Fastest | Good | pandas 간 데이터 교환 |
|
|
137
|
+
| HDF5 | Fast | Fast | Good | 다차원 배열 |
|
|
138
|
+
|
|
139
|
+
### CSV → Parquet 변환 (일회성 비용으로 이후 로드 빠름)
|
|
140
|
+
```python
|
|
141
|
+
# 최초 1회 변환
|
|
142
|
+
df = pd.read_csv("data.csv")
|
|
143
|
+
df.to_parquet("data.parquet", compression="snappy")
|
|
144
|
+
|
|
145
|
+
# 이후 빠른 로드
|
|
146
|
+
df = pd.read_parquet("data.parquet")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## 메모리 확인 방법
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
# DataFrame 메모리 사용량 확인
|
|
153
|
+
print(df.memory_usage(deep=True).sum() / 1024**2, "MB")
|
|
154
|
+
|
|
155
|
+
# 컬럼별 메모리 사용량
|
|
156
|
+
print(df.memory_usage(deep=True) / 1024**2)
|
|
157
|
+
```
|
|
158
|
+
|