hdsp-jupyter-extension 2.0.11__py3-none-any.whl → 2.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. agent_server/langchain/MULTI_AGENT_ARCHITECTURE.md +1114 -0
  2. agent_server/langchain/__init__.py +2 -2
  3. agent_server/langchain/agent.py +72 -33
  4. agent_server/langchain/agent_factory.py +400 -0
  5. agent_server/langchain/agent_prompts/__init__.py +25 -0
  6. agent_server/langchain/agent_prompts/athena_query_prompt.py +71 -0
  7. agent_server/langchain/agent_prompts/planner_prompt.py +85 -0
  8. agent_server/langchain/agent_prompts/python_developer_prompt.py +123 -0
  9. agent_server/langchain/agent_prompts/researcher_prompt.py +38 -0
  10. agent_server/langchain/custom_middleware.py +652 -195
  11. agent_server/langchain/hitl_config.py +34 -10
  12. agent_server/langchain/middleware/__init__.py +24 -0
  13. agent_server/langchain/middleware/code_history_middleware.py +412 -0
  14. agent_server/langchain/middleware/description_injector.py +150 -0
  15. agent_server/langchain/middleware/skill_middleware.py +298 -0
  16. agent_server/langchain/middleware/subagent_events.py +171 -0
  17. agent_server/langchain/middleware/subagent_middleware.py +329 -0
  18. agent_server/langchain/prompts.py +96 -101
  19. agent_server/langchain/skills/data_analysis.md +236 -0
  20. agent_server/langchain/skills/data_loading.md +158 -0
  21. agent_server/langchain/skills/inference.md +392 -0
  22. agent_server/langchain/skills/model_training.md +318 -0
  23. agent_server/langchain/skills/pyspark.md +352 -0
  24. agent_server/langchain/subagents/__init__.py +20 -0
  25. agent_server/langchain/subagents/base.py +173 -0
  26. agent_server/langchain/tools/__init__.py +3 -0
  27. agent_server/langchain/tools/jupyter_tools.py +58 -20
  28. agent_server/langchain/tools/lsp_tools.py +1 -1
  29. agent_server/langchain/tools/shared/__init__.py +26 -0
  30. agent_server/langchain/tools/shared/qdrant_search.py +175 -0
  31. agent_server/langchain/tools/tool_registry.py +219 -0
  32. agent_server/langchain/tools/workspace_tools.py +197 -0
  33. agent_server/routers/config.py +40 -1
  34. agent_server/routers/langchain_agent.py +818 -337
  35. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/build_log.json +1 -1
  36. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/package.json +7 -2
  37. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js +1108 -179
  38. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
  39. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js +3916 -8128
  40. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
  41. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js → hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js +17 -17
  42. hdsp_jupyter_extension-2.0.13.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
  43. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/METADATA +1 -1
  44. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/RECORD +75 -51
  45. jupyter_ext/_version.py +1 -1
  46. jupyter_ext/handlers.py +59 -8
  47. jupyter_ext/labextension/build_log.json +1 -1
  48. jupyter_ext/labextension/package.json +7 -2
  49. jupyter_ext/labextension/static/{frontend_styles_index_js.2d9fb488c82498c45c2d.js → frontend_styles_index_js.037b3c8e5d6a92b63b16.js} +1108 -179
  50. jupyter_ext/labextension/static/frontend_styles_index_js.037b3c8e5d6a92b63b16.js.map +1 -0
  51. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js → jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js +3916 -8128
  52. jupyter_ext/labextension/static/lib_index_js.5449ba3c7e25177d2987.js.map +1 -0
  53. jupyter_ext/labextension/static/{remoteEntry.9da31d1134a53b0c4af5.js → remoteEntry.a8e0b064eb9b1c1ff463.js} +17 -17
  54. jupyter_ext/labextension/static/remoteEntry.a8e0b064eb9b1c1ff463.js.map +1 -0
  55. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
  56. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/lib_index_js.58c1e128ba0b76f41f04.js.map +0 -1
  57. hdsp_jupyter_extension-2.0.11.data/data/share/jupyter/labextensions/hdsp-agent/static/remoteEntry.9da31d1134a53b0c4af5.js.map +0 -1
  58. jupyter_ext/labextension/static/frontend_styles_index_js.2d9fb488c82498c45c2d.js.map +0 -1
  59. jupyter_ext/labextension/static/lib_index_js.58c1e128ba0b76f41f04.js.map +0 -1
  60. jupyter_ext/labextension/static/remoteEntry.9da31d1134a53b0c4af5.js.map +0 -1
  61. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/etc/jupyter/jupyter_server_config.d/hdsp_jupyter_extension.json +0 -0
  62. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/install.json +0 -0
  63. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js +0 -0
  64. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b80.c095373419d05e6f141a.js.map +0 -0
  65. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js +0 -0
  66. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/node_modules_emotion_use-insertion-effect-with-fallbacks_dist_emotion-use-insertion-effect-wi-3ba6b81.61e75fb98ecff46cf836.js.map +0 -0
  67. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/style.js +0 -0
  68. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js +0 -0
  69. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_babel_runtime_helpers_esm_extends_js-node_modules_emotion_serialize_dist-051195.e2553aab0c3963b83dd7.js.map +0 -0
  70. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js +0 -0
  71. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_cache_dist_emotion-cache_browser_development_esm_js.24edcc52a1c014a8a5f0.js.map +0 -0
  72. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js +0 -0
  73. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_react_dist_emotion-react_browser_development_esm_js.19ecf6babe00caff6b8a.js.map +0 -0
  74. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js +0 -0
  75. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_emotion_styled_dist_emotion-styled_browser_development_esm_js.661fb5836f4978a7c6e1.js.map +0 -0
  76. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js +0 -0
  77. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_index_js.985697e0162d8d088ca2.js.map +0 -0
  78. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js +0 -0
  79. {hdsp_jupyter_extension-2.0.11.data → hdsp_jupyter_extension-2.0.13.data}/data/share/jupyter/labextensions/hdsp-agent/static/vendors-node_modules_mui_material_utils_createSvgIcon_js.1f5038488cdfd8b3a85d.js.map +0 -0
  80. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/WHEEL +0 -0
  81. {hdsp_jupyter_extension-2.0.11.dist-info → hdsp_jupyter_extension-2.0.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
1
+ ---
2
+ name: data-analysis
3
+ description: DataFrame 연산 최적화. groupby, merge, pivot 등 메모리 집약적 연산 시 사용. 벡터화 연산, query 최적화, 메모리 효율적 패턴 제공.
4
+ ---
5
+
6
+ # Data Analysis Optimization Guide
7
+
8
+ pandas DataFrame의 분석 연산을 메모리 효율적으로 수행하는 방법을 안내합니다.
9
+
10
+ ## Resource Tiers
11
+
12
+ ### TIER_SMALL: DataFrame < 1GB, RAM 여유 충분
13
+ 일반 pandas 연산 OK.
14
+
15
+ ### TIER_MEDIUM: DataFrame 1~5GB
16
+ 벡터화 연산 + query 최적화 필수
17
+
18
+ ### TIER_LARGE: DataFrame > 5GB 또는 메모리 부족
19
+ Dask/Polars 사용 또는 청크 처리
20
+
21
+ ---
22
+
23
+ ## 1. 벡터화 연산 (Vectorization)
24
+
25
+ ### 나쁜 예: 반복문 사용
26
+ ```python
27
+ # 느림 - 절대 사용 금지
28
+ for i in range(len(df)):
29
+ df.loc[i, "new_col"] = df.loc[i, "col1"] * 2
30
+ ```
31
+
32
+ ### 좋은 예: 벡터화
33
+ ```python
34
+ # 빠름 - 항상 이 방법 사용
35
+ df["new_col"] = df["col1"] * 2
36
+ ```
37
+
38
+ ### 조건부 연산
39
+ ```python
40
+ # 나쁜 예: apply 사용
41
+ df["category"] = df["value"].apply(lambda x: "high" if x > 100 else "low")
42
+
43
+ # 좋은 예: np.where 사용 (10x+ 빠름)
44
+ import numpy as np
45
+ df["category"] = np.where(df["value"] > 100, "high", "low")
46
+
47
+ # 여러 조건: np.select 사용
48
+ conditions = [
49
+ df["value"] > 100,
50
+ df["value"] > 50,
51
+ ]
52
+ choices = ["high", "medium"]
53
+ df["category"] = np.select(conditions, choices, default="low")
54
+ ```
55
+
56
+ ---
57
+
58
+ ## 2. GroupBy 최적화
59
+
60
+ ### 기본 최적화
61
+ ```python
62
+ # sort=False로 정렬 비용 제거
63
+ result = df.groupby("category", sort=False)["value"].sum()
64
+
65
+ # 여러 집계 한 번에
66
+ result = df.groupby("category", sort=False).agg({
67
+ "value": ["sum", "mean", "count"],
68
+ "amount": "sum"
69
+ })
70
+ ```
71
+
72
+ ### 대용량 GroupBy (메모리 부족 시)
73
+ ```python
74
+ # Option A: numba 가속 (설치 필요)
75
+ @numba.jit
76
+ def custom_agg(values):
77
+ return values.sum()
78
+
79
+ result = df.groupby("category")["value"].agg(custom_agg)
80
+
81
+ # Option B: Dask 사용
82
+ import dask.dataframe as dd
83
+ ddf = dd.from_pandas(df, npartitions=4)
84
+ result = ddf.groupby("category")["value"].sum().compute()
85
+ ```
86
+
87
+ ---
88
+
89
+ ## 3. Merge/Join 최적화
90
+
91
+ ### 메모리 효율적 Merge
92
+ ```python
93
+ # 1. 필요한 컬럼만 선택 후 merge
94
+ df1_subset = df1[["key", "needed_col1", "needed_col2"]]
95
+ df2_subset = df2[["key", "needed_col3"]]
96
+ result = pd.merge(df1_subset, df2_subset, on="key")
97
+
98
+ # 2. 작은 테이블을 왼쪽에 배치 (메모리 효율)
99
+ result = pd.merge(small_df, large_df, on="key", how="left")
100
+ ```
101
+
102
+ ### 대용량 Merge (메모리 부족 시)
103
+ ```python
104
+ # Chunked merge
105
+ def chunked_merge(large_df, small_df, on, chunksize=100_000):
106
+ chunks = []
107
+ for start in range(0, len(large_df), chunksize):
108
+ chunk = large_df.iloc[start:start + chunksize]
109
+ merged = pd.merge(chunk, small_df, on=on, how="left")
110
+ chunks.append(merged)
111
+ return pd.concat(chunks, ignore_index=True)
112
+ ```
113
+
114
+ ---
115
+
116
+ ## 4. Query 최적화
117
+
118
+ ### eval() 사용 (대용량 DataFrame에서 빠름)
119
+ ```python
120
+ # 일반 방식
121
+ df["c"] = df["a"] + df["b"]
122
+ df["d"] = df["c"] * 2
123
+
124
+ # eval() 사용 (중간 결과 메모리 절약)
125
+ df = df.eval("""
126
+ c = a + b
127
+ d = c * 2
128
+ """)
129
+ ```
130
+
131
+ ### query() 사용 (필터링)
132
+ ```python
133
+ # 일반 방식
134
+ result = df[(df["col1"] > 10) & (df["col2"] == "active")]
135
+
136
+ # query() 사용 (더 빠르고 가독성 좋음)
137
+ result = df.query("col1 > 10 and col2 == 'active'")
138
+
139
+ # 변수 사용
140
+ threshold = 10
141
+ status = "active"
142
+ result = df.query("col1 > @threshold and col2 == @status")
143
+ ```
144
+
145
+ ---
146
+
147
+ ## 5. 피벗/언피벗 최적화
148
+
149
+ ### Pivot Table
150
+ ```python
151
+ # 기본 사용
152
+ pivot = df.pivot_table(
153
+ values="amount",
154
+ index="date",
155
+ columns="category",
156
+ aggfunc="sum",
157
+ fill_value=0
158
+ )
159
+
160
+ # 메모리 부족 시: 청크로 처리
161
+ def chunked_pivot(df, chunksize=100_000):
162
+ results = []
163
+ for start in range(0, len(df), chunksize):
164
+ chunk = df.iloc[start:start + chunksize]
165
+ pivot = chunk.pivot_table(...)
166
+ results.append(pivot)
167
+ return pd.concat(results).groupby(level=0).sum()
168
+ ```
169
+
170
+ ---
171
+
172
+ ## 6. 메모리 관리
173
+
174
+ ### 불필요한 객체 삭제
175
+ ```python
176
+ import gc
177
+
178
+ # 중간 결과 삭제
179
+ del intermediate_df
180
+ gc.collect()
181
+
182
+ # 컬럼 삭제 (inplace)
183
+ df.drop(columns=["unneeded_col"], inplace=True)
184
+ ```
185
+
186
+ ### 원본 유지하면서 메모리 절약
187
+ ```python
188
+ # 복사 대신 view 사용 (가능할 때)
189
+ subset = df[["col1", "col2"]] # view (메모리 공유)
190
+ subset = df[["col1", "col2"]].copy() # copy (별도 메모리)
191
+ ```
192
+
193
+ ---
194
+
195
+ ## 7. 연산 속도 비교표
196
+
197
+ | Operation | Slow Method | Fast Method | Speedup |
198
+ |-----------|-------------|-------------|---------|
199
+ | 조건부 할당 | apply(lambda) | np.where | 10-100x |
200
+ | 문자열 연산 | apply(str) | .str accessor | 5-20x |
201
+ | 반복 계산 | for loop | vectorized | 100-1000x |
202
+ | 다중 집계 | 여러 groupby | 단일 .agg() | 2-5x |
203
+ | 필터링 | boolean indexing | .query() | 1.5-3x |
204
+
205
+ ---
206
+
207
+ ## 8. 성능 측정
208
+
209
+ ```python
210
+ import time
211
+
212
+ # 실행 시간 측정
213
+ start = time.time()
214
+ result = df.groupby("category")["value"].sum()
215
+ print(f"Elapsed: {time.time() - start:.2f}s")
216
+
217
+ # 메모리 프로파일링
218
+ import memory_profiler
219
+ %memit df.groupby("category")["value"].sum()
220
+ ```
221
+
222
+ ---
223
+
224
+ ## Quick Reference
225
+
226
+ ```python
227
+ # 대용량 분석 체크리스트
228
+ # 1. dtype 최적화 (data_loading 스킬 참조)
229
+ # 2. 필요한 컬럼만 선택
230
+ # 3. 벡터화 연산 사용 (apply 대신 np.where/np.select)
231
+ # 4. eval()/query() 활용
232
+ # 5. groupby에 sort=False 추가
233
+ # 6. 중간 결과 삭제 (del + gc.collect())
234
+ # 7. 메모리 부족 시 Dask/Polars 전환
235
+ ```
236
+
@@ -0,0 +1,158 @@
1
+ ---
2
+ name: data-loading
3
+ description: 대용량 파일 로드 최적화. CSV/Parquet 파일이 100MB 이상이거나 메모리 부족 시 사용. chunking, sampling, dtype 최적화, Dask/Polars 전환 가이드 제공.
4
+ ---
5
+
6
+ # Data Loading Optimization Guide
7
+
8
+ 대용량 데이터셋 로드 시 메모리 효율적인 방법을 안내합니다.
9
+
10
+ ## Resource Tiers
11
+
12
+ ### TIER_SMALL: 파일 < 100MB, RAM 여유 충분
13
+ 직접 로드 OK. 특별한 최적화 불필요.
14
+
15
+ ```python
16
+ import pandas as pd
17
+ df = pd.read_csv("data.csv")
18
+ # 또는
19
+ df = pd.read_parquet("data.parquet")
20
+ ```
21
+
22
+ ### TIER_MEDIUM: 파일 100MB ~ 1GB
23
+ dtype 최적화 + 필요한 컬럼만 로드
24
+
25
+ ```python
26
+ import pandas as pd
27
+
28
+ # 1. 필요한 컬럼만 로드 (메모리 최대 90% 절약)
29
+ df = pd.read_csv("data.csv", usecols=["col1", "col2", "col3"])
30
+
31
+ # 2. dtype 최적화 지정
32
+ dtype_map = {
33
+ "id": "int32", # int64 → int32 (50% 절약)
34
+ "category_col": "category", # string → category (90%+ 절약)
35
+ "float_col": "float32", # float64 → float32 (50% 절약)
36
+ }
37
+ df = pd.read_csv("data.csv", dtype=dtype_map)
38
+
39
+ # 3. Parquet 사용 시 (자동 압축, 컬럼 선택 지원)
40
+ df = pd.read_parquet("data.parquet", columns=["col1", "col2"])
41
+ ```
42
+
43
+ ### TIER_LARGE: 파일 > 1GB 또는 메모리 부족
44
+ Chunking 또는 Dask/Polars 사용
45
+
46
+ #### Option A: Chunking (단순 집계용)
47
+ ```python
48
+ import pandas as pd
49
+
50
+ # 청크 단위 처리 (메모리: 청크 크기만 사용)
51
+ chunks = pd.read_csv("large_data.csv", chunksize=100_000)
52
+
53
+ # 예: 청크별 집계 후 합산
54
+ total_count = 0
55
+ for chunk in chunks:
56
+ total_count += len(chunk[chunk["status"] == "active"])
57
+
58
+ print(f"Active records: {total_count}")
59
+ ```
60
+
61
+ #### Option B: Dask (복잡한 연산, groupby 등)
62
+ ```python
63
+ import dask.dataframe as dd
64
+
65
+ # Dask로 로드 (lazy evaluation, 메모리 효율적)
66
+ ddf = dd.read_csv("large_data.csv")
67
+
68
+ # pandas처럼 사용 (내부적으로 청크 처리)
69
+ result = ddf.groupby("category")["value"].mean().compute()
70
+ ```
71
+
72
+ #### Option C: Polars (고성능 대안)
73
+ ```python
74
+ import polars as pl
75
+
76
+ # Polars: Rust 기반, pandas보다 30x 빠름
77
+ df = pl.read_csv("large_data.csv")
78
+
79
+ # 또는 lazy mode (메모리 최적화)
80
+ df = pl.scan_csv("large_data.csv").filter(
81
+ pl.col("date") > "2024-01-01"
82
+ ).collect()
83
+ ```
84
+
85
+ ## dtype 최적화 상세
86
+
87
+ | Original Type | Optimized Type | Memory Savings | When to Use |
88
+ |---------------|----------------|----------------|-------------|
89
+ | int64 | int32 | 50% | 값 범위가 ±2B 이내 |
90
+ | int64 | int16 | 75% | 값 범위가 ±32K 이내 |
91
+ | int64 | int8 | 87.5% | 값 범위가 ±127 이내 |
92
+ | float64 | float32 | 50% | 소수점 7자리 정밀도 OK |
93
+ | object (string) | category | 90%+ | 고유값 < 50% |
94
+
95
+ ### 자동 dtype 최적화 함수
96
+ ```python
97
+ def optimize_dtypes(df):
98
+ """DataFrame의 dtype을 자동 최적화"""
99
+ for col in df.columns:
100
+ col_type = df[col].dtype
101
+
102
+ if col_type == "int64":
103
+ if df[col].min() >= 0:
104
+ if df[col].max() < 255:
105
+ df[col] = df[col].astype("uint8")
106
+ elif df[col].max() < 65535:
107
+ df[col] = df[col].astype("uint16")
108
+ else:
109
+ df[col] = df[col].astype("uint32")
110
+ else:
111
+ if df[col].min() > -128 and df[col].max() < 127:
112
+ df[col] = df[col].astype("int8")
113
+ elif df[col].min() > -32768 and df[col].max() < 32767:
114
+ df[col] = df[col].astype("int16")
115
+ else:
116
+ df[col] = df[col].astype("int32")
117
+
118
+ elif col_type == "float64":
119
+ df[col] = df[col].astype("float32")
120
+
121
+ elif col_type == "object":
122
+ num_unique = df[col].nunique()
123
+ num_total = len(df[col])
124
+ if num_unique / num_total < 0.5: # 50% 미만 고유값
125
+ df[col] = df[col].astype("category")
126
+
127
+ return df
128
+ ```
129
+
130
+ ## 파일 포맷별 권장사항
131
+
132
+ | Format | Read Speed | Write Speed | Compression | Best For |
133
+ |--------|------------|-------------|-------------|----------|
134
+ | CSV | Slow | Slow | None | 호환성, 간단한 데이터 |
135
+ | Parquet | Fast | Fast | Excellent | 대용량 분석, 컬럼 선택 |
136
+ | Feather | Fastest | Fastest | Good | pandas 간 데이터 교환 |
137
+ | HDF5 | Fast | Fast | Good | 다차원 배열 |
138
+
139
+ ### CSV → Parquet 변환 (일회성 비용으로 이후 로드 빠름)
140
+ ```python
141
+ # 최초 1회 변환
142
+ df = pd.read_csv("data.csv")
143
+ df.to_parquet("data.parquet", compression="snappy")
144
+
145
+ # 이후 빠른 로드
146
+ df = pd.read_parquet("data.parquet")
147
+ ```
148
+
149
+ ## 메모리 확인 방법
150
+
151
+ ```python
152
+ # DataFrame 메모리 사용량 확인
153
+ print(df.memory_usage(deep=True).sum() / 1024**2, "MB")
154
+
155
+ # 컬럼별 메모리 사용량
156
+ print(df.memory_usage(deep=True) / 1024**2)
157
+ ```
158
+