kollabor 0.4.9__py3-none-any.whl → 0.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +2 -0
- agents/coder/__init__.py +0 -0
- agents/coder/agent.json +4 -0
- agents/coder/api-integration.md +2150 -0
- agents/coder/cli-pretty.md +765 -0
- agents/coder/code-review.md +1092 -0
- agents/coder/database-design.md +1525 -0
- agents/coder/debugging.md +1102 -0
- agents/coder/dependency-management.md +1397 -0
- agents/coder/git-workflow.md +1099 -0
- agents/coder/refactoring.md +1454 -0
- agents/coder/security-hardening.md +1732 -0
- agents/coder/system_prompt.md +1448 -0
- agents/coder/tdd.md +1367 -0
- agents/creative-writer/__init__.py +0 -0
- agents/creative-writer/agent.json +4 -0
- agents/creative-writer/character-development.md +1852 -0
- agents/creative-writer/dialogue-craft.md +1122 -0
- agents/creative-writer/plot-structure.md +1073 -0
- agents/creative-writer/revision-editing.md +1484 -0
- agents/creative-writer/system_prompt.md +690 -0
- agents/creative-writer/worldbuilding.md +2049 -0
- agents/data-analyst/__init__.py +30 -0
- agents/data-analyst/agent.json +4 -0
- agents/data-analyst/data-visualization.md +992 -0
- agents/data-analyst/exploratory-data-analysis.md +1110 -0
- agents/data-analyst/pandas-data-manipulation.md +1081 -0
- agents/data-analyst/sql-query-optimization.md +881 -0
- agents/data-analyst/statistical-analysis.md +1118 -0
- agents/data-analyst/system_prompt.md +928 -0
- agents/default/__init__.py +0 -0
- agents/default/agent.json +4 -0
- agents/default/dead-code.md +794 -0
- agents/default/explore-agent-system.md +585 -0
- agents/default/system_prompt.md +1448 -0
- agents/kollabor/__init__.py +0 -0
- agents/kollabor/analyze-plugin-lifecycle.md +175 -0
- agents/kollabor/analyze-terminal-rendering.md +388 -0
- agents/kollabor/code-review.md +1092 -0
- agents/kollabor/debug-mcp-integration.md +521 -0
- agents/kollabor/debug-plugin-hooks.md +547 -0
- agents/kollabor/debugging.md +1102 -0
- agents/kollabor/dependency-management.md +1397 -0
- agents/kollabor/git-workflow.md +1099 -0
- agents/kollabor/inspect-llm-conversation.md +148 -0
- agents/kollabor/monitor-event-bus.md +558 -0
- agents/kollabor/profile-performance.md +576 -0
- agents/kollabor/refactoring.md +1454 -0
- agents/kollabor/system_prompt copy.md +1448 -0
- agents/kollabor/system_prompt.md +757 -0
- agents/kollabor/trace-command-execution.md +178 -0
- agents/kollabor/validate-config.md +879 -0
- agents/research/__init__.py +0 -0
- agents/research/agent.json +4 -0
- agents/research/architecture-mapping.md +1099 -0
- agents/research/codebase-analysis.md +1077 -0
- agents/research/dependency-audit.md +1027 -0
- agents/research/performance-profiling.md +1047 -0
- agents/research/security-review.md +1359 -0
- agents/research/system_prompt.md +492 -0
- agents/technical-writer/__init__.py +0 -0
- agents/technical-writer/agent.json +4 -0
- agents/technical-writer/api-documentation.md +2328 -0
- agents/technical-writer/changelog-management.md +1181 -0
- agents/technical-writer/readme-writing.md +1360 -0
- agents/technical-writer/style-guide.md +1410 -0
- agents/technical-writer/system_prompt.md +653 -0
- agents/technical-writer/tutorial-creation.md +1448 -0
- core/__init__.py +0 -2
- core/application.py +343 -88
- core/cli.py +229 -10
- core/commands/menu_renderer.py +463 -59
- core/commands/registry.py +14 -9
- core/commands/system_commands.py +2461 -14
- core/config/loader.py +151 -37
- core/config/service.py +18 -6
- core/events/bus.py +29 -9
- core/events/executor.py +205 -75
- core/events/models.py +27 -8
- core/fullscreen/command_integration.py +20 -24
- core/fullscreen/components/__init__.py +10 -1
- core/fullscreen/components/matrix_components.py +1 -2
- core/fullscreen/components/space_shooter_components.py +654 -0
- core/fullscreen/plugin.py +5 -0
- core/fullscreen/renderer.py +52 -13
- core/fullscreen/session.py +52 -15
- core/io/__init__.py +29 -5
- core/io/buffer_manager.py +6 -1
- core/io/config_status_view.py +7 -29
- core/io/core_status_views.py +267 -347
- core/io/input/__init__.py +25 -0
- core/io/input/command_mode_handler.py +711 -0
- core/io/input/display_controller.py +128 -0
- core/io/input/hook_registrar.py +286 -0
- core/io/input/input_loop_manager.py +421 -0
- core/io/input/key_press_handler.py +502 -0
- core/io/input/modal_controller.py +1011 -0
- core/io/input/paste_processor.py +339 -0
- core/io/input/status_modal_renderer.py +184 -0
- core/io/input_errors.py +5 -1
- core/io/input_handler.py +211 -2452
- core/io/key_parser.py +7 -0
- core/io/layout.py +15 -3
- core/io/message_coordinator.py +111 -2
- core/io/message_renderer.py +129 -4
- core/io/status_renderer.py +147 -607
- core/io/terminal_renderer.py +97 -51
- core/io/terminal_state.py +21 -4
- core/io/visual_effects.py +816 -165
- core/llm/agent_manager.py +1063 -0
- core/llm/api_adapters/__init__.py +44 -0
- core/llm/api_adapters/anthropic_adapter.py +432 -0
- core/llm/api_adapters/base.py +241 -0
- core/llm/api_adapters/openai_adapter.py +326 -0
- core/llm/api_communication_service.py +167 -113
- core/llm/conversation_logger.py +322 -16
- core/llm/conversation_manager.py +556 -30
- core/llm/file_operations_executor.py +84 -32
- core/llm/llm_service.py +934 -103
- core/llm/mcp_integration.py +541 -57
- core/llm/message_display_service.py +135 -18
- core/llm/plugin_sdk.py +1 -2
- core/llm/profile_manager.py +1183 -0
- core/llm/response_parser.py +274 -56
- core/llm/response_processor.py +16 -3
- core/llm/tool_executor.py +6 -1
- core/logging/__init__.py +2 -0
- core/logging/setup.py +34 -6
- core/models/resume.py +54 -0
- core/plugins/__init__.py +4 -2
- core/plugins/base.py +127 -0
- core/plugins/collector.py +23 -161
- core/plugins/discovery.py +37 -3
- core/plugins/factory.py +6 -12
- core/plugins/registry.py +5 -17
- core/ui/config_widgets.py +128 -28
- core/ui/live_modal_renderer.py +2 -1
- core/ui/modal_actions.py +5 -0
- core/ui/modal_overlay_renderer.py +0 -60
- core/ui/modal_renderer.py +268 -7
- core/ui/modal_state_manager.py +29 -4
- core/ui/widgets/base_widget.py +7 -0
- core/updates/__init__.py +10 -0
- core/updates/version_check_service.py +348 -0
- core/updates/version_comparator.py +103 -0
- core/utils/config_utils.py +685 -526
- core/utils/plugin_utils.py +1 -1
- core/utils/session_naming.py +111 -0
- fonts/LICENSE +21 -0
- fonts/README.md +46 -0
- fonts/SymbolsNerdFont-Regular.ttf +0 -0
- fonts/SymbolsNerdFontMono-Regular.ttf +0 -0
- fonts/__init__.py +44 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/METADATA +54 -4
- kollabor-0.4.15.dist-info/RECORD +228 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/top_level.txt +2 -0
- plugins/agent_orchestrator/__init__.py +39 -0
- plugins/agent_orchestrator/activity_monitor.py +181 -0
- plugins/agent_orchestrator/file_attacher.py +77 -0
- plugins/agent_orchestrator/message_injector.py +135 -0
- plugins/agent_orchestrator/models.py +48 -0
- plugins/agent_orchestrator/orchestrator.py +403 -0
- plugins/agent_orchestrator/plugin.py +976 -0
- plugins/agent_orchestrator/xml_parser.py +191 -0
- plugins/agent_orchestrator_plugin.py +9 -0
- plugins/enhanced_input/box_styles.py +1 -0
- plugins/enhanced_input/color_engine.py +19 -4
- plugins/enhanced_input/config.py +2 -2
- plugins/enhanced_input_plugin.py +61 -11
- plugins/fullscreen/__init__.py +6 -2
- plugins/fullscreen/example_plugin.py +1035 -222
- plugins/fullscreen/setup_wizard_plugin.py +592 -0
- plugins/fullscreen/space_shooter_plugin.py +131 -0
- plugins/hook_monitoring_plugin.py +436 -78
- plugins/query_enhancer_plugin.py +66 -30
- plugins/resume_conversation_plugin.py +1494 -0
- plugins/save_conversation_plugin.py +98 -32
- plugins/system_commands_plugin.py +70 -56
- plugins/tmux_plugin.py +154 -78
- plugins/workflow_enforcement_plugin.py +94 -92
- system_prompt/default.md +952 -886
- core/io/input_mode_manager.py +0 -402
- core/io/modal_interaction_handler.py +0 -315
- core/io/raw_input_processor.py +0 -946
- core/storage/__init__.py +0 -5
- core/storage/state_manager.py +0 -84
- core/ui/widget_integration.py +0 -222
- core/utils/key_reader.py +0 -171
- kollabor-0.4.9.dist-info/RECORD +0 -128
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/WHEEL +0 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/entry_points.txt +0 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1081 @@
|
|
|
1
|
+
<!-- Pandas Data Manipulation skill - master DataFrame operations and transformations -->
|
|
2
|
+
|
|
3
|
+
pandas manipulation mode: DATA TRANSFORMATION MASTERY
|
|
4
|
+
|
|
5
|
+
when this skill is active, you follow pandas best practices for
|
|
6
|
+
efficient, readable, and performant data manipulation.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PHASE 0: PANDAS ENVIRONMENT VERIFICATION
|
|
10
|
+
|
|
11
|
+
before attempting ANY pandas operations, verify your tools are ready.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
check pandas installation and version
|
|
15
|
+
|
|
16
|
+
<terminal>python -c "import pandas; print(f'pandas version: {pandas.__version__}')"</terminal>
|
|
17
|
+
|
|
18
|
+
if pandas not available:
|
|
19
|
+
<terminal>pip install pandas</terminal>
|
|
20
|
+
|
|
21
|
+
verify recommended pandas version (>= 2.0.0):
|
|
22
|
+
<terminal>python -c "import pandas; import sys; print('ok' if pandas.__version__ >= '2.0.0' else 'upgrade needed')"</terminal>
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
check numpy dependency
|
|
26
|
+
|
|
27
|
+
<terminal>python -c "import numpy; print(f'numpy version: {numpy.__version__}')"</terminal>
|
|
28
|
+
|
|
29
|
+
if numpy not available:
|
|
30
|
+
<terminal>pip install numpy</terminal>
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
check memory availability for large datasets
|
|
34
|
+
|
|
35
|
+
<terminal>python -c "import psutil; mem = psutil.virtual_memory(); print(f'total: {mem.total/1024**3:.1f}GB, available: {mem.available/1024**3:.1f}GB')"</terminal>
|
|
36
|
+
|
|
37
|
+
if psutil not installed:
|
|
38
|
+
<terminal>pip install psutil</terminal>
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
verify pandas display settings
|
|
42
|
+
|
|
43
|
+
<terminal>python -c "import pandas as pd; pd.set_option('display.max_rows', 10); pd.set_option('display.max_columns', 10); print('display settings configured')"</terminal>
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
check for sample data files
|
|
47
|
+
|
|
48
|
+
<terminal>find . -maxdepth 2 -name "*.csv" -o -name "*.parquet" -o -name "*.xlsx" | head -10</terminal>
|
|
49
|
+
|
|
50
|
+
<terminal>ls -lh data/ 2>/dev/null || ls -lh *.csv 2>/dev/null || echo "no data files found"</terminal>
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
PHASE 1: DATA LOADING FUNDAMENTALS
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
reading csv files efficiently
|
|
57
|
+
|
|
58
|
+
basic csv loading:
|
|
59
|
+
import pandas as pd
|
|
60
|
+
|
|
61
|
+
df = pd.read_csv('data.csv')
|
|
62
|
+
|
|
63
|
+
specify dtypes for efficiency:
|
|
64
|
+
df = pd.read_csv('data.csv',
|
|
65
|
+
dtype={
|
|
66
|
+
'id': 'int32',
|
|
67
|
+
'category': 'category',
|
|
68
|
+
'value': 'float64'
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
handle date columns:
|
|
72
|
+
df = pd.read_csv('data.csv',
|
|
73
|
+
parse_dates=['date_column', 'timestamp'])
|
|
74
|
+
|
|
75
|
+
specify columns to read:
|
|
76
|
+
df = pd.read_csv('data.csv',
|
|
77
|
+
usecols=['id', 'name', 'value'])
|
|
78
|
+
|
|
79
|
+
skip rows:
|
|
80
|
+
df = pd.read_csv('data.csv', skiprows=5)
|
|
81
|
+
|
|
82
|
+
limit rows:
|
|
83
|
+
df = pd.read_csv('data.csv', nrows=10000)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
reading large files in chunks
|
|
87
|
+
|
|
88
|
+
for files too large for memory:
|
|
89
|
+
chunksize = 10000
|
|
90
|
+
chunks = []
|
|
91
|
+
for chunk in pd.read_csv('large_file.csv', chunksize=chunksize):
|
|
92
|
+
# process each chunk
|
|
93
|
+
processed = process_chunk(chunk)
|
|
94
|
+
chunks.append(processed)
|
|
95
|
+
|
|
96
|
+
df = pd.concat(chunks, ignore_index=True)
|
|
97
|
+
|
|
98
|
+
or process without storing all:
|
|
99
|
+
for chunk in pd.read_csv('huge_file.csv', chunksize=10000):
|
|
100
|
+
result = chunk.groupby('category').sum()
|
|
101
|
+
result.to_csv('output.csv', mode='a', header=False)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
reading from different sources
|
|
105
|
+
|
|
106
|
+
read excel:
|
|
107
|
+
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
|
|
108
|
+
|
|
109
|
+
read parquet (recommended for large datasets):
|
|
110
|
+
df = pd.read_parquet('data.parquet')
|
|
111
|
+
|
|
112
|
+
read json:
|
|
113
|
+
df = pd.read_json('data.json')
|
|
114
|
+
|
|
115
|
+
read sql database:
|
|
116
|
+
import sqlite3
|
|
117
|
+
|
|
118
|
+
conn = sqlite3.connect('database.db')
|
|
119
|
+
df = pd.read_sql_query("SELECT * FROM table", conn)
|
|
120
|
+
|
|
121
|
+
read html tables:
|
|
122
|
+
tables = pd.read_html('https://example.com/data')
|
|
123
|
+
df = tables[0]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
PHASE 2: DATA INSPECTION AND EXPLORATION
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
basic dataframe inspection
|
|
130
|
+
|
|
131
|
+
get dataframe info:
|
|
132
|
+
df.info()
|
|
133
|
+
|
|
134
|
+
summary statistics:
|
|
135
|
+
df.describe()
|
|
136
|
+
|
|
137
|
+
first few rows:
|
|
138
|
+
df.head()
|
|
139
|
+
df.head(10)
|
|
140
|
+
|
|
141
|
+
last few rows:
|
|
142
|
+
df.tail()
|
|
143
|
+
|
|
144
|
+
random sample:
|
|
145
|
+
df.sample(n=5)
|
|
146
|
+
df.sample(frac=0.1) # 10% of data
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
checking data types
|
|
150
|
+
|
|
151
|
+
view dtypes:
|
|
152
|
+
df.dtypes
|
|
153
|
+
|
|
154
|
+
count dtypes:
|
|
155
|
+
df.dtypes.value_counts()
|
|
156
|
+
|
|
157
|
+
convert dtype:
|
|
158
|
+
df['column'] = df['column'].astype('float64')
|
|
159
|
+
|
|
160
|
+
convert to datetime:
|
|
161
|
+
df['date'] = pd.to_datetime(df['date'])
|
|
162
|
+
|
|
163
|
+
convert to category:
|
|
164
|
+
df['category_column'] = df['category_column'].astype('category')
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
checking for missing values
|
|
168
|
+
|
|
169
|
+
count missing values per column:
|
|
170
|
+
df.isnull().sum()
|
|
171
|
+
|
|
172
|
+
percentage missing:
|
|
173
|
+
df.isnull().sum() / len(df) * 100
|
|
174
|
+
|
|
175
|
+
total missing:
|
|
176
|
+
df.isnull().sum().sum()
|
|
177
|
+
|
|
178
|
+
boolean mask of missing:
|
|
179
|
+
missing_mask = df.isnull()
|
|
180
|
+
|
|
181
|
+
rows with any missing:
|
|
182
|
+
rows_with_missing = df[df.isnull().any(axis=1)]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
checking for duplicates
|
|
186
|
+
|
|
187
|
+
duplicate rows:
|
|
188
|
+
df.duplicated()
|
|
189
|
+
|
|
190
|
+
count duplicates:
|
|
191
|
+
df.duplicated().sum()
|
|
192
|
+
|
|
193
|
+
specific column duplicates:
|
|
194
|
+
df.duplicated(subset=['id'])
|
|
195
|
+
|
|
196
|
+
first occurrence is not duplicate:
|
|
197
|
+
df.duplicated(keep='first')
|
|
198
|
+
|
|
199
|
+
last occurrence is not duplicate:
|
|
200
|
+
df.duplicated(keep='last')
|
|
201
|
+
|
|
202
|
+
mark all duplicates:
|
|
203
|
+
df.duplicated(keep=False)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
unique values and counts
|
|
207
|
+
|
|
208
|
+
unique values:
|
|
209
|
+
df['column'].unique()
|
|
210
|
+
|
|
211
|
+
count unique:
|
|
212
|
+
df['column'].nunique()
|
|
213
|
+
|
|
214
|
+
value counts:
|
|
215
|
+
df['column'].value_counts()
|
|
216
|
+
|
|
217
|
+
value counts as percentage:
|
|
218
|
+
df['column'].value_counts(normalize=True)
|
|
219
|
+
|
|
220
|
+
value counts with missing:
|
|
221
|
+
df['column'].value_counts(dropna=False)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
PHASE 3: DATA SELECTION AND FILTERING
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
selecting columns
|
|
228
|
+
|
|
229
|
+
single column:
|
|
230
|
+
df['column_name']
|
|
231
|
+
|
|
232
|
+
multiple columns:
|
|
233
|
+
df[['col1', 'col2', 'col3']]
|
|
234
|
+
|
|
235
|
+
column by position:
|
|
236
|
+
df.iloc[:, 0] # first column
|
|
237
|
+
|
|
238
|
+
columns by position range:
|
|
239
|
+
df.iloc[:, 0:3]
|
|
240
|
+
|
|
241
|
+
columns by name range:
|
|
242
|
+
df.loc[:, 'col1':'col3']
|
|
243
|
+
|
|
244
|
+
columns by condition:
|
|
245
|
+
df.select_dtypes(include=['number'])
|
|
246
|
+
df.select_dtypes(include=['object', 'category'])
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
filtering rows
|
|
250
|
+
|
|
251
|
+
by value:
|
|
252
|
+
df[df['column'] == 'value']
|
|
253
|
+
|
|
254
|
+
by multiple values:
|
|
255
|
+
df[df['column'].isin(['value1', 'value2'])]
|
|
256
|
+
|
|
257
|
+
by range:
|
|
258
|
+
df[(df['column'] >= 10) & (df['column'] <= 20)]
|
|
259
|
+
|
|
260
|
+
by string contains:
|
|
261
|
+
df[df['column'].str.contains('pattern')]
|
|
262
|
+
|
|
263
|
+
by string startswith:
|
|
264
|
+
df[df['column'].str.startswith('prefix')]
|
|
265
|
+
|
|
266
|
+
by date range:
|
|
267
|
+
df[(df['date'] >= '2024-01-01') & (df['date'] <= '2024-12-31')]
|
|
268
|
+
|
|
269
|
+
complex boolean logic:
|
|
270
|
+
df[
|
|
271
|
+
(df['category'] == 'A') |
|
|
272
|
+
((df['value'] > 100) & (df['status'] == 'active'))
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
query method
|
|
277
|
+
|
|
278
|
+
string queries:
|
|
279
|
+
df.query('column > 100 and category == "A"')
|
|
280
|
+
|
|
281
|
+
with variables:
|
|
282
|
+
threshold = 100
|
|
283
|
+
df.query('value > @threshold')
|
|
284
|
+
|
|
285
|
+
with column names with spaces:
|
|
286
|
+
df.query('`column with spaces` > 100')
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
positional indexing with iloc
|
|
290
|
+
|
|
291
|
+
single cell:
|
|
292
|
+
df.iloc[0, 0] # row 0, column 0
|
|
293
|
+
|
|
294
|
+
single row:
|
|
295
|
+
df.iloc[0] # first row
|
|
296
|
+
|
|
297
|
+
multiple rows:
|
|
298
|
+
df.iloc[0:5] # rows 0-4
|
|
299
|
+
|
|
300
|
+
rows and columns:
|
|
301
|
+
df.iloc[0:5, 2:5] # rows 0-4, columns 2-4
|
|
302
|
+
|
|
303
|
+
specific rows and columns:
|
|
304
|
+
df.iloc[[0, 2, 4], [1, 3]]
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
label-based indexing with loc
|
|
308
|
+
|
|
309
|
+
single cell:
|
|
310
|
+
df.loc[0, 'column']
|
|
311
|
+
|
|
312
|
+
single row:
|
|
313
|
+
df.loc[0]
|
|
314
|
+
|
|
315
|
+
multiple rows:
|
|
316
|
+
df.loc[0:5]
|
|
317
|
+
|
|
318
|
+
rows and columns:
|
|
319
|
+
df.loc[0:5, 'col1':'col3']
|
|
320
|
+
|
|
321
|
+
boolean indexing:
|
|
322
|
+
df.loc[df['value'] > 100, 'column']
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
PHASE 4: DATA CLEANING AND HANDLING MISSING VALUES
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
handling missing values
|
|
329
|
+
|
|
330
|
+
drop rows with any missing:
|
|
331
|
+
df_clean = df.dropna()
|
|
332
|
+
|
|
333
|
+
drop rows where specific columns are missing:
|
|
334
|
+
df_clean = df.dropna(subset=['column1', 'column2'])
|
|
335
|
+
|
|
336
|
+
drop columns with missing:
|
|
337
|
+
df_clean = df.dropna(axis=1)
|
|
338
|
+
|
|
339
|
+
drop rows with all missing:
|
|
340
|
+
df_clean = df.dropna(how='all')
|
|
341
|
+
|
|
342
|
+
threshold for dropping:
|
|
343
|
+
df_clean = df.dropna(thresh=2) # require at least 2 non-NA values
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
filling missing values
|
|
347
|
+
|
|
348
|
+
fill with constant:
|
|
349
|
+
df['column'] = df['column'].fillna(0)
|
|
350
|
+
|
|
351
|
+
fill with mean:
|
|
352
|
+
df['column'] = df['column'].fillna(df['column'].mean())
|
|
353
|
+
|
|
354
|
+
fill with median:
|
|
355
|
+
df['column'] = df['column'].fillna(df['column'].median())
|
|
356
|
+
|
|
357
|
+
fill with mode:
|
|
358
|
+
df['column'] = df['column'].fillna(df['column'].mode()[0])
|
|
359
|
+
|
|
360
|
+
forward fill:
|
|
361
|
+
df['column'] = df['column'].fillna(method='ffill')
|
|
362
|
+
|
|
363
|
+
backward fill:
|
|
364
|
+
df['column'] = df['column'].fillna(method='bfill')
|
|
365
|
+
|
|
366
|
+
interpolate:
|
|
367
|
+
df['column'] = df['column'].interpolate()
|
|
368
|
+
|
|
369
|
+
fill with different values per column:
|
|
370
|
+
df.fillna({'column1': 0, 'column2': 'unknown'})
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
removing duplicates
|
|
374
|
+
|
|
375
|
+
drop all duplicates:
|
|
376
|
+
df_clean = df.drop_duplicates()
|
|
377
|
+
|
|
378
|
+
drop duplicates keeping first:
|
|
379
|
+
df_clean = df.drop_duplicates(keep='first')
|
|
380
|
+
|
|
381
|
+
drop duplicates keeping last:
|
|
382
|
+
df_clean = df.drop_duplicates(keep='last')
|
|
383
|
+
|
|
384
|
+
drop all duplicate rows:
|
|
385
|
+
df_clean = df.drop_duplicates(keep=False)
|
|
386
|
+
|
|
387
|
+
drop duplicates on subset:
|
|
388
|
+
df_clean = df.drop_duplicates(subset=['id'])
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
handling outliers
|
|
392
|
+
|
|
393
|
+
iqr method:
|
|
394
|
+
Q1 = df['column'].quantile(0.25)
|
|
395
|
+
Q3 = df['column'].quantile(0.75)
|
|
396
|
+
IQR = Q3 - Q1
|
|
397
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
398
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
399
|
+
|
|
400
|
+
df_clean = df[
|
|
401
|
+
(df['column'] >= lower_bound) &
|
|
402
|
+
(df['column'] <= upper_bound)
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
z-score method:
|
|
406
|
+
from scipy import stats
|
|
407
|
+
|
|
408
|
+
z_scores = stats.zscore(df['column'])
|
|
409
|
+
df_clean = df[abs(z_scores) < 3]
|
|
410
|
+
|
|
411
|
+
cap outliers:
|
|
412
|
+
df['column'] = df['column'].clip(lower_bound, upper_bound)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
PHASE 5: DATA TRANSFORMATION
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
string operations
|
|
419
|
+
|
|
420
|
+
convert to uppercase:
|
|
421
|
+
df['column'] = df['column'].str.upper()
|
|
422
|
+
|
|
423
|
+
convert to lowercase:
|
|
424
|
+
df['column'] = df['column'].str.lower()
|
|
425
|
+
|
|
426
|
+
strip whitespace:
|
|
427
|
+
df['column'] = df['column'].str.strip()
|
|
428
|
+
|
|
429
|
+
replace substrings:
|
|
430
|
+
df['column'] = df['column'].str.replace('old', 'new')
|
|
431
|
+
|
|
432
|
+
extract with regex:
|
|
433
|
+
df['extracted'] = df['column'].str.extract(r'pattern')
|
|
434
|
+
|
|
435
|
+
split strings:
|
|
436
|
+
df[['first', 'last']] = df['name'].str.split(' ', expand=True)
|
|
437
|
+
|
|
438
|
+
concatenate strings:
|
|
439
|
+
df['full_name'] = df['first'] + ' ' + df['last']
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
numeric operations
|
|
443
|
+
|
|
444
|
+
arithmetic:
|
|
445
|
+
df['total'] = df['price'] * df['quantity']
|
|
446
|
+
|
|
447
|
+
absolute value:
|
|
448
|
+
df['absolute'] = df['column'].abs()
|
|
449
|
+
|
|
450
|
+
round:
|
|
451
|
+
df['rounded'] = df['column'].round(2)
|
|
452
|
+
|
|
453
|
+
floor:
|
|
454
|
+
df['floor'] = df['column'].apply(np.floor)
|
|
455
|
+
|
|
456
|
+
ceiling:
|
|
457
|
+
df['ceiling'] = df['column'].apply(np.ceil)
|
|
458
|
+
|
|
459
|
+
binning:
|
|
460
|
+
df['bin'] = pd.cut(df['value'], bins=[0, 10, 20, 30, 40, 50])
|
|
461
|
+
|
|
462
|
+
percentile rank:
|
|
463
|
+
df['percentile'] = df['value'].rank(pct=True)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
datetime operations
|
|
467
|
+
|
|
468
|
+
extract components:
|
|
469
|
+
df['year'] = df['date'].dt.year
|
|
470
|
+
df['month'] = df['date'].dt.month
|
|
471
|
+
df['day'] = df['date'].dt.day
|
|
472
|
+
df['hour'] = df['date'].dt.hour
|
|
473
|
+
df['dayofweek'] = df['date'].dt.dayofweek
|
|
474
|
+
df['weekday_name'] = df['date'].dt.strftime('%A')
|
|
475
|
+
|
|
476
|
+
calculate difference:
|
|
477
|
+
df['days_diff'] = (df['end_date'] - df['start_date']).dt.days
|
|
478
|
+
|
|
479
|
+
add time:
|
|
480
|
+
df['future_date'] = df['date'] + pd.Timedelta(days=7)
|
|
481
|
+
|
|
482
|
+
resample time series:
|
|
483
|
+
df.set_index('date').resample('D').mean()
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
categorical operations
|
|
487
|
+
|
|
488
|
+
encode categories:
|
|
489
|
+
df['category_encoded'] = df['category'].cat.codes
|
|
490
|
+
|
|
491
|
+
get categories:
|
|
492
|
+
df['category'].cat.categories
|
|
493
|
+
|
|
494
|
+
rename categories:
|
|
495
|
+
df['category'] = df['category'].cat.rename_categories({
|
|
496
|
+
'A': 'Alpha',
|
|
497
|
+
'B': 'Beta'
|
|
498
|
+
})
|
|
499
|
+
|
|
500
|
+
reorder categories:
|
|
501
|
+
df['category'] = df['category'].cat.reorder_categories(
|
|
502
|
+
['Low', 'Medium', 'High'],
|
|
503
|
+
ordered=True
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
PHASE 6: DATA AGGREGATION AND GROUPING
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
groupby basics
|
|
511
|
+
|
|
512
|
+
single column groupby:
|
|
513
|
+
df.groupby('category')['value'].sum()
|
|
514
|
+
|
|
515
|
+
multiple columns groupby:
|
|
516
|
+
df.groupby(['category', 'subcategory'])['value'].mean()
|
|
517
|
+
|
|
518
|
+
multiple aggregations:
|
|
519
|
+
df.groupby('category')['value'].agg(['mean', 'std', 'count'])
|
|
520
|
+
|
|
521
|
+
different aggregations per column:
|
|
522
|
+
df.groupby('category').agg({
|
|
523
|
+
'value': 'mean',
|
|
524
|
+
'count': 'sum',
|
|
525
|
+
'price': 'max'
|
|
526
|
+
})
|
|
527
|
+
|
|
528
|
+
named aggregations:
|
|
529
|
+
df.groupby('category').agg(
|
|
530
|
+
mean_value=('value', 'mean'),
|
|
531
|
+
std_value=('value', 'std'),
|
|
532
|
+
total_count=('id', 'count')
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
groupby transformations
|
|
537
|
+
|
|
538
|
+
transform:
|
|
539
|
+
df['value_zscore'] = df.groupby('category')['value'].transform(
|
|
540
|
+
lambda x: (x - x.mean()) / x.std()
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
fill missing with group mean:
|
|
544
|
+
df['value'] = df.groupby('category')['value'].transform(
|
|
545
|
+
lambda x: x.fillna(x.mean())
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
group rank:
|
|
549
|
+
df['rank_in_group'] = df.groupby('category')['value'].rank()
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
groupby filtering
|
|
553
|
+
|
|
554
|
+
filter groups:
|
|
555
|
+
df.groupby('category').filter(lambda x: len(x) > 10)
|
|
556
|
+
|
|
557
|
+
filter by aggregate:
|
|
558
|
+
df.groupby('category').filter(
|
|
559
|
+
lambda x: x['value'].mean() > 100
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
pivot tables
|
|
564
|
+
|
|
565
|
+
basic pivot:
|
|
566
|
+
pd.pivot_table(
|
|
567
|
+
df,
|
|
568
|
+
values='value',
|
|
569
|
+
index='category',
|
|
570
|
+
columns='month',
|
|
571
|
+
aggfunc='mean'
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
multiple aggregations:
|
|
575
|
+
pd.pivot_table(
|
|
576
|
+
df,
|
|
577
|
+
values='value',
|
|
578
|
+
index='category',
|
|
579
|
+
columns='month',
|
|
580
|
+
aggfunc=['mean', 'sum']
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
fill missing values:
|
|
584
|
+
pd.pivot_table(
|
|
585
|
+
df,
|
|
586
|
+
values='value',
|
|
587
|
+
index='category',
|
|
588
|
+
columns='month',
|
|
589
|
+
aggfunc='sum',
|
|
590
|
+
fill_value=0
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
with margins:
|
|
594
|
+
pd.pivot_table(
|
|
595
|
+
df,
|
|
596
|
+
values='value',
|
|
597
|
+
index='category',
|
|
598
|
+
columns='month',
|
|
599
|
+
aggfunc='sum',
|
|
600
|
+
margins=True
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
cross tabulation
|
|
605
|
+
|
|
606
|
+
basic crosstab:
|
|
607
|
+
pd.crosstab(df['category'], df['status'])
|
|
608
|
+
|
|
609
|
+
with counts:
|
|
610
|
+
pd.crosstab(df['category'], df['status'], margins=True)
|
|
611
|
+
|
|
612
|
+
with normalization:
|
|
613
|
+
pd.crosstab(
|
|
614
|
+
df['category'],
|
|
615
|
+
df['status'],
|
|
616
|
+
normalize='index'
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
PHASE 7: MERGING AND JOINING DATA
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
basic merge
|
|
624
|
+
|
|
625
|
+
inner join:
|
|
626
|
+
pd.merge(df1, df2, on='id')
|
|
627
|
+
|
|
628
|
+
left join:
|
|
629
|
+
pd.merge(df1, df2, on='id', how='left')
|
|
630
|
+
|
|
631
|
+
right join:
|
|
632
|
+
pd.merge(df1, df2, on='id', how='right')
|
|
633
|
+
|
|
634
|
+
outer join:
|
|
635
|
+
pd.merge(df1, df2, on='id', how='outer')
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
merge on different column names
|
|
639
|
+
|
|
640
|
+
pd.merge(df1, df2, left_on='id1', right_on='id2')
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
merge on multiple columns
|
|
644
|
+
|
|
645
|
+
pd.merge(df1, df2, on=['id', 'date'])
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
merge with suffixes
|
|
649
|
+
|
|
650
|
+
pd.merge(
|
|
651
|
+
df1,
|
|
652
|
+
df2,
|
|
653
|
+
on='id',
|
|
654
|
+
suffixes=('_left', '_right')
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
merge with indicator
|
|
659
|
+
|
|
660
|
+
merged = pd.merge(df1, df2, on='id', how='outer', indicator=True)
|
|
661
|
+
|
|
662
|
+
filter for unmatched:
|
|
663
|
+
merged[merged['_merge'] == 'left_only']
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
concatenating dataframes
|
|
667
|
+
|
|
668
|
+
vertical concat:
|
|
669
|
+
pd.concat([df1, df2, df3], ignore_index=True)
|
|
670
|
+
|
|
671
|
+
horizontal concat:
|
|
672
|
+
pd.concat([df1, df2], axis=1)
|
|
673
|
+
|
|
674
|
+
concat with keys:
|
|
675
|
+
pd.concat([df1, df2], keys=['source1', 'source2'])
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
join on index
|
|
679
|
+
|
|
680
|
+
df1.join(df2, on='id')
|
|
681
|
+
|
|
682
|
+
inner join on index:
|
|
683
|
+
df1.join(df2, how='inner')
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
PHASE 8: RESHAPING DATA
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
melting data
|
|
690
|
+
|
|
691
|
+
wide to long:
|
|
692
|
+
pd.melt(
|
|
693
|
+
df,
|
|
694
|
+
id_vars=['id', 'name'],
|
|
695
|
+
value_vars=['q1', 'q2', 'q3'],
|
|
696
|
+
var_name='quarter',
|
|
697
|
+
value_name='sales'
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
stacking and unstacking
|
|
702
|
+
|
|
703
|
+
stack columns to index:
|
|
704
|
+
df.stack()
|
|
705
|
+
|
|
706
|
+
unstack index to columns:
|
|
707
|
+
df.unstack()
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
pivoting
|
|
711
|
+
|
|
712
|
+
long to wide:
|
|
713
|
+
df.pivot(
|
|
714
|
+
index='id',
|
|
715
|
+
columns='date',
|
|
716
|
+
values='value'
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
multiindex operations
|
|
721
|
+
|
|
722
|
+
create multiindex:
|
|
723
|
+
df.set_index(['category', 'subcategory'])
|
|
724
|
+
|
|
725
|
+
select from multiindex:
|
|
726
|
+
df.loc['category_A']
|
|
727
|
+
|
|
728
|
+
swap levels:
|
|
729
|
+
df.swaplevel()
|
|
730
|
+
|
|
731
|
+
reset index:
|
|
732
|
+
df.reset_index()
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
PHASE 9: APPLYING FUNCTIONS
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
apply to series
|
|
739
|
+
|
|
740
|
+
simple function:
|
|
741
|
+
df['new_column'] = df['column'].apply(lambda x: x * 2)
|
|
742
|
+
|
|
743
|
+
with condition:
|
|
744
|
+
df['new_column'] = df['column'].apply(
|
|
745
|
+
lambda x: 'high' if x > 100 else 'low'
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
apply to dataframe
|
|
750
|
+
|
|
751
|
+
row-wise:
|
|
752
|
+
df['result'] = df.apply(
|
|
753
|
+
lambda row: row['a'] + row['b'],
|
|
754
|
+
axis=1
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
column-wise:
|
|
758
|
+
df.apply(lambda col: col.max() - col.min())
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
vectorized operations
|
|
762
|
+
|
|
763
|
+
prefer vectorized over apply:
|
|
764
|
+
df['new_column'] = df['column'] * 2
|
|
765
|
+
|
|
766
|
+
much faster than:
|
|
767
|
+
df['new_column'] = df['column'].apply(lambda x: x * 2)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
map values
|
|
771
|
+
|
|
772
|
+
simple mapping:
|
|
773
|
+
df['category'] = df['category'].map({
|
|
774
|
+
'A': 'Alpha',
|
|
775
|
+
'B': 'Beta'
|
|
776
|
+
})
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
PHASE 10: PERFORMANCE OPTIMIZATION
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
use efficient dtypes
|
|
783
|
+
|
|
784
|
+
convert to category:
|
|
785
|
+
df['column'] = df['column'].astype('category')
|
|
786
|
+
|
|
787
|
+
use int32 instead of int64:
|
|
788
|
+
df['column'] = df['column'].astype('int32')
|
|
789
|
+
|
|
790
|
+
use float32 instead of float64:
|
|
791
|
+
df['column'] = df['column'].astype('float32')
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
use vectorized operations
|
|
795
|
+
|
|
796
|
+
bad:
|
|
797
|
+
for i in range(len(df)):
|
|
798
|
+
df.loc[i, 'new_col'] = df.loc[i, 'col'] * 2
|
|
799
|
+
|
|
800
|
+
good:
|
|
801
|
+
df['new_col'] = df['col'] * 2
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
use categorical for repeated strings
|
|
805
|
+
|
|
806
|
+
memory savings:
|
|
807
|
+
df['category'] = df['category'].astype('category')
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
use query for complex filtering
|
|
811
|
+
|
|
812
|
+
faster than boolean indexing:
|
|
813
|
+
df.query('value > 100 and category == "A"')
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
use eval for complex expressions
|
|
817
|
+
|
|
818
|
+
df.eval('new_column = a + b * c')
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
avoid chaining operations
|
|
822
|
+
|
|
823
|
+
bad:
|
|
824
|
+
df[df['a'] > 10]['b'] = 5
|
|
825
|
+
|
|
826
|
+
good:
|
|
827
|
+
df.loc[df['a'] > 10, 'b'] = 5
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
PHASE 11: ADVANCED OPERATIONS
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
rolling windows
|
|
834
|
+
|
|
835
|
+
simple rolling mean:
|
|
836
|
+
df['rolling_mean'] = df['value'].rolling(window=5).mean()
|
|
837
|
+
|
|
838
|
+
rolling with multiple aggregations:
|
|
839
|
+
df['rolling_std'] = df['value'].rolling(window=5).std()
|
|
840
|
+
|
|
841
|
+
rolling with min periods:
|
|
842
|
+
df['rolling'] = df['value'].rolling(window=5, min_periods=1).mean()
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
expanding windows
|
|
846
|
+
|
|
847
|
+
cumulative sum:
|
|
848
|
+
df['cumsum'] = df['value'].expanding().sum()
|
|
849
|
+
|
|
850
|
+
cumulative mean:
|
|
851
|
+
df['cummean'] = df['value'].expanding().mean()
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
shift and lag
|
|
855
|
+
|
|
856
|
+
shift down:
|
|
857
|
+
df['lag1'] = df['value'].shift(1)
|
|
858
|
+
|
|
859
|
+
shift up:
|
|
860
|
+
df['lead1'] = df['value'].shift(-1)
|
|
861
|
+
|
|
862
|
+
percentage change:
|
|
863
|
+
df['pct_change'] = df['value'].pct_change()
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
rank and quantiles
|
|
867
|
+
|
|
868
|
+
rank:
|
|
869
|
+
df['rank'] = df['value'].rank()
|
|
870
|
+
|
|
871
|
+
quantile bins:
|
|
872
|
+
df['quantile'] = pd.qcut(
|
|
873
|
+
df['value'],
|
|
874
|
+
q=4,
|
|
875
|
+
labels=['Q1', 'Q2', 'Q3', 'Q4']
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
duplicates handling
|
|
880
|
+
|
|
881
|
+
find duplicates:
|
|
882
|
+
df.duplicated()
|
|
883
|
+
|
|
884
|
+
find first duplicates:
|
|
885
|
+
df.duplicated(keep='first')
|
|
886
|
+
|
|
887
|
+
find all duplicates:
|
|
888
|
+
df.duplicated(keep=False)
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
PHASE 12: WORKING WITH TIME SERIES
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
datetime basics
|
|
895
|
+
|
|
896
|
+
create datetime:
|
|
897
|
+
pd.to_datetime('2024-01-01')
|
|
898
|
+
|
|
899
|
+
parse datetime column:
|
|
900
|
+
df['date'] = pd.to_datetime(df['date'])
|
|
901
|
+
|
|
902
|
+
set datetime as index:
|
|
903
|
+
df.set_index('date', inplace=True)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
resampling
|
|
907
|
+
|
|
908
|
+
daily to monthly:
|
|
909
|
+
df.resample('M').mean()
|
|
910
|
+
|
|
911
|
+
hourly to daily:
|
|
912
|
+
df.resample('D').sum()
|
|
913
|
+
|
|
914
|
+
custom frequency:
|
|
915
|
+
df.resample('2W').mean() # 2 weeks
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
time-based filtering
|
|
919
|
+
|
|
920
|
+
df.loc['2024-01':'2024-06']
|
|
921
|
+
|
|
922
|
+
df[df.index.month == 1]
|
|
923
|
+
|
|
924
|
+
df[df.index.dayofweek < 5] # weekdays
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
rolling with time
|
|
928
|
+
|
|
929
|
+
df.rolling('7D').mean()
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
PHASE 13: BEST PRACTICES CHECKLIST
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
before operations:
|
|
936
|
+
|
|
937
|
+
[ ] verify data types are correct
|
|
938
|
+
[ ] check for missing values
|
|
939
|
+
[ ] understand data size and memory requirements
|
|
940
|
+
[ ] sample data before full operations
|
|
941
|
+
[ ] backup original data if important
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
during operations:
|
|
945
|
+
|
|
946
|
+
[ ] use efficient dtypes (category, int32, float32)
|
|
947
|
+
[ ] prefer vectorized operations over loops
|
|
948
|
+
[ ] use query() for complex filtering
|
|
949
|
+
[ ] avoid chained indexing
|
|
950
|
+
[ ] use inplace=False by default
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
after operations:
|
|
954
|
+
|
|
955
|
+
[ ] verify operation results
|
|
956
|
+
[ ] check for unexpected missing values
|
|
957
|
+
[ ] validate data integrity
|
|
958
|
+
[ ] check memory usage
|
|
959
|
+
[ ] document transformations
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
PHASE 14: COMMON PITFALLS TO AVOID
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
chained assignment
|
|
966
|
+
|
|
967
|
+
wrong:
|
|
968
|
+
df[df['a'] > 10]['b'] = 5
|
|
969
|
+
|
|
970
|
+
correct:
|
|
971
|
+
df.loc[df['a'] > 10, 'b'] = 5
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
modifying while iterating
|
|
975
|
+
|
|
976
|
+
wrong:
|
|
977
|
+
for i, row in df.iterrows():
|
|
978
|
+
df.loc[i, 'new'] = row['old'] * 2
|
|
979
|
+
|
|
980
|
+
correct:
|
|
981
|
+
df['new'] = df['old'] * 2
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
ignoring copy vs view
|
|
985
|
+
|
|
986
|
+
df_copy = df.copy() # explicit copy
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
forgetting to reset_index after filtering
|
|
990
|
+
|
|
991
|
+
df_filtered = df[condition].reset_index(drop=True)
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
mixing loc and iloc
|
|
995
|
+
|
|
996
|
+
be consistent with either label-based or positional indexing
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
PHASE 15: MANDATORY RULES
|
|
1000
|
+
|
|
1001
|
+
while this skill is active, these rules are MANDATORY:
|
|
1002
|
+
|
|
1003
|
+
[1] ALWAYS CHECK DATA TYPES before operations
|
|
1004
|
+
incorrect types cause unexpected results
|
|
1005
|
+
verify with df.dtypes before processing
|
|
1006
|
+
|
|
1007
|
+
[2] NEVER USE LOOPS for DataFrame operations
|
|
1008
|
+
vectorized operations are 100-1000x faster
|
|
1009
|
+
always use df.apply() or built-in methods
|
|
1010
|
+
|
|
1011
|
+
[3] ALWAYS HANDLE MISSING VALUES explicitly
|
|
1012
|
+
decide how to handle: drop, fill, or leave
|
|
1013
|
+
document your decision
|
|
1014
|
+
|
|
1015
|
+
[4] NEVER CHAIN INDEXING without .loc or .iloc
|
|
1016
|
+
df[df.a > 10]['b'] = 5 is dangerous
|
|
1017
|
+
use df.loc[df.a > 10, 'b'] = 5
|
|
1018
|
+
|
|
1019
|
+
[5] ALWAYS SAMPLE before full operations
|
|
1020
|
+
test on sample (df.sample(1000))
|
|
1021
|
+
verify correctness before full dataset
|
|
1022
|
+
|
|
1023
|
+
[6] NEVER ASSUME INDEX is sorted
|
|
1024
|
+
sort before operations if needed
|
|
1025
|
+
df.sort_values('column')
|
|
1026
|
+
|
|
1027
|
+
[7] ALWAYS USE EFFICIENT DTYPES for large datasets
|
|
1028
|
+
category for low-cardinality strings
|
|
1029
|
+
int32/float32 instead of 64-bit
|
|
1030
|
+
|
|
1031
|
+
[8] NEVER FORGET inplace=False default
|
|
1032
|
+
most operations return new dataframe
|
|
1033
|
+
assign result: df = df.copy()
|
|
1034
|
+
|
|
1035
|
+
[9] ALWAYS VERIFY OUTPUT SHAPE and content
|
|
1036
|
+
check df.shape after operations
|
|
1037
|
+
df.head() to spot issues
|
|
1038
|
+
|
|
1039
|
+
[10] NEVER MIX loc and iloc in same operation
|
|
1040
|
+
pick one and stick with it
|
|
1041
|
+
be consistent in your code
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
FINAL REMINDERS
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
pandas is powerful
|
|
1048
|
+
|
|
1049
|
+
use vectorized operations.
|
|
1050
|
+
your code will be faster.
|
|
1051
|
+
and more readable.
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
think in operations, not rows
|
|
1055
|
+
|
|
1056
|
+
not: for each row, do this
|
|
1057
|
+
but: apply this operation to all rows
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
the chainable API
|
|
1061
|
+
|
|
1062
|
+
df.query('value > 100').groupby('category').agg({'value': 'mean'})
|
|
1063
|
+
|
|
1064
|
+
each step returns a dataframe.
|
|
1065
|
+
chain them together.
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
when stuck
|
|
1069
|
+
|
|
1070
|
+
[ ] read pandas documentation
|
|
1071
|
+
[ ] check stackoverflow
|
|
1072
|
+
[ ] use df.info() to understand structure
|
|
1073
|
+
[ ] print df.head() to see data
|
|
1074
|
+
|
|
1075
|
+
the goal
|
|
1076
|
+
|
|
1077
|
+
clean data.
|
|
1078
|
+
transform efficiently.
|
|
1079
|
+
get insights fast.
|
|
1080
|
+
|
|
1081
|
+
now go analyze that data.
|