kollabor 0.4.9__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. agents/__init__.py +2 -0
  2. agents/coder/__init__.py +0 -0
  3. agents/coder/agent.json +4 -0
  4. agents/coder/api-integration.md +2150 -0
  5. agents/coder/cli-pretty.md +765 -0
  6. agents/coder/code-review.md +1092 -0
  7. agents/coder/database-design.md +1525 -0
  8. agents/coder/debugging.md +1102 -0
  9. agents/coder/dependency-management.md +1397 -0
  10. agents/coder/git-workflow.md +1099 -0
  11. agents/coder/refactoring.md +1454 -0
  12. agents/coder/security-hardening.md +1732 -0
  13. agents/coder/system_prompt.md +1448 -0
  14. agents/coder/tdd.md +1367 -0
  15. agents/creative-writer/__init__.py +0 -0
  16. agents/creative-writer/agent.json +4 -0
  17. agents/creative-writer/character-development.md +1852 -0
  18. agents/creative-writer/dialogue-craft.md +1122 -0
  19. agents/creative-writer/plot-structure.md +1073 -0
  20. agents/creative-writer/revision-editing.md +1484 -0
  21. agents/creative-writer/system_prompt.md +690 -0
  22. agents/creative-writer/worldbuilding.md +2049 -0
  23. agents/data-analyst/__init__.py +30 -0
  24. agents/data-analyst/agent.json +4 -0
  25. agents/data-analyst/data-visualization.md +992 -0
  26. agents/data-analyst/exploratory-data-analysis.md +1110 -0
  27. agents/data-analyst/pandas-data-manipulation.md +1081 -0
  28. agents/data-analyst/sql-query-optimization.md +881 -0
  29. agents/data-analyst/statistical-analysis.md +1118 -0
  30. agents/data-analyst/system_prompt.md +928 -0
  31. agents/default/__init__.py +0 -0
  32. agents/default/agent.json +4 -0
  33. agents/default/dead-code.md +794 -0
  34. agents/default/explore-agent-system.md +585 -0
  35. agents/default/system_prompt.md +1448 -0
  36. agents/kollabor/__init__.py +0 -0
  37. agents/kollabor/analyze-plugin-lifecycle.md +175 -0
  38. agents/kollabor/analyze-terminal-rendering.md +388 -0
  39. agents/kollabor/code-review.md +1092 -0
  40. agents/kollabor/debug-mcp-integration.md +521 -0
  41. agents/kollabor/debug-plugin-hooks.md +547 -0
  42. agents/kollabor/debugging.md +1102 -0
  43. agents/kollabor/dependency-management.md +1397 -0
  44. agents/kollabor/git-workflow.md +1099 -0
  45. agents/kollabor/inspect-llm-conversation.md +148 -0
  46. agents/kollabor/monitor-event-bus.md +558 -0
  47. agents/kollabor/profile-performance.md +576 -0
  48. agents/kollabor/refactoring.md +1454 -0
  49. agents/kollabor/system_prompt copy.md +1448 -0
  50. agents/kollabor/system_prompt.md +757 -0
  51. agents/kollabor/trace-command-execution.md +178 -0
  52. agents/kollabor/validate-config.md +879 -0
  53. agents/research/__init__.py +0 -0
  54. agents/research/agent.json +4 -0
  55. agents/research/architecture-mapping.md +1099 -0
  56. agents/research/codebase-analysis.md +1077 -0
  57. agents/research/dependency-audit.md +1027 -0
  58. agents/research/performance-profiling.md +1047 -0
  59. agents/research/security-review.md +1359 -0
  60. agents/research/system_prompt.md +492 -0
  61. agents/technical-writer/__init__.py +0 -0
  62. agents/technical-writer/agent.json +4 -0
  63. agents/technical-writer/api-documentation.md +2328 -0
  64. agents/technical-writer/changelog-management.md +1181 -0
  65. agents/technical-writer/readme-writing.md +1360 -0
  66. agents/technical-writer/style-guide.md +1410 -0
  67. agents/technical-writer/system_prompt.md +653 -0
  68. agents/technical-writer/tutorial-creation.md +1448 -0
  69. core/__init__.py +0 -2
  70. core/application.py +343 -88
  71. core/cli.py +229 -10
  72. core/commands/menu_renderer.py +463 -59
  73. core/commands/registry.py +14 -9
  74. core/commands/system_commands.py +2461 -14
  75. core/config/loader.py +151 -37
  76. core/config/service.py +18 -6
  77. core/events/bus.py +29 -9
  78. core/events/executor.py +205 -75
  79. core/events/models.py +27 -8
  80. core/fullscreen/command_integration.py +20 -24
  81. core/fullscreen/components/__init__.py +10 -1
  82. core/fullscreen/components/matrix_components.py +1 -2
  83. core/fullscreen/components/space_shooter_components.py +654 -0
  84. core/fullscreen/plugin.py +5 -0
  85. core/fullscreen/renderer.py +52 -13
  86. core/fullscreen/session.py +52 -15
  87. core/io/__init__.py +29 -5
  88. core/io/buffer_manager.py +6 -1
  89. core/io/config_status_view.py +7 -29
  90. core/io/core_status_views.py +267 -347
  91. core/io/input/__init__.py +25 -0
  92. core/io/input/command_mode_handler.py +711 -0
  93. core/io/input/display_controller.py +128 -0
  94. core/io/input/hook_registrar.py +286 -0
  95. core/io/input/input_loop_manager.py +421 -0
  96. core/io/input/key_press_handler.py +502 -0
  97. core/io/input/modal_controller.py +1011 -0
  98. core/io/input/paste_processor.py +339 -0
  99. core/io/input/status_modal_renderer.py +184 -0
  100. core/io/input_errors.py +5 -1
  101. core/io/input_handler.py +211 -2452
  102. core/io/key_parser.py +7 -0
  103. core/io/layout.py +15 -3
  104. core/io/message_coordinator.py +111 -2
  105. core/io/message_renderer.py +129 -4
  106. core/io/status_renderer.py +147 -607
  107. core/io/terminal_renderer.py +97 -51
  108. core/io/terminal_state.py +21 -4
  109. core/io/visual_effects.py +816 -165
  110. core/llm/agent_manager.py +1063 -0
  111. core/llm/api_adapters/__init__.py +44 -0
  112. core/llm/api_adapters/anthropic_adapter.py +432 -0
  113. core/llm/api_adapters/base.py +241 -0
  114. core/llm/api_adapters/openai_adapter.py +326 -0
  115. core/llm/api_communication_service.py +167 -113
  116. core/llm/conversation_logger.py +322 -16
  117. core/llm/conversation_manager.py +556 -30
  118. core/llm/file_operations_executor.py +84 -32
  119. core/llm/llm_service.py +934 -103
  120. core/llm/mcp_integration.py +541 -57
  121. core/llm/message_display_service.py +135 -18
  122. core/llm/plugin_sdk.py +1 -2
  123. core/llm/profile_manager.py +1183 -0
  124. core/llm/response_parser.py +274 -56
  125. core/llm/response_processor.py +16 -3
  126. core/llm/tool_executor.py +6 -1
  127. core/logging/__init__.py +2 -0
  128. core/logging/setup.py +34 -6
  129. core/models/resume.py +54 -0
  130. core/plugins/__init__.py +4 -2
  131. core/plugins/base.py +127 -0
  132. core/plugins/collector.py +23 -161
  133. core/plugins/discovery.py +37 -3
  134. core/plugins/factory.py +6 -12
  135. core/plugins/registry.py +5 -17
  136. core/ui/config_widgets.py +128 -28
  137. core/ui/live_modal_renderer.py +2 -1
  138. core/ui/modal_actions.py +5 -0
  139. core/ui/modal_overlay_renderer.py +0 -60
  140. core/ui/modal_renderer.py +268 -7
  141. core/ui/modal_state_manager.py +29 -4
  142. core/ui/widgets/base_widget.py +7 -0
  143. core/updates/__init__.py +10 -0
  144. core/updates/version_check_service.py +348 -0
  145. core/updates/version_comparator.py +103 -0
  146. core/utils/config_utils.py +685 -526
  147. core/utils/plugin_utils.py +1 -1
  148. core/utils/session_naming.py +111 -0
  149. fonts/LICENSE +21 -0
  150. fonts/README.md +46 -0
  151. fonts/SymbolsNerdFont-Regular.ttf +0 -0
  152. fonts/SymbolsNerdFontMono-Regular.ttf +0 -0
  153. fonts/__init__.py +44 -0
  154. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/METADATA +54 -4
  155. kollabor-0.4.15.dist-info/RECORD +228 -0
  156. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/top_level.txt +2 -0
  157. plugins/agent_orchestrator/__init__.py +39 -0
  158. plugins/agent_orchestrator/activity_monitor.py +181 -0
  159. plugins/agent_orchestrator/file_attacher.py +77 -0
  160. plugins/agent_orchestrator/message_injector.py +135 -0
  161. plugins/agent_orchestrator/models.py +48 -0
  162. plugins/agent_orchestrator/orchestrator.py +403 -0
  163. plugins/agent_orchestrator/plugin.py +976 -0
  164. plugins/agent_orchestrator/xml_parser.py +191 -0
  165. plugins/agent_orchestrator_plugin.py +9 -0
  166. plugins/enhanced_input/box_styles.py +1 -0
  167. plugins/enhanced_input/color_engine.py +19 -4
  168. plugins/enhanced_input/config.py +2 -2
  169. plugins/enhanced_input_plugin.py +61 -11
  170. plugins/fullscreen/__init__.py +6 -2
  171. plugins/fullscreen/example_plugin.py +1035 -222
  172. plugins/fullscreen/setup_wizard_plugin.py +592 -0
  173. plugins/fullscreen/space_shooter_plugin.py +131 -0
  174. plugins/hook_monitoring_plugin.py +436 -78
  175. plugins/query_enhancer_plugin.py +66 -30
  176. plugins/resume_conversation_plugin.py +1494 -0
  177. plugins/save_conversation_plugin.py +98 -32
  178. plugins/system_commands_plugin.py +70 -56
  179. plugins/tmux_plugin.py +154 -78
  180. plugins/workflow_enforcement_plugin.py +94 -92
  181. system_prompt/default.md +952 -886
  182. core/io/input_mode_manager.py +0 -402
  183. core/io/modal_interaction_handler.py +0 -315
  184. core/io/raw_input_processor.py +0 -946
  185. core/storage/__init__.py +0 -5
  186. core/storage/state_manager.py +0 -84
  187. core/ui/widget_integration.py +0 -222
  188. core/utils/key_reader.py +0 -171
  189. kollabor-0.4.9.dist-info/RECORD +0 -128
  190. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/WHEEL +0 -0
  191. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/entry_points.txt +0 -0
  192. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1081 @@
1
+ <!-- Pandas Data Manipulation skill - master DataFrame operations and transformations -->
2
+
3
+ pandas manipulation mode: DATA TRANSFORMATION MASTERY
4
+
5
+ when this skill is active, you follow pandas best practices for
6
+ efficient, readable, and performant data manipulation.
7
+
8
+
9
+ PHASE 0: PANDAS ENVIRONMENT VERIFICATION
10
+
11
+ before attempting ANY pandas operations, verify your tools are ready.
12
+
13
+
14
+ check pandas installation and version
15
+
16
+ <terminal>python -c "import pandas; print(f'pandas version: {pandas.__version__}')"</terminal>
17
+
18
+ if pandas not available:
19
+ <terminal>pip install pandas</terminal>
20
+
21
+ verify recommended pandas version (>= 2.0.0):
22
+ <terminal>python -c "import pandas; import sys; print('ok' if pandas.__version__ >= '2.0.0' else 'upgrade needed')"</terminal>
23
+
24
+
25
+ check numpy dependency
26
+
27
+ <terminal>python -c "import numpy; print(f'numpy version: {numpy.__version__}')"</terminal>
28
+
29
+ if numpy not available:
30
+ <terminal>pip install numpy</terminal>
31
+
32
+
33
+ check memory availability for large datasets
34
+
35
+ <terminal>python -c "import psutil; mem = psutil.virtual_memory(); print(f'total: {mem.total/1024**3:.1f}GB, available: {mem.available/1024**3:.1f}GB')"</terminal>
36
+
37
+ if psutil not installed:
38
+ <terminal>pip install psutil</terminal>
39
+
40
+
41
+ verify pandas display settings
42
+
43
+ <terminal>python -c "import pandas as pd; pd.set_option('display.max_rows', 10); pd.set_option('display.max_columns', 10); print('display settings configured')"</terminal>
44
+
45
+
46
+ check for sample data files
47
+
48
+ <terminal>find . -maxdepth 2 -name "*.csv" -o -name "*.parquet" -o -name "*.xlsx" | head -10</terminal>
49
+
50
+ <terminal>ls -lh data/ 2>/dev/null || ls -lh *.csv 2>/dev/null || echo "no data files found"</terminal>
51
+
52
+
53
+ PHASE 1: DATA LOADING FUNDAMENTALS
54
+
55
+
56
+ reading csv files efficiently
57
+
58
+ basic csv loading:
59
+ import pandas as pd
60
+
61
+ df = pd.read_csv('data.csv')
62
+
63
+ specify dtypes for efficiency:
64
+ df = pd.read_csv('data.csv',
65
+ dtype={
66
+ 'id': 'int32',
67
+ 'category': 'category',
68
+ 'value': 'float64'
69
+ })
70
+
71
+ handle date columns:
72
+ df = pd.read_csv('data.csv',
73
+ parse_dates=['date_column', 'timestamp'])
74
+
75
+ specify columns to read:
76
+ df = pd.read_csv('data.csv',
77
+ usecols=['id', 'name', 'value'])
78
+
79
+ skip rows:
80
+ df = pd.read_csv('data.csv', skiprows=5)
81
+
82
+ limit rows:
83
+ df = pd.read_csv('data.csv', nrows=10000)
84
+
85
+
86
+ reading large files in chunks
87
+
88
+ for files too large for memory:
89
+ chunksize = 10000
90
+ chunks = []
91
+ for chunk in pd.read_csv('large_file.csv', chunksize=chunksize):
92
+ # process each chunk
93
+ processed = process_chunk(chunk)
94
+ chunks.append(processed)
95
+
96
+ df = pd.concat(chunks, ignore_index=True)
97
+
98
+ or process without storing all:
99
+ for chunk in pd.read_csv('huge_file.csv', chunksize=10000):
100
+ result = chunk.groupby('category').sum()
101
+ result.to_csv('output.csv', mode='a', header=False)
102
+
103
+
104
+ reading from different sources
105
+
106
+ read excel:
107
+ df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
108
+
109
+ read parquet (recommended for large datasets):
110
+ df = pd.read_parquet('data.parquet')
111
+
112
+ read json:
113
+ df = pd.read_json('data.json')
114
+
115
+ read sql database:
116
+ import sqlite3
117
+
118
+ conn = sqlite3.connect('database.db')
119
+ df = pd.read_sql_query("SELECT * FROM table", conn)
120
+
121
+ read html tables:
122
+ tables = pd.read_html('https://example.com/data')
123
+ df = tables[0]
124
+
125
+
126
+ PHASE 2: DATA INSPECTION AND EXPLORATION
127
+
128
+
129
+ basic dataframe inspection
130
+
131
+ get dataframe info:
132
+ df.info()
133
+
134
+ summary statistics:
135
+ df.describe()
136
+
137
+ first few rows:
138
+ df.head()
139
+ df.head(10)
140
+
141
+ last few rows:
142
+ df.tail()
143
+
144
+ random sample:
145
+ df.sample(n=5)
146
+ df.sample(frac=0.1) # 10% of data
147
+
148
+
149
+ checking data types
150
+
151
+ view dtypes:
152
+ df.dtypes
153
+
154
+ count dtypes:
155
+ df.dtypes.value_counts()
156
+
157
+ convert dtype:
158
+ df['column'] = df['column'].astype('float64')
159
+
160
+ convert to datetime:
161
+ df['date'] = pd.to_datetime(df['date'])
162
+
163
+ convert to category:
164
+ df['category_column'] = df['category_column'].astype('category')
165
+
166
+
167
+ checking for missing values
168
+
169
+ count missing values per column:
170
+ df.isnull().sum()
171
+
172
+ percentage missing:
173
+ df.isnull().sum() / len(df) * 100
174
+
175
+ total missing:
176
+ df.isnull().sum().sum()
177
+
178
+ boolean mask of missing:
179
+ missing_mask = df.isnull()
180
+
181
+ rows with any missing:
182
+ rows_with_missing = df[df.isnull().any(axis=1)]
183
+
184
+
185
+ checking for duplicates
186
+
187
+ duplicate rows:
188
+ df.duplicated()
189
+
190
+ count duplicates:
191
+ df.duplicated().sum()
192
+
193
+ specific column duplicates:
194
+ df.duplicated(subset=['id'])
195
+
196
+ first occurrence is not duplicate:
197
+ df.duplicated(keep='first')
198
+
199
+ last occurrence is not duplicate:
200
+ df.duplicated(keep='last')
201
+
202
+ mark all duplicates:
203
+ df.duplicated(keep=False)
204
+
205
+
206
+ unique values and counts
207
+
208
+ unique values:
209
+ df['column'].unique()
210
+
211
+ count unique:
212
+ df['column'].nunique()
213
+
214
+ value counts:
215
+ df['column'].value_counts()
216
+
217
+ value counts as percentage:
218
+ df['column'].value_counts(normalize=True)
219
+
220
+ value counts with missing:
221
+ df['column'].value_counts(dropna=False)
222
+
223
+
224
+ PHASE 3: DATA SELECTION AND FILTERING
225
+
226
+
227
+ selecting columns
228
+
229
+ single column:
230
+ df['column_name']
231
+
232
+ multiple columns:
233
+ df[['col1', 'col2', 'col3']]
234
+
235
+ column by position:
236
+ df.iloc[:, 0] # first column
237
+
238
+ columns by position range:
239
+ df.iloc[:, 0:3]
240
+
241
+ columns by name range:
242
+ df.loc[:, 'col1':'col3']
243
+
244
+ columns by condition:
245
+ df.select_dtypes(include=['number'])
246
+ df.select_dtypes(include=['object', 'category'])
247
+
248
+
249
+ filtering rows
250
+
251
+ by value:
252
+ df[df['column'] == 'value']
253
+
254
+ by multiple values:
255
+ df[df['column'].isin(['value1', 'value2'])]
256
+
257
+ by range:
258
+ df[(df['column'] >= 10) & (df['column'] <= 20)]
259
+
260
+ by string contains:
261
+ df[df['column'].str.contains('pattern')]
262
+
263
+ by string startswith:
264
+ df[df['column'].str.startswith('prefix')]
265
+
266
+ by date range:
267
+ df[(df['date'] >= '2024-01-01') & (df['date'] <= '2024-12-31')]
268
+
269
+ complex boolean logic:
270
+ df[
271
+ (df['category'] == 'A') |
272
+ ((df['value'] > 100) & (df['status'] == 'active'))
273
+ ]
274
+
275
+
276
+ query method
277
+
278
+ string queries:
279
+ df.query('column > 100 and category == "A"')
280
+
281
+ with variables:
282
+ threshold = 100
283
+ df.query('value > @threshold')
284
+
285
+ with column names with spaces:
286
+ df.query('`column with spaces` > 100')
287
+
288
+
289
+ positional indexing with iloc
290
+
291
+ single cell:
292
+ df.iloc[0, 0] # row 0, column 0
293
+
294
+ single row:
295
+ df.iloc[0] # first row
296
+
297
+ multiple rows:
298
+ df.iloc[0:5] # rows 0-4
299
+
300
+ rows and columns:
301
+ df.iloc[0:5, 2:5] # rows 0-4, columns 2-4
302
+
303
+ specific rows and columns:
304
+ df.iloc[[0, 2, 4], [1, 3]]
305
+
306
+
307
+ label-based indexing with loc
308
+
309
+ single cell:
310
+ df.loc[0, 'column']
311
+
312
+ single row:
313
+ df.loc[0]
314
+
315
+ multiple rows:
316
+ df.loc[0:5]
317
+
318
+ rows and columns:
319
+ df.loc[0:5, 'col1':'col3']
320
+
321
+ boolean indexing:
322
+ df.loc[df['value'] > 100, 'column']
323
+
324
+
325
+ PHASE 4: DATA CLEANING AND HANDLING MISSING VALUES
326
+
327
+
328
+ handling missing values
329
+
330
+ drop rows with any missing:
331
+ df_clean = df.dropna()
332
+
333
+ drop rows where specific columns are missing:
334
+ df_clean = df.dropna(subset=['column1', 'column2'])
335
+
336
+ drop columns with missing:
337
+ df_clean = df.dropna(axis=1)
338
+
339
+ drop rows with all missing:
340
+ df_clean = df.dropna(how='all')
341
+
342
+ threshold for dropping:
343
+ df_clean = df.dropna(thresh=2) # require at least 2 non-NA values
344
+
345
+
346
+ filling missing values
347
+
348
+ fill with constant:
349
+ df['column'] = df['column'].fillna(0)
350
+
351
+ fill with mean:
352
+ df['column'] = df['column'].fillna(df['column'].mean())
353
+
354
+ fill with median:
355
+ df['column'] = df['column'].fillna(df['column'].median())
356
+
357
+ fill with mode:
358
+ df['column'] = df['column'].fillna(df['column'].mode()[0])
359
+
360
+ forward fill:
361
+ df['column'] = df['column'].fillna(method='ffill')
362
+
363
+ backward fill:
364
+ df['column'] = df['column'].fillna(method='bfill')
365
+
366
+ interpolate:
367
+ df['column'] = df['column'].interpolate()
368
+
369
+ fill with different values per column:
370
+ df.fillna({'column1': 0, 'column2': 'unknown'})
371
+
372
+
373
+ removing duplicates
374
+
375
+ drop all duplicates:
376
+ df_clean = df.drop_duplicates()
377
+
378
+ drop duplicates keeping first:
379
+ df_clean = df.drop_duplicates(keep='first')
380
+
381
+ drop duplicates keeping last:
382
+ df_clean = df.drop_duplicates(keep='last')
383
+
384
+ drop all duplicate rows:
385
+ df_clean = df.drop_duplicates(keep=False)
386
+
387
+ drop duplicates on subset:
388
+ df_clean = df.drop_duplicates(subset=['id'])
389
+
390
+
391
+ handling outliers
392
+
393
+ iqr method:
394
+ Q1 = df['column'].quantile(0.25)
395
+ Q3 = df['column'].quantile(0.75)
396
+ IQR = Q3 - Q1
397
+ lower_bound = Q1 - 1.5 * IQR
398
+ upper_bound = Q3 + 1.5 * IQR
399
+
400
+ df_clean = df[
401
+ (df['column'] >= lower_bound) &
402
+ (df['column'] <= upper_bound)
403
+ ]
404
+
405
+ z-score method:
406
+ from scipy import stats
407
+
408
+ z_scores = stats.zscore(df['column'])
409
+ df_clean = df[abs(z_scores) < 3]
410
+
411
+ cap outliers:
412
+ df['column'] = df['column'].clip(lower_bound, upper_bound)
413
+
414
+
415
+ PHASE 5: DATA TRANSFORMATION
416
+
417
+
418
+ string operations
419
+
420
+ convert to uppercase:
421
+ df['column'] = df['column'].str.upper()
422
+
423
+ convert to lowercase:
424
+ df['column'] = df['column'].str.lower()
425
+
426
+ strip whitespace:
427
+ df['column'] = df['column'].str.strip()
428
+
429
+ replace substrings:
430
+ df['column'] = df['column'].str.replace('old', 'new')
431
+
432
+ extract with regex:
433
+ df['extracted'] = df['column'].str.extract(r'pattern')
434
+
435
+ split strings:
436
+ df[['first', 'last']] = df['name'].str.split(' ', expand=True)
437
+
438
+ concatenate strings:
439
+ df['full_name'] = df['first'] + ' ' + df['last']
440
+
441
+
442
+ numeric operations
443
+
444
+ arithmetic:
445
+ df['total'] = df['price'] * df['quantity']
446
+
447
+ absolute value:
448
+ df['absolute'] = df['column'].abs()
449
+
450
+ round:
451
+ df['rounded'] = df['column'].round(2)
452
+
453
+ floor:
454
+ df['floor'] = df['column'].apply(np.floor)
455
+
456
+ ceiling:
457
+ df['ceiling'] = df['column'].apply(np.ceil)
458
+
459
+ binning:
460
+ df['bin'] = pd.cut(df['value'], bins=[0, 10, 20, 30, 40, 50])
461
+
462
+ percentile rank:
463
+ df['percentile'] = df['value'].rank(pct=True)
464
+
465
+
466
+ datetime operations
467
+
468
+ extract components:
469
+ df['year'] = df['date'].dt.year
470
+ df['month'] = df['date'].dt.month
471
+ df['day'] = df['date'].dt.day
472
+ df['hour'] = df['date'].dt.hour
473
+ df['dayofweek'] = df['date'].dt.dayofweek
474
+ df['weekday_name'] = df['date'].dt.strftime('%A')
475
+
476
+ calculate difference:
477
+ df['days_diff'] = (df['end_date'] - df['start_date']).dt.days
478
+
479
+ add time:
480
+ df['future_date'] = df['date'] + pd.Timedelta(days=7)
481
+
482
+ resample time series:
483
+ df.set_index('date').resample('D').mean()
484
+
485
+
486
+ categorical operations
487
+
488
+ encode categories:
489
+ df['category_encoded'] = df['category'].cat.codes
490
+
491
+ get categories:
492
+ df['category'].cat.categories
493
+
494
+ rename categories:
495
+ df['category'] = df['category'].cat.rename_categories({
496
+ 'A': 'Alpha',
497
+ 'B': 'Beta'
498
+ })
499
+
500
+ reorder categories:
501
+ df['category'] = df['category'].cat.reorder_categories(
502
+ ['Low', 'Medium', 'High'],
503
+ ordered=True
504
+ )
505
+
506
+
507
+ PHASE 6: DATA AGGREGATION AND GROUPING
508
+
509
+
510
+ groupby basics
511
+
512
+ single column groupby:
513
+ df.groupby('category')['value'].sum()
514
+
515
+ multiple columns groupby:
516
+ df.groupby(['category', 'subcategory'])['value'].mean()
517
+
518
+ multiple aggregations:
519
+ df.groupby('category')['value'].agg(['mean', 'std', 'count'])
520
+
521
+ different aggregations per column:
522
+ df.groupby('category').agg({
523
+ 'value': 'mean',
524
+ 'count': 'sum',
525
+ 'price': 'max'
526
+ })
527
+
528
+ named aggregations:
529
+ df.groupby('category').agg(
530
+ mean_value=('value', 'mean'),
531
+ std_value=('value', 'std'),
532
+ total_count=('id', 'count')
533
+ )
534
+
535
+
536
+ groupby transformations
537
+
538
+ transform:
539
+ df['value_zscore'] = df.groupby('category')['value'].transform(
540
+ lambda x: (x - x.mean()) / x.std()
541
+ )
542
+
543
+ fill missing with group mean:
544
+ df['value'] = df.groupby('category')['value'].transform(
545
+ lambda x: x.fillna(x.mean())
546
+ )
547
+
548
+ group rank:
549
+ df['rank_in_group'] = df.groupby('category')['value'].rank()
550
+
551
+
552
+ groupby filtering
553
+
554
+ filter groups:
555
+ df.groupby('category').filter(lambda x: len(x) > 10)
556
+
557
+ filter by aggregate:
558
+ df.groupby('category').filter(
559
+ lambda x: x['value'].mean() > 100
560
+ )
561
+
562
+
563
+ pivot tables
564
+
565
+ basic pivot:
566
+ pd.pivot_table(
567
+ df,
568
+ values='value',
569
+ index='category',
570
+ columns='month',
571
+ aggfunc='mean'
572
+ )
573
+
574
+ multiple aggregations:
575
+ pd.pivot_table(
576
+ df,
577
+ values='value',
578
+ index='category',
579
+ columns='month',
580
+ aggfunc=['mean', 'sum']
581
+ )
582
+
583
+ fill missing values:
584
+ pd.pivot_table(
585
+ df,
586
+ values='value',
587
+ index='category',
588
+ columns='month',
589
+ aggfunc='sum',
590
+ fill_value=0
591
+ )
592
+
593
+ with margins:
594
+ pd.pivot_table(
595
+ df,
596
+ values='value',
597
+ index='category',
598
+ columns='month',
599
+ aggfunc='sum',
600
+ margins=True
601
+ )
602
+
603
+
604
+ cross tabulation
605
+
606
+ basic crosstab:
607
+ pd.crosstab(df['category'], df['status'])
608
+
609
+ with counts:
610
+ pd.crosstab(df['category'], df['status'], margins=True)
611
+
612
+ with normalization:
613
+ pd.crosstab(
614
+ df['category'],
615
+ df['status'],
616
+ normalize='index'
617
+ )
618
+
619
+
620
+ PHASE 7: MERGING AND JOINING DATA
621
+
622
+
623
+ basic merge
624
+
625
+ inner join:
626
+ pd.merge(df1, df2, on='id')
627
+
628
+ left join:
629
+ pd.merge(df1, df2, on='id', how='left')
630
+
631
+ right join:
632
+ pd.merge(df1, df2, on='id', how='right')
633
+
634
+ outer join:
635
+ pd.merge(df1, df2, on='id', how='outer')
636
+
637
+
638
+ merge on different column names
639
+
640
+ pd.merge(df1, df2, left_on='id1', right_on='id2')
641
+
642
+
643
+ merge on multiple columns
644
+
645
+ pd.merge(df1, df2, on=['id', 'date'])
646
+
647
+
648
+ merge with suffixes
649
+
650
+ pd.merge(
651
+ df1,
652
+ df2,
653
+ on='id',
654
+ suffixes=('_left', '_right')
655
+ )
656
+
657
+
658
+ merge with indicator
659
+
660
+ merged = pd.merge(df1, df2, on='id', how='outer', indicator=True)
661
+
662
+ filter for unmatched:
663
+ merged[merged['_merge'] == 'left_only']
664
+
665
+
666
+ concatenating dataframes
667
+
668
+ vertical concat:
669
+ pd.concat([df1, df2, df3], ignore_index=True)
670
+
671
+ horizontal concat:
672
+ pd.concat([df1, df2], axis=1)
673
+
674
+ concat with keys:
675
+ pd.concat([df1, df2], keys=['source1', 'source2'])
676
+
677
+
678
+ join on index
679
+
680
+ df1.join(df2, on='id')
681
+
682
+ inner join on index:
683
+ df1.join(df2, how='inner')
684
+
685
+
686
+ PHASE 8: RESHAPING DATA
687
+
688
+
689
+ melting data
690
+
691
+ wide to long:
692
+ pd.melt(
693
+ df,
694
+ id_vars=['id', 'name'],
695
+ value_vars=['q1', 'q2', 'q3'],
696
+ var_name='quarter',
697
+ value_name='sales'
698
+ )
699
+
700
+
701
+ stacking and unstacking
702
+
703
+ stack columns to index:
704
+ df.stack()
705
+
706
+ unstack index to columns:
707
+ df.unstack()
708
+
709
+
710
+ pivoting
711
+
712
+ long to wide:
713
+ df.pivot(
714
+ index='id',
715
+ columns='date',
716
+ values='value'
717
+ )
718
+
719
+
720
+ multiindex operations
721
+
722
+ create multiindex:
723
+ df.set_index(['category', 'subcategory'])
724
+
725
+ select from multiindex:
726
+ df.loc['category_A']
727
+
728
+ swap levels:
729
+ df.swaplevel()
730
+
731
+ reset index:
732
+ df.reset_index()
733
+
734
+
735
+ PHASE 9: APPLYING FUNCTIONS
736
+
737
+
738
+ apply to series
739
+
740
+ simple function:
741
+ df['new_column'] = df['column'].apply(lambda x: x * 2)
742
+
743
+ with condition:
744
+ df['new_column'] = df['column'].apply(
745
+ lambda x: 'high' if x > 100 else 'low'
746
+ )
747
+
748
+
749
+ apply to dataframe
750
+
751
+ row-wise:
752
+ df['result'] = df.apply(
753
+ lambda row: row['a'] + row['b'],
754
+ axis=1
755
+ )
756
+
757
+ column-wise:
758
+ df.apply(lambda col: col.max() - col.min())
759
+
760
+
761
+ vectorized operations
762
+
763
+ prefer vectorized over apply:
764
+ df['new_column'] = df['column'] * 2
765
+
766
+ much faster than:
767
+ df['new_column'] = df['column'].apply(lambda x: x * 2)
768
+
769
+
770
+ map values
771
+
772
+ simple mapping:
773
+ df['category'] = df['category'].map({
774
+ 'A': 'Alpha',
775
+ 'B': 'Beta'
776
+ })
777
+
778
+
779
+ PHASE 10: PERFORMANCE OPTIMIZATION
780
+
781
+
782
+ use efficient dtypes
783
+
784
+ convert to category:
785
+ df['column'] = df['column'].astype('category')
786
+
787
+ use int32 instead of int64:
788
+ df['column'] = df['column'].astype('int32')
789
+
790
+ use float32 instead of float64:
791
+ df['column'] = df['column'].astype('float32')
792
+
793
+
794
+ use vectorized operations
795
+
796
+ bad:
797
+ for i in range(len(df)):
798
+ df.loc[i, 'new_col'] = df.loc[i, 'col'] * 2
799
+
800
+ good:
801
+ df['new_col'] = df['col'] * 2
802
+
803
+
804
+ use categorical for repeated strings
805
+
806
+ memory savings:
807
+ df['category'] = df['category'].astype('category')
808
+
809
+
810
+ use query for complex filtering
811
+
812
+ faster than boolean indexing:
813
+ df.query('value > 100 and category == "A"')
814
+
815
+
816
+ use eval for complex expressions
817
+
818
+ df.eval('new_column = a + b * c')
819
+
820
+
821
+ avoid chaining operations
822
+
823
+ bad:
824
+ df[df['a'] > 10]['b'] = 5
825
+
826
+ good:
827
+ df.loc[df['a'] > 10, 'b'] = 5
828
+
829
+
830
+ PHASE 11: ADVANCED OPERATIONS
831
+
832
+
833
+ rolling windows
834
+
835
+ simple rolling mean:
836
+ df['rolling_mean'] = df['value'].rolling(window=5).mean()
837
+
838
+ rolling with multiple aggregations:
839
+ df['rolling_std'] = df['value'].rolling(window=5).std()
840
+
841
+ rolling with min periods:
842
+ df['rolling'] = df['value'].rolling(window=5, min_periods=1).mean()
843
+
844
+
845
+ expanding windows
846
+
847
+ cumulative sum:
848
+ df['cumsum'] = df['value'].expanding().sum()
849
+
850
+ cumulative mean:
851
+ df['cummean'] = df['value'].expanding().mean()
852
+
853
+
854
+ shift and lag
855
+
856
+ shift down:
857
+ df['lag1'] = df['value'].shift(1)
858
+
859
+ shift up:
860
+ df['lead1'] = df['value'].shift(-1)
861
+
862
+ percentage change:
863
+ df['pct_change'] = df['value'].pct_change()
864
+
865
+
866
+ rank and quantiles
867
+
868
+ rank:
869
+ df['rank'] = df['value'].rank()
870
+
871
+ quantile bins:
872
+ df['quantile'] = pd.qcut(
873
+ df['value'],
874
+ q=4,
875
+ labels=['Q1', 'Q2', 'Q3', 'Q4']
876
+ )
877
+
878
+
879
+ duplicates handling
880
+
881
+ find duplicates:
882
+ df.duplicated()
883
+
884
+ find first duplicates:
885
+ df.duplicated(keep='first')
886
+
887
+ find all duplicates:
888
+ df.duplicated(keep=False)
889
+
890
+
891
+ PHASE 12: WORKING WITH TIME SERIES
892
+
893
+
894
+ datetime basics
895
+
896
+ create datetime:
897
+ pd.to_datetime('2024-01-01')
898
+
899
+ parse datetime column:
900
+ df['date'] = pd.to_datetime(df['date'])
901
+
902
+ set datetime as index:
903
+ df.set_index('date', inplace=True)
904
+
905
+
906
+ resampling
907
+
908
+ daily to monthly:
909
+ df.resample('M').mean()
910
+
911
+ hourly to daily:
912
+ df.resample('D').sum()
913
+
914
+ custom frequency:
915
+ df.resample('2W').mean() # 2 weeks
916
+
917
+
918
+ time-based filtering
919
+
920
+ df.loc['2024-01':'2024-06']
921
+
922
+ df[df.index.month == 1]
923
+
924
+ df[df.index.dayofweek < 5] # weekdays
925
+
926
+
927
+ rolling with time
928
+
929
+ df.rolling('7D').mean()
930
+
931
+
932
+ PHASE 13: BEST PRACTICES CHECKLIST
933
+
934
+
935
+ before operations:
936
+
937
+ [ ] verify data types are correct
938
+ [ ] check for missing values
939
+ [ ] understand data size and memory requirements
940
+ [ ] sample data before full operations
941
+ [ ] backup original data if important
942
+
943
+
944
+ during operations:
945
+
946
+ [ ] use efficient dtypes (category, int32, float32)
947
+ [ ] prefer vectorized operations over loops
948
+ [ ] use query() for complex filtering
949
+ [ ] avoid chained indexing
950
+ [ ] use inplace=False by default
951
+
952
+
953
+ after operations:
954
+
955
+ [ ] verify operation results
956
+ [ ] check for unexpected missing values
957
+ [ ] validate data integrity
958
+ [ ] check memory usage
959
+ [ ] document transformations
960
+
961
+
962
+ PHASE 14: COMMON PITFALLS TO AVOID
963
+
964
+
965
+ chained assignment
966
+
967
+ wrong:
968
+ df[df['a'] > 10]['b'] = 5
969
+
970
+ correct:
971
+ df.loc[df['a'] > 10, 'b'] = 5
972
+
973
+
974
+ modifying while iterating
975
+
976
+ wrong:
977
+ for i, row in df.iterrows():
978
+ df.loc[i, 'new'] = row['old'] * 2
979
+
980
+ correct:
981
+ df['new'] = df['old'] * 2
982
+
983
+
984
+ ignoring copy vs view
985
+
986
+ df_copy = df.copy() # explicit copy
987
+
988
+
989
+ forgetting to reset_index after filtering
990
+
991
+ df_filtered = df[condition].reset_index(drop=True)
992
+
993
+
994
+ mixing loc and iloc
995
+
996
+ be consistent with either label-based or positional indexing
997
+
998
+
999
+ PHASE 15: MANDATORY RULES
1000
+
1001
+ while this skill is active, these rules are MANDATORY:
1002
+
1003
+ [1] ALWAYS CHECK DATA TYPES before operations
1004
+ incorrect types cause unexpected results
1005
+ verify with df.dtypes before processing
1006
+
1007
+ [2] NEVER USE LOOPS for DataFrame operations
1008
+ vectorized operations are 100-1000x faster
1009
+ always use df.apply() or built-in methods
1010
+
1011
+ [3] ALWAYS HANDLE MISSING VALUES explicitly
1012
+ decide how to handle: drop, fill, or leave
1013
+ document your decision
1014
+
1015
+ [4] NEVER CHAIN INDEXING without .loc or .iloc
1016
+ df[df.a > 10]['b'] = 5 is dangerous
1017
+ use df.loc[df.a > 10, 'b'] = 5
1018
+
1019
+ [5] ALWAYS SAMPLE before full operations
1020
+ test on sample (df.sample(1000))
1021
+ verify correctness before full dataset
1022
+
1023
+ [6] NEVER ASSUME INDEX is sorted
1024
+ sort before operations if needed
1025
+ df.sort_values('column')
1026
+
1027
+ [7] ALWAYS USE EFFICIENT DTYPES for large datasets
1028
+ category for low-cardinality strings
1029
+ int32/float32 instead of 64-bit
1030
+
1031
+ [8] NEVER FORGET inplace=False default
1032
+ most operations return new dataframe
1033
+ assign result: df = df.copy()
1034
+
1035
+ [9] ALWAYS VERIFY OUTPUT SHAPE and content
1036
+ check df.shape after operations
1037
+ df.head() to spot issues
1038
+
1039
+ [10] NEVER MIX loc and iloc in same operation
1040
+ pick one and stick with it
1041
+ be consistent in your code
1042
+
1043
+
1044
+ FINAL REMINDERS
1045
+
1046
+
1047
+ pandas is powerful
1048
+
1049
+ use vectorized operations.
1050
+ your code will be faster.
1051
+ and more readable.
1052
+
1053
+
1054
+ think in operations, not rows
1055
+
1056
+ not: for each row, do this
1057
+ but: apply this operation to all rows
1058
+
1059
+
1060
+ the chainable API
1061
+
1062
+ df.query('value > 100').groupby('category').agg({'value': 'mean'})
1063
+
1064
+ each step returns a dataframe.
1065
+ chain them together.
1066
+
1067
+
1068
+ when stuck
1069
+
1070
+ [ ] read pandas documentation
1071
+ [ ] check stackoverflow
1072
+ [ ] use df.info() to understand structure
1073
+ [ ] print df.head() to see data
1074
+
1075
+ the goal
1076
+
1077
+ clean data.
1078
+ transform efficiently.
1079
+ get insights fast.
1080
+
1081
+ now go analyze that data.