kollabor 0.4.9__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. agents/__init__.py +2 -0
  2. agents/coder/__init__.py +0 -0
  3. agents/coder/agent.json +4 -0
  4. agents/coder/api-integration.md +2150 -0
  5. agents/coder/cli-pretty.md +765 -0
  6. agents/coder/code-review.md +1092 -0
  7. agents/coder/database-design.md +1525 -0
  8. agents/coder/debugging.md +1102 -0
  9. agents/coder/dependency-management.md +1397 -0
  10. agents/coder/git-workflow.md +1099 -0
  11. agents/coder/refactoring.md +1454 -0
  12. agents/coder/security-hardening.md +1732 -0
  13. agents/coder/system_prompt.md +1448 -0
  14. agents/coder/tdd.md +1367 -0
  15. agents/creative-writer/__init__.py +0 -0
  16. agents/creative-writer/agent.json +4 -0
  17. agents/creative-writer/character-development.md +1852 -0
  18. agents/creative-writer/dialogue-craft.md +1122 -0
  19. agents/creative-writer/plot-structure.md +1073 -0
  20. agents/creative-writer/revision-editing.md +1484 -0
  21. agents/creative-writer/system_prompt.md +690 -0
  22. agents/creative-writer/worldbuilding.md +2049 -0
  23. agents/data-analyst/__init__.py +30 -0
  24. agents/data-analyst/agent.json +4 -0
  25. agents/data-analyst/data-visualization.md +992 -0
  26. agents/data-analyst/exploratory-data-analysis.md +1110 -0
  27. agents/data-analyst/pandas-data-manipulation.md +1081 -0
  28. agents/data-analyst/sql-query-optimization.md +881 -0
  29. agents/data-analyst/statistical-analysis.md +1118 -0
  30. agents/data-analyst/system_prompt.md +928 -0
  31. agents/default/__init__.py +0 -0
  32. agents/default/agent.json +4 -0
  33. agents/default/dead-code.md +794 -0
  34. agents/default/explore-agent-system.md +585 -0
  35. agents/default/system_prompt.md +1448 -0
  36. agents/kollabor/__init__.py +0 -0
  37. agents/kollabor/analyze-plugin-lifecycle.md +175 -0
  38. agents/kollabor/analyze-terminal-rendering.md +388 -0
  39. agents/kollabor/code-review.md +1092 -0
  40. agents/kollabor/debug-mcp-integration.md +521 -0
  41. agents/kollabor/debug-plugin-hooks.md +547 -0
  42. agents/kollabor/debugging.md +1102 -0
  43. agents/kollabor/dependency-management.md +1397 -0
  44. agents/kollabor/git-workflow.md +1099 -0
  45. agents/kollabor/inspect-llm-conversation.md +148 -0
  46. agents/kollabor/monitor-event-bus.md +558 -0
  47. agents/kollabor/profile-performance.md +576 -0
  48. agents/kollabor/refactoring.md +1454 -0
  49. agents/kollabor/system_prompt copy.md +1448 -0
  50. agents/kollabor/system_prompt.md +757 -0
  51. agents/kollabor/trace-command-execution.md +178 -0
  52. agents/kollabor/validate-config.md +879 -0
  53. agents/research/__init__.py +0 -0
  54. agents/research/agent.json +4 -0
  55. agents/research/architecture-mapping.md +1099 -0
  56. agents/research/codebase-analysis.md +1077 -0
  57. agents/research/dependency-audit.md +1027 -0
  58. agents/research/performance-profiling.md +1047 -0
  59. agents/research/security-review.md +1359 -0
  60. agents/research/system_prompt.md +492 -0
  61. agents/technical-writer/__init__.py +0 -0
  62. agents/technical-writer/agent.json +4 -0
  63. agents/technical-writer/api-documentation.md +2328 -0
  64. agents/technical-writer/changelog-management.md +1181 -0
  65. agents/technical-writer/readme-writing.md +1360 -0
  66. agents/technical-writer/style-guide.md +1410 -0
  67. agents/technical-writer/system_prompt.md +653 -0
  68. agents/technical-writer/tutorial-creation.md +1448 -0
  69. core/__init__.py +0 -2
  70. core/application.py +343 -88
  71. core/cli.py +229 -10
  72. core/commands/menu_renderer.py +463 -59
  73. core/commands/registry.py +14 -9
  74. core/commands/system_commands.py +2461 -14
  75. core/config/loader.py +151 -37
  76. core/config/service.py +18 -6
  77. core/events/bus.py +29 -9
  78. core/events/executor.py +205 -75
  79. core/events/models.py +27 -8
  80. core/fullscreen/command_integration.py +20 -24
  81. core/fullscreen/components/__init__.py +10 -1
  82. core/fullscreen/components/matrix_components.py +1 -2
  83. core/fullscreen/components/space_shooter_components.py +654 -0
  84. core/fullscreen/plugin.py +5 -0
  85. core/fullscreen/renderer.py +52 -13
  86. core/fullscreen/session.py +52 -15
  87. core/io/__init__.py +29 -5
  88. core/io/buffer_manager.py +6 -1
  89. core/io/config_status_view.py +7 -29
  90. core/io/core_status_views.py +267 -347
  91. core/io/input/__init__.py +25 -0
  92. core/io/input/command_mode_handler.py +711 -0
  93. core/io/input/display_controller.py +128 -0
  94. core/io/input/hook_registrar.py +286 -0
  95. core/io/input/input_loop_manager.py +421 -0
  96. core/io/input/key_press_handler.py +502 -0
  97. core/io/input/modal_controller.py +1011 -0
  98. core/io/input/paste_processor.py +339 -0
  99. core/io/input/status_modal_renderer.py +184 -0
  100. core/io/input_errors.py +5 -1
  101. core/io/input_handler.py +211 -2452
  102. core/io/key_parser.py +7 -0
  103. core/io/layout.py +15 -3
  104. core/io/message_coordinator.py +111 -2
  105. core/io/message_renderer.py +129 -4
  106. core/io/status_renderer.py +147 -607
  107. core/io/terminal_renderer.py +97 -51
  108. core/io/terminal_state.py +21 -4
  109. core/io/visual_effects.py +816 -165
  110. core/llm/agent_manager.py +1063 -0
  111. core/llm/api_adapters/__init__.py +44 -0
  112. core/llm/api_adapters/anthropic_adapter.py +432 -0
  113. core/llm/api_adapters/base.py +241 -0
  114. core/llm/api_adapters/openai_adapter.py +326 -0
  115. core/llm/api_communication_service.py +167 -113
  116. core/llm/conversation_logger.py +322 -16
  117. core/llm/conversation_manager.py +556 -30
  118. core/llm/file_operations_executor.py +84 -32
  119. core/llm/llm_service.py +934 -103
  120. core/llm/mcp_integration.py +541 -57
  121. core/llm/message_display_service.py +135 -18
  122. core/llm/plugin_sdk.py +1 -2
  123. core/llm/profile_manager.py +1183 -0
  124. core/llm/response_parser.py +274 -56
  125. core/llm/response_processor.py +16 -3
  126. core/llm/tool_executor.py +6 -1
  127. core/logging/__init__.py +2 -0
  128. core/logging/setup.py +34 -6
  129. core/models/resume.py +54 -0
  130. core/plugins/__init__.py +4 -2
  131. core/plugins/base.py +127 -0
  132. core/plugins/collector.py +23 -161
  133. core/plugins/discovery.py +37 -3
  134. core/plugins/factory.py +6 -12
  135. core/plugins/registry.py +5 -17
  136. core/ui/config_widgets.py +128 -28
  137. core/ui/live_modal_renderer.py +2 -1
  138. core/ui/modal_actions.py +5 -0
  139. core/ui/modal_overlay_renderer.py +0 -60
  140. core/ui/modal_renderer.py +268 -7
  141. core/ui/modal_state_manager.py +29 -4
  142. core/ui/widgets/base_widget.py +7 -0
  143. core/updates/__init__.py +10 -0
  144. core/updates/version_check_service.py +348 -0
  145. core/updates/version_comparator.py +103 -0
  146. core/utils/config_utils.py +685 -526
  147. core/utils/plugin_utils.py +1 -1
  148. core/utils/session_naming.py +111 -0
  149. fonts/LICENSE +21 -0
  150. fonts/README.md +46 -0
  151. fonts/SymbolsNerdFont-Regular.ttf +0 -0
  152. fonts/SymbolsNerdFontMono-Regular.ttf +0 -0
  153. fonts/__init__.py +44 -0
  154. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/METADATA +54 -4
  155. kollabor-0.4.15.dist-info/RECORD +228 -0
  156. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/top_level.txt +2 -0
  157. plugins/agent_orchestrator/__init__.py +39 -0
  158. plugins/agent_orchestrator/activity_monitor.py +181 -0
  159. plugins/agent_orchestrator/file_attacher.py +77 -0
  160. plugins/agent_orchestrator/message_injector.py +135 -0
  161. plugins/agent_orchestrator/models.py +48 -0
  162. plugins/agent_orchestrator/orchestrator.py +403 -0
  163. plugins/agent_orchestrator/plugin.py +976 -0
  164. plugins/agent_orchestrator/xml_parser.py +191 -0
  165. plugins/agent_orchestrator_plugin.py +9 -0
  166. plugins/enhanced_input/box_styles.py +1 -0
  167. plugins/enhanced_input/color_engine.py +19 -4
  168. plugins/enhanced_input/config.py +2 -2
  169. plugins/enhanced_input_plugin.py +61 -11
  170. plugins/fullscreen/__init__.py +6 -2
  171. plugins/fullscreen/example_plugin.py +1035 -222
  172. plugins/fullscreen/setup_wizard_plugin.py +592 -0
  173. plugins/fullscreen/space_shooter_plugin.py +131 -0
  174. plugins/hook_monitoring_plugin.py +436 -78
  175. plugins/query_enhancer_plugin.py +66 -30
  176. plugins/resume_conversation_plugin.py +1494 -0
  177. plugins/save_conversation_plugin.py +98 -32
  178. plugins/system_commands_plugin.py +70 -56
  179. plugins/tmux_plugin.py +154 -78
  180. plugins/workflow_enforcement_plugin.py +94 -92
  181. system_prompt/default.md +952 -886
  182. core/io/input_mode_manager.py +0 -402
  183. core/io/modal_interaction_handler.py +0 -315
  184. core/io/raw_input_processor.py +0 -946
  185. core/storage/__init__.py +0 -5
  186. core/storage/state_manager.py +0 -84
  187. core/ui/widget_integration.py +0 -222
  188. core/utils/key_reader.py +0 -171
  189. kollabor-0.4.9.dist-info/RECORD +0 -128
  190. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/WHEEL +0 -0
  191. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/entry_points.txt +0 -0
  192. {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1110 @@
1
+ <!-- Exploratory Data Analysis skill - comprehensive data discovery and understanding -->
2
+
3
+ exploratory data analysis: DISCOVER INSIGHTS THROUGH SYSTEMATIC EXPLORATION
4
+
5
+ when this skill is active, you follow rigorous EDA methodology.
6
+ this is a comprehensive guide to understanding your data before modeling.
7
+
8
+
9
+ PHASE 0: EDA ENVIRONMENT VERIFICATION
10
+
11
+ before starting ANY analysis, verify your data science stack is ready.
12
+
13
+
14
+ verify python data packages
15
+
16
+ <terminal>python -c "import pandas as pd; print(f'pandas {pd.__version__} ready')"</terminal>
17
+
18
+ if pandas not available:
19
+ <terminal>pip install pandas numpy scipy</terminal>
20
+
21
+ verify visualization:
22
+ <terminal>python -c "import matplotlib.pyplot as plt; import seaborn as sns; print('viz packages ready')"</terminal>
23
+
24
+ if visualization not available:
25
+ <terminal>pip install matplotlib seaborn plotly</terminal>
26
+
27
+
28
+ verify data file accessibility
29
+
30
+ list data files:
31
+ <terminal>find . -maxdepth 2 -type f \( -name "*.csv" -o -name "*.json" -o -name "*.xlsx" -o -name "*.parquet" \)</terminal>
32
+
33
+ check file sizes:
34
+ <terminal>find . -maxdepth 2 -type f \( -name "*.csv" -o -name "*.json" -o -name "*.xlsx" -o -name "*.parquet" \) -exec ls -lh {} \;</terminal>
35
+
36
+ verify readability:
37
+ <terminal>python -c "import pandas as pd; df = pd.read_csv('data.csv'); print(f'shape: {df.shape}'); print(f'dtypes:\n{df.dtypes}')"</terminal>
38
+
39
+
40
+ check available memory
41
+
42
+ <terminal>python -c "import psutil; mem = psutil.virtual_memory(); print(f'total: {mem.total / 1e9:.2f} GB'); print(f'available: {mem.available / 1e9:.2f} GB')"</terminal>
43
+
44
+ if memory is limited for large datasets:
45
+ <terminal>pip install dask modin</terminal>
46
+
47
+
48
+ verify jupyter/lab notebooks (optional but recommended)
49
+
50
+ <terminal>jupyter --version 2>/dev/null || echo "jupyter not installed"</terminal>
51
+
52
+ if jupyter not installed:
53
+ <terminal>pip install jupyter jupyterlab</terminal>
54
+
55
+
56
+ PHASE 1: INITIAL DATA LOAD AND INSPECTION
57
+
58
+
59
+ load data with appropriate reader
60
+
61
+ csv files:
62
+ <terminal>python -c "
63
+ import pandas as pd
64
+ df = pd.read_csv('data.csv', parse_dates=True, infer_datetime_format=True)
65
+ print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
66
+ "</terminal>
67
+
68
+ json files:
69
+ <terminal>python -c "
70
+ import pandas as pd
71
+ df = pd.read_json('data.json')
72
+ print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
73
+ "</terminal>
74
+
75
+ excel files:
76
+ <terminal>python -c "
77
+ import pandas as pd
78
+ df = pd.read_excel('data.xlsx', engine='openpyxl')
79
+ print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
80
+ "</terminal>
81
+
82
+ parquet files (for large data):
83
+ <terminal>python -c "
84
+ import pandas as pd
85
+ df = pd.read_parquet('data.parquet')
86
+ print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
87
+ "</terminal>
88
+
89
+
90
+ handle large datasets with chunking
91
+
92
+ chunked reading:
93
+ <terminal>python -c "
94
+ import pandas as pd
95
+
96
+ chunk_size = 10000
97
+ chunks = pd.read_csv('large_data.csv', chunksize=chunk_size)
98
+
99
+ total_rows = 0
100
+ for i, chunk in enumerate(chunks):
101
+ total_rows += len(chunk)
102
+ print(f'chunk {i}: {len(chunk)} rows')
103
+
104
+ print(f'total rows: {total_rows}')
105
+ "</terminal>
106
+
107
+ sample large dataset:
108
+ <terminal>python -c "
109
+ import pandas as pd
110
+
111
+ # read first N rows
112
+ df = pd.read_csv('large_data.csv', nrows=100000)
113
+ print(f'sample: {df.shape}')
114
+ "</terminal>
115
+
116
+
117
+ basic data overview
118
+
119
+ shape and memory:
120
+ <terminal>python -c "
121
+ import pandas as pd
122
+
123
+ df = pd.read_csv('data.csv')
124
+ print(f'shape: {df.shape}')
125
+ print(f'memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB')
126
+ print(f'\ncolumns: {list(df.columns)}')
127
+ "</terminal>
128
+
129
+ data types:
130
+ <terminal>python -c "
131
+ import pandas as pd
132
+
133
+ df = pd.read_csv('data.csv')
134
+ print('data types:')
135
+ print(df.dtypes)
136
+ print(f'\ntype distribution:')
137
+ print(df.dtypes.value_counts())
138
+ "</terminal>
139
+
140
+ sample data:
141
+ <terminal>python -c "
142
+ import pandas as pd
143
+
144
+ df = pd.read_csv('data.csv')
145
+ print('first 5 rows:')
146
+ print(df.head())
147
+ print('\nlast 5 rows:')
148
+ print(df.tail())
149
+ print('\nrandom sample:')
150
+ print(df.sample(5))
151
+ "</terminal>
152
+
153
+
154
+ PHASE 2: DATA QUALITY ASSESSMENT
155
+
156
+
157
+ missing value analysis
158
+
159
+ overall missing data:
160
+ <terminal>python -c "
161
+ import pandas as pd
162
+
163
+ df = pd.read_csv('data.csv')
164
+ missing = df.isnull().sum()
165
+ missing_pct = (missing / len(df)) * 100
166
+
167
+ print('missing values:')
168
+ print(pd.DataFrame({
169
+ 'count': missing,
170
+ 'percentage': missing_pct
171
+ }))
172
+ "</terminal>
173
+
174
+ missing data visualization:
175
+ <terminal>python -c "
176
+ import pandas as pd
177
+ import matplotlib.pyplot as plt
178
+ import seaborn as sns
179
+
180
+ df = pd.read_csv('data.csv')
181
+ missing = df.isnull()
182
+
183
+ plt.figure(figsize=(12, 8))
184
+ sns.heatmap(missing, cbar=False, cmap='viridis')
185
+ plt.title('missing data heatmap')
186
+ plt.tight_layout()
187
+ plt.savefig('missing_data_heatmap.png')
188
+ print('saved: missing_data_heatmap.png')
189
+ "</terminal>
190
+
191
+ missing data patterns:
192
+ <terminal>python -c "
193
+ import pandas as pd
194
+
195
+ df = pd.read_csv('data.csv')
196
+
197
+ # check if missing values are related
198
+ print('missing correlation:')
199
+ print(df.isnull().corr())
200
+
201
+ # check row-level missing
202
+ df['missing_count'] = df.isnull().sum(axis=1)
203
+ print(f'\nrows with missing data: {(df[\"missing_count\"] > 0).sum()}')
204
+ print(f'missing per row stats:')
205
+ print(df['missing_count'].describe())
206
+ "</terminal>
207
+
208
+
209
+ duplicate detection
210
+
211
+ exact duplicates:
212
+ <terminal>python -c "
213
+ import pandas as pd
214
+
215
+ df = pd.read_csv('data.csv')
216
+ duplicates = df.duplicated()
217
+
218
+ print(f'duplicate rows: {duplicates.sum()} ({duplicates.sum()/len(df)*100:.2f}%)')
219
+ print(f'\nunique rows: {len(df.drop_duplicates())}')
220
+ "</terminal>
221
+
222
+ subset duplicates:
223
+ <terminal>python -c "
224
+ import pandas as pd
225
+
226
+ df = pd.read_csv('data.csv')
227
+
228
+ # check duplicates on specific columns
229
+ subset_cols = ['id', 'name', 'date']
230
+ subset_dups = df.duplicated(subset=subset_cols)
231
+
232
+ print(f'duplicates on {subset_cols}: {subset_dups.sum()}')
233
+ "</terminal>
234
+
235
+ duplicate analysis:
236
+ <terminal>python -c "
237
+ import pandas as pd
238
+
239
+ df = pd.read_csv('data.csv')
240
+
241
+ # show duplicate examples
242
+ duplicates = df[df.duplicated(keep=False)]
243
+ print('duplicate examples:')
244
+ print(duplicates.sort_values(by=list(df.columns)).head(10))
245
+ "</terminal>
246
+
247
+
248
+ data type validation
249
+
250
+ type mismatch detection:
251
+ <terminal>python -c "
252
+ import pandas as pd
253
+
254
+ df = pd.read_csv('data.csv')
255
+
256
+ # check numeric columns with non-numeric values
257
+ for col in df.select_dtypes(include=['object']).columns:
258
+ # try to convert to numeric
259
+ try:
260
+ numeric = pd.to_numeric(df[col], errors='coerce')
261
+ non_numeric = numeric.isnull() & df[col].notnull()
262
+ if non_numeric.any():
263
+ print(f'{col}: {non_numeric.sum()} non-numeric values')
264
+ print(f' examples: {df.loc[non_numeric, col].head().tolist()}')
265
+ except:
266
+ pass
267
+ "</terminal>
268
+
269
+ datetime validation:
270
+ <terminal>python -c "
271
+ import pandas as pd
272
+
273
+ df = pd.read_csv('data.csv')
274
+
275
+ # try to parse object columns as datetime
276
+ for col in df.select_dtypes(include=['object']).columns:
277
+ if 'date' in col.lower() or 'time' in col.lower():
278
+ try:
279
+ parsed = pd.to_datetime(df[col], errors='coerce')
280
+ failed = parsed.isnull() & df[col].notnull()
281
+ if failed.any():
282
+ print(f'{col}: {failed.sum()} invalid datetime values')
283
+ print(f' examples: {df.loc[failed, col].head().tolist()}')
284
+ except:
285
+ pass
286
+ "</terminal>
287
+
288
+
289
+ outlier detection
290
+
291
+ statistical outliers (z-score):
292
+ <terminal>python -c "
293
+ import pandas as pd
294
+ import numpy as np
295
+
296
+ df = pd.read_csv('data.csv')
297
+
298
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
299
+
300
+ for col in numeric_cols:
301
+ z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
302
+ outliers = z_scores > 3
303
+ if outliers.any():
304
+ print(f'{col}: {outliers.sum()} outliers (z > 3)')
305
+ print(f' outlier values: {df.loc[outliers, col].describe()}')
306
+ "</terminal>
307
+
308
+ iqr outliers:
309
+ <terminal>python -c "
310
+ import pandas as pd
311
+ import numpy as np
312
+
313
+ df = pd.read_csv('data.csv')
314
+
315
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
316
+
317
+ for col in numeric_cols:
318
+ q1 = df[col].quantile(0.25)
319
+ q3 = df[col].quantile(0.75)
320
+ iqr = q3 - q1
321
+ lower_bound = q1 - 1.5 * iqr
322
+ upper_bound = q3 + 1.5 * iqr
323
+
324
+ outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
325
+ if outliers.any():
326
+ print(f'{col}: {outliers.sum()} outliers (iqr method)')
327
+ print(f' bounds: [{lower_bound:.2f}, {upper_bound:.2f}]')
328
+ "</terminal>
329
+
330
+ visual outlier detection:
331
+ <terminal>python -c "
332
+ import pandas as pd
333
+ import matplotlib.pyplot as plt
334
+
335
+ df = pd.read_csv('data.csv')
336
+ numeric_cols = df.select_dtypes(include=['number']).columns
337
+
338
+ fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(10, 5*len(numeric_cols)))
339
+ if len(numeric_cols) == 1:
340
+ axes = [axes]
341
+
342
+ for i, col in enumerate(numeric_cols):
343
+ df[col].plot(kind='box', ax=axes[i])
344
+ axes[i].set_title(f'{col} boxplot')
345
+
346
+ plt.tight_layout()
347
+ plt.savefig('outlier_boxplots.png')
348
+ print('saved: outlier_boxplots.png')
349
+ "</terminal>
350
+
351
+
352
+ PHASE 3: UNIVARIATE ANALYSIS
353
+
354
+
355
+ numeric variable analysis
356
+
357
+ descriptive statistics:
358
+ <terminal>python -c "
359
+ import pandas as pd
360
+
361
+ df = pd.read_csv('data.csv')
362
+ numeric_cols = df.select_dtypes(include=['number']).columns
363
+
364
+ print('descriptive statistics:')
365
+ print(df[numeric_cols].describe().transpose())
366
+
367
+ print('\nskewness:')
368
+ print(df[numeric_cols].skew())
369
+
370
+ print('\nkurtosis:')
371
+ print(df[numeric_cols].kurtosis())
372
+ "</terminal>
373
+
374
+ distribution visualization:
375
+ <terminal>python -c "
376
+ import pandas as pd
377
+ import matplotlib.pyplot as plt
378
+ import seaborn as sns
379
+
380
+ df = pd.read_csv('data.csv')
381
+ numeric_cols = df.select_dtypes(include=['number']).columns
382
+
383
+ fig, axes = plt.subplots(len(numeric_cols), 2, figsize=(15, 5*len(numeric_cols)))
384
+ if len(numeric_cols) == 1:
385
+ axes = axes.reshape(1, -1)
386
+
387
+ for i, col in enumerate(numeric_cols):
388
+ # histogram
389
+ df[col].hist(ax=axes[i, 0], bins=50)
390
+ axes[i, 0].set_title(f'{col} distribution')
391
+ axes[i, 0].set_xlabel(col)
392
+ axes[i, 0].set_ylabel('frequency')
393
+
394
+ # density plot
395
+ df[col].plot(kind='kde', ax=axes[i, 1])
396
+ axes[i, 1].set_title(f'{col} density')
397
+ axes[i, 1].set_xlabel(col)
398
+ axes[i, 1].set_ylabel('density')
399
+
400
+ plt.tight_layout()
401
+ plt.savefig('numeric_distributions.png')
402
+ print('saved: numeric_distributions.png')
403
+ "</terminal>
404
+
405
+ normality tests:
406
+ <terminal>python -c "
407
+ import pandas as pd
408
+ from scipy import stats
409
+
410
+ df = pd.read_csv('data.csv')
411
+ numeric_cols = df.select_dtypes(include=['number']).columns
412
+
413
+ for col in numeric_cols:
414
+ data = df[col].dropna()
415
+
416
+ # shapiro-wilk test (for small samples)
417
+ if len(data) < 5000:
418
+ stat, p = stats.shapiro(data)
419
+ print(f'{col}: shapiro-wilk p={p:.4f} (normal={p > 0.05})')
420
+ else:
421
+ # kolmogorov-smirnov test (for large samples)
422
+ stat, p = stats.kstest(data, 'norm')
423
+ print(f'{col}: ks-test p={p:.4f} (normal={p > 0.05})')
424
+ "</terminal>
425
+
426
+
427
+ categorical variable analysis
428
+
429
+ value counts:
430
+ <terminal>python -c "
431
+ import pandas as pd
432
+
433
+ df = pd.read_csv('data.csv')
434
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
435
+
436
+ for col in categorical_cols:
437
+ print(f'\n{col}:')
438
+ print(f' unique values: {df[col].nunique()}')
439
+ print(f' top 5 values:')
440
+ print(df[col].value_counts().head())
441
+ "</terminal>
442
+
443
+ cardinality analysis:
444
+ <terminal>python -c "
445
+ import pandas as pd
446
+
447
+ df = pd.read_csv('data.csv')
448
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
449
+
450
+ print('cardinality analysis:')
451
+ cardinality = pd.DataFrame({
452
+ 'unique': df[categorical_cols].nunique(),
453
+ 'total': len(df),
454
+ 'ratio': df[categorical_cols].nunique() / len(df)
455
+ })
456
+ print(cardinality)
457
+
458
+ print('\nhigh cardinality columns (> 0.5):')
459
+ high_card = cardinality[cardinality['ratio'] > 0.5]
460
+ if not high_card.empty:
461
+ print(high_card)
462
+ "</terminal>
463
+
464
+ categorical visualization:
465
+ <terminal>python -c "
466
+ import pandas as pd
467
+ import matplotlib.pyplot as plt
468
+ import seaborn as sns
469
+
470
+ df = pd.read_csv('data.csv')
471
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
472
+
473
+ # plot top categories for each column
474
+ for col in categorical_cols:
475
+ if df[col].nunique() <= 20:
476
+ plt.figure(figsize=(10, 6))
477
+ df[col].value_counts().plot(kind='bar')
478
+ plt.title(f'{col} distribution')
479
+ plt.xticks(rotation=45)
480
+ plt.tight_layout()
481
+ plt.savefig(f'{col}_distribution.png')
482
+ plt.close()
483
+ print(f'saved: {col}_distribution.png')
484
+ "</terminal>
485
+
486
+
487
+ datetime variable analysis
488
+
489
+ date range:
490
+ <terminal>python -c "
491
+ import pandas as pd
492
+
493
+ df = pd.read_csv('data.csv')
494
+
495
+ # identify datetime columns
496
+ datetime_cols = []
497
+ for col in df.columns:
498
+ try:
499
+ df[col] = pd.to_datetime(df[col], errors='coerce')
500
+ if df[col].notnull().any():
501
+ datetime_cols.append(col)
502
+ except:
503
+ pass
504
+
505
+ for col in datetime_cols:
506
+ print(f'\n{col}:')
507
+ print(f' range: {df[col].min()} to {df[col].max()}')
508
+ print(f' span: {(df[col].max() - df[col].min()).days} days')
509
+ print(f' missing: {df[col].isnull().sum()}')
510
+ "</terminal>
511
+
512
+ temporal patterns:
513
+ <terminal>python -c "
514
+ import pandas as pd
515
+ import matplotlib.pyplot as plt
516
+
517
+ df = pd.read_csv('data.csv')
518
+
519
+ # assume first datetime column
520
+ date_col = df.select_dtypes(include=['datetime64']).columns[0]
521
+
522
+ # extract time components
523
+ df['year'] = df[date_col].dt.year
524
+ df['month'] = df[date_col].dt.month
525
+ df['day'] = df[date_col].dt.day
526
+ df['weekday'] = df[date_col].dt.weekday
527
+ df['hour'] = df[date_col].dt.hour
528
+
529
+ fig, axes = plt.subplots(2, 2, figsize=(15, 10))
530
+
531
+ # yearly pattern
532
+ df['year'].value_counts().sort_index().plot(kind='bar', ax=axes[0,0])
533
+ axes[0,0].set_title('yearly pattern')
534
+
535
+ # monthly pattern
536
+ df['month'].value_counts().sort_index().plot(kind='bar', ax=axes[0,1])
537
+ axes[0,1].set_title('monthly pattern')
538
+
539
+ # weekday pattern
540
+ df['weekday'].value_counts().sort_index().plot(kind='bar', ax=axes[1,0])
541
+ axes[1,0].set_title('weekday pattern')
542
+
543
+ # hourly pattern
544
+ df['hour'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
545
+ axes[1,1].set_title('hourly pattern')
546
+
547
+ plt.tight_layout()
548
+ plt.savefig('temporal_patterns.png')
549
+ print('saved: temporal_patterns.png')
550
+ "</terminal>
551
+
552
+
553
+ PHASE 4: BIVARIATE ANALYSIS
554
+
555
+
556
+ numeric-numeric relationships
557
+
558
+ correlation matrix:
559
+ <terminal>python -c "
560
+ import pandas as pd
561
+ import matplotlib.pyplot as plt
562
+ import seaborn as sns
563
+
564
+ df = pd.read_csv('data.csv')
565
+ numeric_cols = df.select_dtypes(include=['number']).columns
566
+
567
+ # compute correlation matrix
568
+ corr = df[numeric_cols].corr()
569
+
570
+ # plot heatmap
571
+ plt.figure(figsize=(12, 10))
572
+ sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
573
+ plt.title('correlation matrix')
574
+ plt.tight_layout()
575
+ plt.savefig('correlation_matrix.png')
576
+ print('saved: correlation_matrix.png')
577
+ "</terminal>
578
+
579
+ scatter plot matrix:
580
+ <terminal>python -c "
581
+ import pandas as pd
582
+ import matplotlib.pyplot as plt
583
+ import seaborn as sns
584
+
585
+ df = pd.read_csv('data.csv')
586
+ numeric_cols = df.select_dtypes(include=['number']).columns[:5] # limit to first 5
587
+
588
+ # pairplot
589
+ sns.pairplot(df[numeric_cols], diag_kind='kde')
590
+ plt.tight_layout()
591
+ plt.savefig('scatter_matrix.png')
592
+ print('saved: scatter_matrix.png')
593
+ "</terminal>
594
+
595
+ strong correlations:
596
+ <terminal>python -c "
597
+ import pandas as pd
598
+
599
+ df = pd.read_csv('data.csv')
600
+ numeric_cols = df.select_dtypes(include=['number']).columns
601
+ corr = df[numeric_cols].corr()
602
+
603
+ # find strong correlations (|r| > 0.7)
604
+ strong_corr = []
605
+ for i in range(len(corr.columns)):
606
+ for j in range(i+1, len(corr.columns)):
607
+ if abs(corr.iloc[i, j]) > 0.7:
608
+ strong_corr.append({
609
+ 'var1': corr.columns[i],
610
+ 'var2': corr.columns[j],
611
+ 'correlation': corr.iloc[i, j]
612
+ })
613
+
614
+ if strong_corr:
615
+ strong_corr_df = pd.DataFrame(strong_corr)
616
+ strong_corr_df = strong_corr_df.sort_values('correlation', key=abs, ascending=False)
617
+ print('strong correlations:')
618
+ print(strong_corr_df)
619
+ else:
620
+ print('no strong correlations found')
621
+ "</terminal>
622
+
623
+
624
+ categorical-categorical relationships
625
+
626
+ crosstab analysis:
627
+ <terminal>python -c "
628
+ import pandas as pd
629
+
630
+ df = pd.read_csv('data.csv')
631
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:4]
632
+
633
+ for i in range(len(categorical_cols)):
634
+ for j in range(i+1, len(categorical_cols)):
635
+ col1, col2 = categorical_cols[i], categorical_cols[j]
636
+ if df[col1].nunique() <= 10 and df[col2].nunique() <= 10:
637
+ print(f'\n{col1} vs {col2}:')
638
+ crosstab = pd.crosstab(df[col1], df[col2])
639
+ print(crosstab)
640
+ "</terminal>
641
+
642
+ chi-square test:
643
+ <terminal>python -c "
644
+ import pandas as pd
645
+ from scipy import stats
646
+
647
+ df = pd.read_csv('data.csv')
648
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:4]
649
+
650
+ for i in range(len(categorical_cols)):
651
+ for j in range(i+1, len(categorical_cols)):
652
+ col1, col2 = categorical_cols[i], categorical_cols[j]
653
+ if df[col1].nunique() <= 10 and df[col2].nunique() <= 10:
654
+ crosstab = pd.crosstab(df[col1], df[col2])
655
+ chi2, p, dof, expected = stats.chi2_contingency(crosstab)
656
+ print(f'{col1} vs {col2}: chi2={chi2:.2f}, p={p:.4f}, significant={p < 0.05}')
657
+ "</terminal>
658
+
659
+
660
+ numeric-categorical relationships
661
+
662
+ group statistics:
663
+ <terminal>python -c "
664
+ import pandas as pd
665
+
666
+ df = pd.read_csv('data.csv')
667
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:3]
668
+ numeric_cols = df.select_dtypes(include=['number']).columns[:3]
669
+
670
+ for cat_col in categorical_cols:
671
+ if df[cat_col].nunique() <= 10:
672
+ print(f'\n{cat_col}:')
673
+ for num_col in numeric_cols:
674
+ group_stats = df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count'])
675
+ print(f' {num_col}:')
676
+ print(group_stats)
677
+ "</terminal>
678
+
679
+ boxplot by category:
680
+ <terminal>python -c "
681
+ import pandas as pd
682
+ import matplotlib.pyplot as plt
683
+ import seaborn as sns
684
+
685
+ df = pd.read_csv('data.csv')
686
+ cat_col = df.select_dtypes(include=['object', 'category']).columns[0]
687
+ num_col = df.select_dtypes(include=['number']).columns[0]
688
+
689
+ if df[cat_col].nunique() <= 10:
690
+ plt.figure(figsize=(12, 6))
691
+ sns.boxplot(x=cat_col, y=num_col, data=df)
692
+ plt.title(f'{num_col} by {cat_col}')
693
+ plt.xticks(rotation=45)
694
+ plt.tight_layout()
695
+ plt.savefig(f'{num_col}_by_{cat_col}.png')
696
+ print(f'saved: {num_col}_by_{cat_col}.png')
697
+ "</terminal>
698
+
699
+ anova test:
700
+ <terminal>python -c "
701
+ import pandas as pd
702
+ from scipy import stats
703
+
704
+ df = pd.read_csv('data.csv')
705
+ cat_col = df.select_dtypes(include=['object', 'category']).columns[0]
706
+ num_col = df.select_dtypes(include=['number']).columns[0]
707
+
708
+ if df[cat_col].nunique() <= 10:
709
+ groups = [group[num_col].dropna() for name, group in df.groupby(cat_col)]
710
+ f_stat, p_value = stats.f_oneway(*groups)
711
+ print(f'anova test for {num_col} by {cat_col}:')
712
+ print(f' f-statistic: {f_stat:.4f}')
713
+ print(f' p-value: {p_value:.4f}')
714
+ print(f' significant: {p_value < 0.05}')
715
+ "</terminal>
716
+
717
+
718
+ PHASE 5: MULTIVARIATE ANALYSIS
719
+
720
+
721
+ dimensionality reduction visualization
722
+
723
+ pca scatter plot:
724
+ <terminal>python -c "
725
+ import pandas as pd
726
+ from sklearn.decomposition import PCA
727
+ from sklearn.preprocessing import StandardScaler
728
+ import matplotlib.pyplot as plt
729
+
730
+ df = pd.read_csv('data.csv')
731
+ numeric_cols = df.select_dtypes(include=['number']).columns
732
+
733
+ # prepare data
734
+ data = df[numeric_cols].dropna()
735
+ scaler = StandardScaler()
736
+ scaled_data = scaler.fit_transform(data)
737
+
738
+ # perform pca
739
+ pca = PCA(n_components=2)
740
+ pca_result = pca.fit_transform(scaled_data)
741
+
742
+ # plot
743
+ plt.figure(figsize=(10, 8))
744
+ plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
745
+ plt.xlabel('pc1 (explained variance: {:.2%})'.format(pca.explained_variance_ratio_[0]))
746
+ plt.ylabel('pc2 (explained variance: {:.2%})'.format(pca.explained_variance_ratio_[1]))
747
+ plt.title('pca scatter plot')
748
+ plt.savefig('pca_scatter.png')
749
+ print('saved: pca_scatter.png')
750
+ print(f'total variance explained: {pca.explained_variance_ratio_.sum():.2%}')
751
+ "</terminal>
752
+
753
+ pca loading analysis:
754
+ <terminal>python -c "
755
+ import pandas as pd
756
+ from sklearn.decomposition import PCA
757
+ from sklearn.preprocessing import StandardScaler
758
+
759
+ df = pd.read_csv('data.csv')
760
+ numeric_cols = df.select_dtypes(include=['number']).columns
761
+
762
+ # prepare data
763
+ data = df[numeric_cols].dropna()
764
+ scaler = StandardScaler()
765
+ scaled_data = scaler.fit_transform(data)
766
+
767
+ # perform pca
768
+ pca = PCA()
769
+ pca_result = pca.fit_transform(scaled_data)
770
+
771
+ # loading matrix
772
+ loadings = pd.DataFrame(
773
+ pca.components_.T,
774
+ columns=[f'pc{i+1}' for i in range(len(numeric_cols))],
775
+ index=numeric_cols
776
+ )
777
+
778
+ print('pca loadings (top 3 components):')
779
+ print(loadings.iloc[:, :3])
780
+
781
+ print('\nexplained variance ratio:')
782
+ print(pca.explained_variance_ratio_[:10])
783
+ "</terminal>
784
+
785
+
786
+ feature importance analysis
787
+
788
+ random forest feature importance:
789
+ <terminal>python -c "
790
+ import pandas as pd
791
+ from sklearn.ensemble import RandomForestRegressor
792
+ from sklearn.preprocessing import LabelEncoder
793
+ import matplotlib.pyplot as plt
794
+
795
+ df = pd.read_csv('data.csv')
796
+
797
+ # prepare features
798
+ numeric_cols = df.select_dtypes(include=['number']).columns
799
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
800
+
801
+ # encode categorical variables
802
+ for col in categorical_cols:
803
+ df[col] = LabelEncoder().fit_transform(df[col].astype(str))
804
+
805
+ # use all columns as features
806
+ features = df.dropna()
807
+ X = features
808
+ y = features[numeric_cols[0]] # use first numeric as target
809
+
810
+ # train random forest
811
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
812
+ rf.fit(X, y)
813
+
814
+ # feature importance
815
+ importance = pd.DataFrame({
816
+ 'feature': X.columns,
817
+ 'importance': rf.feature_importances_
818
+ }).sort_values('importance', ascending=False)
819
+
820
+ print('feature importance:')
821
+ print(importance.head(10))
822
+
823
+ # plot
824
+ plt.figure(figsize=(10, 6))
825
+ plt.barh(importance['feature'].head(10), importance['importance'].head(10))
826
+ plt.gca().invert_yaxis()
827
+ plt.title('feature importance (random forest)')
828
+ plt.tight_layout()
829
+ plt.savefig('feature_importance.png')
830
+ print('saved: feature_importance.png')
831
+ "</terminal>
832
+
833
+ mutual information:
834
+ <terminal>python -c "
835
+ import pandas as pd
836
+ from sklearn.feature_selection import mutual_info_regression
837
+ from sklearn.preprocessing import LabelEncoder
838
+ import matplotlib.pyplot as plt
839
+
840
+ df = pd.read_csv('data.csv')
841
+
842
+ # prepare data
843
+ numeric_cols = df.select_dtypes(include=['number']).columns
844
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
845
+
846
+ # encode categorical variables
847
+ for col in categorical_cols:
848
+ df[col] = LabelEncoder().fit_transform(df[col].astype(str))
849
+
850
+ # prepare features and target
851
+ features = df.dropna()
852
+ X = features[features.columns.difference([numeric_cols[0]])]
853
+ y = features[numeric_cols[0]]
854
+
855
+ # calculate mutual information
856
+ mi = mutual_info_regression(X, y)
857
+
858
+ # create dataframe
859
+ mi_df = pd.DataFrame({
860
+ 'feature': X.columns,
861
+ 'mutual_info': mi
862
+ }).sort_values('mutual_info', ascending=False)
863
+
864
+ print('mutual information:')
865
+ print(mi_df.head(10))
866
+
867
+ # plot
868
+ plt.figure(figsize=(10, 6))
869
+ plt.barh(mi_df['feature'].head(10), mi_df['mutual_info'].head(10))
870
+ plt.gca().invert_yaxis()
871
+ plt.title('mutual information')
872
+ plt.tight_layout()
873
+ plt.savefig('mutual_information.png')
874
+ print('saved: mutual_information.png')
875
+ "</terminal>
876
+
877
+
878
+ PHASE 6: DATA QUALITY REPORT
879
+
880
+
881
+ generate comprehensive report
882
+
883
+ <terminal>python -c "
884
+ import pandas as pd
885
+ import numpy as np
886
+ from datetime import datetime
887
+
888
+ # load data
889
+ df = pd.read_csv('data.csv')
890
+
891
+ # initialize report
892
+ report = []
893
+ report.append('=' * 80)
894
+ report.append('exploratory data analysis report')
895
+ report.append('=' * 80)
896
+ report.append(f'generated: {datetime.now().strftime(\"%y-%m-%d %h:%m:%s\")}')
897
+ report.append('')
898
+
899
+ # dataset overview
900
+ report.append('dataset overview')
901
+ report.append('-' * 40)
902
+ report.append(f'rows: {len(df):,}')
903
+ report.append(f'columns: {len(df.columns)}')
904
+ report.append(f'memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} mb')
905
+ report.append('')
906
+
907
+ # data types
908
+ report.append('data types')
909
+ report.append('-' * 40)
910
+ dtypes = df.dtypes.value_counts()
911
+ for dtype, count in dtypes.items():
912
+ report.append(f' {dtype}: {count}')
913
+ report.append('')
914
+
915
+ # missing data
916
+ report.append('missing data')
917
+ report.append('-' * 40)
918
+ missing = df.isnull().sum()
919
+ total_missing = missing.sum()
920
+ report.append(f'total missing: {total_missing:,}')
921
+ report.append(f'percentage: {total_missing / (len(df) * len(df.columns)) * 100:.2f}%')
922
+ report.append('')
923
+
924
+ for col, count in missing[missing > 0].items():
925
+ pct = (count / len(df)) * 100
926
+ report.append(f' {col}: {count:,} ({pct:.2f}%)')
927
+ report.append('')
928
+
929
+ # numeric statistics
930
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
931
+ if not numeric_cols.empty:
932
+ report.append('numeric statistics')
933
+ report.append('-' * 40)
934
+ for col in numeric_cols:
935
+ report.append(f' {col}:')
936
+ report.append(f' mean: {df[col].mean():.2f}')
937
+ report.append(f' std: {df[col].std():.2f}')
938
+ report.append(f' min: {df[col].min():.2f}')
939
+ report.append(f' max: {df[col].max():.2f}')
940
+ report.append(f' missing: {df[col].isnull().sum():,}')
941
+ report.append('')
942
+
943
+ # categorical statistics
944
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
945
+ if not categorical_cols.empty:
946
+ report.append('categorical statistics')
947
+ report.append('-' * 40)
948
+ for col in categorical_cols:
949
+ report.append(f' {col}:')
950
+ report.append(f' unique values: {df[col].nunique():,}')
951
+ report.append(f' most common: {df[col].mode()[0] if not df[col].mode().empty else \"none\"}')
952
+ report.append(f' missing: {df[col].isnull().sum():,}')
953
+ report.append('')
954
+
955
+ # duplicates
956
+ duplicates = df.duplicated().sum()
957
+ report.append('data quality')
958
+ report.append('-' * 40)
959
+ report.append(f'duplicate rows: {duplicates:,}')
960
+ report.append(f'duplicate percentage: {duplicates / len(df) * 100:.2f}%')
961
+ report.append('')
962
+
963
+ # save report
964
+ report_text = '\n'.join(report)
965
+ with open('eda_report.txt', 'w') as f:
966
+ f.write(report_text)
967
+
968
+ print('saved: eda_report.txt')
969
+ print(report_text)
970
+ "</terminal>
971
+
972
+
973
+ PHASE 7: EDA CHECKLIST
974
+
975
+
976
+ initial inspection
977
+
978
+ [ ] loaded data successfully
979
+ [ ] verified data shape and columns
980
+ [ ] checked data types
981
+ [ ] examined sample data
982
+ [ ] checked memory usage
983
+
984
+
985
+ data quality
986
+
987
+ [ ] identified missing values
988
+ [ ] quantified missing data patterns
989
+ [ ] detected duplicate records
990
+ [ ] identified outliers
991
+ [ ] validated data types
992
+ [ ] checked for inconsistencies
993
+
994
+
995
+ univariate analysis
996
+
997
+ [ ] analyzed numeric distributions
998
+ [ ] checked for normality
999
+ [ ] examined categorical frequencies
1000
+ [ ] analyzed temporal patterns
1001
+ [ ] identified high cardinality columns
1002
+
1003
+
1004
+ bivariate analysis
1005
+
1006
+ [ ] computed correlation matrix
1007
+ [ ] identified strong correlations
1008
+ [ ] analyzed categorical relationships
1009
+ [ ] examined group differences
1010
+ [ ] performed significance tests
1011
+
1012
+
1013
+ multivariate analysis
1014
+
1015
+ [ ] performed dimensionality reduction
1016
+ [ ] analyzed feature importance
1017
+ [ ] identified key patterns
1018
+ [ ] detected clusters or groups
1019
+
1020
+
1021
+ documentation
1022
+
1023
+ [ ] saved all visualizations
1024
+ [ ] generated summary report
1025
+ [ ] documented findings
1026
+ [ ] noted data quality issues
1027
+ [ ] suggested next steps
1028
+
1029
+
1030
+ PHASE 8: EDA RULES (MANDATORY)
1031
+
1032
+
1033
+ while this skill is active, these rules are MANDATORY:
1034
+
1035
+ [1] ALWAYS START WITH DATA INSPECTION
1036
+ never jump to modeling without understanding the data
1037
+ examine structure, types, and basic statistics first
1038
+
1039
+ [2] VISUALIZE EVERYTHING
1040
+ use plots to understand distributions and relationships
1041
+ visual patterns reveal insights that statistics miss
1042
+
1043
+ [3] CHECK DATA QUALITY FIRST
1044
+ identify missing values, duplicates, and outliers early
1045
+ poor data quality leads to poor insights
1046
+
1047
+ [4] UNDERSTAND DISTRIBUTIONS
1048
+ know if your data is normal, skewed, or multimodal
1049
+ distribution assumptions impact statistical tests
1050
+
1051
+ [5] EXPLORE RELATIONSHIPS
1052
+ examine correlations and associations between variables
1053
+ relationships drive predictive modeling
1054
+
1055
+ [6] DOCUMENT EVERYTHING
1056
+ save plots, code, and findings
1057
+ others should be able to reproduce your analysis
1058
+
1059
+ [7] BE SKEPTICAL OF OUTLIERS
1060
+ investigate before removing
1061
+ outliers might be errors or important signals
1062
+
1063
+ [8] CONSIDER SAMPLE BIAS
1064
+ understand who/what your data represents
1065
+ bias limits generalizability
1066
+
1067
+ [9] ITERATE AND REFINE
1068
+ initial findings suggest new questions
1069
+ follow interesting threads
1070
+
1071
+ [10] COMMUNICATE FINDINGS
1072
+ explain what you found and why it matters
1073
+ insights are only valuable if they're understood
1074
+
1075
+
1076
+ FINAL REMINDERS
1077
+
1078
+
1079
+ eda is exploration
1080
+
1081
+ you don't know what you'll find.
1082
+ follow your curiosity.
1083
+ investigate anomalies.
1084
+
1085
+
1086
+ patterns lead to insights
1087
+
1088
+ look for:
1089
+ - unexpected correlations
1090
+ - unusual distributions
1091
+ - hidden groups
1092
+ - temporal trends
1093
+ - outliers that matter
1094
+
1095
+
1096
+ questions drive analysis
1097
+
1098
+ every plot should answer a question.
1099
+ every test should address a hypothesis.
1100
+ every finding should generate new questions.
1101
+
1102
+
1103
+ the goal
1104
+
1105
+ not just to describe the data.
1106
+ to understand what it tells us.
1107
+ to guide decisions.
1108
+ to inspire further investigation.
1109
+
1110
+ now explore your data.