kollabor 0.4.9__py3-none-any.whl → 0.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +2 -0
- agents/coder/__init__.py +0 -0
- agents/coder/agent.json +4 -0
- agents/coder/api-integration.md +2150 -0
- agents/coder/cli-pretty.md +765 -0
- agents/coder/code-review.md +1092 -0
- agents/coder/database-design.md +1525 -0
- agents/coder/debugging.md +1102 -0
- agents/coder/dependency-management.md +1397 -0
- agents/coder/git-workflow.md +1099 -0
- agents/coder/refactoring.md +1454 -0
- agents/coder/security-hardening.md +1732 -0
- agents/coder/system_prompt.md +1448 -0
- agents/coder/tdd.md +1367 -0
- agents/creative-writer/__init__.py +0 -0
- agents/creative-writer/agent.json +4 -0
- agents/creative-writer/character-development.md +1852 -0
- agents/creative-writer/dialogue-craft.md +1122 -0
- agents/creative-writer/plot-structure.md +1073 -0
- agents/creative-writer/revision-editing.md +1484 -0
- agents/creative-writer/system_prompt.md +690 -0
- agents/creative-writer/worldbuilding.md +2049 -0
- agents/data-analyst/__init__.py +30 -0
- agents/data-analyst/agent.json +4 -0
- agents/data-analyst/data-visualization.md +992 -0
- agents/data-analyst/exploratory-data-analysis.md +1110 -0
- agents/data-analyst/pandas-data-manipulation.md +1081 -0
- agents/data-analyst/sql-query-optimization.md +881 -0
- agents/data-analyst/statistical-analysis.md +1118 -0
- agents/data-analyst/system_prompt.md +928 -0
- agents/default/__init__.py +0 -0
- agents/default/agent.json +4 -0
- agents/default/dead-code.md +794 -0
- agents/default/explore-agent-system.md +585 -0
- agents/default/system_prompt.md +1448 -0
- agents/kollabor/__init__.py +0 -0
- agents/kollabor/analyze-plugin-lifecycle.md +175 -0
- agents/kollabor/analyze-terminal-rendering.md +388 -0
- agents/kollabor/code-review.md +1092 -0
- agents/kollabor/debug-mcp-integration.md +521 -0
- agents/kollabor/debug-plugin-hooks.md +547 -0
- agents/kollabor/debugging.md +1102 -0
- agents/kollabor/dependency-management.md +1397 -0
- agents/kollabor/git-workflow.md +1099 -0
- agents/kollabor/inspect-llm-conversation.md +148 -0
- agents/kollabor/monitor-event-bus.md +558 -0
- agents/kollabor/profile-performance.md +576 -0
- agents/kollabor/refactoring.md +1454 -0
- agents/kollabor/system_prompt copy.md +1448 -0
- agents/kollabor/system_prompt.md +757 -0
- agents/kollabor/trace-command-execution.md +178 -0
- agents/kollabor/validate-config.md +879 -0
- agents/research/__init__.py +0 -0
- agents/research/agent.json +4 -0
- agents/research/architecture-mapping.md +1099 -0
- agents/research/codebase-analysis.md +1077 -0
- agents/research/dependency-audit.md +1027 -0
- agents/research/performance-profiling.md +1047 -0
- agents/research/security-review.md +1359 -0
- agents/research/system_prompt.md +492 -0
- agents/technical-writer/__init__.py +0 -0
- agents/technical-writer/agent.json +4 -0
- agents/technical-writer/api-documentation.md +2328 -0
- agents/technical-writer/changelog-management.md +1181 -0
- agents/technical-writer/readme-writing.md +1360 -0
- agents/technical-writer/style-guide.md +1410 -0
- agents/technical-writer/system_prompt.md +653 -0
- agents/technical-writer/tutorial-creation.md +1448 -0
- core/__init__.py +0 -2
- core/application.py +343 -88
- core/cli.py +229 -10
- core/commands/menu_renderer.py +463 -59
- core/commands/registry.py +14 -9
- core/commands/system_commands.py +2461 -14
- core/config/loader.py +151 -37
- core/config/service.py +18 -6
- core/events/bus.py +29 -9
- core/events/executor.py +205 -75
- core/events/models.py +27 -8
- core/fullscreen/command_integration.py +20 -24
- core/fullscreen/components/__init__.py +10 -1
- core/fullscreen/components/matrix_components.py +1 -2
- core/fullscreen/components/space_shooter_components.py +654 -0
- core/fullscreen/plugin.py +5 -0
- core/fullscreen/renderer.py +52 -13
- core/fullscreen/session.py +52 -15
- core/io/__init__.py +29 -5
- core/io/buffer_manager.py +6 -1
- core/io/config_status_view.py +7 -29
- core/io/core_status_views.py +267 -347
- core/io/input/__init__.py +25 -0
- core/io/input/command_mode_handler.py +711 -0
- core/io/input/display_controller.py +128 -0
- core/io/input/hook_registrar.py +286 -0
- core/io/input/input_loop_manager.py +421 -0
- core/io/input/key_press_handler.py +502 -0
- core/io/input/modal_controller.py +1011 -0
- core/io/input/paste_processor.py +339 -0
- core/io/input/status_modal_renderer.py +184 -0
- core/io/input_errors.py +5 -1
- core/io/input_handler.py +211 -2452
- core/io/key_parser.py +7 -0
- core/io/layout.py +15 -3
- core/io/message_coordinator.py +111 -2
- core/io/message_renderer.py +129 -4
- core/io/status_renderer.py +147 -607
- core/io/terminal_renderer.py +97 -51
- core/io/terminal_state.py +21 -4
- core/io/visual_effects.py +816 -165
- core/llm/agent_manager.py +1063 -0
- core/llm/api_adapters/__init__.py +44 -0
- core/llm/api_adapters/anthropic_adapter.py +432 -0
- core/llm/api_adapters/base.py +241 -0
- core/llm/api_adapters/openai_adapter.py +326 -0
- core/llm/api_communication_service.py +167 -113
- core/llm/conversation_logger.py +322 -16
- core/llm/conversation_manager.py +556 -30
- core/llm/file_operations_executor.py +84 -32
- core/llm/llm_service.py +934 -103
- core/llm/mcp_integration.py +541 -57
- core/llm/message_display_service.py +135 -18
- core/llm/plugin_sdk.py +1 -2
- core/llm/profile_manager.py +1183 -0
- core/llm/response_parser.py +274 -56
- core/llm/response_processor.py +16 -3
- core/llm/tool_executor.py +6 -1
- core/logging/__init__.py +2 -0
- core/logging/setup.py +34 -6
- core/models/resume.py +54 -0
- core/plugins/__init__.py +4 -2
- core/plugins/base.py +127 -0
- core/plugins/collector.py +23 -161
- core/plugins/discovery.py +37 -3
- core/plugins/factory.py +6 -12
- core/plugins/registry.py +5 -17
- core/ui/config_widgets.py +128 -28
- core/ui/live_modal_renderer.py +2 -1
- core/ui/modal_actions.py +5 -0
- core/ui/modal_overlay_renderer.py +0 -60
- core/ui/modal_renderer.py +268 -7
- core/ui/modal_state_manager.py +29 -4
- core/ui/widgets/base_widget.py +7 -0
- core/updates/__init__.py +10 -0
- core/updates/version_check_service.py +348 -0
- core/updates/version_comparator.py +103 -0
- core/utils/config_utils.py +685 -526
- core/utils/plugin_utils.py +1 -1
- core/utils/session_naming.py +111 -0
- fonts/LICENSE +21 -0
- fonts/README.md +46 -0
- fonts/SymbolsNerdFont-Regular.ttf +0 -0
- fonts/SymbolsNerdFontMono-Regular.ttf +0 -0
- fonts/__init__.py +44 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/METADATA +54 -4
- kollabor-0.4.15.dist-info/RECORD +228 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/top_level.txt +2 -0
- plugins/agent_orchestrator/__init__.py +39 -0
- plugins/agent_orchestrator/activity_monitor.py +181 -0
- plugins/agent_orchestrator/file_attacher.py +77 -0
- plugins/agent_orchestrator/message_injector.py +135 -0
- plugins/agent_orchestrator/models.py +48 -0
- plugins/agent_orchestrator/orchestrator.py +403 -0
- plugins/agent_orchestrator/plugin.py +976 -0
- plugins/agent_orchestrator/xml_parser.py +191 -0
- plugins/agent_orchestrator_plugin.py +9 -0
- plugins/enhanced_input/box_styles.py +1 -0
- plugins/enhanced_input/color_engine.py +19 -4
- plugins/enhanced_input/config.py +2 -2
- plugins/enhanced_input_plugin.py +61 -11
- plugins/fullscreen/__init__.py +6 -2
- plugins/fullscreen/example_plugin.py +1035 -222
- plugins/fullscreen/setup_wizard_plugin.py +592 -0
- plugins/fullscreen/space_shooter_plugin.py +131 -0
- plugins/hook_monitoring_plugin.py +436 -78
- plugins/query_enhancer_plugin.py +66 -30
- plugins/resume_conversation_plugin.py +1494 -0
- plugins/save_conversation_plugin.py +98 -32
- plugins/system_commands_plugin.py +70 -56
- plugins/tmux_plugin.py +154 -78
- plugins/workflow_enforcement_plugin.py +94 -92
- system_prompt/default.md +952 -886
- core/io/input_mode_manager.py +0 -402
- core/io/modal_interaction_handler.py +0 -315
- core/io/raw_input_processor.py +0 -946
- core/storage/__init__.py +0 -5
- core/storage/state_manager.py +0 -84
- core/ui/widget_integration.py +0 -222
- core/utils/key_reader.py +0 -171
- kollabor-0.4.9.dist-info/RECORD +0 -128
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/WHEEL +0 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/entry_points.txt +0 -0
- {kollabor-0.4.9.dist-info → kollabor-0.4.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1110 @@
|
|
|
1
|
+
<!-- Exploratory Data Analysis skill - comprehensive data discovery and understanding -->
|
|
2
|
+
|
|
3
|
+
exploratory data analysis: DISCOVER INSIGHTS THROUGH SYSTEMATIC EXPLORATION
|
|
4
|
+
|
|
5
|
+
when this skill is active, you follow rigorous EDA methodology.
|
|
6
|
+
this is a comprehensive guide to understanding your data before modeling.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PHASE 0: EDA ENVIRONMENT VERIFICATION
|
|
10
|
+
|
|
11
|
+
before starting ANY analysis, verify your data science stack is ready.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
verify python data packages
|
|
15
|
+
|
|
16
|
+
<terminal>python -c "import pandas as pd; print(f'pandas {pd.__version__} ready')"</terminal>
|
|
17
|
+
|
|
18
|
+
if pandas not available:
|
|
19
|
+
<terminal>pip install pandas numpy scipy</terminal>
|
|
20
|
+
|
|
21
|
+
verify visualization:
|
|
22
|
+
<terminal>python -c "import matplotlib.pyplot as plt; import seaborn as sns; print('viz packages ready')"</terminal>
|
|
23
|
+
|
|
24
|
+
if visualization not available:
|
|
25
|
+
<terminal>pip install matplotlib seaborn plotly</terminal>
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
verify data file accessibility
|
|
29
|
+
|
|
30
|
+
list data files:
|
|
31
|
+
<terminal>find . -maxdepth 2 -type f \( -name "*.csv" -o -name "*.json" -o -name "*.xlsx" -o -name "*.parquet" \)</terminal>
|
|
32
|
+
|
|
33
|
+
check file sizes:
|
|
34
|
+
<terminal>find . -maxdepth 2 -type f \( -name "*.csv" -o -name "*.json" -o -name "*.xlsx" -o -name "*.parquet" \) -exec ls -lh {} \;</terminal>
|
|
35
|
+
|
|
36
|
+
verify readability:
|
|
37
|
+
<terminal>python -c "import pandas as pd; df = pd.read_csv('data.csv'); print(f'shape: {df.shape}'); print(f'dtypes:\n{df.dtypes}')"</terminal>
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
check available memory
|
|
41
|
+
|
|
42
|
+
<terminal>python -c "import psutil; mem = psutil.virtual_memory(); print(f'total: {mem.total / 1e9:.2f} GB'); print(f'available: {mem.available / 1e9:.2f} GB')"</terminal>
|
|
43
|
+
|
|
44
|
+
if memory is limited for large datasets:
|
|
45
|
+
<terminal>pip install dask modin</terminal>
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
verify jupyter/lab notebooks (optional but recommended)
|
|
49
|
+
|
|
50
|
+
<terminal>jupyter --version 2>/dev/null || echo "jupyter not installed"</terminal>
|
|
51
|
+
|
|
52
|
+
if jupyter not installed:
|
|
53
|
+
<terminal>pip install jupyter jupyterlab</terminal>
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
PHASE 1: INITIAL DATA LOAD AND INSPECTION
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
load data with appropriate reader
|
|
60
|
+
|
|
61
|
+
csv files:
|
|
62
|
+
<terminal>python -c "
|
|
63
|
+
import pandas as pd
|
|
64
|
+
df = pd.read_csv('data.csv', parse_dates=True, infer_datetime_format=True)
|
|
65
|
+
print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
|
|
66
|
+
"</terminal>
|
|
67
|
+
|
|
68
|
+
json files:
|
|
69
|
+
<terminal>python -c "
|
|
70
|
+
import pandas as pd
|
|
71
|
+
df = pd.read_json('data.json')
|
|
72
|
+
print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
|
|
73
|
+
"</terminal>
|
|
74
|
+
|
|
75
|
+
excel files:
|
|
76
|
+
<terminal>python -c "
|
|
77
|
+
import pandas as pd
|
|
78
|
+
df = pd.read_excel('data.xlsx', engine='openpyxl')
|
|
79
|
+
print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
|
|
80
|
+
"</terminal>
|
|
81
|
+
|
|
82
|
+
parquet files (for large data):
|
|
83
|
+
<terminal>python -c "
|
|
84
|
+
import pandas as pd
|
|
85
|
+
df = pd.read_parquet('data.parquet')
|
|
86
|
+
print(f'loaded: {df.shape[0]} rows, {df.shape[1]} columns')
|
|
87
|
+
"</terminal>
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
handle large datasets with chunking
|
|
91
|
+
|
|
92
|
+
chunked reading:
|
|
93
|
+
<terminal>python -c "
|
|
94
|
+
import pandas as pd
|
|
95
|
+
|
|
96
|
+
chunk_size = 10000
|
|
97
|
+
chunks = pd.read_csv('large_data.csv', chunksize=chunk_size)
|
|
98
|
+
|
|
99
|
+
total_rows = 0
|
|
100
|
+
for i, chunk in enumerate(chunks):
|
|
101
|
+
total_rows += len(chunk)
|
|
102
|
+
print(f'chunk {i}: {len(chunk)} rows')
|
|
103
|
+
|
|
104
|
+
print(f'total rows: {total_rows}')
|
|
105
|
+
"</terminal>
|
|
106
|
+
|
|
107
|
+
sample large dataset:
|
|
108
|
+
<terminal>python -c "
|
|
109
|
+
import pandas as pd
|
|
110
|
+
|
|
111
|
+
# read first N rows
|
|
112
|
+
df = pd.read_csv('large_data.csv', nrows=100000)
|
|
113
|
+
print(f'sample: {df.shape}')
|
|
114
|
+
"</terminal>
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
basic data overview
|
|
118
|
+
|
|
119
|
+
shape and memory:
|
|
120
|
+
<terminal>python -c "
|
|
121
|
+
import pandas as pd
|
|
122
|
+
|
|
123
|
+
df = pd.read_csv('data.csv')
|
|
124
|
+
print(f'shape: {df.shape}')
|
|
125
|
+
print(f'memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB')
|
|
126
|
+
print(f'\ncolumns: {list(df.columns)}')
|
|
127
|
+
"</terminal>
|
|
128
|
+
|
|
129
|
+
data types:
|
|
130
|
+
<terminal>python -c "
|
|
131
|
+
import pandas as pd
|
|
132
|
+
|
|
133
|
+
df = pd.read_csv('data.csv')
|
|
134
|
+
print('data types:')
|
|
135
|
+
print(df.dtypes)
|
|
136
|
+
print(f'\ntype distribution:')
|
|
137
|
+
print(df.dtypes.value_counts())
|
|
138
|
+
"</terminal>
|
|
139
|
+
|
|
140
|
+
sample data:
|
|
141
|
+
<terminal>python -c "
|
|
142
|
+
import pandas as pd
|
|
143
|
+
|
|
144
|
+
df = pd.read_csv('data.csv')
|
|
145
|
+
print('first 5 rows:')
|
|
146
|
+
print(df.head())
|
|
147
|
+
print('\nlast 5 rows:')
|
|
148
|
+
print(df.tail())
|
|
149
|
+
print('\nrandom sample:')
|
|
150
|
+
print(df.sample(5))
|
|
151
|
+
"</terminal>
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
PHASE 2: DATA QUALITY ASSESSMENT
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
missing value analysis
|
|
158
|
+
|
|
159
|
+
overall missing data:
|
|
160
|
+
<terminal>python -c "
|
|
161
|
+
import pandas as pd
|
|
162
|
+
|
|
163
|
+
df = pd.read_csv('data.csv')
|
|
164
|
+
missing = df.isnull().sum()
|
|
165
|
+
missing_pct = (missing / len(df)) * 100
|
|
166
|
+
|
|
167
|
+
print('missing values:')
|
|
168
|
+
print(pd.DataFrame({
|
|
169
|
+
'count': missing,
|
|
170
|
+
'percentage': missing_pct
|
|
171
|
+
}))
|
|
172
|
+
"</terminal>
|
|
173
|
+
|
|
174
|
+
missing data visualization:
|
|
175
|
+
<terminal>python -c "
|
|
176
|
+
import pandas as pd
|
|
177
|
+
import matplotlib.pyplot as plt
|
|
178
|
+
import seaborn as sns
|
|
179
|
+
|
|
180
|
+
df = pd.read_csv('data.csv')
|
|
181
|
+
missing = df.isnull()
|
|
182
|
+
|
|
183
|
+
plt.figure(figsize=(12, 8))
|
|
184
|
+
sns.heatmap(missing, cbar=False, cmap='viridis')
|
|
185
|
+
plt.title('missing data heatmap')
|
|
186
|
+
plt.tight_layout()
|
|
187
|
+
plt.savefig('missing_data_heatmap.png')
|
|
188
|
+
print('saved: missing_data_heatmap.png')
|
|
189
|
+
"</terminal>
|
|
190
|
+
|
|
191
|
+
missing data patterns:
|
|
192
|
+
<terminal>python -c "
|
|
193
|
+
import pandas as pd
|
|
194
|
+
|
|
195
|
+
df = pd.read_csv('data.csv')
|
|
196
|
+
|
|
197
|
+
# check if missing values are related
|
|
198
|
+
print('missing correlation:')
|
|
199
|
+
print(df.isnull().corr())
|
|
200
|
+
|
|
201
|
+
# check row-level missing
|
|
202
|
+
df['missing_count'] = df.isnull().sum(axis=1)
|
|
203
|
+
print(f'\nrows with missing data: {(df[\"missing_count\"] > 0).sum()}')
|
|
204
|
+
print(f'missing per row stats:')
|
|
205
|
+
print(df['missing_count'].describe())
|
|
206
|
+
"</terminal>
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
duplicate detection
|
|
210
|
+
|
|
211
|
+
exact duplicates:
|
|
212
|
+
<terminal>python -c "
|
|
213
|
+
import pandas as pd
|
|
214
|
+
|
|
215
|
+
df = pd.read_csv('data.csv')
|
|
216
|
+
duplicates = df.duplicated()
|
|
217
|
+
|
|
218
|
+
print(f'duplicate rows: {duplicates.sum()} ({duplicates.sum()/len(df)*100:.2f}%)')
|
|
219
|
+
print(f'\nunique rows: {len(df.drop_duplicates())}')
|
|
220
|
+
"</terminal>
|
|
221
|
+
|
|
222
|
+
subset duplicates:
|
|
223
|
+
<terminal>python -c "
|
|
224
|
+
import pandas as pd
|
|
225
|
+
|
|
226
|
+
df = pd.read_csv('data.csv')
|
|
227
|
+
|
|
228
|
+
# check duplicates on specific columns
|
|
229
|
+
subset_cols = ['id', 'name', 'date']
|
|
230
|
+
subset_dups = df.duplicated(subset=subset_cols)
|
|
231
|
+
|
|
232
|
+
print(f'duplicates on {subset_cols}: {subset_dups.sum()}')
|
|
233
|
+
"</terminal>
|
|
234
|
+
|
|
235
|
+
duplicate analysis:
|
|
236
|
+
<terminal>python -c "
|
|
237
|
+
import pandas as pd
|
|
238
|
+
|
|
239
|
+
df = pd.read_csv('data.csv')
|
|
240
|
+
|
|
241
|
+
# show duplicate examples
|
|
242
|
+
duplicates = df[df.duplicated(keep=False)]
|
|
243
|
+
print('duplicate examples:')
|
|
244
|
+
print(duplicates.sort_values(by=list(df.columns)).head(10))
|
|
245
|
+
"</terminal>
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
data type validation
|
|
249
|
+
|
|
250
|
+
type mismatch detection:
|
|
251
|
+
<terminal>python -c "
|
|
252
|
+
import pandas as pd
|
|
253
|
+
|
|
254
|
+
df = pd.read_csv('data.csv')
|
|
255
|
+
|
|
256
|
+
# check numeric columns with non-numeric values
|
|
257
|
+
for col in df.select_dtypes(include=['object']).columns:
|
|
258
|
+
# try to convert to numeric
|
|
259
|
+
try:
|
|
260
|
+
numeric = pd.to_numeric(df[col], errors='coerce')
|
|
261
|
+
non_numeric = numeric.isnull() & df[col].notnull()
|
|
262
|
+
if non_numeric.any():
|
|
263
|
+
print(f'{col}: {non_numeric.sum()} non-numeric values')
|
|
264
|
+
print(f' examples: {df.loc[non_numeric, col].head().tolist()}')
|
|
265
|
+
except:
|
|
266
|
+
pass
|
|
267
|
+
"</terminal>
|
|
268
|
+
|
|
269
|
+
datetime validation:
|
|
270
|
+
<terminal>python -c "
|
|
271
|
+
import pandas as pd
|
|
272
|
+
|
|
273
|
+
df = pd.read_csv('data.csv')
|
|
274
|
+
|
|
275
|
+
# try to parse object columns as datetime
|
|
276
|
+
for col in df.select_dtypes(include=['object']).columns:
|
|
277
|
+
if 'date' in col.lower() or 'time' in col.lower():
|
|
278
|
+
try:
|
|
279
|
+
parsed = pd.to_datetime(df[col], errors='coerce')
|
|
280
|
+
failed = parsed.isnull() & df[col].notnull()
|
|
281
|
+
if failed.any():
|
|
282
|
+
print(f'{col}: {failed.sum()} invalid datetime values')
|
|
283
|
+
print(f' examples: {df.loc[failed, col].head().tolist()}')
|
|
284
|
+
except:
|
|
285
|
+
pass
|
|
286
|
+
"</terminal>
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
outlier detection
|
|
290
|
+
|
|
291
|
+
statistical outliers (z-score):
|
|
292
|
+
<terminal>python -c "
|
|
293
|
+
import pandas as pd
|
|
294
|
+
import numpy as np
|
|
295
|
+
|
|
296
|
+
df = pd.read_csv('data.csv')
|
|
297
|
+
|
|
298
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
299
|
+
|
|
300
|
+
for col in numeric_cols:
|
|
301
|
+
z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
|
|
302
|
+
outliers = z_scores > 3
|
|
303
|
+
if outliers.any():
|
|
304
|
+
print(f'{col}: {outliers.sum()} outliers (z > 3)')
|
|
305
|
+
print(f' outlier values: {df.loc[outliers, col].describe()}')
|
|
306
|
+
"</terminal>
|
|
307
|
+
|
|
308
|
+
iqr outliers:
|
|
309
|
+
<terminal>python -c "
|
|
310
|
+
import pandas as pd
|
|
311
|
+
import numpy as np
|
|
312
|
+
|
|
313
|
+
df = pd.read_csv('data.csv')
|
|
314
|
+
|
|
315
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
316
|
+
|
|
317
|
+
for col in numeric_cols:
|
|
318
|
+
q1 = df[col].quantile(0.25)
|
|
319
|
+
q3 = df[col].quantile(0.75)
|
|
320
|
+
iqr = q3 - q1
|
|
321
|
+
lower_bound = q1 - 1.5 * iqr
|
|
322
|
+
upper_bound = q3 + 1.5 * iqr
|
|
323
|
+
|
|
324
|
+
outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
|
|
325
|
+
if outliers.any():
|
|
326
|
+
print(f'{col}: {outliers.sum()} outliers (iqr method)')
|
|
327
|
+
print(f' bounds: [{lower_bound:.2f}, {upper_bound:.2f}]')
|
|
328
|
+
"</terminal>
|
|
329
|
+
|
|
330
|
+
visual outlier detection:
|
|
331
|
+
<terminal>python -c "
|
|
332
|
+
import pandas as pd
|
|
333
|
+
import matplotlib.pyplot as plt
|
|
334
|
+
|
|
335
|
+
df = pd.read_csv('data.csv')
|
|
336
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
337
|
+
|
|
338
|
+
fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(10, 5*len(numeric_cols)))
|
|
339
|
+
if len(numeric_cols) == 1:
|
|
340
|
+
axes = [axes]
|
|
341
|
+
|
|
342
|
+
for i, col in enumerate(numeric_cols):
|
|
343
|
+
df[col].plot(kind='box', ax=axes[i])
|
|
344
|
+
axes[i].set_title(f'{col} boxplot')
|
|
345
|
+
|
|
346
|
+
plt.tight_layout()
|
|
347
|
+
plt.savefig('outlier_boxplots.png')
|
|
348
|
+
print('saved: outlier_boxplots.png')
|
|
349
|
+
"</terminal>
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
PHASE 3: UNIVARIATE ANALYSIS
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
numeric variable analysis
|
|
356
|
+
|
|
357
|
+
descriptive statistics:
|
|
358
|
+
<terminal>python -c "
|
|
359
|
+
import pandas as pd
|
|
360
|
+
|
|
361
|
+
df = pd.read_csv('data.csv')
|
|
362
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
363
|
+
|
|
364
|
+
print('descriptive statistics:')
|
|
365
|
+
print(df[numeric_cols].describe().transpose())
|
|
366
|
+
|
|
367
|
+
print('\nskewness:')
|
|
368
|
+
print(df[numeric_cols].skew())
|
|
369
|
+
|
|
370
|
+
print('\nkurtosis:')
|
|
371
|
+
print(df[numeric_cols].kurtosis())
|
|
372
|
+
"</terminal>
|
|
373
|
+
|
|
374
|
+
distribution visualization:
|
|
375
|
+
<terminal>python -c "
|
|
376
|
+
import pandas as pd
|
|
377
|
+
import matplotlib.pyplot as plt
|
|
378
|
+
import seaborn as sns
|
|
379
|
+
|
|
380
|
+
df = pd.read_csv('data.csv')
|
|
381
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
382
|
+
|
|
383
|
+
fig, axes = plt.subplots(len(numeric_cols), 2, figsize=(15, 5*len(numeric_cols)))
|
|
384
|
+
if len(numeric_cols) == 1:
|
|
385
|
+
axes = axes.reshape(1, -1)
|
|
386
|
+
|
|
387
|
+
for i, col in enumerate(numeric_cols):
|
|
388
|
+
# histogram
|
|
389
|
+
df[col].hist(ax=axes[i, 0], bins=50)
|
|
390
|
+
axes[i, 0].set_title(f'{col} distribution')
|
|
391
|
+
axes[i, 0].set_xlabel(col)
|
|
392
|
+
axes[i, 0].set_ylabel('frequency')
|
|
393
|
+
|
|
394
|
+
# density plot
|
|
395
|
+
df[col].plot(kind='kde', ax=axes[i, 1])
|
|
396
|
+
axes[i, 1].set_title(f'{col} density')
|
|
397
|
+
axes[i, 1].set_xlabel(col)
|
|
398
|
+
axes[i, 1].set_ylabel('density')
|
|
399
|
+
|
|
400
|
+
plt.tight_layout()
|
|
401
|
+
plt.savefig('numeric_distributions.png')
|
|
402
|
+
print('saved: numeric_distributions.png')
|
|
403
|
+
"</terminal>
|
|
404
|
+
|
|
405
|
+
normality tests:
|
|
406
|
+
<terminal>python -c "
|
|
407
|
+
import pandas as pd
|
|
408
|
+
from scipy import stats
|
|
409
|
+
|
|
410
|
+
df = pd.read_csv('data.csv')
|
|
411
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
412
|
+
|
|
413
|
+
for col in numeric_cols:
|
|
414
|
+
data = df[col].dropna()
|
|
415
|
+
|
|
416
|
+
# shapiro-wilk test (for small samples)
|
|
417
|
+
if len(data) < 5000:
|
|
418
|
+
stat, p = stats.shapiro(data)
|
|
419
|
+
print(f'{col}: shapiro-wilk p={p:.4f} (normal={p > 0.05})')
|
|
420
|
+
else:
|
|
421
|
+
# kolmogorov-smirnov test (for large samples)
|
|
422
|
+
stat, p = stats.kstest(data, 'norm')
|
|
423
|
+
print(f'{col}: ks-test p={p:.4f} (normal={p > 0.05})')
|
|
424
|
+
"</terminal>
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
categorical variable analysis
|
|
428
|
+
|
|
429
|
+
value counts:
|
|
430
|
+
<terminal>python -c "
|
|
431
|
+
import pandas as pd
|
|
432
|
+
|
|
433
|
+
df = pd.read_csv('data.csv')
|
|
434
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
435
|
+
|
|
436
|
+
for col in categorical_cols:
|
|
437
|
+
print(f'\n{col}:')
|
|
438
|
+
print(f' unique values: {df[col].nunique()}')
|
|
439
|
+
print(f' top 5 values:')
|
|
440
|
+
print(df[col].value_counts().head())
|
|
441
|
+
"</terminal>
|
|
442
|
+
|
|
443
|
+
cardinality analysis:
|
|
444
|
+
<terminal>python -c "
|
|
445
|
+
import pandas as pd
|
|
446
|
+
|
|
447
|
+
df = pd.read_csv('data.csv')
|
|
448
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
449
|
+
|
|
450
|
+
print('cardinality analysis:')
|
|
451
|
+
cardinality = pd.DataFrame({
|
|
452
|
+
'unique': df[categorical_cols].nunique(),
|
|
453
|
+
'total': len(df),
|
|
454
|
+
'ratio': df[categorical_cols].nunique() / len(df)
|
|
455
|
+
})
|
|
456
|
+
print(cardinality)
|
|
457
|
+
|
|
458
|
+
print('\nhigh cardinality columns (> 0.5):')
|
|
459
|
+
high_card = cardinality[cardinality['ratio'] > 0.5]
|
|
460
|
+
if not high_card.empty:
|
|
461
|
+
print(high_card)
|
|
462
|
+
"</terminal>
|
|
463
|
+
|
|
464
|
+
categorical visualization:
|
|
465
|
+
<terminal>python -c "
|
|
466
|
+
import pandas as pd
|
|
467
|
+
import matplotlib.pyplot as plt
|
|
468
|
+
import seaborn as sns
|
|
469
|
+
|
|
470
|
+
df = pd.read_csv('data.csv')
|
|
471
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
472
|
+
|
|
473
|
+
# plot top categories for each column
|
|
474
|
+
for col in categorical_cols:
|
|
475
|
+
if df[col].nunique() <= 20:
|
|
476
|
+
plt.figure(figsize=(10, 6))
|
|
477
|
+
df[col].value_counts().plot(kind='bar')
|
|
478
|
+
plt.title(f'{col} distribution')
|
|
479
|
+
plt.xticks(rotation=45)
|
|
480
|
+
plt.tight_layout()
|
|
481
|
+
plt.savefig(f'{col}_distribution.png')
|
|
482
|
+
plt.close()
|
|
483
|
+
print(f'saved: {col}_distribution.png')
|
|
484
|
+
"</terminal>
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
datetime variable analysis
|
|
488
|
+
|
|
489
|
+
date range:
|
|
490
|
+
<terminal>python -c "
|
|
491
|
+
import pandas as pd
|
|
492
|
+
|
|
493
|
+
df = pd.read_csv('data.csv')
|
|
494
|
+
|
|
495
|
+
# identify datetime columns
|
|
496
|
+
datetime_cols = []
|
|
497
|
+
for col in df.columns:
|
|
498
|
+
try:
|
|
499
|
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
|
500
|
+
if df[col].notnull().any():
|
|
501
|
+
datetime_cols.append(col)
|
|
502
|
+
except:
|
|
503
|
+
pass
|
|
504
|
+
|
|
505
|
+
for col in datetime_cols:
|
|
506
|
+
print(f'\n{col}:')
|
|
507
|
+
print(f' range: {df[col].min()} to {df[col].max()}')
|
|
508
|
+
print(f' span: {(df[col].max() - df[col].min()).days} days')
|
|
509
|
+
print(f' missing: {df[col].isnull().sum()}')
|
|
510
|
+
"</terminal>
|
|
511
|
+
|
|
512
|
+
temporal patterns:
|
|
513
|
+
<terminal>python -c "
|
|
514
|
+
import pandas as pd
|
|
515
|
+
import matplotlib.pyplot as plt
|
|
516
|
+
|
|
517
|
+
df = pd.read_csv('data.csv')
|
|
518
|
+
|
|
519
|
+
# assume first datetime column
|
|
520
|
+
date_col = df.select_dtypes(include=['datetime64']).columns[0]
|
|
521
|
+
|
|
522
|
+
# extract time components
|
|
523
|
+
df['year'] = df[date_col].dt.year
|
|
524
|
+
df['month'] = df[date_col].dt.month
|
|
525
|
+
df['day'] = df[date_col].dt.day
|
|
526
|
+
df['weekday'] = df[date_col].dt.weekday
|
|
527
|
+
df['hour'] = df[date_col].dt.hour
|
|
528
|
+
|
|
529
|
+
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
|
|
530
|
+
|
|
531
|
+
# yearly pattern
|
|
532
|
+
df['year'].value_counts().sort_index().plot(kind='bar', ax=axes[0,0])
|
|
533
|
+
axes[0,0].set_title('yearly pattern')
|
|
534
|
+
|
|
535
|
+
# monthly pattern
|
|
536
|
+
df['month'].value_counts().sort_index().plot(kind='bar', ax=axes[0,1])
|
|
537
|
+
axes[0,1].set_title('monthly pattern')
|
|
538
|
+
|
|
539
|
+
# weekday pattern
|
|
540
|
+
df['weekday'].value_counts().sort_index().plot(kind='bar', ax=axes[1,0])
|
|
541
|
+
axes[1,0].set_title('weekday pattern')
|
|
542
|
+
|
|
543
|
+
# hourly pattern
|
|
544
|
+
df['hour'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
|
|
545
|
+
axes[1,1].set_title('hourly pattern')
|
|
546
|
+
|
|
547
|
+
plt.tight_layout()
|
|
548
|
+
plt.savefig('temporal_patterns.png')
|
|
549
|
+
print('saved: temporal_patterns.png')
|
|
550
|
+
"</terminal>
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
PHASE 4: BIVARIATE ANALYSIS
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
numeric-numeric relationships
|
|
557
|
+
|
|
558
|
+
correlation matrix:
|
|
559
|
+
<terminal>python -c "
|
|
560
|
+
import pandas as pd
|
|
561
|
+
import matplotlib.pyplot as plt
|
|
562
|
+
import seaborn as sns
|
|
563
|
+
|
|
564
|
+
df = pd.read_csv('data.csv')
|
|
565
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
566
|
+
|
|
567
|
+
# compute correlation matrix
|
|
568
|
+
corr = df[numeric_cols].corr()
|
|
569
|
+
|
|
570
|
+
# plot heatmap
|
|
571
|
+
plt.figure(figsize=(12, 10))
|
|
572
|
+
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
|
|
573
|
+
plt.title('correlation matrix')
|
|
574
|
+
plt.tight_layout()
|
|
575
|
+
plt.savefig('correlation_matrix.png')
|
|
576
|
+
print('saved: correlation_matrix.png')
|
|
577
|
+
"</terminal>
|
|
578
|
+
|
|
579
|
+
scatter plot matrix:
|
|
580
|
+
<terminal>python -c "
|
|
581
|
+
import pandas as pd
|
|
582
|
+
import matplotlib.pyplot as plt
|
|
583
|
+
import seaborn as sns
|
|
584
|
+
|
|
585
|
+
df = pd.read_csv('data.csv')
|
|
586
|
+
numeric_cols = df.select_dtypes(include=['number']).columns[:5] # limit to first 5
|
|
587
|
+
|
|
588
|
+
# pairplot
|
|
589
|
+
sns.pairplot(df[numeric_cols], diag_kind='kde')
|
|
590
|
+
plt.tight_layout()
|
|
591
|
+
plt.savefig('scatter_matrix.png')
|
|
592
|
+
print('saved: scatter_matrix.png')
|
|
593
|
+
"</terminal>
|
|
594
|
+
|
|
595
|
+
strong correlations:
|
|
596
|
+
<terminal>python -c "
|
|
597
|
+
import pandas as pd
|
|
598
|
+
|
|
599
|
+
df = pd.read_csv('data.csv')
|
|
600
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
601
|
+
corr = df[numeric_cols].corr()
|
|
602
|
+
|
|
603
|
+
# find strong correlations (|r| > 0.7)
|
|
604
|
+
strong_corr = []
|
|
605
|
+
for i in range(len(corr.columns)):
|
|
606
|
+
for j in range(i+1, len(corr.columns)):
|
|
607
|
+
if abs(corr.iloc[i, j]) > 0.7:
|
|
608
|
+
strong_corr.append({
|
|
609
|
+
'var1': corr.columns[i],
|
|
610
|
+
'var2': corr.columns[j],
|
|
611
|
+
'correlation': corr.iloc[i, j]
|
|
612
|
+
})
|
|
613
|
+
|
|
614
|
+
if strong_corr:
|
|
615
|
+
strong_corr_df = pd.DataFrame(strong_corr)
|
|
616
|
+
strong_corr_df = strong_corr_df.sort_values('correlation', key=abs, ascending=False)
|
|
617
|
+
print('strong correlations:')
|
|
618
|
+
print(strong_corr_df)
|
|
619
|
+
else:
|
|
620
|
+
print('no strong correlations found')
|
|
621
|
+
"</terminal>
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
categorical-categorical relationships
|
|
625
|
+
|
|
626
|
+
crosstab analysis:
|
|
627
|
+
<terminal>python -c "
|
|
628
|
+
import pandas as pd
|
|
629
|
+
|
|
630
|
+
df = pd.read_csv('data.csv')
|
|
631
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:4]
|
|
632
|
+
|
|
633
|
+
for i in range(len(categorical_cols)):
|
|
634
|
+
for j in range(i+1, len(categorical_cols)):
|
|
635
|
+
col1, col2 = categorical_cols[i], categorical_cols[j]
|
|
636
|
+
if df[col1].nunique() <= 10 and df[col2].nunique() <= 10:
|
|
637
|
+
print(f'\n{col1} vs {col2}:')
|
|
638
|
+
crosstab = pd.crosstab(df[col1], df[col2])
|
|
639
|
+
print(crosstab)
|
|
640
|
+
"</terminal>
|
|
641
|
+
|
|
642
|
+
chi-square test:
|
|
643
|
+
<terminal>python -c "
|
|
644
|
+
import pandas as pd
|
|
645
|
+
from scipy import stats
|
|
646
|
+
|
|
647
|
+
df = pd.read_csv('data.csv')
|
|
648
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:4]
|
|
649
|
+
|
|
650
|
+
for i in range(len(categorical_cols)):
|
|
651
|
+
for j in range(i+1, len(categorical_cols)):
|
|
652
|
+
col1, col2 = categorical_cols[i], categorical_cols[j]
|
|
653
|
+
if df[col1].nunique() <= 10 and df[col2].nunique() <= 10:
|
|
654
|
+
crosstab = pd.crosstab(df[col1], df[col2])
|
|
655
|
+
chi2, p, dof, expected = stats.chi2_contingency(crosstab)
|
|
656
|
+
print(f'{col1} vs {col2}: chi2={chi2:.2f}, p={p:.4f}, significant={p < 0.05}')
|
|
657
|
+
"</terminal>
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
numeric-categorical relationships
|
|
661
|
+
|
|
662
|
+
group statistics:
|
|
663
|
+
<terminal>python -c "
|
|
664
|
+
import pandas as pd
|
|
665
|
+
|
|
666
|
+
df = pd.read_csv('data.csv')
|
|
667
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns[:3]
|
|
668
|
+
numeric_cols = df.select_dtypes(include=['number']).columns[:3]
|
|
669
|
+
|
|
670
|
+
for cat_col in categorical_cols:
|
|
671
|
+
if df[cat_col].nunique() <= 10:
|
|
672
|
+
print(f'\n{cat_col}:')
|
|
673
|
+
for num_col in numeric_cols:
|
|
674
|
+
group_stats = df.groupby(cat_col)[num_col].agg(['mean', 'std', 'count'])
|
|
675
|
+
print(f' {num_col}:')
|
|
676
|
+
print(group_stats)
|
|
677
|
+
"</terminal>
|
|
678
|
+
|
|
679
|
+
boxplot by category:
|
|
680
|
+
<terminal>python -c "
|
|
681
|
+
import pandas as pd
|
|
682
|
+
import matplotlib.pyplot as plt
|
|
683
|
+
import seaborn as sns
|
|
684
|
+
|
|
685
|
+
df = pd.read_csv('data.csv')
|
|
686
|
+
cat_col = df.select_dtypes(include=['object', 'category']).columns[0]
|
|
687
|
+
num_col = df.select_dtypes(include=['number']).columns[0]
|
|
688
|
+
|
|
689
|
+
if df[cat_col].nunique() <= 10:
|
|
690
|
+
plt.figure(figsize=(12, 6))
|
|
691
|
+
sns.boxplot(x=cat_col, y=num_col, data=df)
|
|
692
|
+
plt.title(f'{num_col} by {cat_col}')
|
|
693
|
+
plt.xticks(rotation=45)
|
|
694
|
+
plt.tight_layout()
|
|
695
|
+
plt.savefig(f'{num_col}_by_{cat_col}.png')
|
|
696
|
+
print(f'saved: {num_col}_by_{cat_col}.png')
|
|
697
|
+
"</terminal>
|
|
698
|
+
|
|
699
|
+
anova test:
|
|
700
|
+
<terminal>python -c "
|
|
701
|
+
import pandas as pd
|
|
702
|
+
from scipy import stats
|
|
703
|
+
|
|
704
|
+
df = pd.read_csv('data.csv')
|
|
705
|
+
cat_col = df.select_dtypes(include=['object', 'category']).columns[0]
|
|
706
|
+
num_col = df.select_dtypes(include=['number']).columns[0]
|
|
707
|
+
|
|
708
|
+
if df[cat_col].nunique() <= 10:
|
|
709
|
+
groups = [group[num_col].dropna() for name, group in df.groupby(cat_col)]
|
|
710
|
+
f_stat, p_value = stats.f_oneway(*groups)
|
|
711
|
+
print(f'anova test for {num_col} by {cat_col}:')
|
|
712
|
+
print(f' f-statistic: {f_stat:.4f}')
|
|
713
|
+
print(f' p-value: {p_value:.4f}')
|
|
714
|
+
print(f' significant: {p_value < 0.05}')
|
|
715
|
+
"</terminal>
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
PHASE 5: MULTIVARIATE ANALYSIS
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
dimensionality reduction visualization
|
|
722
|
+
|
|
723
|
+
pca scatter plot:
|
|
724
|
+
<terminal>python -c "
|
|
725
|
+
import pandas as pd
|
|
726
|
+
from sklearn.decomposition import PCA
|
|
727
|
+
from sklearn.preprocessing import StandardScaler
|
|
728
|
+
import matplotlib.pyplot as plt
|
|
729
|
+
|
|
730
|
+
df = pd.read_csv('data.csv')
|
|
731
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
732
|
+
|
|
733
|
+
# prepare data
|
|
734
|
+
data = df[numeric_cols].dropna()
|
|
735
|
+
scaler = StandardScaler()
|
|
736
|
+
scaled_data = scaler.fit_transform(data)
|
|
737
|
+
|
|
738
|
+
# perform pca
|
|
739
|
+
pca = PCA(n_components=2)
|
|
740
|
+
pca_result = pca.fit_transform(scaled_data)
|
|
741
|
+
|
|
742
|
+
# plot
|
|
743
|
+
plt.figure(figsize=(10, 8))
|
|
744
|
+
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
|
|
745
|
+
plt.xlabel('pc1 (explained variance: {:.2%})'.format(pca.explained_variance_ratio_[0]))
|
|
746
|
+
plt.ylabel('pc2 (explained variance: {:.2%})'.format(pca.explained_variance_ratio_[1]))
|
|
747
|
+
plt.title('pca scatter plot')
|
|
748
|
+
plt.savefig('pca_scatter.png')
|
|
749
|
+
print('saved: pca_scatter.png')
|
|
750
|
+
print(f'total variance explained: {pca.explained_variance_ratio_.sum():.2%}')
|
|
751
|
+
"</terminal>
|
|
752
|
+
|
|
753
|
+
pca loading analysis:
|
|
754
|
+
<terminal>python -c "
|
|
755
|
+
import pandas as pd
|
|
756
|
+
from sklearn.decomposition import PCA
|
|
757
|
+
from sklearn.preprocessing import StandardScaler
|
|
758
|
+
|
|
759
|
+
df = pd.read_csv('data.csv')
|
|
760
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
761
|
+
|
|
762
|
+
# prepare data
|
|
763
|
+
data = df[numeric_cols].dropna()
|
|
764
|
+
scaler = StandardScaler()
|
|
765
|
+
scaled_data = scaler.fit_transform(data)
|
|
766
|
+
|
|
767
|
+
# perform pca
|
|
768
|
+
pca = PCA()
|
|
769
|
+
pca_result = pca.fit_transform(scaled_data)
|
|
770
|
+
|
|
771
|
+
# loading matrix
|
|
772
|
+
loadings = pd.DataFrame(
|
|
773
|
+
pca.components_.T,
|
|
774
|
+
columns=[f'pc{i+1}' for i in range(len(numeric_cols))],
|
|
775
|
+
index=numeric_cols
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
print('pca loadings (top 3 components):')
|
|
779
|
+
print(loadings.iloc[:, :3])
|
|
780
|
+
|
|
781
|
+
print('\nexplained variance ratio:')
|
|
782
|
+
print(pca.explained_variance_ratio_[:10])
|
|
783
|
+
"</terminal>
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
feature importance analysis
|
|
787
|
+
|
|
788
|
+
random forest feature importance:
|
|
789
|
+
<terminal>python -c "
|
|
790
|
+
import pandas as pd
|
|
791
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
792
|
+
from sklearn.preprocessing import LabelEncoder
|
|
793
|
+
import matplotlib.pyplot as plt
|
|
794
|
+
|
|
795
|
+
df = pd.read_csv('data.csv')
|
|
796
|
+
|
|
797
|
+
# prepare features
|
|
798
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
799
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
800
|
+
|
|
801
|
+
# encode categorical variables
|
|
802
|
+
for col in categorical_cols:
|
|
803
|
+
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
|
|
804
|
+
|
|
805
|
+
# use all columns as features
|
|
806
|
+
features = df.dropna()
|
|
807
|
+
X = features
|
|
808
|
+
y = features[numeric_cols[0]] # use first numeric as target
|
|
809
|
+
|
|
810
|
+
# train random forest
|
|
811
|
+
rf = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
812
|
+
rf.fit(X, y)
|
|
813
|
+
|
|
814
|
+
# feature importance
|
|
815
|
+
importance = pd.DataFrame({
|
|
816
|
+
'feature': X.columns,
|
|
817
|
+
'importance': rf.feature_importances_
|
|
818
|
+
}).sort_values('importance', ascending=False)
|
|
819
|
+
|
|
820
|
+
print('feature importance:')
|
|
821
|
+
print(importance.head(10))
|
|
822
|
+
|
|
823
|
+
# plot
|
|
824
|
+
plt.figure(figsize=(10, 6))
|
|
825
|
+
plt.barh(importance['feature'].head(10), importance['importance'].head(10))
|
|
826
|
+
plt.gca().invert_yaxis()
|
|
827
|
+
plt.title('feature importance (random forest)')
|
|
828
|
+
plt.tight_layout()
|
|
829
|
+
plt.savefig('feature_importance.png')
|
|
830
|
+
print('saved: feature_importance.png')
|
|
831
|
+
"</terminal>
|
|
832
|
+
|
|
833
|
+
mutual information:
|
|
834
|
+
<terminal>python -c "
|
|
835
|
+
import pandas as pd
|
|
836
|
+
from sklearn.feature_selection import mutual_info_regression
|
|
837
|
+
from sklearn.preprocessing import LabelEncoder
|
|
838
|
+
import matplotlib.pyplot as plt
|
|
839
|
+
|
|
840
|
+
df = pd.read_csv('data.csv')
|
|
841
|
+
|
|
842
|
+
# prepare data
|
|
843
|
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
844
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
845
|
+
|
|
846
|
+
# encode categorical variables
|
|
847
|
+
for col in categorical_cols:
|
|
848
|
+
df[col] = LabelEncoder().fit_transform(df[col].astype(str))
|
|
849
|
+
|
|
850
|
+
# prepare features and target
|
|
851
|
+
features = df.dropna()
|
|
852
|
+
X = features[features.columns.difference([numeric_cols[0]])]
|
|
853
|
+
y = features[numeric_cols[0]]
|
|
854
|
+
|
|
855
|
+
# calculate mutual information
|
|
856
|
+
mi = mutual_info_regression(X, y)
|
|
857
|
+
|
|
858
|
+
# create dataframe
|
|
859
|
+
mi_df = pd.DataFrame({
|
|
860
|
+
'feature': X.columns,
|
|
861
|
+
'mutual_info': mi
|
|
862
|
+
}).sort_values('mutual_info', ascending=False)
|
|
863
|
+
|
|
864
|
+
print('mutual information:')
|
|
865
|
+
print(mi_df.head(10))
|
|
866
|
+
|
|
867
|
+
# plot
|
|
868
|
+
plt.figure(figsize=(10, 6))
|
|
869
|
+
plt.barh(mi_df['feature'].head(10), mi_df['mutual_info'].head(10))
|
|
870
|
+
plt.gca().invert_yaxis()
|
|
871
|
+
plt.title('mutual information')
|
|
872
|
+
plt.tight_layout()
|
|
873
|
+
plt.savefig('mutual_information.png')
|
|
874
|
+
print('saved: mutual_information.png')
|
|
875
|
+
"</terminal>
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
PHASE 6: DATA QUALITY REPORT
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
generate comprehensive report
|
|
882
|
+
|
|
883
|
+
<terminal>python -c "
|
|
884
|
+
import pandas as pd
|
|
885
|
+
import numpy as np
|
|
886
|
+
from datetime import datetime
|
|
887
|
+
|
|
888
|
+
# load data
|
|
889
|
+
df = pd.read_csv('data.csv')
|
|
890
|
+
|
|
891
|
+
# initialize report
|
|
892
|
+
report = []
|
|
893
|
+
report.append('=' * 80)
|
|
894
|
+
report.append('exploratory data analysis report')
|
|
895
|
+
report.append('=' * 80)
|
|
896
|
+
report.append(f'generated: {datetime.now().strftime(\"%y-%m-%d %h:%m:%s\")}')
|
|
897
|
+
report.append('')
|
|
898
|
+
|
|
899
|
+
# dataset overview
|
|
900
|
+
report.append('dataset overview')
|
|
901
|
+
report.append('-' * 40)
|
|
902
|
+
report.append(f'rows: {len(df):,}')
|
|
903
|
+
report.append(f'columns: {len(df.columns)}')
|
|
904
|
+
report.append(f'memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} mb')
|
|
905
|
+
report.append('')
|
|
906
|
+
|
|
907
|
+
# data types
|
|
908
|
+
report.append('data types')
|
|
909
|
+
report.append('-' * 40)
|
|
910
|
+
dtypes = df.dtypes.value_counts()
|
|
911
|
+
for dtype, count in dtypes.items():
|
|
912
|
+
report.append(f' {dtype}: {count}')
|
|
913
|
+
report.append('')
|
|
914
|
+
|
|
915
|
+
# missing data
|
|
916
|
+
report.append('missing data')
|
|
917
|
+
report.append('-' * 40)
|
|
918
|
+
missing = df.isnull().sum()
|
|
919
|
+
total_missing = missing.sum()
|
|
920
|
+
report.append(f'total missing: {total_missing:,}')
|
|
921
|
+
report.append(f'percentage: {total_missing / (len(df) * len(df.columns)) * 100:.2f}%')
|
|
922
|
+
report.append('')
|
|
923
|
+
|
|
924
|
+
for col, count in missing[missing > 0].items():
|
|
925
|
+
pct = (count / len(df)) * 100
|
|
926
|
+
report.append(f' {col}: {count:,} ({pct:.2f}%)')
|
|
927
|
+
report.append('')
|
|
928
|
+
|
|
929
|
+
# numeric statistics
|
|
930
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
931
|
+
if not numeric_cols.empty:
|
|
932
|
+
report.append('numeric statistics')
|
|
933
|
+
report.append('-' * 40)
|
|
934
|
+
for col in numeric_cols:
|
|
935
|
+
report.append(f' {col}:')
|
|
936
|
+
report.append(f' mean: {df[col].mean():.2f}')
|
|
937
|
+
report.append(f' std: {df[col].std():.2f}')
|
|
938
|
+
report.append(f' min: {df[col].min():.2f}')
|
|
939
|
+
report.append(f' max: {df[col].max():.2f}')
|
|
940
|
+
report.append(f' missing: {df[col].isnull().sum():,}')
|
|
941
|
+
report.append('')
|
|
942
|
+
|
|
943
|
+
# categorical statistics
|
|
944
|
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
945
|
+
if not categorical_cols.empty:
|
|
946
|
+
report.append('categorical statistics')
|
|
947
|
+
report.append('-' * 40)
|
|
948
|
+
for col in categorical_cols:
|
|
949
|
+
report.append(f' {col}:')
|
|
950
|
+
report.append(f' unique values: {df[col].nunique():,}')
|
|
951
|
+
report.append(f' most common: {df[col].mode()[0] if not df[col].mode().empty else \"none\"}')
|
|
952
|
+
report.append(f' missing: {df[col].isnull().sum():,}')
|
|
953
|
+
report.append('')
|
|
954
|
+
|
|
955
|
+
# duplicates
|
|
956
|
+
duplicates = df.duplicated().sum()
|
|
957
|
+
report.append('data quality')
|
|
958
|
+
report.append('-' * 40)
|
|
959
|
+
report.append(f'duplicate rows: {duplicates:,}')
|
|
960
|
+
report.append(f'duplicate percentage: {duplicates / len(df) * 100:.2f}%')
|
|
961
|
+
report.append('')
|
|
962
|
+
|
|
963
|
+
# save report
|
|
964
|
+
report_text = '\n'.join(report)
|
|
965
|
+
with open('eda_report.txt', 'w') as f:
|
|
966
|
+
f.write(report_text)
|
|
967
|
+
|
|
968
|
+
print('saved: eda_report.txt')
|
|
969
|
+
print(report_text)
|
|
970
|
+
"</terminal>
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
PHASE 7: EDA CHECKLIST
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
initial inspection
|
|
977
|
+
|
|
978
|
+
[ ] loaded data successfully
|
|
979
|
+
[ ] verified data shape and columns
|
|
980
|
+
[ ] checked data types
|
|
981
|
+
[ ] examined sample data
|
|
982
|
+
[ ] checked memory usage
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
data quality
|
|
986
|
+
|
|
987
|
+
[ ] identified missing values
|
|
988
|
+
[ ] quantified missing data patterns
|
|
989
|
+
[ ] detected duplicate records
|
|
990
|
+
[ ] identified outliers
|
|
991
|
+
[ ] validated data types
|
|
992
|
+
[ ] checked for inconsistencies
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
univariate analysis
|
|
996
|
+
|
|
997
|
+
[ ] analyzed numeric distributions
|
|
998
|
+
[ ] checked for normality
|
|
999
|
+
[ ] examined categorical frequencies
|
|
1000
|
+
[ ] analyzed temporal patterns
|
|
1001
|
+
[ ] identified high cardinality columns
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
bivariate analysis
|
|
1005
|
+
|
|
1006
|
+
[ ] computed correlation matrix
|
|
1007
|
+
[ ] identified strong correlations
|
|
1008
|
+
[ ] analyzed categorical relationships
|
|
1009
|
+
[ ] examined group differences
|
|
1010
|
+
[ ] performed significance tests
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
multivariate analysis
|
|
1014
|
+
|
|
1015
|
+
[ ] performed dimensionality reduction
|
|
1016
|
+
[ ] analyzed feature importance
|
|
1017
|
+
[ ] identified key patterns
|
|
1018
|
+
[ ] detected clusters or groups
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
documentation
|
|
1022
|
+
|
|
1023
|
+
[ ] saved all visualizations
|
|
1024
|
+
[ ] generated summary report
|
|
1025
|
+
[ ] documented findings
|
|
1026
|
+
[ ] noted data quality issues
|
|
1027
|
+
[ ] suggested next steps
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
PHASE 8: EDA RULES (MANDATORY)
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
while this skill is active, these rules are MANDATORY:
|
|
1034
|
+
|
|
1035
|
+
[1] ALWAYS START WITH DATA INSPECTION
|
|
1036
|
+
never jump to modeling without understanding the data
|
|
1037
|
+
examine structure, types, and basic statistics first
|
|
1038
|
+
|
|
1039
|
+
[2] VISUALIZE EVERYTHING
|
|
1040
|
+
use plots to understand distributions and relationships
|
|
1041
|
+
visual patterns reveal insights that statistics miss
|
|
1042
|
+
|
|
1043
|
+
[3] CHECK DATA QUALITY FIRST
|
|
1044
|
+
identify missing values, duplicates, and outliers early
|
|
1045
|
+
poor data quality leads to poor insights
|
|
1046
|
+
|
|
1047
|
+
[4] UNDERSTAND DISTRIBUTIONS
|
|
1048
|
+
know if your data is normal, skewed, or multimodal
|
|
1049
|
+
distribution assumptions impact statistical tests
|
|
1050
|
+
|
|
1051
|
+
[5] EXPLORE RELATIONSHIPS
|
|
1052
|
+
examine correlations and associations between variables
|
|
1053
|
+
relationships drive predictive modeling
|
|
1054
|
+
|
|
1055
|
+
[6] DOCUMENT EVERYTHING
|
|
1056
|
+
save plots, code, and findings
|
|
1057
|
+
others should be able to reproduce your analysis
|
|
1058
|
+
|
|
1059
|
+
[7] BE SKEPTICAL OF OUTLIERS
|
|
1060
|
+
investigate before removing
|
|
1061
|
+
outliers might be errors or important signals
|
|
1062
|
+
|
|
1063
|
+
[8] CONSIDER SAMPLE BIAS
|
|
1064
|
+
understand who/what your data represents
|
|
1065
|
+
bias limits generalizability
|
|
1066
|
+
|
|
1067
|
+
[9] ITERATE AND REFINE
|
|
1068
|
+
initial findings suggest new questions
|
|
1069
|
+
follow interesting threads
|
|
1070
|
+
|
|
1071
|
+
[10] COMMUNICATE FINDINGS
|
|
1072
|
+
explain what you found and why it matters
|
|
1073
|
+
insights are only valuable if they're understood
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
FINAL REMINDERS
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
eda is exploration
|
|
1080
|
+
|
|
1081
|
+
you don't know what you'll find.
|
|
1082
|
+
follow your curiosity.
|
|
1083
|
+
investigate anomalies.
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
patterns lead to insights
|
|
1087
|
+
|
|
1088
|
+
look for:
|
|
1089
|
+
- unexpected correlations
|
|
1090
|
+
- unusual distributions
|
|
1091
|
+
- hidden groups
|
|
1092
|
+
- temporal trends
|
|
1093
|
+
- outliers that matter
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
questions drive analysis
|
|
1097
|
+
|
|
1098
|
+
every plot should answer a question.
|
|
1099
|
+
every test should address a hypothesis.
|
|
1100
|
+
every finding should generate new questions.
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
the goal
|
|
1104
|
+
|
|
1105
|
+
not just to describe the data.
|
|
1106
|
+
to understand what it tells us.
|
|
1107
|
+
to guide decisions.
|
|
1108
|
+
to inspire further investigation.
|
|
1109
|
+
|
|
1110
|
+
now explore your data.
|