sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import scipy.stats as stats
|
|
4
|
+
import matplotlib
|
|
5
|
+
try:
|
|
6
|
+
matplotlib.use('qtagg') # Set the backend before importing pyplot
|
|
7
|
+
except ImportError:
|
|
8
|
+
matplotlib.use('Agg') # Fall back to headless backend for CI/testing
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
from matplotlib.figure import Figure
|
|
11
|
+
from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
|
|
12
|
+
from PyQt6.QtCore import QObject, pyqtSignal, Qt
|
|
13
|
+
from PyQt6.QtWidgets import (
|
|
14
|
+
QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
|
|
15
|
+
QTableView, QHeaderView, QLabel, QFrame, QScrollArea, QTabWidget,
|
|
16
|
+
QComboBox, QPushButton, QSplitter
|
|
17
|
+
)
|
|
18
|
+
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QBrush
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DistributionProfiler(QObject):
|
|
22
|
+
"""Class to analyze distributions of columns in a dataframe"""
|
|
23
|
+
progress_updated = pyqtSignal(int, str) # Signal for progress reporting
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
# Define common distributions to test
|
|
29
|
+
self.distributions = [
|
|
30
|
+
{'name': 'normal', 'distribution': stats.norm, 'color': 'blue'},
|
|
31
|
+
{'name': 'uniform', 'distribution': stats.uniform, 'color': 'green'},
|
|
32
|
+
{'name': 'exponential', 'distribution': stats.expon, 'color': 'red'},
|
|
33
|
+
{'name': 'lognormal', 'distribution': stats.lognorm, 'color': 'purple'},
|
|
34
|
+
{'name': 'gamma', 'distribution': stats.gamma, 'color': 'orange'},
|
|
35
|
+
{'name': 'beta', 'distribution': stats.beta, 'color': 'brown'},
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def get_best_distribution(self, data):
|
|
39
|
+
"""Find the best distribution that fits the data"""
|
|
40
|
+
# Remove NaNs
|
|
41
|
+
data = data.dropna()
|
|
42
|
+
|
|
43
|
+
if len(data) == 0:
|
|
44
|
+
return None, None, None
|
|
45
|
+
|
|
46
|
+
# For categorical or non-numeric data, return None
|
|
47
|
+
if not pd.api.types.is_numeric_dtype(data):
|
|
48
|
+
return None, None, None
|
|
49
|
+
|
|
50
|
+
# For constant data, return a simple result
|
|
51
|
+
if data.nunique() == 1:
|
|
52
|
+
return 'constant', None, 1.0
|
|
53
|
+
|
|
54
|
+
# If too few unique values, may not be appropriate for distribution fitting
|
|
55
|
+
if data.nunique() < 5:
|
|
56
|
+
return 'discrete', None, None
|
|
57
|
+
|
|
58
|
+
best_distribution = None
|
|
59
|
+
best_params = None
|
|
60
|
+
best_sse = np.inf
|
|
61
|
+
|
|
62
|
+
# Get histogram data
|
|
63
|
+
hist, bin_edges = np.histogram(data, bins='auto', density=True)
|
|
64
|
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
|
65
|
+
|
|
66
|
+
# Try each distribution
|
|
67
|
+
for dist_info in self.distributions:
|
|
68
|
+
distribution = dist_info['distribution']
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Fit distribution to data
|
|
72
|
+
params = distribution.fit(data)
|
|
73
|
+
|
|
74
|
+
# Get PDF values
|
|
75
|
+
arg_params = params[:-2]
|
|
76
|
+
loc = params[-2]
|
|
77
|
+
scale = params[-1]
|
|
78
|
+
|
|
79
|
+
pdf = distribution.pdf(bin_centers, loc=loc, scale=scale, *arg_params)
|
|
80
|
+
|
|
81
|
+
# Calculate sum of squared errors
|
|
82
|
+
sse = np.sum((pdf - hist) ** 2)
|
|
83
|
+
|
|
84
|
+
# Find best fit distribution
|
|
85
|
+
if sse < best_sse:
|
|
86
|
+
best_distribution = dist_info['name']
|
|
87
|
+
best_params = params
|
|
88
|
+
best_sse = sse
|
|
89
|
+
|
|
90
|
+
except Exception:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Calculate Kolmogorov-Smirnov test for goodness of fit
|
|
94
|
+
if best_distribution and best_params:
|
|
95
|
+
dist = getattr(stats, best_distribution)
|
|
96
|
+
|
|
97
|
+
# Try to compute K-S test
|
|
98
|
+
try:
|
|
99
|
+
arg_params = best_params[:-2]
|
|
100
|
+
loc = best_params[-2]
|
|
101
|
+
scale = best_params[-1]
|
|
102
|
+
ks_stat, p_value = stats.kstest(data, dist.cdf, args=arg_params, loc=loc, scale=scale)
|
|
103
|
+
return best_distribution, best_params, p_value
|
|
104
|
+
except:
|
|
105
|
+
return best_distribution, best_params, None
|
|
106
|
+
|
|
107
|
+
return None, None, None
|
|
108
|
+
|
|
109
|
+
def describe_distribution(self, series):
|
|
110
|
+
"""Provide distribution statistics for a series"""
|
|
111
|
+
stats_dict = {}
|
|
112
|
+
|
|
113
|
+
# Remove NaNs
|
|
114
|
+
series = series.dropna()
|
|
115
|
+
|
|
116
|
+
if len(series) == 0:
|
|
117
|
+
return {
|
|
118
|
+
'count': 0,
|
|
119
|
+
'distribution': 'empty',
|
|
120
|
+
'goodness_of_fit': None
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Basic statistics
|
|
124
|
+
stats_dict['count'] = len(series)
|
|
125
|
+
stats_dict['unique_count'] = series.nunique()
|
|
126
|
+
stats_dict['missing_count'] = series.isna().sum()
|
|
127
|
+
stats_dict['missing_percentage'] = (series.isna().sum() / len(series)) * 100
|
|
128
|
+
|
|
129
|
+
# For categorical data
|
|
130
|
+
if not pd.api.types.is_numeric_dtype(series):
|
|
131
|
+
stats_dict['type'] = 'categorical'
|
|
132
|
+
top_values = series.value_counts().head(5).to_dict()
|
|
133
|
+
stats_dict['top_values'] = {str(k): v for k, v in top_values.items()}
|
|
134
|
+
stats_dict['distribution'] = 'categorical'
|
|
135
|
+
return stats_dict
|
|
136
|
+
|
|
137
|
+
# For numerical data
|
|
138
|
+
stats_dict['type'] = 'numerical'
|
|
139
|
+
stats_dict['min'] = float(series.min())
|
|
140
|
+
stats_dict['max'] = float(series.max())
|
|
141
|
+
stats_dict['mean'] = float(series.mean())
|
|
142
|
+
stats_dict['median'] = float(series.median())
|
|
143
|
+
stats_dict['std'] = float(series.std())
|
|
144
|
+
stats_dict['skewness'] = float(stats.skew(series))
|
|
145
|
+
stats_dict['kurtosis'] = float(stats.kurtosis(series))
|
|
146
|
+
|
|
147
|
+
# Find best distribution
|
|
148
|
+
best_dist, params, p_value = self.get_best_distribution(series)
|
|
149
|
+
stats_dict['distribution'] = best_dist
|
|
150
|
+
stats_dict['goodness_of_fit'] = p_value
|
|
151
|
+
|
|
152
|
+
return stats_dict
|
|
153
|
+
|
|
154
|
+
def profile(self, df):
|
|
155
|
+
"""
|
|
156
|
+
Profile a dataframe to identify the distribution characteristics of each column.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
df: pandas DataFrame to analyze
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
DataFrame with columns and their distribution profiles
|
|
163
|
+
"""
|
|
164
|
+
if not isinstance(df, pd.DataFrame):
|
|
165
|
+
raise TypeError("Input must be a pandas DataFrame")
|
|
166
|
+
|
|
167
|
+
if df.empty:
|
|
168
|
+
return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
|
|
169
|
+
|
|
170
|
+
results = []
|
|
171
|
+
total_columns = len(df.columns)
|
|
172
|
+
|
|
173
|
+
# Analyze each column
|
|
174
|
+
for i, column in enumerate(df.columns):
|
|
175
|
+
# Emit progress signal (if connected)
|
|
176
|
+
self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
stats_dict = self.describe_distribution(df[column])
|
|
180
|
+
stats_dict['column'] = column
|
|
181
|
+
results.append(stats_dict)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
# Skip columns that can't be analyzed
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
# Create results dataframe
|
|
187
|
+
result_df = pd.DataFrame(results)
|
|
188
|
+
|
|
189
|
+
if result_df.empty:
|
|
190
|
+
return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
|
|
191
|
+
|
|
192
|
+
# Sort by distribution type and column name
|
|
193
|
+
result_df = result_df.sort_values(by=['type', 'column'])
|
|
194
|
+
|
|
195
|
+
self.progress_updated.emit(100, "Analysis complete")
|
|
196
|
+
return result_df
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class MatplotlibCanvas(FigureCanvasQTAgg):
|
|
200
|
+
"""Matplotlib canvas for embedding plots in PyQt"""
|
|
201
|
+
def __init__(self, width=5, height=4, dpi=100):
|
|
202
|
+
self.fig = Figure(figsize=(width, height), dpi=dpi)
|
|
203
|
+
self.axes = self.fig.add_subplot(111)
|
|
204
|
+
super().__init__(self.fig)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class DistributionVisualization(QMainWindow):
|
|
208
|
+
"""Window to visualize distribution results"""
|
|
209
|
+
|
|
210
|
+
def __init__(self, df, results_df, parent=None):
|
|
211
|
+
super().__init__(parent)
|
|
212
|
+
self.df = df
|
|
213
|
+
self.results_df = results_df
|
|
214
|
+
self.current_column = None
|
|
215
|
+
|
|
216
|
+
self.setWindowTitle("Column Distribution Profiles")
|
|
217
|
+
self.resize(1000, 800)
|
|
218
|
+
|
|
219
|
+
# Create central widget and layout
|
|
220
|
+
central_widget = QWidget()
|
|
221
|
+
self.setCentralWidget(central_widget)
|
|
222
|
+
main_layout = QVBoxLayout(central_widget)
|
|
223
|
+
|
|
224
|
+
# Add a title
|
|
225
|
+
title = QLabel("Statistical Distribution Analysis")
|
|
226
|
+
title.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
227
|
+
title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
|
|
228
|
+
main_layout.addWidget(title)
|
|
229
|
+
|
|
230
|
+
# Add a description
|
|
231
|
+
description = QLabel(
|
|
232
|
+
"Analyzing column distributions helps identify data patterns and select appropriate statistical methods."
|
|
233
|
+
)
|
|
234
|
+
description.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
235
|
+
description.setWordWrap(True)
|
|
236
|
+
main_layout.addWidget(description)
|
|
237
|
+
|
|
238
|
+
# Create a splitter for table and visualization
|
|
239
|
+
splitter = QSplitter(Qt.Orientation.Vertical)
|
|
240
|
+
main_layout.addWidget(splitter, 1)
|
|
241
|
+
|
|
242
|
+
# Create table view
|
|
243
|
+
table_widget = QWidget()
|
|
244
|
+
table_layout = QVBoxLayout(table_widget)
|
|
245
|
+
self.create_table_view(table_layout)
|
|
246
|
+
splitter.addWidget(table_widget)
|
|
247
|
+
|
|
248
|
+
# Create visualization section
|
|
249
|
+
vis_widget = QWidget()
|
|
250
|
+
vis_layout = QVBoxLayout(vis_widget)
|
|
251
|
+
self.create_visualization_section(vis_layout)
|
|
252
|
+
splitter.addWidget(vis_widget)
|
|
253
|
+
|
|
254
|
+
# Set initial splitter sizes
|
|
255
|
+
splitter.setSizes([300, 500])
|
|
256
|
+
|
|
257
|
+
def create_table_view(self, layout):
|
|
258
|
+
"""Create a table view showing the distribution results"""
|
|
259
|
+
# Create the model
|
|
260
|
+
model = QStandardItemModel()
|
|
261
|
+
headers = ['Column', 'Type', 'Distribution', 'Count', 'Unique', 'Missing %']
|
|
262
|
+
if 'skewness' in self.results_df.columns:
|
|
263
|
+
headers.extend(['Mean', 'Median', 'Std', 'Skewness', 'Kurtosis'])
|
|
264
|
+
model.setHorizontalHeaderLabels(headers)
|
|
265
|
+
|
|
266
|
+
# Set table data
|
|
267
|
+
for _, row in self.results_df.iterrows():
|
|
268
|
+
items = []
|
|
269
|
+
|
|
270
|
+
# Basic columns present in all rows
|
|
271
|
+
column_item = QStandardItem(str(row['column']))
|
|
272
|
+
type_item = QStandardItem(str(row['type']))
|
|
273
|
+
dist_item = QStandardItem(str(row['distribution']))
|
|
274
|
+
count_item = QStandardItem(str(row['count']))
|
|
275
|
+
unique_item = QStandardItem(str(row.get('unique_count', 'N/A')))
|
|
276
|
+
missing_item = QStandardItem(f"{row.get('missing_percentage', 0):.1f}%")
|
|
277
|
+
|
|
278
|
+
items.extend([column_item, type_item, dist_item, count_item, unique_item, missing_item])
|
|
279
|
+
|
|
280
|
+
# Add numerical statistics if available
|
|
281
|
+
if row['type'] == 'numerical':
|
|
282
|
+
mean_item = QStandardItem(f"{row.get('mean', 'N/A'):.2f}")
|
|
283
|
+
median_item = QStandardItem(f"{row.get('median', 'N/A'):.2f}")
|
|
284
|
+
std_item = QStandardItem(f"{row.get('std', 'N/A'):.2f}")
|
|
285
|
+
skew_item = QStandardItem(f"{row.get('skewness', 'N/A'):.2f}")
|
|
286
|
+
kurt_item = QStandardItem(f"{row.get('kurtosis', 'N/A'):.2f}")
|
|
287
|
+
|
|
288
|
+
items.extend([mean_item, median_item, std_item, skew_item, kurt_item])
|
|
289
|
+
else:
|
|
290
|
+
# Add empty items for categorical data
|
|
291
|
+
for _ in range(5):
|
|
292
|
+
items.append(QStandardItem(""))
|
|
293
|
+
|
|
294
|
+
model.appendRow(items)
|
|
295
|
+
|
|
296
|
+
# Create and configure the table view
|
|
297
|
+
self.table_view = QTableView()
|
|
298
|
+
self.table_view.setModel(model)
|
|
299
|
+
self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.ResizeToContents)
|
|
300
|
+
self.table_view.setAlternatingRowColors(True)
|
|
301
|
+
self.table_view.setSelectionBehavior(QTableView.SelectionBehavior.SelectRows)
|
|
302
|
+
self.table_view.setMinimumHeight(200)
|
|
303
|
+
|
|
304
|
+
# Connect selection signal
|
|
305
|
+
self.table_view.selectionModel().selectionChanged.connect(self.on_column_selected)
|
|
306
|
+
|
|
307
|
+
layout.addWidget(self.table_view)
|
|
308
|
+
|
|
309
|
+
# Add column selector
|
|
310
|
+
selector_layout = QHBoxLayout()
|
|
311
|
+
selector_layout.addWidget(QLabel("Select Column:"))
|
|
312
|
+
|
|
313
|
+
self.column_selector = QComboBox()
|
|
314
|
+
self.column_selector.addItems(self.results_df['column'].tolist())
|
|
315
|
+
self.column_selector.currentTextChanged.connect(self.on_combobox_changed)
|
|
316
|
+
selector_layout.addWidget(self.column_selector)
|
|
317
|
+
|
|
318
|
+
layout.addLayout(selector_layout)
|
|
319
|
+
|
|
320
|
+
def create_visualization_section(self, layout):
|
|
321
|
+
"""Create the visualization section with tabs for different plots"""
|
|
322
|
+
self.tab_widget = QTabWidget()
|
|
323
|
+
|
|
324
|
+
# Create tabs for different visualizations
|
|
325
|
+
self.histogram_tab = QWidget()
|
|
326
|
+
self.histogram_layout = QVBoxLayout(self.histogram_tab)
|
|
327
|
+
self.histogram_canvas = MatplotlibCanvas(width=8, height=4)
|
|
328
|
+
self.histogram_layout.addWidget(self.histogram_canvas)
|
|
329
|
+
self.tab_widget.addTab(self.histogram_tab, "Histogram & Density")
|
|
330
|
+
|
|
331
|
+
self.boxplot_tab = QWidget()
|
|
332
|
+
self.boxplot_layout = QVBoxLayout(self.boxplot_tab)
|
|
333
|
+
self.boxplot_canvas = MatplotlibCanvas(width=8, height=4)
|
|
334
|
+
self.boxplot_layout.addWidget(self.boxplot_canvas)
|
|
335
|
+
self.tab_widget.addTab(self.boxplot_tab, "Box Plot")
|
|
336
|
+
|
|
337
|
+
self.qq_tab = QWidget()
|
|
338
|
+
self.qq_layout = QVBoxLayout(self.qq_tab)
|
|
339
|
+
self.qq_canvas = MatplotlibCanvas(width=8, height=4)
|
|
340
|
+
self.qq_layout.addWidget(self.qq_canvas)
|
|
341
|
+
self.tab_widget.addTab(self.qq_tab, "Q-Q Plot")
|
|
342
|
+
|
|
343
|
+
self.ecdf_tab = QWidget()
|
|
344
|
+
self.ecdf_layout = QVBoxLayout(self.ecdf_tab)
|
|
345
|
+
self.ecdf_canvas = MatplotlibCanvas(width=8, height=4)
|
|
346
|
+
self.ecdf_layout.addWidget(self.ecdf_canvas)
|
|
347
|
+
self.tab_widget.addTab(self.ecdf_tab, "Empirical CDF")
|
|
348
|
+
|
|
349
|
+
# For categorical data
|
|
350
|
+
self.categorical_tab = QWidget()
|
|
351
|
+
self.categorical_layout = QVBoxLayout(self.categorical_tab)
|
|
352
|
+
self.categorical_canvas = MatplotlibCanvas(width=8, height=4)
|
|
353
|
+
self.categorical_layout.addWidget(self.categorical_canvas)
|
|
354
|
+
self.tab_widget.addTab(self.categorical_tab, "Bar Chart")
|
|
355
|
+
|
|
356
|
+
layout.addWidget(self.tab_widget)
|
|
357
|
+
|
|
358
|
+
# Stats panel
|
|
359
|
+
self.stats_label = QLabel("Select a column to view distribution statistics")
|
|
360
|
+
self.stats_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
|
361
|
+
self.stats_label.setStyleSheet("font-family: monospace; background-color: #f0f0f0; padding: 10px; border-radius: 5px;")
|
|
362
|
+
self.stats_label.setWordWrap(True)
|
|
363
|
+
layout.addWidget(self.stats_label)
|
|
364
|
+
|
|
365
|
+
def on_combobox_changed(self, column_name):
|
|
366
|
+
"""Handle column selection from combobox"""
|
|
367
|
+
self.visualize_column(column_name)
|
|
368
|
+
|
|
369
|
+
def on_column_selected(self, selected, deselected):
|
|
370
|
+
"""Handle column selection from table"""
|
|
371
|
+
indexes = selected.indexes()
|
|
372
|
+
if indexes:
|
|
373
|
+
# Get the column name from the first column
|
|
374
|
+
row_idx = indexes[0].row()
|
|
375
|
+
column_name = self.table_view.model().item(row_idx, 0).text()
|
|
376
|
+
|
|
377
|
+
# Update combobox to match
|
|
378
|
+
index = self.column_selector.findText(column_name)
|
|
379
|
+
if index >= 0:
|
|
380
|
+
self.column_selector.setCurrentIndex(index)
|
|
381
|
+
|
|
382
|
+
self.visualize_column(column_name)
|
|
383
|
+
|
|
384
|
+
def visualize_column(self, column_name):
|
|
385
|
+
"""Visualize the selected column with various plots"""
|
|
386
|
+
if column_name not in self.df.columns:
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
self.current_column = column_name
|
|
390
|
+
|
|
391
|
+
# Get column data and stats
|
|
392
|
+
series = self.df[column_name]
|
|
393
|
+
column_stats = self.results_df[self.results_df['column'] == column_name].iloc[0].to_dict()
|
|
394
|
+
|
|
395
|
+
# Update stats label
|
|
396
|
+
self.update_stats_display(column_stats)
|
|
397
|
+
|
|
398
|
+
# Check if categorical or numerical
|
|
399
|
+
if column_stats['type'] == 'categorical':
|
|
400
|
+
self.create_categorical_plots(series, column_stats)
|
|
401
|
+
self.tab_widget.setCurrentWidget(self.categorical_tab)
|
|
402
|
+
else:
|
|
403
|
+
self.create_numerical_plots(series, column_stats)
|
|
404
|
+
self.tab_widget.setCurrentWidget(self.histogram_tab)
|
|
405
|
+
|
|
406
|
+
def update_stats_display(self, stats):
|
|
407
|
+
"""Update the statistics display panel"""
|
|
408
|
+
if stats['type'] == 'numerical':
|
|
409
|
+
# Format numerical stats
|
|
410
|
+
stats_text = (
|
|
411
|
+
f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']} | "
|
|
412
|
+
f"<b>Distribution:</b> {stats['distribution']}\n"
|
|
413
|
+
f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
|
|
414
|
+
f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)\n"
|
|
415
|
+
f"<b>Min:</b> {stats['min']:.4g} | <b>Max:</b> {stats['max']:.4g} | "
|
|
416
|
+
f"<b>Mean:</b> {stats['mean']:.4g} | <b>Median:</b> {stats['median']:.4g} | <b>Std:</b> {stats['std']:.4g}\n"
|
|
417
|
+
f"<b>Skewness:</b> {stats['skewness']:.4g} | <b>Kurtosis:</b> {stats['kurtosis']:.4g}"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
if stats['goodness_of_fit'] is not None:
|
|
421
|
+
stats_text += f" | <b>Goodness of fit (p-value):</b> {stats['goodness_of_fit']:.4g}"
|
|
422
|
+
else:
|
|
423
|
+
# Format categorical stats
|
|
424
|
+
stats_text = (
|
|
425
|
+
f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']}\n"
|
|
426
|
+
f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
|
|
427
|
+
f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
if 'top_values' in stats:
|
|
431
|
+
top_values = stats['top_values']
|
|
432
|
+
stats_text += "\n<b>Top values:</b> "
|
|
433
|
+
stats_text += ", ".join([f"{k} ({v})" for k, v in top_values.items()])
|
|
434
|
+
|
|
435
|
+
self.stats_label.setText(stats_text)
|
|
436
|
+
|
|
437
|
+
def create_numerical_plots(self, series, column_stats):
|
|
438
|
+
"""Create plots for numerical data"""
|
|
439
|
+
# Clean data
|
|
440
|
+
data = series.dropna()
|
|
441
|
+
|
|
442
|
+
# Histogram with fitted distribution
|
|
443
|
+
self.histogram_canvas.axes.clear()
|
|
444
|
+
self.histogram_canvas.axes.hist(data, bins='auto', density=True, alpha=0.6, label="Data")
|
|
445
|
+
|
|
446
|
+
# If we have a fitted distribution, plot it
|
|
447
|
+
if column_stats['distribution'] not in [None, 'discrete', 'constant', 'categorical']:
|
|
448
|
+
# Get the distribution and params
|
|
449
|
+
dist_name = column_stats['distribution']
|
|
450
|
+
|
|
451
|
+
# Simple estimation for distribution parameters if we don't have them
|
|
452
|
+
# In a real implementation, you would save the parameters from the profiler
|
|
453
|
+
if dist_name == 'normal':
|
|
454
|
+
x = np.linspace(data.min(), data.max(), 1000)
|
|
455
|
+
y = stats.norm.pdf(x, data.mean(), data.std())
|
|
456
|
+
self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
|
|
457
|
+
elif dist_name == 'uniform':
|
|
458
|
+
x = np.linspace(data.min(), data.max(), 1000)
|
|
459
|
+
y = stats.uniform.pdf(x, data.min(), data.max() - data.min())
|
|
460
|
+
self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
|
|
461
|
+
elif dist_name == 'exponential':
|
|
462
|
+
x = np.linspace(data.min(), data.max(), 1000)
|
|
463
|
+
y = stats.expon.pdf(x, scale=1/data.mean())
|
|
464
|
+
self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
|
|
465
|
+
|
|
466
|
+
self.histogram_canvas.axes.set_title(f"Histogram of {series.name}")
|
|
467
|
+
self.histogram_canvas.axes.set_xlabel("Value")
|
|
468
|
+
self.histogram_canvas.axes.set_ylabel("Density")
|
|
469
|
+
self.histogram_canvas.axes.legend()
|
|
470
|
+
self.histogram_canvas.fig.tight_layout()
|
|
471
|
+
self.histogram_canvas.draw()
|
|
472
|
+
|
|
473
|
+
# Box plot
|
|
474
|
+
self.boxplot_canvas.axes.clear()
|
|
475
|
+
self.boxplot_canvas.axes.boxplot(data, vert=False)
|
|
476
|
+
self.boxplot_canvas.axes.set_title(f"Box Plot of {series.name}")
|
|
477
|
+
self.boxplot_canvas.axes.set_xlabel("Value")
|
|
478
|
+
self.boxplot_canvas.axes.set_yticks([])
|
|
479
|
+
self.boxplot_canvas.fig.tight_layout()
|
|
480
|
+
self.boxplot_canvas.draw()
|
|
481
|
+
|
|
482
|
+
# Q-Q plot
|
|
483
|
+
self.qq_canvas.axes.clear()
|
|
484
|
+
stats.probplot(data, dist="norm", plot=self.qq_canvas.axes)
|
|
485
|
+
self.qq_canvas.axes.set_title(f"Q-Q Plot of {series.name} (vs Normal)")
|
|
486
|
+
self.qq_canvas.fig.tight_layout()
|
|
487
|
+
self.qq_canvas.draw()
|
|
488
|
+
|
|
489
|
+
# Empirical CDF
|
|
490
|
+
self.ecdf_canvas.axes.clear()
|
|
491
|
+
x = np.sort(data)
|
|
492
|
+
y = np.arange(1, len(x) + 1) / len(x)
|
|
493
|
+
self.ecdf_canvas.axes.step(x, y, where='post', label="Empirical CDF")
|
|
494
|
+
self.ecdf_canvas.axes.set_title(f"Empirical CDF of {series.name}")
|
|
495
|
+
self.ecdf_canvas.axes.set_xlabel("Value")
|
|
496
|
+
self.ecdf_canvas.axes.set_ylabel("Cumulative Probability")
|
|
497
|
+
self.ecdf_canvas.fig.tight_layout()
|
|
498
|
+
self.ecdf_canvas.draw()
|
|
499
|
+
|
|
500
|
+
def create_categorical_plots(self, series, stats):
|
|
501
|
+
"""Create plots for categorical data"""
|
|
502
|
+
# Clean data
|
|
503
|
+
data = series.dropna()
|
|
504
|
+
|
|
505
|
+
# Bar chart for categorical data
|
|
506
|
+
self.categorical_canvas.axes.clear()
|
|
507
|
+
value_counts = data.value_counts().sort_values(ascending=False)
|
|
508
|
+
|
|
509
|
+
# Limit to top 15 categories if there are too many
|
|
510
|
+
if len(value_counts) > 15:
|
|
511
|
+
value_counts = value_counts.head(15)
|
|
512
|
+
title = f"Top 15 Categories in {series.name}"
|
|
513
|
+
else:
|
|
514
|
+
title = f"Categories in {series.name}"
|
|
515
|
+
|
|
516
|
+
value_counts.plot(kind='bar', ax=self.categorical_canvas.axes)
|
|
517
|
+
self.categorical_canvas.axes.set_title(title)
|
|
518
|
+
self.categorical_canvas.axes.set_xlabel("Category")
|
|
519
|
+
self.categorical_canvas.axes.set_ylabel("Count")
|
|
520
|
+
|
|
521
|
+
# Rotate x-axis labels if needed
|
|
522
|
+
if len(value_counts) > 5:
|
|
523
|
+
plt.setp(self.categorical_canvas.axes.get_xticklabels(), rotation=45, ha='right')
|
|
524
|
+
|
|
525
|
+
self.categorical_canvas.fig.tight_layout()
|
|
526
|
+
self.categorical_canvas.draw()
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
# Function interface for simpler usage
|
|
530
|
+
def profile(df):
|
|
531
|
+
"""
|
|
532
|
+
Profile a dataframe to identify the distribution characteristics of each column.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
df: pandas DataFrame to analyze
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
DataFrame with columns and their distribution profiles
|
|
539
|
+
"""
|
|
540
|
+
profiler = DistributionProfiler()
|
|
541
|
+
return profiler.profile(df)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def visualize_profile(df):
|
|
545
|
+
"""
|
|
546
|
+
Create a visual representation of the distribution profiles for a dataframe.
|
|
547
|
+
|
|
548
|
+
Args:
|
|
549
|
+
df: pandas DataFrame to analyze
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
A PyQt6 window showing the visualization
|
|
553
|
+
"""
|
|
554
|
+
profiler = DistributionProfiler()
|
|
555
|
+
results = profiler.profile(df)
|
|
556
|
+
vis = DistributionVisualization(df, results)
|
|
557
|
+
vis.show()
|
|
558
|
+
return vis
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def test_profile_distributions():
|
|
562
|
+
"""Test the distribution profiler with a sample dataframe"""
|
|
563
|
+
import sys
|
|
564
|
+
|
|
565
|
+
# Create a QApplication instance if one doesn't exist
|
|
566
|
+
app = QApplication.instance()
|
|
567
|
+
if app is None:
|
|
568
|
+
app = QApplication(sys.argv)
|
|
569
|
+
|
|
570
|
+
# Generate a random dataframe with some columns with different distributions
|
|
571
|
+
np.random.seed(42) # For reproducibility
|
|
572
|
+
|
|
573
|
+
# Create a dataframe with columns of varying distributions
|
|
574
|
+
df = pd.DataFrame({
|
|
575
|
+
'uniform': np.random.uniform(0, 100, size=1000), # Uniform distribution
|
|
576
|
+
'normal': np.random.normal(50, 10, size=1000), # Normal distribution
|
|
577
|
+
'exponential': np.random.exponential(5, size=1000), # Exponential distribution
|
|
578
|
+
'lognormal': np.random.lognormal(0, 1, size=1000), # Log-normal distribution
|
|
579
|
+
'bimodal': np.concatenate([np.random.normal(20, 5, 500), np.random.normal(60, 5, 500)]), # Bimodal
|
|
580
|
+
'constant': np.ones(1000), # Constant value
|
|
581
|
+
'binary': np.random.choice([0, 1], size=1000), # Binary
|
|
582
|
+
'categorical': np.random.choice(['A', 'B', 'C', 'D'], size=1000), # Categorical data
|
|
583
|
+
'skewed': 100 - np.random.power(5, size=1000) * 100, # Right-skewed
|
|
584
|
+
'multimodal': np.concatenate([
|
|
585
|
+
np.random.normal(10, 2, 300),
|
|
586
|
+
np.random.normal(30, 2, 300),
|
|
587
|
+
np.random.normal(50, 2, 400)
|
|
588
|
+
]), # Multimodal
|
|
589
|
+
'boolean': np.random.choice([True, False], size=1000), # Boolean
|
|
590
|
+
'integer': np.random.randint(1, 10, size=1000), # Small integers
|
|
591
|
+
'text': pd.Series(['short', 'medium length', 'very long text entry', 'another value'] * 250) # Text data
|
|
592
|
+
})
|
|
593
|
+
|
|
594
|
+
# Add datetime and timedelta columns
|
|
595
|
+
df['datetime'] = pd.date_range('2020-01-01', periods=1000, freq='h')
|
|
596
|
+
df['timedelta'] = pd.Series([pd.Timedelta(days=i) for i in range(1000)])
|
|
597
|
+
|
|
598
|
+
# Add a column with missing values
|
|
599
|
+
df['with_missing'] = df['normal'].copy()
|
|
600
|
+
df.loc[np.random.choice(df.index, size=200), 'with_missing'] = np.nan
|
|
601
|
+
|
|
602
|
+
# Calculate and display profile information
|
|
603
|
+
print("Distribution Profile Results:")
|
|
604
|
+
profiler = DistributionProfiler()
|
|
605
|
+
result = profiler.profile(df)
|
|
606
|
+
print(result[['column', 'type', 'distribution', 'unique_count']])
|
|
607
|
+
|
|
608
|
+
# Visualize the results
|
|
609
|
+
vis = visualize_profile(df)
|
|
610
|
+
|
|
611
|
+
# Start the application event loop
|
|
612
|
+
app.exec()
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
if __name__ == "__main__":
|
|
616
|
+
test_profile_distributions()
|