sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,616 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import scipy.stats as stats
4
+ import matplotlib
5
+ try:
6
+ matplotlib.use('qtagg') # Set the backend before importing pyplot
7
+ except ImportError:
8
+ matplotlib.use('Agg') # Fall back to headless backend for CI/testing
9
+ import matplotlib.pyplot as plt
10
+ from matplotlib.figure import Figure
11
+ from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
12
+ from PyQt6.QtCore import QObject, pyqtSignal, Qt
13
+ from PyQt6.QtWidgets import (
14
+ QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
15
+ QTableView, QHeaderView, QLabel, QFrame, QScrollArea, QTabWidget,
16
+ QComboBox, QPushButton, QSplitter
17
+ )
18
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QBrush
19
+
20
+
21
+ class DistributionProfiler(QObject):
22
+ """Class to analyze distributions of columns in a dataframe"""
23
+ progress_updated = pyqtSignal(int, str) # Signal for progress reporting
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+
28
+ # Define common distributions to test
29
+ self.distributions = [
30
+ {'name': 'normal', 'distribution': stats.norm, 'color': 'blue'},
31
+ {'name': 'uniform', 'distribution': stats.uniform, 'color': 'green'},
32
+ {'name': 'exponential', 'distribution': stats.expon, 'color': 'red'},
33
+ {'name': 'lognormal', 'distribution': stats.lognorm, 'color': 'purple'},
34
+ {'name': 'gamma', 'distribution': stats.gamma, 'color': 'orange'},
35
+ {'name': 'beta', 'distribution': stats.beta, 'color': 'brown'},
36
+ ]
37
+
38
+ def get_best_distribution(self, data):
39
+ """Find the best distribution that fits the data"""
40
+ # Remove NaNs
41
+ data = data.dropna()
42
+
43
+ if len(data) == 0:
44
+ return None, None, None
45
+
46
+ # For categorical or non-numeric data, return None
47
+ if not pd.api.types.is_numeric_dtype(data):
48
+ return None, None, None
49
+
50
+ # For constant data, return a simple result
51
+ if data.nunique() == 1:
52
+ return 'constant', None, 1.0
53
+
54
+ # If too few unique values, may not be appropriate for distribution fitting
55
+ if data.nunique() < 5:
56
+ return 'discrete', None, None
57
+
58
+ best_distribution = None
59
+ best_params = None
60
+ best_sse = np.inf
61
+
62
+ # Get histogram data
63
+ hist, bin_edges = np.histogram(data, bins='auto', density=True)
64
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
65
+
66
+ # Try each distribution
67
+ for dist_info in self.distributions:
68
+ distribution = dist_info['distribution']
69
+
70
+ try:
71
+ # Fit distribution to data
72
+ params = distribution.fit(data)
73
+
74
+ # Get PDF values
75
+ arg_params = params[:-2]
76
+ loc = params[-2]
77
+ scale = params[-1]
78
+
79
+ pdf = distribution.pdf(bin_centers, loc=loc, scale=scale, *arg_params)
80
+
81
+ # Calculate sum of squared errors
82
+ sse = np.sum((pdf - hist) ** 2)
83
+
84
+ # Find best fit distribution
85
+ if sse < best_sse:
86
+ best_distribution = dist_info['name']
87
+ best_params = params
88
+ best_sse = sse
89
+
90
+ except Exception:
91
+ continue
92
+
93
+ # Calculate Kolmogorov-Smirnov test for goodness of fit
94
+ if best_distribution and best_params:
95
+ dist = getattr(stats, best_distribution)
96
+
97
+ # Try to compute K-S test
98
+ try:
99
+ arg_params = best_params[:-2]
100
+ loc = best_params[-2]
101
+ scale = best_params[-1]
102
+ ks_stat, p_value = stats.kstest(data, dist.cdf, args=arg_params, loc=loc, scale=scale)
103
+ return best_distribution, best_params, p_value
104
+ except:
105
+ return best_distribution, best_params, None
106
+
107
+ return None, None, None
108
+
109
+ def describe_distribution(self, series):
110
+ """Provide distribution statistics for a series"""
111
+ stats_dict = {}
112
+
113
+ # Remove NaNs
114
+ series = series.dropna()
115
+
116
+ if len(series) == 0:
117
+ return {
118
+ 'count': 0,
119
+ 'distribution': 'empty',
120
+ 'goodness_of_fit': None
121
+ }
122
+
123
+ # Basic statistics
124
+ stats_dict['count'] = len(series)
125
+ stats_dict['unique_count'] = series.nunique()
126
+ stats_dict['missing_count'] = series.isna().sum()
127
+ stats_dict['missing_percentage'] = (series.isna().sum() / len(series)) * 100
128
+
129
+ # For categorical data
130
+ if not pd.api.types.is_numeric_dtype(series):
131
+ stats_dict['type'] = 'categorical'
132
+ top_values = series.value_counts().head(5).to_dict()
133
+ stats_dict['top_values'] = {str(k): v for k, v in top_values.items()}
134
+ stats_dict['distribution'] = 'categorical'
135
+ return stats_dict
136
+
137
+ # For numerical data
138
+ stats_dict['type'] = 'numerical'
139
+ stats_dict['min'] = float(series.min())
140
+ stats_dict['max'] = float(series.max())
141
+ stats_dict['mean'] = float(series.mean())
142
+ stats_dict['median'] = float(series.median())
143
+ stats_dict['std'] = float(series.std())
144
+ stats_dict['skewness'] = float(stats.skew(series))
145
+ stats_dict['kurtosis'] = float(stats.kurtosis(series))
146
+
147
+ # Find best distribution
148
+ best_dist, params, p_value = self.get_best_distribution(series)
149
+ stats_dict['distribution'] = best_dist
150
+ stats_dict['goodness_of_fit'] = p_value
151
+
152
+ return stats_dict
153
+
154
+ def profile(self, df):
155
+ """
156
+ Profile a dataframe to identify the distribution characteristics of each column.
157
+
158
+ Args:
159
+ df: pandas DataFrame to analyze
160
+
161
+ Returns:
162
+ DataFrame with columns and their distribution profiles
163
+ """
164
+ if not isinstance(df, pd.DataFrame):
165
+ raise TypeError("Input must be a pandas DataFrame")
166
+
167
+ if df.empty:
168
+ return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
169
+
170
+ results = []
171
+ total_columns = len(df.columns)
172
+
173
+ # Analyze each column
174
+ for i, column in enumerate(df.columns):
175
+ # Emit progress signal (if connected)
176
+ self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
177
+
178
+ try:
179
+ stats_dict = self.describe_distribution(df[column])
180
+ stats_dict['column'] = column
181
+ results.append(stats_dict)
182
+ except Exception as e:
183
+ # Skip columns that can't be analyzed
184
+ continue
185
+
186
+ # Create results dataframe
187
+ result_df = pd.DataFrame(results)
188
+
189
+ if result_df.empty:
190
+ return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
191
+
192
+ # Sort by distribution type and column name
193
+ result_df = result_df.sort_values(by=['type', 'column'])
194
+
195
+ self.progress_updated.emit(100, "Analysis complete")
196
+ return result_df
197
+
198
+
199
+ class MatplotlibCanvas(FigureCanvasQTAgg):
200
+ """Matplotlib canvas for embedding plots in PyQt"""
201
+ def __init__(self, width=5, height=4, dpi=100):
202
+ self.fig = Figure(figsize=(width, height), dpi=dpi)
203
+ self.axes = self.fig.add_subplot(111)
204
+ super().__init__(self.fig)
205
+
206
+
207
+ class DistributionVisualization(QMainWindow):
208
+ """Window to visualize distribution results"""
209
+
210
+ def __init__(self, df, results_df, parent=None):
211
+ super().__init__(parent)
212
+ self.df = df
213
+ self.results_df = results_df
214
+ self.current_column = None
215
+
216
+ self.setWindowTitle("Column Distribution Profiles")
217
+ self.resize(1000, 800)
218
+
219
+ # Create central widget and layout
220
+ central_widget = QWidget()
221
+ self.setCentralWidget(central_widget)
222
+ main_layout = QVBoxLayout(central_widget)
223
+
224
+ # Add a title
225
+ title = QLabel("Statistical Distribution Analysis")
226
+ title.setAlignment(Qt.AlignmentFlag.AlignCenter)
227
+ title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
228
+ main_layout.addWidget(title)
229
+
230
+ # Add a description
231
+ description = QLabel(
232
+ "Analyzing column distributions helps identify data patterns and select appropriate statistical methods."
233
+ )
234
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
235
+ description.setWordWrap(True)
236
+ main_layout.addWidget(description)
237
+
238
+ # Create a splitter for table and visualization
239
+ splitter = QSplitter(Qt.Orientation.Vertical)
240
+ main_layout.addWidget(splitter, 1)
241
+
242
+ # Create table view
243
+ table_widget = QWidget()
244
+ table_layout = QVBoxLayout(table_widget)
245
+ self.create_table_view(table_layout)
246
+ splitter.addWidget(table_widget)
247
+
248
+ # Create visualization section
249
+ vis_widget = QWidget()
250
+ vis_layout = QVBoxLayout(vis_widget)
251
+ self.create_visualization_section(vis_layout)
252
+ splitter.addWidget(vis_widget)
253
+
254
+ # Set initial splitter sizes
255
+ splitter.setSizes([300, 500])
256
+
257
+ def create_table_view(self, layout):
258
+ """Create a table view showing the distribution results"""
259
+ # Create the model
260
+ model = QStandardItemModel()
261
+ headers = ['Column', 'Type', 'Distribution', 'Count', 'Unique', 'Missing %']
262
+ if 'skewness' in self.results_df.columns:
263
+ headers.extend(['Mean', 'Median', 'Std', 'Skewness', 'Kurtosis'])
264
+ model.setHorizontalHeaderLabels(headers)
265
+
266
+ # Set table data
267
+ for _, row in self.results_df.iterrows():
268
+ items = []
269
+
270
+ # Basic columns present in all rows
271
+ column_item = QStandardItem(str(row['column']))
272
+ type_item = QStandardItem(str(row['type']))
273
+ dist_item = QStandardItem(str(row['distribution']))
274
+ count_item = QStandardItem(str(row['count']))
275
+ unique_item = QStandardItem(str(row.get('unique_count', 'N/A')))
276
+ missing_item = QStandardItem(f"{row.get('missing_percentage', 0):.1f}%")
277
+
278
+ items.extend([column_item, type_item, dist_item, count_item, unique_item, missing_item])
279
+
280
+ # Add numerical statistics if available
281
+ if row['type'] == 'numerical':
282
+ mean_item = QStandardItem(f"{row.get('mean', 'N/A'):.2f}")
283
+ median_item = QStandardItem(f"{row.get('median', 'N/A'):.2f}")
284
+ std_item = QStandardItem(f"{row.get('std', 'N/A'):.2f}")
285
+ skew_item = QStandardItem(f"{row.get('skewness', 'N/A'):.2f}")
286
+ kurt_item = QStandardItem(f"{row.get('kurtosis', 'N/A'):.2f}")
287
+
288
+ items.extend([mean_item, median_item, std_item, skew_item, kurt_item])
289
+ else:
290
+ # Add empty items for categorical data
291
+ for _ in range(5):
292
+ items.append(QStandardItem(""))
293
+
294
+ model.appendRow(items)
295
+
296
+ # Create and configure the table view
297
+ self.table_view = QTableView()
298
+ self.table_view.setModel(model)
299
+ self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.ResizeToContents)
300
+ self.table_view.setAlternatingRowColors(True)
301
+ self.table_view.setSelectionBehavior(QTableView.SelectionBehavior.SelectRows)
302
+ self.table_view.setMinimumHeight(200)
303
+
304
+ # Connect selection signal
305
+ self.table_view.selectionModel().selectionChanged.connect(self.on_column_selected)
306
+
307
+ layout.addWidget(self.table_view)
308
+
309
+ # Add column selector
310
+ selector_layout = QHBoxLayout()
311
+ selector_layout.addWidget(QLabel("Select Column:"))
312
+
313
+ self.column_selector = QComboBox()
314
+ self.column_selector.addItems(self.results_df['column'].tolist())
315
+ self.column_selector.currentTextChanged.connect(self.on_combobox_changed)
316
+ selector_layout.addWidget(self.column_selector)
317
+
318
+ layout.addLayout(selector_layout)
319
+
320
+ def create_visualization_section(self, layout):
321
+ """Create the visualization section with tabs for different plots"""
322
+ self.tab_widget = QTabWidget()
323
+
324
+ # Create tabs for different visualizations
325
+ self.histogram_tab = QWidget()
326
+ self.histogram_layout = QVBoxLayout(self.histogram_tab)
327
+ self.histogram_canvas = MatplotlibCanvas(width=8, height=4)
328
+ self.histogram_layout.addWidget(self.histogram_canvas)
329
+ self.tab_widget.addTab(self.histogram_tab, "Histogram & Density")
330
+
331
+ self.boxplot_tab = QWidget()
332
+ self.boxplot_layout = QVBoxLayout(self.boxplot_tab)
333
+ self.boxplot_canvas = MatplotlibCanvas(width=8, height=4)
334
+ self.boxplot_layout.addWidget(self.boxplot_canvas)
335
+ self.tab_widget.addTab(self.boxplot_tab, "Box Plot")
336
+
337
+ self.qq_tab = QWidget()
338
+ self.qq_layout = QVBoxLayout(self.qq_tab)
339
+ self.qq_canvas = MatplotlibCanvas(width=8, height=4)
340
+ self.qq_layout.addWidget(self.qq_canvas)
341
+ self.tab_widget.addTab(self.qq_tab, "Q-Q Plot")
342
+
343
+ self.ecdf_tab = QWidget()
344
+ self.ecdf_layout = QVBoxLayout(self.ecdf_tab)
345
+ self.ecdf_canvas = MatplotlibCanvas(width=8, height=4)
346
+ self.ecdf_layout.addWidget(self.ecdf_canvas)
347
+ self.tab_widget.addTab(self.ecdf_tab, "Empirical CDF")
348
+
349
+ # For categorical data
350
+ self.categorical_tab = QWidget()
351
+ self.categorical_layout = QVBoxLayout(self.categorical_tab)
352
+ self.categorical_canvas = MatplotlibCanvas(width=8, height=4)
353
+ self.categorical_layout.addWidget(self.categorical_canvas)
354
+ self.tab_widget.addTab(self.categorical_tab, "Bar Chart")
355
+
356
+ layout.addWidget(self.tab_widget)
357
+
358
+ # Stats panel
359
+ self.stats_label = QLabel("Select a column to view distribution statistics")
360
+ self.stats_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
361
+ self.stats_label.setStyleSheet("font-family: monospace; background-color: #f0f0f0; padding: 10px; border-radius: 5px;")
362
+ self.stats_label.setWordWrap(True)
363
+ layout.addWidget(self.stats_label)
364
+
365
+ def on_combobox_changed(self, column_name):
366
+ """Handle column selection from combobox"""
367
+ self.visualize_column(column_name)
368
+
369
+ def on_column_selected(self, selected, deselected):
370
+ """Handle column selection from table"""
371
+ indexes = selected.indexes()
372
+ if indexes:
373
+ # Get the column name from the first column
374
+ row_idx = indexes[0].row()
375
+ column_name = self.table_view.model().item(row_idx, 0).text()
376
+
377
+ # Update combobox to match
378
+ index = self.column_selector.findText(column_name)
379
+ if index >= 0:
380
+ self.column_selector.setCurrentIndex(index)
381
+
382
+ self.visualize_column(column_name)
383
+
384
+ def visualize_column(self, column_name):
385
+ """Visualize the selected column with various plots"""
386
+ if column_name not in self.df.columns:
387
+ return
388
+
389
+ self.current_column = column_name
390
+
391
+ # Get column data and stats
392
+ series = self.df[column_name]
393
+ column_stats = self.results_df[self.results_df['column'] == column_name].iloc[0].to_dict()
394
+
395
+ # Update stats label
396
+ self.update_stats_display(column_stats)
397
+
398
+ # Check if categorical or numerical
399
+ if column_stats['type'] == 'categorical':
400
+ self.create_categorical_plots(series, column_stats)
401
+ self.tab_widget.setCurrentWidget(self.categorical_tab)
402
+ else:
403
+ self.create_numerical_plots(series, column_stats)
404
+ self.tab_widget.setCurrentWidget(self.histogram_tab)
405
+
406
+ def update_stats_display(self, stats):
407
+ """Update the statistics display panel"""
408
+ if stats['type'] == 'numerical':
409
+ # Format numerical stats
410
+ stats_text = (
411
+ f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']} | "
412
+ f"<b>Distribution:</b> {stats['distribution']}\n"
413
+ f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
414
+ f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)\n"
415
+ f"<b>Min:</b> {stats['min']:.4g} | <b>Max:</b> {stats['max']:.4g} | "
416
+ f"<b>Mean:</b> {stats['mean']:.4g} | <b>Median:</b> {stats['median']:.4g} | <b>Std:</b> {stats['std']:.4g}\n"
417
+ f"<b>Skewness:</b> {stats['skewness']:.4g} | <b>Kurtosis:</b> {stats['kurtosis']:.4g}"
418
+ )
419
+
420
+ if stats['goodness_of_fit'] is not None:
421
+ stats_text += f" | <b>Goodness of fit (p-value):</b> {stats['goodness_of_fit']:.4g}"
422
+ else:
423
+ # Format categorical stats
424
+ stats_text = (
425
+ f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']}\n"
426
+ f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
427
+ f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)"
428
+ )
429
+
430
+ if 'top_values' in stats:
431
+ top_values = stats['top_values']
432
+ stats_text += "\n<b>Top values:</b> "
433
+ stats_text += ", ".join([f"{k} ({v})" for k, v in top_values.items()])
434
+
435
+ self.stats_label.setText(stats_text)
436
+
437
+ def create_numerical_plots(self, series, column_stats):
438
+ """Create plots for numerical data"""
439
+ # Clean data
440
+ data = series.dropna()
441
+
442
+ # Histogram with fitted distribution
443
+ self.histogram_canvas.axes.clear()
444
+ self.histogram_canvas.axes.hist(data, bins='auto', density=True, alpha=0.6, label="Data")
445
+
446
+ # If we have a fitted distribution, plot it
447
+ if column_stats['distribution'] not in [None, 'discrete', 'constant', 'categorical']:
448
+ # Get the distribution and params
449
+ dist_name = column_stats['distribution']
450
+
451
+ # Simple estimation for distribution parameters if we don't have them
452
+ # In a real implementation, you would save the parameters from the profiler
453
+ if dist_name == 'normal':
454
+ x = np.linspace(data.min(), data.max(), 1000)
455
+ y = stats.norm.pdf(x, data.mean(), data.std())
456
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
457
+ elif dist_name == 'uniform':
458
+ x = np.linspace(data.min(), data.max(), 1000)
459
+ y = stats.uniform.pdf(x, data.min(), data.max() - data.min())
460
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
461
+ elif dist_name == 'exponential':
462
+ x = np.linspace(data.min(), data.max(), 1000)
463
+ y = stats.expon.pdf(x, scale=1/data.mean())
464
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
465
+
466
+ self.histogram_canvas.axes.set_title(f"Histogram of {series.name}")
467
+ self.histogram_canvas.axes.set_xlabel("Value")
468
+ self.histogram_canvas.axes.set_ylabel("Density")
469
+ self.histogram_canvas.axes.legend()
470
+ self.histogram_canvas.fig.tight_layout()
471
+ self.histogram_canvas.draw()
472
+
473
+ # Box plot
474
+ self.boxplot_canvas.axes.clear()
475
+ self.boxplot_canvas.axes.boxplot(data, vert=False)
476
+ self.boxplot_canvas.axes.set_title(f"Box Plot of {series.name}")
477
+ self.boxplot_canvas.axes.set_xlabel("Value")
478
+ self.boxplot_canvas.axes.set_yticks([])
479
+ self.boxplot_canvas.fig.tight_layout()
480
+ self.boxplot_canvas.draw()
481
+
482
+ # Q-Q plot
483
+ self.qq_canvas.axes.clear()
484
+ stats.probplot(data, dist="norm", plot=self.qq_canvas.axes)
485
+ self.qq_canvas.axes.set_title(f"Q-Q Plot of {series.name} (vs Normal)")
486
+ self.qq_canvas.fig.tight_layout()
487
+ self.qq_canvas.draw()
488
+
489
+ # Empirical CDF
490
+ self.ecdf_canvas.axes.clear()
491
+ x = np.sort(data)
492
+ y = np.arange(1, len(x) + 1) / len(x)
493
+ self.ecdf_canvas.axes.step(x, y, where='post', label="Empirical CDF")
494
+ self.ecdf_canvas.axes.set_title(f"Empirical CDF of {series.name}")
495
+ self.ecdf_canvas.axes.set_xlabel("Value")
496
+ self.ecdf_canvas.axes.set_ylabel("Cumulative Probability")
497
+ self.ecdf_canvas.fig.tight_layout()
498
+ self.ecdf_canvas.draw()
499
+
500
+ def create_categorical_plots(self, series, stats):
501
+ """Create plots for categorical data"""
502
+ # Clean data
503
+ data = series.dropna()
504
+
505
+ # Bar chart for categorical data
506
+ self.categorical_canvas.axes.clear()
507
+ value_counts = data.value_counts().sort_values(ascending=False)
508
+
509
+ # Limit to top 15 categories if there are too many
510
+ if len(value_counts) > 15:
511
+ value_counts = value_counts.head(15)
512
+ title = f"Top 15 Categories in {series.name}"
513
+ else:
514
+ title = f"Categories in {series.name}"
515
+
516
+ value_counts.plot(kind='bar', ax=self.categorical_canvas.axes)
517
+ self.categorical_canvas.axes.set_title(title)
518
+ self.categorical_canvas.axes.set_xlabel("Category")
519
+ self.categorical_canvas.axes.set_ylabel("Count")
520
+
521
+ # Rotate x-axis labels if needed
522
+ if len(value_counts) > 5:
523
+ plt.setp(self.categorical_canvas.axes.get_xticklabels(), rotation=45, ha='right')
524
+
525
+ self.categorical_canvas.fig.tight_layout()
526
+ self.categorical_canvas.draw()
527
+
528
+
529
+ # Function interface for simpler usage
530
+ def profile(df):
531
+ """
532
+ Profile a dataframe to identify the distribution characteristics of each column.
533
+
534
+ Args:
535
+ df: pandas DataFrame to analyze
536
+
537
+ Returns:
538
+ DataFrame with columns and their distribution profiles
539
+ """
540
+ profiler = DistributionProfiler()
541
+ return profiler.profile(df)
542
+
543
+
544
+ def visualize_profile(df):
545
+ """
546
+ Create a visual representation of the distribution profiles for a dataframe.
547
+
548
+ Args:
549
+ df: pandas DataFrame to analyze
550
+
551
+ Returns:
552
+ A PyQt6 window showing the visualization
553
+ """
554
+ profiler = DistributionProfiler()
555
+ results = profiler.profile(df)
556
+ vis = DistributionVisualization(df, results)
557
+ vis.show()
558
+ return vis
559
+
560
+
561
+ def test_profile_distributions():
562
+ """Test the distribution profiler with a sample dataframe"""
563
+ import sys
564
+
565
+ # Create a QApplication instance if one doesn't exist
566
+ app = QApplication.instance()
567
+ if app is None:
568
+ app = QApplication(sys.argv)
569
+
570
+ # Generate a random dataframe with some columns with different distributions
571
+ np.random.seed(42) # For reproducibility
572
+
573
+ # Create a dataframe with columns of varying distributions
574
+ df = pd.DataFrame({
575
+ 'uniform': np.random.uniform(0, 100, size=1000), # Uniform distribution
576
+ 'normal': np.random.normal(50, 10, size=1000), # Normal distribution
577
+ 'exponential': np.random.exponential(5, size=1000), # Exponential distribution
578
+ 'lognormal': np.random.lognormal(0, 1, size=1000), # Log-normal distribution
579
+ 'bimodal': np.concatenate([np.random.normal(20, 5, 500), np.random.normal(60, 5, 500)]), # Bimodal
580
+ 'constant': np.ones(1000), # Constant value
581
+ 'binary': np.random.choice([0, 1], size=1000), # Binary
582
+ 'categorical': np.random.choice(['A', 'B', 'C', 'D'], size=1000), # Categorical data
583
+ 'skewed': 100 - np.random.power(5, size=1000) * 100, # Right-skewed
584
+ 'multimodal': np.concatenate([
585
+ np.random.normal(10, 2, 300),
586
+ np.random.normal(30, 2, 300),
587
+ np.random.normal(50, 2, 400)
588
+ ]), # Multimodal
589
+ 'boolean': np.random.choice([True, False], size=1000), # Boolean
590
+ 'integer': np.random.randint(1, 10, size=1000), # Small integers
591
+ 'text': pd.Series(['short', 'medium length', 'very long text entry', 'another value'] * 250) # Text data
592
+ })
593
+
594
+ # Add datetime and timedelta columns
595
+ df['datetime'] = pd.date_range('2020-01-01', periods=1000, freq='h')
596
+ df['timedelta'] = pd.Series([pd.Timedelta(days=i) for i in range(1000)])
597
+
598
+ # Add a column with missing values
599
+ df['with_missing'] = df['normal'].copy()
600
+ df.loc[np.random.choice(df.index, size=200), 'with_missing'] = np.nan
601
+
602
+ # Calculate and display profile information
603
+ print("Distribution Profile Results:")
604
+ profiler = DistributionProfiler()
605
+ result = profiler.profile(df)
606
+ print(result[['column', 'type', 'distribution', 'unique_count']])
607
+
608
+ # Visualize the results
609
+ vis = visualize_profile(df)
610
+
611
+ # Start the application event loop
612
+ app.exec()
613
+
614
+
615
+ if __name__ == "__main__":
616
+ test_profile_distributions()