sqlshell 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlshell might be problematic. Click here for more details.

@@ -0,0 +1,613 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import scipy.stats as stats
4
+ import matplotlib
5
+ matplotlib.use('qtagg') # Set the backend before importing pyplot
6
+ import matplotlib.pyplot as plt
7
+ from matplotlib.figure import Figure
8
+ from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg
9
+ from PyQt6.QtCore import QObject, pyqtSignal, Qt
10
+ from PyQt6.QtWidgets import (
11
+ QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
12
+ QTableView, QHeaderView, QLabel, QFrame, QScrollArea, QTabWidget,
13
+ QComboBox, QPushButton, QSplitter
14
+ )
15
+ from PyQt6.QtGui import QStandardItemModel, QStandardItem, QColor, QBrush
16
+
17
+
18
+ class DistributionProfiler(QObject):
19
+ """Class to analyze distributions of columns in a dataframe"""
20
+ progress_updated = pyqtSignal(int, str) # Signal for progress reporting
21
+
22
+ def __init__(self):
23
+ super().__init__()
24
+
25
+ # Define common distributions to test
26
+ self.distributions = [
27
+ {'name': 'normal', 'distribution': stats.norm, 'color': 'blue'},
28
+ {'name': 'uniform', 'distribution': stats.uniform, 'color': 'green'},
29
+ {'name': 'exponential', 'distribution': stats.expon, 'color': 'red'},
30
+ {'name': 'lognormal', 'distribution': stats.lognorm, 'color': 'purple'},
31
+ {'name': 'gamma', 'distribution': stats.gamma, 'color': 'orange'},
32
+ {'name': 'beta', 'distribution': stats.beta, 'color': 'brown'},
33
+ ]
34
+
35
+ def get_best_distribution(self, data):
36
+ """Find the best distribution that fits the data"""
37
+ # Remove NaNs
38
+ data = data.dropna()
39
+
40
+ if len(data) == 0:
41
+ return None, None, None
42
+
43
+ # For categorical or non-numeric data, return None
44
+ if not pd.api.types.is_numeric_dtype(data):
45
+ return None, None, None
46
+
47
+ # For constant data, return a simple result
48
+ if data.nunique() == 1:
49
+ return 'constant', None, 1.0
50
+
51
+ # If too few unique values, may not be appropriate for distribution fitting
52
+ if data.nunique() < 5:
53
+ return 'discrete', None, None
54
+
55
+ best_distribution = None
56
+ best_params = None
57
+ best_sse = np.inf
58
+
59
+ # Get histogram data
60
+ hist, bin_edges = np.histogram(data, bins='auto', density=True)
61
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
62
+
63
+ # Try each distribution
64
+ for dist_info in self.distributions:
65
+ distribution = dist_info['distribution']
66
+
67
+ try:
68
+ # Fit distribution to data
69
+ params = distribution.fit(data)
70
+
71
+ # Get PDF values
72
+ arg_params = params[:-2]
73
+ loc = params[-2]
74
+ scale = params[-1]
75
+
76
+ pdf = distribution.pdf(bin_centers, loc=loc, scale=scale, *arg_params)
77
+
78
+ # Calculate sum of squared errors
79
+ sse = np.sum((pdf - hist) ** 2)
80
+
81
+ # Find best fit distribution
82
+ if sse < best_sse:
83
+ best_distribution = dist_info['name']
84
+ best_params = params
85
+ best_sse = sse
86
+
87
+ except Exception:
88
+ continue
89
+
90
+ # Calculate Kolmogorov-Smirnov test for goodness of fit
91
+ if best_distribution and best_params:
92
+ dist = getattr(stats, best_distribution)
93
+
94
+ # Try to compute K-S test
95
+ try:
96
+ arg_params = best_params[:-2]
97
+ loc = best_params[-2]
98
+ scale = best_params[-1]
99
+ ks_stat, p_value = stats.kstest(data, dist.cdf, args=arg_params, loc=loc, scale=scale)
100
+ return best_distribution, best_params, p_value
101
+ except:
102
+ return best_distribution, best_params, None
103
+
104
+ return None, None, None
105
+
106
+ def describe_distribution(self, series):
107
+ """Provide distribution statistics for a series"""
108
+ stats_dict = {}
109
+
110
+ # Remove NaNs
111
+ series = series.dropna()
112
+
113
+ if len(series) == 0:
114
+ return {
115
+ 'count': 0,
116
+ 'distribution': 'empty',
117
+ 'goodness_of_fit': None
118
+ }
119
+
120
+ # Basic statistics
121
+ stats_dict['count'] = len(series)
122
+ stats_dict['unique_count'] = series.nunique()
123
+ stats_dict['missing_count'] = series.isna().sum()
124
+ stats_dict['missing_percentage'] = (series.isna().sum() / len(series)) * 100
125
+
126
+ # For categorical data
127
+ if not pd.api.types.is_numeric_dtype(series):
128
+ stats_dict['type'] = 'categorical'
129
+ top_values = series.value_counts().head(5).to_dict()
130
+ stats_dict['top_values'] = {str(k): v for k, v in top_values.items()}
131
+ stats_dict['distribution'] = 'categorical'
132
+ return stats_dict
133
+
134
+ # For numerical data
135
+ stats_dict['type'] = 'numerical'
136
+ stats_dict['min'] = float(series.min())
137
+ stats_dict['max'] = float(series.max())
138
+ stats_dict['mean'] = float(series.mean())
139
+ stats_dict['median'] = float(series.median())
140
+ stats_dict['std'] = float(series.std())
141
+ stats_dict['skewness'] = float(stats.skew(series))
142
+ stats_dict['kurtosis'] = float(stats.kurtosis(series))
143
+
144
+ # Find best distribution
145
+ best_dist, params, p_value = self.get_best_distribution(series)
146
+ stats_dict['distribution'] = best_dist
147
+ stats_dict['goodness_of_fit'] = p_value
148
+
149
+ return stats_dict
150
+
151
+ def profile(self, df):
152
+ """
153
+ Profile a dataframe to identify the distribution characteristics of each column.
154
+
155
+ Args:
156
+ df: pandas DataFrame to analyze
157
+
158
+ Returns:
159
+ DataFrame with columns and their distribution profiles
160
+ """
161
+ if not isinstance(df, pd.DataFrame):
162
+ raise TypeError("Input must be a pandas DataFrame")
163
+
164
+ if df.empty:
165
+ return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
166
+
167
+ results = []
168
+ total_columns = len(df.columns)
169
+
170
+ # Analyze each column
171
+ for i, column in enumerate(df.columns):
172
+ # Emit progress signal (if connected)
173
+ self.progress_updated.emit(int((i / total_columns) * 100), f"Analyzing column: {column}")
174
+
175
+ try:
176
+ stats_dict = self.describe_distribution(df[column])
177
+ stats_dict['column'] = column
178
+ results.append(stats_dict)
179
+ except Exception as e:
180
+ # Skip columns that can't be analyzed
181
+ continue
182
+
183
+ # Create results dataframe
184
+ result_df = pd.DataFrame(results)
185
+
186
+ if result_df.empty:
187
+ return pd.DataFrame(columns=['column', 'type', 'distribution', 'goodness_of_fit'])
188
+
189
+ # Sort by distribution type and column name
190
+ result_df = result_df.sort_values(by=['type', 'column'])
191
+
192
+ self.progress_updated.emit(100, "Analysis complete")
193
+ return result_df
194
+
195
+
196
+ class MatplotlibCanvas(FigureCanvasQTAgg):
197
+ """Matplotlib canvas for embedding plots in PyQt"""
198
+ def __init__(self, width=5, height=4, dpi=100):
199
+ self.fig = Figure(figsize=(width, height), dpi=dpi)
200
+ self.axes = self.fig.add_subplot(111)
201
+ super().__init__(self.fig)
202
+
203
+
204
+ class DistributionVisualization(QMainWindow):
205
+ """Window to visualize distribution results"""
206
+
207
+ def __init__(self, df, results_df, parent=None):
208
+ super().__init__(parent)
209
+ self.df = df
210
+ self.results_df = results_df
211
+ self.current_column = None
212
+
213
+ self.setWindowTitle("Column Distribution Profiles")
214
+ self.resize(1000, 800)
215
+
216
+ # Create central widget and layout
217
+ central_widget = QWidget()
218
+ self.setCentralWidget(central_widget)
219
+ main_layout = QVBoxLayout(central_widget)
220
+
221
+ # Add a title
222
+ title = QLabel("Statistical Distribution Analysis")
223
+ title.setAlignment(Qt.AlignmentFlag.AlignCenter)
224
+ title.setStyleSheet("font-size: 16pt; font-weight: bold; margin: 10px;")
225
+ main_layout.addWidget(title)
226
+
227
+ # Add a description
228
+ description = QLabel(
229
+ "Analyzing column distributions helps identify data patterns and select appropriate statistical methods."
230
+ )
231
+ description.setAlignment(Qt.AlignmentFlag.AlignCenter)
232
+ description.setWordWrap(True)
233
+ main_layout.addWidget(description)
234
+
235
+ # Create a splitter for table and visualization
236
+ splitter = QSplitter(Qt.Orientation.Vertical)
237
+ main_layout.addWidget(splitter, 1)
238
+
239
+ # Create table view
240
+ table_widget = QWidget()
241
+ table_layout = QVBoxLayout(table_widget)
242
+ self.create_table_view(table_layout)
243
+ splitter.addWidget(table_widget)
244
+
245
+ # Create visualization section
246
+ vis_widget = QWidget()
247
+ vis_layout = QVBoxLayout(vis_widget)
248
+ self.create_visualization_section(vis_layout)
249
+ splitter.addWidget(vis_widget)
250
+
251
+ # Set initial splitter sizes
252
+ splitter.setSizes([300, 500])
253
+
254
+ def create_table_view(self, layout):
255
+ """Create a table view showing the distribution results"""
256
+ # Create the model
257
+ model = QStandardItemModel()
258
+ headers = ['Column', 'Type', 'Distribution', 'Count', 'Unique', 'Missing %']
259
+ if 'skewness' in self.results_df.columns:
260
+ headers.extend(['Mean', 'Median', 'Std', 'Skewness', 'Kurtosis'])
261
+ model.setHorizontalHeaderLabels(headers)
262
+
263
+ # Set table data
264
+ for _, row in self.results_df.iterrows():
265
+ items = []
266
+
267
+ # Basic columns present in all rows
268
+ column_item = QStandardItem(str(row['column']))
269
+ type_item = QStandardItem(str(row['type']))
270
+ dist_item = QStandardItem(str(row['distribution']))
271
+ count_item = QStandardItem(str(row['count']))
272
+ unique_item = QStandardItem(str(row.get('unique_count', 'N/A')))
273
+ missing_item = QStandardItem(f"{row.get('missing_percentage', 0):.1f}%")
274
+
275
+ items.extend([column_item, type_item, dist_item, count_item, unique_item, missing_item])
276
+
277
+ # Add numerical statistics if available
278
+ if row['type'] == 'numerical':
279
+ mean_item = QStandardItem(f"{row.get('mean', 'N/A'):.2f}")
280
+ median_item = QStandardItem(f"{row.get('median', 'N/A'):.2f}")
281
+ std_item = QStandardItem(f"{row.get('std', 'N/A'):.2f}")
282
+ skew_item = QStandardItem(f"{row.get('skewness', 'N/A'):.2f}")
283
+ kurt_item = QStandardItem(f"{row.get('kurtosis', 'N/A'):.2f}")
284
+
285
+ items.extend([mean_item, median_item, std_item, skew_item, kurt_item])
286
+ else:
287
+ # Add empty items for categorical data
288
+ for _ in range(5):
289
+ items.append(QStandardItem(""))
290
+
291
+ model.appendRow(items)
292
+
293
+ # Create and configure the table view
294
+ self.table_view = QTableView()
295
+ self.table_view.setModel(model)
296
+ self.table_view.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.ResizeToContents)
297
+ self.table_view.setAlternatingRowColors(True)
298
+ self.table_view.setSelectionBehavior(QTableView.SelectionBehavior.SelectRows)
299
+ self.table_view.setMinimumHeight(200)
300
+
301
+ # Connect selection signal
302
+ self.table_view.selectionModel().selectionChanged.connect(self.on_column_selected)
303
+
304
+ layout.addWidget(self.table_view)
305
+
306
+ # Add column selector
307
+ selector_layout = QHBoxLayout()
308
+ selector_layout.addWidget(QLabel("Select Column:"))
309
+
310
+ self.column_selector = QComboBox()
311
+ self.column_selector.addItems(self.results_df['column'].tolist())
312
+ self.column_selector.currentTextChanged.connect(self.on_combobox_changed)
313
+ selector_layout.addWidget(self.column_selector)
314
+
315
+ layout.addLayout(selector_layout)
316
+
317
+ def create_visualization_section(self, layout):
318
+ """Create the visualization section with tabs for different plots"""
319
+ self.tab_widget = QTabWidget()
320
+
321
+ # Create tabs for different visualizations
322
+ self.histogram_tab = QWidget()
323
+ self.histogram_layout = QVBoxLayout(self.histogram_tab)
324
+ self.histogram_canvas = MatplotlibCanvas(width=8, height=4)
325
+ self.histogram_layout.addWidget(self.histogram_canvas)
326
+ self.tab_widget.addTab(self.histogram_tab, "Histogram & Density")
327
+
328
+ self.boxplot_tab = QWidget()
329
+ self.boxplot_layout = QVBoxLayout(self.boxplot_tab)
330
+ self.boxplot_canvas = MatplotlibCanvas(width=8, height=4)
331
+ self.boxplot_layout.addWidget(self.boxplot_canvas)
332
+ self.tab_widget.addTab(self.boxplot_tab, "Box Plot")
333
+
334
+ self.qq_tab = QWidget()
335
+ self.qq_layout = QVBoxLayout(self.qq_tab)
336
+ self.qq_canvas = MatplotlibCanvas(width=8, height=4)
337
+ self.qq_layout.addWidget(self.qq_canvas)
338
+ self.tab_widget.addTab(self.qq_tab, "Q-Q Plot")
339
+
340
+ self.ecdf_tab = QWidget()
341
+ self.ecdf_layout = QVBoxLayout(self.ecdf_tab)
342
+ self.ecdf_canvas = MatplotlibCanvas(width=8, height=4)
343
+ self.ecdf_layout.addWidget(self.ecdf_canvas)
344
+ self.tab_widget.addTab(self.ecdf_tab, "Empirical CDF")
345
+
346
+ # For categorical data
347
+ self.categorical_tab = QWidget()
348
+ self.categorical_layout = QVBoxLayout(self.categorical_tab)
349
+ self.categorical_canvas = MatplotlibCanvas(width=8, height=4)
350
+ self.categorical_layout.addWidget(self.categorical_canvas)
351
+ self.tab_widget.addTab(self.categorical_tab, "Bar Chart")
352
+
353
+ layout.addWidget(self.tab_widget)
354
+
355
+ # Stats panel
356
+ self.stats_label = QLabel("Select a column to view distribution statistics")
357
+ self.stats_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
358
+ self.stats_label.setStyleSheet("font-family: monospace; background-color: #f0f0f0; padding: 10px; border-radius: 5px;")
359
+ self.stats_label.setWordWrap(True)
360
+ layout.addWidget(self.stats_label)
361
+
362
+ def on_combobox_changed(self, column_name):
363
+ """Handle column selection from combobox"""
364
+ self.visualize_column(column_name)
365
+
366
+ def on_column_selected(self, selected, deselected):
367
+ """Handle column selection from table"""
368
+ indexes = selected.indexes()
369
+ if indexes:
370
+ # Get the column name from the first column
371
+ row_idx = indexes[0].row()
372
+ column_name = self.table_view.model().item(row_idx, 0).text()
373
+
374
+ # Update combobox to match
375
+ index = self.column_selector.findText(column_name)
376
+ if index >= 0:
377
+ self.column_selector.setCurrentIndex(index)
378
+
379
+ self.visualize_column(column_name)
380
+
381
+ def visualize_column(self, column_name):
382
+ """Visualize the selected column with various plots"""
383
+ if column_name not in self.df.columns:
384
+ return
385
+
386
+ self.current_column = column_name
387
+
388
+ # Get column data and stats
389
+ series = self.df[column_name]
390
+ column_stats = self.results_df[self.results_df['column'] == column_name].iloc[0].to_dict()
391
+
392
+ # Update stats label
393
+ self.update_stats_display(column_stats)
394
+
395
+ # Check if categorical or numerical
396
+ if column_stats['type'] == 'categorical':
397
+ self.create_categorical_plots(series, column_stats)
398
+ self.tab_widget.setCurrentWidget(self.categorical_tab)
399
+ else:
400
+ self.create_numerical_plots(series, column_stats)
401
+ self.tab_widget.setCurrentWidget(self.histogram_tab)
402
+
403
+ def update_stats_display(self, stats):
404
+ """Update the statistics display panel"""
405
+ if stats['type'] == 'numerical':
406
+ # Format numerical stats
407
+ stats_text = (
408
+ f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']} | "
409
+ f"<b>Distribution:</b> {stats['distribution']}\n"
410
+ f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
411
+ f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)\n"
412
+ f"<b>Min:</b> {stats['min']:.4g} | <b>Max:</b> {stats['max']:.4g} | "
413
+ f"<b>Mean:</b> {stats['mean']:.4g} | <b>Median:</b> {stats['median']:.4g} | <b>Std:</b> {stats['std']:.4g}\n"
414
+ f"<b>Skewness:</b> {stats['skewness']:.4g} | <b>Kurtosis:</b> {stats['kurtosis']:.4g}"
415
+ )
416
+
417
+ if stats['goodness_of_fit'] is not None:
418
+ stats_text += f" | <b>Goodness of fit (p-value):</b> {stats['goodness_of_fit']:.4g}"
419
+ else:
420
+ # Format categorical stats
421
+ stats_text = (
422
+ f"<b>Column:</b> {stats['column']} | <b>Type:</b> {stats['type']}\n"
423
+ f"<b>Count:</b> {stats['count']} | <b>Unique:</b> {stats['unique_count']} | "
424
+ f"<b>Missing:</b> {stats['missing_count']} ({stats['missing_percentage']:.1f}%)"
425
+ )
426
+
427
+ if 'top_values' in stats:
428
+ top_values = stats['top_values']
429
+ stats_text += "\n<b>Top values:</b> "
430
+ stats_text += ", ".join([f"{k} ({v})" for k, v in top_values.items()])
431
+
432
+ self.stats_label.setText(stats_text)
433
+
434
+ def create_numerical_plots(self, series, column_stats):
435
+ """Create plots for numerical data"""
436
+ # Clean data
437
+ data = series.dropna()
438
+
439
+ # Histogram with fitted distribution
440
+ self.histogram_canvas.axes.clear()
441
+ self.histogram_canvas.axes.hist(data, bins='auto', density=True, alpha=0.6, label="Data")
442
+
443
+ # If we have a fitted distribution, plot it
444
+ if column_stats['distribution'] not in [None, 'discrete', 'constant', 'categorical']:
445
+ # Get the distribution and params
446
+ dist_name = column_stats['distribution']
447
+
448
+ # Simple estimation for distribution parameters if we don't have them
449
+ # In a real implementation, you would save the parameters from the profiler
450
+ if dist_name == 'normal':
451
+ x = np.linspace(data.min(), data.max(), 1000)
452
+ y = stats.norm.pdf(x, data.mean(), data.std())
453
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
454
+ elif dist_name == 'uniform':
455
+ x = np.linspace(data.min(), data.max(), 1000)
456
+ y = stats.uniform.pdf(x, data.min(), data.max() - data.min())
457
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
458
+ elif dist_name == 'exponential':
459
+ x = np.linspace(data.min(), data.max(), 1000)
460
+ y = stats.expon.pdf(x, scale=1/data.mean())
461
+ self.histogram_canvas.axes.plot(x, y, 'r-', lw=2, label=f"Fitted {dist_name}")
462
+
463
+ self.histogram_canvas.axes.set_title(f"Histogram of {series.name}")
464
+ self.histogram_canvas.axes.set_xlabel("Value")
465
+ self.histogram_canvas.axes.set_ylabel("Density")
466
+ self.histogram_canvas.axes.legend()
467
+ self.histogram_canvas.fig.tight_layout()
468
+ self.histogram_canvas.draw()
469
+
470
+ # Box plot
471
+ self.boxplot_canvas.axes.clear()
472
+ self.boxplot_canvas.axes.boxplot(data, vert=False)
473
+ self.boxplot_canvas.axes.set_title(f"Box Plot of {series.name}")
474
+ self.boxplot_canvas.axes.set_xlabel("Value")
475
+ self.boxplot_canvas.axes.set_yticks([])
476
+ self.boxplot_canvas.fig.tight_layout()
477
+ self.boxplot_canvas.draw()
478
+
479
+ # Q-Q plot
480
+ self.qq_canvas.axes.clear()
481
+ stats.probplot(data, dist="norm", plot=self.qq_canvas.axes)
482
+ self.qq_canvas.axes.set_title(f"Q-Q Plot of {series.name} (vs Normal)")
483
+ self.qq_canvas.fig.tight_layout()
484
+ self.qq_canvas.draw()
485
+
486
+ # Empirical CDF
487
+ self.ecdf_canvas.axes.clear()
488
+ x = np.sort(data)
489
+ y = np.arange(1, len(x) + 1) / len(x)
490
+ self.ecdf_canvas.axes.step(x, y, where='post', label="Empirical CDF")
491
+ self.ecdf_canvas.axes.set_title(f"Empirical CDF of {series.name}")
492
+ self.ecdf_canvas.axes.set_xlabel("Value")
493
+ self.ecdf_canvas.axes.set_ylabel("Cumulative Probability")
494
+ self.ecdf_canvas.fig.tight_layout()
495
+ self.ecdf_canvas.draw()
496
+
497
+ def create_categorical_plots(self, series, stats):
498
+ """Create plots for categorical data"""
499
+ # Clean data
500
+ data = series.dropna()
501
+
502
+ # Bar chart for categorical data
503
+ self.categorical_canvas.axes.clear()
504
+ value_counts = data.value_counts().sort_values(ascending=False)
505
+
506
+ # Limit to top 15 categories if there are too many
507
+ if len(value_counts) > 15:
508
+ value_counts = value_counts.head(15)
509
+ title = f"Top 15 Categories in {series.name}"
510
+ else:
511
+ title = f"Categories in {series.name}"
512
+
513
+ value_counts.plot(kind='bar', ax=self.categorical_canvas.axes)
514
+ self.categorical_canvas.axes.set_title(title)
515
+ self.categorical_canvas.axes.set_xlabel("Category")
516
+ self.categorical_canvas.axes.set_ylabel("Count")
517
+
518
+ # Rotate x-axis labels if needed
519
+ if len(value_counts) > 5:
520
+ plt.setp(self.categorical_canvas.axes.get_xticklabels(), rotation=45, ha='right')
521
+
522
+ self.categorical_canvas.fig.tight_layout()
523
+ self.categorical_canvas.draw()
524
+
525
+
526
+ # Function interface for simpler usage
527
+ def profile(df):
528
+ """
529
+ Profile a dataframe to identify the distribution characteristics of each column.
530
+
531
+ Args:
532
+ df: pandas DataFrame to analyze
533
+
534
+ Returns:
535
+ DataFrame with columns and their distribution profiles
536
+ """
537
+ profiler = DistributionProfiler()
538
+ return profiler.profile(df)
539
+
540
+
541
+ def visualize_profile(df):
542
+ """
543
+ Create a visual representation of the distribution profiles for a dataframe.
544
+
545
+ Args:
546
+ df: pandas DataFrame to analyze
547
+
548
+ Returns:
549
+ A PyQt6 window showing the visualization
550
+ """
551
+ profiler = DistributionProfiler()
552
+ results = profiler.profile(df)
553
+ vis = DistributionVisualization(df, results)
554
+ vis.show()
555
+ return vis
556
+
557
+
558
+ def test_profile_distributions():
559
+ """Test the distribution profiler with a sample dataframe"""
560
+ import sys
561
+
562
+ # Create a QApplication instance if one doesn't exist
563
+ app = QApplication.instance()
564
+ if app is None:
565
+ app = QApplication(sys.argv)
566
+
567
+ # Generate a random dataframe with some columns with different distributions
568
+ np.random.seed(42) # For reproducibility
569
+
570
+ # Create a dataframe with columns of varying distributions
571
+ df = pd.DataFrame({
572
+ 'uniform': np.random.uniform(0, 100, size=1000), # Uniform distribution
573
+ 'normal': np.random.normal(50, 10, size=1000), # Normal distribution
574
+ 'exponential': np.random.exponential(5, size=1000), # Exponential distribution
575
+ 'lognormal': np.random.lognormal(0, 1, size=1000), # Log-normal distribution
576
+ 'bimodal': np.concatenate([np.random.normal(20, 5, 500), np.random.normal(60, 5, 500)]), # Bimodal
577
+ 'constant': np.ones(1000), # Constant value
578
+ 'binary': np.random.choice([0, 1], size=1000), # Binary
579
+ 'categorical': np.random.choice(['A', 'B', 'C', 'D'], size=1000), # Categorical data
580
+ 'skewed': 100 - np.random.power(5, size=1000) * 100, # Right-skewed
581
+ 'multimodal': np.concatenate([
582
+ np.random.normal(10, 2, 300),
583
+ np.random.normal(30, 2, 300),
584
+ np.random.normal(50, 2, 400)
585
+ ]), # Multimodal
586
+ 'boolean': np.random.choice([True, False], size=1000), # Boolean
587
+ 'integer': np.random.randint(1, 10, size=1000), # Small integers
588
+ 'text': pd.Series(['short', 'medium length', 'very long text entry', 'another value'] * 250) # Text data
589
+ })
590
+
591
+ # Add datetime and timedelta columns
592
+ df['datetime'] = pd.date_range('2020-01-01', periods=1000, freq='h')
593
+ df['timedelta'] = pd.Series([pd.Timedelta(days=i) for i in range(1000)])
594
+
595
+ # Add a column with missing values
596
+ df['with_missing'] = df['normal'].copy()
597
+ df.loc[np.random.choice(df.index, size=200), 'with_missing'] = np.nan
598
+
599
+ # Calculate and display profile information
600
+ print("Distribution Profile Results:")
601
+ profiler = DistributionProfiler()
602
+ result = profiler.profile(df)
603
+ print(result[['column', 'type', 'distribution', 'unique_count']])
604
+
605
+ # Visualize the results
606
+ vis = visualize_profile(df)
607
+
608
+ # Start the application event loop
609
+ app.exec()
610
+
611
+
612
+ if __name__ == "__main__":
613
+ test_profile_distributions()