sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Set random seed for reproducibility
|
|
7
|
+
np.random.seed(42)
|
|
8
|
+
|
|
9
|
+
def create_california_housing_data(output_file='california_housing_data.parquet'):
|
|
10
|
+
"""Use the real world california housing dataset"""
|
|
11
|
+
# Load the dataset
|
|
12
|
+
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')
|
|
13
|
+
|
|
14
|
+
# Save to Parquet
|
|
15
|
+
df.to_parquet(output_file)
|
|
16
|
+
return df
|
|
17
|
+
|
|
18
|
+
def create_large_customer_data(num_customers=1_000_000, chunk_size=100_000, output_file='large_customer_data.parquet'):
|
|
19
|
+
"""Create a large customer dataset """
|
|
20
|
+
# Generate customer data
|
|
21
|
+
data = {
|
|
22
|
+
'CustomerID': range(1, num_customers + 1),
|
|
23
|
+
'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
|
|
24
|
+
'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
|
|
25
|
+
'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
|
|
26
|
+
'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
|
|
27
|
+
for _ in range(num_customers)],
|
|
28
|
+
'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
|
|
29
|
+
'CreditScore': np.random.randint(300, 851, num_customers)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Create DataFrame
|
|
33
|
+
df = pd.DataFrame(data)
|
|
34
|
+
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_sales_data(num_records=1000):
|
|
39
|
+
# Generate dates for the last 365 days
|
|
40
|
+
end_date = datetime.now()
|
|
41
|
+
start_date = end_date - timedelta(days=365)
|
|
42
|
+
dates = [start_date + timedelta(days=x) for x in range(366)]
|
|
43
|
+
random_dates = np.random.choice(dates, num_records)
|
|
44
|
+
|
|
45
|
+
# Create product data
|
|
46
|
+
products = ['Laptop', 'Smartphone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse', 'Headphones', 'Printer']
|
|
47
|
+
product_prices = {
|
|
48
|
+
'Laptop': (800, 2000),
|
|
49
|
+
'Smartphone': (400, 1200),
|
|
50
|
+
'Tablet': (200, 800),
|
|
51
|
+
'Monitor': (150, 500),
|
|
52
|
+
'Keyboard': (20, 150),
|
|
53
|
+
'Mouse': (10, 80),
|
|
54
|
+
'Headphones': (30, 300),
|
|
55
|
+
'Printer': (100, 400)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Generate random data
|
|
59
|
+
data = {
|
|
60
|
+
'OrderID': range(1, num_records + 1),
|
|
61
|
+
'Date': random_dates,
|
|
62
|
+
'ProductID': np.random.randint(1, len(products) + 1, num_records), # Changed to ProductID for joining
|
|
63
|
+
'Quantity': np.random.randint(1, 11, num_records),
|
|
64
|
+
'CustomerID': np.random.randint(1, 201, num_records),
|
|
65
|
+
'Region': np.random.choice(['North', 'South', 'East', 'West'], num_records)
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Calculate prices based on product
|
|
69
|
+
product_list = [products[pid-1] for pid in data['ProductID']]
|
|
70
|
+
data['Price'] = [np.random.uniform(product_prices[p][0], product_prices[p][1])
|
|
71
|
+
for p in product_list]
|
|
72
|
+
data['TotalAmount'] = [price * qty for price, qty in zip(data['Price'], data['Quantity'])]
|
|
73
|
+
|
|
74
|
+
# Create DataFrame
|
|
75
|
+
df = pd.DataFrame(data)
|
|
76
|
+
|
|
77
|
+
# Round numerical columns
|
|
78
|
+
df['Price'] = df['Price'].round(2)
|
|
79
|
+
df['TotalAmount'] = df['TotalAmount'].round(2)
|
|
80
|
+
|
|
81
|
+
# Sort by Date
|
|
82
|
+
return df.sort_values('Date')
|
|
83
|
+
|
|
84
|
+
def create_customer_data(num_customers=200):
|
|
85
|
+
# Generate customer data
|
|
86
|
+
data = {
|
|
87
|
+
'CustomerID': range(1, num_customers + 1),
|
|
88
|
+
'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
|
|
89
|
+
'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
|
|
90
|
+
'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
|
|
91
|
+
'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
|
|
92
|
+
for _ in range(num_customers)],
|
|
93
|
+
'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
|
|
94
|
+
'CreditScore': np.random.randint(300, 851, num_customers)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return pd.DataFrame(data)
|
|
98
|
+
|
|
99
|
+
def create_product_data():
|
|
100
|
+
# Create detailed product information
|
|
101
|
+
products = {
|
|
102
|
+
'ProductID': range(1, 9),
|
|
103
|
+
'ProductName': ['Laptop', 'Smartphone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse', 'Headphones', 'Printer'],
|
|
104
|
+
'Category': ['Computers', 'Mobile', 'Mobile', 'Accessories', 'Accessories', 'Accessories', 'Audio', 'Peripherals'],
|
|
105
|
+
'Brand': ['TechPro', 'MobileX', 'TabletCo', 'ViewMax', 'TypeMaster', 'ClickPro', 'SoundMax', 'PrintPro'],
|
|
106
|
+
'StockQuantity': np.random.randint(50, 500, 8),
|
|
107
|
+
'MinPrice': [800, 400, 200, 150, 20, 10, 30, 100],
|
|
108
|
+
'MaxPrice': [2000, 1200, 800, 500, 150, 80, 300, 400],
|
|
109
|
+
'Weight_kg': [2.5, 0.2, 0.5, 3.0, 0.8, 0.1, 0.3, 5.0],
|
|
110
|
+
'WarrantyMonths': [24, 12, 12, 36, 12, 12, 24, 12]
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return pd.DataFrame(products)
|
|
114
|
+
|
|
115
|
+
def create_large_numbers_data(num_records=100):
|
|
116
|
+
"""Create a dataset with very large numbers for testing and visualization."""
|
|
117
|
+
|
|
118
|
+
# Generate random IDs
|
|
119
|
+
ids = range(1, num_records + 1)
|
|
120
|
+
|
|
121
|
+
# Create different columns with large numbers
|
|
122
|
+
data = {
|
|
123
|
+
'ID': ids,
|
|
124
|
+
'Date': pd.date_range(start='2023-01-01', periods=num_records),
|
|
125
|
+
'SmallValue': np.random.randint(1, 1000, num_records),
|
|
126
|
+
'MediumValue': np.random.randint(10000, 9999999, num_records),
|
|
127
|
+
'LargeValue': [int(str(np.random.randint(1, 999)) + str(np.random.randint(0, 9999999)).zfill(7) +
|
|
128
|
+
str(np.random.randint(0, 9999)).zfill(4)) for _ in range(num_records)],
|
|
129
|
+
'VeryLargeValue': [int(str(np.random.randint(100, 999)) + str(np.random.randint(1000000, 9999999)) +
|
|
130
|
+
str(np.random.randint(1000000, 9999999))) for _ in range(num_records)],
|
|
131
|
+
'MassiveValue': [int('1' + ''.join([str(np.random.randint(0, 10)) for _ in range(15)])) for _ in range(num_records)],
|
|
132
|
+
'Category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_records),
|
|
133
|
+
'IsActive': np.random.choice([True, False], num_records, p=[0.8, 0.2])
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Create exponential values for scientific notation
|
|
137
|
+
data['ExponentialValue'] = [float(f"{np.random.randint(1, 10)}.{np.random.randint(1, 100):02d}e{np.random.randint(10, 20)}")
|
|
138
|
+
for _ in range(num_records)]
|
|
139
|
+
|
|
140
|
+
# Create monetary values (with decimals)
|
|
141
|
+
# Use dtype=np.int64 to avoid int32 overflow on Windows
|
|
142
|
+
data['Revenue'] = [np.random.randint(1000000, 9999999999, dtype=np.int64) + np.random.random() for _ in range(num_records)]
|
|
143
|
+
data['Budget'] = [np.random.randint(10000000, 999999999, dtype=np.int64) + np.random.random() for _ in range(num_records)]
|
|
144
|
+
|
|
145
|
+
# Create DataFrame
|
|
146
|
+
df = pd.DataFrame(data)
|
|
147
|
+
|
|
148
|
+
# Round monetary values to 2 decimal places
|
|
149
|
+
df['Revenue'] = df['Revenue'].round(2)
|
|
150
|
+
df['Budget'] = df['Budget'].round(2)
|
|
151
|
+
|
|
152
|
+
return df
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Set random seed for reproducibility
|
|
7
|
+
np.random.seed(42)
|
|
8
|
+
|
|
9
|
+
# Define output directory
|
|
10
|
+
OUTPUT_DIR = 'test_data'
|
|
11
|
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
12
|
+
|
|
13
|
+
def create_sales_data(num_records=1000):
|
|
14
|
+
# Generate dates for the last 365 days
|
|
15
|
+
end_date = datetime.now()
|
|
16
|
+
start_date = end_date - timedelta(days=365)
|
|
17
|
+
dates = [start_date + timedelta(days=x) for x in range(366)]
|
|
18
|
+
random_dates = np.random.choice(dates, num_records)
|
|
19
|
+
|
|
20
|
+
# Create product data
|
|
21
|
+
products = ['Laptop', 'Smartphone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse', 'Headphones', 'Printer']
|
|
22
|
+
product_prices = {
|
|
23
|
+
'Laptop': (800, 2000),
|
|
24
|
+
'Smartphone': (400, 1200),
|
|
25
|
+
'Tablet': (200, 800),
|
|
26
|
+
'Monitor': (150, 500),
|
|
27
|
+
'Keyboard': (20, 150),
|
|
28
|
+
'Mouse': (10, 80),
|
|
29
|
+
'Headphones': (30, 300),
|
|
30
|
+
'Printer': (100, 400)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Generate random data
|
|
34
|
+
data = {
|
|
35
|
+
'OrderID': range(1, num_records + 1),
|
|
36
|
+
'Date': random_dates,
|
|
37
|
+
'ProductID': np.random.randint(1, len(products) + 1, num_records), # Changed to ProductID for joining
|
|
38
|
+
'Quantity': np.random.randint(1, 11, num_records),
|
|
39
|
+
'CustomerID': np.random.randint(1, 201, num_records),
|
|
40
|
+
'Region': np.random.choice(['North', 'South', 'East', 'West'], num_records)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# Calculate prices based on product
|
|
44
|
+
product_list = [products[pid-1] for pid in data['ProductID']]
|
|
45
|
+
data['Price'] = [np.random.uniform(product_prices[p][0], product_prices[p][1])
|
|
46
|
+
for p in product_list]
|
|
47
|
+
data['TotalAmount'] = [price * qty for price, qty in zip(data['Price'], data['Quantity'])]
|
|
48
|
+
|
|
49
|
+
# Create DataFrame
|
|
50
|
+
df = pd.DataFrame(data)
|
|
51
|
+
|
|
52
|
+
# Round numerical columns
|
|
53
|
+
df['Price'] = df['Price'].round(2)
|
|
54
|
+
df['TotalAmount'] = df['TotalAmount'].round(2)
|
|
55
|
+
|
|
56
|
+
# Sort by Date
|
|
57
|
+
return df.sort_values('Date')
|
|
58
|
+
|
|
59
|
+
def create_customer_data(num_customers=200):
|
|
60
|
+
# Generate customer data
|
|
61
|
+
data = {
|
|
62
|
+
'CustomerID': range(1, num_customers + 1),
|
|
63
|
+
'FirstName': [f'Customer{i}' for i in range(1, num_customers + 1)],
|
|
64
|
+
'LastName': [f'Lastname{i}' for i in range(1, num_customers + 1)],
|
|
65
|
+
'Email': [f'customer{i}@example.com' for i in range(1, num_customers + 1)],
|
|
66
|
+
'JoinDate': [datetime.now() - timedelta(days=np.random.randint(1, 1000))
|
|
67
|
+
for _ in range(num_customers)],
|
|
68
|
+
'CustomerType': np.random.choice(['Regular', 'Premium', 'VIP'], num_customers),
|
|
69
|
+
'CreditScore': np.random.randint(300, 851, num_customers)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return pd.DataFrame(data)
|
|
73
|
+
|
|
74
|
+
def create_product_data():
|
|
75
|
+
# Create detailed product information
|
|
76
|
+
products = {
|
|
77
|
+
'ProductID': range(1, 9),
|
|
78
|
+
'ProductName': ['Laptop', 'Smartphone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse', 'Headphones', 'Printer'],
|
|
79
|
+
'Category': ['Computers', 'Mobile', 'Mobile', 'Accessories', 'Accessories', 'Accessories', 'Audio', 'Peripherals'],
|
|
80
|
+
'Brand': ['TechPro', 'MobileX', 'TabletCo', 'ViewMax', 'TypeMaster', 'ClickPro', 'SoundMax', 'PrintPro'],
|
|
81
|
+
'StockQuantity': np.random.randint(50, 500, 8),
|
|
82
|
+
'MinPrice': [800, 400, 200, 150, 20, 10, 30, 100],
|
|
83
|
+
'MaxPrice': [2000, 1200, 800, 500, 150, 80, 300, 400],
|
|
84
|
+
'Weight_kg': [2.5, 0.2, 0.5, 3.0, 0.8, 0.1, 0.3, 5.0],
|
|
85
|
+
'WarrantyMonths': [24, 12, 12, 36, 12, 12, 24, 12]
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return pd.DataFrame(products)
|
|
89
|
+
|
|
90
|
+
if __name__ == '__main__':
|
|
91
|
+
# Create and save sales data
|
|
92
|
+
sales_df = create_sales_data()
|
|
93
|
+
sales_output = os.path.join(OUTPUT_DIR, 'sample_sales_data.xlsx')
|
|
94
|
+
sales_df.to_excel(sales_output, index=False)
|
|
95
|
+
print(f"Created sales data in '{sales_output}'")
|
|
96
|
+
print(f"Number of sales records: {len(sales_df)}")
|
|
97
|
+
|
|
98
|
+
# Create and save customer data as parquet
|
|
99
|
+
customer_df = create_customer_data()
|
|
100
|
+
customer_output = os.path.join(OUTPUT_DIR, 'customer_data.parquet')
|
|
101
|
+
customer_df.to_parquet(customer_output, index=False)
|
|
102
|
+
print(f"\nCreated customer data in '{customer_output}'")
|
|
103
|
+
print(f"Number of customers: {len(customer_df)}")
|
|
104
|
+
|
|
105
|
+
# Create and save product data
|
|
106
|
+
product_df = create_product_data()
|
|
107
|
+
product_output = os.path.join(OUTPUT_DIR, 'product_catalog.xlsx')
|
|
108
|
+
product_df.to_excel(product_output, index=False)
|
|
109
|
+
print(f"\nCreated product catalog in '{product_output}'")
|
|
110
|
+
print(f"Number of products: {len(product_df)}")
|
|
111
|
+
|
|
112
|
+
# Print sample queries
|
|
113
|
+
print("\nSample SQL queries for joining the data:")
|
|
114
|
+
print("""
|
|
115
|
+
-- Join sales with customer data
|
|
116
|
+
SELECT s.*, c.FirstName, c.LastName, c.CustomerType
|
|
117
|
+
FROM test_data.sample_sales_data s
|
|
118
|
+
JOIN test_data.customer_data c ON s.CustomerID = c.CustomerID;
|
|
119
|
+
|
|
120
|
+
-- Join sales with product data
|
|
121
|
+
SELECT s.*, p.ProductName, p.Category, p.Brand
|
|
122
|
+
FROM test_data.sample_sales_data s
|
|
123
|
+
JOIN test_data.product_catalog p ON s.ProductID = p.ProductID;
|
|
124
|
+
|
|
125
|
+
-- Three-way join with aggregation
|
|
126
|
+
SELECT
|
|
127
|
+
p.Category,
|
|
128
|
+
c.CustomerType,
|
|
129
|
+
COUNT(*) as NumOrders,
|
|
130
|
+
SUM(s.TotalAmount) as TotalRevenue,
|
|
131
|
+
AVG(s.Quantity) as AvgQuantity
|
|
132
|
+
FROM test_data.sample_sales_data s
|
|
133
|
+
JOIN test_data.customer_data c ON s.CustomerID = c.CustomerID
|
|
134
|
+
JOIN test_data.product_catalog p ON s.ProductID = p.ProductID
|
|
135
|
+
GROUP BY p.Category, c.CustomerType
|
|
136
|
+
ORDER BY p.Category, c.CustomerType;
|
|
137
|
+
""")
|