sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
import seaborn as sns
|
|
5
|
+
import warnings
|
|
6
|
+
warnings.filterwarnings('ignore')
|
|
7
|
+
|
|
8
|
+
# Import OHE functions
|
|
9
|
+
try:
|
|
10
|
+
from .profile_ohe import get_ohe # Basic OHE
|
|
11
|
+
from .profile_ohe_advanced import get_advanced_ohe, analyze_concept_correlations # Advanced OHE
|
|
12
|
+
except ImportError:
|
|
13
|
+
# Try without relative imports
|
|
14
|
+
import sys
|
|
15
|
+
import os
|
|
16
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
17
|
+
from profile_ohe import get_ohe
|
|
18
|
+
from profile_ohe_advanced import get_advanced_ohe, analyze_concept_correlations
|
|
19
|
+
|
|
20
|
+
# Optional: Word embeddings support
|
|
21
|
+
try:
|
|
22
|
+
from sentence_transformers import SentenceTransformer
|
|
23
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
24
|
+
from sklearn.cluster import AgglomerativeClustering
|
|
25
|
+
EMBEDDINGS_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
EMBEDDINGS_AVAILABLE = False
|
|
28
|
+
print("Note: sentence-transformers not available. Install with: pip install sentence-transformers")
|
|
29
|
+
|
|
30
|
+
class EmbeddingAnalyzer:
|
|
31
|
+
"""
|
|
32
|
+
Advanced semantic analysis using transformer-based embeddings.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, model_name='all-MiniLM-L6-v2'):
|
|
36
|
+
"""Initialize embedding analyzer with pre-trained model."""
|
|
37
|
+
if not EMBEDDINGS_AVAILABLE:
|
|
38
|
+
self.model = None
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.model = SentenceTransformer(model_name)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
print(f"Failed to load embedding model: {e}")
|
|
45
|
+
self.model = None
|
|
46
|
+
|
|
47
|
+
def extract_semantic_clusters_embeddings(self, texts, n_clusters=8):
|
|
48
|
+
"""
|
|
49
|
+
Extract semantic clusters using sentence embeddings.
|
|
50
|
+
More sophisticated than TF-IDF clustering.
|
|
51
|
+
"""
|
|
52
|
+
if self.model is None or not texts:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Generate embeddings for all texts
|
|
57
|
+
embeddings = self.model.encode(texts)
|
|
58
|
+
|
|
59
|
+
# Perform hierarchical clustering
|
|
60
|
+
clustering = AgglomerativeClustering(
|
|
61
|
+
n_clusters=min(n_clusters, len(texts)),
|
|
62
|
+
linkage='ward'
|
|
63
|
+
)
|
|
64
|
+
cluster_labels = clustering.fit_predict(embeddings)
|
|
65
|
+
|
|
66
|
+
# Group texts by cluster
|
|
67
|
+
clusters = {}
|
|
68
|
+
for i, label in enumerate(cluster_labels):
|
|
69
|
+
if label not in clusters:
|
|
70
|
+
clusters[label] = []
|
|
71
|
+
clusters[label].append(texts[i])
|
|
72
|
+
|
|
73
|
+
# Extract representative terms from each cluster
|
|
74
|
+
cluster_concepts = []
|
|
75
|
+
for cluster_id, cluster_texts in clusters.items():
|
|
76
|
+
if len(cluster_texts) >= 2: # Only clusters with multiple texts
|
|
77
|
+
# Get most common words across cluster texts
|
|
78
|
+
all_words = []
|
|
79
|
+
for text in cluster_texts:
|
|
80
|
+
words = text.lower().split()
|
|
81
|
+
all_words.extend([w for w in words if len(w) > 3])
|
|
82
|
+
|
|
83
|
+
from collections import Counter
|
|
84
|
+
common_words = Counter(all_words).most_common(5)
|
|
85
|
+
|
|
86
|
+
cluster_concepts.append({
|
|
87
|
+
'name': f"embedding_cluster_{cluster_id}",
|
|
88
|
+
'words': [word for word, _ in common_words],
|
|
89
|
+
'texts': cluster_texts,
|
|
90
|
+
'size': len(cluster_texts)
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
return cluster_concepts
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"Embedding clustering failed: {e}")
|
|
97
|
+
return []
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def demo_advanced_algorithms():
|
|
101
|
+
"""Demonstrate the power of advanced algorithms with AI-related text."""
|
|
102
|
+
print("\n" + "="*80)
|
|
103
|
+
print("DEMONSTRATION: Advanced OHE Algorithms vs Basic Approach")
|
|
104
|
+
print("="*80)
|
|
105
|
+
|
|
106
|
+
# Create AI/ML focused dataset (your use case!)
|
|
107
|
+
ai_data = {
|
|
108
|
+
'description': [
|
|
109
|
+
"Machine learning engineer developing neural networks for computer vision applications",
|
|
110
|
+
"AI researcher working on natural language processing and large language models",
|
|
111
|
+
"Data scientist implementing deep learning algorithms for predictive analytics",
|
|
112
|
+
"Software engineer building recommendation systems using collaborative filtering",
|
|
113
|
+
"ML ops engineer deploying artificial intelligence models to cloud infrastructure",
|
|
114
|
+
"Computer vision specialist creating object detection systems for autonomous vehicles",
|
|
115
|
+
"NLP engineer developing chatbots and conversational AI systems",
|
|
116
|
+
"Deep learning researcher working on transformer architectures and attention mechanisms",
|
|
117
|
+
"AI product manager overseeing machine learning product development lifecycle",
|
|
118
|
+
"Data engineer building pipelines for real-time AI model inference and training"
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
df = pd.DataFrame(ai_data)
|
|
123
|
+
|
|
124
|
+
print("\nOriginal Data:")
|
|
125
|
+
for i, desc in enumerate(ai_data['description'][:3]):
|
|
126
|
+
print(f"{i+1}. {desc}")
|
|
127
|
+
print("... (and 7 more)")
|
|
128
|
+
|
|
129
|
+
# Test Basic OHE
|
|
130
|
+
print("\n" + "-"*50)
|
|
131
|
+
print("BASIC OHE RESULTS:")
|
|
132
|
+
print("-"*50)
|
|
133
|
+
basic_result = get_ohe(df.copy(), 'description', binary_format="numeric")
|
|
134
|
+
basic_features = [col for col in basic_result.columns if col.startswith('has_')]
|
|
135
|
+
print(f"Features created: {len(basic_features)}")
|
|
136
|
+
for feature in basic_features:
|
|
137
|
+
coverage = (basic_result[feature] == 1).sum() / len(basic_result) * 100
|
|
138
|
+
print(f" • {feature}: {coverage:.1f}% coverage")
|
|
139
|
+
|
|
140
|
+
# Test Advanced OHE
|
|
141
|
+
print("\n" + "-"*50)
|
|
142
|
+
print("ADVANCED OHE RESULTS:")
|
|
143
|
+
print("-"*50)
|
|
144
|
+
advanced_result = get_advanced_ohe(
|
|
145
|
+
df.copy(),
|
|
146
|
+
'description',
|
|
147
|
+
binary_format="numeric",
|
|
148
|
+
analysis_type="comprehensive",
|
|
149
|
+
max_features=15
|
|
150
|
+
)
|
|
151
|
+
advanced_features = [col for col in advanced_result.columns if col.startswith('has_')]
|
|
152
|
+
print(f"Features created: {len(advanced_features)}")
|
|
153
|
+
|
|
154
|
+
# Group features by type
|
|
155
|
+
feature_types = {}
|
|
156
|
+
for feature in advanced_features:
|
|
157
|
+
if 'topic_lda' in feature:
|
|
158
|
+
feature_types.setdefault('LDA Topics', []).append(feature)
|
|
159
|
+
elif 'topic_nmf' in feature:
|
|
160
|
+
feature_types.setdefault('NMF Topics', []).append(feature)
|
|
161
|
+
elif 'semantic_cluster' in feature:
|
|
162
|
+
feature_types.setdefault('Semantic Clusters', []).append(feature)
|
|
163
|
+
elif 'domain_' in feature:
|
|
164
|
+
feature_types.setdefault('Domain Concepts', []).append(feature)
|
|
165
|
+
elif 'ngram_' in feature:
|
|
166
|
+
feature_types.setdefault('Key N-grams', []).append(feature)
|
|
167
|
+
else:
|
|
168
|
+
feature_types.setdefault('Other', []).append(feature)
|
|
169
|
+
|
|
170
|
+
for ftype, features in feature_types.items():
|
|
171
|
+
print(f"\n{ftype}:")
|
|
172
|
+
for feature in features[:3]: # Show first 3 of each type
|
|
173
|
+
coverage = (advanced_result[feature] == 1).sum() / len(advanced_result) * 100
|
|
174
|
+
print(f" • {feature}: {coverage:.1f}% coverage")
|
|
175
|
+
if len(features) > 3:
|
|
176
|
+
print(f" ... and {len(features) - 3} more")
|
|
177
|
+
|
|
178
|
+
# Analyze correlations
|
|
179
|
+
print("\n" + "-"*50)
|
|
180
|
+
print("CORRELATION ANALYSIS:")
|
|
181
|
+
print("-"*50)
|
|
182
|
+
correlation_analysis = analyze_concept_correlations(advanced_result, advanced_features)
|
|
183
|
+
|
|
184
|
+
if correlation_analysis and correlation_analysis.get('strong_correlations'):
|
|
185
|
+
print("Strong correlations found (shows semantic relationships):")
|
|
186
|
+
for corr in correlation_analysis['strong_correlations'][:5]:
|
|
187
|
+
print(f" • {corr['feature1']} ↔ {corr['feature2']}: {corr['correlation']:.3f}")
|
|
188
|
+
|
|
189
|
+
print("\nThis shows the advanced algorithm captured semantic relationships!")
|
|
190
|
+
print("For example, 'AI' and 'machine learning' concepts are properly linked.")
|
|
191
|
+
else:
|
|
192
|
+
print("No strong correlations found - features are orthogonal")
|
|
193
|
+
|
|
194
|
+
# Show specific example of AI correlation capture
|
|
195
|
+
print("\n" + "-"*50)
|
|
196
|
+
print("AI CORRELATION ANALYSIS (Your Original Problem):")
|
|
197
|
+
print("-"*50)
|
|
198
|
+
|
|
199
|
+
# Check which features capture AI-related concepts
|
|
200
|
+
ai_related_features = []
|
|
201
|
+
for feature in advanced_features:
|
|
202
|
+
feature_name = feature.lower()
|
|
203
|
+
if any(term in feature_name for term in ['ai', 'artificial', 'intelligence', 'machine', 'learning', 'neural', 'deep']):
|
|
204
|
+
ai_related_features.append(feature)
|
|
205
|
+
|
|
206
|
+
if ai_related_features:
|
|
207
|
+
print(f"Found {len(ai_related_features)} AI-related features:")
|
|
208
|
+
for feature in ai_related_features:
|
|
209
|
+
coverage = (advanced_result[feature] == 1).sum() / len(advanced_result) * 100
|
|
210
|
+
print(f" • {feature}: {coverage:.1f}% coverage")
|
|
211
|
+
|
|
212
|
+
# Show which descriptions match these features
|
|
213
|
+
print("\nDescriptions matching AI-related features:")
|
|
214
|
+
for i, desc in enumerate(ai_data['description']):
|
|
215
|
+
matches = []
|
|
216
|
+
for feature in ai_related_features:
|
|
217
|
+
if advanced_result.iloc[i][feature] == 1:
|
|
218
|
+
matches.append(feature.replace('has_', ''))
|
|
219
|
+
if matches:
|
|
220
|
+
print(f" {i+1}. '{desc[:50]}...' → {', '.join(matches[:2])}")
|
|
221
|
+
else:
|
|
222
|
+
print("No explicit AI-related features found in feature names")
|
|
223
|
+
print("However, topic modeling may have captured these concepts in broader themes")
|
|
224
|
+
|
|
225
|
+
print("\n" + "="*80)
|
|
226
|
+
print("CONCLUSION: Advanced algorithms provide much richer semantic understanding!")
|
|
227
|
+
print("• Basic OHE: Only captures individual word frequency")
|
|
228
|
+
print("• Advanced OHE: Captures topics, themes, semantic clusters, and domain concepts")
|
|
229
|
+
print("• This solves your AI correlation problem by grouping related concepts!")
|
|
230
|
+
print("="*80)
|
|
231
|
+
|
|
232
|
+
return basic_result, advanced_result
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == "__main__":
|
|
236
|
+
# Run the demonstration
|
|
237
|
+
demo_advanced_algorithms()
|