sqlshell 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlshell/__init__.py +84 -0
- sqlshell/__main__.py +4926 -0
- sqlshell/ai_autocomplete.py +392 -0
- sqlshell/ai_settings_dialog.py +337 -0
- sqlshell/context_suggester.py +768 -0
- sqlshell/create_test_data.py +152 -0
- sqlshell/data/create_test_data.py +137 -0
- sqlshell/db/__init__.py +6 -0
- sqlshell/db/database_manager.py +1318 -0
- sqlshell/db/export_manager.py +188 -0
- sqlshell/editor.py +1166 -0
- sqlshell/editor_integration.py +127 -0
- sqlshell/execution_handler.py +421 -0
- sqlshell/menus.py +262 -0
- sqlshell/notification_manager.py +370 -0
- sqlshell/query_tab.py +904 -0
- sqlshell/resources/__init__.py +1 -0
- sqlshell/resources/icon.png +0 -0
- sqlshell/resources/logo_large.png +0 -0
- sqlshell/resources/logo_medium.png +0 -0
- sqlshell/resources/logo_small.png +0 -0
- sqlshell/resources/splash_screen.gif +0 -0
- sqlshell/space_invaders.py +501 -0
- sqlshell/splash_screen.py +405 -0
- sqlshell/sqlshell/__init__.py +5 -0
- sqlshell/sqlshell/create_test_data.py +118 -0
- sqlshell/sqlshell/create_test_databases.py +96 -0
- sqlshell/sqlshell_demo.png +0 -0
- sqlshell/styles.py +257 -0
- sqlshell/suggester_integration.py +330 -0
- sqlshell/syntax_highlighter.py +124 -0
- sqlshell/table_list.py +996 -0
- sqlshell/ui/__init__.py +6 -0
- sqlshell/ui/bar_chart_delegate.py +49 -0
- sqlshell/ui/filter_header.py +469 -0
- sqlshell/utils/__init__.py +16 -0
- sqlshell/utils/profile_cn2.py +1661 -0
- sqlshell/utils/profile_column.py +2635 -0
- sqlshell/utils/profile_distributions.py +616 -0
- sqlshell/utils/profile_entropy.py +347 -0
- sqlshell/utils/profile_foreign_keys.py +779 -0
- sqlshell/utils/profile_keys.py +2834 -0
- sqlshell/utils/profile_ohe.py +934 -0
- sqlshell/utils/profile_ohe_advanced.py +754 -0
- sqlshell/utils/profile_ohe_comparison.py +237 -0
- sqlshell/utils/profile_prediction.py +926 -0
- sqlshell/utils/profile_similarity.py +876 -0
- sqlshell/utils/search_in_df.py +90 -0
- sqlshell/widgets.py +400 -0
- sqlshell-0.4.4.dist-info/METADATA +441 -0
- sqlshell-0.4.4.dist-info/RECORD +54 -0
- sqlshell-0.4.4.dist-info/WHEEL +5 -0
- sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
- sqlshell-0.4.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,754 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
6
|
+
from sklearn.decomposition import LatentDirichletAllocation, NMF
|
|
7
|
+
from sklearn.cluster import KMeans, DBSCAN
|
|
8
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
9
|
+
from sklearn.manifold import TSNE
|
|
10
|
+
from collections import defaultdict, Counter
|
|
11
|
+
import re
|
|
12
|
+
import warnings
|
|
13
|
+
warnings.filterwarnings('ignore')
|
|
14
|
+
|
|
15
|
+
# Optional imports with fallbacks
|
|
16
|
+
try:
|
|
17
|
+
import spacy
|
|
18
|
+
SPACY_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
SPACY_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
# Flag to track if NLTK is available
|
|
23
|
+
NLTK_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
def _setup_nltk_data_path():
|
|
26
|
+
"""Configure NLTK to find data in bundled location (for PyInstaller builds)"""
|
|
27
|
+
import nltk
|
|
28
|
+
|
|
29
|
+
# Check if running from a PyInstaller bundle
|
|
30
|
+
if getattr(sys, 'frozen', False):
|
|
31
|
+
# Running in a PyInstaller bundle
|
|
32
|
+
bundle_dir = sys._MEIPASS
|
|
33
|
+
nltk_data_path = os.path.join(bundle_dir, 'nltk_data')
|
|
34
|
+
if os.path.exists(nltk_data_path):
|
|
35
|
+
nltk.data.path.insert(0, nltk_data_path)
|
|
36
|
+
|
|
37
|
+
# Also check relative to the application
|
|
38
|
+
app_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
39
|
+
possible_paths = [
|
|
40
|
+
os.path.join(app_dir, 'nltk_data'),
|
|
41
|
+
os.path.join(os.path.dirname(app_dir), 'nltk_data'),
|
|
42
|
+
]
|
|
43
|
+
for path in possible_paths:
|
|
44
|
+
if os.path.exists(path) and path not in nltk.data.path:
|
|
45
|
+
nltk.data.path.insert(0, path)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _simple_tokenize(text):
|
|
49
|
+
"""Simple fallback tokenizer when NLTK is not available"""
|
|
50
|
+
# Simple word tokenization using regex
|
|
51
|
+
return re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_simple_stopwords():
|
|
55
|
+
"""Return a basic set of English stopwords when NLTK is not available"""
|
|
56
|
+
return {
|
|
57
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
58
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
59
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
60
|
+
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
|
|
61
|
+
'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
|
|
62
|
+
'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
|
|
63
|
+
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
|
|
64
|
+
'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
|
|
65
|
+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
|
|
66
|
+
'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
|
67
|
+
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
68
|
+
'just', 'also', 'now', 'here', 'there', 'then', 'once', 'if', 'because',
|
|
69
|
+
'while', 'although', 'though', 'after', 'before', 'since', 'until', 'unless'
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
import nltk
|
|
74
|
+
_setup_nltk_data_path()
|
|
75
|
+
from nltk.corpus import stopwords
|
|
76
|
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
|
77
|
+
|
|
78
|
+
# Try to find required NLTK data, download if missing
|
|
79
|
+
try:
|
|
80
|
+
nltk.data.find('tokenizers/punkt')
|
|
81
|
+
except LookupError:
|
|
82
|
+
try:
|
|
83
|
+
nltk.download('punkt', quiet=True)
|
|
84
|
+
except Exception:
|
|
85
|
+
pass # Download failed silently - NLTK features will be unavailable
|
|
86
|
+
try:
|
|
87
|
+
nltk.data.find('corpora/stopwords')
|
|
88
|
+
except LookupError:
|
|
89
|
+
try:
|
|
90
|
+
nltk.download('stopwords', quiet=True)
|
|
91
|
+
except Exception:
|
|
92
|
+
pass # Download failed silently - NLTK features will be unavailable
|
|
93
|
+
try:
|
|
94
|
+
nltk.data.find('tokenizers/punkt_tab/english')
|
|
95
|
+
except LookupError:
|
|
96
|
+
try:
|
|
97
|
+
nltk.download('punkt_tab', quiet=True)
|
|
98
|
+
except Exception:
|
|
99
|
+
pass # Download failed silently - NLTK features will be unavailable
|
|
100
|
+
|
|
101
|
+
# Test if NLTK is actually working
|
|
102
|
+
try:
|
|
103
|
+
_ = stopwords.words('english')
|
|
104
|
+
_ = word_tokenize("test")
|
|
105
|
+
NLTK_AVAILABLE = True
|
|
106
|
+
except Exception:
|
|
107
|
+
NLTK_AVAILABLE = False
|
|
108
|
+
|
|
109
|
+
except ImportError:
|
|
110
|
+
NLTK_AVAILABLE = False
|
|
111
|
+
|
|
112
|
+
class AdvancedTextAnalyzer:
|
|
113
|
+
"""
|
|
114
|
+
Advanced text analyzer using multiple academic algorithms for sophisticated
|
|
115
|
+
feature extraction and semantic analysis.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, model_name='en_core_web_sm'):
|
|
119
|
+
"""
|
|
120
|
+
Initialize the advanced text analyzer.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
model_name (str): Spacy model name for NER and advanced processing
|
|
124
|
+
"""
|
|
125
|
+
# Get stopwords (use NLTK if available, otherwise fallback)
|
|
126
|
+
if NLTK_AVAILABLE:
|
|
127
|
+
self.stop_words = set(stopwords.words('english'))
|
|
128
|
+
else:
|
|
129
|
+
self.stop_words = _get_simple_stopwords()
|
|
130
|
+
|
|
131
|
+
self.tfidf_vectorizer = None
|
|
132
|
+
self.lda_model = None
|
|
133
|
+
self.nmf_model = None
|
|
134
|
+
self.word_clusters = None
|
|
135
|
+
self.concept_mapping = {}
|
|
136
|
+
|
|
137
|
+
# Try to load spaCy model for NER
|
|
138
|
+
if SPACY_AVAILABLE:
|
|
139
|
+
try:
|
|
140
|
+
self.nlp = spacy.load(model_name)
|
|
141
|
+
except OSError:
|
|
142
|
+
self.nlp = None
|
|
143
|
+
else:
|
|
144
|
+
self.nlp = None
|
|
145
|
+
|
|
146
|
+
def extract_semantic_concepts(self, texts, n_topics=8, min_concept_freq=2):
|
|
147
|
+
"""
|
|
148
|
+
Extract semantic concepts using multiple algorithms:
|
|
149
|
+
1. Topic Modeling (LDA + NMF)
|
|
150
|
+
2. TF-IDF with clustering
|
|
151
|
+
3. Named Entity Recognition (if available)
|
|
152
|
+
4. N-gram concept extraction
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
texts (list): List of text documents
|
|
156
|
+
n_topics (int): Number of topics for topic modeling
|
|
157
|
+
min_concept_freq (int): Minimum frequency for concept inclusion
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
dict: Dictionary mapping concept types to extracted concepts
|
|
161
|
+
"""
|
|
162
|
+
concepts = {
|
|
163
|
+
'topics_lda': [],
|
|
164
|
+
'topics_nmf': [],
|
|
165
|
+
'entities': [],
|
|
166
|
+
'semantic_clusters': [],
|
|
167
|
+
'key_ngrams': [],
|
|
168
|
+
'domain_concepts': []
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
if not texts or len(texts) == 0:
|
|
172
|
+
return concepts
|
|
173
|
+
|
|
174
|
+
# Clean and preprocess texts
|
|
175
|
+
cleaned_texts = [self._preprocess_text(text) for text in texts if isinstance(text, str)]
|
|
176
|
+
if not cleaned_texts:
|
|
177
|
+
return concepts
|
|
178
|
+
|
|
179
|
+
# 1. Topic Modeling with LDA and NMF
|
|
180
|
+
concepts['topics_lda'] = self._extract_topics_lda(cleaned_texts, n_topics)
|
|
181
|
+
concepts['topics_nmf'] = self._extract_topics_nmf(cleaned_texts, n_topics)
|
|
182
|
+
|
|
183
|
+
# 2. Named Entity Recognition (if spaCy is available)
|
|
184
|
+
if SPACY_AVAILABLE and self.nlp:
|
|
185
|
+
concepts['entities'] = self._extract_named_entities(texts)
|
|
186
|
+
|
|
187
|
+
# 3. Semantic clustering of words
|
|
188
|
+
concepts['semantic_clusters'] = self._extract_semantic_clusters(cleaned_texts)
|
|
189
|
+
|
|
190
|
+
# 4. Key N-gram extraction
|
|
191
|
+
concepts['key_ngrams'] = self._extract_key_ngrams(cleaned_texts, min_concept_freq)
|
|
192
|
+
|
|
193
|
+
# 5. Domain-specific concept extraction
|
|
194
|
+
concepts['domain_concepts'] = self._extract_domain_concepts(cleaned_texts)
|
|
195
|
+
|
|
196
|
+
return concepts
|
|
197
|
+
|
|
198
|
+
def _preprocess_text(self, text):
|
|
199
|
+
"""Advanced text preprocessing"""
|
|
200
|
+
if not isinstance(text, str):
|
|
201
|
+
return ""
|
|
202
|
+
|
|
203
|
+
# Convert to lowercase and remove extra whitespace
|
|
204
|
+
text = re.sub(r'\s+', ' ', text.lower().strip())
|
|
205
|
+
|
|
206
|
+
# Remove special characters but keep important punctuation
|
|
207
|
+
text = re.sub(r'[^\w\s\-\.]', ' ', text)
|
|
208
|
+
|
|
209
|
+
# Tokenize (use NLTK if available, otherwise fallback)
|
|
210
|
+
if NLTK_AVAILABLE:
|
|
211
|
+
tokens = word_tokenize(text)
|
|
212
|
+
else:
|
|
213
|
+
tokens = _simple_tokenize(text)
|
|
214
|
+
|
|
215
|
+
# Remove stopwords and short tokens
|
|
216
|
+
tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
|
|
217
|
+
|
|
218
|
+
return ' '.join(tokens)
|
|
219
|
+
|
|
220
|
+
def _extract_topics_lda(self, texts, n_topics):
|
|
221
|
+
"""Extract topics using Latent Dirichlet Allocation"""
|
|
222
|
+
try:
|
|
223
|
+
# Create TF-IDF vectorizer
|
|
224
|
+
self.tfidf_vectorizer = TfidfVectorizer(
|
|
225
|
+
max_features=1000,
|
|
226
|
+
ngram_range=(1, 3),
|
|
227
|
+
min_df=2,
|
|
228
|
+
max_df=0.8
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
|
232
|
+
|
|
233
|
+
# Apply LDA
|
|
234
|
+
self.lda_model = LatentDirichletAllocation(
|
|
235
|
+
n_components=n_topics,
|
|
236
|
+
random_state=42,
|
|
237
|
+
max_iter=10
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
self.lda_model.fit(tfidf_matrix)
|
|
241
|
+
|
|
242
|
+
# Extract topic keywords
|
|
243
|
+
feature_names = self.tfidf_vectorizer.get_feature_names_out()
|
|
244
|
+
topics = []
|
|
245
|
+
|
|
246
|
+
for topic_idx, topic in enumerate(self.lda_model.components_):
|
|
247
|
+
top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
|
|
248
|
+
topic_name = f"topic_lda_{topic_idx}_{'_'.join(top_words[:2])}"
|
|
249
|
+
topics.append({
|
|
250
|
+
'name': topic_name,
|
|
251
|
+
'keywords': top_words,
|
|
252
|
+
'weight': float(np.sum(topic))
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
return topics
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
print(f"LDA topic extraction failed: {e}")
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
def _extract_topics_nmf(self, texts, n_topics):
|
|
262
|
+
"""Extract topics using Non-negative Matrix Factorization"""
|
|
263
|
+
try:
|
|
264
|
+
if self.tfidf_vectorizer is None:
|
|
265
|
+
self.tfidf_vectorizer = TfidfVectorizer(
|
|
266
|
+
max_features=1000,
|
|
267
|
+
ngram_range=(1, 3),
|
|
268
|
+
min_df=2,
|
|
269
|
+
max_df=0.8
|
|
270
|
+
)
|
|
271
|
+
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
|
272
|
+
else:
|
|
273
|
+
tfidf_matrix = self.tfidf_vectorizer.transform(texts)
|
|
274
|
+
|
|
275
|
+
# Apply NMF
|
|
276
|
+
self.nmf_model = NMF(
|
|
277
|
+
n_components=n_topics,
|
|
278
|
+
random_state=42,
|
|
279
|
+
alpha_W=0.1,
|
|
280
|
+
alpha_H=0.1
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
self.nmf_model.fit(tfidf_matrix)
|
|
284
|
+
|
|
285
|
+
# Extract topic keywords
|
|
286
|
+
feature_names = self.tfidf_vectorizer.get_feature_names_out()
|
|
287
|
+
topics = []
|
|
288
|
+
|
|
289
|
+
for topic_idx, topic in enumerate(self.nmf_model.components_):
|
|
290
|
+
top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
|
|
291
|
+
topic_name = f"topic_nmf_{topic_idx}_{'_'.join(top_words[:2])}"
|
|
292
|
+
topics.append({
|
|
293
|
+
'name': topic_name,
|
|
294
|
+
'keywords': top_words,
|
|
295
|
+
'weight': float(np.sum(topic))
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return topics
|
|
299
|
+
|
|
300
|
+
except Exception as e:
|
|
301
|
+
print(f"NMF topic extraction failed: {e}")
|
|
302
|
+
return []
|
|
303
|
+
|
|
304
|
+
def _extract_named_entities(self, texts):
|
|
305
|
+
"""Extract named entities using spaCy"""
|
|
306
|
+
if self.nlp is None:
|
|
307
|
+
return []
|
|
308
|
+
|
|
309
|
+
entities = defaultdict(list)
|
|
310
|
+
|
|
311
|
+
try:
|
|
312
|
+
for text in texts:
|
|
313
|
+
if isinstance(text, str):
|
|
314
|
+
doc = self.nlp(text)
|
|
315
|
+
for ent in doc.ents:
|
|
316
|
+
# Focus on relevant entity types
|
|
317
|
+
if ent.label_ in ['ORG', 'PRODUCT', 'TECHNOLOGY', 'EVENT', 'GPE', 'PERSON']:
|
|
318
|
+
entities[ent.label_].append(ent.text.lower())
|
|
319
|
+
|
|
320
|
+
# Convert to concept format
|
|
321
|
+
entity_concepts = []
|
|
322
|
+
for entity_type, entity_list in entities.items():
|
|
323
|
+
# Get most common entities of each type
|
|
324
|
+
common_entities = Counter(entity_list).most_common(5)
|
|
325
|
+
for entity, count in common_entities:
|
|
326
|
+
if count >= 2: # Must appear at least twice
|
|
327
|
+
entity_concepts.append({
|
|
328
|
+
'name': f"entity_{entity_type.lower()}_{entity.replace(' ', '_')}",
|
|
329
|
+
'type': entity_type,
|
|
330
|
+
'entity': entity,
|
|
331
|
+
'frequency': count
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
return entity_concepts
|
|
335
|
+
|
|
336
|
+
except Exception as e:
|
|
337
|
+
print(f"Named entity extraction failed: {e}")
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
def _extract_semantic_clusters(self, texts):
|
|
341
|
+
"""Extract semantic word clusters using TF-IDF and clustering"""
|
|
342
|
+
try:
|
|
343
|
+
if self.tfidf_vectorizer is None:
|
|
344
|
+
self.tfidf_vectorizer = TfidfVectorizer(
|
|
345
|
+
max_features=500,
|
|
346
|
+
ngram_range=(1, 2),
|
|
347
|
+
min_df=2,
|
|
348
|
+
max_df=0.8
|
|
349
|
+
)
|
|
350
|
+
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
|
351
|
+
else:
|
|
352
|
+
tfidf_matrix = self.tfidf_vectorizer.transform(texts)
|
|
353
|
+
|
|
354
|
+
# Get feature names (words/phrases)
|
|
355
|
+
feature_names = self.tfidf_vectorizer.get_feature_names_out()
|
|
356
|
+
|
|
357
|
+
if len(feature_names) < 5:
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
# Cluster words based on their TF-IDF vectors
|
|
361
|
+
n_clusters = min(8, len(feature_names) // 3)
|
|
362
|
+
if n_clusters < 2:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
366
|
+
|
|
367
|
+
# Transpose to cluster features (words) instead of documents
|
|
368
|
+
word_clusters = kmeans.fit_predict(tfidf_matrix.T.toarray())
|
|
369
|
+
|
|
370
|
+
# Group words by cluster
|
|
371
|
+
clusters = defaultdict(list)
|
|
372
|
+
for word_idx, cluster_id in enumerate(word_clusters):
|
|
373
|
+
clusters[cluster_id].append(feature_names[word_idx])
|
|
374
|
+
|
|
375
|
+
# Convert to concept format
|
|
376
|
+
cluster_concepts = []
|
|
377
|
+
for cluster_id, words in clusters.items():
|
|
378
|
+
if len(words) >= 2: # Only clusters with multiple words
|
|
379
|
+
# Sort words by their average TF-IDF score
|
|
380
|
+
cluster_name = f"semantic_cluster_{cluster_id}_{'_'.join(words[:2])}"
|
|
381
|
+
cluster_concepts.append({
|
|
382
|
+
'name': cluster_name,
|
|
383
|
+
'words': words,
|
|
384
|
+
'cluster_id': cluster_id,
|
|
385
|
+
'size': len(words)
|
|
386
|
+
})
|
|
387
|
+
|
|
388
|
+
return cluster_concepts
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
print(f"Semantic clustering failed: {e}")
|
|
392
|
+
return []
|
|
393
|
+
|
|
394
|
+
def _extract_key_ngrams(self, texts, min_freq=2):
|
|
395
|
+
"""Extract key n-grams using advanced scoring"""
|
|
396
|
+
try:
|
|
397
|
+
# Extract 2-grams and 3-grams
|
|
398
|
+
ngram_vectorizer = TfidfVectorizer(
|
|
399
|
+
ngram_range=(2, 3),
|
|
400
|
+
min_df=min_freq,
|
|
401
|
+
max_df=0.8,
|
|
402
|
+
stop_words='english'
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
ngram_matrix = ngram_vectorizer.fit_transform(texts)
|
|
406
|
+
feature_names = ngram_vectorizer.get_feature_names_out()
|
|
407
|
+
|
|
408
|
+
# Calculate importance scores
|
|
409
|
+
tfidf_scores = np.array(ngram_matrix.sum(axis=0)).flatten()
|
|
410
|
+
|
|
411
|
+
# Get top n-grams
|
|
412
|
+
top_indices = tfidf_scores.argsort()[-15:][::-1]
|
|
413
|
+
|
|
414
|
+
ngram_concepts = []
|
|
415
|
+
for idx in top_indices:
|
|
416
|
+
if tfidf_scores[idx] > 0:
|
|
417
|
+
ngram = feature_names[idx]
|
|
418
|
+
ngram_concepts.append({
|
|
419
|
+
'name': f"ngram_{ngram.replace(' ', '_')}",
|
|
420
|
+
'ngram': ngram,
|
|
421
|
+
'score': float(tfidf_scores[idx])
|
|
422
|
+
})
|
|
423
|
+
|
|
424
|
+
return ngram_concepts
|
|
425
|
+
|
|
426
|
+
except Exception as e:
|
|
427
|
+
print(f"N-gram extraction failed: {e}")
|
|
428
|
+
return []
|
|
429
|
+
|
|
430
|
+
def _extract_domain_concepts(self, texts):
|
|
431
|
+
"""Extract domain-specific concepts using keyword patterns"""
|
|
432
|
+
# Define domain-specific patterns
|
|
433
|
+
domain_patterns = {
|
|
434
|
+
'ai_ml': [
|
|
435
|
+
r'\b(artificial intelligence|ai|machine learning|ml|deep learning|neural network|nlp|computer vision|data science)\b',
|
|
436
|
+
r'\b(algorithm|model|training|prediction|classification|regression|clustering)\b',
|
|
437
|
+
r'\b(tensorflow|pytorch|scikit|keras|pandas|numpy)\b'
|
|
438
|
+
],
|
|
439
|
+
'tech': [
|
|
440
|
+
r'\b(software|hardware|system|platform|framework|database|api|cloud|server)\b',
|
|
441
|
+
r'\b(programming|development|coding|bug|feature|deployment|testing)\b',
|
|
442
|
+
r'\b(python|java|javascript|sql|html|css|react|node)\b'
|
|
443
|
+
],
|
|
444
|
+
'business': [
|
|
445
|
+
r'\b(revenue|profit|sales|customer|market|strategy|growth|roi|kpi)\b',
|
|
446
|
+
r'\b(management|team|project|budget|timeline|milestone|deliverable)\b',
|
|
447
|
+
r'\b(analytics|metrics|dashboard|report|insight|trend)\b'
|
|
448
|
+
],
|
|
449
|
+
'academic': [
|
|
450
|
+
r'\b(research|study|analysis|experiment|hypothesis|methodology|results)\b',
|
|
451
|
+
r'\b(publication|paper|journal|conference|peer review|citation)\b',
|
|
452
|
+
r'\b(university|college|professor|student|degree|thesis)\b'
|
|
453
|
+
]
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
domain_concepts = []
|
|
457
|
+
combined_text = ' '.join(texts).lower()
|
|
458
|
+
|
|
459
|
+
for domain, patterns in domain_patterns.items():
|
|
460
|
+
domain_matches = set()
|
|
461
|
+
for pattern in patterns:
|
|
462
|
+
matches = re.findall(pattern, combined_text)
|
|
463
|
+
domain_matches.update(matches)
|
|
464
|
+
|
|
465
|
+
if domain_matches:
|
|
466
|
+
domain_concepts.append({
|
|
467
|
+
'name': f"domain_{domain}",
|
|
468
|
+
'domain': domain,
|
|
469
|
+
'concepts': list(domain_matches),
|
|
470
|
+
'count': len(domain_matches)
|
|
471
|
+
})
|
|
472
|
+
|
|
473
|
+
return domain_concepts
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def get_advanced_ohe(dataframe: pd.DataFrame, column: str,
|
|
477
|
+
binary_format: str = "numeric",
|
|
478
|
+
analysis_type: str = "comprehensive",
|
|
479
|
+
n_topics: int = 6,
|
|
480
|
+
max_features: int = 25) -> pd.DataFrame:
|
|
481
|
+
"""
|
|
482
|
+
Create sophisticated one-hot encoded columns using advanced academic algorithms.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
dataframe (pd.DataFrame): Input dataframe
|
|
486
|
+
column (str): Name of the column to process
|
|
487
|
+
binary_format (str): Format for encoding - "numeric" for 1/0 or "text" for "Yes"/"No"
|
|
488
|
+
analysis_type (str): Type of analysis - "comprehensive", "topic_focused", "entity_focused", "semantic_focused"
|
|
489
|
+
n_topics (int): Number of topics for topic modeling
|
|
490
|
+
max_features (int): Maximum number of features to create
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
pd.DataFrame: Original dataframe with additional sophisticated one-hot encoded columns
|
|
494
|
+
"""
|
|
495
|
+
# Check if column exists
|
|
496
|
+
if column not in dataframe.columns:
|
|
497
|
+
raise ValueError(f"Column '{column}' not found in dataframe")
|
|
498
|
+
|
|
499
|
+
# Check binary format is valid
|
|
500
|
+
if binary_format not in ["numeric", "text"]:
|
|
501
|
+
raise ValueError("binary_format must be either 'numeric' or 'text'")
|
|
502
|
+
|
|
503
|
+
# Filter out non-string values and get text data
|
|
504
|
+
text_data = dataframe[column].dropna().astype(str).tolist()
|
|
505
|
+
if not text_data:
|
|
506
|
+
return dataframe # Nothing to process
|
|
507
|
+
|
|
508
|
+
# Initialize advanced analyzer
|
|
509
|
+
analyzer = AdvancedTextAnalyzer()
|
|
510
|
+
|
|
511
|
+
# Extract sophisticated concepts
|
|
512
|
+
print("Extracting semantic concepts using advanced algorithms...")
|
|
513
|
+
concepts = analyzer.extract_semantic_concepts(text_data, n_topics=n_topics)
|
|
514
|
+
|
|
515
|
+
# Create features based on analysis type
|
|
516
|
+
features_to_create = []
|
|
517
|
+
|
|
518
|
+
if analysis_type in ["comprehensive", "topic_focused"]:
|
|
519
|
+
# Add topic-based features
|
|
520
|
+
for topic in concepts['topics_lda']:
|
|
521
|
+
features_to_create.append({
|
|
522
|
+
'name': f"has_{topic['name']}",
|
|
523
|
+
'type': 'topic_lda',
|
|
524
|
+
'keywords': topic['keywords']
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
for topic in concepts['topics_nmf']:
|
|
528
|
+
features_to_create.append({
|
|
529
|
+
'name': f"has_{topic['name']}",
|
|
530
|
+
'type': 'topic_nmf',
|
|
531
|
+
'keywords': topic['keywords']
|
|
532
|
+
})
|
|
533
|
+
|
|
534
|
+
if analysis_type in ["comprehensive", "entity_focused"]:
|
|
535
|
+
# Add entity-based features
|
|
536
|
+
for entity in concepts['entities']:
|
|
537
|
+
features_to_create.append({
|
|
538
|
+
'name': f"has_{entity['name']}",
|
|
539
|
+
'type': 'entity',
|
|
540
|
+
'entity_text': entity['entity']
|
|
541
|
+
})
|
|
542
|
+
|
|
543
|
+
if analysis_type in ["comprehensive", "semantic_focused"]:
|
|
544
|
+
# Add semantic cluster features
|
|
545
|
+
for cluster in concepts['semantic_clusters']:
|
|
546
|
+
features_to_create.append({
|
|
547
|
+
'name': f"has_{cluster['name']}",
|
|
548
|
+
'type': 'semantic_cluster',
|
|
549
|
+
'words': cluster['words']
|
|
550
|
+
})
|
|
551
|
+
|
|
552
|
+
# Add n-gram features
|
|
553
|
+
for ngram in concepts['key_ngrams'][:10]: # Top 10 n-grams
|
|
554
|
+
features_to_create.append({
|
|
555
|
+
'name': f"has_{ngram['name']}",
|
|
556
|
+
'type': 'ngram',
|
|
557
|
+
'ngram_text': ngram['ngram']
|
|
558
|
+
})
|
|
559
|
+
|
|
560
|
+
if analysis_type == "comprehensive":
|
|
561
|
+
# Add domain concept features
|
|
562
|
+
for domain in concepts['domain_concepts']:
|
|
563
|
+
features_to_create.append({
|
|
564
|
+
'name': f"has_{domain['name']}",
|
|
565
|
+
'type': 'domain',
|
|
566
|
+
'domain_concepts': domain['concepts']
|
|
567
|
+
})
|
|
568
|
+
|
|
569
|
+
# Limit features to max_features
|
|
570
|
+
features_to_create = features_to_create[:max_features]
|
|
571
|
+
|
|
572
|
+
# Create the actual features
|
|
573
|
+
print(f"Creating {len(features_to_create)} sophisticated features...")
|
|
574
|
+
|
|
575
|
+
for feature in features_to_create:
|
|
576
|
+
column_name = feature['name']
|
|
577
|
+
|
|
578
|
+
if feature['type'] in ['topic_lda', 'topic_nmf']:
|
|
579
|
+
# Topic-based features: check if any keyword appears in text
|
|
580
|
+
if binary_format == "numeric":
|
|
581
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
582
|
+
lambda x: 1 if isinstance(x, str) and any(keyword in str(x).lower() for keyword in feature['keywords']) else 0
|
|
583
|
+
)
|
|
584
|
+
else:
|
|
585
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
586
|
+
lambda x: "Yes" if isinstance(x, str) and any(keyword in str(x).lower() for keyword in feature['keywords']) else "No"
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
elif feature['type'] == 'entity':
|
|
590
|
+
# Entity-based features
|
|
591
|
+
if binary_format == "numeric":
|
|
592
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
593
|
+
lambda x: 1 if isinstance(x, str) and feature['entity_text'] in str(x).lower() else 0
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
597
|
+
lambda x: "Yes" if isinstance(x, str) and feature['entity_text'] in str(x).lower() else "No"
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
elif feature['type'] == 'semantic_cluster':
|
|
601
|
+
# Semantic cluster features
|
|
602
|
+
if binary_format == "numeric":
|
|
603
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
604
|
+
lambda x: 1 if isinstance(x, str) and any(word in str(x).lower() for word in feature['words']) else 0
|
|
605
|
+
)
|
|
606
|
+
else:
|
|
607
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
608
|
+
lambda x: "Yes" if isinstance(x, str) and any(word in str(x).lower() for word in feature['words']) else "No"
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
elif feature['type'] == 'ngram':
|
|
612
|
+
# N-gram features
|
|
613
|
+
if binary_format == "numeric":
|
|
614
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
615
|
+
lambda x: 1 if isinstance(x, str) and feature['ngram_text'] in str(x).lower() else 0
|
|
616
|
+
)
|
|
617
|
+
else:
|
|
618
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
619
|
+
lambda x: "Yes" if isinstance(x, str) and feature['ngram_text'] in str(x).lower() else "No"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
elif feature['type'] == 'domain':
|
|
623
|
+
# Domain concept features
|
|
624
|
+
if binary_format == "numeric":
|
|
625
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
626
|
+
lambda x: 1 if isinstance(x, str) and any(concept in str(x).lower() for concept in feature['domain_concepts']) else 0
|
|
627
|
+
)
|
|
628
|
+
else:
|
|
629
|
+
dataframe[column_name] = dataframe[column].apply(
|
|
630
|
+
lambda x: "Yes" if isinstance(x, str) and any(concept in str(x).lower() for concept in feature['domain_concepts']) else "No"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
return dataframe
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def analyze_concept_correlations(dataframe: pd.DataFrame, encoded_columns: list) -> dict:
|
|
637
|
+
"""
|
|
638
|
+
Analyze correlations between extracted concepts to identify hidden patterns.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
dataframe (pd.DataFrame): DataFrame with encoded columns
|
|
642
|
+
encoded_columns (list): List of encoded column names
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
dict: Analysis results including correlation matrix and insights
|
|
646
|
+
"""
|
|
647
|
+
if not encoded_columns:
|
|
648
|
+
return {}
|
|
649
|
+
|
|
650
|
+
# Calculate correlation matrix
|
|
651
|
+
correlation_matrix = dataframe[encoded_columns].corr()
|
|
652
|
+
|
|
653
|
+
# Find strong correlations (> 0.5)
|
|
654
|
+
strong_correlations = []
|
|
655
|
+
for i, col1 in enumerate(encoded_columns):
|
|
656
|
+
for j, col2 in enumerate(encoded_columns[i+1:], i+1):
|
|
657
|
+
corr_value = correlation_matrix.loc[col1, col2]
|
|
658
|
+
if abs(corr_value) > 0.5:
|
|
659
|
+
strong_correlations.append({
|
|
660
|
+
'feature1': col1,
|
|
661
|
+
'feature2': col2,
|
|
662
|
+
'correlation': corr_value,
|
|
663
|
+
'strength': 'strong' if abs(corr_value) > 0.7 else 'moderate'
|
|
664
|
+
})
|
|
665
|
+
|
|
666
|
+
# Analyze concept co-occurrence patterns
|
|
667
|
+
co_occurrence_patterns = []
|
|
668
|
+
for correlation in strong_correlations:
|
|
669
|
+
if correlation['correlation'] > 0.5: # Positive correlation
|
|
670
|
+
pattern = {
|
|
671
|
+
'pattern_type': 'co_occurrence',
|
|
672
|
+
'features': [correlation['feature1'], correlation['feature2']],
|
|
673
|
+
'strength': correlation['correlation'],
|
|
674
|
+
'interpretation': f"When {correlation['feature1']} is present, {correlation['feature2']} is also likely to be present"
|
|
675
|
+
}
|
|
676
|
+
co_occurrence_patterns.append(pattern)
|
|
677
|
+
|
|
678
|
+
return {
|
|
679
|
+
'correlation_matrix': correlation_matrix,
|
|
680
|
+
'strong_correlations': strong_correlations,
|
|
681
|
+
'co_occurrence_patterns': co_occurrence_patterns,
|
|
682
|
+
'summary': {
|
|
683
|
+
'total_features': len(encoded_columns),
|
|
684
|
+
'strong_correlations_count': len([c for c in strong_correlations if c['strength'] == 'strong']),
|
|
685
|
+
'moderate_correlations_count': len([c for c in strong_correlations if c['strength'] == 'moderate'])
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def test_advanced_ohe():
|
|
691
|
+
"""Test the advanced OHE function with AI/ML related text"""
|
|
692
|
+
print("\n===== Testing Advanced OHE with AI/ML Text =====")
|
|
693
|
+
|
|
694
|
+
# Create sample data with AI/ML related text
|
|
695
|
+
ai_texts = [
|
|
696
|
+
"Developing machine learning models using TensorFlow and neural networks for computer vision tasks",
|
|
697
|
+
"Implementing deep learning algorithms for natural language processing and text classification",
|
|
698
|
+
"Using artificial intelligence to automate data science workflows and predictive analytics",
|
|
699
|
+
"Building recommendation systems with collaborative filtering and matrix factorization techniques",
|
|
700
|
+
"Applying reinforcement learning agents to optimize decision making in complex environments",
|
|
701
|
+
"Creating chatbots using large language models and transformer architectures like BERT",
|
|
702
|
+
"Deploying ML models to production using Docker containers and Kubernetes orchestration",
|
|
703
|
+
"Analyzing big data with Apache Spark and implementing real-time streaming analytics",
|
|
704
|
+
"Using computer vision for object detection and image segmentation in autonomous vehicles",
|
|
705
|
+
"Implementing explainable AI techniques to understand model predictions and bias detection"
|
|
706
|
+
]
|
|
707
|
+
|
|
708
|
+
# Create dataframe
|
|
709
|
+
df = pd.DataFrame({'ai_description': ai_texts})
|
|
710
|
+
|
|
711
|
+
print("Original DataFrame:")
|
|
712
|
+
print(df)
|
|
713
|
+
|
|
714
|
+
# Test comprehensive analysis
|
|
715
|
+
print("\n----- Testing Comprehensive Analysis -----")
|
|
716
|
+
result_comprehensive = get_advanced_ohe(
|
|
717
|
+
df.copy(),
|
|
718
|
+
'ai_description',
|
|
719
|
+
binary_format="numeric",
|
|
720
|
+
analysis_type="comprehensive",
|
|
721
|
+
n_topics=4,
|
|
722
|
+
max_features=20
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
# Show new columns
|
|
726
|
+
new_columns = [col for col in result_comprehensive.columns if col.startswith('has_')]
|
|
727
|
+
print(f"\nCreated {len(new_columns)} sophisticated features:")
|
|
728
|
+
for col in new_columns:
|
|
729
|
+
print(f" - {col}")
|
|
730
|
+
|
|
731
|
+
print("\nSample of results (first 3 rows, new columns only):")
|
|
732
|
+
print(result_comprehensive[new_columns].head(3))
|
|
733
|
+
|
|
734
|
+
# Analyze correlations
|
|
735
|
+
print("\n----- Analyzing Concept Correlations -----")
|
|
736
|
+
correlation_analysis = analyze_concept_correlations(result_comprehensive, new_columns)
|
|
737
|
+
|
|
738
|
+
print(f"Summary: {correlation_analysis['summary']}")
|
|
739
|
+
|
|
740
|
+
if correlation_analysis['strong_correlations']:
|
|
741
|
+
print("\nStrong correlations found:")
|
|
742
|
+
for corr in correlation_analysis['strong_correlations'][:5]: # Show top 5
|
|
743
|
+
print(f" {corr['feature1']} <-> {corr['feature2']}: {corr['correlation']:.3f} ({corr['strength']})")
|
|
744
|
+
|
|
745
|
+
if correlation_analysis['co_occurrence_patterns']:
|
|
746
|
+
print("\nCo-occurrence patterns:")
|
|
747
|
+
for pattern in correlation_analysis['co_occurrence_patterns'][:3]: # Show top 3
|
|
748
|
+
print(f" {pattern['interpretation']}")
|
|
749
|
+
|
|
750
|
+
print("\nAdvanced OHE test completed successfully!")
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
if __name__ == "__main__":
|
|
754
|
+
test_advanced_ohe()
|