sqlshell 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sqlshell/__init__.py +84 -0
  2. sqlshell/__main__.py +4926 -0
  3. sqlshell/ai_autocomplete.py +392 -0
  4. sqlshell/ai_settings_dialog.py +337 -0
  5. sqlshell/context_suggester.py +768 -0
  6. sqlshell/create_test_data.py +152 -0
  7. sqlshell/data/create_test_data.py +137 -0
  8. sqlshell/db/__init__.py +6 -0
  9. sqlshell/db/database_manager.py +1318 -0
  10. sqlshell/db/export_manager.py +188 -0
  11. sqlshell/editor.py +1166 -0
  12. sqlshell/editor_integration.py +127 -0
  13. sqlshell/execution_handler.py +421 -0
  14. sqlshell/menus.py +262 -0
  15. sqlshell/notification_manager.py +370 -0
  16. sqlshell/query_tab.py +904 -0
  17. sqlshell/resources/__init__.py +1 -0
  18. sqlshell/resources/icon.png +0 -0
  19. sqlshell/resources/logo_large.png +0 -0
  20. sqlshell/resources/logo_medium.png +0 -0
  21. sqlshell/resources/logo_small.png +0 -0
  22. sqlshell/resources/splash_screen.gif +0 -0
  23. sqlshell/space_invaders.py +501 -0
  24. sqlshell/splash_screen.py +405 -0
  25. sqlshell/sqlshell/__init__.py +5 -0
  26. sqlshell/sqlshell/create_test_data.py +118 -0
  27. sqlshell/sqlshell/create_test_databases.py +96 -0
  28. sqlshell/sqlshell_demo.png +0 -0
  29. sqlshell/styles.py +257 -0
  30. sqlshell/suggester_integration.py +330 -0
  31. sqlshell/syntax_highlighter.py +124 -0
  32. sqlshell/table_list.py +996 -0
  33. sqlshell/ui/__init__.py +6 -0
  34. sqlshell/ui/bar_chart_delegate.py +49 -0
  35. sqlshell/ui/filter_header.py +469 -0
  36. sqlshell/utils/__init__.py +16 -0
  37. sqlshell/utils/profile_cn2.py +1661 -0
  38. sqlshell/utils/profile_column.py +2635 -0
  39. sqlshell/utils/profile_distributions.py +616 -0
  40. sqlshell/utils/profile_entropy.py +347 -0
  41. sqlshell/utils/profile_foreign_keys.py +779 -0
  42. sqlshell/utils/profile_keys.py +2834 -0
  43. sqlshell/utils/profile_ohe.py +934 -0
  44. sqlshell/utils/profile_ohe_advanced.py +754 -0
  45. sqlshell/utils/profile_ohe_comparison.py +237 -0
  46. sqlshell/utils/profile_prediction.py +926 -0
  47. sqlshell/utils/profile_similarity.py +876 -0
  48. sqlshell/utils/search_in_df.py +90 -0
  49. sqlshell/widgets.py +400 -0
  50. sqlshell-0.4.4.dist-info/METADATA +441 -0
  51. sqlshell-0.4.4.dist-info/RECORD +54 -0
  52. sqlshell-0.4.4.dist-info/WHEEL +5 -0
  53. sqlshell-0.4.4.dist-info/entry_points.txt +2 -0
  54. sqlshell-0.4.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,754 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.decomposition import LatentDirichletAllocation, NMF
7
+ from sklearn.cluster import KMeans, DBSCAN
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ from sklearn.manifold import TSNE
10
+ from collections import defaultdict, Counter
11
+ import re
12
+ import warnings
13
+ warnings.filterwarnings('ignore')
14
+
15
+ # Optional imports with fallbacks
16
+ try:
17
+ import spacy
18
+ SPACY_AVAILABLE = True
19
+ except ImportError:
20
+ SPACY_AVAILABLE = False
21
+
22
+ # Flag to track if NLTK is available
23
+ NLTK_AVAILABLE = False
24
+
25
+ def _setup_nltk_data_path():
26
+ """Configure NLTK to find data in bundled location (for PyInstaller builds)"""
27
+ import nltk
28
+
29
+ # Check if running from a PyInstaller bundle
30
+ if getattr(sys, 'frozen', False):
31
+ # Running in a PyInstaller bundle
32
+ bundle_dir = sys._MEIPASS
33
+ nltk_data_path = os.path.join(bundle_dir, 'nltk_data')
34
+ if os.path.exists(nltk_data_path):
35
+ nltk.data.path.insert(0, nltk_data_path)
36
+
37
+ # Also check relative to the application
38
+ app_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
39
+ possible_paths = [
40
+ os.path.join(app_dir, 'nltk_data'),
41
+ os.path.join(os.path.dirname(app_dir), 'nltk_data'),
42
+ ]
43
+ for path in possible_paths:
44
+ if os.path.exists(path) and path not in nltk.data.path:
45
+ nltk.data.path.insert(0, path)
46
+
47
+
48
+ def _simple_tokenize(text):
49
+ """Simple fallback tokenizer when NLTK is not available"""
50
+ # Simple word tokenization using regex
51
+ return re.findall(r'\b[a-zA-Z]+\b', text.lower())
52
+
53
+
54
+ def _get_simple_stopwords():
55
+ """Return a basic set of English stopwords when NLTK is not available"""
56
+ return {
57
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
58
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
59
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
60
+ 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
61
+ 'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
62
+ 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
63
+ 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
64
+ 'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
65
+ 'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
66
+ 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
67
+ 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
68
+ 'just', 'also', 'now', 'here', 'there', 'then', 'once', 'if', 'because',
69
+ 'while', 'although', 'though', 'after', 'before', 'since', 'until', 'unless'
70
+ }
71
+
72
+ try:
73
+ import nltk
74
+ _setup_nltk_data_path()
75
+ from nltk.corpus import stopwords
76
+ from nltk.tokenize import word_tokenize, sent_tokenize
77
+
78
+ # Try to find required NLTK data, download if missing
79
+ try:
80
+ nltk.data.find('tokenizers/punkt')
81
+ except LookupError:
82
+ try:
83
+ nltk.download('punkt', quiet=True)
84
+ except Exception:
85
+ pass # Download failed silently - NLTK features will be unavailable
86
+ try:
87
+ nltk.data.find('corpora/stopwords')
88
+ except LookupError:
89
+ try:
90
+ nltk.download('stopwords', quiet=True)
91
+ except Exception:
92
+ pass # Download failed silently - NLTK features will be unavailable
93
+ try:
94
+ nltk.data.find('tokenizers/punkt_tab/english')
95
+ except LookupError:
96
+ try:
97
+ nltk.download('punkt_tab', quiet=True)
98
+ except Exception:
99
+ pass # Download failed silently - NLTK features will be unavailable
100
+
101
+ # Test if NLTK is actually working
102
+ try:
103
+ _ = stopwords.words('english')
104
+ _ = word_tokenize("test")
105
+ NLTK_AVAILABLE = True
106
+ except Exception:
107
+ NLTK_AVAILABLE = False
108
+
109
+ except ImportError:
110
+ NLTK_AVAILABLE = False
111
+
112
+ class AdvancedTextAnalyzer:
113
+ """
114
+ Advanced text analyzer using multiple academic algorithms for sophisticated
115
+ feature extraction and semantic analysis.
116
+ """
117
+
118
+ def __init__(self, model_name='en_core_web_sm'):
119
+ """
120
+ Initialize the advanced text analyzer.
121
+
122
+ Args:
123
+ model_name (str): Spacy model name for NER and advanced processing
124
+ """
125
+ # Get stopwords (use NLTK if available, otherwise fallback)
126
+ if NLTK_AVAILABLE:
127
+ self.stop_words = set(stopwords.words('english'))
128
+ else:
129
+ self.stop_words = _get_simple_stopwords()
130
+
131
+ self.tfidf_vectorizer = None
132
+ self.lda_model = None
133
+ self.nmf_model = None
134
+ self.word_clusters = None
135
+ self.concept_mapping = {}
136
+
137
+ # Try to load spaCy model for NER
138
+ if SPACY_AVAILABLE:
139
+ try:
140
+ self.nlp = spacy.load(model_name)
141
+ except OSError:
142
+ self.nlp = None
143
+ else:
144
+ self.nlp = None
145
+
146
+ def extract_semantic_concepts(self, texts, n_topics=8, min_concept_freq=2):
147
+ """
148
+ Extract semantic concepts using multiple algorithms:
149
+ 1. Topic Modeling (LDA + NMF)
150
+ 2. TF-IDF with clustering
151
+ 3. Named Entity Recognition (if available)
152
+ 4. N-gram concept extraction
153
+
154
+ Args:
155
+ texts (list): List of text documents
156
+ n_topics (int): Number of topics for topic modeling
157
+ min_concept_freq (int): Minimum frequency for concept inclusion
158
+
159
+ Returns:
160
+ dict: Dictionary mapping concept types to extracted concepts
161
+ """
162
+ concepts = {
163
+ 'topics_lda': [],
164
+ 'topics_nmf': [],
165
+ 'entities': [],
166
+ 'semantic_clusters': [],
167
+ 'key_ngrams': [],
168
+ 'domain_concepts': []
169
+ }
170
+
171
+ if not texts or len(texts) == 0:
172
+ return concepts
173
+
174
+ # Clean and preprocess texts
175
+ cleaned_texts = [self._preprocess_text(text) for text in texts if isinstance(text, str)]
176
+ if not cleaned_texts:
177
+ return concepts
178
+
179
+ # 1. Topic Modeling with LDA and NMF
180
+ concepts['topics_lda'] = self._extract_topics_lda(cleaned_texts, n_topics)
181
+ concepts['topics_nmf'] = self._extract_topics_nmf(cleaned_texts, n_topics)
182
+
183
+ # 2. Named Entity Recognition (if spaCy is available)
184
+ if SPACY_AVAILABLE and self.nlp:
185
+ concepts['entities'] = self._extract_named_entities(texts)
186
+
187
+ # 3. Semantic clustering of words
188
+ concepts['semantic_clusters'] = self._extract_semantic_clusters(cleaned_texts)
189
+
190
+ # 4. Key N-gram extraction
191
+ concepts['key_ngrams'] = self._extract_key_ngrams(cleaned_texts, min_concept_freq)
192
+
193
+ # 5. Domain-specific concept extraction
194
+ concepts['domain_concepts'] = self._extract_domain_concepts(cleaned_texts)
195
+
196
+ return concepts
197
+
198
+ def _preprocess_text(self, text):
199
+ """Advanced text preprocessing"""
200
+ if not isinstance(text, str):
201
+ return ""
202
+
203
+ # Convert to lowercase and remove extra whitespace
204
+ text = re.sub(r'\s+', ' ', text.lower().strip())
205
+
206
+ # Remove special characters but keep important punctuation
207
+ text = re.sub(r'[^\w\s\-\.]', ' ', text)
208
+
209
+ # Tokenize (use NLTK if available, otherwise fallback)
210
+ if NLTK_AVAILABLE:
211
+ tokens = word_tokenize(text)
212
+ else:
213
+ tokens = _simple_tokenize(text)
214
+
215
+ # Remove stopwords and short tokens
216
+ tokens = [token for token in tokens if token not in self.stop_words and len(token) > 2]
217
+
218
+ return ' '.join(tokens)
219
+
220
+ def _extract_topics_lda(self, texts, n_topics):
221
+ """Extract topics using Latent Dirichlet Allocation"""
222
+ try:
223
+ # Create TF-IDF vectorizer
224
+ self.tfidf_vectorizer = TfidfVectorizer(
225
+ max_features=1000,
226
+ ngram_range=(1, 3),
227
+ min_df=2,
228
+ max_df=0.8
229
+ )
230
+
231
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
232
+
233
+ # Apply LDA
234
+ self.lda_model = LatentDirichletAllocation(
235
+ n_components=n_topics,
236
+ random_state=42,
237
+ max_iter=10
238
+ )
239
+
240
+ self.lda_model.fit(tfidf_matrix)
241
+
242
+ # Extract topic keywords
243
+ feature_names = self.tfidf_vectorizer.get_feature_names_out()
244
+ topics = []
245
+
246
+ for topic_idx, topic in enumerate(self.lda_model.components_):
247
+ top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
248
+ topic_name = f"topic_lda_{topic_idx}_{'_'.join(top_words[:2])}"
249
+ topics.append({
250
+ 'name': topic_name,
251
+ 'keywords': top_words,
252
+ 'weight': float(np.sum(topic))
253
+ })
254
+
255
+ return topics
256
+
257
+ except Exception as e:
258
+ print(f"LDA topic extraction failed: {e}")
259
+ return []
260
+
261
+ def _extract_topics_nmf(self, texts, n_topics):
262
+ """Extract topics using Non-negative Matrix Factorization"""
263
+ try:
264
+ if self.tfidf_vectorizer is None:
265
+ self.tfidf_vectorizer = TfidfVectorizer(
266
+ max_features=1000,
267
+ ngram_range=(1, 3),
268
+ min_df=2,
269
+ max_df=0.8
270
+ )
271
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
272
+ else:
273
+ tfidf_matrix = self.tfidf_vectorizer.transform(texts)
274
+
275
+ # Apply NMF
276
+ self.nmf_model = NMF(
277
+ n_components=n_topics,
278
+ random_state=42,
279
+ alpha_W=0.1,
280
+ alpha_H=0.1
281
+ )
282
+
283
+ self.nmf_model.fit(tfidf_matrix)
284
+
285
+ # Extract topic keywords
286
+ feature_names = self.tfidf_vectorizer.get_feature_names_out()
287
+ topics = []
288
+
289
+ for topic_idx, topic in enumerate(self.nmf_model.components_):
290
+ top_words = [feature_names[i] for i in topic.argsort()[-5:][::-1]]
291
+ topic_name = f"topic_nmf_{topic_idx}_{'_'.join(top_words[:2])}"
292
+ topics.append({
293
+ 'name': topic_name,
294
+ 'keywords': top_words,
295
+ 'weight': float(np.sum(topic))
296
+ })
297
+
298
+ return topics
299
+
300
+ except Exception as e:
301
+ print(f"NMF topic extraction failed: {e}")
302
+ return []
303
+
304
+ def _extract_named_entities(self, texts):
305
+ """Extract named entities using spaCy"""
306
+ if self.nlp is None:
307
+ return []
308
+
309
+ entities = defaultdict(list)
310
+
311
+ try:
312
+ for text in texts:
313
+ if isinstance(text, str):
314
+ doc = self.nlp(text)
315
+ for ent in doc.ents:
316
+ # Focus on relevant entity types
317
+ if ent.label_ in ['ORG', 'PRODUCT', 'TECHNOLOGY', 'EVENT', 'GPE', 'PERSON']:
318
+ entities[ent.label_].append(ent.text.lower())
319
+
320
+ # Convert to concept format
321
+ entity_concepts = []
322
+ for entity_type, entity_list in entities.items():
323
+ # Get most common entities of each type
324
+ common_entities = Counter(entity_list).most_common(5)
325
+ for entity, count in common_entities:
326
+ if count >= 2: # Must appear at least twice
327
+ entity_concepts.append({
328
+ 'name': f"entity_{entity_type.lower()}_{entity.replace(' ', '_')}",
329
+ 'type': entity_type,
330
+ 'entity': entity,
331
+ 'frequency': count
332
+ })
333
+
334
+ return entity_concepts
335
+
336
+ except Exception as e:
337
+ print(f"Named entity extraction failed: {e}")
338
+ return []
339
+
340
+ def _extract_semantic_clusters(self, texts):
341
+ """Extract semantic word clusters using TF-IDF and clustering"""
342
+ try:
343
+ if self.tfidf_vectorizer is None:
344
+ self.tfidf_vectorizer = TfidfVectorizer(
345
+ max_features=500,
346
+ ngram_range=(1, 2),
347
+ min_df=2,
348
+ max_df=0.8
349
+ )
350
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
351
+ else:
352
+ tfidf_matrix = self.tfidf_vectorizer.transform(texts)
353
+
354
+ # Get feature names (words/phrases)
355
+ feature_names = self.tfidf_vectorizer.get_feature_names_out()
356
+
357
+ if len(feature_names) < 5:
358
+ return []
359
+
360
+ # Cluster words based on their TF-IDF vectors
361
+ n_clusters = min(8, len(feature_names) // 3)
362
+ if n_clusters < 2:
363
+ return []
364
+
365
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
366
+
367
+ # Transpose to cluster features (words) instead of documents
368
+ word_clusters = kmeans.fit_predict(tfidf_matrix.T.toarray())
369
+
370
+ # Group words by cluster
371
+ clusters = defaultdict(list)
372
+ for word_idx, cluster_id in enumerate(word_clusters):
373
+ clusters[cluster_id].append(feature_names[word_idx])
374
+
375
+ # Convert to concept format
376
+ cluster_concepts = []
377
+ for cluster_id, words in clusters.items():
378
+ if len(words) >= 2: # Only clusters with multiple words
379
+ # Sort words by their average TF-IDF score
380
+ cluster_name = f"semantic_cluster_{cluster_id}_{'_'.join(words[:2])}"
381
+ cluster_concepts.append({
382
+ 'name': cluster_name,
383
+ 'words': words,
384
+ 'cluster_id': cluster_id,
385
+ 'size': len(words)
386
+ })
387
+
388
+ return cluster_concepts
389
+
390
+ except Exception as e:
391
+ print(f"Semantic clustering failed: {e}")
392
+ return []
393
+
394
+ def _extract_key_ngrams(self, texts, min_freq=2):
395
+ """Extract key n-grams using advanced scoring"""
396
+ try:
397
+ # Extract 2-grams and 3-grams
398
+ ngram_vectorizer = TfidfVectorizer(
399
+ ngram_range=(2, 3),
400
+ min_df=min_freq,
401
+ max_df=0.8,
402
+ stop_words='english'
403
+ )
404
+
405
+ ngram_matrix = ngram_vectorizer.fit_transform(texts)
406
+ feature_names = ngram_vectorizer.get_feature_names_out()
407
+
408
+ # Calculate importance scores
409
+ tfidf_scores = np.array(ngram_matrix.sum(axis=0)).flatten()
410
+
411
+ # Get top n-grams
412
+ top_indices = tfidf_scores.argsort()[-15:][::-1]
413
+
414
+ ngram_concepts = []
415
+ for idx in top_indices:
416
+ if tfidf_scores[idx] > 0:
417
+ ngram = feature_names[idx]
418
+ ngram_concepts.append({
419
+ 'name': f"ngram_{ngram.replace(' ', '_')}",
420
+ 'ngram': ngram,
421
+ 'score': float(tfidf_scores[idx])
422
+ })
423
+
424
+ return ngram_concepts
425
+
426
+ except Exception as e:
427
+ print(f"N-gram extraction failed: {e}")
428
+ return []
429
+
430
+ def _extract_domain_concepts(self, texts):
431
+ """Extract domain-specific concepts using keyword patterns"""
432
+ # Define domain-specific patterns
433
+ domain_patterns = {
434
+ 'ai_ml': [
435
+ r'\b(artificial intelligence|ai|machine learning|ml|deep learning|neural network|nlp|computer vision|data science)\b',
436
+ r'\b(algorithm|model|training|prediction|classification|regression|clustering)\b',
437
+ r'\b(tensorflow|pytorch|scikit|keras|pandas|numpy)\b'
438
+ ],
439
+ 'tech': [
440
+ r'\b(software|hardware|system|platform|framework|database|api|cloud|server)\b',
441
+ r'\b(programming|development|coding|bug|feature|deployment|testing)\b',
442
+ r'\b(python|java|javascript|sql|html|css|react|node)\b'
443
+ ],
444
+ 'business': [
445
+ r'\b(revenue|profit|sales|customer|market|strategy|growth|roi|kpi)\b',
446
+ r'\b(management|team|project|budget|timeline|milestone|deliverable)\b',
447
+ r'\b(analytics|metrics|dashboard|report|insight|trend)\b'
448
+ ],
449
+ 'academic': [
450
+ r'\b(research|study|analysis|experiment|hypothesis|methodology|results)\b',
451
+ r'\b(publication|paper|journal|conference|peer review|citation)\b',
452
+ r'\b(university|college|professor|student|degree|thesis)\b'
453
+ ]
454
+ }
455
+
456
+ domain_concepts = []
457
+ combined_text = ' '.join(texts).lower()
458
+
459
+ for domain, patterns in domain_patterns.items():
460
+ domain_matches = set()
461
+ for pattern in patterns:
462
+ matches = re.findall(pattern, combined_text)
463
+ domain_matches.update(matches)
464
+
465
+ if domain_matches:
466
+ domain_concepts.append({
467
+ 'name': f"domain_{domain}",
468
+ 'domain': domain,
469
+ 'concepts': list(domain_matches),
470
+ 'count': len(domain_matches)
471
+ })
472
+
473
+ return domain_concepts
474
+
475
+
476
+ def get_advanced_ohe(dataframe: pd.DataFrame, column: str,
477
+ binary_format: str = "numeric",
478
+ analysis_type: str = "comprehensive",
479
+ n_topics: int = 6,
480
+ max_features: int = 25) -> pd.DataFrame:
481
+ """
482
+ Create sophisticated one-hot encoded columns using advanced academic algorithms.
483
+
484
+ Args:
485
+ dataframe (pd.DataFrame): Input dataframe
486
+ column (str): Name of the column to process
487
+ binary_format (str): Format for encoding - "numeric" for 1/0 or "text" for "Yes"/"No"
488
+ analysis_type (str): Type of analysis - "comprehensive", "topic_focused", "entity_focused", "semantic_focused"
489
+ n_topics (int): Number of topics for topic modeling
490
+ max_features (int): Maximum number of features to create
491
+
492
+ Returns:
493
+ pd.DataFrame: Original dataframe with additional sophisticated one-hot encoded columns
494
+ """
495
+ # Check if column exists
496
+ if column not in dataframe.columns:
497
+ raise ValueError(f"Column '{column}' not found in dataframe")
498
+
499
+ # Check binary format is valid
500
+ if binary_format not in ["numeric", "text"]:
501
+ raise ValueError("binary_format must be either 'numeric' or 'text'")
502
+
503
+ # Filter out non-string values and get text data
504
+ text_data = dataframe[column].dropna().astype(str).tolist()
505
+ if not text_data:
506
+ return dataframe # Nothing to process
507
+
508
+ # Initialize advanced analyzer
509
+ analyzer = AdvancedTextAnalyzer()
510
+
511
+ # Extract sophisticated concepts
512
+ print("Extracting semantic concepts using advanced algorithms...")
513
+ concepts = analyzer.extract_semantic_concepts(text_data, n_topics=n_topics)
514
+
515
+ # Create features based on analysis type
516
+ features_to_create = []
517
+
518
+ if analysis_type in ["comprehensive", "topic_focused"]:
519
+ # Add topic-based features
520
+ for topic in concepts['topics_lda']:
521
+ features_to_create.append({
522
+ 'name': f"has_{topic['name']}",
523
+ 'type': 'topic_lda',
524
+ 'keywords': topic['keywords']
525
+ })
526
+
527
+ for topic in concepts['topics_nmf']:
528
+ features_to_create.append({
529
+ 'name': f"has_{topic['name']}",
530
+ 'type': 'topic_nmf',
531
+ 'keywords': topic['keywords']
532
+ })
533
+
534
+ if analysis_type in ["comprehensive", "entity_focused"]:
535
+ # Add entity-based features
536
+ for entity in concepts['entities']:
537
+ features_to_create.append({
538
+ 'name': f"has_{entity['name']}",
539
+ 'type': 'entity',
540
+ 'entity_text': entity['entity']
541
+ })
542
+
543
+ if analysis_type in ["comprehensive", "semantic_focused"]:
544
+ # Add semantic cluster features
545
+ for cluster in concepts['semantic_clusters']:
546
+ features_to_create.append({
547
+ 'name': f"has_{cluster['name']}",
548
+ 'type': 'semantic_cluster',
549
+ 'words': cluster['words']
550
+ })
551
+
552
+ # Add n-gram features
553
+ for ngram in concepts['key_ngrams'][:10]: # Top 10 n-grams
554
+ features_to_create.append({
555
+ 'name': f"has_{ngram['name']}",
556
+ 'type': 'ngram',
557
+ 'ngram_text': ngram['ngram']
558
+ })
559
+
560
+ if analysis_type == "comprehensive":
561
+ # Add domain concept features
562
+ for domain in concepts['domain_concepts']:
563
+ features_to_create.append({
564
+ 'name': f"has_{domain['name']}",
565
+ 'type': 'domain',
566
+ 'domain_concepts': domain['concepts']
567
+ })
568
+
569
+ # Limit features to max_features
570
+ features_to_create = features_to_create[:max_features]
571
+
572
+ # Create the actual features
573
+ print(f"Creating {len(features_to_create)} sophisticated features...")
574
+
575
+ for feature in features_to_create:
576
+ column_name = feature['name']
577
+
578
+ if feature['type'] in ['topic_lda', 'topic_nmf']:
579
+ # Topic-based features: check if any keyword appears in text
580
+ if binary_format == "numeric":
581
+ dataframe[column_name] = dataframe[column].apply(
582
+ lambda x: 1 if isinstance(x, str) and any(keyword in str(x).lower() for keyword in feature['keywords']) else 0
583
+ )
584
+ else:
585
+ dataframe[column_name] = dataframe[column].apply(
586
+ lambda x: "Yes" if isinstance(x, str) and any(keyword in str(x).lower() for keyword in feature['keywords']) else "No"
587
+ )
588
+
589
+ elif feature['type'] == 'entity':
590
+ # Entity-based features
591
+ if binary_format == "numeric":
592
+ dataframe[column_name] = dataframe[column].apply(
593
+ lambda x: 1 if isinstance(x, str) and feature['entity_text'] in str(x).lower() else 0
594
+ )
595
+ else:
596
+ dataframe[column_name] = dataframe[column].apply(
597
+ lambda x: "Yes" if isinstance(x, str) and feature['entity_text'] in str(x).lower() else "No"
598
+ )
599
+
600
+ elif feature['type'] == 'semantic_cluster':
601
+ # Semantic cluster features
602
+ if binary_format == "numeric":
603
+ dataframe[column_name] = dataframe[column].apply(
604
+ lambda x: 1 if isinstance(x, str) and any(word in str(x).lower() for word in feature['words']) else 0
605
+ )
606
+ else:
607
+ dataframe[column_name] = dataframe[column].apply(
608
+ lambda x: "Yes" if isinstance(x, str) and any(word in str(x).lower() for word in feature['words']) else "No"
609
+ )
610
+
611
+ elif feature['type'] == 'ngram':
612
+ # N-gram features
613
+ if binary_format == "numeric":
614
+ dataframe[column_name] = dataframe[column].apply(
615
+ lambda x: 1 if isinstance(x, str) and feature['ngram_text'] in str(x).lower() else 0
616
+ )
617
+ else:
618
+ dataframe[column_name] = dataframe[column].apply(
619
+ lambda x: "Yes" if isinstance(x, str) and feature['ngram_text'] in str(x).lower() else "No"
620
+ )
621
+
622
+ elif feature['type'] == 'domain':
623
+ # Domain concept features
624
+ if binary_format == "numeric":
625
+ dataframe[column_name] = dataframe[column].apply(
626
+ lambda x: 1 if isinstance(x, str) and any(concept in str(x).lower() for concept in feature['domain_concepts']) else 0
627
+ )
628
+ else:
629
+ dataframe[column_name] = dataframe[column].apply(
630
+ lambda x: "Yes" if isinstance(x, str) and any(concept in str(x).lower() for concept in feature['domain_concepts']) else "No"
631
+ )
632
+
633
+ return dataframe
634
+
635
+
636
+ def analyze_concept_correlations(dataframe: pd.DataFrame, encoded_columns: list) -> dict:
637
+ """
638
+ Analyze correlations between extracted concepts to identify hidden patterns.
639
+
640
+ Args:
641
+ dataframe (pd.DataFrame): DataFrame with encoded columns
642
+ encoded_columns (list): List of encoded column names
643
+
644
+ Returns:
645
+ dict: Analysis results including correlation matrix and insights
646
+ """
647
+ if not encoded_columns:
648
+ return {}
649
+
650
+ # Calculate correlation matrix
651
+ correlation_matrix = dataframe[encoded_columns].corr()
652
+
653
+ # Find strong correlations (> 0.5)
654
+ strong_correlations = []
655
+ for i, col1 in enumerate(encoded_columns):
656
+ for j, col2 in enumerate(encoded_columns[i+1:], i+1):
657
+ corr_value = correlation_matrix.loc[col1, col2]
658
+ if abs(corr_value) > 0.5:
659
+ strong_correlations.append({
660
+ 'feature1': col1,
661
+ 'feature2': col2,
662
+ 'correlation': corr_value,
663
+ 'strength': 'strong' if abs(corr_value) > 0.7 else 'moderate'
664
+ })
665
+
666
+ # Analyze concept co-occurrence patterns
667
+ co_occurrence_patterns = []
668
+ for correlation in strong_correlations:
669
+ if correlation['correlation'] > 0.5: # Positive correlation
670
+ pattern = {
671
+ 'pattern_type': 'co_occurrence',
672
+ 'features': [correlation['feature1'], correlation['feature2']],
673
+ 'strength': correlation['correlation'],
674
+ 'interpretation': f"When {correlation['feature1']} is present, {correlation['feature2']} is also likely to be present"
675
+ }
676
+ co_occurrence_patterns.append(pattern)
677
+
678
+ return {
679
+ 'correlation_matrix': correlation_matrix,
680
+ 'strong_correlations': strong_correlations,
681
+ 'co_occurrence_patterns': co_occurrence_patterns,
682
+ 'summary': {
683
+ 'total_features': len(encoded_columns),
684
+ 'strong_correlations_count': len([c for c in strong_correlations if c['strength'] == 'strong']),
685
+ 'moderate_correlations_count': len([c for c in strong_correlations if c['strength'] == 'moderate'])
686
+ }
687
+ }
688
+
689
+
690
+ def test_advanced_ohe():
691
+ """Test the advanced OHE function with AI/ML related text"""
692
+ print("\n===== Testing Advanced OHE with AI/ML Text =====")
693
+
694
+ # Create sample data with AI/ML related text
695
+ ai_texts = [
696
+ "Developing machine learning models using TensorFlow and neural networks for computer vision tasks",
697
+ "Implementing deep learning algorithms for natural language processing and text classification",
698
+ "Using artificial intelligence to automate data science workflows and predictive analytics",
699
+ "Building recommendation systems with collaborative filtering and matrix factorization techniques",
700
+ "Applying reinforcement learning agents to optimize decision making in complex environments",
701
+ "Creating chatbots using large language models and transformer architectures like BERT",
702
+ "Deploying ML models to production using Docker containers and Kubernetes orchestration",
703
+ "Analyzing big data with Apache Spark and implementing real-time streaming analytics",
704
+ "Using computer vision for object detection and image segmentation in autonomous vehicles",
705
+ "Implementing explainable AI techniques to understand model predictions and bias detection"
706
+ ]
707
+
708
+ # Create dataframe
709
+ df = pd.DataFrame({'ai_description': ai_texts})
710
+
711
+ print("Original DataFrame:")
712
+ print(df)
713
+
714
+ # Test comprehensive analysis
715
+ print("\n----- Testing Comprehensive Analysis -----")
716
+ result_comprehensive = get_advanced_ohe(
717
+ df.copy(),
718
+ 'ai_description',
719
+ binary_format="numeric",
720
+ analysis_type="comprehensive",
721
+ n_topics=4,
722
+ max_features=20
723
+ )
724
+
725
+ # Show new columns
726
+ new_columns = [col for col in result_comprehensive.columns if col.startswith('has_')]
727
+ print(f"\nCreated {len(new_columns)} sophisticated features:")
728
+ for col in new_columns:
729
+ print(f" - {col}")
730
+
731
+ print("\nSample of results (first 3 rows, new columns only):")
732
+ print(result_comprehensive[new_columns].head(3))
733
+
734
+ # Analyze correlations
735
+ print("\n----- Analyzing Concept Correlations -----")
736
+ correlation_analysis = analyze_concept_correlations(result_comprehensive, new_columns)
737
+
738
+ print(f"Summary: {correlation_analysis['summary']}")
739
+
740
+ if correlation_analysis['strong_correlations']:
741
+ print("\nStrong correlations found:")
742
+ for corr in correlation_analysis['strong_correlations'][:5]: # Show top 5
743
+ print(f" {corr['feature1']} <-> {corr['feature2']}: {corr['correlation']:.3f} ({corr['strength']})")
744
+
745
+ if correlation_analysis['co_occurrence_patterns']:
746
+ print("\nCo-occurrence patterns:")
747
+ for pattern in correlation_analysis['co_occurrence_patterns'][:3]: # Show top 3
748
+ print(f" {pattern['interpretation']}")
749
+
750
+ print("\nAdvanced OHE test completed successfully!")
751
+
752
+
753
+ if __name__ == "__main__":
754
+ test_advanced_ohe()