ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,761 @@
1
+ """
2
+ NLP & Text Analytics Tools
3
+
4
+ Advanced natural language processing tools for text analysis, topic modeling,
5
+ named entity recognition, sentiment analysis, and text similarity.
6
+ """
7
+
8
+ import polars as pl
9
+ import numpy as np
10
+ from typing import Dict, Any, List, Optional, Tuple
11
+ from pathlib import Path
12
+ import json
13
+
14
+ # Core NLP
15
+ try:
16
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
17
+ from sklearn.decomposition import LatentDirichletAllocation, NMF
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+ except ImportError:
20
+ pass
21
+
22
+ # Advanced NLP (optional)
23
+ try:
24
+ import spacy
25
+ SPACY_AVAILABLE = True
26
+ except ImportError:
27
+ SPACY_AVAILABLE = False
28
+
29
+ try:
30
+ from transformers import pipeline, AutoTokenizer, AutoModel
31
+ import torch
32
+ TRANSFORMERS_AVAILABLE = True
33
+ except ImportError:
34
+ TRANSFORMERS_AVAILABLE = False
35
+
36
+ try:
37
+ from bertopic import BERTopic
38
+ BERTOPIC_AVAILABLE = True
39
+ except ImportError:
40
+ BERTOPIC_AVAILABLE = False
41
+
42
+ # Basic NLP
43
+ try:
44
+ from textblob import TextBlob
45
+ except ImportError:
46
+ pass
47
+
48
+ import re
49
+ from collections import Counter
50
+
51
+
52
+ def perform_topic_modeling(
53
+ data: pl.DataFrame,
54
+ text_column: str,
55
+ n_topics: int = 5,
56
+ method: str = "lda",
57
+ n_top_words: int = 10,
58
+ min_df: int = 2,
59
+ max_df: float = 0.95,
60
+ ngram_range: Tuple[int, int] = (1, 2),
61
+ random_state: int = 42,
62
+ **kwargs
63
+ ) -> Dict[str, Any]:
64
+ """
65
+ Perform topic modeling on text data using LDA, NMF, or BERTopic.
66
+
67
+ Args:
68
+ data: Input DataFrame
69
+ text_column: Column containing text data
70
+ n_topics: Number of topics to extract
71
+ method: Topic modeling method ('lda', 'nmf', 'bertopic')
72
+ n_top_words: Number of top words per topic
73
+ min_df: Minimum document frequency for terms
74
+ max_df: Maximum document frequency for terms
75
+ ngram_range: Range of n-grams to extract
76
+ random_state: Random state for reproducibility
77
+ **kwargs: Additional parameters for the chosen method
78
+
79
+ Returns:
80
+ Dictionary containing topics, document-topic distributions, and metrics
81
+ """
82
+ print(f"🔍 Performing topic modeling using {method.upper()}...")
83
+
84
+ # Validate input
85
+ if text_column not in data.columns:
86
+ raise ValueError(f"Text column '{text_column}' not found in DataFrame")
87
+
88
+ # Extract text and clean
89
+ texts = data[text_column].to_list()
90
+ texts = [str(t) if t is not None else "" for t in texts]
91
+
92
+ # Filter out empty texts
93
+ valid_indices = [i for i, t in enumerate(texts) if len(t.strip()) > 0]
94
+ texts_clean = [texts[i] for i in valid_indices]
95
+
96
+ if len(texts_clean) < n_topics:
97
+ raise ValueError(f"Not enough documents ({len(texts_clean)}) for {n_topics} topics")
98
+
99
+ result = {
100
+ "method": method,
101
+ "n_topics": n_topics,
102
+ "n_documents": len(texts_clean),
103
+ "topics": [],
104
+ "document_topics": None,
105
+ "topic_coherence": None
106
+ }
107
+
108
+ try:
109
+ if method == "bertopic" and BERTOPIC_AVAILABLE:
110
+ # BERTopic - transformer-based topic modeling
111
+ print(" Using BERTopic (transformer-based)...")
112
+
113
+ model = BERTopic(
114
+ nr_topics=n_topics,
115
+ language="english",
116
+ calculate_probabilities=True,
117
+ verbose=False,
118
+ **kwargs
119
+ )
120
+
121
+ topics_assigned, probabilities = model.fit_transform(texts_clean)
122
+
123
+ # Extract topic information
124
+ topic_info = model.get_topic_info()
125
+
126
+ for topic_id in range(n_topics):
127
+ if topic_id in model.get_topics():
128
+ topic_words = model.get_topic(topic_id)[:n_top_words]
129
+ result["topics"].append({
130
+ "topic_id": topic_id,
131
+ "words": [word for word, score in topic_words],
132
+ "scores": [float(score) for word, score in topic_words],
133
+ "size": int(topic_info[topic_info['Topic'] == topic_id]['Count'].iloc[0])
134
+ })
135
+
136
+ # Document-topic distributions
137
+ result["document_topics"] = probabilities.tolist() if probabilities is not None else None
138
+ result["topic_assignments"] = topics_assigned.tolist()
139
+
140
+ elif method in ["lda", "nmf"]:
141
+ # Traditional topic modeling with sklearn
142
+ print(f" Using {method.upper()} with TF-IDF/Count vectorization...")
143
+
144
+ # Vectorization
145
+ if method == "lda":
146
+ vectorizer = CountVectorizer(
147
+ min_df=min_df,
148
+ max_df=max_df,
149
+ ngram_range=ngram_range,
150
+ stop_words='english',
151
+ max_features=kwargs.get('max_features', 1000)
152
+ )
153
+ else: # nmf
154
+ vectorizer = TfidfVectorizer(
155
+ min_df=min_df,
156
+ max_df=max_df,
157
+ ngram_range=ngram_range,
158
+ stop_words='english',
159
+ max_features=kwargs.get('max_features', 1000)
160
+ )
161
+
162
+ doc_term_matrix = vectorizer.fit_transform(texts_clean)
163
+ feature_names = vectorizer.get_feature_names_out()
164
+
165
+ # Topic modeling
166
+ if method == "lda":
167
+ model = LatentDirichletAllocation(
168
+ n_components=n_topics,
169
+ random_state=random_state,
170
+ max_iter=kwargs.get('max_iter', 20),
171
+ learning_method='online',
172
+ n_jobs=-1
173
+ )
174
+ else: # nmf
175
+ model = NMF(
176
+ n_components=n_topics,
177
+ random_state=random_state,
178
+ max_iter=kwargs.get('max_iter', 200),
179
+ init='nndsvda'
180
+ )
181
+
182
+ doc_topic_dist = model.fit_transform(doc_term_matrix)
183
+
184
+ # Extract topics
185
+ for topic_idx, topic in enumerate(model.components_):
186
+ top_indices = topic.argsort()[-n_top_words:][::-1]
187
+ top_words = [feature_names[i] for i in top_indices]
188
+ top_scores = [float(topic[i]) for i in top_indices]
189
+
190
+ result["topics"].append({
191
+ "topic_id": topic_idx,
192
+ "words": top_words,
193
+ "scores": top_scores,
194
+ "size": int((doc_topic_dist.argmax(axis=1) == topic_idx).sum())
195
+ })
196
+
197
+ # Document-topic distributions
198
+ result["document_topics"] = doc_topic_dist.tolist()
199
+
200
+ # Topic assignments (most probable topic per document)
201
+ result["topic_assignments"] = doc_topic_dist.argmax(axis=1).tolist()
202
+
203
+ # Calculate perplexity for LDA
204
+ if method == "lda":
205
+ result["perplexity"] = float(model.perplexity(doc_term_matrix))
206
+ result["log_likelihood"] = float(model.score(doc_term_matrix))
207
+
208
+ # Vocabulary size
209
+ result["vocabulary_size"] = len(feature_names)
210
+
211
+ else:
212
+ raise ValueError(f"Unknown method '{method}'. Use 'lda', 'nmf', or 'bertopic'")
213
+
214
+ # Calculate topic diversity (unique words across topics)
215
+ all_topic_words = set()
216
+ total_topic_words = 0
217
+ for topic in result["topics"]:
218
+ all_topic_words.update(topic["words"])
219
+ total_topic_words += len(topic["words"])
220
+
221
+ result["topic_diversity"] = len(all_topic_words) / total_topic_words if total_topic_words > 0 else 0
222
+
223
+ # Summary statistics
224
+ result["summary"] = {
225
+ "total_topics": len(result["topics"]),
226
+ "avg_topic_size": np.mean([t["size"] for t in result["topics"]]),
227
+ "topic_diversity": result["topic_diversity"]
228
+ }
229
+
230
+ print(f"✅ Topic modeling complete! Found {len(result['topics'])} topics")
231
+ print(f" Topic diversity: {result['topic_diversity']:.3f}")
232
+
233
+ return result
234
+
235
+ except Exception as e:
236
+ print(f"❌ Error during topic modeling: {str(e)}")
237
+ raise
238
+
239
+
240
+ def perform_named_entity_recognition(
241
+ data: pl.DataFrame,
242
+ text_column: str,
243
+ model: str = "en_core_web_sm",
244
+ entity_types: Optional[List[str]] = None,
245
+ min_confidence: float = 0.0
246
+ ) -> Dict[str, Any]:
247
+ """
248
+ Perform named entity recognition to extract people, organizations, locations, etc.
249
+
250
+ Args:
251
+ data: Input DataFrame
252
+ text_column: Column containing text data
253
+ model: spaCy model to use ('en_core_web_sm', 'en_core_web_md', 'en_core_web_lg')
254
+ entity_types: List of entity types to extract (e.g., ['PERSON', 'ORG', 'GPE'])
255
+ If None, extracts all types
256
+ min_confidence: Minimum confidence score for entity extraction (0.0-1.0)
257
+
258
+ Returns:
259
+ Dictionary containing extracted entities, counts, and statistics
260
+ """
261
+ print(f"🔍 Performing named entity recognition with spaCy...")
262
+
263
+ if not SPACY_AVAILABLE:
264
+ # Fallback to basic pattern matching
265
+ print("⚠️ spaCy not available. Using basic pattern matching...")
266
+ return _perform_ner_basic(data, text_column)
267
+
268
+ # Validate input
269
+ if text_column not in data.columns:
270
+ raise ValueError(f"Text column '{text_column}' not found in DataFrame")
271
+
272
+ try:
273
+ # Load spaCy model
274
+ try:
275
+ nlp = spacy.load(model)
276
+ except OSError:
277
+ print(f"⚠️ Model '{model}' not found. Attempting to download...")
278
+ import subprocess
279
+ subprocess.run(["python", "-m", "spacy", "download", model], check=True)
280
+ nlp = spacy.load(model)
281
+
282
+ # Extract text
283
+ texts = data[text_column].to_list()
284
+ texts = [str(t) if t is not None else "" for t in texts]
285
+
286
+ # Process documents
287
+ all_entities = []
288
+ entity_counts = Counter()
289
+ entity_by_type = {}
290
+
291
+ print(f" Processing {len(texts)} documents...")
292
+
293
+ for doc_idx, text in enumerate(texts):
294
+ if len(text.strip()) == 0:
295
+ continue
296
+
297
+ doc = nlp(text)
298
+
299
+ for ent in doc.ents:
300
+ # Filter by entity type if specified
301
+ if entity_types and ent.label_ not in entity_types:
302
+ continue
303
+
304
+ entity_info = {
305
+ "text": ent.text,
306
+ "label": ent.label_,
307
+ "start": ent.start_char,
308
+ "end": ent.end_char,
309
+ "document_id": doc_idx
310
+ }
311
+
312
+ all_entities.append(entity_info)
313
+ entity_counts[(ent.text, ent.label_)] += 1
314
+
315
+ if ent.label_ not in entity_by_type:
316
+ entity_by_type[ent.label_] = []
317
+ entity_by_type[ent.label_].append(ent.text)
318
+
319
+ # Aggregate results
320
+ result = {
321
+ "total_entities": len(all_entities),
322
+ "unique_entities": len(entity_counts),
323
+ "entities": all_entities,
324
+ "entity_counts": [
325
+ {"text": text, "label": label, "count": count}
326
+ for (text, label), count in entity_counts.most_common(100)
327
+ ],
328
+ "by_type": {}
329
+ }
330
+
331
+ # Statistics by entity type
332
+ for entity_type, entities in entity_by_type.items():
333
+ type_counter = Counter(entities)
334
+ result["by_type"][entity_type] = {
335
+ "total": len(entities),
336
+ "unique": len(type_counter),
337
+ "top_entities": [
338
+ {"text": text, "count": count}
339
+ for text, count in type_counter.most_common(10)
340
+ ]
341
+ }
342
+
343
+ print(f"✅ NER complete! Found {result['total_entities']} entities")
344
+ print(f" Unique entities: {result['unique_entities']}")
345
+ print(f" Entity types: {', '.join(result['by_type'].keys())}")
346
+
347
+ return result
348
+
349
+ except Exception as e:
350
+ print(f"❌ Error during NER: {str(e)}")
351
+ raise
352
+
353
+
354
+ def _perform_ner_basic(data: pl.DataFrame, text_column: str) -> Dict[str, Any]:
355
+ """Fallback NER using basic pattern matching when spaCy is not available."""
356
+
357
+ texts = data[text_column].to_list()
358
+ texts = [str(t) if t is not None else "" for t in texts]
359
+
360
+ # Basic patterns
361
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
362
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
363
+ phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
364
+
365
+ emails = []
366
+ urls = []
367
+ phones = []
368
+
369
+ for text in texts:
370
+ emails.extend(re.findall(email_pattern, text))
371
+ urls.extend(re.findall(url_pattern, text))
372
+ phones.extend(re.findall(phone_pattern, text))
373
+
374
+ return {
375
+ "method": "basic_pattern_matching",
376
+ "total_entities": len(emails) + len(urls) + len(phones),
377
+ "by_type": {
378
+ "EMAIL": {"total": len(emails), "unique": len(set(emails)), "examples": list(set(emails))[:10]},
379
+ "URL": {"total": len(urls), "unique": len(set(urls)), "examples": list(set(urls))[:10]},
380
+ "PHONE": {"total": len(phones), "unique": len(set(phones)), "examples": list(set(phones))[:10]}
381
+ },
382
+ "note": "Install spaCy for advanced NER: pip install spacy && python -m spacy download en_core_web_sm"
383
+ }
384
+
385
+
386
+ def analyze_sentiment_advanced(
387
+ data: pl.DataFrame,
388
+ text_column: str,
389
+ method: str = "transformer",
390
+ model_name: str = "distilbert-base-uncased-finetuned-sst-2-english",
391
+ aspects: Optional[List[str]] = None,
392
+ detect_emotions: bool = True
393
+ ) -> Dict[str, Any]:
394
+ """
395
+ Perform advanced sentiment analysis with aspect-based sentiment and emotion detection.
396
+
397
+ Args:
398
+ data: Input DataFrame
399
+ text_column: Column containing text data
400
+ method: Analysis method ('transformer', 'textblob', 'vader')
401
+ model_name: Transformer model for sentiment analysis
402
+ aspects: List of aspects for aspect-based sentiment (e.g., ['price', 'quality'])
403
+ detect_emotions: Whether to detect emotions (joy, anger, sadness, etc.)
404
+
405
+ Returns:
406
+ Dictionary containing sentiment scores, emotions, and statistics
407
+ """
408
+ print(f"🔍 Performing advanced sentiment analysis...")
409
+
410
+ # Validate input
411
+ if text_column not in data.columns:
412
+ raise ValueError(f"Text column '{text_column}' not found in DataFrame")
413
+
414
+ # Extract text
415
+ texts = data[text_column].to_list()
416
+ texts = [str(t) if t is not None else "" for t in texts]
417
+ texts_clean = [t for t in texts if len(t.strip()) > 0]
418
+
419
+ result = {
420
+ "method": method,
421
+ "n_documents": len(texts_clean),
422
+ "sentiments": [],
423
+ "statistics": {}
424
+ }
425
+
426
+ try:
427
+ if method == "transformer" and TRANSFORMERS_AVAILABLE:
428
+ print(f" Using transformer model: {model_name}")
429
+
430
+ # Sentiment analysis pipeline
431
+ sentiment_pipeline = pipeline(
432
+ "sentiment-analysis",
433
+ model=model_name,
434
+ truncation=True,
435
+ max_length=512
436
+ )
437
+
438
+ # Process in batches
439
+ batch_size = 32
440
+ all_sentiments = []
441
+
442
+ for i in range(0, len(texts_clean), batch_size):
443
+ batch = texts_clean[i:i+batch_size]
444
+ batch_results = sentiment_pipeline(batch)
445
+ all_sentiments.extend(batch_results)
446
+
447
+ result["sentiments"] = [
448
+ {
449
+ "label": s["label"],
450
+ "score": float(s["score"]),
451
+ "text": texts_clean[i][:100] # First 100 chars
452
+ }
453
+ for i, s in enumerate(all_sentiments)
454
+ ]
455
+
456
+ # Emotion detection
457
+ if detect_emotions:
458
+ try:
459
+ emotion_pipeline = pipeline(
460
+ "text-classification",
461
+ model="j-hartmann/emotion-english-distilroberta-base",
462
+ truncation=True,
463
+ max_length=512
464
+ )
465
+
466
+ emotions = []
467
+ for i in range(0, len(texts_clean), batch_size):
468
+ batch = texts_clean[i:i+batch_size]
469
+ batch_emotions = emotion_pipeline(batch)
470
+ emotions.extend(batch_emotions)
471
+
472
+ result["emotions"] = [
473
+ {"emotion": e["label"], "score": float(e["score"])}
474
+ for e in emotions
475
+ ]
476
+
477
+ # Emotion distribution
478
+ emotion_counts = Counter([e["label"] for e in emotions])
479
+ result["emotion_distribution"] = dict(emotion_counts)
480
+
481
+ except Exception as e:
482
+ print(f"⚠️ Emotion detection failed: {str(e)}")
483
+ result["emotions"] = None
484
+
485
+ else:
486
+ # Check if method is 'vader' - use vaderSentiment
487
+ if method == "vader":
488
+ try:
489
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
490
+ print(" Using VADER for sentiment analysis...")
491
+
492
+ analyzer = SentimentIntensityAnalyzer()
493
+ sentiments = []
494
+ for text in texts_clean:
495
+ scores = analyzer.polarity_scores(text)
496
+ label = "POSITIVE" if scores['compound'] > 0.05 else "NEGATIVE" if scores['compound'] < -0.05 else "NEUTRAL"
497
+ sentiments.append({
498
+ "compound": scores['compound'],
499
+ "positive": scores['pos'],
500
+ "negative": scores['neg'],
501
+ "neutral": scores['neu'],
502
+ "label": label,
503
+ "text": text[:100]
504
+ })
505
+
506
+ result["sentiments"] = sentiments
507
+
508
+ except ImportError:
509
+ print("⚠️ vaderSentiment not installed. Falling back to TextBlob.")
510
+ print(" Install with: pip install vaderSentiment>=3.3")
511
+ method = "textblob"
512
+
513
+ if method in ["textblob", "transformer"]:
514
+ # Fallback to TextBlob
515
+ print(" Using TextBlob for sentiment analysis...")
516
+
517
+ sentiments = []
518
+ for text in texts_clean:
519
+ blob = TextBlob(text)
520
+ sentiments.append({
521
+ "polarity": blob.sentiment.polarity,
522
+ "subjectivity": blob.sentiment.subjectivity,
523
+ "label": "POSITIVE" if blob.sentiment.polarity > 0 else "NEGATIVE" if blob.sentiment.polarity < 0 else "NEUTRAL",
524
+ "text": text[:100]
525
+ })
526
+
527
+ result["sentiments"] = sentiments
528
+
529
+ # Aspect-based sentiment
530
+ if aspects:
531
+ print(f" Analyzing aspect-based sentiment for: {', '.join(aspects)}")
532
+ result["aspect_sentiments"] = _extract_aspect_sentiments(texts_clean, aspects)
533
+
534
+ # Calculate statistics
535
+ if method == "transformer":
536
+ sentiment_counts = Counter([s["label"] for s in result["sentiments"]])
537
+ result["statistics"] = {
538
+ "sentiment_distribution": dict(sentiment_counts),
539
+ "positive_ratio": sentiment_counts.get("POSITIVE", 0) / len(texts_clean),
540
+ "negative_ratio": sentiment_counts.get("NEGATIVE", 0) / len(texts_clean),
541
+ "avg_confidence": np.mean([s["score"] for s in result["sentiments"]])
542
+ }
543
+ else:
544
+ polarities = [s["polarity"] for s in result["sentiments"]]
545
+ result["statistics"] = {
546
+ "avg_polarity": np.mean(polarities),
547
+ "std_polarity": np.std(polarities),
548
+ "positive_ratio": sum(1 for p in polarities if p > 0) / len(polarities),
549
+ "negative_ratio": sum(1 for p in polarities if p < 0) / len(polarities),
550
+ "neutral_ratio": sum(1 for p in polarities if p == 0) / len(polarities)
551
+ }
552
+
553
+ print(f"✅ Sentiment analysis complete!")
554
+ print(f" Distribution: {result['statistics'].get('sentiment_distribution', 'N/A')}")
555
+
556
+ return result
557
+
558
+ except Exception as e:
559
+ print(f"❌ Error during sentiment analysis: {str(e)}")
560
+ raise
561
+
562
+
563
+ def _extract_aspect_sentiments(texts: List[str], aspects: List[str]) -> Dict[str, Any]:
564
+ """Extract sentiment for specific aspects in text."""
565
+
566
+ aspect_sentiments = {aspect: [] for aspect in aspects}
567
+
568
+ for text in texts:
569
+ text_lower = text.lower()
570
+
571
+ for aspect in aspects:
572
+ # Find sentences containing the aspect
573
+ sentences = text.split('.')
574
+ aspect_sentences = [s for s in sentences if aspect.lower() in s.lower()]
575
+
576
+ if aspect_sentences:
577
+ # Analyze sentiment of aspect sentences
578
+ for sentence in aspect_sentences:
579
+ blob = TextBlob(sentence)
580
+ aspect_sentiments[aspect].append({
581
+ "text": sentence.strip(),
582
+ "polarity": blob.sentiment.polarity,
583
+ "subjectivity": blob.sentiment.subjectivity
584
+ })
585
+
586
+ # Aggregate aspect sentiments
587
+ result = {}
588
+ for aspect, sentiments in aspect_sentiments.items():
589
+ if sentiments:
590
+ polarities = [s["polarity"] for s in sentiments]
591
+ result[aspect] = {
592
+ "count": len(sentiments),
593
+ "avg_polarity": np.mean(polarities),
594
+ "positive_mentions": sum(1 for p in polarities if p > 0),
595
+ "negative_mentions": sum(1 for p in polarities if p < 0),
596
+ "examples": sentiments[:5]
597
+ }
598
+ else:
599
+ result[aspect] = {"count": 0, "avg_polarity": 0.0}
600
+
601
+ return result
602
+
603
+
604
+ def perform_text_similarity(
605
+ data: pl.DataFrame,
606
+ text_column: str,
607
+ query_text: Optional[str] = None,
608
+ method: str = "cosine",
609
+ top_k: int = 10,
610
+ use_embeddings: bool = False,
611
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
612
+ ) -> Dict[str, Any]:
613
+ """
614
+ Calculate text similarity using cosine, Jaccard, or semantic embeddings.
615
+
616
+ Args:
617
+ data: Input DataFrame
618
+ text_column: Column containing text data
619
+ query_text: Query text to find similar documents (if None, computes pairwise)
620
+ method: Similarity method ('cosine', 'jaccard', 'semantic')
621
+ top_k: Number of top similar documents to return
622
+ use_embeddings: Whether to use transformer embeddings (for semantic similarity)
623
+ model_name: Model for semantic embeddings
624
+
625
+ Returns:
626
+ Dictionary containing similarity scores and top matches
627
+ """
628
+ print(f"🔍 Calculating text similarity using {method} method...")
629
+
630
+ # Validate input
631
+ if text_column not in data.columns:
632
+ raise ValueError(f"Text column '{text_column}' not found in DataFrame")
633
+
634
+ # Extract text
635
+ texts = data[text_column].to_list()
636
+ texts = [str(t) if t is not None else "" for t in texts]
637
+
638
+ result = {
639
+ "method": method,
640
+ "n_documents": len(texts),
641
+ "query_text": query_text,
642
+ "similarities": []
643
+ }
644
+
645
+ try:
646
+ if method == "semantic" and use_embeddings and TRANSFORMERS_AVAILABLE:
647
+ print(f" Using semantic embeddings: {model_name}")
648
+
649
+ # Load model and tokenizer
650
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
651
+ model = AutoModel.from_pretrained(model_name)
652
+
653
+ def get_embedding(text: str) -> np.ndarray:
654
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
655
+ with torch.no_grad():
656
+ outputs = model(**inputs)
657
+ # Mean pooling
658
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
659
+
660
+ # Get embeddings
661
+ if query_text:
662
+ query_embedding = get_embedding(query_text)
663
+ text_embeddings = np.array([get_embedding(t) for t in texts])
664
+
665
+ # Calculate cosine similarity
666
+ similarities = cosine_similarity([query_embedding], text_embeddings)[0]
667
+
668
+ # Top K
669
+ top_indices = similarities.argsort()[-top_k:][::-1]
670
+ result["similarities"] = [
671
+ {
672
+ "document_id": int(idx),
673
+ "text": texts[idx][:200],
674
+ "score": float(similarities[idx])
675
+ }
676
+ for idx in top_indices
677
+ ]
678
+ else:
679
+ # Pairwise similarity
680
+ text_embeddings = np.array([get_embedding(t) for t in texts])
681
+ similarity_matrix = cosine_similarity(text_embeddings)
682
+ result["similarity_matrix"] = similarity_matrix.tolist()
683
+ result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
684
+
685
+ elif method == "cosine":
686
+ print(" Using TF-IDF with cosine similarity...")
687
+
688
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
689
+
690
+ if query_text:
691
+ all_texts = [query_text] + texts
692
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
693
+
694
+ # Similarity between query and all documents
695
+ similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
696
+
697
+ # Top K
698
+ top_indices = similarities.argsort()[-top_k:][::-1]
699
+ result["similarities"] = [
700
+ {
701
+ "document_id": int(idx),
702
+ "text": texts[idx][:200],
703
+ "score": float(similarities[idx])
704
+ }
705
+ for idx in top_indices
706
+ ]
707
+ else:
708
+ # Pairwise similarity
709
+ tfidf_matrix = vectorizer.fit_transform(texts)
710
+ similarity_matrix = cosine_similarity(tfidf_matrix)
711
+ result["similarity_matrix"] = similarity_matrix.tolist()
712
+ result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
713
+
714
+ elif method == "jaccard":
715
+ print(" Using Jaccard similarity...")
716
+
717
+ def jaccard_similarity(text1: str, text2: str) -> float:
718
+ set1 = set(text1.lower().split())
719
+ set2 = set(text2.lower().split())
720
+ intersection = len(set1.intersection(set2))
721
+ union = len(set1.union(set2))
722
+ return intersection / union if union > 0 else 0.0
723
+
724
+ if query_text:
725
+ similarities = [jaccard_similarity(query_text, text) for text in texts]
726
+
727
+ # Top K
728
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
729
+ result["similarities"] = [
730
+ {
731
+ "document_id": int(idx),
732
+ "text": texts[idx][:200],
733
+ "score": float(similarities[idx])
734
+ }
735
+ for idx in top_indices
736
+ ]
737
+ else:
738
+ # Pairwise similarity
739
+ n = len(texts)
740
+ similarity_matrix = np.zeros((n, n))
741
+ for i in range(n):
742
+ for j in range(i+1, n):
743
+ sim = jaccard_similarity(texts[i], texts[j])
744
+ similarity_matrix[i, j] = sim
745
+ similarity_matrix[j, i] = sim
746
+
747
+ result["similarity_matrix"] = similarity_matrix.tolist()
748
+ result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
749
+
750
+ else:
751
+ raise ValueError(f"Unknown method '{method}'. Use 'cosine', 'jaccard', or 'semantic'")
752
+
753
+ print(f"✅ Similarity calculation complete!")
754
+ if result.get("similarities"):
755
+ print(f" Top similarity score: {result['similarities'][0]['score']:.3f}")
756
+
757
+ return result
758
+
759
+ except Exception as e:
760
+ print(f"❌ Error during similarity calculation: {str(e)}")
761
+ raise