ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,761 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NLP & Text Analytics Tools
|
|
3
|
+
|
|
4
|
+
Advanced natural language processing tools for text analysis, topic modeling,
|
|
5
|
+
named entity recognition, sentiment analysis, and text similarity.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
import numpy as np
|
|
10
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
# Core NLP
|
|
15
|
+
try:
|
|
16
|
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
|
17
|
+
from sklearn.decomposition import LatentDirichletAllocation, NMF
|
|
18
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
19
|
+
except ImportError:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
# Advanced NLP (optional)
|
|
23
|
+
try:
|
|
24
|
+
import spacy
|
|
25
|
+
SPACY_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
SPACY_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from transformers import pipeline, AutoTokenizer, AutoModel
|
|
31
|
+
import torch
|
|
32
|
+
TRANSFORMERS_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
TRANSFORMERS_AVAILABLE = False
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from bertopic import BERTopic
|
|
38
|
+
BERTOPIC_AVAILABLE = True
|
|
39
|
+
except ImportError:
|
|
40
|
+
BERTOPIC_AVAILABLE = False
|
|
41
|
+
|
|
42
|
+
# Basic NLP
|
|
43
|
+
try:
|
|
44
|
+
from textblob import TextBlob
|
|
45
|
+
except ImportError:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
import re
|
|
49
|
+
from collections import Counter
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def perform_topic_modeling(
|
|
53
|
+
data: pl.DataFrame,
|
|
54
|
+
text_column: str,
|
|
55
|
+
n_topics: int = 5,
|
|
56
|
+
method: str = "lda",
|
|
57
|
+
n_top_words: int = 10,
|
|
58
|
+
min_df: int = 2,
|
|
59
|
+
max_df: float = 0.95,
|
|
60
|
+
ngram_range: Tuple[int, int] = (1, 2),
|
|
61
|
+
random_state: int = 42,
|
|
62
|
+
**kwargs
|
|
63
|
+
) -> Dict[str, Any]:
|
|
64
|
+
"""
|
|
65
|
+
Perform topic modeling on text data using LDA, NMF, or BERTopic.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
data: Input DataFrame
|
|
69
|
+
text_column: Column containing text data
|
|
70
|
+
n_topics: Number of topics to extract
|
|
71
|
+
method: Topic modeling method ('lda', 'nmf', 'bertopic')
|
|
72
|
+
n_top_words: Number of top words per topic
|
|
73
|
+
min_df: Minimum document frequency for terms
|
|
74
|
+
max_df: Maximum document frequency for terms
|
|
75
|
+
ngram_range: Range of n-grams to extract
|
|
76
|
+
random_state: Random state for reproducibility
|
|
77
|
+
**kwargs: Additional parameters for the chosen method
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dictionary containing topics, document-topic distributions, and metrics
|
|
81
|
+
"""
|
|
82
|
+
print(f"🔍 Performing topic modeling using {method.upper()}...")
|
|
83
|
+
|
|
84
|
+
# Validate input
|
|
85
|
+
if text_column not in data.columns:
|
|
86
|
+
raise ValueError(f"Text column '{text_column}' not found in DataFrame")
|
|
87
|
+
|
|
88
|
+
# Extract text and clean
|
|
89
|
+
texts = data[text_column].to_list()
|
|
90
|
+
texts = [str(t) if t is not None else "" for t in texts]
|
|
91
|
+
|
|
92
|
+
# Filter out empty texts
|
|
93
|
+
valid_indices = [i for i, t in enumerate(texts) if len(t.strip()) > 0]
|
|
94
|
+
texts_clean = [texts[i] for i in valid_indices]
|
|
95
|
+
|
|
96
|
+
if len(texts_clean) < n_topics:
|
|
97
|
+
raise ValueError(f"Not enough documents ({len(texts_clean)}) for {n_topics} topics")
|
|
98
|
+
|
|
99
|
+
result = {
|
|
100
|
+
"method": method,
|
|
101
|
+
"n_topics": n_topics,
|
|
102
|
+
"n_documents": len(texts_clean),
|
|
103
|
+
"topics": [],
|
|
104
|
+
"document_topics": None,
|
|
105
|
+
"topic_coherence": None
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
if method == "bertopic" and BERTOPIC_AVAILABLE:
|
|
110
|
+
# BERTopic - transformer-based topic modeling
|
|
111
|
+
print(" Using BERTopic (transformer-based)...")
|
|
112
|
+
|
|
113
|
+
model = BERTopic(
|
|
114
|
+
nr_topics=n_topics,
|
|
115
|
+
language="english",
|
|
116
|
+
calculate_probabilities=True,
|
|
117
|
+
verbose=False,
|
|
118
|
+
**kwargs
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
topics_assigned, probabilities = model.fit_transform(texts_clean)
|
|
122
|
+
|
|
123
|
+
# Extract topic information
|
|
124
|
+
topic_info = model.get_topic_info()
|
|
125
|
+
|
|
126
|
+
for topic_id in range(n_topics):
|
|
127
|
+
if topic_id in model.get_topics():
|
|
128
|
+
topic_words = model.get_topic(topic_id)[:n_top_words]
|
|
129
|
+
result["topics"].append({
|
|
130
|
+
"topic_id": topic_id,
|
|
131
|
+
"words": [word for word, score in topic_words],
|
|
132
|
+
"scores": [float(score) for word, score in topic_words],
|
|
133
|
+
"size": int(topic_info[topic_info['Topic'] == topic_id]['Count'].iloc[0])
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
# Document-topic distributions
|
|
137
|
+
result["document_topics"] = probabilities.tolist() if probabilities is not None else None
|
|
138
|
+
result["topic_assignments"] = topics_assigned.tolist()
|
|
139
|
+
|
|
140
|
+
elif method in ["lda", "nmf"]:
|
|
141
|
+
# Traditional topic modeling with sklearn
|
|
142
|
+
print(f" Using {method.upper()} with TF-IDF/Count vectorization...")
|
|
143
|
+
|
|
144
|
+
# Vectorization
|
|
145
|
+
if method == "lda":
|
|
146
|
+
vectorizer = CountVectorizer(
|
|
147
|
+
min_df=min_df,
|
|
148
|
+
max_df=max_df,
|
|
149
|
+
ngram_range=ngram_range,
|
|
150
|
+
stop_words='english',
|
|
151
|
+
max_features=kwargs.get('max_features', 1000)
|
|
152
|
+
)
|
|
153
|
+
else: # nmf
|
|
154
|
+
vectorizer = TfidfVectorizer(
|
|
155
|
+
min_df=min_df,
|
|
156
|
+
max_df=max_df,
|
|
157
|
+
ngram_range=ngram_range,
|
|
158
|
+
stop_words='english',
|
|
159
|
+
max_features=kwargs.get('max_features', 1000)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
doc_term_matrix = vectorizer.fit_transform(texts_clean)
|
|
163
|
+
feature_names = vectorizer.get_feature_names_out()
|
|
164
|
+
|
|
165
|
+
# Topic modeling
|
|
166
|
+
if method == "lda":
|
|
167
|
+
model = LatentDirichletAllocation(
|
|
168
|
+
n_components=n_topics,
|
|
169
|
+
random_state=random_state,
|
|
170
|
+
max_iter=kwargs.get('max_iter', 20),
|
|
171
|
+
learning_method='online',
|
|
172
|
+
n_jobs=-1
|
|
173
|
+
)
|
|
174
|
+
else: # nmf
|
|
175
|
+
model = NMF(
|
|
176
|
+
n_components=n_topics,
|
|
177
|
+
random_state=random_state,
|
|
178
|
+
max_iter=kwargs.get('max_iter', 200),
|
|
179
|
+
init='nndsvda'
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
doc_topic_dist = model.fit_transform(doc_term_matrix)
|
|
183
|
+
|
|
184
|
+
# Extract topics
|
|
185
|
+
for topic_idx, topic in enumerate(model.components_):
|
|
186
|
+
top_indices = topic.argsort()[-n_top_words:][::-1]
|
|
187
|
+
top_words = [feature_names[i] for i in top_indices]
|
|
188
|
+
top_scores = [float(topic[i]) for i in top_indices]
|
|
189
|
+
|
|
190
|
+
result["topics"].append({
|
|
191
|
+
"topic_id": topic_idx,
|
|
192
|
+
"words": top_words,
|
|
193
|
+
"scores": top_scores,
|
|
194
|
+
"size": int((doc_topic_dist.argmax(axis=1) == topic_idx).sum())
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
# Document-topic distributions
|
|
198
|
+
result["document_topics"] = doc_topic_dist.tolist()
|
|
199
|
+
|
|
200
|
+
# Topic assignments (most probable topic per document)
|
|
201
|
+
result["topic_assignments"] = doc_topic_dist.argmax(axis=1).tolist()
|
|
202
|
+
|
|
203
|
+
# Calculate perplexity for LDA
|
|
204
|
+
if method == "lda":
|
|
205
|
+
result["perplexity"] = float(model.perplexity(doc_term_matrix))
|
|
206
|
+
result["log_likelihood"] = float(model.score(doc_term_matrix))
|
|
207
|
+
|
|
208
|
+
# Vocabulary size
|
|
209
|
+
result["vocabulary_size"] = len(feature_names)
|
|
210
|
+
|
|
211
|
+
else:
|
|
212
|
+
raise ValueError(f"Unknown method '{method}'. Use 'lda', 'nmf', or 'bertopic'")
|
|
213
|
+
|
|
214
|
+
# Calculate topic diversity (unique words across topics)
|
|
215
|
+
all_topic_words = set()
|
|
216
|
+
total_topic_words = 0
|
|
217
|
+
for topic in result["topics"]:
|
|
218
|
+
all_topic_words.update(topic["words"])
|
|
219
|
+
total_topic_words += len(topic["words"])
|
|
220
|
+
|
|
221
|
+
result["topic_diversity"] = len(all_topic_words) / total_topic_words if total_topic_words > 0 else 0
|
|
222
|
+
|
|
223
|
+
# Summary statistics
|
|
224
|
+
result["summary"] = {
|
|
225
|
+
"total_topics": len(result["topics"]),
|
|
226
|
+
"avg_topic_size": np.mean([t["size"] for t in result["topics"]]),
|
|
227
|
+
"topic_diversity": result["topic_diversity"]
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
print(f"✅ Topic modeling complete! Found {len(result['topics'])} topics")
|
|
231
|
+
print(f" Topic diversity: {result['topic_diversity']:.3f}")
|
|
232
|
+
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
print(f"❌ Error during topic modeling: {str(e)}")
|
|
237
|
+
raise
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def perform_named_entity_recognition(
|
|
241
|
+
data: pl.DataFrame,
|
|
242
|
+
text_column: str,
|
|
243
|
+
model: str = "en_core_web_sm",
|
|
244
|
+
entity_types: Optional[List[str]] = None,
|
|
245
|
+
min_confidence: float = 0.0
|
|
246
|
+
) -> Dict[str, Any]:
|
|
247
|
+
"""
|
|
248
|
+
Perform named entity recognition to extract people, organizations, locations, etc.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
data: Input DataFrame
|
|
252
|
+
text_column: Column containing text data
|
|
253
|
+
model: spaCy model to use ('en_core_web_sm', 'en_core_web_md', 'en_core_web_lg')
|
|
254
|
+
entity_types: List of entity types to extract (e.g., ['PERSON', 'ORG', 'GPE'])
|
|
255
|
+
If None, extracts all types
|
|
256
|
+
min_confidence: Minimum confidence score for entity extraction (0.0-1.0)
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Dictionary containing extracted entities, counts, and statistics
|
|
260
|
+
"""
|
|
261
|
+
print(f"🔍 Performing named entity recognition with spaCy...")
|
|
262
|
+
|
|
263
|
+
if not SPACY_AVAILABLE:
|
|
264
|
+
# Fallback to basic pattern matching
|
|
265
|
+
print("⚠️ spaCy not available. Using basic pattern matching...")
|
|
266
|
+
return _perform_ner_basic(data, text_column)
|
|
267
|
+
|
|
268
|
+
# Validate input
|
|
269
|
+
if text_column not in data.columns:
|
|
270
|
+
raise ValueError(f"Text column '{text_column}' not found in DataFrame")
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
# Load spaCy model
|
|
274
|
+
try:
|
|
275
|
+
nlp = spacy.load(model)
|
|
276
|
+
except OSError:
|
|
277
|
+
print(f"⚠️ Model '{model}' not found. Attempting to download...")
|
|
278
|
+
import subprocess
|
|
279
|
+
subprocess.run(["python", "-m", "spacy", "download", model], check=True)
|
|
280
|
+
nlp = spacy.load(model)
|
|
281
|
+
|
|
282
|
+
# Extract text
|
|
283
|
+
texts = data[text_column].to_list()
|
|
284
|
+
texts = [str(t) if t is not None else "" for t in texts]
|
|
285
|
+
|
|
286
|
+
# Process documents
|
|
287
|
+
all_entities = []
|
|
288
|
+
entity_counts = Counter()
|
|
289
|
+
entity_by_type = {}
|
|
290
|
+
|
|
291
|
+
print(f" Processing {len(texts)} documents...")
|
|
292
|
+
|
|
293
|
+
for doc_idx, text in enumerate(texts):
|
|
294
|
+
if len(text.strip()) == 0:
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
doc = nlp(text)
|
|
298
|
+
|
|
299
|
+
for ent in doc.ents:
|
|
300
|
+
# Filter by entity type if specified
|
|
301
|
+
if entity_types and ent.label_ not in entity_types:
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
entity_info = {
|
|
305
|
+
"text": ent.text,
|
|
306
|
+
"label": ent.label_,
|
|
307
|
+
"start": ent.start_char,
|
|
308
|
+
"end": ent.end_char,
|
|
309
|
+
"document_id": doc_idx
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
all_entities.append(entity_info)
|
|
313
|
+
entity_counts[(ent.text, ent.label_)] += 1
|
|
314
|
+
|
|
315
|
+
if ent.label_ not in entity_by_type:
|
|
316
|
+
entity_by_type[ent.label_] = []
|
|
317
|
+
entity_by_type[ent.label_].append(ent.text)
|
|
318
|
+
|
|
319
|
+
# Aggregate results
|
|
320
|
+
result = {
|
|
321
|
+
"total_entities": len(all_entities),
|
|
322
|
+
"unique_entities": len(entity_counts),
|
|
323
|
+
"entities": all_entities,
|
|
324
|
+
"entity_counts": [
|
|
325
|
+
{"text": text, "label": label, "count": count}
|
|
326
|
+
for (text, label), count in entity_counts.most_common(100)
|
|
327
|
+
],
|
|
328
|
+
"by_type": {}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
# Statistics by entity type
|
|
332
|
+
for entity_type, entities in entity_by_type.items():
|
|
333
|
+
type_counter = Counter(entities)
|
|
334
|
+
result["by_type"][entity_type] = {
|
|
335
|
+
"total": len(entities),
|
|
336
|
+
"unique": len(type_counter),
|
|
337
|
+
"top_entities": [
|
|
338
|
+
{"text": text, "count": count}
|
|
339
|
+
for text, count in type_counter.most_common(10)
|
|
340
|
+
]
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
print(f"✅ NER complete! Found {result['total_entities']} entities")
|
|
344
|
+
print(f" Unique entities: {result['unique_entities']}")
|
|
345
|
+
print(f" Entity types: {', '.join(result['by_type'].keys())}")
|
|
346
|
+
|
|
347
|
+
return result
|
|
348
|
+
|
|
349
|
+
except Exception as e:
|
|
350
|
+
print(f"❌ Error during NER: {str(e)}")
|
|
351
|
+
raise
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _perform_ner_basic(data: pl.DataFrame, text_column: str) -> Dict[str, Any]:
|
|
355
|
+
"""Fallback NER using basic pattern matching when spaCy is not available."""
|
|
356
|
+
|
|
357
|
+
texts = data[text_column].to_list()
|
|
358
|
+
texts = [str(t) if t is not None else "" for t in texts]
|
|
359
|
+
|
|
360
|
+
# Basic patterns
|
|
361
|
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
362
|
+
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
|
363
|
+
phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
|
|
364
|
+
|
|
365
|
+
emails = []
|
|
366
|
+
urls = []
|
|
367
|
+
phones = []
|
|
368
|
+
|
|
369
|
+
for text in texts:
|
|
370
|
+
emails.extend(re.findall(email_pattern, text))
|
|
371
|
+
urls.extend(re.findall(url_pattern, text))
|
|
372
|
+
phones.extend(re.findall(phone_pattern, text))
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
"method": "basic_pattern_matching",
|
|
376
|
+
"total_entities": len(emails) + len(urls) + len(phones),
|
|
377
|
+
"by_type": {
|
|
378
|
+
"EMAIL": {"total": len(emails), "unique": len(set(emails)), "examples": list(set(emails))[:10]},
|
|
379
|
+
"URL": {"total": len(urls), "unique": len(set(urls)), "examples": list(set(urls))[:10]},
|
|
380
|
+
"PHONE": {"total": len(phones), "unique": len(set(phones)), "examples": list(set(phones))[:10]}
|
|
381
|
+
},
|
|
382
|
+
"note": "Install spaCy for advanced NER: pip install spacy && python -m spacy download en_core_web_sm"
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def analyze_sentiment_advanced(
|
|
387
|
+
data: pl.DataFrame,
|
|
388
|
+
text_column: str,
|
|
389
|
+
method: str = "transformer",
|
|
390
|
+
model_name: str = "distilbert-base-uncased-finetuned-sst-2-english",
|
|
391
|
+
aspects: Optional[List[str]] = None,
|
|
392
|
+
detect_emotions: bool = True
|
|
393
|
+
) -> Dict[str, Any]:
|
|
394
|
+
"""
|
|
395
|
+
Perform advanced sentiment analysis with aspect-based sentiment and emotion detection.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
data: Input DataFrame
|
|
399
|
+
text_column: Column containing text data
|
|
400
|
+
method: Analysis method ('transformer', 'textblob', 'vader')
|
|
401
|
+
model_name: Transformer model for sentiment analysis
|
|
402
|
+
aspects: List of aspects for aspect-based sentiment (e.g., ['price', 'quality'])
|
|
403
|
+
detect_emotions: Whether to detect emotions (joy, anger, sadness, etc.)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Dictionary containing sentiment scores, emotions, and statistics
|
|
407
|
+
"""
|
|
408
|
+
print(f"🔍 Performing advanced sentiment analysis...")
|
|
409
|
+
|
|
410
|
+
# Validate input
|
|
411
|
+
if text_column not in data.columns:
|
|
412
|
+
raise ValueError(f"Text column '{text_column}' not found in DataFrame")
|
|
413
|
+
|
|
414
|
+
# Extract text
|
|
415
|
+
texts = data[text_column].to_list()
|
|
416
|
+
texts = [str(t) if t is not None else "" for t in texts]
|
|
417
|
+
texts_clean = [t for t in texts if len(t.strip()) > 0]
|
|
418
|
+
|
|
419
|
+
result = {
|
|
420
|
+
"method": method,
|
|
421
|
+
"n_documents": len(texts_clean),
|
|
422
|
+
"sentiments": [],
|
|
423
|
+
"statistics": {}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
try:
|
|
427
|
+
if method == "transformer" and TRANSFORMERS_AVAILABLE:
|
|
428
|
+
print(f" Using transformer model: {model_name}")
|
|
429
|
+
|
|
430
|
+
# Sentiment analysis pipeline
|
|
431
|
+
sentiment_pipeline = pipeline(
|
|
432
|
+
"sentiment-analysis",
|
|
433
|
+
model=model_name,
|
|
434
|
+
truncation=True,
|
|
435
|
+
max_length=512
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Process in batches
|
|
439
|
+
batch_size = 32
|
|
440
|
+
all_sentiments = []
|
|
441
|
+
|
|
442
|
+
for i in range(0, len(texts_clean), batch_size):
|
|
443
|
+
batch = texts_clean[i:i+batch_size]
|
|
444
|
+
batch_results = sentiment_pipeline(batch)
|
|
445
|
+
all_sentiments.extend(batch_results)
|
|
446
|
+
|
|
447
|
+
result["sentiments"] = [
|
|
448
|
+
{
|
|
449
|
+
"label": s["label"],
|
|
450
|
+
"score": float(s["score"]),
|
|
451
|
+
"text": texts_clean[i][:100] # First 100 chars
|
|
452
|
+
}
|
|
453
|
+
for i, s in enumerate(all_sentiments)
|
|
454
|
+
]
|
|
455
|
+
|
|
456
|
+
# Emotion detection
|
|
457
|
+
if detect_emotions:
|
|
458
|
+
try:
|
|
459
|
+
emotion_pipeline = pipeline(
|
|
460
|
+
"text-classification",
|
|
461
|
+
model="j-hartmann/emotion-english-distilroberta-base",
|
|
462
|
+
truncation=True,
|
|
463
|
+
max_length=512
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
emotions = []
|
|
467
|
+
for i in range(0, len(texts_clean), batch_size):
|
|
468
|
+
batch = texts_clean[i:i+batch_size]
|
|
469
|
+
batch_emotions = emotion_pipeline(batch)
|
|
470
|
+
emotions.extend(batch_emotions)
|
|
471
|
+
|
|
472
|
+
result["emotions"] = [
|
|
473
|
+
{"emotion": e["label"], "score": float(e["score"])}
|
|
474
|
+
for e in emotions
|
|
475
|
+
]
|
|
476
|
+
|
|
477
|
+
# Emotion distribution
|
|
478
|
+
emotion_counts = Counter([e["label"] for e in emotions])
|
|
479
|
+
result["emotion_distribution"] = dict(emotion_counts)
|
|
480
|
+
|
|
481
|
+
except Exception as e:
|
|
482
|
+
print(f"⚠️ Emotion detection failed: {str(e)}")
|
|
483
|
+
result["emotions"] = None
|
|
484
|
+
|
|
485
|
+
else:
|
|
486
|
+
# Check if method is 'vader' - use vaderSentiment
|
|
487
|
+
if method == "vader":
|
|
488
|
+
try:
|
|
489
|
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
490
|
+
print(" Using VADER for sentiment analysis...")
|
|
491
|
+
|
|
492
|
+
analyzer = SentimentIntensityAnalyzer()
|
|
493
|
+
sentiments = []
|
|
494
|
+
for text in texts_clean:
|
|
495
|
+
scores = analyzer.polarity_scores(text)
|
|
496
|
+
label = "POSITIVE" if scores['compound'] > 0.05 else "NEGATIVE" if scores['compound'] < -0.05 else "NEUTRAL"
|
|
497
|
+
sentiments.append({
|
|
498
|
+
"compound": scores['compound'],
|
|
499
|
+
"positive": scores['pos'],
|
|
500
|
+
"negative": scores['neg'],
|
|
501
|
+
"neutral": scores['neu'],
|
|
502
|
+
"label": label,
|
|
503
|
+
"text": text[:100]
|
|
504
|
+
})
|
|
505
|
+
|
|
506
|
+
result["sentiments"] = sentiments
|
|
507
|
+
|
|
508
|
+
except ImportError:
|
|
509
|
+
print("⚠️ vaderSentiment not installed. Falling back to TextBlob.")
|
|
510
|
+
print(" Install with: pip install vaderSentiment>=3.3")
|
|
511
|
+
method = "textblob"
|
|
512
|
+
|
|
513
|
+
if method in ["textblob", "transformer"]:
|
|
514
|
+
# Fallback to TextBlob
|
|
515
|
+
print(" Using TextBlob for sentiment analysis...")
|
|
516
|
+
|
|
517
|
+
sentiments = []
|
|
518
|
+
for text in texts_clean:
|
|
519
|
+
blob = TextBlob(text)
|
|
520
|
+
sentiments.append({
|
|
521
|
+
"polarity": blob.sentiment.polarity,
|
|
522
|
+
"subjectivity": blob.sentiment.subjectivity,
|
|
523
|
+
"label": "POSITIVE" if blob.sentiment.polarity > 0 else "NEGATIVE" if blob.sentiment.polarity < 0 else "NEUTRAL",
|
|
524
|
+
"text": text[:100]
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
result["sentiments"] = sentiments
|
|
528
|
+
|
|
529
|
+
# Aspect-based sentiment
|
|
530
|
+
if aspects:
|
|
531
|
+
print(f" Analyzing aspect-based sentiment for: {', '.join(aspects)}")
|
|
532
|
+
result["aspect_sentiments"] = _extract_aspect_sentiments(texts_clean, aspects)
|
|
533
|
+
|
|
534
|
+
# Calculate statistics
|
|
535
|
+
if method == "transformer":
|
|
536
|
+
sentiment_counts = Counter([s["label"] for s in result["sentiments"]])
|
|
537
|
+
result["statistics"] = {
|
|
538
|
+
"sentiment_distribution": dict(sentiment_counts),
|
|
539
|
+
"positive_ratio": sentiment_counts.get("POSITIVE", 0) / len(texts_clean),
|
|
540
|
+
"negative_ratio": sentiment_counts.get("NEGATIVE", 0) / len(texts_clean),
|
|
541
|
+
"avg_confidence": np.mean([s["score"] for s in result["sentiments"]])
|
|
542
|
+
}
|
|
543
|
+
else:
|
|
544
|
+
polarities = [s["polarity"] for s in result["sentiments"]]
|
|
545
|
+
result["statistics"] = {
|
|
546
|
+
"avg_polarity": np.mean(polarities),
|
|
547
|
+
"std_polarity": np.std(polarities),
|
|
548
|
+
"positive_ratio": sum(1 for p in polarities if p > 0) / len(polarities),
|
|
549
|
+
"negative_ratio": sum(1 for p in polarities if p < 0) / len(polarities),
|
|
550
|
+
"neutral_ratio": sum(1 for p in polarities if p == 0) / len(polarities)
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
print(f"✅ Sentiment analysis complete!")
|
|
554
|
+
print(f" Distribution: {result['statistics'].get('sentiment_distribution', 'N/A')}")
|
|
555
|
+
|
|
556
|
+
return result
|
|
557
|
+
|
|
558
|
+
except Exception as e:
|
|
559
|
+
print(f"❌ Error during sentiment analysis: {str(e)}")
|
|
560
|
+
raise
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _extract_aspect_sentiments(texts: List[str], aspects: List[str]) -> Dict[str, Any]:
|
|
564
|
+
"""Extract sentiment for specific aspects in text."""
|
|
565
|
+
|
|
566
|
+
aspect_sentiments = {aspect: [] for aspect in aspects}
|
|
567
|
+
|
|
568
|
+
for text in texts:
|
|
569
|
+
text_lower = text.lower()
|
|
570
|
+
|
|
571
|
+
for aspect in aspects:
|
|
572
|
+
# Find sentences containing the aspect
|
|
573
|
+
sentences = text.split('.')
|
|
574
|
+
aspect_sentences = [s for s in sentences if aspect.lower() in s.lower()]
|
|
575
|
+
|
|
576
|
+
if aspect_sentences:
|
|
577
|
+
# Analyze sentiment of aspect sentences
|
|
578
|
+
for sentence in aspect_sentences:
|
|
579
|
+
blob = TextBlob(sentence)
|
|
580
|
+
aspect_sentiments[aspect].append({
|
|
581
|
+
"text": sentence.strip(),
|
|
582
|
+
"polarity": blob.sentiment.polarity,
|
|
583
|
+
"subjectivity": blob.sentiment.subjectivity
|
|
584
|
+
})
|
|
585
|
+
|
|
586
|
+
# Aggregate aspect sentiments
|
|
587
|
+
result = {}
|
|
588
|
+
for aspect, sentiments in aspect_sentiments.items():
|
|
589
|
+
if sentiments:
|
|
590
|
+
polarities = [s["polarity"] for s in sentiments]
|
|
591
|
+
result[aspect] = {
|
|
592
|
+
"count": len(sentiments),
|
|
593
|
+
"avg_polarity": np.mean(polarities),
|
|
594
|
+
"positive_mentions": sum(1 for p in polarities if p > 0),
|
|
595
|
+
"negative_mentions": sum(1 for p in polarities if p < 0),
|
|
596
|
+
"examples": sentiments[:5]
|
|
597
|
+
}
|
|
598
|
+
else:
|
|
599
|
+
result[aspect] = {"count": 0, "avg_polarity": 0.0}
|
|
600
|
+
|
|
601
|
+
return result
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def perform_text_similarity(
|
|
605
|
+
data: pl.DataFrame,
|
|
606
|
+
text_column: str,
|
|
607
|
+
query_text: Optional[str] = None,
|
|
608
|
+
method: str = "cosine",
|
|
609
|
+
top_k: int = 10,
|
|
610
|
+
use_embeddings: bool = False,
|
|
611
|
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
|
|
612
|
+
) -> Dict[str, Any]:
|
|
613
|
+
"""
|
|
614
|
+
Calculate text similarity using cosine, Jaccard, or semantic embeddings.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
data: Input DataFrame
|
|
618
|
+
text_column: Column containing text data
|
|
619
|
+
query_text: Query text to find similar documents (if None, computes pairwise)
|
|
620
|
+
method: Similarity method ('cosine', 'jaccard', 'semantic')
|
|
621
|
+
top_k: Number of top similar documents to return
|
|
622
|
+
use_embeddings: Whether to use transformer embeddings (for semantic similarity)
|
|
623
|
+
model_name: Model for semantic embeddings
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Dictionary containing similarity scores and top matches
|
|
627
|
+
"""
|
|
628
|
+
print(f"🔍 Calculating text similarity using {method} method...")
|
|
629
|
+
|
|
630
|
+
# Validate input
|
|
631
|
+
if text_column not in data.columns:
|
|
632
|
+
raise ValueError(f"Text column '{text_column}' not found in DataFrame")
|
|
633
|
+
|
|
634
|
+
# Extract text
|
|
635
|
+
texts = data[text_column].to_list()
|
|
636
|
+
texts = [str(t) if t is not None else "" for t in texts]
|
|
637
|
+
|
|
638
|
+
result = {
|
|
639
|
+
"method": method,
|
|
640
|
+
"n_documents": len(texts),
|
|
641
|
+
"query_text": query_text,
|
|
642
|
+
"similarities": []
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
try:
|
|
646
|
+
if method == "semantic" and use_embeddings and TRANSFORMERS_AVAILABLE:
|
|
647
|
+
print(f" Using semantic embeddings: {model_name}")
|
|
648
|
+
|
|
649
|
+
# Load model and tokenizer
|
|
650
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
651
|
+
model = AutoModel.from_pretrained(model_name)
|
|
652
|
+
|
|
653
|
+
def get_embedding(text: str) -> np.ndarray:
|
|
654
|
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
|
|
655
|
+
with torch.no_grad():
|
|
656
|
+
outputs = model(**inputs)
|
|
657
|
+
# Mean pooling
|
|
658
|
+
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
|
659
|
+
|
|
660
|
+
# Get embeddings
|
|
661
|
+
if query_text:
|
|
662
|
+
query_embedding = get_embedding(query_text)
|
|
663
|
+
text_embeddings = np.array([get_embedding(t) for t in texts])
|
|
664
|
+
|
|
665
|
+
# Calculate cosine similarity
|
|
666
|
+
similarities = cosine_similarity([query_embedding], text_embeddings)[0]
|
|
667
|
+
|
|
668
|
+
# Top K
|
|
669
|
+
top_indices = similarities.argsort()[-top_k:][::-1]
|
|
670
|
+
result["similarities"] = [
|
|
671
|
+
{
|
|
672
|
+
"document_id": int(idx),
|
|
673
|
+
"text": texts[idx][:200],
|
|
674
|
+
"score": float(similarities[idx])
|
|
675
|
+
}
|
|
676
|
+
for idx in top_indices
|
|
677
|
+
]
|
|
678
|
+
else:
|
|
679
|
+
# Pairwise similarity
|
|
680
|
+
text_embeddings = np.array([get_embedding(t) for t in texts])
|
|
681
|
+
similarity_matrix = cosine_similarity(text_embeddings)
|
|
682
|
+
result["similarity_matrix"] = similarity_matrix.tolist()
|
|
683
|
+
result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
|
|
684
|
+
|
|
685
|
+
elif method == "cosine":
|
|
686
|
+
print(" Using TF-IDF with cosine similarity...")
|
|
687
|
+
|
|
688
|
+
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
|
689
|
+
|
|
690
|
+
if query_text:
|
|
691
|
+
all_texts = [query_text] + texts
|
|
692
|
+
tfidf_matrix = vectorizer.fit_transform(all_texts)
|
|
693
|
+
|
|
694
|
+
# Similarity between query and all documents
|
|
695
|
+
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
|
|
696
|
+
|
|
697
|
+
# Top K
|
|
698
|
+
top_indices = similarities.argsort()[-top_k:][::-1]
|
|
699
|
+
result["similarities"] = [
|
|
700
|
+
{
|
|
701
|
+
"document_id": int(idx),
|
|
702
|
+
"text": texts[idx][:200],
|
|
703
|
+
"score": float(similarities[idx])
|
|
704
|
+
}
|
|
705
|
+
for idx in top_indices
|
|
706
|
+
]
|
|
707
|
+
else:
|
|
708
|
+
# Pairwise similarity
|
|
709
|
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
|
710
|
+
similarity_matrix = cosine_similarity(tfidf_matrix)
|
|
711
|
+
result["similarity_matrix"] = similarity_matrix.tolist()
|
|
712
|
+
result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
|
|
713
|
+
|
|
714
|
+
elif method == "jaccard":
|
|
715
|
+
print(" Using Jaccard similarity...")
|
|
716
|
+
|
|
717
|
+
def jaccard_similarity(text1: str, text2: str) -> float:
|
|
718
|
+
set1 = set(text1.lower().split())
|
|
719
|
+
set2 = set(text2.lower().split())
|
|
720
|
+
intersection = len(set1.intersection(set2))
|
|
721
|
+
union = len(set1.union(set2))
|
|
722
|
+
return intersection / union if union > 0 else 0.0
|
|
723
|
+
|
|
724
|
+
if query_text:
|
|
725
|
+
similarities = [jaccard_similarity(query_text, text) for text in texts]
|
|
726
|
+
|
|
727
|
+
# Top K
|
|
728
|
+
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
|
729
|
+
result["similarities"] = [
|
|
730
|
+
{
|
|
731
|
+
"document_id": int(idx),
|
|
732
|
+
"text": texts[idx][:200],
|
|
733
|
+
"score": float(similarities[idx])
|
|
734
|
+
}
|
|
735
|
+
for idx in top_indices
|
|
736
|
+
]
|
|
737
|
+
else:
|
|
738
|
+
# Pairwise similarity
|
|
739
|
+
n = len(texts)
|
|
740
|
+
similarity_matrix = np.zeros((n, n))
|
|
741
|
+
for i in range(n):
|
|
742
|
+
for j in range(i+1, n):
|
|
743
|
+
sim = jaccard_similarity(texts[i], texts[j])
|
|
744
|
+
similarity_matrix[i, j] = sim
|
|
745
|
+
similarity_matrix[j, i] = sim
|
|
746
|
+
|
|
747
|
+
result["similarity_matrix"] = similarity_matrix.tolist()
|
|
748
|
+
result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
|
|
749
|
+
|
|
750
|
+
else:
|
|
751
|
+
raise ValueError(f"Unknown method '{method}'. Use 'cosine', 'jaccard', or 'semantic'")
|
|
752
|
+
|
|
753
|
+
print(f"✅ Similarity calculation complete!")
|
|
754
|
+
if result.get("similarities"):
|
|
755
|
+
print(f" Top similarity score: {result['similarities'][0]['score']:.3f}")
|
|
756
|
+
|
|
757
|
+
return result
|
|
758
|
+
|
|
759
|
+
except Exception as e:
|
|
760
|
+
print(f"❌ Error during similarity calculation: {str(e)}")
|
|
761
|
+
raise
|