py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- py2ls/.git/index +0 -0
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +105 -55
- py2ls/ml2ls.py +244 -110
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +351 -40
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +9 -8
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0
py2ls/nl2ls.py
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
from . import translator,ips,plot
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
|
6
|
+
|
7
|
+
def detect(text, method: str = "vader", nb_model=None, device=-1,overall_method="major",overall_threhold=0.8,overall_weight=None,plot_=True,verbose=True, **kwargs) -> dict:
|
8
|
+
"""
|
9
|
+
Analyze the sentiment of a text or a list of texts using different methods.
|
10
|
+
|
11
|
+
Parameters:
|
12
|
+
- text (str or list of str): The text(s) to analyze. Can be a single text or a list of texts.
|
13
|
+
- method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers', 'senta').
|
14
|
+
- nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
|
15
|
+
- vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
|
16
|
+
- device (int): Device to run the model on (-1 for CPU, 0 for GPU).
|
17
|
+
- transformer_model_name (str): Transformer model name for 'transformers' method.
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
- dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
|
21
|
+
"""
|
22
|
+
result = {
|
23
|
+
"method": method,
|
24
|
+
"score": None,
|
25
|
+
"label": None,
|
26
|
+
"language": None,
|
27
|
+
}
|
28
|
+
|
29
|
+
methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
|
30
|
+
if ips.run_once_within(10, reverse=True) and verbose:
|
31
|
+
print(f"methods: {methods}")
|
32
|
+
|
33
|
+
overall_methods=["majority","average","mean","threshold","weighted","detailed"]
|
34
|
+
if ips.run_once_within(10, reverse=True) and verbose:
|
35
|
+
print(f"overall_methods: {overall_methods}")
|
36
|
+
# If the input is a list of texts, loop through each one
|
37
|
+
if isinstance(text, list):
|
38
|
+
results = []
|
39
|
+
for text_ in text:
|
40
|
+
results.append(detect_single_text(text_, method=method, nb_model=nb_model, device=device, **kwargs))
|
41
|
+
res_overall=get_overall_results(results, method=overall_method, threshold=overall_threhold, weight=overall_weight)
|
42
|
+
if plot_:
|
43
|
+
res_detail=get_overall_results(results, method='detail', threshold=overall_threhold, weight=overall_weight)
|
44
|
+
plot.pie(res_detail["label"].value_counts(),explode=None,verbose=False)
|
45
|
+
return res_overall
|
46
|
+
else:
|
47
|
+
return detect_single_text(text=text, method=method, nb_model=nb_model, device=device, **kwargs)
|
48
|
+
|
49
|
+
|
50
|
+
def detect_single_text(text: str, method: str = "vader", nb_model=None, device=-1, **kwargs) -> dict:
|
51
|
+
"""
|
52
|
+
Analyze the sentiment of a text using different methods.
|
53
|
+
|
54
|
+
Parameters:
|
55
|
+
- text (str): The text to analyze.
|
56
|
+
- method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers').
|
57
|
+
- nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
|
58
|
+
- vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
|
59
|
+
- transformer_model_name (str): Transformer model name for 'transformers' method.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
- dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
|
63
|
+
"""
|
64
|
+
result = {
|
65
|
+
"text":text,
|
66
|
+
"method": method,
|
67
|
+
"score": None,
|
68
|
+
"label": None,
|
69
|
+
"language": None,
|
70
|
+
}
|
71
|
+
|
72
|
+
# Detect language for additional insights
|
73
|
+
language = translator.detect_lang(text)
|
74
|
+
result["language"] = language
|
75
|
+
if language != "English" and method in ["vader", "textblob", "naive_bayes"]:
|
76
|
+
print("Detected non-English language, results may be inaccurate.")
|
77
|
+
methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
|
78
|
+
method=ips.strcmp(method,methods)[0]
|
79
|
+
if method == "vader":
|
80
|
+
import nltk, os
|
81
|
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
82
|
+
|
83
|
+
# check if it is downloaded
|
84
|
+
is_local = os.path.isfile(
|
85
|
+
os.path.join(nltk.data.path[0], "sentiment", "vader_lexicon.zip")
|
86
|
+
)
|
87
|
+
if not is_local:
|
88
|
+
nltk.download("vader_lexicon")
|
89
|
+
try:
|
90
|
+
sia = SentimentIntensityAnalyzer()
|
91
|
+
scores = sia.polarity_scores(text)
|
92
|
+
result["score"] = scores["compound"]
|
93
|
+
result["label"] = (
|
94
|
+
"Positive"
|
95
|
+
if scores["compound"] >= 0.05
|
96
|
+
else "Negative" if scores["compound"] <= -0.05 else "Neutral"
|
97
|
+
)
|
98
|
+
except Exception as e:
|
99
|
+
print(f"Error in VADER analysis: {e}")
|
100
|
+
|
101
|
+
elif method == "textblob":
|
102
|
+
from textblob import TextBlob
|
103
|
+
|
104
|
+
try:
|
105
|
+
blob = TextBlob(text)
|
106
|
+
polarity = blob.sentiment.polarity
|
107
|
+
result["score"] = polarity
|
108
|
+
result["label"] = (
|
109
|
+
"Positive"
|
110
|
+
if polarity > 0
|
111
|
+
else "Negative" if polarity < 0 else "Neutral"
|
112
|
+
)
|
113
|
+
except Exception as e:
|
114
|
+
print(f"Error in TextBlob analysis: {e}")
|
115
|
+
|
116
|
+
elif method == "naive_bayes":
|
117
|
+
from sklearn.naive_bayes import MultinomialNB
|
118
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
119
|
+
|
120
|
+
try:
|
121
|
+
if nb_model is None or vectorizer is None:
|
122
|
+
from sklearn.model_selection import train_test_split
|
123
|
+
|
124
|
+
# Sample data for Naive Bayes training if model not provided
|
125
|
+
sample_texts = [
|
126
|
+
"I love this product",
|
127
|
+
"I hate this product",
|
128
|
+
"It's okay, not great",
|
129
|
+
"Absolutely fantastic!",
|
130
|
+
"Not satisfied",
|
131
|
+
]
|
132
|
+
sample_labels = [1, 0, 0, 1, 0] # 1 = Positive, 0 = Negative
|
133
|
+
|
134
|
+
# Train Naive Bayes model
|
135
|
+
vectorizer = TfidfVectorizer()
|
136
|
+
X_train_tfidf = vectorizer.fit_transform(sample_texts)
|
137
|
+
nb_model = MultinomialNB()
|
138
|
+
nb_model.fit(X_train_tfidf, sample_labels)
|
139
|
+
|
140
|
+
transformed_text = vectorizer.transform([text])
|
141
|
+
prediction = nb_model.predict(transformed_text)[0]
|
142
|
+
result["score"] = max(nb_model.predict_proba(transformed_text)[0])
|
143
|
+
result["label"] = "Positive" if prediction == 1 else "Negative"
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
print(f"Error in Naive Bayes analysis: {e}")
|
147
|
+
elif method=="transformer":
|
148
|
+
try:
|
149
|
+
from transformers import pipeline
|
150
|
+
# Load pre-trained sentiment analysis pipeline with a Chinese model
|
151
|
+
classifier = pipeline('sentiment-analysis', model='bert-base-chinese', device=device)
|
152
|
+
analysis_result = classifier(text)
|
153
|
+
result["score"] = analysis_result[0]['score']
|
154
|
+
result["label"] = analysis_result[0]['label']
|
155
|
+
except Exception as e:
|
156
|
+
print(f"Error in Transformer analysis: {e}")
|
157
|
+
elif method == "senta":
|
158
|
+
from transformers import pipeline
|
159
|
+
|
160
|
+
try:
|
161
|
+
# Load the Senta model for sentiment analysis
|
162
|
+
classifier = pipeline('sentiment-analysis', model='junnyu/senta', device=device)
|
163
|
+
analysis_result = classifier(text)
|
164
|
+
|
165
|
+
# Senta model output will be a list with one result (since it's single text input)
|
166
|
+
result["score"] = analysis_result[0]["score"]
|
167
|
+
result["label"] = analysis_result[0]["label"]
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
print(f"Error in Senta analysis: {e}")
|
171
|
+
|
172
|
+
else:
|
173
|
+
print(
|
174
|
+
f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
|
175
|
+
)
|
176
|
+
raise ValueError(
|
177
|
+
f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
|
178
|
+
)
|
179
|
+
|
180
|
+
return result
|
181
|
+
|
182
|
+
def get_overall_results(results, method="majority", threshold=0.8, weight=None,verbose=False):
|
183
|
+
from collections import Counter
|
184
|
+
"""
|
185
|
+
Aggregates sentiment analysis results based on the selected method.
|
186
|
+
|
187
|
+
Parameters:
|
188
|
+
- results (list): A list of sentiment analysis results, each being a dictionary.
|
189
|
+
- method (str): The aggregation method to use ('majority', 'average', 'threshold', 'weighted', 'detailed').
|
190
|
+
- threshold (float): Confidence threshold for 'threshold' method.
|
191
|
+
- weight (dict): Optional dictionary for weighted aggregation (e.g., model name as key and weight as value).
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
- dict: Aggregated sentiment result with final label and score.
|
195
|
+
"""
|
196
|
+
def majority_voting(results):
|
197
|
+
"""Aggregates sentiment using majority voting."""
|
198
|
+
labels = [result['label'] for result in results]
|
199
|
+
label_counts = Counter(labels)
|
200
|
+
final_label = label_counts.most_common(1)[0][0] # Get the most common label
|
201
|
+
return {"label": final_label}
|
202
|
+
|
203
|
+
|
204
|
+
def average_score(results):
|
205
|
+
"""Aggregates sentiment by calculating the average score."""
|
206
|
+
scores = [result['score'] for result in results]
|
207
|
+
avg_score = sum(scores) / len(scores)
|
208
|
+
|
209
|
+
if avg_score > 0.05:
|
210
|
+
label = 'Positive'
|
211
|
+
elif avg_score < -0.05:
|
212
|
+
label = 'Negative'
|
213
|
+
else:
|
214
|
+
label = 'Neutral'
|
215
|
+
|
216
|
+
return {"score": avg_score, "label": label}
|
217
|
+
|
218
|
+
|
219
|
+
def confidence_threshold(results, threshold=0.8):
|
220
|
+
"""Aggregates sentiment based on a confidence threshold."""
|
221
|
+
labels = [result['label'] for result in results]
|
222
|
+
label_counts = Counter(labels)
|
223
|
+
total_results = len(results)
|
224
|
+
|
225
|
+
for label, count in label_counts.items():
|
226
|
+
if count / total_results >= threshold:
|
227
|
+
return {"label": label}
|
228
|
+
|
229
|
+
return {"label": 'Neutral'} # If no label exceeds the threshold, return neutral
|
230
|
+
|
231
|
+
|
232
|
+
def weighted_average(results, weight=None):
|
233
|
+
"""Aggregates sentiment based on a weighted average."""
|
234
|
+
if weight is None:
|
235
|
+
weight = {"vader": 2}
|
236
|
+
|
237
|
+
weighted_scores = 0
|
238
|
+
total_weight = 0
|
239
|
+
|
240
|
+
for result in results:
|
241
|
+
model = result.get('method', 'default')
|
242
|
+
model_weight = weight.get(model, 1) # Default weight is 1 if model not in weight dict
|
243
|
+
weighted_scores += result['score'] * model_weight
|
244
|
+
total_weight += model_weight
|
245
|
+
|
246
|
+
avg_weighted_score = weighted_scores / total_weight
|
247
|
+
|
248
|
+
# Assign label based on weighted average score
|
249
|
+
if avg_weighted_score > 0.05:
|
250
|
+
label = 'Positive'
|
251
|
+
elif avg_weighted_score < -0.05:
|
252
|
+
label = 'Negative'
|
253
|
+
else:
|
254
|
+
label = 'Neutral'
|
255
|
+
|
256
|
+
return {"score": avg_weighted_score, "label": label}
|
257
|
+
|
258
|
+
def detailed_output(results,verbose=False):
|
259
|
+
"""Prints the detailed sentiment results."""
|
260
|
+
for result in results:
|
261
|
+
if verbose:
|
262
|
+
print(f"Label: {result['label']} | Score: {result['score']}")
|
263
|
+
return {"detailed_results": results}
|
264
|
+
overall_methods=["majority","average","mean","threshold","weighted","detailed"]
|
265
|
+
method=ips.strcmp(method, overall_methods)[0]
|
266
|
+
if method == "majority":
|
267
|
+
return majority_voting(results)
|
268
|
+
|
269
|
+
elif method in ["mean","average"]:
|
270
|
+
return average_score(results)
|
271
|
+
|
272
|
+
elif method == "threshold":
|
273
|
+
return confidence_threshold(results, threshold)
|
274
|
+
|
275
|
+
elif method == "weighted":
|
276
|
+
return weighted_average(results, weight)
|
277
|
+
|
278
|
+
elif method == "detailed":
|
279
|
+
return pd.DataFrame(results)
|
280
|
+
else:
|
281
|
+
raise ValueError(f"Unknown method '{method}'. Available methods: 'majority', 'average', 'threshold', 'weighted', 'detailed'")
|
282
|
+
|
283
|
+
|