py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/usages_sns.json +6 -1
- py2ls/ec2ls.py +61 -0
- py2ls/ips.py +496 -138
- py2ls/ml2ls.py +994 -288
- py2ls/netfinder.py +16 -20
- py2ls/nl2ls.py +283 -0
- py2ls/plot.py +1244 -158
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/METADATA +5 -1
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/RECORD +17 -14
- py2ls/data/usages_pd copy.json +0 -1105
- py2ls/ml2ls copy.py +0 -2906
- {py2ls-0.2.4.24.dist-info → py2ls-0.2.4.26.dist-info}/WHEEL +0 -0
py2ls/netfinder.py
CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
|
|
626
626
|
)
|
627
627
|
if condition:
|
628
628
|
filtered_links.append(link)
|
629
|
-
return filtered_links
|
629
|
+
return ips.unique(filtered_links)
|
630
630
|
|
631
631
|
|
632
632
|
def find_domain(links):
|
@@ -717,7 +717,7 @@ def downloader(
|
|
717
717
|
kind=[".pdf"],
|
718
718
|
contains=None,
|
719
719
|
rm_folder=False,
|
720
|
-
booster=
|
720
|
+
booster=True,# use find_links
|
721
721
|
verbose=True,
|
722
722
|
timeout=30,
|
723
723
|
n_try=3,
|
@@ -726,7 +726,7 @@ def downloader(
|
|
726
726
|
|
727
727
|
from requests.exceptions import ChunkedEncodingError, ConnectionError
|
728
728
|
|
729
|
-
if verbose:
|
729
|
+
if verbose and ips.run_once_within():
|
730
730
|
print(
|
731
731
|
"usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
|
732
732
|
)
|
@@ -734,8 +734,11 @@ def downloader(
|
|
734
734
|
def fname_corrector(fname, ext):
|
735
735
|
if not ext.startswith("."):
|
736
736
|
ext = "." + ext
|
737
|
-
if not fname.endswith(
|
737
|
+
if not fname.endswith(ext): # if not ext in fname:
|
738
738
|
fname = fname[: -len(ext)] + ext
|
739
|
+
if not any(fname[: -len(ext)]):
|
740
|
+
from datetime import datetime
|
741
|
+
fname = datetime.now().strftime("%H%M%S") + ext
|
739
742
|
return fname
|
740
743
|
|
741
744
|
def check_and_modify_filename(directory, filename):
|
@@ -784,8 +787,8 @@ def downloader(
|
|
784
787
|
kind[i] = "." + kind[i]
|
785
788
|
file_links_all = []
|
786
789
|
for kind_ in kind:
|
787
|
-
if isinstance(contains, str):
|
788
|
-
|
790
|
+
# if isinstance(contains, str):
|
791
|
+
# contains = [contains]
|
789
792
|
if isinstance(url, str):
|
790
793
|
if any(ext in url for ext in kind):
|
791
794
|
file_links = [url]
|
@@ -799,7 +802,7 @@ def downloader(
|
|
799
802
|
if contains is not None:
|
800
803
|
file_links = filter_links(links_all, contains=contains + kind_)
|
801
804
|
else:
|
802
|
-
file_links =
|
805
|
+
file_links = filter_links(links_all, contains=kind_)#links_all #
|
803
806
|
elif isinstance(url, list):
|
804
807
|
links_all = url
|
805
808
|
if contains is not None:
|
@@ -812,6 +815,7 @@ def downloader(
|
|
812
815
|
file_links = filter_links(links_all, contains=contains + kind_)
|
813
816
|
else:
|
814
817
|
file_links = filter_links(links_all, contains=kind_)
|
818
|
+
file_links=ips.unique(file_links)
|
815
819
|
if verbose:
|
816
820
|
if file_links:
|
817
821
|
from pprint import pp
|
@@ -825,6 +829,7 @@ def downloader(
|
|
825
829
|
file_links_all = [file_links]
|
826
830
|
elif isinstance(file_links, list):
|
827
831
|
file_links_all.extend(file_links)
|
832
|
+
file_links_all=ips.unique(file_links_all)
|
828
833
|
if dir_save:
|
829
834
|
if rm_folder:
|
830
835
|
ips.rm_folder(dir_save)
|
@@ -847,7 +852,7 @@ def downloader(
|
|
847
852
|
)
|
848
853
|
if ext is None:
|
849
854
|
ext = kind_
|
850
|
-
|
855
|
+
|
851
856
|
if ext:
|
852
857
|
corrected_fname = fname_corrector(fnames[idx], ext)
|
853
858
|
corrected_fname = check_and_modify_filename(
|
@@ -860,13 +865,13 @@ def downloader(
|
|
860
865
|
datetime.now().strftime("%y%m%d_%H%M%S_")
|
861
866
|
+ corrected_fname
|
862
867
|
)
|
863
|
-
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
868
|
+
fpath_tmp = os.path.join(dir_save, corrected_fname)
|
864
869
|
with open(fpath_tmp, "wb") as file:
|
865
870
|
for chunk in response.iter_content(chunk_size=8192):
|
866
871
|
if chunk: # Filter out keep-alive chunks
|
867
872
|
file.write(chunk)
|
868
873
|
if verbose:
|
869
|
-
print(f"Done
|
874
|
+
print(f"Done⤵{fnames[idx]}")
|
870
875
|
else:
|
871
876
|
if verbose:
|
872
877
|
print(f"Unknown file type for {file_link}")
|
@@ -886,16 +891,7 @@ def downloader(
|
|
886
891
|
|
887
892
|
if itry == n_try:
|
888
893
|
print(f"Failed to download {file_link} after {n_try} attempts.")
|
889
|
-
|
890
|
-
# print(f"\n{len(fnames)} files were downloaded:")
|
891
|
-
if verbose:
|
892
|
-
from pprint import pp
|
893
|
-
|
894
|
-
if corrected_fname:
|
895
|
-
pp(corrected_fname)
|
896
|
-
print(f"\n\nsaved @:\n{dir_save}")
|
897
|
-
else:
|
898
|
-
pp(fnames)
|
894
|
+
|
899
895
|
|
900
896
|
|
901
897
|
def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
|
py2ls/nl2ls.py
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
from . import translator,ips,plot
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
|
6
|
+
|
7
|
+
def detect(text, method: str = "vader", nb_model=None, device=-1,overall_method="major",overall_threhold=0.8,overall_weight=None,plot_=True,verbose=True, **kwargs) -> dict:
|
8
|
+
"""
|
9
|
+
Analyze the sentiment of a text or a list of texts using different methods.
|
10
|
+
|
11
|
+
Parameters:
|
12
|
+
- text (str or list of str): The text(s) to analyze. Can be a single text or a list of texts.
|
13
|
+
- method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers', 'senta').
|
14
|
+
- nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
|
15
|
+
- vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
|
16
|
+
- device (int): Device to run the model on (-1 for CPU, 0 for GPU).
|
17
|
+
- transformer_model_name (str): Transformer model name for 'transformers' method.
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
- dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
|
21
|
+
"""
|
22
|
+
result = {
|
23
|
+
"method": method,
|
24
|
+
"score": None,
|
25
|
+
"label": None,
|
26
|
+
"language": None,
|
27
|
+
}
|
28
|
+
|
29
|
+
methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
|
30
|
+
if ips.run_once_within(10, reverse=True) and verbose:
|
31
|
+
print(f"methods: {methods}")
|
32
|
+
|
33
|
+
overall_methods=["majority","average","mean","threshold","weighted","detailed"]
|
34
|
+
if ips.run_once_within(10, reverse=True) and verbose:
|
35
|
+
print(f"overall_methods: {overall_methods}")
|
36
|
+
# If the input is a list of texts, loop through each one
|
37
|
+
if isinstance(text, list):
|
38
|
+
results = []
|
39
|
+
for text_ in text:
|
40
|
+
results.append(detect_single_text(text_, method=method, nb_model=nb_model, device=device, **kwargs))
|
41
|
+
res_overall=get_overall_results(results, method=overall_method, threshold=overall_threhold, weight=overall_weight)
|
42
|
+
if plot_:
|
43
|
+
res_detail=get_overall_results(results, method='detail', threshold=overall_threhold, weight=overall_weight)
|
44
|
+
plot.pie(res_detail["label"].value_counts(),explode=None,verbose=False)
|
45
|
+
return res_overall
|
46
|
+
else:
|
47
|
+
return detect_single_text(text=text, method=method, nb_model=nb_model, device=device, **kwargs)
|
48
|
+
|
49
|
+
|
50
|
+
def detect_single_text(text: str, method: str = "vader", nb_model=None, device=-1, **kwargs) -> dict:
|
51
|
+
"""
|
52
|
+
Analyze the sentiment of a text using different methods.
|
53
|
+
|
54
|
+
Parameters:
|
55
|
+
- text (str): The text to analyze.
|
56
|
+
- method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers').
|
57
|
+
- nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
|
58
|
+
- vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
|
59
|
+
- transformer_model_name (str): Transformer model name for 'transformers' method.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
- dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
|
63
|
+
"""
|
64
|
+
result = {
|
65
|
+
"text":text,
|
66
|
+
"method": method,
|
67
|
+
"score": None,
|
68
|
+
"label": None,
|
69
|
+
"language": None,
|
70
|
+
}
|
71
|
+
|
72
|
+
# Detect language for additional insights
|
73
|
+
language = translator.detect_lang(text)
|
74
|
+
result["language"] = language
|
75
|
+
if language != "English" and method in ["vader", "textblob", "naive_bayes"]:
|
76
|
+
print("Detected non-English language, results may be inaccurate.")
|
77
|
+
methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
|
78
|
+
method=ips.strcmp(method,methods)[0]
|
79
|
+
if method == "vader":
|
80
|
+
import nltk, os
|
81
|
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
82
|
+
|
83
|
+
# check if it is downloaded
|
84
|
+
is_local = os.path.isfile(
|
85
|
+
os.path.join(nltk.data.path[0], "sentiment", "vader_lexicon.zip")
|
86
|
+
)
|
87
|
+
if not is_local:
|
88
|
+
nltk.download("vader_lexicon")
|
89
|
+
try:
|
90
|
+
sia = SentimentIntensityAnalyzer()
|
91
|
+
scores = sia.polarity_scores(text)
|
92
|
+
result["score"] = scores["compound"]
|
93
|
+
result["label"] = (
|
94
|
+
"Positive"
|
95
|
+
if scores["compound"] >= 0.05
|
96
|
+
else "Negative" if scores["compound"] <= -0.05 else "Neutral"
|
97
|
+
)
|
98
|
+
except Exception as e:
|
99
|
+
print(f"Error in VADER analysis: {e}")
|
100
|
+
|
101
|
+
elif method == "textblob":
|
102
|
+
from textblob import TextBlob
|
103
|
+
|
104
|
+
try:
|
105
|
+
blob = TextBlob(text)
|
106
|
+
polarity = blob.sentiment.polarity
|
107
|
+
result["score"] = polarity
|
108
|
+
result["label"] = (
|
109
|
+
"Positive"
|
110
|
+
if polarity > 0
|
111
|
+
else "Negative" if polarity < 0 else "Neutral"
|
112
|
+
)
|
113
|
+
except Exception as e:
|
114
|
+
print(f"Error in TextBlob analysis: {e}")
|
115
|
+
|
116
|
+
elif method == "naive_bayes":
|
117
|
+
from sklearn.naive_bayes import MultinomialNB
|
118
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
119
|
+
|
120
|
+
try:
|
121
|
+
if nb_model is None or vectorizer is None:
|
122
|
+
from sklearn.model_selection import train_test_split
|
123
|
+
|
124
|
+
# Sample data for Naive Bayes training if model not provided
|
125
|
+
sample_texts = [
|
126
|
+
"I love this product",
|
127
|
+
"I hate this product",
|
128
|
+
"It's okay, not great",
|
129
|
+
"Absolutely fantastic!",
|
130
|
+
"Not satisfied",
|
131
|
+
]
|
132
|
+
sample_labels = [1, 0, 0, 1, 0] # 1 = Positive, 0 = Negative
|
133
|
+
|
134
|
+
# Train Naive Bayes model
|
135
|
+
vectorizer = TfidfVectorizer()
|
136
|
+
X_train_tfidf = vectorizer.fit_transform(sample_texts)
|
137
|
+
nb_model = MultinomialNB()
|
138
|
+
nb_model.fit(X_train_tfidf, sample_labels)
|
139
|
+
|
140
|
+
transformed_text = vectorizer.transform([text])
|
141
|
+
prediction = nb_model.predict(transformed_text)[0]
|
142
|
+
result["score"] = max(nb_model.predict_proba(transformed_text)[0])
|
143
|
+
result["label"] = "Positive" if prediction == 1 else "Negative"
|
144
|
+
|
145
|
+
except Exception as e:
|
146
|
+
print(f"Error in Naive Bayes analysis: {e}")
|
147
|
+
elif method=="transformer":
|
148
|
+
try:
|
149
|
+
from transformers import pipeline
|
150
|
+
# Load pre-trained sentiment analysis pipeline with a Chinese model
|
151
|
+
classifier = pipeline('sentiment-analysis', model='bert-base-chinese', device=device)
|
152
|
+
analysis_result = classifier(text)
|
153
|
+
result["score"] = analysis_result[0]['score']
|
154
|
+
result["label"] = analysis_result[0]['label']
|
155
|
+
except Exception as e:
|
156
|
+
print(f"Error in Transformer analysis: {e}")
|
157
|
+
elif method == "senta":
|
158
|
+
from transformers import pipeline
|
159
|
+
|
160
|
+
try:
|
161
|
+
# Load the Senta model for sentiment analysis
|
162
|
+
classifier = pipeline('sentiment-analysis', model='junnyu/senta', device=device)
|
163
|
+
analysis_result = classifier(text)
|
164
|
+
|
165
|
+
# Senta model output will be a list with one result (since it's single text input)
|
166
|
+
result["score"] = analysis_result[0]["score"]
|
167
|
+
result["label"] = analysis_result[0]["label"]
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
print(f"Error in Senta analysis: {e}")
|
171
|
+
|
172
|
+
else:
|
173
|
+
print(
|
174
|
+
f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
|
175
|
+
)
|
176
|
+
raise ValueError(
|
177
|
+
f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
|
178
|
+
)
|
179
|
+
|
180
|
+
return result
|
181
|
+
|
182
|
+
def get_overall_results(results, method="majority", threshold=0.8, weight=None,verbose=False):
|
183
|
+
from collections import Counter
|
184
|
+
"""
|
185
|
+
Aggregates sentiment analysis results based on the selected method.
|
186
|
+
|
187
|
+
Parameters:
|
188
|
+
- results (list): A list of sentiment analysis results, each being a dictionary.
|
189
|
+
- method (str): The aggregation method to use ('majority', 'average', 'threshold', 'weighted', 'detailed').
|
190
|
+
- threshold (float): Confidence threshold for 'threshold' method.
|
191
|
+
- weight (dict): Optional dictionary for weighted aggregation (e.g., model name as key and weight as value).
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
- dict: Aggregated sentiment result with final label and score.
|
195
|
+
"""
|
196
|
+
def majority_voting(results):
|
197
|
+
"""Aggregates sentiment using majority voting."""
|
198
|
+
labels = [result['label'] for result in results]
|
199
|
+
label_counts = Counter(labels)
|
200
|
+
final_label = label_counts.most_common(1)[0][0] # Get the most common label
|
201
|
+
return {"label": final_label}
|
202
|
+
|
203
|
+
|
204
|
+
def average_score(results):
|
205
|
+
"""Aggregates sentiment by calculating the average score."""
|
206
|
+
scores = [result['score'] for result in results]
|
207
|
+
avg_score = sum(scores) / len(scores)
|
208
|
+
|
209
|
+
if avg_score > 0.05:
|
210
|
+
label = 'Positive'
|
211
|
+
elif avg_score < -0.05:
|
212
|
+
label = 'Negative'
|
213
|
+
else:
|
214
|
+
label = 'Neutral'
|
215
|
+
|
216
|
+
return {"score": avg_score, "label": label}
|
217
|
+
|
218
|
+
|
219
|
+
def confidence_threshold(results, threshold=0.8):
|
220
|
+
"""Aggregates sentiment based on a confidence threshold."""
|
221
|
+
labels = [result['label'] for result in results]
|
222
|
+
label_counts = Counter(labels)
|
223
|
+
total_results = len(results)
|
224
|
+
|
225
|
+
for label, count in label_counts.items():
|
226
|
+
if count / total_results >= threshold:
|
227
|
+
return {"label": label}
|
228
|
+
|
229
|
+
return {"label": 'Neutral'} # If no label exceeds the threshold, return neutral
|
230
|
+
|
231
|
+
|
232
|
+
def weighted_average(results, weight=None):
|
233
|
+
"""Aggregates sentiment based on a weighted average."""
|
234
|
+
if weight is None:
|
235
|
+
weight = {"vader": 2}
|
236
|
+
|
237
|
+
weighted_scores = 0
|
238
|
+
total_weight = 0
|
239
|
+
|
240
|
+
for result in results:
|
241
|
+
model = result.get('method', 'default')
|
242
|
+
model_weight = weight.get(model, 1) # Default weight is 1 if model not in weight dict
|
243
|
+
weighted_scores += result['score'] * model_weight
|
244
|
+
total_weight += model_weight
|
245
|
+
|
246
|
+
avg_weighted_score = weighted_scores / total_weight
|
247
|
+
|
248
|
+
# Assign label based on weighted average score
|
249
|
+
if avg_weighted_score > 0.05:
|
250
|
+
label = 'Positive'
|
251
|
+
elif avg_weighted_score < -0.05:
|
252
|
+
label = 'Negative'
|
253
|
+
else:
|
254
|
+
label = 'Neutral'
|
255
|
+
|
256
|
+
return {"score": avg_weighted_score, "label": label}
|
257
|
+
|
258
|
+
def detailed_output(results,verbose=False):
|
259
|
+
"""Prints the detailed sentiment results."""
|
260
|
+
for result in results:
|
261
|
+
if verbose:
|
262
|
+
print(f"Label: {result['label']} | Score: {result['score']}")
|
263
|
+
return {"detailed_results": results}
|
264
|
+
overall_methods=["majority","average","mean","threshold","weighted","detailed"]
|
265
|
+
method=ips.strcmp(method, overall_methods)[0]
|
266
|
+
if method == "majority":
|
267
|
+
return majority_voting(results)
|
268
|
+
|
269
|
+
elif method in ["mean","average"]:
|
270
|
+
return average_score(results)
|
271
|
+
|
272
|
+
elif method == "threshold":
|
273
|
+
return confidence_threshold(results, threshold)
|
274
|
+
|
275
|
+
elif method == "weighted":
|
276
|
+
return weighted_average(results, weight)
|
277
|
+
|
278
|
+
elif method == "detailed":
|
279
|
+
return pd.DataFrame(results)
|
280
|
+
else:
|
281
|
+
raise ValueError(f"Unknown method '{method}'. Available methods: 'majority', 'average', 'threshold', 'weighted', 'detailed'")
|
282
|
+
|
283
|
+
|