py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
py2ls/netfinder.py CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
626
626
  )
627
627
  if condition:
628
628
  filtered_links.append(link)
629
- return filtered_links
629
+ return ips.unique(filtered_links)
630
630
 
631
631
 
632
632
  def find_domain(links):
@@ -717,7 +717,7 @@ def downloader(
717
717
  kind=[".pdf"],
718
718
  contains=None,
719
719
  rm_folder=False,
720
- booster=False,
720
+ booster=True,# use find_links
721
721
  verbose=True,
722
722
  timeout=30,
723
723
  n_try=3,
@@ -726,7 +726,7 @@ def downloader(
726
726
 
727
727
  from requests.exceptions import ChunkedEncodingError, ConnectionError
728
728
 
729
- if verbose:
729
+ if verbose and ips.run_once_within():
730
730
  print(
731
731
  "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
732
732
  )
@@ -734,8 +734,11 @@ def downloader(
734
734
  def fname_corrector(fname, ext):
735
735
  if not ext.startswith("."):
736
736
  ext = "." + ext
737
- if not fname.endswith("ext"): # if not ext in fname:
737
+ if not fname.endswith(ext): # if not ext in fname:
738
738
  fname = fname[: -len(ext)] + ext
739
+ if not any(fname[: -len(ext)]):
740
+ from datetime import datetime
741
+ fname = datetime.now().strftime("%H%M%S") + ext
739
742
  return fname
740
743
 
741
744
  def check_and_modify_filename(directory, filename):
@@ -784,8 +787,8 @@ def downloader(
784
787
  kind[i] = "." + kind[i]
785
788
  file_links_all = []
786
789
  for kind_ in kind:
787
- if isinstance(contains, str):
788
- contains = [contains]
790
+ # if isinstance(contains, str):
791
+ # contains = [contains]
789
792
  if isinstance(url, str):
790
793
  if any(ext in url for ext in kind):
791
794
  file_links = [url]
@@ -799,7 +802,7 @@ def downloader(
799
802
  if contains is not None:
800
803
  file_links = filter_links(links_all, contains=contains + kind_)
801
804
  else:
802
- file_links = links_all # filter_links(links_all, contains=kind_)
805
+ file_links = filter_links(links_all, contains=kind_)#links_all #
803
806
  elif isinstance(url, list):
804
807
  links_all = url
805
808
  if contains is not None:
@@ -812,6 +815,7 @@ def downloader(
812
815
  file_links = filter_links(links_all, contains=contains + kind_)
813
816
  else:
814
817
  file_links = filter_links(links_all, contains=kind_)
818
+ file_links=ips.unique(file_links)
815
819
  if verbose:
816
820
  if file_links:
817
821
  from pprint import pp
@@ -825,6 +829,7 @@ def downloader(
825
829
  file_links_all = [file_links]
826
830
  elif isinstance(file_links, list):
827
831
  file_links_all.extend(file_links)
832
+ file_links_all=ips.unique(file_links_all)
828
833
  if dir_save:
829
834
  if rm_folder:
830
835
  ips.rm_folder(dir_save)
@@ -847,7 +852,7 @@ def downloader(
847
852
  )
848
853
  if ext is None:
849
854
  ext = kind_
850
- print("ehereerere", ext)
855
+
851
856
  if ext:
852
857
  corrected_fname = fname_corrector(fnames[idx], ext)
853
858
  corrected_fname = check_and_modify_filename(
@@ -860,13 +865,13 @@ def downloader(
860
865
  datetime.now().strftime("%y%m%d_%H%M%S_")
861
866
  + corrected_fname
862
867
  )
863
- fpath_tmp = os.path.join(dir_save, corrected_fname)
868
+ fpath_tmp = os.path.join(dir_save, corrected_fname)
864
869
  with open(fpath_tmp, "wb") as file:
865
870
  for chunk in response.iter_content(chunk_size=8192):
866
871
  if chunk: # Filter out keep-alive chunks
867
872
  file.write(chunk)
868
873
  if verbose:
869
- print(f"Done! {fnames[idx]}")
874
+ print(f"Done{fnames[idx]}")
870
875
  else:
871
876
  if verbose:
872
877
  print(f"Unknown file type for {file_link}")
@@ -886,16 +891,7 @@ def downloader(
886
891
 
887
892
  if itry == n_try:
888
893
  print(f"Failed to download {file_link} after {n_try} attempts.")
889
-
890
- # print(f"\n{len(fnames)} files were downloaded:")
891
- if verbose:
892
- from pprint import pp
893
-
894
- if corrected_fname:
895
- pp(corrected_fname)
896
- print(f"\n\nsaved @:\n{dir_save}")
897
- else:
898
- pp(fnames)
894
+
899
895
 
900
896
 
901
897
  def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
py2ls/nl2ls.py ADDED
@@ -0,0 +1,283 @@
1
+ from . import translator,ips,plot
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ def detect(text, method: str = "vader", nb_model=None, device=-1,overall_method="major",overall_threhold=0.8,overall_weight=None,plot_=True,verbose=True, **kwargs) -> dict:
8
+ """
9
+ Analyze the sentiment of a text or a list of texts using different methods.
10
+
11
+ Parameters:
12
+ - text (str or list of str): The text(s) to analyze. Can be a single text or a list of texts.
13
+ - method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers', 'senta').
14
+ - nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
15
+ - vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
16
+ - device (int): Device to run the model on (-1 for CPU, 0 for GPU).
17
+ - transformer_model_name (str): Transformer model name for 'transformers' method.
18
+
19
+ Returns:
20
+ - dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
21
+ """
22
+ result = {
23
+ "method": method,
24
+ "score": None,
25
+ "label": None,
26
+ "language": None,
27
+ }
28
+
29
+ methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
30
+ if ips.run_once_within(10, reverse=True) and verbose:
31
+ print(f"methods: {methods}")
32
+
33
+ overall_methods=["majority","average","mean","threshold","weighted","detailed"]
34
+ if ips.run_once_within(10, reverse=True) and verbose:
35
+ print(f"overall_methods: {overall_methods}")
36
+ # If the input is a list of texts, loop through each one
37
+ if isinstance(text, list):
38
+ results = []
39
+ for text_ in text:
40
+ results.append(detect_single_text(text_, method=method, nb_model=nb_model, device=device, **kwargs))
41
+ res_overall=get_overall_results(results, method=overall_method, threshold=overall_threhold, weight=overall_weight)
42
+ if plot_:
43
+ res_detail=get_overall_results(results, method='detail', threshold=overall_threhold, weight=overall_weight)
44
+ plot.pie(res_detail["label"].value_counts(),explode=None,verbose=False)
45
+ return res_overall
46
+ else:
47
+ return detect_single_text(text=text, method=method, nb_model=nb_model, device=device, **kwargs)
48
+
49
+
50
+ def detect_single_text(text: str, method: str = "vader", nb_model=None, device=-1, **kwargs) -> dict:
51
+ """
52
+ Analyze the sentiment of a text using different methods.
53
+
54
+ Parameters:
55
+ - text (str): The text to analyze.
56
+ - method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers').
57
+ - nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
58
+ - vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
59
+ - transformer_model_name (str): Transformer model name for 'transformers' method.
60
+
61
+ Returns:
62
+ - dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
63
+ """
64
+ result = {
65
+ "text":text,
66
+ "method": method,
67
+ "score": None,
68
+ "label": None,
69
+ "language": None,
70
+ }
71
+
72
+ # Detect language for additional insights
73
+ language = translator.detect_lang(text)
74
+ result["language"] = language
75
+ if language != "English" and method in ["vader", "textblob", "naive_bayes"]:
76
+ print("Detected non-English language, results may be inaccurate.")
77
+ methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
78
+ method=ips.strcmp(method,methods)[0]
79
+ if method == "vader":
80
+ import nltk, os
81
+ from nltk.sentiment import SentimentIntensityAnalyzer
82
+
83
+ # check if it is downloaded
84
+ is_local = os.path.isfile(
85
+ os.path.join(nltk.data.path[0], "sentiment", "vader_lexicon.zip")
86
+ )
87
+ if not is_local:
88
+ nltk.download("vader_lexicon")
89
+ try:
90
+ sia = SentimentIntensityAnalyzer()
91
+ scores = sia.polarity_scores(text)
92
+ result["score"] = scores["compound"]
93
+ result["label"] = (
94
+ "Positive"
95
+ if scores["compound"] >= 0.05
96
+ else "Negative" if scores["compound"] <= -0.05 else "Neutral"
97
+ )
98
+ except Exception as e:
99
+ print(f"Error in VADER analysis: {e}")
100
+
101
+ elif method == "textblob":
102
+ from textblob import TextBlob
103
+
104
+ try:
105
+ blob = TextBlob(text)
106
+ polarity = blob.sentiment.polarity
107
+ result["score"] = polarity
108
+ result["label"] = (
109
+ "Positive"
110
+ if polarity > 0
111
+ else "Negative" if polarity < 0 else "Neutral"
112
+ )
113
+ except Exception as e:
114
+ print(f"Error in TextBlob analysis: {e}")
115
+
116
+ elif method == "naive_bayes":
117
+ from sklearn.naive_bayes import MultinomialNB
118
+ from sklearn.feature_extraction.text import TfidfVectorizer
119
+
120
+ try:
121
+ if nb_model is None or vectorizer is None:
122
+ from sklearn.model_selection import train_test_split
123
+
124
+ # Sample data for Naive Bayes training if model not provided
125
+ sample_texts = [
126
+ "I love this product",
127
+ "I hate this product",
128
+ "It's okay, not great",
129
+ "Absolutely fantastic!",
130
+ "Not satisfied",
131
+ ]
132
+ sample_labels = [1, 0, 0, 1, 0] # 1 = Positive, 0 = Negative
133
+
134
+ # Train Naive Bayes model
135
+ vectorizer = TfidfVectorizer()
136
+ X_train_tfidf = vectorizer.fit_transform(sample_texts)
137
+ nb_model = MultinomialNB()
138
+ nb_model.fit(X_train_tfidf, sample_labels)
139
+
140
+ transformed_text = vectorizer.transform([text])
141
+ prediction = nb_model.predict(transformed_text)[0]
142
+ result["score"] = max(nb_model.predict_proba(transformed_text)[0])
143
+ result["label"] = "Positive" if prediction == 1 else "Negative"
144
+
145
+ except Exception as e:
146
+ print(f"Error in Naive Bayes analysis: {e}")
147
+ elif method=="transformer":
148
+ try:
149
+ from transformers import pipeline
150
+ # Load pre-trained sentiment analysis pipeline with a Chinese model
151
+ classifier = pipeline('sentiment-analysis', model='bert-base-chinese', device=device)
152
+ analysis_result = classifier(text)
153
+ result["score"] = analysis_result[0]['score']
154
+ result["label"] = analysis_result[0]['label']
155
+ except Exception as e:
156
+ print(f"Error in Transformer analysis: {e}")
157
+ elif method == "senta":
158
+ from transformers import pipeline
159
+
160
+ try:
161
+ # Load the Senta model for sentiment analysis
162
+ classifier = pipeline('sentiment-analysis', model='junnyu/senta', device=device)
163
+ analysis_result = classifier(text)
164
+
165
+ # Senta model output will be a list with one result (since it's single text input)
166
+ result["score"] = analysis_result[0]["score"]
167
+ result["label"] = analysis_result[0]["label"]
168
+
169
+ except Exception as e:
170
+ print(f"Error in Senta analysis: {e}")
171
+
172
+ else:
173
+ print(
174
+ f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
175
+ )
176
+ raise ValueError(
177
+ f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
178
+ )
179
+
180
+ return result
181
+
182
+ def get_overall_results(results, method="majority", threshold=0.8, weight=None,verbose=False):
183
+ from collections import Counter
184
+ """
185
+ Aggregates sentiment analysis results based on the selected method.
186
+
187
+ Parameters:
188
+ - results (list): A list of sentiment analysis results, each being a dictionary.
189
+ - method (str): The aggregation method to use ('majority', 'average', 'threshold', 'weighted', 'detailed').
190
+ - threshold (float): Confidence threshold for 'threshold' method.
191
+ - weight (dict): Optional dictionary for weighted aggregation (e.g., model name as key and weight as value).
192
+
193
+ Returns:
194
+ - dict: Aggregated sentiment result with final label and score.
195
+ """
196
+ def majority_voting(results):
197
+ """Aggregates sentiment using majority voting."""
198
+ labels = [result['label'] for result in results]
199
+ label_counts = Counter(labels)
200
+ final_label = label_counts.most_common(1)[0][0] # Get the most common label
201
+ return {"label": final_label}
202
+
203
+
204
+ def average_score(results):
205
+ """Aggregates sentiment by calculating the average score."""
206
+ scores = [result['score'] for result in results]
207
+ avg_score = sum(scores) / len(scores)
208
+
209
+ if avg_score > 0.05:
210
+ label = 'Positive'
211
+ elif avg_score < -0.05:
212
+ label = 'Negative'
213
+ else:
214
+ label = 'Neutral'
215
+
216
+ return {"score": avg_score, "label": label}
217
+
218
+
219
+ def confidence_threshold(results, threshold=0.8):
220
+ """Aggregates sentiment based on a confidence threshold."""
221
+ labels = [result['label'] for result in results]
222
+ label_counts = Counter(labels)
223
+ total_results = len(results)
224
+
225
+ for label, count in label_counts.items():
226
+ if count / total_results >= threshold:
227
+ return {"label": label}
228
+
229
+ return {"label": 'Neutral'} # If no label exceeds the threshold, return neutral
230
+
231
+
232
+ def weighted_average(results, weight=None):
233
+ """Aggregates sentiment based on a weighted average."""
234
+ if weight is None:
235
+ weight = {"vader": 2}
236
+
237
+ weighted_scores = 0
238
+ total_weight = 0
239
+
240
+ for result in results:
241
+ model = result.get('method', 'default')
242
+ model_weight = weight.get(model, 1) # Default weight is 1 if model not in weight dict
243
+ weighted_scores += result['score'] * model_weight
244
+ total_weight += model_weight
245
+
246
+ avg_weighted_score = weighted_scores / total_weight
247
+
248
+ # Assign label based on weighted average score
249
+ if avg_weighted_score > 0.05:
250
+ label = 'Positive'
251
+ elif avg_weighted_score < -0.05:
252
+ label = 'Negative'
253
+ else:
254
+ label = 'Neutral'
255
+
256
+ return {"score": avg_weighted_score, "label": label}
257
+
258
+ def detailed_output(results,verbose=False):
259
+ """Prints the detailed sentiment results."""
260
+ for result in results:
261
+ if verbose:
262
+ print(f"Label: {result['label']} | Score: {result['score']}")
263
+ return {"detailed_results": results}
264
+ overall_methods=["majority","average","mean","threshold","weighted","detailed"]
265
+ method=ips.strcmp(method, overall_methods)[0]
266
+ if method == "majority":
267
+ return majority_voting(results)
268
+
269
+ elif method in ["mean","average"]:
270
+ return average_score(results)
271
+
272
+ elif method == "threshold":
273
+ return confidence_threshold(results, threshold)
274
+
275
+ elif method == "weighted":
276
+ return weighted_average(results, weight)
277
+
278
+ elif method == "detailed":
279
+ return pd.DataFrame(results)
280
+ else:
281
+ raise ValueError(f"Unknown method '{method}'. Available methods: 'majority', 'average', 'threshold', 'weighted', 'detailed'")
282
+
283
+