py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -626,7 +626,7 @@ def filter_links(links, contains="html", driver="requ", booster=False):
626
626
  )
627
627
  if condition:
628
628
  filtered_links.append(link)
629
- return filtered_links
629
+ return ips.unique(filtered_links)
630
630
 
631
631
 
632
632
  def find_domain(links):
@@ -717,7 +717,7 @@ def downloader(
717
717
  kind=[".pdf"],
718
718
  contains=None,
719
719
  rm_folder=False,
720
- booster=False,
720
+ booster=True,# use find_links
721
721
  verbose=True,
722
722
  timeout=30,
723
723
  n_try=3,
@@ -726,7 +726,7 @@ def downloader(
726
726
 
727
727
  from requests.exceptions import ChunkedEncodingError, ConnectionError
728
728
 
729
- if verbose:
729
+ if verbose and ips.run_once_within():
730
730
  print(
731
731
  "usage: downloader(url, dir_save=None, kind=['.pdf','xls'], contains=None, booster=False)"
732
732
  )
@@ -734,8 +734,11 @@ def downloader(
734
734
  def fname_corrector(fname, ext):
735
735
  if not ext.startswith("."):
736
736
  ext = "." + ext
737
- if not fname.endswith("ext"): # if not ext in fname:
737
+ if not fname.endswith(ext): # if not ext in fname:
738
738
  fname = fname[: -len(ext)] + ext
739
+ if not any(fname[: -len(ext)]):
740
+ from datetime import datetime
741
+ fname = datetime.now().strftime("%H%M%S") + ext
739
742
  return fname
740
743
 
741
744
  def check_and_modify_filename(directory, filename):
@@ -784,8 +787,8 @@ def downloader(
784
787
  kind[i] = "." + kind[i]
785
788
  file_links_all = []
786
789
  for kind_ in kind:
787
- if isinstance(contains, str):
788
- contains = [contains]
790
+ # if isinstance(contains, str):
791
+ # contains = [contains]
789
792
  if isinstance(url, str):
790
793
  if any(ext in url for ext in kind):
791
794
  file_links = [url]
@@ -799,7 +802,7 @@ def downloader(
799
802
  if contains is not None:
800
803
  file_links = filter_links(links_all, contains=contains + kind_)
801
804
  else:
802
- file_links = links_all # filter_links(links_all, contains=kind_)
805
+ file_links = filter_links(links_all, contains=kind_)#links_all #
803
806
  elif isinstance(url, list):
804
807
  links_all = url
805
808
  if contains is not None:
@@ -812,6 +815,7 @@ def downloader(
812
815
  file_links = filter_links(links_all, contains=contains + kind_)
813
816
  else:
814
817
  file_links = filter_links(links_all, contains=kind_)
818
+ file_links=ips.unique(file_links)
815
819
  if verbose:
816
820
  if file_links:
817
821
  from pprint import pp
@@ -825,6 +829,7 @@ def downloader(
825
829
  file_links_all = [file_links]
826
830
  elif isinstance(file_links, list):
827
831
  file_links_all.extend(file_links)
832
+ file_links_all=ips.unique(file_links_all)
828
833
  if dir_save:
829
834
  if rm_folder:
830
835
  ips.rm_folder(dir_save)
@@ -847,7 +852,7 @@ def downloader(
847
852
  )
848
853
  if ext is None:
849
854
  ext = kind_
850
- print("ehereerere", ext)
855
+
851
856
  if ext:
852
857
  corrected_fname = fname_corrector(fnames[idx], ext)
853
858
  corrected_fname = check_and_modify_filename(
@@ -860,13 +865,13 @@ def downloader(
860
865
  datetime.now().strftime("%y%m%d_%H%M%S_")
861
866
  + corrected_fname
862
867
  )
863
- fpath_tmp = os.path.join(dir_save, corrected_fname)
868
+ fpath_tmp = os.path.join(dir_save, corrected_fname)
864
869
  with open(fpath_tmp, "wb") as file:
865
870
  for chunk in response.iter_content(chunk_size=8192):
866
871
  if chunk: # Filter out keep-alive chunks
867
872
  file.write(chunk)
868
873
  if verbose:
869
- print(f"Done! {fnames[idx]}")
874
+ print(f"Done{fnames[idx]}")
870
875
  else:
871
876
  if verbose:
872
877
  print(f"Unknown file type for {file_link}")
@@ -886,16 +891,7 @@ def downloader(
886
891
 
887
892
  if itry == n_try:
888
893
  print(f"Failed to download {file_link} after {n_try} attempts.")
889
-
890
- # print(f"\n{len(fnames)} files were downloaded:")
891
- if verbose:
892
- from pprint import pp
893
-
894
- if corrected_fname:
895
- pp(corrected_fname)
896
- print(f"\n\nsaved @:\n{dir_save}")
897
- else:
898
- pp(fnames)
894
+
899
895
 
900
896
 
901
897
  def find_img(url, driver="request", dir_save="images", rm_folder=False, verbose=True):
py2ls/nl2ls.py ADDED
@@ -0,0 +1,283 @@
1
+ from . import translator,ips,plot
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ def detect(text, method: str = "vader", nb_model=None, device=-1,overall_method="major",overall_threhold=0.8,overall_weight=None,plot_=True,verbose=True, **kwargs) -> dict:
8
+ """
9
+ Analyze the sentiment of a text or a list of texts using different methods.
10
+
11
+ Parameters:
12
+ - text (str or list of str): The text(s) to analyze. Can be a single text or a list of texts.
13
+ - method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers', 'senta').
14
+ - nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
15
+ - vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
16
+ - device (int): Device to run the model on (-1 for CPU, 0 for GPU).
17
+ - transformer_model_name (str): Transformer model name for 'transformers' method.
18
+
19
+ Returns:
20
+ - dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
21
+ """
22
+ result = {
23
+ "method": method,
24
+ "score": None,
25
+ "label": None,
26
+ "language": None,
27
+ }
28
+
29
+ methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
30
+ if ips.run_once_within(10, reverse=True) and verbose:
31
+ print(f"methods: {methods}")
32
+
33
+ overall_methods=["majority","average","mean","threshold","weighted","detailed"]
34
+ if ips.run_once_within(10, reverse=True) and verbose:
35
+ print(f"overall_methods: {overall_methods}")
36
+ # If the input is a list of texts, loop through each one
37
+ if isinstance(text, list):
38
+ results = []
39
+ for text_ in text:
40
+ results.append(detect_single_text(text_, method=method, nb_model=nb_model, device=device, **kwargs))
41
+ res_overall=get_overall_results(results, method=overall_method, threshold=overall_threhold, weight=overall_weight)
42
+ if plot_:
43
+ res_detail=get_overall_results(results, method='detail', threshold=overall_threhold, weight=overall_weight)
44
+ plot.pie(res_detail["label"].value_counts(),explode=None,verbose=False)
45
+ return res_overall
46
+ else:
47
+ return detect_single_text(text=text, method=method, nb_model=nb_model, device=device, **kwargs)
48
+
49
+
50
+ def detect_single_text(text: str, method: str = "vader", nb_model=None, device=-1, **kwargs) -> dict:
51
+ """
52
+ Analyze the sentiment of a text using different methods.
53
+
54
+ Parameters:
55
+ - text (str): The text to analyze.
56
+ - method (str): The method to use ('vader', 'textblob', 'naive_bayes', 'transformers').
57
+ - nb_model (Optional[MultinomialNB]): Pre-trained Naive Bayes model (required if method='naive_bayes').
58
+ - vectorizer (Optional[TfidfVectorizer]): Vectorizer trained with Naive Bayes model (required if method='naive_bayes').
59
+ - transformer_model_name (str): Transformer model name for 'transformers' method.
60
+
61
+ Returns:
62
+ - dict: A dictionary with sentiment score, sentiment label, analysis method, and language.
63
+ """
64
+ result = {
65
+ "text":text,
66
+ "method": method,
67
+ "score": None,
68
+ "label": None,
69
+ "language": None,
70
+ }
71
+
72
+ # Detect language for additional insights
73
+ language = translator.detect_lang(text)
74
+ result["language"] = language
75
+ if language != "English" and method in ["vader", "textblob", "naive_bayes"]:
76
+ print("Detected non-English language, results may be inaccurate.")
77
+ methods=['vader','textblob','naive_bayes','transformer(not ready)','senta(not ready)']
78
+ method=ips.strcmp(method,methods)[0]
79
+ if method == "vader":
80
+ import nltk, os
81
+ from nltk.sentiment import SentimentIntensityAnalyzer
82
+
83
+ # check if it is downloaded
84
+ is_local = os.path.isfile(
85
+ os.path.join(nltk.data.path[0], "sentiment", "vader_lexicon.zip")
86
+ )
87
+ if not is_local:
88
+ nltk.download("vader_lexicon")
89
+ try:
90
+ sia = SentimentIntensityAnalyzer()
91
+ scores = sia.polarity_scores(text)
92
+ result["score"] = scores["compound"]
93
+ result["label"] = (
94
+ "Positive"
95
+ if scores["compound"] >= 0.05
96
+ else "Negative" if scores["compound"] <= -0.05 else "Neutral"
97
+ )
98
+ except Exception as e:
99
+ print(f"Error in VADER analysis: {e}")
100
+
101
+ elif method == "textblob":
102
+ from textblob import TextBlob
103
+
104
+ try:
105
+ blob = TextBlob(text)
106
+ polarity = blob.sentiment.polarity
107
+ result["score"] = polarity
108
+ result["label"] = (
109
+ "Positive"
110
+ if polarity > 0
111
+ else "Negative" if polarity < 0 else "Neutral"
112
+ )
113
+ except Exception as e:
114
+ print(f"Error in TextBlob analysis: {e}")
115
+
116
+ elif method == "naive_bayes":
117
+ from sklearn.naive_bayes import MultinomialNB
118
+ from sklearn.feature_extraction.text import TfidfVectorizer
119
+
120
+ try:
121
+ if nb_model is None or vectorizer is None:
122
+ from sklearn.model_selection import train_test_split
123
+
124
+ # Sample data for Naive Bayes training if model not provided
125
+ sample_texts = [
126
+ "I love this product",
127
+ "I hate this product",
128
+ "It's okay, not great",
129
+ "Absolutely fantastic!",
130
+ "Not satisfied",
131
+ ]
132
+ sample_labels = [1, 0, 0, 1, 0] # 1 = Positive, 0 = Negative
133
+
134
+ # Train Naive Bayes model
135
+ vectorizer = TfidfVectorizer()
136
+ X_train_tfidf = vectorizer.fit_transform(sample_texts)
137
+ nb_model = MultinomialNB()
138
+ nb_model.fit(X_train_tfidf, sample_labels)
139
+
140
+ transformed_text = vectorizer.transform([text])
141
+ prediction = nb_model.predict(transformed_text)[0]
142
+ result["score"] = max(nb_model.predict_proba(transformed_text)[0])
143
+ result["label"] = "Positive" if prediction == 1 else "Negative"
144
+
145
+ except Exception as e:
146
+ print(f"Error in Naive Bayes analysis: {e}")
147
+ elif method=="transformer":
148
+ try:
149
+ from transformers import pipeline
150
+ # Load pre-trained sentiment analysis pipeline with a Chinese model
151
+ classifier = pipeline('sentiment-analysis', model='bert-base-chinese', device=device)
152
+ analysis_result = classifier(text)
153
+ result["score"] = analysis_result[0]['score']
154
+ result["label"] = analysis_result[0]['label']
155
+ except Exception as e:
156
+ print(f"Error in Transformer analysis: {e}")
157
+ elif method == "senta":
158
+ from transformers import pipeline
159
+
160
+ try:
161
+ # Load the Senta model for sentiment analysis
162
+ classifier = pipeline('sentiment-analysis', model='junnyu/senta', device=device)
163
+ analysis_result = classifier(text)
164
+
165
+ # Senta model output will be a list with one result (since it's single text input)
166
+ result["score"] = analysis_result[0]["score"]
167
+ result["label"] = analysis_result[0]["label"]
168
+
169
+ except Exception as e:
170
+ print(f"Error in Senta analysis: {e}")
171
+
172
+ else:
173
+ print(
174
+ f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
175
+ )
176
+ raise ValueError(
177
+ f"Unknown method '{method}'. Available methods: 'vader', 'textblob', 'naive_bayes', 'transformers'"
178
+ )
179
+
180
+ return result
181
+
182
+ def get_overall_results(results, method="majority", threshold=0.8, weight=None,verbose=False):
183
+ from collections import Counter
184
+ """
185
+ Aggregates sentiment analysis results based on the selected method.
186
+
187
+ Parameters:
188
+ - results (list): A list of sentiment analysis results, each being a dictionary.
189
+ - method (str): The aggregation method to use ('majority', 'average', 'threshold', 'weighted', 'detailed').
190
+ - threshold (float): Confidence threshold for 'threshold' method.
191
+ - weight (dict): Optional dictionary for weighted aggregation (e.g., model name as key and weight as value).
192
+
193
+ Returns:
194
+ - dict: Aggregated sentiment result with final label and score.
195
+ """
196
+ def majority_voting(results):
197
+ """Aggregates sentiment using majority voting."""
198
+ labels = [result['label'] for result in results]
199
+ label_counts = Counter(labels)
200
+ final_label = label_counts.most_common(1)[0][0] # Get the most common label
201
+ return {"label": final_label}
202
+
203
+
204
+ def average_score(results):
205
+ """Aggregates sentiment by calculating the average score."""
206
+ scores = [result['score'] for result in results]
207
+ avg_score = sum(scores) / len(scores)
208
+
209
+ if avg_score > 0.05:
210
+ label = 'Positive'
211
+ elif avg_score < -0.05:
212
+ label = 'Negative'
213
+ else:
214
+ label = 'Neutral'
215
+
216
+ return {"score": avg_score, "label": label}
217
+
218
+
219
+ def confidence_threshold(results, threshold=0.8):
220
+ """Aggregates sentiment based on a confidence threshold."""
221
+ labels = [result['label'] for result in results]
222
+ label_counts = Counter(labels)
223
+ total_results = len(results)
224
+
225
+ for label, count in label_counts.items():
226
+ if count / total_results >= threshold:
227
+ return {"label": label}
228
+
229
+ return {"label": 'Neutral'} # If no label exceeds the threshold, return neutral
230
+
231
+
232
+ def weighted_average(results, weight=None):
233
+ """Aggregates sentiment based on a weighted average."""
234
+ if weight is None:
235
+ weight = {"vader": 2}
236
+
237
+ weighted_scores = 0
238
+ total_weight = 0
239
+
240
+ for result in results:
241
+ model = result.get('method', 'default')
242
+ model_weight = weight.get(model, 1) # Default weight is 1 if model not in weight dict
243
+ weighted_scores += result['score'] * model_weight
244
+ total_weight += model_weight
245
+
246
+ avg_weighted_score = weighted_scores / total_weight
247
+
248
+ # Assign label based on weighted average score
249
+ if avg_weighted_score > 0.05:
250
+ label = 'Positive'
251
+ elif avg_weighted_score < -0.05:
252
+ label = 'Negative'
253
+ else:
254
+ label = 'Neutral'
255
+
256
+ return {"score": avg_weighted_score, "label": label}
257
+
258
+ def detailed_output(results,verbose=False):
259
+ """Prints the detailed sentiment results."""
260
+ for result in results:
261
+ if verbose:
262
+ print(f"Label: {result['label']} | Score: {result['score']}")
263
+ return {"detailed_results": results}
264
+ overall_methods=["majority","average","mean","threshold","weighted","detailed"]
265
+ method=ips.strcmp(method, overall_methods)[0]
266
+ if method == "majority":
267
+ return majority_voting(results)
268
+
269
+ elif method in ["mean","average"]:
270
+ return average_score(results)
271
+
272
+ elif method == "threshold":
273
+ return confidence_threshold(results, threshold)
274
+
275
+ elif method == "weighted":
276
+ return weighted_average(results, weight)
277
+
278
+ elif method == "detailed":
279
+ return pd.DataFrame(results)
280
+ else:
281
+ raise ValueError(f"Unknown method '{method}'. Available methods: 'majority', 'average', 'threshold', 'weighted', 'detailed'")
282
+
283
+