opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,81 @@ import concurrent.futures
11
11
  import pandas as pd
12
12
  from tqdm import tqdm
13
13
 
14
+ def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
15
+ """
16
+ Retrieves the HTML code of a tweet given the username and tweet ID.
17
+
18
+ Args:
19
+ username (str): The username of the Twitter account.
20
+ tweet_id (str): The ID of the tweet.
21
+ kwargs : additional parameters to pass to the Twitter API.
22
+
23
+ Returns:
24
+ str: The HTML code of the tweet.
25
+
26
+
27
+ """
28
+ params = {'lang':"en", # language of the features around the tweet
29
+ "maxwidth" : 550, # size of the tweet
30
+ "hide_media":False, # to hide photo / video
31
+ "hide_thread":False, # to hide original message on replies
32
+ "omit_script": True, # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
33
+ "align": None, # to align the tweet {left,right,center,none}
34
+ "theme": "light", # theme of the tweet {light,dark}
35
+ "dnt": True # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
36
+ }
37
+
38
+ params.update(kwargs)
39
+
40
+ url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
41
+ response = requests.get(url, params=params)
42
+
43
+ if response.status_code == 200:
44
+ data = response.json()
45
+ html = data.get('html')
46
+ return html, username, tweet_id
47
+ else:
48
+ print(response.url, "Failed to fetch data from Twitter.")
49
+ return None, username, tweet_id
50
+
51
+
52
+ def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
53
+ """
54
+ Scrapes Twitter oEmbed data for multiple tweets in parallel.
55
+
56
+ Args:
57
+ usernames (list): A list of Twitter usernames.
58
+ tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
59
+ **kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
60
+
61
+ Returns:
62
+ pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
63
+
64
+ Raises:
65
+ Exception: If there is an error while downloading the tweet HTML.
66
+
67
+ """
68
+ all_data = []
69
+ with concurrent.futures.ThreadPoolExecutor() as executor:
70
+ # Submit scraping tasks for each URL and add tqdm progress bar
71
+ futures = [
72
+ executor.submit(get_tweet_html, username, tweet_id, **kwargs)
73
+ for username, tweet_id in zip(usernames, tweet_ids)
74
+ ]
75
+ for future in tqdm(
76
+ concurrent.futures.as_completed(futures),
77
+ total=len(usernames),
78
+ desc="Scraping Progress",
79
+ ):
80
+ try:
81
+ data, username, tweet_id = future.result()
82
+ all_data.append((data, username, tweet_id))
83
+ except Exception as e:
84
+ print(f"Error downloading : {e}")
85
+
86
+ df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
87
+ return df
88
+
14
89
 
15
90
  def url_get_domain(url: str) -> str:
16
91
  """
@@ -114,7 +114,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
114
114
  The DataFrame with cleaned text data.
115
115
  """
116
116
  df[col_clean] = df[col].apply(remove_rt)
117
- df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
117
+ df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
118
118
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
119
119
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
120
120
  # df = df.loc[(df[col_clean] != ""), :]
@@ -1042,6 +1042,73 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
1042
1042
  def get_lang_detector(nlp, name):
1043
1043
  return LanguageDetector(seed=42) # We use the seed 42
1044
1044
 
1045
+ def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "lemmatized_text", pos_to_keep: list = ["VERB","NOUN","ADJ", "ADV", "PROPN"], entities_to_keep: list = ['PERSON','ORG', 'LOC'], stopwords: list = [], batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
1046
+ """
1047
+ Perform natural language processing tasks using spaCy for PR Army project.
1048
+ Its main tasks are lemmatization and named entity recognition (NER).
1049
+
1050
+ Args:
1051
+ nlp : spacy.Language
1052
+ The spaCy language model.
1053
+ df : pandas.DataFrame
1054
+ The DataFrame containing the text data.
1055
+ col_text : str
1056
+ The name of the column containing the text data.
1057
+ col_lemma : str
1058
+ The name of the column to store the lemmatized text data.
1059
+ pos_to_keep : list
1060
+ A list of part-of-speech tags to keep during lemmatization.
1061
+ entities_to_keep : list
1062
+ A list of NER tags to keep.
1063
+ stopwords : list
1064
+ A list of stopwords to remove during processing.
1065
+ batch_size : int, optional
1066
+ The batch size for spaCy processing. Default is 100.
1067
+ n_process : int, optional
1068
+ The number of processes for parallel processing. Default is 1.
1069
+ Returns:
1070
+ pandas.DataFrame
1071
+ The DataFrame with processed text data.
1072
+
1073
+ """
1074
+ all_records = []
1075
+ text=list(df[col_text].astype('unicode').values)
1076
+
1077
+ for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "NLP Process"):
1078
+ NER_type = []
1079
+ NER_text = []
1080
+
1081
+ ### LEMMATIZATION
1082
+
1083
+ if len(pos_to_keep)>0 and len(stopwords)>0:
1084
+ lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords and tok.pos_ in pos_to_keep]
1085
+ elif len(pos_to_keep)>0 and len(stopwords) < 1:
1086
+ lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.pos_ in pos_to_keep]
1087
+ elif len(pos_to_keep) < 1 and len(stopwords) > 0:
1088
+ lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords]
1089
+ else :
1090
+ lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space)]
1091
+
1092
+ ### NER
1093
+ if len(entities_to_keep)>0:
1094
+ for ent in doc.ents:
1095
+ if ent.label_ in entities_to_keep:
1096
+ NER_type.append(ent.label_)
1097
+ NER_text.append(ent.text)
1098
+
1099
+ else:
1100
+ for ent in doc.ents:
1101
+ NER_type.append(ent.label_)
1102
+ NER_text.append(ent.text)
1103
+
1104
+ record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
1105
+ all_records.append(record)
1106
+
1107
+
1108
+ df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
1109
+
1110
+ return df
1111
+
1045
1112
  def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
1046
1113
  """
1047
1114
  Perform natural language processing tasks using spaCy for topic modeling.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -1,14 +1,14 @@
1
1
  opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
4
- opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
4
+ opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
5
5
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
6
6
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
8
8
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
9
9
  opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
10
10
  opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
11
- opsci_toolbox/helpers/nlp.py,sha256=baq4BsSgeLBgToPOU5RTmDA80dFJwH9xf0jppuAVseU,88947
11
+ opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
12
12
  opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
13
13
  opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
14
14
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
16
16
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
18
18
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
19
- opsci_toolbox-0.0.11.dist-info/METADATA,sha256=5h-cfwhi31VKlzrOfdAeZuoKTLB1iyDIA4qqsz-bZGQ,1633
20
- opsci_toolbox-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
- opsci_toolbox-0.0.11.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
- opsci_toolbox-0.0.11.dist-info/RECORD,,
19
+ opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
20
+ opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
+ opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
+ opsci_toolbox-0.0.12.dist-info/RECORD,,