opsci-toolbox 0.0.11__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/webscraping.py +75 -0
- opsci_toolbox/helpers/nlp.py +68 -1
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/METADATA +1 -1
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/RECORD +6 -6
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.11.dist-info → opsci_toolbox-0.0.12.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,81 @@ import concurrent.futures
|
|
11
11
|
import pandas as pd
|
12
12
|
from tqdm import tqdm
|
13
13
|
|
14
|
+
def get_tweet_html(username: str, tweet_id: str, **kwargs) -> str:
|
15
|
+
"""
|
16
|
+
Retrieves the HTML code of a tweet given the username and tweet ID.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
username (str): The username of the Twitter account.
|
20
|
+
tweet_id (str): The ID of the tweet.
|
21
|
+
kwargs : additional parameters to pass to the Twitter API.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
str: The HTML code of the tweet.
|
25
|
+
|
26
|
+
|
27
|
+
"""
|
28
|
+
params = {'lang':"en", # language of the features around the tweet
|
29
|
+
"maxwidth" : 550, # size of the tweet
|
30
|
+
"hide_media":False, # to hide photo / video
|
31
|
+
"hide_thread":False, # to hide original message on replies
|
32
|
+
"omit_script": True, # to include or not the JS script : <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
|
33
|
+
"align": None, # to align the tweet {left,right,center,none}
|
34
|
+
"theme": "light", # theme of the tweet {light,dark}
|
35
|
+
"dnt": True # When set to true, the Tweet and its embedded page on your site are not used for purposes that include personalized suggestions and personalized ads.
|
36
|
+
}
|
37
|
+
|
38
|
+
params.update(kwargs)
|
39
|
+
|
40
|
+
url = f'https://publish.twitter.com/oembed?url=https://twitter.com/{username}/status/{tweet_id}'
|
41
|
+
response = requests.get(url, params=params)
|
42
|
+
|
43
|
+
if response.status_code == 200:
|
44
|
+
data = response.json()
|
45
|
+
html = data.get('html')
|
46
|
+
return html, username, tweet_id
|
47
|
+
else:
|
48
|
+
print(response.url, "Failed to fetch data from Twitter.")
|
49
|
+
return None, username, tweet_id
|
50
|
+
|
51
|
+
|
52
|
+
def parallel_twitter_oembed(usernames, tweet_ids, **kwargs):
|
53
|
+
"""
|
54
|
+
Scrapes Twitter oEmbed data for multiple tweets in parallel.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
usernames (list): A list of Twitter usernames.
|
58
|
+
tweet_ids (list): A list of tweet IDs corresponding to the tweets of the given usernames.
|
59
|
+
**kwargs: Additional keyword arguments to be passed to the `get_tweet_html` function.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
pandas.DataFrame: A DataFrame containing the scraped tweet HTML, username, and message ID.
|
63
|
+
|
64
|
+
Raises:
|
65
|
+
Exception: If there is an error while downloading the tweet HTML.
|
66
|
+
|
67
|
+
"""
|
68
|
+
all_data = []
|
69
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
70
|
+
# Submit scraping tasks for each URL and add tqdm progress bar
|
71
|
+
futures = [
|
72
|
+
executor.submit(get_tweet_html, username, tweet_id, **kwargs)
|
73
|
+
for username, tweet_id in zip(usernames, tweet_ids)
|
74
|
+
]
|
75
|
+
for future in tqdm(
|
76
|
+
concurrent.futures.as_completed(futures),
|
77
|
+
total=len(usernames),
|
78
|
+
desc="Scraping Progress",
|
79
|
+
):
|
80
|
+
try:
|
81
|
+
data, username, tweet_id = future.result()
|
82
|
+
all_data.append((data, username, tweet_id))
|
83
|
+
except Exception as e:
|
84
|
+
print(f"Error downloading : {e}")
|
85
|
+
|
86
|
+
df = pd.DataFrame(all_data, columns=["tweet_html", "user_name", "message_id"])
|
87
|
+
return df
|
88
|
+
|
14
89
|
|
15
90
|
def url_get_domain(url: str) -> str:
|
16
91
|
"""
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -114,7 +114,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
|
|
114
114
|
The DataFrame with cleaned text data.
|
115
115
|
"""
|
116
116
|
df[col_clean] = df[col].apply(remove_rt)
|
117
|
-
df[col_clean] = df[
|
117
|
+
df[col_clean] = df[col_clean].apply(lambda x : urls(x, repl= ''))
|
118
118
|
df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
|
119
119
|
df[col_clean] = df[col_clean].apply(remove_extra_spaces)
|
120
120
|
# df = df.loc[(df[col_clean] != ""), :]
|
@@ -1042,6 +1042,73 @@ def sample_most_engaging_posts(df: pd.DataFrame, col_topic: str, col_engagement:
|
|
1042
1042
|
def get_lang_detector(nlp, name):
|
1043
1043
|
return LanguageDetector(seed=42) # We use the seed 42
|
1044
1044
|
|
1045
|
+
def PRarmy_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str = "lemmatized_text", pos_to_keep: list = ["VERB","NOUN","ADJ", "ADV", "PROPN"], entities_to_keep: list = ['PERSON','ORG', 'LOC'], stopwords: list = [], batch_size: int = 100, n_process: int = 1) -> pd.DataFrame:
|
1046
|
+
"""
|
1047
|
+
Perform natural language processing tasks using spaCy for PR Army project.
|
1048
|
+
Its main tasks are lemmatization and named entity recognition (NER).
|
1049
|
+
|
1050
|
+
Args:
|
1051
|
+
nlp : spacy.Language
|
1052
|
+
The spaCy language model.
|
1053
|
+
df : pandas.DataFrame
|
1054
|
+
The DataFrame containing the text data.
|
1055
|
+
col_text : str
|
1056
|
+
The name of the column containing the text data.
|
1057
|
+
col_lemma : str
|
1058
|
+
The name of the column to store the lemmatized text data.
|
1059
|
+
pos_to_keep : list
|
1060
|
+
A list of part-of-speech tags to keep during lemmatization.
|
1061
|
+
entities_to_keep : list
|
1062
|
+
A list of NER tags to keep.
|
1063
|
+
stopwords : list
|
1064
|
+
A list of stopwords to remove during processing.
|
1065
|
+
batch_size : int, optional
|
1066
|
+
The batch size for spaCy processing. Default is 100.
|
1067
|
+
n_process : int, optional
|
1068
|
+
The number of processes for parallel processing. Default is 1.
|
1069
|
+
Returns:
|
1070
|
+
pandas.DataFrame
|
1071
|
+
The DataFrame with processed text data.
|
1072
|
+
|
1073
|
+
"""
|
1074
|
+
all_records = []
|
1075
|
+
text=list(df[col_text].astype('unicode').values)
|
1076
|
+
|
1077
|
+
for doc in tqdm(nlp.pipe(text, batch_size=batch_size, n_process=n_process), total= len(text), desc = "NLP Process"):
|
1078
|
+
NER_type = []
|
1079
|
+
NER_text = []
|
1080
|
+
|
1081
|
+
### LEMMATIZATION
|
1082
|
+
|
1083
|
+
if len(pos_to_keep)>0 and len(stopwords)>0:
|
1084
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords and tok.pos_ in pos_to_keep]
|
1085
|
+
elif len(pos_to_keep)>0 and len(stopwords) < 1:
|
1086
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.pos_ in pos_to_keep]
|
1087
|
+
elif len(pos_to_keep) < 1 and len(stopwords) > 0:
|
1088
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space) and tok.text.lower() not in stopwords]
|
1089
|
+
else :
|
1090
|
+
lemmas_list = [str(tok.lemma_).lower() for tok in doc if not (tok.is_punct or tok.is_space)]
|
1091
|
+
|
1092
|
+
### NER
|
1093
|
+
if len(entities_to_keep)>0:
|
1094
|
+
for ent in doc.ents:
|
1095
|
+
if ent.label_ in entities_to_keep:
|
1096
|
+
NER_type.append(ent.label_)
|
1097
|
+
NER_text.append(ent.text)
|
1098
|
+
|
1099
|
+
else:
|
1100
|
+
for ent in doc.ents:
|
1101
|
+
NER_type.append(ent.label_)
|
1102
|
+
NER_text.append(ent.text)
|
1103
|
+
|
1104
|
+
record = (NER_type, NER_text, ' '.join(map(str, lemmas_list)))
|
1105
|
+
all_records.append(record)
|
1106
|
+
|
1107
|
+
|
1108
|
+
df[['NER_type', 'NER_text', col_lemma]] = pd.DataFrame(all_records, index=df.index)
|
1109
|
+
|
1110
|
+
return df
|
1111
|
+
|
1045
1112
|
def TM_nlp_process(nlp, df: pd.DataFrame, col_text: str, col_lemma: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True, join_list: bool = False) -> pd.DataFrame:
|
1046
1113
|
"""
|
1047
1114
|
Perform natural language processing tasks using spaCy for topic modeling.
|
@@ -1,14 +1,14 @@
|
|
1
1
|
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
|
4
|
-
opsci_toolbox/apis/webscraping.py,sha256=
|
4
|
+
opsci_toolbox/apis/webscraping.py,sha256=1DAIYbywZoPwTSyoqFGxyF0-q_nUsGg_VK51zLL_bB0,21465
|
5
5
|
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
6
6
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
|
8
8
|
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
9
9
|
opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
|
10
10
|
opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
11
|
+
opsci_toolbox/helpers/nlp.py,sha256=n7nNEU0cuu7bqXYRRBH4D-xIzpdNwKm0nj-eRYh3aPY,91956
|
12
12
|
opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
|
13
13
|
opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
|
14
14
|
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
|
|
16
16
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
18
18
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
19
|
-
opsci_toolbox-0.0.
|
20
|
-
opsci_toolbox-0.0.
|
21
|
-
opsci_toolbox-0.0.
|
22
|
-
opsci_toolbox-0.0.
|
19
|
+
opsci_toolbox-0.0.12.dist-info/METADATA,sha256=LosT5jzu7Z0TXIslwVUSvPG6AKMrblGp8A6odUN_N9U,1633
|
20
|
+
opsci_toolbox-0.0.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
21
|
+
opsci_toolbox-0.0.12.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
22
|
+
opsci_toolbox-0.0.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|