opsci-toolbox 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,6 +48,28 @@ def remove_html_tags(text: str) -> str:
48
48
  soup = BeautifulSoup(text, "html.parser")
49
49
  return soup.get_text()
50
50
 
51
+ def remove_rt(text: str) -> str:
52
+ """
53
+ Remove the retweet tag from a given text.
54
+
55
+ Args:
56
+ - text (str): The input text possibly containing a retweet tag in the format "RT @username: ".
57
+
58
+ Returns:
59
+ - str: The cleaned text with the retweet tag removed.
60
+
61
+ Example:
62
+ >>> remove_rt("RT @user123: Check out this tweet!")
63
+ 'Check out this tweet!'
64
+ """
65
+ # Regular expression pattern to match "RT @username: "
66
+ pattern = r'RT @\w+: '
67
+
68
+ # Substitute the pattern with an empty string
69
+ cleaned_text = re.sub(pattern, '', text)
70
+
71
+ return cleaned_text
72
+
51
73
  def filter_by_query(df: pd.DataFrame, col_text: str, query: str, ignore_case: bool = True, ignore_accent: bool = True, match_word: bool = False) -> pd.DataFrame:
52
74
  """
53
75
  Filter DataFrame rows by a query on a specific text column.
@@ -91,6 +113,7 @@ def TM_clean_text(df: pd.DataFrame, col: str, col_clean: str) -> pd.DataFrame:
91
113
  df : pandas DataFrame
92
114
  The DataFrame with cleaned text data.
93
115
  """
116
+ df[col_clean] = df[col].apply(remove_rt)
94
117
  df[col_clean] = df[col].apply(lambda x : urls(x, repl= ''))
95
118
  df[col_clean] = df.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row[col_clean].split())), 1)
96
119
  df[col_clean] = df[col_clean].apply(remove_extra_spaces)
@@ -1358,14 +1381,14 @@ def split_n_sentences(nlp, df: pd.DataFrame, col_text: str, n_sentences: int = 1
1358
1381
  return df
1359
1382
 
1360
1383
 
1361
- def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True) -> pd.DataFrame:
1384
+ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['PERSON','ORG'], explode: bool = True, batch_size : int = 100, n_process: int =1) -> pd.DataFrame:
1362
1385
  """
1363
1386
  Spacy implementation of NER.
1364
1387
  To define entities type to keep, call get_labels(nlp, pipe_step="ner", explanations=False)
1365
1388
  explode = False means it returns 1 list of entities per document
1366
1389
  explode = True means it returns 1 entity per row
1367
1390
 
1368
- Parameters:
1391
+ Args:
1369
1392
  nlp : spacy.language.Language
1370
1393
  The spaCy language processing pipeline.
1371
1394
  df : pd.DataFrame
@@ -1376,6 +1399,10 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
1376
1399
  List of entity types to keep. Default is ['PERSON','ORG'].
1377
1400
  explode : bool, optional
1378
1401
  Flag indicating whether to explode the DataFrame to have one entity per row. Default is True.
1402
+ batch_size : int, optional
1403
+ Batch sizes
1404
+ n_process : int, optional
1405
+ Number of processes
1379
1406
 
1380
1407
  Returns:
1381
1408
  pd.DataFrame
@@ -1385,43 +1412,40 @@ def spacy_NER(nlp, df: pd.DataFrame, col_text: str, entities_to_keep: list = ['P
1385
1412
  This function performs Named Entity Recognition (NER) using spaCy on a DataFrame with text data. It extracts entities of the specified types
1386
1413
  and stores the NER information in separate columns. If 'explode' is set to True, it returns one entity per row in the DataFrame.
1387
1414
  """
1388
- # Create columns to store the NER information
1389
- df['NER_type'] = None
1390
- df['NER_text'] = None
1391
- df['NER_start_char'] = None
1392
- df['NER_end_char'] = None
1393
-
1394
- # Function to process each row in the DataFrame
1395
- def process_row(row):
1396
- doc = nlp(row[col_text])
1397
- entities_data = []
1415
+ l_text = df[col_text].tolist()
1416
+ all_records = []
1417
+ for doc in tqdm(nlp.pipe(l_text, batch_size=batch_size, n_process=n_process), total= len(l_text), desc = "NLP Process"):
1418
+ NER_type = []
1419
+ NER_text = []
1420
+ NER_start_char = []
1421
+ NER_end_char=[]
1422
+ # entities_data = []
1398
1423
 
1399
1424
  if len(entities_to_keep)>0:
1400
1425
  for ent in doc.ents:
1401
1426
  if ent.label_ in entities_to_keep:
1402
- entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
1427
+ NER_type.append(ent.label_)
1428
+ NER_text.append(ent.text)
1429
+ NER_start_char.append(ent.start_char)
1430
+ NER_end_char.append(ent.end_char)
1431
+ # entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
1403
1432
  else:
1404
1433
  for ent in doc.ents:
1405
- entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
1434
+ NER_type.append(ent.label_)
1435
+ NER_text.append(ent.text)
1436
+ NER_start_char.append(ent.start_char)
1437
+ NER_end_char.append(ent.end_char)
1438
+ # entities_data.append([ent.label_, ent.text, ent.start_char, ent.end_char])
1439
+ record = (NER_type, NER_text, NER_start_char, NER_end_char)
1440
+ all_records.append(record)
1406
1441
 
1407
- if entities_data:
1408
- entity_label, entity_text, start_char, end_char = zip(*entities_data)
1409
- row['NER_type'] = entity_label
1410
- row['NER_text'] = entity_text
1411
- row['NER_start_char'] = start_char
1412
- row['NER_end_char'] = end_char
1413
-
1414
- return row
1415
-
1416
- # Apply the processing function to each row
1417
- df = df.apply(process_row, axis=1)
1442
+ df[['NER_type', 'NER_text','NER_start_char','NER_end_char']] = pd.DataFrame(all_records, index=df.index)
1418
1443
 
1419
1444
  if explode:
1420
1445
  df= df.explode(['NER_type', 'NER_text','NER_start_char','NER_end_char'])
1421
1446
 
1422
1447
  return df
1423
1448
 
1424
-
1425
1449
  def tokenize(nlp, df: pd.DataFrame, col_text: str, col_tokens: str, pos_to_keep: list, stopwords: list, batch_size: int = 100, n_process: int = 1, stats: bool = True) -> pd.DataFrame:
1426
1450
  """
1427
1451
  Spacy implementation to tokenize text
@@ -1901,15 +1925,13 @@ def agglomerative_clustering(embeddings, n_clusters=15, metric="euclidean", link
1901
1925
 
1902
1926
 
1903
1927
 
1904
- def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True,
1905
- gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None,
1906
- p=None, cluster_selection_method='eom', prediction_data = True):
1907
-
1928
+ def hdbscan_clustering(embeddings, algorithm='best', alpha=1.0, cluster_selection_epsilon=0.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, metric='euclidean', min_cluster_size=5, min_samples=None, p=None, cluster_selection_method='eom', prediction_data = True):
1908
1929
  """
1909
1930
  This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm. It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the probability of each sample being an outlier.
1931
+
1910
1932
  Args
1911
1933
  embeddings : array-like or sparse matrix, shape (n_samples, n_features). The input data to be clustered.
1912
- algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is 'best'.
1934
+ algorithm : {'best', 'generic', 'prims_kdtree', 'boruvka_kdtree', 'boruvka_balltree', 'prims_balltree'}, optional. The algorithm to use for computation. Default is best.
1913
1935
  alpha : float, optional. Scaling factor determining the individual weight of the (unnormalized) density estimate. Default is 1.0.
1914
1936
  cluster_selection_epsilon : float, optional. The epsilon value to specify a minimum cluster size. Default is 0.0.
1915
1937
  approx_min_span_tree : bool, optional. Whether to compute an approximation of the minimum spanning tree. Default is True.
@@ -2054,7 +2076,6 @@ def HF_sentiment_classifier(tokenizer, model, text, col_text, filename, dir_json
2054
2076
  proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()[0]
2055
2077
  label = model.config.id2label[proba.argmax()]
2056
2078
  results = {"label":label, "score" : float(proba.max()), col_text : text}
2057
- print(results)
2058
2079
  write_json(results, dir_json , str(filename))
2059
2080
 
2060
2081
  return results
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.1
2
+ Name: opsci-toolbox
3
+ Version: 0.0.11
4
+ Summary: a complete toolbox
5
+ Home-page: UNKNOWN
6
+ Author: Erwan Le Nagard
7
+ Author-email: erwan@opsci.ai
8
+ License: MIT
9
+ Platform: UNKNOWN
10
+ Requires-Dist: requests <3,>=2.31.0
11
+ Requires-Dist: beautifulsoup4 ==4.9.3
12
+ Requires-Dist: chardet >=4.0.0
13
+ Requires-Dist: chart-studio ==1.1.0
14
+ Requires-Dist: eldar ==0.0.8
15
+ Requires-Dist: emoji ==2.10.1
16
+ Requires-Dist: fa2-modified ==0.3.10
17
+ Requires-Dist: google-api-python-client ==2.122.0
18
+ Requires-Dist: gspread ==6.1.2
19
+ Requires-Dist: hdbscan ==0.8.33
20
+ Requires-Dist: jusText ==3.0.0
21
+ Requires-Dist: langchain ==0.1.20
22
+ Requires-Dist: matplotlib >=3.9.0
23
+ Requires-Dist: mysql-connector-python >=9.0.0
24
+ Requires-Dist: networkx ==3.2.1
25
+ Requires-Dist: nltk ==3.8.1
26
+ Requires-Dist: numpy <1.25.0,>=1.21.5
27
+ Requires-Dist: opencv-python-headless ==4.9.0.80
28
+ Requires-Dist: openpyxl ==3.1.3
29
+ Requires-Dist: pandas >=1.5.3
30
+ Requires-Dist: Pillow >=9.0.1
31
+ Requires-Dist: plotly ==5.19.0
32
+ Requires-Dist: protobuf ==4.23.4
33
+ Requires-Dist: pyarrow >=14.0.2
34
+ Requires-Dist: python-louvain ==0.16
35
+ Requires-Dist: scikit-learn ==1.4.1.post1
36
+ Requires-Dist: scipy <2.0.0,>=1.8.0
37
+ Requires-Dist: sentence-transformers ==2.5.1
38
+ Requires-Dist: setuptools ==59.6.0
39
+ Requires-Dist: spacy ==3.7.4
40
+ Requires-Dist: spacy-language-detection ==0.2.1
41
+ Requires-Dist: spacymoji ==3.1.0
42
+ Requires-Dist: supervision ==0.21.0
43
+ Requires-Dist: textacy ==0.13.0
44
+ Requires-Dist: torch ==2.0.1
45
+ Requires-Dist: tqdm >=4.66.2
46
+ Requires-Dist: trafilatura ==1.7.0
47
+ Requires-Dist: transformers ==4.38.2
48
+ Requires-Dist: umap-learn ==0.5.5
49
+ Requires-Dist: urlextract ==1.9.0
50
+ Requires-Dist: wordcloud ==1.9.3
51
+
52
+ UNKNOWN
53
+
@@ -4,11 +4,11 @@ opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSl
4
4
  opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
5
5
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
6
6
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- opsci_toolbox/helpers/common.py,sha256=lemGhNwWIxaMwo-X7UsksUMGLV-IOuX_XwC82a50GD4,44672
7
+ opsci_toolbox/helpers/common.py,sha256=nqg9wzgU5DxVTCxEb5LSw2lUnp0f_hKF_Q-DhpRtu6g,45158
8
8
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
9
- opsci_toolbox/helpers/dataviz.py,sha256=IfHByNWAU2rErZMfs3LuwZwJApLN5w320JEbBPuVp6U,115856
9
+ opsci_toolbox/helpers/dataviz.py,sha256=1cIGb-u81cD5iSIkkkrzyrBnfim7fbhm0x_CguHUbf0,202128
10
10
  opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
11
- opsci_toolbox/helpers/nlp.py,sha256=r4o7V9tJrj3xt34O_4hN0szbSB4RmveP8qmwCqHOxEY,87988
11
+ opsci_toolbox/helpers/nlp.py,sha256=baq4BsSgeLBgToPOU5RTmDA80dFJwH9xf0jppuAVseU,88947
12
12
  opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
13
13
  opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
14
14
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
@@ -16,7 +16,7 @@ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUK
16
16
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
18
18
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
19
- opsci_toolbox-0.0.10.dist-info/METADATA,sha256=DAYpwkedg6Tf4p_JS0ntxq9qUBx9hxWagStKN972RoU,1717
20
- opsci_toolbox-0.0.10.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
21
- opsci_toolbox-0.0.10.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
- opsci_toolbox-0.0.10.dist-info/RECORD,,
19
+ opsci_toolbox-0.0.11.dist-info/METADATA,sha256=5h-cfwhi31VKlzrOfdAeZuoKTLB1iyDIA4qqsz-bZGQ,1633
20
+ opsci_toolbox-0.0.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
21
+ opsci_toolbox-0.0.11.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
22
+ opsci_toolbox-0.0.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.37.1)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,53 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: opsci-toolbox
3
- Version: 0.0.10
4
- Summary: a complete toolbox
5
- Home-page: UNKNOWN
6
- Author: Erwan Le Nagard
7
- Author-email: erwan@opsci.ai
8
- License: MIT
9
- Platform: UNKNOWN
10
- Requires-Dist: Pillow (>=9.0.1)
11
- Requires-Dist: Requests (==2.32.3)
12
- Requires-Dist: beautifulsoup4 (==4.10.0)
13
- Requires-Dist: chardet (>=4.0.0)
14
- Requires-Dist: chart-studio (==1.1.0)
15
- Requires-Dist: eldar (==0.0.8)
16
- Requires-Dist: emoji (==2.10.1)
17
- Requires-Dist: fa2-modified (==0.3.10)
18
- Requires-Dist: google-api-python-client (==2.122.0)
19
- Requires-Dist: gspread (==6.1.2)
20
- Requires-Dist: hdbscan (==0.8.33)
21
- Requires-Dist: jusText (==3.0.0)
22
- Requires-Dist: langchain (==0.1.20)
23
- Requires-Dist: matplotlib (>=3.9.0)
24
- Requires-Dist: mysql-connector-repackaged (==0.3.1)
25
- Requires-Dist: networkx (==3.2.1)
26
- Requires-Dist: nltk (==3.8.1)
27
- Requires-Dist: numpy (<1.25.0,>=1.21.5)
28
- Requires-Dist: opencv-python-headless (==4.9.0.80)
29
- Requires-Dist: openpyxl (==3.1.3)
30
- Requires-Dist: pandas (>=1.5.3)
31
- Requires-Dist: plotly (==5.19.0)
32
- Requires-Dist: protobuf (==4.23.4)
33
- Requires-Dist: pyarrow (>=14.0.2)
34
- Requires-Dist: python-louvain (==0.16)
35
- Requires-Dist: scikit-learn (==1.4.1.post1)
36
- Requires-Dist: scipy (<2.0.0,>=1.8.0)
37
- Requires-Dist: sentence-transformers (==2.5.1)
38
- Requires-Dist: setuptools (==59.6.0)
39
- Requires-Dist: spacy (==3.7.4)
40
- Requires-Dist: spacy-language-detection (==0.2.1)
41
- Requires-Dist: spacymoji (==3.1.0)
42
- Requires-Dist: supervision (==0.21.0)
43
- Requires-Dist: textacy (==0.13.0)
44
- Requires-Dist: torch (==2.0.1)
45
- Requires-Dist: tqdm (==4.66.2)
46
- Requires-Dist: trafilatura (==1.7.0)
47
- Requires-Dist: transformers (==4.38.2)
48
- Requires-Dist: umap-learn (==0.5.5)
49
- Requires-Dist: urlextract (==1.9.0)
50
- Requires-Dist: wordcloud (==1.9.3)
51
-
52
- UNKNOWN
53
-