risk-network 0.0.9b42__py3-none-any.whl → 0.0.9b44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.9-beta.42"
10
+ __version__ = "0.0.9-beta.44"
@@ -3,7 +3,9 @@ risk/annotations/annotations
3
3
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
+ import os
6
7
  import re
8
+ import zipfile
7
9
  from collections import Counter
8
10
  from itertools import compress
9
11
  from typing import Any, Dict, List, Set
@@ -20,29 +22,51 @@ from risk.log import logger
20
22
  from scipy.sparse import coo_matrix
21
23
 
22
24
 
23
- def _setup_nltk():
24
- """Ensure necessary NLTK data is downloaded."""
25
+ def ensure_nltk_resource(resource: str) -> None:
26
+ """Ensure the specified NLTK resource is available."""
27
+ # Define the path to the resource within the NLTK data directory
28
+ resource_path = f"corpora/{resource}"
29
+ # Check if the resource is already available.
25
30
  try:
26
- nltk.data.find("tokenizers/punkt")
31
+ nltk.data.find(resource_path)
32
+ return
27
33
  except LookupError:
28
- # Force download if not found
29
- nltk.download("punkt", force=True, quiet=True)
34
+ print(f"Resource '{resource}' not found. Attempting to download...")
30
35
 
36
+ # Download the resource.
37
+ nltk.download(resource)
38
+ # Check again after downloading.
31
39
  try:
32
- nltk.data.find("corpora/stopwords")
40
+ nltk.data.find(resource_path)
41
+ return
33
42
  except LookupError:
34
- nltk.download("stopwords", force=True, quiet=True)
35
-
43
+ print(f"Resource '{resource}' still not found after download. Checking for a ZIP file...")
44
+
45
+ # Look for a ZIP file in all known NLTK data directories.
46
+ for data_path in nltk.data.path:
47
+ zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
48
+ if os.path.isfile(zip_path):
49
+ print(f"Found ZIP file for '{resource}' at: {zip_path}")
50
+ target_dir = os.path.join(data_path, "corpora")
51
+ with zipfile.ZipFile(zip_path, "r") as z:
52
+ z.extractall(path=target_dir)
53
+ print(f"Unzipped '{resource}' successfully.")
54
+ break # Stop after unzipping the first found ZIP.
55
+
56
+ # Final check: Try to load the resource one last time.
36
57
  try:
37
- nltk.data.find("corpora/wordnet")
58
+ nltk.data.find(resource_path)
59
+ print(f"Resource '{resource}' is now available.")
38
60
  except LookupError:
39
- nltk.download("wordnet", force=True, quiet=True)
61
+ raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
40
62
 
41
63
 
42
- # Ensure you have the necessary NLTK data
43
- _setup_nltk()
44
- # Use NLTK's stopwords
45
- STOP_WORDS = set(stopwords.words("english"))
64
+ # Ensure the NLTK stopwords and WordNet resources are available
65
+ ensure_nltk_resource("stopwords")
66
+ ensure_nltk_resource("wordnet")
67
+ # Use NLTK's stopwords - load all languages
68
+ STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
69
+ # Initialize the WordNet lemmatizer, which is used for normalizing words
46
70
  LEMMATIZER = WordNetLemmatizer()
47
71
 
48
72
 
@@ -242,7 +266,7 @@ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series)
242
266
  weight = max(1, int((0 if pd.isna(score) else score) * 10))
243
267
  for token in tokens:
244
268
  # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
245
- token_clean = re.sub(r"[^\w\-]", "", token.lower()).strip()
269
+ token_clean = re.sub(r"[^\w\-]", "", token).strip()
246
270
  if not token_clean:
247
271
  continue
248
272
  # Skip tokens that are pure numbers
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b42
3
+ Version: 0.0.9b44
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -1,7 +1,7 @@
1
- risk/__init__.py,sha256=0G_X2wjPwCz7UG5bgL3bYfsnVpLvoRhcMyS2bV45ZKI,127
1
+ risk/__init__.py,sha256=RVOwiHzzwMXL1qujltMK4sdkHgP3Pv85KrFz7QfhPTk,127
2
2
  risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
3
3
  risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
4
- risk/annotations/annotations.py,sha256=_2crX1SKphCY1gJmDpRuvYouf8DowScSetbxCG3vLHk,15022
4
+ risk/annotations/annotations.py,sha256=5X2R8RFxgK6kgSoj05UdCPcpkRRPOaHjGwnIrjeD5Ww,16299
5
5
  risk/annotations/io.py,sha256=z1AJySsU-KL_IYuHa7j3nvuczmOHgK3WfaQ4TRunvrA,10499
6
6
  risk/log/__init__.py,sha256=7LxDysQu7doi0LAvlY2YbjN6iJH0fNknqy8lSLgeljo,217
7
7
  risk/log/console.py,sha256=PgjyEvyhYLUSHXPUKEqOmxsDsfrjPICIgqo_cAHq0N8,4575
@@ -33,8 +33,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
33
33
  risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
34
34
  risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
35
35
  risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
36
- risk_network-0.0.9b42.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
37
- risk_network-0.0.9b42.dist-info/METADATA,sha256=vShfLdet9LjI_5lQOuQdcvaG1rIbwbBRO34gF3BFIcw,47627
38
- risk_network-0.0.9b42.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
39
- risk_network-0.0.9b42.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
40
- risk_network-0.0.9b42.dist-info/RECORD,,
36
+ risk_network-0.0.9b44.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
37
+ risk_network-0.0.9b44.dist-info/METADATA,sha256=mLBWb_wyKny6tgHt3xmdlaZgwSic3pVhSg47e-b6O5A,47627
38
+ risk_network-0.0.9b44.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
39
+ risk_network-0.0.9b44.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
40
+ risk_network-0.0.9b44.dist-info/RECORD,,