risk-network 0.0.9b42__py3-none-any.whl → 0.0.9b44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/annotations.py +39 -15
- {risk_network-0.0.9b42.dist-info → risk_network-0.0.9b44.dist-info}/METADATA +1 -1
- {risk_network-0.0.9b42.dist-info → risk_network-0.0.9b44.dist-info}/RECORD +7 -7
- {risk_network-0.0.9b42.dist-info → risk_network-0.0.9b44.dist-info}/LICENSE +0 -0
- {risk_network-0.0.9b42.dist-info → risk_network-0.0.9b44.dist-info}/WHEEL +0 -0
- {risk_network-0.0.9b42.dist-info → risk_network-0.0.9b44.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/annotations/annotations.py
CHANGED
@@ -3,7 +3,9 @@ risk/annotations/annotations
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
+
import os
|
6
7
|
import re
|
8
|
+
import zipfile
|
7
9
|
from collections import Counter
|
8
10
|
from itertools import compress
|
9
11
|
from typing import Any, Dict, List, Set
|
@@ -20,29 +22,51 @@ from risk.log import logger
|
|
20
22
|
from scipy.sparse import coo_matrix
|
21
23
|
|
22
24
|
|
23
|
-
def
|
24
|
-
"""Ensure
|
25
|
+
def ensure_nltk_resource(resource: str) -> None:
|
26
|
+
"""Ensure the specified NLTK resource is available."""
|
27
|
+
# Define the path to the resource within the NLTK data directory
|
28
|
+
resource_path = f"corpora/{resource}"
|
29
|
+
# Check if the resource is already available.
|
25
30
|
try:
|
26
|
-
nltk.data.find(
|
31
|
+
nltk.data.find(resource_path)
|
32
|
+
return
|
27
33
|
except LookupError:
|
28
|
-
|
29
|
-
nltk.download("punkt", force=True, quiet=True)
|
34
|
+
print(f"Resource '{resource}' not found. Attempting to download...")
|
30
35
|
|
36
|
+
# Download the resource.
|
37
|
+
nltk.download(resource)
|
38
|
+
# Check again after downloading.
|
31
39
|
try:
|
32
|
-
nltk.data.find(
|
40
|
+
nltk.data.find(resource_path)
|
41
|
+
return
|
33
42
|
except LookupError:
|
34
|
-
|
35
|
-
|
43
|
+
print(f"Resource '{resource}' still not found after download. Checking for a ZIP file...")
|
44
|
+
|
45
|
+
# Look for a ZIP file in all known NLTK data directories.
|
46
|
+
for data_path in nltk.data.path:
|
47
|
+
zip_path = os.path.join(data_path, "corpora", f"{resource}.zip")
|
48
|
+
if os.path.isfile(zip_path):
|
49
|
+
print(f"Found ZIP file for '{resource}' at: {zip_path}")
|
50
|
+
target_dir = os.path.join(data_path, "corpora")
|
51
|
+
with zipfile.ZipFile(zip_path, "r") as z:
|
52
|
+
z.extractall(path=target_dir)
|
53
|
+
print(f"Unzipped '{resource}' successfully.")
|
54
|
+
break # Stop after unzipping the first found ZIP.
|
55
|
+
|
56
|
+
# Final check: Try to load the resource one last time.
|
36
57
|
try:
|
37
|
-
nltk.data.find(
|
58
|
+
nltk.data.find(resource_path)
|
59
|
+
print(f"Resource '{resource}' is now available.")
|
38
60
|
except LookupError:
|
39
|
-
|
61
|
+
raise LookupError(f"Resource '{resource}' could not be found, downloaded, or unzipped.")
|
40
62
|
|
41
63
|
|
42
|
-
# Ensure
|
43
|
-
|
44
|
-
|
45
|
-
|
64
|
+
# Ensure the NLTK stopwords and WordNet resources are available
|
65
|
+
ensure_nltk_resource("stopwords")
|
66
|
+
ensure_nltk_resource("wordnet")
|
67
|
+
# Use NLTK's stopwords - load all languages
|
68
|
+
STOP_WORDS = set(word for lang in stopwords.fileids() for word in stopwords.words(lang))
|
69
|
+
# Initialize the WordNet lemmatizer, which is used for normalizing words
|
46
70
|
LEMMATIZER = WordNetLemmatizer()
|
47
71
|
|
48
72
|
|
@@ -242,7 +266,7 @@ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series)
|
|
242
266
|
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
243
267
|
for token in tokens:
|
244
268
|
# Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
|
245
|
-
token_clean = re.sub(r"[^\w\-]", "", token
|
269
|
+
token_clean = re.sub(r"[^\w\-]", "", token).strip()
|
246
270
|
if not token_clean:
|
247
271
|
continue
|
248
272
|
# Skip tokens that are pure numbers
|
@@ -1,7 +1,7 @@
|
|
1
|
-
risk/__init__.py,sha256=
|
1
|
+
risk/__init__.py,sha256=RVOwiHzzwMXL1qujltMK4sdkHgP3Pv85KrFz7QfhPTk,127
|
2
2
|
risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
|
3
3
|
risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
|
4
|
-
risk/annotations/annotations.py,sha256=
|
4
|
+
risk/annotations/annotations.py,sha256=5X2R8RFxgK6kgSoj05UdCPcpkRRPOaHjGwnIrjeD5Ww,16299
|
5
5
|
risk/annotations/io.py,sha256=z1AJySsU-KL_IYuHa7j3nvuczmOHgK3WfaQ4TRunvrA,10499
|
6
6
|
risk/log/__init__.py,sha256=7LxDysQu7doi0LAvlY2YbjN6iJH0fNknqy8lSLgeljo,217
|
7
7
|
risk/log/console.py,sha256=PgjyEvyhYLUSHXPUKEqOmxsDsfrjPICIgqo_cAHq0N8,4575
|
@@ -33,8 +33,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
|
|
33
33
|
risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
|
34
34
|
risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
|
35
35
|
risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
|
36
|
-
risk_network-0.0.
|
37
|
-
risk_network-0.0.
|
38
|
-
risk_network-0.0.
|
39
|
-
risk_network-0.0.
|
40
|
-
risk_network-0.0.
|
36
|
+
risk_network-0.0.9b44.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
|
37
|
+
risk_network-0.0.9b44.dist-info/METADATA,sha256=mLBWb_wyKny6tgHt3xmdlaZgwSic3pVhSg47e-b6O5A,47627
|
38
|
+
risk_network-0.0.9b44.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
39
|
+
risk_network-0.0.9b44.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
|
40
|
+
risk_network-0.0.9b44.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|