aiagents4pharma 1.4.3__tar.gz → 1.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/PKG-INFO +6 -2
- aiagents4pharma-1.5.4/aiagents4pharma/__init__.py +7 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/__init__.py +4 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +7 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +562 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +23 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +201 -0
- aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +201 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/PKG-INFO +6 -2
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/SOURCES.txt +7 -1
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/requires.txt +5 -1
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/pyproject.toml +8 -2
- aiagents4pharma-1.5.4/release_version.txt +1 -0
- aiagents4pharma-1.4.3/aiagents4pharma/__init__.py +0 -5
- aiagents4pharma-1.4.3/release_version.txt +0 -1
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/LICENSE +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/README.md +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/__init__.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/__init__.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/basico_model.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/sys_bio_model.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/__init__.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/ask_question.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/custom_plotter.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/fetch_parameters.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/model_description.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/search_models.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/simulate_model.py +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/dependency_links.txt +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/top_level.txt +0 -0
- {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: aiagents4pharma
|
3
|
-
Version: 1.4
|
3
|
+
Version: 1.5.4
|
4
4
|
Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -10,13 +10,16 @@ Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
11
11
|
Requires-Dist: copasi_basico==0.78
|
12
12
|
Requires-Dist: coverage==7.6.4
|
13
|
+
Requires-Dist: gdown==5.2.0
|
14
|
+
Requires-Dist: huggingface_hub==0.26.5
|
15
|
+
Requires-Dist: joblib==1.4.2
|
13
16
|
Requires-Dist: langchain==0.3.7
|
14
17
|
Requires-Dist: langchain-community==0.3.5
|
15
18
|
Requires-Dist: langchain-core==0.3.15
|
16
19
|
Requires-Dist: langchain-experimental==0.3.3
|
17
20
|
Requires-Dist: langchain-openai==0.2.5
|
18
21
|
Requires-Dist: matplotlib==3.9.2
|
19
|
-
Requires-Dist: openai==1.
|
22
|
+
Requires-Dist: openai==1.59.4
|
20
23
|
Requires-Dist: pandas==2.2.3
|
21
24
|
Requires-Dist: plotly==5.24.1
|
22
25
|
Requires-Dist: pydantic==2.9.2
|
@@ -24,6 +27,7 @@ Requires-Dist: pylint==3.3.1
|
|
24
27
|
Requires-Dist: pytest==8.3.3
|
25
28
|
Requires-Dist: streamlit==1.39.0
|
26
29
|
Requires-Dist: tabulate==0.9.0
|
30
|
+
Requires-Dist: torch==2.5.1
|
27
31
|
Requires-Dist: tqdm==4.66.6
|
28
32
|
Requires-Dist: mkdocs==1.6.1
|
29
33
|
Requires-Dist: mkdocs-jupyter==0.25.1
|
@@ -0,0 +1,562 @@
|
|
1
|
+
"""
|
2
|
+
Class for loading BioBridgePrimeKG dataset.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import pickle
|
7
|
+
import json
|
8
|
+
import requests
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
from tqdm import tqdm
|
12
|
+
from .dataset import Dataset
|
13
|
+
from .primekg import PrimeKG
|
14
|
+
|
15
|
+
class BioBridgePrimeKG(Dataset):
|
16
|
+
"""
|
17
|
+
Class for loading BioBridgePrimeKG dataset.
|
18
|
+
It downloads the data from the BioBridge repo and stores it in the local directory.
|
19
|
+
The data is then loaded into pandas DataFrame of nodes and edges.
|
20
|
+
This class was adapted from the BioBridge repo:
|
21
|
+
https://github.com/RyanWangZf/BioBridge
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self,
|
25
|
+
primekg_dir: str = "../../../data/primekg/",
|
26
|
+
local_dir: str = "../../../data/biobridge_primekg/",
|
27
|
+
random_seed: int=0,
|
28
|
+
n_neg_samples: int=5):
|
29
|
+
"""
|
30
|
+
Constructor for BioBridgePrimeKG class.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
primekg_dir (str): The directory of PrimeKG dataset.
|
34
|
+
local_dir (str): The directory to store the downloaded data.
|
35
|
+
random_seed (int): The random seed value.
|
36
|
+
"""
|
37
|
+
self.name: str = "biobridge_primekg"
|
38
|
+
self.primekg_dir: str = primekg_dir
|
39
|
+
self.local_dir: str = local_dir
|
40
|
+
self.random_seed = random_seed
|
41
|
+
self.n_neg_samples = n_neg_samples
|
42
|
+
# Preselected node types:
|
43
|
+
# protein, molecular function, cellular component, biological process, drug, disease
|
44
|
+
self.preselected_node_types = ["protein", "mf", "cc", "bp", "drug", "disease"]
|
45
|
+
self.node_type_map = {
|
46
|
+
"protein": "gene/protein",
|
47
|
+
"mf": "molecular_function",
|
48
|
+
"cc": "cellular_component",
|
49
|
+
"bp": "biological_process",
|
50
|
+
"drug": "drug",
|
51
|
+
"disease": "disease",
|
52
|
+
}
|
53
|
+
|
54
|
+
# Attributes to store the data
|
55
|
+
self.primekg = None
|
56
|
+
self.primekg_triplets = None
|
57
|
+
self.primekg_triplets_negative = None
|
58
|
+
self.data_config = None
|
59
|
+
self.emb_dict = None
|
60
|
+
self.df_train = None
|
61
|
+
self.df_node_train = None
|
62
|
+
self.df_test = None
|
63
|
+
self.df_node_test = None
|
64
|
+
self.node_info_dict = None
|
65
|
+
|
66
|
+
# Set up the dataset
|
67
|
+
self.setup()
|
68
|
+
|
69
|
+
def setup(self):
|
70
|
+
"""
|
71
|
+
A method to set up the dataset.
|
72
|
+
"""
|
73
|
+
# Make the directories if it doesn't exist
|
74
|
+
os.makedirs(os.path.dirname(self.primekg_dir), exist_ok=True)
|
75
|
+
os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
|
76
|
+
|
77
|
+
# Set the random seed
|
78
|
+
self.set_random_seed(self.random_seed)
|
79
|
+
|
80
|
+
# Set SettingWithCopyWarning warnings to none
|
81
|
+
pd.options.mode.chained_assignment = None
|
82
|
+
|
83
|
+
def _load_primekg(self) -> PrimeKG:
|
84
|
+
"""
|
85
|
+
Private method to load related files of PrimeKG dataset.
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
The PrimeKG dataset.
|
89
|
+
"""
|
90
|
+
primekg_data = PrimeKG(local_dir=self.primekg_dir)
|
91
|
+
primekg_data.load_data()
|
92
|
+
|
93
|
+
return primekg_data
|
94
|
+
|
95
|
+
def _download_file(self,
|
96
|
+
remote_url:str,
|
97
|
+
local_dir: str,
|
98
|
+
local_filename: str):
|
99
|
+
"""
|
100
|
+
A helper function to download a file from remote URL to the local directory.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
remote_url (str): The remote URL of the file to be downloaded.
|
104
|
+
local_dir (str): The local directory to store the downloaded file.
|
105
|
+
local_filename (str): The local filename to store the downloaded file.
|
106
|
+
"""
|
107
|
+
# Make the local directory if it does not exist
|
108
|
+
if not os.path.exists(local_dir):
|
109
|
+
os.makedirs(local_dir)
|
110
|
+
# Download the file from remote URL to local directory
|
111
|
+
local_path = os.path.join(local_dir, local_filename)
|
112
|
+
if os.path.exists(local_path):
|
113
|
+
print(f"File {local_filename} already exists in {local_dir}.")
|
114
|
+
else:
|
115
|
+
print(f"Downloading {local_filename} from {remote_url} to {local_dir}...")
|
116
|
+
response = requests.get(remote_url, stream=True, timeout=300)
|
117
|
+
response.raise_for_status()
|
118
|
+
progress_bar = tqdm(
|
119
|
+
total=int(response.headers.get("content-length", 0)),
|
120
|
+
unit="iB",
|
121
|
+
unit_scale=True,
|
122
|
+
)
|
123
|
+
with open(os.path.join(local_dir, local_filename), "wb") as file:
|
124
|
+
for data in response.iter_content(1024):
|
125
|
+
progress_bar.update(len(data))
|
126
|
+
file.write(data)
|
127
|
+
progress_bar.close()
|
128
|
+
|
129
|
+
def _load_data_config(self) -> dict:
|
130
|
+
"""
|
131
|
+
Load the data config file of BioBridgePrimeKG dataset.
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
The data config file of BioBridgePrimeKG dataset.
|
135
|
+
"""
|
136
|
+
# Download the data config file of BioBridgePrimeKG
|
137
|
+
self._download_file(
|
138
|
+
remote_url= ('https://raw.githubusercontent.com/RyanWangZf/BioBridge/'
|
139
|
+
'refs/heads/main/data/BindData/data_config.json'),
|
140
|
+
local_dir=self.local_dir,
|
141
|
+
local_filename='data_config.json')
|
142
|
+
|
143
|
+
# Load the downloaded data config file
|
144
|
+
with open(os.path.join(self.local_dir, 'data_config.json'), 'r', encoding='utf-8') as f:
|
145
|
+
data_config = json.load(f)
|
146
|
+
|
147
|
+
return data_config
|
148
|
+
|
149
|
+
def _build_node_embeddings(self) -> dict:
|
150
|
+
"""
|
151
|
+
Build the node embeddings for BioBridgePrimeKG dataset.
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
The dictionary of node embeddings.
|
155
|
+
"""
|
156
|
+
processed_file_path = os.path.join(self.local_dir, "embeddings", "embedding_dict.pkl")
|
157
|
+
if os.path.exists(processed_file_path):
|
158
|
+
# Load the embeddings from the local directory
|
159
|
+
with open(processed_file_path, "rb") as f:
|
160
|
+
emb_dict_all = pickle.load(f)
|
161
|
+
else:
|
162
|
+
# Download the embeddings from the BioBridge repo and further process them
|
163
|
+
# List of embedding source files
|
164
|
+
url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
|
165
|
+
'refs/heads/main/data/embeddings/esm2b_unimo_pubmedbert/')
|
166
|
+
file_list = [f"{n}.pkl" for n in self.preselected_node_types]
|
167
|
+
|
168
|
+
# Download the embeddings
|
169
|
+
for file in file_list:
|
170
|
+
self._download_file(remote_url=os.path.join(url, file),
|
171
|
+
local_dir=os.path.join(self.local_dir, "embeddings"),
|
172
|
+
local_filename=file)
|
173
|
+
|
174
|
+
# Unified embeddings
|
175
|
+
emb_dict_all = {}
|
176
|
+
for file in file_list:
|
177
|
+
with open(os.path.join(self.local_dir, "embeddings", file), "rb") as f:
|
178
|
+
emb = pickle.load(f)
|
179
|
+
emb_ar = emb["embedding"]
|
180
|
+
if not isinstance(emb_ar, list):
|
181
|
+
emb_ar = emb_ar.tolist()
|
182
|
+
emb_dict_all.update(dict(zip(emb["node_index"], emb_ar)))
|
183
|
+
|
184
|
+
# Store embeddings
|
185
|
+
with open(processed_file_path, "wb") as f:
|
186
|
+
pickle.dump(emb_dict_all, f)
|
187
|
+
|
188
|
+
return emb_dict_all
|
189
|
+
|
190
|
+
def _build_full_triplets(self) -> tuple[pd.DataFrame, dict]:
|
191
|
+
"""
|
192
|
+
Build the full triplets for BioBridgePrimeKG dataset.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
The full triplets for BioBridgePrimeKG dataset.
|
196
|
+
The dictionary of node information.
|
197
|
+
"""
|
198
|
+
processed_file_path = os.path.join(self.local_dir, "processed", "triplet_full.tsv.gz")
|
199
|
+
if os.path.exists(processed_file_path):
|
200
|
+
# Load the file from the local directory
|
201
|
+
with open(processed_file_path, "rb") as f:
|
202
|
+
primekg_triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
203
|
+
|
204
|
+
# Load each dataframe in the local directory
|
205
|
+
node_info_dict = {}
|
206
|
+
for i, node_type in enumerate(self.preselected_node_types):
|
207
|
+
with open(os.path.join(self.local_dir, "processed",
|
208
|
+
f"{node_type}.csv"), "rb") as f:
|
209
|
+
df_node = pd.read_csv(f)
|
210
|
+
node_info_dict[self.node_type_map[node_type]] = df_node
|
211
|
+
else:
|
212
|
+
# Download the related files from the BioBridge repo and further process them
|
213
|
+
# List of processed files
|
214
|
+
url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
|
215
|
+
'refs/heads/main/data/Processed/')
|
216
|
+
file_list = ["protein", "molecular", "cellular", "biological", "drug", "disease"]
|
217
|
+
|
218
|
+
# Download the processed files
|
219
|
+
for i, file in enumerate(file_list):
|
220
|
+
self._download_file(remote_url=os.path.join(url, f"{file}.csv"),
|
221
|
+
local_dir=os.path.join(self.local_dir, "processed"),
|
222
|
+
local_filename=f"{self.preselected_node_types[i]}.csv")
|
223
|
+
|
224
|
+
# Build the node index list
|
225
|
+
node_info_dict = {}
|
226
|
+
node_index_list = []
|
227
|
+
for i, file in enumerate(file_list):
|
228
|
+
df_node = pd.read_csv(os.path.join(self.local_dir, "processed",
|
229
|
+
f"{self.preselected_node_types[i]}.csv"))
|
230
|
+
node_info_dict[self.node_type_map[self.preselected_node_types[i]]] = df_node
|
231
|
+
node_index_list.extend(df_node["node_index"].tolist())
|
232
|
+
|
233
|
+
# Filter the PrimeKG dataset to take into account only the selected node types
|
234
|
+
primekg_triplets = self.primekg.get_edges().copy()
|
235
|
+
primekg_triplets = primekg_triplets[
|
236
|
+
primekg_triplets["head_index"].isin(node_index_list) &\
|
237
|
+
primekg_triplets["tail_index"].isin(node_index_list)
|
238
|
+
]
|
239
|
+
primekg_triplets = primekg_triplets.reset_index(drop=True)
|
240
|
+
|
241
|
+
# Perform mapping of node types
|
242
|
+
primekg_triplets["head_type"] = primekg_triplets["head_type"].apply(
|
243
|
+
lambda x: self.data_config["node_type"][x]
|
244
|
+
)
|
245
|
+
primekg_triplets["tail_type"] = primekg_triplets["tail_type"].apply(
|
246
|
+
lambda x: self.data_config["node_type"][x]
|
247
|
+
)
|
248
|
+
|
249
|
+
# Perform mapping of relation types
|
250
|
+
primekg_triplets["display_relation"] = primekg_triplets["display_relation"].apply(
|
251
|
+
lambda x: self.data_config["relation_type"][x]
|
252
|
+
)
|
253
|
+
|
254
|
+
# Store the processed triplets
|
255
|
+
primekg_triplets.to_csv(processed_file_path, sep="\t", compression="gzip", index=False)
|
256
|
+
|
257
|
+
return primekg_triplets, node_info_dict
|
258
|
+
|
259
|
+
def _build_train_test_split(self) -> tuple[pd.DataFrame, pd.DataFrame,
|
260
|
+
pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
261
|
+
"""
|
262
|
+
Build the train-test split for BioBridgePrimeKG dataset.
|
263
|
+
|
264
|
+
Returns:
|
265
|
+
The train triplets for BioBridgePrimeKG dataset.
|
266
|
+
The train nodes for BioBridgePrimeKG dataset.
|
267
|
+
The test triplets for BioBridgePrimeKG dataset.
|
268
|
+
The test nodes for BioBridgePrimeKG dataset.
|
269
|
+
The full triplets for BioBridgePrimeKG dataset.
|
270
|
+
"""
|
271
|
+
if os.path.exists(os.path.join(self.local_dir, "processed",
|
272
|
+
"triplet_full_altered.tsv.gz")):
|
273
|
+
# Load each dataframe in the local directory
|
274
|
+
with open(os.path.join(self.local_dir, "processed",
|
275
|
+
"triplet_train.tsv.gz"), "rb") as f:
|
276
|
+
df_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
277
|
+
|
278
|
+
with open(os.path.join(self.local_dir, "processed",
|
279
|
+
"node_train.tsv.gz"), "rb") as f:
|
280
|
+
df_node_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
281
|
+
|
282
|
+
with open(os.path.join(self.local_dir, "processed",
|
283
|
+
"triplet_test.tsv.gz"), "rb") as f:
|
284
|
+
df_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
285
|
+
|
286
|
+
with open(os.path.join(self.local_dir, "processed",
|
287
|
+
"node_test.tsv.gz"), "rb") as f:
|
288
|
+
df_node_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
289
|
+
|
290
|
+
with open(os.path.join(self.local_dir, "processed",
|
291
|
+
"triplet_full_altered.tsv.gz"), "rb") as f:
|
292
|
+
triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
293
|
+
else:
|
294
|
+
# Filtering out some nodes in the embedding dictionary
|
295
|
+
triplets = self.primekg_triplets.copy()
|
296
|
+
triplets = triplets[
|
297
|
+
triplets["head_index"].isin(list(self.emb_dict.keys())) &\
|
298
|
+
triplets["tail_index"].isin(list(self.emb_dict.keys()))
|
299
|
+
].reset_index(drop=True)
|
300
|
+
|
301
|
+
# Perform splitting of the triplets
|
302
|
+
list_split = {
|
303
|
+
"train": [],
|
304
|
+
"test": [],
|
305
|
+
}
|
306
|
+
node_split = {
|
307
|
+
"train": {
|
308
|
+
"node_index": [],
|
309
|
+
"node_type": [],
|
310
|
+
},
|
311
|
+
"test": {
|
312
|
+
"node_index": [],
|
313
|
+
"node_type": [],
|
314
|
+
}
|
315
|
+
}
|
316
|
+
# Loop over the node types
|
317
|
+
for node_type in triplets["head_type"].unique():
|
318
|
+
df_sub = triplets[triplets["head_type"] == node_type]
|
319
|
+
all_x_indexes = df_sub["head_index"].unique()
|
320
|
+
# By default, we use 90% of the nodes for training and 10% for testing
|
321
|
+
te_x_indexes = np.random.choice(
|
322
|
+
all_x_indexes, size=int(0.1*len(all_x_indexes)), replace=False
|
323
|
+
)
|
324
|
+
df_subs = {}
|
325
|
+
df_subs["test"] = df_sub[df_sub["head_index"].isin(te_x_indexes)]
|
326
|
+
df_subs["train"] = df_sub[~df_sub["head_index"].isin(te_x_indexes)]
|
327
|
+
list_split["train"].append(df_subs["train"])
|
328
|
+
list_split["test"].append(df_subs["test"])
|
329
|
+
|
330
|
+
# record the split
|
331
|
+
node_index = {}
|
332
|
+
node_index["train"] = df_subs["train"]["head_index"].unique()
|
333
|
+
node_split["train"]["node_index"].extend(node_index["train"].tolist())
|
334
|
+
node_split["train"]["node_type"].extend([node_type]*len(node_index["train"]))
|
335
|
+
node_index["test"] = df_subs["test"]["head_index"].unique()
|
336
|
+
node_split["test"]["node_index"].extend(node_index["test"].tolist())
|
337
|
+
node_split["test"]["node_type"].extend([node_type]*len(node_index["test"]))
|
338
|
+
|
339
|
+
print(f"Number of {node_type} nodes in train: {len(node_index['train'])}")
|
340
|
+
print(f"Number of {node_type} nodes in test: {len(node_index['test'])}")
|
341
|
+
|
342
|
+
# Prepare train and test DataFrames
|
343
|
+
df_train = pd.concat(list_split["train"])
|
344
|
+
df_node_train = pd.DataFrame(node_split["train"])
|
345
|
+
df_test = pd.concat(list_split["test"])
|
346
|
+
df_node_test = pd.DataFrame(node_split["test"])
|
347
|
+
|
348
|
+
# Store each dataframe in the local directory
|
349
|
+
df_train.to_csv(os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"),
|
350
|
+
sep="\t", compression="gzip", index=False)
|
351
|
+
df_node_train.to_csv(os.path.join(self.local_dir, "processed", "node_train.tsv.gz"),
|
352
|
+
sep="\t", compression="gzip", index=False)
|
353
|
+
df_test.to_csv(os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"),
|
354
|
+
sep="\t", compression="gzip", index=False)
|
355
|
+
df_node_test.to_csv(os.path.join(self.local_dir, "processed", "node_test.tsv.gz"),
|
356
|
+
sep="\t", compression="gzip", index=False)
|
357
|
+
# Store altered full triplets as well
|
358
|
+
triplets.to_csv(os.path.join(self.local_dir, "processed",
|
359
|
+
"triplet_full_altered.tsv.gz"),
|
360
|
+
sep="\t", compression="gzip", index=False)
|
361
|
+
|
362
|
+
return df_train, df_node_train, df_test, df_node_test, triplets
|
363
|
+
|
364
|
+
# def _negative_sampling(self,
|
365
|
+
# batch_df: pd.DataFrame,
|
366
|
+
# process_index: int,
|
367
|
+
# index_map: dict,
|
368
|
+
# node_train_dict: dict) -> pd.DataFrame:
|
369
|
+
# """
|
370
|
+
# A helper function to perform negative sampling for a batch of triplets.
|
371
|
+
# """
|
372
|
+
# negative_y_index_list = []
|
373
|
+
# for _, row in tqdm(batch_df.iterrows(),
|
374
|
+
# total=batch_df.shape[0],
|
375
|
+
# desc=f"Process {process_index}"):
|
376
|
+
# x_index = row['head_index']
|
377
|
+
# # y_index = row['y_index']
|
378
|
+
# y_index_type = row['tail_type']
|
379
|
+
# paired_y_index_list = index_map[x_index]
|
380
|
+
|
381
|
+
# # sample a list of negative y_index
|
382
|
+
# node_train_sub = node_train_dict[y_index_type]
|
383
|
+
# negative_y_index = node_train_sub[
|
384
|
+
# ~node_train_sub['node_index'].isin(paired_y_index_list)
|
385
|
+
# ]['node_index'].sample(self.n_neg_samples).tolist()
|
386
|
+
# negative_y_index_list.append(negative_y_index)
|
387
|
+
|
388
|
+
# batch_df.loc[:, 'negative_tail_index'] = negative_y_index_list
|
389
|
+
# return batch_df
|
390
|
+
|
391
|
+
# def _build_negative_triplets(self,
|
392
|
+
# chunk_size: int=100000,
|
393
|
+
# n_neg_samples: int=10):
|
394
|
+
# """
|
395
|
+
# Build the negative triplets for BioBridgePrimeKG dataset.
|
396
|
+
# """
|
397
|
+
# processed_file_path = os.path.join(self.local_dir,
|
398
|
+
# "processed",
|
399
|
+
# "triplet_train_negative.tsv.gz")
|
400
|
+
# if os.path.exists(processed_file_path):
|
401
|
+
# # Load the negative triplets from the local directory
|
402
|
+
# with open(processed_file_path, "rb") as f:
|
403
|
+
# triplets_negative = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
404
|
+
# else:
|
405
|
+
# # Set the number samples for negative sampling
|
406
|
+
# self.n_neg_samples = n_neg_samples
|
407
|
+
|
408
|
+
# # Split node list by type
|
409
|
+
# node_train_dict = {}
|
410
|
+
# type_list = self.df_node_train['node_type'].unique()
|
411
|
+
# for node_type in type_list:
|
412
|
+
# node_train_dict[node_type] = self.df_node_train[
|
413
|
+
# self.df_node_train['node_type'] == node_type
|
414
|
+
# ].reset_index(drop=True)
|
415
|
+
|
416
|
+
# # create an index mapping from x_index to y_index
|
417
|
+
# index_map = self.df_train[
|
418
|
+
# ['head_index', 'tail_index']
|
419
|
+
# ].drop_duplicates().groupby('head_index').agg(list).to_dict()['tail_index']
|
420
|
+
|
421
|
+
# # Negative sampling
|
422
|
+
# batch_df_list = []
|
423
|
+
# for i in tqdm(range(0, self.df_train.shape[0], chunk_size)):
|
424
|
+
# batch_df_list.append(self.df_train.iloc[i:i+chunk_size])
|
425
|
+
# # Process negative sampling
|
426
|
+
# results = [
|
427
|
+
# self._negative_sampling(batch_df,
|
428
|
+
# num_piece,
|
429
|
+
# index_map,
|
430
|
+
# node_train_dict)
|
431
|
+
# for num_piece, batch_df in enumerate(batch_df_list)
|
432
|
+
# ]
|
433
|
+
|
434
|
+
# # Store the negative triplets
|
435
|
+
# triplets_negative = pd.concat(results, axis=0)
|
436
|
+
# triplets_negative.to_csv(processed_file_path,
|
437
|
+
# sep="\t", compression="gzip", index=False)
|
438
|
+
|
439
|
+
# # Set attribute
|
440
|
+
# self.primekg_triplets_negative = triplets_negative
|
441
|
+
|
442
|
+
# return triplets_negative
|
443
|
+
|
444
|
+
# def load_data(self,
|
445
|
+
# build_neg_triplest: bool= False,
|
446
|
+
# chunk_size: int=100000,
|
447
|
+
# n_neg_samples: int=10):
|
448
|
+
|
449
|
+
def load_data(self):
|
450
|
+
"""
|
451
|
+
Load the BioBridgePrimeKG dataset into pandas DataFrame of nodes and edges.
|
452
|
+
|
453
|
+
Args:
|
454
|
+
build_neg_triplest (bool): Whether to build negative triplets.
|
455
|
+
chunk_size (int): The chunk size for negative sampling.
|
456
|
+
n_neg_samples (int): The number of negative samples for negative sampling.
|
457
|
+
"""
|
458
|
+
# Load PrimeKG dataset
|
459
|
+
print("Loading PrimeKG dataset...")
|
460
|
+
self.primekg = self._load_primekg()
|
461
|
+
|
462
|
+
# Load data config file of BioBridgePrimeKG
|
463
|
+
print("Loading data config file of BioBridgePrimeKG...")
|
464
|
+
self.data_config = self._load_data_config()
|
465
|
+
|
466
|
+
# Build node embeddings
|
467
|
+
print("Building node embeddings...")
|
468
|
+
self.emb_dict = self._build_node_embeddings()
|
469
|
+
|
470
|
+
# Build full triplets
|
471
|
+
print("Building full triplets...")
|
472
|
+
self.primekg_triplets, self.node_info_dict = self._build_full_triplets()
|
473
|
+
|
474
|
+
# Build train-test split
|
475
|
+
print("Building train-test split...")
|
476
|
+
self.df_train, self.df_node_train, self.df_test, self.df_node_test, self.primekg_triplets =\
|
477
|
+
self._build_train_test_split()
|
478
|
+
|
479
|
+
# if build_neg_triplest:
|
480
|
+
# # Build negative triplets
|
481
|
+
# print("Building negative triplets...")
|
482
|
+
# self.primekg_triplets_negative = self._build_negative_triplets(
|
483
|
+
# chunk_size=chunk_size,
|
484
|
+
# n_neg_samples=n_neg_samples
|
485
|
+
# )
|
486
|
+
|
487
|
+
def set_random_seed(self, seed: int):
|
488
|
+
"""
|
489
|
+
Set the random seed for reproducibility.
|
490
|
+
|
491
|
+
Args:
|
492
|
+
seed (int): The random seed value.
|
493
|
+
"""
|
494
|
+
np.random.seed(seed)
|
495
|
+
|
496
|
+
def get_primekg(self) -> PrimeKG:
|
497
|
+
"""
|
498
|
+
Get the PrimeKG dataset.
|
499
|
+
|
500
|
+
Returns:
|
501
|
+
The PrimeKG dataset.
|
502
|
+
"""
|
503
|
+
return self.primekg
|
504
|
+
|
505
|
+
def get_data_config(self) -> dict:
|
506
|
+
"""
|
507
|
+
Get the data config file of BioBridgePrimeKG dataset.
|
508
|
+
|
509
|
+
Returns:
|
510
|
+
The data config file of BioBridgePrimeKG dataset.
|
511
|
+
"""
|
512
|
+
return self.data_config
|
513
|
+
|
514
|
+
def get_node_embeddings(self) -> dict:
|
515
|
+
"""
|
516
|
+
Get the node embeddings for BioBridgePrimeKG dataset.
|
517
|
+
|
518
|
+
Returns:
|
519
|
+
The dictionary of node embeddings.
|
520
|
+
"""
|
521
|
+
return self.emb_dict
|
522
|
+
|
523
|
+
def get_primekg_triplets(self) -> pd.DataFrame:
|
524
|
+
"""
|
525
|
+
Get the full triplets for BioBridgePrimeKG dataset.
|
526
|
+
|
527
|
+
Returns:
|
528
|
+
The full triplets for BioBridgePrimeKG dataset.
|
529
|
+
"""
|
530
|
+
return self.primekg_triplets
|
531
|
+
|
532
|
+
# def get_primekg_triplets_negative(self) -> pd.DataFrame:
|
533
|
+
# """
|
534
|
+
# Get the negative triplets for BioBridgePrimeKG dataset.
|
535
|
+
|
536
|
+
# Returns:
|
537
|
+
# The negative triplets for BioBridgePrimeKG dataset.
|
538
|
+
# """
|
539
|
+
# return self.primekg_triplets_negative
|
540
|
+
|
541
|
+
def get_train_test_split(self) -> dict:
|
542
|
+
"""
|
543
|
+
Get the train-test split for BioBridgePrimeKG dataset.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
The train-test split for BioBridgePrimeKG dataset.
|
547
|
+
"""
|
548
|
+
return {
|
549
|
+
"train": self.df_train,
|
550
|
+
"node_train": self.df_node_train,
|
551
|
+
"test": self.df_test,
|
552
|
+
"node_test": self.df_node_test
|
553
|
+
}
|
554
|
+
|
555
|
+
def get_node_info_dict(self) -> dict:
|
556
|
+
"""
|
557
|
+
Get the node information dictionary for BioBridgePrimeKG dataset.
|
558
|
+
|
559
|
+
Returns:
|
560
|
+
The node information dictionary for BioBridgePrimeKG dataset.
|
561
|
+
"""
|
562
|
+
return self.node_info_dict
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
"""
|
4
|
+
Abstract class for dataset.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from abc import ABC, abstractmethod
|
8
|
+
|
9
|
+
class Dataset(ABC):
|
10
|
+
"""
|
11
|
+
Abstract class for dataset.
|
12
|
+
"""
|
13
|
+
@abstractmethod
|
14
|
+
def setup(self):
|
15
|
+
"""
|
16
|
+
A method to set up the dataset.
|
17
|
+
"""
|
18
|
+
|
19
|
+
@abstractmethod
|
20
|
+
def load_data(self):
|
21
|
+
"""
|
22
|
+
A method to load the dataset and potentially preprocess it.
|
23
|
+
"""
|
@@ -0,0 +1,201 @@
|
|
1
|
+
"""
|
2
|
+
Class for loading PrimeKG dataset.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import requests
|
7
|
+
from tqdm import tqdm
|
8
|
+
import pandas as pd
|
9
|
+
from .dataset import Dataset
|
10
|
+
|
11
|
+
class PrimeKG(Dataset):
|
12
|
+
"""
|
13
|
+
Class for loading PrimeKG dataset.
|
14
|
+
It downloads the data from the Harvard Dataverse and stores it in the local directory.
|
15
|
+
The data is then loaded into pandas DataFrame of nodes and edges.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self, local_dir: str = "../../../data/primekg/"):
|
19
|
+
"""
|
20
|
+
Constructor for PrimeKG class.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
local_dir (str): The local directory where the data will be stored.
|
24
|
+
"""
|
25
|
+
self.name: str = "primekg"
|
26
|
+
self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
|
27
|
+
self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
|
28
|
+
self.local_dir: str = local_dir
|
29
|
+
|
30
|
+
# Attributes to store the data
|
31
|
+
self.nodes: pd.DataFrame = None
|
32
|
+
self.edges: pd.DataFrame = None
|
33
|
+
|
34
|
+
# Set up the dataset
|
35
|
+
self.setup()
|
36
|
+
|
37
|
+
def setup(self):
|
38
|
+
"""
|
39
|
+
A method to set up the dataset.
|
40
|
+
"""
|
41
|
+
# Make the directory if it doesn't exist
|
42
|
+
os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
|
43
|
+
|
44
|
+
|
45
|
+
def _download_file(self, remote_url:str, local_path: str):
|
46
|
+
"""
|
47
|
+
A helper function to download a file from remote URL to the local directory.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
remote_url (str): The remote URL of the file to be downloaded.
|
51
|
+
local_path (str): The local path where the file will be saved.
|
52
|
+
"""
|
53
|
+
response = requests.get(remote_url, stream=True, timeout=300)
|
54
|
+
response.raise_for_status()
|
55
|
+
progress_bar = tqdm(
|
56
|
+
total=int(response.headers.get("content-length", 0)),
|
57
|
+
unit="iB",
|
58
|
+
unit_scale=True,
|
59
|
+
)
|
60
|
+
with open(local_path, "wb") as file:
|
61
|
+
for data in response.iter_content(1024):
|
62
|
+
progress_bar.update(len(data))
|
63
|
+
file.write(data)
|
64
|
+
progress_bar.close()
|
65
|
+
|
66
|
+
def _load_nodes(self) -> pd.DataFrame:
|
67
|
+
"""
|
68
|
+
Private method to load the nodes dataframe of PrimeKG dataset.
|
69
|
+
This method downloads the nodes file from the Harvard Dataverse if it does not exist
|
70
|
+
in the local directory. Otherwise, it loads the data from the local directory.
|
71
|
+
It further processes the dataframe of nodes and returns it.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
The nodes dataframe of PrimeKG dataset.
|
75
|
+
"""
|
76
|
+
local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
|
77
|
+
if os.path.exists(local_file):
|
78
|
+
print(f"{local_file} already exists. Loading the data from the local directory.")
|
79
|
+
|
80
|
+
# Load the dataframe from the local directory and assign it to the nodes attribute
|
81
|
+
nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
|
82
|
+
else:
|
83
|
+
print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
|
84
|
+
|
85
|
+
# Download the file from the Harvard Dataverse with designated file_id for node
|
86
|
+
self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
|
87
|
+
os.path.join(self.local_dir, "nodes.tab"))
|
88
|
+
|
89
|
+
# Load the downloaded file into a pandas DataFrame
|
90
|
+
nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
|
91
|
+
sep="\t", low_memory=False)
|
92
|
+
|
93
|
+
# Further processing of the dataframe
|
94
|
+
nodes = nodes[
|
95
|
+
["node_index", "node_name", "node_source", "node_id", "node_type"]
|
96
|
+
]
|
97
|
+
|
98
|
+
# Store compressed dataframe in the local directory
|
99
|
+
nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
|
100
|
+
|
101
|
+
return nodes
|
102
|
+
|
103
|
+
def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
|
104
|
+
"""
|
105
|
+
Private method to load the edges dataframe of PrimeKG dataset.
|
106
|
+
This method downloads the edges file from the Harvard Dataverse if it does not exist
|
107
|
+
in the local directory. Otherwise, it loads the data from the local directory.
|
108
|
+
It further processes the dataframe of edges and returns it.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
The edges dataframe of PrimeKG dataset.
|
115
|
+
"""
|
116
|
+
local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
|
117
|
+
if os.path.exists(local_file):
|
118
|
+
print(f"{local_file} already exists. Loading the data from the local directory.")
|
119
|
+
|
120
|
+
# Load the dataframe from the local directory and assign it to the edges attribute
|
121
|
+
edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
|
122
|
+
else:
|
123
|
+
print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
|
124
|
+
|
125
|
+
# Download the file from the Harvard Dataverse with designated file_id for edge
|
126
|
+
self._download_file(f"{self.server_path}{self.file_ids['edges']}",
|
127
|
+
os.path.join(self.local_dir, "edges.csv"))
|
128
|
+
|
129
|
+
# Load the downloaded file into a pandas DataFrame
|
130
|
+
edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
|
131
|
+
sep=",", low_memory=False)
|
132
|
+
|
133
|
+
# Further processing of the dataframe
|
134
|
+
edges = edges.merge(
|
135
|
+
nodes, left_on="x_index", right_on="node_index"
|
136
|
+
)
|
137
|
+
edges.drop(["x_index"], axis=1, inplace=True)
|
138
|
+
edges.rename(
|
139
|
+
columns={
|
140
|
+
"node_index": "head_index",
|
141
|
+
"node_name": "head_name",
|
142
|
+
"node_source": "head_source",
|
143
|
+
"node_id": "head_id",
|
144
|
+
"node_type": "head_type",
|
145
|
+
},
|
146
|
+
inplace=True,
|
147
|
+
)
|
148
|
+
edges = edges.merge(
|
149
|
+
nodes, left_on="y_index", right_on="node_index"
|
150
|
+
)
|
151
|
+
edges.drop(["y_index"], axis=1, inplace=True)
|
152
|
+
edges.rename(
|
153
|
+
columns={
|
154
|
+
"node_index": "tail_index",
|
155
|
+
"node_name": "tail_name",
|
156
|
+
"node_source": "tail_source",
|
157
|
+
"node_id": "tail_id",
|
158
|
+
"node_type": "tail_type"
|
159
|
+
},
|
160
|
+
inplace=True,
|
161
|
+
)
|
162
|
+
edges = edges[
|
163
|
+
[
|
164
|
+
"head_index", "head_name", "head_source", "head_id", "head_type",
|
165
|
+
"tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
|
166
|
+
"display_relation", "relation",
|
167
|
+
]
|
168
|
+
]
|
169
|
+
|
170
|
+
# Store compressed dataframe in the local directory
|
171
|
+
edges.to_csv(local_file, index=False, sep="\t", compression="gzip")
|
172
|
+
|
173
|
+
return edges
|
174
|
+
|
175
|
+
def load_data(self):
|
176
|
+
"""
|
177
|
+
Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
|
178
|
+
"""
|
179
|
+
print("Loading nodes of PrimeKG dataset ...")
|
180
|
+
self.nodes = self._load_nodes()
|
181
|
+
|
182
|
+
print("Loading edges of PrimeKG dataset ...")
|
183
|
+
self.edges = self._load_edges(self.nodes)
|
184
|
+
|
185
|
+
def get_nodes(self) -> pd.DataFrame:
|
186
|
+
"""
|
187
|
+
Get the nodes dataframe of PrimeKG dataset.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The nodes dataframe of PrimeKG dataset.
|
191
|
+
"""
|
192
|
+
return self.nodes
|
193
|
+
|
194
|
+
def get_edges(self) -> pd.DataFrame:
|
195
|
+
"""
|
196
|
+
Get the edges dataframe of PrimeKG dataset.
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
The edges dataframe of PrimeKG dataset.
|
200
|
+
"""
|
201
|
+
return self.edges
|
@@ -0,0 +1,201 @@
|
|
1
|
+
"""
|
2
|
+
Class for loading StarkQAPrimeKG dataset.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import shutil
|
7
|
+
import pickle
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
from tqdm import tqdm
|
11
|
+
import torch
|
12
|
+
from huggingface_hub import hf_hub_download, list_repo_files
|
13
|
+
import gdown
|
14
|
+
from .dataset import Dataset
|
15
|
+
|
16
|
+
class StarkQAPrimeKG(Dataset):
|
17
|
+
"""
|
18
|
+
Class for loading StarkQAPrimeKG dataset.
|
19
|
+
It downloads the data from the HuggingFace repo and stores it in the local directory.
|
20
|
+
The data is then loaded into pandas DataFrame of QA pairs, dictionary of split indices,
|
21
|
+
and node information.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self, local_dir: str = "../../../data/starkqa_primekg/"):
|
25
|
+
"""
|
26
|
+
Constructor for StarkQAPrimeKG class.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
local_dir (str): The local directory to store the dataset files.
|
30
|
+
"""
|
31
|
+
self.name: str = "starkqa_primekg"
|
32
|
+
self.hf_repo_id: str = "snap-stanford/stark"
|
33
|
+
self.local_dir: str = local_dir
|
34
|
+
# Attributes to store the data
|
35
|
+
self.starkqa: pd.DataFrame = None
|
36
|
+
self.starkqa_split_idx: dict = None
|
37
|
+
self.starkqa_node_info: dict = None
|
38
|
+
self.query_emb_dict: dict = None
|
39
|
+
self.node_emb_dict: dict = None
|
40
|
+
|
41
|
+
# Set up the dataset
|
42
|
+
self.setup()
|
43
|
+
|
44
|
+
def setup(self):
|
45
|
+
"""
|
46
|
+
A method to set up the dataset.
|
47
|
+
"""
|
48
|
+
# Make the directory if it doesn't exist
|
49
|
+
os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
|
50
|
+
|
51
|
+
def _load_stark_repo(self) -> tuple[pd.DataFrame, dict, dict]:
|
52
|
+
"""
|
53
|
+
Private method to load related files of StarkQAPrimeKG dataset.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
The nodes dataframe of StarkQAPrimeKG dataset.
|
57
|
+
The split indices of StarkQAPrimeKG dataset.
|
58
|
+
The node information of StarkQAPrimeKG dataset.
|
59
|
+
"""
|
60
|
+
# Download the file if it does not exist in the local directory
|
61
|
+
# Otherwise, load the data from the local directory
|
62
|
+
local_file = os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv")
|
63
|
+
if os.path.exists(local_file):
|
64
|
+
print(f"{local_file} already exists. Loading the data from the local directory.")
|
65
|
+
else:
|
66
|
+
print(f"Downloading files from {self.hf_repo_id}")
|
67
|
+
|
68
|
+
# List all related files in the HuggingFace Hub repository
|
69
|
+
files = list_repo_files(self.hf_repo_id, repo_type="dataset")
|
70
|
+
files = [f for f in files if ((f.startswith("qa/prime/") or
|
71
|
+
f.startswith("skb/prime/")) and f.find("raw") == -1)]
|
72
|
+
|
73
|
+
# Download and save each file in the specified folder
|
74
|
+
for file in tqdm(files):
|
75
|
+
_ = hf_hub_download(self.hf_repo_id,
|
76
|
+
file,
|
77
|
+
repo_type="dataset",
|
78
|
+
local_dir=self.local_dir)
|
79
|
+
|
80
|
+
# Unzip the processed files
|
81
|
+
shutil.unpack_archive(
|
82
|
+
os.path.join(self.local_dir, "skb/prime/processed.zip"),
|
83
|
+
os.path.join(self.local_dir, "skb/prime/")
|
84
|
+
)
|
85
|
+
|
86
|
+
# Load StarkQA dataframe
|
87
|
+
starkqa = pd.read_csv(
|
88
|
+
os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
|
89
|
+
low_memory=False)
|
90
|
+
|
91
|
+
# Read split indices
|
92
|
+
qa_indices = sorted(starkqa['id'].tolist())
|
93
|
+
starkqa_split_idx = {}
|
94
|
+
for split in ['train', 'val', 'test', 'test-0.1']:
|
95
|
+
indices_file = os.path.join(self.local_dir, "qa/prime/split", f'{split}.index')
|
96
|
+
with open(indices_file, 'r', encoding='utf-8') as f:
|
97
|
+
indices = f.read().strip().split('\n')
|
98
|
+
query_ids = [int(idx) for idx in indices]
|
99
|
+
starkqa_split_idx[split] = np.array(
|
100
|
+
[qa_indices.index(query_id) for query_id in query_ids]
|
101
|
+
)
|
102
|
+
|
103
|
+
# Load the node info of PrimeKG preprocessed for StarkQA
|
104
|
+
with open(os.path.join(self.local_dir, 'skb/prime/processed/node_info.pkl'), 'rb') as f:
|
105
|
+
starkqa_node_info = pickle.load(f)
|
106
|
+
|
107
|
+
return starkqa, starkqa_split_idx, starkqa_node_info
|
108
|
+
|
109
|
+
def _load_stark_embeddings(self) -> tuple[dict, dict]:
|
110
|
+
"""
|
111
|
+
Private method to load the embeddings of StarkQAPrimeKG dataset.
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
The query embeddings of StarkQAPrimeKG dataset.
|
115
|
+
The node embeddings of StarkQAPrimeKG dataset.
|
116
|
+
"""
|
117
|
+
# Load the provided embeddings of query and nodes
|
118
|
+
# Note that they utilized 'text-embedding-ada-002' for embeddings
|
119
|
+
emb_model = 'text-embedding-ada-002'
|
120
|
+
query_emb_url = 'https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU'
|
121
|
+
node_emb_url = 'https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy'
|
122
|
+
|
123
|
+
# Prepare respective directories to store the embeddings
|
124
|
+
emb_dir = os.path.join(self.local_dir, emb_model)
|
125
|
+
query_emb_dir = os.path.join(emb_dir, "query")
|
126
|
+
node_emb_dir = os.path.join(emb_dir, "doc")
|
127
|
+
os.makedirs(query_emb_dir, exist_ok=True)
|
128
|
+
os.makedirs(node_emb_dir, exist_ok=True)
|
129
|
+
query_emb_path = os.path.join(query_emb_dir, "query_emb_dict.pt")
|
130
|
+
node_emb_path = os.path.join(node_emb_dir, "candidate_emb_dict.pt")
|
131
|
+
|
132
|
+
# Download the embeddings if they do not exist in the local directory
|
133
|
+
if not os.path.exists(query_emb_path) or not os.path.exists(node_emb_path):
|
134
|
+
# Download the query embeddings
|
135
|
+
gdown.download(query_emb_url, query_emb_path, quiet=False)
|
136
|
+
|
137
|
+
# Download the node embeddings
|
138
|
+
gdown.download(node_emb_url, node_emb_path, quiet=False)
|
139
|
+
|
140
|
+
# Load the embeddings
|
141
|
+
query_emb_dict = torch.load(query_emb_path)
|
142
|
+
node_emb_dict = torch.load(node_emb_path)
|
143
|
+
|
144
|
+
return query_emb_dict, node_emb_dict
|
145
|
+
|
146
|
+
def load_data(self):
|
147
|
+
"""
|
148
|
+
Load the StarkQAPrimeKG dataset into pandas DataFrame of QA pairs,
|
149
|
+
dictionary of split indices, and node information.
|
150
|
+
"""
|
151
|
+
print("Loading StarkQAPrimeKG dataset...")
|
152
|
+
self.starkqa, self.starkqa_split_idx, self.starkqa_node_info = self._load_stark_repo()
|
153
|
+
|
154
|
+
print("Loading StarkQAPrimeKG embeddings...")
|
155
|
+
self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
|
156
|
+
|
157
|
+
|
158
|
+
def get_starkqa(self) -> pd.DataFrame:
|
159
|
+
"""
|
160
|
+
Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
The nodes dataframe of PrimeKG dataset.
|
164
|
+
"""
|
165
|
+
return self.starkqa
|
166
|
+
|
167
|
+
def get_starkqa_split_indicies(self) -> dict:
|
168
|
+
"""
|
169
|
+
Get the split indices of StarkQAPrimeKG dataset.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
The split indices of StarkQAPrimeKG dataset.
|
173
|
+
"""
|
174
|
+
return self.starkqa_split_idx
|
175
|
+
|
176
|
+
def get_starkqa_node_info(self) -> dict:
|
177
|
+
"""
|
178
|
+
Get the node information of StarkQAPrimeKG dataset.
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
The node information of StarkQAPrimeKG dataset.
|
182
|
+
"""
|
183
|
+
return self.starkqa_node_info
|
184
|
+
|
185
|
+
def get_query_embeddings(self) -> dict:
|
186
|
+
"""
|
187
|
+
Get the query embeddings of StarkQAPrimeKG dataset.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The query embeddings of StarkQAPrimeKG dataset.
|
191
|
+
"""
|
192
|
+
return self.query_emb_dict
|
193
|
+
|
194
|
+
def get_node_embeddings(self) -> dict:
|
195
|
+
"""
|
196
|
+
Get the node embeddings of StarkQAPrimeKG dataset.
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
The node embeddings of StarkQAPrimeKG dataset.
|
200
|
+
"""
|
201
|
+
return self.node_emb_dict
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: aiagents4pharma
|
3
|
-
Version: 1.4
|
3
|
+
Version: 1.5.4
|
4
4
|
Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D
|
5
5
|
Classifier: Programming Language :: Python :: 3
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -10,13 +10,16 @@ Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
11
11
|
Requires-Dist: copasi_basico==0.78
|
12
12
|
Requires-Dist: coverage==7.6.4
|
13
|
+
Requires-Dist: gdown==5.2.0
|
14
|
+
Requires-Dist: huggingface_hub==0.26.5
|
15
|
+
Requires-Dist: joblib==1.4.2
|
13
16
|
Requires-Dist: langchain==0.3.7
|
14
17
|
Requires-Dist: langchain-community==0.3.5
|
15
18
|
Requires-Dist: langchain-core==0.3.15
|
16
19
|
Requires-Dist: langchain-experimental==0.3.3
|
17
20
|
Requires-Dist: langchain-openai==0.2.5
|
18
21
|
Requires-Dist: matplotlib==3.9.2
|
19
|
-
Requires-Dist: openai==1.
|
22
|
+
Requires-Dist: openai==1.59.4
|
20
23
|
Requires-Dist: pandas==2.2.3
|
21
24
|
Requires-Dist: plotly==5.24.1
|
22
25
|
Requires-Dist: pydantic==2.9.2
|
@@ -24,6 +27,7 @@ Requires-Dist: pylint==3.3.1
|
|
24
27
|
Requires-Dist: pytest==8.3.3
|
25
28
|
Requires-Dist: streamlit==1.39.0
|
26
29
|
Requires-Dist: tabulate==0.9.0
|
30
|
+
Requires-Dist: torch==2.5.1
|
27
31
|
Requires-Dist: tqdm==4.66.6
|
28
32
|
Requires-Dist: mkdocs==1.6.1
|
29
33
|
Requires-Dist: mkdocs-jupyter==0.25.1
|
@@ -18,4 +18,10 @@ aiagents4pharma/talk2biomodels/tools/custom_plotter.py
|
|
18
18
|
aiagents4pharma/talk2biomodels/tools/fetch_parameters.py
|
19
19
|
aiagents4pharma/talk2biomodels/tools/model_description.py
|
20
20
|
aiagents4pharma/talk2biomodels/tools/search_models.py
|
21
|
-
aiagents4pharma/talk2biomodels/tools/simulate_model.py
|
21
|
+
aiagents4pharma/talk2biomodels/tools/simulate_model.py
|
22
|
+
aiagents4pharma/talk2knowledgegraphs/__init__.py
|
23
|
+
aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py
|
24
|
+
aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py
|
25
|
+
aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py
|
26
|
+
aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
|
27
|
+
aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py
|
@@ -1,12 +1,15 @@
|
|
1
1
|
copasi_basico==0.78
|
2
2
|
coverage==7.6.4
|
3
|
+
gdown==5.2.0
|
4
|
+
huggingface_hub==0.26.5
|
5
|
+
joblib==1.4.2
|
3
6
|
langchain==0.3.7
|
4
7
|
langchain-community==0.3.5
|
5
8
|
langchain-core==0.3.15
|
6
9
|
langchain-experimental==0.3.3
|
7
10
|
langchain-openai==0.2.5
|
8
11
|
matplotlib==3.9.2
|
9
|
-
openai==1.
|
12
|
+
openai==1.59.4
|
10
13
|
pandas==2.2.3
|
11
14
|
plotly==5.24.1
|
12
15
|
pydantic==2.9.2
|
@@ -14,6 +17,7 @@ pylint==3.3.1
|
|
14
17
|
pytest==8.3.3
|
15
18
|
streamlit==1.39.0
|
16
19
|
tabulate==0.9.0
|
20
|
+
torch==2.5.1
|
17
21
|
tqdm==4.66.6
|
18
22
|
mkdocs==1.6.1
|
19
23
|
mkdocs-jupyter==0.25.1
|
@@ -15,13 +15,16 @@ classifiers = [
|
|
15
15
|
dependencies = [
|
16
16
|
"copasi_basico==0.78",
|
17
17
|
"coverage==7.6.4",
|
18
|
+
"gdown==5.2.0",
|
19
|
+
"huggingface_hub==0.26.5",
|
20
|
+
"joblib==1.4.2",
|
18
21
|
"langchain==0.3.7",
|
19
22
|
"langchain-community==0.3.5",
|
20
23
|
"langchain-core==0.3.15",
|
21
24
|
"langchain-experimental==0.3.3",
|
22
25
|
"langchain-openai==0.2.5",
|
23
26
|
"matplotlib==3.9.2",
|
24
|
-
"openai==1.
|
27
|
+
"openai==1.59.4",
|
25
28
|
"pandas==2.2.3",
|
26
29
|
"plotly==5.24.1",
|
27
30
|
"pydantic==2.9.2",
|
@@ -29,6 +32,7 @@ dependencies = [
|
|
29
32
|
"pytest==8.3.3",
|
30
33
|
"streamlit==1.39.0",
|
31
34
|
"tabulate==0.9.0",
|
35
|
+
"torch==2.5.1",
|
32
36
|
"tqdm==4.66.6",
|
33
37
|
"mkdocs==1.6.1",
|
34
38
|
"mkdocs-jupyter==0.25.1",
|
@@ -48,7 +52,9 @@ version = {file = "release_version.txt"}
|
|
48
52
|
packages = ["aiagents4pharma",
|
49
53
|
"aiagents4pharma.talk2biomodels",
|
50
54
|
"aiagents4pharma.talk2biomodels.models",
|
51
|
-
"aiagents4pharma.talk2biomodels.tools"
|
55
|
+
"aiagents4pharma.talk2biomodels.tools",
|
56
|
+
"aiagents4pharma.talk2knowledgegraphs",
|
57
|
+
"aiagents4pharma.talk2knowledgegraphs.datasets"]
|
52
58
|
|
53
59
|
# [tool.setuptools.packages.find]
|
54
60
|
# where = ["aiagents4pharma", "aiagents4pharma.talk2biomodels"]
|
@@ -0,0 +1 @@
|
|
1
|
+
v1.5.4
|
@@ -1 +0,0 @@
|
|
1
|
-
v1.4.3
|
File without changes
|
File without changes
|
File without changes
|
{aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/__init__.py
RENAMED
File without changes
|
{aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/ask_question.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|