aiagents4pharma 1.4.3__tar.gz → 1.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/PKG-INFO +6 -2
  2. aiagents4pharma-1.5.4/aiagents4pharma/__init__.py +7 -0
  3. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/__init__.py +4 -0
  4. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +7 -0
  5. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +562 -0
  6. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +23 -0
  7. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +201 -0
  8. aiagents4pharma-1.5.4/aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +201 -0
  9. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/PKG-INFO +6 -2
  10. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/SOURCES.txt +7 -1
  11. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/requires.txt +5 -1
  12. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/pyproject.toml +8 -2
  13. aiagents4pharma-1.5.4/release_version.txt +1 -0
  14. aiagents4pharma-1.4.3/aiagents4pharma/__init__.py +0 -5
  15. aiagents4pharma-1.4.3/release_version.txt +0 -1
  16. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/LICENSE +0 -0
  17. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/README.md +0 -0
  18. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/__init__.py +0 -0
  19. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/__init__.py +0 -0
  20. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/basico_model.py +0 -0
  21. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/models/sys_bio_model.py +0 -0
  22. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/__init__.py +0 -0
  23. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/ask_question.py +0 -0
  24. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/custom_plotter.py +0 -0
  25. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/fetch_parameters.py +0 -0
  26. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/model_description.py +0 -0
  27. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/search_models.py +0 -0
  28. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma/talk2biomodels/tools/simulate_model.py +0 -0
  29. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/dependency_links.txt +0 -0
  30. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/aiagents4pharma.egg-info/top_level.txt +0 -0
  31. {aiagents4pharma-1.4.3 → aiagents4pharma-1.5.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: aiagents4pharma
3
- Version: 1.4.3
3
+ Version: 1.5.4
4
4
  Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -10,13 +10,16 @@ Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: copasi_basico==0.78
12
12
  Requires-Dist: coverage==7.6.4
13
+ Requires-Dist: gdown==5.2.0
14
+ Requires-Dist: huggingface_hub==0.26.5
15
+ Requires-Dist: joblib==1.4.2
13
16
  Requires-Dist: langchain==0.3.7
14
17
  Requires-Dist: langchain-community==0.3.5
15
18
  Requires-Dist: langchain-core==0.3.15
16
19
  Requires-Dist: langchain-experimental==0.3.3
17
20
  Requires-Dist: langchain-openai==0.2.5
18
21
  Requires-Dist: matplotlib==3.9.2
19
- Requires-Dist: openai==1.55.3
22
+ Requires-Dist: openai==1.59.4
20
23
  Requires-Dist: pandas==2.2.3
21
24
  Requires-Dist: plotly==5.24.1
22
25
  Requires-Dist: pydantic==2.9.2
@@ -24,6 +27,7 @@ Requires-Dist: pylint==3.3.1
24
27
  Requires-Dist: pytest==8.3.3
25
28
  Requires-Dist: streamlit==1.39.0
26
29
  Requires-Dist: tabulate==0.9.0
30
+ Requires-Dist: torch==2.5.1
27
31
  Requires-Dist: tqdm==4.66.6
28
32
  Requires-Dist: mkdocs==1.6.1
29
33
  Requires-Dist: mkdocs-jupyter==0.25.1
@@ -0,0 +1,7 @@
1
+ '''
2
+ This file is used to import aiagents4pharma modules.
3
+ '''
4
+
5
+ from . import talk2biomodels
6
+ from . import talk2cells
7
+ from . import talk2knowledgegraphs
@@ -0,0 +1,4 @@
1
+ '''
2
+ This file is used to import the datasets, utils, and tools.
3
+ '''
4
+ from . import datasets
@@ -0,0 +1,7 @@
1
+ '''
2
+ This file is used to import all the models in the package.
3
+ '''
4
+ from . import dataset
5
+ from . import primekg
6
+ from . import starkqa_primekg
7
+ from . import biobridge_primekg
@@ -0,0 +1,562 @@
1
+ """
2
+ Class for loading BioBridgePrimeKG dataset.
3
+ """
4
+
5
+ import os
6
+ import pickle
7
+ import json
8
+ import requests
9
+ import numpy as np
10
+ import pandas as pd
11
+ from tqdm import tqdm
12
+ from .dataset import Dataset
13
+ from .primekg import PrimeKG
14
+
15
+ class BioBridgePrimeKG(Dataset):
16
+ """
17
+ Class for loading BioBridgePrimeKG dataset.
18
+ It downloads the data from the BioBridge repo and stores it in the local directory.
19
+ The data is then loaded into pandas DataFrame of nodes and edges.
20
+ This class was adapted from the BioBridge repo:
21
+ https://github.com/RyanWangZf/BioBridge
22
+ """
23
+
24
+ def __init__(self,
25
+ primekg_dir: str = "../../../data/primekg/",
26
+ local_dir: str = "../../../data/biobridge_primekg/",
27
+ random_seed: int=0,
28
+ n_neg_samples: int=5):
29
+ """
30
+ Constructor for BioBridgePrimeKG class.
31
+
32
+ Args:
33
+ primekg_dir (str): The directory of PrimeKG dataset.
34
+ local_dir (str): The directory to store the downloaded data.
35
+ random_seed (int): The random seed value.
36
+ """
37
+ self.name: str = "biobridge_primekg"
38
+ self.primekg_dir: str = primekg_dir
39
+ self.local_dir: str = local_dir
40
+ self.random_seed = random_seed
41
+ self.n_neg_samples = n_neg_samples
42
+ # Preselected node types:
43
+ # protein, molecular function, cellular component, biological process, drug, disease
44
+ self.preselected_node_types = ["protein", "mf", "cc", "bp", "drug", "disease"]
45
+ self.node_type_map = {
46
+ "protein": "gene/protein",
47
+ "mf": "molecular_function",
48
+ "cc": "cellular_component",
49
+ "bp": "biological_process",
50
+ "drug": "drug",
51
+ "disease": "disease",
52
+ }
53
+
54
+ # Attributes to store the data
55
+ self.primekg = None
56
+ self.primekg_triplets = None
57
+ self.primekg_triplets_negative = None
58
+ self.data_config = None
59
+ self.emb_dict = None
60
+ self.df_train = None
61
+ self.df_node_train = None
62
+ self.df_test = None
63
+ self.df_node_test = None
64
+ self.node_info_dict = None
65
+
66
+ # Set up the dataset
67
+ self.setup()
68
+
69
+ def setup(self):
70
+ """
71
+ A method to set up the dataset.
72
+ """
73
+ # Make the directories if it doesn't exist
74
+ os.makedirs(os.path.dirname(self.primekg_dir), exist_ok=True)
75
+ os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
76
+
77
+ # Set the random seed
78
+ self.set_random_seed(self.random_seed)
79
+
80
+ # Set SettingWithCopyWarning warnings to none
81
+ pd.options.mode.chained_assignment = None
82
+
83
+ def _load_primekg(self) -> PrimeKG:
84
+ """
85
+ Private method to load related files of PrimeKG dataset.
86
+
87
+ Returns:
88
+ The PrimeKG dataset.
89
+ """
90
+ primekg_data = PrimeKG(local_dir=self.primekg_dir)
91
+ primekg_data.load_data()
92
+
93
+ return primekg_data
94
+
95
+ def _download_file(self,
96
+ remote_url:str,
97
+ local_dir: str,
98
+ local_filename: str):
99
+ """
100
+ A helper function to download a file from remote URL to the local directory.
101
+
102
+ Args:
103
+ remote_url (str): The remote URL of the file to be downloaded.
104
+ local_dir (str): The local directory to store the downloaded file.
105
+ local_filename (str): The local filename to store the downloaded file.
106
+ """
107
+ # Make the local directory if it does not exist
108
+ if not os.path.exists(local_dir):
109
+ os.makedirs(local_dir)
110
+ # Download the file from remote URL to local directory
111
+ local_path = os.path.join(local_dir, local_filename)
112
+ if os.path.exists(local_path):
113
+ print(f"File {local_filename} already exists in {local_dir}.")
114
+ else:
115
+ print(f"Downloading {local_filename} from {remote_url} to {local_dir}...")
116
+ response = requests.get(remote_url, stream=True, timeout=300)
117
+ response.raise_for_status()
118
+ progress_bar = tqdm(
119
+ total=int(response.headers.get("content-length", 0)),
120
+ unit="iB",
121
+ unit_scale=True,
122
+ )
123
+ with open(os.path.join(local_dir, local_filename), "wb") as file:
124
+ for data in response.iter_content(1024):
125
+ progress_bar.update(len(data))
126
+ file.write(data)
127
+ progress_bar.close()
128
+
129
+ def _load_data_config(self) -> dict:
130
+ """
131
+ Load the data config file of BioBridgePrimeKG dataset.
132
+
133
+ Returns:
134
+ The data config file of BioBridgePrimeKG dataset.
135
+ """
136
+ # Download the data config file of BioBridgePrimeKG
137
+ self._download_file(
138
+ remote_url= ('https://raw.githubusercontent.com/RyanWangZf/BioBridge/'
139
+ 'refs/heads/main/data/BindData/data_config.json'),
140
+ local_dir=self.local_dir,
141
+ local_filename='data_config.json')
142
+
143
+ # Load the downloaded data config file
144
+ with open(os.path.join(self.local_dir, 'data_config.json'), 'r', encoding='utf-8') as f:
145
+ data_config = json.load(f)
146
+
147
+ return data_config
148
+
149
+ def _build_node_embeddings(self) -> dict:
150
+ """
151
+ Build the node embeddings for BioBridgePrimeKG dataset.
152
+
153
+ Returns:
154
+ The dictionary of node embeddings.
155
+ """
156
+ processed_file_path = os.path.join(self.local_dir, "embeddings", "embedding_dict.pkl")
157
+ if os.path.exists(processed_file_path):
158
+ # Load the embeddings from the local directory
159
+ with open(processed_file_path, "rb") as f:
160
+ emb_dict_all = pickle.load(f)
161
+ else:
162
+ # Download the embeddings from the BioBridge repo and further process them
163
+ # List of embedding source files
164
+ url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
165
+ 'refs/heads/main/data/embeddings/esm2b_unimo_pubmedbert/')
166
+ file_list = [f"{n}.pkl" for n in self.preselected_node_types]
167
+
168
+ # Download the embeddings
169
+ for file in file_list:
170
+ self._download_file(remote_url=os.path.join(url, file),
171
+ local_dir=os.path.join(self.local_dir, "embeddings"),
172
+ local_filename=file)
173
+
174
+ # Unified embeddings
175
+ emb_dict_all = {}
176
+ for file in file_list:
177
+ with open(os.path.join(self.local_dir, "embeddings", file), "rb") as f:
178
+ emb = pickle.load(f)
179
+ emb_ar = emb["embedding"]
180
+ if not isinstance(emb_ar, list):
181
+ emb_ar = emb_ar.tolist()
182
+ emb_dict_all.update(dict(zip(emb["node_index"], emb_ar)))
183
+
184
+ # Store embeddings
185
+ with open(processed_file_path, "wb") as f:
186
+ pickle.dump(emb_dict_all, f)
187
+
188
+ return emb_dict_all
189
+
190
+ def _build_full_triplets(self) -> tuple[pd.DataFrame, dict]:
191
+ """
192
+ Build the full triplets for BioBridgePrimeKG dataset.
193
+
194
+ Returns:
195
+ The full triplets for BioBridgePrimeKG dataset.
196
+ The dictionary of node information.
197
+ """
198
+ processed_file_path = os.path.join(self.local_dir, "processed", "triplet_full.tsv.gz")
199
+ if os.path.exists(processed_file_path):
200
+ # Load the file from the local directory
201
+ with open(processed_file_path, "rb") as f:
202
+ primekg_triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
203
+
204
+ # Load each dataframe in the local directory
205
+ node_info_dict = {}
206
+ for i, node_type in enumerate(self.preselected_node_types):
207
+ with open(os.path.join(self.local_dir, "processed",
208
+ f"{node_type}.csv"), "rb") as f:
209
+ df_node = pd.read_csv(f)
210
+ node_info_dict[self.node_type_map[node_type]] = df_node
211
+ else:
212
+ # Download the related files from the BioBridge repo and further process them
213
+ # List of processed files
214
+ url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
215
+ 'refs/heads/main/data/Processed/')
216
+ file_list = ["protein", "molecular", "cellular", "biological", "drug", "disease"]
217
+
218
+ # Download the processed files
219
+ for i, file in enumerate(file_list):
220
+ self._download_file(remote_url=os.path.join(url, f"{file}.csv"),
221
+ local_dir=os.path.join(self.local_dir, "processed"),
222
+ local_filename=f"{self.preselected_node_types[i]}.csv")
223
+
224
+ # Build the node index list
225
+ node_info_dict = {}
226
+ node_index_list = []
227
+ for i, file in enumerate(file_list):
228
+ df_node = pd.read_csv(os.path.join(self.local_dir, "processed",
229
+ f"{self.preselected_node_types[i]}.csv"))
230
+ node_info_dict[self.node_type_map[self.preselected_node_types[i]]] = df_node
231
+ node_index_list.extend(df_node["node_index"].tolist())
232
+
233
+ # Filter the PrimeKG dataset to take into account only the selected node types
234
+ primekg_triplets = self.primekg.get_edges().copy()
235
+ primekg_triplets = primekg_triplets[
236
+ primekg_triplets["head_index"].isin(node_index_list) &\
237
+ primekg_triplets["tail_index"].isin(node_index_list)
238
+ ]
239
+ primekg_triplets = primekg_triplets.reset_index(drop=True)
240
+
241
+ # Perform mapping of node types
242
+ primekg_triplets["head_type"] = primekg_triplets["head_type"].apply(
243
+ lambda x: self.data_config["node_type"][x]
244
+ )
245
+ primekg_triplets["tail_type"] = primekg_triplets["tail_type"].apply(
246
+ lambda x: self.data_config["node_type"][x]
247
+ )
248
+
249
+ # Perform mapping of relation types
250
+ primekg_triplets["display_relation"] = primekg_triplets["display_relation"].apply(
251
+ lambda x: self.data_config["relation_type"][x]
252
+ )
253
+
254
+ # Store the processed triplets
255
+ primekg_triplets.to_csv(processed_file_path, sep="\t", compression="gzip", index=False)
256
+
257
+ return primekg_triplets, node_info_dict
258
+
259
+ def _build_train_test_split(self) -> tuple[pd.DataFrame, pd.DataFrame,
260
+ pd.DataFrame, pd.DataFrame, pd.DataFrame]:
261
+ """
262
+ Build the train-test split for BioBridgePrimeKG dataset.
263
+
264
+ Returns:
265
+ The train triplets for BioBridgePrimeKG dataset.
266
+ The train nodes for BioBridgePrimeKG dataset.
267
+ The test triplets for BioBridgePrimeKG dataset.
268
+ The test nodes for BioBridgePrimeKG dataset.
269
+ The full triplets for BioBridgePrimeKG dataset.
270
+ """
271
+ if os.path.exists(os.path.join(self.local_dir, "processed",
272
+ "triplet_full_altered.tsv.gz")):
273
+ # Load each dataframe in the local directory
274
+ with open(os.path.join(self.local_dir, "processed",
275
+ "triplet_train.tsv.gz"), "rb") as f:
276
+ df_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
277
+
278
+ with open(os.path.join(self.local_dir, "processed",
279
+ "node_train.tsv.gz"), "rb") as f:
280
+ df_node_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
281
+
282
+ with open(os.path.join(self.local_dir, "processed",
283
+ "triplet_test.tsv.gz"), "rb") as f:
284
+ df_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
285
+
286
+ with open(os.path.join(self.local_dir, "processed",
287
+ "node_test.tsv.gz"), "rb") as f:
288
+ df_node_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
289
+
290
+ with open(os.path.join(self.local_dir, "processed",
291
+ "triplet_full_altered.tsv.gz"), "rb") as f:
292
+ triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
293
+ else:
294
+ # Filtering out some nodes in the embedding dictionary
295
+ triplets = self.primekg_triplets.copy()
296
+ triplets = triplets[
297
+ triplets["head_index"].isin(list(self.emb_dict.keys())) &\
298
+ triplets["tail_index"].isin(list(self.emb_dict.keys()))
299
+ ].reset_index(drop=True)
300
+
301
+ # Perform splitting of the triplets
302
+ list_split = {
303
+ "train": [],
304
+ "test": [],
305
+ }
306
+ node_split = {
307
+ "train": {
308
+ "node_index": [],
309
+ "node_type": [],
310
+ },
311
+ "test": {
312
+ "node_index": [],
313
+ "node_type": [],
314
+ }
315
+ }
316
+ # Loop over the node types
317
+ for node_type in triplets["head_type"].unique():
318
+ df_sub = triplets[triplets["head_type"] == node_type]
319
+ all_x_indexes = df_sub["head_index"].unique()
320
+ # By default, we use 90% of the nodes for training and 10% for testing
321
+ te_x_indexes = np.random.choice(
322
+ all_x_indexes, size=int(0.1*len(all_x_indexes)), replace=False
323
+ )
324
+ df_subs = {}
325
+ df_subs["test"] = df_sub[df_sub["head_index"].isin(te_x_indexes)]
326
+ df_subs["train"] = df_sub[~df_sub["head_index"].isin(te_x_indexes)]
327
+ list_split["train"].append(df_subs["train"])
328
+ list_split["test"].append(df_subs["test"])
329
+
330
+ # record the split
331
+ node_index = {}
332
+ node_index["train"] = df_subs["train"]["head_index"].unique()
333
+ node_split["train"]["node_index"].extend(node_index["train"].tolist())
334
+ node_split["train"]["node_type"].extend([node_type]*len(node_index["train"]))
335
+ node_index["test"] = df_subs["test"]["head_index"].unique()
336
+ node_split["test"]["node_index"].extend(node_index["test"].tolist())
337
+ node_split["test"]["node_type"].extend([node_type]*len(node_index["test"]))
338
+
339
+ print(f"Number of {node_type} nodes in train: {len(node_index['train'])}")
340
+ print(f"Number of {node_type} nodes in test: {len(node_index['test'])}")
341
+
342
+ # Prepare train and test DataFrames
343
+ df_train = pd.concat(list_split["train"])
344
+ df_node_train = pd.DataFrame(node_split["train"])
345
+ df_test = pd.concat(list_split["test"])
346
+ df_node_test = pd.DataFrame(node_split["test"])
347
+
348
+ # Store each dataframe in the local directory
349
+ df_train.to_csv(os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"),
350
+ sep="\t", compression="gzip", index=False)
351
+ df_node_train.to_csv(os.path.join(self.local_dir, "processed", "node_train.tsv.gz"),
352
+ sep="\t", compression="gzip", index=False)
353
+ df_test.to_csv(os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"),
354
+ sep="\t", compression="gzip", index=False)
355
+ df_node_test.to_csv(os.path.join(self.local_dir, "processed", "node_test.tsv.gz"),
356
+ sep="\t", compression="gzip", index=False)
357
+ # Store altered full triplets as well
358
+ triplets.to_csv(os.path.join(self.local_dir, "processed",
359
+ "triplet_full_altered.tsv.gz"),
360
+ sep="\t", compression="gzip", index=False)
361
+
362
+ return df_train, df_node_train, df_test, df_node_test, triplets
363
+
364
+ # def _negative_sampling(self,
365
+ # batch_df: pd.DataFrame,
366
+ # process_index: int,
367
+ # index_map: dict,
368
+ # node_train_dict: dict) -> pd.DataFrame:
369
+ # """
370
+ # A helper function to perform negative sampling for a batch of triplets.
371
+ # """
372
+ # negative_y_index_list = []
373
+ # for _, row in tqdm(batch_df.iterrows(),
374
+ # total=batch_df.shape[0],
375
+ # desc=f"Process {process_index}"):
376
+ # x_index = row['head_index']
377
+ # # y_index = row['y_index']
378
+ # y_index_type = row['tail_type']
379
+ # paired_y_index_list = index_map[x_index]
380
+
381
+ # # sample a list of negative y_index
382
+ # node_train_sub = node_train_dict[y_index_type]
383
+ # negative_y_index = node_train_sub[
384
+ # ~node_train_sub['node_index'].isin(paired_y_index_list)
385
+ # ]['node_index'].sample(self.n_neg_samples).tolist()
386
+ # negative_y_index_list.append(negative_y_index)
387
+
388
+ # batch_df.loc[:, 'negative_tail_index'] = negative_y_index_list
389
+ # return batch_df
390
+
391
+ # def _build_negative_triplets(self,
392
+ # chunk_size: int=100000,
393
+ # n_neg_samples: int=10):
394
+ # """
395
+ # Build the negative triplets for BioBridgePrimeKG dataset.
396
+ # """
397
+ # processed_file_path = os.path.join(self.local_dir,
398
+ # "processed",
399
+ # "triplet_train_negative.tsv.gz")
400
+ # if os.path.exists(processed_file_path):
401
+ # # Load the negative triplets from the local directory
402
+ # with open(processed_file_path, "rb") as f:
403
+ # triplets_negative = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
404
+ # else:
405
+ # # Set the number samples for negative sampling
406
+ # self.n_neg_samples = n_neg_samples
407
+
408
+ # # Split node list by type
409
+ # node_train_dict = {}
410
+ # type_list = self.df_node_train['node_type'].unique()
411
+ # for node_type in type_list:
412
+ # node_train_dict[node_type] = self.df_node_train[
413
+ # self.df_node_train['node_type'] == node_type
414
+ # ].reset_index(drop=True)
415
+
416
+ # # create an index mapping from x_index to y_index
417
+ # index_map = self.df_train[
418
+ # ['head_index', 'tail_index']
419
+ # ].drop_duplicates().groupby('head_index').agg(list).to_dict()['tail_index']
420
+
421
+ # # Negative sampling
422
+ # batch_df_list = []
423
+ # for i in tqdm(range(0, self.df_train.shape[0], chunk_size)):
424
+ # batch_df_list.append(self.df_train.iloc[i:i+chunk_size])
425
+ # # Process negative sampling
426
+ # results = [
427
+ # self._negative_sampling(batch_df,
428
+ # num_piece,
429
+ # index_map,
430
+ # node_train_dict)
431
+ # for num_piece, batch_df in enumerate(batch_df_list)
432
+ # ]
433
+
434
+ # # Store the negative triplets
435
+ # triplets_negative = pd.concat(results, axis=0)
436
+ # triplets_negative.to_csv(processed_file_path,
437
+ # sep="\t", compression="gzip", index=False)
438
+
439
+ # # Set attribute
440
+ # self.primekg_triplets_negative = triplets_negative
441
+
442
+ # return triplets_negative
443
+
444
+ # def load_data(self,
445
+ # build_neg_triplest: bool= False,
446
+ # chunk_size: int=100000,
447
+ # n_neg_samples: int=10):
448
+
449
+ def load_data(self):
450
+ """
451
+ Load the BioBridgePrimeKG dataset into pandas DataFrame of nodes and edges.
452
+
453
+ Args:
454
+ build_neg_triplest (bool): Whether to build negative triplets.
455
+ chunk_size (int): The chunk size for negative sampling.
456
+ n_neg_samples (int): The number of negative samples for negative sampling.
457
+ """
458
+ # Load PrimeKG dataset
459
+ print("Loading PrimeKG dataset...")
460
+ self.primekg = self._load_primekg()
461
+
462
+ # Load data config file of BioBridgePrimeKG
463
+ print("Loading data config file of BioBridgePrimeKG...")
464
+ self.data_config = self._load_data_config()
465
+
466
+ # Build node embeddings
467
+ print("Building node embeddings...")
468
+ self.emb_dict = self._build_node_embeddings()
469
+
470
+ # Build full triplets
471
+ print("Building full triplets...")
472
+ self.primekg_triplets, self.node_info_dict = self._build_full_triplets()
473
+
474
+ # Build train-test split
475
+ print("Building train-test split...")
476
+ self.df_train, self.df_node_train, self.df_test, self.df_node_test, self.primekg_triplets =\
477
+ self._build_train_test_split()
478
+
479
+ # if build_neg_triplest:
480
+ # # Build negative triplets
481
+ # print("Building negative triplets...")
482
+ # self.primekg_triplets_negative = self._build_negative_triplets(
483
+ # chunk_size=chunk_size,
484
+ # n_neg_samples=n_neg_samples
485
+ # )
486
+
487
+ def set_random_seed(self, seed: int):
488
+ """
489
+ Set the random seed for reproducibility.
490
+
491
+ Args:
492
+ seed (int): The random seed value.
493
+ """
494
+ np.random.seed(seed)
495
+
496
+ def get_primekg(self) -> PrimeKG:
497
+ """
498
+ Get the PrimeKG dataset.
499
+
500
+ Returns:
501
+ The PrimeKG dataset.
502
+ """
503
+ return self.primekg
504
+
505
+ def get_data_config(self) -> dict:
506
+ """
507
+ Get the data config file of BioBridgePrimeKG dataset.
508
+
509
+ Returns:
510
+ The data config file of BioBridgePrimeKG dataset.
511
+ """
512
+ return self.data_config
513
+
514
+ def get_node_embeddings(self) -> dict:
515
+ """
516
+ Get the node embeddings for BioBridgePrimeKG dataset.
517
+
518
+ Returns:
519
+ The dictionary of node embeddings.
520
+ """
521
+ return self.emb_dict
522
+
523
+ def get_primekg_triplets(self) -> pd.DataFrame:
524
+ """
525
+ Get the full triplets for BioBridgePrimeKG dataset.
526
+
527
+ Returns:
528
+ The full triplets for BioBridgePrimeKG dataset.
529
+ """
530
+ return self.primekg_triplets
531
+
532
+ # def get_primekg_triplets_negative(self) -> pd.DataFrame:
533
+ # """
534
+ # Get the negative triplets for BioBridgePrimeKG dataset.
535
+
536
+ # Returns:
537
+ # The negative triplets for BioBridgePrimeKG dataset.
538
+ # """
539
+ # return self.primekg_triplets_negative
540
+
541
+ def get_train_test_split(self) -> dict:
542
+ """
543
+ Get the train-test split for BioBridgePrimeKG dataset.
544
+
545
+ Returns:
546
+ The train-test split for BioBridgePrimeKG dataset.
547
+ """
548
+ return {
549
+ "train": self.df_train,
550
+ "node_train": self.df_node_train,
551
+ "test": self.df_test,
552
+ "node_test": self.df_node_test
553
+ }
554
+
555
+ def get_node_info_dict(self) -> dict:
556
+ """
557
+ Get the node information dictionary for BioBridgePrimeKG dataset.
558
+
559
+ Returns:
560
+ The node information dictionary for BioBridgePrimeKG dataset.
561
+ """
562
+ return self.node_info_dict
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Abstract class for dataset.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+
9
+ class Dataset(ABC):
10
+ """
11
+ Abstract class for dataset.
12
+ """
13
+ @abstractmethod
14
+ def setup(self):
15
+ """
16
+ A method to set up the dataset.
17
+ """
18
+
19
+ @abstractmethod
20
+ def load_data(self):
21
+ """
22
+ A method to load the dataset and potentially preprocess it.
23
+ """
@@ -0,0 +1,201 @@
1
+ """
2
+ Class for loading PrimeKG dataset.
3
+ """
4
+
5
+ import os
6
+ import requests
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+ from .dataset import Dataset
10
+
11
+ class PrimeKG(Dataset):
12
+ """
13
+ Class for loading PrimeKG dataset.
14
+ It downloads the data from the Harvard Dataverse and stores it in the local directory.
15
+ The data is then loaded into pandas DataFrame of nodes and edges.
16
+ """
17
+
18
+ def __init__(self, local_dir: str = "../../../data/primekg/"):
19
+ """
20
+ Constructor for PrimeKG class.
21
+
22
+ Args:
23
+ local_dir (str): The local directory where the data will be stored.
24
+ """
25
+ self.name: str = "primekg"
26
+ self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
27
+ self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
28
+ self.local_dir: str = local_dir
29
+
30
+ # Attributes to store the data
31
+ self.nodes: pd.DataFrame = None
32
+ self.edges: pd.DataFrame = None
33
+
34
+ # Set up the dataset
35
+ self.setup()
36
+
37
+ def setup(self):
38
+ """
39
+ A method to set up the dataset.
40
+ """
41
+ # Make the directory if it doesn't exist
42
+ os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
43
+
44
+
45
+ def _download_file(self, remote_url:str, local_path: str):
46
+ """
47
+ A helper function to download a file from remote URL to the local directory.
48
+
49
+ Args:
50
+ remote_url (str): The remote URL of the file to be downloaded.
51
+ local_path (str): The local path where the file will be saved.
52
+ """
53
+ response = requests.get(remote_url, stream=True, timeout=300)
54
+ response.raise_for_status()
55
+ progress_bar = tqdm(
56
+ total=int(response.headers.get("content-length", 0)),
57
+ unit="iB",
58
+ unit_scale=True,
59
+ )
60
+ with open(local_path, "wb") as file:
61
+ for data in response.iter_content(1024):
62
+ progress_bar.update(len(data))
63
+ file.write(data)
64
+ progress_bar.close()
65
+
66
+ def _load_nodes(self) -> pd.DataFrame:
67
+ """
68
+ Private method to load the nodes dataframe of PrimeKG dataset.
69
+ This method downloads the nodes file from the Harvard Dataverse if it does not exist
70
+ in the local directory. Otherwise, it loads the data from the local directory.
71
+ It further processes the dataframe of nodes and returns it.
72
+
73
+ Returns:
74
+ The nodes dataframe of PrimeKG dataset.
75
+ """
76
+ local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
77
+ if os.path.exists(local_file):
78
+ print(f"{local_file} already exists. Loading the data from the local directory.")
79
+
80
+ # Load the dataframe from the local directory and assign it to the nodes attribute
81
+ nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
82
+ else:
83
+ print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
84
+
85
+ # Download the file from the Harvard Dataverse with designated file_id for node
86
+ self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
87
+ os.path.join(self.local_dir, "nodes.tab"))
88
+
89
+ # Load the downloaded file into a pandas DataFrame
90
+ nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
91
+ sep="\t", low_memory=False)
92
+
93
+ # Further processing of the dataframe
94
+ nodes = nodes[
95
+ ["node_index", "node_name", "node_source", "node_id", "node_type"]
96
+ ]
97
+
98
+ # Store compressed dataframe in the local directory
99
+ nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
100
+
101
+ return nodes
102
+
103
+ def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
104
+ """
105
+ Private method to load the edges dataframe of PrimeKG dataset.
106
+ This method downloads the edges file from the Harvard Dataverse if it does not exist
107
+ in the local directory. Otherwise, it loads the data from the local directory.
108
+ It further processes the dataframe of edges and returns it.
109
+
110
+ Args:
111
+ nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.
112
+
113
+ Returns:
114
+ The edges dataframe of PrimeKG dataset.
115
+ """
116
+ local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
117
+ if os.path.exists(local_file):
118
+ print(f"{local_file} already exists. Loading the data from the local directory.")
119
+
120
+ # Load the dataframe from the local directory and assign it to the edges attribute
121
+ edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
122
+ else:
123
+ print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
124
+
125
+ # Download the file from the Harvard Dataverse with designated file_id for edge
126
+ self._download_file(f"{self.server_path}{self.file_ids['edges']}",
127
+ os.path.join(self.local_dir, "edges.csv"))
128
+
129
+ # Load the downloaded file into a pandas DataFrame
130
+ edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
131
+ sep=",", low_memory=False)
132
+
133
+ # Further processing of the dataframe
134
+ edges = edges.merge(
135
+ nodes, left_on="x_index", right_on="node_index"
136
+ )
137
+ edges.drop(["x_index"], axis=1, inplace=True)
138
+ edges.rename(
139
+ columns={
140
+ "node_index": "head_index",
141
+ "node_name": "head_name",
142
+ "node_source": "head_source",
143
+ "node_id": "head_id",
144
+ "node_type": "head_type",
145
+ },
146
+ inplace=True,
147
+ )
148
+ edges = edges.merge(
149
+ nodes, left_on="y_index", right_on="node_index"
150
+ )
151
+ edges.drop(["y_index"], axis=1, inplace=True)
152
+ edges.rename(
153
+ columns={
154
+ "node_index": "tail_index",
155
+ "node_name": "tail_name",
156
+ "node_source": "tail_source",
157
+ "node_id": "tail_id",
158
+ "node_type": "tail_type"
159
+ },
160
+ inplace=True,
161
+ )
162
+ edges = edges[
163
+ [
164
+ "head_index", "head_name", "head_source", "head_id", "head_type",
165
+ "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
166
+ "display_relation", "relation",
167
+ ]
168
+ ]
169
+
170
+ # Store compressed dataframe in the local directory
171
+ edges.to_csv(local_file, index=False, sep="\t", compression="gzip")
172
+
173
+ return edges
174
+
175
+ def load_data(self):
176
+ """
177
+ Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
178
+ """
179
+ print("Loading nodes of PrimeKG dataset ...")
180
+ self.nodes = self._load_nodes()
181
+
182
+ print("Loading edges of PrimeKG dataset ...")
183
+ self.edges = self._load_edges(self.nodes)
184
+
185
+ def get_nodes(self) -> pd.DataFrame:
186
+ """
187
+ Get the nodes dataframe of PrimeKG dataset.
188
+
189
+ Returns:
190
+ The nodes dataframe of PrimeKG dataset.
191
+ """
192
+ return self.nodes
193
+
194
+ def get_edges(self) -> pd.DataFrame:
195
+ """
196
+ Get the edges dataframe of PrimeKG dataset.
197
+
198
+ Returns:
199
+ The edges dataframe of PrimeKG dataset.
200
+ """
201
+ return self.edges
@@ -0,0 +1,201 @@
1
+ """
2
+ Class for loading StarkQAPrimeKG dataset.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ import pickle
8
+ import numpy as np
9
+ import pandas as pd
10
+ from tqdm import tqdm
11
+ import torch
12
+ from huggingface_hub import hf_hub_download, list_repo_files
13
+ import gdown
14
+ from .dataset import Dataset
15
+
16
+ class StarkQAPrimeKG(Dataset):
17
+ """
18
+ Class for loading StarkQAPrimeKG dataset.
19
+ It downloads the data from the HuggingFace repo and stores it in the local directory.
20
+ The data is then loaded into pandas DataFrame of QA pairs, dictionary of split indices,
21
+ and node information.
22
+ """
23
+
24
+ def __init__(self, local_dir: str = "../../../data/starkqa_primekg/"):
25
+ """
26
+ Constructor for StarkQAPrimeKG class.
27
+
28
+ Args:
29
+ local_dir (str): The local directory to store the dataset files.
30
+ """
31
+ self.name: str = "starkqa_primekg"
32
+ self.hf_repo_id: str = "snap-stanford/stark"
33
+ self.local_dir: str = local_dir
34
+ # Attributes to store the data
35
+ self.starkqa: pd.DataFrame = None
36
+ self.starkqa_split_idx: dict = None
37
+ self.starkqa_node_info: dict = None
38
+ self.query_emb_dict: dict = None
39
+ self.node_emb_dict: dict = None
40
+
41
+ # Set up the dataset
42
+ self.setup()
43
+
44
+ def setup(self):
45
+ """
46
+ A method to set up the dataset.
47
+ """
48
+ # Make the directory if it doesn't exist
49
+ os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
50
+
51
+ def _load_stark_repo(self) -> tuple[pd.DataFrame, dict, dict]:
52
+ """
53
+ Private method to load related files of StarkQAPrimeKG dataset.
54
+
55
+ Returns:
56
+ The nodes dataframe of StarkQAPrimeKG dataset.
57
+ The split indices of StarkQAPrimeKG dataset.
58
+ The node information of StarkQAPrimeKG dataset.
59
+ """
60
+ # Download the file if it does not exist in the local directory
61
+ # Otherwise, load the data from the local directory
62
+ local_file = os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv")
63
+ if os.path.exists(local_file):
64
+ print(f"{local_file} already exists. Loading the data from the local directory.")
65
+ else:
66
+ print(f"Downloading files from {self.hf_repo_id}")
67
+
68
+ # List all related files in the HuggingFace Hub repository
69
+ files = list_repo_files(self.hf_repo_id, repo_type="dataset")
70
+ files = [f for f in files if ((f.startswith("qa/prime/") or
71
+ f.startswith("skb/prime/")) and f.find("raw") == -1)]
72
+
73
+ # Download and save each file in the specified folder
74
+ for file in tqdm(files):
75
+ _ = hf_hub_download(self.hf_repo_id,
76
+ file,
77
+ repo_type="dataset",
78
+ local_dir=self.local_dir)
79
+
80
+ # Unzip the processed files
81
+ shutil.unpack_archive(
82
+ os.path.join(self.local_dir, "skb/prime/processed.zip"),
83
+ os.path.join(self.local_dir, "skb/prime/")
84
+ )
85
+
86
+ # Load StarkQA dataframe
87
+ starkqa = pd.read_csv(
88
+ os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
89
+ low_memory=False)
90
+
91
+ # Read split indices
92
+ qa_indices = sorted(starkqa['id'].tolist())
93
+ starkqa_split_idx = {}
94
+ for split in ['train', 'val', 'test', 'test-0.1']:
95
+ indices_file = os.path.join(self.local_dir, "qa/prime/split", f'{split}.index')
96
+ with open(indices_file, 'r', encoding='utf-8') as f:
97
+ indices = f.read().strip().split('\n')
98
+ query_ids = [int(idx) for idx in indices]
99
+ starkqa_split_idx[split] = np.array(
100
+ [qa_indices.index(query_id) for query_id in query_ids]
101
+ )
102
+
103
+ # Load the node info of PrimeKG preprocessed for StarkQA
104
+ with open(os.path.join(self.local_dir, 'skb/prime/processed/node_info.pkl'), 'rb') as f:
105
+ starkqa_node_info = pickle.load(f)
106
+
107
+ return starkqa, starkqa_split_idx, starkqa_node_info
108
+
109
+ def _load_stark_embeddings(self) -> tuple[dict, dict]:
110
+ """
111
+ Private method to load the embeddings of StarkQAPrimeKG dataset.
112
+
113
+ Returns:
114
+ The query embeddings of StarkQAPrimeKG dataset.
115
+ The node embeddings of StarkQAPrimeKG dataset.
116
+ """
117
+ # Load the provided embeddings of query and nodes
118
+ # Note that they utilized 'text-embedding-ada-002' for embeddings
119
+ emb_model = 'text-embedding-ada-002'
120
+ query_emb_url = 'https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU'
121
+ node_emb_url = 'https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy'
122
+
123
+ # Prepare respective directories to store the embeddings
124
+ emb_dir = os.path.join(self.local_dir, emb_model)
125
+ query_emb_dir = os.path.join(emb_dir, "query")
126
+ node_emb_dir = os.path.join(emb_dir, "doc")
127
+ os.makedirs(query_emb_dir, exist_ok=True)
128
+ os.makedirs(node_emb_dir, exist_ok=True)
129
+ query_emb_path = os.path.join(query_emb_dir, "query_emb_dict.pt")
130
+ node_emb_path = os.path.join(node_emb_dir, "candidate_emb_dict.pt")
131
+
132
+ # Download the embeddings if they do not exist in the local directory
133
+ if not os.path.exists(query_emb_path) or not os.path.exists(node_emb_path):
134
+ # Download the query embeddings
135
+ gdown.download(query_emb_url, query_emb_path, quiet=False)
136
+
137
+ # Download the node embeddings
138
+ gdown.download(node_emb_url, node_emb_path, quiet=False)
139
+
140
+ # Load the embeddings
141
+ query_emb_dict = torch.load(query_emb_path)
142
+ node_emb_dict = torch.load(node_emb_path)
143
+
144
+ return query_emb_dict, node_emb_dict
145
+
146
+ def load_data(self):
147
+ """
148
+ Load the StarkQAPrimeKG dataset into pandas DataFrame of QA pairs,
149
+ dictionary of split indices, and node information.
150
+ """
151
+ print("Loading StarkQAPrimeKG dataset...")
152
+ self.starkqa, self.starkqa_split_idx, self.starkqa_node_info = self._load_stark_repo()
153
+
154
+ print("Loading StarkQAPrimeKG embeddings...")
155
+ self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
156
+
157
+
158
+ def get_starkqa(self) -> pd.DataFrame:
159
+ """
160
+ Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
161
+
162
+ Returns:
163
+ The nodes dataframe of PrimeKG dataset.
164
+ """
165
+ return self.starkqa
166
+
167
+ def get_starkqa_split_indicies(self) -> dict:
168
+ """
169
+ Get the split indices of StarkQAPrimeKG dataset.
170
+
171
+ Returns:
172
+ The split indices of StarkQAPrimeKG dataset.
173
+ """
174
+ return self.starkqa_split_idx
175
+
176
+ def get_starkqa_node_info(self) -> dict:
177
+ """
178
+ Get the node information of StarkQAPrimeKG dataset.
179
+
180
+ Returns:
181
+ The node information of StarkQAPrimeKG dataset.
182
+ """
183
+ return self.starkqa_node_info
184
+
185
+ def get_query_embeddings(self) -> dict:
186
+ """
187
+ Get the query embeddings of StarkQAPrimeKG dataset.
188
+
189
+ Returns:
190
+ The query embeddings of StarkQAPrimeKG dataset.
191
+ """
192
+ return self.query_emb_dict
193
+
194
+ def get_node_embeddings(self) -> dict:
195
+ """
196
+ Get the node embeddings of StarkQAPrimeKG dataset.
197
+
198
+ Returns:
199
+ The node embeddings of StarkQAPrimeKG dataset.
200
+ """
201
+ return self.node_emb_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: aiagents4pharma
3
- Version: 1.4.3
3
+ Version: 1.5.4
4
4
  Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -10,13 +10,16 @@ Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: copasi_basico==0.78
12
12
  Requires-Dist: coverage==7.6.4
13
+ Requires-Dist: gdown==5.2.0
14
+ Requires-Dist: huggingface_hub==0.26.5
15
+ Requires-Dist: joblib==1.4.2
13
16
  Requires-Dist: langchain==0.3.7
14
17
  Requires-Dist: langchain-community==0.3.5
15
18
  Requires-Dist: langchain-core==0.3.15
16
19
  Requires-Dist: langchain-experimental==0.3.3
17
20
  Requires-Dist: langchain-openai==0.2.5
18
21
  Requires-Dist: matplotlib==3.9.2
19
- Requires-Dist: openai==1.55.3
22
+ Requires-Dist: openai==1.59.4
20
23
  Requires-Dist: pandas==2.2.3
21
24
  Requires-Dist: plotly==5.24.1
22
25
  Requires-Dist: pydantic==2.9.2
@@ -24,6 +27,7 @@ Requires-Dist: pylint==3.3.1
24
27
  Requires-Dist: pytest==8.3.3
25
28
  Requires-Dist: streamlit==1.39.0
26
29
  Requires-Dist: tabulate==0.9.0
30
+ Requires-Dist: torch==2.5.1
27
31
  Requires-Dist: tqdm==4.66.6
28
32
  Requires-Dist: mkdocs==1.6.1
29
33
  Requires-Dist: mkdocs-jupyter==0.25.1
@@ -18,4 +18,10 @@ aiagents4pharma/talk2biomodels/tools/custom_plotter.py
18
18
  aiagents4pharma/talk2biomodels/tools/fetch_parameters.py
19
19
  aiagents4pharma/talk2biomodels/tools/model_description.py
20
20
  aiagents4pharma/talk2biomodels/tools/search_models.py
21
- aiagents4pharma/talk2biomodels/tools/simulate_model.py
21
+ aiagents4pharma/talk2biomodels/tools/simulate_model.py
22
+ aiagents4pharma/talk2knowledgegraphs/__init__.py
23
+ aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py
24
+ aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py
25
+ aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py
26
+ aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py
27
+ aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py
@@ -1,12 +1,15 @@
1
1
  copasi_basico==0.78
2
2
  coverage==7.6.4
3
+ gdown==5.2.0
4
+ huggingface_hub==0.26.5
5
+ joblib==1.4.2
3
6
  langchain==0.3.7
4
7
  langchain-community==0.3.5
5
8
  langchain-core==0.3.15
6
9
  langchain-experimental==0.3.3
7
10
  langchain-openai==0.2.5
8
11
  matplotlib==3.9.2
9
- openai==1.55.3
12
+ openai==1.59.4
10
13
  pandas==2.2.3
11
14
  plotly==5.24.1
12
15
  pydantic==2.9.2
@@ -14,6 +17,7 @@ pylint==3.3.1
14
17
  pytest==8.3.3
15
18
  streamlit==1.39.0
16
19
  tabulate==0.9.0
20
+ torch==2.5.1
17
21
  tqdm==4.66.6
18
22
  mkdocs==1.6.1
19
23
  mkdocs-jupyter==0.25.1
@@ -15,13 +15,16 @@ classifiers = [
15
15
  dependencies = [
16
16
  "copasi_basico==0.78",
17
17
  "coverage==7.6.4",
18
+ "gdown==5.2.0",
19
+ "huggingface_hub==0.26.5",
20
+ "joblib==1.4.2",
18
21
  "langchain==0.3.7",
19
22
  "langchain-community==0.3.5",
20
23
  "langchain-core==0.3.15",
21
24
  "langchain-experimental==0.3.3",
22
25
  "langchain-openai==0.2.5",
23
26
  "matplotlib==3.9.2",
24
- "openai==1.55.3",
27
+ "openai==1.59.4",
25
28
  "pandas==2.2.3",
26
29
  "plotly==5.24.1",
27
30
  "pydantic==2.9.2",
@@ -29,6 +32,7 @@ dependencies = [
29
32
  "pytest==8.3.3",
30
33
  "streamlit==1.39.0",
31
34
  "tabulate==0.9.0",
35
+ "torch==2.5.1",
32
36
  "tqdm==4.66.6",
33
37
  "mkdocs==1.6.1",
34
38
  "mkdocs-jupyter==0.25.1",
@@ -48,7 +52,9 @@ version = {file = "release_version.txt"}
48
52
  packages = ["aiagents4pharma",
49
53
  "aiagents4pharma.talk2biomodels",
50
54
  "aiagents4pharma.talk2biomodels.models",
51
- "aiagents4pharma.talk2biomodels.tools"]
55
+ "aiagents4pharma.talk2biomodels.tools",
56
+ "aiagents4pharma.talk2knowledgegraphs",
57
+ "aiagents4pharma.talk2knowledgegraphs.datasets"]
52
58
 
53
59
  # [tool.setuptools.packages.find]
54
60
  # where = ["aiagents4pharma", "aiagents4pharma.talk2biomodels"]
@@ -0,0 +1 @@
1
+ v1.5.4
@@ -1,5 +0,0 @@
1
- '''
2
- This file is used to import the talk2biomodels module.
3
- '''
4
-
5
- from . import talk2biomodels
@@ -1 +0,0 @@
1
- v1.4.3
File without changes