aiagents4pharma 1.43.0__py3-none-any.whl → 1.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. aiagents4pharma/__init__.py +2 -2
  2. aiagents4pharma/talk2aiagents4pharma/.dockerignore +13 -0
  3. aiagents4pharma/talk2aiagents4pharma/Dockerfile +105 -0
  4. aiagents4pharma/talk2aiagents4pharma/README.md +1 -0
  5. aiagents4pharma/talk2aiagents4pharma/__init__.py +4 -5
  6. aiagents4pharma/talk2aiagents4pharma/agents/__init__.py +3 -2
  7. aiagents4pharma/talk2aiagents4pharma/agents/main_agent.py +24 -23
  8. aiagents4pharma/talk2aiagents4pharma/configs/__init__.py +2 -2
  9. aiagents4pharma/talk2aiagents4pharma/configs/agents/__init__.py +2 -2
  10. aiagents4pharma/talk2aiagents4pharma/configs/agents/main_agent/default.yaml +2 -2
  11. aiagents4pharma/talk2aiagents4pharma/configs/config.yaml +1 -1
  12. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/.env.example +23 -0
  13. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/docker-compose.yml +93 -0
  14. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/.env.example +23 -0
  15. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/docker-compose.yml +108 -0
  16. aiagents4pharma/talk2aiagents4pharma/install.md +127 -0
  17. aiagents4pharma/talk2aiagents4pharma/states/__init__.py +3 -2
  18. aiagents4pharma/talk2aiagents4pharma/states/state_talk2aiagents4pharma.py +5 -3
  19. aiagents4pharma/talk2aiagents4pharma/tests/__init__.py +2 -2
  20. aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +72 -50
  21. aiagents4pharma/talk2biomodels/.dockerignore +13 -0
  22. aiagents4pharma/talk2biomodels/Dockerfile +104 -0
  23. aiagents4pharma/talk2biomodels/README.md +1 -0
  24. aiagents4pharma/talk2biomodels/__init__.py +4 -8
  25. aiagents4pharma/talk2biomodels/agents/__init__.py +3 -2
  26. aiagents4pharma/talk2biomodels/agents/t2b_agent.py +47 -42
  27. aiagents4pharma/talk2biomodels/api/__init__.py +4 -5
  28. aiagents4pharma/talk2biomodels/api/kegg.py +14 -10
  29. aiagents4pharma/talk2biomodels/api/ols.py +13 -10
  30. aiagents4pharma/talk2biomodels/api/uniprot.py +7 -6
  31. aiagents4pharma/talk2biomodels/configs/__init__.py +3 -4
  32. aiagents4pharma/talk2biomodels/configs/agents/__init__.py +2 -2
  33. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/__init__.py +2 -2
  34. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/default.yaml +1 -1
  35. aiagents4pharma/talk2biomodels/configs/config.yaml +1 -1
  36. aiagents4pharma/talk2biomodels/configs/tools/__init__.py +4 -5
  37. aiagents4pharma/talk2biomodels/configs/tools/ask_question/__init__.py +2 -2
  38. aiagents4pharma/talk2biomodels/configs/tools/ask_question/default.yaml +1 -2
  39. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/__init__.py +2 -2
  40. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/default.yaml +1 -1
  41. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/__init__.py +2 -2
  42. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/default.yaml +1 -1
  43. aiagents4pharma/talk2biomodels/install.md +63 -0
  44. aiagents4pharma/talk2biomodels/models/__init__.py +4 -4
  45. aiagents4pharma/talk2biomodels/models/basico_model.py +36 -28
  46. aiagents4pharma/talk2biomodels/models/sys_bio_model.py +13 -10
  47. aiagents4pharma/talk2biomodels/states/__init__.py +3 -2
  48. aiagents4pharma/talk2biomodels/states/state_talk2biomodels.py +12 -8
  49. aiagents4pharma/talk2biomodels/tests/BIOMD0000000449_url.xml +1585 -0
  50. aiagents4pharma/talk2biomodels/tests/__init__.py +2 -2
  51. aiagents4pharma/talk2biomodels/tests/article_on_model_537.pdf +0 -0
  52. aiagents4pharma/talk2biomodels/tests/test_api.py +18 -14
  53. aiagents4pharma/talk2biomodels/tests/test_ask_question.py +8 -9
  54. aiagents4pharma/talk2biomodels/tests/test_basico_model.py +15 -9
  55. aiagents4pharma/talk2biomodels/tests/test_get_annotation.py +54 -55
  56. aiagents4pharma/talk2biomodels/tests/test_getmodelinfo.py +28 -27
  57. aiagents4pharma/talk2biomodels/tests/test_integration.py +21 -33
  58. aiagents4pharma/talk2biomodels/tests/test_load_biomodel.py +14 -11
  59. aiagents4pharma/talk2biomodels/tests/test_param_scan.py +21 -20
  60. aiagents4pharma/talk2biomodels/tests/test_query_article.py +129 -29
  61. aiagents4pharma/talk2biomodels/tests/test_search_models.py +9 -13
  62. aiagents4pharma/talk2biomodels/tests/test_simulate_model.py +16 -15
  63. aiagents4pharma/talk2biomodels/tests/test_steady_state.py +12 -22
  64. aiagents4pharma/talk2biomodels/tests/test_sys_bio_model.py +33 -29
  65. aiagents4pharma/talk2biomodels/tools/__init__.py +15 -12
  66. aiagents4pharma/talk2biomodels/tools/ask_question.py +42 -32
  67. aiagents4pharma/talk2biomodels/tools/custom_plotter.py +51 -43
  68. aiagents4pharma/talk2biomodels/tools/get_annotation.py +99 -75
  69. aiagents4pharma/talk2biomodels/tools/get_modelinfo.py +57 -51
  70. aiagents4pharma/talk2biomodels/tools/load_arguments.py +52 -32
  71. aiagents4pharma/talk2biomodels/tools/load_biomodel.py +8 -2
  72. aiagents4pharma/talk2biomodels/tools/parameter_scan.py +107 -90
  73. aiagents4pharma/talk2biomodels/tools/query_article.py +14 -13
  74. aiagents4pharma/talk2biomodels/tools/search_models.py +37 -26
  75. aiagents4pharma/talk2biomodels/tools/simulate_model.py +47 -37
  76. aiagents4pharma/talk2biomodels/tools/steady_state.py +76 -58
  77. aiagents4pharma/talk2biomodels/tools/utils.py +4 -3
  78. aiagents4pharma/talk2cells/README.md +1 -0
  79. aiagents4pharma/talk2cells/__init__.py +4 -5
  80. aiagents4pharma/talk2cells/agents/__init__.py +3 -2
  81. aiagents4pharma/talk2cells/agents/scp_agent.py +21 -19
  82. aiagents4pharma/talk2cells/states/__init__.py +3 -2
  83. aiagents4pharma/talk2cells/states/state_talk2cells.py +4 -2
  84. aiagents4pharma/talk2cells/tests/scp_agent/test_scp_agent.py +8 -9
  85. aiagents4pharma/talk2cells/tools/__init__.py +3 -2
  86. aiagents4pharma/talk2cells/tools/scp_agent/__init__.py +4 -4
  87. aiagents4pharma/talk2cells/tools/scp_agent/display_studies.py +5 -3
  88. aiagents4pharma/talk2cells/tools/scp_agent/search_studies.py +21 -22
  89. aiagents4pharma/talk2knowledgegraphs/.dockerignore +13 -0
  90. aiagents4pharma/talk2knowledgegraphs/Dockerfile +103 -0
  91. aiagents4pharma/talk2knowledgegraphs/README.md +1 -0
  92. aiagents4pharma/talk2knowledgegraphs/__init__.py +4 -7
  93. aiagents4pharma/talk2knowledgegraphs/agents/__init__.py +3 -2
  94. aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +40 -30
  95. aiagents4pharma/talk2knowledgegraphs/configs/__init__.py +3 -6
  96. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py +2 -2
  97. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml +8 -8
  98. aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py +3 -2
  99. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/__init__.py +2 -2
  100. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  101. aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -1
  102. aiagents4pharma/talk2knowledgegraphs/configs/tools/__init__.py +4 -5
  103. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/__init__.py +2 -2
  104. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/default.yaml +1 -1
  105. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +17 -2
  106. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/__init__.py +2 -2
  107. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/default.yaml +1 -1
  108. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/__init__.py +2 -2
  109. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/default.yaml +1 -1
  110. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +1 -1
  111. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +1 -1
  112. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +1 -1
  113. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +1 -1
  114. aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +4 -6
  115. aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +115 -67
  116. aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +2 -0
  117. aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +35 -24
  118. aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +29 -21
  119. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example +23 -0
  120. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml +93 -0
  121. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example +23 -0
  122. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml +108 -0
  123. aiagents4pharma/talk2knowledgegraphs/entrypoint.sh +190 -0
  124. aiagents4pharma/talk2knowledgegraphs/install.md +140 -0
  125. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +31 -65
  126. aiagents4pharma/talk2knowledgegraphs/states/__init__.py +3 -2
  127. aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +1 -0
  128. aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +65 -40
  129. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py +54 -48
  130. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py +4 -0
  131. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_primekg.py +17 -4
  132. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_starkqa_primekg.py +33 -24
  133. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_graphrag_reasoning.py +116 -69
  134. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +736 -413
  135. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +22 -15
  136. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +19 -12
  137. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_summarization.py +95 -48
  138. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_embeddings.py +4 -0
  139. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_huggingface.py +5 -0
  140. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_nim_molmim.py +13 -18
  141. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py +10 -3
  142. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py +4 -3
  143. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py +3 -2
  144. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +1 -0
  145. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +9 -4
  146. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +6 -6
  147. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +4 -0
  148. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +442 -42
  149. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py +3 -4
  150. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +10 -6
  151. aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +10 -7
  152. aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py +15 -20
  153. aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +245 -205
  154. aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +92 -90
  155. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py +25 -37
  156. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py +10 -13
  157. aiagents4pharma/talk2knowledgegraphs/utils/__init__.py +4 -7
  158. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py +4 -7
  159. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py +4 -0
  160. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py +11 -14
  161. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py +7 -7
  162. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py +12 -6
  163. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py +8 -6
  164. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +9 -6
  165. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py +1 -0
  166. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py +15 -9
  167. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +23 -20
  168. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +12 -10
  169. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +16 -10
  170. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +26 -18
  171. aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +4 -5
  172. aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +218 -81
  173. aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +53 -47
  174. aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py +18 -14
  175. aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py +22 -23
  176. aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +11 -10
  177. aiagents4pharma/talk2scholars/.dockerignore +13 -0
  178. aiagents4pharma/talk2scholars/Dockerfile +104 -0
  179. aiagents4pharma/talk2scholars/README.md +1 -0
  180. aiagents4pharma/talk2scholars/agents/__init__.py +1 -5
  181. aiagents4pharma/talk2scholars/agents/main_agent.py +6 -4
  182. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +5 -4
  183. aiagents4pharma/talk2scholars/agents/pdf_agent.py +4 -2
  184. aiagents4pharma/talk2scholars/agents/s2_agent.py +2 -2
  185. aiagents4pharma/talk2scholars/agents/zotero_agent.py +10 -11
  186. aiagents4pharma/talk2scholars/configs/__init__.py +1 -3
  187. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py +1 -4
  188. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +1 -1
  189. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +1 -1
  190. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +8 -8
  191. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +7 -7
  192. aiagents4pharma/talk2scholars/configs/tools/__init__.py +8 -6
  193. aiagents4pharma/talk2scholars/docker-compose/cpu/.env.example +21 -0
  194. aiagents4pharma/talk2scholars/docker-compose/cpu/docker-compose.yml +90 -0
  195. aiagents4pharma/talk2scholars/docker-compose/gpu/.env.example +21 -0
  196. aiagents4pharma/talk2scholars/docker-compose/gpu/docker-compose.yml +105 -0
  197. aiagents4pharma/talk2scholars/install.md +122 -0
  198. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +8 -8
  199. aiagents4pharma/talk2scholars/tests/{test_main_agent.py → test_agents_main_agent.py} +41 -23
  200. aiagents4pharma/talk2scholars/tests/{test_paper_download_agent.py → test_agents_paper_agents_download_agent.py} +10 -16
  201. aiagents4pharma/talk2scholars/tests/{test_pdf_agent.py → test_agents_pdf_agent.py} +6 -10
  202. aiagents4pharma/talk2scholars/tests/{test_s2_agent.py → test_agents_s2_agent.py} +8 -16
  203. aiagents4pharma/talk2scholars/tests/{test_zotero_agent.py → test_agents_zotero_agent.py} +5 -7
  204. aiagents4pharma/talk2scholars/tests/{test_s2_display_dataframe.py → test_s2_tools_display_dataframe.py} +6 -7
  205. aiagents4pharma/talk2scholars/tests/{test_s2_query_dataframe.py → test_s2_tools_query_dataframe.py} +5 -15
  206. aiagents4pharma/talk2scholars/tests/{test_paper_downloader.py → test_tools_paper_downloader.py} +25 -63
  207. aiagents4pharma/talk2scholars/tests/{test_question_and_answer_tool.py → test_tools_question_and_answer_tool.py} +2 -6
  208. aiagents4pharma/talk2scholars/tests/{test_s2_multi.py → test_tools_s2_multi.py} +5 -5
  209. aiagents4pharma/talk2scholars/tests/{test_s2_retrieve.py → test_tools_s2_retrieve.py} +2 -1
  210. aiagents4pharma/talk2scholars/tests/{test_s2_search.py → test_tools_s2_search.py} +5 -5
  211. aiagents4pharma/talk2scholars/tests/{test_s2_single.py → test_tools_s2_single.py} +5 -5
  212. aiagents4pharma/talk2scholars/tests/{test_arxiv_downloader.py → test_utils_arxiv_downloader.py} +16 -25
  213. aiagents4pharma/talk2scholars/tests/{test_base_paper_downloader.py → test_utils_base_paper_downloader.py} +25 -47
  214. aiagents4pharma/talk2scholars/tests/{test_biorxiv_downloader.py → test_utils_biorxiv_downloader.py} +14 -42
  215. aiagents4pharma/talk2scholars/tests/{test_medrxiv_downloader.py → test_utils_medrxiv_downloader.py} +15 -49
  216. aiagents4pharma/talk2scholars/tests/{test_nvidia_nim_reranker.py → test_utils_nvidia_nim_reranker.py} +6 -16
  217. aiagents4pharma/talk2scholars/tests/{test_pdf_answer_formatter.py → test_utils_pdf_answer_formatter.py} +1 -0
  218. aiagents4pharma/talk2scholars/tests/{test_pdf_batch_processor.py → test_utils_pdf_batch_processor.py} +6 -15
  219. aiagents4pharma/talk2scholars/tests/{test_pdf_collection_manager.py → test_utils_pdf_collection_manager.py} +34 -11
  220. aiagents4pharma/talk2scholars/tests/{test_pdf_document_processor.py → test_utils_pdf_document_processor.py} +2 -3
  221. aiagents4pharma/talk2scholars/tests/{test_pdf_generate_answer.py → test_utils_pdf_generate_answer.py} +3 -6
  222. aiagents4pharma/talk2scholars/tests/{test_pdf_gpu_detection.py → test_utils_pdf_gpu_detection.py} +5 -16
  223. aiagents4pharma/talk2scholars/tests/{test_pdf_rag_pipeline.py → test_utils_pdf_rag_pipeline.py} +7 -17
  224. aiagents4pharma/talk2scholars/tests/{test_pdf_retrieve_chunks.py → test_utils_pdf_retrieve_chunks.py} +4 -11
  225. aiagents4pharma/talk2scholars/tests/{test_pdf_singleton_manager.py → test_utils_pdf_singleton_manager.py} +26 -23
  226. aiagents4pharma/talk2scholars/tests/{test_pdf_vector_normalization.py → test_utils_pdf_vector_normalization.py} +1 -1
  227. aiagents4pharma/talk2scholars/tests/{test_pdf_vector_store.py → test_utils_pdf_vector_store.py} +27 -55
  228. aiagents4pharma/talk2scholars/tests/{test_pubmed_downloader.py → test_utils_pubmed_downloader.py} +31 -91
  229. aiagents4pharma/talk2scholars/tests/{test_read_helper_utils.py → test_utils_read_helper_utils.py} +2 -6
  230. aiagents4pharma/talk2scholars/tests/{test_s2_utils_ext_ids.py → test_utils_s2_utils_ext_ids.py} +5 -15
  231. aiagents4pharma/talk2scholars/tests/{test_zotero_human_in_the_loop.py → test_utils_zotero_human_in_the_loop.py} +6 -13
  232. aiagents4pharma/talk2scholars/tests/{test_zotero_path.py → test_utils_zotero_path.py} +53 -45
  233. aiagents4pharma/talk2scholars/tests/{test_zotero_read.py → test_utils_zotero_read.py} +30 -91
  234. aiagents4pharma/talk2scholars/tests/{test_zotero_write.py → test_utils_zotero_write.py} +6 -16
  235. aiagents4pharma/talk2scholars/tools/__init__.py +1 -4
  236. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +20 -35
  237. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +7 -5
  238. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +9 -11
  239. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +14 -21
  240. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +14 -22
  241. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +11 -13
  242. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +14 -28
  243. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +4 -8
  244. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +16 -14
  245. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +4 -4
  246. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +15 -17
  247. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +2 -2
  248. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +5 -5
  249. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +4 -4
  250. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +2 -6
  251. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +5 -9
  252. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +4 -4
  253. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +2 -2
  254. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +6 -15
  255. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +7 -15
  256. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +2 -2
  257. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +3 -4
  258. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +8 -17
  259. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +17 -33
  260. aiagents4pharma/talk2scholars/tools/s2/__init__.py +8 -6
  261. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +3 -7
  262. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +7 -6
  263. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +5 -12
  264. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +2 -4
  265. aiagents4pharma/talk2scholars/tools/s2/search.py +6 -6
  266. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -3
  267. aiagents4pharma/talk2scholars/tools/s2/utils/__init__.py +1 -3
  268. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +12 -18
  269. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +11 -18
  270. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +11 -16
  271. aiagents4pharma/talk2scholars/tools/zotero/__init__.py +1 -4
  272. aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py +1 -4
  273. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +21 -39
  274. aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py +2 -6
  275. aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py +8 -11
  276. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +4 -12
  277. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +13 -27
  278. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +4 -7
  279. aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +8 -10
  280. aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +3 -2
  281. {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/METADATA +115 -50
  282. aiagents4pharma-1.45.0.dist-info/RECORD +324 -0
  283. {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/WHEEL +1 -2
  284. aiagents4pharma-1.43.0.dist-info/RECORD +0 -293
  285. aiagents4pharma-1.43.0.dist-info/top_level.txt +0 -1
  286. /aiagents4pharma/talk2scholars/tests/{test_state.py → test_states_state.py} +0 -0
  287. /aiagents4pharma/talk2scholars/tests/{test_pdf_paper_loader.py → test_utils_pdf_paper_loader.py} +0 -0
  288. /aiagents4pharma/talk2scholars/tests/{test_tool_helper_utils.py → test_utils_tool_helper_utils.py} +0 -0
  289. /aiagents4pharma/talk2scholars/tests/{test_zotero_pdf_downloader_utils.py → test_utils_zotero_pdf_downloader_utils.py} +0 -0
  290. {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,16 +2,19 @@
2
2
  Class for loading BioBridgePrimeKG dataset.
3
3
  """
4
4
 
5
+ import json
5
6
  import os
6
7
  import pickle
7
- import json
8
- import requests
8
+
9
9
  import numpy as np
10
10
  import pandas as pd
11
+ import requests
11
12
  from tqdm import tqdm
13
+
12
14
  from .dataset import Dataset
13
15
  from .primekg import PrimeKG
14
16
 
17
+
15
18
  class BioBridgePrimeKG(Dataset):
16
19
  """
17
20
  Class for loading BioBridgePrimeKG dataset.
@@ -21,11 +24,13 @@ class BioBridgePrimeKG(Dataset):
21
24
  https://github.com/RyanWangZf/BioBridge
22
25
  """
23
26
 
24
- def __init__(self,
25
- primekg_dir: str = "../../../data/primekg/",
26
- local_dir: str = "../../../data/biobridge_primekg/",
27
- random_seed: int=0,
28
- n_neg_samples: int=5):
27
+ def __init__(
28
+ self,
29
+ primekg_dir: str = "../../../data/primekg/",
30
+ local_dir: str = "../../../data/biobridge_primekg/",
31
+ random_seed: int = 0,
32
+ n_neg_samples: int = 5,
33
+ ):
29
34
  """
30
35
  Constructor for BioBridgePrimeKG class.
31
36
 
@@ -92,10 +97,7 @@ class BioBridgePrimeKG(Dataset):
92
97
 
93
98
  return primekg_data
94
99
 
95
- def _download_file(self,
96
- remote_url:str,
97
- local_dir: str,
98
- local_filename: str):
100
+ def _download_file(self, remote_url: str, local_dir: str, local_filename: str):
99
101
  """
100
102
  A helper function to download a file from remote URL to the local directory.
101
103
 
@@ -135,13 +137,16 @@ class BioBridgePrimeKG(Dataset):
135
137
  """
136
138
  # Download the data config file of BioBridgePrimeKG
137
139
  self._download_file(
138
- remote_url= ('https://raw.githubusercontent.com/RyanWangZf/BioBridge/'
139
- 'refs/heads/main/data/BindData/data_config.json'),
140
+ remote_url=(
141
+ "https://raw.githubusercontent.com/RyanWangZf/BioBridge/"
142
+ "refs/heads/main/data/BindData/data_config.json"
143
+ ),
140
144
  local_dir=self.local_dir,
141
- local_filename='data_config.json')
145
+ local_filename="data_config.json",
146
+ )
142
147
 
143
148
  # Load the downloaded data config file
144
- with open(os.path.join(self.local_dir, 'data_config.json'), 'r', encoding='utf-8') as f:
149
+ with open(os.path.join(self.local_dir, "data_config.json"), encoding="utf-8") as f:
145
150
  data_config = json.load(f)
146
151
 
147
152
  return data_config
@@ -161,15 +166,19 @@ class BioBridgePrimeKG(Dataset):
161
166
  else:
162
167
  # Download the embeddings from the BioBridge repo and further process them
163
168
  # List of embedding source files
164
- url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
165
- 'refs/heads/main/data/embeddings/esm2b_unimo_pubmedbert/')
169
+ url = (
170
+ "https://media.githubusercontent.com/media/RyanWangZf/BioBridge/"
171
+ "refs/heads/main/data/embeddings/esm2b_unimo_pubmedbert/"
172
+ )
166
173
  file_list = [f"{n}.pkl" for n in self.preselected_node_types]
167
174
 
168
175
  # Download the embeddings
169
176
  for file in file_list:
170
- self._download_file(remote_url=os.path.join(url, file),
171
- local_dir=os.path.join(self.local_dir, "embeddings"),
172
- local_filename=file)
177
+ self._download_file(
178
+ remote_url=os.path.join(url, file),
179
+ local_dir=os.path.join(self.local_dir, "embeddings"),
180
+ local_filename=file,
181
+ )
173
182
 
174
183
  # Unified embeddings
175
184
  emb_dict_all = {}
@@ -179,7 +188,7 @@ class BioBridgePrimeKG(Dataset):
179
188
  emb_ar = emb["embedding"]
180
189
  if not isinstance(emb_ar, list):
181
190
  emb_ar = emb_ar.tolist()
182
- emb_dict_all.update(dict(zip(emb["node_index"], emb_ar)))
191
+ emb_dict_all.update(dict(zip(emb["node_index"], emb_ar, strict=False)))
183
192
 
184
193
  # Store embeddings
185
194
  with open(processed_file_path, "wb") as f:
@@ -204,37 +213,54 @@ class BioBridgePrimeKG(Dataset):
204
213
  # Load each dataframe in the local directory
205
214
  node_info_dict = {}
206
215
  for i, node_type in enumerate(self.preselected_node_types):
207
- with open(os.path.join(self.local_dir, "processed",
208
- f"{node_type}.csv"), "rb") as f:
216
+ with open(os.path.join(self.local_dir, "processed", f"{node_type}.csv"), "rb") as f:
209
217
  df_node = pd.read_csv(f)
210
218
  node_info_dict[self.node_type_map[node_type]] = df_node
219
+ print(i)
211
220
  else:
212
221
  # Download the related files from the BioBridge repo and further process them
213
222
  # List of processed files
214
- url = ('https://media.githubusercontent.com/media/RyanWangZf/BioBridge/'
215
- 'refs/heads/main/data/Processed/')
216
- file_list = ["protein", "molecular", "cellular", "biological", "drug", "disease"]
223
+ url = (
224
+ "https://media.githubusercontent.com/media/RyanWangZf/BioBridge/"
225
+ "refs/heads/main/data/Processed/"
226
+ )
227
+ file_list = [
228
+ "protein",
229
+ "molecular",
230
+ "cellular",
231
+ "biological",
232
+ "drug",
233
+ "disease",
234
+ ]
217
235
 
218
236
  # Download the processed files
219
237
  for i, file in enumerate(file_list):
220
- self._download_file(remote_url=os.path.join(url, f"{file}.csv"),
221
- local_dir=os.path.join(self.local_dir, "processed"),
222
- local_filename=f"{self.preselected_node_types[i]}.csv")
238
+ self._download_file(
239
+ remote_url=os.path.join(url, f"{file}.csv"),
240
+ local_dir=os.path.join(self.local_dir, "processed"),
241
+ local_filename=f"{self.preselected_node_types[i]}.csv",
242
+ )
223
243
 
224
244
  # Build the node index list
225
245
  node_info_dict = {}
226
246
  node_index_list = []
227
247
  for i, file in enumerate(file_list):
228
- df_node = pd.read_csv(os.path.join(self.local_dir, "processed",
229
- f"{self.preselected_node_types[i]}.csv"))
248
+ df_node = pd.read_csv(
249
+ os.path.join(
250
+ self.local_dir,
251
+ "processed",
252
+ f"{self.preselected_node_types[i]}.csv",
253
+ )
254
+ )
230
255
  node_info_dict[self.node_type_map[self.preselected_node_types[i]]] = df_node
231
256
  node_index_list.extend(df_node["node_index"].tolist())
257
+ print(i, file)
232
258
 
233
259
  # Filter the PrimeKG dataset to take into account only the selected node types
234
260
  primekg_triplets = self.primekg.get_edges().copy()
235
261
  primekg_triplets = primekg_triplets[
236
- primekg_triplets["head_index"].isin(node_index_list) &\
237
- primekg_triplets["tail_index"].isin(node_index_list)
262
+ primekg_triplets["head_index"].isin(node_index_list)
263
+ & primekg_triplets["tail_index"].isin(node_index_list)
238
264
  ]
239
265
  primekg_triplets = primekg_triplets.reset_index(drop=True)
240
266
 
@@ -256,8 +282,9 @@ class BioBridgePrimeKG(Dataset):
256
282
 
257
283
  return primekg_triplets, node_info_dict
258
284
 
259
- def _build_train_test_split(self) -> tuple[pd.DataFrame, pd.DataFrame,
260
- pd.DataFrame, pd.DataFrame, pd.DataFrame]:
285
+ def _build_train_test_split(
286
+ self,
287
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
261
288
  """
262
289
  Build the train-test split for BioBridgePrimeKG dataset.
263
290
 
@@ -268,34 +295,31 @@ class BioBridgePrimeKG(Dataset):
268
295
  The test nodes for BioBridgePrimeKG dataset.
269
296
  The full triplets for BioBridgePrimeKG dataset.
270
297
  """
271
- if os.path.exists(os.path.join(self.local_dir, "processed",
272
- "triplet_full_altered.tsv.gz")):
298
+ if os.path.exists(os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz")):
273
299
  # Load each dataframe in the local directory
274
- with open(os.path.join(self.local_dir, "processed",
275
- "triplet_train.tsv.gz"), "rb") as f:
300
+ with open(os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"), "rb") as f:
276
301
  df_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
277
302
 
278
- with open(os.path.join(self.local_dir, "processed",
279
- "node_train.tsv.gz"), "rb") as f:
303
+ with open(os.path.join(self.local_dir, "processed", "node_train.tsv.gz"), "rb") as f:
280
304
  df_node_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
281
305
 
282
- with open(os.path.join(self.local_dir, "processed",
283
- "triplet_test.tsv.gz"), "rb") as f:
306
+ with open(os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"), "rb") as f:
284
307
  df_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
285
308
 
286
- with open(os.path.join(self.local_dir, "processed",
287
- "node_test.tsv.gz"), "rb") as f:
309
+ with open(os.path.join(self.local_dir, "processed", "node_test.tsv.gz"), "rb") as f:
288
310
  df_node_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
289
311
 
290
- with open(os.path.join(self.local_dir, "processed",
291
- "triplet_full_altered.tsv.gz"), "rb") as f:
312
+ with open(
313
+ os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz"),
314
+ "rb",
315
+ ) as f:
292
316
  triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
293
317
  else:
294
318
  # Filtering out some nodes in the embedding dictionary
295
319
  triplets = self.primekg_triplets.copy()
296
320
  triplets = triplets[
297
- triplets["head_index"].isin(list(self.emb_dict.keys())) &\
298
- triplets["tail_index"].isin(list(self.emb_dict.keys()))
321
+ triplets["head_index"].isin(list(self.emb_dict.keys()))
322
+ & triplets["tail_index"].isin(list(self.emb_dict.keys()))
299
323
  ].reset_index(drop=True)
300
324
 
301
325
  # Perform splitting of the triplets
@@ -311,7 +335,7 @@ class BioBridgePrimeKG(Dataset):
311
335
  "test": {
312
336
  "node_index": [],
313
337
  "node_type": [],
314
- }
338
+ },
315
339
  }
316
340
  # Loop over the node types
317
341
  for node_type in triplets["head_type"].unique():
@@ -319,7 +343,7 @@ class BioBridgePrimeKG(Dataset):
319
343
  all_x_indexes = df_sub["head_index"].unique()
320
344
  # By default, we use 90% of the nodes for training and 10% for testing
321
345
  te_x_indexes = np.random.choice(
322
- all_x_indexes, size=int(0.1*len(all_x_indexes)), replace=False
346
+ all_x_indexes, size=int(0.1 * len(all_x_indexes)), replace=False
323
347
  )
324
348
  df_subs = {}
325
349
  df_subs["test"] = df_sub[df_sub["head_index"].isin(te_x_indexes)]
@@ -331,10 +355,10 @@ class BioBridgePrimeKG(Dataset):
331
355
  node_index = {}
332
356
  node_index["train"] = df_subs["train"]["head_index"].unique()
333
357
  node_split["train"]["node_index"].extend(node_index["train"].tolist())
334
- node_split["train"]["node_type"].extend([node_type]*len(node_index["train"]))
358
+ node_split["train"]["node_type"].extend([node_type] * len(node_index["train"]))
335
359
  node_index["test"] = df_subs["test"]["head_index"].unique()
336
360
  node_split["test"]["node_index"].extend(node_index["test"].tolist())
337
- node_split["test"]["node_type"].extend([node_type]*len(node_index["test"]))
361
+ node_split["test"]["node_type"].extend([node_type] * len(node_index["test"]))
338
362
 
339
363
  print(f"Number of {node_type} nodes in train: {len(node_index['train'])}")
340
364
  print(f"Number of {node_type} nodes in test: {len(node_index['test'])}")
@@ -346,18 +370,37 @@ class BioBridgePrimeKG(Dataset):
346
370
  df_node_test = pd.DataFrame(node_split["test"])
347
371
 
348
372
  # Store each dataframe in the local directory
349
- df_train.to_csv(os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"),
350
- sep="\t", compression="gzip", index=False)
351
- df_node_train.to_csv(os.path.join(self.local_dir, "processed", "node_train.tsv.gz"),
352
- sep="\t", compression="gzip", index=False)
353
- df_test.to_csv(os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"),
354
- sep="\t", compression="gzip", index=False)
355
- df_node_test.to_csv(os.path.join(self.local_dir, "processed", "node_test.tsv.gz"),
356
- sep="\t", compression="gzip", index=False)
373
+ df_train.to_csv(
374
+ os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"),
375
+ sep="\t",
376
+ compression="gzip",
377
+ index=False,
378
+ )
379
+ df_node_train.to_csv(
380
+ os.path.join(self.local_dir, "processed", "node_train.tsv.gz"),
381
+ sep="\t",
382
+ compression="gzip",
383
+ index=False,
384
+ )
385
+ df_test.to_csv(
386
+ os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"),
387
+ sep="\t",
388
+ compression="gzip",
389
+ index=False,
390
+ )
391
+ df_node_test.to_csv(
392
+ os.path.join(self.local_dir, "processed", "node_test.tsv.gz"),
393
+ sep="\t",
394
+ compression="gzip",
395
+ index=False,
396
+ )
357
397
  # Store altered full triplets as well
358
- triplets.to_csv(os.path.join(self.local_dir, "processed",
359
- "triplet_full_altered.tsv.gz"),
360
- sep="\t", compression="gzip", index=False)
398
+ triplets.to_csv(
399
+ os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz"),
400
+ sep="\t",
401
+ compression="gzip",
402
+ index=False,
403
+ )
361
404
 
362
405
  return df_train, df_node_train, df_test, df_node_test, triplets
363
406
 
@@ -473,8 +516,13 @@ class BioBridgePrimeKG(Dataset):
473
516
 
474
517
  # Build train-test split
475
518
  print("Building train-test split...")
476
- self.df_train, self.df_node_train, self.df_test, self.df_node_test, self.primekg_triplets =\
477
- self._build_train_test_split()
519
+ (
520
+ self.df_train,
521
+ self.df_node_train,
522
+ self.df_test,
523
+ self.df_node_test,
524
+ self.primekg_triplets,
525
+ ) = self._build_train_test_split()
478
526
 
479
527
  # if build_neg_triplest:
480
528
  # # Build negative triplets
@@ -549,7 +597,7 @@ class BioBridgePrimeKG(Dataset):
549
597
  "train": self.df_train,
550
598
  "node_train": self.df_node_train,
551
599
  "test": self.df_test,
552
- "node_test": self.df_node_test
600
+ "node_test": self.df_node_test,
553
601
  }
554
602
 
555
603
  def get_node_info_dict(self) -> dict:
@@ -6,10 +6,12 @@ Abstract class for dataset.
6
6
 
7
7
  from abc import ABC, abstractmethod
8
8
 
9
+
9
10
  class Dataset(ABC):
10
11
  """
11
12
  Abstract class for dataset.
12
13
  """
14
+
13
15
  @abstractmethod
14
16
  def setup(self):
15
17
  """
@@ -3,11 +3,14 @@ Class for loading PrimeKG dataset.
3
3
  """
4
4
 
5
5
  import os
6
+
7
+ import pandas as pd
6
8
  import requests
7
9
  from tqdm import tqdm
8
- import pandas as pd
10
+
9
11
  from .dataset import Dataset
10
12
 
13
+
11
14
  class PrimeKG(Dataset):
12
15
  """
13
16
  Class for loading PrimeKG dataset.
@@ -41,8 +44,7 @@ class PrimeKG(Dataset):
41
44
  # Make the directory if it doesn't exist
42
45
  os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
43
46
 
44
-
45
- def _download_file(self, remote_url:str, local_path: str):
47
+ def _download_file(self, remote_url: str, local_path: str):
46
48
  """
47
49
  A helper function to download a file from remote URL to the local directory.
48
50
 
@@ -83,17 +85,18 @@ class PrimeKG(Dataset):
83
85
  print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
84
86
 
85
87
  # Download the file from the Harvard Dataverse with designated file_id for node
86
- self._download_file(f"{self.server_path}{self.file_ids['nodes']}",
87
- os.path.join(self.local_dir, "nodes.tab"))
88
+ self._download_file(
89
+ f"{self.server_path}{self.file_ids['nodes']}",
90
+ os.path.join(self.local_dir, "nodes.tab"),
91
+ )
88
92
 
89
93
  # Load the downloaded file into a pandas DataFrame
90
- nodes = pd.read_csv(os.path.join(self.local_dir, "nodes.tab"),
91
- sep="\t", low_memory=False)
94
+ nodes = pd.read_csv(
95
+ os.path.join(self.local_dir, "nodes.tab"), sep="\t", low_memory=False
96
+ )
92
97
 
93
98
  # Further processing of the dataframe
94
- nodes = nodes[
95
- ["node_index", "node_name", "node_source", "node_id", "node_type"]
96
- ]
99
+ nodes = nodes[["node_index", "node_name", "node_source", "node_id", "node_type"]]
97
100
 
98
101
  # Store compressed dataframe in the local directory
99
102
  nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
@@ -123,17 +126,18 @@ class PrimeKG(Dataset):
123
126
  print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
124
127
 
125
128
  # Download the file from the Harvard Dataverse with designated file_id for edge
126
- self._download_file(f"{self.server_path}{self.file_ids['edges']}",
127
- os.path.join(self.local_dir, "edges.csv"))
129
+ self._download_file(
130
+ f"{self.server_path}{self.file_ids['edges']}",
131
+ os.path.join(self.local_dir, "edges.csv"),
132
+ )
128
133
 
129
134
  # Load the downloaded file into a pandas DataFrame
130
- edges = pd.read_csv(os.path.join(self.local_dir, "edges.csv"),
131
- sep=",", low_memory=False)
135
+ edges = pd.read_csv(
136
+ os.path.join(self.local_dir, "edges.csv"), sep=",", low_memory=False
137
+ )
132
138
 
133
139
  # Further processing of the dataframe
134
- edges = edges.merge(
135
- nodes, left_on="x_index", right_on="node_index"
136
- )
140
+ edges = edges.merge(nodes, left_on="x_index", right_on="node_index")
137
141
  edges.drop(["x_index"], axis=1, inplace=True)
138
142
  edges.rename(
139
143
  columns={
@@ -145,9 +149,7 @@ class PrimeKG(Dataset):
145
149
  },
146
150
  inplace=True,
147
151
  )
148
- edges = edges.merge(
149
- nodes, left_on="y_index", right_on="node_index"
150
- )
152
+ edges = edges.merge(nodes, left_on="y_index", right_on="node_index")
151
153
  edges.drop(["y_index"], axis=1, inplace=True)
152
154
  edges.rename(
153
155
  columns={
@@ -155,15 +157,24 @@ class PrimeKG(Dataset):
155
157
  "node_name": "tail_name",
156
158
  "node_source": "tail_source",
157
159
  "node_id": "tail_id",
158
- "node_type": "tail_type"
160
+ "node_type": "tail_type",
159
161
  },
160
162
  inplace=True,
161
163
  )
162
164
  edges = edges[
163
165
  [
164
- "head_index", "head_name", "head_source", "head_id", "head_type",
165
- "tail_index", "tail_name", "tail_source", "tail_id", "tail_type",
166
- "display_relation", "relation",
166
+ "head_index",
167
+ "head_name",
168
+ "head_source",
169
+ "head_id",
170
+ "head_type",
171
+ "tail_index",
172
+ "tail_name",
173
+ "tail_source",
174
+ "tail_id",
175
+ "tail_type",
176
+ "display_relation",
177
+ "relation",
167
178
  ]
168
179
  ]
169
180
 
@@ -3,16 +3,19 @@ Class for loading StarkQAPrimeKG dataset.
3
3
  """
4
4
 
5
5
  import os
6
- import shutil
7
6
  import pickle
7
+ import shutil
8
+
9
+ import gdown
8
10
  import numpy as np
9
11
  import pandas as pd
10
- from tqdm import tqdm
11
12
  import torch
12
13
  from huggingface_hub import hf_hub_download, list_repo_files
13
- import gdown
14
+ from tqdm import tqdm
15
+
14
16
  from .dataset import Dataset
15
17
 
18
+
16
19
  class StarkQAPrimeKG(Dataset):
17
20
  """
18
21
  Class for loading StarkQAPrimeKG dataset.
@@ -67,41 +70,47 @@ class StarkQAPrimeKG(Dataset):
67
70
 
68
71
  # List all related files in the HuggingFace Hub repository
69
72
  files = list_repo_files(self.hf_repo_id, repo_type="dataset")
70
- files = [f for f in files if ((f.startswith("qa/prime/") or
71
- f.startswith("skb/prime/")) and f.find("raw") == -1)]
73
+ files = [
74
+ f
75
+ for f in files
76
+ if (
77
+ (f.startswith("qa/prime/") or f.startswith("skb/prime/"))
78
+ and f.find("raw") == -1
79
+ )
80
+ ]
72
81
 
73
82
  # Download and save each file in the specified folder
74
83
  for file in tqdm(files):
75
- _ = hf_hub_download(self.hf_repo_id,
76
- file,
77
- repo_type="dataset",
78
- local_dir=self.local_dir)
84
+ _ = hf_hub_download(
85
+ self.hf_repo_id, file, repo_type="dataset", local_dir=self.local_dir
86
+ )
79
87
 
80
88
  # Unzip the processed files
81
89
  shutil.unpack_archive(
82
90
  os.path.join(self.local_dir, "skb/prime/processed.zip"),
83
- os.path.join(self.local_dir, "skb/prime/")
91
+ os.path.join(self.local_dir, "skb/prime/"),
84
92
  )
85
93
 
86
94
  # Load StarkQA dataframe
87
95
  starkqa = pd.read_csv(
88
96
  os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
89
- low_memory=False)
97
+ low_memory=False,
98
+ )
90
99
 
91
100
  # Read split indices
92
- qa_indices = sorted(starkqa['id'].tolist())
101
+ qa_indices = sorted(starkqa["id"].tolist())
93
102
  starkqa_split_idx = {}
94
- for split in ['train', 'val', 'test', 'test-0.1']:
95
- indices_file = os.path.join(self.local_dir, "qa/prime/split", f'{split}.index')
96
- with open(indices_file, 'r', encoding='utf-8') as f:
97
- indices = f.read().strip().split('\n')
103
+ for split in ["train", "val", "test", "test-0.1"]:
104
+ indices_file = os.path.join(self.local_dir, "qa/prime/split", f"{split}.index")
105
+ with open(indices_file, encoding="utf-8") as f:
106
+ indices = f.read().strip().split("\n")
98
107
  query_ids = [int(idx) for idx in indices]
99
108
  starkqa_split_idx[split] = np.array(
100
109
  [qa_indices.index(query_id) for query_id in query_ids]
101
110
  )
102
111
 
103
112
  # Load the node info of PrimeKG preprocessed for StarkQA
104
- with open(os.path.join(self.local_dir, 'skb/prime/processed/node_info.pkl'), 'rb') as f:
113
+ with open(os.path.join(self.local_dir, "skb/prime/processed/node_info.pkl"), "rb") as f:
105
114
  starkqa_node_info = pickle.load(f)
106
115
 
107
116
  return starkqa, starkqa_split_idx, starkqa_node_info
@@ -116,9 +125,9 @@ class StarkQAPrimeKG(Dataset):
116
125
  """
117
126
  # Load the provided embeddings of query and nodes
118
127
  # Note that they utilized 'text-embedding-ada-002' for embeddings
119
- emb_model = 'text-embedding-ada-002'
120
- query_emb_url = 'https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU'
121
- node_emb_url = 'https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy'
128
+ emb_model = "text-embedding-ada-002"
129
+ query_emb_url = "https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU"
130
+ node_emb_url = "https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy"
122
131
 
123
132
  # Prepare respective directories to store the embeddings
124
133
  emb_dir = os.path.join(self.local_dir, emb_model)
@@ -154,7 +163,6 @@ class StarkQAPrimeKG(Dataset):
154
163
  print("Loading StarkQAPrimeKG embeddings...")
155
164
  self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
156
165
 
157
-
158
166
  def get_starkqa(self) -> pd.DataFrame:
159
167
  """
160
168
  Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
@@ -0,0 +1,23 @@
1
+ # .env.example (DO NOT put actual API keys here, read the README.md)
2
+
3
+ # OPENAI API KEY
4
+ OPENAI_API_KEY=your_openai_api_key_here
5
+
6
+ # LangSmith API KEY
7
+ LANGCHAIN_TRACING_V2=true
8
+ LANGCHAIN_API_KEY=your_langchain_api_key_here
9
+
10
+ # NVIDIA API KEY
11
+ NVIDIA_API_KEY=your_nvidia_api_key_here
12
+
13
+ # Set environment variables for data loader
14
+ MILVUS_HOST=localhost
15
+ MILVUS_PORT=19530
16
+ MILVUS_USER=root
17
+ MILVUS_PASSWORD=Milvus
18
+ MILVUS_DATABASE=your_database_name_here
19
+
20
+ # Specify the data directory for multimodal data to your own data directory
21
+ # DATA_DIR=/your_absolute_path_to_your_data_dir/
22
+
23
+ BATCH_SIZE=500