aiagents4pharma 1.43.0__py3-none-any.whl → 1.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/.dockerignore +13 -0
- aiagents4pharma/talk2aiagents4pharma/Dockerfile +105 -0
- aiagents4pharma/talk2aiagents4pharma/README.md +1 -0
- aiagents4pharma/talk2aiagents4pharma/__init__.py +4 -5
- aiagents4pharma/talk2aiagents4pharma/agents/__init__.py +3 -2
- aiagents4pharma/talk2aiagents4pharma/agents/main_agent.py +24 -23
- aiagents4pharma/talk2aiagents4pharma/configs/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/agents/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/agents/main_agent/default.yaml +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/config.yaml +1 -1
- aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/.env.example +23 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/docker-compose.yml +93 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/.env.example +23 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/docker-compose.yml +108 -0
- aiagents4pharma/talk2aiagents4pharma/install.md +127 -0
- aiagents4pharma/talk2aiagents4pharma/states/__init__.py +3 -2
- aiagents4pharma/talk2aiagents4pharma/states/state_talk2aiagents4pharma.py +5 -3
- aiagents4pharma/talk2aiagents4pharma/tests/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +72 -50
- aiagents4pharma/talk2biomodels/.dockerignore +13 -0
- aiagents4pharma/talk2biomodels/Dockerfile +104 -0
- aiagents4pharma/talk2biomodels/README.md +1 -0
- aiagents4pharma/talk2biomodels/__init__.py +4 -8
- aiagents4pharma/talk2biomodels/agents/__init__.py +3 -2
- aiagents4pharma/talk2biomodels/agents/t2b_agent.py +47 -42
- aiagents4pharma/talk2biomodels/api/__init__.py +4 -5
- aiagents4pharma/talk2biomodels/api/kegg.py +14 -10
- aiagents4pharma/talk2biomodels/api/ols.py +13 -10
- aiagents4pharma/talk2biomodels/api/uniprot.py +7 -6
- aiagents4pharma/talk2biomodels/configs/__init__.py +3 -4
- aiagents4pharma/talk2biomodels/configs/agents/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/config.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/tools/__init__.py +4 -5
- aiagents4pharma/talk2biomodels/configs/tools/ask_question/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/ask_question/default.yaml +1 -2
- aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/tools/get_annotation/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/get_annotation/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/install.md +63 -0
- aiagents4pharma/talk2biomodels/models/__init__.py +4 -4
- aiagents4pharma/talk2biomodels/models/basico_model.py +36 -28
- aiagents4pharma/talk2biomodels/models/sys_bio_model.py +13 -10
- aiagents4pharma/talk2biomodels/states/__init__.py +3 -2
- aiagents4pharma/talk2biomodels/states/state_talk2biomodels.py +12 -8
- aiagents4pharma/talk2biomodels/tests/BIOMD0000000449_url.xml +1585 -0
- aiagents4pharma/talk2biomodels/tests/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/tests/article_on_model_537.pdf +0 -0
- aiagents4pharma/talk2biomodels/tests/test_api.py +18 -14
- aiagents4pharma/talk2biomodels/tests/test_ask_question.py +8 -9
- aiagents4pharma/talk2biomodels/tests/test_basico_model.py +15 -9
- aiagents4pharma/talk2biomodels/tests/test_get_annotation.py +54 -55
- aiagents4pharma/talk2biomodels/tests/test_getmodelinfo.py +28 -27
- aiagents4pharma/talk2biomodels/tests/test_integration.py +21 -33
- aiagents4pharma/talk2biomodels/tests/test_load_biomodel.py +14 -11
- aiagents4pharma/talk2biomodels/tests/test_param_scan.py +21 -20
- aiagents4pharma/talk2biomodels/tests/test_query_article.py +129 -29
- aiagents4pharma/talk2biomodels/tests/test_search_models.py +9 -13
- aiagents4pharma/talk2biomodels/tests/test_simulate_model.py +16 -15
- aiagents4pharma/talk2biomodels/tests/test_steady_state.py +12 -22
- aiagents4pharma/talk2biomodels/tests/test_sys_bio_model.py +33 -29
- aiagents4pharma/talk2biomodels/tools/__init__.py +15 -12
- aiagents4pharma/talk2biomodels/tools/ask_question.py +42 -32
- aiagents4pharma/talk2biomodels/tools/custom_plotter.py +51 -43
- aiagents4pharma/talk2biomodels/tools/get_annotation.py +99 -75
- aiagents4pharma/talk2biomodels/tools/get_modelinfo.py +57 -51
- aiagents4pharma/talk2biomodels/tools/load_arguments.py +52 -32
- aiagents4pharma/talk2biomodels/tools/load_biomodel.py +8 -2
- aiagents4pharma/talk2biomodels/tools/parameter_scan.py +107 -90
- aiagents4pharma/talk2biomodels/tools/query_article.py +14 -13
- aiagents4pharma/talk2biomodels/tools/search_models.py +37 -26
- aiagents4pharma/talk2biomodels/tools/simulate_model.py +47 -37
- aiagents4pharma/talk2biomodels/tools/steady_state.py +76 -58
- aiagents4pharma/talk2biomodels/tools/utils.py +4 -3
- aiagents4pharma/talk2cells/README.md +1 -0
- aiagents4pharma/talk2cells/__init__.py +4 -5
- aiagents4pharma/talk2cells/agents/__init__.py +3 -2
- aiagents4pharma/talk2cells/agents/scp_agent.py +21 -19
- aiagents4pharma/talk2cells/states/__init__.py +3 -2
- aiagents4pharma/talk2cells/states/state_talk2cells.py +4 -2
- aiagents4pharma/talk2cells/tests/scp_agent/test_scp_agent.py +8 -9
- aiagents4pharma/talk2cells/tools/__init__.py +3 -2
- aiagents4pharma/talk2cells/tools/scp_agent/__init__.py +4 -4
- aiagents4pharma/talk2cells/tools/scp_agent/display_studies.py +5 -3
- aiagents4pharma/talk2cells/tools/scp_agent/search_studies.py +21 -22
- aiagents4pharma/talk2knowledgegraphs/.dockerignore +13 -0
- aiagents4pharma/talk2knowledgegraphs/Dockerfile +103 -0
- aiagents4pharma/talk2knowledgegraphs/README.md +1 -0
- aiagents4pharma/talk2knowledgegraphs/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/agents/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +40 -30
- aiagents4pharma/talk2knowledgegraphs/configs/__init__.py +3 -6
- aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml +8 -8
- aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/__init__.py +4 -5
- aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +17 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +4 -6
- aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +115 -67
- aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +2 -0
- aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +35 -24
- aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +29 -21
- aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example +23 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml +93 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example +23 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml +108 -0
- aiagents4pharma/talk2knowledgegraphs/entrypoint.sh +190 -0
- aiagents4pharma/talk2knowledgegraphs/install.md +140 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +31 -65
- aiagents4pharma/talk2knowledgegraphs/states/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +65 -40
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py +54 -48
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_primekg.py +17 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_starkqa_primekg.py +33 -24
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_graphrag_reasoning.py +116 -69
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +736 -413
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +22 -15
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +19 -12
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_summarization.py +95 -48
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_embeddings.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_huggingface.py +5 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_nim_molmim.py +13 -18
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py +10 -3
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py +4 -3
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +9 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +6 -6
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +442 -42
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py +3 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +10 -6
- aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +10 -7
- aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py +15 -20
- aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +245 -205
- aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +92 -90
- aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py +25 -37
- aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py +10 -13
- aiagents4pharma/talk2knowledgegraphs/utils/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py +11 -14
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py +7 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py +12 -6
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py +8 -6
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +9 -6
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py +15 -9
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +23 -20
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +12 -10
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +16 -10
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +26 -18
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +4 -5
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +218 -81
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +53 -47
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py +18 -14
- aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py +22 -23
- aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +11 -10
- aiagents4pharma/talk2scholars/.dockerignore +13 -0
- aiagents4pharma/talk2scholars/Dockerfile +104 -0
- aiagents4pharma/talk2scholars/README.md +1 -0
- aiagents4pharma/talk2scholars/agents/__init__.py +1 -5
- aiagents4pharma/talk2scholars/agents/main_agent.py +6 -4
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +5 -4
- aiagents4pharma/talk2scholars/agents/pdf_agent.py +4 -2
- aiagents4pharma/talk2scholars/agents/s2_agent.py +2 -2
- aiagents4pharma/talk2scholars/agents/zotero_agent.py +10 -11
- aiagents4pharma/talk2scholars/configs/__init__.py +1 -3
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py +1 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +1 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +1 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +8 -8
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +7 -7
- aiagents4pharma/talk2scholars/configs/tools/__init__.py +8 -6
- aiagents4pharma/talk2scholars/docker-compose/cpu/.env.example +21 -0
- aiagents4pharma/talk2scholars/docker-compose/cpu/docker-compose.yml +90 -0
- aiagents4pharma/talk2scholars/docker-compose/gpu/.env.example +21 -0
- aiagents4pharma/talk2scholars/docker-compose/gpu/docker-compose.yml +105 -0
- aiagents4pharma/talk2scholars/install.md +122 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +8 -8
- aiagents4pharma/talk2scholars/tests/{test_main_agent.py → test_agents_main_agent.py} +41 -23
- aiagents4pharma/talk2scholars/tests/{test_paper_download_agent.py → test_agents_paper_agents_download_agent.py} +10 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_agent.py → test_agents_pdf_agent.py} +6 -10
- aiagents4pharma/talk2scholars/tests/{test_s2_agent.py → test_agents_s2_agent.py} +8 -16
- aiagents4pharma/talk2scholars/tests/{test_zotero_agent.py → test_agents_zotero_agent.py} +5 -7
- aiagents4pharma/talk2scholars/tests/{test_s2_display_dataframe.py → test_s2_tools_display_dataframe.py} +6 -7
- aiagents4pharma/talk2scholars/tests/{test_s2_query_dataframe.py → test_s2_tools_query_dataframe.py} +5 -15
- aiagents4pharma/talk2scholars/tests/{test_paper_downloader.py → test_tools_paper_downloader.py} +25 -63
- aiagents4pharma/talk2scholars/tests/{test_question_and_answer_tool.py → test_tools_question_and_answer_tool.py} +2 -6
- aiagents4pharma/talk2scholars/tests/{test_s2_multi.py → test_tools_s2_multi.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_s2_retrieve.py → test_tools_s2_retrieve.py} +2 -1
- aiagents4pharma/talk2scholars/tests/{test_s2_search.py → test_tools_s2_search.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_s2_single.py → test_tools_s2_single.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_arxiv_downloader.py → test_utils_arxiv_downloader.py} +16 -25
- aiagents4pharma/talk2scholars/tests/{test_base_paper_downloader.py → test_utils_base_paper_downloader.py} +25 -47
- aiagents4pharma/talk2scholars/tests/{test_biorxiv_downloader.py → test_utils_biorxiv_downloader.py} +14 -42
- aiagents4pharma/talk2scholars/tests/{test_medrxiv_downloader.py → test_utils_medrxiv_downloader.py} +15 -49
- aiagents4pharma/talk2scholars/tests/{test_nvidia_nim_reranker.py → test_utils_nvidia_nim_reranker.py} +6 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_answer_formatter.py → test_utils_pdf_answer_formatter.py} +1 -0
- aiagents4pharma/talk2scholars/tests/{test_pdf_batch_processor.py → test_utils_pdf_batch_processor.py} +6 -15
- aiagents4pharma/talk2scholars/tests/{test_pdf_collection_manager.py → test_utils_pdf_collection_manager.py} +34 -11
- aiagents4pharma/talk2scholars/tests/{test_pdf_document_processor.py → test_utils_pdf_document_processor.py} +2 -3
- aiagents4pharma/talk2scholars/tests/{test_pdf_generate_answer.py → test_utils_pdf_generate_answer.py} +3 -6
- aiagents4pharma/talk2scholars/tests/{test_pdf_gpu_detection.py → test_utils_pdf_gpu_detection.py} +5 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_rag_pipeline.py → test_utils_pdf_rag_pipeline.py} +7 -17
- aiagents4pharma/talk2scholars/tests/{test_pdf_retrieve_chunks.py → test_utils_pdf_retrieve_chunks.py} +4 -11
- aiagents4pharma/talk2scholars/tests/{test_pdf_singleton_manager.py → test_utils_pdf_singleton_manager.py} +26 -23
- aiagents4pharma/talk2scholars/tests/{test_pdf_vector_normalization.py → test_utils_pdf_vector_normalization.py} +1 -1
- aiagents4pharma/talk2scholars/tests/{test_pdf_vector_store.py → test_utils_pdf_vector_store.py} +27 -55
- aiagents4pharma/talk2scholars/tests/{test_pubmed_downloader.py → test_utils_pubmed_downloader.py} +31 -91
- aiagents4pharma/talk2scholars/tests/{test_read_helper_utils.py → test_utils_read_helper_utils.py} +2 -6
- aiagents4pharma/talk2scholars/tests/{test_s2_utils_ext_ids.py → test_utils_s2_utils_ext_ids.py} +5 -15
- aiagents4pharma/talk2scholars/tests/{test_zotero_human_in_the_loop.py → test_utils_zotero_human_in_the_loop.py} +6 -13
- aiagents4pharma/talk2scholars/tests/{test_zotero_path.py → test_utils_zotero_path.py} +53 -45
- aiagents4pharma/talk2scholars/tests/{test_zotero_read.py → test_utils_zotero_read.py} +30 -91
- aiagents4pharma/talk2scholars/tests/{test_zotero_write.py → test_utils_zotero_write.py} +6 -16
- aiagents4pharma/talk2scholars/tools/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +20 -35
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +7 -5
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +9 -11
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +14 -21
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +14 -22
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +11 -13
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +14 -28
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +4 -8
- aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +16 -14
- aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +15 -17
- aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +5 -5
- aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +2 -6
- aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +5 -9
- aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +6 -15
- aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +7 -15
- aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +3 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +8 -17
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +17 -33
- aiagents4pharma/talk2scholars/tools/s2/__init__.py +8 -6
- aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +3 -7
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +7 -6
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +5 -12
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +2 -4
- aiagents4pharma/talk2scholars/tools/s2/search.py +6 -6
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -3
- aiagents4pharma/talk2scholars/tools/s2/utils/__init__.py +1 -3
- aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +12 -18
- aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +11 -18
- aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +11 -16
- aiagents4pharma/talk2scholars/tools/zotero/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +21 -39
- aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py +2 -6
- aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py +8 -11
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +4 -12
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +13 -27
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +4 -7
- aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +8 -10
- aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +3 -2
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/METADATA +115 -50
- aiagents4pharma-1.45.0.dist-info/RECORD +324 -0
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/WHEEL +1 -2
- aiagents4pharma-1.43.0.dist-info/RECORD +0 -293
- aiagents4pharma-1.43.0.dist-info/top_level.txt +0 -1
- /aiagents4pharma/talk2scholars/tests/{test_state.py → test_states_state.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_pdf_paper_loader.py → test_utils_pdf_paper_loader.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_tool_helper_utils.py → test_utils_tool_helper_utils.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_zotero_pdf_downloader_utils.py → test_utils_zotero_pdf_downloader_utils.py} +0 -0
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,16 +2,19 @@
|
|
2
2
|
Class for loading BioBridgePrimeKG dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
+
import json
|
5
6
|
import os
|
6
7
|
import pickle
|
7
|
-
|
8
|
-
import requests
|
8
|
+
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
11
|
+
import requests
|
11
12
|
from tqdm import tqdm
|
13
|
+
|
12
14
|
from .dataset import Dataset
|
13
15
|
from .primekg import PrimeKG
|
14
16
|
|
17
|
+
|
15
18
|
class BioBridgePrimeKG(Dataset):
|
16
19
|
"""
|
17
20
|
Class for loading BioBridgePrimeKG dataset.
|
@@ -21,11 +24,13 @@ class BioBridgePrimeKG(Dataset):
|
|
21
24
|
https://github.com/RyanWangZf/BioBridge
|
22
25
|
"""
|
23
26
|
|
24
|
-
def __init__(
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
primekg_dir: str = "../../../data/primekg/",
|
30
|
+
local_dir: str = "../../../data/biobridge_primekg/",
|
31
|
+
random_seed: int = 0,
|
32
|
+
n_neg_samples: int = 5,
|
33
|
+
):
|
29
34
|
"""
|
30
35
|
Constructor for BioBridgePrimeKG class.
|
31
36
|
|
@@ -92,10 +97,7 @@ class BioBridgePrimeKG(Dataset):
|
|
92
97
|
|
93
98
|
return primekg_data
|
94
99
|
|
95
|
-
def _download_file(self,
|
96
|
-
remote_url:str,
|
97
|
-
local_dir: str,
|
98
|
-
local_filename: str):
|
100
|
+
def _download_file(self, remote_url: str, local_dir: str, local_filename: str):
|
99
101
|
"""
|
100
102
|
A helper function to download a file from remote URL to the local directory.
|
101
103
|
|
@@ -135,13 +137,16 @@ class BioBridgePrimeKG(Dataset):
|
|
135
137
|
"""
|
136
138
|
# Download the data config file of BioBridgePrimeKG
|
137
139
|
self._download_file(
|
138
|
-
remote_url=
|
139
|
-
|
140
|
+
remote_url=(
|
141
|
+
"https://raw.githubusercontent.com/RyanWangZf/BioBridge/"
|
142
|
+
"refs/heads/main/data/BindData/data_config.json"
|
143
|
+
),
|
140
144
|
local_dir=self.local_dir,
|
141
|
-
local_filename=
|
145
|
+
local_filename="data_config.json",
|
146
|
+
)
|
142
147
|
|
143
148
|
# Load the downloaded data config file
|
144
|
-
with open(os.path.join(self.local_dir,
|
149
|
+
with open(os.path.join(self.local_dir, "data_config.json"), encoding="utf-8") as f:
|
145
150
|
data_config = json.load(f)
|
146
151
|
|
147
152
|
return data_config
|
@@ -161,15 +166,19 @@ class BioBridgePrimeKG(Dataset):
|
|
161
166
|
else:
|
162
167
|
# Download the embeddings from the BioBridge repo and further process them
|
163
168
|
# List of embedding source files
|
164
|
-
url = (
|
165
|
-
|
169
|
+
url = (
|
170
|
+
"https://media.githubusercontent.com/media/RyanWangZf/BioBridge/"
|
171
|
+
"refs/heads/main/data/embeddings/esm2b_unimo_pubmedbert/"
|
172
|
+
)
|
166
173
|
file_list = [f"{n}.pkl" for n in self.preselected_node_types]
|
167
174
|
|
168
175
|
# Download the embeddings
|
169
176
|
for file in file_list:
|
170
|
-
self._download_file(
|
171
|
-
|
172
|
-
|
177
|
+
self._download_file(
|
178
|
+
remote_url=os.path.join(url, file),
|
179
|
+
local_dir=os.path.join(self.local_dir, "embeddings"),
|
180
|
+
local_filename=file,
|
181
|
+
)
|
173
182
|
|
174
183
|
# Unified embeddings
|
175
184
|
emb_dict_all = {}
|
@@ -179,7 +188,7 @@ class BioBridgePrimeKG(Dataset):
|
|
179
188
|
emb_ar = emb["embedding"]
|
180
189
|
if not isinstance(emb_ar, list):
|
181
190
|
emb_ar = emb_ar.tolist()
|
182
|
-
emb_dict_all.update(dict(zip(emb["node_index"], emb_ar)))
|
191
|
+
emb_dict_all.update(dict(zip(emb["node_index"], emb_ar, strict=False)))
|
183
192
|
|
184
193
|
# Store embeddings
|
185
194
|
with open(processed_file_path, "wb") as f:
|
@@ -204,37 +213,54 @@ class BioBridgePrimeKG(Dataset):
|
|
204
213
|
# Load each dataframe in the local directory
|
205
214
|
node_info_dict = {}
|
206
215
|
for i, node_type in enumerate(self.preselected_node_types):
|
207
|
-
with open(os.path.join(self.local_dir, "processed",
|
208
|
-
f"{node_type}.csv"), "rb") as f:
|
216
|
+
with open(os.path.join(self.local_dir, "processed", f"{node_type}.csv"), "rb") as f:
|
209
217
|
df_node = pd.read_csv(f)
|
210
218
|
node_info_dict[self.node_type_map[node_type]] = df_node
|
219
|
+
print(i)
|
211
220
|
else:
|
212
221
|
# Download the related files from the BioBridge repo and further process them
|
213
222
|
# List of processed files
|
214
|
-
url = (
|
215
|
-
|
216
|
-
|
223
|
+
url = (
|
224
|
+
"https://media.githubusercontent.com/media/RyanWangZf/BioBridge/"
|
225
|
+
"refs/heads/main/data/Processed/"
|
226
|
+
)
|
227
|
+
file_list = [
|
228
|
+
"protein",
|
229
|
+
"molecular",
|
230
|
+
"cellular",
|
231
|
+
"biological",
|
232
|
+
"drug",
|
233
|
+
"disease",
|
234
|
+
]
|
217
235
|
|
218
236
|
# Download the processed files
|
219
237
|
for i, file in enumerate(file_list):
|
220
|
-
self._download_file(
|
221
|
-
|
222
|
-
|
238
|
+
self._download_file(
|
239
|
+
remote_url=os.path.join(url, f"{file}.csv"),
|
240
|
+
local_dir=os.path.join(self.local_dir, "processed"),
|
241
|
+
local_filename=f"{self.preselected_node_types[i]}.csv",
|
242
|
+
)
|
223
243
|
|
224
244
|
# Build the node index list
|
225
245
|
node_info_dict = {}
|
226
246
|
node_index_list = []
|
227
247
|
for i, file in enumerate(file_list):
|
228
|
-
df_node = pd.read_csv(
|
229
|
-
|
248
|
+
df_node = pd.read_csv(
|
249
|
+
os.path.join(
|
250
|
+
self.local_dir,
|
251
|
+
"processed",
|
252
|
+
f"{self.preselected_node_types[i]}.csv",
|
253
|
+
)
|
254
|
+
)
|
230
255
|
node_info_dict[self.node_type_map[self.preselected_node_types[i]]] = df_node
|
231
256
|
node_index_list.extend(df_node["node_index"].tolist())
|
257
|
+
print(i, file)
|
232
258
|
|
233
259
|
# Filter the PrimeKG dataset to take into account only the selected node types
|
234
260
|
primekg_triplets = self.primekg.get_edges().copy()
|
235
261
|
primekg_triplets = primekg_triplets[
|
236
|
-
primekg_triplets["head_index"].isin(node_index_list)
|
237
|
-
primekg_triplets["tail_index"].isin(node_index_list)
|
262
|
+
primekg_triplets["head_index"].isin(node_index_list)
|
263
|
+
& primekg_triplets["tail_index"].isin(node_index_list)
|
238
264
|
]
|
239
265
|
primekg_triplets = primekg_triplets.reset_index(drop=True)
|
240
266
|
|
@@ -256,8 +282,9 @@ class BioBridgePrimeKG(Dataset):
|
|
256
282
|
|
257
283
|
return primekg_triplets, node_info_dict
|
258
284
|
|
259
|
-
def _build_train_test_split(
|
260
|
-
|
285
|
+
def _build_train_test_split(
|
286
|
+
self,
|
287
|
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
261
288
|
"""
|
262
289
|
Build the train-test split for BioBridgePrimeKG dataset.
|
263
290
|
|
@@ -268,34 +295,31 @@ class BioBridgePrimeKG(Dataset):
|
|
268
295
|
The test nodes for BioBridgePrimeKG dataset.
|
269
296
|
The full triplets for BioBridgePrimeKG dataset.
|
270
297
|
"""
|
271
|
-
if os.path.exists(os.path.join(self.local_dir, "processed",
|
272
|
-
"triplet_full_altered.tsv.gz")):
|
298
|
+
if os.path.exists(os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz")):
|
273
299
|
# Load each dataframe in the local directory
|
274
|
-
with open(os.path.join(self.local_dir, "processed",
|
275
|
-
"triplet_train.tsv.gz"), "rb") as f:
|
300
|
+
with open(os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"), "rb") as f:
|
276
301
|
df_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
277
302
|
|
278
|
-
with open(os.path.join(self.local_dir, "processed",
|
279
|
-
"node_train.tsv.gz"), "rb") as f:
|
303
|
+
with open(os.path.join(self.local_dir, "processed", "node_train.tsv.gz"), "rb") as f:
|
280
304
|
df_node_train = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
281
305
|
|
282
|
-
with open(os.path.join(self.local_dir, "processed",
|
283
|
-
"triplet_test.tsv.gz"), "rb") as f:
|
306
|
+
with open(os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"), "rb") as f:
|
284
307
|
df_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
285
308
|
|
286
|
-
with open(os.path.join(self.local_dir, "processed",
|
287
|
-
"node_test.tsv.gz"), "rb") as f:
|
309
|
+
with open(os.path.join(self.local_dir, "processed", "node_test.tsv.gz"), "rb") as f:
|
288
310
|
df_node_test = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
289
311
|
|
290
|
-
with open(
|
291
|
-
|
312
|
+
with open(
|
313
|
+
os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz"),
|
314
|
+
"rb",
|
315
|
+
) as f:
|
292
316
|
triplets = pd.read_csv(f, sep="\t", compression="gzip", low_memory=False)
|
293
317
|
else:
|
294
318
|
# Filtering out some nodes in the embedding dictionary
|
295
319
|
triplets = self.primekg_triplets.copy()
|
296
320
|
triplets = triplets[
|
297
|
-
triplets["head_index"].isin(list(self.emb_dict.keys()))
|
298
|
-
triplets["tail_index"].isin(list(self.emb_dict.keys()))
|
321
|
+
triplets["head_index"].isin(list(self.emb_dict.keys()))
|
322
|
+
& triplets["tail_index"].isin(list(self.emb_dict.keys()))
|
299
323
|
].reset_index(drop=True)
|
300
324
|
|
301
325
|
# Perform splitting of the triplets
|
@@ -311,7 +335,7 @@ class BioBridgePrimeKG(Dataset):
|
|
311
335
|
"test": {
|
312
336
|
"node_index": [],
|
313
337
|
"node_type": [],
|
314
|
-
}
|
338
|
+
},
|
315
339
|
}
|
316
340
|
# Loop over the node types
|
317
341
|
for node_type in triplets["head_type"].unique():
|
@@ -319,7 +343,7 @@ class BioBridgePrimeKG(Dataset):
|
|
319
343
|
all_x_indexes = df_sub["head_index"].unique()
|
320
344
|
# By default, we use 90% of the nodes for training and 10% for testing
|
321
345
|
te_x_indexes = np.random.choice(
|
322
|
-
all_x_indexes, size=int(0.1*len(all_x_indexes)), replace=False
|
346
|
+
all_x_indexes, size=int(0.1 * len(all_x_indexes)), replace=False
|
323
347
|
)
|
324
348
|
df_subs = {}
|
325
349
|
df_subs["test"] = df_sub[df_sub["head_index"].isin(te_x_indexes)]
|
@@ -331,10 +355,10 @@ class BioBridgePrimeKG(Dataset):
|
|
331
355
|
node_index = {}
|
332
356
|
node_index["train"] = df_subs["train"]["head_index"].unique()
|
333
357
|
node_split["train"]["node_index"].extend(node_index["train"].tolist())
|
334
|
-
node_split["train"]["node_type"].extend([node_type]*len(node_index["train"]))
|
358
|
+
node_split["train"]["node_type"].extend([node_type] * len(node_index["train"]))
|
335
359
|
node_index["test"] = df_subs["test"]["head_index"].unique()
|
336
360
|
node_split["test"]["node_index"].extend(node_index["test"].tolist())
|
337
|
-
node_split["test"]["node_type"].extend([node_type]*len(node_index["test"]))
|
361
|
+
node_split["test"]["node_type"].extend([node_type] * len(node_index["test"]))
|
338
362
|
|
339
363
|
print(f"Number of {node_type} nodes in train: {len(node_index['train'])}")
|
340
364
|
print(f"Number of {node_type} nodes in test: {len(node_index['test'])}")
|
@@ -346,18 +370,37 @@ class BioBridgePrimeKG(Dataset):
|
|
346
370
|
df_node_test = pd.DataFrame(node_split["test"])
|
347
371
|
|
348
372
|
# Store each dataframe in the local directory
|
349
|
-
df_train.to_csv(
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
373
|
+
df_train.to_csv(
|
374
|
+
os.path.join(self.local_dir, "processed", "triplet_train.tsv.gz"),
|
375
|
+
sep="\t",
|
376
|
+
compression="gzip",
|
377
|
+
index=False,
|
378
|
+
)
|
379
|
+
df_node_train.to_csv(
|
380
|
+
os.path.join(self.local_dir, "processed", "node_train.tsv.gz"),
|
381
|
+
sep="\t",
|
382
|
+
compression="gzip",
|
383
|
+
index=False,
|
384
|
+
)
|
385
|
+
df_test.to_csv(
|
386
|
+
os.path.join(self.local_dir, "processed", "triplet_test.tsv.gz"),
|
387
|
+
sep="\t",
|
388
|
+
compression="gzip",
|
389
|
+
index=False,
|
390
|
+
)
|
391
|
+
df_node_test.to_csv(
|
392
|
+
os.path.join(self.local_dir, "processed", "node_test.tsv.gz"),
|
393
|
+
sep="\t",
|
394
|
+
compression="gzip",
|
395
|
+
index=False,
|
396
|
+
)
|
357
397
|
# Store altered full triplets as well
|
358
|
-
triplets.to_csv(
|
359
|
-
|
360
|
-
|
398
|
+
triplets.to_csv(
|
399
|
+
os.path.join(self.local_dir, "processed", "triplet_full_altered.tsv.gz"),
|
400
|
+
sep="\t",
|
401
|
+
compression="gzip",
|
402
|
+
index=False,
|
403
|
+
)
|
361
404
|
|
362
405
|
return df_train, df_node_train, df_test, df_node_test, triplets
|
363
406
|
|
@@ -473,8 +516,13 @@ class BioBridgePrimeKG(Dataset):
|
|
473
516
|
|
474
517
|
# Build train-test split
|
475
518
|
print("Building train-test split...")
|
476
|
-
|
477
|
-
|
519
|
+
(
|
520
|
+
self.df_train,
|
521
|
+
self.df_node_train,
|
522
|
+
self.df_test,
|
523
|
+
self.df_node_test,
|
524
|
+
self.primekg_triplets,
|
525
|
+
) = self._build_train_test_split()
|
478
526
|
|
479
527
|
# if build_neg_triplest:
|
480
528
|
# # Build negative triplets
|
@@ -549,7 +597,7 @@ class BioBridgePrimeKG(Dataset):
|
|
549
597
|
"train": self.df_train,
|
550
598
|
"node_train": self.df_node_train,
|
551
599
|
"test": self.df_test,
|
552
|
-
"node_test": self.df_node_test
|
600
|
+
"node_test": self.df_node_test,
|
553
601
|
}
|
554
602
|
|
555
603
|
def get_node_info_dict(self) -> dict:
|
@@ -3,11 +3,14 @@ Class for loading PrimeKG dataset.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import os
|
6
|
+
|
7
|
+
import pandas as pd
|
6
8
|
import requests
|
7
9
|
from tqdm import tqdm
|
8
|
-
|
10
|
+
|
9
11
|
from .dataset import Dataset
|
10
12
|
|
13
|
+
|
11
14
|
class PrimeKG(Dataset):
|
12
15
|
"""
|
13
16
|
Class for loading PrimeKG dataset.
|
@@ -41,8 +44,7 @@ class PrimeKG(Dataset):
|
|
41
44
|
# Make the directory if it doesn't exist
|
42
45
|
os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
|
43
46
|
|
44
|
-
|
45
|
-
def _download_file(self, remote_url:str, local_path: str):
|
47
|
+
def _download_file(self, remote_url: str, local_path: str):
|
46
48
|
"""
|
47
49
|
A helper function to download a file from remote URL to the local directory.
|
48
50
|
|
@@ -83,17 +85,18 @@ class PrimeKG(Dataset):
|
|
83
85
|
print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
|
84
86
|
|
85
87
|
# Download the file from the Harvard Dataverse with designated file_id for node
|
86
|
-
self._download_file(
|
87
|
-
|
88
|
+
self._download_file(
|
89
|
+
f"{self.server_path}{self.file_ids['nodes']}",
|
90
|
+
os.path.join(self.local_dir, "nodes.tab"),
|
91
|
+
)
|
88
92
|
|
89
93
|
# Load the downloaded file into a pandas DataFrame
|
90
|
-
nodes = pd.read_csv(
|
91
|
-
|
94
|
+
nodes = pd.read_csv(
|
95
|
+
os.path.join(self.local_dir, "nodes.tab"), sep="\t", low_memory=False
|
96
|
+
)
|
92
97
|
|
93
98
|
# Further processing of the dataframe
|
94
|
-
nodes = nodes[
|
95
|
-
["node_index", "node_name", "node_source", "node_id", "node_type"]
|
96
|
-
]
|
99
|
+
nodes = nodes[["node_index", "node_name", "node_source", "node_id", "node_type"]]
|
97
100
|
|
98
101
|
# Store compressed dataframe in the local directory
|
99
102
|
nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
|
@@ -123,17 +126,18 @@ class PrimeKG(Dataset):
|
|
123
126
|
print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
|
124
127
|
|
125
128
|
# Download the file from the Harvard Dataverse with designated file_id for edge
|
126
|
-
self._download_file(
|
127
|
-
|
129
|
+
self._download_file(
|
130
|
+
f"{self.server_path}{self.file_ids['edges']}",
|
131
|
+
os.path.join(self.local_dir, "edges.csv"),
|
132
|
+
)
|
128
133
|
|
129
134
|
# Load the downloaded file into a pandas DataFrame
|
130
|
-
edges = pd.read_csv(
|
131
|
-
|
135
|
+
edges = pd.read_csv(
|
136
|
+
os.path.join(self.local_dir, "edges.csv"), sep=",", low_memory=False
|
137
|
+
)
|
132
138
|
|
133
139
|
# Further processing of the dataframe
|
134
|
-
edges = edges.merge(
|
135
|
-
nodes, left_on="x_index", right_on="node_index"
|
136
|
-
)
|
140
|
+
edges = edges.merge(nodes, left_on="x_index", right_on="node_index")
|
137
141
|
edges.drop(["x_index"], axis=1, inplace=True)
|
138
142
|
edges.rename(
|
139
143
|
columns={
|
@@ -145,9 +149,7 @@ class PrimeKG(Dataset):
|
|
145
149
|
},
|
146
150
|
inplace=True,
|
147
151
|
)
|
148
|
-
edges = edges.merge(
|
149
|
-
nodes, left_on="y_index", right_on="node_index"
|
150
|
-
)
|
152
|
+
edges = edges.merge(nodes, left_on="y_index", right_on="node_index")
|
151
153
|
edges.drop(["y_index"], axis=1, inplace=True)
|
152
154
|
edges.rename(
|
153
155
|
columns={
|
@@ -155,15 +157,24 @@ class PrimeKG(Dataset):
|
|
155
157
|
"node_name": "tail_name",
|
156
158
|
"node_source": "tail_source",
|
157
159
|
"node_id": "tail_id",
|
158
|
-
"node_type": "tail_type"
|
160
|
+
"node_type": "tail_type",
|
159
161
|
},
|
160
162
|
inplace=True,
|
161
163
|
)
|
162
164
|
edges = edges[
|
163
165
|
[
|
164
|
-
"head_index",
|
165
|
-
"
|
166
|
-
"
|
166
|
+
"head_index",
|
167
|
+
"head_name",
|
168
|
+
"head_source",
|
169
|
+
"head_id",
|
170
|
+
"head_type",
|
171
|
+
"tail_index",
|
172
|
+
"tail_name",
|
173
|
+
"tail_source",
|
174
|
+
"tail_id",
|
175
|
+
"tail_type",
|
176
|
+
"display_relation",
|
177
|
+
"relation",
|
167
178
|
]
|
168
179
|
]
|
169
180
|
|
@@ -3,16 +3,19 @@ Class for loading StarkQAPrimeKG dataset.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import os
|
6
|
-
import shutil
|
7
6
|
import pickle
|
7
|
+
import shutil
|
8
|
+
|
9
|
+
import gdown
|
8
10
|
import numpy as np
|
9
11
|
import pandas as pd
|
10
|
-
from tqdm import tqdm
|
11
12
|
import torch
|
12
13
|
from huggingface_hub import hf_hub_download, list_repo_files
|
13
|
-
import
|
14
|
+
from tqdm import tqdm
|
15
|
+
|
14
16
|
from .dataset import Dataset
|
15
17
|
|
18
|
+
|
16
19
|
class StarkQAPrimeKG(Dataset):
|
17
20
|
"""
|
18
21
|
Class for loading StarkQAPrimeKG dataset.
|
@@ -67,41 +70,47 @@ class StarkQAPrimeKG(Dataset):
|
|
67
70
|
|
68
71
|
# List all related files in the HuggingFace Hub repository
|
69
72
|
files = list_repo_files(self.hf_repo_id, repo_type="dataset")
|
70
|
-
files = [
|
71
|
-
|
73
|
+
files = [
|
74
|
+
f
|
75
|
+
for f in files
|
76
|
+
if (
|
77
|
+
(f.startswith("qa/prime/") or f.startswith("skb/prime/"))
|
78
|
+
and f.find("raw") == -1
|
79
|
+
)
|
80
|
+
]
|
72
81
|
|
73
82
|
# Download and save each file in the specified folder
|
74
83
|
for file in tqdm(files):
|
75
|
-
_ = hf_hub_download(
|
76
|
-
|
77
|
-
|
78
|
-
local_dir=self.local_dir)
|
84
|
+
_ = hf_hub_download(
|
85
|
+
self.hf_repo_id, file, repo_type="dataset", local_dir=self.local_dir
|
86
|
+
)
|
79
87
|
|
80
88
|
# Unzip the processed files
|
81
89
|
shutil.unpack_archive(
|
82
90
|
os.path.join(self.local_dir, "skb/prime/processed.zip"),
|
83
|
-
os.path.join(self.local_dir, "skb/prime/")
|
91
|
+
os.path.join(self.local_dir, "skb/prime/"),
|
84
92
|
)
|
85
93
|
|
86
94
|
# Load StarkQA dataframe
|
87
95
|
starkqa = pd.read_csv(
|
88
96
|
os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
|
89
|
-
low_memory=False
|
97
|
+
low_memory=False,
|
98
|
+
)
|
90
99
|
|
91
100
|
# Read split indices
|
92
|
-
qa_indices = sorted(starkqa[
|
101
|
+
qa_indices = sorted(starkqa["id"].tolist())
|
93
102
|
starkqa_split_idx = {}
|
94
|
-
for split in [
|
95
|
-
indices_file = os.path.join(self.local_dir, "qa/prime/split", f
|
96
|
-
with open(indices_file,
|
97
|
-
indices = f.read().strip().split(
|
103
|
+
for split in ["train", "val", "test", "test-0.1"]:
|
104
|
+
indices_file = os.path.join(self.local_dir, "qa/prime/split", f"{split}.index")
|
105
|
+
with open(indices_file, encoding="utf-8") as f:
|
106
|
+
indices = f.read().strip().split("\n")
|
98
107
|
query_ids = [int(idx) for idx in indices]
|
99
108
|
starkqa_split_idx[split] = np.array(
|
100
109
|
[qa_indices.index(query_id) for query_id in query_ids]
|
101
110
|
)
|
102
111
|
|
103
112
|
# Load the node info of PrimeKG preprocessed for StarkQA
|
104
|
-
with open(os.path.join(self.local_dir,
|
113
|
+
with open(os.path.join(self.local_dir, "skb/prime/processed/node_info.pkl"), "rb") as f:
|
105
114
|
starkqa_node_info = pickle.load(f)
|
106
115
|
|
107
116
|
return starkqa, starkqa_split_idx, starkqa_node_info
|
@@ -116,9 +125,9 @@ class StarkQAPrimeKG(Dataset):
|
|
116
125
|
"""
|
117
126
|
# Load the provided embeddings of query and nodes
|
118
127
|
# Note that they utilized 'text-embedding-ada-002' for embeddings
|
119
|
-
emb_model =
|
120
|
-
query_emb_url =
|
121
|
-
node_emb_url =
|
128
|
+
emb_model = "text-embedding-ada-002"
|
129
|
+
query_emb_url = "https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU"
|
130
|
+
node_emb_url = "https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy"
|
122
131
|
|
123
132
|
# Prepare respective directories to store the embeddings
|
124
133
|
emb_dir = os.path.join(self.local_dir, emb_model)
|
@@ -154,7 +163,6 @@ class StarkQAPrimeKG(Dataset):
|
|
154
163
|
print("Loading StarkQAPrimeKG embeddings...")
|
155
164
|
self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
|
156
165
|
|
157
|
-
|
158
166
|
def get_starkqa(self) -> pd.DataFrame:
|
159
167
|
"""
|
160
168
|
Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# .env.example (DO NOT put actual API keys here, read the README.md)
|
2
|
+
|
3
|
+
# OPENAI API KEY
|
4
|
+
OPENAI_API_KEY=your_openai_api_key_here
|
5
|
+
|
6
|
+
# LangSmith API KEY
|
7
|
+
LANGCHAIN_TRACING_V2=true
|
8
|
+
LANGCHAIN_API_KEY=your_langchain_api_key_here
|
9
|
+
|
10
|
+
# NVIDIA API KEY
|
11
|
+
NVIDIA_API_KEY=your_nvidia_api_key_here
|
12
|
+
|
13
|
+
# Set environment variables for data loader
|
14
|
+
MILVUS_HOST=localhost
|
15
|
+
MILVUS_PORT=19530
|
16
|
+
MILVUS_USER=root
|
17
|
+
MILVUS_PASSWORD=Milvus
|
18
|
+
MILVUS_DATABASE=your_database_name_here
|
19
|
+
|
20
|
+
# Specify the data directory for multimodal data to your own data directory
|
21
|
+
# DATA_DIR=/your_absolute_path_to_your_data_dir/
|
22
|
+
|
23
|
+
BATCH_SIZE=500
|