aiagents4pharma 1.43.0__py3-none-any.whl → 1.45.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/.dockerignore +13 -0
- aiagents4pharma/talk2aiagents4pharma/Dockerfile +105 -0
- aiagents4pharma/talk2aiagents4pharma/README.md +1 -0
- aiagents4pharma/talk2aiagents4pharma/__init__.py +4 -5
- aiagents4pharma/talk2aiagents4pharma/agents/__init__.py +3 -2
- aiagents4pharma/talk2aiagents4pharma/agents/main_agent.py +24 -23
- aiagents4pharma/talk2aiagents4pharma/configs/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/agents/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/agents/main_agent/default.yaml +2 -2
- aiagents4pharma/talk2aiagents4pharma/configs/config.yaml +1 -1
- aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/.env.example +23 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/docker-compose.yml +93 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/.env.example +23 -0
- aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/docker-compose.yml +108 -0
- aiagents4pharma/talk2aiagents4pharma/install.md +127 -0
- aiagents4pharma/talk2aiagents4pharma/states/__init__.py +3 -2
- aiagents4pharma/talk2aiagents4pharma/states/state_talk2aiagents4pharma.py +5 -3
- aiagents4pharma/talk2aiagents4pharma/tests/__init__.py +2 -2
- aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +72 -50
- aiagents4pharma/talk2biomodels/.dockerignore +13 -0
- aiagents4pharma/talk2biomodels/Dockerfile +104 -0
- aiagents4pharma/talk2biomodels/README.md +1 -0
- aiagents4pharma/talk2biomodels/__init__.py +4 -8
- aiagents4pharma/talk2biomodels/agents/__init__.py +3 -2
- aiagents4pharma/talk2biomodels/agents/t2b_agent.py +47 -42
- aiagents4pharma/talk2biomodels/api/__init__.py +4 -5
- aiagents4pharma/talk2biomodels/api/kegg.py +14 -10
- aiagents4pharma/talk2biomodels/api/ols.py +13 -10
- aiagents4pharma/talk2biomodels/api/uniprot.py +7 -6
- aiagents4pharma/talk2biomodels/configs/__init__.py +3 -4
- aiagents4pharma/talk2biomodels/configs/agents/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/config.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/tools/__init__.py +4 -5
- aiagents4pharma/talk2biomodels/configs/tools/ask_question/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/ask_question/default.yaml +1 -2
- aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/configs/tools/get_annotation/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/configs/tools/get_annotation/default.yaml +1 -1
- aiagents4pharma/talk2biomodels/install.md +63 -0
- aiagents4pharma/talk2biomodels/models/__init__.py +4 -4
- aiagents4pharma/talk2biomodels/models/basico_model.py +36 -28
- aiagents4pharma/talk2biomodels/models/sys_bio_model.py +13 -10
- aiagents4pharma/talk2biomodels/states/__init__.py +3 -2
- aiagents4pharma/talk2biomodels/states/state_talk2biomodels.py +12 -8
- aiagents4pharma/talk2biomodels/tests/BIOMD0000000449_url.xml +1585 -0
- aiagents4pharma/talk2biomodels/tests/__init__.py +2 -2
- aiagents4pharma/talk2biomodels/tests/article_on_model_537.pdf +0 -0
- aiagents4pharma/talk2biomodels/tests/test_api.py +18 -14
- aiagents4pharma/talk2biomodels/tests/test_ask_question.py +8 -9
- aiagents4pharma/talk2biomodels/tests/test_basico_model.py +15 -9
- aiagents4pharma/talk2biomodels/tests/test_get_annotation.py +54 -55
- aiagents4pharma/talk2biomodels/tests/test_getmodelinfo.py +28 -27
- aiagents4pharma/talk2biomodels/tests/test_integration.py +21 -33
- aiagents4pharma/talk2biomodels/tests/test_load_biomodel.py +14 -11
- aiagents4pharma/talk2biomodels/tests/test_param_scan.py +21 -20
- aiagents4pharma/talk2biomodels/tests/test_query_article.py +129 -29
- aiagents4pharma/talk2biomodels/tests/test_search_models.py +9 -13
- aiagents4pharma/talk2biomodels/tests/test_simulate_model.py +16 -15
- aiagents4pharma/talk2biomodels/tests/test_steady_state.py +12 -22
- aiagents4pharma/talk2biomodels/tests/test_sys_bio_model.py +33 -29
- aiagents4pharma/talk2biomodels/tools/__init__.py +15 -12
- aiagents4pharma/talk2biomodels/tools/ask_question.py +42 -32
- aiagents4pharma/talk2biomodels/tools/custom_plotter.py +51 -43
- aiagents4pharma/talk2biomodels/tools/get_annotation.py +99 -75
- aiagents4pharma/talk2biomodels/tools/get_modelinfo.py +57 -51
- aiagents4pharma/talk2biomodels/tools/load_arguments.py +52 -32
- aiagents4pharma/talk2biomodels/tools/load_biomodel.py +8 -2
- aiagents4pharma/talk2biomodels/tools/parameter_scan.py +107 -90
- aiagents4pharma/talk2biomodels/tools/query_article.py +14 -13
- aiagents4pharma/talk2biomodels/tools/search_models.py +37 -26
- aiagents4pharma/talk2biomodels/tools/simulate_model.py +47 -37
- aiagents4pharma/talk2biomodels/tools/steady_state.py +76 -58
- aiagents4pharma/talk2biomodels/tools/utils.py +4 -3
- aiagents4pharma/talk2cells/README.md +1 -0
- aiagents4pharma/talk2cells/__init__.py +4 -5
- aiagents4pharma/talk2cells/agents/__init__.py +3 -2
- aiagents4pharma/talk2cells/agents/scp_agent.py +21 -19
- aiagents4pharma/talk2cells/states/__init__.py +3 -2
- aiagents4pharma/talk2cells/states/state_talk2cells.py +4 -2
- aiagents4pharma/talk2cells/tests/scp_agent/test_scp_agent.py +8 -9
- aiagents4pharma/talk2cells/tools/__init__.py +3 -2
- aiagents4pharma/talk2cells/tools/scp_agent/__init__.py +4 -4
- aiagents4pharma/talk2cells/tools/scp_agent/display_studies.py +5 -3
- aiagents4pharma/talk2cells/tools/scp_agent/search_studies.py +21 -22
- aiagents4pharma/talk2knowledgegraphs/.dockerignore +13 -0
- aiagents4pharma/talk2knowledgegraphs/Dockerfile +103 -0
- aiagents4pharma/talk2knowledgegraphs/README.md +1 -0
- aiagents4pharma/talk2knowledgegraphs/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/agents/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +40 -30
- aiagents4pharma/talk2knowledgegraphs/configs/__init__.py +3 -6
- aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml +8 -8
- aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/__init__.py +4 -5
- aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +17 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/__init__.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +4 -6
- aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +115 -67
- aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +2 -0
- aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +35 -24
- aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +29 -21
- aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example +23 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml +93 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example +23 -0
- aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml +108 -0
- aiagents4pharma/talk2knowledgegraphs/entrypoint.sh +190 -0
- aiagents4pharma/talk2knowledgegraphs/install.md +140 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +31 -65
- aiagents4pharma/talk2knowledgegraphs/states/__init__.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +65 -40
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py +54 -48
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_primekg.py +17 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_starkqa_primekg.py +33 -24
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_graphrag_reasoning.py +116 -69
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +736 -413
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +22 -15
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +19 -12
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_summarization.py +95 -48
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_embeddings.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_huggingface.py +5 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_nim_molmim.py +13 -18
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py +10 -3
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py +4 -3
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py +3 -2
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +9 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +6 -6
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +442 -42
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py +3 -4
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +10 -6
- aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +10 -7
- aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py +15 -20
- aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +245 -205
- aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +92 -90
- aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py +25 -37
- aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py +10 -13
- aiagents4pharma/talk2knowledgegraphs/utils/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py +4 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py +4 -0
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py +11 -14
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py +7 -7
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py +12 -6
- aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py +8 -6
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +9 -6
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py +15 -9
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +23 -20
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +12 -10
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +16 -10
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +26 -18
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +4 -5
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +218 -81
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +53 -47
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py +18 -14
- aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py +22 -23
- aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +11 -10
- aiagents4pharma/talk2scholars/.dockerignore +13 -0
- aiagents4pharma/talk2scholars/Dockerfile +104 -0
- aiagents4pharma/talk2scholars/README.md +1 -0
- aiagents4pharma/talk2scholars/agents/__init__.py +1 -5
- aiagents4pharma/talk2scholars/agents/main_agent.py +6 -4
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +5 -4
- aiagents4pharma/talk2scholars/agents/pdf_agent.py +4 -2
- aiagents4pharma/talk2scholars/agents/s2_agent.py +2 -2
- aiagents4pharma/talk2scholars/agents/zotero_agent.py +10 -11
- aiagents4pharma/talk2scholars/configs/__init__.py +1 -3
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py +1 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +1 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +1 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +8 -8
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +7 -7
- aiagents4pharma/talk2scholars/configs/tools/__init__.py +8 -6
- aiagents4pharma/talk2scholars/docker-compose/cpu/.env.example +21 -0
- aiagents4pharma/talk2scholars/docker-compose/cpu/docker-compose.yml +90 -0
- aiagents4pharma/talk2scholars/docker-compose/gpu/.env.example +21 -0
- aiagents4pharma/talk2scholars/docker-compose/gpu/docker-compose.yml +105 -0
- aiagents4pharma/talk2scholars/install.md +122 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +8 -8
- aiagents4pharma/talk2scholars/tests/{test_main_agent.py → test_agents_main_agent.py} +41 -23
- aiagents4pharma/talk2scholars/tests/{test_paper_download_agent.py → test_agents_paper_agents_download_agent.py} +10 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_agent.py → test_agents_pdf_agent.py} +6 -10
- aiagents4pharma/talk2scholars/tests/{test_s2_agent.py → test_agents_s2_agent.py} +8 -16
- aiagents4pharma/talk2scholars/tests/{test_zotero_agent.py → test_agents_zotero_agent.py} +5 -7
- aiagents4pharma/talk2scholars/tests/{test_s2_display_dataframe.py → test_s2_tools_display_dataframe.py} +6 -7
- aiagents4pharma/talk2scholars/tests/{test_s2_query_dataframe.py → test_s2_tools_query_dataframe.py} +5 -15
- aiagents4pharma/talk2scholars/tests/{test_paper_downloader.py → test_tools_paper_downloader.py} +25 -63
- aiagents4pharma/talk2scholars/tests/{test_question_and_answer_tool.py → test_tools_question_and_answer_tool.py} +2 -6
- aiagents4pharma/talk2scholars/tests/{test_s2_multi.py → test_tools_s2_multi.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_s2_retrieve.py → test_tools_s2_retrieve.py} +2 -1
- aiagents4pharma/talk2scholars/tests/{test_s2_search.py → test_tools_s2_search.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_s2_single.py → test_tools_s2_single.py} +5 -5
- aiagents4pharma/talk2scholars/tests/{test_arxiv_downloader.py → test_utils_arxiv_downloader.py} +16 -25
- aiagents4pharma/talk2scholars/tests/{test_base_paper_downloader.py → test_utils_base_paper_downloader.py} +25 -47
- aiagents4pharma/talk2scholars/tests/{test_biorxiv_downloader.py → test_utils_biorxiv_downloader.py} +14 -42
- aiagents4pharma/talk2scholars/tests/{test_medrxiv_downloader.py → test_utils_medrxiv_downloader.py} +15 -49
- aiagents4pharma/talk2scholars/tests/{test_nvidia_nim_reranker.py → test_utils_nvidia_nim_reranker.py} +6 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_answer_formatter.py → test_utils_pdf_answer_formatter.py} +1 -0
- aiagents4pharma/talk2scholars/tests/{test_pdf_batch_processor.py → test_utils_pdf_batch_processor.py} +6 -15
- aiagents4pharma/talk2scholars/tests/{test_pdf_collection_manager.py → test_utils_pdf_collection_manager.py} +34 -11
- aiagents4pharma/talk2scholars/tests/{test_pdf_document_processor.py → test_utils_pdf_document_processor.py} +2 -3
- aiagents4pharma/talk2scholars/tests/{test_pdf_generate_answer.py → test_utils_pdf_generate_answer.py} +3 -6
- aiagents4pharma/talk2scholars/tests/{test_pdf_gpu_detection.py → test_utils_pdf_gpu_detection.py} +5 -16
- aiagents4pharma/talk2scholars/tests/{test_pdf_rag_pipeline.py → test_utils_pdf_rag_pipeline.py} +7 -17
- aiagents4pharma/talk2scholars/tests/{test_pdf_retrieve_chunks.py → test_utils_pdf_retrieve_chunks.py} +4 -11
- aiagents4pharma/talk2scholars/tests/{test_pdf_singleton_manager.py → test_utils_pdf_singleton_manager.py} +26 -23
- aiagents4pharma/talk2scholars/tests/{test_pdf_vector_normalization.py → test_utils_pdf_vector_normalization.py} +1 -1
- aiagents4pharma/talk2scholars/tests/{test_pdf_vector_store.py → test_utils_pdf_vector_store.py} +27 -55
- aiagents4pharma/talk2scholars/tests/{test_pubmed_downloader.py → test_utils_pubmed_downloader.py} +31 -91
- aiagents4pharma/talk2scholars/tests/{test_read_helper_utils.py → test_utils_read_helper_utils.py} +2 -6
- aiagents4pharma/talk2scholars/tests/{test_s2_utils_ext_ids.py → test_utils_s2_utils_ext_ids.py} +5 -15
- aiagents4pharma/talk2scholars/tests/{test_zotero_human_in_the_loop.py → test_utils_zotero_human_in_the_loop.py} +6 -13
- aiagents4pharma/talk2scholars/tests/{test_zotero_path.py → test_utils_zotero_path.py} +53 -45
- aiagents4pharma/talk2scholars/tests/{test_zotero_read.py → test_utils_zotero_read.py} +30 -91
- aiagents4pharma/talk2scholars/tests/{test_zotero_write.py → test_utils_zotero_write.py} +6 -16
- aiagents4pharma/talk2scholars/tools/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +20 -35
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +7 -5
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +9 -11
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +14 -21
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +14 -22
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +11 -13
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +14 -28
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +4 -8
- aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +16 -14
- aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +15 -17
- aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +5 -5
- aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +2 -6
- aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +5 -9
- aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +4 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +6 -15
- aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +7 -15
- aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +2 -2
- aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +3 -4
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +8 -17
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +17 -33
- aiagents4pharma/talk2scholars/tools/s2/__init__.py +8 -6
- aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +3 -7
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +7 -6
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +5 -12
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +2 -4
- aiagents4pharma/talk2scholars/tools/s2/search.py +6 -6
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -3
- aiagents4pharma/talk2scholars/tools/s2/utils/__init__.py +1 -3
- aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +12 -18
- aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +11 -18
- aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +11 -16
- aiagents4pharma/talk2scholars/tools/zotero/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py +1 -4
- aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +21 -39
- aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py +2 -6
- aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py +8 -11
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +4 -12
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +13 -27
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +4 -7
- aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +8 -10
- aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +3 -2
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/METADATA +115 -50
- aiagents4pharma-1.45.0.dist-info/RECORD +324 -0
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/WHEEL +1 -2
- aiagents4pharma-1.43.0.dist-info/RECORD +0 -293
- aiagents4pharma-1.43.0.dist-info/top_level.txt +0 -1
- /aiagents4pharma/talk2scholars/tests/{test_state.py → test_states_state.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_pdf_paper_loader.py → test_utils_pdf_paper_loader.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_tool_helper_utils.py → test_utils_tool_helper_utils.py} +0 -0
- /aiagents4pharma/talk2scholars/tests/{test_zotero_pdf_downloader_utils.py → test_utils_zotero_pdf_downloader_utils.py} +0 -0
- {aiagents4pharma-1.43.0.dist-info → aiagents4pharma-1.45.0.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ import logging
|
|
8
8
|
import re
|
9
9
|
import tempfile
|
10
10
|
from abc import ABC, abstractmethod
|
11
|
-
from typing import Any
|
11
|
+
from typing import Any
|
12
12
|
|
13
13
|
import requests
|
14
14
|
|
@@ -58,8 +58,8 @@ class BasePaperDownloader(ABC):
|
|
58
58
|
|
59
59
|
@abstractmethod
|
60
60
|
def extract_paper_metadata(
|
61
|
-
self, metadata: Any, identifier: str, pdf_result:
|
62
|
-
) ->
|
61
|
+
self, metadata: Any, identifier: str, pdf_result: tuple[str, str] | None
|
62
|
+
) -> dict[str, Any]:
|
63
63
|
"""
|
64
64
|
Extract and structure metadata into standardized format.
|
65
65
|
|
@@ -89,9 +89,7 @@ class BasePaperDownloader(ABC):
|
|
89
89
|
raise NotImplementedError
|
90
90
|
|
91
91
|
# Common methods shared by all services
|
92
|
-
def download_pdf_to_temp(
|
93
|
-
self, pdf_url: str, identifier: str
|
94
|
-
) -> Optional[Tuple[str, str]]:
|
92
|
+
def download_pdf_to_temp(self, pdf_url: str, identifier: str) -> tuple[str, str] | None:
|
95
93
|
"""
|
96
94
|
Download PDF from URL to a temporary file.
|
97
95
|
|
@@ -103,9 +101,7 @@ class BasePaperDownloader(ABC):
|
|
103
101
|
Tuple of (temp_file_path, filename) or None if failed
|
104
102
|
"""
|
105
103
|
if not pdf_url:
|
106
|
-
logger.info(
|
107
|
-
"No PDF URL available for %s %s", self.get_identifier_name(), identifier
|
108
|
-
)
|
104
|
+
logger.info("No PDF URL available for %s %s", self.get_identifier_name(), identifier)
|
109
105
|
return None
|
110
106
|
|
111
107
|
try:
|
@@ -141,14 +137,11 @@ class BasePaperDownloader(ABC):
|
|
141
137
|
|
142
138
|
if "filename=" in content_disposition:
|
143
139
|
try:
|
144
|
-
|
145
140
|
filename_match = re.search(
|
146
141
|
r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
|
147
142
|
)
|
148
143
|
if filename_match:
|
149
|
-
extracted_filename = filename_match.group(
|
150
|
-
1
|
151
|
-
) or filename_match.group(2)
|
144
|
+
extracted_filename = filename_match.group(1) or filename_match.group(2)
|
152
145
|
extracted_filename = extracted_filename.strip().strip('"')
|
153
146
|
if extracted_filename and extracted_filename.endswith(".pdf"):
|
154
147
|
filename = extracted_filename
|
@@ -189,7 +182,7 @@ class BasePaperDownloader(ABC):
|
|
189
182
|
|
190
183
|
return snippet
|
191
184
|
|
192
|
-
def create_error_entry(self, identifier: str, error_msg: str) ->
|
185
|
+
def create_error_entry(self, identifier: str, error_msg: str) -> dict[str, Any]:
|
193
186
|
"""
|
194
187
|
Create standardized error entry for failed paper processing.
|
195
188
|
|
@@ -215,7 +208,7 @@ class BasePaperDownloader(ABC):
|
|
215
208
|
# Service-specific identifier field will be added by subclasses
|
216
209
|
}
|
217
210
|
|
218
|
-
def build_summary(self, article_data:
|
211
|
+
def build_summary(self, article_data: dict[str, Any]) -> str:
|
219
212
|
"""
|
220
213
|
Build a summary string for up to three papers with snippets.
|
221
214
|
|
@@ -226,7 +219,7 @@ class BasePaperDownloader(ABC):
|
|
226
219
|
Formatted summary string
|
227
220
|
"""
|
228
221
|
top = list(article_data.values())[:3]
|
229
|
-
lines:
|
222
|
+
lines: list[str] = []
|
230
223
|
downloaded_count = sum(
|
231
224
|
1
|
232
225
|
for paper in article_data.values()
|
@@ -240,7 +233,7 @@ class BasePaperDownloader(ABC):
|
|
240
233
|
snippet = self.get_snippet(paper.get("Abstract", ""))
|
241
234
|
|
242
235
|
# Build paper line with service-specific identifier info
|
243
|
-
line = f"{idx+1}. {title}"
|
236
|
+
line = f"{idx + 1}. {title}"
|
244
237
|
line += self._get_paper_identifier_info(paper)
|
245
238
|
line += f"\n Access: {access_type}"
|
246
239
|
|
@@ -264,7 +257,7 @@ class BasePaperDownloader(ABC):
|
|
264
257
|
)
|
265
258
|
|
266
259
|
@abstractmethod
|
267
|
-
def _get_paper_identifier_info(self, paper:
|
260
|
+
def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
|
268
261
|
"""
|
269
262
|
Get service-specific identifier info for paper summary.
|
270
263
|
|
@@ -276,7 +269,7 @@ class BasePaperDownloader(ABC):
|
|
276
269
|
"""
|
277
270
|
raise NotImplementedError
|
278
271
|
|
279
|
-
def process_identifiers(self, identifiers:
|
272
|
+
def process_identifiers(self, identifiers: list[str]) -> dict[str, Any]:
|
280
273
|
"""
|
281
274
|
Main processing loop for downloading papers.
|
282
275
|
|
@@ -293,7 +286,7 @@ class BasePaperDownloader(ABC):
|
|
293
286
|
identifiers,
|
294
287
|
)
|
295
288
|
|
296
|
-
article_data:
|
289
|
+
article_data: dict[str, Any] = {}
|
297
290
|
|
298
291
|
for identifier in identifiers:
|
299
292
|
logger.info("Processing %s: %s", self.get_identifier_name(), identifier)
|
@@ -332,7 +325,7 @@ class BasePaperDownloader(ABC):
|
|
332
325
|
return article_data
|
333
326
|
|
334
327
|
@abstractmethod
|
335
|
-
def _add_service_identifier(self, entry:
|
328
|
+
def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
|
336
329
|
"""
|
337
330
|
Add service-specific identifier field to entry.
|
338
331
|
|
@@ -6,7 +6,7 @@ BioRxiv paper downloader implementation.
|
|
6
6
|
import logging
|
7
7
|
import re
|
8
8
|
import tempfile
|
9
|
-
from typing import Any
|
9
|
+
from typing import Any
|
10
10
|
|
11
11
|
import cloudscraper
|
12
12
|
import requests
|
@@ -43,9 +43,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
43
43
|
# CloudScraper specific settings
|
44
44
|
self.cf_clearance_timeout = getattr(config, "cf_clearance_timeout", 30)
|
45
45
|
self.session_reuse = getattr(config, "session_reuse", True)
|
46
|
-
self.browser_config_type = getattr(config, "browser_config", {}).get(
|
47
|
-
"type", "custom"
|
48
|
-
)
|
46
|
+
self.browser_config_type = getattr(config, "browser_config", {}).get("type", "custom")
|
49
47
|
|
50
48
|
# Initialize shared CloudScraper session if enabled
|
51
49
|
self._scraper = None
|
@@ -55,7 +53,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
55
53
|
delay=self.cf_clearance_timeout,
|
56
54
|
)
|
57
55
|
|
58
|
-
def fetch_metadata(self, identifier: str) ->
|
56
|
+
def fetch_metadata(self, identifier: str) -> dict[str, Any]:
|
59
57
|
"""
|
60
58
|
Fetch paper metadata from bioRxiv API.
|
61
59
|
|
@@ -88,7 +86,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
88
86
|
|
89
87
|
return paper_data
|
90
88
|
|
91
|
-
def construct_pdf_url(self, metadata:
|
89
|
+
def construct_pdf_url(self, metadata: dict[str, Any], identifier: str) -> str:
|
92
90
|
"""
|
93
91
|
Construct PDF URL from bioRxiv metadata and DOI.
|
94
92
|
|
@@ -111,9 +109,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
111
109
|
|
112
110
|
return pdf_url
|
113
111
|
|
114
|
-
def download_pdf_to_temp(
|
115
|
-
self, pdf_url: str, identifier: str
|
116
|
-
) -> Optional[Tuple[str, str]]:
|
112
|
+
def download_pdf_to_temp(self, pdf_url: str, identifier: str) -> tuple[str, str] | None:
|
117
113
|
"""
|
118
114
|
Override base method to use CloudScraper for bioRxiv PDF downloads.
|
119
115
|
Includes landing page visit to handle CloudFlare protection.
|
@@ -188,9 +184,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
188
184
|
r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
|
189
185
|
)
|
190
186
|
if filename_match:
|
191
|
-
extracted_filename = filename_match.group(
|
192
|
-
1
|
193
|
-
) or filename_match.group(2)
|
187
|
+
extracted_filename = filename_match.group(1) or filename_match.group(2)
|
194
188
|
extracted_filename = extracted_filename.strip().strip('"')
|
195
189
|
if extracted_filename and extracted_filename.endswith(".pdf"):
|
196
190
|
filename = extracted_filename
|
@@ -202,10 +196,10 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
202
196
|
|
203
197
|
def extract_paper_metadata(
|
204
198
|
self,
|
205
|
-
metadata:
|
199
|
+
metadata: dict[str, Any],
|
206
200
|
identifier: str,
|
207
|
-
pdf_result:
|
208
|
-
) ->
|
201
|
+
pdf_result: tuple[str, str] | None,
|
202
|
+
) -> dict[str, Any]:
|
209
203
|
"""
|
210
204
|
Extract structured metadata from bioRxiv API response.
|
211
205
|
|
@@ -234,9 +228,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
234
228
|
**pdf_metadata,
|
235
229
|
}
|
236
230
|
|
237
|
-
def _extract_basic_metadata(
|
238
|
-
self, paper: Dict[str, Any], identifier: str
|
239
|
-
) -> Dict[str, Any]:
|
231
|
+
def _extract_basic_metadata(self, paper: dict[str, Any], identifier: str) -> dict[str, Any]:
|
240
232
|
"""Extract basic metadata from paper data."""
|
241
233
|
# Extract basic fields
|
242
234
|
title = paper.get("title", "N/A").strip()
|
@@ -267,8 +259,8 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
267
259
|
return [author.strip() for author in authors_str.split(";") if author.strip()]
|
268
260
|
|
269
261
|
def _extract_pdf_metadata(
|
270
|
-
self, pdf_result:
|
271
|
-
) ->
|
262
|
+
self, pdf_result: tuple[str, str] | None, identifier: str
|
263
|
+
) -> dict[str, Any]:
|
272
264
|
"""Extract PDF-related metadata."""
|
273
265
|
if pdf_result:
|
274
266
|
temp_file_path, filename = pdf_result
|
@@ -301,7 +293,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
301
293
|
# Sanitize DOI for filename use
|
302
294
|
return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
|
303
295
|
|
304
|
-
def _get_paper_identifier_info(self, paper:
|
296
|
+
def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
|
305
297
|
"""Get bioRxiv-specific identifier info for paper summary."""
|
306
298
|
doi = paper.get("DOI", "N/A")
|
307
299
|
pub_date = paper.get("Publication Date", "N/A")
|
@@ -313,7 +305,7 @@ class BiorxivDownloader(BasePaperDownloader):
|
|
313
305
|
|
314
306
|
return info
|
315
307
|
|
316
|
-
def _add_service_identifier(self, entry:
|
308
|
+
def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
|
317
309
|
"""Add DOI and bioRxiv-specific fields to entry."""
|
318
310
|
entry["DOI"] = identifier
|
319
311
|
entry["Category"] = "N/A"
|
@@ -4,7 +4,7 @@ MedRxiv paper downloader implementation.
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
import logging
|
7
|
-
from typing import Any
|
7
|
+
from typing import Any
|
8
8
|
|
9
9
|
import requests
|
10
10
|
|
@@ -27,7 +27,7 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
27
27
|
)
|
28
28
|
self.default_version = getattr(config, "default_version", "1")
|
29
29
|
|
30
|
-
def fetch_metadata(self, identifier: str) ->
|
30
|
+
def fetch_metadata(self, identifier: str) -> dict[str, Any]:
|
31
31
|
"""
|
32
32
|
Fetch paper metadata from medRxiv API.
|
33
33
|
|
@@ -54,7 +54,7 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
54
54
|
|
55
55
|
return paper_data
|
56
56
|
|
57
|
-
def construct_pdf_url(self, metadata:
|
57
|
+
def construct_pdf_url(self, metadata: dict[str, Any], identifier: str) -> str:
|
58
58
|
"""
|
59
59
|
Construct PDF URL from medRxiv metadata and DOI.
|
60
60
|
|
@@ -79,10 +79,10 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
79
79
|
|
80
80
|
def extract_paper_metadata(
|
81
81
|
self,
|
82
|
-
metadata:
|
82
|
+
metadata: dict[str, Any],
|
83
83
|
identifier: str,
|
84
|
-
pdf_result:
|
85
|
-
) ->
|
84
|
+
pdf_result: tuple[str, str] | None,
|
85
|
+
) -> dict[str, Any]:
|
86
86
|
"""
|
87
87
|
Extract structured metadata from medRxiv API response.
|
88
88
|
|
@@ -111,9 +111,7 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
111
111
|
**pdf_metadata,
|
112
112
|
}
|
113
113
|
|
114
|
-
def _extract_basic_metadata(
|
115
|
-
self, paper: Dict[str, Any], identifier: str
|
116
|
-
) -> Dict[str, Any]:
|
114
|
+
def _extract_basic_metadata(self, paper: dict[str, Any], identifier: str) -> dict[str, Any]:
|
117
115
|
"""Extract basic metadata from paper data."""
|
118
116
|
# Extract basic fields
|
119
117
|
title = paper.get("title", "N/A").strip()
|
@@ -144,8 +142,8 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
144
142
|
return [author.strip() for author in authors_str.split(";") if author.strip()]
|
145
143
|
|
146
144
|
def _extract_pdf_metadata(
|
147
|
-
self, pdf_result:
|
148
|
-
) ->
|
145
|
+
self, pdf_result: tuple[str, str] | None, identifier: str
|
146
|
+
) -> dict[str, Any]:
|
149
147
|
"""Extract PDF-related metadata."""
|
150
148
|
if pdf_result:
|
151
149
|
temp_file_path, filename = pdf_result
|
@@ -178,7 +176,7 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
178
176
|
# Sanitize DOI for filename use
|
179
177
|
return f"{identifier.replace('/', '_').replace('.', '_')}.pdf"
|
180
178
|
|
181
|
-
def _get_paper_identifier_info(self, paper:
|
179
|
+
def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
|
182
180
|
"""Get medRxiv-specific identifier info for paper summary."""
|
183
181
|
doi = paper.get("DOI", "N/A")
|
184
182
|
pub_date = paper.get("Publication Date", "N/A")
|
@@ -190,7 +188,7 @@ class MedrxivDownloader(BasePaperDownloader):
|
|
190
188
|
|
191
189
|
return info
|
192
190
|
|
193
|
-
def _add_service_identifier(self, entry:
|
191
|
+
def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
|
194
192
|
"""Add DOI and medRxiv-specific fields to entry."""
|
195
193
|
entry["DOI"] = identifier
|
196
194
|
entry["Category"] = "N/A"
|
@@ -5,7 +5,7 @@ PubMed paper downloader implementation.
|
|
5
5
|
|
6
6
|
import logging
|
7
7
|
import xml.etree.ElementTree as ET
|
8
|
-
from typing import Any,
|
8
|
+
from typing import Any, cast
|
9
9
|
|
10
10
|
import requests
|
11
11
|
from bs4 import BeautifulSoup, Tag
|
@@ -37,7 +37,7 @@ class PubmedDownloader(BasePaperDownloader):
|
|
37
37
|
self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
|
38
38
|
self.default_error_code = getattr(config, "default_error_code", "unknown")
|
39
39
|
|
40
|
-
def fetch_metadata(self, identifier: str) ->
|
40
|
+
def fetch_metadata(self, identifier: str) -> dict[str, Any]:
|
41
41
|
"""
|
42
42
|
Fetch paper metadata from PubMed ID Converter API.
|
43
43
|
|
@@ -52,9 +52,7 @@ class PubmedDownloader(BasePaperDownloader):
|
|
52
52
|
RuntimeError: If no records found in response
|
53
53
|
"""
|
54
54
|
query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
|
55
|
-
logger.info(
|
56
|
-
"Fetching metadata from ID converter for PMID %s: %s", identifier, query_url
|
57
|
-
)
|
55
|
+
logger.info("Fetching metadata from ID converter for PMID %s: %s", identifier, query_url)
|
58
56
|
|
59
57
|
response = requests.get(query_url, timeout=self.request_timeout)
|
60
58
|
response.raise_for_status()
|
@@ -67,7 +65,7 @@ class PubmedDownloader(BasePaperDownloader):
|
|
67
65
|
|
68
66
|
return result
|
69
67
|
|
70
|
-
def construct_pdf_url(self, metadata:
|
68
|
+
def construct_pdf_url(self, metadata: dict[str, Any], identifier: str) -> str:
|
71
69
|
"""
|
72
70
|
Construct PDF URL using multiple fallback strategies.
|
73
71
|
|
@@ -145,18 +143,14 @@ class PubmedDownloader(BasePaperDownloader):
|
|
145
143
|
if error_elem is not None:
|
146
144
|
error_code = error_elem.get("code", self.default_error_code)
|
147
145
|
error_text = error_elem.text or "unknown error"
|
148
|
-
logger.info(
|
149
|
-
"OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text
|
150
|
-
)
|
146
|
+
logger.info("OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text)
|
151
147
|
return ""
|
152
148
|
|
153
149
|
# Look for PDF link
|
154
150
|
pdf_link = root.find(".//link[@format='pdf']")
|
155
151
|
if pdf_link is not None:
|
156
152
|
pdf_url = pdf_link.get("href", "")
|
157
|
-
logger.info(
|
158
|
-
"Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url
|
159
|
-
)
|
153
|
+
logger.info("Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url)
|
160
154
|
|
161
155
|
# Convert FTP links to HTTPS for download compatibility
|
162
156
|
if pdf_url.startswith(self.ftp_base_url):
|
@@ -188,15 +182,11 @@ class PubmedDownloader(BasePaperDownloader):
|
|
188
182
|
def _try_pmc_page_scraping(self, pmcid: str) -> str:
|
189
183
|
"""Try scraping PMC page for PDF meta tag."""
|
190
184
|
pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
|
191
|
-
logger.info(
|
192
|
-
"Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url
|
193
|
-
)
|
185
|
+
logger.info("Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url)
|
194
186
|
|
195
187
|
try:
|
196
188
|
headers = {"User-Agent": self.user_agent}
|
197
|
-
response = requests.get(
|
198
|
-
pmc_page_url, headers=headers, timeout=self.request_timeout
|
199
|
-
)
|
189
|
+
response = requests.get(pmc_page_url, headers=headers, timeout=self.request_timeout)
|
200
190
|
response.raise_for_status()
|
201
191
|
|
202
192
|
soup = BeautifulSoup(response.content, "html.parser")
|
@@ -238,10 +228,10 @@ class PubmedDownloader(BasePaperDownloader):
|
|
238
228
|
|
239
229
|
def extract_paper_metadata(
|
240
230
|
self,
|
241
|
-
metadata:
|
231
|
+
metadata: dict[str, Any],
|
242
232
|
identifier: str,
|
243
|
-
pdf_result:
|
244
|
-
) ->
|
233
|
+
pdf_result: tuple[str, str] | None,
|
234
|
+
) -> dict[str, Any]:
|
245
235
|
"""
|
246
236
|
Extract structured metadata from PubMed ID converter response.
|
247
237
|
|
@@ -310,15 +300,11 @@ class PubmedDownloader(BasePaperDownloader):
|
|
310
300
|
|
311
301
|
def get_snippet(self, abstract: str) -> str:
|
312
302
|
"""Override to handle PubMed-specific abstract placeholder."""
|
313
|
-
if
|
314
|
-
not abstract
|
315
|
-
or abstract == "N/A"
|
316
|
-
or abstract == "Abstract available in PubMed"
|
317
|
-
):
|
303
|
+
if not abstract or abstract == "N/A" or abstract == "Abstract available in PubMed":
|
318
304
|
return ""
|
319
305
|
return super().get_snippet(abstract)
|
320
306
|
|
321
|
-
def _get_paper_identifier_info(self, paper:
|
307
|
+
def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
|
322
308
|
"""Get PubMed-specific identifier info for paper summary."""
|
323
309
|
pmid = paper.get("PMID", "N/A")
|
324
310
|
pmcid = paper.get("PMCID", "N/A")
|
@@ -329,7 +315,7 @@ class PubmedDownloader(BasePaperDownloader):
|
|
329
315
|
|
330
316
|
return info
|
331
317
|
|
332
|
-
def _add_service_identifier(self, entry:
|
318
|
+
def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
|
333
319
|
"""Add PMID and PubMed-specific fields to entry."""
|
334
320
|
entry["PMID"] = identifier
|
335
321
|
entry["PMCID"] = "N/A"
|
@@ -25,11 +25,11 @@ from langgraph.prebuilt import InjectedState
|
|
25
25
|
from langgraph.types import Command
|
26
26
|
from pydantic import BaseModel, Field
|
27
27
|
|
28
|
+
from .utils.answer_formatter import format_answer
|
28
29
|
from .utils.generate_answer import load_hydra_config
|
29
|
-
from .utils.tool_helper import QAToolHelper
|
30
30
|
from .utils.paper_loader import load_all_papers
|
31
31
|
from .utils.rag_pipeline import retrieve_and_rerank_chunks
|
32
|
-
from .utils.
|
32
|
+
from .utils.tool_helper import QAToolHelper
|
33
33
|
|
34
34
|
# Helper for managing state, vectorstore, reranking, and formatting
|
35
35
|
helper = QAToolHelper()
|
@@ -56,9 +56,7 @@ class QuestionAndAnswerInput(BaseModel):
|
|
56
56
|
- llm_model: chat/LLM instance for answer generation.
|
57
57
|
"""
|
58
58
|
|
59
|
-
question: str = Field(
|
60
|
-
description="User question for generating a PDF-based answer."
|
61
|
-
)
|
59
|
+
question: str = Field(description="User question for generating a PDF-based answer.")
|
62
60
|
tool_call_id: Annotated[str, InjectedToolCallId]
|
63
61
|
state: Annotated[dict, InjectedState]
|
64
62
|
|
@@ -133,9 +131,7 @@ def question_and_answer(
|
|
133
131
|
)
|
134
132
|
|
135
133
|
# Retrieve and rerank chunks in one step
|
136
|
-
reranked_chunks = retrieve_and_rerank_chunks(
|
137
|
-
vs, question, config, call_id, helper.has_gpu
|
138
|
-
)
|
134
|
+
reranked_chunks = retrieve_and_rerank_chunks(vs, question, config, call_id, helper.has_gpu)
|
139
135
|
|
140
136
|
if not reranked_chunks:
|
141
137
|
msg = f"No relevant chunks found for question: '{question}'"
|
@@ -2,20 +2,22 @@
|
|
2
2
|
Utility modules for the PDF question_and_answer tool.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from . import
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
5
|
+
from . import (
|
6
|
+
answer_formatter,
|
7
|
+
batch_processor,
|
8
|
+
collection_manager,
|
9
|
+
generate_answer,
|
10
|
+
get_vectorstore,
|
11
|
+
gpu_detection,
|
12
|
+
nvidia_nim_reranker,
|
13
|
+
paper_loader,
|
14
|
+
rag_pipeline,
|
15
|
+
retrieve_chunks,
|
16
|
+
singleton_manager,
|
17
|
+
tool_helper,
|
18
|
+
vector_normalization,
|
19
|
+
vector_store,
|
20
|
+
)
|
19
21
|
|
20
22
|
__all__ = [
|
21
23
|
"answer_formatter",
|
@@ -3,7 +3,7 @@ Format the final answer text with source attributions and hardware info.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import logging
|
6
|
-
from typing import Any
|
6
|
+
from typing import Any
|
7
7
|
|
8
8
|
from .generate_answer import generate_answer
|
9
9
|
|
@@ -12,9 +12,9 @@ logger = logging.getLogger(__name__)
|
|
12
12
|
|
13
13
|
def format_answer(
|
14
14
|
question: str,
|
15
|
-
chunks:
|
15
|
+
chunks: list[Any],
|
16
16
|
llm: Any,
|
17
|
-
articles:
|
17
|
+
articles: dict[str, Any],
|
18
18
|
config: Any,
|
19
19
|
**kwargs: Any,
|
20
20
|
) -> str:
|
@@ -27,7 +27,7 @@ def format_answer(
|
|
27
27
|
answer = result.get("output_text", "No answer generated.")
|
28
28
|
|
29
29
|
# Get unique paper titles for source attribution
|
30
|
-
titles:
|
30
|
+
titles: dict[str, str] = {}
|
31
31
|
for pid in result.get("papers_used", []):
|
32
32
|
if pid in articles:
|
33
33
|
titles[pid] = articles[pid].get("Title", "Unknown paper")
|
@@ -5,7 +5,7 @@ Batch processing utilities for adding multiple papers to vector store.
|
|
5
5
|
import concurrent.futures
|
6
6
|
import logging
|
7
7
|
import time
|
8
|
-
from typing import Any
|
8
|
+
from typing import Any
|
9
9
|
|
10
10
|
from langchain_core.documents import Document
|
11
11
|
|
@@ -15,11 +15,11 @@ logger = logging.getLogger(__name__)
|
|
15
15
|
|
16
16
|
|
17
17
|
def add_papers_batch(
|
18
|
-
papers_to_add:
|
18
|
+
papers_to_add: list[tuple[str, str, dict[str, Any]]],
|
19
19
|
vector_store: Any,
|
20
|
-
loaded_papers:
|
21
|
-
paper_metadata:
|
22
|
-
documents:
|
20
|
+
loaded_papers: set[str],
|
21
|
+
paper_metadata: dict[str, dict[str, Any]],
|
22
|
+
documents: dict[str, Document],
|
23
23
|
**kwargs: Any,
|
24
24
|
) -> None:
|
25
25
|
"""
|
@@ -43,9 +43,7 @@ def add_papers_batch(
|
|
43
43
|
logger.info("No papers to add")
|
44
44
|
return
|
45
45
|
|
46
|
-
to_process = [
|
47
|
-
(pid, url, md) for pid, url, md in papers_to_add if pid not in loaded_papers
|
48
|
-
]
|
46
|
+
to_process = [(pid, url, md) for pid, url, md in papers_to_add if pid not in loaded_papers]
|
49
47
|
if not to_process:
|
50
48
|
logger.info("Skipping %d already-loaded papers", len(papers_to_add))
|
51
49
|
logger.info("All %d papers are already loaded", len(papers_to_add))
|
@@ -91,16 +89,16 @@ def add_papers_batch(
|
|
91
89
|
|
92
90
|
|
93
91
|
def _parallel_load_and_split(
|
94
|
-
papers:
|
92
|
+
papers: list[tuple[str, str, dict[str, Any]]],
|
95
93
|
config: Any,
|
96
|
-
metadata_fields:
|
97
|
-
documents:
|
94
|
+
metadata_fields: list[str],
|
95
|
+
documents: dict[str, Document],
|
98
96
|
max_workers: int,
|
99
|
-
) ->
|
97
|
+
) -> tuple[list[Document], list[str], list[str]]:
|
100
98
|
"""Load & split PDFs in parallel, preserving original logic."""
|
101
|
-
all_chunks:
|
102
|
-
all_ids:
|
103
|
-
success:
|
99
|
+
all_chunks: list[Document] = []
|
100
|
+
all_ids: list[str] = []
|
101
|
+
success: list[str] = []
|
104
102
|
|
105
103
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
106
104
|
futures = {
|
@@ -138,8 +136,8 @@ def _parallel_load_and_split(
|
|
138
136
|
|
139
137
|
|
140
138
|
def _batch_embed(
|
141
|
-
chunks:
|
142
|
-
ids:
|
139
|
+
chunks: list[Document],
|
140
|
+
ids: list[str],
|
143
141
|
store: Any,
|
144
142
|
batch_size: int,
|
145
143
|
has_gpu: bool,
|
@@ -5,7 +5,7 @@ Collection Manager for Milvus
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import threading
|
8
|
-
from typing import Any
|
8
|
+
from typing import Any
|
9
9
|
|
10
10
|
from pymilvus import (
|
11
11
|
Collection,
|
@@ -28,7 +28,7 @@ _cache_lock = threading.Lock()
|
|
28
28
|
|
29
29
|
|
30
30
|
def ensure_collection_exists(
|
31
|
-
collection_name: str, config: Any, index_params:
|
31
|
+
collection_name: str, config: Any, index_params: dict[str, Any], has_gpu: bool
|
32
32
|
) -> Collection:
|
33
33
|
"""Ensure the Milvus collection exists before trying to sync or add documents."""
|
34
34
|
|
@@ -3,7 +3,7 @@ Document processing utilities for loading and splitting PDFs.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import logging
|
6
|
-
from typing import Any
|
6
|
+
from typing import Any
|
7
7
|
|
8
8
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9
9
|
from langchain_community.document_loaders import PyPDFLoader
|
@@ -15,10 +15,10 @@ logger = logging.getLogger(__name__)
|
|
15
15
|
def load_and_split_pdf(
|
16
16
|
paper_id: str,
|
17
17
|
pdf_url: str,
|
18
|
-
paper_metadata:
|
18
|
+
paper_metadata: dict[str, Any],
|
19
19
|
config: Any,
|
20
20
|
**kwargs: Any,
|
21
|
-
) ->
|
21
|
+
) -> list[Document]:
|
22
22
|
"""
|
23
23
|
Load a PDF and split it into chunks.
|
24
24
|
|
@@ -35,8 +35,8 @@ def load_and_split_pdf(
|
|
35
35
|
Returns:
|
36
36
|
A list of Document chunks, each with updated metadata.
|
37
37
|
"""
|
38
|
-
metadata_fields:
|
39
|
-
documents_dict:
|
38
|
+
metadata_fields: list[str] = kwargs["metadata_fields"]
|
39
|
+
documents_dict: dict[str, Document] = kwargs["documents_dict"]
|
40
40
|
|
41
41
|
logger.info("Loading PDF for paper %s from %s", paper_id, pdf_url)
|
42
42
|
|