aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (336) hide show
  1. aiagents4pharma/__init__.py +11 -0
  2. aiagents4pharma/talk2aiagents4pharma/.dockerignore +13 -0
  3. aiagents4pharma/talk2aiagents4pharma/Dockerfile +133 -0
  4. aiagents4pharma/talk2aiagents4pharma/README.md +1 -0
  5. aiagents4pharma/talk2aiagents4pharma/__init__.py +5 -0
  6. aiagents4pharma/talk2aiagents4pharma/agents/__init__.py +6 -0
  7. aiagents4pharma/talk2aiagents4pharma/agents/main_agent.py +70 -0
  8. aiagents4pharma/talk2aiagents4pharma/configs/__init__.py +5 -0
  9. aiagents4pharma/talk2aiagents4pharma/configs/agents/__init__.py +5 -0
  10. aiagents4pharma/talk2aiagents4pharma/configs/agents/main_agent/default.yaml +29 -0
  11. aiagents4pharma/talk2aiagents4pharma/configs/app/__init__.py +0 -0
  12. aiagents4pharma/talk2aiagents4pharma/configs/app/frontend/__init__.py +0 -0
  13. aiagents4pharma/talk2aiagents4pharma/configs/app/frontend/default.yaml +102 -0
  14. aiagents4pharma/talk2aiagents4pharma/configs/config.yaml +4 -0
  15. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/.env.example +23 -0
  16. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/docker-compose.yml +93 -0
  17. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/.env.example +23 -0
  18. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/docker-compose.yml +108 -0
  19. aiagents4pharma/talk2aiagents4pharma/install.md +154 -0
  20. aiagents4pharma/talk2aiagents4pharma/states/__init__.py +5 -0
  21. aiagents4pharma/talk2aiagents4pharma/states/state_talk2aiagents4pharma.py +18 -0
  22. aiagents4pharma/talk2aiagents4pharma/tests/__init__.py +3 -0
  23. aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +312 -0
  24. aiagents4pharma/talk2biomodels/.dockerignore +13 -0
  25. aiagents4pharma/talk2biomodels/Dockerfile +104 -0
  26. aiagents4pharma/talk2biomodels/README.md +1 -0
  27. aiagents4pharma/talk2biomodels/__init__.py +5 -0
  28. aiagents4pharma/talk2biomodels/agents/__init__.py +6 -0
  29. aiagents4pharma/talk2biomodels/agents/t2b_agent.py +104 -0
  30. aiagents4pharma/talk2biomodels/api/__init__.py +5 -0
  31. aiagents4pharma/talk2biomodels/api/ols.py +75 -0
  32. aiagents4pharma/talk2biomodels/api/uniprot.py +36 -0
  33. aiagents4pharma/talk2biomodels/configs/__init__.py +5 -0
  34. aiagents4pharma/talk2biomodels/configs/agents/__init__.py +5 -0
  35. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/__init__.py +3 -0
  36. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/default.yaml +14 -0
  37. aiagents4pharma/talk2biomodels/configs/app/__init__.py +0 -0
  38. aiagents4pharma/talk2biomodels/configs/app/frontend/__init__.py +0 -0
  39. aiagents4pharma/talk2biomodels/configs/app/frontend/default.yaml +72 -0
  40. aiagents4pharma/talk2biomodels/configs/config.yaml +7 -0
  41. aiagents4pharma/talk2biomodels/configs/tools/__init__.py +5 -0
  42. aiagents4pharma/talk2biomodels/configs/tools/ask_question/__init__.py +3 -0
  43. aiagents4pharma/talk2biomodels/configs/tools/ask_question/default.yaml +30 -0
  44. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/__init__.py +3 -0
  45. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/default.yaml +8 -0
  46. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/__init__.py +3 -0
  47. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/default.yaml +8 -0
  48. aiagents4pharma/talk2biomodels/install.md +63 -0
  49. aiagents4pharma/talk2biomodels/models/__init__.py +5 -0
  50. aiagents4pharma/talk2biomodels/models/basico_model.py +125 -0
  51. aiagents4pharma/talk2biomodels/models/sys_bio_model.py +60 -0
  52. aiagents4pharma/talk2biomodels/states/__init__.py +6 -0
  53. aiagents4pharma/talk2biomodels/states/state_talk2biomodels.py +49 -0
  54. aiagents4pharma/talk2biomodels/tests/BIOMD0000000449_url.xml +1585 -0
  55. aiagents4pharma/talk2biomodels/tests/__init__.py +3 -0
  56. aiagents4pharma/talk2biomodels/tests/article_on_model_537.pdf +0 -0
  57. aiagents4pharma/talk2biomodels/tests/test_api.py +31 -0
  58. aiagents4pharma/talk2biomodels/tests/test_ask_question.py +42 -0
  59. aiagents4pharma/talk2biomodels/tests/test_basico_model.py +67 -0
  60. aiagents4pharma/talk2biomodels/tests/test_get_annotation.py +190 -0
  61. aiagents4pharma/talk2biomodels/tests/test_getmodelinfo.py +92 -0
  62. aiagents4pharma/talk2biomodels/tests/test_integration.py +116 -0
  63. aiagents4pharma/talk2biomodels/tests/test_load_biomodel.py +35 -0
  64. aiagents4pharma/talk2biomodels/tests/test_param_scan.py +71 -0
  65. aiagents4pharma/talk2biomodels/tests/test_query_article.py +184 -0
  66. aiagents4pharma/talk2biomodels/tests/test_save_model.py +47 -0
  67. aiagents4pharma/talk2biomodels/tests/test_search_models.py +35 -0
  68. aiagents4pharma/talk2biomodels/tests/test_simulate_model.py +44 -0
  69. aiagents4pharma/talk2biomodels/tests/test_steady_state.py +86 -0
  70. aiagents4pharma/talk2biomodels/tests/test_sys_bio_model.py +67 -0
  71. aiagents4pharma/talk2biomodels/tools/__init__.py +17 -0
  72. aiagents4pharma/talk2biomodels/tools/ask_question.py +125 -0
  73. aiagents4pharma/talk2biomodels/tools/custom_plotter.py +165 -0
  74. aiagents4pharma/talk2biomodels/tools/get_annotation.py +342 -0
  75. aiagents4pharma/talk2biomodels/tools/get_modelinfo.py +159 -0
  76. aiagents4pharma/talk2biomodels/tools/load_arguments.py +134 -0
  77. aiagents4pharma/talk2biomodels/tools/load_biomodel.py +44 -0
  78. aiagents4pharma/talk2biomodels/tools/parameter_scan.py +310 -0
  79. aiagents4pharma/talk2biomodels/tools/query_article.py +64 -0
  80. aiagents4pharma/talk2biomodels/tools/save_model.py +98 -0
  81. aiagents4pharma/talk2biomodels/tools/search_models.py +96 -0
  82. aiagents4pharma/talk2biomodels/tools/simulate_model.py +137 -0
  83. aiagents4pharma/talk2biomodels/tools/steady_state.py +187 -0
  84. aiagents4pharma/talk2biomodels/tools/utils.py +23 -0
  85. aiagents4pharma/talk2cells/README.md +1 -0
  86. aiagents4pharma/talk2cells/__init__.py +5 -0
  87. aiagents4pharma/talk2cells/agents/__init__.py +6 -0
  88. aiagents4pharma/talk2cells/agents/scp_agent.py +87 -0
  89. aiagents4pharma/talk2cells/states/__init__.py +6 -0
  90. aiagents4pharma/talk2cells/states/state_talk2cells.py +15 -0
  91. aiagents4pharma/talk2cells/tests/scp_agent/test_scp_agent.py +22 -0
  92. aiagents4pharma/talk2cells/tools/__init__.py +6 -0
  93. aiagents4pharma/talk2cells/tools/scp_agent/__init__.py +6 -0
  94. aiagents4pharma/talk2cells/tools/scp_agent/display_studies.py +27 -0
  95. aiagents4pharma/talk2cells/tools/scp_agent/search_studies.py +78 -0
  96. aiagents4pharma/talk2knowledgegraphs/.dockerignore +13 -0
  97. aiagents4pharma/talk2knowledgegraphs/Dockerfile +131 -0
  98. aiagents4pharma/talk2knowledgegraphs/README.md +1 -0
  99. aiagents4pharma/talk2knowledgegraphs/__init__.py +5 -0
  100. aiagents4pharma/talk2knowledgegraphs/agents/__init__.py +5 -0
  101. aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +99 -0
  102. aiagents4pharma/talk2knowledgegraphs/configs/__init__.py +5 -0
  103. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py +3 -0
  104. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml +62 -0
  105. aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py +5 -0
  106. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/__init__.py +3 -0
  107. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +79 -0
  108. aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +13 -0
  109. aiagents4pharma/talk2knowledgegraphs/configs/tools/__init__.py +5 -0
  110. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/__init__.py +3 -0
  111. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/default.yaml +24 -0
  112. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/__init__.py +0 -0
  113. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +33 -0
  114. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/__init__.py +3 -0
  115. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/default.yaml +43 -0
  116. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/__init__.py +3 -0
  117. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/default.yaml +9 -0
  118. aiagents4pharma/talk2knowledgegraphs/configs/utils/database/milvus/__init__.py +3 -0
  119. aiagents4pharma/talk2knowledgegraphs/configs/utils/database/milvus/default.yaml +61 -0
  120. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  121. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  122. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  123. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  124. aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +5 -0
  125. aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +607 -0
  126. aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +25 -0
  127. aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +212 -0
  128. aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +210 -0
  129. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example +23 -0
  130. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml +93 -0
  131. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example +23 -0
  132. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml +108 -0
  133. aiagents4pharma/talk2knowledgegraphs/entrypoint.sh +180 -0
  134. aiagents4pharma/talk2knowledgegraphs/install.md +165 -0
  135. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +886 -0
  136. aiagents4pharma/talk2knowledgegraphs/states/__init__.py +5 -0
  137. aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +40 -0
  138. aiagents4pharma/talk2knowledgegraphs/tests/__init__.py +0 -0
  139. aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +318 -0
  140. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py +248 -0
  141. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py +33 -0
  142. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_primekg.py +86 -0
  143. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_starkqa_primekg.py +125 -0
  144. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_graphrag_reasoning.py +257 -0
  145. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +1444 -0
  146. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +159 -0
  147. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +152 -0
  148. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_summarization.py +201 -0
  149. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_database_milvus_connection_manager.py +812 -0
  150. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_embeddings.py +51 -0
  151. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_huggingface.py +49 -0
  152. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_nim_molmim.py +59 -0
  153. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py +63 -0
  154. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_sentencetransformer.py +47 -0
  155. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py +40 -0
  156. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py +94 -0
  157. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +70 -0
  158. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +45 -0
  159. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +44 -0
  160. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +48 -0
  161. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +759 -0
  162. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py +78 -0
  163. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +123 -0
  164. aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +11 -0
  165. aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py +138 -0
  166. aiagents4pharma/talk2knowledgegraphs/tools/load_arguments.py +22 -0
  167. aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +965 -0
  168. aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +374 -0
  169. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py +291 -0
  170. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py +123 -0
  171. aiagents4pharma/talk2knowledgegraphs/utils/__init__.py +5 -0
  172. aiagents4pharma/talk2knowledgegraphs/utils/database/__init__.py +5 -0
  173. aiagents4pharma/talk2knowledgegraphs/utils/database/milvus_connection_manager.py +586 -0
  174. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py +5 -0
  175. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py +81 -0
  176. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py +111 -0
  177. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py +54 -0
  178. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py +87 -0
  179. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py +73 -0
  180. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +12 -0
  181. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py +37 -0
  182. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py +129 -0
  183. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +89 -0
  184. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +78 -0
  185. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +71 -0
  186. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +98 -0
  187. aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +5 -0
  188. aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +762 -0
  189. aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +298 -0
  190. aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py +229 -0
  191. aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py +67 -0
  192. aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +104 -0
  193. aiagents4pharma/talk2scholars/.dockerignore +13 -0
  194. aiagents4pharma/talk2scholars/Dockerfile +104 -0
  195. aiagents4pharma/talk2scholars/README.md +1 -0
  196. aiagents4pharma/talk2scholars/__init__.py +7 -0
  197. aiagents4pharma/talk2scholars/agents/__init__.py +13 -0
  198. aiagents4pharma/talk2scholars/agents/main_agent.py +89 -0
  199. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +96 -0
  200. aiagents4pharma/talk2scholars/agents/pdf_agent.py +101 -0
  201. aiagents4pharma/talk2scholars/agents/s2_agent.py +135 -0
  202. aiagents4pharma/talk2scholars/agents/zotero_agent.py +127 -0
  203. aiagents4pharma/talk2scholars/configs/__init__.py +7 -0
  204. aiagents4pharma/talk2scholars/configs/agents/__init__.py +7 -0
  205. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py +7 -0
  206. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/__init__.py +3 -0
  207. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +52 -0
  208. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/__init__.py +3 -0
  209. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +19 -0
  210. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/__init__.py +3 -0
  211. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +19 -0
  212. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/__init__.py +3 -0
  213. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +44 -0
  214. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/__init__.py +3 -0
  215. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +19 -0
  216. aiagents4pharma/talk2scholars/configs/app/__init__.py +7 -0
  217. aiagents4pharma/talk2scholars/configs/app/frontend/__init__.py +3 -0
  218. aiagents4pharma/talk2scholars/configs/app/frontend/default.yaml +72 -0
  219. aiagents4pharma/talk2scholars/configs/config.yaml +16 -0
  220. aiagents4pharma/talk2scholars/configs/tools/__init__.py +21 -0
  221. aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/__init__.py +3 -0
  222. aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/default.yaml +26 -0
  223. aiagents4pharma/talk2scholars/configs/tools/paper_download/__init__.py +3 -0
  224. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  225. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/__init__.py +3 -0
  226. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +62 -0
  227. aiagents4pharma/talk2scholars/configs/tools/retrieve_semantic_scholar_paper_id/__init__.py +3 -0
  228. aiagents4pharma/talk2scholars/configs/tools/retrieve_semantic_scholar_paper_id/default.yaml +12 -0
  229. aiagents4pharma/talk2scholars/configs/tools/search/__init__.py +3 -0
  230. aiagents4pharma/talk2scholars/configs/tools/search/default.yaml +26 -0
  231. aiagents4pharma/talk2scholars/configs/tools/single_paper_recommendation/__init__.py +3 -0
  232. aiagents4pharma/talk2scholars/configs/tools/single_paper_recommendation/default.yaml +26 -0
  233. aiagents4pharma/talk2scholars/configs/tools/zotero_read/__init__.py +3 -0
  234. aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +57 -0
  235. aiagents4pharma/talk2scholars/configs/tools/zotero_write/__inti__.py +3 -0
  236. aiagents4pharma/talk2scholars/configs/tools/zotero_write/default.yaml +55 -0
  237. aiagents4pharma/talk2scholars/docker-compose/cpu/.env.example +21 -0
  238. aiagents4pharma/talk2scholars/docker-compose/cpu/docker-compose.yml +90 -0
  239. aiagents4pharma/talk2scholars/docker-compose/gpu/.env.example +21 -0
  240. aiagents4pharma/talk2scholars/docker-compose/gpu/docker-compose.yml +105 -0
  241. aiagents4pharma/talk2scholars/install.md +122 -0
  242. aiagents4pharma/talk2scholars/state/__init__.py +7 -0
  243. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +98 -0
  244. aiagents4pharma/talk2scholars/tests/__init__.py +3 -0
  245. aiagents4pharma/talk2scholars/tests/test_agents_main_agent.py +256 -0
  246. aiagents4pharma/talk2scholars/tests/test_agents_paper_agents_download_agent.py +139 -0
  247. aiagents4pharma/talk2scholars/tests/test_agents_pdf_agent.py +114 -0
  248. aiagents4pharma/talk2scholars/tests/test_agents_s2_agent.py +198 -0
  249. aiagents4pharma/talk2scholars/tests/test_agents_zotero_agent.py +160 -0
  250. aiagents4pharma/talk2scholars/tests/test_s2_tools_display_dataframe.py +91 -0
  251. aiagents4pharma/talk2scholars/tests/test_s2_tools_query_dataframe.py +191 -0
  252. aiagents4pharma/talk2scholars/tests/test_states_state.py +38 -0
  253. aiagents4pharma/talk2scholars/tests/test_tools_paper_downloader.py +507 -0
  254. aiagents4pharma/talk2scholars/tests/test_tools_question_and_answer_tool.py +105 -0
  255. aiagents4pharma/talk2scholars/tests/test_tools_s2_multi.py +307 -0
  256. aiagents4pharma/talk2scholars/tests/test_tools_s2_retrieve.py +67 -0
  257. aiagents4pharma/talk2scholars/tests/test_tools_s2_search.py +286 -0
  258. aiagents4pharma/talk2scholars/tests/test_tools_s2_single.py +298 -0
  259. aiagents4pharma/talk2scholars/tests/test_utils_arxiv_downloader.py +469 -0
  260. aiagents4pharma/talk2scholars/tests/test_utils_base_paper_downloader.py +598 -0
  261. aiagents4pharma/talk2scholars/tests/test_utils_biorxiv_downloader.py +669 -0
  262. aiagents4pharma/talk2scholars/tests/test_utils_medrxiv_downloader.py +500 -0
  263. aiagents4pharma/talk2scholars/tests/test_utils_nvidia_nim_reranker.py +117 -0
  264. aiagents4pharma/talk2scholars/tests/test_utils_pdf_answer_formatter.py +67 -0
  265. aiagents4pharma/talk2scholars/tests/test_utils_pdf_batch_processor.py +92 -0
  266. aiagents4pharma/talk2scholars/tests/test_utils_pdf_collection_manager.py +173 -0
  267. aiagents4pharma/talk2scholars/tests/test_utils_pdf_document_processor.py +68 -0
  268. aiagents4pharma/talk2scholars/tests/test_utils_pdf_generate_answer.py +72 -0
  269. aiagents4pharma/talk2scholars/tests/test_utils_pdf_gpu_detection.py +129 -0
  270. aiagents4pharma/talk2scholars/tests/test_utils_pdf_paper_loader.py +116 -0
  271. aiagents4pharma/talk2scholars/tests/test_utils_pdf_rag_pipeline.py +88 -0
  272. aiagents4pharma/talk2scholars/tests/test_utils_pdf_retrieve_chunks.py +190 -0
  273. aiagents4pharma/talk2scholars/tests/test_utils_pdf_singleton_manager.py +159 -0
  274. aiagents4pharma/talk2scholars/tests/test_utils_pdf_vector_normalization.py +121 -0
  275. aiagents4pharma/talk2scholars/tests/test_utils_pdf_vector_store.py +406 -0
  276. aiagents4pharma/talk2scholars/tests/test_utils_pubmed_downloader.py +1007 -0
  277. aiagents4pharma/talk2scholars/tests/test_utils_read_helper_utils.py +106 -0
  278. aiagents4pharma/talk2scholars/tests/test_utils_s2_utils_ext_ids.py +403 -0
  279. aiagents4pharma/talk2scholars/tests/test_utils_tool_helper_utils.py +85 -0
  280. aiagents4pharma/talk2scholars/tests/test_utils_zotero_human_in_the_loop.py +266 -0
  281. aiagents4pharma/talk2scholars/tests/test_utils_zotero_path.py +496 -0
  282. aiagents4pharma/talk2scholars/tests/test_utils_zotero_pdf_downloader_utils.py +46 -0
  283. aiagents4pharma/talk2scholars/tests/test_utils_zotero_read.py +743 -0
  284. aiagents4pharma/talk2scholars/tests/test_utils_zotero_write.py +151 -0
  285. aiagents4pharma/talk2scholars/tools/__init__.py +9 -0
  286. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +12 -0
  287. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +442 -0
  288. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +22 -0
  289. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +207 -0
  290. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +336 -0
  291. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +313 -0
  292. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +196 -0
  293. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +323 -0
  294. aiagents4pharma/talk2scholars/tools/pdf/__init__.py +7 -0
  295. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +170 -0
  296. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +37 -0
  297. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  298. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +198 -0
  299. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  300. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  301. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +97 -0
  302. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +59 -0
  303. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +150 -0
  304. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +97 -0
  305. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  306. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +113 -0
  307. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +197 -0
  308. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  309. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +86 -0
  310. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +150 -0
  311. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +327 -0
  312. aiagents4pharma/talk2scholars/tools/s2/__init__.py +21 -0
  313. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +110 -0
  314. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +111 -0
  315. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +233 -0
  316. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +128 -0
  317. aiagents4pharma/talk2scholars/tools/s2/search.py +101 -0
  318. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +102 -0
  319. aiagents4pharma/talk2scholars/tools/s2/utils/__init__.py +5 -0
  320. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +223 -0
  321. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +205 -0
  322. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +216 -0
  323. aiagents4pharma/talk2scholars/tools/zotero/__init__.py +7 -0
  324. aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py +7 -0
  325. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +270 -0
  326. aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py +74 -0
  327. aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py +194 -0
  328. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +180 -0
  329. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +133 -0
  330. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +105 -0
  331. aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +162 -0
  332. aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +91 -0
  333. aiagents4pharma-0.0.0.dist-info/METADATA +335 -0
  334. aiagents4pharma-0.0.0.dist-info/RECORD +336 -0
  335. aiagents4pharma-0.0.0.dist-info/WHEEL +4 -0
  336. aiagents4pharma-0.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,198 @@
1
+ """
2
+ Batch processing utilities for adding multiple papers to vector store.
3
+ """
4
+
5
+ import concurrent.futures
6
+ import logging
7
+ import time
8
+ from typing import Any
9
+
10
+ from langchain_core.documents import Document
11
+
12
+ from .document_processor import load_and_split_pdf
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def add_papers_batch(
18
+ papers_to_add: list[tuple[str, str, dict[str, Any]]],
19
+ vector_store: Any,
20
+ loaded_papers: set[str],
21
+ paper_metadata: dict[str, dict[str, Any]],
22
+ documents: dict[str, Document],
23
+ **kwargs: Any,
24
+ ) -> None:
25
+ """
26
+ Add multiple papers to the document store in parallel with batch embedding.
27
+
28
+ Args:
29
+ papers_to_add: List of tuples (paper_id, pdf_url, paper_metadata).
30
+ vector_store: The LangChain Milvus vector store instance.
31
+ loaded_papers: Set to track which papers are already loaded.
32
+ paper_metadata: Dict to store paper metadata after load.
33
+ documents: Dict to store document chunks.
34
+ config: (via kwargs) Configuration object.
35
+ metadata_fields: (via kwargs) List of metadata fields to include.
36
+ has_gpu: (via kwargs) Whether GPU is available.
37
+ max_workers: (via kwargs) Max PDF‐loading threads (default 5).
38
+ batch_size: (via kwargs) Embedding batch size (default 100).
39
+ """
40
+ cfg = kwargs
41
+
42
+ if not papers_to_add:
43
+ logger.info("No papers to add")
44
+ return
45
+
46
+ to_process = [(pid, url, md) for pid, url, md in papers_to_add if pid not in loaded_papers]
47
+ if not to_process:
48
+ logger.info("Skipping %d already-loaded papers", len(papers_to_add))
49
+ logger.info("All %d papers are already loaded", len(papers_to_add))
50
+ return
51
+
52
+ logger.info(
53
+ "Starting PARALLEL batch processing of %d papers with %d workers (%s)",
54
+ len(to_process),
55
+ cfg.get("max_workers", 5),
56
+ "GPU acceleration" if cfg["has_gpu"] else "CPU processing",
57
+ )
58
+
59
+ chunks, ids, success = _parallel_load_and_split(
60
+ to_process,
61
+ cfg["config"],
62
+ cfg["metadata_fields"],
63
+ documents,
64
+ cfg.get("max_workers", 5),
65
+ )
66
+
67
+ if not chunks:
68
+ logger.warning("No chunks to add to vector store")
69
+ return
70
+
71
+ for pid, _, md in to_process:
72
+ if pid in success:
73
+ paper_metadata[pid] = md
74
+
75
+ try:
76
+ _batch_embed(
77
+ chunks,
78
+ ids,
79
+ vector_store,
80
+ cfg.get("batch_size", 100),
81
+ cfg["has_gpu"],
82
+ )
83
+ except Exception:
84
+ logger.error("Failed to add chunks to Milvus", exc_info=True)
85
+ raise
86
+
87
+ # finally mark papers as loaded
88
+ loaded_papers.update(success)
89
+
90
+
91
+ def _parallel_load_and_split(
92
+ papers: list[tuple[str, str, dict[str, Any]]],
93
+ config: Any,
94
+ metadata_fields: list[str],
95
+ documents: dict[str, Document],
96
+ max_workers: int,
97
+ ) -> tuple[list[Document], list[str], list[str]]:
98
+ """Load & split PDFs in parallel, preserving original logic."""
99
+ all_chunks: list[Document] = []
100
+ all_ids: list[str] = []
101
+ success: list[str] = []
102
+
103
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
104
+ futures = {
105
+ executor.submit(
106
+ load_and_split_pdf,
107
+ pid,
108
+ url,
109
+ md,
110
+ config,
111
+ metadata_fields=metadata_fields,
112
+ documents_dict=documents,
113
+ ): pid
114
+ for pid, url, md in papers
115
+ }
116
+ logger.info("Submitted %d PDF loading tasks", len(futures))
117
+
118
+ for idx, fut in enumerate(concurrent.futures.as_completed(futures), start=1):
119
+ pid = futures[fut]
120
+ chunks = fut.result()
121
+ ids = [f"{pid}_{i}" for i in range(len(chunks))]
122
+
123
+ all_chunks.extend(chunks)
124
+ all_ids.extend(ids)
125
+ success.append(pid)
126
+
127
+ logger.info(
128
+ "Progress: %d/%d - Loaded paper %s (%d chunks)",
129
+ idx,
130
+ len(papers),
131
+ pid,
132
+ len(chunks),
133
+ )
134
+
135
+ return all_chunks, all_ids, success
136
+
137
+
138
+ def _batch_embed(
139
+ chunks: list[Document],
140
+ ids: list[str],
141
+ store: Any,
142
+ batch_size: int,
143
+ has_gpu: bool,
144
+ ) -> None:
145
+ """Embed chunks in batches and verify insertion exactly as before."""
146
+ start = time.time()
147
+ n = len(chunks)
148
+ logger.info(
149
+ "Starting BATCH EMBEDDING of %d chunks in batches of %d (%s)",
150
+ n,
151
+ batch_size,
152
+ "GPU" if has_gpu else "CPU",
153
+ )
154
+
155
+ for batch_num, start_idx in enumerate(range(0, n, batch_size), start=1):
156
+ end_idx = min(start_idx + batch_size, n)
157
+ logger.info(
158
+ "Embedding batch %d/%d (chunks %d-%d of %d) - %s",
159
+ batch_num,
160
+ (n + batch_size - 1) // batch_size,
161
+ start_idx + 1,
162
+ end_idx,
163
+ n,
164
+ "GPU" if has_gpu else "CPU",
165
+ )
166
+
167
+ store.add_documents(
168
+ documents=chunks[start_idx:end_idx],
169
+ ids=ids[start_idx:end_idx],
170
+ )
171
+
172
+ # Post-insert verification
173
+ col = store.col
174
+ col.flush()
175
+ count = col.num_entities
176
+ logger.info(
177
+ "Post-insert batch %d: collection has %d entities",
178
+ batch_num,
179
+ count,
180
+ )
181
+ if count:
182
+ logger.info(
183
+ "Sample paper IDs: %s",
184
+ [
185
+ r.get("paper_id", "unknown")
186
+ for r in col.query(expr="", output_fields=["paper_id"], limit=3)
187
+ ],
188
+ )
189
+
190
+ logger.info("Successfully stored batch %d", batch_num)
191
+
192
+ elapsed = time.time() - start
193
+ logger.info(
194
+ "BATCH EMBEDDING COMPLETE: %d chunks in %.2f seconds (%.2f chunks/sec)",
195
+ n,
196
+ elapsed,
197
+ n / elapsed if elapsed > 0 else 0,
198
+ )
@@ -0,0 +1,172 @@
1
+ """
2
+ Collection Manager for Milvus
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import threading
8
+ from typing import Any
9
+
10
+ from pymilvus import (
11
+ Collection,
12
+ CollectionSchema,
13
+ DataType,
14
+ FieldSchema,
15
+ connections,
16
+ utility,
17
+ )
18
+
19
+ # Set up logging with configurable level
20
+ log_level = os.environ.get("LOG_LEVEL", "INFO")
21
+ logging.basicConfig(level=getattr(logging, log_level))
22
+ logger = logging.getLogger(__name__)
23
+ logger.setLevel(getattr(logging, log_level))
24
+
25
+ # Global cache for collections to avoid repeated creation checks
26
+ _collection_cache = {}
27
+ _cache_lock = threading.Lock()
28
+
29
+
30
+ def ensure_collection_exists(
31
+ collection_name: str, config: Any, index_params: dict[str, Any], has_gpu: bool
32
+ ) -> Collection:
33
+ """Ensure the Milvus collection exists before trying to sync or add documents."""
34
+
35
+ # Check cache first
36
+ with _cache_lock:
37
+ if collection_name in _collection_cache:
38
+ logger.debug("Returning cached collection: %s", collection_name)
39
+ return _collection_cache[collection_name]
40
+
41
+ try:
42
+ existing_collections = utility.list_collections()
43
+ if collection_name not in existing_collections:
44
+ logger.info(
45
+ "Collection %s does not exist. Creating schema...",
46
+ collection_name,
47
+ )
48
+
49
+ # Define schema
50
+ fields = [
51
+ FieldSchema(
52
+ name="id",
53
+ dtype=DataType.VARCHAR,
54
+ is_primary=True,
55
+ auto_id=False,
56
+ max_length=100,
57
+ ),
58
+ FieldSchema(
59
+ name="embedding",
60
+ dtype=DataType.FLOAT_VECTOR,
61
+ dim=config.milvus.embedding_dim if config else 768,
62
+ ),
63
+ FieldSchema(
64
+ name="text",
65
+ dtype=DataType.VARCHAR,
66
+ max_length=65535,
67
+ ),
68
+ FieldSchema(
69
+ name="paper_id",
70
+ dtype=DataType.VARCHAR,
71
+ max_length=100,
72
+ ),
73
+ FieldSchema(
74
+ name="title",
75
+ dtype=DataType.VARCHAR,
76
+ max_length=512,
77
+ ),
78
+ FieldSchema(
79
+ name="chunk_id",
80
+ dtype=DataType.INT64,
81
+ ),
82
+ FieldSchema(
83
+ name="page",
84
+ dtype=DataType.INT64,
85
+ ),
86
+ FieldSchema(
87
+ name="source",
88
+ dtype=DataType.VARCHAR,
89
+ max_length=512,
90
+ ),
91
+ ]
92
+
93
+ schema = CollectionSchema(
94
+ fields=fields,
95
+ description="RAG collection for embedded PDF chunks",
96
+ enable_dynamic_field=True,
97
+ )
98
+
99
+ # Create collection
100
+ collection = Collection(
101
+ name=collection_name,
102
+ schema=schema,
103
+ using="default",
104
+ shards_num=2,
105
+ )
106
+ logger.info("Created collection: %s", collection_name)
107
+
108
+ # Create index on the embedding field with GPU/CPU optimization
109
+ logger.info(
110
+ "Creating %s index on 'embedding' field for collection: %s",
111
+ index_params["index_type"],
112
+ collection_name,
113
+ )
114
+
115
+ collection.create_index(field_name="embedding", index_params=index_params)
116
+
117
+ index_type = index_params["index_type"]
118
+ logger.info(
119
+ "Successfully created %s index on 'embedding' field for collection: %s",
120
+ index_type,
121
+ collection_name,
122
+ )
123
+
124
+ else:
125
+ logger.info("Collection %s already exists. Loading it.", collection_name)
126
+ collection = Collection(name=collection_name, using="default")
127
+
128
+ collection.load()
129
+
130
+ def debug_collection_state(collection, collection_name):
131
+ """Debug collection state for troubleshooting."""
132
+ logger.info("=== DEBUG COLLECTION STATE ===")
133
+ logger.info("Collection name: %s", collection_name)
134
+ logger.info("Collection schema: %s", collection.schema)
135
+ logger.info("Collection num_entities: %d", collection.num_entities)
136
+
137
+ # Check if collection is actually loaded
138
+ # logger.info("Is collection loaded: %s", collection.load)
139
+
140
+ # Check available indexes
141
+ indexes = collection.indexes
142
+ logger.info("Collection indexes: %s", [idx.field_name for idx in indexes])
143
+
144
+ # Try to get collection stats
145
+ logger.info("Collection statistics: %s", collection.num_entities)
146
+
147
+ logger.info("Active connections: %s", connections.list_connections())
148
+
149
+ logger.info("=== END DEBUG ===")
150
+
151
+ debug_collection_state(collection, collection_name)
152
+
153
+ # Log collection statistics with GPU/CPU info
154
+ num_entities = collection.num_entities
155
+ gpu_info = " (GPU accelerated)" if has_gpu else " (CPU only)"
156
+ logger.info(
157
+ "Collection %s is loaded and ready with %d entities%s",
158
+ collection_name,
159
+ num_entities,
160
+ gpu_info,
161
+ )
162
+
163
+ # Cache the collection
164
+ with _cache_lock:
165
+ _collection_cache[collection_name] = collection
166
+ logger.debug("Cached collection: %s", collection_name)
167
+
168
+ return collection # Return the collection object
169
+
170
+ except Exception as e:
171
+ logger.error("Failed to ensure collection exists: %s", e, exc_info=True)
172
+ raise
@@ -0,0 +1,76 @@
1
+ """
2
+ Document processing utilities for loading and splitting PDFs.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain_core.documents import Document
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def load_and_split_pdf(
16
+ paper_id: str,
17
+ pdf_url: str,
18
+ paper_metadata: dict[str, Any],
19
+ config: Any,
20
+ **kwargs: Any,
21
+ ) -> list[Document]:
22
+ """
23
+ Load a PDF and split it into chunks.
24
+
25
+ Args:
26
+ paper_id: Unique identifier for the paper.
27
+ pdf_url: URL to the PDF.
28
+ paper_metadata: Metadata about the paper (e.g. Title, Authors, etc.).
29
+ config: Configuration object with `chunk_size` and `chunk_overlap` attributes.
30
+ metadata_fields: List of additional metadata keys to propagate into each
31
+ chunk (passed via kwargs).
32
+ documents_dict: Dictionary where split chunks will also be stored under keys
33
+ of the form "{paper_id}_{chunk_index}" (passed via kwargs).
34
+
35
+ Returns:
36
+ A list of Document chunks, each with updated metadata.
37
+ """
38
+ metadata_fields: list[str] = kwargs["metadata_fields"]
39
+ documents_dict: dict[str, Document] = kwargs["documents_dict"]
40
+
41
+ logger.info("Loading PDF for paper %s from %s", paper_id, pdf_url)
42
+
43
+ # Load pages
44
+ documents = PyPDFLoader(pdf_url).load()
45
+ logger.info("Loaded %d pages from paper %s", len(documents), paper_id)
46
+
47
+ if config is None:
48
+ raise ValueError("Configuration is required for text splitting in Vectorstore.")
49
+ splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=config.chunk_size,
51
+ chunk_overlap=config.chunk_overlap,
52
+ separators=["\n\n", "\n", ". ", " ", ""],
53
+ )
54
+
55
+ # Split into chunks
56
+ chunks = splitter.split_documents(documents)
57
+ logger.info("Split paper %s into %d chunks", paper_id, len(chunks))
58
+
59
+ # Attach metadata & populate documents_dict
60
+ for i, chunk in enumerate(chunks):
61
+ chunk_id = f"{paper_id}_{i}"
62
+ chunk.metadata.update(
63
+ {
64
+ "paper_id": paper_id,
65
+ "title": paper_metadata.get("Title", "Unknown"),
66
+ "chunk_id": i,
67
+ "page": chunk.metadata.get("page", 0),
68
+ "source": pdf_url,
69
+ }
70
+ )
71
+ for field in metadata_fields:
72
+ if field in paper_metadata and field not in chunk.metadata:
73
+ chunk.metadata[field] = paper_metadata[field]
74
+ documents_dict[chunk_id] = chunk
75
+
76
+ return chunks
@@ -0,0 +1,97 @@
1
+ """
2
+ Generate an answer for a question using retrieved chunks of documents.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ from typing import Any
8
+
9
+ import hydra
10
+ from langchain_core.documents import Document
11
+ from langchain_core.language_models.chat_models import BaseChatModel
12
+
13
+ # Set up logging with configurable level
14
+ log_level = os.environ.get("LOG_LEVEL", "INFO")
15
+ logging.basicConfig(level=getattr(logging, log_level))
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(getattr(logging, log_level))
18
+
19
+
20
+ def load_hydra_config() -> Any:
21
+ """
22
+ Load the configuration using Hydra and return the configuration for the Q&A tool.
23
+ """
24
+ with hydra.initialize(version_base=None, config_path="../../../configs"):
25
+ cfg = hydra.compose(
26
+ config_name="config",
27
+ overrides=["tools/question_and_answer=default"],
28
+ )
29
+ config = cfg.tools.question_and_answer
30
+ logger.debug("Loaded Question and Answer tool configuration.")
31
+ return config
32
+
33
+
34
+ def _build_context_and_sources(
35
+ retrieved_chunks: list[Document],
36
+ ) -> tuple[str, set[str]]:
37
+ """
38
+ Build the combined context string and set of paper_ids from retrieved chunks.
39
+ """
40
+ papers = {}
41
+ for doc in retrieved_chunks:
42
+ pid = doc.metadata.get("paper_id", "unknown")
43
+ papers.setdefault(pid, []).append(doc)
44
+ formatted = []
45
+ idx = 1
46
+ for pid, chunks in papers.items():
47
+ title = chunks[0].metadata.get("title", "Unknown")
48
+ formatted.append(f"[Document {idx}] From: '{title}' (ID: {pid})")
49
+ for chunk in chunks:
50
+ page = chunk.metadata.get("page", "unknown")
51
+ formatted.append(f"Page {page}: {chunk.page_content}")
52
+ idx += 1
53
+ context = "\n\n".join(formatted)
54
+ sources: set[str] = set()
55
+ for doc in retrieved_chunks:
56
+ pid = doc.metadata.get("paper_id")
57
+ if isinstance(pid, str):
58
+ sources.add(pid)
59
+ return context, sources
60
+
61
+
62
+ def generate_answer(
63
+ question: str,
64
+ retrieved_chunks: list[Document],
65
+ llm_model: BaseChatModel,
66
+ config: Any,
67
+ ) -> dict[str, Any]:
68
+ """
69
+ Generate an answer for a question using retrieved chunks.
70
+
71
+ Args:
72
+ question (str): The question to answer
73
+ retrieved_chunks (List[Document]): List of relevant document chunks
74
+ llm_model (BaseChatModel): Language model for generating answers
75
+ config (Any): Configuration for answer generation
76
+
77
+ Returns:
78
+ Dict[str, Any]: Dictionary with the answer and metadata
79
+ """
80
+ # Ensure the configuration is provided and has the prompt_template.
81
+ if config is None:
82
+ raise ValueError("Configuration for generate_answer is required.")
83
+ if "prompt_template" not in config:
84
+ raise ValueError("The prompt_template is missing from the configuration.")
85
+
86
+ # Build context and sources, then invoke LLM
87
+ context, paper_sources = _build_context_and_sources(retrieved_chunks)
88
+ prompt = config["prompt_template"].format(context=context, question=question)
89
+ response = llm_model.invoke(prompt)
90
+
91
+ # Return the response with metadata
92
+ return {
93
+ "output_text": response.content,
94
+ "sources": [doc.metadata for doc in retrieved_chunks],
95
+ "num_sources": len(retrieved_chunks),
96
+ "papers_used": list(paper_sources),
97
+ }
@@ -0,0 +1,59 @@
1
+ """
2
+ Create or retrieve a Vectorstore instance for PDF RAG.
3
+ """
4
+
5
+ import logging
6
+ import threading
7
+ from typing import Any
8
+
9
+ from langchain_core.embeddings import Embeddings
10
+
11
+ from .vector_store import Vectorstore
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Global cache for Vectorstore instances
16
+ _vectorstore_cache = {}
17
+ _cache_lock = threading.Lock()
18
+
19
+
20
+ def get_vectorstore(
21
+ embedding_model: Embeddings, config: Any, force_new: bool = False
22
+ ) -> "Vectorstore":
23
+ """
24
+ Factory function to get or create a Vectorstore instance.
25
+ Ensures the same instance is reused across the application.
26
+
27
+ Args:
28
+ embedding_model: The embedding model to use
29
+ config: Configuration object
30
+ force_new: Force creation of a new instance
31
+
32
+ Returns:
33
+ Vectorstore instance
34
+ """
35
+ collection_name = config.milvus.collection_name if config else "pdf_rag_documents"
36
+
37
+ with _cache_lock:
38
+ if force_new and collection_name in _vectorstore_cache:
39
+ del _vectorstore_cache[collection_name]
40
+ logger.info("Forced new Vectorstore instance for collection: %s", collection_name)
41
+
42
+ if collection_name not in _vectorstore_cache:
43
+ logger.info("Creating new Vectorstore instance for collection: %s", collection_name)
44
+ _vectorstore_cache[collection_name] = Vectorstore(
45
+ embedding_model=embedding_model, config=config
46
+ )
47
+ else:
48
+ logger.info(
49
+ "Reusing existing Vectorstore instance for collection: %s",
50
+ collection_name,
51
+ )
52
+ # Update embedding model if different
53
+ existing = _vectorstore_cache[collection_name]
54
+ if existing.embedding_model != embedding_model:
55
+ logger.warning("Embedding model changed, updating existing instance")
56
+ existing.embedding_model = embedding_model
57
+ existing.vector_store.embedding_function = embedding_model
58
+
59
+ return _vectorstore_cache[collection_name]