aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (336) hide show
  1. aiagents4pharma/__init__.py +11 -0
  2. aiagents4pharma/talk2aiagents4pharma/.dockerignore +13 -0
  3. aiagents4pharma/talk2aiagents4pharma/Dockerfile +133 -0
  4. aiagents4pharma/talk2aiagents4pharma/README.md +1 -0
  5. aiagents4pharma/talk2aiagents4pharma/__init__.py +5 -0
  6. aiagents4pharma/talk2aiagents4pharma/agents/__init__.py +6 -0
  7. aiagents4pharma/talk2aiagents4pharma/agents/main_agent.py +70 -0
  8. aiagents4pharma/talk2aiagents4pharma/configs/__init__.py +5 -0
  9. aiagents4pharma/talk2aiagents4pharma/configs/agents/__init__.py +5 -0
  10. aiagents4pharma/talk2aiagents4pharma/configs/agents/main_agent/default.yaml +29 -0
  11. aiagents4pharma/talk2aiagents4pharma/configs/app/__init__.py +0 -0
  12. aiagents4pharma/talk2aiagents4pharma/configs/app/frontend/__init__.py +0 -0
  13. aiagents4pharma/talk2aiagents4pharma/configs/app/frontend/default.yaml +102 -0
  14. aiagents4pharma/talk2aiagents4pharma/configs/config.yaml +4 -0
  15. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/.env.example +23 -0
  16. aiagents4pharma/talk2aiagents4pharma/docker-compose/cpu/docker-compose.yml +93 -0
  17. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/.env.example +23 -0
  18. aiagents4pharma/talk2aiagents4pharma/docker-compose/gpu/docker-compose.yml +108 -0
  19. aiagents4pharma/talk2aiagents4pharma/install.md +154 -0
  20. aiagents4pharma/talk2aiagents4pharma/states/__init__.py +5 -0
  21. aiagents4pharma/talk2aiagents4pharma/states/state_talk2aiagents4pharma.py +18 -0
  22. aiagents4pharma/talk2aiagents4pharma/tests/__init__.py +3 -0
  23. aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +312 -0
  24. aiagents4pharma/talk2biomodels/.dockerignore +13 -0
  25. aiagents4pharma/talk2biomodels/Dockerfile +104 -0
  26. aiagents4pharma/talk2biomodels/README.md +1 -0
  27. aiagents4pharma/talk2biomodels/__init__.py +5 -0
  28. aiagents4pharma/talk2biomodels/agents/__init__.py +6 -0
  29. aiagents4pharma/talk2biomodels/agents/t2b_agent.py +104 -0
  30. aiagents4pharma/talk2biomodels/api/__init__.py +5 -0
  31. aiagents4pharma/talk2biomodels/api/ols.py +75 -0
  32. aiagents4pharma/talk2biomodels/api/uniprot.py +36 -0
  33. aiagents4pharma/talk2biomodels/configs/__init__.py +5 -0
  34. aiagents4pharma/talk2biomodels/configs/agents/__init__.py +5 -0
  35. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/__init__.py +3 -0
  36. aiagents4pharma/talk2biomodels/configs/agents/t2b_agent/default.yaml +14 -0
  37. aiagents4pharma/talk2biomodels/configs/app/__init__.py +0 -0
  38. aiagents4pharma/talk2biomodels/configs/app/frontend/__init__.py +0 -0
  39. aiagents4pharma/talk2biomodels/configs/app/frontend/default.yaml +72 -0
  40. aiagents4pharma/talk2biomodels/configs/config.yaml +7 -0
  41. aiagents4pharma/talk2biomodels/configs/tools/__init__.py +5 -0
  42. aiagents4pharma/talk2biomodels/configs/tools/ask_question/__init__.py +3 -0
  43. aiagents4pharma/talk2biomodels/configs/tools/ask_question/default.yaml +30 -0
  44. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/__init__.py +3 -0
  45. aiagents4pharma/talk2biomodels/configs/tools/custom_plotter/default.yaml +8 -0
  46. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/__init__.py +3 -0
  47. aiagents4pharma/talk2biomodels/configs/tools/get_annotation/default.yaml +8 -0
  48. aiagents4pharma/talk2biomodels/install.md +63 -0
  49. aiagents4pharma/talk2biomodels/models/__init__.py +5 -0
  50. aiagents4pharma/talk2biomodels/models/basico_model.py +125 -0
  51. aiagents4pharma/talk2biomodels/models/sys_bio_model.py +60 -0
  52. aiagents4pharma/talk2biomodels/states/__init__.py +6 -0
  53. aiagents4pharma/talk2biomodels/states/state_talk2biomodels.py +49 -0
  54. aiagents4pharma/talk2biomodels/tests/BIOMD0000000449_url.xml +1585 -0
  55. aiagents4pharma/talk2biomodels/tests/__init__.py +3 -0
  56. aiagents4pharma/talk2biomodels/tests/article_on_model_537.pdf +0 -0
  57. aiagents4pharma/talk2biomodels/tests/test_api.py +31 -0
  58. aiagents4pharma/talk2biomodels/tests/test_ask_question.py +42 -0
  59. aiagents4pharma/talk2biomodels/tests/test_basico_model.py +67 -0
  60. aiagents4pharma/talk2biomodels/tests/test_get_annotation.py +190 -0
  61. aiagents4pharma/talk2biomodels/tests/test_getmodelinfo.py +92 -0
  62. aiagents4pharma/talk2biomodels/tests/test_integration.py +116 -0
  63. aiagents4pharma/talk2biomodels/tests/test_load_biomodel.py +35 -0
  64. aiagents4pharma/talk2biomodels/tests/test_param_scan.py +71 -0
  65. aiagents4pharma/talk2biomodels/tests/test_query_article.py +184 -0
  66. aiagents4pharma/talk2biomodels/tests/test_save_model.py +47 -0
  67. aiagents4pharma/talk2biomodels/tests/test_search_models.py +35 -0
  68. aiagents4pharma/talk2biomodels/tests/test_simulate_model.py +44 -0
  69. aiagents4pharma/talk2biomodels/tests/test_steady_state.py +86 -0
  70. aiagents4pharma/talk2biomodels/tests/test_sys_bio_model.py +67 -0
  71. aiagents4pharma/talk2biomodels/tools/__init__.py +17 -0
  72. aiagents4pharma/talk2biomodels/tools/ask_question.py +125 -0
  73. aiagents4pharma/talk2biomodels/tools/custom_plotter.py +165 -0
  74. aiagents4pharma/talk2biomodels/tools/get_annotation.py +342 -0
  75. aiagents4pharma/talk2biomodels/tools/get_modelinfo.py +159 -0
  76. aiagents4pharma/talk2biomodels/tools/load_arguments.py +134 -0
  77. aiagents4pharma/talk2biomodels/tools/load_biomodel.py +44 -0
  78. aiagents4pharma/talk2biomodels/tools/parameter_scan.py +310 -0
  79. aiagents4pharma/talk2biomodels/tools/query_article.py +64 -0
  80. aiagents4pharma/talk2biomodels/tools/save_model.py +98 -0
  81. aiagents4pharma/talk2biomodels/tools/search_models.py +96 -0
  82. aiagents4pharma/talk2biomodels/tools/simulate_model.py +137 -0
  83. aiagents4pharma/talk2biomodels/tools/steady_state.py +187 -0
  84. aiagents4pharma/talk2biomodels/tools/utils.py +23 -0
  85. aiagents4pharma/talk2cells/README.md +1 -0
  86. aiagents4pharma/talk2cells/__init__.py +5 -0
  87. aiagents4pharma/talk2cells/agents/__init__.py +6 -0
  88. aiagents4pharma/talk2cells/agents/scp_agent.py +87 -0
  89. aiagents4pharma/talk2cells/states/__init__.py +6 -0
  90. aiagents4pharma/talk2cells/states/state_talk2cells.py +15 -0
  91. aiagents4pharma/talk2cells/tests/scp_agent/test_scp_agent.py +22 -0
  92. aiagents4pharma/talk2cells/tools/__init__.py +6 -0
  93. aiagents4pharma/talk2cells/tools/scp_agent/__init__.py +6 -0
  94. aiagents4pharma/talk2cells/tools/scp_agent/display_studies.py +27 -0
  95. aiagents4pharma/talk2cells/tools/scp_agent/search_studies.py +78 -0
  96. aiagents4pharma/talk2knowledgegraphs/.dockerignore +13 -0
  97. aiagents4pharma/talk2knowledgegraphs/Dockerfile +131 -0
  98. aiagents4pharma/talk2knowledgegraphs/README.md +1 -0
  99. aiagents4pharma/talk2knowledgegraphs/__init__.py +5 -0
  100. aiagents4pharma/talk2knowledgegraphs/agents/__init__.py +5 -0
  101. aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +99 -0
  102. aiagents4pharma/talk2knowledgegraphs/configs/__init__.py +5 -0
  103. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/__init__.py +3 -0
  104. aiagents4pharma/talk2knowledgegraphs/configs/agents/t2kg_agent/default.yaml +62 -0
  105. aiagents4pharma/talk2knowledgegraphs/configs/app/__init__.py +5 -0
  106. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/__init__.py +3 -0
  107. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +79 -0
  108. aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +13 -0
  109. aiagents4pharma/talk2knowledgegraphs/configs/tools/__init__.py +5 -0
  110. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/__init__.py +3 -0
  111. aiagents4pharma/talk2knowledgegraphs/configs/tools/graphrag_reasoning/default.yaml +24 -0
  112. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/__init__.py +0 -0
  113. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +33 -0
  114. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/__init__.py +3 -0
  115. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_extraction/default.yaml +43 -0
  116. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/__init__.py +3 -0
  117. aiagents4pharma/talk2knowledgegraphs/configs/tools/subgraph_summarization/default.yaml +9 -0
  118. aiagents4pharma/talk2knowledgegraphs/configs/utils/database/milvus/__init__.py +3 -0
  119. aiagents4pharma/talk2knowledgegraphs/configs/utils/database/milvus/default.yaml +61 -0
  120. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  121. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  122. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  123. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  124. aiagents4pharma/talk2knowledgegraphs/datasets/__init__.py +5 -0
  125. aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py +607 -0
  126. aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py +25 -0
  127. aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py +212 -0
  128. aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py +210 -0
  129. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example +23 -0
  130. aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml +93 -0
  131. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example +23 -0
  132. aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml +108 -0
  133. aiagents4pharma/talk2knowledgegraphs/entrypoint.sh +180 -0
  134. aiagents4pharma/talk2knowledgegraphs/install.md +165 -0
  135. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +886 -0
  136. aiagents4pharma/talk2knowledgegraphs/states/__init__.py +5 -0
  137. aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +40 -0
  138. aiagents4pharma/talk2knowledgegraphs/tests/__init__.py +0 -0
  139. aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +318 -0
  140. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py +248 -0
  141. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py +33 -0
  142. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_primekg.py +86 -0
  143. aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_starkqa_primekg.py +125 -0
  144. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_graphrag_reasoning.py +257 -0
  145. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +1444 -0
  146. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +159 -0
  147. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +152 -0
  148. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_summarization.py +201 -0
  149. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_database_milvus_connection_manager.py +812 -0
  150. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_embeddings.py +51 -0
  151. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_huggingface.py +49 -0
  152. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_nim_molmim.py +59 -0
  153. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_ollama.py +63 -0
  154. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_embeddings_sentencetransformer.py +47 -0
  155. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_enrichments.py +40 -0
  156. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ollama.py +94 -0
  157. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_ols.py +70 -0
  158. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_pubchem.py +45 -0
  159. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_reactome.py +44 -0
  160. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +48 -0
  161. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +759 -0
  162. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_kg_utils.py +78 -0
  163. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_pubchem_utils.py +123 -0
  164. aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +11 -0
  165. aiagents4pharma/talk2knowledgegraphs/tools/graphrag_reasoning.py +138 -0
  166. aiagents4pharma/talk2knowledgegraphs/tools/load_arguments.py +22 -0
  167. aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +965 -0
  168. aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +374 -0
  169. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_extraction.py +291 -0
  170. aiagents4pharma/talk2knowledgegraphs/tools/subgraph_summarization.py +123 -0
  171. aiagents4pharma/talk2knowledgegraphs/utils/__init__.py +5 -0
  172. aiagents4pharma/talk2knowledgegraphs/utils/database/__init__.py +5 -0
  173. aiagents4pharma/talk2knowledgegraphs/utils/database/milvus_connection_manager.py +586 -0
  174. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py +5 -0
  175. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py +81 -0
  176. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py +111 -0
  177. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py +54 -0
  178. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py +87 -0
  179. aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py +73 -0
  180. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +12 -0
  181. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py +37 -0
  182. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py +129 -0
  183. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py +89 -0
  184. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py +78 -0
  185. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/reactome_pathways.py +71 -0
  186. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +98 -0
  187. aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +5 -0
  188. aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +762 -0
  189. aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +298 -0
  190. aiagents4pharma/talk2knowledgegraphs/utils/extractions/pcst.py +229 -0
  191. aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py +67 -0
  192. aiagents4pharma/talk2knowledgegraphs/utils/pubchem_utils.py +104 -0
  193. aiagents4pharma/talk2scholars/.dockerignore +13 -0
  194. aiagents4pharma/talk2scholars/Dockerfile +104 -0
  195. aiagents4pharma/talk2scholars/README.md +1 -0
  196. aiagents4pharma/talk2scholars/__init__.py +7 -0
  197. aiagents4pharma/talk2scholars/agents/__init__.py +13 -0
  198. aiagents4pharma/talk2scholars/agents/main_agent.py +89 -0
  199. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +96 -0
  200. aiagents4pharma/talk2scholars/agents/pdf_agent.py +101 -0
  201. aiagents4pharma/talk2scholars/agents/s2_agent.py +135 -0
  202. aiagents4pharma/talk2scholars/agents/zotero_agent.py +127 -0
  203. aiagents4pharma/talk2scholars/configs/__init__.py +7 -0
  204. aiagents4pharma/talk2scholars/configs/agents/__init__.py +7 -0
  205. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/__init__.py +7 -0
  206. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/__init__.py +3 -0
  207. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +52 -0
  208. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/__init__.py +3 -0
  209. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +19 -0
  210. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/__init__.py +3 -0
  211. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +19 -0
  212. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/__init__.py +3 -0
  213. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +44 -0
  214. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/__init__.py +3 -0
  215. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +19 -0
  216. aiagents4pharma/talk2scholars/configs/app/__init__.py +7 -0
  217. aiagents4pharma/talk2scholars/configs/app/frontend/__init__.py +3 -0
  218. aiagents4pharma/talk2scholars/configs/app/frontend/default.yaml +72 -0
  219. aiagents4pharma/talk2scholars/configs/config.yaml +16 -0
  220. aiagents4pharma/talk2scholars/configs/tools/__init__.py +21 -0
  221. aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/__init__.py +3 -0
  222. aiagents4pharma/talk2scholars/configs/tools/multi_paper_recommendation/default.yaml +26 -0
  223. aiagents4pharma/talk2scholars/configs/tools/paper_download/__init__.py +3 -0
  224. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  225. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/__init__.py +3 -0
  226. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +62 -0
  227. aiagents4pharma/talk2scholars/configs/tools/retrieve_semantic_scholar_paper_id/__init__.py +3 -0
  228. aiagents4pharma/talk2scholars/configs/tools/retrieve_semantic_scholar_paper_id/default.yaml +12 -0
  229. aiagents4pharma/talk2scholars/configs/tools/search/__init__.py +3 -0
  230. aiagents4pharma/talk2scholars/configs/tools/search/default.yaml +26 -0
  231. aiagents4pharma/talk2scholars/configs/tools/single_paper_recommendation/__init__.py +3 -0
  232. aiagents4pharma/talk2scholars/configs/tools/single_paper_recommendation/default.yaml +26 -0
  233. aiagents4pharma/talk2scholars/configs/tools/zotero_read/__init__.py +3 -0
  234. aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +57 -0
  235. aiagents4pharma/talk2scholars/configs/tools/zotero_write/__inti__.py +3 -0
  236. aiagents4pharma/talk2scholars/configs/tools/zotero_write/default.yaml +55 -0
  237. aiagents4pharma/talk2scholars/docker-compose/cpu/.env.example +21 -0
  238. aiagents4pharma/talk2scholars/docker-compose/cpu/docker-compose.yml +90 -0
  239. aiagents4pharma/talk2scholars/docker-compose/gpu/.env.example +21 -0
  240. aiagents4pharma/talk2scholars/docker-compose/gpu/docker-compose.yml +105 -0
  241. aiagents4pharma/talk2scholars/install.md +122 -0
  242. aiagents4pharma/talk2scholars/state/__init__.py +7 -0
  243. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +98 -0
  244. aiagents4pharma/talk2scholars/tests/__init__.py +3 -0
  245. aiagents4pharma/talk2scholars/tests/test_agents_main_agent.py +256 -0
  246. aiagents4pharma/talk2scholars/tests/test_agents_paper_agents_download_agent.py +139 -0
  247. aiagents4pharma/talk2scholars/tests/test_agents_pdf_agent.py +114 -0
  248. aiagents4pharma/talk2scholars/tests/test_agents_s2_agent.py +198 -0
  249. aiagents4pharma/talk2scholars/tests/test_agents_zotero_agent.py +160 -0
  250. aiagents4pharma/talk2scholars/tests/test_s2_tools_display_dataframe.py +91 -0
  251. aiagents4pharma/talk2scholars/tests/test_s2_tools_query_dataframe.py +191 -0
  252. aiagents4pharma/talk2scholars/tests/test_states_state.py +38 -0
  253. aiagents4pharma/talk2scholars/tests/test_tools_paper_downloader.py +507 -0
  254. aiagents4pharma/talk2scholars/tests/test_tools_question_and_answer_tool.py +105 -0
  255. aiagents4pharma/talk2scholars/tests/test_tools_s2_multi.py +307 -0
  256. aiagents4pharma/talk2scholars/tests/test_tools_s2_retrieve.py +67 -0
  257. aiagents4pharma/talk2scholars/tests/test_tools_s2_search.py +286 -0
  258. aiagents4pharma/talk2scholars/tests/test_tools_s2_single.py +298 -0
  259. aiagents4pharma/talk2scholars/tests/test_utils_arxiv_downloader.py +469 -0
  260. aiagents4pharma/talk2scholars/tests/test_utils_base_paper_downloader.py +598 -0
  261. aiagents4pharma/talk2scholars/tests/test_utils_biorxiv_downloader.py +669 -0
  262. aiagents4pharma/talk2scholars/tests/test_utils_medrxiv_downloader.py +500 -0
  263. aiagents4pharma/talk2scholars/tests/test_utils_nvidia_nim_reranker.py +117 -0
  264. aiagents4pharma/talk2scholars/tests/test_utils_pdf_answer_formatter.py +67 -0
  265. aiagents4pharma/talk2scholars/tests/test_utils_pdf_batch_processor.py +92 -0
  266. aiagents4pharma/talk2scholars/tests/test_utils_pdf_collection_manager.py +173 -0
  267. aiagents4pharma/talk2scholars/tests/test_utils_pdf_document_processor.py +68 -0
  268. aiagents4pharma/talk2scholars/tests/test_utils_pdf_generate_answer.py +72 -0
  269. aiagents4pharma/talk2scholars/tests/test_utils_pdf_gpu_detection.py +129 -0
  270. aiagents4pharma/talk2scholars/tests/test_utils_pdf_paper_loader.py +116 -0
  271. aiagents4pharma/talk2scholars/tests/test_utils_pdf_rag_pipeline.py +88 -0
  272. aiagents4pharma/talk2scholars/tests/test_utils_pdf_retrieve_chunks.py +190 -0
  273. aiagents4pharma/talk2scholars/tests/test_utils_pdf_singleton_manager.py +159 -0
  274. aiagents4pharma/talk2scholars/tests/test_utils_pdf_vector_normalization.py +121 -0
  275. aiagents4pharma/talk2scholars/tests/test_utils_pdf_vector_store.py +406 -0
  276. aiagents4pharma/talk2scholars/tests/test_utils_pubmed_downloader.py +1007 -0
  277. aiagents4pharma/talk2scholars/tests/test_utils_read_helper_utils.py +106 -0
  278. aiagents4pharma/talk2scholars/tests/test_utils_s2_utils_ext_ids.py +403 -0
  279. aiagents4pharma/talk2scholars/tests/test_utils_tool_helper_utils.py +85 -0
  280. aiagents4pharma/talk2scholars/tests/test_utils_zotero_human_in_the_loop.py +266 -0
  281. aiagents4pharma/talk2scholars/tests/test_utils_zotero_path.py +496 -0
  282. aiagents4pharma/talk2scholars/tests/test_utils_zotero_pdf_downloader_utils.py +46 -0
  283. aiagents4pharma/talk2scholars/tests/test_utils_zotero_read.py +743 -0
  284. aiagents4pharma/talk2scholars/tests/test_utils_zotero_write.py +151 -0
  285. aiagents4pharma/talk2scholars/tools/__init__.py +9 -0
  286. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +12 -0
  287. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +442 -0
  288. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +22 -0
  289. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +207 -0
  290. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +336 -0
  291. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +313 -0
  292. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +196 -0
  293. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +323 -0
  294. aiagents4pharma/talk2scholars/tools/pdf/__init__.py +7 -0
  295. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +170 -0
  296. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +37 -0
  297. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  298. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +198 -0
  299. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  300. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  301. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +97 -0
  302. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +59 -0
  303. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +150 -0
  304. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +97 -0
  305. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  306. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +113 -0
  307. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +197 -0
  308. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  309. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +86 -0
  310. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +150 -0
  311. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +327 -0
  312. aiagents4pharma/talk2scholars/tools/s2/__init__.py +21 -0
  313. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +110 -0
  314. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +111 -0
  315. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +233 -0
  316. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +128 -0
  317. aiagents4pharma/talk2scholars/tools/s2/search.py +101 -0
  318. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +102 -0
  319. aiagents4pharma/talk2scholars/tools/s2/utils/__init__.py +5 -0
  320. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +223 -0
  321. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +205 -0
  322. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +216 -0
  323. aiagents4pharma/talk2scholars/tools/zotero/__init__.py +7 -0
  324. aiagents4pharma/talk2scholars/tools/zotero/utils/__init__.py +7 -0
  325. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +270 -0
  326. aiagents4pharma/talk2scholars/tools/zotero/utils/review_helper.py +74 -0
  327. aiagents4pharma/talk2scholars/tools/zotero/utils/write_helper.py +194 -0
  328. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_path.py +180 -0
  329. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +133 -0
  330. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +105 -0
  331. aiagents4pharma/talk2scholars/tools/zotero/zotero_review.py +162 -0
  332. aiagents4pharma/talk2scholars/tools/zotero/zotero_write.py +91 -0
  333. aiagents4pharma-0.0.0.dist-info/METADATA +335 -0
  334. aiagents4pharma-0.0.0.dist-info/RECORD +336 -0
  335. aiagents4pharma-0.0.0.dist-info/WHEEL +4 -0
  336. aiagents4pharma-0.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,669 @@
1
+ """
2
+ Unit tests for BiorxivDownloader.
3
+ Tests CloudScraper integration, JSON API interaction, and PDF download with CloudFlare protection.
4
+ """
5
+
6
+ import unittest
7
+ from unittest.mock import Mock, patch
8
+
9
+ import requests
10
+
11
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.biorxiv_downloader import (
12
+ BiorxivDownloader,
13
+ )
14
+
15
+
16
+ class BiorxivDownloaderTestShim(BiorxivDownloader):
17
+ """biorxiv_downloader test shim to expose protected methods."""
18
+
19
+ __test__ = False
20
+
21
+ def set_scraper(self, scraper):
22
+ """set_scraper is a public method to set the scraper."""
23
+ self._scraper = scraper
24
+
25
+ def get_scraper_public(self):
26
+ """get_scraper_public is a public method to access the scraper."""
27
+ return self._get_scraper()
28
+
29
+ def visit_landing_page_public(self, scraper, pdf_url, identifier):
30
+ """call visit_landing_page with public access."""
31
+ return self._visit_landing_page(scraper, pdf_url, identifier)
32
+
33
+ def save_pdf_to_temp_public(self, response):
34
+ """save_pdf_to_temp_public is a public method to save PDF response."""
35
+ return self._save_pdf_to_temp(response)
36
+
37
+ def extract_filename_public(self, response, identifier):
38
+ """extract_filename_public is a public method to extract filename from response."""
39
+ return self._extract_filename(response, identifier)
40
+
41
+ def extract_basic_metadata_public(self, paper, identifier):
42
+ """extract_basic_metadata_public is a public method to extract basic metadata."""
43
+ return self._extract_basic_metadata(paper, identifier)
44
+
45
+ def extract_authors_public(self, authors_str):
46
+ """extract_authors_public is a public method to extract authors from a string."""
47
+ return self._extract_authors(authors_str)
48
+
49
+ def get_paper_identifier_info_public(self, paper):
50
+ """get_paper_identifier_info_public is a public method to get paper identifier info."""
51
+ return self._get_paper_identifier_info(paper)
52
+
53
+ def add_service_identifier_public(self, entry, identifier):
54
+ """add_service_identifier_public is a public method to add service identifier."""
55
+ self._add_service_identifier(entry, identifier)
56
+
57
+
58
+ class TestBiorxivDownloader(unittest.TestCase):
59
+ """Tests for the BiorxivDownloader class."""
60
+
61
+ @patch("cloudscraper.create_scraper")
62
+ def setUp(self, mock_create_scraper):
63
+ """Set up test fixtures."""
64
+ self.mock_config = Mock()
65
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
66
+ self.mock_config.pdf_url_template = (
67
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
68
+ )
69
+ self.mock_config.user_agent = "test-agent"
70
+ self.mock_config.cf_clearance_timeout = 10
71
+ self.mock_config.request_timeout = 30
72
+ self.mock_config.chunk_size = 8192
73
+ self.mock_config.session_reuse = True
74
+ self.mock_config.default_version = "1"
75
+ self.mock_config.browser_config = {"type": "custom"}
76
+
77
+ # Mock the scraper creation during initialization
78
+ mock_scraper = Mock()
79
+ mock_create_scraper.return_value = mock_scraper
80
+
81
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
82
+ self.initial_scraper = mock_scraper
83
+ self.downloader.set_scraper(mock_scraper)
84
+
85
+ # Sample bioRxiv API response
86
+ self.sample_json_response = {
87
+ "collection": [
88
+ {
89
+ "title": "Test BioRxiv Paper",
90
+ "authors": "John Doe; Jane Smith",
91
+ "abstract": "This is a test abstract for bioRxiv paper.",
92
+ "date": "2023-01-01",
93
+ "category": "Biochemistry",
94
+ "version": "2",
95
+ "doi": "10.1101/2023.01.01.123456",
96
+ }
97
+ ]
98
+ }
99
+
100
+ def test_initialization(self):
101
+ """Test BiorxivDownloader initialization."""
102
+ self.assertEqual(self.downloader.api_url, "https://api.biorxiv.org/details")
103
+ self.assertEqual(
104
+ self.downloader.pdf_url_template,
105
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf",
106
+ )
107
+ self.assertEqual(self.downloader.user_agent, "test-agent")
108
+ self.assertEqual(self.downloader.cf_clearance_timeout, 10)
109
+ self.assertIsNotNone(self.downloader.get_scraper_public())
110
+
111
+ def test_fetch_metadata_success(self):
112
+ """Test successful metadata fetching from bioRxiv API."""
113
+ mock_scraper = Mock()
114
+ mock_response = Mock()
115
+ mock_response.json.return_value = self.sample_json_response
116
+ mock_response.raise_for_status = Mock()
117
+ mock_scraper.get.return_value = mock_response
118
+
119
+ # Mock the existing scraper
120
+ self.downloader.set_scraper(mock_scraper)
121
+
122
+ result = self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
123
+
124
+ # Verify API call
125
+ expected_url = "https://api.biorxiv.org/details/biorxiv/10.1101/2023.01.01.123456/na/json"
126
+ mock_scraper.get.assert_called_once_with(expected_url, timeout=30)
127
+ mock_response.raise_for_status.assert_called_once()
128
+
129
+ # Verify JSON parsing
130
+ self.assertEqual(result, self.sample_json_response)
131
+
132
+ def test_fetch_metadata_network_error(self):
133
+ """Test fetch_metadata with network error."""
134
+ mock_scraper = Mock()
135
+ mock_scraper.get.side_effect = requests.RequestException("Network error")
136
+ self.downloader.set_scraper(mock_scraper)
137
+
138
+ with self.assertRaises(requests.RequestException):
139
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
140
+
141
+ def test_fetch_metadata_no_collection_data(self):
142
+ """Test fetch_metadata when API response has no collection data."""
143
+ mock_scraper = Mock()
144
+ mock_response = Mock()
145
+ mock_response.json.return_value = {} # Empty response
146
+ mock_response.raise_for_status = Mock()
147
+ mock_scraper.get.return_value = mock_response
148
+ self.downloader.set_scraper(mock_scraper)
149
+
150
+ with self.assertRaises(RuntimeError) as context:
151
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
152
+
153
+ self.assertIn("No collection data found", str(context.exception))
154
+
155
+ def test_construct_pdf_url_variants(self):
156
+ """PDF URL construction: normal, missing collection, default version."""
157
+ # Success
158
+ self.assertEqual(
159
+ self.downloader.construct_pdf_url(
160
+ self.sample_json_response, "10.1101/2023.01.01.123456"
161
+ ),
162
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v2.full.pdf",
163
+ )
164
+ # No collection
165
+ self.assertEqual(
166
+ self.downloader.construct_pdf_url({}, "10.1101/2023.01.01.123456"),
167
+ "",
168
+ )
169
+ # Default version
170
+ meta_default = {"collection": [{"title": "Test Paper"}]}
171
+ self.assertEqual(
172
+ self.downloader.construct_pdf_url(meta_default, "10.1101/2023.01.01.123456"),
173
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf",
174
+ )
175
+
176
+ @patch("tempfile.NamedTemporaryFile")
177
+ def test_download_pdf_to_temp_success(self, mock_tempfile):
178
+ """Test successful PDF download with CloudScraper."""
179
+ # Setup mock scraper
180
+ mock_scraper = Mock()
181
+ self.downloader.set_scraper(mock_scraper)
182
+
183
+ # Mock landing page response
184
+ mock_landing_response = Mock()
185
+ mock_landing_response.raise_for_status = Mock()
186
+
187
+ # Mock PDF download response
188
+ mock_pdf_response = Mock()
189
+ mock_pdf_response.raise_for_status = Mock()
190
+ mock_pdf_response.iter_content.return_value = [
191
+ b"PDF content chunk 1",
192
+ b"PDF content chunk 2",
193
+ ]
194
+ mock_pdf_response.headers = {"Content-Disposition": 'attachment; filename="paper.pdf"'}
195
+
196
+ mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
197
+
198
+ # Mock temporary file
199
+ mock_temp_file = Mock()
200
+ mock_temp_file.name = "/tmp/test.pdf"
201
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
202
+ mock_temp_file.__exit__ = Mock(return_value=None)
203
+ mock_tempfile.return_value = mock_temp_file
204
+
205
+ pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
206
+ result = self.downloader.download_pdf_to_temp(pdf_url, "10.1101/2023.01.01.123456")
207
+
208
+ # Verify result
209
+ self.assertEqual(result, ("/tmp/test.pdf", "paper.pdf"))
210
+
211
+ # Verify landing page visit
212
+ landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
213
+ mock_scraper.get.assert_any_call(landing_url, timeout=30)
214
+
215
+ # Verify PDF download
216
+ mock_scraper.get.assert_any_call(pdf_url, timeout=30, stream=True)
217
+
218
+ # Verify file writing
219
+ mock_temp_file.write.assert_any_call(b"PDF content chunk 1")
220
+ mock_temp_file.write.assert_any_call(b"PDF content chunk 2")
221
+
222
+ def test_download_pdf_to_temp_error_variants(self):
223
+ """Download errors: empty URL and network failure."""
224
+ # Empty URL
225
+ self.assertIsNone(self.downloader.download_pdf_to_temp("", "10.1101/x"))
226
+
227
+ # Network error
228
+ mock_scraper = Mock()
229
+ mock_scraper.get.side_effect = requests.RequestException("Network error")
230
+ self.downloader.set_scraper(mock_scraper)
231
+ url = "https://www.biorxiv.org/content/10.1101/xv1.full.pdf"
232
+ self.assertIsNone(self.downloader.download_pdf_to_temp(url, "10.1101/x"))
233
+
234
+ @patch("cloudscraper.create_scraper")
235
+ def test_get_scraper_new_and_existing(self, mock_create):
236
+ """_get_scraper creates when missing and reuses when present."""
237
+ # New scraper
238
+ self.downloader.set_scraper(None)
239
+ new_scraper = Mock()
240
+ mock_create.return_value = new_scraper
241
+ got = self.downloader.get_scraper_public()
242
+ self.assertIs(got, new_scraper)
243
+ mock_create.assert_called_once_with(browser={"custom": "test-agent"}, delay=10)
244
+
245
+ # Existing scraper
246
+ self.downloader.set_scraper(new_scraper)
247
+ got2 = self.downloader.get_scraper_public()
248
+ self.assertIs(got2, new_scraper)
249
+
250
+ def test_visit_landing_page_variants(self):
251
+ """Landing page visit happens only for .full.pdf URLs."""
252
+ mock_scraper = Mock()
253
+ ok = Mock()
254
+ ok.raise_for_status = Mock()
255
+ mock_scraper.get.return_value = ok
256
+
257
+ # Case 1: with .full.pdf -> should visit landing
258
+ pdf_url_full = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
259
+ self.downloader.visit_landing_page_public(
260
+ mock_scraper, pdf_url_full, "10.1101/2023.01.01.123456"
261
+ )
262
+ expected = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
263
+ mock_scraper.get.assert_called_with(expected, timeout=30)
264
+
265
+ # Case 2: no .full.pdf -> no call
266
+ mock_scraper.get.reset_mock()
267
+ pdf_url_plain = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
268
+ self.downloader.visit_landing_page_public(
269
+ mock_scraper, pdf_url_plain, "10.1101/2023.01.01.123456"
270
+ )
271
+ mock_scraper.get.assert_not_called()
272
+
273
+ @patch("tempfile.NamedTemporaryFile")
274
+ def test_save_pdf_to_temp(self, mock_tempfile):
275
+ """Test saving PDF response to temporary file."""
276
+ mock_response = Mock()
277
+ mock_response.iter_content.return_value = [
278
+ b"chunk1",
279
+ b"chunk2",
280
+ None,
281
+ b"chunk3",
282
+ ] # Include None chunk
283
+
284
+ mock_temp_file = Mock()
285
+ mock_temp_file.name = "/tmp/saved.pdf"
286
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
287
+ mock_temp_file.__exit__ = Mock(return_value=None)
288
+ mock_tempfile.return_value = mock_temp_file
289
+
290
+ result = self.downloader.save_pdf_to_temp_public(mock_response)
291
+
292
+ self.assertEqual(result, "/tmp/saved.pdf")
293
+
294
+ # Verify chunks were written (None chunk should be skipped)
295
+ mock_temp_file.write.assert_any_call(b"chunk1")
296
+ mock_temp_file.write.assert_any_call(b"chunk2")
297
+ mock_temp_file.write.assert_any_call(b"chunk3")
298
+ self.assertEqual(mock_temp_file.write.call_count, 3)
299
+
300
+ def test_extract_filename_variants(self):
301
+ """Filename extraction across header variants and regex-exception path."""
302
+ cases = [
303
+ (
304
+ {"Content-Disposition": 'attachment; filename="test-paper.pdf"'},
305
+ "test-paper.pdf",
306
+ False,
307
+ ),
308
+ ({}, "default.pdf", False),
309
+ ({"Content-Disposition": "invalid header format"}, "default.pdf", False),
310
+ (
311
+ {"Content-Disposition": 'attachment; filename="test.pdf"'},
312
+ "default.pdf",
313
+ True,
314
+ ), # trigger exception path
315
+ ]
316
+ for headers, expected, raise_regex in cases:
317
+ with self.subTest(headers=headers, expected=expected, raise_regex=raise_regex):
318
+ resp = Mock()
319
+ resp.headers = headers
320
+ if raise_regex:
321
+ with patch(
322
+ "re.search",
323
+ side_effect=requests.RequestException("Regex error"),
324
+ ):
325
+ with patch.object(
326
+ self.downloader,
327
+ "get_default_filename",
328
+ return_value="default.pdf",
329
+ ):
330
+ got = self.downloader.extract_filename_public(resp, "10.1101/test")
331
+ else:
332
+ with patch.object(
333
+ self.downloader,
334
+ "get_default_filename",
335
+ return_value="default.pdf",
336
+ ):
337
+ got = self.downloader.extract_filename_public(resp, "10.1101/test")
338
+ self.assertEqual(got, expected)
339
+
340
+ def test_extract_paper_metadata_success(self):
341
+ """Test successful paper metadata extraction."""
342
+ metadata = self.sample_json_response
343
+ pdf_result = ("/tmp/paper.pdf", "biorxiv_paper.pdf")
344
+
345
+ result = self.downloader.extract_paper_metadata(
346
+ metadata, "10.1101/2023.01.01.123456", pdf_result
347
+ )
348
+
349
+ expected = {
350
+ "Title": "Test BioRxiv Paper",
351
+ "Authors": ["John Doe", "Jane Smith"],
352
+ "Abstract": "This is a test abstract for bioRxiv paper.",
353
+ "Publication Date": "2023-01-01",
354
+ "DOI": "10.1101/2023.01.01.123456",
355
+ "Category": "Biochemistry",
356
+ "Version": "2",
357
+ "source": "biorxiv",
358
+ "server": "biorxiv",
359
+ "URL": "/tmp/paper.pdf",
360
+ "pdf_url": "/tmp/paper.pdf",
361
+ "filename": "biorxiv_paper.pdf",
362
+ "access_type": "open_access_downloaded",
363
+ "temp_file_path": "/tmp/paper.pdf",
364
+ }
365
+
366
+ self.assertEqual(result, expected)
367
+
368
+ def test_extract_paper_metadata_no_pdf_result(self):
369
+ """Test metadata extraction when PDF download failed."""
370
+ metadata = self.sample_json_response
371
+ pdf_result = None # No PDF download result
372
+
373
+ result = self.downloader.extract_paper_metadata(
374
+ metadata, "10.1101/2023.01.01.123456", pdf_result
375
+ )
376
+
377
+ # Should still have basic metadata but with download_failed access type
378
+ self.assertEqual(result["Title"], "Test BioRxiv Paper")
379
+ self.assertEqual(result["access_type"], "download_failed")
380
+ self.assertEqual(result["URL"], "")
381
+ self.assertEqual(result["pdf_url"], "")
382
+ self.assertEqual(result["temp_file_path"], "")
383
+ self.assertEqual(result["filename"], "10_1101_2023_01_01_123456.pdf") # Default filename
384
+
385
+ def test_extract_paper_metadata_no_collection(self):
386
+ """Test metadata extraction with missing collection."""
387
+ metadata = {}
388
+
389
+ with self.assertRaises(RuntimeError) as context:
390
+ self.downloader.extract_paper_metadata(metadata, "10.1101/2023.01.01.123456", None)
391
+
392
+ self.assertIn("No collection data found", str(context.exception))
393
+
394
+ def test_extract_basic_metadata(self):
395
+ """Test basic metadata extraction helper method."""
396
+ paper = self.sample_json_response["collection"][0]
397
+
398
+ result = self.downloader.extract_basic_metadata_public(paper, "10.1101/2023.01.01.123456")
399
+
400
+ expected = {
401
+ "Title": "Test BioRxiv Paper",
402
+ "Authors": ["John Doe", "Jane Smith"],
403
+ "Abstract": "This is a test abstract for bioRxiv paper.",
404
+ "Publication Date": "2023-01-01",
405
+ "DOI": "10.1101/2023.01.01.123456",
406
+ "Category": "Biochemistry",
407
+ "Version": "2",
408
+ "source": "biorxiv",
409
+ "server": "biorxiv",
410
+ }
411
+
412
+ self.assertEqual(result, expected)
413
+
414
+ def test_extract_authors_variants(self):
415
+ """Author parsing for semicolon list and empty string."""
416
+ self.assertEqual(
417
+ self.downloader.extract_authors_public("John Doe; Jane Smith; Bob Johnson"),
418
+ ["John Doe", "Jane Smith", "Bob Johnson"],
419
+ )
420
+ self.assertEqual(self.downloader.extract_authors_public(""), [])
421
+
422
+ def test_service_and_identifier_helpers(self):
423
+ """Service name, identifier name, and default filename."""
424
+ self.assertEqual(self.downloader.get_service_name(), "bioRxiv")
425
+ self.assertEqual(self.downloader.get_identifier_name(), "DOI")
426
+ self.assertEqual(
427
+ self.downloader.get_default_filename("10.1101/2023.01.01.123456"),
428
+ "10_1101_2023_01_01_123456.pdf",
429
+ )
430
+
431
+ def test_get_paper_identifier_info(self):
432
+ """Test _get_paper_identifier_info method."""
433
+ paper = {
434
+ "DOI": "10.1101/2023.01.01.123456",
435
+ "Publication Date": "2023-01-01",
436
+ "Category": "Biology",
437
+ }
438
+
439
+ result = self.downloader.get_paper_identifier_info_public(paper)
440
+
441
+ self.assertIn("10.1101/2023.01.01.123456", result)
442
+ self.assertIn("2023-01-01", result)
443
+ self.assertIn("Biology", result)
444
+
445
+ def test_add_service_identifier(self):
446
+ """Test _add_service_identifier method."""
447
+ entry = {}
448
+
449
+ self.downloader.add_service_identifier_public(entry, "10.1101/2023.01.01.123456")
450
+
451
+ self.assertEqual(entry["DOI"], "10.1101/2023.01.01.123456")
452
+ self.assertEqual(entry["server"], "biorxiv")
453
+
454
+
455
+ class TestBiorxivDownloaderIntegration(unittest.TestCase):
456
+ """Integration tests for BiorxivDownloader workflow."""
457
+
458
+ @patch("cloudscraper.create_scraper")
459
+ def setUp(self, mock_create_scraper):
460
+ """Set up integration test fixtures."""
461
+ self.mock_config = Mock()
462
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
463
+ self.mock_config.pdf_url_template = (
464
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
465
+ )
466
+ self.mock_config.user_agent = "test-agent"
467
+ self.mock_config.cf_clearance_timeout = 10
468
+ self.mock_config.request_timeout = 30
469
+ self.mock_config.chunk_size = 8192
470
+ self.mock_config.session_reuse = True
471
+ self.mock_config.default_version = "1"
472
+ self.mock_config.browser_config = {"type": "custom"}
473
+
474
+ # Mock the scraper creation during initialization
475
+ mock_scraper = Mock()
476
+ mock_create_scraper.return_value = mock_scraper
477
+
478
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
479
+
480
+ self.sample_response = {
481
+ "collection": [
482
+ {
483
+ "title": "Integration Test Paper",
484
+ "authors": "Test Author",
485
+ "abstract": "Integration test abstract.",
486
+ "date": "2023-01-01",
487
+ "category": "Biology",
488
+ "version": "1",
489
+ "doi": "10.1101/2023.01.01.123456",
490
+ }
491
+ ]
492
+ }
493
+
494
+ @patch("tempfile.NamedTemporaryFile")
495
+ def test_full_paper_processing_workflow(self, mock_tempfile):
496
+ """Test the complete workflow from DOI to processed paper data."""
497
+ # Mock scraper responses
498
+ mock_scraper = Mock()
499
+ mock_metadata_response = Mock()
500
+ mock_metadata_response.json.return_value = self.sample_response
501
+ mock_metadata_response.raise_for_status = Mock()
502
+
503
+ # Mock landing page and PDF responses for download
504
+ mock_landing_response = Mock()
505
+ mock_landing_response.raise_for_status = Mock()
506
+
507
+ mock_pdf_response = Mock()
508
+ mock_pdf_response.raise_for_status = Mock()
509
+ mock_pdf_response.iter_content.return_value = [b"PDF data"]
510
+ mock_pdf_response.headers = {}
511
+
512
+ # First call for metadata, then landing page, then PDF download
513
+ mock_scraper.get.side_effect = [
514
+ mock_metadata_response,
515
+ mock_landing_response,
516
+ mock_pdf_response,
517
+ ]
518
+ self.downloader.set_scraper(mock_scraper)
519
+
520
+ # Mock temporary file
521
+ mock_temp_file = Mock()
522
+ mock_temp_file.name = "/tmp/integration.pdf"
523
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
524
+ mock_temp_file.__exit__ = Mock(return_value=None)
525
+ mock_tempfile.return_value = mock_temp_file
526
+
527
+ # Simulate the workflow
528
+ identifier = "10.1101/2023.01.01.123456"
529
+
530
+ # Step 1: Fetch metadata
531
+ metadata = self.downloader.fetch_metadata(identifier)
532
+
533
+ # Step 2: Construct PDF URL
534
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
535
+
536
+ # Step 3: Download PDF
537
+ pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
538
+
539
+ # Step 4: Extract metadata
540
+ paper_data = self.downloader.extract_paper_metadata(metadata, identifier, pdf_result)
541
+
542
+ # Verify the complete workflow
543
+ self.assertEqual(paper_data["Title"], "Integration Test Paper")
544
+ self.assertEqual(paper_data["Authors"], ["Test Author"])
545
+ self.assertEqual(paper_data["access_type"], "open_access_downloaded")
546
+ self.assertEqual(paper_data["temp_file_path"], "/tmp/integration.pdf")
547
+
548
+ expected_pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
549
+ self.assertEqual(pdf_url, expected_pdf_url)
550
+
551
+ # Verify 3 calls: metadata, landing page, PDF
552
+ self.assertEqual(mock_scraper.get.call_count, 3)
553
+
554
+ def test_workflow_with_existing_scraper(self):
555
+ """Test workflow reusing existing scraper instance."""
556
+ # Set existing scraper
557
+ existing_scraper = Mock()
558
+
559
+ # Mock API response for metadata
560
+ mock_response = Mock()
561
+ mock_response.json.return_value = self.sample_response
562
+ mock_response.raise_for_status = Mock()
563
+ existing_scraper.get.return_value = mock_response
564
+
565
+ self.downloader.set_scraper(existing_scraper)
566
+
567
+ identifier = "10.1101/2023.01.01.123456"
568
+ metadata = self.downloader.fetch_metadata(identifier)
569
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
570
+
571
+ # Try to download (will use existing scraper)
572
+ with patch("tempfile.NamedTemporaryFile"):
573
+ # Reset the mock and set up responses for landing + PDF
574
+ existing_scraper.reset_mock()
575
+ mock_landing = Mock()
576
+ mock_landing.raise_for_status = Mock()
577
+ mock_pdf = Mock()
578
+ mock_pdf.raise_for_status = Mock()
579
+ mock_pdf.iter_content.return_value = [b"data"]
580
+ mock_pdf.headers = {}
581
+ existing_scraper.get.side_effect = [mock_landing, mock_pdf]
582
+
583
+ self.downloader.download_pdf_to_temp(pdf_url, identifier)
584
+
585
+ # Should have used existing scraper for landing + PDF (2 calls)
586
+ self.assertEqual(existing_scraper.get.call_count, 2)
587
+
588
+
589
+ class TestBiorxivCloudFlareHandling(unittest.TestCase):
590
+ """Tests specific to CloudFlare protection handling."""
591
+
592
+ @patch("cloudscraper.create_scraper")
593
+ def setUp(self, mock_create_scraper):
594
+ """Set up CloudFlare handling test fixtures."""
595
+ self.mock_config = Mock()
596
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
597
+ self.mock_config.pdf_url_template = (
598
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
599
+ )
600
+ self.mock_config.user_agent = "Mozilla/5.0 (compatible; test-agent)"
601
+ self.mock_config.cf_clearance_timeout = 15
602
+ self.mock_config.request_timeout = 30
603
+ self.mock_config.chunk_size = 8192
604
+ self.mock_config.session_reuse = True
605
+ self.mock_config.default_version = "1"
606
+ self.mock_config.browser_config = {"type": "custom"}
607
+
608
+ # Mock the scraper creation during initialization
609
+ mock_scraper = Mock()
610
+ mock_create_scraper.return_value = mock_scraper
611
+
612
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
613
+
614
+ @patch("cloudscraper.create_scraper")
615
+ def test_cloudscraper_configuration(self, mock_create_scraper):
616
+ """Test CloudScraper is configured with proper parameters."""
617
+ # Set scraper to None so we create a new one
618
+ self.downloader.set_scraper(None)
619
+ mock_scraper = Mock()
620
+ mock_create_scraper.return_value = mock_scraper
621
+
622
+ scraper = self.downloader.get_scraper_public()
623
+
624
+ mock_create_scraper.assert_called_once_with(
625
+ browser={"custom": "Mozilla/5.0 (compatible; test-agent)"}, delay=15
626
+ )
627
+ self.assertEqual(scraper, mock_scraper)
628
+
629
+ @patch("tempfile.NamedTemporaryFile")
630
+ def test_landing_page_visit_before_pdf_download(self, mock_tempfile):
631
+ """Test that landing page is visited before PDF download for CloudFlare bypass."""
632
+ mock_scraper = Mock()
633
+ self.downloader.set_scraper(mock_scraper)
634
+
635
+ # Mock responses
636
+ mock_landing_response = Mock()
637
+ mock_landing_response.raise_for_status = Mock()
638
+
639
+ mock_pdf_response = Mock()
640
+ mock_pdf_response.raise_for_status = Mock()
641
+ mock_pdf_response.iter_content.return_value = [b"PDF content"]
642
+ mock_pdf_response.headers = {}
643
+
644
+ mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
645
+
646
+ # Mock temp file
647
+ mock_temp_file = Mock()
648
+ mock_temp_file.name = "/tmp/test.pdf"
649
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
650
+ mock_temp_file.__exit__ = Mock(return_value=None)
651
+ mock_tempfile.return_value = mock_temp_file
652
+
653
+ pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
654
+ self.downloader.download_pdf_to_temp(pdf_url, "10.1101/2023.01.01.123456")
655
+
656
+ # Verify landing page was visited first
657
+ landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
658
+
659
+ calls = mock_scraper.get.call_args_list
660
+ self.assertEqual(len(calls), 2)
661
+
662
+ # First call should be to landing page
663
+ self.assertEqual(calls[0][0][0], landing_url)
664
+ self.assertEqual(calls[0][1]["timeout"], 30)
665
+
666
+ # Second call should be to PDF URL
667
+ self.assertEqual(calls[1][0][0], pdf_url)
668
+ self.assertEqual(calls[1][1]["timeout"], 30)
669
+ self.assertEqual(calls[1][1]["stream"], True)