aiagents4pharma 1.37.0__py3-none-any.whl → 1.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +12 -4
  2. aiagents4pharma/talk2scholars/configs/config.yaml +2 -0
  3. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +3 -0
  4. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +3 -0
  5. aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +1 -0
  6. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +33 -7
  7. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +151 -0
  8. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +151 -0
  9. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +59 -3
  10. aiagents4pharma/talk2scholars/tests/test_read_helper_utils.py +110 -0
  11. aiagents4pharma/talk2scholars/tests/test_s2_display.py +20 -1
  12. aiagents4pharma/talk2scholars/tests/test_s2_query.py +17 -0
  13. aiagents4pharma/talk2scholars/tests/test_state.py +25 -1
  14. aiagents4pharma/talk2scholars/tests/test_zotero_pdf_downloader_utils.py +46 -0
  15. aiagents4pharma/talk2scholars/tests/test_zotero_read.py +35 -40
  16. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +4 -1
  17. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +112 -0
  18. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +112 -0
  19. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +82 -41
  20. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +6 -2
  21. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +2 -1
  22. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +7 -3
  23. aiagents4pharma/talk2scholars/tools/s2/search.py +2 -1
  24. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +2 -1
  25. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +2 -0
  26. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +2 -0
  27. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +2 -0
  28. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +79 -136
  29. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +147 -0
  30. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +42 -9
  31. {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/METADATA +1 -1
  32. {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/RECORD +35 -26
  33. {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/WHEEL +1 -1
  34. {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/licenses/LICENSE +0 -0
  35. {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,10 @@
1
1
  """
2
- Tool for performing Q&A on PDF documents using retrieval augmented generation.
3
- This module provides functionality to load PDFs from URLs, split them into
4
- chunks, retrieve relevant segments via semantic search, and generate answers
5
- to user-provided questions using a language model chain.
2
+ PDF Question & Answer Tool
3
+
4
+ This LangGraph tool answers user questions by leveraging a pre-built FAISS vector store
5
+ of embedded PDF document chunks. Given a question, it retrieves the most relevant text
6
+ segments from the loaded PDFs, invokes an LLM for answer generation, and returns the
7
+ response with source attribution.
6
8
  """
7
9
 
8
10
  import logging
@@ -52,19 +54,18 @@ def load_hydra_config() -> Any:
52
54
 
53
55
  class QuestionAndAnswerInput(BaseModel):
54
56
  """
55
- Input schema for the PDF Question and Answer tool.
56
-
57
- This schema defines the inputs required for querying academic or research-related
58
- PDFs to answer a specific question using a language model and document retrieval.
57
+ Input schema for the PDF Q&A tool.
59
58
 
60
59
  Attributes:
61
- question (str): The question to ask regarding the PDF content.
62
- paper_ids (Optional[List[str]]): Optional list of specific paper IDs to query.
63
- If not provided, the system will determine relevant papers automatically.
64
- use_all_papers (bool): Whether to use all available papers for answering the question.
65
- If True, the system will include all loaded papers regardless of relevance filtering.
66
- tool_call_id (str): Unique identifier for the tool call, injected automatically.
67
- state (dict): Shared application state, injected automatically.
60
+ question (str): Free-text question to answer based on PDF content.
61
+ paper_ids (Optional[List[str]]): If provided, restricts retrieval to these paper IDs.
62
+ use_all_papers (bool): If True, include all loaded papers without semantic ranking.
63
+ tool_call_id (str): Internal ID injected by LangGraph for this tool call.
64
+ state (dict): Shared agent state containing:
65
+ - 'article_data': dict of paper metadata with 'pdf_url' keys
66
+ - 'text_embedding_model': embedding model instance
67
+ - 'llm_model': chat/LLM instance
68
+ - 'vector_store': pre-built Vectorstore for retrieval
68
69
  """
69
70
 
70
71
  question: str = Field(description="The question to ask regarding the PDF content.")
@@ -119,6 +120,8 @@ class Vectorstore:
119
120
  self.documents: Dict[str, Document] = {}
120
121
  self.vector_store: Optional[VectorStore] = None
121
122
  self.paper_metadata: Dict[str, Dict[str, Any]] = {}
123
+ # Cache for document chunk embeddings to avoid recomputation
124
+ self.embeddings: Dict[str, Any] = {}
122
125
 
123
126
  def add_paper(
124
127
  self,
@@ -160,6 +163,10 @@ class Vectorstore:
160
163
  # Split documents and add metadata for each chunk
161
164
  chunks = splitter.split_documents(documents)
162
165
  logger.info("Split %s into %d chunks", paper_id, len(chunks))
166
+ # Embed and cache chunk embeddings
167
+ chunk_texts = [chunk.page_content for chunk in chunks]
168
+ chunk_embeddings = self.embedding_model.embed_documents(chunk_texts)
169
+ logger.info("Embedded %d chunks for paper %s", len(chunk_embeddings), paper_id)
163
170
 
164
171
  # Enhance document metadata
165
172
  for i, chunk in enumerate(chunks):
@@ -182,6 +189,9 @@ class Vectorstore:
182
189
  # Store chunk
183
190
  doc_id = f"{paper_id}_{i}"
184
191
  self.documents[doc_id] = chunk
192
+ # Cache embedding if available
193
+ if chunk_embeddings[i] is not None:
194
+ self.embeddings[doc_id] = chunk_embeddings[i]
185
195
 
186
196
  # Mark as loaded to prevent duplicate loading
187
197
  self.loaded_papers.add(paper_id)
@@ -295,12 +305,16 @@ class Vectorstore:
295
305
  logger.warning("No documents found after filtering by paper_ids.")
296
306
  return []
297
307
 
298
- texts = [doc.page_content for doc in all_docs]
299
-
300
- # Step 3: Batch embed all documents
301
- logger.info("Starting batch embedding for %d chunks...", len(texts))
302
- all_embeddings = self.embedding_model.embed_documents(texts)
303
- logger.info("Completed embedding for %d chunks...", len(texts))
308
+ # Step 3: Retrieve or compute embeddings for all documents using cache
309
+ logger.info("Retrieving embeddings for %d chunks...", len(all_docs))
310
+ all_embeddings = []
311
+ for doc in all_docs:
312
+ doc_id = f"{doc.metadata['paper_id']}_{doc.metadata['chunk_id']}"
313
+ if doc_id not in self.embeddings:
314
+ logger.info("Embedding missing chunk %s", doc_id)
315
+ emb = self.embedding_model.embed_documents([doc.page_content])[0]
316
+ self.embeddings[doc_id] = emb
317
+ all_embeddings.append(self.embeddings[doc_id])
304
318
 
305
319
  # Step 4: Apply MMR
306
320
  mmr_indices = maximal_marginal_relevance(
@@ -392,6 +406,10 @@ def generate_answer(
392
406
  }
393
407
 
394
408
 
409
+ # Shared pre-built Vectorstore for RAG (set externally, e.g., by Streamlit startup)
410
+ prebuilt_vector_store: Optional[Vectorstore] = None
411
+
412
+
395
413
  @tool(args_schema=QuestionAndAnswerInput, parse_docstring=True)
396
414
  def question_and_answer(
397
415
  question: str,
@@ -401,30 +419,29 @@ def question_and_answer(
401
419
  use_all_papers: bool = False,
402
420
  ) -> Command[Any]:
403
421
  """
404
- Answer a question using PDF content with advanced retrieval augmented generation.
422
+ Generate an answer to a user question using Retrieval-Augmented Generation (RAG) over PDFs.
405
423
 
406
- This tool retrieves PDF documents from URLs, processes them using semantic search,
407
- and generates an answer to the user's question based on the most relevant content.
408
- It can work with multiple papers simultaneously and provides source attribution.
424
+ This tool expects that a FAISS vector store of PDF document chunks has already been built
425
+ and stored in shared state. It retrieves the most relevant chunks for the input question,
426
+ invokes an LLM to craft a response, and returns the answer with source attribution.
409
427
 
410
428
  Args:
411
- question (str): The question to answer based on PDF content.
412
- paper_ids (Optional[List[str]]): Optional list of specific paper IDs to query.
413
- use_all_papers (bool): Whether to use all available papers.
414
- tool_call_id (str): Unique identifier for the current tool call.
415
- state (dict): Current state dictionary containing article data and required models.
416
- Expected keys:
417
- - "article_data": Dictionary containing article metadata including PDF URLs
418
- - "text_embedding_model": Model for generating embeddings
419
- - "llm_model": Language model for generating answers
420
- - "vector_store": Optional Vectorstore instance
429
+ question (str): The free-text question to answer.
430
+ state (dict): Injected agent state mapping that must include:
431
+ - 'article_data': mapping of paper IDs to metadata (including 'pdf_url')
432
+ - 'text_embedding_model': the embedding model instance
433
+ - 'llm_model': the chat/LLM instance
434
+ tool_call_id (str): Internal identifier for this tool call.
435
+ paper_ids (Optional[List[str]]): Specific paper IDs to restrict retrieval (default: None).
436
+ use_all_papers (bool): If True, bypasses semantic ranking and includes all papers.
421
437
 
422
438
  Returns:
423
- Dict[str, Any]: A dictionary wrapped in a Command that updates the conversation
424
- with either the answer or an error message.
439
+ Command[Any]: A LangGraph Command that updates the conversation state:
440
+ - 'messages': a single ToolMessage containing the generated answer text.
425
441
 
426
442
  Raises:
427
- ValueError: If required components are missing or if PDF processing fails.
443
+ ValueError: If required models or 'article_data' are missing from state.
444
+ RuntimeError: If no relevant document chunks can be retrieved.
428
445
  """
429
446
  # Load configuration
430
447
  config = load_hydra_config()
@@ -456,8 +473,13 @@ def question_and_answer(
456
473
  logger.error("%s: %s", call_id, error_msg)
457
474
  raise ValueError(error_msg)
458
475
 
459
- # Always use a fresh in-memory document store for this Q&A call
460
- vector_store = Vectorstore(embedding_model=text_embedding_model)
476
+ # Use shared pre-built Vectorstore if provided, else create a new one
477
+ if prebuilt_vector_store is not None:
478
+ vector_store = prebuilt_vector_store
479
+ logger.info("Using shared pre-built vector store from the memory")
480
+ else:
481
+ vector_store = Vectorstore(embedding_model=text_embedding_model)
482
+ logger.info("Initialized new vector store (no pre-built store found)")
461
483
 
462
484
  # Check if there are papers from different sources
463
485
  has_uploaded_papers = any(
@@ -478,8 +500,27 @@ def question_and_answer(
478
500
  if isinstance(paper, dict)
479
501
  )
480
502
 
503
+ has_biorxiv_papers = any(
504
+ paper.get("source") == "biorxiv"
505
+ for paper in article_data.values()
506
+ if isinstance(paper, dict)
507
+ )
508
+
509
+ has_medrxiv_papers = any(
510
+ paper.get("source") == "medrxiv"
511
+ for paper in article_data.values()
512
+ if isinstance(paper, dict)
513
+ )
514
+
481
515
  # Choose papers to use
482
516
  selected_paper_ids = []
517
+ has_combimed_papers = (
518
+ has_uploaded_papers
519
+ or has_zotero_papers
520
+ or has_arxiv_papers
521
+ or has_biorxiv_papers
522
+ or has_medrxiv_papers
523
+ )
483
524
 
484
525
  if paper_ids:
485
526
  # Use explicitly specified papers
@@ -493,7 +534,7 @@ def question_and_answer(
493
534
  "%s: None of the provided paper_ids %s were found", call_id, paper_ids
494
535
  )
495
536
 
496
- elif use_all_papers or has_uploaded_papers or has_zotero_papers or has_arxiv_papers:
537
+ elif use_all_papers or has_combimed_papers:
497
538
  # Use all available papers if explicitly requested or if we have papers from any source
498
539
  selected_paper_ids = list(article_data.keys())
499
540
  logger.info(
@@ -66,8 +66,12 @@ def display_dataframe(
66
66
  NoPapersFoundError: If no entries exist under 'last_displayed_papers' in state.
67
67
  """
68
68
  logger.info("Displaying papers")
69
- context_key = state.get("last_displayed_papers")
70
- artifact = state.get(context_key)
69
+ context_val = state.get("last_displayed_papers")
70
+ # Support both key reference (str) and direct mapping
71
+ if isinstance(context_val, dict):
72
+ artifact = context_val
73
+ else:
74
+ artifact = state.get(context_val)
71
75
  if not artifact:
72
76
  logger.info("No papers found in state, raising NoPapersFoundError")
73
77
  raise NoPapersFoundError(
@@ -71,7 +71,8 @@ def get_multi_paper_recommendations(
71
71
  return Command(
72
72
  update={
73
73
  "multi_papers": results["papers"],
74
- "last_displayed_papers": "multi_papers",
74
+ # Store the latest multi-paper results mapping directly for display
75
+ "last_displayed_papers": results["papers"],
75
76
  "messages": [
76
77
  ToolMessage(
77
78
  content=results["content"],
@@ -49,13 +49,17 @@ def query_dataframe(question: str, state: Annotated[dict, InjectedState]) -> str
49
49
  """
50
50
  logger.info("Querying last displayed papers with question: %s", question)
51
51
  llm_model = state.get("llm_model")
52
- if not state.get("last_displayed_papers"):
52
+ context_val = state.get("last_displayed_papers")
53
+ if not context_val:
53
54
  logger.info("No papers displayed so far, raising NoPapersFoundError")
54
55
  raise NoPapersFoundError(
55
56
  "No papers found. A search needs to be performed first."
56
57
  )
57
- context_key = state.get("last_displayed_papers")
58
- dic_papers = state.get(context_key)
58
+ # Support both key reference (str) and direct mapping
59
+ if isinstance(context_val, dict):
60
+ dic_papers = context_val
61
+ else:
62
+ dic_papers = state.get(context_val)
59
63
  df_papers = pd.DataFrame.from_dict(dic_papers, orient="index")
60
64
  df_agent = create_pandas_dataframe_agent(
61
65
  llm_model,
@@ -65,7 +65,8 @@ def search_tool(
65
65
  return Command(
66
66
  update={
67
67
  "papers": results["papers"],
68
- "last_displayed_papers": "papers",
68
+ # Store the latest results mapping directly for display
69
+ "last_displayed_papers": results["papers"],
69
70
  "messages": [
70
71
  ToolMessage(
71
72
  content=results["content"],
@@ -69,7 +69,8 @@ def get_single_paper_recommendations(
69
69
  return Command(
70
70
  update={
71
71
  "papers": results["papers"],
72
- "last_displayed_papers": "papers",
72
+ # Store the latest single-paper results mapping directly for display
73
+ "last_displayed_papers": results["papers"],
73
74
  "messages": [
74
75
  ToolMessage(
75
76
  content=results["content"],
@@ -143,6 +143,7 @@ class MultiPaperRecData:
143
143
  ],
144
144
  "URL": paper.get("url", "N/A"),
145
145
  "arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
146
+ "doi": paper.get("externalIds", {}).get("DOI", "N/A"),
146
147
  }
147
148
  for paper in self.recommendations
148
149
  if paper.get("title") and paper.get("authors")
@@ -158,6 +159,7 @@ class MultiPaperRecData:
158
159
  f"{i+1}. {paper['Title']} ({paper['Year']}; "
159
160
  f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
160
161
  f"arXiv ID: {paper['arxiv_id']})"
162
+ f"doi: {paper['doi']})"
161
163
  for i, paper in enumerate(top_papers)
162
164
  ]
163
165
  )
@@ -125,6 +125,7 @@ class SearchData:
125
125
  ],
126
126
  "URL": paper.get("url", "N/A"),
127
127
  "arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
128
+ "doi": paper.get("externalIds", {}).get("DOI", "N/A"),
128
129
  }
129
130
  for paper in self.papers
130
131
  if paper.get("title") and paper.get("authors")
@@ -140,6 +141,7 @@ class SearchData:
140
141
  f"{i+1}. {paper['Title']} ({paper['Year']}; "
141
142
  f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
142
143
  f"arXiv ID: {paper['arxiv_id']})"
144
+ f"doi: {paper['doi']})"
143
145
  for i, paper in enumerate(top_papers)
144
146
  ]
145
147
  )
@@ -136,6 +136,7 @@ class SinglePaperRecData:
136
136
  ],
137
137
  "URL": paper.get("url", "N/A"),
138
138
  "arxiv_id": paper.get("externalIds", {}).get("ArXiv", "N/A"),
139
+ "doi": paper.get("externalIds", {}).get("DOI", "N/A"),
139
140
  }
140
141
  for paper in self.recommendations
141
142
  if paper.get("title") and paper.get("authors")
@@ -151,6 +152,7 @@ class SinglePaperRecData:
151
152
  f"{i+1}. {paper['Title']} ({paper['Year']}; "
152
153
  f"semantic_scholar_paper_id: {paper['semantic_scholar_paper_id']}; "
153
154
  f"arXiv ID: {paper['arxiv_id']})"
155
+ f"doi: {paper['doi']})"
154
156
  for i, paper in enumerate(top_papers)
155
157
  ]
156
158
  )
@@ -5,15 +5,14 @@ Utility for zotero read tool.
5
5
  """
6
6
 
7
7
  import logging
8
- import tempfile
9
- from typing import Any, Dict, List, Tuple, Optional
10
- import concurrent.futures
8
+ from typing import Any, Dict, List
11
9
 
12
10
  import hydra
13
11
  import requests
14
12
  from pyzotero import zotero
15
13
 
16
14
  from .zotero_path import get_item_collections
15
+ from .zotero_pdf_downloader import download_pdfs_in_parallel
17
16
 
18
17
  # Configure logging
19
18
  logging.basicConfig(level=logging.INFO)
@@ -30,12 +29,14 @@ class ZoteroSearchData:
30
29
  query: str,
31
30
  only_articles: bool,
32
31
  limit: int,
33
- tool_call_id: str,
32
+ download_pdfs: bool = True,
33
+ **_kwargs,
34
34
  ):
35
35
  self.query = query
36
36
  self.only_articles = only_articles
37
37
  self.limit = limit
38
- self.tool_call_id = tool_call_id
38
+ # Control whether to fetch PDF attachments now
39
+ self.download_pdfs = download_pdfs
39
40
  self.cfg = self._load_config()
40
41
  self.zot = self._init_zotero_client()
41
42
  self.item_to_collections = get_item_collections(self.zot)
@@ -105,89 +106,75 @@ class ZoteroSearchData:
105
106
 
106
107
  return items
107
108
 
108
- def _download_zotero_pdf(self, attachment_key: str) -> Optional[Tuple[str, str]]:
109
- """Download a PDF from Zotero by attachment key. Returns (file_path, filename) or None."""
110
- zotero_pdf_url = (
111
- f"https://api.zotero.org/users/{self.cfg.user_id}/items/"
112
- f"{attachment_key}/file"
113
- )
114
- headers = {"Zotero-API-Key": self.cfg.api_key}
109
+ def _collect_item_attachments(self) -> Dict[str, str]:
110
+ """Collect PDF attachment keys for non-orphan items."""
111
+ item_attachments: Dict[str, str] = {}
112
+ for item_key, item_data in self.article_data.items():
113
+ if item_data.get("Type") == "orphan_attachment":
114
+ continue
115
+ try:
116
+ children = self.zot.children(item_key)
117
+ for child in children:
118
+ data = child.get("data", {})
119
+ if data.get("contentType") == "application/pdf":
120
+ attachment_key = data.get("key")
121
+ filename = data.get("filename", "unknown.pdf")
122
+ if attachment_key:
123
+ item_attachments[attachment_key] = item_key
124
+ self.article_data[item_key]["filename"] = filename
125
+ break
126
+ except Exception as e:
127
+ logger.error("Failed to get attachments for item %s: %s", item_key, e)
128
+ return item_attachments
129
+
130
+ def _process_orphaned_pdfs(self, orphaned_pdfs: Dict[str, str]) -> None:
131
+ """Download or record orphaned PDF attachments."""
132
+ if self.download_pdfs:
133
+ logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
134
+ results = download_pdfs_in_parallel(
135
+ self.session,
136
+ self.cfg.user_id,
137
+ self.cfg.api_key,
138
+ orphaned_pdfs,
139
+ chunk_size=getattr(self.cfg, "chunk_size", None),
140
+ )
141
+ for item_key, (file_path, filename, attachment_key) in results.items():
142
+ self.article_data[item_key]["filename"] = filename
143
+ self.article_data[item_key]["pdf_url"] = file_path
144
+ self.article_data[item_key]["attachment_key"] = attachment_key
145
+ logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
146
+ else:
147
+ logger.info("Skipping orphaned PDF downloads (download_pdfs=False)")
148
+ for attachment_key in orphaned_pdfs:
149
+ self.article_data[attachment_key]["attachment_key"] = attachment_key
150
+ self.article_data[attachment_key]["filename"] = (
151
+ self.article_data[attachment_key].get("Title", attachment_key)
152
+ )
115
153
 
116
- try:
117
- # Use session for connection pooling
118
- response = self.session.get(
119
- zotero_pdf_url, headers=headers, stream=True, timeout=10
154
+ def _process_item_pdfs(self, item_attachments: Dict[str, str]) -> None:
155
+ """Download or record regular item PDF attachments."""
156
+ if self.download_pdfs:
157
+ logger.info(
158
+ "Downloading %d regular item PDFs in parallel", len(item_attachments)
120
159
  )
121
- response.raise_for_status()
122
-
123
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
- # Increased chunk size for better performance
125
- for chunk in response.iter_content(chunk_size=16384):
126
- temp_file.write(chunk)
127
- temp_file_path = temp_file.name
128
-
129
- content_disp = response.headers.get("Content-Disposition", "")
130
- filename = (
131
- content_disp.split("filename=")[-1].strip('"')
132
- if "filename=" in content_disp
133
- else "downloaded.pdf"
160
+ results = download_pdfs_in_parallel(
161
+ self.session,
162
+ self.cfg.user_id,
163
+ self.cfg.api_key,
164
+ item_attachments,
165
+ chunk_size=getattr(self.cfg, "chunk_size", None),
134
166
  )
167
+ else:
168
+ logger.info("Skipping regular PDF downloads (download_pdfs=False)")
169
+ results = {}
170
+ for attachment_key, item_key in item_attachments.items():
171
+ self.article_data[item_key]["attachment_key"] = attachment_key
172
+ for item_key, (file_path, filename, attachment_key) in results.items():
173
+ self.article_data[item_key]["filename"] = filename
174
+ self.article_data[item_key]["pdf_url"] = file_path
175
+ self.article_data[item_key]["attachment_key"] = attachment_key
176
+ logger.info("Downloaded Zotero PDF to: %s", file_path)
135
177
 
136
- return temp_file_path, filename
137
-
138
- except Exception as e:
139
- logger.error(
140
- "Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
141
- )
142
- return None
143
-
144
- def _download_pdfs_in_parallel(
145
- self, attachment_item_map: Dict[str, str]
146
- ) -> Dict[str, Tuple[str, str, str]]:
147
- """
148
- Download multiple PDFs in parallel using ThreadPoolExecutor.
149
-
150
- Args:
151
- attachment_item_map: Dictionary mapping attachment keys to parent item keys
152
-
153
- Returns:
154
- Dictionary mapping parent item keys to (file_path, filename, attachment_key)
155
- """
156
- results = {}
157
- max_workers = min(10, len(attachment_item_map)) # Set reasonable limit
158
-
159
- if not attachment_item_map:
160
- return results
161
-
162
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
163
- # Create a dictionary mapping Future objects to attachment keys
164
- future_to_key = {
165
- executor.submit(self._download_zotero_pdf, attachment_key): (
166
- attachment_key,
167
- item_key,
168
- )
169
- for attachment_key, item_key in attachment_item_map.items()
170
- }
171
-
172
- for future in concurrent.futures.as_completed(future_to_key):
173
- attachment_key, item_key = future_to_key[future]
174
- try:
175
- result = future.result()
176
- if result:
177
- temp_file_path, resolved_filename = result
178
- results[item_key] = (
179
- temp_file_path,
180
- resolved_filename,
181
- attachment_key,
182
- )
183
- except Exception as e:
184
- logger.error(
185
- "Failed to download PDF for key %s: %s", attachment_key, e
186
- )
187
-
188
- return results
189
-
190
- # pylint: disable=too-many-locals, too-many-branches
191
178
  def _filter_and_format_papers(self, items: List[Dict[str, Any]]) -> None:
192
179
  """Filter and format papers from Zotero items, including standalone PDFs."""
193
180
  filter_item_types = (
@@ -196,8 +183,7 @@ class ZoteroSearchData:
196
183
  logger.debug("Filtering item types: %s", filter_item_types)
197
184
 
198
185
  # Maps to track attachments for batch processing
199
- orphaned_pdfs = {} # attachment_key -> item key (same for orphans)
200
- item_attachments = {} # item_key -> [attachment_keys]
186
+ orphaned_pdfs: Dict[str, str] = {} # attachment_key -> item key (same for orphans)
201
187
 
202
188
  # First pass: process all items without downloading PDFs
203
189
  for item in items:
@@ -263,59 +249,16 @@ class ZoteroSearchData:
263
249
  "source": "zotero",
264
250
  }
265
251
 
266
- # Second pass: collect attachment info for all items
267
- for item_key, item_data in self.article_data.items():
268
- if item_data["Type"] != "orphan_attachment":
269
- try:
270
- children = self.zot.children(item_key)
271
- pdf_attachments = [
272
- child
273
- for child in children
274
- if isinstance(child, dict)
275
- and child.get("data", {}).get("contentType")
276
- == "application/pdf"
277
- ]
278
-
279
- if pdf_attachments:
280
- attachment = pdf_attachments[0]
281
- attachment_data = attachment.get("data", {})
282
- attachment_key = attachment_data.get("key")
283
- filename = attachment_data.get("filename", "unknown.pdf")
252
+ # Collect and process attachments
253
+ item_attachments = self._collect_item_attachments()
284
254
 
285
- if attachment_key:
286
- # Add to item attachments map
287
- item_attachments[attachment_key] = item_key
288
- # Add basic info
289
- self.article_data[item_key]["filename"] = filename
290
- except Exception as e:
291
- logger.error(
292
- "Failed to get attachments for item %s: %s", item_key, e
293
- )
255
+ # Process orphaned PDFs
256
+ self._process_orphaned_pdfs(orphaned_pdfs)
294
257
 
295
- # Now download all PDFs in parallel - first orphaned PDFs
296
- logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
297
- orphan_results = self._download_pdfs_in_parallel(orphaned_pdfs)
298
-
299
- # Update orphan data
300
- for item_key, (file_path, filename, attachment_key) in orphan_results.items():
301
- self.article_data[item_key]["filename"] = filename
302
- self.article_data[item_key]["pdf_url"] = file_path
303
- self.article_data[item_key]["attachment_key"] = attachment_key
304
- logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
305
-
306
- # Download regular item attachments
307
- logger.info(
308
- "Downloading %d regular item PDFs in parallel", len(item_attachments)
309
- )
310
- item_results = self._download_pdfs_in_parallel(item_attachments)
311
-
312
- # Update item data
313
- for item_key, (file_path, filename, attachment_key) in item_results.items():
314
- self.article_data[item_key]["filename"] = filename
315
- self.article_data[item_key]["pdf_url"] = file_path
316
- self.article_data[item_key]["attachment_key"] = attachment_key
317
- logger.info("Downloaded Zotero PDF to: %s", file_path)
258
+ # Process regular item PDFs
259
+ self._process_item_pdfs(item_attachments)
318
260
 
261
+ # Ensure we have some results
319
262
  if not self.article_data:
320
263
  logger.error(
321
264
  "No matching papers returned from Zotero for query: '%s'", self.query