aiagents4pharma 1.36.0__py3-none-any.whl → 1.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +12 -4
  2. aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +2 -2
  3. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +7 -6
  4. aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/__init__.py +0 -0
  6. aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +1 -0
  7. aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +12 -11
  8. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +152 -0
  9. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +36 -65
  10. aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +1 -0
  11. aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +374 -0
  12. aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +1 -0
  13. aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +292 -0
  14. aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +1 -0
  15. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +33 -7
  16. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +59 -3
  17. aiagents4pharma/talk2scholars/tests/test_read_helper_utils.py +110 -0
  18. aiagents4pharma/talk2scholars/tests/test_s2_display.py +20 -1
  19. aiagents4pharma/talk2scholars/tests/test_s2_query.py +17 -0
  20. aiagents4pharma/talk2scholars/tests/test_state.py +25 -1
  21. aiagents4pharma/talk2scholars/tests/test_zotero_pdf_downloader_utils.py +46 -0
  22. aiagents4pharma/talk2scholars/tests/test_zotero_read.py +35 -40
  23. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +62 -40
  24. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +6 -2
  25. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +2 -1
  26. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +7 -3
  27. aiagents4pharma/talk2scholars/tools/s2/search.py +2 -1
  28. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +2 -1
  29. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +79 -136
  30. aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +147 -0
  31. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +42 -9
  32. {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/METADATA +2 -1
  33. {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/RECORD +36 -29
  34. {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/WHEEL +1 -1
  35. {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/licenses/LICENSE +0 -0
  36. {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,10 @@
1
1
  """
2
- Tool for performing Q&A on PDF documents using retrieval augmented generation.
3
- This module provides functionality to load PDFs from URLs, split them into
4
- chunks, retrieve relevant segments via semantic search, and generate answers
5
- to user-provided questions using a language model chain.
2
+ PDF Question & Answer Tool
3
+
4
+ This LangGraph tool answers user questions by leveraging a pre-built FAISS vector store
5
+ of embedded PDF document chunks. Given a question, it retrieves the most relevant text
6
+ segments from the loaded PDFs, invokes an LLM for answer generation, and returns the
7
+ response with source attribution.
6
8
  """
7
9
 
8
10
  import logging
@@ -52,19 +54,18 @@ def load_hydra_config() -> Any:
52
54
 
53
55
  class QuestionAndAnswerInput(BaseModel):
54
56
  """
55
- Input schema for the PDF Question and Answer tool.
56
-
57
- This schema defines the inputs required for querying academic or research-related
58
- PDFs to answer a specific question using a language model and document retrieval.
57
+ Input schema for the PDF Q&A tool.
59
58
 
60
59
  Attributes:
61
- question (str): The question to ask regarding the PDF content.
62
- paper_ids (Optional[List[str]]): Optional list of specific paper IDs to query.
63
- If not provided, the system will determine relevant papers automatically.
64
- use_all_papers (bool): Whether to use all available papers for answering the question.
65
- If True, the system will include all loaded papers regardless of relevance filtering.
66
- tool_call_id (str): Unique identifier for the tool call, injected automatically.
67
- state (dict): Shared application state, injected automatically.
60
+ question (str): Free-text question to answer based on PDF content.
61
+ paper_ids (Optional[List[str]]): If provided, restricts retrieval to these paper IDs.
62
+ use_all_papers (bool): If True, include all loaded papers without semantic ranking.
63
+ tool_call_id (str): Internal ID injected by LangGraph for this tool call.
64
+ state (dict): Shared agent state containing:
65
+ - 'article_data': dict of paper metadata with 'pdf_url' keys
66
+ - 'text_embedding_model': embedding model instance
67
+ - 'llm_model': chat/LLM instance
68
+ - 'vector_store': pre-built Vectorstore for retrieval
68
69
  """
69
70
 
70
71
  question: str = Field(description="The question to ask regarding the PDF content.")
@@ -119,6 +120,8 @@ class Vectorstore:
119
120
  self.documents: Dict[str, Document] = {}
120
121
  self.vector_store: Optional[VectorStore] = None
121
122
  self.paper_metadata: Dict[str, Dict[str, Any]] = {}
123
+ # Cache for document chunk embeddings to avoid recomputation
124
+ self.embeddings: Dict[str, Any] = {}
122
125
 
123
126
  def add_paper(
124
127
  self,
@@ -160,6 +163,10 @@ class Vectorstore:
160
163
  # Split documents and add metadata for each chunk
161
164
  chunks = splitter.split_documents(documents)
162
165
  logger.info("Split %s into %d chunks", paper_id, len(chunks))
166
+ # Embed and cache chunk embeddings
167
+ chunk_texts = [chunk.page_content for chunk in chunks]
168
+ chunk_embeddings = self.embedding_model.embed_documents(chunk_texts)
169
+ logger.info("Embedded %d chunks for paper %s", len(chunk_embeddings), paper_id)
163
170
 
164
171
  # Enhance document metadata
165
172
  for i, chunk in enumerate(chunks):
@@ -182,6 +189,9 @@ class Vectorstore:
182
189
  # Store chunk
183
190
  doc_id = f"{paper_id}_{i}"
184
191
  self.documents[doc_id] = chunk
192
+ # Cache embedding if available
193
+ if chunk_embeddings[i] is not None:
194
+ self.embeddings[doc_id] = chunk_embeddings[i]
185
195
 
186
196
  # Mark as loaded to prevent duplicate loading
187
197
  self.loaded_papers.add(paper_id)
@@ -295,12 +305,16 @@ class Vectorstore:
295
305
  logger.warning("No documents found after filtering by paper_ids.")
296
306
  return []
297
307
 
298
- texts = [doc.page_content for doc in all_docs]
299
-
300
- # Step 3: Batch embed all documents
301
- logger.info("Starting batch embedding for %d chunks...", len(texts))
302
- all_embeddings = self.embedding_model.embed_documents(texts)
303
- logger.info("Completed embedding for %d chunks...", len(texts))
308
+ # Step 3: Retrieve or compute embeddings for all documents using cache
309
+ logger.info("Retrieving embeddings for %d chunks...", len(all_docs))
310
+ all_embeddings = []
311
+ for doc in all_docs:
312
+ doc_id = f"{doc.metadata['paper_id']}_{doc.metadata['chunk_id']}"
313
+ if doc_id not in self.embeddings:
314
+ logger.info("Embedding missing chunk %s", doc_id)
315
+ emb = self.embedding_model.embed_documents([doc.page_content])[0]
316
+ self.embeddings[doc_id] = emb
317
+ all_embeddings.append(self.embeddings[doc_id])
304
318
 
305
319
  # Step 4: Apply MMR
306
320
  mmr_indices = maximal_marginal_relevance(
@@ -392,6 +406,10 @@ def generate_answer(
392
406
  }
393
407
 
394
408
 
409
+ # Shared pre-built Vectorstore for RAG (set externally, e.g., by Streamlit startup)
410
+ prebuilt_vector_store: Optional[Vectorstore] = None
411
+
412
+
395
413
  @tool(args_schema=QuestionAndAnswerInput, parse_docstring=True)
396
414
  def question_and_answer(
397
415
  question: str,
@@ -401,30 +419,29 @@ def question_and_answer(
401
419
  use_all_papers: bool = False,
402
420
  ) -> Command[Any]:
403
421
  """
404
- Answer a question using PDF content with advanced retrieval augmented generation.
422
+ Generate an answer to a user question using Retrieval-Augmented Generation (RAG) over PDFs.
405
423
 
406
- This tool retrieves PDF documents from URLs, processes them using semantic search,
407
- and generates an answer to the user's question based on the most relevant content.
408
- It can work with multiple papers simultaneously and provides source attribution.
424
+ This tool expects that a FAISS vector store of PDF document chunks has already been built
425
+ and stored in shared state. It retrieves the most relevant chunks for the input question,
426
+ invokes an LLM to craft a response, and returns the answer with source attribution.
409
427
 
410
428
  Args:
411
- question (str): The question to answer based on PDF content.
412
- paper_ids (Optional[List[str]]): Optional list of specific paper IDs to query.
413
- use_all_papers (bool): Whether to use all available papers.
414
- tool_call_id (str): Unique identifier for the current tool call.
415
- state (dict): Current state dictionary containing article data and required models.
416
- Expected keys:
417
- - "article_data": Dictionary containing article metadata including PDF URLs
418
- - "text_embedding_model": Model for generating embeddings
419
- - "llm_model": Language model for generating answers
420
- - "vector_store": Optional Vectorstore instance
429
+ question (str): The free-text question to answer.
430
+ state (dict): Injected agent state mapping that must include:
431
+ - 'article_data': mapping of paper IDs to metadata (including 'pdf_url')
432
+ - 'text_embedding_model': the embedding model instance
433
+ - 'llm_model': the chat/LLM instance
434
+ tool_call_id (str): Internal identifier for this tool call.
435
+ paper_ids (Optional[List[str]]): Specific paper IDs to restrict retrieval (default: None).
436
+ use_all_papers (bool): If True, bypasses semantic ranking and includes all papers.
421
437
 
422
438
  Returns:
423
- Dict[str, Any]: A dictionary wrapped in a Command that updates the conversation
424
- with either the answer or an error message.
439
+ Command[Any]: A LangGraph Command that updates the conversation state:
440
+ - 'messages': a single ToolMessage containing the generated answer text.
425
441
 
426
442
  Raises:
427
- ValueError: If required components are missing or if PDF processing fails.
443
+ ValueError: If required models or 'article_data' are missing from state.
444
+ RuntimeError: If no relevant document chunks can be retrieved.
428
445
  """
429
446
  # Load configuration
430
447
  config = load_hydra_config()
@@ -456,8 +473,13 @@ def question_and_answer(
456
473
  logger.error("%s: %s", call_id, error_msg)
457
474
  raise ValueError(error_msg)
458
475
 
459
- # Always use a fresh in-memory document store for this Q&A call
460
- vector_store = Vectorstore(embedding_model=text_embedding_model)
476
+ # Use shared pre-built Vectorstore if provided, else create a new one
477
+ if prebuilt_vector_store is not None:
478
+ vector_store = prebuilt_vector_store
479
+ logger.info("Using shared pre-built vector store from the memory")
480
+ else:
481
+ vector_store = Vectorstore(embedding_model=text_embedding_model)
482
+ logger.info("Initialized new vector store (no pre-built store found)")
461
483
 
462
484
  # Check if there are papers from different sources
463
485
  has_uploaded_papers = any(
@@ -66,8 +66,12 @@ def display_dataframe(
66
66
  NoPapersFoundError: If no entries exist under 'last_displayed_papers' in state.
67
67
  """
68
68
  logger.info("Displaying papers")
69
- context_key = state.get("last_displayed_papers")
70
- artifact = state.get(context_key)
69
+ context_val = state.get("last_displayed_papers")
70
+ # Support both key reference (str) and direct mapping
71
+ if isinstance(context_val, dict):
72
+ artifact = context_val
73
+ else:
74
+ artifact = state.get(context_val)
71
75
  if not artifact:
72
76
  logger.info("No papers found in state, raising NoPapersFoundError")
73
77
  raise NoPapersFoundError(
@@ -71,7 +71,8 @@ def get_multi_paper_recommendations(
71
71
  return Command(
72
72
  update={
73
73
  "multi_papers": results["papers"],
74
- "last_displayed_papers": "multi_papers",
74
+ # Store the latest multi-paper results mapping directly for display
75
+ "last_displayed_papers": results["papers"],
75
76
  "messages": [
76
77
  ToolMessage(
77
78
  content=results["content"],
@@ -49,13 +49,17 @@ def query_dataframe(question: str, state: Annotated[dict, InjectedState]) -> str
49
49
  """
50
50
  logger.info("Querying last displayed papers with question: %s", question)
51
51
  llm_model = state.get("llm_model")
52
- if not state.get("last_displayed_papers"):
52
+ context_val = state.get("last_displayed_papers")
53
+ if not context_val:
53
54
  logger.info("No papers displayed so far, raising NoPapersFoundError")
54
55
  raise NoPapersFoundError(
55
56
  "No papers found. A search needs to be performed first."
56
57
  )
57
- context_key = state.get("last_displayed_papers")
58
- dic_papers = state.get(context_key)
58
+ # Support both key reference (str) and direct mapping
59
+ if isinstance(context_val, dict):
60
+ dic_papers = context_val
61
+ else:
62
+ dic_papers = state.get(context_val)
59
63
  df_papers = pd.DataFrame.from_dict(dic_papers, orient="index")
60
64
  df_agent = create_pandas_dataframe_agent(
61
65
  llm_model,
@@ -65,7 +65,8 @@ def search_tool(
65
65
  return Command(
66
66
  update={
67
67
  "papers": results["papers"],
68
- "last_displayed_papers": "papers",
68
+ # Store the latest results mapping directly for display
69
+ "last_displayed_papers": results["papers"],
69
70
  "messages": [
70
71
  ToolMessage(
71
72
  content=results["content"],
@@ -69,7 +69,8 @@ def get_single_paper_recommendations(
69
69
  return Command(
70
70
  update={
71
71
  "papers": results["papers"],
72
- "last_displayed_papers": "papers",
72
+ # Store the latest single-paper results mapping directly for display
73
+ "last_displayed_papers": results["papers"],
73
74
  "messages": [
74
75
  ToolMessage(
75
76
  content=results["content"],
@@ -5,15 +5,14 @@ Utility for zotero read tool.
5
5
  """
6
6
 
7
7
  import logging
8
- import tempfile
9
- from typing import Any, Dict, List, Tuple, Optional
10
- import concurrent.futures
8
+ from typing import Any, Dict, List
11
9
 
12
10
  import hydra
13
11
  import requests
14
12
  from pyzotero import zotero
15
13
 
16
14
  from .zotero_path import get_item_collections
15
+ from .zotero_pdf_downloader import download_pdfs_in_parallel
17
16
 
18
17
  # Configure logging
19
18
  logging.basicConfig(level=logging.INFO)
@@ -30,12 +29,14 @@ class ZoteroSearchData:
30
29
  query: str,
31
30
  only_articles: bool,
32
31
  limit: int,
33
- tool_call_id: str,
32
+ download_pdfs: bool = True,
33
+ **_kwargs,
34
34
  ):
35
35
  self.query = query
36
36
  self.only_articles = only_articles
37
37
  self.limit = limit
38
- self.tool_call_id = tool_call_id
38
+ # Control whether to fetch PDF attachments now
39
+ self.download_pdfs = download_pdfs
39
40
  self.cfg = self._load_config()
40
41
  self.zot = self._init_zotero_client()
41
42
  self.item_to_collections = get_item_collections(self.zot)
@@ -105,89 +106,75 @@ class ZoteroSearchData:
105
106
 
106
107
  return items
107
108
 
108
- def _download_zotero_pdf(self, attachment_key: str) -> Optional[Tuple[str, str]]:
109
- """Download a PDF from Zotero by attachment key. Returns (file_path, filename) or None."""
110
- zotero_pdf_url = (
111
- f"https://api.zotero.org/users/{self.cfg.user_id}/items/"
112
- f"{attachment_key}/file"
113
- )
114
- headers = {"Zotero-API-Key": self.cfg.api_key}
109
+ def _collect_item_attachments(self) -> Dict[str, str]:
110
+ """Collect PDF attachment keys for non-orphan items."""
111
+ item_attachments: Dict[str, str] = {}
112
+ for item_key, item_data in self.article_data.items():
113
+ if item_data.get("Type") == "orphan_attachment":
114
+ continue
115
+ try:
116
+ children = self.zot.children(item_key)
117
+ for child in children:
118
+ data = child.get("data", {})
119
+ if data.get("contentType") == "application/pdf":
120
+ attachment_key = data.get("key")
121
+ filename = data.get("filename", "unknown.pdf")
122
+ if attachment_key:
123
+ item_attachments[attachment_key] = item_key
124
+ self.article_data[item_key]["filename"] = filename
125
+ break
126
+ except Exception as e:
127
+ logger.error("Failed to get attachments for item %s: %s", item_key, e)
128
+ return item_attachments
129
+
130
+ def _process_orphaned_pdfs(self, orphaned_pdfs: Dict[str, str]) -> None:
131
+ """Download or record orphaned PDF attachments."""
132
+ if self.download_pdfs:
133
+ logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
134
+ results = download_pdfs_in_parallel(
135
+ self.session,
136
+ self.cfg.user_id,
137
+ self.cfg.api_key,
138
+ orphaned_pdfs,
139
+ chunk_size=getattr(self.cfg, "chunk_size", None),
140
+ )
141
+ for item_key, (file_path, filename, attachment_key) in results.items():
142
+ self.article_data[item_key]["filename"] = filename
143
+ self.article_data[item_key]["pdf_url"] = file_path
144
+ self.article_data[item_key]["attachment_key"] = attachment_key
145
+ logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
146
+ else:
147
+ logger.info("Skipping orphaned PDF downloads (download_pdfs=False)")
148
+ for attachment_key in orphaned_pdfs:
149
+ self.article_data[attachment_key]["attachment_key"] = attachment_key
150
+ self.article_data[attachment_key]["filename"] = (
151
+ self.article_data[attachment_key].get("Title", attachment_key)
152
+ )
115
153
 
116
- try:
117
- # Use session for connection pooling
118
- response = self.session.get(
119
- zotero_pdf_url, headers=headers, stream=True, timeout=10
154
+ def _process_item_pdfs(self, item_attachments: Dict[str, str]) -> None:
155
+ """Download or record regular item PDF attachments."""
156
+ if self.download_pdfs:
157
+ logger.info(
158
+ "Downloading %d regular item PDFs in parallel", len(item_attachments)
120
159
  )
121
- response.raise_for_status()
122
-
123
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
- # Increased chunk size for better performance
125
- for chunk in response.iter_content(chunk_size=16384):
126
- temp_file.write(chunk)
127
- temp_file_path = temp_file.name
128
-
129
- content_disp = response.headers.get("Content-Disposition", "")
130
- filename = (
131
- content_disp.split("filename=")[-1].strip('"')
132
- if "filename=" in content_disp
133
- else "downloaded.pdf"
160
+ results = download_pdfs_in_parallel(
161
+ self.session,
162
+ self.cfg.user_id,
163
+ self.cfg.api_key,
164
+ item_attachments,
165
+ chunk_size=getattr(self.cfg, "chunk_size", None),
134
166
  )
167
+ else:
168
+ logger.info("Skipping regular PDF downloads (download_pdfs=False)")
169
+ results = {}
170
+ for attachment_key, item_key in item_attachments.items():
171
+ self.article_data[item_key]["attachment_key"] = attachment_key
172
+ for item_key, (file_path, filename, attachment_key) in results.items():
173
+ self.article_data[item_key]["filename"] = filename
174
+ self.article_data[item_key]["pdf_url"] = file_path
175
+ self.article_data[item_key]["attachment_key"] = attachment_key
176
+ logger.info("Downloaded Zotero PDF to: %s", file_path)
135
177
 
136
- return temp_file_path, filename
137
-
138
- except Exception as e:
139
- logger.error(
140
- "Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
141
- )
142
- return None
143
-
144
- def _download_pdfs_in_parallel(
145
- self, attachment_item_map: Dict[str, str]
146
- ) -> Dict[str, Tuple[str, str, str]]:
147
- """
148
- Download multiple PDFs in parallel using ThreadPoolExecutor.
149
-
150
- Args:
151
- attachment_item_map: Dictionary mapping attachment keys to parent item keys
152
-
153
- Returns:
154
- Dictionary mapping parent item keys to (file_path, filename, attachment_key)
155
- """
156
- results = {}
157
- max_workers = min(10, len(attachment_item_map)) # Set reasonable limit
158
-
159
- if not attachment_item_map:
160
- return results
161
-
162
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
163
- # Create a dictionary mapping Future objects to attachment keys
164
- future_to_key = {
165
- executor.submit(self._download_zotero_pdf, attachment_key): (
166
- attachment_key,
167
- item_key,
168
- )
169
- for attachment_key, item_key in attachment_item_map.items()
170
- }
171
-
172
- for future in concurrent.futures.as_completed(future_to_key):
173
- attachment_key, item_key = future_to_key[future]
174
- try:
175
- result = future.result()
176
- if result:
177
- temp_file_path, resolved_filename = result
178
- results[item_key] = (
179
- temp_file_path,
180
- resolved_filename,
181
- attachment_key,
182
- )
183
- except Exception as e:
184
- logger.error(
185
- "Failed to download PDF for key %s: %s", attachment_key, e
186
- )
187
-
188
- return results
189
-
190
- # pylint: disable=too-many-locals, too-many-branches
191
178
  def _filter_and_format_papers(self, items: List[Dict[str, Any]]) -> None:
192
179
  """Filter and format papers from Zotero items, including standalone PDFs."""
193
180
  filter_item_types = (
@@ -196,8 +183,7 @@ class ZoteroSearchData:
196
183
  logger.debug("Filtering item types: %s", filter_item_types)
197
184
 
198
185
  # Maps to track attachments for batch processing
199
- orphaned_pdfs = {} # attachment_key -> item key (same for orphans)
200
- item_attachments = {} # item_key -> [attachment_keys]
186
+ orphaned_pdfs: Dict[str, str] = {} # attachment_key -> item key (same for orphans)
201
187
 
202
188
  # First pass: process all items without downloading PDFs
203
189
  for item in items:
@@ -263,59 +249,16 @@ class ZoteroSearchData:
263
249
  "source": "zotero",
264
250
  }
265
251
 
266
- # Second pass: collect attachment info for all items
267
- for item_key, item_data in self.article_data.items():
268
- if item_data["Type"] != "orphan_attachment":
269
- try:
270
- children = self.zot.children(item_key)
271
- pdf_attachments = [
272
- child
273
- for child in children
274
- if isinstance(child, dict)
275
- and child.get("data", {}).get("contentType")
276
- == "application/pdf"
277
- ]
278
-
279
- if pdf_attachments:
280
- attachment = pdf_attachments[0]
281
- attachment_data = attachment.get("data", {})
282
- attachment_key = attachment_data.get("key")
283
- filename = attachment_data.get("filename", "unknown.pdf")
252
+ # Collect and process attachments
253
+ item_attachments = self._collect_item_attachments()
284
254
 
285
- if attachment_key:
286
- # Add to item attachments map
287
- item_attachments[attachment_key] = item_key
288
- # Add basic info
289
- self.article_data[item_key]["filename"] = filename
290
- except Exception as e:
291
- logger.error(
292
- "Failed to get attachments for item %s: %s", item_key, e
293
- )
255
+ # Process orphaned PDFs
256
+ self._process_orphaned_pdfs(orphaned_pdfs)
294
257
 
295
- # Now download all PDFs in parallel - first orphaned PDFs
296
- logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
297
- orphan_results = self._download_pdfs_in_parallel(orphaned_pdfs)
298
-
299
- # Update orphan data
300
- for item_key, (file_path, filename, attachment_key) in orphan_results.items():
301
- self.article_data[item_key]["filename"] = filename
302
- self.article_data[item_key]["pdf_url"] = file_path
303
- self.article_data[item_key]["attachment_key"] = attachment_key
304
- logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
305
-
306
- # Download regular item attachments
307
- logger.info(
308
- "Downloading %d regular item PDFs in parallel", len(item_attachments)
309
- )
310
- item_results = self._download_pdfs_in_parallel(item_attachments)
311
-
312
- # Update item data
313
- for item_key, (file_path, filename, attachment_key) in item_results.items():
314
- self.article_data[item_key]["filename"] = filename
315
- self.article_data[item_key]["pdf_url"] = file_path
316
- self.article_data[item_key]["attachment_key"] = attachment_key
317
- logger.info("Downloaded Zotero PDF to: %s", file_path)
258
+ # Process regular item PDFs
259
+ self._process_item_pdfs(item_attachments)
318
260
 
261
+ # Ensure we have some results
319
262
  if not self.article_data:
320
263
  logger.error(
321
264
  "No matching papers returned from Zotero for query: '%s'", self.query
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility functions for downloading PDFs from Zotero.
4
+ """
5
+
6
+ import logging
7
+ import tempfile
8
+ from typing import Optional, Tuple, Dict
9
+ import concurrent.futures
10
+ import requests
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def download_zotero_pdf(
16
+ session: requests.Session,
17
+ user_id: str,
18
+ api_key: str,
19
+ attachment_key: str,
20
+ **kwargs,
21
+ ) -> Optional[Tuple[str, str]]:
22
+ """
23
+ Download a PDF from Zotero by attachment key.
24
+
25
+ Args:
26
+ session: requests.Session for HTTP requests.
27
+ user_id: Zotero user ID.
28
+ api_key: Zotero API key.
29
+ attachment_key: Zotero attachment item key.
30
+ kwargs:
31
+ timeout (int): Request timeout in seconds (default: 10).
32
+ chunk_size (int, optional): Chunk size for streaming.
33
+
34
+ Returns:
35
+ Tuple of (local_file_path, filename) if successful, else None.
36
+ """
37
+ # Extract optional parameters
38
+ timeout = kwargs.get("timeout", 10)
39
+ chunk_size = kwargs.get("chunk_size")
40
+ # Log configured parameters for verification
41
+ logger.info("download_zotero_pdf params -> timeout=%s, chunk_size=%s", timeout, chunk_size)
42
+ # Log download start
43
+ logger.info(
44
+ "Downloading Zotero PDF for attachment %s from Zotero API", attachment_key
45
+ )
46
+ zotero_pdf_url = (
47
+ f"https://api.zotero.org/users/{user_id}/items/" f"{attachment_key}/file"
48
+ )
49
+ headers = {"Zotero-API-Key": api_key}
50
+
51
+ try:
52
+ response = session.get(
53
+ zotero_pdf_url, headers=headers, stream=True, timeout=timeout
54
+ )
55
+ response.raise_for_status()
56
+
57
+ # Download to a temporary file first
58
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
59
+ for chunk in response.iter_content(chunk_size=chunk_size):
60
+ temp_file.write(chunk)
61
+ temp_file_path = temp_file.name
62
+ # Temp file written to %s
63
+ logger.info("Zotero PDF downloaded to temporary file: %s", temp_file_path)
64
+
65
+ # Determine filename from Content-Disposition header or default
66
+ if "filename=" in response.headers.get("Content-Disposition", ""):
67
+ filename = (
68
+ response.headers.get("Content-Disposition", "")
69
+ .split("filename=")[-1]
70
+ .strip('"')
71
+ )
72
+ else:
73
+ filename = "downloaded.pdf"
74
+
75
+ return temp_file_path, filename
76
+
77
+ except (requests.exceptions.RequestException, OSError) as e:
78
+ logger.error(
79
+ "Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
80
+ )
81
+ return None
82
+
83
+
84
+ def download_pdfs_in_parallel(
85
+ session: requests.Session,
86
+ user_id: str,
87
+ api_key: str,
88
+ attachment_item_map: Dict[str, str],
89
+ **kwargs,
90
+ ) -> Dict[str, Tuple[str, str, str]]:
91
+ """
92
+ Download multiple PDFs in parallel using ThreadPoolExecutor.
93
+
94
+ Args:
95
+ session: requests.Session for HTTP requests.
96
+ user_id: Zotero user ID.
97
+ api_key: Zotero API key.
98
+ attachment_item_map: Mapping of attachment_key to parent item_key.
99
+ kwargs:
100
+ max_workers (int, optional): Maximum number of worker threads (default: min(10, n)).
101
+ chunk_size (int, optional): Chunk size for streaming.
102
+
103
+ Returns:
104
+ Mapping of parent item_key to (local_file_path, filename, attachment_key).
105
+ """
106
+ # Extract optional parameters
107
+ max_workers = kwargs.get("max_workers")
108
+ chunk_size = kwargs.get("chunk_size")
109
+ # Log configured parameters for verification
110
+ logger.info(
111
+ "download_pdfs_in_parallel params -> max_workers=%s, chunk_size=%s",
112
+ max_workers,
113
+ chunk_size,
114
+ )
115
+ results: Dict[str, Tuple[str, str, str]] = {}
116
+ if not attachment_item_map:
117
+ return results
118
+
119
+ with concurrent.futures.ThreadPoolExecutor(
120
+ max_workers=(
121
+ max_workers
122
+ if max_workers is not None
123
+ else min(10, len(attachment_item_map))
124
+ )
125
+ ) as executor:
126
+ future_to_keys = {
127
+ executor.submit(
128
+ download_zotero_pdf,
129
+ session,
130
+ user_id,
131
+ api_key,
132
+ attachment_key,
133
+ chunk_size=chunk_size,
134
+ ): (attachment_key, item_key)
135
+ for attachment_key, item_key in attachment_item_map.items()
136
+ }
137
+
138
+ for future in concurrent.futures.as_completed(future_to_keys):
139
+ attachment_key, item_key = future_to_keys[future]
140
+ try:
141
+ res = future.result()
142
+ if res:
143
+ results[item_key] = (*res, attachment_key)
144
+ except (requests.exceptions.RequestException, OSError) as e:
145
+ logger.error("Failed to download PDF for key %s: %s", attachment_key, e)
146
+
147
+ return results