aiagents4pharma 1.36.0__py3-none-any.whl → 1.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2aiagents4pharma/tests/test_main_agent.py +12 -4
- aiagents4pharma/talk2knowledgegraphs/agents/t2kg_agent.py +2 -2
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +7 -6
- aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -0
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/__init__.py +0 -0
- aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py +12 -11
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_multimodal_subgraph_extraction.py +152 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_subgraph_extraction.py +36 -65
- aiagents4pharma/talk2knowledgegraphs/tools/__init__.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/tools/multimodal_subgraph_extraction.py +374 -0
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/__init__.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/multimodal_pcst.py +292 -0
- aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +1 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +33 -7
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +59 -3
- aiagents4pharma/talk2scholars/tests/test_read_helper_utils.py +110 -0
- aiagents4pharma/talk2scholars/tests/test_s2_display.py +20 -1
- aiagents4pharma/talk2scholars/tests/test_s2_query.py +17 -0
- aiagents4pharma/talk2scholars/tests/test_state.py +25 -1
- aiagents4pharma/talk2scholars/tests/test_zotero_pdf_downloader_utils.py +46 -0
- aiagents4pharma/talk2scholars/tests/test_zotero_read.py +35 -40
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +62 -40
- aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +6 -2
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +2 -1
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +7 -3
- aiagents4pharma/talk2scholars/tools/s2/search.py +2 -1
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +2 -1
- aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +79 -136
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +147 -0
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +42 -9
- {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/METADATA +2 -1
- {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/RECORD +36 -29
- {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/WHEEL +1 -1
- {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.36.0.dist-info → aiagents4pharma-1.38.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,10 @@
|
|
1
1
|
"""
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
PDF Question & Answer Tool
|
3
|
+
|
4
|
+
This LangGraph tool answers user questions by leveraging a pre-built FAISS vector store
|
5
|
+
of embedded PDF document chunks. Given a question, it retrieves the most relevant text
|
6
|
+
segments from the loaded PDFs, invokes an LLM for answer generation, and returns the
|
7
|
+
response with source attribution.
|
6
8
|
"""
|
7
9
|
|
8
10
|
import logging
|
@@ -52,19 +54,18 @@ def load_hydra_config() -> Any:
|
|
52
54
|
|
53
55
|
class QuestionAndAnswerInput(BaseModel):
|
54
56
|
"""
|
55
|
-
Input schema for the PDF
|
56
|
-
|
57
|
-
This schema defines the inputs required for querying academic or research-related
|
58
|
-
PDFs to answer a specific question using a language model and document retrieval.
|
57
|
+
Input schema for the PDF Q&A tool.
|
59
58
|
|
60
59
|
Attributes:
|
61
|
-
question (str):
|
62
|
-
paper_ids (Optional[List[str]]):
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
question (str): Free-text question to answer based on PDF content.
|
61
|
+
paper_ids (Optional[List[str]]): If provided, restricts retrieval to these paper IDs.
|
62
|
+
use_all_papers (bool): If True, include all loaded papers without semantic ranking.
|
63
|
+
tool_call_id (str): Internal ID injected by LangGraph for this tool call.
|
64
|
+
state (dict): Shared agent state containing:
|
65
|
+
- 'article_data': dict of paper metadata with 'pdf_url' keys
|
66
|
+
- 'text_embedding_model': embedding model instance
|
67
|
+
- 'llm_model': chat/LLM instance
|
68
|
+
- 'vector_store': pre-built Vectorstore for retrieval
|
68
69
|
"""
|
69
70
|
|
70
71
|
question: str = Field(description="The question to ask regarding the PDF content.")
|
@@ -119,6 +120,8 @@ class Vectorstore:
|
|
119
120
|
self.documents: Dict[str, Document] = {}
|
120
121
|
self.vector_store: Optional[VectorStore] = None
|
121
122
|
self.paper_metadata: Dict[str, Dict[str, Any]] = {}
|
123
|
+
# Cache for document chunk embeddings to avoid recomputation
|
124
|
+
self.embeddings: Dict[str, Any] = {}
|
122
125
|
|
123
126
|
def add_paper(
|
124
127
|
self,
|
@@ -160,6 +163,10 @@ class Vectorstore:
|
|
160
163
|
# Split documents and add metadata for each chunk
|
161
164
|
chunks = splitter.split_documents(documents)
|
162
165
|
logger.info("Split %s into %d chunks", paper_id, len(chunks))
|
166
|
+
# Embed and cache chunk embeddings
|
167
|
+
chunk_texts = [chunk.page_content for chunk in chunks]
|
168
|
+
chunk_embeddings = self.embedding_model.embed_documents(chunk_texts)
|
169
|
+
logger.info("Embedded %d chunks for paper %s", len(chunk_embeddings), paper_id)
|
163
170
|
|
164
171
|
# Enhance document metadata
|
165
172
|
for i, chunk in enumerate(chunks):
|
@@ -182,6 +189,9 @@ class Vectorstore:
|
|
182
189
|
# Store chunk
|
183
190
|
doc_id = f"{paper_id}_{i}"
|
184
191
|
self.documents[doc_id] = chunk
|
192
|
+
# Cache embedding if available
|
193
|
+
if chunk_embeddings[i] is not None:
|
194
|
+
self.embeddings[doc_id] = chunk_embeddings[i]
|
185
195
|
|
186
196
|
# Mark as loaded to prevent duplicate loading
|
187
197
|
self.loaded_papers.add(paper_id)
|
@@ -295,12 +305,16 @@ class Vectorstore:
|
|
295
305
|
logger.warning("No documents found after filtering by paper_ids.")
|
296
306
|
return []
|
297
307
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
308
|
+
# Step 3: Retrieve or compute embeddings for all documents using cache
|
309
|
+
logger.info("Retrieving embeddings for %d chunks...", len(all_docs))
|
310
|
+
all_embeddings = []
|
311
|
+
for doc in all_docs:
|
312
|
+
doc_id = f"{doc.metadata['paper_id']}_{doc.metadata['chunk_id']}"
|
313
|
+
if doc_id not in self.embeddings:
|
314
|
+
logger.info("Embedding missing chunk %s", doc_id)
|
315
|
+
emb = self.embedding_model.embed_documents([doc.page_content])[0]
|
316
|
+
self.embeddings[doc_id] = emb
|
317
|
+
all_embeddings.append(self.embeddings[doc_id])
|
304
318
|
|
305
319
|
# Step 4: Apply MMR
|
306
320
|
mmr_indices = maximal_marginal_relevance(
|
@@ -392,6 +406,10 @@ def generate_answer(
|
|
392
406
|
}
|
393
407
|
|
394
408
|
|
409
|
+
# Shared pre-built Vectorstore for RAG (set externally, e.g., by Streamlit startup)
|
410
|
+
prebuilt_vector_store: Optional[Vectorstore] = None
|
411
|
+
|
412
|
+
|
395
413
|
@tool(args_schema=QuestionAndAnswerInput, parse_docstring=True)
|
396
414
|
def question_and_answer(
|
397
415
|
question: str,
|
@@ -401,30 +419,29 @@ def question_and_answer(
|
|
401
419
|
use_all_papers: bool = False,
|
402
420
|
) -> Command[Any]:
|
403
421
|
"""
|
404
|
-
|
422
|
+
Generate an answer to a user question using Retrieval-Augmented Generation (RAG) over PDFs.
|
405
423
|
|
406
|
-
This tool
|
407
|
-
and
|
408
|
-
|
424
|
+
This tool expects that a FAISS vector store of PDF document chunks has already been built
|
425
|
+
and stored in shared state. It retrieves the most relevant chunks for the input question,
|
426
|
+
invokes an LLM to craft a response, and returns the answer with source attribution.
|
409
427
|
|
410
428
|
Args:
|
411
|
-
question (str): The question to answer
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
- "llm_model": Language model for generating answers
|
420
|
-
- "vector_store": Optional Vectorstore instance
|
429
|
+
question (str): The free-text question to answer.
|
430
|
+
state (dict): Injected agent state mapping that must include:
|
431
|
+
- 'article_data': mapping of paper IDs to metadata (including 'pdf_url')
|
432
|
+
- 'text_embedding_model': the embedding model instance
|
433
|
+
- 'llm_model': the chat/LLM instance
|
434
|
+
tool_call_id (str): Internal identifier for this tool call.
|
435
|
+
paper_ids (Optional[List[str]]): Specific paper IDs to restrict retrieval (default: None).
|
436
|
+
use_all_papers (bool): If True, bypasses semantic ranking and includes all papers.
|
421
437
|
|
422
438
|
Returns:
|
423
|
-
|
424
|
-
|
439
|
+
Command[Any]: A LangGraph Command that updates the conversation state:
|
440
|
+
- 'messages': a single ToolMessage containing the generated answer text.
|
425
441
|
|
426
442
|
Raises:
|
427
|
-
ValueError: If required
|
443
|
+
ValueError: If required models or 'article_data' are missing from state.
|
444
|
+
RuntimeError: If no relevant document chunks can be retrieved.
|
428
445
|
"""
|
429
446
|
# Load configuration
|
430
447
|
config = load_hydra_config()
|
@@ -456,8 +473,13 @@ def question_and_answer(
|
|
456
473
|
logger.error("%s: %s", call_id, error_msg)
|
457
474
|
raise ValueError(error_msg)
|
458
475
|
|
459
|
-
#
|
460
|
-
|
476
|
+
# Use shared pre-built Vectorstore if provided, else create a new one
|
477
|
+
if prebuilt_vector_store is not None:
|
478
|
+
vector_store = prebuilt_vector_store
|
479
|
+
logger.info("Using shared pre-built vector store from the memory")
|
480
|
+
else:
|
481
|
+
vector_store = Vectorstore(embedding_model=text_embedding_model)
|
482
|
+
logger.info("Initialized new vector store (no pre-built store found)")
|
461
483
|
|
462
484
|
# Check if there are papers from different sources
|
463
485
|
has_uploaded_papers = any(
|
@@ -66,8 +66,12 @@ def display_dataframe(
|
|
66
66
|
NoPapersFoundError: If no entries exist under 'last_displayed_papers' in state.
|
67
67
|
"""
|
68
68
|
logger.info("Displaying papers")
|
69
|
-
|
70
|
-
|
69
|
+
context_val = state.get("last_displayed_papers")
|
70
|
+
# Support both key reference (str) and direct mapping
|
71
|
+
if isinstance(context_val, dict):
|
72
|
+
artifact = context_val
|
73
|
+
else:
|
74
|
+
artifact = state.get(context_val)
|
71
75
|
if not artifact:
|
72
76
|
logger.info("No papers found in state, raising NoPapersFoundError")
|
73
77
|
raise NoPapersFoundError(
|
@@ -71,7 +71,8 @@ def get_multi_paper_recommendations(
|
|
71
71
|
return Command(
|
72
72
|
update={
|
73
73
|
"multi_papers": results["papers"],
|
74
|
-
|
74
|
+
# Store the latest multi-paper results mapping directly for display
|
75
|
+
"last_displayed_papers": results["papers"],
|
75
76
|
"messages": [
|
76
77
|
ToolMessage(
|
77
78
|
content=results["content"],
|
@@ -49,13 +49,17 @@ def query_dataframe(question: str, state: Annotated[dict, InjectedState]) -> str
|
|
49
49
|
"""
|
50
50
|
logger.info("Querying last displayed papers with question: %s", question)
|
51
51
|
llm_model = state.get("llm_model")
|
52
|
-
|
52
|
+
context_val = state.get("last_displayed_papers")
|
53
|
+
if not context_val:
|
53
54
|
logger.info("No papers displayed so far, raising NoPapersFoundError")
|
54
55
|
raise NoPapersFoundError(
|
55
56
|
"No papers found. A search needs to be performed first."
|
56
57
|
)
|
57
|
-
|
58
|
-
|
58
|
+
# Support both key reference (str) and direct mapping
|
59
|
+
if isinstance(context_val, dict):
|
60
|
+
dic_papers = context_val
|
61
|
+
else:
|
62
|
+
dic_papers = state.get(context_val)
|
59
63
|
df_papers = pd.DataFrame.from_dict(dic_papers, orient="index")
|
60
64
|
df_agent = create_pandas_dataframe_agent(
|
61
65
|
llm_model,
|
@@ -65,7 +65,8 @@ def search_tool(
|
|
65
65
|
return Command(
|
66
66
|
update={
|
67
67
|
"papers": results["papers"],
|
68
|
-
|
68
|
+
# Store the latest results mapping directly for display
|
69
|
+
"last_displayed_papers": results["papers"],
|
69
70
|
"messages": [
|
70
71
|
ToolMessage(
|
71
72
|
content=results["content"],
|
@@ -69,7 +69,8 @@ def get_single_paper_recommendations(
|
|
69
69
|
return Command(
|
70
70
|
update={
|
71
71
|
"papers": results["papers"],
|
72
|
-
|
72
|
+
# Store the latest single-paper results mapping directly for display
|
73
|
+
"last_displayed_papers": results["papers"],
|
73
74
|
"messages": [
|
74
75
|
ToolMessage(
|
75
76
|
content=results["content"],
|
@@ -5,15 +5,14 @@ Utility for zotero read tool.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import logging
|
8
|
-
import
|
9
|
-
from typing import Any, Dict, List, Tuple, Optional
|
10
|
-
import concurrent.futures
|
8
|
+
from typing import Any, Dict, List
|
11
9
|
|
12
10
|
import hydra
|
13
11
|
import requests
|
14
12
|
from pyzotero import zotero
|
15
13
|
|
16
14
|
from .zotero_path import get_item_collections
|
15
|
+
from .zotero_pdf_downloader import download_pdfs_in_parallel
|
17
16
|
|
18
17
|
# Configure logging
|
19
18
|
logging.basicConfig(level=logging.INFO)
|
@@ -30,12 +29,14 @@ class ZoteroSearchData:
|
|
30
29
|
query: str,
|
31
30
|
only_articles: bool,
|
32
31
|
limit: int,
|
33
|
-
|
32
|
+
download_pdfs: bool = True,
|
33
|
+
**_kwargs,
|
34
34
|
):
|
35
35
|
self.query = query
|
36
36
|
self.only_articles = only_articles
|
37
37
|
self.limit = limit
|
38
|
-
|
38
|
+
# Control whether to fetch PDF attachments now
|
39
|
+
self.download_pdfs = download_pdfs
|
39
40
|
self.cfg = self._load_config()
|
40
41
|
self.zot = self._init_zotero_client()
|
41
42
|
self.item_to_collections = get_item_collections(self.zot)
|
@@ -105,89 +106,75 @@ class ZoteroSearchData:
|
|
105
106
|
|
106
107
|
return items
|
107
108
|
|
108
|
-
def
|
109
|
-
"""
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
109
|
+
def _collect_item_attachments(self) -> Dict[str, str]:
|
110
|
+
"""Collect PDF attachment keys for non-orphan items."""
|
111
|
+
item_attachments: Dict[str, str] = {}
|
112
|
+
for item_key, item_data in self.article_data.items():
|
113
|
+
if item_data.get("Type") == "orphan_attachment":
|
114
|
+
continue
|
115
|
+
try:
|
116
|
+
children = self.zot.children(item_key)
|
117
|
+
for child in children:
|
118
|
+
data = child.get("data", {})
|
119
|
+
if data.get("contentType") == "application/pdf":
|
120
|
+
attachment_key = data.get("key")
|
121
|
+
filename = data.get("filename", "unknown.pdf")
|
122
|
+
if attachment_key:
|
123
|
+
item_attachments[attachment_key] = item_key
|
124
|
+
self.article_data[item_key]["filename"] = filename
|
125
|
+
break
|
126
|
+
except Exception as e:
|
127
|
+
logger.error("Failed to get attachments for item %s: %s", item_key, e)
|
128
|
+
return item_attachments
|
129
|
+
|
130
|
+
def _process_orphaned_pdfs(self, orphaned_pdfs: Dict[str, str]) -> None:
|
131
|
+
"""Download or record orphaned PDF attachments."""
|
132
|
+
if self.download_pdfs:
|
133
|
+
logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
|
134
|
+
results = download_pdfs_in_parallel(
|
135
|
+
self.session,
|
136
|
+
self.cfg.user_id,
|
137
|
+
self.cfg.api_key,
|
138
|
+
orphaned_pdfs,
|
139
|
+
chunk_size=getattr(self.cfg, "chunk_size", None),
|
140
|
+
)
|
141
|
+
for item_key, (file_path, filename, attachment_key) in results.items():
|
142
|
+
self.article_data[item_key]["filename"] = filename
|
143
|
+
self.article_data[item_key]["pdf_url"] = file_path
|
144
|
+
self.article_data[item_key]["attachment_key"] = attachment_key
|
145
|
+
logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
|
146
|
+
else:
|
147
|
+
logger.info("Skipping orphaned PDF downloads (download_pdfs=False)")
|
148
|
+
for attachment_key in orphaned_pdfs:
|
149
|
+
self.article_data[attachment_key]["attachment_key"] = attachment_key
|
150
|
+
self.article_data[attachment_key]["filename"] = (
|
151
|
+
self.article_data[attachment_key].get("Title", attachment_key)
|
152
|
+
)
|
115
153
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
154
|
+
def _process_item_pdfs(self, item_attachments: Dict[str, str]) -> None:
|
155
|
+
"""Download or record regular item PDF attachments."""
|
156
|
+
if self.download_pdfs:
|
157
|
+
logger.info(
|
158
|
+
"Downloading %d regular item PDFs in parallel", len(item_attachments)
|
120
159
|
)
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
temp_file_path = temp_file.name
|
128
|
-
|
129
|
-
content_disp = response.headers.get("Content-Disposition", "")
|
130
|
-
filename = (
|
131
|
-
content_disp.split("filename=")[-1].strip('"')
|
132
|
-
if "filename=" in content_disp
|
133
|
-
else "downloaded.pdf"
|
160
|
+
results = download_pdfs_in_parallel(
|
161
|
+
self.session,
|
162
|
+
self.cfg.user_id,
|
163
|
+
self.cfg.api_key,
|
164
|
+
item_attachments,
|
165
|
+
chunk_size=getattr(self.cfg, "chunk_size", None),
|
134
166
|
)
|
167
|
+
else:
|
168
|
+
logger.info("Skipping regular PDF downloads (download_pdfs=False)")
|
169
|
+
results = {}
|
170
|
+
for attachment_key, item_key in item_attachments.items():
|
171
|
+
self.article_data[item_key]["attachment_key"] = attachment_key
|
172
|
+
for item_key, (file_path, filename, attachment_key) in results.items():
|
173
|
+
self.article_data[item_key]["filename"] = filename
|
174
|
+
self.article_data[item_key]["pdf_url"] = file_path
|
175
|
+
self.article_data[item_key]["attachment_key"] = attachment_key
|
176
|
+
logger.info("Downloaded Zotero PDF to: %s", file_path)
|
135
177
|
|
136
|
-
return temp_file_path, filename
|
137
|
-
|
138
|
-
except Exception as e:
|
139
|
-
logger.error(
|
140
|
-
"Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
|
141
|
-
)
|
142
|
-
return None
|
143
|
-
|
144
|
-
def _download_pdfs_in_parallel(
|
145
|
-
self, attachment_item_map: Dict[str, str]
|
146
|
-
) -> Dict[str, Tuple[str, str, str]]:
|
147
|
-
"""
|
148
|
-
Download multiple PDFs in parallel using ThreadPoolExecutor.
|
149
|
-
|
150
|
-
Args:
|
151
|
-
attachment_item_map: Dictionary mapping attachment keys to parent item keys
|
152
|
-
|
153
|
-
Returns:
|
154
|
-
Dictionary mapping parent item keys to (file_path, filename, attachment_key)
|
155
|
-
"""
|
156
|
-
results = {}
|
157
|
-
max_workers = min(10, len(attachment_item_map)) # Set reasonable limit
|
158
|
-
|
159
|
-
if not attachment_item_map:
|
160
|
-
return results
|
161
|
-
|
162
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
163
|
-
# Create a dictionary mapping Future objects to attachment keys
|
164
|
-
future_to_key = {
|
165
|
-
executor.submit(self._download_zotero_pdf, attachment_key): (
|
166
|
-
attachment_key,
|
167
|
-
item_key,
|
168
|
-
)
|
169
|
-
for attachment_key, item_key in attachment_item_map.items()
|
170
|
-
}
|
171
|
-
|
172
|
-
for future in concurrent.futures.as_completed(future_to_key):
|
173
|
-
attachment_key, item_key = future_to_key[future]
|
174
|
-
try:
|
175
|
-
result = future.result()
|
176
|
-
if result:
|
177
|
-
temp_file_path, resolved_filename = result
|
178
|
-
results[item_key] = (
|
179
|
-
temp_file_path,
|
180
|
-
resolved_filename,
|
181
|
-
attachment_key,
|
182
|
-
)
|
183
|
-
except Exception as e:
|
184
|
-
logger.error(
|
185
|
-
"Failed to download PDF for key %s: %s", attachment_key, e
|
186
|
-
)
|
187
|
-
|
188
|
-
return results
|
189
|
-
|
190
|
-
# pylint: disable=too-many-locals, too-many-branches
|
191
178
|
def _filter_and_format_papers(self, items: List[Dict[str, Any]]) -> None:
|
192
179
|
"""Filter and format papers from Zotero items, including standalone PDFs."""
|
193
180
|
filter_item_types = (
|
@@ -196,8 +183,7 @@ class ZoteroSearchData:
|
|
196
183
|
logger.debug("Filtering item types: %s", filter_item_types)
|
197
184
|
|
198
185
|
# Maps to track attachments for batch processing
|
199
|
-
orphaned_pdfs = {} # attachment_key -> item key (same for orphans)
|
200
|
-
item_attachments = {} # item_key -> [attachment_keys]
|
186
|
+
orphaned_pdfs: Dict[str, str] = {} # attachment_key -> item key (same for orphans)
|
201
187
|
|
202
188
|
# First pass: process all items without downloading PDFs
|
203
189
|
for item in items:
|
@@ -263,59 +249,16 @@ class ZoteroSearchData:
|
|
263
249
|
"source": "zotero",
|
264
250
|
}
|
265
251
|
|
266
|
-
#
|
267
|
-
|
268
|
-
if item_data["Type"] != "orphan_attachment":
|
269
|
-
try:
|
270
|
-
children = self.zot.children(item_key)
|
271
|
-
pdf_attachments = [
|
272
|
-
child
|
273
|
-
for child in children
|
274
|
-
if isinstance(child, dict)
|
275
|
-
and child.get("data", {}).get("contentType")
|
276
|
-
== "application/pdf"
|
277
|
-
]
|
278
|
-
|
279
|
-
if pdf_attachments:
|
280
|
-
attachment = pdf_attachments[0]
|
281
|
-
attachment_data = attachment.get("data", {})
|
282
|
-
attachment_key = attachment_data.get("key")
|
283
|
-
filename = attachment_data.get("filename", "unknown.pdf")
|
252
|
+
# Collect and process attachments
|
253
|
+
item_attachments = self._collect_item_attachments()
|
284
254
|
|
285
|
-
|
286
|
-
|
287
|
-
item_attachments[attachment_key] = item_key
|
288
|
-
# Add basic info
|
289
|
-
self.article_data[item_key]["filename"] = filename
|
290
|
-
except Exception as e:
|
291
|
-
logger.error(
|
292
|
-
"Failed to get attachments for item %s: %s", item_key, e
|
293
|
-
)
|
255
|
+
# Process orphaned PDFs
|
256
|
+
self._process_orphaned_pdfs(orphaned_pdfs)
|
294
257
|
|
295
|
-
#
|
296
|
-
|
297
|
-
orphan_results = self._download_pdfs_in_parallel(orphaned_pdfs)
|
298
|
-
|
299
|
-
# Update orphan data
|
300
|
-
for item_key, (file_path, filename, attachment_key) in orphan_results.items():
|
301
|
-
self.article_data[item_key]["filename"] = filename
|
302
|
-
self.article_data[item_key]["pdf_url"] = file_path
|
303
|
-
self.article_data[item_key]["attachment_key"] = attachment_key
|
304
|
-
logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
|
305
|
-
|
306
|
-
# Download regular item attachments
|
307
|
-
logger.info(
|
308
|
-
"Downloading %d regular item PDFs in parallel", len(item_attachments)
|
309
|
-
)
|
310
|
-
item_results = self._download_pdfs_in_parallel(item_attachments)
|
311
|
-
|
312
|
-
# Update item data
|
313
|
-
for item_key, (file_path, filename, attachment_key) in item_results.items():
|
314
|
-
self.article_data[item_key]["filename"] = filename
|
315
|
-
self.article_data[item_key]["pdf_url"] = file_path
|
316
|
-
self.article_data[item_key]["attachment_key"] = attachment_key
|
317
|
-
logger.info("Downloaded Zotero PDF to: %s", file_path)
|
258
|
+
# Process regular item PDFs
|
259
|
+
self._process_item_pdfs(item_attachments)
|
318
260
|
|
261
|
+
# Ensure we have some results
|
319
262
|
if not self.article_data:
|
320
263
|
logger.error(
|
321
264
|
"No matching papers returned from Zotero for query: '%s'", self.query
|
@@ -0,0 +1,147 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Utility functions for downloading PDFs from Zotero.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import tempfile
|
8
|
+
from typing import Optional, Tuple, Dict
|
9
|
+
import concurrent.futures
|
10
|
+
import requests
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def download_zotero_pdf(
|
16
|
+
session: requests.Session,
|
17
|
+
user_id: str,
|
18
|
+
api_key: str,
|
19
|
+
attachment_key: str,
|
20
|
+
**kwargs,
|
21
|
+
) -> Optional[Tuple[str, str]]:
|
22
|
+
"""
|
23
|
+
Download a PDF from Zotero by attachment key.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
session: requests.Session for HTTP requests.
|
27
|
+
user_id: Zotero user ID.
|
28
|
+
api_key: Zotero API key.
|
29
|
+
attachment_key: Zotero attachment item key.
|
30
|
+
kwargs:
|
31
|
+
timeout (int): Request timeout in seconds (default: 10).
|
32
|
+
chunk_size (int, optional): Chunk size for streaming.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Tuple of (local_file_path, filename) if successful, else None.
|
36
|
+
"""
|
37
|
+
# Extract optional parameters
|
38
|
+
timeout = kwargs.get("timeout", 10)
|
39
|
+
chunk_size = kwargs.get("chunk_size")
|
40
|
+
# Log configured parameters for verification
|
41
|
+
logger.info("download_zotero_pdf params -> timeout=%s, chunk_size=%s", timeout, chunk_size)
|
42
|
+
# Log download start
|
43
|
+
logger.info(
|
44
|
+
"Downloading Zotero PDF for attachment %s from Zotero API", attachment_key
|
45
|
+
)
|
46
|
+
zotero_pdf_url = (
|
47
|
+
f"https://api.zotero.org/users/{user_id}/items/" f"{attachment_key}/file"
|
48
|
+
)
|
49
|
+
headers = {"Zotero-API-Key": api_key}
|
50
|
+
|
51
|
+
try:
|
52
|
+
response = session.get(
|
53
|
+
zotero_pdf_url, headers=headers, stream=True, timeout=timeout
|
54
|
+
)
|
55
|
+
response.raise_for_status()
|
56
|
+
|
57
|
+
# Download to a temporary file first
|
58
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
59
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
60
|
+
temp_file.write(chunk)
|
61
|
+
temp_file_path = temp_file.name
|
62
|
+
# Temp file written to %s
|
63
|
+
logger.info("Zotero PDF downloaded to temporary file: %s", temp_file_path)
|
64
|
+
|
65
|
+
# Determine filename from Content-Disposition header or default
|
66
|
+
if "filename=" in response.headers.get("Content-Disposition", ""):
|
67
|
+
filename = (
|
68
|
+
response.headers.get("Content-Disposition", "")
|
69
|
+
.split("filename=")[-1]
|
70
|
+
.strip('"')
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
filename = "downloaded.pdf"
|
74
|
+
|
75
|
+
return temp_file_path, filename
|
76
|
+
|
77
|
+
except (requests.exceptions.RequestException, OSError) as e:
|
78
|
+
logger.error(
|
79
|
+
"Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
|
80
|
+
)
|
81
|
+
return None
|
82
|
+
|
83
|
+
|
84
|
+
def download_pdfs_in_parallel(
|
85
|
+
session: requests.Session,
|
86
|
+
user_id: str,
|
87
|
+
api_key: str,
|
88
|
+
attachment_item_map: Dict[str, str],
|
89
|
+
**kwargs,
|
90
|
+
) -> Dict[str, Tuple[str, str, str]]:
|
91
|
+
"""
|
92
|
+
Download multiple PDFs in parallel using ThreadPoolExecutor.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
session: requests.Session for HTTP requests.
|
96
|
+
user_id: Zotero user ID.
|
97
|
+
api_key: Zotero API key.
|
98
|
+
attachment_item_map: Mapping of attachment_key to parent item_key.
|
99
|
+
kwargs:
|
100
|
+
max_workers (int, optional): Maximum number of worker threads (default: min(10, n)).
|
101
|
+
chunk_size (int, optional): Chunk size for streaming.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
Mapping of parent item_key to (local_file_path, filename, attachment_key).
|
105
|
+
"""
|
106
|
+
# Extract optional parameters
|
107
|
+
max_workers = kwargs.get("max_workers")
|
108
|
+
chunk_size = kwargs.get("chunk_size")
|
109
|
+
# Log configured parameters for verification
|
110
|
+
logger.info(
|
111
|
+
"download_pdfs_in_parallel params -> max_workers=%s, chunk_size=%s",
|
112
|
+
max_workers,
|
113
|
+
chunk_size,
|
114
|
+
)
|
115
|
+
results: Dict[str, Tuple[str, str, str]] = {}
|
116
|
+
if not attachment_item_map:
|
117
|
+
return results
|
118
|
+
|
119
|
+
with concurrent.futures.ThreadPoolExecutor(
|
120
|
+
max_workers=(
|
121
|
+
max_workers
|
122
|
+
if max_workers is not None
|
123
|
+
else min(10, len(attachment_item_map))
|
124
|
+
)
|
125
|
+
) as executor:
|
126
|
+
future_to_keys = {
|
127
|
+
executor.submit(
|
128
|
+
download_zotero_pdf,
|
129
|
+
session,
|
130
|
+
user_id,
|
131
|
+
api_key,
|
132
|
+
attachment_key,
|
133
|
+
chunk_size=chunk_size,
|
134
|
+
): (attachment_key, item_key)
|
135
|
+
for attachment_key, item_key in attachment_item_map.items()
|
136
|
+
}
|
137
|
+
|
138
|
+
for future in concurrent.futures.as_completed(future_to_keys):
|
139
|
+
attachment_key, item_key = future_to_keys[future]
|
140
|
+
try:
|
141
|
+
res = future.result()
|
142
|
+
if res:
|
143
|
+
results[item_key] = (*res, attachment_key)
|
144
|
+
except (requests.exceptions.RequestException, OSError) as e:
|
145
|
+
logger.error("Failed to download PDF for key %s: %s", attachment_key, e)
|
146
|
+
|
147
|
+
return results
|