aiagents4pharma 1.40.0__py3-none-any.whl → 1.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
  2. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
  3. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
  4. aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
  5. aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
  6. aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
  7. aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
  8. aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
  9. aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
  10. aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
  11. aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
  12. aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
  13. aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
  14. aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
  15. aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
  16. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
  17. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
  18. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
  19. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
  20. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
  21. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
  22. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  23. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
  24. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  25. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  26. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
  27. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
  28. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
  32. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
  33. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  34. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
  35. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
  36. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
  37. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
  38. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
  39. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
  40. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
  41. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/METADATA +27 -115
  42. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/RECORD +45 -23
  43. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
  44. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/WHEEL +0 -0
  45. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/licenses/LICENSE +0 -0
  46. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
1
  """
2
- Vectorstore class for managing document embeddings and retrieval.
2
+ Vectorstore class for managing PDF embeddings with Milvus.
3
+ Manages GPU normalization and similarity search and MMR operations.
4
+ With automatic handling of COSINE to IP conversion for GPU compatibility.
5
+ Supports both GPU and CPU configurations.
3
6
  """
4
7
 
5
8
  import logging
@@ -7,13 +10,18 @@ import os
7
10
  import time
8
11
  from typing import Any, Dict, List, Optional
9
12
 
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain_community.document_loaders import PyPDFLoader
12
- from langchain_community.vectorstores import FAISS
13
13
  from langchain_core.documents import Document
14
14
  from langchain_core.embeddings import Embeddings
15
- from langchain_core.vectorstores import VectorStore
15
+ from langchain_milvus import Milvus
16
16
 
17
+ from .collection_manager import ensure_collection_exists
18
+ from .gpu_detection import (
19
+ detect_nvidia_gpu,
20
+ get_optimal_index_config,
21
+ log_index_configuration,
22
+ )
23
+ from .singleton_manager import VectorstoreSingleton
24
+ from .vector_normalization import wrap_embedding_model_if_needed
17
25
 
18
26
  # Set up logging with configurable level
19
27
  log_level = os.environ.get("LOG_LEVEL", "INFO")
@@ -24,8 +32,8 @@ logger.setLevel(getattr(logging, log_level))
24
32
 
25
33
  class Vectorstore:
26
34
  """
27
- A class for managing document embeddings and retrieval.
28
- Provides unified access to documents across multiple papers.
35
+ Enhanced Vectorstore class with GPU normalization support.
36
+ Automatically handles COSINE -> IP conversion for GPU compatibility.
29
37
  """
30
38
 
31
39
  def __init__(
@@ -35,13 +43,13 @@ class Vectorstore:
35
43
  config: Any = None,
36
44
  ):
37
45
  """
38
- Initialize the document store.
46
+ Initialize the document store with Milvus and GPU optimization.
39
47
 
40
48
  Args:
41
49
  embedding_model: The embedding model to use
42
- metadata_fields: Fields to include in document metadata for filtering/retrieval
50
+ metadata_fields: Fields to include in document metadata
51
+ config: Configuration object containing Milvus connection details
43
52
  """
44
- self.embedding_model = embedding_model
45
53
  self.config = config
46
54
  self.metadata_fields = metadata_fields or [
47
55
  "title",
@@ -50,113 +58,286 @@ class Vectorstore:
50
58
  "chunk_id",
51
59
  ]
52
60
  self.initialization_time = time.time()
53
- logger.info("Vectorstore initialized at: %s", self.initialization_time)
61
+
62
+ # GPU detection with config override (SINGLE CALL)
63
+ self.has_gpu = detect_nvidia_gpu(config)
64
+
65
+ # Additional check for force CPU mode
66
+ if (
67
+ config
68
+ and hasattr(config, "gpu_detection")
69
+ and getattr(config.gpu_detection, "force_cpu_mode", False)
70
+ ):
71
+ logger.info("Running in forced CPU mode (config override)")
72
+ self.has_gpu = False
73
+
74
+ # Determine if we want to use COSINE similarity
75
+ self.use_cosine = True # Default preference
76
+ if config and hasattr(config, "similarity_metric"):
77
+ self.use_cosine = getattr(config.similarity_metric, "use_cosine", True)
78
+
79
+ # Wrap embedding model with normalization if needed for GPU
80
+ self.original_embedding_model = embedding_model
81
+ self.embedding_model = wrap_embedding_model_if_needed(
82
+ embedding_model, self.has_gpu, self.use_cosine
83
+ )
84
+
85
+ # Configure index parameters AFTER determining GPU usage and normalization
86
+ embedding_dim = config.milvus.embedding_dim if config else 768
87
+ self.index_params, self.search_params = get_optimal_index_config(
88
+ self.has_gpu, embedding_dim, self.use_cosine
89
+ )
90
+
91
+ # Log the configuration
92
+ log_index_configuration(self.index_params, self.search_params, self.use_cosine)
54
93
 
55
94
  # Track loaded papers to prevent duplicate loading
56
95
  self.loaded_papers = set()
57
- self.vector_store_class = FAISS
58
- logger.info("Using FAISS vector store")
59
96
 
60
- # Store for initialized documents
97
+ # Initialize Milvus connection parameters with environment variable fallback
98
+ self.connection_args = {
99
+ "host": (
100
+ config.milvus.host if config else os.getenv("MILVUS_HOST", "127.0.0.1")
101
+ ),
102
+ "port": (
103
+ config.milvus.port if config else int(os.getenv("MILVUS_PORT", "19530"))
104
+ ),
105
+ }
106
+ # Log the connection parameters being used
107
+ logger.info(
108
+ "Using Milvus connection: %s:%s",
109
+ self.connection_args["host"],
110
+ self.connection_args["port"],
111
+ )
112
+ self.collection_name = (
113
+ config.milvus.collection_name if config else "pdf_rag_documents"
114
+ )
115
+ self.db_name = config.milvus.db_name if config else "pdf_rag_db"
116
+
117
+ # Get singleton instance
118
+ self._singleton = VectorstoreSingleton()
119
+
120
+ # Connect to Milvus (reuses existing connection if available)
121
+ self._connect_milvus()
122
+
123
+ # Create collection with proper metric type
124
+ self.collection = ensure_collection_exists(
125
+ self.collection_name, self.config, self.index_params, self.has_gpu
126
+ )
127
+
128
+ # Initialize the LangChain Milvus vector store
129
+ self.vector_store = self._initialize_vector_store()
130
+
131
+ # Load existing papers AFTER vector store is ready
132
+ self._load_existing_paper_ids()
133
+
134
+ # CRITICAL: Load collection into memory/GPU after any existing data is identified
135
+ logger.info(
136
+ "Calling _ensure_collection_loaded() for %s processing...",
137
+ "GPU" if self.has_gpu else "CPU",
138
+ )
139
+ self._ensure_collection_loaded()
140
+
141
+ # Store for document metadata (keeping for compatibility)
61
142
  self.documents: Dict[str, Document] = {}
62
- self.vector_store: Optional[VectorStore] = None
63
143
  self.paper_metadata: Dict[str, Dict[str, Any]] = {}
64
- # Cache for document chunk embeddings to avoid recomputation
65
- self.embeddings: Dict[str, Any] = {}
66
144
 
67
- def add_paper(
68
- self,
69
- paper_id: str,
70
- pdf_url: str,
71
- paper_metadata: Dict[str, Any],
72
- ) -> None:
73
- """
74
- Add a paper to the document store.
145
+ # Log final configuration
146
+ metric_info = (
147
+ "IP (normalized for COSINE)"
148
+ if self.has_gpu and self.use_cosine
149
+ else self.index_params["metric_type"]
150
+ )
75
151
 
76
- Args:
77
- paper_id: Unique identifier for the paper
78
- pdf_url: URL to the PDF
79
- paper_metadata: Metadata about the paper
80
- """
81
- # Skip if already loaded
82
- if paper_id in self.loaded_papers:
83
- logger.info("Paper %s already loaded, skipping", paper_id)
84
- return
152
+ logger.info(
153
+ "Milvus vector store initialized with collection: %s (GPU: %s, Metric: %s)",
154
+ self.collection_name,
155
+ "enabled" if self.has_gpu else "disabled",
156
+ metric_info,
157
+ )
158
+
159
+ def _connect_milvus(self) -> None:
160
+ """Establish connection to Milvus server using singleton."""
161
+ self._singleton.get_connection(
162
+ self.connection_args["host"], self.connection_args["port"], self.db_name
163
+ )
164
+
165
+ def _initialize_vector_store(self) -> Milvus:
166
+ """Initialize or load the Milvus vector store with proper embedding model."""
167
+ # Use the wrapped embedding model (with normalization if needed)
168
+ vector_store = self._singleton.get_vector_store(
169
+ self.collection_name, self.embedding_model, self.connection_args
170
+ )
171
+
172
+ return vector_store
85
173
 
86
- logger.info("Loading paper %s from %s", paper_id, pdf_url)
174
+ def _load_existing_paper_ids(self):
175
+ """Load already embedded paper IDs using LangChain's collection access."""
176
+ logger.info("Checking for existing papers via LangChain collection...")
87
177
 
88
- # Store paper metadata
89
- self.paper_metadata[paper_id] = paper_metadata
178
+ # Access the collection through LangChain's wrapper
179
+ langchain_collection = getattr(self.vector_store, "col", None)
90
180
 
91
- # Load the PDF and split into chunks according to Hydra config
92
- loader = PyPDFLoader(pdf_url)
93
- documents = loader.load()
94
- logger.info("Loaded %d pages from %s", len(documents), paper_id)
181
+ if langchain_collection is None:
182
+ langchain_collection = getattr(self.vector_store, "collection", None)
95
183
 
96
- # Create text splitter according to provided configuration
97
- if self.config is None:
98
- raise ValueError(
99
- "Configuration is required for text splitting in Vectorstore."
184
+ if langchain_collection is None:
185
+ logger.warning(
186
+ "No LangChain collection found, proceeding with empty loaded_papers"
100
187
  )
101
- splitter = RecursiveCharacterTextSplitter(
102
- chunk_size=self.config.chunk_size,
103
- chunk_overlap=self.config.chunk_overlap,
104
- separators=["\n\n", "\n", ". ", " ", ""],
105
- )
188
+ return
189
+
190
+ # Force flush and check entity count
191
+ langchain_collection.flush()
192
+ num_entities = langchain_collection.num_entities
106
193
 
107
- # Split documents and add metadata for each chunk
108
- chunks = splitter.split_documents(documents)
109
- logger.info("Split %s into %d chunks", paper_id, len(chunks))
110
- # Embed and cache chunk embeddings
111
- chunk_texts = [chunk.page_content for chunk in chunks]
112
- chunk_embeddings = self.embedding_model.embed_documents(chunk_texts)
113
- logger.info("Embedded %d chunks for paper %s", len(chunk_embeddings), paper_id)
114
-
115
- # Enhance document metadata
116
- for i, chunk in enumerate(chunks):
117
- # Add paper metadata to each chunk
118
- chunk.metadata.update(
119
- {
120
- "paper_id": paper_id,
121
- "title": paper_metadata.get("Title", "Unknown"),
122
- "chunk_id": i,
123
- # Keep existing page number if available
124
- "page": chunk.metadata.get("page", 0),
125
- }
194
+ logger.info("LangChain collection entity count: %d", num_entities)
195
+
196
+ if num_entities > 0:
197
+ logger.info("Loading existing paper IDs from LangChain collection...")
198
+
199
+ results = langchain_collection.query(
200
+ expr="", # No filter - get all
201
+ output_fields=["paper_id"],
202
+ limit=16384, # Max limit
203
+ consistency_level="Strong",
126
204
  )
127
205
 
128
- # Add any additional metadata fields
129
- for field in self.metadata_fields:
130
- if field in paper_metadata and field not in chunk.metadata:
131
- chunk.metadata[field] = paper_metadata[field]
206
+ # Extract unique paper IDs
207
+ existing_paper_ids = set(result["paper_id"] for result in results)
208
+ self.loaded_papers.update(existing_paper_ids)
132
209
 
133
- # Store chunk
134
- doc_id = f"{paper_id}_{i}"
135
- self.documents[doc_id] = chunk
136
- # Cache embedding if available
137
- if chunk_embeddings[i] is not None:
138
- self.embeddings[doc_id] = chunk_embeddings[i]
210
+ logger.info("Found %d unique papers in collection", len(existing_paper_ids))
211
+ else:
212
+ logger.info("Collection is empty - no existing papers")
139
213
 
140
- # Mark as loaded to prevent duplicate loading
141
- self.loaded_papers.add(paper_id)
142
- logger.info("Added %d chunks from paper %s", len(chunks), paper_id)
214
+ def similarity_search(self, query: str, **kwargs: Any) -> List[Document]:
215
+ """
216
+ Perform similarity search on the vector store.
217
+ Query embedding will be automatically normalized if using GPU with COSINE.
218
+ Keyword args:
219
+ k: int = 4
220
+ filter: Optional[Dict[str, Any]] = None
221
+ plus any other kwargs to pass through to the underlying vector_store.
222
+ """
223
+ # Extract our parameters
224
+ k: int = kwargs.pop("k", 4)
225
+ filter_: Optional[Dict[str, Any]] = kwargs.pop("filter", None)
226
+
227
+ # Build Milvus expr from filter_, if present
228
+ expr = None
229
+ if filter_:
230
+ conditions = []
231
+ for key, value in filter_.items():
232
+ if isinstance(value, str):
233
+ conditions.append(f'{key} == "{value}"')
234
+ elif isinstance(value, list):
235
+ vals = ", ".join(
236
+ f'"{v}"' if isinstance(v, str) else str(v) for v in value
237
+ )
238
+ conditions.append(f"{key} in [{vals}]")
239
+ else:
240
+ conditions.append(f"{key} == {value}")
241
+ expr = " and ".join(conditions)
242
+
243
+ # Delegate to the wrapped store
244
+ return self.vector_store.similarity_search(
245
+ query=query, k=k, expr=expr, **kwargs
246
+ )
143
247
 
144
- def build_vector_store(self) -> None:
248
+ def max_marginal_relevance_search(
249
+ self, query: str, **kwargs: Any
250
+ ) -> List[Document]:
145
251
  """
146
- Build the vector store from all loaded documents.
147
- Should be called after all papers are added.
252
+ Perform MMR search on the vector store.
253
+ Query embedding will be automatically normalized if using GPU with COSINE.
254
+ Keyword args:
255
+ k: int = 4
256
+ fetch_k: int = 20
257
+ lambda_mult: float = 0.5
258
+ filter: Optional[Dict[str, Any]] = None
259
+ plus any other kwargs to pass through.
148
260
  """
149
- if not self.documents:
150
- logger.warning("No documents added to build vector store")
151
- return
261
+ # Extract our parameters
262
+ k: int = kwargs.pop("k", 4)
263
+ fetch_k: int = kwargs.pop("fetch_k", 20)
264
+ lambda_mult: float = kwargs.pop("lambda_mult", 0.5)
265
+ filter_: Optional[Dict[str, Any]] = kwargs.pop("filter", None)
152
266
 
153
- if self.vector_store is not None:
154
- logger.info("Vector store already built, skipping")
155
- return
267
+ # Build Milvus expr from filter_, if present
268
+ expr = None
269
+ if filter_:
270
+ conditions = []
271
+ for key, value in filter_.items():
272
+ if isinstance(value, str):
273
+ conditions.append(f'{key} == "{value}"')
274
+ elif isinstance(value, list):
275
+ vals = ", ".join(
276
+ f'"{v}"' if isinstance(v, str) else str(v) for v in value
277
+ )
278
+ conditions.append(f"{key} in [{vals}]")
279
+ else:
280
+ conditions.append(f"{key} == {value}")
281
+ expr = " and ".join(conditions)
156
282
 
157
- # Create vector store from documents
158
- documents_list = list(self.documents.values())
159
- self.vector_store = self.vector_store_class.from_documents(
160
- documents=documents_list, embedding=self.embedding_model
283
+ # Delegate to the wrapped store
284
+ return self.vector_store.max_marginal_relevance_search(
285
+ query=query,
286
+ k=k,
287
+ fetch_k=fetch_k,
288
+ lambda_mult=lambda_mult,
289
+ expr=expr,
290
+ **kwargs,
161
291
  )
162
- logger.info("Built vector store with %d documents", len(documents_list))
292
+
293
+ def _ensure_collection_loaded(self):
294
+ """Ensure collection is loaded into memory/GPU after data insertion."""
295
+ # Get the collection
296
+ collection = getattr(self.vector_store, "col", None)
297
+ if collection is None:
298
+ collection = getattr(self.vector_store, "collection", None)
299
+
300
+ if collection is None:
301
+ logger.warning("Cannot access collection for loading")
302
+ return
303
+
304
+ # Force flush to ensure we see all data
305
+ logger.info("Flushing collection to ensure data visibility...")
306
+ collection.flush()
307
+
308
+ # Check entity count after flush
309
+ num_entities = collection.num_entities
310
+ logger.info("Collection entity count after flush: %d", num_entities)
311
+
312
+ if num_entities > 0:
313
+ hardware_type = "GPU" if self.has_gpu else "CPU"
314
+ logger.info(
315
+ "Loading collection with %d entities into %s memory...",
316
+ num_entities,
317
+ hardware_type,
318
+ )
319
+
320
+ # Load collection into memory (CPU or GPU)
321
+ collection.load()
322
+
323
+ # Verify loading was successful
324
+ final_count = collection.num_entities
325
+ logger.info(
326
+ "Collection successfully loaded into %s memory with %d entities",
327
+ hardware_type,
328
+ final_count,
329
+ )
330
+ else:
331
+ logger.info("Collection is empty, skipping load operation")
332
+
333
+ def get_embedding_info(self) -> Dict[str, Any]:
334
+ """Get information about the embedding configuration."""
335
+ return {
336
+ "has_gpu": self.has_gpu,
337
+ "use_cosine": self.use_cosine,
338
+ "metric_type": self.index_params["metric_type"],
339
+ "index_type": self.index_params["index_type"],
340
+ "normalization_enabled": hasattr(self.embedding_model, "normalize_for_gpu"),
341
+ "original_model_type": type(self.original_embedding_model).__name__,
342
+ "wrapped_model_type": type(self.embedding_model).__name__,
343
+ }
@@ -65,25 +65,28 @@ def get_multi_paper_recommendations(
65
65
  year: Optional[str] = None,
66
66
  ) -> Command[Any]:
67
67
  """
68
- Return recommended papers based on multiple Semantic Scholar paper IDs.
68
+ Recommend related research papers using the Semantic Scholar API.
69
69
 
70
- This tool accepts a list of Semantic Scholar paper IDs and returns a set of
71
- recommended papers by aggregating related works (citations and references)
72
- from each input paper.
70
+ This tool is designed to suggest relevant papers based on a list of
71
+ input Semantic Scholar paper IDs.
72
+
73
+ It fetches citations and references for each input paper and aggregates
74
+ them to generate a set of
75
+ recommended papers.
73
76
 
74
77
  Args:
75
78
  paper_ids (List[str]): List of 40-character Semantic Scholar paper IDs.
76
- Provide at least two IDs.
79
+ Provide at least two IDs to improve the relevance of recommendations.
77
80
  tool_call_id (str): Internal tool call identifier injected by the system.
78
- limit (int, optional): Maximum total number of recommendations to return. Defaults to 10.
79
- year (str, optional): Publication year filter; supports formats: 'YYYY',
80
- 'YYYY-', '-YYYY', 'YYYY:YYYY'. Defaults to None.
81
+ limit (int, optional): Maximum number of recommendations to return. Defaults to 10.
82
+ year (str, optional): Filter recommendations by publication year.
83
+ Supports formats: 'YYYY', 'YYYY-', '-YYYY', or 'YYYY:YYYY'. Defaults to None.
81
84
 
82
85
  Returns:
83
86
  Command: A Command object containing:
84
87
  - multi_papers: List of recommended papers.
85
88
  - last_displayed_papers: Same list for display purposes.
86
- - messages: List containing a ToolMessage with recommendations details.
89
+ - messages: List containing a ToolMessage with recommendation details.
87
90
  """
88
91
  # Create recommendation data object to organize variables
89
92
  rec_data = MultiPaperRecData(paper_ids, limit, year, tool_call_id)
@@ -87,7 +87,6 @@ class QueryDataFrameInput(BaseModel):
87
87
  "query_dataframe",
88
88
  args_schema=QueryDataFrameInput,
89
89
  parse_docstring=True,
90
- return_direct=True,
91
90
  )
92
91
  def query_dataframe(
93
92
  question: str,
@@ -50,22 +50,23 @@ def retrieve_semantic_scholar_paper_id(
50
50
  tool_call_id: str,
51
51
  ) -> Command[Any]:
52
52
  """
53
- Search for a paper by title on Semantic Scholar and return its unique paper ID.
53
+ Retrieve a Semantic Scholar paper ID using a paper title.
54
54
 
55
- This tool issues a GET request to the Semantic Scholar API to find the best match
56
- for the given paper title, then returns the paper's Semantic Scholar ID.
55
+ This tool searches Semantic Scholar for the best match to the provided paper title
56
+ and returns the corresponding unique paper ID. It is intended to support downstream
57
+ tasks such as recommendations, metadata lookups, or citation graph queries.
57
58
 
58
- Use when you have a known title (full or partial) and need the Semantic Scholar ID
59
- to fetch additional metadata or perform downstream lookups. Do not use this tool
60
- for broad literature searches; for general search use the `search` tool.
59
+ Use this tool when you know the full or partial title of a paper and need its
60
+ Semantic Scholar ID.
61
+ For broad literature searches or topic-based queries, use a general `search` tool instead.
61
62
 
62
63
  Args:
63
- paper_title (str): The title of the paper to look up.
64
+ paper_title (str): The full or partial title of the paper to look up.
64
65
  tool_call_id (str): LangGraph-injected identifier for this tool call.
65
66
 
66
67
  Returns:
67
68
  Command: A structured response containing a ToolMessage whose content is
68
- the Semantic Scholar paper ID string (e.g., 'abc123xyz').
69
+ the Semantic Scholar paper ID string (e.g., 'abc123xyz').
69
70
 
70
71
  Raises:
71
72
  ValueError: If no matching paper is found for the given title.
@@ -60,17 +60,17 @@ def get_single_paper_recommendations(
60
60
  year: Optional[str] = None,
61
61
  ) -> Command[Any]:
62
62
  """
63
- Return recommended papers for a single Semantic Scholar paper ID.
63
+ Recommend related research papers using the Semantic Scholar API for a single paper ID.
64
64
 
65
- This tool accepts a single Semantic Scholar paper ID and returns related works
66
- by aggregating citations and references.
65
+ This tool is designed to suggest relevant papers based on one input Semantic Scholar paper ID.
66
+ It fetches citations and references for the given paper and returns a set of recommended works.
67
67
 
68
68
  Args:
69
69
  paper_id (str): 40-character Semantic Scholar paper ID.
70
70
  tool_call_id (str): Internal tool call identifier injected by the system.
71
71
  limit (int, optional): Maximum number of recommendations to return. Defaults to 5.
72
- year (str, optional): Publication year filter; supports 'YYYY', 'YYYY-',
73
- '-YYYY', 'YYYY:YYYY'. Defaults to None.
72
+ year (str, optional): Filter recommendations by publication year.
73
+ Supports formats: 'YYYY', 'YYYY-', '-YYYY', or 'YYYY:YYYY'. Defaults to None.
74
74
 
75
75
  Returns:
76
76
  Command: A Command object containing: