nexus-cli 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nexus/research/pdf.py ADDED
@@ -0,0 +1,497 @@
1
+ """PDF extraction and search for Nexus CLI."""
2
+
3
+ import re
4
+ import shutil
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass
11
+ class PDFDocument:
12
+ """A PDF document with extracted content."""
13
+
14
+ path: str
15
+ filename: str
16
+ title: str = ""
17
+ text: str = ""
18
+ page_count: int = 0
19
+ size_bytes: int = 0
20
+
21
+ def to_dict(self) -> dict:
22
+ """Convert to dictionary."""
23
+ return {
24
+ "path": self.path,
25
+ "filename": self.filename,
26
+ "title": self.title or self.filename,
27
+ "page_count": self.page_count,
28
+ "size_bytes": self.size_bytes,
29
+ "text_preview": self.text[:500] + "..." if len(self.text) > 500 else self.text,
30
+ }
31
+
32
+
33
+ @dataclass
34
+ class PDFSearchResult:
35
+ """A search result from PDF content."""
36
+
37
+ path: str
38
+ filename: str
39
+ page: int
40
+ context: str
41
+ match_text: str = ""
42
+
43
+ def to_dict(self) -> dict:
44
+ """Convert to dictionary."""
45
+ return {
46
+ "path": self.path,
47
+ "filename": self.filename,
48
+ "page": self.page,
49
+ "context": self.context,
50
+ "match_text": self.match_text,
51
+ }
52
+
53
+
54
+ class PDFExtractor:
55
+ """Extract text from PDF files using pdftotext."""
56
+
57
+ def __init__(self, directories: list[Path] | None = None):
58
+ """Initialize PDF extractor.
59
+
60
+ Args:
61
+ directories: List of directories to search for PDFs
62
+ """
63
+ self.directories = [Path(d).expanduser() for d in (directories or [])]
64
+ self._pdftotext_path = shutil.which("pdftotext")
65
+
66
+ def available(self) -> bool:
67
+ """Check if pdftotext is available."""
68
+ return self._pdftotext_path is not None
69
+
70
+ def pdf_count(self) -> int:
71
+ """Count total PDFs in configured directories."""
72
+ count = 0
73
+ for directory in self.directories:
74
+ if directory.exists():
75
+ count += len(list(directory.rglob("*.pdf")))
76
+ return count
77
+
78
+ def extract(
79
+ self,
80
+ pdf_path: Path,
81
+ pages: str | None = None,
82
+ layout: bool = False,
83
+ ) -> PDFDocument:
84
+ """Extract text from a PDF file.
85
+
86
+ Args:
87
+ pdf_path: Path to PDF file
88
+ pages: Optional page range (e.g., "1-5" or "1,3,5")
89
+ layout: Preserve layout (slower but better for tables)
90
+
91
+ Returns:
92
+ PDFDocument with extracted text
93
+
94
+ Raises:
95
+ FileNotFoundError: If PDF doesn't exist
96
+ RuntimeError: If pdftotext not available
97
+ """
98
+ pdf_path = Path(pdf_path).expanduser()
99
+
100
+ if not pdf_path.exists():
101
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
102
+
103
+ if not self.available():
104
+ raise RuntimeError("pdftotext not installed. Run: brew install poppler")
105
+
106
+ # Build command
107
+ cmd = [self._pdftotext_path]
108
+
109
+ if layout:
110
+ cmd.append("-layout")
111
+ else:
112
+ # Use -raw for better text flow in multi-column papers
113
+ cmd.append("-raw")
114
+
115
+ # Handle page range
116
+ first_page = None
117
+ last_page = None
118
+ if pages:
119
+ if "-" in pages:
120
+ parts = pages.split("-")
121
+ first_page = int(parts[0])
122
+ last_page = int(parts[1]) if len(parts) > 1 and parts[1] else None
123
+ elif pages.isdigit():
124
+ first_page = int(pages)
125
+ last_page = int(pages)
126
+
127
+ if first_page:
128
+ cmd.extend(["-f", str(first_page)])
129
+ if last_page:
130
+ cmd.extend(["-l", str(last_page)])
131
+
132
+ cmd.extend([str(pdf_path), "-"])
133
+
134
+ try:
135
+ result = subprocess.run(
136
+ cmd,
137
+ capture_output=True,
138
+ text=True,
139
+ timeout=60,
140
+ )
141
+ text = result.stdout
142
+ except subprocess.TimeoutExpired:
143
+ text = "[Extraction timed out]"
144
+ except Exception as e:
145
+ text = f"[Extraction failed: {e}]"
146
+
147
+ # Clean and improve text quality
148
+ text = self._clean_text(text)
149
+
150
+ # Get page count using pdfinfo if available
151
+ page_count = self._get_page_count(pdf_path)
152
+
153
+ # Try to extract title intelligently
154
+ title = self._extract_title(text, pdf_path)
155
+
156
+ return PDFDocument(
157
+ path=str(pdf_path),
158
+ filename=pdf_path.name,
159
+ title=title,
160
+ text=text,
161
+ page_count=page_count,
162
+ size_bytes=pdf_path.stat().st_size,
163
+ )
164
+
165
+ def _clean_text(self, text: str) -> str:
166
+ """Clean extracted text for better readability.
167
+
168
+ Args:
169
+ text: Raw extracted text
170
+
171
+ Returns:
172
+ Cleaned text
173
+ """
174
+ if not text or text.startswith("["):
175
+ return text
176
+
177
+ # Remove multiple spaces
178
+ text = re.sub(r" +", " ", text)
179
+
180
+ # Remove excessive newlines (more than 2)
181
+ text = re.sub(r"\n{3,}", "\n\n", text)
182
+
183
+ # Fix hyphenated words split across lines
184
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
185
+
186
+ # Remove soft hyphens
187
+ text = text.replace("\u00ad", "")
188
+
189
+ # Fix ligatures
190
+ text = text.replace("ff", "ff")
191
+ text = text.replace("fi", "fi")
192
+ text = text.replace("fl", "fl")
193
+ text = text.replace("ffi", "ffi")
194
+ text = text.replace("ffl", "ffl")
195
+
196
+ # Remove common PDF artifacts
197
+ text = re.sub(r"\(cid:\d+\)", "", text)
198
+
199
+ # Normalize whitespace
200
+ lines = []
201
+ for line in text.split("\n"):
202
+ line = line.strip()
203
+ if line:
204
+ lines.append(line)
205
+
206
+ return "\n".join(lines)
207
+
208
+ def _extract_title(self, text: str, pdf_path: Path) -> str:
209
+ """Extract title from PDF text intelligently.
210
+
211
+ Args:
212
+ text: Extracted text
213
+ pdf_path: Path to PDF
214
+
215
+ Returns:
216
+ Best guess at title
217
+ """
218
+ if not text or text.startswith("["):
219
+ return pdf_path.stem
220
+
221
+ lines = text.strip().split("\n")
222
+
223
+ # Look for title in first few lines
224
+ for i, line in enumerate(lines[:10]):
225
+ line = line.strip()
226
+
227
+ # Skip empty lines
228
+ if not line:
229
+ continue
230
+
231
+ # Skip common headers (page numbers, running heads)
232
+ if re.match(r"^\d+$", line):
233
+ continue
234
+ if len(line) < 10:
235
+ continue
236
+
237
+ # Title is usually capitalized and reasonably long
238
+ if len(line) > 15 and len(line) < 200 and not line.endswith(".") and line[0].isupper():
239
+ # Clean up title
240
+ title = re.sub(r"\s+", " ", line)
241
+
242
+ # Remove trailing punctuation except ?!
243
+ title = re.sub(r"[,;:]$", "", title)
244
+
245
+ return title
246
+
247
+ # Fallback to filename
248
+ return pdf_path.stem
249
+
250
+ def _get_page_count(self, pdf_path: Path) -> int:
251
+ """Get page count using pdfinfo."""
252
+ pdfinfo = shutil.which("pdfinfo")
253
+ if not pdfinfo:
254
+ return 0
255
+
256
+ try:
257
+ result = subprocess.run(
258
+ [pdfinfo, str(pdf_path)],
259
+ capture_output=True,
260
+ text=True,
261
+ timeout=10,
262
+ )
263
+ for line in result.stdout.split("\n"):
264
+ if line.startswith("Pages:"):
265
+ return int(line.split(":")[1].strip())
266
+ except Exception:
267
+ pass
268
+
269
+ return 0
270
+
271
+ def search(
272
+ self,
273
+ query: str,
274
+ limit: int = 20,
275
+ directories: list[Path] | None = None,
276
+ search_depth: int = 5,
277
+ ) -> list[PDFSearchResult]:
278
+ """Search for text in PDFs with improved context extraction.
279
+
280
+ This implementation uses a two-pass approach:
281
+ 1. Quick filename matching
282
+ 2. Deep content search with better context
283
+
284
+ Args:
285
+ query: Search query
286
+ limit: Maximum results
287
+ directories: Override directories to search
288
+ search_depth: Number of pages to search (default: 5)
289
+
290
+ Returns:
291
+ List of PDFSearchResults with ranked matches
292
+ """
293
+ search_dirs = directories or self.directories
294
+ results = []
295
+
296
+ # Compile case-insensitive pattern
297
+ pattern = re.compile(query, re.IGNORECASE)
298
+
299
+ # Pass 1: Quick filename matching (scored lower)
300
+ filename_matches = []
301
+ for directory in search_dirs:
302
+ if not directory.exists():
303
+ continue
304
+
305
+ for pdf_path in directory.rglob("*.pdf"):
306
+ if len(filename_matches) >= limit * 2: # Get extra for ranking
307
+ break
308
+
309
+ # Check filename
310
+ if pattern.search(pdf_path.name):
311
+ filename_matches.append(
312
+ (
313
+ PDFSearchResult(
314
+ path=str(pdf_path),
315
+ filename=pdf_path.name,
316
+ page=0,
317
+ context=f"Filename match: {pdf_path.name}",
318
+ match_text=query,
319
+ ),
320
+ 0.5, # Lower score for filename-only match
321
+ )
322
+ )
323
+
324
+ # Pass 2: Deep content search (scored higher)
325
+ content_matches = []
326
+ if self.available():
327
+ searched_count = 0
328
+ max_search = 50 # Limit to avoid long searches
329
+
330
+ for directory in search_dirs:
331
+ if not directory.exists():
332
+ continue
333
+
334
+ for pdf_path in directory.rglob("*.pdf"):
335
+ if searched_count >= max_search:
336
+ break
337
+
338
+ # Skip if already matched by filename
339
+ if any(r[0].path == str(pdf_path) for r in filename_matches):
340
+ continue
341
+
342
+ searched_count += 1
343
+
344
+ try:
345
+ # Extract first few pages
346
+ doc = self.extract(pdf_path, pages=f"1-{search_depth}")
347
+
348
+ # Find all matches in content
349
+ matches = list(pattern.finditer(doc.text))
350
+
351
+ if matches:
352
+ # Take the first/best match
353
+ match = matches[0]
354
+
355
+ # Extract smart context (sentence or paragraph)
356
+ context = self._extract_context(doc.text, match, window=150)
357
+
358
+ # Calculate relevance score based on:
359
+ # - Number of matches
360
+ # - Position in document
361
+ # - Match in title vs body
362
+ score = 1.0
363
+ if len(matches) > 1:
364
+ score += 0.1 * min(len(matches) - 1, 5)
365
+ if match.start() < 500: # Early in document
366
+ score += 0.2
367
+ if doc.title and query.lower() in doc.title.lower():
368
+ score += 0.3
369
+
370
+ content_matches.append(
371
+ (
372
+ PDFSearchResult(
373
+ path=str(pdf_path),
374
+ filename=pdf_path.name,
375
+ page=1, # Approximate
376
+ context=context,
377
+ match_text=match.group(),
378
+ ),
379
+ score,
380
+ )
381
+ )
382
+ except Exception:
383
+ continue
384
+
385
+ # Combine and rank results
386
+ all_matches = content_matches + filename_matches
387
+ all_matches.sort(key=lambda x: x[1], reverse=True)
388
+
389
+ # Return top results without scores
390
+ return [r[0] for r in all_matches[:limit]]
391
+
392
+ def _extract_context(self, text: str, match: re.Match, window: int = 150) -> str:
393
+ """Extract intelligent context around a match.
394
+
395
+ Tries to extract a complete sentence or paragraph.
396
+
397
+ Args:
398
+ text: Full text
399
+ match: Regex match object
400
+ window: Character window size
401
+
402
+ Returns:
403
+ Context string
404
+ """
405
+ start = max(0, match.start() - window)
406
+ end = min(len(text), match.end() + window)
407
+
408
+ # Expand to sentence boundaries if possible
409
+ while start > 0 and text[start] not in ".!?\n":
410
+ start -= 1
411
+ if match.start() - start > window * 2:
412
+ break
413
+
414
+ while end < len(text) and text[end] not in ".!?\n":
415
+ end += 1
416
+ if end - match.end() > window * 2:
417
+ break
418
+
419
+ context = text[start:end].strip()
420
+
421
+ # Clean up
422
+ context = re.sub(r"\s+", " ", context)
423
+
424
+ # Add ellipsis if truncated
425
+ if start > 0:
426
+ context = "..." + context
427
+ if end < len(text):
428
+ context = context + "..."
429
+
430
+ return context
431
+
432
+ def list_pdfs(self, limit: int = 100) -> list[dict]:
433
+ """List all PDFs in configured directories.
434
+
435
+ Args:
436
+ limit: Maximum files to list
437
+
438
+ Returns:
439
+ List of PDF info dicts
440
+ """
441
+ pdfs = []
442
+
443
+ for directory in self.directories:
444
+ if not directory.exists():
445
+ continue
446
+
447
+ for pdf_path in directory.rglob("*.pdf"):
448
+ if len(pdfs) >= limit:
449
+ break
450
+
451
+ try:
452
+ stat = pdf_path.stat()
453
+ pdfs.append(
454
+ {
455
+ "path": str(pdf_path),
456
+ "filename": pdf_path.name,
457
+ "size_bytes": stat.st_size,
458
+ "modified": stat.st_mtime,
459
+ "directory": str(pdf_path.parent),
460
+ }
461
+ )
462
+ except Exception:
463
+ continue
464
+
465
+ # Sort by modification time, newest first
466
+ pdfs.sort(key=lambda x: x.get("modified", 0), reverse=True)
467
+
468
+ return pdfs[:limit]
469
+
470
+ def summarize_directories(self) -> list[dict]:
471
+ """Summarize PDF counts per directory.
472
+
473
+ Returns:
474
+ List of directory summaries
475
+ """
476
+ summaries = []
477
+
478
+ for directory in self.directories:
479
+ if directory.exists():
480
+ count = len(list(directory.rglob("*.pdf")))
481
+ summaries.append(
482
+ {
483
+ "directory": str(directory),
484
+ "count": count,
485
+ "exists": True,
486
+ }
487
+ )
488
+ else:
489
+ summaries.append(
490
+ {
491
+ "directory": str(directory),
492
+ "count": 0,
493
+ "exists": False,
494
+ }
495
+ )
496
+
497
+ return summaries