nexus-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nexus/__init__.py +8 -0
- nexus/cli.py +1914 -0
- nexus/integrations/__init__.py +0 -0
- nexus/knowledge/__init__.py +13 -0
- nexus/knowledge/search.py +233 -0
- nexus/knowledge/vault.py +662 -0
- nexus/research/__init__.py +12 -0
- nexus/research/pdf.py +497 -0
- nexus/research/zotero.py +521 -0
- nexus/teaching/__init__.py +14 -0
- nexus/teaching/courses.py +388 -0
- nexus/teaching/quarto.py +385 -0
- nexus/utils/__init__.py +0 -0
- nexus/utils/config.py +157 -0
- nexus/writing/__init__.py +12 -0
- nexus/writing/bibliography.py +339 -0
- nexus/writing/manuscript.py +397 -0
- nexus_cli-0.3.0.dist-info/METADATA +369 -0
- nexus_cli-0.3.0.dist-info/RECORD +21 -0
- nexus_cli-0.3.0.dist-info/WHEEL +4 -0
- nexus_cli-0.3.0.dist-info/entry_points.txt +2 -0
nexus/research/pdf.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""PDF extraction and search for Nexus CLI."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PDFDocument:
|
|
12
|
+
"""A PDF document with extracted content."""
|
|
13
|
+
|
|
14
|
+
path: str
|
|
15
|
+
filename: str
|
|
16
|
+
title: str = ""
|
|
17
|
+
text: str = ""
|
|
18
|
+
page_count: int = 0
|
|
19
|
+
size_bytes: int = 0
|
|
20
|
+
|
|
21
|
+
def to_dict(self) -> dict:
|
|
22
|
+
"""Convert to dictionary."""
|
|
23
|
+
return {
|
|
24
|
+
"path": self.path,
|
|
25
|
+
"filename": self.filename,
|
|
26
|
+
"title": self.title or self.filename,
|
|
27
|
+
"page_count": self.page_count,
|
|
28
|
+
"size_bytes": self.size_bytes,
|
|
29
|
+
"text_preview": self.text[:500] + "..." if len(self.text) > 500 else self.text,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class PDFSearchResult:
|
|
35
|
+
"""A search result from PDF content."""
|
|
36
|
+
|
|
37
|
+
path: str
|
|
38
|
+
filename: str
|
|
39
|
+
page: int
|
|
40
|
+
context: str
|
|
41
|
+
match_text: str = ""
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> dict:
|
|
44
|
+
"""Convert to dictionary."""
|
|
45
|
+
return {
|
|
46
|
+
"path": self.path,
|
|
47
|
+
"filename": self.filename,
|
|
48
|
+
"page": self.page,
|
|
49
|
+
"context": self.context,
|
|
50
|
+
"match_text": self.match_text,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PDFExtractor:
|
|
55
|
+
"""Extract text from PDF files using pdftotext."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, directories: list[Path] | None = None):
|
|
58
|
+
"""Initialize PDF extractor.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
directories: List of directories to search for PDFs
|
|
62
|
+
"""
|
|
63
|
+
self.directories = [Path(d).expanduser() for d in (directories or [])]
|
|
64
|
+
self._pdftotext_path = shutil.which("pdftotext")
|
|
65
|
+
|
|
66
|
+
def available(self) -> bool:
|
|
67
|
+
"""Check if pdftotext is available."""
|
|
68
|
+
return self._pdftotext_path is not None
|
|
69
|
+
|
|
70
|
+
def pdf_count(self) -> int:
|
|
71
|
+
"""Count total PDFs in configured directories."""
|
|
72
|
+
count = 0
|
|
73
|
+
for directory in self.directories:
|
|
74
|
+
if directory.exists():
|
|
75
|
+
count += len(list(directory.rglob("*.pdf")))
|
|
76
|
+
return count
|
|
77
|
+
|
|
78
|
+
def extract(
|
|
79
|
+
self,
|
|
80
|
+
pdf_path: Path,
|
|
81
|
+
pages: str | None = None,
|
|
82
|
+
layout: bool = False,
|
|
83
|
+
) -> PDFDocument:
|
|
84
|
+
"""Extract text from a PDF file.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
pdf_path: Path to PDF file
|
|
88
|
+
pages: Optional page range (e.g., "1-5" or "1,3,5")
|
|
89
|
+
layout: Preserve layout (slower but better for tables)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
PDFDocument with extracted text
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
FileNotFoundError: If PDF doesn't exist
|
|
96
|
+
RuntimeError: If pdftotext not available
|
|
97
|
+
"""
|
|
98
|
+
pdf_path = Path(pdf_path).expanduser()
|
|
99
|
+
|
|
100
|
+
if not pdf_path.exists():
|
|
101
|
+
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
102
|
+
|
|
103
|
+
if not self.available():
|
|
104
|
+
raise RuntimeError("pdftotext not installed. Run: brew install poppler")
|
|
105
|
+
|
|
106
|
+
# Build command
|
|
107
|
+
cmd = [self._pdftotext_path]
|
|
108
|
+
|
|
109
|
+
if layout:
|
|
110
|
+
cmd.append("-layout")
|
|
111
|
+
else:
|
|
112
|
+
# Use -raw for better text flow in multi-column papers
|
|
113
|
+
cmd.append("-raw")
|
|
114
|
+
|
|
115
|
+
# Handle page range
|
|
116
|
+
first_page = None
|
|
117
|
+
last_page = None
|
|
118
|
+
if pages:
|
|
119
|
+
if "-" in pages:
|
|
120
|
+
parts = pages.split("-")
|
|
121
|
+
first_page = int(parts[0])
|
|
122
|
+
last_page = int(parts[1]) if len(parts) > 1 and parts[1] else None
|
|
123
|
+
elif pages.isdigit():
|
|
124
|
+
first_page = int(pages)
|
|
125
|
+
last_page = int(pages)
|
|
126
|
+
|
|
127
|
+
if first_page:
|
|
128
|
+
cmd.extend(["-f", str(first_page)])
|
|
129
|
+
if last_page:
|
|
130
|
+
cmd.extend(["-l", str(last_page)])
|
|
131
|
+
|
|
132
|
+
cmd.extend([str(pdf_path), "-"])
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
result = subprocess.run(
|
|
136
|
+
cmd,
|
|
137
|
+
capture_output=True,
|
|
138
|
+
text=True,
|
|
139
|
+
timeout=60,
|
|
140
|
+
)
|
|
141
|
+
text = result.stdout
|
|
142
|
+
except subprocess.TimeoutExpired:
|
|
143
|
+
text = "[Extraction timed out]"
|
|
144
|
+
except Exception as e:
|
|
145
|
+
text = f"[Extraction failed: {e}]"
|
|
146
|
+
|
|
147
|
+
# Clean and improve text quality
|
|
148
|
+
text = self._clean_text(text)
|
|
149
|
+
|
|
150
|
+
# Get page count using pdfinfo if available
|
|
151
|
+
page_count = self._get_page_count(pdf_path)
|
|
152
|
+
|
|
153
|
+
# Try to extract title intelligently
|
|
154
|
+
title = self._extract_title(text, pdf_path)
|
|
155
|
+
|
|
156
|
+
return PDFDocument(
|
|
157
|
+
path=str(pdf_path),
|
|
158
|
+
filename=pdf_path.name,
|
|
159
|
+
title=title,
|
|
160
|
+
text=text,
|
|
161
|
+
page_count=page_count,
|
|
162
|
+
size_bytes=pdf_path.stat().st_size,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _clean_text(self, text: str) -> str:
|
|
166
|
+
"""Clean extracted text for better readability.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
text: Raw extracted text
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Cleaned text
|
|
173
|
+
"""
|
|
174
|
+
if not text or text.startswith("["):
|
|
175
|
+
return text
|
|
176
|
+
|
|
177
|
+
# Remove multiple spaces
|
|
178
|
+
text = re.sub(r" +", " ", text)
|
|
179
|
+
|
|
180
|
+
# Remove excessive newlines (more than 2)
|
|
181
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
182
|
+
|
|
183
|
+
# Fix hyphenated words split across lines
|
|
184
|
+
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
|
|
185
|
+
|
|
186
|
+
# Remove soft hyphens
|
|
187
|
+
text = text.replace("\u00ad", "")
|
|
188
|
+
|
|
189
|
+
# Fix ligatures
|
|
190
|
+
text = text.replace("ff", "ff")
|
|
191
|
+
text = text.replace("fi", "fi")
|
|
192
|
+
text = text.replace("fl", "fl")
|
|
193
|
+
text = text.replace("ffi", "ffi")
|
|
194
|
+
text = text.replace("ffl", "ffl")
|
|
195
|
+
|
|
196
|
+
# Remove common PDF artifacts
|
|
197
|
+
text = re.sub(r"\(cid:\d+\)", "", text)
|
|
198
|
+
|
|
199
|
+
# Normalize whitespace
|
|
200
|
+
lines = []
|
|
201
|
+
for line in text.split("\n"):
|
|
202
|
+
line = line.strip()
|
|
203
|
+
if line:
|
|
204
|
+
lines.append(line)
|
|
205
|
+
|
|
206
|
+
return "\n".join(lines)
|
|
207
|
+
|
|
208
|
+
def _extract_title(self, text: str, pdf_path: Path) -> str:
|
|
209
|
+
"""Extract title from PDF text intelligently.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
text: Extracted text
|
|
213
|
+
pdf_path: Path to PDF
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Best guess at title
|
|
217
|
+
"""
|
|
218
|
+
if not text or text.startswith("["):
|
|
219
|
+
return pdf_path.stem
|
|
220
|
+
|
|
221
|
+
lines = text.strip().split("\n")
|
|
222
|
+
|
|
223
|
+
# Look for title in first few lines
|
|
224
|
+
for i, line in enumerate(lines[:10]):
|
|
225
|
+
line = line.strip()
|
|
226
|
+
|
|
227
|
+
# Skip empty lines
|
|
228
|
+
if not line:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# Skip common headers (page numbers, running heads)
|
|
232
|
+
if re.match(r"^\d+$", line):
|
|
233
|
+
continue
|
|
234
|
+
if len(line) < 10:
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
# Title is usually capitalized and reasonably long
|
|
238
|
+
if len(line) > 15 and len(line) < 200 and not line.endswith(".") and line[0].isupper():
|
|
239
|
+
# Clean up title
|
|
240
|
+
title = re.sub(r"\s+", " ", line)
|
|
241
|
+
|
|
242
|
+
# Remove trailing punctuation except ?!
|
|
243
|
+
title = re.sub(r"[,;:]$", "", title)
|
|
244
|
+
|
|
245
|
+
return title
|
|
246
|
+
|
|
247
|
+
# Fallback to filename
|
|
248
|
+
return pdf_path.stem
|
|
249
|
+
|
|
250
|
+
def _get_page_count(self, pdf_path: Path) -> int:
|
|
251
|
+
"""Get page count using pdfinfo."""
|
|
252
|
+
pdfinfo = shutil.which("pdfinfo")
|
|
253
|
+
if not pdfinfo:
|
|
254
|
+
return 0
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
result = subprocess.run(
|
|
258
|
+
[pdfinfo, str(pdf_path)],
|
|
259
|
+
capture_output=True,
|
|
260
|
+
text=True,
|
|
261
|
+
timeout=10,
|
|
262
|
+
)
|
|
263
|
+
for line in result.stdout.split("\n"):
|
|
264
|
+
if line.startswith("Pages:"):
|
|
265
|
+
return int(line.split(":")[1].strip())
|
|
266
|
+
except Exception:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
return 0
|
|
270
|
+
|
|
271
|
+
def search(
|
|
272
|
+
self,
|
|
273
|
+
query: str,
|
|
274
|
+
limit: int = 20,
|
|
275
|
+
directories: list[Path] | None = None,
|
|
276
|
+
search_depth: int = 5,
|
|
277
|
+
) -> list[PDFSearchResult]:
|
|
278
|
+
"""Search for text in PDFs with improved context extraction.
|
|
279
|
+
|
|
280
|
+
This implementation uses a two-pass approach:
|
|
281
|
+
1. Quick filename matching
|
|
282
|
+
2. Deep content search with better context
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
query: Search query
|
|
286
|
+
limit: Maximum results
|
|
287
|
+
directories: Override directories to search
|
|
288
|
+
search_depth: Number of pages to search (default: 5)
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of PDFSearchResults with ranked matches
|
|
292
|
+
"""
|
|
293
|
+
search_dirs = directories or self.directories
|
|
294
|
+
results = []
|
|
295
|
+
|
|
296
|
+
# Compile case-insensitive pattern
|
|
297
|
+
pattern = re.compile(query, re.IGNORECASE)
|
|
298
|
+
|
|
299
|
+
# Pass 1: Quick filename matching (scored lower)
|
|
300
|
+
filename_matches = []
|
|
301
|
+
for directory in search_dirs:
|
|
302
|
+
if not directory.exists():
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
for pdf_path in directory.rglob("*.pdf"):
|
|
306
|
+
if len(filename_matches) >= limit * 2: # Get extra for ranking
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
# Check filename
|
|
310
|
+
if pattern.search(pdf_path.name):
|
|
311
|
+
filename_matches.append(
|
|
312
|
+
(
|
|
313
|
+
PDFSearchResult(
|
|
314
|
+
path=str(pdf_path),
|
|
315
|
+
filename=pdf_path.name,
|
|
316
|
+
page=0,
|
|
317
|
+
context=f"Filename match: {pdf_path.name}",
|
|
318
|
+
match_text=query,
|
|
319
|
+
),
|
|
320
|
+
0.5, # Lower score for filename-only match
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Pass 2: Deep content search (scored higher)
|
|
325
|
+
content_matches = []
|
|
326
|
+
if self.available():
|
|
327
|
+
searched_count = 0
|
|
328
|
+
max_search = 50 # Limit to avoid long searches
|
|
329
|
+
|
|
330
|
+
for directory in search_dirs:
|
|
331
|
+
if not directory.exists():
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
for pdf_path in directory.rglob("*.pdf"):
|
|
335
|
+
if searched_count >= max_search:
|
|
336
|
+
break
|
|
337
|
+
|
|
338
|
+
# Skip if already matched by filename
|
|
339
|
+
if any(r[0].path == str(pdf_path) for r in filename_matches):
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
searched_count += 1
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
# Extract first few pages
|
|
346
|
+
doc = self.extract(pdf_path, pages=f"1-{search_depth}")
|
|
347
|
+
|
|
348
|
+
# Find all matches in content
|
|
349
|
+
matches = list(pattern.finditer(doc.text))
|
|
350
|
+
|
|
351
|
+
if matches:
|
|
352
|
+
# Take the first/best match
|
|
353
|
+
match = matches[0]
|
|
354
|
+
|
|
355
|
+
# Extract smart context (sentence or paragraph)
|
|
356
|
+
context = self._extract_context(doc.text, match, window=150)
|
|
357
|
+
|
|
358
|
+
# Calculate relevance score based on:
|
|
359
|
+
# - Number of matches
|
|
360
|
+
# - Position in document
|
|
361
|
+
# - Match in title vs body
|
|
362
|
+
score = 1.0
|
|
363
|
+
if len(matches) > 1:
|
|
364
|
+
score += 0.1 * min(len(matches) - 1, 5)
|
|
365
|
+
if match.start() < 500: # Early in document
|
|
366
|
+
score += 0.2
|
|
367
|
+
if doc.title and query.lower() in doc.title.lower():
|
|
368
|
+
score += 0.3
|
|
369
|
+
|
|
370
|
+
content_matches.append(
|
|
371
|
+
(
|
|
372
|
+
PDFSearchResult(
|
|
373
|
+
path=str(pdf_path),
|
|
374
|
+
filename=pdf_path.name,
|
|
375
|
+
page=1, # Approximate
|
|
376
|
+
context=context,
|
|
377
|
+
match_text=match.group(),
|
|
378
|
+
),
|
|
379
|
+
score,
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
except Exception:
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
# Combine and rank results
|
|
386
|
+
all_matches = content_matches + filename_matches
|
|
387
|
+
all_matches.sort(key=lambda x: x[1], reverse=True)
|
|
388
|
+
|
|
389
|
+
# Return top results without scores
|
|
390
|
+
return [r[0] for r in all_matches[:limit]]
|
|
391
|
+
|
|
392
|
+
def _extract_context(self, text: str, match: re.Match, window: int = 150) -> str:
|
|
393
|
+
"""Extract intelligent context around a match.
|
|
394
|
+
|
|
395
|
+
Tries to extract a complete sentence or paragraph.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
text: Full text
|
|
399
|
+
match: Regex match object
|
|
400
|
+
window: Character window size
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Context string
|
|
404
|
+
"""
|
|
405
|
+
start = max(0, match.start() - window)
|
|
406
|
+
end = min(len(text), match.end() + window)
|
|
407
|
+
|
|
408
|
+
# Expand to sentence boundaries if possible
|
|
409
|
+
while start > 0 and text[start] not in ".!?\n":
|
|
410
|
+
start -= 1
|
|
411
|
+
if match.start() - start > window * 2:
|
|
412
|
+
break
|
|
413
|
+
|
|
414
|
+
while end < len(text) and text[end] not in ".!?\n":
|
|
415
|
+
end += 1
|
|
416
|
+
if end - match.end() > window * 2:
|
|
417
|
+
break
|
|
418
|
+
|
|
419
|
+
context = text[start:end].strip()
|
|
420
|
+
|
|
421
|
+
# Clean up
|
|
422
|
+
context = re.sub(r"\s+", " ", context)
|
|
423
|
+
|
|
424
|
+
# Add ellipsis if truncated
|
|
425
|
+
if start > 0:
|
|
426
|
+
context = "..." + context
|
|
427
|
+
if end < len(text):
|
|
428
|
+
context = context + "..."
|
|
429
|
+
|
|
430
|
+
return context
|
|
431
|
+
|
|
432
|
+
def list_pdfs(self, limit: int = 100) -> list[dict]:
|
|
433
|
+
"""List all PDFs in configured directories.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
limit: Maximum files to list
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
List of PDF info dicts
|
|
440
|
+
"""
|
|
441
|
+
pdfs = []
|
|
442
|
+
|
|
443
|
+
for directory in self.directories:
|
|
444
|
+
if not directory.exists():
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
for pdf_path in directory.rglob("*.pdf"):
|
|
448
|
+
if len(pdfs) >= limit:
|
|
449
|
+
break
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
stat = pdf_path.stat()
|
|
453
|
+
pdfs.append(
|
|
454
|
+
{
|
|
455
|
+
"path": str(pdf_path),
|
|
456
|
+
"filename": pdf_path.name,
|
|
457
|
+
"size_bytes": stat.st_size,
|
|
458
|
+
"modified": stat.st_mtime,
|
|
459
|
+
"directory": str(pdf_path.parent),
|
|
460
|
+
}
|
|
461
|
+
)
|
|
462
|
+
except Exception:
|
|
463
|
+
continue
|
|
464
|
+
|
|
465
|
+
# Sort by modification time, newest first
|
|
466
|
+
pdfs.sort(key=lambda x: x.get("modified", 0), reverse=True)
|
|
467
|
+
|
|
468
|
+
return pdfs[:limit]
|
|
469
|
+
|
|
470
|
+
def summarize_directories(self) -> list[dict]:
|
|
471
|
+
"""Summarize PDF counts per directory.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
List of directory summaries
|
|
475
|
+
"""
|
|
476
|
+
summaries = []
|
|
477
|
+
|
|
478
|
+
for directory in self.directories:
|
|
479
|
+
if directory.exists():
|
|
480
|
+
count = len(list(directory.rglob("*.pdf")))
|
|
481
|
+
summaries.append(
|
|
482
|
+
{
|
|
483
|
+
"directory": str(directory),
|
|
484
|
+
"count": count,
|
|
485
|
+
"exists": True,
|
|
486
|
+
}
|
|
487
|
+
)
|
|
488
|
+
else:
|
|
489
|
+
summaries.append(
|
|
490
|
+
{
|
|
491
|
+
"directory": str(directory),
|
|
492
|
+
"count": 0,
|
|
493
|
+
"exists": False,
|
|
494
|
+
}
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
return summaries
|