okb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
okb/ingest.py ADDED
@@ -0,0 +1,1589 @@
1
+ """
2
+ Document ingestion pipeline with contextual chunking.
3
+
4
+ Collects documents, chunks them with context, generates embeddings via Modal,
5
+ and stores in pgvector.
6
+
7
+ Usage:
8
+ python ingest.py ~/notes ~/projects/docs
9
+ python ingest.py ~/notes --metadata '{"project": "personal"}'
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import fnmatch
16
+ import hashlib
17
+ import json
18
+ import os
19
+ import re
20
+ import sys
21
+ from collections.abc import Generator
22
+ from dataclasses import dataclass, field
23
+ from datetime import UTC, datetime
24
+ from pathlib import Path
25
+
26
+ import psycopg
27
+ import yaml
28
+ from pgvector.psycopg import register_vector
29
+ from psycopg.rows import dict_row
30
+
31
+ from .config import config
32
+
33
+
34
+ def read_text_with_fallback(
35
+ path: Path, encodings: tuple[str, ...] = ("utf-8", "windows-1252", "latin-1")
36
+ ) -> str:
37
+ """Read text file trying multiple encodings in order."""
38
+ for encoding in encodings:
39
+ try:
40
+ return path.read_text(encoding=encoding)
41
+ except UnicodeDecodeError:
42
+ continue
43
+ # Last resort: read with errors replaced
44
+ return path.read_text(encoding="utf-8", errors="replace")
45
+
46
+
47
+ def matches_pattern(filename: str, patterns: list[str]) -> str | None:
48
+ """Check if filename matches any pattern. Returns matched pattern or None."""
49
+ for pattern in patterns:
50
+ if fnmatch.fnmatch(filename, pattern) or fnmatch.fnmatch(filename.lower(), pattern.lower()):
51
+ return pattern
52
+ return None
53
+
54
+
55
+ # Patterns for detecting secrets in content
56
+ SECRET_PATTERNS = [
57
+ (re.compile(r"-----BEGIN [A-Z ]* PRIVATE KEY-----"), "private key"),
58
+ (re.compile(r"AKIA[0-9A-Z]{16}"), "AWS access key"),
59
+ (re.compile(r"ghp_[a-zA-Z0-9]{36}"), "GitHub personal access token"),
60
+ (re.compile(r"gho_[a-zA-Z0-9]{36}"), "GitHub OAuth token"),
61
+ (re.compile(r"sk-[a-zA-Z0-9]{48}"), "OpenAI API key"),
62
+ (re.compile(r"sk-ant-api[a-zA-Z0-9-]{80,}"), "Anthropic API key"),
63
+ ]
64
+
65
+
66
+ def scan_content_for_secrets(content: str) -> str | None:
67
+ """Scan content for potential secrets. Returns description if found, None otherwise."""
68
+ # Only check first 10KB to avoid slow scans on large files
69
+ sample = content[:10240]
70
+ for pattern, description in SECRET_PATTERNS:
71
+ if pattern.search(sample):
72
+ return description
73
+ return None
74
+
75
+
76
+ def is_minified(content: str, max_line_length: int = 1000) -> bool:
77
+ """Detect if content appears to be minified JS/CSS."""
78
+ lines = content.split("\n", 10) # Only check first few lines
79
+ if not lines:
80
+ return False
81
+ # Check if any of the first lines is extremely long
82
+ for line in lines[:5]:
83
+ if len(line) > max_line_length:
84
+ # Also check it's not just a long string/comment - minified has lots of punctuation
85
+ if line.count(";") > 20 or line.count(",") > 50 or line.count("{") > 20:
86
+ return True
87
+ return False
88
+
89
+
90
+ class FileSkipReason:
91
+ """Result of file skip check."""
92
+
93
+ def __init__(self, should_skip: bool, reason: str = "", is_security: bool = False):
94
+ self.should_skip = should_skip
95
+ self.reason = reason
96
+ self.is_security = is_security # True for blocked (security), False for skipped (low-value)
97
+
98
+
99
+ def check_file_skip(path: Path, content: str | None = None) -> FileSkipReason:
100
+ """
101
+ Check if a file should be skipped or blocked.
102
+
103
+ Returns FileSkipReason with details.
104
+ """
105
+ filename = path.name
106
+
107
+ # Check block patterns (security)
108
+ if matched := matches_pattern(filename, config.block_patterns):
109
+ return FileSkipReason(True, f"matches block pattern '{matched}'", is_security=True)
110
+
111
+ # Check skip patterns (low-value)
112
+ if matched := matches_pattern(filename, config.skip_patterns):
113
+ return FileSkipReason(True, f"matches skip pattern '{matched}'", is_security=False)
114
+
115
+ # Content-based checks (if content provided and scanning enabled)
116
+ if content is not None and config.scan_content:
117
+ # Check for secrets
118
+ if secret_type := scan_content_for_secrets(content):
119
+ return FileSkipReason(True, f"contains {secret_type}", is_security=True)
120
+
121
+ # Check for minified JS/CSS
122
+ if path.suffix in (".js", ".css") and is_minified(
123
+ content, config.max_line_length_for_minified
124
+ ):
125
+ return FileSkipReason(True, "appears to be minified", is_security=False)
126
+
127
+ return FileSkipReason(False)
128
+
129
+
130
+ @dataclass
131
+ class DocumentMetadata:
132
+ """Metadata extracted from document or provided externally."""
133
+
134
+ tags: list[str] = field(default_factory=list)
135
+ project: str | None = None
136
+ category: str | None = None
137
+ status: str | None = None
138
+ extra: dict = field(default_factory=dict)
139
+
140
+ @classmethod
141
+ def from_frontmatter(cls, frontmatter: dict) -> DocumentMetadata:
142
+ """Create from YAML frontmatter."""
143
+ extra = {
144
+ k: v
145
+ for k, v in frontmatter.items()
146
+ if k not in {"tags", "project", "category", "status"}
147
+ }
148
+ if doc_date := extract_document_date(frontmatter):
149
+ extra["document_date"] = doc_date
150
+ return cls(
151
+ tags=frontmatter.get("tags", []),
152
+ project=frontmatter.get("project"),
153
+ category=frontmatter.get("category"),
154
+ status=frontmatter.get("status"),
155
+ extra=extra,
156
+ )
157
+
158
+ def to_dict(self) -> dict:
159
+ """Convert to JSON-serializable dict."""
160
+ result = {}
161
+ if self.tags:
162
+ result["tags"] = self.tags
163
+ if self.project:
164
+ result["project"] = self.project
165
+ if self.category:
166
+ result["category"] = self.category
167
+ if self.status:
168
+ result["status"] = self.status
169
+ if self.extra:
170
+ result.update(self.extra)
171
+ return result
172
+
173
+
174
+ @dataclass
175
+ class Document:
176
+ """A document to be indexed."""
177
+
178
+ source_path: str
179
+ source_type: str
180
+ title: str
181
+ content: str
182
+ metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
183
+ sections: list[tuple[str, str]] = field(default_factory=list) # (header, content)
184
+
185
+ # Structured fields for actionable items (tasks, events, emails)
186
+ due_date: datetime | None = None # Task deadlines
187
+ event_start: datetime | None = None # Calendar event start
188
+ event_end: datetime | None = None # Calendar event end
189
+ status: str | None = None # 'pending', 'completed', 'cancelled', etc.
190
+ priority: int | None = None # 1-5 scale (1=highest)
191
+
192
+
193
+ @dataclass
194
+ class Chunk:
195
+ """A chunk ready for embedding."""
196
+
197
+ content: str # Original text (for display)
198
+ embedding_text: str # Contextualized text (for embedding)
199
+ chunk_index: int
200
+ token_count: int
201
+ metadata: dict = field(default_factory=dict)
202
+
203
+
204
+ def content_hash(content: str) -> str:
205
+ """Generate hash for deduplication/change detection."""
206
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
207
+
208
+
209
+ def extract_document_date(metadata: dict) -> str | None:
210
+ """Extract document date from frontmatter/metadata, trying common field names."""
211
+ date_fields = ["date", "created", "modified", "updated", "last_modified", "pubdate"]
212
+ for field_name in date_fields:
213
+ if value := metadata.get(field_name):
214
+ if hasattr(value, "isoformat"):
215
+ return value.isoformat()
216
+ if isinstance(value, str):
217
+ return value
218
+ return None
219
+
220
+
221
+ def extract_frontmatter(content: str) -> tuple[dict, str]:
222
+ """
223
+ Extract YAML frontmatter from markdown content.
224
+
225
+ Returns (frontmatter_dict, remaining_content).
226
+ """
227
+ if not content.startswith("---"):
228
+ return {}, content
229
+
230
+ # Find closing ---
231
+ end_match = re.search(r"\n---\s*\n", content[3:])
232
+ if not end_match:
233
+ return {}, content
234
+
235
+ frontmatter_text = content[3 : end_match.start() + 3]
236
+ remaining = content[end_match.end() + 3 :]
237
+
238
+ try:
239
+ frontmatter = yaml.safe_load(frontmatter_text) or {}
240
+ return frontmatter, remaining
241
+ except yaml.YAMLError:
242
+ return {}, content
243
+
244
+
245
+ def extract_sections_markdown(content: str) -> list[tuple[str, str]]:
246
+ """
247
+ Extract sections from markdown content.
248
+
249
+ Returns list of (header, section_content) tuples.
250
+ """
251
+ # Split by headers (any level)
252
+ parts = re.split(r"(^#{1,6}\s+.+$)", content, flags=re.MULTILINE)
253
+
254
+ sections = []
255
+ current_header = None
256
+
257
+ for part in parts:
258
+ if re.match(r"^#{1,6}\s+", part):
259
+ current_header = part.strip().lstrip("#").strip()
260
+ elif part.strip():
261
+ sections.append((current_header, part.strip()))
262
+
263
+ return sections
264
+
265
+
266
+ def extract_org_metadata(content: str) -> tuple[dict, str]:
267
+ """
268
+ Extract org-mode metadata from file header.
269
+
270
+ Parses #+KEY: value lines at the start of the file.
271
+ Returns (metadata_dict, remaining_content).
272
+ """
273
+ metadata = {}
274
+ lines = content.split("\n")
275
+ body_start = 0
276
+
277
+ for i, line in enumerate(lines):
278
+ match = re.match(r"^#\+(\w+):\s*(.*)$", line, re.IGNORECASE)
279
+ if match:
280
+ key = match.group(1).lower()
281
+ value = match.group(2).strip()
282
+ if key in metadata:
283
+ # Handle multiple values (e.g., multiple #+TAGS lines)
284
+ if isinstance(metadata[key], list):
285
+ metadata[key].append(value)
286
+ else:
287
+ metadata[key] = [metadata[key], value]
288
+ else:
289
+ metadata[key] = value
290
+ body_start = i + 1
291
+ elif line.strip() and not line.startswith("#"):
292
+ # Stop at first non-metadata, non-comment line
293
+ break
294
+
295
+ remaining = "\n".join(lines[body_start:])
296
+ return metadata, remaining
297
+
298
+
299
+ def extract_org_tags(header: str) -> tuple[str, list[str]]:
300
+ """
301
+ Extract tags from an org header line.
302
+
303
+ Org tags appear at end of header like: * Header text :tag1:tag2:
304
+ Returns (header_without_tags, list_of_tags).
305
+ """
306
+ match = re.search(r"\s+(:[:\w]+:)\s*$", header)
307
+ if match:
308
+ tag_str = match.group(1)
309
+ tags = [t for t in tag_str.split(":") if t]
310
+ header_clean = header[: match.start()].strip()
311
+ return header_clean, tags
312
+ return header, []
313
+
314
+
315
+ def extract_sections_org(content: str) -> list[tuple[str, str]]:
316
+ """
317
+ Extract sections from org-mode content.
318
+
319
+ Org headers use * (one or more) at start of line.
320
+ Returns list of (header, section_content) tuples.
321
+ """
322
+ # Split by org headers (any level)
323
+ parts = re.split(r"(^\*+\s+.+$)", content, flags=re.MULTILINE)
324
+
325
+ sections = []
326
+ current_header = None
327
+
328
+ for part in parts:
329
+ if re.match(r"^\*+\s+", part):
330
+ # Remove leading stars and any TODO keywords
331
+ header = re.sub(r"^\*+\s+", "", part)
332
+ # Remove common TODO keywords
333
+ header = re.sub(r"^(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+", "", header)
334
+ # Extract and remove tags
335
+ header, _ = extract_org_tags(header)
336
+ current_header = header.strip()
337
+ elif part.strip():
338
+ # Skip property drawers
339
+ clean_part = re.sub(r":PROPERTIES:.*?:END:", "", part, flags=re.DOTALL)
340
+ if clean_part.strip():
341
+ sections.append((current_header, clean_part.strip()))
342
+
343
+ return sections
344
+
345
+
346
+ # Org-mode TODO keywords (common defaults)
347
+ ORG_TODO_KEYWORDS = {"TODO", "DONE", "WAITING", "CANCELLED", "NEXT", "SOMEDAY"}
348
+ ORG_DONE_KEYWORDS = {"DONE", "CANCELLED"}
349
+
350
+
351
+ @dataclass
352
+ class OrgTodoItem:
353
+ """Represents a parsed org-mode TODO item."""
354
+
355
+ heading: str # The heading text (without stars, keyword, priority, tags)
356
+ raw_heading: str # Original heading line for source_path anchor
357
+ level: int # Number of stars
358
+ keyword: str | None # TODO, DONE, etc.
359
+ priority: str | None # A, B, C
360
+ tags: list[str]
361
+ deadline: datetime | None
362
+ scheduled: datetime | None
363
+ closed: datetime | None
364
+ content: str # Body text under this heading
365
+
366
+
367
+ def parse_org_timestamp(ts: str) -> datetime | None:
368
+ """Parse org-mode timestamp like <2024-01-15 Mon> or [2024-01-15 Mon 10:30]."""
369
+ # Strip brackets
370
+ ts = ts.strip("<>[]")
371
+ # Try various formats
372
+ formats = [
373
+ "%Y-%m-%d %a %H:%M", # <2024-01-15 Mon 10:30>
374
+ "%Y-%m-%d %a", # <2024-01-15 Mon>
375
+ "%Y-%m-%d %H:%M", # <2024-01-15 10:30>
376
+ "%Y-%m-%d", # <2024-01-15>
377
+ ]
378
+ for fmt in formats:
379
+ try:
380
+ dt = datetime.strptime(ts, fmt)
381
+ return dt.replace(tzinfo=UTC)
382
+ except ValueError:
383
+ continue
384
+ # Try just the date part
385
+ match = re.match(r"(\d{4}-\d{2}-\d{2})", ts)
386
+ if match:
387
+ try:
388
+ return datetime.strptime(match.group(1), "%Y-%m-%d").replace(tzinfo=UTC)
389
+ except ValueError:
390
+ pass
391
+ return None
392
+
393
+
394
+ def extract_org_todo_items(content: str) -> list[OrgTodoItem]:
395
+ """
396
+ Extract TODO items from org-mode content.
397
+
398
+ Parses headings with TODO keywords and extracts:
399
+ - Status (TODO/DONE/etc.)
400
+ - Priority ([#A]/[#B]/[#C])
401
+ - Tags (:tag1:tag2:)
402
+ - DEADLINE/SCHEDULED/CLOSED timestamps
403
+ - Body content
404
+ """
405
+ items = []
406
+ lines = content.split("\n")
407
+
408
+ i = 0
409
+ while i < len(lines):
410
+ line = lines[i]
411
+
412
+ # Match org heading with optional TODO keyword
413
+ # Pattern: *+ [KEYWORD] [#PRIORITY] Title :tags:
414
+ heading_match = re.match(
415
+ r"^(\*+)\s+" # Stars
416
+ r"(?:(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+)?" # Optional keyword
417
+ r"(?:\[#([ABC])\]\s+)?" # Optional priority
418
+ r"(.+)$", # Rest of heading
419
+ line,
420
+ )
421
+
422
+ if heading_match:
423
+ level = len(heading_match.group(1))
424
+ keyword = heading_match.group(2)
425
+ priority = heading_match.group(3)
426
+ rest = heading_match.group(4)
427
+
428
+ # Only process items with TODO keywords
429
+ if keyword:
430
+ # Extract tags from end of heading
431
+ heading_text, tags = extract_org_tags(rest)
432
+
433
+ # Collect body content until next heading of same or higher level
434
+ body_lines = []
435
+ deadline = None
436
+ scheduled = None
437
+ closed = None
438
+ i += 1
439
+
440
+ while i < len(lines):
441
+ next_line = lines[i]
442
+ # Check for next heading of same or higher level
443
+ next_heading = re.match(r"^(\*+)\s+", next_line)
444
+ if next_heading and len(next_heading.group(1)) <= level:
445
+ break
446
+
447
+ # Check for planning line (DEADLINE, SCHEDULED, CLOSED)
448
+ if re.match(r"^\s*(DEADLINE|SCHEDULED|CLOSED):", next_line):
449
+ if dl := re.search(r"DEADLINE:\s*(<[^>]+>)", next_line):
450
+ deadline = parse_org_timestamp(dl.group(1))
451
+ if sc := re.search(r"SCHEDULED:\s*(<[^>]+>)", next_line):
452
+ scheduled = parse_org_timestamp(sc.group(1))
453
+ if cl := re.search(r"CLOSED:\s*(\[[^\]]+\])", next_line):
454
+ closed = parse_org_timestamp(cl.group(1))
455
+ # Skip property drawers
456
+ elif next_line.strip() == ":PROPERTIES:":
457
+ while i < len(lines) and lines[i].strip() != ":END:":
458
+ i += 1
459
+ elif next_line.strip() and not next_line.strip().startswith(":"):
460
+ body_lines.append(next_line)
461
+
462
+ i += 1
463
+
464
+ items.append(
465
+ OrgTodoItem(
466
+ heading=heading_text.strip(),
467
+ raw_heading=line,
468
+ level=level,
469
+ keyword=keyword,
470
+ priority=priority,
471
+ tags=tags,
472
+ deadline=deadline,
473
+ scheduled=scheduled,
474
+ closed=closed,
475
+ content="\n".join(body_lines).strip(),
476
+ )
477
+ )
478
+ continue
479
+
480
+ i += 1
481
+
482
+ return items
483
+
484
+
485
+ def org_todo_to_document(
486
+ item: OrgTodoItem,
487
+ file_path: Path,
488
+ file_metadata: DocumentMetadata,
489
+ ) -> Document:
490
+ """Convert an OrgTodoItem to a Document with structured fields."""
491
+ # Build org-mode link-style source path: file.org::*Heading
492
+ # Use the heading text (not raw) for cleaner anchors
493
+ anchor = f"*{item.keyword} {item.heading}" if item.keyword else f"*{item.heading}"
494
+ source_path = f"{file_path.resolve()}::{anchor}"
495
+
496
+ # Map org priority to numeric (A=1, B=2, C=3)
497
+ priority_map = {"A": 1, "B": 2, "C": 3}
498
+ priority = priority_map.get(item.priority) if item.priority else None
499
+ # SOMEDAY items get lowest priority
500
+ if item.keyword == "SOMEDAY":
501
+ priority = 5
502
+
503
+ # Map org keyword to status
504
+ status = "completed" if item.keyword in ORG_DONE_KEYWORDS else "pending"
505
+
506
+ # Use deadline or scheduled as due_date
507
+ due_date = item.deadline or item.scheduled
508
+
509
+ # Merge file tags with item tags
510
+ tags = list(file_metadata.tags) + item.tags
511
+
512
+ metadata = DocumentMetadata(
513
+ tags=tags,
514
+ project=file_metadata.project,
515
+ category=file_metadata.category,
516
+ )
517
+
518
+ # Build content with context
519
+ content_parts = [item.heading]
520
+ if item.content:
521
+ content_parts.append(item.content)
522
+ content = "\n\n".join(content_parts)
523
+
524
+ return Document(
525
+ source_path=source_path,
526
+ source_type="org-todo",
527
+ title=item.heading,
528
+ content=content,
529
+ metadata=metadata,
530
+ due_date=due_date,
531
+ status=status,
532
+ priority=priority,
533
+ )
534
+
535
+
536
+ def extract_code_context(content: str, file_ext: str) -> dict:
537
+ """
538
+ Extract structural context from code files.
539
+
540
+ Returns dict with classes, functions, imports found.
541
+ """
542
+ context = {
543
+ "classes": [],
544
+ "functions": [],
545
+ "imports": [],
546
+ }
547
+
548
+ if file_ext == ".py":
549
+ # Python classes and functions
550
+ context["classes"] = re.findall(r"^class\s+(\w+)", content, re.MULTILINE)
551
+ context["functions"] = re.findall(r"^def\s+(\w+)", content, re.MULTILINE)
552
+ # Top-level imports
553
+ imports = re.findall(r"^(?:from\s+(\S+)|import\s+(\S+))", content, re.MULTILINE)
554
+ context["imports"] = [i[0] or i[1] for i in imports][:10] # Limit
555
+
556
+ elif file_ext in {".js", ".ts", ".jsx", ".tsx"}:
557
+ # JavaScript/TypeScript
558
+ context["classes"] = re.findall(r"class\s+(\w+)", content)
559
+ context["functions"] = re.findall(
560
+ r"(?:function\s+(\w+)|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\()",
561
+ content,
562
+ )
563
+ context["functions"] = [f[0] or f[1] for f in context["functions"]]
564
+ context["imports"] = re.findall(r"from\s+['\"]([^'\"]+)['\"]", content)[:10]
565
+
566
+ return {k: v for k, v in context.items() if v}
567
+
568
+
569
+ def infer_project_from_path(path: Path) -> str | None:
570
+ """
571
+ Infer project name from file path.
572
+
573
+ Looks for common patterns like:
574
+ - ~/projects/{project}/...
575
+ - ~/code/{project}/...
576
+ - ~/notes/projects/{project}/...
577
+ """
578
+ parts = path.parts
579
+ project_indicators = {"projects", "code", "repos", "src"}
580
+
581
+ for i, part in enumerate(parts):
582
+ if part.lower() in project_indicators and i + 1 < len(parts):
583
+ return parts[i + 1]
584
+
585
+ return None
586
+
587
+
588
+ def build_embedding_context(
589
+ chunk_text: str,
590
+ doc_title: str,
591
+ source_path: str,
592
+ source_type: str,
593
+ section_header: str | None = None,
594
+ metadata: DocumentMetadata | None = None,
595
+ code_context: dict | None = None,
596
+ ) -> str:
597
+ """
598
+ Build contextualized text for embedding.
599
+
600
+ This is what the embedding model sees. The original chunk_text
601
+ is stored separately for display.
602
+ """
603
+ parts = []
604
+
605
+ # Document identity
606
+ parts.append(f"Document: {doc_title}")
607
+
608
+ # Source type context
609
+ if source_type == "code":
610
+ path = Path(source_path)
611
+ parts.append(f"File: {path.name}")
612
+ if code_context:
613
+ if classes := code_context.get("classes"):
614
+ parts.append(f"Classes: {', '.join(classes[:5])}")
615
+ if functions := code_context.get("functions"):
616
+ parts.append(f"Functions: {', '.join(functions[:5])}")
617
+
618
+ # Project from metadata or path
619
+ project = None
620
+ if metadata and metadata.project:
621
+ project = metadata.project
622
+ else:
623
+ project = infer_project_from_path(Path(source_path))
624
+
625
+ if project:
626
+ parts.append(f"Project: {project}")
627
+
628
+ # Section context for long documents
629
+ if section_header:
630
+ parts.append(f"Section: {section_header}")
631
+
632
+ # Tags/topics from metadata
633
+ if metadata and metadata.tags:
634
+ parts.append(f"Topics: {', '.join(metadata.tags[:5])}")
635
+
636
+ if metadata and metadata.category:
637
+ parts.append(f"Category: {metadata.category}")
638
+
639
+ # The actual content
640
+ parts.append(f"Content: {chunk_text}")
641
+
642
+ return "\n".join(parts)
643
+
644
+
645
+ def chunk_text(
646
+ text: str,
647
+ chunk_size: int = config.chunk_size,
648
+ chunk_overlap: int = config.chunk_overlap,
649
+ ) -> Generator[tuple[int, str], None, None]:
650
+ """
651
+ Split text into overlapping chunks.
652
+
653
+ Tries to break at paragraph/sentence boundaries.
654
+ Uses approximate token count (4 chars ≈ 1 token).
655
+ """
656
+ char_size = chunk_size * config.chars_per_token
657
+ char_overlap = chunk_overlap * config.chars_per_token
658
+
659
+ if len(text) <= char_size:
660
+ yield 0, text
661
+ return
662
+
663
+ # Split into paragraphs
664
+ paragraphs = re.split(r"\n\n+", text)
665
+
666
+ current_chunk = ""
667
+ chunk_index = 0
668
+
669
+ for para in paragraphs:
670
+ para = para.strip()
671
+ if not para:
672
+ continue
673
+
674
+ if len(current_chunk) + len(para) + 2 <= char_size:
675
+ current_chunk += para + "\n\n"
676
+ else:
677
+ if current_chunk.strip():
678
+ yield chunk_index, current_chunk.strip()
679
+ chunk_index += 1
680
+ # Keep overlap
681
+ overlap = current_chunk[-char_overlap:] if len(current_chunk) > char_overlap else ""
682
+ current_chunk = overlap + para + "\n\n"
683
+ else:
684
+ # Single paragraph too large - split by sentences
685
+ sentences = re.split(r"(?<=[.!?])\s+", para)
686
+ for sentence in sentences:
687
+ if len(current_chunk) + len(sentence) + 1 <= char_size:
688
+ current_chunk += sentence + " "
689
+ else:
690
+ if current_chunk.strip():
691
+ yield chunk_index, current_chunk.strip()
692
+ chunk_index += 1
693
+ overlap = (
694
+ current_chunk[-char_overlap:]
695
+ if len(current_chunk) > char_overlap
696
+ else ""
697
+ )
698
+ current_chunk = overlap + sentence + " "
699
+ else:
700
+ # Single sentence too large - hard split
701
+ yield chunk_index, sentence[:char_size]
702
+ chunk_index += 1
703
+ current_chunk = (
704
+ sentence[-char_overlap:] if len(sentence) > char_overlap else ""
705
+ )
706
+
707
+ if current_chunk.strip():
708
+ yield chunk_index, current_chunk.strip()
709
+
710
+
711
+ def parse_markdown(path: Path, extra_metadata: dict | None = None) -> Document:
712
+ """Parse a markdown (.md) file into a Document."""
713
+ content = read_text_with_fallback(path)
714
+
715
+ # Extract frontmatter
716
+ frontmatter, body = extract_frontmatter(content)
717
+ metadata = DocumentMetadata.from_frontmatter(frontmatter)
718
+
719
+ # Merge extra metadata
720
+ if extra_metadata:
721
+ if "tags" in extra_metadata:
722
+ metadata.tags.extend(extra_metadata["tags"])
723
+ if "project" in extra_metadata:
724
+ metadata.project = extra_metadata["project"]
725
+ if "category" in extra_metadata:
726
+ metadata.category = extra_metadata["category"]
727
+
728
+ # Extract title
729
+ title_match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
730
+ title = title_match.group(1) if title_match else path.stem
731
+
732
+ # Extract sections
733
+ sections = extract_sections_markdown(body)
734
+
735
+ return Document(
736
+ source_path=str(path.resolve()),
737
+ source_type="markdown",
738
+ title=title,
739
+ content=content,
740
+ metadata=metadata,
741
+ sections=sections,
742
+ )
743
+
744
+
745
+ def parse_org(path: Path, extra_metadata: dict | None = None) -> Document:
746
+ """Parse an org-mode (.org) file into a Document (file only, no TODO extraction)."""
747
+ content = read_text_with_fallback(path)
748
+
749
+ # Extract org metadata (#+KEY: value lines)
750
+ org_meta, body = extract_org_metadata(content)
751
+
752
+ # Build DocumentMetadata from org metadata
753
+ tags = []
754
+ # #+FILETAGS: :tag1:tag2:
755
+ if filetags := org_meta.get("filetags"):
756
+ tags.extend([t for t in filetags.split(":") if t])
757
+ # #+TAGS: tag1 tag2
758
+ if tag_str := org_meta.get("tags"):
759
+ if isinstance(tag_str, list):
760
+ for t in tag_str:
761
+ tags.extend(t.split())
762
+ else:
763
+ tags.extend(tag_str.split())
764
+
765
+ metadata = DocumentMetadata(
766
+ tags=tags,
767
+ project=org_meta.get("project"),
768
+ category=org_meta.get("category"),
769
+ )
770
+
771
+ # Merge extra metadata
772
+ if extra_metadata:
773
+ if "tags" in extra_metadata:
774
+ metadata.tags.extend(extra_metadata["tags"])
775
+ if "project" in extra_metadata:
776
+ metadata.project = extra_metadata["project"]
777
+ if "category" in extra_metadata:
778
+ metadata.category = extra_metadata["category"]
779
+
780
+ # Extract title from #+TITLE or first header
781
+ title = org_meta.get("title")
782
+ if not title:
783
+ title_match = re.search(r"^\*+\s+(.+)$", body, re.MULTILINE)
784
+ if title_match:
785
+ title, _ = extract_org_tags(title_match.group(1))
786
+ # Remove TODO keywords from title
787
+ title = re.sub(r"^(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+", "", title)
788
+ else:
789
+ title = path.stem
790
+
791
+ # Extract sections
792
+ sections = extract_sections_org(body)
793
+
794
+ return Document(
795
+ source_path=str(path.resolve()),
796
+ source_type="org",
797
+ title=title,
798
+ content=content,
799
+ metadata=metadata,
800
+ sections=sections,
801
+ )
802
+
803
+
804
+ def parse_org_with_todos(path: Path, extra_metadata: dict | None = None) -> list[Document]:
805
+ """
806
+ Parse an org-mode file into multiple Documents.
807
+
808
+ Returns:
809
+ - The file itself as one Document (source_type='org')
810
+ - Each TODO item as a separate Document (source_type='org-todo')
811
+ """
812
+ # Parse the file document
813
+ file_doc = parse_org(path, extra_metadata)
814
+
815
+ # Extract TODO items
816
+ content = read_text_with_fallback(path)
817
+ todo_items = extract_org_todo_items(content)
818
+
819
+ # Convert TODO items to Documents
820
+ todo_docs = [org_todo_to_document(item, path, file_doc.metadata) for item in todo_items]
821
+
822
+ # File document first, then TODO documents
823
+ return [file_doc] + todo_docs
824
+
825
+
826
+ def parse_text(path: Path, extra_metadata: dict | None = None) -> Document:
827
+ """Parse a plain text file into a Document (no special parsing)."""
828
+ content = read_text_with_fallback(path)
829
+
830
+ metadata = DocumentMetadata()
831
+ if extra_metadata:
832
+ metadata = DocumentMetadata(
833
+ tags=extra_metadata.get("tags", []),
834
+ project=extra_metadata.get("project"),
835
+ category=extra_metadata.get("category"),
836
+ )
837
+
838
+ return Document(
839
+ source_path=str(path.resolve()),
840
+ source_type="text",
841
+ title=path.stem,
842
+ content=content,
843
+ metadata=metadata,
844
+ sections=[], # No section parsing for raw text
845
+ )
846
+
847
+
848
+ def parse_code(path: Path, extra_metadata: dict | None = None) -> Document:
849
+ """Parse a code file into a Document."""
850
+ content = read_text_with_fallback(path)
851
+
852
+ metadata = DocumentMetadata()
853
+ if extra_metadata:
854
+ metadata = DocumentMetadata(
855
+ tags=extra_metadata.get("tags", []),
856
+ project=extra_metadata.get("project"),
857
+ category=extra_metadata.get("category"),
858
+ )
859
+
860
+ # Auto-tag by language
861
+ lang_tags = {
862
+ ".py": "python",
863
+ ".js": "javascript",
864
+ ".ts": "typescript",
865
+ ".sql": "sql",
866
+ ".sh": "bash",
867
+ ".yaml": "yaml",
868
+ ".yml": "yaml",
869
+ }
870
+ if lang := lang_tags.get(path.suffix):
871
+ if lang not in metadata.tags:
872
+ metadata.tags.append(lang)
873
+
874
+ return Document(
875
+ source_path=str(path.resolve()),
876
+ source_type="code",
877
+ title=path.name,
878
+ content=content,
879
+ metadata=metadata,
880
+ )
881
+
882
+
883
+ def is_url(s: str) -> bool:
884
+ """Check if a string looks like a URL."""
885
+ return s.startswith(("http://", "https://"))
886
+
887
+
888
+ def parse_url(url: str, extra_metadata: dict | None = None) -> Document:
889
+ """Fetch and parse content from a URL using trafilatura."""
890
+ try:
891
+ import trafilatura
892
+ except ImportError:
893
+ raise ImportError(
894
+ "trafilatura is required for URL ingestion. Install with: pip install local-kb[web]"
895
+ )
896
+
897
+ # Fetch and extract content
898
+ downloaded = trafilatura.fetch_url(url)
899
+ if downloaded is None:
900
+ raise ValueError(f"Failed to fetch URL: {url}")
901
+
902
+ # Extract text content and metadata
903
+ result = trafilatura.extract(
904
+ downloaded,
905
+ include_comments=False,
906
+ include_tables=True,
907
+ output_format="txt",
908
+ )
909
+ if result is None:
910
+ raise ValueError(f"Failed to extract content from URL: {url}")
911
+
912
+ # Get metadata separately
913
+ meta = trafilatura.extract_metadata(downloaded)
914
+
915
+ # Build document metadata
916
+ metadata = DocumentMetadata()
917
+ if extra_metadata:
918
+ metadata = DocumentMetadata(
919
+ tags=extra_metadata.get("tags", []),
920
+ project=extra_metadata.get("project"),
921
+ category=extra_metadata.get("category"),
922
+ )
923
+
924
+ # Add URL-specific metadata
925
+ if meta:
926
+ if meta.title:
927
+ metadata.extra["original_title"] = meta.title
928
+ if meta.author:
929
+ metadata.extra["author"] = meta.author
930
+ if meta.date:
931
+ metadata.extra["document_date"] = meta.date
932
+ if meta.sitename:
933
+ metadata.extra["site"] = meta.sitename
934
+ if meta.description:
935
+ metadata.extra["description"] = meta.description
936
+
937
+ # Use fetched timestamp
938
+ metadata.extra["fetched_at"] = datetime.now(UTC).isoformat()
939
+
940
+ # Determine title
941
+ title = meta.title if meta and meta.title else url
942
+
943
+ return Document(
944
+ source_path=url,
945
+ source_type="web",
946
+ title=title,
947
+ content=result,
948
+ metadata=metadata,
949
+ sections=extract_sections_markdown(result), # trafilatura output has markdown-like headers
950
+ )
951
+
952
+
953
+ def parse_pdf_date(pdf_date: str | None) -> str | None:
954
+ """Parse PDF date format (D:YYYYMMDDHHmmSS+TZ) to ISO format."""
955
+ if not pdf_date:
956
+ return None
957
+ # Strip optional 'D:' prefix
958
+ if pdf_date.startswith("D:"):
959
+ pdf_date = pdf_date[2:]
960
+ try:
961
+ # Basic format: YYYYMMDDHHMMSS
962
+ if len(pdf_date) >= 14:
963
+ dt = datetime.strptime(pdf_date[:14], "%Y%m%d%H%M%S")
964
+ return dt.isoformat()
965
+ elif len(pdf_date) >= 8:
966
+ dt = datetime.strptime(pdf_date[:8], "%Y%m%d")
967
+ return dt.isoformat()
968
+ except ValueError:
969
+ pass
970
+ return None
971
+
972
+
973
+ def parse_pdf(path: Path, extra_metadata: dict | None = None) -> Document:
974
+ """Parse a PDF file into a Document using PyMuPDF."""
975
+ try:
976
+ import fitz # PyMuPDF
977
+ except ImportError:
978
+ raise ImportError(
979
+ "pymupdf is required for PDF ingestion. Install with: pip install local-kb[pdf]"
980
+ )
981
+
982
+ doc = fitz.open(path)
983
+ metadata = DocumentMetadata()
984
+
985
+ # Extract PDF metadata
986
+ pdf_meta = doc.metadata
987
+ if pdf_meta:
988
+ if pdf_meta.get("title"):
989
+ metadata.extra["original_title"] = pdf_meta["title"]
990
+ if pdf_meta.get("author"):
991
+ metadata.extra["author"] = pdf_meta["author"]
992
+ if pdf_meta.get("subject"):
993
+ metadata.extra["subject"] = pdf_meta["subject"]
994
+ if pdf_meta.get("keywords"):
995
+ # Keywords often comma-separated
996
+ keywords = [k.strip() for k in pdf_meta["keywords"].split(",") if k.strip()]
997
+ metadata.tags.extend(keywords)
998
+ # Parse creation date
999
+ if doc_date := parse_pdf_date(pdf_meta.get("creationDate")):
1000
+ metadata.extra["document_date"] = doc_date
1001
+
1002
+ # Merge extra metadata
1003
+ if extra_metadata:
1004
+ if "tags" in extra_metadata:
1005
+ metadata.tags.extend(extra_metadata["tags"])
1006
+ if "project" in extra_metadata:
1007
+ metadata.project = extra_metadata["project"]
1008
+ if "category" in extra_metadata:
1009
+ metadata.category = extra_metadata["category"]
1010
+
1011
+ # Extract text page by page as sections
1012
+ sections = []
1013
+ full_text_parts = []
1014
+ for page_num in range(len(doc)):
1015
+ page = doc[page_num]
1016
+ text = page.get_text().strip()
1017
+ if text:
1018
+ sections.append((f"Page {page_num + 1}", text))
1019
+ full_text_parts.append(text)
1020
+
1021
+ doc.close()
1022
+
1023
+ # Skip if no text extracted (likely scanned image)
1024
+ if not full_text_parts:
1025
+ raise ValueError(f"No text extracted from {path.name} - may be a scanned image (needs OCR)")
1026
+
1027
+ # Determine title
1028
+ title = pdf_meta.get("title") if pdf_meta else None
1029
+ if not title:
1030
+ title = path.stem
1031
+
1032
+ return Document(
1033
+ source_path=str(path.resolve()),
1034
+ source_type="pdf",
1035
+ title=title,
1036
+ content="\n\n".join(full_text_parts),
1037
+ metadata=metadata,
1038
+ sections=sections,
1039
+ )
1040
+
1041
+
1042
+ def parse_docx(path: Path, extra_metadata: dict | None = None) -> Document:
1043
+ """Parse a DOCX file into a Document using python-docx."""
1044
+ try:
1045
+ import docx
1046
+ except ImportError:
1047
+ raise ImportError(
1048
+ "python-docx is required for DOCX ingestion. Install with: pip install local-kb[docx]"
1049
+ )
1050
+
1051
+ doc = docx.Document(path)
1052
+ metadata = DocumentMetadata()
1053
+
1054
+ # Extract core properties
1055
+ core = doc.core_properties
1056
+ if core:
1057
+ if core.title:
1058
+ metadata.extra["original_title"] = core.title
1059
+ if core.author:
1060
+ metadata.extra["author"] = core.author
1061
+ if core.keywords:
1062
+ # Keywords often comma or semicolon separated
1063
+ for sep in [",", ";"]:
1064
+ if sep in core.keywords:
1065
+ keywords = [k.strip() for k in core.keywords.split(sep) if k.strip()]
1066
+ metadata.tags.extend(keywords)
1067
+ break
1068
+ else:
1069
+ # Single keyword or space-separated
1070
+ metadata.tags.extend(core.keywords.split())
1071
+ # Get document date (prefer created, fall back to modified)
1072
+ if core.created:
1073
+ metadata.extra["document_date"] = core.created.isoformat()
1074
+ elif core.modified:
1075
+ metadata.extra["document_date"] = core.modified.isoformat()
1076
+
1077
+ # Merge extra metadata
1078
+ if extra_metadata:
1079
+ if "tags" in extra_metadata:
1080
+ metadata.tags.extend(extra_metadata["tags"])
1081
+ if "project" in extra_metadata:
1082
+ metadata.project = extra_metadata["project"]
1083
+ if "category" in extra_metadata:
1084
+ metadata.category = extra_metadata["category"]
1085
+
1086
+ # Extract paragraphs with heading detection
1087
+ sections = []
1088
+ current_heading = None
1089
+ current_content = []
1090
+ full_text_parts = []
1091
+
1092
+ for para in doc.paragraphs:
1093
+ text = para.text.strip()
1094
+ if not text:
1095
+ continue
1096
+
1097
+ full_text_parts.append(text)
1098
+
1099
+ # Check if paragraph is a heading
1100
+ if para.style and para.style.name and para.style.name.startswith("Heading"):
1101
+ # Save previous section
1102
+ if current_content:
1103
+ sections.append((current_heading, "\n\n".join(current_content)))
1104
+ current_content = []
1105
+ current_heading = text
1106
+ else:
1107
+ current_content.append(text)
1108
+
1109
+ # Don't forget the last section
1110
+ if current_content:
1111
+ sections.append((current_heading, "\n\n".join(current_content)))
1112
+
1113
+ # Determine title
1114
+ title = core.title if core and core.title else path.stem
1115
+
1116
+ return Document(
1117
+ source_path=str(path.resolve()),
1118
+ source_type="docx",
1119
+ title=title,
1120
+ content="\n\n".join(full_text_parts),
1121
+ metadata=metadata,
1122
+ sections=sections,
1123
+ )
1124
+
1125
+
1126
+ def is_text_file(path: Path) -> bool:
1127
+ """Check if a file appears to be text (not binary)."""
1128
+ try:
1129
+ with open(path, "rb") as f:
1130
+ chunk = f.read(8192)
1131
+ # Check for null bytes (binary indicator)
1132
+ if b"\x00" in chunk:
1133
+ return False
1134
+ # Try to decode as UTF-8
1135
+ try:
1136
+ chunk.decode("utf-8")
1137
+ return True
1138
+ except UnicodeDecodeError:
1139
+ # Try other common encodings
1140
+ for encoding in ("windows-1252", "latin-1"):
1141
+ try:
1142
+ chunk.decode(encoding)
1143
+ return True
1144
+ except UnicodeDecodeError:
1145
+ continue
1146
+ return False
1147
+ except OSError:
1148
+ return False
1149
+
1150
+
1151
+ def parse_document(
1152
+ path: Path, extra_metadata: dict | None = None, force: bool = False
1153
+ ) -> list[Document]:
1154
+ """Parse a file into one or more Documents.
1155
+
1156
+ Some file types (e.g., org-mode) produce multiple documents:
1157
+ - The file itself (for semantic search)
1158
+ - Individual actionable items like TODOs (for structured queries)
1159
+
1160
+ Checks plugin registry first, then falls back to built-in parsers.
1161
+ If force=True, parse unknown extensions as text/code (for explicitly provided files).
1162
+ """
1163
+ # Check plugin registry first
1164
+ from .plugins.registry import PluginRegistry
1165
+
1166
+ if parser := PluginRegistry.get_parser_for_file(path):
1167
+ return [parser.parse(path, extra_metadata)]
1168
+
1169
+ # Fall back to built-in parsers
1170
+ if path.suffix == ".md":
1171
+ return [parse_markdown(path, extra_metadata)]
1172
+ elif path.suffix == ".org":
1173
+ # Org files produce multiple documents: file + TODO items
1174
+ return parse_org_with_todos(path, extra_metadata)
1175
+ elif path.suffix == ".pdf":
1176
+ return [parse_pdf(path, extra_metadata)]
1177
+ elif path.suffix == ".docx":
1178
+ return [parse_docx(path, extra_metadata)]
1179
+ elif path.suffix in config.document_extensions:
1180
+ return [parse_text(path, extra_metadata)]
1181
+ elif path.suffix in config.code_extensions:
1182
+ return [parse_code(path, extra_metadata)]
1183
+ elif force:
1184
+ return [parse_code(path, extra_metadata)]
1185
+ else:
1186
+ raise ValueError(f"Unsupported file type: {path.suffix}")
1187
+
1188
+
1189
+ def collect_documents(
1190
+ root: Path,
1191
+ extra_metadata: dict | None = None,
1192
+ ) -> Generator[Document, None, None]:
1193
+ """Recursively collect documents from a directory, pruning ignored directories."""
1194
+ print(f"Scanning {root}...", file=sys.stderr, flush=True)
1195
+ scanned = 0
1196
+ collected = 0
1197
+ skipped_ext = 0
1198
+
1199
+ for dirpath, dirnames, filenames in os.walk(root, topdown=True):
1200
+ # Prune ignored directories in-place (modifying dirnames affects traversal)
1201
+ dirnames[:] = [
1202
+ d for d in dirnames if not d.startswith(".") and d not in config.skip_directories
1203
+ ]
1204
+
1205
+ for filename in filenames:
1206
+ path = Path(dirpath) / filename
1207
+
1208
+ scanned += 1
1209
+ if scanned % 500 == 0:
1210
+ print(
1211
+ f" {scanned} files scanned, {collected} documents found...",
1212
+ file=sys.stderr,
1213
+ flush=True,
1214
+ )
1215
+
1216
+ if path.suffix not in config.all_extensions:
1217
+ skipped_ext += 1
1218
+ continue
1219
+
1220
+ # Check filename-based skip/block patterns first (before reading content)
1221
+ skip_check = check_file_skip(path)
1222
+ if skip_check.should_skip:
1223
+ prefix = "BLOCKED" if skip_check.is_security else "Skipping"
1224
+ print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
1225
+ continue
1226
+
1227
+ try:
1228
+ docs = parse_document(path, extra_metadata)
1229
+ if not docs:
1230
+ continue
1231
+
1232
+ # Content-based checks on the primary (file) document
1233
+ primary_doc = docs[0]
1234
+ if config.scan_content:
1235
+ skip_check = check_file_skip(path, primary_doc.content)
1236
+ if skip_check.should_skip:
1237
+ prefix = "BLOCKED" if skip_check.is_security else "Skipping"
1238
+ print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
1239
+ continue
1240
+
1241
+ # Capture file mtime for staleness tracking
1242
+ mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
1243
+ mtime_iso = mtime.isoformat()
1244
+
1245
+ # Yield all documents from this file
1246
+ for doc in docs:
1247
+ doc.metadata.extra["file_modified_at"] = mtime_iso
1248
+ collected += 1
1249
+ yield doc
1250
+ except Exception as e:
1251
+ print(f"Error parsing {path}: {e}", file=sys.stderr)
1252
+
1253
+ if scanned >= 1000:
1254
+ print(
1255
+ f"Scan complete: {scanned} files, {skipped_ext} wrong extension",
1256
+ file=sys.stderr,
1257
+ flush=True,
1258
+ )
1259
+
1260
+
1261
+ def create_chunks(doc: Document) -> list[Chunk]:
1262
+ """
1263
+ Create contextual chunks from a document.
1264
+
1265
+ Each chunk includes:
1266
+ - content: original text (for display)
1267
+ - embedding_text: contextualized text (for embedding)
1268
+ """
1269
+ chunks = []
1270
+
1271
+ # For code files, extract structural context once
1272
+ code_context = None
1273
+ if doc.source_type == "code":
1274
+ ext = Path(doc.source_path).suffix
1275
+ code_context = extract_code_context(doc.content, ext)
1276
+
1277
+ # Chunk by sections if available, otherwise whole document
1278
+ if doc.sections:
1279
+ chunk_index = 0
1280
+ for section_header, section_content in doc.sections:
1281
+ for _, section_chunk in chunk_text_generator(section_content):
1282
+ if not section_chunk.strip():
1283
+ continue # Skip empty chunks
1284
+ embedding_text = build_embedding_context(
1285
+ chunk_text=section_chunk,
1286
+ doc_title=doc.title,
1287
+ source_path=doc.source_path,
1288
+ source_type=doc.source_type,
1289
+ section_header=section_header,
1290
+ metadata=doc.metadata,
1291
+ code_context=code_context,
1292
+ )
1293
+
1294
+ chunks.append(
1295
+ Chunk(
1296
+ content=section_chunk,
1297
+ embedding_text=embedding_text,
1298
+ chunk_index=chunk_index,
1299
+ token_count=len(section_chunk) // config.chars_per_token,
1300
+ metadata={"section": section_header} if section_header else {},
1301
+ )
1302
+ )
1303
+ chunk_index += 1
1304
+ else:
1305
+ for chunk_index, chunk_content in chunk_text(doc.content):
1306
+ if not chunk_content.strip():
1307
+ continue # Skip empty chunks
1308
+ embedding_text = build_embedding_context(
1309
+ chunk_text=chunk_content,
1310
+ doc_title=doc.title,
1311
+ source_path=doc.source_path,
1312
+ source_type=doc.source_type,
1313
+ metadata=doc.metadata,
1314
+ code_context=code_context,
1315
+ )
1316
+
1317
+ chunks.append(
1318
+ Chunk(
1319
+ content=chunk_content,
1320
+ embedding_text=embedding_text,
1321
+ chunk_index=chunk_index,
1322
+ token_count=len(chunk_content) // config.chars_per_token,
1323
+ )
1324
+ )
1325
+
1326
+ return chunks
1327
+
1328
+
1329
+ # Alias for the generator to avoid name collision
1330
+ chunk_text_generator = chunk_text
1331
+
1332
+
1333
+ class Ingester:
1334
+ """Handles document ingestion into pgvector."""
1335
+
1336
+ def __init__(self, db_url: str, use_modal: bool = True):
1337
+ self.db_url = db_url
1338
+ self.use_modal = use_modal
1339
+ self._embedder = None
1340
+
1341
+ @property
1342
+ def embedder(self):
1343
+ """Lazy-load embedder, falling back to local if Modal unavailable."""
1344
+ if self._embedder is None:
1345
+ if self.use_modal:
1346
+ try:
1347
+ import modal
1348
+
1349
+ self._embedder = modal.Cls.from_name("knowledge-embedder", "Embedder")()
1350
+ except Exception as e:
1351
+ print(f"Modal unavailable ({e}), using local CPU embedding", file=sys.stderr)
1352
+ self.use_modal = False # Update flag for embed_batch call path
1353
+
1354
+ if not self.use_modal:
1355
+ from .local_embedder import embed_document
1356
+
1357
+ class LocalEmbedder:
1358
+ def embed_batch(self, texts):
1359
+ return [embed_document(t) for t in texts]
1360
+
1361
+ self._embedder = LocalEmbedder()
1362
+ return self._embedder
1363
+
1364
+ def ingest_documents(self, documents: list[Document], batch_size: int = 50):
1365
+ """
1366
+ Ingest documents into the database.
1367
+
1368
+ 1. Check for existing documents (by hash)
1369
+ 2. Create contextual chunks
1370
+ 3. Generate embeddings via Modal (or local)
1371
+ 4. Store in pgvector
1372
+
1373
+ For files that produce multiple documents (e.g., org with TODOs):
1374
+ - Primary document: file.org (source_type='org')
1375
+ - Derived documents: file.org::*TODO ... (source_type='org-todo')
1376
+ When a primary document changes, all derived documents are deleted first.
1377
+ """
1378
+ with psycopg.connect(self.db_url, row_factory=dict_row) as conn:
1379
+ register_vector(conn)
1380
+
1381
+ # Track which primary files we've already cleaned up derived docs for
1382
+ cleaned_derived = set()
1383
+
1384
+ for doc in documents:
1385
+ doc_hash = content_hash(doc.content)
1386
+
1387
+ # Determine if this is a derived document (has :: in path)
1388
+ is_derived = "::" in doc.source_path
1389
+ if is_derived:
1390
+ base_path = doc.source_path.split("::")[0]
1391
+ else:
1392
+ base_path = doc.source_path
1393
+
1394
+ # Check if document exists and unchanged (FOR UPDATE to prevent race)
1395
+ existing = conn.execute(
1396
+ "SELECT id FROM documents WHERE content_hash = %s FOR UPDATE",
1397
+ (doc_hash,),
1398
+ ).fetchone()
1399
+
1400
+ if existing:
1401
+ # Content unchanged - but update file_modified_at if present
1402
+ new_mtime = doc.metadata.extra.get("file_modified_at")
1403
+ if new_mtime:
1404
+ conn.execute(
1405
+ """UPDATE documents
1406
+ SET metadata = jsonb_set(metadata, '{file_modified_at}', to_jsonb(%s::text))
1407
+ WHERE id = %s""",
1408
+ (new_mtime, existing["id"]),
1409
+ )
1410
+ conn.commit()
1411
+ print(f"Skipping (unchanged): {doc.source_path}")
1412
+ continue
1413
+
1414
+ # For primary documents: also delete any derived documents
1415
+ if not is_derived and base_path not in cleaned_derived:
1416
+ deleted = conn.execute(
1417
+ "DELETE FROM documents WHERE source_path LIKE %s RETURNING id",
1418
+ (base_path + "::%",),
1419
+ ).fetchall()
1420
+ if deleted:
1421
+ print(f" Deleted {len(deleted)} derived documents from {base_path}")
1422
+ cleaned_derived.add(base_path)
1423
+
1424
+ # Check if same path exists with different hash (FOR UPDATE to prevent race)
1425
+ old_doc = conn.execute(
1426
+ "SELECT id FROM documents WHERE source_path = %s FOR UPDATE",
1427
+ (doc.source_path,),
1428
+ ).fetchone()
1429
+
1430
+ if old_doc:
1431
+ print(f"Updating: {doc.source_path}")
1432
+ conn.execute("DELETE FROM documents WHERE id = %s", (old_doc["id"],))
1433
+ else:
1434
+ print(f"Ingesting: {doc.source_path}")
1435
+
1436
+ # Insert document (ON CONFLICT handles duplicate content from different paths)
1437
+ result = conn.execute(
1438
+ """
1439
+ INSERT INTO documents (
1440
+ source_path, source_type, title, content, metadata, content_hash,
1441
+ due_date, event_start, event_end, status, priority
1442
+ )
1443
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
1444
+ ON CONFLICT (content_hash) DO NOTHING
1445
+ RETURNING id
1446
+ """,
1447
+ (
1448
+ doc.source_path,
1449
+ doc.source_type,
1450
+ doc.title,
1451
+ doc.content,
1452
+ psycopg.types.json.Json(doc.metadata.to_dict()),
1453
+ doc_hash,
1454
+ doc.due_date,
1455
+ doc.event_start,
1456
+ doc.event_end,
1457
+ doc.status,
1458
+ doc.priority,
1459
+ ),
1460
+ ).fetchone()
1461
+
1462
+ if result is None:
1463
+ print(f" Skipping (duplicate content): {doc.source_path}")
1464
+ continue
1465
+
1466
+ doc_id = result["id"]
1467
+
1468
+ # Create chunks
1469
+ chunks = create_chunks(doc)
1470
+
1471
+ if not chunks:
1472
+ conn.commit()
1473
+ continue
1474
+
1475
+ # Generate embeddings (batch to avoid OOM on GPU)
1476
+ embedding_texts = [c.embedding_text for c in chunks]
1477
+ embed_batch_size = 100 # Max texts per GPU call
1478
+
1479
+ print(f" Generating embeddings for {len(chunks)} chunks...")
1480
+ if self.use_modal:
1481
+ embeddings = []
1482
+ for i in range(0, len(embedding_texts), embed_batch_size):
1483
+ batch = embedding_texts[i : i + embed_batch_size]
1484
+ embeddings.extend(self.embedder.embed_batch.remote(batch))
1485
+ else:
1486
+ embeddings = self.embedder.embed_batch(embedding_texts)
1487
+
1488
+ # Insert chunks with embeddings
1489
+ for chunk, embedding in zip(chunks, embeddings):
1490
+ conn.execute(
1491
+ """
1492
+ INSERT INTO chunks
1493
+ (document_id, chunk_index, content, embedding_text, embedding, token_count, metadata)
1494
+ VALUES (%s, %s, %s, %s, %s, %s, %s)
1495
+ """,
1496
+ (
1497
+ doc_id,
1498
+ chunk.chunk_index,
1499
+ chunk.content,
1500
+ chunk.embedding_text,
1501
+ embedding,
1502
+ chunk.token_count,
1503
+ psycopg.types.json.Json(chunk.metadata),
1504
+ ),
1505
+ )
1506
+
1507
+ conn.commit()
1508
+ print(f" → {len(chunks)} chunks indexed")
1509
+
1510
+ def delete_document(self, source_path: str):
1511
+ """Remove a document and its chunks."""
1512
+ with psycopg.connect(self.db_url) as conn:
1513
+ result = conn.execute(
1514
+ "DELETE FROM documents WHERE source_path = %s RETURNING id",
1515
+ (source_path,),
1516
+ ).fetchone()
1517
+ conn.commit()
1518
+ return result is not None
1519
+
1520
+
1521
+ def main():
1522
+ """CLI entry point."""
1523
+ parser = argparse.ArgumentParser(
1524
+ description="Ingest documents into knowledge base",
1525
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1526
+ epilog="""
1527
+ Examples:
1528
+ python ingest.py ~/notes
1529
+ python ingest.py ~/projects/myapp --metadata '{"project": "myapp"}'
1530
+ python ingest.py document.md --local # Use CPU embedding
1531
+ """,
1532
+ )
1533
+ parser.add_argument("paths", nargs="+", type=Path, help="Files or directories to ingest")
1534
+ parser.add_argument(
1535
+ "--metadata",
1536
+ type=json.loads,
1537
+ default={},
1538
+ help='JSON metadata to attach (e.g., \'{"project": "myapp"}\')',
1539
+ )
1540
+ parser.add_argument(
1541
+ "--db-url",
1542
+ default=config.db_url,
1543
+ help="Database URL",
1544
+ )
1545
+ parser.add_argument(
1546
+ "--local",
1547
+ action="store_true",
1548
+ help="Use local CPU embedding instead of Modal",
1549
+ )
1550
+
1551
+ args = parser.parse_args()
1552
+
1553
+ ingester = Ingester(args.db_url, use_modal=not args.local)
1554
+
1555
+ # Collect documents
1556
+ documents = []
1557
+ for path in args.paths:
1558
+ path = path.resolve()
1559
+ if path.is_dir():
1560
+ documents.extend(collect_documents(path, args.metadata))
1561
+ elif path.is_file():
1562
+ # Check security patterns first
1563
+ skip_check = check_file_skip(path)
1564
+ if skip_check.should_skip:
1565
+ prefix = "BLOCKED" if skip_check.is_security else "Skipping"
1566
+ print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
1567
+ continue
1568
+
1569
+ # For explicitly provided files, try to parse even with unknown extension
1570
+ # Always allow .pdf and .docx even if not in config (user may have old config)
1571
+ if path.suffix in config.all_extensions or path.suffix in (".pdf", ".docx"):
1572
+ documents.extend(parse_document(path, args.metadata))
1573
+ elif is_text_file(path):
1574
+ print(f"Parsing as text: {path}", file=sys.stderr)
1575
+ documents.extend(parse_document(path, args.metadata, force=True))
1576
+ else:
1577
+ print(f"Skipping binary file: {path}", file=sys.stderr)
1578
+
1579
+ if not documents:
1580
+ print("No documents found to ingest")
1581
+ return
1582
+
1583
+ print(f"Found {len(documents)} documents to process")
1584
+ ingester.ingest_documents(documents)
1585
+ print("Done!")
1586
+
1587
+
1588
+ if __name__ == "__main__":
1589
+ main()