okb 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- okb/__init__.py +3 -0
- okb/cli.py +1272 -0
- okb/config.py +661 -0
- okb/data/init.sql +92 -0
- okb/http_server.py +463 -0
- okb/ingest.py +1589 -0
- okb/llm/__init__.py +86 -0
- okb/llm/base.py +83 -0
- okb/llm/cache.py +217 -0
- okb/llm/filter.py +187 -0
- okb/llm/providers.py +322 -0
- okb/local_embedder.py +87 -0
- okb/mcp_server.py +1393 -0
- okb/migrate.py +53 -0
- okb/migrations/0001.initial-schema.sql +91 -0
- okb/migrations/0002.sync-state.sql +22 -0
- okb/migrations/0003.structured-fields.sql +22 -0
- okb/migrations/0004.tokens.sql +13 -0
- okb/migrations/0005.database-metadata.sql +19 -0
- okb/migrations/0006.llm-cache.sql +13 -0
- okb/modal_embedder.py +120 -0
- okb/modal_llm.py +178 -0
- okb/plugins/__init__.py +8 -0
- okb/plugins/base.py +110 -0
- okb/plugins/registry.py +123 -0
- okb/plugins/sources/__init__.py +5 -0
- okb/plugins/sources/dropbox_paper.py +188 -0
- okb/plugins/sources/github.py +484 -0
- okb/rescan.py +227 -0
- okb/scripts/__init__.py +1 -0
- okb/scripts/watch.py +206 -0
- okb/tokens.py +277 -0
- okb-1.0.0.dist-info/METADATA +397 -0
- okb-1.0.0.dist-info/RECORD +36 -0
- okb-1.0.0.dist-info/WHEEL +4 -0
- okb-1.0.0.dist-info/entry_points.txt +9 -0
okb/ingest.py
ADDED
|
@@ -0,0 +1,1589 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document ingestion pipeline with contextual chunking.
|
|
3
|
+
|
|
4
|
+
Collects documents, chunks them with context, generates embeddings via Modal,
|
|
5
|
+
and stores in pgvector.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python ingest.py ~/notes ~/projects/docs
|
|
9
|
+
python ingest.py ~/notes --metadata '{"project": "personal"}'
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import fnmatch
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import sys
|
|
21
|
+
from collections.abc import Generator
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import UTC, datetime
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
import psycopg
|
|
27
|
+
import yaml
|
|
28
|
+
from pgvector.psycopg import register_vector
|
|
29
|
+
from psycopg.rows import dict_row
|
|
30
|
+
|
|
31
|
+
from .config import config
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def read_text_with_fallback(
|
|
35
|
+
path: Path, encodings: tuple[str, ...] = ("utf-8", "windows-1252", "latin-1")
|
|
36
|
+
) -> str:
|
|
37
|
+
"""Read text file trying multiple encodings in order."""
|
|
38
|
+
for encoding in encodings:
|
|
39
|
+
try:
|
|
40
|
+
return path.read_text(encoding=encoding)
|
|
41
|
+
except UnicodeDecodeError:
|
|
42
|
+
continue
|
|
43
|
+
# Last resort: read with errors replaced
|
|
44
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def matches_pattern(filename: str, patterns: list[str]) -> str | None:
|
|
48
|
+
"""Check if filename matches any pattern. Returns matched pattern or None."""
|
|
49
|
+
for pattern in patterns:
|
|
50
|
+
if fnmatch.fnmatch(filename, pattern) or fnmatch.fnmatch(filename.lower(), pattern.lower()):
|
|
51
|
+
return pattern
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Patterns for detecting secrets in content
|
|
56
|
+
SECRET_PATTERNS = [
|
|
57
|
+
(re.compile(r"-----BEGIN [A-Z ]* PRIVATE KEY-----"), "private key"),
|
|
58
|
+
(re.compile(r"AKIA[0-9A-Z]{16}"), "AWS access key"),
|
|
59
|
+
(re.compile(r"ghp_[a-zA-Z0-9]{36}"), "GitHub personal access token"),
|
|
60
|
+
(re.compile(r"gho_[a-zA-Z0-9]{36}"), "GitHub OAuth token"),
|
|
61
|
+
(re.compile(r"sk-[a-zA-Z0-9]{48}"), "OpenAI API key"),
|
|
62
|
+
(re.compile(r"sk-ant-api[a-zA-Z0-9-]{80,}"), "Anthropic API key"),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def scan_content_for_secrets(content: str) -> str | None:
|
|
67
|
+
"""Scan content for potential secrets. Returns description if found, None otherwise."""
|
|
68
|
+
# Only check first 10KB to avoid slow scans on large files
|
|
69
|
+
sample = content[:10240]
|
|
70
|
+
for pattern, description in SECRET_PATTERNS:
|
|
71
|
+
if pattern.search(sample):
|
|
72
|
+
return description
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def is_minified(content: str, max_line_length: int = 1000) -> bool:
|
|
77
|
+
"""Detect if content appears to be minified JS/CSS."""
|
|
78
|
+
lines = content.split("\n", 10) # Only check first few lines
|
|
79
|
+
if not lines:
|
|
80
|
+
return False
|
|
81
|
+
# Check if any of the first lines is extremely long
|
|
82
|
+
for line in lines[:5]:
|
|
83
|
+
if len(line) > max_line_length:
|
|
84
|
+
# Also check it's not just a long string/comment - minified has lots of punctuation
|
|
85
|
+
if line.count(";") > 20 or line.count(",") > 50 or line.count("{") > 20:
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class FileSkipReason:
|
|
91
|
+
"""Result of file skip check."""
|
|
92
|
+
|
|
93
|
+
def __init__(self, should_skip: bool, reason: str = "", is_security: bool = False):
|
|
94
|
+
self.should_skip = should_skip
|
|
95
|
+
self.reason = reason
|
|
96
|
+
self.is_security = is_security # True for blocked (security), False for skipped (low-value)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def check_file_skip(path: Path, content: str | None = None) -> FileSkipReason:
|
|
100
|
+
"""
|
|
101
|
+
Check if a file should be skipped or blocked.
|
|
102
|
+
|
|
103
|
+
Returns FileSkipReason with details.
|
|
104
|
+
"""
|
|
105
|
+
filename = path.name
|
|
106
|
+
|
|
107
|
+
# Check block patterns (security)
|
|
108
|
+
if matched := matches_pattern(filename, config.block_patterns):
|
|
109
|
+
return FileSkipReason(True, f"matches block pattern '{matched}'", is_security=True)
|
|
110
|
+
|
|
111
|
+
# Check skip patterns (low-value)
|
|
112
|
+
if matched := matches_pattern(filename, config.skip_patterns):
|
|
113
|
+
return FileSkipReason(True, f"matches skip pattern '{matched}'", is_security=False)
|
|
114
|
+
|
|
115
|
+
# Content-based checks (if content provided and scanning enabled)
|
|
116
|
+
if content is not None and config.scan_content:
|
|
117
|
+
# Check for secrets
|
|
118
|
+
if secret_type := scan_content_for_secrets(content):
|
|
119
|
+
return FileSkipReason(True, f"contains {secret_type}", is_security=True)
|
|
120
|
+
|
|
121
|
+
# Check for minified JS/CSS
|
|
122
|
+
if path.suffix in (".js", ".css") and is_minified(
|
|
123
|
+
content, config.max_line_length_for_minified
|
|
124
|
+
):
|
|
125
|
+
return FileSkipReason(True, "appears to be minified", is_security=False)
|
|
126
|
+
|
|
127
|
+
return FileSkipReason(False)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class DocumentMetadata:
|
|
132
|
+
"""Metadata extracted from document or provided externally."""
|
|
133
|
+
|
|
134
|
+
tags: list[str] = field(default_factory=list)
|
|
135
|
+
project: str | None = None
|
|
136
|
+
category: str | None = None
|
|
137
|
+
status: str | None = None
|
|
138
|
+
extra: dict = field(default_factory=dict)
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def from_frontmatter(cls, frontmatter: dict) -> DocumentMetadata:
|
|
142
|
+
"""Create from YAML frontmatter."""
|
|
143
|
+
extra = {
|
|
144
|
+
k: v
|
|
145
|
+
for k, v in frontmatter.items()
|
|
146
|
+
if k not in {"tags", "project", "category", "status"}
|
|
147
|
+
}
|
|
148
|
+
if doc_date := extract_document_date(frontmatter):
|
|
149
|
+
extra["document_date"] = doc_date
|
|
150
|
+
return cls(
|
|
151
|
+
tags=frontmatter.get("tags", []),
|
|
152
|
+
project=frontmatter.get("project"),
|
|
153
|
+
category=frontmatter.get("category"),
|
|
154
|
+
status=frontmatter.get("status"),
|
|
155
|
+
extra=extra,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
def to_dict(self) -> dict:
|
|
159
|
+
"""Convert to JSON-serializable dict."""
|
|
160
|
+
result = {}
|
|
161
|
+
if self.tags:
|
|
162
|
+
result["tags"] = self.tags
|
|
163
|
+
if self.project:
|
|
164
|
+
result["project"] = self.project
|
|
165
|
+
if self.category:
|
|
166
|
+
result["category"] = self.category
|
|
167
|
+
if self.status:
|
|
168
|
+
result["status"] = self.status
|
|
169
|
+
if self.extra:
|
|
170
|
+
result.update(self.extra)
|
|
171
|
+
return result
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class Document:
|
|
176
|
+
"""A document to be indexed."""
|
|
177
|
+
|
|
178
|
+
source_path: str
|
|
179
|
+
source_type: str
|
|
180
|
+
title: str
|
|
181
|
+
content: str
|
|
182
|
+
metadata: DocumentMetadata = field(default_factory=DocumentMetadata)
|
|
183
|
+
sections: list[tuple[str, str]] = field(default_factory=list) # (header, content)
|
|
184
|
+
|
|
185
|
+
# Structured fields for actionable items (tasks, events, emails)
|
|
186
|
+
due_date: datetime | None = None # Task deadlines
|
|
187
|
+
event_start: datetime | None = None # Calendar event start
|
|
188
|
+
event_end: datetime | None = None # Calendar event end
|
|
189
|
+
status: str | None = None # 'pending', 'completed', 'cancelled', etc.
|
|
190
|
+
priority: int | None = None # 1-5 scale (1=highest)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@dataclass
|
|
194
|
+
class Chunk:
|
|
195
|
+
"""A chunk ready for embedding."""
|
|
196
|
+
|
|
197
|
+
content: str # Original text (for display)
|
|
198
|
+
embedding_text: str # Contextualized text (for embedding)
|
|
199
|
+
chunk_index: int
|
|
200
|
+
token_count: int
|
|
201
|
+
metadata: dict = field(default_factory=dict)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def content_hash(content: str) -> str:
|
|
205
|
+
"""Generate hash for deduplication/change detection."""
|
|
206
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def extract_document_date(metadata: dict) -> str | None:
|
|
210
|
+
"""Extract document date from frontmatter/metadata, trying common field names."""
|
|
211
|
+
date_fields = ["date", "created", "modified", "updated", "last_modified", "pubdate"]
|
|
212
|
+
for field_name in date_fields:
|
|
213
|
+
if value := metadata.get(field_name):
|
|
214
|
+
if hasattr(value, "isoformat"):
|
|
215
|
+
return value.isoformat()
|
|
216
|
+
if isinstance(value, str):
|
|
217
|
+
return value
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def extract_frontmatter(content: str) -> tuple[dict, str]:
|
|
222
|
+
"""
|
|
223
|
+
Extract YAML frontmatter from markdown content.
|
|
224
|
+
|
|
225
|
+
Returns (frontmatter_dict, remaining_content).
|
|
226
|
+
"""
|
|
227
|
+
if not content.startswith("---"):
|
|
228
|
+
return {}, content
|
|
229
|
+
|
|
230
|
+
# Find closing ---
|
|
231
|
+
end_match = re.search(r"\n---\s*\n", content[3:])
|
|
232
|
+
if not end_match:
|
|
233
|
+
return {}, content
|
|
234
|
+
|
|
235
|
+
frontmatter_text = content[3 : end_match.start() + 3]
|
|
236
|
+
remaining = content[end_match.end() + 3 :]
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
frontmatter = yaml.safe_load(frontmatter_text) or {}
|
|
240
|
+
return frontmatter, remaining
|
|
241
|
+
except yaml.YAMLError:
|
|
242
|
+
return {}, content
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def extract_sections_markdown(content: str) -> list[tuple[str, str]]:
|
|
246
|
+
"""
|
|
247
|
+
Extract sections from markdown content.
|
|
248
|
+
|
|
249
|
+
Returns list of (header, section_content) tuples.
|
|
250
|
+
"""
|
|
251
|
+
# Split by headers (any level)
|
|
252
|
+
parts = re.split(r"(^#{1,6}\s+.+$)", content, flags=re.MULTILINE)
|
|
253
|
+
|
|
254
|
+
sections = []
|
|
255
|
+
current_header = None
|
|
256
|
+
|
|
257
|
+
for part in parts:
|
|
258
|
+
if re.match(r"^#{1,6}\s+", part):
|
|
259
|
+
current_header = part.strip().lstrip("#").strip()
|
|
260
|
+
elif part.strip():
|
|
261
|
+
sections.append((current_header, part.strip()))
|
|
262
|
+
|
|
263
|
+
return sections
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def extract_org_metadata(content: str) -> tuple[dict, str]:
|
|
267
|
+
"""
|
|
268
|
+
Extract org-mode metadata from file header.
|
|
269
|
+
|
|
270
|
+
Parses #+KEY: value lines at the start of the file.
|
|
271
|
+
Returns (metadata_dict, remaining_content).
|
|
272
|
+
"""
|
|
273
|
+
metadata = {}
|
|
274
|
+
lines = content.split("\n")
|
|
275
|
+
body_start = 0
|
|
276
|
+
|
|
277
|
+
for i, line in enumerate(lines):
|
|
278
|
+
match = re.match(r"^#\+(\w+):\s*(.*)$", line, re.IGNORECASE)
|
|
279
|
+
if match:
|
|
280
|
+
key = match.group(1).lower()
|
|
281
|
+
value = match.group(2).strip()
|
|
282
|
+
if key in metadata:
|
|
283
|
+
# Handle multiple values (e.g., multiple #+TAGS lines)
|
|
284
|
+
if isinstance(metadata[key], list):
|
|
285
|
+
metadata[key].append(value)
|
|
286
|
+
else:
|
|
287
|
+
metadata[key] = [metadata[key], value]
|
|
288
|
+
else:
|
|
289
|
+
metadata[key] = value
|
|
290
|
+
body_start = i + 1
|
|
291
|
+
elif line.strip() and not line.startswith("#"):
|
|
292
|
+
# Stop at first non-metadata, non-comment line
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
remaining = "\n".join(lines[body_start:])
|
|
296
|
+
return metadata, remaining
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def extract_org_tags(header: str) -> tuple[str, list[str]]:
|
|
300
|
+
"""
|
|
301
|
+
Extract tags from an org header line.
|
|
302
|
+
|
|
303
|
+
Org tags appear at end of header like: * Header text :tag1:tag2:
|
|
304
|
+
Returns (header_without_tags, list_of_tags).
|
|
305
|
+
"""
|
|
306
|
+
match = re.search(r"\s+(:[:\w]+:)\s*$", header)
|
|
307
|
+
if match:
|
|
308
|
+
tag_str = match.group(1)
|
|
309
|
+
tags = [t for t in tag_str.split(":") if t]
|
|
310
|
+
header_clean = header[: match.start()].strip()
|
|
311
|
+
return header_clean, tags
|
|
312
|
+
return header, []
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def extract_sections_org(content: str) -> list[tuple[str, str]]:
|
|
316
|
+
"""
|
|
317
|
+
Extract sections from org-mode content.
|
|
318
|
+
|
|
319
|
+
Org headers use * (one or more) at start of line.
|
|
320
|
+
Returns list of (header, section_content) tuples.
|
|
321
|
+
"""
|
|
322
|
+
# Split by org headers (any level)
|
|
323
|
+
parts = re.split(r"(^\*+\s+.+$)", content, flags=re.MULTILINE)
|
|
324
|
+
|
|
325
|
+
sections = []
|
|
326
|
+
current_header = None
|
|
327
|
+
|
|
328
|
+
for part in parts:
|
|
329
|
+
if re.match(r"^\*+\s+", part):
|
|
330
|
+
# Remove leading stars and any TODO keywords
|
|
331
|
+
header = re.sub(r"^\*+\s+", "", part)
|
|
332
|
+
# Remove common TODO keywords
|
|
333
|
+
header = re.sub(r"^(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+", "", header)
|
|
334
|
+
# Extract and remove tags
|
|
335
|
+
header, _ = extract_org_tags(header)
|
|
336
|
+
current_header = header.strip()
|
|
337
|
+
elif part.strip():
|
|
338
|
+
# Skip property drawers
|
|
339
|
+
clean_part = re.sub(r":PROPERTIES:.*?:END:", "", part, flags=re.DOTALL)
|
|
340
|
+
if clean_part.strip():
|
|
341
|
+
sections.append((current_header, clean_part.strip()))
|
|
342
|
+
|
|
343
|
+
return sections
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Org-mode TODO keywords (common defaults)
|
|
347
|
+
ORG_TODO_KEYWORDS = {"TODO", "DONE", "WAITING", "CANCELLED", "NEXT", "SOMEDAY"}
|
|
348
|
+
ORG_DONE_KEYWORDS = {"DONE", "CANCELLED"}
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@dataclass
|
|
352
|
+
class OrgTodoItem:
|
|
353
|
+
"""Represents a parsed org-mode TODO item."""
|
|
354
|
+
|
|
355
|
+
heading: str # The heading text (without stars, keyword, priority, tags)
|
|
356
|
+
raw_heading: str # Original heading line for source_path anchor
|
|
357
|
+
level: int # Number of stars
|
|
358
|
+
keyword: str | None # TODO, DONE, etc.
|
|
359
|
+
priority: str | None # A, B, C
|
|
360
|
+
tags: list[str]
|
|
361
|
+
deadline: datetime | None
|
|
362
|
+
scheduled: datetime | None
|
|
363
|
+
closed: datetime | None
|
|
364
|
+
content: str # Body text under this heading
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def parse_org_timestamp(ts: str) -> datetime | None:
|
|
368
|
+
"""Parse org-mode timestamp like <2024-01-15 Mon> or [2024-01-15 Mon 10:30]."""
|
|
369
|
+
# Strip brackets
|
|
370
|
+
ts = ts.strip("<>[]")
|
|
371
|
+
# Try various formats
|
|
372
|
+
formats = [
|
|
373
|
+
"%Y-%m-%d %a %H:%M", # <2024-01-15 Mon 10:30>
|
|
374
|
+
"%Y-%m-%d %a", # <2024-01-15 Mon>
|
|
375
|
+
"%Y-%m-%d %H:%M", # <2024-01-15 10:30>
|
|
376
|
+
"%Y-%m-%d", # <2024-01-15>
|
|
377
|
+
]
|
|
378
|
+
for fmt in formats:
|
|
379
|
+
try:
|
|
380
|
+
dt = datetime.strptime(ts, fmt)
|
|
381
|
+
return dt.replace(tzinfo=UTC)
|
|
382
|
+
except ValueError:
|
|
383
|
+
continue
|
|
384
|
+
# Try just the date part
|
|
385
|
+
match = re.match(r"(\d{4}-\d{2}-\d{2})", ts)
|
|
386
|
+
if match:
|
|
387
|
+
try:
|
|
388
|
+
return datetime.strptime(match.group(1), "%Y-%m-%d").replace(tzinfo=UTC)
|
|
389
|
+
except ValueError:
|
|
390
|
+
pass
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def extract_org_todo_items(content: str) -> list[OrgTodoItem]:
|
|
395
|
+
"""
|
|
396
|
+
Extract TODO items from org-mode content.
|
|
397
|
+
|
|
398
|
+
Parses headings with TODO keywords and extracts:
|
|
399
|
+
- Status (TODO/DONE/etc.)
|
|
400
|
+
- Priority ([#A]/[#B]/[#C])
|
|
401
|
+
- Tags (:tag1:tag2:)
|
|
402
|
+
- DEADLINE/SCHEDULED/CLOSED timestamps
|
|
403
|
+
- Body content
|
|
404
|
+
"""
|
|
405
|
+
items = []
|
|
406
|
+
lines = content.split("\n")
|
|
407
|
+
|
|
408
|
+
i = 0
|
|
409
|
+
while i < len(lines):
|
|
410
|
+
line = lines[i]
|
|
411
|
+
|
|
412
|
+
# Match org heading with optional TODO keyword
|
|
413
|
+
# Pattern: *+ [KEYWORD] [#PRIORITY] Title :tags:
|
|
414
|
+
heading_match = re.match(
|
|
415
|
+
r"^(\*+)\s+" # Stars
|
|
416
|
+
r"(?:(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+)?" # Optional keyword
|
|
417
|
+
r"(?:\[#([ABC])\]\s+)?" # Optional priority
|
|
418
|
+
r"(.+)$", # Rest of heading
|
|
419
|
+
line,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if heading_match:
|
|
423
|
+
level = len(heading_match.group(1))
|
|
424
|
+
keyword = heading_match.group(2)
|
|
425
|
+
priority = heading_match.group(3)
|
|
426
|
+
rest = heading_match.group(4)
|
|
427
|
+
|
|
428
|
+
# Only process items with TODO keywords
|
|
429
|
+
if keyword:
|
|
430
|
+
# Extract tags from end of heading
|
|
431
|
+
heading_text, tags = extract_org_tags(rest)
|
|
432
|
+
|
|
433
|
+
# Collect body content until next heading of same or higher level
|
|
434
|
+
body_lines = []
|
|
435
|
+
deadline = None
|
|
436
|
+
scheduled = None
|
|
437
|
+
closed = None
|
|
438
|
+
i += 1
|
|
439
|
+
|
|
440
|
+
while i < len(lines):
|
|
441
|
+
next_line = lines[i]
|
|
442
|
+
# Check for next heading of same or higher level
|
|
443
|
+
next_heading = re.match(r"^(\*+)\s+", next_line)
|
|
444
|
+
if next_heading and len(next_heading.group(1)) <= level:
|
|
445
|
+
break
|
|
446
|
+
|
|
447
|
+
# Check for planning line (DEADLINE, SCHEDULED, CLOSED)
|
|
448
|
+
if re.match(r"^\s*(DEADLINE|SCHEDULED|CLOSED):", next_line):
|
|
449
|
+
if dl := re.search(r"DEADLINE:\s*(<[^>]+>)", next_line):
|
|
450
|
+
deadline = parse_org_timestamp(dl.group(1))
|
|
451
|
+
if sc := re.search(r"SCHEDULED:\s*(<[^>]+>)", next_line):
|
|
452
|
+
scheduled = parse_org_timestamp(sc.group(1))
|
|
453
|
+
if cl := re.search(r"CLOSED:\s*(\[[^\]]+\])", next_line):
|
|
454
|
+
closed = parse_org_timestamp(cl.group(1))
|
|
455
|
+
# Skip property drawers
|
|
456
|
+
elif next_line.strip() == ":PROPERTIES:":
|
|
457
|
+
while i < len(lines) and lines[i].strip() != ":END:":
|
|
458
|
+
i += 1
|
|
459
|
+
elif next_line.strip() and not next_line.strip().startswith(":"):
|
|
460
|
+
body_lines.append(next_line)
|
|
461
|
+
|
|
462
|
+
i += 1
|
|
463
|
+
|
|
464
|
+
items.append(
|
|
465
|
+
OrgTodoItem(
|
|
466
|
+
heading=heading_text.strip(),
|
|
467
|
+
raw_heading=line,
|
|
468
|
+
level=level,
|
|
469
|
+
keyword=keyword,
|
|
470
|
+
priority=priority,
|
|
471
|
+
tags=tags,
|
|
472
|
+
deadline=deadline,
|
|
473
|
+
scheduled=scheduled,
|
|
474
|
+
closed=closed,
|
|
475
|
+
content="\n".join(body_lines).strip(),
|
|
476
|
+
)
|
|
477
|
+
)
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
i += 1
|
|
481
|
+
|
|
482
|
+
return items
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def org_todo_to_document(
|
|
486
|
+
item: OrgTodoItem,
|
|
487
|
+
file_path: Path,
|
|
488
|
+
file_metadata: DocumentMetadata,
|
|
489
|
+
) -> Document:
|
|
490
|
+
"""Convert an OrgTodoItem to a Document with structured fields."""
|
|
491
|
+
# Build org-mode link-style source path: file.org::*Heading
|
|
492
|
+
# Use the heading text (not raw) for cleaner anchors
|
|
493
|
+
anchor = f"*{item.keyword} {item.heading}" if item.keyword else f"*{item.heading}"
|
|
494
|
+
source_path = f"{file_path.resolve()}::{anchor}"
|
|
495
|
+
|
|
496
|
+
# Map org priority to numeric (A=1, B=2, C=3)
|
|
497
|
+
priority_map = {"A": 1, "B": 2, "C": 3}
|
|
498
|
+
priority = priority_map.get(item.priority) if item.priority else None
|
|
499
|
+
# SOMEDAY items get lowest priority
|
|
500
|
+
if item.keyword == "SOMEDAY":
|
|
501
|
+
priority = 5
|
|
502
|
+
|
|
503
|
+
# Map org keyword to status
|
|
504
|
+
status = "completed" if item.keyword in ORG_DONE_KEYWORDS else "pending"
|
|
505
|
+
|
|
506
|
+
# Use deadline or scheduled as due_date
|
|
507
|
+
due_date = item.deadline or item.scheduled
|
|
508
|
+
|
|
509
|
+
# Merge file tags with item tags
|
|
510
|
+
tags = list(file_metadata.tags) + item.tags
|
|
511
|
+
|
|
512
|
+
metadata = DocumentMetadata(
|
|
513
|
+
tags=tags,
|
|
514
|
+
project=file_metadata.project,
|
|
515
|
+
category=file_metadata.category,
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Build content with context
|
|
519
|
+
content_parts = [item.heading]
|
|
520
|
+
if item.content:
|
|
521
|
+
content_parts.append(item.content)
|
|
522
|
+
content = "\n\n".join(content_parts)
|
|
523
|
+
|
|
524
|
+
return Document(
|
|
525
|
+
source_path=source_path,
|
|
526
|
+
source_type="org-todo",
|
|
527
|
+
title=item.heading,
|
|
528
|
+
content=content,
|
|
529
|
+
metadata=metadata,
|
|
530
|
+
due_date=due_date,
|
|
531
|
+
status=status,
|
|
532
|
+
priority=priority,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def extract_code_context(content: str, file_ext: str) -> dict:
|
|
537
|
+
"""
|
|
538
|
+
Extract structural context from code files.
|
|
539
|
+
|
|
540
|
+
Returns dict with classes, functions, imports found.
|
|
541
|
+
"""
|
|
542
|
+
context = {
|
|
543
|
+
"classes": [],
|
|
544
|
+
"functions": [],
|
|
545
|
+
"imports": [],
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
if file_ext == ".py":
|
|
549
|
+
# Python classes and functions
|
|
550
|
+
context["classes"] = re.findall(r"^class\s+(\w+)", content, re.MULTILINE)
|
|
551
|
+
context["functions"] = re.findall(r"^def\s+(\w+)", content, re.MULTILINE)
|
|
552
|
+
# Top-level imports
|
|
553
|
+
imports = re.findall(r"^(?:from\s+(\S+)|import\s+(\S+))", content, re.MULTILINE)
|
|
554
|
+
context["imports"] = [i[0] or i[1] for i in imports][:10] # Limit
|
|
555
|
+
|
|
556
|
+
elif file_ext in {".js", ".ts", ".jsx", ".tsx"}:
|
|
557
|
+
# JavaScript/TypeScript
|
|
558
|
+
context["classes"] = re.findall(r"class\s+(\w+)", content)
|
|
559
|
+
context["functions"] = re.findall(
|
|
560
|
+
r"(?:function\s+(\w+)|(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\()",
|
|
561
|
+
content,
|
|
562
|
+
)
|
|
563
|
+
context["functions"] = [f[0] or f[1] for f in context["functions"]]
|
|
564
|
+
context["imports"] = re.findall(r"from\s+['\"]([^'\"]+)['\"]", content)[:10]
|
|
565
|
+
|
|
566
|
+
return {k: v for k, v in context.items() if v}
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def infer_project_from_path(path: Path) -> str | None:
|
|
570
|
+
"""
|
|
571
|
+
Infer project name from file path.
|
|
572
|
+
|
|
573
|
+
Looks for common patterns like:
|
|
574
|
+
- ~/projects/{project}/...
|
|
575
|
+
- ~/code/{project}/...
|
|
576
|
+
- ~/notes/projects/{project}/...
|
|
577
|
+
"""
|
|
578
|
+
parts = path.parts
|
|
579
|
+
project_indicators = {"projects", "code", "repos", "src"}
|
|
580
|
+
|
|
581
|
+
for i, part in enumerate(parts):
|
|
582
|
+
if part.lower() in project_indicators and i + 1 < len(parts):
|
|
583
|
+
return parts[i + 1]
|
|
584
|
+
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def build_embedding_context(
|
|
589
|
+
chunk_text: str,
|
|
590
|
+
doc_title: str,
|
|
591
|
+
source_path: str,
|
|
592
|
+
source_type: str,
|
|
593
|
+
section_header: str | None = None,
|
|
594
|
+
metadata: DocumentMetadata | None = None,
|
|
595
|
+
code_context: dict | None = None,
|
|
596
|
+
) -> str:
|
|
597
|
+
"""
|
|
598
|
+
Build contextualized text for embedding.
|
|
599
|
+
|
|
600
|
+
This is what the embedding model sees. The original chunk_text
|
|
601
|
+
is stored separately for display.
|
|
602
|
+
"""
|
|
603
|
+
parts = []
|
|
604
|
+
|
|
605
|
+
# Document identity
|
|
606
|
+
parts.append(f"Document: {doc_title}")
|
|
607
|
+
|
|
608
|
+
# Source type context
|
|
609
|
+
if source_type == "code":
|
|
610
|
+
path = Path(source_path)
|
|
611
|
+
parts.append(f"File: {path.name}")
|
|
612
|
+
if code_context:
|
|
613
|
+
if classes := code_context.get("classes"):
|
|
614
|
+
parts.append(f"Classes: {', '.join(classes[:5])}")
|
|
615
|
+
if functions := code_context.get("functions"):
|
|
616
|
+
parts.append(f"Functions: {', '.join(functions[:5])}")
|
|
617
|
+
|
|
618
|
+
# Project from metadata or path
|
|
619
|
+
project = None
|
|
620
|
+
if metadata and metadata.project:
|
|
621
|
+
project = metadata.project
|
|
622
|
+
else:
|
|
623
|
+
project = infer_project_from_path(Path(source_path))
|
|
624
|
+
|
|
625
|
+
if project:
|
|
626
|
+
parts.append(f"Project: {project}")
|
|
627
|
+
|
|
628
|
+
# Section context for long documents
|
|
629
|
+
if section_header:
|
|
630
|
+
parts.append(f"Section: {section_header}")
|
|
631
|
+
|
|
632
|
+
# Tags/topics from metadata
|
|
633
|
+
if metadata and metadata.tags:
|
|
634
|
+
parts.append(f"Topics: {', '.join(metadata.tags[:5])}")
|
|
635
|
+
|
|
636
|
+
if metadata and metadata.category:
|
|
637
|
+
parts.append(f"Category: {metadata.category}")
|
|
638
|
+
|
|
639
|
+
# The actual content
|
|
640
|
+
parts.append(f"Content: {chunk_text}")
|
|
641
|
+
|
|
642
|
+
return "\n".join(parts)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def chunk_text(
|
|
646
|
+
text: str,
|
|
647
|
+
chunk_size: int = config.chunk_size,
|
|
648
|
+
chunk_overlap: int = config.chunk_overlap,
|
|
649
|
+
) -> Generator[tuple[int, str], None, None]:
|
|
650
|
+
"""
|
|
651
|
+
Split text into overlapping chunks.
|
|
652
|
+
|
|
653
|
+
Tries to break at paragraph/sentence boundaries.
|
|
654
|
+
Uses approximate token count (4 chars ≈ 1 token).
|
|
655
|
+
"""
|
|
656
|
+
char_size = chunk_size * config.chars_per_token
|
|
657
|
+
char_overlap = chunk_overlap * config.chars_per_token
|
|
658
|
+
|
|
659
|
+
if len(text) <= char_size:
|
|
660
|
+
yield 0, text
|
|
661
|
+
return
|
|
662
|
+
|
|
663
|
+
# Split into paragraphs
|
|
664
|
+
paragraphs = re.split(r"\n\n+", text)
|
|
665
|
+
|
|
666
|
+
current_chunk = ""
|
|
667
|
+
chunk_index = 0
|
|
668
|
+
|
|
669
|
+
for para in paragraphs:
|
|
670
|
+
para = para.strip()
|
|
671
|
+
if not para:
|
|
672
|
+
continue
|
|
673
|
+
|
|
674
|
+
if len(current_chunk) + len(para) + 2 <= char_size:
|
|
675
|
+
current_chunk += para + "\n\n"
|
|
676
|
+
else:
|
|
677
|
+
if current_chunk.strip():
|
|
678
|
+
yield chunk_index, current_chunk.strip()
|
|
679
|
+
chunk_index += 1
|
|
680
|
+
# Keep overlap
|
|
681
|
+
overlap = current_chunk[-char_overlap:] if len(current_chunk) > char_overlap else ""
|
|
682
|
+
current_chunk = overlap + para + "\n\n"
|
|
683
|
+
else:
|
|
684
|
+
# Single paragraph too large - split by sentences
|
|
685
|
+
sentences = re.split(r"(?<=[.!?])\s+", para)
|
|
686
|
+
for sentence in sentences:
|
|
687
|
+
if len(current_chunk) + len(sentence) + 1 <= char_size:
|
|
688
|
+
current_chunk += sentence + " "
|
|
689
|
+
else:
|
|
690
|
+
if current_chunk.strip():
|
|
691
|
+
yield chunk_index, current_chunk.strip()
|
|
692
|
+
chunk_index += 1
|
|
693
|
+
overlap = (
|
|
694
|
+
current_chunk[-char_overlap:]
|
|
695
|
+
if len(current_chunk) > char_overlap
|
|
696
|
+
else ""
|
|
697
|
+
)
|
|
698
|
+
current_chunk = overlap + sentence + " "
|
|
699
|
+
else:
|
|
700
|
+
# Single sentence too large - hard split
|
|
701
|
+
yield chunk_index, sentence[:char_size]
|
|
702
|
+
chunk_index += 1
|
|
703
|
+
current_chunk = (
|
|
704
|
+
sentence[-char_overlap:] if len(sentence) > char_overlap else ""
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
if current_chunk.strip():
|
|
708
|
+
yield chunk_index, current_chunk.strip()
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
def parse_markdown(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
712
|
+
"""Parse a markdown (.md) file into a Document."""
|
|
713
|
+
content = read_text_with_fallback(path)
|
|
714
|
+
|
|
715
|
+
# Extract frontmatter
|
|
716
|
+
frontmatter, body = extract_frontmatter(content)
|
|
717
|
+
metadata = DocumentMetadata.from_frontmatter(frontmatter)
|
|
718
|
+
|
|
719
|
+
# Merge extra metadata
|
|
720
|
+
if extra_metadata:
|
|
721
|
+
if "tags" in extra_metadata:
|
|
722
|
+
metadata.tags.extend(extra_metadata["tags"])
|
|
723
|
+
if "project" in extra_metadata:
|
|
724
|
+
metadata.project = extra_metadata["project"]
|
|
725
|
+
if "category" in extra_metadata:
|
|
726
|
+
metadata.category = extra_metadata["category"]
|
|
727
|
+
|
|
728
|
+
# Extract title
|
|
729
|
+
title_match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
|
|
730
|
+
title = title_match.group(1) if title_match else path.stem
|
|
731
|
+
|
|
732
|
+
# Extract sections
|
|
733
|
+
sections = extract_sections_markdown(body)
|
|
734
|
+
|
|
735
|
+
return Document(
|
|
736
|
+
source_path=str(path.resolve()),
|
|
737
|
+
source_type="markdown",
|
|
738
|
+
title=title,
|
|
739
|
+
content=content,
|
|
740
|
+
metadata=metadata,
|
|
741
|
+
sections=sections,
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def parse_org(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
746
|
+
"""Parse an org-mode (.org) file into a Document (file only, no TODO extraction)."""
|
|
747
|
+
content = read_text_with_fallback(path)
|
|
748
|
+
|
|
749
|
+
# Extract org metadata (#+KEY: value lines)
|
|
750
|
+
org_meta, body = extract_org_metadata(content)
|
|
751
|
+
|
|
752
|
+
# Build DocumentMetadata from org metadata
|
|
753
|
+
tags = []
|
|
754
|
+
# #+FILETAGS: :tag1:tag2:
|
|
755
|
+
if filetags := org_meta.get("filetags"):
|
|
756
|
+
tags.extend([t for t in filetags.split(":") if t])
|
|
757
|
+
# #+TAGS: tag1 tag2
|
|
758
|
+
if tag_str := org_meta.get("tags"):
|
|
759
|
+
if isinstance(tag_str, list):
|
|
760
|
+
for t in tag_str:
|
|
761
|
+
tags.extend(t.split())
|
|
762
|
+
else:
|
|
763
|
+
tags.extend(tag_str.split())
|
|
764
|
+
|
|
765
|
+
metadata = DocumentMetadata(
|
|
766
|
+
tags=tags,
|
|
767
|
+
project=org_meta.get("project"),
|
|
768
|
+
category=org_meta.get("category"),
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Merge extra metadata
|
|
772
|
+
if extra_metadata:
|
|
773
|
+
if "tags" in extra_metadata:
|
|
774
|
+
metadata.tags.extend(extra_metadata["tags"])
|
|
775
|
+
if "project" in extra_metadata:
|
|
776
|
+
metadata.project = extra_metadata["project"]
|
|
777
|
+
if "category" in extra_metadata:
|
|
778
|
+
metadata.category = extra_metadata["category"]
|
|
779
|
+
|
|
780
|
+
# Extract title from #+TITLE or first header
|
|
781
|
+
title = org_meta.get("title")
|
|
782
|
+
if not title:
|
|
783
|
+
title_match = re.search(r"^\*+\s+(.+)$", body, re.MULTILINE)
|
|
784
|
+
if title_match:
|
|
785
|
+
title, _ = extract_org_tags(title_match.group(1))
|
|
786
|
+
# Remove TODO keywords from title
|
|
787
|
+
title = re.sub(r"^(TODO|DONE|WAITING|CANCELLED|NEXT|SOMEDAY)\s+", "", title)
|
|
788
|
+
else:
|
|
789
|
+
title = path.stem
|
|
790
|
+
|
|
791
|
+
# Extract sections
|
|
792
|
+
sections = extract_sections_org(body)
|
|
793
|
+
|
|
794
|
+
return Document(
|
|
795
|
+
source_path=str(path.resolve()),
|
|
796
|
+
source_type="org",
|
|
797
|
+
title=title,
|
|
798
|
+
content=content,
|
|
799
|
+
metadata=metadata,
|
|
800
|
+
sections=sections,
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def parse_org_with_todos(path: Path, extra_metadata: dict | None = None) -> list[Document]:
|
|
805
|
+
"""
|
|
806
|
+
Parse an org-mode file into multiple Documents.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
- The file itself as one Document (source_type='org')
|
|
810
|
+
- Each TODO item as a separate Document (source_type='org-todo')
|
|
811
|
+
"""
|
|
812
|
+
# Parse the file document
|
|
813
|
+
file_doc = parse_org(path, extra_metadata)
|
|
814
|
+
|
|
815
|
+
# Extract TODO items
|
|
816
|
+
content = read_text_with_fallback(path)
|
|
817
|
+
todo_items = extract_org_todo_items(content)
|
|
818
|
+
|
|
819
|
+
# Convert TODO items to Documents
|
|
820
|
+
todo_docs = [org_todo_to_document(item, path, file_doc.metadata) for item in todo_items]
|
|
821
|
+
|
|
822
|
+
# File document first, then TODO documents
|
|
823
|
+
return [file_doc] + todo_docs
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
def parse_text(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
827
|
+
"""Parse a plain text file into a Document (no special parsing)."""
|
|
828
|
+
content = read_text_with_fallback(path)
|
|
829
|
+
|
|
830
|
+
metadata = DocumentMetadata()
|
|
831
|
+
if extra_metadata:
|
|
832
|
+
metadata = DocumentMetadata(
|
|
833
|
+
tags=extra_metadata.get("tags", []),
|
|
834
|
+
project=extra_metadata.get("project"),
|
|
835
|
+
category=extra_metadata.get("category"),
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
return Document(
|
|
839
|
+
source_path=str(path.resolve()),
|
|
840
|
+
source_type="text",
|
|
841
|
+
title=path.stem,
|
|
842
|
+
content=content,
|
|
843
|
+
metadata=metadata,
|
|
844
|
+
sections=[], # No section parsing for raw text
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def parse_code(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
849
|
+
"""Parse a code file into a Document."""
|
|
850
|
+
content = read_text_with_fallback(path)
|
|
851
|
+
|
|
852
|
+
metadata = DocumentMetadata()
|
|
853
|
+
if extra_metadata:
|
|
854
|
+
metadata = DocumentMetadata(
|
|
855
|
+
tags=extra_metadata.get("tags", []),
|
|
856
|
+
project=extra_metadata.get("project"),
|
|
857
|
+
category=extra_metadata.get("category"),
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
# Auto-tag by language
|
|
861
|
+
lang_tags = {
|
|
862
|
+
".py": "python",
|
|
863
|
+
".js": "javascript",
|
|
864
|
+
".ts": "typescript",
|
|
865
|
+
".sql": "sql",
|
|
866
|
+
".sh": "bash",
|
|
867
|
+
".yaml": "yaml",
|
|
868
|
+
".yml": "yaml",
|
|
869
|
+
}
|
|
870
|
+
if lang := lang_tags.get(path.suffix):
|
|
871
|
+
if lang not in metadata.tags:
|
|
872
|
+
metadata.tags.append(lang)
|
|
873
|
+
|
|
874
|
+
return Document(
|
|
875
|
+
source_path=str(path.resolve()),
|
|
876
|
+
source_type="code",
|
|
877
|
+
title=path.name,
|
|
878
|
+
content=content,
|
|
879
|
+
metadata=metadata,
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def is_url(s: str) -> bool:
|
|
884
|
+
"""Check if a string looks like a URL."""
|
|
885
|
+
return s.startswith(("http://", "https://"))
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def parse_url(url: str, extra_metadata: dict | None = None) -> Document:
|
|
889
|
+
"""Fetch and parse content from a URL using trafilatura."""
|
|
890
|
+
try:
|
|
891
|
+
import trafilatura
|
|
892
|
+
except ImportError:
|
|
893
|
+
raise ImportError(
|
|
894
|
+
"trafilatura is required for URL ingestion. Install with: pip install local-kb[web]"
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
# Fetch and extract content
|
|
898
|
+
downloaded = trafilatura.fetch_url(url)
|
|
899
|
+
if downloaded is None:
|
|
900
|
+
raise ValueError(f"Failed to fetch URL: {url}")
|
|
901
|
+
|
|
902
|
+
# Extract text content and metadata
|
|
903
|
+
result = trafilatura.extract(
|
|
904
|
+
downloaded,
|
|
905
|
+
include_comments=False,
|
|
906
|
+
include_tables=True,
|
|
907
|
+
output_format="txt",
|
|
908
|
+
)
|
|
909
|
+
if result is None:
|
|
910
|
+
raise ValueError(f"Failed to extract content from URL: {url}")
|
|
911
|
+
|
|
912
|
+
# Get metadata separately
|
|
913
|
+
meta = trafilatura.extract_metadata(downloaded)
|
|
914
|
+
|
|
915
|
+
# Build document metadata
|
|
916
|
+
metadata = DocumentMetadata()
|
|
917
|
+
if extra_metadata:
|
|
918
|
+
metadata = DocumentMetadata(
|
|
919
|
+
tags=extra_metadata.get("tags", []),
|
|
920
|
+
project=extra_metadata.get("project"),
|
|
921
|
+
category=extra_metadata.get("category"),
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Add URL-specific metadata
|
|
925
|
+
if meta:
|
|
926
|
+
if meta.title:
|
|
927
|
+
metadata.extra["original_title"] = meta.title
|
|
928
|
+
if meta.author:
|
|
929
|
+
metadata.extra["author"] = meta.author
|
|
930
|
+
if meta.date:
|
|
931
|
+
metadata.extra["document_date"] = meta.date
|
|
932
|
+
if meta.sitename:
|
|
933
|
+
metadata.extra["site"] = meta.sitename
|
|
934
|
+
if meta.description:
|
|
935
|
+
metadata.extra["description"] = meta.description
|
|
936
|
+
|
|
937
|
+
# Use fetched timestamp
|
|
938
|
+
metadata.extra["fetched_at"] = datetime.now(UTC).isoformat()
|
|
939
|
+
|
|
940
|
+
# Determine title
|
|
941
|
+
title = meta.title if meta and meta.title else url
|
|
942
|
+
|
|
943
|
+
return Document(
|
|
944
|
+
source_path=url,
|
|
945
|
+
source_type="web",
|
|
946
|
+
title=title,
|
|
947
|
+
content=result,
|
|
948
|
+
metadata=metadata,
|
|
949
|
+
sections=extract_sections_markdown(result), # trafilatura output has markdown-like headers
|
|
950
|
+
)
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def parse_pdf_date(pdf_date: str | None) -> str | None:
|
|
954
|
+
"""Parse PDF date format (D:YYYYMMDDHHmmSS+TZ) to ISO format."""
|
|
955
|
+
if not pdf_date:
|
|
956
|
+
return None
|
|
957
|
+
# Strip optional 'D:' prefix
|
|
958
|
+
if pdf_date.startswith("D:"):
|
|
959
|
+
pdf_date = pdf_date[2:]
|
|
960
|
+
try:
|
|
961
|
+
# Basic format: YYYYMMDDHHMMSS
|
|
962
|
+
if len(pdf_date) >= 14:
|
|
963
|
+
dt = datetime.strptime(pdf_date[:14], "%Y%m%d%H%M%S")
|
|
964
|
+
return dt.isoformat()
|
|
965
|
+
elif len(pdf_date) >= 8:
|
|
966
|
+
dt = datetime.strptime(pdf_date[:8], "%Y%m%d")
|
|
967
|
+
return dt.isoformat()
|
|
968
|
+
except ValueError:
|
|
969
|
+
pass
|
|
970
|
+
return None
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def parse_pdf(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
974
|
+
"""Parse a PDF file into a Document using PyMuPDF."""
|
|
975
|
+
try:
|
|
976
|
+
import fitz # PyMuPDF
|
|
977
|
+
except ImportError:
|
|
978
|
+
raise ImportError(
|
|
979
|
+
"pymupdf is required for PDF ingestion. Install with: pip install local-kb[pdf]"
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
doc = fitz.open(path)
|
|
983
|
+
metadata = DocumentMetadata()
|
|
984
|
+
|
|
985
|
+
# Extract PDF metadata
|
|
986
|
+
pdf_meta = doc.metadata
|
|
987
|
+
if pdf_meta:
|
|
988
|
+
if pdf_meta.get("title"):
|
|
989
|
+
metadata.extra["original_title"] = pdf_meta["title"]
|
|
990
|
+
if pdf_meta.get("author"):
|
|
991
|
+
metadata.extra["author"] = pdf_meta["author"]
|
|
992
|
+
if pdf_meta.get("subject"):
|
|
993
|
+
metadata.extra["subject"] = pdf_meta["subject"]
|
|
994
|
+
if pdf_meta.get("keywords"):
|
|
995
|
+
# Keywords often comma-separated
|
|
996
|
+
keywords = [k.strip() for k in pdf_meta["keywords"].split(",") if k.strip()]
|
|
997
|
+
metadata.tags.extend(keywords)
|
|
998
|
+
# Parse creation date
|
|
999
|
+
if doc_date := parse_pdf_date(pdf_meta.get("creationDate")):
|
|
1000
|
+
metadata.extra["document_date"] = doc_date
|
|
1001
|
+
|
|
1002
|
+
# Merge extra metadata
|
|
1003
|
+
if extra_metadata:
|
|
1004
|
+
if "tags" in extra_metadata:
|
|
1005
|
+
metadata.tags.extend(extra_metadata["tags"])
|
|
1006
|
+
if "project" in extra_metadata:
|
|
1007
|
+
metadata.project = extra_metadata["project"]
|
|
1008
|
+
if "category" in extra_metadata:
|
|
1009
|
+
metadata.category = extra_metadata["category"]
|
|
1010
|
+
|
|
1011
|
+
# Extract text page by page as sections
|
|
1012
|
+
sections = []
|
|
1013
|
+
full_text_parts = []
|
|
1014
|
+
for page_num in range(len(doc)):
|
|
1015
|
+
page = doc[page_num]
|
|
1016
|
+
text = page.get_text().strip()
|
|
1017
|
+
if text:
|
|
1018
|
+
sections.append((f"Page {page_num + 1}", text))
|
|
1019
|
+
full_text_parts.append(text)
|
|
1020
|
+
|
|
1021
|
+
doc.close()
|
|
1022
|
+
|
|
1023
|
+
# Skip if no text extracted (likely scanned image)
|
|
1024
|
+
if not full_text_parts:
|
|
1025
|
+
raise ValueError(f"No text extracted from {path.name} - may be a scanned image (needs OCR)")
|
|
1026
|
+
|
|
1027
|
+
# Determine title
|
|
1028
|
+
title = pdf_meta.get("title") if pdf_meta else None
|
|
1029
|
+
if not title:
|
|
1030
|
+
title = path.stem
|
|
1031
|
+
|
|
1032
|
+
return Document(
|
|
1033
|
+
source_path=str(path.resolve()),
|
|
1034
|
+
source_type="pdf",
|
|
1035
|
+
title=title,
|
|
1036
|
+
content="\n\n".join(full_text_parts),
|
|
1037
|
+
metadata=metadata,
|
|
1038
|
+
sections=sections,
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
def parse_docx(path: Path, extra_metadata: dict | None = None) -> Document:
|
|
1043
|
+
"""Parse a DOCX file into a Document using python-docx."""
|
|
1044
|
+
try:
|
|
1045
|
+
import docx
|
|
1046
|
+
except ImportError:
|
|
1047
|
+
raise ImportError(
|
|
1048
|
+
"python-docx is required for DOCX ingestion. Install with: pip install local-kb[docx]"
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
doc = docx.Document(path)
|
|
1052
|
+
metadata = DocumentMetadata()
|
|
1053
|
+
|
|
1054
|
+
# Extract core properties
|
|
1055
|
+
core = doc.core_properties
|
|
1056
|
+
if core:
|
|
1057
|
+
if core.title:
|
|
1058
|
+
metadata.extra["original_title"] = core.title
|
|
1059
|
+
if core.author:
|
|
1060
|
+
metadata.extra["author"] = core.author
|
|
1061
|
+
if core.keywords:
|
|
1062
|
+
# Keywords often comma or semicolon separated
|
|
1063
|
+
for sep in [",", ";"]:
|
|
1064
|
+
if sep in core.keywords:
|
|
1065
|
+
keywords = [k.strip() for k in core.keywords.split(sep) if k.strip()]
|
|
1066
|
+
metadata.tags.extend(keywords)
|
|
1067
|
+
break
|
|
1068
|
+
else:
|
|
1069
|
+
# Single keyword or space-separated
|
|
1070
|
+
metadata.tags.extend(core.keywords.split())
|
|
1071
|
+
# Get document date (prefer created, fall back to modified)
|
|
1072
|
+
if core.created:
|
|
1073
|
+
metadata.extra["document_date"] = core.created.isoformat()
|
|
1074
|
+
elif core.modified:
|
|
1075
|
+
metadata.extra["document_date"] = core.modified.isoformat()
|
|
1076
|
+
|
|
1077
|
+
# Merge extra metadata
|
|
1078
|
+
if extra_metadata:
|
|
1079
|
+
if "tags" in extra_metadata:
|
|
1080
|
+
metadata.tags.extend(extra_metadata["tags"])
|
|
1081
|
+
if "project" in extra_metadata:
|
|
1082
|
+
metadata.project = extra_metadata["project"]
|
|
1083
|
+
if "category" in extra_metadata:
|
|
1084
|
+
metadata.category = extra_metadata["category"]
|
|
1085
|
+
|
|
1086
|
+
# Extract paragraphs with heading detection
|
|
1087
|
+
sections = []
|
|
1088
|
+
current_heading = None
|
|
1089
|
+
current_content = []
|
|
1090
|
+
full_text_parts = []
|
|
1091
|
+
|
|
1092
|
+
for para in doc.paragraphs:
|
|
1093
|
+
text = para.text.strip()
|
|
1094
|
+
if not text:
|
|
1095
|
+
continue
|
|
1096
|
+
|
|
1097
|
+
full_text_parts.append(text)
|
|
1098
|
+
|
|
1099
|
+
# Check if paragraph is a heading
|
|
1100
|
+
if para.style and para.style.name and para.style.name.startswith("Heading"):
|
|
1101
|
+
# Save previous section
|
|
1102
|
+
if current_content:
|
|
1103
|
+
sections.append((current_heading, "\n\n".join(current_content)))
|
|
1104
|
+
current_content = []
|
|
1105
|
+
current_heading = text
|
|
1106
|
+
else:
|
|
1107
|
+
current_content.append(text)
|
|
1108
|
+
|
|
1109
|
+
# Don't forget the last section
|
|
1110
|
+
if current_content:
|
|
1111
|
+
sections.append((current_heading, "\n\n".join(current_content)))
|
|
1112
|
+
|
|
1113
|
+
# Determine title
|
|
1114
|
+
title = core.title if core and core.title else path.stem
|
|
1115
|
+
|
|
1116
|
+
return Document(
|
|
1117
|
+
source_path=str(path.resolve()),
|
|
1118
|
+
source_type="docx",
|
|
1119
|
+
title=title,
|
|
1120
|
+
content="\n\n".join(full_text_parts),
|
|
1121
|
+
metadata=metadata,
|
|
1122
|
+
sections=sections,
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def is_text_file(path: Path) -> bool:
|
|
1127
|
+
"""Check if a file appears to be text (not binary)."""
|
|
1128
|
+
try:
|
|
1129
|
+
with open(path, "rb") as f:
|
|
1130
|
+
chunk = f.read(8192)
|
|
1131
|
+
# Check for null bytes (binary indicator)
|
|
1132
|
+
if b"\x00" in chunk:
|
|
1133
|
+
return False
|
|
1134
|
+
# Try to decode as UTF-8
|
|
1135
|
+
try:
|
|
1136
|
+
chunk.decode("utf-8")
|
|
1137
|
+
return True
|
|
1138
|
+
except UnicodeDecodeError:
|
|
1139
|
+
# Try other common encodings
|
|
1140
|
+
for encoding in ("windows-1252", "latin-1"):
|
|
1141
|
+
try:
|
|
1142
|
+
chunk.decode(encoding)
|
|
1143
|
+
return True
|
|
1144
|
+
except UnicodeDecodeError:
|
|
1145
|
+
continue
|
|
1146
|
+
return False
|
|
1147
|
+
except OSError:
|
|
1148
|
+
return False
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
def parse_document(
|
|
1152
|
+
path: Path, extra_metadata: dict | None = None, force: bool = False
|
|
1153
|
+
) -> list[Document]:
|
|
1154
|
+
"""Parse a file into one or more Documents.
|
|
1155
|
+
|
|
1156
|
+
Some file types (e.g., org-mode) produce multiple documents:
|
|
1157
|
+
- The file itself (for semantic search)
|
|
1158
|
+
- Individual actionable items like TODOs (for structured queries)
|
|
1159
|
+
|
|
1160
|
+
Checks plugin registry first, then falls back to built-in parsers.
|
|
1161
|
+
If force=True, parse unknown extensions as text/code (for explicitly provided files).
|
|
1162
|
+
"""
|
|
1163
|
+
# Check plugin registry first
|
|
1164
|
+
from .plugins.registry import PluginRegistry
|
|
1165
|
+
|
|
1166
|
+
if parser := PluginRegistry.get_parser_for_file(path):
|
|
1167
|
+
return [parser.parse(path, extra_metadata)]
|
|
1168
|
+
|
|
1169
|
+
# Fall back to built-in parsers
|
|
1170
|
+
if path.suffix == ".md":
|
|
1171
|
+
return [parse_markdown(path, extra_metadata)]
|
|
1172
|
+
elif path.suffix == ".org":
|
|
1173
|
+
# Org files produce multiple documents: file + TODO items
|
|
1174
|
+
return parse_org_with_todos(path, extra_metadata)
|
|
1175
|
+
elif path.suffix == ".pdf":
|
|
1176
|
+
return [parse_pdf(path, extra_metadata)]
|
|
1177
|
+
elif path.suffix == ".docx":
|
|
1178
|
+
return [parse_docx(path, extra_metadata)]
|
|
1179
|
+
elif path.suffix in config.document_extensions:
|
|
1180
|
+
return [parse_text(path, extra_metadata)]
|
|
1181
|
+
elif path.suffix in config.code_extensions:
|
|
1182
|
+
return [parse_code(path, extra_metadata)]
|
|
1183
|
+
elif force:
|
|
1184
|
+
return [parse_code(path, extra_metadata)]
|
|
1185
|
+
else:
|
|
1186
|
+
raise ValueError(f"Unsupported file type: {path.suffix}")
|
|
1187
|
+
|
|
1188
|
+
|
|
1189
|
+
def collect_documents(
|
|
1190
|
+
root: Path,
|
|
1191
|
+
extra_metadata: dict | None = None,
|
|
1192
|
+
) -> Generator[Document, None, None]:
|
|
1193
|
+
"""Recursively collect documents from a directory, pruning ignored directories."""
|
|
1194
|
+
print(f"Scanning {root}...", file=sys.stderr, flush=True)
|
|
1195
|
+
scanned = 0
|
|
1196
|
+
collected = 0
|
|
1197
|
+
skipped_ext = 0
|
|
1198
|
+
|
|
1199
|
+
for dirpath, dirnames, filenames in os.walk(root, topdown=True):
|
|
1200
|
+
# Prune ignored directories in-place (modifying dirnames affects traversal)
|
|
1201
|
+
dirnames[:] = [
|
|
1202
|
+
d for d in dirnames if not d.startswith(".") and d not in config.skip_directories
|
|
1203
|
+
]
|
|
1204
|
+
|
|
1205
|
+
for filename in filenames:
|
|
1206
|
+
path = Path(dirpath) / filename
|
|
1207
|
+
|
|
1208
|
+
scanned += 1
|
|
1209
|
+
if scanned % 500 == 0:
|
|
1210
|
+
print(
|
|
1211
|
+
f" {scanned} files scanned, {collected} documents found...",
|
|
1212
|
+
file=sys.stderr,
|
|
1213
|
+
flush=True,
|
|
1214
|
+
)
|
|
1215
|
+
|
|
1216
|
+
if path.suffix not in config.all_extensions:
|
|
1217
|
+
skipped_ext += 1
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
1220
|
+
# Check filename-based skip/block patterns first (before reading content)
|
|
1221
|
+
skip_check = check_file_skip(path)
|
|
1222
|
+
if skip_check.should_skip:
|
|
1223
|
+
prefix = "BLOCKED" if skip_check.is_security else "Skipping"
|
|
1224
|
+
print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
|
|
1225
|
+
continue
|
|
1226
|
+
|
|
1227
|
+
try:
|
|
1228
|
+
docs = parse_document(path, extra_metadata)
|
|
1229
|
+
if not docs:
|
|
1230
|
+
continue
|
|
1231
|
+
|
|
1232
|
+
# Content-based checks on the primary (file) document
|
|
1233
|
+
primary_doc = docs[0]
|
|
1234
|
+
if config.scan_content:
|
|
1235
|
+
skip_check = check_file_skip(path, primary_doc.content)
|
|
1236
|
+
if skip_check.should_skip:
|
|
1237
|
+
prefix = "BLOCKED" if skip_check.is_security else "Skipping"
|
|
1238
|
+
print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
|
|
1239
|
+
continue
|
|
1240
|
+
|
|
1241
|
+
# Capture file mtime for staleness tracking
|
|
1242
|
+
mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
|
|
1243
|
+
mtime_iso = mtime.isoformat()
|
|
1244
|
+
|
|
1245
|
+
# Yield all documents from this file
|
|
1246
|
+
for doc in docs:
|
|
1247
|
+
doc.metadata.extra["file_modified_at"] = mtime_iso
|
|
1248
|
+
collected += 1
|
|
1249
|
+
yield doc
|
|
1250
|
+
except Exception as e:
|
|
1251
|
+
print(f"Error parsing {path}: {e}", file=sys.stderr)
|
|
1252
|
+
|
|
1253
|
+
if scanned >= 1000:
|
|
1254
|
+
print(
|
|
1255
|
+
f"Scan complete: {scanned} files, {skipped_ext} wrong extension",
|
|
1256
|
+
file=sys.stderr,
|
|
1257
|
+
flush=True,
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
def create_chunks(doc: Document) -> list[Chunk]:
|
|
1262
|
+
"""
|
|
1263
|
+
Create contextual chunks from a document.
|
|
1264
|
+
|
|
1265
|
+
Each chunk includes:
|
|
1266
|
+
- content: original text (for display)
|
|
1267
|
+
- embedding_text: contextualized text (for embedding)
|
|
1268
|
+
"""
|
|
1269
|
+
chunks = []
|
|
1270
|
+
|
|
1271
|
+
# For code files, extract structural context once
|
|
1272
|
+
code_context = None
|
|
1273
|
+
if doc.source_type == "code":
|
|
1274
|
+
ext = Path(doc.source_path).suffix
|
|
1275
|
+
code_context = extract_code_context(doc.content, ext)
|
|
1276
|
+
|
|
1277
|
+
# Chunk by sections if available, otherwise whole document
|
|
1278
|
+
if doc.sections:
|
|
1279
|
+
chunk_index = 0
|
|
1280
|
+
for section_header, section_content in doc.sections:
|
|
1281
|
+
for _, section_chunk in chunk_text_generator(section_content):
|
|
1282
|
+
if not section_chunk.strip():
|
|
1283
|
+
continue # Skip empty chunks
|
|
1284
|
+
embedding_text = build_embedding_context(
|
|
1285
|
+
chunk_text=section_chunk,
|
|
1286
|
+
doc_title=doc.title,
|
|
1287
|
+
source_path=doc.source_path,
|
|
1288
|
+
source_type=doc.source_type,
|
|
1289
|
+
section_header=section_header,
|
|
1290
|
+
metadata=doc.metadata,
|
|
1291
|
+
code_context=code_context,
|
|
1292
|
+
)
|
|
1293
|
+
|
|
1294
|
+
chunks.append(
|
|
1295
|
+
Chunk(
|
|
1296
|
+
content=section_chunk,
|
|
1297
|
+
embedding_text=embedding_text,
|
|
1298
|
+
chunk_index=chunk_index,
|
|
1299
|
+
token_count=len(section_chunk) // config.chars_per_token,
|
|
1300
|
+
metadata={"section": section_header} if section_header else {},
|
|
1301
|
+
)
|
|
1302
|
+
)
|
|
1303
|
+
chunk_index += 1
|
|
1304
|
+
else:
|
|
1305
|
+
for chunk_index, chunk_content in chunk_text(doc.content):
|
|
1306
|
+
if not chunk_content.strip():
|
|
1307
|
+
continue # Skip empty chunks
|
|
1308
|
+
embedding_text = build_embedding_context(
|
|
1309
|
+
chunk_text=chunk_content,
|
|
1310
|
+
doc_title=doc.title,
|
|
1311
|
+
source_path=doc.source_path,
|
|
1312
|
+
source_type=doc.source_type,
|
|
1313
|
+
metadata=doc.metadata,
|
|
1314
|
+
code_context=code_context,
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
chunks.append(
|
|
1318
|
+
Chunk(
|
|
1319
|
+
content=chunk_content,
|
|
1320
|
+
embedding_text=embedding_text,
|
|
1321
|
+
chunk_index=chunk_index,
|
|
1322
|
+
token_count=len(chunk_content) // config.chars_per_token,
|
|
1323
|
+
)
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
return chunks
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
# Alias for the generator to avoid name collision
|
|
1330
|
+
chunk_text_generator = chunk_text
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
class Ingester:
|
|
1334
|
+
"""Handles document ingestion into pgvector."""
|
|
1335
|
+
|
|
1336
|
+
def __init__(self, db_url: str, use_modal: bool = True):
|
|
1337
|
+
self.db_url = db_url
|
|
1338
|
+
self.use_modal = use_modal
|
|
1339
|
+
self._embedder = None
|
|
1340
|
+
|
|
1341
|
+
@property
|
|
1342
|
+
def embedder(self):
|
|
1343
|
+
"""Lazy-load embedder, falling back to local if Modal unavailable."""
|
|
1344
|
+
if self._embedder is None:
|
|
1345
|
+
if self.use_modal:
|
|
1346
|
+
try:
|
|
1347
|
+
import modal
|
|
1348
|
+
|
|
1349
|
+
self._embedder = modal.Cls.from_name("knowledge-embedder", "Embedder")()
|
|
1350
|
+
except Exception as e:
|
|
1351
|
+
print(f"Modal unavailable ({e}), using local CPU embedding", file=sys.stderr)
|
|
1352
|
+
self.use_modal = False # Update flag for embed_batch call path
|
|
1353
|
+
|
|
1354
|
+
if not self.use_modal:
|
|
1355
|
+
from .local_embedder import embed_document
|
|
1356
|
+
|
|
1357
|
+
class LocalEmbedder:
|
|
1358
|
+
def embed_batch(self, texts):
|
|
1359
|
+
return [embed_document(t) for t in texts]
|
|
1360
|
+
|
|
1361
|
+
self._embedder = LocalEmbedder()
|
|
1362
|
+
return self._embedder
|
|
1363
|
+
|
|
1364
|
+
def ingest_documents(self, documents: list[Document], batch_size: int = 50):
|
|
1365
|
+
"""
|
|
1366
|
+
Ingest documents into the database.
|
|
1367
|
+
|
|
1368
|
+
1. Check for existing documents (by hash)
|
|
1369
|
+
2. Create contextual chunks
|
|
1370
|
+
3. Generate embeddings via Modal (or local)
|
|
1371
|
+
4. Store in pgvector
|
|
1372
|
+
|
|
1373
|
+
For files that produce multiple documents (e.g., org with TODOs):
|
|
1374
|
+
- Primary document: file.org (source_type='org')
|
|
1375
|
+
- Derived documents: file.org::*TODO ... (source_type='org-todo')
|
|
1376
|
+
When a primary document changes, all derived documents are deleted first.
|
|
1377
|
+
"""
|
|
1378
|
+
with psycopg.connect(self.db_url, row_factory=dict_row) as conn:
|
|
1379
|
+
register_vector(conn)
|
|
1380
|
+
|
|
1381
|
+
# Track which primary files we've already cleaned up derived docs for
|
|
1382
|
+
cleaned_derived = set()
|
|
1383
|
+
|
|
1384
|
+
for doc in documents:
|
|
1385
|
+
doc_hash = content_hash(doc.content)
|
|
1386
|
+
|
|
1387
|
+
# Determine if this is a derived document (has :: in path)
|
|
1388
|
+
is_derived = "::" in doc.source_path
|
|
1389
|
+
if is_derived:
|
|
1390
|
+
base_path = doc.source_path.split("::")[0]
|
|
1391
|
+
else:
|
|
1392
|
+
base_path = doc.source_path
|
|
1393
|
+
|
|
1394
|
+
# Check if document exists and unchanged (FOR UPDATE to prevent race)
|
|
1395
|
+
existing = conn.execute(
|
|
1396
|
+
"SELECT id FROM documents WHERE content_hash = %s FOR UPDATE",
|
|
1397
|
+
(doc_hash,),
|
|
1398
|
+
).fetchone()
|
|
1399
|
+
|
|
1400
|
+
if existing:
|
|
1401
|
+
# Content unchanged - but update file_modified_at if present
|
|
1402
|
+
new_mtime = doc.metadata.extra.get("file_modified_at")
|
|
1403
|
+
if new_mtime:
|
|
1404
|
+
conn.execute(
|
|
1405
|
+
"""UPDATE documents
|
|
1406
|
+
SET metadata = jsonb_set(metadata, '{file_modified_at}', to_jsonb(%s::text))
|
|
1407
|
+
WHERE id = %s""",
|
|
1408
|
+
(new_mtime, existing["id"]),
|
|
1409
|
+
)
|
|
1410
|
+
conn.commit()
|
|
1411
|
+
print(f"Skipping (unchanged): {doc.source_path}")
|
|
1412
|
+
continue
|
|
1413
|
+
|
|
1414
|
+
# For primary documents: also delete any derived documents
|
|
1415
|
+
if not is_derived and base_path not in cleaned_derived:
|
|
1416
|
+
deleted = conn.execute(
|
|
1417
|
+
"DELETE FROM documents WHERE source_path LIKE %s RETURNING id",
|
|
1418
|
+
(base_path + "::%",),
|
|
1419
|
+
).fetchall()
|
|
1420
|
+
if deleted:
|
|
1421
|
+
print(f" Deleted {len(deleted)} derived documents from {base_path}")
|
|
1422
|
+
cleaned_derived.add(base_path)
|
|
1423
|
+
|
|
1424
|
+
# Check if same path exists with different hash (FOR UPDATE to prevent race)
|
|
1425
|
+
old_doc = conn.execute(
|
|
1426
|
+
"SELECT id FROM documents WHERE source_path = %s FOR UPDATE",
|
|
1427
|
+
(doc.source_path,),
|
|
1428
|
+
).fetchone()
|
|
1429
|
+
|
|
1430
|
+
if old_doc:
|
|
1431
|
+
print(f"Updating: {doc.source_path}")
|
|
1432
|
+
conn.execute("DELETE FROM documents WHERE id = %s", (old_doc["id"],))
|
|
1433
|
+
else:
|
|
1434
|
+
print(f"Ingesting: {doc.source_path}")
|
|
1435
|
+
|
|
1436
|
+
# Insert document (ON CONFLICT handles duplicate content from different paths)
|
|
1437
|
+
result = conn.execute(
|
|
1438
|
+
"""
|
|
1439
|
+
INSERT INTO documents (
|
|
1440
|
+
source_path, source_type, title, content, metadata, content_hash,
|
|
1441
|
+
due_date, event_start, event_end, status, priority
|
|
1442
|
+
)
|
|
1443
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
1444
|
+
ON CONFLICT (content_hash) DO NOTHING
|
|
1445
|
+
RETURNING id
|
|
1446
|
+
""",
|
|
1447
|
+
(
|
|
1448
|
+
doc.source_path,
|
|
1449
|
+
doc.source_type,
|
|
1450
|
+
doc.title,
|
|
1451
|
+
doc.content,
|
|
1452
|
+
psycopg.types.json.Json(doc.metadata.to_dict()),
|
|
1453
|
+
doc_hash,
|
|
1454
|
+
doc.due_date,
|
|
1455
|
+
doc.event_start,
|
|
1456
|
+
doc.event_end,
|
|
1457
|
+
doc.status,
|
|
1458
|
+
doc.priority,
|
|
1459
|
+
),
|
|
1460
|
+
).fetchone()
|
|
1461
|
+
|
|
1462
|
+
if result is None:
|
|
1463
|
+
print(f" Skipping (duplicate content): {doc.source_path}")
|
|
1464
|
+
continue
|
|
1465
|
+
|
|
1466
|
+
doc_id = result["id"]
|
|
1467
|
+
|
|
1468
|
+
# Create chunks
|
|
1469
|
+
chunks = create_chunks(doc)
|
|
1470
|
+
|
|
1471
|
+
if not chunks:
|
|
1472
|
+
conn.commit()
|
|
1473
|
+
continue
|
|
1474
|
+
|
|
1475
|
+
# Generate embeddings (batch to avoid OOM on GPU)
|
|
1476
|
+
embedding_texts = [c.embedding_text for c in chunks]
|
|
1477
|
+
embed_batch_size = 100 # Max texts per GPU call
|
|
1478
|
+
|
|
1479
|
+
print(f" Generating embeddings for {len(chunks)} chunks...")
|
|
1480
|
+
if self.use_modal:
|
|
1481
|
+
embeddings = []
|
|
1482
|
+
for i in range(0, len(embedding_texts), embed_batch_size):
|
|
1483
|
+
batch = embedding_texts[i : i + embed_batch_size]
|
|
1484
|
+
embeddings.extend(self.embedder.embed_batch.remote(batch))
|
|
1485
|
+
else:
|
|
1486
|
+
embeddings = self.embedder.embed_batch(embedding_texts)
|
|
1487
|
+
|
|
1488
|
+
# Insert chunks with embeddings
|
|
1489
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
1490
|
+
conn.execute(
|
|
1491
|
+
"""
|
|
1492
|
+
INSERT INTO chunks
|
|
1493
|
+
(document_id, chunk_index, content, embedding_text, embedding, token_count, metadata)
|
|
1494
|
+
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
1495
|
+
""",
|
|
1496
|
+
(
|
|
1497
|
+
doc_id,
|
|
1498
|
+
chunk.chunk_index,
|
|
1499
|
+
chunk.content,
|
|
1500
|
+
chunk.embedding_text,
|
|
1501
|
+
embedding,
|
|
1502
|
+
chunk.token_count,
|
|
1503
|
+
psycopg.types.json.Json(chunk.metadata),
|
|
1504
|
+
),
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
conn.commit()
|
|
1508
|
+
print(f" → {len(chunks)} chunks indexed")
|
|
1509
|
+
|
|
1510
|
+
def delete_document(self, source_path: str):
|
|
1511
|
+
"""Remove a document and its chunks."""
|
|
1512
|
+
with psycopg.connect(self.db_url) as conn:
|
|
1513
|
+
result = conn.execute(
|
|
1514
|
+
"DELETE FROM documents WHERE source_path = %s RETURNING id",
|
|
1515
|
+
(source_path,),
|
|
1516
|
+
).fetchone()
|
|
1517
|
+
conn.commit()
|
|
1518
|
+
return result is not None
|
|
1519
|
+
|
|
1520
|
+
|
|
1521
|
+
def main():
|
|
1522
|
+
"""CLI entry point."""
|
|
1523
|
+
parser = argparse.ArgumentParser(
|
|
1524
|
+
description="Ingest documents into knowledge base",
|
|
1525
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1526
|
+
epilog="""
|
|
1527
|
+
Examples:
|
|
1528
|
+
python ingest.py ~/notes
|
|
1529
|
+
python ingest.py ~/projects/myapp --metadata '{"project": "myapp"}'
|
|
1530
|
+
python ingest.py document.md --local # Use CPU embedding
|
|
1531
|
+
""",
|
|
1532
|
+
)
|
|
1533
|
+
parser.add_argument("paths", nargs="+", type=Path, help="Files or directories to ingest")
|
|
1534
|
+
parser.add_argument(
|
|
1535
|
+
"--metadata",
|
|
1536
|
+
type=json.loads,
|
|
1537
|
+
default={},
|
|
1538
|
+
help='JSON metadata to attach (e.g., \'{"project": "myapp"}\')',
|
|
1539
|
+
)
|
|
1540
|
+
parser.add_argument(
|
|
1541
|
+
"--db-url",
|
|
1542
|
+
default=config.db_url,
|
|
1543
|
+
help="Database URL",
|
|
1544
|
+
)
|
|
1545
|
+
parser.add_argument(
|
|
1546
|
+
"--local",
|
|
1547
|
+
action="store_true",
|
|
1548
|
+
help="Use local CPU embedding instead of Modal",
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
args = parser.parse_args()
|
|
1552
|
+
|
|
1553
|
+
ingester = Ingester(args.db_url, use_modal=not args.local)
|
|
1554
|
+
|
|
1555
|
+
# Collect documents
|
|
1556
|
+
documents = []
|
|
1557
|
+
for path in args.paths:
|
|
1558
|
+
path = path.resolve()
|
|
1559
|
+
if path.is_dir():
|
|
1560
|
+
documents.extend(collect_documents(path, args.metadata))
|
|
1561
|
+
elif path.is_file():
|
|
1562
|
+
# Check security patterns first
|
|
1563
|
+
skip_check = check_file_skip(path)
|
|
1564
|
+
if skip_check.should_skip:
|
|
1565
|
+
prefix = "BLOCKED" if skip_check.is_security else "Skipping"
|
|
1566
|
+
print(f"{prefix}: {path} ({skip_check.reason})", file=sys.stderr)
|
|
1567
|
+
continue
|
|
1568
|
+
|
|
1569
|
+
# For explicitly provided files, try to parse even with unknown extension
|
|
1570
|
+
# Always allow .pdf and .docx even if not in config (user may have old config)
|
|
1571
|
+
if path.suffix in config.all_extensions or path.suffix in (".pdf", ".docx"):
|
|
1572
|
+
documents.extend(parse_document(path, args.metadata))
|
|
1573
|
+
elif is_text_file(path):
|
|
1574
|
+
print(f"Parsing as text: {path}", file=sys.stderr)
|
|
1575
|
+
documents.extend(parse_document(path, args.metadata, force=True))
|
|
1576
|
+
else:
|
|
1577
|
+
print(f"Skipping binary file: {path}", file=sys.stderr)
|
|
1578
|
+
|
|
1579
|
+
if not documents:
|
|
1580
|
+
print("No documents found to ingest")
|
|
1581
|
+
return
|
|
1582
|
+
|
|
1583
|
+
print(f"Found {len(documents)} documents to process")
|
|
1584
|
+
ingester.ingest_documents(documents)
|
|
1585
|
+
print("Done!")
|
|
1586
|
+
|
|
1587
|
+
|
|
1588
|
+
if __name__ == "__main__":
|
|
1589
|
+
main()
|