agmem 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,538 @@
1
+ """
2
+ Progressive Disclosure Search - SQLite FTS5 based multi-layer search.
3
+
4
+ Implements 3-tier search to minimize token usage while maximizing relevance:
5
+ - Layer 1: Lightweight Index (metadata + first line)
6
+ - Layer 2: Timeline Context (file summaries by date)
7
+ - Layer 3: Full Details (complete file content)
8
+ """
9
+
10
+ import hashlib
11
+ import json
12
+ import os
13
+ import sqlite3
14
+ from dataclasses import dataclass
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+
20
+ @dataclass
21
+ class IndexEntry:
22
+ """A single entry in the search index."""
23
+
24
+ file_hash: str
25
+ path: str
26
+ filename: str
27
+ memory_type: str
28
+ first_line: str
29
+ modified_time: str
30
+ size_bytes: int
31
+ commit_hash: Optional[str] = None
32
+ metadata: Optional[Dict[str, Any]] = None
33
+
34
+
35
+ @dataclass
36
+ class SearchResult:
37
+ """A search result with relevance info."""
38
+
39
+ path: str
40
+ filename: str
41
+ memory_type: str
42
+ first_line: str
43
+ snippet: str
44
+ score: float
45
+ modified_time: str
46
+ size_bytes: int
47
+
48
+
49
+ @dataclass
50
+ class TimelineEntry:
51
+ """A timeline entry grouping files by date."""
52
+
53
+ date: str
54
+ file_count: int
55
+ files: List[Dict[str, str]]
56
+ summary: Optional[str] = None
57
+
58
+
59
+ class SearchIndex:
60
+ """SQLite FTS5 based search index for memory files."""
61
+
62
+ SCHEMA = """
63
+ -- Main index table
64
+ CREATE TABLE IF NOT EXISTS file_index (
65
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
66
+ file_hash TEXT UNIQUE NOT NULL,
67
+ path TEXT NOT NULL,
68
+ filename TEXT NOT NULL,
69
+ memory_type TEXT NOT NULL,
70
+ first_line TEXT,
71
+ content_preview TEXT,
72
+ modified_time TEXT NOT NULL,
73
+ size_bytes INTEGER NOT NULL,
74
+ commit_hash TEXT,
75
+ metadata_json TEXT,
76
+ indexed_at TEXT NOT NULL
77
+ );
78
+
79
+ -- FTS5 virtual table for full-text search (standalone)
80
+ CREATE VIRTUAL TABLE IF NOT EXISTS file_fts USING fts5(
81
+ file_hash,
82
+ path,
83
+ filename,
84
+ first_line,
85
+ content_preview
86
+ );
87
+
88
+ -- Indexes for common queries
89
+ CREATE INDEX IF NOT EXISTS idx_memory_type ON file_index(memory_type);
90
+ CREATE INDEX IF NOT EXISTS idx_modified_time ON file_index(modified_time);
91
+ CREATE INDEX IF NOT EXISTS idx_path ON file_index(path);
92
+ CREATE INDEX IF NOT EXISTS idx_file_hash ON file_index(file_hash);
93
+
94
+ -- Timeline view helper table
95
+ CREATE TABLE IF NOT EXISTS timeline_cache (
96
+ date TEXT PRIMARY KEY,
97
+ file_count INTEGER,
98
+ files_json TEXT,
99
+ updated_at TEXT
100
+ );
101
+ """
102
+
103
+ def __init__(self, mem_dir: Path):
104
+ self.mem_dir = Path(mem_dir)
105
+ self.db_path = self.mem_dir / "search_index.db"
106
+ self._conn: Optional[sqlite3.Connection] = None
107
+
108
+ def _get_connection(self) -> sqlite3.Connection:
109
+ """Get or create SQLite connection."""
110
+ if self._conn is None:
111
+ self.mem_dir.mkdir(parents=True, exist_ok=True)
112
+ self._conn = sqlite3.connect(str(self.db_path))
113
+ self._conn.row_factory = sqlite3.Row
114
+ # Enable FTS5
115
+ self._conn.execute("PRAGMA journal_mode=WAL")
116
+ self._init_schema()
117
+ return self._conn
118
+
119
+ def _init_schema(self) -> None:
120
+ """Initialize database schema."""
121
+ conn = self._conn
122
+ cursor = conn.cursor()
123
+ for statement in self.SCHEMA.split(";"):
124
+ stmt = statement.strip()
125
+ if stmt:
126
+ try:
127
+ cursor.execute(stmt)
128
+ except sqlite3.OperationalError:
129
+ pass # Table may already exist
130
+ conn.commit()
131
+
132
+ def close(self) -> None:
133
+ """Close database connection."""
134
+ if self._conn:
135
+ self._conn.close()
136
+ self._conn = None
137
+
138
+ # --- Indexing ---
139
+
140
+ def index_file(self, path: Path, content: str, commit_hash: Optional[str] = None) -> str:
141
+ """Index a single file. Returns the file hash."""
142
+ conn = self._get_connection()
143
+
144
+ # Calculate file hash
145
+ file_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
146
+
147
+ # Extract metadata
148
+ filename = path.name
149
+ memory_type = self._extract_memory_type(path)
150
+ first_line = self._extract_first_line(content)
151
+ content_preview = content[:500] # First 500 chars for FTS
152
+ stat = path.stat() if path.exists() else None
153
+ modified_time = (
154
+ datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat()
155
+ if stat
156
+ else datetime.now(timezone.utc).isoformat()
157
+ )
158
+ size_bytes = stat.st_size if stat else len(content.encode())
159
+
160
+ # Parse YAML frontmatter for metadata
161
+ metadata = self._extract_frontmatter(content)
162
+
163
+ # Insert or replace in main table
164
+ cursor = conn.cursor()
165
+
166
+ # Delete existing entry if present (for proper FTS sync)
167
+ cursor.execute("DELETE FROM file_fts WHERE file_hash = ?", (file_hash,))
168
+ cursor.execute("DELETE FROM file_index WHERE file_hash = ?", (file_hash,))
169
+
170
+ cursor.execute(
171
+ """
172
+ INSERT INTO file_index
173
+ (file_hash, path, filename, memory_type, first_line, content_preview, modified_time, size_bytes, commit_hash, metadata_json, indexed_at)
174
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
175
+ """,
176
+ (
177
+ file_hash,
178
+ str(path),
179
+ filename,
180
+ memory_type,
181
+ first_line,
182
+ content_preview,
183
+ modified_time,
184
+ size_bytes,
185
+ commit_hash,
186
+ json.dumps(metadata) if metadata else None,
187
+ datetime.now(timezone.utc).isoformat(),
188
+ ),
189
+ )
190
+
191
+ # Insert into FTS index
192
+ cursor.execute(
193
+ """
194
+ INSERT INTO file_fts (file_hash, path, filename, first_line, content_preview)
195
+ VALUES (?, ?, ?, ?, ?)
196
+ """,
197
+ (file_hash, str(path), filename, first_line, content_preview),
198
+ )
199
+
200
+ conn.commit()
201
+ return file_hash
202
+
203
+ def index_directory(self, current_dir: Path) -> int:
204
+ """Recursively index all files in current/ directory. Returns count of indexed files."""
205
+ count = 0
206
+ for memory_type in ["episodic", "semantic", "procedural"]:
207
+ type_dir = current_dir / memory_type
208
+ if not type_dir.exists():
209
+ continue
210
+
211
+ for filepath in type_dir.rglob("*"):
212
+ if filepath.is_file():
213
+ try:
214
+ content = filepath.read_text(encoding="utf-8", errors="replace")
215
+ self.index_file(filepath, content)
216
+ count += 1
217
+ except Exception:
218
+ pass
219
+
220
+ self._update_timeline_cache()
221
+ return count
222
+
223
+ def _extract_memory_type(self, path: Path) -> str:
224
+ """Extract memory type from path."""
225
+ parts = path.parts
226
+ for memory_type in ["episodic", "semantic", "procedural"]:
227
+ if memory_type in parts:
228
+ return memory_type
229
+ return "unknown"
230
+
231
+ def _extract_first_line(self, content: str) -> str:
232
+ """Extract meaningful first line from content."""
233
+ lines = content.strip().split("\n")
234
+ for line in lines:
235
+ line = line.strip()
236
+ # Skip frontmatter delimiters and empty lines
237
+ if line and line != "---" and not line.startswith("#"):
238
+ return line[:200]
239
+ # Use first heading if present
240
+ if line.startswith("#"):
241
+ return line.lstrip("#").strip()[:200]
242
+ return lines[0][:200] if lines else ""
243
+
244
+ def _extract_frontmatter(self, content: str) -> Optional[Dict[str, Any]]:
245
+ """Extract YAML frontmatter if present."""
246
+ if not content.startswith("---"):
247
+ return None
248
+
249
+ try:
250
+ end = content.find("---", 3)
251
+ if end == -1:
252
+ return None
253
+ frontmatter = content[3:end].strip()
254
+
255
+ # Simple YAML parsing (key: value only)
256
+ metadata = {}
257
+ for line in frontmatter.split("\n"):
258
+ if ":" in line:
259
+ key, value = line.split(":", 1)
260
+ metadata[key.strip()] = value.strip().strip('"').strip("'")
261
+ return metadata
262
+ except Exception:
263
+ return None
264
+
265
+ def _update_timeline_cache(self) -> None:
266
+ """Update the timeline cache table."""
267
+ conn = self._get_connection()
268
+ cursor = conn.cursor()
269
+
270
+ # Group files by date
271
+ cursor.execute(
272
+ """
273
+ SELECT DATE(modified_time) as date, COUNT(*) as count,
274
+ GROUP_CONCAT(path || '|' || filename || '|' || memory_type, ';;') as files
275
+ FROM file_index
276
+ GROUP BY DATE(modified_time)
277
+ ORDER BY date DESC
278
+ """
279
+ )
280
+
281
+ for row in cursor.fetchall():
282
+ files_list = []
283
+ if row["files"]:
284
+ for file_str in row["files"].split(";;"):
285
+ parts = file_str.split("|")
286
+ if len(parts) >= 3:
287
+ files_list.append(
288
+ {
289
+ "path": parts[0],
290
+ "filename": parts[1],
291
+ "memory_type": parts[2],
292
+ }
293
+ )
294
+
295
+ cursor.execute(
296
+ """
297
+ INSERT OR REPLACE INTO timeline_cache (date, file_count, files_json, updated_at)
298
+ VALUES (?, ?, ?, ?)
299
+ """,
300
+ (
301
+ row["date"],
302
+ row["count"],
303
+ json.dumps(files_list),
304
+ datetime.now(timezone.utc).isoformat(),
305
+ ),
306
+ )
307
+
308
+ conn.commit()
309
+
310
+ # --- Layer 1: Lightweight Index Search ---
311
+
312
+ def search_index(
313
+ self,
314
+ query: str,
315
+ memory_type: Optional[str] = None,
316
+ limit: int = 20,
317
+ ) -> List[SearchResult]:
318
+ """Layer 1: Search the lightweight index. Returns metadata + first line only."""
319
+ conn = self._get_connection()
320
+ cursor = conn.cursor()
321
+
322
+ # Build FTS5 query
323
+ fts_query = self._build_fts_query(query)
324
+
325
+ sql = """
326
+ SELECT f.path, f.filename, f.memory_type, f.first_line, f.modified_time, f.size_bytes,
327
+ bm25(file_fts) as score,
328
+ snippet(file_fts, 4, '<b>', '</b>', '...', 32) as snippet
329
+ FROM file_fts
330
+ JOIN file_index f ON file_fts.file_hash = f.file_hash
331
+ WHERE file_fts MATCH ?
332
+ """
333
+ params: List[Any] = [fts_query]
334
+
335
+ if memory_type:
336
+ sql += " AND f.memory_type = ?"
337
+ params.append(memory_type)
338
+
339
+ sql += " ORDER BY score LIMIT ?"
340
+ params.append(limit)
341
+
342
+ cursor.execute(sql, params)
343
+
344
+ results = []
345
+ for row in cursor.fetchall():
346
+ results.append(
347
+ SearchResult(
348
+ path=row["path"],
349
+ filename=row["filename"],
350
+ memory_type=row["memory_type"],
351
+ first_line=row["first_line"],
352
+ snippet=row["snippet"] or "",
353
+ score=abs(row["score"]),
354
+ modified_time=row["modified_time"],
355
+ size_bytes=row["size_bytes"],
356
+ )
357
+ )
358
+
359
+ return results
360
+
361
+ def _build_fts_query(self, query: str) -> str:
362
+ """Build FTS5 query from user query."""
363
+ # Simple tokenization - split on spaces, add wildcard for prefix matching
364
+ tokens = query.strip().split()
365
+ if not tokens:
366
+ return "*"
367
+
368
+ # For single token, use prefix match
369
+ if len(tokens) == 1:
370
+ return f'"{tokens[0]}"*'
371
+
372
+ # For multiple tokens, use AND with prefix match on last token
373
+ parts = [f'"{t}"' for t in tokens[:-1]]
374
+ parts.append(f'"{tokens[-1]}"*')
375
+ return " AND ".join(parts)
376
+
377
+ # --- Layer 2: Timeline Context ---
378
+
379
+ def get_timeline(
380
+ self,
381
+ start_date: Optional[str] = None,
382
+ end_date: Optional[str] = None,
383
+ limit: int = 10,
384
+ ) -> List[TimelineEntry]:
385
+ """Layer 2: Get timeline of files grouped by date."""
386
+ conn = self._get_connection()
387
+ cursor = conn.cursor()
388
+
389
+ sql = "SELECT date, file_count, files_json FROM timeline_cache"
390
+ conditions = []
391
+ params: List[Any] = []
392
+
393
+ if start_date:
394
+ conditions.append("date >= ?")
395
+ params.append(start_date)
396
+ if end_date:
397
+ conditions.append("date <= ?")
398
+ params.append(end_date)
399
+
400
+ if conditions:
401
+ sql += " WHERE " + " AND ".join(conditions)
402
+
403
+ sql += " ORDER BY date DESC LIMIT ?"
404
+ params.append(limit)
405
+
406
+ cursor.execute(sql, params)
407
+
408
+ results = []
409
+ for row in cursor.fetchall():
410
+ files = json.loads(row["files_json"]) if row["files_json"] else []
411
+ results.append(
412
+ TimelineEntry(
413
+ date=row["date"],
414
+ file_count=row["file_count"],
415
+ files=files,
416
+ )
417
+ )
418
+
419
+ return results
420
+
421
+ def get_context_around(
422
+ self,
423
+ path: str,
424
+ window_hours: int = 24,
425
+ ) -> List[Dict[str, Any]]:
426
+ """Get files modified around the same time as a given file."""
427
+ conn = self._get_connection()
428
+ cursor = conn.cursor()
429
+
430
+ # Get the file's modification time
431
+ cursor.execute("SELECT modified_time FROM file_index WHERE path = ?", (path,))
432
+ row = cursor.fetchone()
433
+ if not row:
434
+ return []
435
+
436
+ base_time = row["modified_time"]
437
+
438
+ # Find files within the time window
439
+ cursor.execute(
440
+ """
441
+ SELECT path, filename, memory_type, first_line, modified_time
442
+ FROM file_index
443
+ WHERE ABS(JULIANDAY(modified_time) - JULIANDAY(?)) * 24 <= ?
444
+ AND path != ?
445
+ ORDER BY ABS(JULIANDAY(modified_time) - JULIANDAY(?))
446
+ LIMIT 20
447
+ """,
448
+ (base_time, window_hours, path, base_time),
449
+ )
450
+
451
+ return [dict(row) for row in cursor.fetchall()]
452
+
453
+ # --- Layer 3: Full Details ---
454
+
455
+ def get_full_details(self, paths: List[str]) -> List[Dict[str, Any]]:
456
+ """Layer 3: Get full file details for specific paths."""
457
+ results = []
458
+ for path in paths:
459
+ filepath = Path(path)
460
+ if not filepath.exists():
461
+ continue
462
+
463
+ try:
464
+ content = filepath.read_text(encoding="utf-8", errors="replace")
465
+ results.append(
466
+ {
467
+ "path": str(path),
468
+ "filename": filepath.name,
469
+ "content": content,
470
+ "size_bytes": len(content.encode()),
471
+ }
472
+ )
473
+ except Exception:
474
+ pass
475
+
476
+ return results
477
+
478
+ # --- Statistics ---
479
+
480
+ def get_stats(self) -> Dict[str, Any]:
481
+ """Get index statistics."""
482
+ conn = self._get_connection()
483
+ cursor = conn.cursor()
484
+
485
+ cursor.execute("SELECT COUNT(*) as total FROM file_index")
486
+ total = cursor.fetchone()["total"]
487
+
488
+ cursor.execute(
489
+ """
490
+ SELECT memory_type, COUNT(*) as count
491
+ FROM file_index
492
+ GROUP BY memory_type
493
+ """
494
+ )
495
+ by_type = {row["memory_type"]: row["count"] for row in cursor.fetchall()}
496
+
497
+ cursor.execute("SELECT SUM(size_bytes) as total_size FROM file_index")
498
+ total_size = cursor.fetchone()["total_size"] or 0
499
+
500
+ return {
501
+ "total_files": total,
502
+ "by_type": by_type,
503
+ "total_size_bytes": total_size,
504
+ "db_path": str(self.db_path),
505
+ }
506
+
507
+
508
+ # --- Token Cost Estimation ---
509
+
510
+
511
+ def estimate_token_cost(text: str) -> int:
512
+ """Estimate token count for text (rough approximation)."""
513
+ # Rough estimate: ~4 characters per token
514
+ return len(text) // 4
515
+
516
+
517
+ def layer1_cost(results: List[SearchResult]) -> int:
518
+ """Estimate token cost for Layer 1 results."""
519
+ total = 0
520
+ for r in results:
521
+ total += estimate_token_cost(r.path + r.first_line + r.snippet)
522
+ return total
523
+
524
+
525
+ def layer2_cost(timeline: List[TimelineEntry]) -> int:
526
+ """Estimate token cost for Layer 2 results."""
527
+ total = 0
528
+ for t in timeline:
529
+ total += estimate_token_cost(str(t.files))
530
+ return total
531
+
532
+
533
+ def layer3_cost(details: List[Dict[str, Any]]) -> int:
534
+ """Estimate token cost for Layer 3 results."""
535
+ total = 0
536
+ for d in details:
537
+ total += estimate_token_cost(d.get("content", ""))
538
+ return total