cosma-backend 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cosma_backend/__init__.py +14 -0
  2. cosma_backend/__main__.py +4 -0
  3. cosma_backend/api/__init__.py +29 -0
  4. cosma_backend/api/files.py +154 -0
  5. cosma_backend/api/index.py +114 -0
  6. cosma_backend/api/models.py +28 -0
  7. cosma_backend/api/search.py +166 -0
  8. cosma_backend/api/status.py +28 -0
  9. cosma_backend/api/updates.py +67 -0
  10. cosma_backend/api/watch.py +156 -0
  11. cosma_backend/app.py +192 -0
  12. cosma_backend/db/__init__.py +2 -0
  13. cosma_backend/db/database.py +638 -0
  14. cosma_backend/discoverer/__init__.py +1 -0
  15. cosma_backend/discoverer/discoverer.py +34 -0
  16. cosma_backend/embedder/__init__.py +1 -0
  17. cosma_backend/embedder/embedder.py +637 -0
  18. cosma_backend/logging.py +73 -0
  19. cosma_backend/models/__init__.py +3 -0
  20. cosma_backend/models/file.py +169 -0
  21. cosma_backend/models/status.py +10 -0
  22. cosma_backend/models/update.py +202 -0
  23. cosma_backend/models/watch.py +132 -0
  24. cosma_backend/pipeline/__init__.py +2 -0
  25. cosma_backend/pipeline/pipeline.py +222 -0
  26. cosma_backend/schema.sql +319 -0
  27. cosma_backend/searcher/__init__.py +1 -0
  28. cosma_backend/searcher/searcher.py +397 -0
  29. cosma_backend/summarizer/__init__.py +44 -0
  30. cosma_backend/summarizer/summarizer.py +1075 -0
  31. cosma_backend/utils/bundled.py +24 -0
  32. cosma_backend/utils/pubsub.py +31 -0
  33. cosma_backend/utils/sse.py +92 -0
  34. cosma_backend/watcher/__init__.py +1 -0
  35. cosma_backend/watcher/awatchdog.py +80 -0
  36. cosma_backend/watcher/watcher.py +257 -0
  37. cosma_backend-0.1.0.dist-info/METADATA +23 -0
  38. cosma_backend-0.1.0.dist-info/RECORD +39 -0
  39. cosma_backend-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,638 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import logging
5
+ from sqlite3 import Row
6
+ import struct
7
+ from types import TracebackType
8
+ from typing import TYPE_CHECKING, Optional, Self, Type
9
+ import asqlite
10
+ import sqlite_vec
11
+ import numpy as np
12
+
13
+ from backend.logging import sm
14
+ from backend.models import File
15
+ from backend.models.watch import WatchedDirectory
16
+ from backend.utils.bundled import get_bundled_file_text
17
+
18
+ if TYPE_CHECKING:
19
+ from sqlite3 import Connection as Sqlite3Connection
20
+
21
+ # The schema file is bundled with the distribution
22
+ SCHEMA_FILE = "./schema.sql"
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def to_timestamp(dt: datetime.datetime | None):
28
+ return int(dt.timestamp()) if dt else None
29
+
30
+
31
+ class Database:
32
+ pool: asqlite.Pool
33
+ _closed: bool
34
+
35
+ # ====== Python Magic Functions ======
36
+ # __aenter__ and __aexit__ are for async context managers
37
+
38
+ def __init__(self, pool: asqlite.Pool):
39
+ self.pool = pool
40
+ self._closed = False
41
+
42
+ @classmethod
43
+ async def from_path(cls, path: str) -> Self:
44
+ def init_conn(conn: Sqlite3Connection):
45
+ # initialize sqlite_vec in each connection
46
+ conn.enable_load_extension(True)
47
+ sqlite_vec.load(conn)
48
+ conn.enable_load_extension(False)
49
+
50
+ pool = await asqlite.create_pool(path, init=init_conn)
51
+
52
+ # perform migrations (or create tables)
53
+ schema = get_bundled_file_text(SCHEMA_FILE)
54
+
55
+ async with pool.acquire() as conn:
56
+ await conn.executescript(schema)
57
+
58
+ return cls(pool)
59
+
60
+ async def close(self):
61
+ await self.pool.close()
62
+
63
+ async def __aenter__(self, ):
64
+ pass
65
+
66
+ async def __aexit__(
67
+ self,
68
+ exc_type: Optional[Type[BaseException]],
69
+ exc_value: Optional[BaseException],
70
+ traceback: Optional[TracebackType],
71
+ ) -> None:
72
+ if self._closed is False:
73
+ await self.pool.close()
74
+ self._closed = True
75
+
76
+ # ====== Helper Functions ======
77
+ def acquire(self) -> asqlite._AcquireProxyContextManager:
78
+ return self.pool.acquire()
79
+
80
+ # ====== Database Logic ======
81
+ async def insert_file(self, file: File):
82
+ SQL = """INSERT INTO files (filename, extension, created, modified, summary)
83
+ VALUES (?, ?, ?, ?, ?)
84
+ RETURNING id;
85
+ """
86
+
87
+ async with self.acquire() as conn:
88
+ await conn.execute(SQL, (
89
+ file.filename,
90
+ file.extension,
91
+ file.created,
92
+ file.modified,
93
+ file.summary,
94
+ ))
95
+
96
+ # Add these methods to your Database class:
97
+
98
+ async def get_file_by_path(self, file_path: str) -> Optional[File]:
99
+ """Get file by its path."""
100
+ SQL = "SELECT * FROM files WHERE file_path = ?"
101
+ async with self.acquire() as conn:
102
+ async with conn.execute(SQL, (file_path,)) as cursor:
103
+ row = await cursor.fetchone()
104
+ if row:
105
+ return File.from_row(row)
106
+ return None
107
+
108
+ async def get_file_by_hash(self, content_hash: str) -> Optional[File]:
109
+ """Get file by content hash."""
110
+ SQL = "SELECT * FROM files WHERE content_hash = ?"
111
+ async with self.acquire() as conn:
112
+ async with conn.execute(SQL, (content_hash,)) as cursor:
113
+ row = await cursor.fetchone()
114
+ if row:
115
+ return File.from_row(row)
116
+ return None
117
+
118
+ async def upsert_file(self, file_data: File) -> int:
119
+ """
120
+ Insert or update a file record.
121
+ Returns the file ID.
122
+ """
123
+
124
+ # Check if exists
125
+ async with self.acquire() as conn:
126
+ existing = await conn.fetchone(
127
+ "SELECT 1 FROM files WHERE file_path = ?",
128
+ (file_data.file_path,)
129
+ )
130
+
131
+
132
+ if existing:
133
+ # Update
134
+ SQL = """
135
+ UPDATE files
136
+ SET filename=?, extension=?, file_size=?, created=?, modified=?, accessed=?,
137
+ content_type=?, content_hash=?,
138
+ summary=?, title=?, status=?, processing_error=?,
139
+ parsed_at=?, summarized_at=?, embedded_at=?, updated_at=(strftime('%s', 'now'))
140
+ WHERE file_path=?
141
+ RETURNING id
142
+ """
143
+ params = (
144
+ file_data.filename, file_data.extension, file_data.file_size,
145
+ to_timestamp(file_data.created), to_timestamp(file_data.modified), to_timestamp(file_data.accessed),
146
+ file_data.content_type, file_data.content_hash,
147
+ file_data.summary, file_data.title, file_data.status.name, file_data.processing_error,
148
+ to_timestamp(file_data.parsed_at), to_timestamp(file_data.summarized_at), to_timestamp(file_data.embedded_at),
149
+ file_data.file_path
150
+ )
151
+ else:
152
+ # Insert
153
+ SQL = """
154
+ INSERT INTO files (
155
+ file_path, filename, extension, file_size, created, modified, accessed,
156
+ content_type, content_hash, summary, title,
157
+ status, processing_error, parsed_at, summarized_at, embedded_at
158
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
159
+ RETURNING id
160
+ """
161
+ params = (
162
+ file_data.file_path, file_data.filename, file_data.extension,
163
+ file_data.file_size, to_timestamp(file_data.created), to_timestamp(file_data.modified), to_timestamp(file_data.accessed),
164
+ file_data.content_type, file_data.content_hash,
165
+ file_data.summary, file_data.title, file_data.status.name, file_data.processing_error,
166
+ to_timestamp(file_data.parsed_at), to_timestamp(file_data.summarized_at), to_timestamp(file_data.embedded_at)
167
+ )
168
+
169
+ async with self.acquire() as conn:
170
+ async with conn.execute(SQL, params) as cursor:
171
+ row = await cursor.fetchone()
172
+ file_data.id = row[0]
173
+
174
+ # Update keywords in file_keywords table
175
+ if file_data.keywords:
176
+ # Delete existing keywords
177
+ await conn.execute(
178
+ "DELETE FROM file_keywords WHERE file_id = ?",
179
+ (file_data.id,)
180
+ )
181
+
182
+ # Insert new keywords
183
+ for keyword in file_data.keywords:
184
+ await conn.execute(
185
+ "INSERT INTO file_keywords (file_id, keyword) VALUES (?, ?)",
186
+ (file_data.id, keyword)
187
+ )
188
+
189
+ return file_data.id
190
+
191
+ # ====== Vector Embedding Operations ======
192
+
193
+ def _serialize_vector(self, vector: np.ndarray) -> bytes:
194
+ """Serialize numpy array to bytes for sqlite-vec storage."""
195
+ # Ensure vector is float32
196
+ vector = vector.astype(np.float32)
197
+ # Pack as bytes
198
+ return struct.pack(f"{len(vector)}f", *vector)
199
+
200
+ def _deserialize_vector(self, blob: bytes, dimensions: int) -> np.ndarray:
201
+ """Deserialize bytes from sqlite-vec to numpy array."""
202
+ # Unpack bytes to float array
203
+ values = struct.unpack(f"{dimensions}f", blob)
204
+ return np.array(values, dtype=np.float32)
205
+
206
+ def _normalize_embedding_dimensions(self, embedding: np.ndarray, target_dimensions: int = 1536) -> np.ndarray:
207
+ """
208
+ Normalize embedding to target dimensions by padding with zeros or truncating.
209
+
210
+ Args:
211
+ embedding: Input embedding vector
212
+ target_dimensions: Target dimension count (default 1536)
213
+
214
+ Returns:
215
+ Normalized embedding vector
216
+ """
217
+ current_dims = embedding.shape[0]
218
+
219
+ if current_dims == target_dimensions:
220
+ return embedding
221
+ if current_dims < target_dimensions:
222
+ # Pad with zeros
223
+ padded = np.zeros(target_dimensions, dtype=embedding.dtype)
224
+ padded[:current_dims] = embedding
225
+ return padded
226
+ # Truncate to target dimensions
227
+ return embedding[:target_dimensions]
228
+
229
+ async def upsert_file_embeddings(self, file: File) -> None:
230
+ """
231
+ Insert or update embedding for a file.
232
+
233
+ Args:
234
+ file_id: ID of the file
235
+ embedding: Embedding vector as numpy array
236
+ model_name: Name of the model used
237
+ dimensions: Actual dimensions of the embedding
238
+ """
239
+ logger.debug(sm("Inserting embedding", file_id=file.id, model=file.embedding_model, dimensions=file.embedding_dimensions))
240
+
241
+ # Normalize embedding to 1536 dimensions for consistent storage
242
+ normalized_embedding = self._normalize_embedding_dimensions(file.embedding)
243
+
244
+ # Serialize embedding
245
+ embedding_blob = self._serialize_vector(normalized_embedding)
246
+
247
+ async with self.acquire() as conn:
248
+ # Check if embedding already exists
249
+ existing = await conn.fetchone(
250
+ "SELECT 1 FROM file_embeddings WHERE file_id = ?",
251
+ (file.id,)
252
+ )
253
+
254
+ # Always delete existing embeddings first (vec0 virtual tables don't support INSERT OR REPLACE)
255
+ await conn.execute(
256
+ "DELETE FROM file_embeddings WHERE file_id = ?",
257
+ (file.id,)
258
+ )
259
+ # await conn.execute(
260
+ # "DELETE FROM embedding_metadata WHERE file_id = ?",
261
+ # (file.id,)
262
+ # )
263
+
264
+ # Insert into vec0 table (using normalized dimensions)
265
+ await conn.execute(
266
+ """
267
+ INSERT INTO file_embeddings(file_id, embedding_model, embedding_dimensions, embedding)
268
+ VALUES (?, ?, ?, ?)
269
+ """,
270
+ (file.id, file.embedding_model, 1536, embedding_blob)
271
+ )
272
+
273
+ # Insert metadata
274
+ # await conn.execute(
275
+ # """
276
+ # INSERT INTO embedding_metadata(file_id, model_name, model_dimensions)
277
+ # VALUES (?, ?, ?)
278
+ # """,
279
+ # (file.id, file.embedding_model, file.embedding_dimensions)
280
+ # )
281
+
282
+ logger.info(sm("Embedding inserted successfully", file_id=file.id))
283
+
284
+ async def search_similar_files(self, query_embedding: np.ndarray, limit: int = 10, threshold: float | None = None, directory: str | None = None) -> list[tuple[File, float]]:
285
+ """
286
+ Search for files similar to the query embedding.
287
+
288
+ Args:
289
+ query_embedding: Query vector as numpy array
290
+ limit: Maximum number of results
291
+ threshold: Optional similarity threshold (lower is more similar)
292
+ directory: Optional directory path to limit search scope
293
+
294
+ Returns:
295
+ List of tuples (FileMetadata, distance)
296
+ """
297
+ logger.debug(sm("Searching similar files", limit=limit, threshold=threshold, directory=directory))
298
+
299
+ # Normalize and serialize query embedding
300
+ normalized_embedding = self._normalize_embedding_dimensions(query_embedding)
301
+ query_blob = self._serialize_vector(normalized_embedding)
302
+
303
+ # Build SQL query with k parameter for knn search
304
+ SQL = """
305
+ SELECT
306
+ f.*,
307
+ GROUP_CONCAT(fk.keyword, '||') as keywords_str,
308
+ distance
309
+ FROM file_embeddings
310
+ INNER JOIN files f ON file_embeddings.file_id = f.id
311
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
312
+ WHERE embedding MATCH ? AND k = ?
313
+ """
314
+
315
+ if threshold is not None:
316
+ SQL += f" AND distance <= {threshold}"
317
+
318
+ if directory is not None:
319
+ SQL += " AND (f.file_path LIKE ? || '/%' OR f.file_path = ?)"
320
+
321
+ SQL += """
322
+ GROUP BY f.id
323
+ ORDER BY distance
324
+ LIMIT ?
325
+ """
326
+
327
+ params = [query_blob, limit]
328
+ if directory is not None:
329
+ params.extend([directory, directory])
330
+ params.append(limit)
331
+
332
+ async with self.acquire() as conn:
333
+ rows = await conn.fetchall(SQL, tuple(params))
334
+
335
+ results = []
336
+ for row in rows:
337
+ file = File.from_row(row)
338
+ distance = row["distance"]
339
+ results.append((file, distance))
340
+
341
+ logger.info(sm("Found similar files", count=len(results)))
342
+ return results
343
+
344
+ async def get_file_embedding(self, file_id: str) -> tuple[np.ndarray, str, int] | None:
345
+ """
346
+ Get embedding vector for a file.
347
+
348
+ Args:
349
+ file_id: ID of the file
350
+
351
+ Returns:
352
+ Tuple of (embedding, model_name, dimensions) or None
353
+ """
354
+ SQL = """
355
+ SELECT
356
+ fe.embedding,
357
+ em.model_name,
358
+ em.model_dimensions
359
+ FROM file_embeddings fe
360
+ INNER JOIN embedding_metadata em ON fe.file_id = em.file_id
361
+ WHERE fe.file_id = ?
362
+ """
363
+
364
+ async with self.acquire() as conn:
365
+ row = await conn.fetchone(SQL, (file_id,))
366
+
367
+ if not row:
368
+ return None
369
+
370
+ # Deserialize embedding using normalized dimensions (1536) since that's how it's stored
371
+ # The vector was normalized to 1536 dimensions before storage in line 514
372
+ embedding = self._deserialize_vector(row["embedding"], 1536)
373
+
374
+ # Truncate back to original model dimensions if needed
375
+ original_dimensions = row["model_dimensions"]
376
+ if original_dimensions < 1536:
377
+ embedding = embedding[:original_dimensions]
378
+
379
+ return (embedding, row["model_name"], row["model_dimensions"])
380
+
381
+ async def delete_embedding(self, file_id: str) -> bool:
382
+ """
383
+ Delete embedding for a file.
384
+
385
+ Args:
386
+ file_id: ID of the file
387
+
388
+ Returns:
389
+ True if deleted, False if not found
390
+ """
391
+ SQL = "DELETE FROM file_embeddings WHERE file_id = ?"
392
+
393
+ async with self.acquire() as conn:
394
+ cursor = await conn.execute(SQL, (file_id,))
395
+ rows_affected = cursor.get_cursor().rowcount
396
+
397
+ if rows_affected > 0:
398
+ # Also delete metadata
399
+ # await conn.execute("DELETE FROM embedding_metadata WHERE file_id = ?", (file_id,))
400
+ logger.info(sm("Embedding deleted", file_id=file_id))
401
+ return True
402
+
403
+ return False
404
+
405
+ async def delete_file(self, file_path: str) -> File | None:
406
+ """
407
+ Delete a file.
408
+
409
+ Args:
410
+ file_path: Path of the file to delete
411
+
412
+ Returns:
413
+ True if deleted, False if not found
414
+ """
415
+ async with self.acquire() as conn:
416
+ # Delete file record
417
+ row = await conn.fetchone("DELETE FROM files WHERE file_path = ? RETURNING *", (file_path,))
418
+
419
+ if not row:
420
+ return None
421
+
422
+ return File.from_row(row)
423
+
424
+ async def add_watched_directory(self, watched_dir: WatchedDirectory) -> int:
425
+ """
426
+ Add a directory to the watched_directories table.
427
+
428
+ Args:
429
+ watched_dir: WatchedDirectory instance to add
430
+
431
+ Returns:
432
+ The ID of the watched directory record
433
+ """
434
+ SQL = """
435
+ INSERT INTO watched_directories (path, recursive, file_pattern, last_scan)
436
+ VALUES (?, ?, ?, (strftime('%s', 'now')))
437
+ ON CONFLICT(path) DO UPDATE SET
438
+ is_active = 1,
439
+ recursive = excluded.recursive,
440
+ file_pattern = excluded.file_pattern,
441
+ last_scan = (strftime('%s', 'now'))
442
+ RETURNING id
443
+ """
444
+
445
+ async with self.acquire() as conn:
446
+ async with conn.execute(SQL, (
447
+ watched_dir.path_str,
448
+ 1 if watched_dir.recursive else 0,
449
+ watched_dir.file_pattern
450
+ )) as cursor:
451
+ row = await cursor.fetchone()
452
+ watched_dir.id = row[0]
453
+ logger.info(sm("Watched directory added", path=watched_dir.path_str, id=watched_dir.id, recursive=watched_dir.recursive))
454
+ return watched_dir.id
455
+
456
+ async def get_watched_directories(self, active_only: bool = True) -> list[WatchedDirectory]:
457
+ """
458
+ Get all watched directories from the database.
459
+
460
+ Args:
461
+ active_only: If True, only return active directories (default: True)
462
+
463
+ Returns:
464
+ List of WatchedDirectory instances
465
+ """
466
+ SQL = """
467
+ SELECT id, path, is_active, recursive, file_pattern, last_scan, created_at, updated_at
468
+ FROM watched_directories
469
+ """
470
+
471
+ if active_only:
472
+ SQL += " WHERE is_active = 1"
473
+
474
+ SQL += " ORDER BY created_at"
475
+
476
+ async with self.acquire() as conn:
477
+ rows = await conn.fetchall(SQL)
478
+
479
+ directories = []
480
+ for row in rows:
481
+ watched_dir = WatchedDirectory.from_row(row)
482
+ directories.append(watched_dir)
483
+
484
+ logger.info(sm("Retrieved watched directories", count=len(directories), active_only=active_only))
485
+ return directories
486
+
487
+ async def delete_watched_directory(self, job_id: int) -> WatchedDirectory | None:
488
+ """
489
+ Delete a watched directory by ID and clean up all associated files.
490
+
491
+ Args:
492
+ job_id: ID of the watched directory to delete
493
+
494
+ Returns:
495
+ WatchedDirectory instance if deleted, None if not found
496
+ """
497
+ async with self.acquire() as conn:
498
+ # First, get the watched directory to retrieve its path
499
+ get_dir_sql = "SELECT * FROM watched_directories WHERE id = ?"
500
+ row = await conn.fetchone(get_dir_sql, (job_id,))
501
+
502
+ if not row:
503
+ logger.warning(sm("Watched directory not found for deletion", job_id=job_id))
504
+ return None
505
+
506
+ watched_dir = WatchedDirectory.from_row(row)
507
+ directory_path = watched_dir.path_str
508
+
509
+ # Delete all files in this directory (using LIKE for path matching)
510
+ # This handles both direct files and files in subdirectories
511
+ delete_files_sql = """
512
+ DELETE FROM files
513
+ WHERE file_path LIKE ? || '/%' OR file_path = ?
514
+ RETURNING id, file_path
515
+ """
516
+ deleted_files = await conn.fetchall(delete_files_sql, (directory_path, directory_path))
517
+
518
+ # Delete the watched directory
519
+ delete_dir_sql = "DELETE FROM watched_directories WHERE id = ?"
520
+ await conn.execute(delete_dir_sql, (job_id,))
521
+
522
+ logger.info(sm(
523
+ "Watched directory and associated files deleted",
524
+ job_id=job_id,
525
+ path=directory_path,
526
+ files_deleted=len(deleted_files)
527
+ ))
528
+
529
+ return watched_dir
530
+
531
+ async def keyword_search(self, query: str, limit: int = 20, directory: str | None = None) -> list[tuple[File, float]]:
532
+ """
533
+ Perform keyword search using FTS5 with BM25 ranking.
534
+
535
+ Args:
536
+ query: Search query
537
+ limit: Maximum number of results
538
+ directory: Optional directory path to limit search scope
539
+
540
+ Returns:
541
+ List of tuples (File, relevance_score)
542
+ """
543
+
544
+ sanitized_query = f'"{query.replace('"', '""')}"'
545
+ logger.debug(sm("Performing keyword search", query=sanitized_query, limit=limit, directory=directory))
546
+
547
+ # FTS5 query with BM25 ranking
548
+ # You can use advanced syntax like: "housing AND (apartment OR lease)"
549
+ # BM25 parameters: k1=1.2 (term frequency saturation), b=0.75 (length normalization)
550
+ SQL = """
551
+ SELECT
552
+ f.*,
553
+ bm25(files_fts) AS relevance_score
554
+ FROM files_fts fts
555
+ JOIN files f ON f.id = fts.rowid
556
+ WHERE files_fts MATCH ?
557
+ """
558
+
559
+ params = [sanitized_query]
560
+
561
+ if directory is not None:
562
+ SQL += " AND (f.file_path LIKE ? || '/%' OR f.file_path = ?)"
563
+ params.extend([directory, directory])
564
+
565
+ SQL += """
566
+ ORDER BY rank
567
+ LIMIT ?;
568
+ """
569
+
570
+ params.append(limit)
571
+
572
+ async with self.acquire() as conn:
573
+ try:
574
+ async with conn.execute(SQL, tuple(params)) as cursor:
575
+ rows = await cursor.fetchall()
576
+ except Exception as e:
577
+ logger.error(sm("SQL query failed", error=str(e), query=sanitized_query))
578
+ raise
579
+
580
+ results = []
581
+ for row in rows:
582
+ file = File.from_row(row)
583
+ # BM25 scores are negative (less negative = better match)
584
+ # Convert to positive score (0-1 range approximately)
585
+ relevance_score = abs(row["relevance_score"])
586
+ results.append((file, relevance_score))
587
+
588
+ logger.info(sm("Keyword search completed", count=len(results)))
589
+ return results
590
+
591
+ async def update_file_timestamp(self, file_path: str) -> bool:
592
+ """
593
+ Update the updated_at timestamp for a file to the current datetime.
594
+ This is used to track which files are still present in the filesystem.
595
+
596
+ Args:
597
+ file_path: Path of the file to update
598
+
599
+ Returns:
600
+ True if updated, False if file not found
601
+ """
602
+ SQL = "UPDATE files SET updated_at = (strftime('%s', 'now')) WHERE file_path = ?"
603
+
604
+ async with self.acquire() as conn:
605
+ cursor = await conn.execute(SQL, (file_path,))
606
+ rows_affected = cursor.get_cursor().rowcount
607
+
608
+ if rows_affected > 0:
609
+ logger.debug(sm("File timestamp updated", file_path=file_path))
610
+ return True
611
+
612
+ return False
613
+
614
+ async def delete_files_not_updated_since(self, timestamp: datetime.datetime, directory_path: str) -> list[Row]:
615
+ """
616
+ Delete files that have not been updated since the given timestamp within a specific directory.
617
+ This is used to remove files that are no longer present in the filesystem.
618
+
619
+ Args:
620
+ timestamp: datetime
621
+ directory_path: Directory path to limit deletion scope (only files under this path will be deleted)
622
+
623
+ Returns:
624
+ List of file paths that were deleted
625
+ """
626
+ directory_pattern = f"{directory_path}/%"
627
+
628
+ async with self.acquire() as conn:
629
+ # Delete the files (cascading deletes will handle embeddings and keywords)
630
+ SQL_DELETE = "DELETE FROM files WHERE updated_at < ? AND (file_path LIKE ? OR file_path = ?) RETURNING *"
631
+ rows = await conn.fetchall(SQL_DELETE, (to_timestamp(timestamp), directory_pattern, directory_path))
632
+
633
+ logger.info(sm("Deleted stale files", count=len(rows), timestamp=timestamp, directory=directory_path))
634
+ return rows
635
+
636
+
637
+ async def connect(path: str) -> Database:
638
+ return await Database.from_path(path)
@@ -0,0 +1 @@
1
+ from .discoverer import Discoverer as Discoverer
@@ -0,0 +1,34 @@
1
+ from collections.abc import Generator
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from backend.models import File
6
+
7
+
8
+ class Discoverer:
9
+ """A simple file discoverer that recursively finds files under a given path."""
10
+
11
+
12
+ def files_in(self, path: str | Path) -> Generator[File, None, None]:
13
+ """
14
+ Discover all files under the specified path.
15
+
16
+ Args:
17
+ path: The root path to start discovery from
18
+
19
+ Yields:
20
+ File: Each file found under the specified path
21
+ """
22
+ root = Path(path)
23
+
24
+ if not root.exists():
25
+ return
26
+
27
+ if root.is_file():
28
+ yield File.from_path(root)
29
+ return
30
+
31
+ # Recursively discover all files
32
+ for item in root.rglob("*"):
33
+ if item.is_file():
34
+ yield File.from_path(item)
@@ -0,0 +1 @@
1
+ from .embedder import AutoEmbedder as AutoEmbedder