cosma-backend 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cosma_backend/__init__.py +14 -0
  2. cosma_backend/__main__.py +4 -0
  3. cosma_backend/api/__init__.py +29 -0
  4. cosma_backend/api/files.py +154 -0
  5. cosma_backend/api/index.py +114 -0
  6. cosma_backend/api/models.py +28 -0
  7. cosma_backend/api/search.py +166 -0
  8. cosma_backend/api/status.py +28 -0
  9. cosma_backend/api/updates.py +67 -0
  10. cosma_backend/api/watch.py +156 -0
  11. cosma_backend/app.py +192 -0
  12. cosma_backend/db/__init__.py +2 -0
  13. cosma_backend/db/database.py +638 -0
  14. cosma_backend/discoverer/__init__.py +1 -0
  15. cosma_backend/discoverer/discoverer.py +34 -0
  16. cosma_backend/embedder/__init__.py +1 -0
  17. cosma_backend/embedder/embedder.py +637 -0
  18. cosma_backend/logging.py +73 -0
  19. cosma_backend/models/__init__.py +3 -0
  20. cosma_backend/models/file.py +169 -0
  21. cosma_backend/models/status.py +10 -0
  22. cosma_backend/models/update.py +202 -0
  23. cosma_backend/models/watch.py +132 -0
  24. cosma_backend/pipeline/__init__.py +2 -0
  25. cosma_backend/pipeline/pipeline.py +222 -0
  26. cosma_backend/schema.sql +319 -0
  27. cosma_backend/searcher/__init__.py +1 -0
  28. cosma_backend/searcher/searcher.py +397 -0
  29. cosma_backend/summarizer/__init__.py +44 -0
  30. cosma_backend/summarizer/summarizer.py +1075 -0
  31. cosma_backend/utils/bundled.py +24 -0
  32. cosma_backend/utils/pubsub.py +31 -0
  33. cosma_backend/utils/sse.py +92 -0
  34. cosma_backend/watcher/__init__.py +1 -0
  35. cosma_backend/watcher/awatchdog.py +80 -0
  36. cosma_backend/watcher/watcher.py +257 -0
  37. cosma_backend-0.1.0.dist-info/METADATA +23 -0
  38. cosma_backend-0.1.0.dist-info/RECORD +39 -0
  39. cosma_backend-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,222 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from typing import Any, Iterator, Optional
6
+ import logging
7
+
8
+ from backend.db import Database
9
+ from backend.logging import sm
10
+ from backend.models import File
11
+ from backend.models.status import ProcessingStatus
12
+ from backend.models.update import Update
13
+ from backend.discoverer import Discoverer
14
+ from backend.parser import FileParser
15
+ from backend.summarizer import AutoSummarizer
16
+ from backend.embedder import AutoEmbedder
17
+ from backend.utils.pubsub import Hub
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class PipelineResult:
23
+ """Results from processing a batch of files."""
24
+ def __init__(self):
25
+ self.discovered = 0
26
+ self.skipped = 0
27
+ self.parsed = 0
28
+ self.summarized = 0
29
+ self.embedded = 0
30
+ self.failed = 0
31
+ self.errors: list[tuple[str, str]] = [] # (file_path, error)
32
+
33
+
34
+ class Pipeline:
35
+ """
36
+ Main pipeline orchestrator. Processes files through:
37
+ Discovery → Parsing → Summarization → Embedding
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ db: Database,
43
+ updates_hub: Optional[Hub] = None,
44
+ discoverer: Optional[Discoverer] = None,
45
+ parser: Optional[FileParser] = None,
46
+ summarizer: Optional[AutoSummarizer] = None,
47
+ embedder: Optional[AutoEmbedder] = None,
48
+ ):
49
+ self.db = db
50
+ self.updates_hub = updates_hub
51
+ self.discoverer = discoverer or Discoverer()
52
+ self.parser = parser or FileParser()
53
+ self.summarizer = summarizer or AutoSummarizer()
54
+ self.embedder = embedder or AutoEmbedder()
55
+
56
+ async def process_directory(self, path: str | Path):
57
+ """
58
+ Process all files in a directory through the full pipeline.
59
+ After processing, deletes any files from the database that weren't seen
60
+ (i.e., files that no longer exist in the filesystem).
61
+
62
+ Args:
63
+ path: Root directory to process
64
+
65
+ Returns:
66
+ PipelineResult with statistics
67
+ """
68
+ # result = PipelineResult()
69
+
70
+ # Publish directory processing started
71
+ self._publish_update(Update.directory_processing_started(str(path)))
72
+
73
+ started_processing = datetime.now(timezone.utc)
74
+
75
+ # Stage 1: Discovery
76
+ logger.info(f"Discovering files in {path}")
77
+ for file in self.discoverer.files_in(path):
78
+ # result.discovered += 1
79
+
80
+ try:
81
+ # Update the timestamp to mark this file as still present in the filesystem
82
+ await self.db.update_file_timestamp(file.file_path)
83
+
84
+ # Check if file needs processing
85
+ if await self._should_skip_file(file):
86
+ logger.info(sm("Skipping processing file", file=file))
87
+ self._publish_update(Update.file_skipped(
88
+ file.file_path,
89
+ file.filename,
90
+ reason="already processed"
91
+ ))
92
+ # result.skipped += 1
93
+ continue
94
+
95
+ # Process the file through the pipeline
96
+ await self.process_file(file)
97
+
98
+ except Exception:
99
+ continue
100
+
101
+ try:
102
+ logger.info(sm("Deleting files no longer present in filesystem", started_processing=started_processing, path=str(path)))
103
+ rows = await self.db.delete_files_not_updated_since(started_processing, str(path))
104
+ logger.info(sm("Deleted unused files", count=len(rows)))
105
+ except Exception as e:
106
+ logger.error(sm("Error while deleting unused files", error=str(e)))
107
+
108
+ logger.info(sm("Completed processing directory", directory=str(path)))
109
+ self._publish_update(Update.directory_processing_completed(str(path)))
110
+
111
+
112
+ async def process_file(self, file: File):
113
+ """
114
+ Process a single file through the pipeline.
115
+
116
+ Args:
117
+ file_path: Path to the file
118
+
119
+ Returns:
120
+ Processed File or None if failed
121
+ """
122
+ # if result is None:
123
+ # result = PipelineResult()
124
+
125
+ try:
126
+ # Stage 1: Parse
127
+ self._publish_update(Update.file_parsing(file.file_path, file.filename))
128
+ await self.parser.parse_file(file)
129
+ self._publish_update(Update.file_parsed(file.file_path, file.filename))
130
+
131
+ # Check if file hash is different before proceeding
132
+ if not await self._has_file_changed(file):
133
+ logger.info(sm("Skipping processing file, hashed not changed", file=file))
134
+ self._publish_update(Update.file_skipped(
135
+ file.file_path,
136
+ file.filename,
137
+ reason="content not changed"
138
+ ))
139
+ return
140
+
141
+ # Stage 2: Summarize
142
+ self._publish_update(Update.file_summarizing(file.file_path, file.filename))
143
+ await self.summarizer.summarize(file)
144
+ await self._save_to_db(file)
145
+ self._publish_update(Update.file_summarized(file.file_path, file.filename))
146
+
147
+ # Stage 3: Embed (if embedder is available)
148
+ self._publish_update(Update.file_embedding(file.file_path, file.filename))
149
+ await self.embedder.embed(file)
150
+ # embeddings need special care when saving
151
+ await self._save_embeddings(file)
152
+ self._publish_update(Update.file_embedded(file.file_path, file.filename))
153
+
154
+ # Mark as complete
155
+ self._publish_update(Update.file_complete(file.file_path, file.filename))
156
+
157
+ except Exception as e:
158
+ # result.failed += 1
159
+ # result.errors.append((str(file_path), str(e)))
160
+ logger.error(sm("Pipeline failed for file", file=file, error=e))
161
+
162
+ # Publish failure update
163
+ self._publish_update(Update.file_failed(
164
+ file.file_path,
165
+ file.filename,
166
+ error=str(e)
167
+ ))
168
+
169
+ # Save failed state to DB if we have file_data
170
+ file.status = ProcessingStatus.FAILED
171
+ file.processing_error = str(e)
172
+ await self._save_to_db(file)
173
+
174
+ raise e
175
+
176
+ async def is_supported(self, file: File) -> bool:
177
+ """Check if a file is supported for processing"""
178
+ return self.parser.is_supported(file)
179
+
180
+ async def _should_skip_file(self, file: File) -> bool:
181
+ """Check if file should be skipped based on DB state."""
182
+ if not await self.is_supported(file):
183
+ return False
184
+
185
+ saved_file = await self.db.get_file_by_path(file.file_path)
186
+
187
+ if not saved_file or saved_file.status not in (ProcessingStatus.COMPLETE, ProcessingStatus.FAILED):
188
+ logger.info(sm("Should skip", file=file, status=saved_file.status if saved_file else "No saved file"))
189
+ return False
190
+
191
+ saved_modified = saved_file.modified.replace(microsecond=0)
192
+ current_modified = file.modified.replace(microsecond=0)
193
+
194
+ logger.info(sm("Should skip", file=file, saved_modified=saved_modified, current_modified=current_modified))
195
+
196
+ return saved_modified == current_modified
197
+
198
+
199
+
200
+ async def _has_file_changed(self, file: File) -> bool:
201
+ """Check if file has been changed based on hash."""
202
+ saved_file = await self.db.get_file_by_path(file.file_path)
203
+
204
+ logger.info(sm("Saved file", saved_file=saved_file, status=saved_file.status if saved_file else "N/A"))
205
+
206
+ if not saved_file or saved_file.status is not ProcessingStatus.COMPLETE:
207
+ return True
208
+
209
+ return saved_file.content_hash != file.content_hash
210
+
211
+ async def _save_to_db(self, file: File) -> None:
212
+ """Save file data to database."""
213
+ await self.db.upsert_file(file)
214
+
215
+ async def _save_embeddings(self, file: File) -> None:
216
+ """Save file embeddings to database."""
217
+ await self._save_to_db(file)
218
+ await self.db.upsert_file_embeddings(file)
219
+
220
+ def _publish_update(self, update: Any):
221
+ if self.updates_hub:
222
+ self.updates_hub.publish(update)
@@ -0,0 +1,319 @@
1
+ -- =============================================================================
2
+ -- File Processing and Organization Database Schema
3
+ -- =============================================================================
4
+ --
5
+
6
+ -- =============================================================================
7
+ -- Watched Directories Table
8
+ -- =============================================================================
9
+
10
+ -- Table for tracking directories that are being monitored for file changes
11
+ CREATE TABLE IF NOT EXISTS watched_directories (
12
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
13
+ path TEXT NOT NULL UNIQUE,
14
+ is_active INTEGER DEFAULT 1 CHECK (is_active IN (0, 1)),
15
+ recursive INTEGER DEFAULT 1 CHECK (recursive IN (0, 1)),
16
+ file_pattern TEXT, -- Optional glob pattern for filtering files (e.g., "*.pdf")
17
+ last_scan INTEGER,
18
+ created_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL,
19
+ updated_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL
20
+ );
21
+
22
+ -- Index for watched directories
23
+ CREATE INDEX IF NOT EXISTS idx_watched_directories_is_active ON watched_directories(is_active);
24
+ CREATE INDEX IF NOT EXISTS idx_watched_directories_path ON watched_directories(path);
25
+
26
+ -- Trigger for updating watched_directories timestamp
27
+ CREATE TRIGGER IF NOT EXISTS update_watched_directories_timestamp
28
+ AFTER UPDATE ON watched_directories
29
+ FOR EACH ROW
30
+ BEGIN
31
+ UPDATE watched_directories SET updated_at = (strftime('%s', 'now')) WHERE id = NEW.id;
32
+ END;
33
+
34
+ -- =============================================================================
35
+
36
+ -- =============================================================================
37
+ -- Files Table
38
+ -- =============================================================================
39
+
40
+ -- Main files table with comprehensive metadata
41
+ CREATE TABLE IF NOT EXISTS files (
42
+ -- Primary key
43
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
44
+
45
+ -- Stage 0: Discovery (file system metadata) - Required fields
46
+ file_path TEXT NOT NULL,
47
+ filename TEXT NOT NULL,
48
+ extension TEXT NOT NULL,
49
+ file_size INTEGER NOT NULL,
50
+ created INTEGER NOT NULL,
51
+ modified INTEGER NOT NULL,
52
+ accessed INTEGER NOT NULL,
53
+
54
+ -- Stage 1: Parsing (content extraction)
55
+ content_type TEXT,
56
+ content_hash TEXT,
57
+ parsed_at INTEGER,
58
+
59
+ -- Stage 2: Summarization (AI processing)
60
+ summary TEXT, -- AI-generated summary
61
+ title TEXT,
62
+ summarized_at INTEGER,
63
+
64
+ -- Stage 3: Embedding (vector representation)
65
+ embedded_at INTEGER,
66
+
67
+ -- Meta
68
+ status TEXT DEFAULT 'DISCOVERED' CHECK (status IN ('DISCOVERED', 'PARSED', 'SUMMARIZED', 'COMPLETE', 'FAILED')),
69
+ processing_error TEXT,
70
+
71
+ -- File owner and permissions (if available)
72
+ owner TEXT,
73
+ permissions TEXT,
74
+
75
+ -- System timestamps
76
+ created_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL,
77
+ updated_at INTEGER DEFAULT (strftime('%s', 'now')) NOT NULL
78
+ );
79
+
80
+ -- =============================================================================
81
+ -- Vector Embeddings Table (using sqlite-vec)
82
+ -- =============================================================================
83
+
84
+ -- Virtual table for storing file embeddings
85
+ -- Note: Adjust the dimension (e.g., float[384], float[768], float[1536])
86
+ -- based on your embedding model's output size
87
+ CREATE VIRTUAL TABLE IF NOT EXISTS file_embeddings USING vec0(
88
+ file_id INTEGER PRIMARY KEY, -- Links to files.id
89
+ embedding_model TEXT,
90
+ embedding_dimensions INTEGER,
91
+ embedding float[1536]
92
+ );
93
+
94
+ CREATE TRIGGER IF NOT EXISTS delete_file_embeddings
95
+ AFTER DELETE ON files
96
+ BEGIN
97
+ DELETE FROM file_embeddings WHERE file_id = OLD.id;
98
+ END;
99
+
100
+ -- =============================================================================
101
+ -- Keywords Table
102
+ -- =============================================================================
103
+
104
+ -- Keywords table (many-to-many relationship with files)
105
+ CREATE TABLE IF NOT EXISTS file_keywords (
106
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
107
+ file_id INTEGER NOT NULL,
108
+ keyword TEXT NOT NULL,
109
+
110
+ -- Indexes for performance
111
+ FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE,
112
+ UNIQUE(file_id, keyword) -- Prevent duplicate keywords per file
113
+ );
114
+
115
+ -- =============================================================================
116
+ -- Full-Text Search Table (using FTS5)
117
+ -- =============================================================================
118
+
119
+ -- Create a view that combines summary and keywords for each file
120
+ CREATE VIEW IF NOT EXISTS files_searchable AS
121
+ SELECT
122
+ f.id,
123
+ f.summary,
124
+ GROUP_CONCAT(fk.keyword, ' ') AS keywords
125
+ FROM files f
126
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
127
+ GROUP BY f.id;
128
+
129
+ -- Create the contentless FTS5 table
130
+ CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
131
+ file_path,
132
+ title,
133
+ summary,
134
+ keywords,
135
+ content='',
136
+ contentless_delete=1 -- Use this for UPDATE/DELETE support
137
+ );
138
+
139
+ -- Triggers to keep FTS index synchronized with your data
140
+ CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
141
+ INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
142
+ SELECT
143
+ new.id,
144
+ new.file_path,
145
+ new.title,
146
+ new.summary,
147
+ GROUP_CONCAT(fk.keyword, ' ')
148
+ FROM file_keywords fk
149
+ WHERE fk.file_id = new.id;
150
+ END;
151
+
152
+ CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
153
+ DELETE FROM files_fts WHERE rowid = old.id;
154
+ END;
155
+
156
+ CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
157
+ DELETE FROM files_fts WHERE rowid = old.id;
158
+ INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
159
+ SELECT
160
+ new.id,
161
+ new.file_path,
162
+ new.title,
163
+ new.summary,
164
+ GROUP_CONCAT(fk.keyword, ' ')
165
+ FROM file_keywords fk
166
+ WHERE fk.file_id = new.id;
167
+ END;
168
+
169
+ -- Trigger for keyword changes
170
+ CREATE TRIGGER IF NOT EXISTS file_keywords_ai AFTER INSERT ON file_keywords BEGIN
171
+ DELETE FROM files_fts WHERE rowid = new.file_id;
172
+ INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
173
+ SELECT
174
+ f.id,
175
+ f.file_path,
176
+ f.title,
177
+ f.summary,
178
+ GROUP_CONCAT(fk.keyword, ' ')
179
+ FROM files f
180
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
181
+ WHERE f.id = new.file_id
182
+ GROUP BY f.id;
183
+ END;
184
+
185
+ CREATE TRIGGER IF NOT EXISTS file_keywords_ad AFTER DELETE ON file_keywords BEGIN
186
+ DELETE FROM files_fts WHERE rowid = old.file_id;
187
+ INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
188
+ SELECT
189
+ f.id,
190
+ f.file_path,
191
+ f.title,
192
+ f.summary,
193
+ GROUP_CONCAT(fk.keyword, ' ')
194
+ FROM files f
195
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
196
+ WHERE f.id = old.file_id
197
+ GROUP BY f.id;
198
+ END;
199
+
200
+ CREATE TRIGGER IF NOT EXISTS file_keywords_au AFTER UPDATE ON file_keywords BEGIN
201
+ DELETE FROM files_fts WHERE rowid = old.file_id;
202
+ INSERT INTO files_fts(rowid, file_path, title, summary, keywords)
203
+ SELECT
204
+ f.id,
205
+ f.file_path,
206
+ f.title,
207
+ f.summary,
208
+ GROUP_CONCAT(fk.keyword, ' ')
209
+ FROM files f
210
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
211
+ WHERE f.id = old.file_id
212
+ GROUP BY f.id;
213
+ END;
214
+
215
+ -- =============================================================================
216
+ -- Processing Statistics Table
217
+ -- =============================================================================
218
+
219
+ -- Processing statistics table
220
+ CREATE TABLE IF NOT EXISTS processing_stats (
221
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
222
+ session_id TEXT NOT NULL, -- Processing session identifier
223
+ total_files INTEGER DEFAULT 0,
224
+ processed_files INTEGER DEFAULT 0,
225
+ failed_files INTEGER DEFAULT 0,
226
+ skipped_files INTEGER DEFAULT 0,
227
+ processing_time_seconds REAL,
228
+ started_at INTEGER NOT NULL,
229
+ completed_at INTEGER,
230
+ status TEXT DEFAULT 'running' CHECK (status IN ('running', 'completed', 'failed', 'cancelled'))
231
+ );
232
+
233
+ -- =============================================================================
234
+ -- Indexes for Performance
235
+ -- =============================================================================
236
+
237
+ -- Main files table indexes
238
+ CREATE INDEX IF NOT EXISTS idx_files_extension ON files(extension);
239
+ CREATE INDEX IF NOT EXISTS idx_files_content_hash ON files(content_hash);
240
+ CREATE INDEX IF NOT EXISTS idx_files_status ON files(status);
241
+ CREATE INDEX IF NOT EXISTS idx_files_created_at ON files(created_at);
242
+ CREATE INDEX IF NOT EXISTS idx_files_file_path ON files(file_path);
243
+ CREATE INDEX IF NOT EXISTS idx_files_filename ON files(filename);
244
+
245
+ -- Keywords table indexes
246
+ CREATE INDEX IF NOT EXISTS idx_keywords_file_id ON file_keywords(file_id);
247
+ CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON file_keywords(keyword);
248
+
249
+ -- Processing stats indexes
250
+ CREATE INDEX IF NOT EXISTS idx_stats_session_id ON processing_stats(session_id);
251
+ CREATE INDEX IF NOT EXISTS idx_stats_started_at ON processing_stats(started_at);
252
+
253
+ -- =============================================================================
254
+ -- Triggers for Automatic Timestamp Updates
255
+ -- =============================================================================
256
+
257
+ -- Update the updated_at timestamp when files are modified
258
+ CREATE TRIGGER IF NOT EXISTS update_files_timestamp
259
+ AFTER UPDATE ON files
260
+ FOR EACH ROW
261
+ BEGIN
262
+ UPDATE files SET updated_at = (strftime('%s', 'now')) WHERE id = NEW.id;
263
+ END;
264
+
265
+ -- =============================================================================
266
+ -- Views for Common Queries
267
+ -- =============================================================================
268
+
269
+ -- View for files with their keywords
270
+ CREATE VIEW IF NOT EXISTS files_with_keywords AS
271
+ SELECT
272
+ f.*,
273
+ GROUP_CONCAT(fk.keyword, ', ') as keywords
274
+ FROM files f
275
+ LEFT JOIN file_keywords fk ON f.id = fk.file_id
276
+ GROUP BY f.id;
277
+
278
+ -- View for processing summary
279
+ CREATE VIEW IF NOT EXISTS processing_summary AS
280
+ SELECT
281
+ status,
282
+ COUNT(*) as count,
283
+ AVG(file_size) as avg_file_size,
284
+ SUM(file_size) as total_size
285
+ FROM files
286
+ WHERE status IS NOT NULL
287
+ GROUP BY status;
288
+
289
+ -- View for recent activity
290
+ CREATE VIEW IF NOT EXISTS recent_activity AS
291
+ SELECT
292
+ id,
293
+ filename,
294
+ extension,
295
+ status,
296
+ datetime(created_at, 'unixepoch') as created_date,
297
+ datetime(parsed_at, 'unixepoch') as parsed_date,
298
+ datetime(summarized_at, 'unixepoch') as summarized_date,
299
+ datetime(embedded_at, 'unixepoch') as embedded_date
300
+ FROM files
301
+ ORDER BY created_at DESC
302
+ LIMIT 100;
303
+
304
+ -- =============================================================================
305
+ -- Initial Data (Optional)
306
+ -- =============================================================================
307
+
308
+ -- Insert initial processing session if none exists
309
+ INSERT OR IGNORE INTO processing_stats (
310
+ id,
311
+ session_id,
312
+ started_at,
313
+ status
314
+ ) VALUES (
315
+ 1,
316
+ 'initial_session',
317
+ (strftime('%s', 'now')),
318
+ 'completed'
319
+ );
@@ -0,0 +1 @@
1
+ from .searcher import HybridSearcher as HybridSearcher