cosma-backend 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cosma_backend/__init__.py +14 -0
  2. cosma_backend/__main__.py +4 -0
  3. cosma_backend/api/__init__.py +29 -0
  4. cosma_backend/api/files.py +154 -0
  5. cosma_backend/api/index.py +114 -0
  6. cosma_backend/api/models.py +28 -0
  7. cosma_backend/api/search.py +166 -0
  8. cosma_backend/api/status.py +28 -0
  9. cosma_backend/api/updates.py +67 -0
  10. cosma_backend/api/watch.py +156 -0
  11. cosma_backend/app.py +192 -0
  12. cosma_backend/db/__init__.py +2 -0
  13. cosma_backend/db/database.py +638 -0
  14. cosma_backend/discoverer/__init__.py +1 -0
  15. cosma_backend/discoverer/discoverer.py +34 -0
  16. cosma_backend/embedder/__init__.py +1 -0
  17. cosma_backend/embedder/embedder.py +637 -0
  18. cosma_backend/logging.py +73 -0
  19. cosma_backend/models/__init__.py +3 -0
  20. cosma_backend/models/file.py +169 -0
  21. cosma_backend/models/status.py +10 -0
  22. cosma_backend/models/update.py +202 -0
  23. cosma_backend/models/watch.py +132 -0
  24. cosma_backend/pipeline/__init__.py +2 -0
  25. cosma_backend/pipeline/pipeline.py +222 -0
  26. cosma_backend/schema.sql +319 -0
  27. cosma_backend/searcher/__init__.py +1 -0
  28. cosma_backend/searcher/searcher.py +397 -0
  29. cosma_backend/summarizer/__init__.py +44 -0
  30. cosma_backend/summarizer/summarizer.py +1075 -0
  31. cosma_backend/utils/bundled.py +24 -0
  32. cosma_backend/utils/pubsub.py +31 -0
  33. cosma_backend/utils/sse.py +92 -0
  34. cosma_backend/watcher/__init__.py +1 -0
  35. cosma_backend/watcher/awatchdog.py +80 -0
  36. cosma_backend/watcher/watcher.py +257 -0
  37. cosma_backend-0.1.0.dist-info/METADATA +23 -0
  38. cosma_backend-0.1.0.dist-info/RECORD +39 -0
  39. cosma_backend-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,397 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ @File : searcher.py
4
+ @Time : 2025/07/14
5
+ @Author :
6
+ @Version : 1.0
7
+ @Contact :
8
+ @License :
9
+ @Desc : Hybrid search combining semantic similarity and keyword matching
10
+ """
11
+
12
+ from dataclasses import dataclass
13
+
14
+ import logging
15
+
16
+ from backend.db.database import Database
17
+ from backend.embedder.embedder import AutoEmbedder
18
+ from backend.logging import sm
19
+ from backend.models import File
20
+
21
+ # Configure logger
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class SearchError(Exception):
26
+ """Base exception for search errors."""
27
+
28
+
29
+ @dataclass
30
+ class SearchResult:
31
+ """
32
+ Represents a search result with metadata and scoring.
33
+ """
34
+ file_metadata: File
35
+ semantic_score: float | None = None # Distance from semantic search (lower is better)
36
+ keyword_score: float | None = None # Keyword match score (higher is better)
37
+ combined_score: float = 0.0 # Combined weighted score
38
+ match_type: str = "unknown" # Type of match: semantic, keyword, hybrid
39
+
40
+ def __post_init__(self):
41
+ """Calculate combined score after initialization."""
42
+ if self.semantic_score is not None and self.keyword_score is not None:
43
+ self.match_type = "hybrid"
44
+ # Normalize semantic score (convert distance to similarity)
45
+ semantic_similarity = max(0, 1 - self.semantic_score)
46
+ # Combine with weights (adjust as needed)
47
+ self.combined_score = (0.7 * semantic_similarity) + (0.3 * self.keyword_score)
48
+ elif self.semantic_score is not None:
49
+ self.match_type = "semantic"
50
+ self.combined_score = max(0, 1 - self.semantic_score)
51
+ elif self.keyword_score is not None:
52
+ self.match_type = "keyword"
53
+ self.combined_score = self.keyword_score
54
+ else:
55
+ self.combined_score = 0.0
56
+
57
+ def to_json(self) -> dict:
58
+ """
59
+ Convert SearchResult to JSON-serializable dictionary.
60
+
61
+ Returns:
62
+ Dictionary representation of the search result
63
+ """
64
+ return {
65
+ "file_path": str(self.file_metadata.path),
66
+ "filename": str(self.file_metadata.filename),
67
+ "semantic_score": self.semantic_score,
68
+ "keyword_score": self.keyword_score,
69
+ "combined_score": self.combined_score,
70
+ "match_type": self.match_type
71
+ }
72
+
73
+
74
+ class HybridSearcher:
75
+ """
76
+ Hybrid search engine combining semantic similarity and keyword matching.
77
+ """
78
+
79
+ def __init__(self, db: Database, embedder: AutoEmbedder | None = None) -> None:
80
+ """
81
+ Initialize hybrid searcher.
82
+
83
+ Args:
84
+ db: Database instance
85
+ embedder: Embedder for generating query embeddings
86
+ """
87
+ self.db = db
88
+ self.embedder = embedder or AutoEmbedder()
89
+ logger.info(sm("HybridSearcher initialized"))
90
+
91
+ async def search(self,
92
+ query: str,
93
+ limit: int = 20,
94
+ semantic_weight: float = 0.7,
95
+ keyword_weight: float = 0.3,
96
+ semantic_threshold: float = 2.0,
97
+ include_metadata: bool = True,
98
+ directory: str | None = None) -> list[SearchResult]:
99
+ """
100
+ Perform hybrid search combining semantic and keyword matching.
101
+
102
+ Args:
103
+ query: Search query
104
+ limit: Maximum number of results
105
+ semantic_weight: Weight for semantic similarity (0-1)
106
+ keyword_weight: Weight for keyword matching (0-1)
107
+ semantic_threshold: Maximum distance for semantic matches
108
+ include_metadata: Include file metadata in results
109
+ directory: Optional directory path to limit search scope
110
+
111
+ Returns:
112
+ List of SearchResult objects sorted by combined score
113
+ """
114
+ logger.info(sm("Performing hybrid search",
115
+ query=query,
116
+ limit=limit,
117
+ semantic_weight=semantic_weight,
118
+ keyword_weight=keyword_weight,
119
+ directory=directory))
120
+
121
+ # Normalize weights
122
+ total_weight = semantic_weight + keyword_weight
123
+ if total_weight > 0:
124
+ semantic_weight /= total_weight
125
+ keyword_weight /= total_weight
126
+
127
+ # Collect results from both search methods
128
+ semantic_results = {}
129
+ keyword_results = {}
130
+
131
+ # 1. Semantic search
132
+ try:
133
+ semantic_matches = await self._semantic_search(query, limit * 2, semantic_threshold, directory)
134
+ for file_metadata, distance in semantic_matches:
135
+ file_id = file_metadata.id if hasattr(file_metadata, "id") else hash(file_metadata.file_path)
136
+ semantic_results[file_id] = (file_metadata, distance)
137
+
138
+ logger.debug(sm("Semantic search completed", results=len(semantic_results)))
139
+ except Exception as e:
140
+ logger.warning(sm("Semantic search failed", error=str(e)))
141
+
142
+ # 2. Keyword search
143
+ try:
144
+ keyword_matches = await self._keyword_search(query, limit * 2, directory)
145
+ for file_metadata, score in keyword_matches:
146
+ file_id = file_metadata.id if hasattr(file_metadata, "id") else hash(file_metadata.file_path)
147
+ keyword_results[file_id] = (file_metadata, score)
148
+
149
+ logger.debug(sm("Keyword search completed", results=len(keyword_results)))
150
+ except Exception as e:
151
+ logger.warning(sm("Keyword search failed", error=str(e)))
152
+
153
+ # 3. Combine results
154
+ combined_results = []
155
+ all_file_ids = set(semantic_results.keys()) | set(keyword_results.keys())
156
+
157
+ for file_id in all_file_ids:
158
+ semantic_data = semantic_results.get(file_id)
159
+ keyword_data = keyword_results.get(file_id)
160
+
161
+ # Get file metadata (prefer from semantic search for completeness)
162
+ if semantic_data:
163
+ file_metadata = semantic_data[0]
164
+ semantic_score = semantic_data[1]
165
+ else:
166
+ file_metadata = keyword_data[0]
167
+ semantic_score = None
168
+
169
+ keyword_score = keyword_data[1] if keyword_data else None
170
+
171
+ # Create search result
172
+ result = SearchResult(
173
+ file_metadata=file_metadata,
174
+ semantic_score=semantic_score,
175
+ keyword_score=keyword_score
176
+ )
177
+
178
+ # Apply improved additive scoring algorithm
179
+ semantic_component = 0.0
180
+ keyword_component = 0.0
181
+
182
+ # Semantic component: Convert distance to similarity and normalize
183
+ if result.semantic_score is not None:
184
+ # Convert distance (lower=better) to similarity (higher=better)
185
+ # Use exponential decay to emphasize closer matches
186
+ import math
187
+ semantic_similarity = math.exp(-result.semantic_score) # Range: 0-1, closer=higher
188
+ semantic_component = semantic_similarity * 0.5 # Scale to 0-0.5 range
189
+
190
+ # Keyword component: Direct score
191
+ if result.keyword_score is not None:
192
+ keyword_component = result.keyword_score * 0.5 # Scale to 0-0.5 range
193
+
194
+ # Combined score: Sum of components (not weighted average)
195
+ result.combined_score = semantic_component + keyword_component
196
+
197
+ # Determine match type based on components
198
+ if semantic_component > 0 and keyword_component > 0:
199
+ result.match_type = "hybrid"
200
+ elif keyword_component > 0:
201
+ result.match_type = "keyword"
202
+ elif semantic_component > 0:
203
+ result.match_type = "semantic"
204
+ else:
205
+ result.match_type = "none"
206
+
207
+ combined_results.append(result)
208
+
209
+ # Sort by combined score (descending)
210
+ combined_results.sort(key=lambda x: x.combined_score, reverse=True)
211
+
212
+ # Limit results
213
+ final_results = combined_results[:limit]
214
+
215
+ logger.info(sm("Hybrid search completed",
216
+ total_results=len(final_results),
217
+ semantic_matches=len(semantic_results),
218
+ keyword_matches=len(keyword_results)))
219
+
220
+ return final_results
221
+
222
+ async def _semantic_search(self, query: str, limit: int, threshold: float, directory: str | None = None) -> list[tuple]:
223
+ """Perform semantic search using embeddings."""
224
+ if not self.embedder:
225
+ return []
226
+
227
+ try:
228
+ # Generate query embedding
229
+ query_embedding = self.embedder.embed_text(query)
230
+
231
+ # Search similar files
232
+ return await self.db.search_similar_files(
233
+ query_embedding=query_embedding,
234
+ limit=limit,
235
+ threshold=threshold,
236
+ directory=directory
237
+ )
238
+
239
+
240
+ except Exception as e:
241
+ logger.exception(sm("Semantic search failed", error=str(e)))
242
+ return []
243
+
244
+ async def _keyword_search(self, query: str, limit: int, directory: str | None = None) -> list[tuple]:
245
+ """Perform keyword search using SQLite FTS5."""
246
+ try:
247
+ # Use database FTS5 for efficient keyword search
248
+ results = await self.db.keyword_search(query, limit * 2, directory)
249
+
250
+ # Results are already in (file_metadata, score) format
251
+ # FTS5 returns relevance scores that work well for ranking
252
+ return results
253
+
254
+ except Exception as e:
255
+ logger.exception(sm("Keyword search failed", error=str(e)))
256
+ return []
257
+
258
+ async def search_similar_to_file(self,
259
+ file_id: int,
260
+ limit: int = 10,
261
+ threshold: float = 0.8) -> list[SearchResult]:
262
+ """
263
+ Find files similar to a given file using its embedding.
264
+
265
+ Args:
266
+ file_id: ID of the reference file
267
+ limit: Maximum number of results
268
+ threshold: Similarity threshold
269
+
270
+ Returns:
271
+ List of similar files
272
+ """
273
+ logger.info(sm("Searching for similar files", file_id=file_id))
274
+
275
+ try:
276
+ # Get file's embedding
277
+ embedding_data = await self.db.get_file_embedding(file_id)
278
+ if not embedding_data:
279
+ msg = f"No embedding found for file {file_id}"
280
+ raise SearchError(msg)
281
+
282
+ embedding, model_name, dimensions = embedding_data
283
+
284
+ # Search for similar files
285
+ results = await self.db.search_similar_files(
286
+ query_embedding=embedding,
287
+ limit=limit + 1, # +1 to account for the file itself
288
+ threshold=threshold
289
+ )
290
+
291
+ # Convert to SearchResult objects, excluding the original file
292
+ search_results = []
293
+ for file_metadata, distance in results:
294
+ if hasattr(file_metadata, "id") and file_metadata.id != file_id:
295
+ result = SearchResult(
296
+ file_metadata=file_metadata,
297
+ semantic_score=distance,
298
+ match_type="semantic"
299
+ )
300
+ search_results.append(result)
301
+
302
+ # Limit results
303
+ search_results = search_results[:limit]
304
+
305
+ logger.info(sm("Found similar files",
306
+ file_id=file_id,
307
+ similar_count=len(search_results)))
308
+
309
+ return search_results
310
+
311
+ except Exception as e:
312
+ logger.exception(sm("Similar file search failed",
313
+ file_id=file_id,
314
+ error=str(e)))
315
+ msg = f"Failed to find similar files: {e!s}"
316
+ raise SearchError(msg)
317
+
318
+ async def get_search_suggestions(self, query: str, limit: int = 5) -> list[str]:
319
+ """
320
+ Get search suggestions based on existing keywords and file titles.
321
+
322
+ Args:
323
+ query: Partial query
324
+ limit: Maximum number of suggestions
325
+
326
+ Returns:
327
+ List of suggested search terms
328
+ """
329
+ logger.debug(sm("Getting search suggestions", query=query))
330
+
331
+ try:
332
+ # Get all files to extract keywords and titles
333
+ files = await self.db.get_files(limit=1000) # Reasonable limit
334
+
335
+ suggestions = set()
336
+ query_lower = query.lower()
337
+
338
+ for file_metadata in files:
339
+ # Add matching keywords
340
+ if file_metadata.keywords:
341
+ for keyword in file_metadata.keywords:
342
+ if keyword.lower().startswith(query_lower):
343
+ suggestions.add(keyword)
344
+
345
+ # Add matching title words
346
+ if file_metadata.title:
347
+ for word in file_metadata.title.split():
348
+ if word.lower().startswith(query_lower) and len(word) > 2:
349
+ suggestions.add(word)
350
+
351
+ # Stop if we have enough suggestions
352
+ if len(suggestions) >= limit * 2:
353
+ break
354
+
355
+ # Sort suggestions by relevance (simple alphabetical for now)
356
+ sorted_suggestions = sorted(suggestions)[:limit]
357
+
358
+ logger.debug(sm("Generated search suggestions",
359
+ query=query,
360
+ suggestions=len(sorted_suggestions)))
361
+
362
+ return sorted_suggestions
363
+
364
+ except Exception as e:
365
+ logger.exception(sm("Failed to get search suggestions",
366
+ query=query,
367
+ error=str(e)))
368
+ return []
369
+
370
+
371
+ # Convenience function for simple search
372
+ async def search_files(db: Database,
373
+ query: str,
374
+ limit: int = 20,
375
+ search_type: str = "hybrid") -> list[SearchResult]:
376
+ """
377
+ Convenience function for file search.
378
+
379
+ Args:
380
+ db: Database instance
381
+ query: Search query
382
+ limit: Maximum results
383
+ search_type: "hybrid", "semantic", or "keyword"
384
+
385
+ Returns:
386
+ List of search results
387
+ """
388
+ searcher = HybridSearcher(db)
389
+
390
+ if search_type == "hybrid":
391
+ return await searcher.search(query, limit, semantic_threshold=1.5)
392
+ if search_type == "semantic":
393
+ return await searcher.search(query, limit, semantic_weight=1.0, keyword_weight=0.0, semantic_threshold=1.5)
394
+ if search_type == "keyword":
395
+ return await searcher.search(query, limit, semantic_weight=0.0, keyword_weight=1.0, semantic_threshold=1.5)
396
+ msg = f"Invalid search_type: {search_type}"
397
+ raise ValueError(msg)
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env python
2
+ # -*-coding:utf-8 -*-
3
+ '''
4
+ @File : __init__.py
5
+ @Time : 2025/07/06 10:43:12
6
+ @Author : Ethan Pan
7
+ @Version : 1.0
8
+ @Contact : epan@cs.wisc.edu
9
+ @License : (C)Copyright 2025, Ethan Pan
10
+ @Desc : Summarizer module for AI-powered file summarization
11
+ '''
12
+
13
+ from .summarizer import (
14
+ AutoSummarizer,
15
+ OllamaSummarizer,
16
+ OnlineSummarizer,
17
+ BaseSummarizer,
18
+ SummarizerError,
19
+ AIProviderError,
20
+ summarize_file,
21
+ get_available_providers,
22
+ is_summarizer_available,
23
+ )
24
+
25
+ __all__ = [
26
+ "AutoSummarizer",
27
+ "OllamaSummarizer",
28
+ "OnlineSummarizer",
29
+ "BaseSummarizer",
30
+ "SummarizerError",
31
+ "AIProviderError",
32
+ "summarize_file",
33
+ "get_available_providers",
34
+ "is_summarizer_available",
35
+ ]
36
+
37
+ # Put summarizing code in here.
38
+ # Maybe expose a Summarizer class that contains all initialization logic
39
+ # and has a method for summarizing a file?
40
+ #
41
+ # Something like:
42
+ # summarizer = Summarizer(...)
43
+ # summarizer.summarize_file(file)
44
+