cosma-backend 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosma_backend/__init__.py +14 -0
- cosma_backend/__main__.py +4 -0
- cosma_backend/api/__init__.py +29 -0
- cosma_backend/api/files.py +154 -0
- cosma_backend/api/index.py +114 -0
- cosma_backend/api/models.py +28 -0
- cosma_backend/api/search.py +166 -0
- cosma_backend/api/status.py +28 -0
- cosma_backend/api/updates.py +67 -0
- cosma_backend/api/watch.py +156 -0
- cosma_backend/app.py +192 -0
- cosma_backend/db/__init__.py +2 -0
- cosma_backend/db/database.py +638 -0
- cosma_backend/discoverer/__init__.py +1 -0
- cosma_backend/discoverer/discoverer.py +34 -0
- cosma_backend/embedder/__init__.py +1 -0
- cosma_backend/embedder/embedder.py +637 -0
- cosma_backend/logging.py +73 -0
- cosma_backend/models/__init__.py +3 -0
- cosma_backend/models/file.py +169 -0
- cosma_backend/models/status.py +10 -0
- cosma_backend/models/update.py +202 -0
- cosma_backend/models/watch.py +132 -0
- cosma_backend/pipeline/__init__.py +2 -0
- cosma_backend/pipeline/pipeline.py +222 -0
- cosma_backend/schema.sql +319 -0
- cosma_backend/searcher/__init__.py +1 -0
- cosma_backend/searcher/searcher.py +397 -0
- cosma_backend/summarizer/__init__.py +44 -0
- cosma_backend/summarizer/summarizer.py +1075 -0
- cosma_backend/utils/bundled.py +24 -0
- cosma_backend/utils/pubsub.py +31 -0
- cosma_backend/utils/sse.py +92 -0
- cosma_backend/watcher/__init__.py +1 -0
- cosma_backend/watcher/awatchdog.py +80 -0
- cosma_backend/watcher/watcher.py +257 -0
- cosma_backend-0.1.0.dist-info/METADATA +23 -0
- cosma_backend-0.1.0.dist-info/RECORD +39 -0
- cosma_backend-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
@File : searcher.py
|
|
4
|
+
@Time : 2025/07/14
|
|
5
|
+
@Author :
|
|
6
|
+
@Version : 1.0
|
|
7
|
+
@Contact :
|
|
8
|
+
@License :
|
|
9
|
+
@Desc : Hybrid search combining semantic similarity and keyword matching
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
from backend.db.database import Database
|
|
17
|
+
from backend.embedder.embedder import AutoEmbedder
|
|
18
|
+
from backend.logging import sm
|
|
19
|
+
from backend.models import File
|
|
20
|
+
|
|
21
|
+
# Configure logger
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SearchError(Exception):
|
|
26
|
+
"""Base exception for search errors."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SearchResult:
|
|
31
|
+
"""
|
|
32
|
+
Represents a search result with metadata and scoring.
|
|
33
|
+
"""
|
|
34
|
+
file_metadata: File
|
|
35
|
+
semantic_score: float | None = None # Distance from semantic search (lower is better)
|
|
36
|
+
keyword_score: float | None = None # Keyword match score (higher is better)
|
|
37
|
+
combined_score: float = 0.0 # Combined weighted score
|
|
38
|
+
match_type: str = "unknown" # Type of match: semantic, keyword, hybrid
|
|
39
|
+
|
|
40
|
+
def __post_init__(self):
|
|
41
|
+
"""Calculate combined score after initialization."""
|
|
42
|
+
if self.semantic_score is not None and self.keyword_score is not None:
|
|
43
|
+
self.match_type = "hybrid"
|
|
44
|
+
# Normalize semantic score (convert distance to similarity)
|
|
45
|
+
semantic_similarity = max(0, 1 - self.semantic_score)
|
|
46
|
+
# Combine with weights (adjust as needed)
|
|
47
|
+
self.combined_score = (0.7 * semantic_similarity) + (0.3 * self.keyword_score)
|
|
48
|
+
elif self.semantic_score is not None:
|
|
49
|
+
self.match_type = "semantic"
|
|
50
|
+
self.combined_score = max(0, 1 - self.semantic_score)
|
|
51
|
+
elif self.keyword_score is not None:
|
|
52
|
+
self.match_type = "keyword"
|
|
53
|
+
self.combined_score = self.keyword_score
|
|
54
|
+
else:
|
|
55
|
+
self.combined_score = 0.0
|
|
56
|
+
|
|
57
|
+
def to_json(self) -> dict:
|
|
58
|
+
"""
|
|
59
|
+
Convert SearchResult to JSON-serializable dictionary.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Dictionary representation of the search result
|
|
63
|
+
"""
|
|
64
|
+
return {
|
|
65
|
+
"file_path": str(self.file_metadata.path),
|
|
66
|
+
"filename": str(self.file_metadata.filename),
|
|
67
|
+
"semantic_score": self.semantic_score,
|
|
68
|
+
"keyword_score": self.keyword_score,
|
|
69
|
+
"combined_score": self.combined_score,
|
|
70
|
+
"match_type": self.match_type
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class HybridSearcher:
|
|
75
|
+
"""
|
|
76
|
+
Hybrid search engine combining semantic similarity and keyword matching.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, db: Database, embedder: AutoEmbedder | None = None) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Initialize hybrid searcher.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
db: Database instance
|
|
85
|
+
embedder: Embedder for generating query embeddings
|
|
86
|
+
"""
|
|
87
|
+
self.db = db
|
|
88
|
+
self.embedder = embedder or AutoEmbedder()
|
|
89
|
+
logger.info(sm("HybridSearcher initialized"))
|
|
90
|
+
|
|
91
|
+
async def search(self,
|
|
92
|
+
query: str,
|
|
93
|
+
limit: int = 20,
|
|
94
|
+
semantic_weight: float = 0.7,
|
|
95
|
+
keyword_weight: float = 0.3,
|
|
96
|
+
semantic_threshold: float = 2.0,
|
|
97
|
+
include_metadata: bool = True,
|
|
98
|
+
directory: str | None = None) -> list[SearchResult]:
|
|
99
|
+
"""
|
|
100
|
+
Perform hybrid search combining semantic and keyword matching.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
query: Search query
|
|
104
|
+
limit: Maximum number of results
|
|
105
|
+
semantic_weight: Weight for semantic similarity (0-1)
|
|
106
|
+
keyword_weight: Weight for keyword matching (0-1)
|
|
107
|
+
semantic_threshold: Maximum distance for semantic matches
|
|
108
|
+
include_metadata: Include file metadata in results
|
|
109
|
+
directory: Optional directory path to limit search scope
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
List of SearchResult objects sorted by combined score
|
|
113
|
+
"""
|
|
114
|
+
logger.info(sm("Performing hybrid search",
|
|
115
|
+
query=query,
|
|
116
|
+
limit=limit,
|
|
117
|
+
semantic_weight=semantic_weight,
|
|
118
|
+
keyword_weight=keyword_weight,
|
|
119
|
+
directory=directory))
|
|
120
|
+
|
|
121
|
+
# Normalize weights
|
|
122
|
+
total_weight = semantic_weight + keyword_weight
|
|
123
|
+
if total_weight > 0:
|
|
124
|
+
semantic_weight /= total_weight
|
|
125
|
+
keyword_weight /= total_weight
|
|
126
|
+
|
|
127
|
+
# Collect results from both search methods
|
|
128
|
+
semantic_results = {}
|
|
129
|
+
keyword_results = {}
|
|
130
|
+
|
|
131
|
+
# 1. Semantic search
|
|
132
|
+
try:
|
|
133
|
+
semantic_matches = await self._semantic_search(query, limit * 2, semantic_threshold, directory)
|
|
134
|
+
for file_metadata, distance in semantic_matches:
|
|
135
|
+
file_id = file_metadata.id if hasattr(file_metadata, "id") else hash(file_metadata.file_path)
|
|
136
|
+
semantic_results[file_id] = (file_metadata, distance)
|
|
137
|
+
|
|
138
|
+
logger.debug(sm("Semantic search completed", results=len(semantic_results)))
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(sm("Semantic search failed", error=str(e)))
|
|
141
|
+
|
|
142
|
+
# 2. Keyword search
|
|
143
|
+
try:
|
|
144
|
+
keyword_matches = await self._keyword_search(query, limit * 2, directory)
|
|
145
|
+
for file_metadata, score in keyword_matches:
|
|
146
|
+
file_id = file_metadata.id if hasattr(file_metadata, "id") else hash(file_metadata.file_path)
|
|
147
|
+
keyword_results[file_id] = (file_metadata, score)
|
|
148
|
+
|
|
149
|
+
logger.debug(sm("Keyword search completed", results=len(keyword_results)))
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.warning(sm("Keyword search failed", error=str(e)))
|
|
152
|
+
|
|
153
|
+
# 3. Combine results
|
|
154
|
+
combined_results = []
|
|
155
|
+
all_file_ids = set(semantic_results.keys()) | set(keyword_results.keys())
|
|
156
|
+
|
|
157
|
+
for file_id in all_file_ids:
|
|
158
|
+
semantic_data = semantic_results.get(file_id)
|
|
159
|
+
keyword_data = keyword_results.get(file_id)
|
|
160
|
+
|
|
161
|
+
# Get file metadata (prefer from semantic search for completeness)
|
|
162
|
+
if semantic_data:
|
|
163
|
+
file_metadata = semantic_data[0]
|
|
164
|
+
semantic_score = semantic_data[1]
|
|
165
|
+
else:
|
|
166
|
+
file_metadata = keyword_data[0]
|
|
167
|
+
semantic_score = None
|
|
168
|
+
|
|
169
|
+
keyword_score = keyword_data[1] if keyword_data else None
|
|
170
|
+
|
|
171
|
+
# Create search result
|
|
172
|
+
result = SearchResult(
|
|
173
|
+
file_metadata=file_metadata,
|
|
174
|
+
semantic_score=semantic_score,
|
|
175
|
+
keyword_score=keyword_score
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Apply improved additive scoring algorithm
|
|
179
|
+
semantic_component = 0.0
|
|
180
|
+
keyword_component = 0.0
|
|
181
|
+
|
|
182
|
+
# Semantic component: Convert distance to similarity and normalize
|
|
183
|
+
if result.semantic_score is not None:
|
|
184
|
+
# Convert distance (lower=better) to similarity (higher=better)
|
|
185
|
+
# Use exponential decay to emphasize closer matches
|
|
186
|
+
import math
|
|
187
|
+
semantic_similarity = math.exp(-result.semantic_score) # Range: 0-1, closer=higher
|
|
188
|
+
semantic_component = semantic_similarity * 0.5 # Scale to 0-0.5 range
|
|
189
|
+
|
|
190
|
+
# Keyword component: Direct score
|
|
191
|
+
if result.keyword_score is not None:
|
|
192
|
+
keyword_component = result.keyword_score * 0.5 # Scale to 0-0.5 range
|
|
193
|
+
|
|
194
|
+
# Combined score: Sum of components (not weighted average)
|
|
195
|
+
result.combined_score = semantic_component + keyword_component
|
|
196
|
+
|
|
197
|
+
# Determine match type based on components
|
|
198
|
+
if semantic_component > 0 and keyword_component > 0:
|
|
199
|
+
result.match_type = "hybrid"
|
|
200
|
+
elif keyword_component > 0:
|
|
201
|
+
result.match_type = "keyword"
|
|
202
|
+
elif semantic_component > 0:
|
|
203
|
+
result.match_type = "semantic"
|
|
204
|
+
else:
|
|
205
|
+
result.match_type = "none"
|
|
206
|
+
|
|
207
|
+
combined_results.append(result)
|
|
208
|
+
|
|
209
|
+
# Sort by combined score (descending)
|
|
210
|
+
combined_results.sort(key=lambda x: x.combined_score, reverse=True)
|
|
211
|
+
|
|
212
|
+
# Limit results
|
|
213
|
+
final_results = combined_results[:limit]
|
|
214
|
+
|
|
215
|
+
logger.info(sm("Hybrid search completed",
|
|
216
|
+
total_results=len(final_results),
|
|
217
|
+
semantic_matches=len(semantic_results),
|
|
218
|
+
keyword_matches=len(keyword_results)))
|
|
219
|
+
|
|
220
|
+
return final_results
|
|
221
|
+
|
|
222
|
+
async def _semantic_search(self, query: str, limit: int, threshold: float, directory: str | None = None) -> list[tuple]:
|
|
223
|
+
"""Perform semantic search using embeddings."""
|
|
224
|
+
if not self.embedder:
|
|
225
|
+
return []
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Generate query embedding
|
|
229
|
+
query_embedding = self.embedder.embed_text(query)
|
|
230
|
+
|
|
231
|
+
# Search similar files
|
|
232
|
+
return await self.db.search_similar_files(
|
|
233
|
+
query_embedding=query_embedding,
|
|
234
|
+
limit=limit,
|
|
235
|
+
threshold=threshold,
|
|
236
|
+
directory=directory
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.exception(sm("Semantic search failed", error=str(e)))
|
|
242
|
+
return []
|
|
243
|
+
|
|
244
|
+
async def _keyword_search(self, query: str, limit: int, directory: str | None = None) -> list[tuple]:
|
|
245
|
+
"""Perform keyword search using SQLite FTS5."""
|
|
246
|
+
try:
|
|
247
|
+
# Use database FTS5 for efficient keyword search
|
|
248
|
+
results = await self.db.keyword_search(query, limit * 2, directory)
|
|
249
|
+
|
|
250
|
+
# Results are already in (file_metadata, score) format
|
|
251
|
+
# FTS5 returns relevance scores that work well for ranking
|
|
252
|
+
return results
|
|
253
|
+
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.exception(sm("Keyword search failed", error=str(e)))
|
|
256
|
+
return []
|
|
257
|
+
|
|
258
|
+
async def search_similar_to_file(self,
|
|
259
|
+
file_id: int,
|
|
260
|
+
limit: int = 10,
|
|
261
|
+
threshold: float = 0.8) -> list[SearchResult]:
|
|
262
|
+
"""
|
|
263
|
+
Find files similar to a given file using its embedding.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
file_id: ID of the reference file
|
|
267
|
+
limit: Maximum number of results
|
|
268
|
+
threshold: Similarity threshold
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
List of similar files
|
|
272
|
+
"""
|
|
273
|
+
logger.info(sm("Searching for similar files", file_id=file_id))
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
# Get file's embedding
|
|
277
|
+
embedding_data = await self.db.get_file_embedding(file_id)
|
|
278
|
+
if not embedding_data:
|
|
279
|
+
msg = f"No embedding found for file {file_id}"
|
|
280
|
+
raise SearchError(msg)
|
|
281
|
+
|
|
282
|
+
embedding, model_name, dimensions = embedding_data
|
|
283
|
+
|
|
284
|
+
# Search for similar files
|
|
285
|
+
results = await self.db.search_similar_files(
|
|
286
|
+
query_embedding=embedding,
|
|
287
|
+
limit=limit + 1, # +1 to account for the file itself
|
|
288
|
+
threshold=threshold
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Convert to SearchResult objects, excluding the original file
|
|
292
|
+
search_results = []
|
|
293
|
+
for file_metadata, distance in results:
|
|
294
|
+
if hasattr(file_metadata, "id") and file_metadata.id != file_id:
|
|
295
|
+
result = SearchResult(
|
|
296
|
+
file_metadata=file_metadata,
|
|
297
|
+
semantic_score=distance,
|
|
298
|
+
match_type="semantic"
|
|
299
|
+
)
|
|
300
|
+
search_results.append(result)
|
|
301
|
+
|
|
302
|
+
# Limit results
|
|
303
|
+
search_results = search_results[:limit]
|
|
304
|
+
|
|
305
|
+
logger.info(sm("Found similar files",
|
|
306
|
+
file_id=file_id,
|
|
307
|
+
similar_count=len(search_results)))
|
|
308
|
+
|
|
309
|
+
return search_results
|
|
310
|
+
|
|
311
|
+
except Exception as e:
|
|
312
|
+
logger.exception(sm("Similar file search failed",
|
|
313
|
+
file_id=file_id,
|
|
314
|
+
error=str(e)))
|
|
315
|
+
msg = f"Failed to find similar files: {e!s}"
|
|
316
|
+
raise SearchError(msg)
|
|
317
|
+
|
|
318
|
+
async def get_search_suggestions(self, query: str, limit: int = 5) -> list[str]:
|
|
319
|
+
"""
|
|
320
|
+
Get search suggestions based on existing keywords and file titles.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
query: Partial query
|
|
324
|
+
limit: Maximum number of suggestions
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
List of suggested search terms
|
|
328
|
+
"""
|
|
329
|
+
logger.debug(sm("Getting search suggestions", query=query))
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
# Get all files to extract keywords and titles
|
|
333
|
+
files = await self.db.get_files(limit=1000) # Reasonable limit
|
|
334
|
+
|
|
335
|
+
suggestions = set()
|
|
336
|
+
query_lower = query.lower()
|
|
337
|
+
|
|
338
|
+
for file_metadata in files:
|
|
339
|
+
# Add matching keywords
|
|
340
|
+
if file_metadata.keywords:
|
|
341
|
+
for keyword in file_metadata.keywords:
|
|
342
|
+
if keyword.lower().startswith(query_lower):
|
|
343
|
+
suggestions.add(keyword)
|
|
344
|
+
|
|
345
|
+
# Add matching title words
|
|
346
|
+
if file_metadata.title:
|
|
347
|
+
for word in file_metadata.title.split():
|
|
348
|
+
if word.lower().startswith(query_lower) and len(word) > 2:
|
|
349
|
+
suggestions.add(word)
|
|
350
|
+
|
|
351
|
+
# Stop if we have enough suggestions
|
|
352
|
+
if len(suggestions) >= limit * 2:
|
|
353
|
+
break
|
|
354
|
+
|
|
355
|
+
# Sort suggestions by relevance (simple alphabetical for now)
|
|
356
|
+
sorted_suggestions = sorted(suggestions)[:limit]
|
|
357
|
+
|
|
358
|
+
logger.debug(sm("Generated search suggestions",
|
|
359
|
+
query=query,
|
|
360
|
+
suggestions=len(sorted_suggestions)))
|
|
361
|
+
|
|
362
|
+
return sorted_suggestions
|
|
363
|
+
|
|
364
|
+
except Exception as e:
|
|
365
|
+
logger.exception(sm("Failed to get search suggestions",
|
|
366
|
+
query=query,
|
|
367
|
+
error=str(e)))
|
|
368
|
+
return []
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# Convenience function for simple search
|
|
372
|
+
async def search_files(db: Database,
|
|
373
|
+
query: str,
|
|
374
|
+
limit: int = 20,
|
|
375
|
+
search_type: str = "hybrid") -> list[SearchResult]:
|
|
376
|
+
"""
|
|
377
|
+
Convenience function for file search.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
db: Database instance
|
|
381
|
+
query: Search query
|
|
382
|
+
limit: Maximum results
|
|
383
|
+
search_type: "hybrid", "semantic", or "keyword"
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
List of search results
|
|
387
|
+
"""
|
|
388
|
+
searcher = HybridSearcher(db)
|
|
389
|
+
|
|
390
|
+
if search_type == "hybrid":
|
|
391
|
+
return await searcher.search(query, limit, semantic_threshold=1.5)
|
|
392
|
+
if search_type == "semantic":
|
|
393
|
+
return await searcher.search(query, limit, semantic_weight=1.0, keyword_weight=0.0, semantic_threshold=1.5)
|
|
394
|
+
if search_type == "keyword":
|
|
395
|
+
return await searcher.search(query, limit, semantic_weight=0.0, keyword_weight=1.0, semantic_threshold=1.5)
|
|
396
|
+
msg = f"Invalid search_type: {search_type}"
|
|
397
|
+
raise ValueError(msg)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*-coding:utf-8 -*-
|
|
3
|
+
'''
|
|
4
|
+
@File : __init__.py
|
|
5
|
+
@Time : 2025/07/06 10:43:12
|
|
6
|
+
@Author : Ethan Pan
|
|
7
|
+
@Version : 1.0
|
|
8
|
+
@Contact : epan@cs.wisc.edu
|
|
9
|
+
@License : (C)Copyright 2025, Ethan Pan
|
|
10
|
+
@Desc : Summarizer module for AI-powered file summarization
|
|
11
|
+
'''
|
|
12
|
+
|
|
13
|
+
from .summarizer import (
|
|
14
|
+
AutoSummarizer,
|
|
15
|
+
OllamaSummarizer,
|
|
16
|
+
OnlineSummarizer,
|
|
17
|
+
BaseSummarizer,
|
|
18
|
+
SummarizerError,
|
|
19
|
+
AIProviderError,
|
|
20
|
+
summarize_file,
|
|
21
|
+
get_available_providers,
|
|
22
|
+
is_summarizer_available,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"AutoSummarizer",
|
|
27
|
+
"OllamaSummarizer",
|
|
28
|
+
"OnlineSummarizer",
|
|
29
|
+
"BaseSummarizer",
|
|
30
|
+
"SummarizerError",
|
|
31
|
+
"AIProviderError",
|
|
32
|
+
"summarize_file",
|
|
33
|
+
"get_available_providers",
|
|
34
|
+
"is_summarizer_available",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
# Put summarizing code in here.
|
|
38
|
+
# Maybe expose a Summarizer class that contains all initialization logic
|
|
39
|
+
# and has a method for summarizing a file?
|
|
40
|
+
#
|
|
41
|
+
# Something like:
|
|
42
|
+
# summarizer = Summarizer(...)
|
|
43
|
+
# summarizer.summarize_file(file)
|
|
44
|
+
|