fast-resume 1.12.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_resume/__init__.py +5 -0
- fast_resume/adapters/__init__.py +25 -0
- fast_resume/adapters/base.py +263 -0
- fast_resume/adapters/claude.py +209 -0
- fast_resume/adapters/codex.py +216 -0
- fast_resume/adapters/copilot.py +176 -0
- fast_resume/adapters/copilot_vscode.py +326 -0
- fast_resume/adapters/crush.py +341 -0
- fast_resume/adapters/opencode.py +333 -0
- fast_resume/adapters/vibe.py +188 -0
- fast_resume/assets/claude.png +0 -0
- fast_resume/assets/codex.png +0 -0
- fast_resume/assets/copilot-cli.png +0 -0
- fast_resume/assets/copilot-vscode.png +0 -0
- fast_resume/assets/crush.png +0 -0
- fast_resume/assets/opencode.png +0 -0
- fast_resume/assets/vibe.png +0 -0
- fast_resume/cli.py +327 -0
- fast_resume/config.py +30 -0
- fast_resume/index.py +758 -0
- fast_resume/logging_config.py +57 -0
- fast_resume/query.py +264 -0
- fast_resume/search.py +281 -0
- fast_resume/tui/__init__.py +58 -0
- fast_resume/tui/app.py +629 -0
- fast_resume/tui/filter_bar.py +128 -0
- fast_resume/tui/modal.py +73 -0
- fast_resume/tui/preview.py +396 -0
- fast_resume/tui/query.py +86 -0
- fast_resume/tui/results_table.py +178 -0
- fast_resume/tui/search_input.py +117 -0
- fast_resume/tui/styles.py +302 -0
- fast_resume/tui/utils.py +160 -0
- fast_resume-1.12.8.dist-info/METADATA +545 -0
- fast_resume-1.12.8.dist-info/RECORD +38 -0
- fast_resume-1.12.8.dist-info/WHEEL +4 -0
- fast_resume-1.12.8.dist-info/entry_points.txt +3 -0
- fast_resume-1.12.8.dist-info/licenses/LICENSE +21 -0
fast_resume/index.py
ADDED
|
@@ -0,0 +1,758 @@
|
|
|
1
|
+
"""Tantivy full-text search index for sessions."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import tantivy
|
|
11
|
+
|
|
12
|
+
from .adapters.base import Session
|
|
13
|
+
from .config import INDEX_DIR, SCHEMA_VERSION
|
|
14
|
+
from .query import DateFilter, DateOp, Filter
|
|
15
|
+
|
|
16
|
+
# Version file to detect schema changes
|
|
17
|
+
_VERSION_FILE = ".schema_version"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class IndexStats:
|
|
22
|
+
"""Statistics about the index contents."""
|
|
23
|
+
|
|
24
|
+
total_sessions: int
|
|
25
|
+
sessions_by_agent: dict[str, int]
|
|
26
|
+
total_messages: int
|
|
27
|
+
oldest_session: datetime | None
|
|
28
|
+
newest_session: datetime | None
|
|
29
|
+
top_directories: list[
|
|
30
|
+
tuple[str, int, int]
|
|
31
|
+
] # (directory, sessions, messages) tuples
|
|
32
|
+
index_size_bytes: int
|
|
33
|
+
# Time breakdown
|
|
34
|
+
sessions_today: int
|
|
35
|
+
sessions_this_week: int
|
|
36
|
+
sessions_this_month: int
|
|
37
|
+
sessions_older: int
|
|
38
|
+
# Content metrics
|
|
39
|
+
total_content_chars: int
|
|
40
|
+
avg_content_chars: int
|
|
41
|
+
avg_messages_per_session: float
|
|
42
|
+
# Activity patterns
|
|
43
|
+
sessions_by_weekday: dict[str, int] # Mon, Tue, etc.
|
|
44
|
+
sessions_by_hour: dict[int, int] # 0-23
|
|
45
|
+
# Daily activity (date string -> (sessions, messages))
|
|
46
|
+
daily_activity: list[tuple[str, int, int]] # (date, sessions, messages)
|
|
47
|
+
# Per-agent raw data
|
|
48
|
+
messages_by_agent: dict[str, int] | None = None
|
|
49
|
+
content_chars_by_agent: dict[str, int] | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TantivyIndex:
|
|
53
|
+
"""Manages a Tantivy full-text search index for sessions.
|
|
54
|
+
|
|
55
|
+
This is the single source of truth for session data.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, index_path: Path = INDEX_DIR) -> None:
|
|
59
|
+
self.index_path = index_path
|
|
60
|
+
self._index: tantivy.Index | None = None
|
|
61
|
+
self._schema: tantivy.Schema | None = None
|
|
62
|
+
self._version_file = index_path / _VERSION_FILE
|
|
63
|
+
|
|
64
|
+
def _build_schema(self) -> tantivy.Schema:
|
|
65
|
+
"""Build the Tantivy schema for sessions."""
|
|
66
|
+
schema_builder = tantivy.SchemaBuilder()
|
|
67
|
+
# ID field - stored and indexed with raw tokenizer for exact term matching
|
|
68
|
+
schema_builder.add_text_field("id", stored=True, tokenizer_name="raw")
|
|
69
|
+
# Title - stored and indexed for search
|
|
70
|
+
schema_builder.add_text_field("title", stored=True)
|
|
71
|
+
# Directory - stored with raw tokenizer for regex substring matching
|
|
72
|
+
schema_builder.add_text_field("directory", stored=True, tokenizer_name="raw")
|
|
73
|
+
# Agent - stored for filtering (raw tokenizer to preserve hyphens)
|
|
74
|
+
schema_builder.add_text_field("agent", stored=True, tokenizer_name="raw")
|
|
75
|
+
# Content - stored and indexed for full-text search
|
|
76
|
+
schema_builder.add_text_field("content", stored=True)
|
|
77
|
+
# Timestamp - stored and indexed for range queries
|
|
78
|
+
schema_builder.add_float_field("timestamp", stored=True, indexed=True)
|
|
79
|
+
# Message count - stored as integer
|
|
80
|
+
schema_builder.add_integer_field("message_count", stored=True)
|
|
81
|
+
# File modification time - for incremental updates
|
|
82
|
+
schema_builder.add_float_field("mtime", stored=True)
|
|
83
|
+
# Yolo mode - session was started with auto-approve/skip-permissions
|
|
84
|
+
schema_builder.add_boolean_field("yolo", stored=True)
|
|
85
|
+
return schema_builder.build()
|
|
86
|
+
|
|
87
|
+
def _check_version(self) -> bool:
|
|
88
|
+
"""Check if index version matches current schema version."""
|
|
89
|
+
if not self._version_file.exists():
|
|
90
|
+
return False
|
|
91
|
+
try:
|
|
92
|
+
stored_version = int(self._version_file.read_text().strip())
|
|
93
|
+
return stored_version == SCHEMA_VERSION
|
|
94
|
+
except (ValueError, OSError):
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
def _write_version(self) -> None:
|
|
98
|
+
"""Write current schema version to version file."""
|
|
99
|
+
self._version_file.parent.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
self._version_file.write_text(str(SCHEMA_VERSION))
|
|
101
|
+
|
|
102
|
+
def _clear(self) -> None:
|
|
103
|
+
"""Clear the index directory."""
|
|
104
|
+
self._index = None
|
|
105
|
+
self._schema = None
|
|
106
|
+
if self.index_path.exists():
|
|
107
|
+
shutil.rmtree(self.index_path)
|
|
108
|
+
|
|
109
|
+
def _ensure_index(self) -> tantivy.Index:
|
|
110
|
+
"""Ensure the index is loaded or created."""
|
|
111
|
+
if self._index is not None:
|
|
112
|
+
return self._index
|
|
113
|
+
|
|
114
|
+
# Check version - rebuild if schema changed
|
|
115
|
+
if self.index_path.exists() and not self._check_version():
|
|
116
|
+
self._clear()
|
|
117
|
+
|
|
118
|
+
self._schema = self._build_schema()
|
|
119
|
+
|
|
120
|
+
if self.index_path.exists():
|
|
121
|
+
# Open existing index
|
|
122
|
+
self._index = tantivy.Index(self._schema, path=str(self.index_path))
|
|
123
|
+
else:
|
|
124
|
+
# Create new index
|
|
125
|
+
self.index_path.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
self._index = tantivy.Index(self._schema, path=str(self.index_path))
|
|
127
|
+
self._write_version()
|
|
128
|
+
|
|
129
|
+
return self._index
|
|
130
|
+
|
|
131
|
+
def get_known_sessions(self) -> dict[str, tuple[float, str]]:
|
|
132
|
+
"""Get all session IDs with their mtimes and agents.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dict mapping session_id to (mtime, agent) tuple.
|
|
136
|
+
"""
|
|
137
|
+
if not self.index_path.exists() or not self._check_version():
|
|
138
|
+
return {}
|
|
139
|
+
|
|
140
|
+
index = self._ensure_index()
|
|
141
|
+
index.reload()
|
|
142
|
+
searcher = index.searcher()
|
|
143
|
+
|
|
144
|
+
if searcher.num_docs == 0:
|
|
145
|
+
return {}
|
|
146
|
+
|
|
147
|
+
known: dict[str, tuple[float, str]] = {}
|
|
148
|
+
|
|
149
|
+
# Match all documents
|
|
150
|
+
all_query = tantivy.Query.all_query()
|
|
151
|
+
results = searcher.search(all_query, limit=searcher.num_docs).hits
|
|
152
|
+
|
|
153
|
+
for _score, doc_address in results:
|
|
154
|
+
doc = searcher.doc(doc_address)
|
|
155
|
+
session_id = doc.get_first("id")
|
|
156
|
+
mtime = doc.get_first("mtime")
|
|
157
|
+
agent = doc.get_first("agent")
|
|
158
|
+
if session_id and mtime is not None and agent:
|
|
159
|
+
known[session_id] = (mtime, agent)
|
|
160
|
+
|
|
161
|
+
return known
|
|
162
|
+
|
|
163
|
+
def get_all_sessions(self) -> list[Session]:
|
|
164
|
+
"""Retrieve all sessions from the index.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
List of Session objects, unsorted.
|
|
168
|
+
"""
|
|
169
|
+
if not self.index_path.exists() or not self._check_version():
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
index = self._ensure_index()
|
|
173
|
+
index.reload()
|
|
174
|
+
searcher = index.searcher()
|
|
175
|
+
|
|
176
|
+
if searcher.num_docs == 0:
|
|
177
|
+
return []
|
|
178
|
+
|
|
179
|
+
sessions: list[Session] = []
|
|
180
|
+
|
|
181
|
+
# Match all documents
|
|
182
|
+
all_query = tantivy.Query.all_query()
|
|
183
|
+
results = searcher.search(all_query, limit=searcher.num_docs).hits
|
|
184
|
+
|
|
185
|
+
for _score, doc_address in results:
|
|
186
|
+
doc = searcher.doc(doc_address)
|
|
187
|
+
session = self._doc_to_session(doc)
|
|
188
|
+
if session:
|
|
189
|
+
sessions.append(session)
|
|
190
|
+
|
|
191
|
+
return sessions
|
|
192
|
+
|
|
193
|
+
def get_session_count(self, agent_filter: str | None = None) -> int:
|
|
194
|
+
"""Get the total number of sessions in the index.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
agent_filter: If provided, only count sessions for this agent.
|
|
198
|
+
"""
|
|
199
|
+
if not self.index_path.exists() or not self._check_version():
|
|
200
|
+
return 0
|
|
201
|
+
|
|
202
|
+
index = self._ensure_index()
|
|
203
|
+
index.reload()
|
|
204
|
+
searcher = index.searcher()
|
|
205
|
+
|
|
206
|
+
if agent_filter is None:
|
|
207
|
+
return searcher.num_docs
|
|
208
|
+
|
|
209
|
+
# Count sessions for specific agent using term query
|
|
210
|
+
schema = index.schema
|
|
211
|
+
query = tantivy.Query.term_query(schema, "agent", agent_filter)
|
|
212
|
+
# Tantivy requires limit > 0, use count property for total matches
|
|
213
|
+
return searcher.search(query, limit=1).count # type: ignore[attr-defined]
|
|
214
|
+
|
|
215
|
+
def get_stats(self) -> IndexStats:
|
|
216
|
+
"""Get statistics about the index contents."""
|
|
217
|
+
empty_stats = IndexStats(
|
|
218
|
+
total_sessions=0,
|
|
219
|
+
sessions_by_agent={},
|
|
220
|
+
total_messages=0,
|
|
221
|
+
oldest_session=None,
|
|
222
|
+
newest_session=None,
|
|
223
|
+
top_directories=[],
|
|
224
|
+
index_size_bytes=0,
|
|
225
|
+
sessions_today=0,
|
|
226
|
+
sessions_this_week=0,
|
|
227
|
+
sessions_this_month=0,
|
|
228
|
+
sessions_older=0,
|
|
229
|
+
total_content_chars=0,
|
|
230
|
+
avg_content_chars=0,
|
|
231
|
+
avg_messages_per_session=0.0,
|
|
232
|
+
sessions_by_weekday={},
|
|
233
|
+
sessions_by_hour={},
|
|
234
|
+
daily_activity=[],
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not self.index_path.exists() or not self._check_version():
|
|
238
|
+
return empty_stats
|
|
239
|
+
|
|
240
|
+
index = self._ensure_index()
|
|
241
|
+
index.reload()
|
|
242
|
+
searcher = index.searcher()
|
|
243
|
+
|
|
244
|
+
if searcher.num_docs == 0:
|
|
245
|
+
empty_stats.index_size_bytes = self._get_index_size()
|
|
246
|
+
return empty_stats
|
|
247
|
+
|
|
248
|
+
# Collect stats from all documents
|
|
249
|
+
agent_counts: Counter[str] = Counter()
|
|
250
|
+
agent_messages: Counter[str] = Counter()
|
|
251
|
+
agent_content_chars: Counter[str] = Counter()
|
|
252
|
+
dir_counts: Counter[str] = Counter()
|
|
253
|
+
dir_messages: Counter[str] = Counter()
|
|
254
|
+
weekday_counts: Counter[str] = Counter()
|
|
255
|
+
hour_counts: Counter[int] = Counter()
|
|
256
|
+
daily_sessions: Counter[str] = Counter()
|
|
257
|
+
daily_messages: Counter[str] = Counter()
|
|
258
|
+
total_messages = 0
|
|
259
|
+
total_content_chars = 0
|
|
260
|
+
oldest_ts: float | None = None
|
|
261
|
+
newest_ts: float | None = None
|
|
262
|
+
|
|
263
|
+
# Time boundaries
|
|
264
|
+
now = datetime.now()
|
|
265
|
+
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
266
|
+
week_start = today_start - timedelta(days=today_start.weekday())
|
|
267
|
+
month_start = today_start.replace(day=1)
|
|
268
|
+
|
|
269
|
+
sessions_today = 0
|
|
270
|
+
sessions_this_week = 0
|
|
271
|
+
sessions_this_month = 0
|
|
272
|
+
sessions_older = 0
|
|
273
|
+
|
|
274
|
+
weekday_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
|
275
|
+
|
|
276
|
+
all_query = tantivy.Query.all_query()
|
|
277
|
+
results = searcher.search(all_query, limit=searcher.num_docs).hits
|
|
278
|
+
|
|
279
|
+
for _score, doc_address in results:
|
|
280
|
+
doc = searcher.doc(doc_address)
|
|
281
|
+
|
|
282
|
+
agent = doc.get_first("agent")
|
|
283
|
+
if agent:
|
|
284
|
+
agent_counts[agent] += 1
|
|
285
|
+
|
|
286
|
+
directory = doc.get_first("directory")
|
|
287
|
+
if directory:
|
|
288
|
+
dir_counts[directory] += 1
|
|
289
|
+
|
|
290
|
+
msg_count = doc.get_first("message_count")
|
|
291
|
+
if msg_count:
|
|
292
|
+
total_messages += msg_count
|
|
293
|
+
if directory:
|
|
294
|
+
dir_messages[directory] += msg_count
|
|
295
|
+
if agent:
|
|
296
|
+
agent_messages[agent] += msg_count
|
|
297
|
+
|
|
298
|
+
content = doc.get_first("content")
|
|
299
|
+
if content:
|
|
300
|
+
content_len = len(content)
|
|
301
|
+
total_content_chars += content_len
|
|
302
|
+
if agent:
|
|
303
|
+
agent_content_chars[agent] += content_len
|
|
304
|
+
|
|
305
|
+
timestamp = doc.get_first("timestamp")
|
|
306
|
+
if timestamp is not None:
|
|
307
|
+
if oldest_ts is None or timestamp < oldest_ts:
|
|
308
|
+
oldest_ts = timestamp
|
|
309
|
+
if newest_ts is None or timestamp > newest_ts:
|
|
310
|
+
newest_ts = timestamp
|
|
311
|
+
|
|
312
|
+
# Time breakdown
|
|
313
|
+
dt = datetime.fromtimestamp(timestamp)
|
|
314
|
+
if dt >= today_start:
|
|
315
|
+
sessions_today += 1
|
|
316
|
+
if dt >= week_start:
|
|
317
|
+
sessions_this_week += 1
|
|
318
|
+
if dt >= month_start:
|
|
319
|
+
sessions_this_month += 1
|
|
320
|
+
else:
|
|
321
|
+
sessions_older += 1
|
|
322
|
+
|
|
323
|
+
# Activity patterns
|
|
324
|
+
weekday_counts[weekday_names[dt.weekday()]] += 1
|
|
325
|
+
hour_counts[dt.hour] += 1
|
|
326
|
+
|
|
327
|
+
# Daily activity
|
|
328
|
+
date_str = dt.strftime("%Y-%m-%d")
|
|
329
|
+
daily_sessions[date_str] += 1
|
|
330
|
+
if msg_count:
|
|
331
|
+
daily_messages[date_str] += msg_count
|
|
332
|
+
|
|
333
|
+
num_docs = searcher.num_docs
|
|
334
|
+
|
|
335
|
+
# Build daily activity list sorted by date
|
|
336
|
+
all_dates = sorted(set(daily_sessions.keys()) | set(daily_messages.keys()))
|
|
337
|
+
daily_activity = [
|
|
338
|
+
(d, daily_sessions.get(d, 0), daily_messages.get(d, 0)) for d in all_dates
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
return IndexStats(
|
|
342
|
+
total_sessions=num_docs,
|
|
343
|
+
sessions_by_agent=dict(agent_counts),
|
|
344
|
+
total_messages=total_messages,
|
|
345
|
+
oldest_session=datetime.fromtimestamp(oldest_ts) if oldest_ts else None,
|
|
346
|
+
newest_session=datetime.fromtimestamp(newest_ts) if newest_ts else None,
|
|
347
|
+
top_directories=[
|
|
348
|
+
(d, count, dir_messages[d]) for d, count in dir_counts.most_common(10)
|
|
349
|
+
],
|
|
350
|
+
index_size_bytes=self._get_index_size(),
|
|
351
|
+
sessions_today=sessions_today,
|
|
352
|
+
sessions_this_week=sessions_this_week,
|
|
353
|
+
sessions_this_month=sessions_this_month,
|
|
354
|
+
sessions_older=sessions_older,
|
|
355
|
+
total_content_chars=total_content_chars,
|
|
356
|
+
avg_content_chars=total_content_chars // num_docs if num_docs else 0,
|
|
357
|
+
avg_messages_per_session=total_messages / num_docs if num_docs else 0.0,
|
|
358
|
+
sessions_by_weekday={d: weekday_counts.get(d, 0) for d in weekday_names},
|
|
359
|
+
sessions_by_hour=dict(hour_counts),
|
|
360
|
+
daily_activity=daily_activity,
|
|
361
|
+
messages_by_agent=dict(agent_messages),
|
|
362
|
+
content_chars_by_agent=dict(agent_content_chars),
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
def _get_index_size(self) -> int:
|
|
366
|
+
"""Get total size of the index directory in bytes."""
|
|
367
|
+
if not self.index_path.exists():
|
|
368
|
+
return 0
|
|
369
|
+
total = 0
|
|
370
|
+
for f in self.index_path.rglob("*"):
|
|
371
|
+
if f.is_file():
|
|
372
|
+
total += f.stat().st_size
|
|
373
|
+
return total
|
|
374
|
+
|
|
375
|
+
def _doc_to_session(self, doc: tantivy.Document) -> Session | None:
|
|
376
|
+
"""Convert a Tantivy document to a Session object."""
|
|
377
|
+
try:
|
|
378
|
+
session_id = doc.get_first("id")
|
|
379
|
+
if not session_id:
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
timestamp_float = doc.get_first("timestamp")
|
|
383
|
+
if timestamp_float is None:
|
|
384
|
+
return None
|
|
385
|
+
|
|
386
|
+
content = doc.get_first("content") or ""
|
|
387
|
+
|
|
388
|
+
return Session(
|
|
389
|
+
id=session_id,
|
|
390
|
+
agent=doc.get_first("agent") or "",
|
|
391
|
+
title=doc.get_first("title") or "",
|
|
392
|
+
directory=doc.get_first("directory") or "",
|
|
393
|
+
timestamp=datetime.fromtimestamp(timestamp_float),
|
|
394
|
+
content=content,
|
|
395
|
+
message_count=doc.get_first("message_count") or 0,
|
|
396
|
+
mtime=doc.get_first("mtime") or 0.0,
|
|
397
|
+
yolo=doc.get_first("yolo") or False,
|
|
398
|
+
)
|
|
399
|
+
except Exception:
|
|
400
|
+
return None
|
|
401
|
+
|
|
402
|
+
def delete_sessions(self, session_ids: list[str]) -> None:
|
|
403
|
+
"""Remove sessions from the index by ID."""
|
|
404
|
+
if not session_ids:
|
|
405
|
+
return
|
|
406
|
+
|
|
407
|
+
index = self._ensure_index()
|
|
408
|
+
writer = index.writer()
|
|
409
|
+
for sid in session_ids:
|
|
410
|
+
writer.delete_documents_by_term("id", sid)
|
|
411
|
+
writer.commit()
|
|
412
|
+
|
|
413
|
+
def add_sessions(self, sessions: list[Session]) -> None:
|
|
414
|
+
"""Add sessions to the index."""
|
|
415
|
+
if not sessions:
|
|
416
|
+
return
|
|
417
|
+
|
|
418
|
+
index = self._ensure_index()
|
|
419
|
+
writer = index.writer()
|
|
420
|
+
for session in sessions:
|
|
421
|
+
writer.add_document(
|
|
422
|
+
tantivy.Document(
|
|
423
|
+
id=session.id,
|
|
424
|
+
title=session.title,
|
|
425
|
+
directory=session.directory,
|
|
426
|
+
agent=session.agent,
|
|
427
|
+
content=session.content,
|
|
428
|
+
timestamp=session.timestamp.timestamp(),
|
|
429
|
+
message_count=session.message_count,
|
|
430
|
+
mtime=session.mtime,
|
|
431
|
+
yolo=session.yolo,
|
|
432
|
+
)
|
|
433
|
+
)
|
|
434
|
+
writer.commit()
|
|
435
|
+
|
|
436
|
+
def update_sessions(self, sessions: list[Session]) -> None:
|
|
437
|
+
"""Update sessions in the index (delete then add in a single transaction)."""
|
|
438
|
+
if not sessions:
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
index = self._ensure_index()
|
|
442
|
+
writer = index.writer()
|
|
443
|
+
# Delete existing documents first
|
|
444
|
+
for session in sessions:
|
|
445
|
+
writer.delete_documents_by_term("id", session.id)
|
|
446
|
+
# Add new versions
|
|
447
|
+
for session in sessions:
|
|
448
|
+
writer.add_document(
|
|
449
|
+
tantivy.Document(
|
|
450
|
+
id=session.id,
|
|
451
|
+
title=session.title,
|
|
452
|
+
directory=session.directory,
|
|
453
|
+
agent=session.agent,
|
|
454
|
+
content=session.content,
|
|
455
|
+
timestamp=session.timestamp.timestamp(),
|
|
456
|
+
message_count=session.message_count,
|
|
457
|
+
mtime=session.mtime,
|
|
458
|
+
yolo=session.yolo,
|
|
459
|
+
)
|
|
460
|
+
)
|
|
461
|
+
writer.commit()
|
|
462
|
+
|
|
463
|
+
def search(
|
|
464
|
+
self,
|
|
465
|
+
query: str,
|
|
466
|
+
agent_filter: Filter | None = None,
|
|
467
|
+
directory_filter: Filter | None = None,
|
|
468
|
+
date_filter: DateFilter | None = None,
|
|
469
|
+
limit: int = 100,
|
|
470
|
+
) -> list[tuple[str, float]]:
|
|
471
|
+
"""Search the index and return (session_id, score) pairs.
|
|
472
|
+
|
|
473
|
+
Uses a hybrid approach:
|
|
474
|
+
- Exact matches (via parsed query) are boosted 5x for better ranking
|
|
475
|
+
- Fuzzy matches (edit distance 1) provide typo tolerance
|
|
476
|
+
|
|
477
|
+
All filters are applied at the Tantivy level for efficiency:
|
|
478
|
+
- agent_filter: term_set_query for includes, MustNot for excludes
|
|
479
|
+
- directory_filter: regex_query for substring matching
|
|
480
|
+
- date_filter: range_query on timestamp field
|
|
481
|
+
"""
|
|
482
|
+
index = self._ensure_index()
|
|
483
|
+
index.reload()
|
|
484
|
+
searcher = index.searcher()
|
|
485
|
+
schema = index.schema
|
|
486
|
+
|
|
487
|
+
try:
|
|
488
|
+
query_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
|
|
489
|
+
|
|
490
|
+
if query.strip():
|
|
491
|
+
# Build hybrid query: exact (boosted) + fuzzy (for typo tolerance)
|
|
492
|
+
text_query = self._build_hybrid_query(query, index, schema)
|
|
493
|
+
query_parts.append((tantivy.Occur.Must, text_query))
|
|
494
|
+
|
|
495
|
+
# Add agent filter if specified
|
|
496
|
+
agent_query = self._build_agent_filter_query(agent_filter, schema)
|
|
497
|
+
if agent_query:
|
|
498
|
+
query_parts.append((tantivy.Occur.Must, agent_query))
|
|
499
|
+
|
|
500
|
+
# Add directory filter if specified
|
|
501
|
+
dir_query = self._build_directory_filter_query(directory_filter, schema)
|
|
502
|
+
if dir_query:
|
|
503
|
+
query_parts.append((tantivy.Occur.Must, dir_query))
|
|
504
|
+
|
|
505
|
+
# Add date filter if specified
|
|
506
|
+
date_query = self._build_date_filter_query(date_filter, schema)
|
|
507
|
+
if date_query:
|
|
508
|
+
query_parts.append((tantivy.Occur.Must, date_query))
|
|
509
|
+
|
|
510
|
+
# Combine all query parts
|
|
511
|
+
if not query_parts:
|
|
512
|
+
# No text query and no filters - match all documents
|
|
513
|
+
combined_query = tantivy.Query.all_query()
|
|
514
|
+
else:
|
|
515
|
+
combined_query = tantivy.Query.boolean_query(query_parts)
|
|
516
|
+
results = searcher.search(combined_query, limit).hits
|
|
517
|
+
|
|
518
|
+
# Extract session IDs and scores
|
|
519
|
+
output = []
|
|
520
|
+
for score, doc_address in results:
|
|
521
|
+
doc = searcher.doc(doc_address)
|
|
522
|
+
session_id = doc.get_first("id")
|
|
523
|
+
if session_id:
|
|
524
|
+
output.append((session_id, score))
|
|
525
|
+
|
|
526
|
+
return output
|
|
527
|
+
except Exception:
|
|
528
|
+
# If query fails, return empty results
|
|
529
|
+
return []
|
|
530
|
+
|
|
531
|
+
def _build_agent_filter_query(
|
|
532
|
+
self,
|
|
533
|
+
agent_filter: Filter | None,
|
|
534
|
+
schema: tantivy.Schema,
|
|
535
|
+
) -> tantivy.Query | None:
|
|
536
|
+
"""Build a Tantivy query for agent filtering.
|
|
537
|
+
|
|
538
|
+
Supports:
|
|
539
|
+
- Multiple include values (OR logic via term_set_query)
|
|
540
|
+
- Multiple exclude values (AND logic via MustNot)
|
|
541
|
+
- Mixed include/exclude
|
|
542
|
+
"""
|
|
543
|
+
if not agent_filter:
|
|
544
|
+
return None
|
|
545
|
+
|
|
546
|
+
parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
|
|
547
|
+
|
|
548
|
+
# Include filter: match any of the included agents
|
|
549
|
+
if agent_filter.include:
|
|
550
|
+
if len(agent_filter.include) == 1:
|
|
551
|
+
# Single value: use term_query
|
|
552
|
+
include_query = tantivy.Query.term_query(
|
|
553
|
+
schema, "agent", agent_filter.include[0]
|
|
554
|
+
)
|
|
555
|
+
else:
|
|
556
|
+
# Multiple values: use term_set_query (OR)
|
|
557
|
+
include_query = tantivy.Query.term_set_query(
|
|
558
|
+
schema, "agent", agent_filter.include
|
|
559
|
+
)
|
|
560
|
+
parts.append((tantivy.Occur.Must, include_query))
|
|
561
|
+
|
|
562
|
+
# Exclude filter: reject any of the excluded agents
|
|
563
|
+
for excluded in agent_filter.exclude:
|
|
564
|
+
exclude_query = tantivy.Query.term_query(schema, "agent", excluded)
|
|
565
|
+
parts.append((tantivy.Occur.MustNot, exclude_query))
|
|
566
|
+
|
|
567
|
+
if not parts:
|
|
568
|
+
return None
|
|
569
|
+
|
|
570
|
+
# If only excludes, we need a base query to exclude from
|
|
571
|
+
if not agent_filter.include and agent_filter.exclude:
|
|
572
|
+
# Match all, then exclude
|
|
573
|
+
parts.insert(0, (tantivy.Occur.Must, tantivy.Query.all_query()))
|
|
574
|
+
|
|
575
|
+
return tantivy.Query.boolean_query(parts)
|
|
576
|
+
|
|
577
|
+
def _build_directory_filter_query(
|
|
578
|
+
self,
|
|
579
|
+
directory_filter: Filter | None,
|
|
580
|
+
schema: tantivy.Schema,
|
|
581
|
+
) -> tantivy.Query | None:
|
|
582
|
+
"""Build a Tantivy query for directory filtering using regex.
|
|
583
|
+
|
|
584
|
+
Uses regex_query for substring matching (case-insensitive).
|
|
585
|
+
"""
|
|
586
|
+
if not directory_filter:
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
|
|
590
|
+
|
|
591
|
+
# Include filter: match any directory containing the substring
|
|
592
|
+
if directory_filter.include:
|
|
593
|
+
include_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
|
|
594
|
+
for dir_pattern in directory_filter.include:
|
|
595
|
+
# Escape regex special characters and build case-insensitive pattern
|
|
596
|
+
escaped = re.escape(dir_pattern)
|
|
597
|
+
regex_pattern = f"(?i).*{escaped}.*"
|
|
598
|
+
include_query = tantivy.Query.regex_query(
|
|
599
|
+
schema, "directory", regex_pattern
|
|
600
|
+
)
|
|
601
|
+
include_parts.append((tantivy.Occur.Should, include_query))
|
|
602
|
+
|
|
603
|
+
if len(include_parts) == 1:
|
|
604
|
+
parts.append((tantivy.Occur.Must, include_parts[0][1]))
|
|
605
|
+
else:
|
|
606
|
+
parts.append(
|
|
607
|
+
(tantivy.Occur.Must, tantivy.Query.boolean_query(include_parts))
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
# Exclude filter: reject directories containing the substring
|
|
611
|
+
for dir_pattern in directory_filter.exclude:
|
|
612
|
+
escaped = re.escape(dir_pattern)
|
|
613
|
+
regex_pattern = f"(?i).*{escaped}.*"
|
|
614
|
+
exclude_query = tantivy.Query.regex_query(
|
|
615
|
+
schema, "directory", regex_pattern
|
|
616
|
+
)
|
|
617
|
+
parts.append((tantivy.Occur.MustNot, exclude_query))
|
|
618
|
+
|
|
619
|
+
if not parts:
|
|
620
|
+
return None
|
|
621
|
+
|
|
622
|
+
# If only excludes, we need a base query to exclude from
|
|
623
|
+
if not directory_filter.include and directory_filter.exclude:
|
|
624
|
+
parts.insert(0, (tantivy.Occur.Must, tantivy.Query.all_query()))
|
|
625
|
+
|
|
626
|
+
return tantivy.Query.boolean_query(parts)
|
|
627
|
+
|
|
628
|
+
def _build_date_filter_query(
|
|
629
|
+
self,
|
|
630
|
+
date_filter: DateFilter | None,
|
|
631
|
+
schema: tantivy.Schema,
|
|
632
|
+
) -> tantivy.Query | None:
|
|
633
|
+
"""Build a Tantivy query for date filtering using range queries.
|
|
634
|
+
|
|
635
|
+
Supports:
|
|
636
|
+
- date:<1h (sessions newer than 1 hour)
|
|
637
|
+
- date:>1d (sessions older than 1 day)
|
|
638
|
+
- date:today (sessions from today)
|
|
639
|
+
- date:yesterday (sessions from yesterday only)
|
|
640
|
+
- Negation via date:!today or -date:today
|
|
641
|
+
"""
|
|
642
|
+
if not date_filter:
|
|
643
|
+
return None
|
|
644
|
+
|
|
645
|
+
cutoff_ts = date_filter.cutoff.timestamp()
|
|
646
|
+
|
|
647
|
+
# Build the range query based on operator
|
|
648
|
+
# Use float('inf') and float('-inf') for unbounded ranges
|
|
649
|
+
if date_filter.op == DateOp.LESS_THAN:
|
|
650
|
+
# Sessions newer than cutoff (timestamp >= cutoff)
|
|
651
|
+
range_query = tantivy.Query.range_query(
|
|
652
|
+
schema,
|
|
653
|
+
"timestamp",
|
|
654
|
+
tantivy.FieldType.Float,
|
|
655
|
+
lower_bound=cutoff_ts,
|
|
656
|
+
upper_bound=float("inf"),
|
|
657
|
+
include_lower=True,
|
|
658
|
+
include_upper=True,
|
|
659
|
+
)
|
|
660
|
+
elif date_filter.op == DateOp.GREATER_THAN:
|
|
661
|
+
# Sessions older than cutoff (timestamp < cutoff)
|
|
662
|
+
range_query = tantivy.Query.range_query(
|
|
663
|
+
schema,
|
|
664
|
+
"timestamp",
|
|
665
|
+
tantivy.FieldType.Float,
|
|
666
|
+
lower_bound=float("-inf"),
|
|
667
|
+
upper_bound=cutoff_ts,
|
|
668
|
+
include_lower=True,
|
|
669
|
+
include_upper=False,
|
|
670
|
+
)
|
|
671
|
+
elif date_filter.op == DateOp.EXACT:
|
|
672
|
+
if date_filter.value.lower() == "today":
|
|
673
|
+
# Sessions from today (timestamp >= today_start)
|
|
674
|
+
range_query = tantivy.Query.range_query(
|
|
675
|
+
schema,
|
|
676
|
+
"timestamp",
|
|
677
|
+
tantivy.FieldType.Float,
|
|
678
|
+
lower_bound=cutoff_ts,
|
|
679
|
+
upper_bound=float("inf"),
|
|
680
|
+
include_lower=True,
|
|
681
|
+
include_upper=True,
|
|
682
|
+
)
|
|
683
|
+
elif date_filter.value.lower() == "yesterday":
|
|
684
|
+
# Sessions from yesterday only (cutoff <= timestamp < cutoff + 1 day)
|
|
685
|
+
next_day_ts = (date_filter.cutoff + timedelta(days=1)).timestamp()
|
|
686
|
+
range_query = tantivy.Query.range_query(
|
|
687
|
+
schema,
|
|
688
|
+
"timestamp",
|
|
689
|
+
tantivy.FieldType.Float,
|
|
690
|
+
lower_bound=cutoff_ts,
|
|
691
|
+
upper_bound=next_day_ts,
|
|
692
|
+
include_lower=True,
|
|
693
|
+
include_upper=False,
|
|
694
|
+
)
|
|
695
|
+
else:
|
|
696
|
+
# Unknown exact date, match all
|
|
697
|
+
return None
|
|
698
|
+
else:
|
|
699
|
+
return None
|
|
700
|
+
|
|
701
|
+
# Handle negation
|
|
702
|
+
if date_filter.negated:
|
|
703
|
+
return tantivy.Query.boolean_query(
|
|
704
|
+
[
|
|
705
|
+
(tantivy.Occur.Must, tantivy.Query.all_query()),
|
|
706
|
+
(tantivy.Occur.MustNot, range_query),
|
|
707
|
+
]
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
return range_query
|
|
711
|
+
|
|
712
|
+
def _build_hybrid_query(
|
|
713
|
+
self,
|
|
714
|
+
query: str,
|
|
715
|
+
index: tantivy.Index,
|
|
716
|
+
schema: tantivy.Schema,
|
|
717
|
+
) -> tantivy.Query:
|
|
718
|
+
"""Build a hybrid query combining exact and fuzzy matching.
|
|
719
|
+
|
|
720
|
+
Exact matches are boosted 5x to rank higher than fuzzy matches.
|
|
721
|
+
This provides typo tolerance while favoring exact matches.
|
|
722
|
+
"""
|
|
723
|
+
# Exact match query (boosted) - uses BM25 scoring
|
|
724
|
+
exact_query = index.parse_query(query, ["title", "content"])
|
|
725
|
+
boosted_exact = tantivy.Query.boost_query(exact_query, 5.0)
|
|
726
|
+
|
|
727
|
+
# Fuzzy match queries for typo tolerance
|
|
728
|
+
fuzzy_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
|
|
729
|
+
for term in query.split():
|
|
730
|
+
if not term:
|
|
731
|
+
continue
|
|
732
|
+
# Fuzzy query for title and content
|
|
733
|
+
fuzzy_title = tantivy.Query.fuzzy_term_query(
|
|
734
|
+
schema, "title", term, distance=1, prefix=True
|
|
735
|
+
)
|
|
736
|
+
fuzzy_content = tantivy.Query.fuzzy_term_query(
|
|
737
|
+
schema, "content", term, distance=1, prefix=True
|
|
738
|
+
)
|
|
739
|
+
# Either field can match
|
|
740
|
+
term_query = tantivy.Query.boolean_query(
|
|
741
|
+
[
|
|
742
|
+
(tantivy.Occur.Should, fuzzy_title),
|
|
743
|
+
(tantivy.Occur.Should, fuzzy_content),
|
|
744
|
+
]
|
|
745
|
+
)
|
|
746
|
+
fuzzy_parts.append((tantivy.Occur.Must, term_query))
|
|
747
|
+
|
|
748
|
+
# Combine: exact OR fuzzy (either can match, but exact scores higher)
|
|
749
|
+
if fuzzy_parts:
|
|
750
|
+
fuzzy_query = tantivy.Query.boolean_query(fuzzy_parts)
|
|
751
|
+
return tantivy.Query.boolean_query(
|
|
752
|
+
[
|
|
753
|
+
(tantivy.Occur.Should, boosted_exact),
|
|
754
|
+
(tantivy.Occur.Should, fuzzy_query),
|
|
755
|
+
]
|
|
756
|
+
)
|
|
757
|
+
else:
|
|
758
|
+
return boosted_exact
|