fast-resume 1.12.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fast_resume/index.py ADDED
@@ -0,0 +1,758 @@
1
+ """Tantivy full-text search index for sessions."""
2
+
3
+ import re
4
+ import shutil
5
+ from collections import Counter
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, timedelta
8
+ from pathlib import Path
9
+
10
+ import tantivy
11
+
12
+ from .adapters.base import Session
13
+ from .config import INDEX_DIR, SCHEMA_VERSION
14
+ from .query import DateFilter, DateOp, Filter
15
+
16
+ # Version file to detect schema changes
17
+ _VERSION_FILE = ".schema_version"
18
+
19
+
20
+ @dataclass
21
+ class IndexStats:
22
+ """Statistics about the index contents."""
23
+
24
+ total_sessions: int
25
+ sessions_by_agent: dict[str, int]
26
+ total_messages: int
27
+ oldest_session: datetime | None
28
+ newest_session: datetime | None
29
+ top_directories: list[
30
+ tuple[str, int, int]
31
+ ] # (directory, sessions, messages) tuples
32
+ index_size_bytes: int
33
+ # Time breakdown
34
+ sessions_today: int
35
+ sessions_this_week: int
36
+ sessions_this_month: int
37
+ sessions_older: int
38
+ # Content metrics
39
+ total_content_chars: int
40
+ avg_content_chars: int
41
+ avg_messages_per_session: float
42
+ # Activity patterns
43
+ sessions_by_weekday: dict[str, int] # Mon, Tue, etc.
44
+ sessions_by_hour: dict[int, int] # 0-23
45
+ # Daily activity (date string -> (sessions, messages))
46
+ daily_activity: list[tuple[str, int, int]] # (date, sessions, messages)
47
+ # Per-agent raw data
48
+ messages_by_agent: dict[str, int] | None = None
49
+ content_chars_by_agent: dict[str, int] | None = None
50
+
51
+
52
+ class TantivyIndex:
53
+ """Manages a Tantivy full-text search index for sessions.
54
+
55
+ This is the single source of truth for session data.
56
+ """
57
+
58
+ def __init__(self, index_path: Path = INDEX_DIR) -> None:
59
+ self.index_path = index_path
60
+ self._index: tantivy.Index | None = None
61
+ self._schema: tantivy.Schema | None = None
62
+ self._version_file = index_path / _VERSION_FILE
63
+
64
+ def _build_schema(self) -> tantivy.Schema:
65
+ """Build the Tantivy schema for sessions."""
66
+ schema_builder = tantivy.SchemaBuilder()
67
+ # ID field - stored and indexed with raw tokenizer for exact term matching
68
+ schema_builder.add_text_field("id", stored=True, tokenizer_name="raw")
69
+ # Title - stored and indexed for search
70
+ schema_builder.add_text_field("title", stored=True)
71
+ # Directory - stored with raw tokenizer for regex substring matching
72
+ schema_builder.add_text_field("directory", stored=True, tokenizer_name="raw")
73
+ # Agent - stored for filtering (raw tokenizer to preserve hyphens)
74
+ schema_builder.add_text_field("agent", stored=True, tokenizer_name="raw")
75
+ # Content - stored and indexed for full-text search
76
+ schema_builder.add_text_field("content", stored=True)
77
+ # Timestamp - stored and indexed for range queries
78
+ schema_builder.add_float_field("timestamp", stored=True, indexed=True)
79
+ # Message count - stored as integer
80
+ schema_builder.add_integer_field("message_count", stored=True)
81
+ # File modification time - for incremental updates
82
+ schema_builder.add_float_field("mtime", stored=True)
83
+ # Yolo mode - session was started with auto-approve/skip-permissions
84
+ schema_builder.add_boolean_field("yolo", stored=True)
85
+ return schema_builder.build()
86
+
87
+ def _check_version(self) -> bool:
88
+ """Check if index version matches current schema version."""
89
+ if not self._version_file.exists():
90
+ return False
91
+ try:
92
+ stored_version = int(self._version_file.read_text().strip())
93
+ return stored_version == SCHEMA_VERSION
94
+ except (ValueError, OSError):
95
+ return False
96
+
97
+ def _write_version(self) -> None:
98
+ """Write current schema version to version file."""
99
+ self._version_file.parent.mkdir(parents=True, exist_ok=True)
100
+ self._version_file.write_text(str(SCHEMA_VERSION))
101
+
102
+ def _clear(self) -> None:
103
+ """Clear the index directory."""
104
+ self._index = None
105
+ self._schema = None
106
+ if self.index_path.exists():
107
+ shutil.rmtree(self.index_path)
108
+
109
+ def _ensure_index(self) -> tantivy.Index:
110
+ """Ensure the index is loaded or created."""
111
+ if self._index is not None:
112
+ return self._index
113
+
114
+ # Check version - rebuild if schema changed
115
+ if self.index_path.exists() and not self._check_version():
116
+ self._clear()
117
+
118
+ self._schema = self._build_schema()
119
+
120
+ if self.index_path.exists():
121
+ # Open existing index
122
+ self._index = tantivy.Index(self._schema, path=str(self.index_path))
123
+ else:
124
+ # Create new index
125
+ self.index_path.mkdir(parents=True, exist_ok=True)
126
+ self._index = tantivy.Index(self._schema, path=str(self.index_path))
127
+ self._write_version()
128
+
129
+ return self._index
130
+
131
+ def get_known_sessions(self) -> dict[str, tuple[float, str]]:
132
+ """Get all session IDs with their mtimes and agents.
133
+
134
+ Returns:
135
+ Dict mapping session_id to (mtime, agent) tuple.
136
+ """
137
+ if not self.index_path.exists() or not self._check_version():
138
+ return {}
139
+
140
+ index = self._ensure_index()
141
+ index.reload()
142
+ searcher = index.searcher()
143
+
144
+ if searcher.num_docs == 0:
145
+ return {}
146
+
147
+ known: dict[str, tuple[float, str]] = {}
148
+
149
+ # Match all documents
150
+ all_query = tantivy.Query.all_query()
151
+ results = searcher.search(all_query, limit=searcher.num_docs).hits
152
+
153
+ for _score, doc_address in results:
154
+ doc = searcher.doc(doc_address)
155
+ session_id = doc.get_first("id")
156
+ mtime = doc.get_first("mtime")
157
+ agent = doc.get_first("agent")
158
+ if session_id and mtime is not None and agent:
159
+ known[session_id] = (mtime, agent)
160
+
161
+ return known
162
+
163
+ def get_all_sessions(self) -> list[Session]:
164
+ """Retrieve all sessions from the index.
165
+
166
+ Returns:
167
+ List of Session objects, unsorted.
168
+ """
169
+ if not self.index_path.exists() or not self._check_version():
170
+ return []
171
+
172
+ index = self._ensure_index()
173
+ index.reload()
174
+ searcher = index.searcher()
175
+
176
+ if searcher.num_docs == 0:
177
+ return []
178
+
179
+ sessions: list[Session] = []
180
+
181
+ # Match all documents
182
+ all_query = tantivy.Query.all_query()
183
+ results = searcher.search(all_query, limit=searcher.num_docs).hits
184
+
185
+ for _score, doc_address in results:
186
+ doc = searcher.doc(doc_address)
187
+ session = self._doc_to_session(doc)
188
+ if session:
189
+ sessions.append(session)
190
+
191
+ return sessions
192
+
193
+ def get_session_count(self, agent_filter: str | None = None) -> int:
194
+ """Get the total number of sessions in the index.
195
+
196
+ Args:
197
+ agent_filter: If provided, only count sessions for this agent.
198
+ """
199
+ if not self.index_path.exists() or not self._check_version():
200
+ return 0
201
+
202
+ index = self._ensure_index()
203
+ index.reload()
204
+ searcher = index.searcher()
205
+
206
+ if agent_filter is None:
207
+ return searcher.num_docs
208
+
209
+ # Count sessions for specific agent using term query
210
+ schema = index.schema
211
+ query = tantivy.Query.term_query(schema, "agent", agent_filter)
212
+ # Tantivy requires limit > 0, use count property for total matches
213
+ return searcher.search(query, limit=1).count # type: ignore[attr-defined]
214
+
215
+ def get_stats(self) -> IndexStats:
216
+ """Get statistics about the index contents."""
217
+ empty_stats = IndexStats(
218
+ total_sessions=0,
219
+ sessions_by_agent={},
220
+ total_messages=0,
221
+ oldest_session=None,
222
+ newest_session=None,
223
+ top_directories=[],
224
+ index_size_bytes=0,
225
+ sessions_today=0,
226
+ sessions_this_week=0,
227
+ sessions_this_month=0,
228
+ sessions_older=0,
229
+ total_content_chars=0,
230
+ avg_content_chars=0,
231
+ avg_messages_per_session=0.0,
232
+ sessions_by_weekday={},
233
+ sessions_by_hour={},
234
+ daily_activity=[],
235
+ )
236
+
237
+ if not self.index_path.exists() or not self._check_version():
238
+ return empty_stats
239
+
240
+ index = self._ensure_index()
241
+ index.reload()
242
+ searcher = index.searcher()
243
+
244
+ if searcher.num_docs == 0:
245
+ empty_stats.index_size_bytes = self._get_index_size()
246
+ return empty_stats
247
+
248
+ # Collect stats from all documents
249
+ agent_counts: Counter[str] = Counter()
250
+ agent_messages: Counter[str] = Counter()
251
+ agent_content_chars: Counter[str] = Counter()
252
+ dir_counts: Counter[str] = Counter()
253
+ dir_messages: Counter[str] = Counter()
254
+ weekday_counts: Counter[str] = Counter()
255
+ hour_counts: Counter[int] = Counter()
256
+ daily_sessions: Counter[str] = Counter()
257
+ daily_messages: Counter[str] = Counter()
258
+ total_messages = 0
259
+ total_content_chars = 0
260
+ oldest_ts: float | None = None
261
+ newest_ts: float | None = None
262
+
263
+ # Time boundaries
264
+ now = datetime.now()
265
+ today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
266
+ week_start = today_start - timedelta(days=today_start.weekday())
267
+ month_start = today_start.replace(day=1)
268
+
269
+ sessions_today = 0
270
+ sessions_this_week = 0
271
+ sessions_this_month = 0
272
+ sessions_older = 0
273
+
274
+ weekday_names = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
275
+
276
+ all_query = tantivy.Query.all_query()
277
+ results = searcher.search(all_query, limit=searcher.num_docs).hits
278
+
279
+ for _score, doc_address in results:
280
+ doc = searcher.doc(doc_address)
281
+
282
+ agent = doc.get_first("agent")
283
+ if agent:
284
+ agent_counts[agent] += 1
285
+
286
+ directory = doc.get_first("directory")
287
+ if directory:
288
+ dir_counts[directory] += 1
289
+
290
+ msg_count = doc.get_first("message_count")
291
+ if msg_count:
292
+ total_messages += msg_count
293
+ if directory:
294
+ dir_messages[directory] += msg_count
295
+ if agent:
296
+ agent_messages[agent] += msg_count
297
+
298
+ content = doc.get_first("content")
299
+ if content:
300
+ content_len = len(content)
301
+ total_content_chars += content_len
302
+ if agent:
303
+ agent_content_chars[agent] += content_len
304
+
305
+ timestamp = doc.get_first("timestamp")
306
+ if timestamp is not None:
307
+ if oldest_ts is None or timestamp < oldest_ts:
308
+ oldest_ts = timestamp
309
+ if newest_ts is None or timestamp > newest_ts:
310
+ newest_ts = timestamp
311
+
312
+ # Time breakdown
313
+ dt = datetime.fromtimestamp(timestamp)
314
+ if dt >= today_start:
315
+ sessions_today += 1
316
+ if dt >= week_start:
317
+ sessions_this_week += 1
318
+ if dt >= month_start:
319
+ sessions_this_month += 1
320
+ else:
321
+ sessions_older += 1
322
+
323
+ # Activity patterns
324
+ weekday_counts[weekday_names[dt.weekday()]] += 1
325
+ hour_counts[dt.hour] += 1
326
+
327
+ # Daily activity
328
+ date_str = dt.strftime("%Y-%m-%d")
329
+ daily_sessions[date_str] += 1
330
+ if msg_count:
331
+ daily_messages[date_str] += msg_count
332
+
333
+ num_docs = searcher.num_docs
334
+
335
+ # Build daily activity list sorted by date
336
+ all_dates = sorted(set(daily_sessions.keys()) | set(daily_messages.keys()))
337
+ daily_activity = [
338
+ (d, daily_sessions.get(d, 0), daily_messages.get(d, 0)) for d in all_dates
339
+ ]
340
+
341
+ return IndexStats(
342
+ total_sessions=num_docs,
343
+ sessions_by_agent=dict(agent_counts),
344
+ total_messages=total_messages,
345
+ oldest_session=datetime.fromtimestamp(oldest_ts) if oldest_ts else None,
346
+ newest_session=datetime.fromtimestamp(newest_ts) if newest_ts else None,
347
+ top_directories=[
348
+ (d, count, dir_messages[d]) for d, count in dir_counts.most_common(10)
349
+ ],
350
+ index_size_bytes=self._get_index_size(),
351
+ sessions_today=sessions_today,
352
+ sessions_this_week=sessions_this_week,
353
+ sessions_this_month=sessions_this_month,
354
+ sessions_older=sessions_older,
355
+ total_content_chars=total_content_chars,
356
+ avg_content_chars=total_content_chars // num_docs if num_docs else 0,
357
+ avg_messages_per_session=total_messages / num_docs if num_docs else 0.0,
358
+ sessions_by_weekday={d: weekday_counts.get(d, 0) for d in weekday_names},
359
+ sessions_by_hour=dict(hour_counts),
360
+ daily_activity=daily_activity,
361
+ messages_by_agent=dict(agent_messages),
362
+ content_chars_by_agent=dict(agent_content_chars),
363
+ )
364
+
365
+ def _get_index_size(self) -> int:
366
+ """Get total size of the index directory in bytes."""
367
+ if not self.index_path.exists():
368
+ return 0
369
+ total = 0
370
+ for f in self.index_path.rglob("*"):
371
+ if f.is_file():
372
+ total += f.stat().st_size
373
+ return total
374
+
375
+ def _doc_to_session(self, doc: tantivy.Document) -> Session | None:
376
+ """Convert a Tantivy document to a Session object."""
377
+ try:
378
+ session_id = doc.get_first("id")
379
+ if not session_id:
380
+ return None
381
+
382
+ timestamp_float = doc.get_first("timestamp")
383
+ if timestamp_float is None:
384
+ return None
385
+
386
+ content = doc.get_first("content") or ""
387
+
388
+ return Session(
389
+ id=session_id,
390
+ agent=doc.get_first("agent") or "",
391
+ title=doc.get_first("title") or "",
392
+ directory=doc.get_first("directory") or "",
393
+ timestamp=datetime.fromtimestamp(timestamp_float),
394
+ content=content,
395
+ message_count=doc.get_first("message_count") or 0,
396
+ mtime=doc.get_first("mtime") or 0.0,
397
+ yolo=doc.get_first("yolo") or False,
398
+ )
399
+ except Exception:
400
+ return None
401
+
402
+ def delete_sessions(self, session_ids: list[str]) -> None:
403
+ """Remove sessions from the index by ID."""
404
+ if not session_ids:
405
+ return
406
+
407
+ index = self._ensure_index()
408
+ writer = index.writer()
409
+ for sid in session_ids:
410
+ writer.delete_documents_by_term("id", sid)
411
+ writer.commit()
412
+
413
+ def add_sessions(self, sessions: list[Session]) -> None:
414
+ """Add sessions to the index."""
415
+ if not sessions:
416
+ return
417
+
418
+ index = self._ensure_index()
419
+ writer = index.writer()
420
+ for session in sessions:
421
+ writer.add_document(
422
+ tantivy.Document(
423
+ id=session.id,
424
+ title=session.title,
425
+ directory=session.directory,
426
+ agent=session.agent,
427
+ content=session.content,
428
+ timestamp=session.timestamp.timestamp(),
429
+ message_count=session.message_count,
430
+ mtime=session.mtime,
431
+ yolo=session.yolo,
432
+ )
433
+ )
434
+ writer.commit()
435
+
436
+ def update_sessions(self, sessions: list[Session]) -> None:
437
+ """Update sessions in the index (delete then add in a single transaction)."""
438
+ if not sessions:
439
+ return
440
+
441
+ index = self._ensure_index()
442
+ writer = index.writer()
443
+ # Delete existing documents first
444
+ for session in sessions:
445
+ writer.delete_documents_by_term("id", session.id)
446
+ # Add new versions
447
+ for session in sessions:
448
+ writer.add_document(
449
+ tantivy.Document(
450
+ id=session.id,
451
+ title=session.title,
452
+ directory=session.directory,
453
+ agent=session.agent,
454
+ content=session.content,
455
+ timestamp=session.timestamp.timestamp(),
456
+ message_count=session.message_count,
457
+ mtime=session.mtime,
458
+ yolo=session.yolo,
459
+ )
460
+ )
461
+ writer.commit()
462
+
463
+ def search(
464
+ self,
465
+ query: str,
466
+ agent_filter: Filter | None = None,
467
+ directory_filter: Filter | None = None,
468
+ date_filter: DateFilter | None = None,
469
+ limit: int = 100,
470
+ ) -> list[tuple[str, float]]:
471
+ """Search the index and return (session_id, score) pairs.
472
+
473
+ Uses a hybrid approach:
474
+ - Exact matches (via parsed query) are boosted 5x for better ranking
475
+ - Fuzzy matches (edit distance 1) provide typo tolerance
476
+
477
+ All filters are applied at the Tantivy level for efficiency:
478
+ - agent_filter: term_set_query for includes, MustNot for excludes
479
+ - directory_filter: regex_query for substring matching
480
+ - date_filter: range_query on timestamp field
481
+ """
482
+ index = self._ensure_index()
483
+ index.reload()
484
+ searcher = index.searcher()
485
+ schema = index.schema
486
+
487
+ try:
488
+ query_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
489
+
490
+ if query.strip():
491
+ # Build hybrid query: exact (boosted) + fuzzy (for typo tolerance)
492
+ text_query = self._build_hybrid_query(query, index, schema)
493
+ query_parts.append((tantivy.Occur.Must, text_query))
494
+
495
+ # Add agent filter if specified
496
+ agent_query = self._build_agent_filter_query(agent_filter, schema)
497
+ if agent_query:
498
+ query_parts.append((tantivy.Occur.Must, agent_query))
499
+
500
+ # Add directory filter if specified
501
+ dir_query = self._build_directory_filter_query(directory_filter, schema)
502
+ if dir_query:
503
+ query_parts.append((tantivy.Occur.Must, dir_query))
504
+
505
+ # Add date filter if specified
506
+ date_query = self._build_date_filter_query(date_filter, schema)
507
+ if date_query:
508
+ query_parts.append((tantivy.Occur.Must, date_query))
509
+
510
+ # Combine all query parts
511
+ if not query_parts:
512
+ # No text query and no filters - match all documents
513
+ combined_query = tantivy.Query.all_query()
514
+ else:
515
+ combined_query = tantivy.Query.boolean_query(query_parts)
516
+ results = searcher.search(combined_query, limit).hits
517
+
518
+ # Extract session IDs and scores
519
+ output = []
520
+ for score, doc_address in results:
521
+ doc = searcher.doc(doc_address)
522
+ session_id = doc.get_first("id")
523
+ if session_id:
524
+ output.append((session_id, score))
525
+
526
+ return output
527
+ except Exception:
528
+ # If query fails, return empty results
529
+ return []
530
+
531
+ def _build_agent_filter_query(
532
+ self,
533
+ agent_filter: Filter | None,
534
+ schema: tantivy.Schema,
535
+ ) -> tantivy.Query | None:
536
+ """Build a Tantivy query for agent filtering.
537
+
538
+ Supports:
539
+ - Multiple include values (OR logic via term_set_query)
540
+ - Multiple exclude values (AND logic via MustNot)
541
+ - Mixed include/exclude
542
+ """
543
+ if not agent_filter:
544
+ return None
545
+
546
+ parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
547
+
548
+ # Include filter: match any of the included agents
549
+ if agent_filter.include:
550
+ if len(agent_filter.include) == 1:
551
+ # Single value: use term_query
552
+ include_query = tantivy.Query.term_query(
553
+ schema, "agent", agent_filter.include[0]
554
+ )
555
+ else:
556
+ # Multiple values: use term_set_query (OR)
557
+ include_query = tantivy.Query.term_set_query(
558
+ schema, "agent", agent_filter.include
559
+ )
560
+ parts.append((tantivy.Occur.Must, include_query))
561
+
562
+ # Exclude filter: reject any of the excluded agents
563
+ for excluded in agent_filter.exclude:
564
+ exclude_query = tantivy.Query.term_query(schema, "agent", excluded)
565
+ parts.append((tantivy.Occur.MustNot, exclude_query))
566
+
567
+ if not parts:
568
+ return None
569
+
570
+ # If only excludes, we need a base query to exclude from
571
+ if not agent_filter.include and agent_filter.exclude:
572
+ # Match all, then exclude
573
+ parts.insert(0, (tantivy.Occur.Must, tantivy.Query.all_query()))
574
+
575
+ return tantivy.Query.boolean_query(parts)
576
+
577
+ def _build_directory_filter_query(
578
+ self,
579
+ directory_filter: Filter | None,
580
+ schema: tantivy.Schema,
581
+ ) -> tantivy.Query | None:
582
+ """Build a Tantivy query for directory filtering using regex.
583
+
584
+ Uses regex_query for substring matching (case-insensitive).
585
+ """
586
+ if not directory_filter:
587
+ return None
588
+
589
+ parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
590
+
591
+ # Include filter: match any directory containing the substring
592
+ if directory_filter.include:
593
+ include_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
594
+ for dir_pattern in directory_filter.include:
595
+ # Escape regex special characters and build case-insensitive pattern
596
+ escaped = re.escape(dir_pattern)
597
+ regex_pattern = f"(?i).*{escaped}.*"
598
+ include_query = tantivy.Query.regex_query(
599
+ schema, "directory", regex_pattern
600
+ )
601
+ include_parts.append((tantivy.Occur.Should, include_query))
602
+
603
+ if len(include_parts) == 1:
604
+ parts.append((tantivy.Occur.Must, include_parts[0][1]))
605
+ else:
606
+ parts.append(
607
+ (tantivy.Occur.Must, tantivy.Query.boolean_query(include_parts))
608
+ )
609
+
610
+ # Exclude filter: reject directories containing the substring
611
+ for dir_pattern in directory_filter.exclude:
612
+ escaped = re.escape(dir_pattern)
613
+ regex_pattern = f"(?i).*{escaped}.*"
614
+ exclude_query = tantivy.Query.regex_query(
615
+ schema, "directory", regex_pattern
616
+ )
617
+ parts.append((tantivy.Occur.MustNot, exclude_query))
618
+
619
+ if not parts:
620
+ return None
621
+
622
+ # If only excludes, we need a base query to exclude from
623
+ if not directory_filter.include and directory_filter.exclude:
624
+ parts.insert(0, (tantivy.Occur.Must, tantivy.Query.all_query()))
625
+
626
+ return tantivy.Query.boolean_query(parts)
627
+
628
+ def _build_date_filter_query(
629
+ self,
630
+ date_filter: DateFilter | None,
631
+ schema: tantivy.Schema,
632
+ ) -> tantivy.Query | None:
633
+ """Build a Tantivy query for date filtering using range queries.
634
+
635
+ Supports:
636
+ - date:<1h (sessions newer than 1 hour)
637
+ - date:>1d (sessions older than 1 day)
638
+ - date:today (sessions from today)
639
+ - date:yesterday (sessions from yesterday only)
640
+ - Negation via date:!today or -date:today
641
+ """
642
+ if not date_filter:
643
+ return None
644
+
645
+ cutoff_ts = date_filter.cutoff.timestamp()
646
+
647
+ # Build the range query based on operator
648
+ # Use float('inf') and float('-inf') for unbounded ranges
649
+ if date_filter.op == DateOp.LESS_THAN:
650
+ # Sessions newer than cutoff (timestamp >= cutoff)
651
+ range_query = tantivy.Query.range_query(
652
+ schema,
653
+ "timestamp",
654
+ tantivy.FieldType.Float,
655
+ lower_bound=cutoff_ts,
656
+ upper_bound=float("inf"),
657
+ include_lower=True,
658
+ include_upper=True,
659
+ )
660
+ elif date_filter.op == DateOp.GREATER_THAN:
661
+ # Sessions older than cutoff (timestamp < cutoff)
662
+ range_query = tantivy.Query.range_query(
663
+ schema,
664
+ "timestamp",
665
+ tantivy.FieldType.Float,
666
+ lower_bound=float("-inf"),
667
+ upper_bound=cutoff_ts,
668
+ include_lower=True,
669
+ include_upper=False,
670
+ )
671
+ elif date_filter.op == DateOp.EXACT:
672
+ if date_filter.value.lower() == "today":
673
+ # Sessions from today (timestamp >= today_start)
674
+ range_query = tantivy.Query.range_query(
675
+ schema,
676
+ "timestamp",
677
+ tantivy.FieldType.Float,
678
+ lower_bound=cutoff_ts,
679
+ upper_bound=float("inf"),
680
+ include_lower=True,
681
+ include_upper=True,
682
+ )
683
+ elif date_filter.value.lower() == "yesterday":
684
+ # Sessions from yesterday only (cutoff <= timestamp < cutoff + 1 day)
685
+ next_day_ts = (date_filter.cutoff + timedelta(days=1)).timestamp()
686
+ range_query = tantivy.Query.range_query(
687
+ schema,
688
+ "timestamp",
689
+ tantivy.FieldType.Float,
690
+ lower_bound=cutoff_ts,
691
+ upper_bound=next_day_ts,
692
+ include_lower=True,
693
+ include_upper=False,
694
+ )
695
+ else:
696
+ # Unknown exact date, match all
697
+ return None
698
+ else:
699
+ return None
700
+
701
+ # Handle negation
702
+ if date_filter.negated:
703
+ return tantivy.Query.boolean_query(
704
+ [
705
+ (tantivy.Occur.Must, tantivy.Query.all_query()),
706
+ (tantivy.Occur.MustNot, range_query),
707
+ ]
708
+ )
709
+
710
+ return range_query
711
+
712
+ def _build_hybrid_query(
713
+ self,
714
+ query: str,
715
+ index: tantivy.Index,
716
+ schema: tantivy.Schema,
717
+ ) -> tantivy.Query:
718
+ """Build a hybrid query combining exact and fuzzy matching.
719
+
720
+ Exact matches are boosted 5x to rank higher than fuzzy matches.
721
+ This provides typo tolerance while favoring exact matches.
722
+ """
723
+ # Exact match query (boosted) - uses BM25 scoring
724
+ exact_query = index.parse_query(query, ["title", "content"])
725
+ boosted_exact = tantivy.Query.boost_query(exact_query, 5.0)
726
+
727
+ # Fuzzy match queries for typo tolerance
728
+ fuzzy_parts: list[tuple[tantivy.Occur, tantivy.Query]] = []
729
+ for term in query.split():
730
+ if not term:
731
+ continue
732
+ # Fuzzy query for title and content
733
+ fuzzy_title = tantivy.Query.fuzzy_term_query(
734
+ schema, "title", term, distance=1, prefix=True
735
+ )
736
+ fuzzy_content = tantivy.Query.fuzzy_term_query(
737
+ schema, "content", term, distance=1, prefix=True
738
+ )
739
+ # Either field can match
740
+ term_query = tantivy.Query.boolean_query(
741
+ [
742
+ (tantivy.Occur.Should, fuzzy_title),
743
+ (tantivy.Occur.Should, fuzzy_content),
744
+ ]
745
+ )
746
+ fuzzy_parts.append((tantivy.Occur.Must, term_query))
747
+
748
+ # Combine: exact OR fuzzy (either can match, but exact scores higher)
749
+ if fuzzy_parts:
750
+ fuzzy_query = tantivy.Query.boolean_query(fuzzy_parts)
751
+ return tantivy.Query.boolean_query(
752
+ [
753
+ (tantivy.Occur.Should, boosted_exact),
754
+ (tantivy.Occur.Should, fuzzy_query),
755
+ ]
756
+ )
757
+ else:
758
+ return boosted_exact