chunksilo 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chunksilo might be problematic. Click here for more details.

chunksilo/search.py ADDED
@@ -0,0 +1,784 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """
4
+ Core search pipeline for ChunkSilo.
5
+
6
+ Contains all retrieval logic independent of the MCP server.
7
+ Used by both the MCP server (server.py) and the CLI (cli.py).
8
+ """
9
+ import os
10
+ import time
11
+ import math
12
+ import logging
13
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from llama_index.core import StorageContext, load_index_from_storage, Settings
19
+ from llama_index.core.schema import TextNode, NodeWithScore
20
+ from llama_index.embeddings.fastembed import FastEmbedEmbedding
21
+ from .index import get_heading_store
22
+ try:
23
+ from llama_index.readers.confluence import ConfluenceReader
24
+ import requests # Available when llama-index-readers-confluence is installed
25
+ except ImportError:
26
+ ConfluenceReader = None
27
+ requests = None
28
+
29
+ # TEMPORARY FIX: Patch Confluence HTML parser to handle syntax highlighting spans
30
+ # Remove when upstream issue is fixed (see confluence_html_formatter.py)
31
+ if ConfluenceReader is not None:
32
+ try:
33
+ from .confluence_html_formatter import patch_confluence_reader
34
+ patch_confluence_reader()
35
+ except ImportError:
36
+ pass
37
+
38
+ from .cfgload import load_config
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ def _init_config(config_path: Path | None = None) -> dict[str, Any]:
44
+ """Load and return configuration."""
45
+ return load_config(config_path)
46
+
47
+
48
+ # Module-level config (loaded on first use)
49
+ _config: dict[str, Any] | None = None
50
+
51
+
52
+ def _get_config() -> dict[str, Any]:
53
+ """Get the module config, loading defaults if not yet initialized."""
54
+ global _config
55
+ if _config is None:
56
+ _config = _init_config()
57
+ return _config
58
+
59
+
60
+ # Global caches
61
+ _index_cache = None
62
+ _embed_model_initialized = False
63
+ _reranker_model = None
64
+ _bm25_retriever_cache = None
65
+ _configured_directories_cache: list[Path] | None = None
66
+
67
+ # Common English stopwords to filter from Confluence CQL queries
68
+ CONFLUENCE_STOPWORDS = frozenset({
69
+ # Articles
70
+ "a", "an", "the",
71
+ # Prepositions
72
+ "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
73
+ # Conjunctions
74
+ "and", "or", "but", "if", "then", "so",
75
+ # Pronouns
76
+ "i", "me", "my", "we", "us", "our", "you", "your", "he", "she", "it", "they", "them",
77
+ # Common verbs
78
+ "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did",
79
+ "can", "could", "will", "would", "should", "may", "might", "must",
80
+ # Question words
81
+ "what", "when", "where", "which", "who", "whom", "whose", "why", "how",
82
+ # Other common words
83
+ "this", "that", "these", "those", "here", "there", "all", "any", "each", "some", "no", "not",
84
+ "about", "into", "over", "after", "before", "between", "under", "again", "just", "only", "also",
85
+ })
86
+
87
+
88
+ def _setup_offline_mode(config: dict[str, Any]) -> None:
89
+ """Configure offline mode for HuggingFace libraries if enabled."""
90
+ offline_mode = config["retrieval"]["offline"]
91
+ if offline_mode:
92
+ os.environ["HF_HUB_OFFLINE"] = "1"
93
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
94
+ os.environ["HF_DATASETS_OFFLINE"] = "1"
95
+ cache_dir_abs = Path(config["storage"]["model_cache_dir"]).resolve()
96
+ os.environ["HF_HOME"] = str(cache_dir_abs)
97
+ os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
98
+ os.environ["HF_DATASETS_CACHE"] = str(cache_dir_abs)
99
+
100
+
101
+ def _setup_ssl(config: dict[str, Any]) -> str | None:
102
+ """Configure SSL/TLS CA bundle if specified. Returns the CA bundle path or None."""
103
+ ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
104
+ if ca_bundle_path:
105
+ ca_path = Path(ca_bundle_path)
106
+ if ca_path.exists():
107
+ os.environ["REQUESTS_CA_BUNDLE"] = str(ca_path.resolve())
108
+ os.environ["SSL_CERT_FILE"] = str(ca_path.resolve())
109
+ logger.info(f"CA bundle configured: {ca_path.resolve()}")
110
+ else:
111
+ logger.warning(f"CA bundle path does not exist: {ca_path.resolve()}")
112
+ return ca_bundle_path
113
+
114
+
115
+ def _get_configured_directories(config: dict[str, Any]) -> list[Path]:
116
+ """Get list of configured data directories for path resolution."""
117
+ global _configured_directories_cache
118
+
119
+ if _configured_directories_cache is not None:
120
+ return _configured_directories_cache
121
+
122
+ dirs: list[Path] = []
123
+ for entry in config.get("indexing", {}).get("directories", []):
124
+ if isinstance(entry, str):
125
+ dirs.append(Path(entry))
126
+ elif isinstance(entry, dict) and entry.get("enabled", True):
127
+ path_str = entry.get("path")
128
+ if path_str:
129
+ dirs.append(Path(path_str))
130
+
131
+ _configured_directories_cache = dirs if dirs else []
132
+ return _configured_directories_cache
133
+
134
+
135
+ def _resolve_file_uri(file_path: str, config: dict[str, Any]) -> str | None:
136
+ """Resolve a file path to a file:// URI."""
137
+ try:
138
+ file_path_obj = Path(str(file_path))
139
+
140
+ if file_path_obj.is_absolute():
141
+ if file_path_obj.exists():
142
+ return f"file://{file_path_obj.resolve()}"
143
+ return f"file://{file_path_obj}"
144
+
145
+ for data_dir in _get_configured_directories(config):
146
+ candidate = data_dir / file_path_obj
147
+ if candidate.exists():
148
+ return f"file://{candidate.resolve()}"
149
+
150
+ return f"file://{file_path_obj.resolve()}"
151
+ except Exception:
152
+ return None
153
+
154
+
155
+ def _build_heading_path(headings: list[dict], char_start: int | None) -> tuple[str | None, list[str]]:
156
+ """Build a heading path for the given character position within a document."""
157
+ if not headings or char_start is None:
158
+ return None, []
159
+
160
+ current_idx = None
161
+ for idx, heading in enumerate(headings):
162
+ heading_pos = heading.get("position", 0)
163
+ if heading_pos <= char_start:
164
+ current_idx = idx
165
+ else:
166
+ break
167
+
168
+ if current_idx is None:
169
+ return None, []
170
+
171
+ path = [h.get("text", "") for h in headings[: current_idx + 1] if h.get("text")]
172
+ current_heading_text = path[-1] if path else None
173
+ return current_heading_text, path
174
+
175
+
176
+ def _char_offset_to_line(char_offset: int | None, line_offsets: list[int] | None) -> int | None:
177
+ """Convert a character offset to a line number (1-indexed)."""
178
+ if char_offset is None or not line_offsets:
179
+ return None
180
+
181
+ left, right = 0, len(line_offsets) - 1
182
+ while left < right:
183
+ mid = (left + right + 1) // 2
184
+ if line_offsets[mid] <= char_offset:
185
+ left = mid
186
+ else:
187
+ right = mid - 1
188
+
189
+ return left + 1
190
+
191
+
192
+ def _get_cached_model_path(cache_dir: Path, model_name: str) -> Path | None:
193
+ """Get the cached model directory path using huggingface_hub's snapshot_download."""
194
+ try:
195
+ from huggingface_hub import snapshot_download
196
+ from fastembed import TextEmbedding
197
+ models = TextEmbedding.list_supported_models()
198
+ model_info = [m for m in models if m.get("model") == model_name]
199
+ if model_info:
200
+ hf_source = model_info[0].get("sources", {}).get("hf")
201
+ if hf_source:
202
+ cache_dir_abs = cache_dir.resolve()
203
+ model_dir = snapshot_download(
204
+ repo_id=hf_source,
205
+ local_files_only=True,
206
+ cache_dir=str(cache_dir_abs)
207
+ )
208
+ return Path(model_dir).resolve()
209
+ except (ImportError, Exception):
210
+ pass
211
+ return None
212
+
213
+
214
+ def _ensure_embed_model(config: dict[str, Any]) -> None:
215
+ """Ensure the embedding model is initialized."""
216
+ global _embed_model_initialized
217
+
218
+ if _embed_model_initialized:
219
+ return
220
+
221
+ model_name = config["retrieval"]["embed_model_name"]
222
+ cache_dir = Path(config["storage"]["model_cache_dir"])
223
+ offline_mode = config["retrieval"]["offline"]
224
+
225
+ cached_model_path = _get_cached_model_path(cache_dir, model_name)
226
+ if cached_model_path and offline_mode:
227
+ logger.info(f"Loading embedding model from cache: {cached_model_path}")
228
+ embed_model = FastEmbedEmbedding(
229
+ model_name=model_name,
230
+ cache_dir=str(cache_dir),
231
+ specific_model_path=str(cached_model_path)
232
+ )
233
+ else:
234
+ embed_model = FastEmbedEmbedding(
235
+ model_name=model_name,
236
+ cache_dir=str(cache_dir),
237
+ )
238
+ logger.info("Embedding model initialized successfully")
239
+ Settings.embed_model = embed_model
240
+ _embed_model_initialized = True
241
+
242
+
243
+ def _ensure_reranker(config: dict[str, Any]):
244
+ """Load the FlashRank reranking model."""
245
+ global _reranker_model
246
+
247
+ if _reranker_model is not None:
248
+ return _reranker_model
249
+
250
+ try:
251
+ from flashrank import Ranker
252
+ except ImportError as exc:
253
+ raise ImportError(
254
+ "flashrank is required for reranking. Install with: pip install chunksilo"
255
+ ) from exc
256
+
257
+ model_name = config["retrieval"]["rerank_model_name"]
258
+ cache_dir = Path(config["storage"]["model_cache_dir"])
259
+ offline_mode = config["retrieval"]["offline"]
260
+
261
+ model_mapping = {
262
+ "cross-encoder/ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2",
263
+ "ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2",
264
+ }
265
+ if model_name in model_mapping:
266
+ model_name = model_mapping[model_name]
267
+ elif model_name.startswith("cross-encoder/"):
268
+ base_name = model_name.replace("cross-encoder/", "")
269
+ if "L-6" in base_name:
270
+ model_name = base_name.replace("L-6", "L-12")
271
+ else:
272
+ model_name = base_name
273
+
274
+ try:
275
+ _reranker_model = Ranker(model_name=model_name, cache_dir=str(cache_dir))
276
+ except Exception as exc:
277
+ if offline_mode:
278
+ raise FileNotFoundError(
279
+ f"Rerank model '{model_name}' not available in cache directory {cache_dir}. "
280
+ "Download it before running in offline mode."
281
+ ) from exc
282
+ raise
283
+
284
+ logger.info(f"Rerank model '{model_name}' loaded successfully")
285
+ return _reranker_model
286
+
287
+
288
+ def _ensure_bm25_retriever(config: dict[str, Any]):
289
+ """Load the BM25 retriever for file name matching."""
290
+ global _bm25_retriever_cache
291
+
292
+ if _bm25_retriever_cache is not None:
293
+ return _bm25_retriever_cache
294
+
295
+ storage_dir = Path(config["storage"]["storage_dir"])
296
+ bm25_index_dir = storage_dir / "bm25_index"
297
+
298
+ if not bm25_index_dir.exists():
299
+ logger.warning(f"BM25 index not found at {bm25_index_dir}. Run indexing to create it.")
300
+ return None
301
+
302
+ try:
303
+ from llama_index.retrievers.bm25 import BM25Retriever
304
+ logger.info(f"Loading BM25 index from {bm25_index_dir}")
305
+ _bm25_retriever_cache = BM25Retriever.from_persist_dir(str(bm25_index_dir))
306
+ logger.info("BM25 retriever loaded successfully")
307
+ return _bm25_retriever_cache
308
+ except Exception as e:
309
+ logger.error(f"Failed to load BM25 retriever: {e}")
310
+ return None
311
+
312
+
313
+ def _format_bm25_matches(bm25_nodes: list[NodeWithScore], config: dict[str, Any]) -> list[dict[str, Any]]:
314
+ """Format BM25 file name matches for the response."""
315
+ matched_files = []
316
+ for node in bm25_nodes:
317
+ if node.score is None or node.score <= 0:
318
+ continue
319
+ metadata = node.node.metadata or {}
320
+ file_path = metadata.get("file_path", "")
321
+ source_uri = _resolve_file_uri(file_path, config) if file_path else None
322
+ matched_files.append({
323
+ "uri": source_uri,
324
+ "score": round(float(node.score), 4),
325
+ })
326
+ return matched_files[:5]
327
+
328
+
329
+ def _preprocess_query(query: str) -> str:
330
+ """Preprocess queries with basic normalization."""
331
+ if not query or not query.strip():
332
+ return query
333
+
334
+ original_query = query
335
+ query = " ".join(query.split())
336
+ query = query.rstrip(".,!?;")
337
+ processed = query.strip()
338
+ return processed if processed else original_query
339
+
340
+
341
+ def _prepare_confluence_query_terms(query: str) -> list[str]:
342
+ """Prepare query terms for Confluence CQL search."""
343
+ words = query.strip().lower().split()
344
+ meaningful = [w for w in words if w not in CONFLUENCE_STOPWORDS and len(w) >= 2]
345
+ return [w.replace('"', '\\"') for w in meaningful]
346
+
347
+
348
+ def _get_confluence_page_dates(
349
+ base_url: str, page_id: str, username: str, api_token: str,
350
+ ca_bundle_path: str | None = None
351
+ ) -> dict[str, str]:
352
+ """Fetch creation and modification dates for a Confluence page."""
353
+ if requests is None:
354
+ return {}
355
+
356
+ try:
357
+ url = f"{base_url.rstrip('/')}/wiki/api/v2/pages/{page_id}"
358
+ response = requests.get(
359
+ url,
360
+ auth=(username, api_token),
361
+ timeout=5.0,
362
+ verify=ca_bundle_path if ca_bundle_path else True,
363
+ )
364
+ if response.status_code == 200:
365
+ data = response.json()
366
+ result = {}
367
+ if "createdAt" in data:
368
+ try:
369
+ dt = datetime.fromisoformat(data["createdAt"].replace("Z", "+00:00"))
370
+ result["creation_date"] = dt.strftime("%Y-%m-%d")
371
+ except Exception:
372
+ pass
373
+ if "version" in data and "createdAt" in data["version"]:
374
+ try:
375
+ dt = datetime.fromisoformat(data["version"]["createdAt"].replace("Z", "+00:00"))
376
+ result["last_modified_date"] = dt.strftime("%Y-%m-%d")
377
+ except Exception:
378
+ pass
379
+ return result
380
+ except Exception as e:
381
+ logger.debug(f"Failed to fetch Confluence page dates: {e}")
382
+ return {}
383
+
384
+
385
+ def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
386
+ """Search Confluence for documents matching the query using CQL."""
387
+ base_url = config["confluence"]["url"]
388
+ if not base_url:
389
+ logger.warning("Confluence search skipped: confluence.url not set in config")
390
+ return []
391
+
392
+ if ConfluenceReader is None:
393
+ logger.warning("llama-index-readers-confluence not installed, skipping Confluence search")
394
+ return []
395
+
396
+ username = config["confluence"]["username"]
397
+ api_token = config["confluence"]["api_token"]
398
+ max_results = config["confluence"]["max_results"]
399
+ ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
400
+
401
+ if not (base_url and username and api_token):
402
+ missing = []
403
+ if not username:
404
+ missing.append("confluence.username")
405
+ if not api_token:
406
+ missing.append("confluence.api_token")
407
+ logger.warning(f"Confluence search skipped: missing {', '.join(missing)} in config")
408
+ return []
409
+
410
+ try:
411
+ reader = ConfluenceReader(base_url=base_url, user_name=username, api_token=api_token)
412
+ query_terms = _prepare_confluence_query_terms(query)
413
+
414
+ if not query_terms:
415
+ escaped = query.strip().replace('"', '\\"')
416
+ if not escaped:
417
+ logger.warning("Confluence search skipped: empty query after processing")
418
+ return []
419
+ cql = f'text ~ "{escaped}" AND type = "page"'
420
+ elif len(query_terms) == 1:
421
+ cql = f'text ~ "{query_terms[0]}" AND type = "page"'
422
+ else:
423
+ text_conditions = ' OR '.join([f'text ~ "{term}"' for term in query_terms])
424
+ cql = f'({text_conditions}) AND type = "page"'
425
+
426
+ documents = reader.load_data(cql=cql, max_num_results=max_results)
427
+
428
+ nodes: list[NodeWithScore] = []
429
+ for doc in documents:
430
+ metadata = doc.metadata.copy()
431
+ metadata["source"] = "Confluence"
432
+ if "title" in metadata:
433
+ metadata["file_name"] = metadata["title"]
434
+
435
+ page_id = metadata.get("page_id")
436
+ if page_id:
437
+ date_info = _get_confluence_page_dates(base_url, page_id, username, api_token, ca_bundle_path)
438
+ metadata.update(date_info)
439
+
440
+ node = TextNode(text=doc.text, metadata=metadata)
441
+ nodes.append(NodeWithScore(node=node, score=0.0))
442
+
443
+ return nodes
444
+
445
+ except Exception as e:
446
+ logger.error(f"Failed to search Confluence: {e}", exc_info=True)
447
+ return []
448
+
449
+
450
+ def load_llamaindex_index(config: dict[str, Any] | None = None):
451
+ """Load the LlamaIndex from storage."""
452
+ if config is None:
453
+ config = _get_config()
454
+ global _index_cache
455
+
456
+ if _index_cache is not None:
457
+ return _index_cache
458
+
459
+ storage_dir = Path(config["storage"]["storage_dir"])
460
+ if not storage_dir.exists():
461
+ raise FileNotFoundError(
462
+ f"Storage directory {storage_dir} does not exist. "
463
+ "Please run indexing first."
464
+ )
465
+
466
+ logger.info("Loading LlamaIndex from storage...")
467
+ _ensure_embed_model(config)
468
+ storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
469
+ index = load_index_from_storage(storage_context)
470
+ _index_cache = index
471
+ return index
472
+
473
+
474
+ def _parse_date(date_str: str) -> datetime | None:
475
+ """Parse date string in YYYY-MM-DD format."""
476
+ try:
477
+ return datetime.strptime(date_str, "%Y-%m-%d")
478
+ except (ValueError, TypeError):
479
+ return None
480
+
481
+
482
+ def _filter_nodes_by_date(
483
+ nodes: list[NodeWithScore],
484
+ date_from: str | None,
485
+ date_to: str | None
486
+ ) -> list[NodeWithScore]:
487
+ """Filter nodes by date range."""
488
+ if not date_from and not date_to:
489
+ return nodes
490
+
491
+ from_dt = _parse_date(date_from) if date_from else None
492
+ to_dt = _parse_date(date_to) if date_to else None
493
+
494
+ filtered = []
495
+ for node in nodes:
496
+ metadata = node.node.metadata or {}
497
+ doc_date_str = metadata.get("last_modified_date") or metadata.get("creation_date")
498
+ if not doc_date_str:
499
+ filtered.append(node)
500
+ continue
501
+
502
+ doc_date = _parse_date(doc_date_str)
503
+ if not doc_date:
504
+ filtered.append(node)
505
+ continue
506
+
507
+ if from_dt and doc_date < from_dt:
508
+ continue
509
+ if to_dt and doc_date > to_dt:
510
+ continue
511
+
512
+ filtered.append(node)
513
+
514
+ return filtered
515
+
516
+
517
+ def _apply_recency_boost(
518
+ nodes: list[NodeWithScore],
519
+ boost_weight: float,
520
+ half_life_days: int = 365
521
+ ) -> list[NodeWithScore]:
522
+ """Apply time-decay boost to nodes based on document recency."""
523
+ if not nodes or boost_weight <= 0:
524
+ return nodes
525
+
526
+ today = datetime.now()
527
+ boosted_nodes = []
528
+
529
+ for node in nodes:
530
+ metadata = node.node.metadata or {}
531
+ doc_date_str = metadata.get("last_modified_date") or metadata.get("creation_date")
532
+ base_score = node.score if node.score is not None else 0.5
533
+
534
+ if not doc_date_str:
535
+ boosted_nodes.append(NodeWithScore(node=node.node, score=base_score))
536
+ continue
537
+
538
+ doc_date = _parse_date(doc_date_str)
539
+ if not doc_date:
540
+ boosted_nodes.append(NodeWithScore(node=node.node, score=base_score))
541
+ continue
542
+
543
+ age_days = (today - doc_date).days
544
+ if age_days < 0:
545
+ age_days = 0
546
+
547
+ decay_rate = math.log(2) / half_life_days
548
+ recency_factor = math.exp(-decay_rate * age_days)
549
+ boosted_score = base_score * (1 + boost_weight * recency_factor)
550
+
551
+ boosted_nodes.append(NodeWithScore(node=node.node, score=boosted_score))
552
+
553
+ boosted_nodes.sort(key=lambda x: x.score or 0, reverse=True)
554
+ return boosted_nodes
555
+
556
+
557
+ def run_search(
558
+ query: str,
559
+ date_from: str | None = None,
560
+ date_to: str | None = None,
561
+ config_path: Path | None = None,
562
+ ) -> dict[str, Any]:
563
+ """Execute the full search pipeline and return structured results.
564
+
565
+ This is the shared implementation used by both the MCP tool and CLI.
566
+
567
+ Args:
568
+ query: Search query text
569
+ date_from: Optional start date filter (YYYY-MM-DD, inclusive)
570
+ date_to: Optional end date filter (YYYY-MM-DD, inclusive)
571
+ config_path: Optional path to config.yaml
572
+
573
+ Returns:
574
+ Structured response dict with matched_files, chunks, etc.
575
+ """
576
+ config = _init_config(config_path) if config_path else _get_config()
577
+
578
+ # Setup environment on first call
579
+ _setup_offline_mode(config)
580
+ _setup_ssl(config)
581
+
582
+ start_time = time.time()
583
+
584
+ try:
585
+ enhanced_query = _preprocess_query(query)
586
+
587
+ # Load index
588
+ index = load_llamaindex_index(config)
589
+
590
+ # Stage 1a: Vector search
591
+ embed_top_k = config["retrieval"]["embed_top_k"]
592
+ retriever = index.as_retriever(similarity_top_k=embed_top_k)
593
+ vector_nodes = retriever.retrieve(enhanced_query)
594
+
595
+ # Stage 1b: BM25 file name search
596
+ matched_files: list[dict[str, Any]] = []
597
+ bm25_retriever = _ensure_bm25_retriever(config)
598
+ if bm25_retriever:
599
+ try:
600
+ bm25_matches = bm25_retriever.retrieve(enhanced_query)
601
+ if bm25_matches:
602
+ matched_files = _format_bm25_matches(bm25_matches, config)
603
+ logger.info(f"BM25 matched {len(matched_files)} files (from {len(bm25_matches)} candidates)")
604
+ except Exception as e:
605
+ logger.error(f"BM25 search failed: {e}")
606
+
607
+ nodes = vector_nodes
608
+
609
+ # Search Confluence (with timeout)
610
+ confluence_nodes: list[NodeWithScore] = []
611
+ confluence_timeout = config["confluence"]["timeout"]
612
+ if config["confluence"]["url"]:
613
+ try:
614
+ with ThreadPoolExecutor(max_workers=1) as executor:
615
+ future = executor.submit(_search_confluence, enhanced_query, config)
616
+ confluence_nodes = future.result(timeout=confluence_timeout)
617
+ logger.info(f"Confluence search returned {len(confluence_nodes)} entries")
618
+ except FuturesTimeoutError:
619
+ logger.warning(f"Confluence search timed out after {confluence_timeout}s")
620
+ except Exception as e:
621
+ logger.error(f"Error during Confluence search: {e}")
622
+
623
+ if confluence_nodes:
624
+ nodes.extend(confluence_nodes)
625
+
626
+ # Apply date filtering
627
+ if date_from or date_to:
628
+ original_count = len(nodes)
629
+ nodes = _filter_nodes_by_date(nodes, date_from, date_to)
630
+ logger.info(f"Date filtering: {original_count} -> {len(nodes)} nodes")
631
+
632
+ # Apply recency boost
633
+ recency_boost = config["retrieval"]["recency_boost"]
634
+ recency_half_life = config["retrieval"]["recency_half_life_days"]
635
+ if recency_boost > 0:
636
+ nodes = _apply_recency_boost(nodes, recency_boost, recency_half_life)
637
+
638
+ # Cap candidates before reranking
639
+ rerank_candidates = config["retrieval"]["rerank_candidates"]
640
+ if len(nodes) > rerank_candidates:
641
+ logger.info(f"Capping rerank candidates: {len(nodes)} -> {rerank_candidates}")
642
+ nodes = nodes[:rerank_candidates]
643
+
644
+ # Stage 2: Rerank
645
+ rerank_top_k = config["retrieval"]["rerank_top_k"]
646
+ rerank_scores: dict[int, float] = {}
647
+ if nodes:
648
+ rerank_limit = max(1, min(rerank_top_k, len(nodes)))
649
+ try:
650
+ reranker = _ensure_reranker(config)
651
+ passages = [{"text": node.node.get_content() or ""} for node in nodes]
652
+
653
+ from flashrank import RerankRequest
654
+ rerank_request = RerankRequest(query=enhanced_query, passages=passages)
655
+ reranked_results = reranker.rerank(rerank_request)
656
+
657
+ text_to_indices: dict[str, list[tuple[int, NodeWithScore]]] = {}
658
+ for idx, node in enumerate(nodes):
659
+ node_text = node.node.get_content() or ""
660
+ if node_text not in text_to_indices:
661
+ text_to_indices[node_text] = []
662
+ text_to_indices[node_text].append((idx, node))
663
+
664
+ reranked_nodes = []
665
+ seen_indices: set[int] = set()
666
+ for result in reranked_results:
667
+ doc_text = result.get("text", "")
668
+ score = result.get("score", 0.0)
669
+
670
+ if doc_text in text_to_indices:
671
+ for idx, node in text_to_indices[doc_text]:
672
+ if idx not in seen_indices:
673
+ reranked_nodes.append(node)
674
+ rerank_scores[id(node)] = float(score)
675
+ seen_indices.add(idx)
676
+ break
677
+
678
+ for idx, node in enumerate(nodes):
679
+ if idx not in seen_indices:
680
+ reranked_nodes.append(node)
681
+
682
+ nodes = reranked_nodes[:rerank_limit]
683
+ except Exception as e:
684
+ logger.error(f"Reranking failed, falling back to vector search order: {e}")
685
+ nodes = nodes[:rerank_limit]
686
+
687
+ # Filter by score threshold
688
+ score_threshold = config["retrieval"]["score_threshold"]
689
+ if score_threshold > 0:
690
+ nodes = [
691
+ node for node in nodes
692
+ if rerank_scores.get(id(node), 0.0) >= score_threshold
693
+ ]
694
+
695
+ # Format chunks
696
+ chunks = []
697
+ for node in nodes:
698
+ metadata = dict(node.node.metadata or {})
699
+ chunk_text = node.node.get_content()
700
+
701
+ file_path = (
702
+ metadata.get("file_path")
703
+ or metadata.get("file_name")
704
+ or metadata.get("source")
705
+ )
706
+ original_source = metadata.get("source")
707
+
708
+ # Build heading path
709
+ headings = metadata.get("document_headings") or metadata.get("headings") or []
710
+ if not headings and file_path:
711
+ headings = get_heading_store().get_headings(str(file_path))
712
+ char_start = getattr(node.node, "start_char_idx", None)
713
+ heading_text = metadata.get("heading")
714
+ heading_path: list[str] = []
715
+ if isinstance(headings, list) and headings:
716
+ if heading_text is None and char_start is not None:
717
+ heading_text, heading_path = _build_heading_path(headings, char_start)
718
+ meta_heading_path = metadata.get("heading_path")
719
+ if not heading_path and meta_heading_path:
720
+ heading_path = list(meta_heading_path)
721
+ if heading_text and (not heading_path or heading_path[-1] != heading_text):
722
+ heading_path = heading_path + [heading_text] if heading_path else [heading_text]
723
+
724
+ # Build URI
725
+ source_uri = None
726
+ if original_source == "Confluence":
727
+ confluence_url = config["confluence"]["url"]
728
+ page_id = metadata.get("page_id")
729
+ if confluence_url and page_id:
730
+ source_uri = f"{confluence_url.rstrip('/')}/pages/viewpage.action?pageId={page_id}"
731
+ elif confluence_url:
732
+ title = metadata.get("title", metadata.get("file_name", ""))
733
+ if title:
734
+ from urllib.parse import quote
735
+ encoded_title = quote(title.replace(" ", "+"))
736
+ source_uri = f"{confluence_url.rstrip('/')}/spaces/~{encoded_title}"
737
+ elif file_path:
738
+ source_uri = _resolve_file_uri(file_path, config)
739
+
740
+ page_number = (
741
+ metadata.get("page_label")
742
+ or metadata.get("page_number")
743
+ or metadata.get("page")
744
+ )
745
+
746
+ line_number = None
747
+ line_offsets = metadata.get("line_offsets")
748
+ if line_offsets and char_start is not None:
749
+ line_number = _char_offset_to_line(char_start, line_offsets)
750
+
751
+ location = {
752
+ "uri": source_uri,
753
+ "page": page_number,
754
+ "line": line_number,
755
+ "heading_path": heading_path if heading_path else None,
756
+ }
757
+
758
+ score_value = rerank_scores.get(id(node), getattr(node, "score", None))
759
+ chunk_data = {
760
+ "text": chunk_text,
761
+ "score": round(float(score_value), 3) if score_value is not None else 0.0,
762
+ "location": location,
763
+ }
764
+ chunks.append(chunk_data)
765
+
766
+ elapsed = time.time() - start_time
767
+
768
+ return {
769
+ "matched_files": matched_files,
770
+ "num_matched_files": len(matched_files),
771
+ "chunks": chunks,
772
+ "num_chunks": len(chunks),
773
+ "query": query,
774
+ "retrieval_time": f"{elapsed:.2f}s",
775
+ }
776
+
777
+ except Exception as e:
778
+ logger.error(f"Error in search: {e}", exc_info=True)
779
+ return {
780
+ "matched_files": [],
781
+ "chunks": [],
782
+ "error": str(e),
783
+ "query": query,
784
+ }