chunksilo 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chunksilo might be problematic. Click here for more details.
- chunksilo/__init__.py +4 -0
- chunksilo/__main__.py +3 -0
- chunksilo/cfgload.py +163 -0
- chunksilo/cli.py +124 -0
- chunksilo/confluence_html_formatter.py +96 -0
- chunksilo/index.py +1420 -0
- chunksilo/search.py +784 -0
- chunksilo/server.py +110 -0
- chunksilo-2.0.0.dist-info/METADATA +366 -0
- chunksilo-2.0.0.dist-info/RECORD +15 -0
- chunksilo-2.0.0.dist-info/WHEEL +5 -0
- chunksilo-2.0.0.dist-info/entry_points.txt +3 -0
- chunksilo-2.0.0.dist-info/licenses/LICENSE +191 -0
- chunksilo-2.0.0.dist-info/licenses/NOTICE +33 -0
- chunksilo-2.0.0.dist-info/top_level.txt +1 -0
chunksilo/search.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""
|
|
4
|
+
Core search pipeline for ChunkSilo.
|
|
5
|
+
|
|
6
|
+
Contains all retrieval logic independent of the MCP server.
|
|
7
|
+
Used by both the MCP server (server.py) and the CLI (cli.py).
|
|
8
|
+
"""
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
import math
|
|
12
|
+
import logging
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from llama_index.core import StorageContext, load_index_from_storage, Settings
|
|
19
|
+
from llama_index.core.schema import TextNode, NodeWithScore
|
|
20
|
+
from llama_index.embeddings.fastembed import FastEmbedEmbedding
|
|
21
|
+
from .index import get_heading_store
|
|
22
|
+
try:
|
|
23
|
+
from llama_index.readers.confluence import ConfluenceReader
|
|
24
|
+
import requests # Available when llama-index-readers-confluence is installed
|
|
25
|
+
except ImportError:
|
|
26
|
+
ConfluenceReader = None
|
|
27
|
+
requests = None
|
|
28
|
+
|
|
29
|
+
# TEMPORARY FIX: Patch Confluence HTML parser to handle syntax highlighting spans
|
|
30
|
+
# Remove when upstream issue is fixed (see confluence_html_formatter.py)
|
|
31
|
+
if ConfluenceReader is not None:
|
|
32
|
+
try:
|
|
33
|
+
from .confluence_html_formatter import patch_confluence_reader
|
|
34
|
+
patch_confluence_reader()
|
|
35
|
+
except ImportError:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
from .cfgload import load_config
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _init_config(config_path: Path | None = None) -> dict[str, Any]:
|
|
44
|
+
"""Load and return configuration."""
|
|
45
|
+
return load_config(config_path)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Module-level config (loaded on first use)
|
|
49
|
+
_config: dict[str, Any] | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_config() -> dict[str, Any]:
|
|
53
|
+
"""Get the module config, loading defaults if not yet initialized."""
|
|
54
|
+
global _config
|
|
55
|
+
if _config is None:
|
|
56
|
+
_config = _init_config()
|
|
57
|
+
return _config
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Global caches
|
|
61
|
+
_index_cache = None
|
|
62
|
+
_embed_model_initialized = False
|
|
63
|
+
_reranker_model = None
|
|
64
|
+
_bm25_retriever_cache = None
|
|
65
|
+
_configured_directories_cache: list[Path] | None = None
|
|
66
|
+
|
|
67
|
+
# Common English stopwords to filter from Confluence CQL queries
|
|
68
|
+
CONFLUENCE_STOPWORDS = frozenset({
|
|
69
|
+
# Articles
|
|
70
|
+
"a", "an", "the",
|
|
71
|
+
# Prepositions
|
|
72
|
+
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
|
|
73
|
+
# Conjunctions
|
|
74
|
+
"and", "or", "but", "if", "then", "so",
|
|
75
|
+
# Pronouns
|
|
76
|
+
"i", "me", "my", "we", "us", "our", "you", "your", "he", "she", "it", "they", "them",
|
|
77
|
+
# Common verbs
|
|
78
|
+
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did",
|
|
79
|
+
"can", "could", "will", "would", "should", "may", "might", "must",
|
|
80
|
+
# Question words
|
|
81
|
+
"what", "when", "where", "which", "who", "whom", "whose", "why", "how",
|
|
82
|
+
# Other common words
|
|
83
|
+
"this", "that", "these", "those", "here", "there", "all", "any", "each", "some", "no", "not",
|
|
84
|
+
"about", "into", "over", "after", "before", "between", "under", "again", "just", "only", "also",
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _setup_offline_mode(config: dict[str, Any]) -> None:
|
|
89
|
+
"""Configure offline mode for HuggingFace libraries if enabled."""
|
|
90
|
+
offline_mode = config["retrieval"]["offline"]
|
|
91
|
+
if offline_mode:
|
|
92
|
+
os.environ["HF_HUB_OFFLINE"] = "1"
|
|
93
|
+
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
94
|
+
os.environ["HF_DATASETS_OFFLINE"] = "1"
|
|
95
|
+
cache_dir_abs = Path(config["storage"]["model_cache_dir"]).resolve()
|
|
96
|
+
os.environ["HF_HOME"] = str(cache_dir_abs)
|
|
97
|
+
os.environ["HF_HUB_CACHE"] = str(cache_dir_abs)
|
|
98
|
+
os.environ["HF_DATASETS_CACHE"] = str(cache_dir_abs)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _setup_ssl(config: dict[str, Any]) -> str | None:
|
|
102
|
+
"""Configure SSL/TLS CA bundle if specified. Returns the CA bundle path or None."""
|
|
103
|
+
ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
|
|
104
|
+
if ca_bundle_path:
|
|
105
|
+
ca_path = Path(ca_bundle_path)
|
|
106
|
+
if ca_path.exists():
|
|
107
|
+
os.environ["REQUESTS_CA_BUNDLE"] = str(ca_path.resolve())
|
|
108
|
+
os.environ["SSL_CERT_FILE"] = str(ca_path.resolve())
|
|
109
|
+
logger.info(f"CA bundle configured: {ca_path.resolve()}")
|
|
110
|
+
else:
|
|
111
|
+
logger.warning(f"CA bundle path does not exist: {ca_path.resolve()}")
|
|
112
|
+
return ca_bundle_path
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_configured_directories(config: dict[str, Any]) -> list[Path]:
|
|
116
|
+
"""Get list of configured data directories for path resolution."""
|
|
117
|
+
global _configured_directories_cache
|
|
118
|
+
|
|
119
|
+
if _configured_directories_cache is not None:
|
|
120
|
+
return _configured_directories_cache
|
|
121
|
+
|
|
122
|
+
dirs: list[Path] = []
|
|
123
|
+
for entry in config.get("indexing", {}).get("directories", []):
|
|
124
|
+
if isinstance(entry, str):
|
|
125
|
+
dirs.append(Path(entry))
|
|
126
|
+
elif isinstance(entry, dict) and entry.get("enabled", True):
|
|
127
|
+
path_str = entry.get("path")
|
|
128
|
+
if path_str:
|
|
129
|
+
dirs.append(Path(path_str))
|
|
130
|
+
|
|
131
|
+
_configured_directories_cache = dirs if dirs else []
|
|
132
|
+
return _configured_directories_cache
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _resolve_file_uri(file_path: str, config: dict[str, Any]) -> str | None:
|
|
136
|
+
"""Resolve a file path to a file:// URI."""
|
|
137
|
+
try:
|
|
138
|
+
file_path_obj = Path(str(file_path))
|
|
139
|
+
|
|
140
|
+
if file_path_obj.is_absolute():
|
|
141
|
+
if file_path_obj.exists():
|
|
142
|
+
return f"file://{file_path_obj.resolve()}"
|
|
143
|
+
return f"file://{file_path_obj}"
|
|
144
|
+
|
|
145
|
+
for data_dir in _get_configured_directories(config):
|
|
146
|
+
candidate = data_dir / file_path_obj
|
|
147
|
+
if candidate.exists():
|
|
148
|
+
return f"file://{candidate.resolve()}"
|
|
149
|
+
|
|
150
|
+
return f"file://{file_path_obj.resolve()}"
|
|
151
|
+
except Exception:
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _build_heading_path(headings: list[dict], char_start: int | None) -> tuple[str | None, list[str]]:
|
|
156
|
+
"""Build a heading path for the given character position within a document."""
|
|
157
|
+
if not headings or char_start is None:
|
|
158
|
+
return None, []
|
|
159
|
+
|
|
160
|
+
current_idx = None
|
|
161
|
+
for idx, heading in enumerate(headings):
|
|
162
|
+
heading_pos = heading.get("position", 0)
|
|
163
|
+
if heading_pos <= char_start:
|
|
164
|
+
current_idx = idx
|
|
165
|
+
else:
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
if current_idx is None:
|
|
169
|
+
return None, []
|
|
170
|
+
|
|
171
|
+
path = [h.get("text", "") for h in headings[: current_idx + 1] if h.get("text")]
|
|
172
|
+
current_heading_text = path[-1] if path else None
|
|
173
|
+
return current_heading_text, path
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _char_offset_to_line(char_offset: int | None, line_offsets: list[int] | None) -> int | None:
|
|
177
|
+
"""Convert a character offset to a line number (1-indexed)."""
|
|
178
|
+
if char_offset is None or not line_offsets:
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
left, right = 0, len(line_offsets) - 1
|
|
182
|
+
while left < right:
|
|
183
|
+
mid = (left + right + 1) // 2
|
|
184
|
+
if line_offsets[mid] <= char_offset:
|
|
185
|
+
left = mid
|
|
186
|
+
else:
|
|
187
|
+
right = mid - 1
|
|
188
|
+
|
|
189
|
+
return left + 1
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _get_cached_model_path(cache_dir: Path, model_name: str) -> Path | None:
|
|
193
|
+
"""Get the cached model directory path using huggingface_hub's snapshot_download."""
|
|
194
|
+
try:
|
|
195
|
+
from huggingface_hub import snapshot_download
|
|
196
|
+
from fastembed import TextEmbedding
|
|
197
|
+
models = TextEmbedding.list_supported_models()
|
|
198
|
+
model_info = [m for m in models if m.get("model") == model_name]
|
|
199
|
+
if model_info:
|
|
200
|
+
hf_source = model_info[0].get("sources", {}).get("hf")
|
|
201
|
+
if hf_source:
|
|
202
|
+
cache_dir_abs = cache_dir.resolve()
|
|
203
|
+
model_dir = snapshot_download(
|
|
204
|
+
repo_id=hf_source,
|
|
205
|
+
local_files_only=True,
|
|
206
|
+
cache_dir=str(cache_dir_abs)
|
|
207
|
+
)
|
|
208
|
+
return Path(model_dir).resolve()
|
|
209
|
+
except (ImportError, Exception):
|
|
210
|
+
pass
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _ensure_embed_model(config: dict[str, Any]) -> None:
|
|
215
|
+
"""Ensure the embedding model is initialized."""
|
|
216
|
+
global _embed_model_initialized
|
|
217
|
+
|
|
218
|
+
if _embed_model_initialized:
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
model_name = config["retrieval"]["embed_model_name"]
|
|
222
|
+
cache_dir = Path(config["storage"]["model_cache_dir"])
|
|
223
|
+
offline_mode = config["retrieval"]["offline"]
|
|
224
|
+
|
|
225
|
+
cached_model_path = _get_cached_model_path(cache_dir, model_name)
|
|
226
|
+
if cached_model_path and offline_mode:
|
|
227
|
+
logger.info(f"Loading embedding model from cache: {cached_model_path}")
|
|
228
|
+
embed_model = FastEmbedEmbedding(
|
|
229
|
+
model_name=model_name,
|
|
230
|
+
cache_dir=str(cache_dir),
|
|
231
|
+
specific_model_path=str(cached_model_path)
|
|
232
|
+
)
|
|
233
|
+
else:
|
|
234
|
+
embed_model = FastEmbedEmbedding(
|
|
235
|
+
model_name=model_name,
|
|
236
|
+
cache_dir=str(cache_dir),
|
|
237
|
+
)
|
|
238
|
+
logger.info("Embedding model initialized successfully")
|
|
239
|
+
Settings.embed_model = embed_model
|
|
240
|
+
_embed_model_initialized = True
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _ensure_reranker(config: dict[str, Any]):
|
|
244
|
+
"""Load the FlashRank reranking model."""
|
|
245
|
+
global _reranker_model
|
|
246
|
+
|
|
247
|
+
if _reranker_model is not None:
|
|
248
|
+
return _reranker_model
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
from flashrank import Ranker
|
|
252
|
+
except ImportError as exc:
|
|
253
|
+
raise ImportError(
|
|
254
|
+
"flashrank is required for reranking. Install with: pip install chunksilo"
|
|
255
|
+
) from exc
|
|
256
|
+
|
|
257
|
+
model_name = config["retrieval"]["rerank_model_name"]
|
|
258
|
+
cache_dir = Path(config["storage"]["model_cache_dir"])
|
|
259
|
+
offline_mode = config["retrieval"]["offline"]
|
|
260
|
+
|
|
261
|
+
model_mapping = {
|
|
262
|
+
"cross-encoder/ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2",
|
|
263
|
+
"ms-marco-MiniLM-L-6-v2": "ms-marco-MiniLM-L-12-v2",
|
|
264
|
+
}
|
|
265
|
+
if model_name in model_mapping:
|
|
266
|
+
model_name = model_mapping[model_name]
|
|
267
|
+
elif model_name.startswith("cross-encoder/"):
|
|
268
|
+
base_name = model_name.replace("cross-encoder/", "")
|
|
269
|
+
if "L-6" in base_name:
|
|
270
|
+
model_name = base_name.replace("L-6", "L-12")
|
|
271
|
+
else:
|
|
272
|
+
model_name = base_name
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
_reranker_model = Ranker(model_name=model_name, cache_dir=str(cache_dir))
|
|
276
|
+
except Exception as exc:
|
|
277
|
+
if offline_mode:
|
|
278
|
+
raise FileNotFoundError(
|
|
279
|
+
f"Rerank model '{model_name}' not available in cache directory {cache_dir}. "
|
|
280
|
+
"Download it before running in offline mode."
|
|
281
|
+
) from exc
|
|
282
|
+
raise
|
|
283
|
+
|
|
284
|
+
logger.info(f"Rerank model '{model_name}' loaded successfully")
|
|
285
|
+
return _reranker_model
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _ensure_bm25_retriever(config: dict[str, Any]):
|
|
289
|
+
"""Load the BM25 retriever for file name matching."""
|
|
290
|
+
global _bm25_retriever_cache
|
|
291
|
+
|
|
292
|
+
if _bm25_retriever_cache is not None:
|
|
293
|
+
return _bm25_retriever_cache
|
|
294
|
+
|
|
295
|
+
storage_dir = Path(config["storage"]["storage_dir"])
|
|
296
|
+
bm25_index_dir = storage_dir / "bm25_index"
|
|
297
|
+
|
|
298
|
+
if not bm25_index_dir.exists():
|
|
299
|
+
logger.warning(f"BM25 index not found at {bm25_index_dir}. Run indexing to create it.")
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
try:
|
|
303
|
+
from llama_index.retrievers.bm25 import BM25Retriever
|
|
304
|
+
logger.info(f"Loading BM25 index from {bm25_index_dir}")
|
|
305
|
+
_bm25_retriever_cache = BM25Retriever.from_persist_dir(str(bm25_index_dir))
|
|
306
|
+
logger.info("BM25 retriever loaded successfully")
|
|
307
|
+
return _bm25_retriever_cache
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.error(f"Failed to load BM25 retriever: {e}")
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _format_bm25_matches(bm25_nodes: list[NodeWithScore], config: dict[str, Any]) -> list[dict[str, Any]]:
|
|
314
|
+
"""Format BM25 file name matches for the response."""
|
|
315
|
+
matched_files = []
|
|
316
|
+
for node in bm25_nodes:
|
|
317
|
+
if node.score is None or node.score <= 0:
|
|
318
|
+
continue
|
|
319
|
+
metadata = node.node.metadata or {}
|
|
320
|
+
file_path = metadata.get("file_path", "")
|
|
321
|
+
source_uri = _resolve_file_uri(file_path, config) if file_path else None
|
|
322
|
+
matched_files.append({
|
|
323
|
+
"uri": source_uri,
|
|
324
|
+
"score": round(float(node.score), 4),
|
|
325
|
+
})
|
|
326
|
+
return matched_files[:5]
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _preprocess_query(query: str) -> str:
|
|
330
|
+
"""Preprocess queries with basic normalization."""
|
|
331
|
+
if not query or not query.strip():
|
|
332
|
+
return query
|
|
333
|
+
|
|
334
|
+
original_query = query
|
|
335
|
+
query = " ".join(query.split())
|
|
336
|
+
query = query.rstrip(".,!?;")
|
|
337
|
+
processed = query.strip()
|
|
338
|
+
return processed if processed else original_query
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _prepare_confluence_query_terms(query: str) -> list[str]:
|
|
342
|
+
"""Prepare query terms for Confluence CQL search."""
|
|
343
|
+
words = query.strip().lower().split()
|
|
344
|
+
meaningful = [w for w in words if w not in CONFLUENCE_STOPWORDS and len(w) >= 2]
|
|
345
|
+
return [w.replace('"', '\\"') for w in meaningful]
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _get_confluence_page_dates(
|
|
349
|
+
base_url: str, page_id: str, username: str, api_token: str,
|
|
350
|
+
ca_bundle_path: str | None = None
|
|
351
|
+
) -> dict[str, str]:
|
|
352
|
+
"""Fetch creation and modification dates for a Confluence page."""
|
|
353
|
+
if requests is None:
|
|
354
|
+
return {}
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
url = f"{base_url.rstrip('/')}/wiki/api/v2/pages/{page_id}"
|
|
358
|
+
response = requests.get(
|
|
359
|
+
url,
|
|
360
|
+
auth=(username, api_token),
|
|
361
|
+
timeout=5.0,
|
|
362
|
+
verify=ca_bundle_path if ca_bundle_path else True,
|
|
363
|
+
)
|
|
364
|
+
if response.status_code == 200:
|
|
365
|
+
data = response.json()
|
|
366
|
+
result = {}
|
|
367
|
+
if "createdAt" in data:
|
|
368
|
+
try:
|
|
369
|
+
dt = datetime.fromisoformat(data["createdAt"].replace("Z", "+00:00"))
|
|
370
|
+
result["creation_date"] = dt.strftime("%Y-%m-%d")
|
|
371
|
+
except Exception:
|
|
372
|
+
pass
|
|
373
|
+
if "version" in data and "createdAt" in data["version"]:
|
|
374
|
+
try:
|
|
375
|
+
dt = datetime.fromisoformat(data["version"]["createdAt"].replace("Z", "+00:00"))
|
|
376
|
+
result["last_modified_date"] = dt.strftime("%Y-%m-%d")
|
|
377
|
+
except Exception:
|
|
378
|
+
pass
|
|
379
|
+
return result
|
|
380
|
+
except Exception as e:
|
|
381
|
+
logger.debug(f"Failed to fetch Confluence page dates: {e}")
|
|
382
|
+
return {}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _search_confluence(query: str, config: dict[str, Any]) -> list[NodeWithScore]:
|
|
386
|
+
"""Search Confluence for documents matching the query using CQL."""
|
|
387
|
+
base_url = config["confluence"]["url"]
|
|
388
|
+
if not base_url:
|
|
389
|
+
logger.warning("Confluence search skipped: confluence.url not set in config")
|
|
390
|
+
return []
|
|
391
|
+
|
|
392
|
+
if ConfluenceReader is None:
|
|
393
|
+
logger.warning("llama-index-readers-confluence not installed, skipping Confluence search")
|
|
394
|
+
return []
|
|
395
|
+
|
|
396
|
+
username = config["confluence"]["username"]
|
|
397
|
+
api_token = config["confluence"]["api_token"]
|
|
398
|
+
max_results = config["confluence"]["max_results"]
|
|
399
|
+
ca_bundle_path = config["ssl"]["ca_bundle_path"] or None
|
|
400
|
+
|
|
401
|
+
if not (base_url and username and api_token):
|
|
402
|
+
missing = []
|
|
403
|
+
if not username:
|
|
404
|
+
missing.append("confluence.username")
|
|
405
|
+
if not api_token:
|
|
406
|
+
missing.append("confluence.api_token")
|
|
407
|
+
logger.warning(f"Confluence search skipped: missing {', '.join(missing)} in config")
|
|
408
|
+
return []
|
|
409
|
+
|
|
410
|
+
try:
|
|
411
|
+
reader = ConfluenceReader(base_url=base_url, user_name=username, api_token=api_token)
|
|
412
|
+
query_terms = _prepare_confluence_query_terms(query)
|
|
413
|
+
|
|
414
|
+
if not query_terms:
|
|
415
|
+
escaped = query.strip().replace('"', '\\"')
|
|
416
|
+
if not escaped:
|
|
417
|
+
logger.warning("Confluence search skipped: empty query after processing")
|
|
418
|
+
return []
|
|
419
|
+
cql = f'text ~ "{escaped}" AND type = "page"'
|
|
420
|
+
elif len(query_terms) == 1:
|
|
421
|
+
cql = f'text ~ "{query_terms[0]}" AND type = "page"'
|
|
422
|
+
else:
|
|
423
|
+
text_conditions = ' OR '.join([f'text ~ "{term}"' for term in query_terms])
|
|
424
|
+
cql = f'({text_conditions}) AND type = "page"'
|
|
425
|
+
|
|
426
|
+
documents = reader.load_data(cql=cql, max_num_results=max_results)
|
|
427
|
+
|
|
428
|
+
nodes: list[NodeWithScore] = []
|
|
429
|
+
for doc in documents:
|
|
430
|
+
metadata = doc.metadata.copy()
|
|
431
|
+
metadata["source"] = "Confluence"
|
|
432
|
+
if "title" in metadata:
|
|
433
|
+
metadata["file_name"] = metadata["title"]
|
|
434
|
+
|
|
435
|
+
page_id = metadata.get("page_id")
|
|
436
|
+
if page_id:
|
|
437
|
+
date_info = _get_confluence_page_dates(base_url, page_id, username, api_token, ca_bundle_path)
|
|
438
|
+
metadata.update(date_info)
|
|
439
|
+
|
|
440
|
+
node = TextNode(text=doc.text, metadata=metadata)
|
|
441
|
+
nodes.append(NodeWithScore(node=node, score=0.0))
|
|
442
|
+
|
|
443
|
+
return nodes
|
|
444
|
+
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.error(f"Failed to search Confluence: {e}", exc_info=True)
|
|
447
|
+
return []
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def load_llamaindex_index(config: dict[str, Any] | None = None):
|
|
451
|
+
"""Load the LlamaIndex from storage."""
|
|
452
|
+
if config is None:
|
|
453
|
+
config = _get_config()
|
|
454
|
+
global _index_cache
|
|
455
|
+
|
|
456
|
+
if _index_cache is not None:
|
|
457
|
+
return _index_cache
|
|
458
|
+
|
|
459
|
+
storage_dir = Path(config["storage"]["storage_dir"])
|
|
460
|
+
if not storage_dir.exists():
|
|
461
|
+
raise FileNotFoundError(
|
|
462
|
+
f"Storage directory {storage_dir} does not exist. "
|
|
463
|
+
"Please run indexing first."
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
logger.info("Loading LlamaIndex from storage...")
|
|
467
|
+
_ensure_embed_model(config)
|
|
468
|
+
storage_context = StorageContext.from_defaults(persist_dir=str(storage_dir))
|
|
469
|
+
index = load_index_from_storage(storage_context)
|
|
470
|
+
_index_cache = index
|
|
471
|
+
return index
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def _parse_date(date_str: str) -> datetime | None:
|
|
475
|
+
"""Parse date string in YYYY-MM-DD format."""
|
|
476
|
+
try:
|
|
477
|
+
return datetime.strptime(date_str, "%Y-%m-%d")
|
|
478
|
+
except (ValueError, TypeError):
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _filter_nodes_by_date(
|
|
483
|
+
nodes: list[NodeWithScore],
|
|
484
|
+
date_from: str | None,
|
|
485
|
+
date_to: str | None
|
|
486
|
+
) -> list[NodeWithScore]:
|
|
487
|
+
"""Filter nodes by date range."""
|
|
488
|
+
if not date_from and not date_to:
|
|
489
|
+
return nodes
|
|
490
|
+
|
|
491
|
+
from_dt = _parse_date(date_from) if date_from else None
|
|
492
|
+
to_dt = _parse_date(date_to) if date_to else None
|
|
493
|
+
|
|
494
|
+
filtered = []
|
|
495
|
+
for node in nodes:
|
|
496
|
+
metadata = node.node.metadata or {}
|
|
497
|
+
doc_date_str = metadata.get("last_modified_date") or metadata.get("creation_date")
|
|
498
|
+
if not doc_date_str:
|
|
499
|
+
filtered.append(node)
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
doc_date = _parse_date(doc_date_str)
|
|
503
|
+
if not doc_date:
|
|
504
|
+
filtered.append(node)
|
|
505
|
+
continue
|
|
506
|
+
|
|
507
|
+
if from_dt and doc_date < from_dt:
|
|
508
|
+
continue
|
|
509
|
+
if to_dt and doc_date > to_dt:
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
filtered.append(node)
|
|
513
|
+
|
|
514
|
+
return filtered
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def _apply_recency_boost(
|
|
518
|
+
nodes: list[NodeWithScore],
|
|
519
|
+
boost_weight: float,
|
|
520
|
+
half_life_days: int = 365
|
|
521
|
+
) -> list[NodeWithScore]:
|
|
522
|
+
"""Apply time-decay boost to nodes based on document recency."""
|
|
523
|
+
if not nodes or boost_weight <= 0:
|
|
524
|
+
return nodes
|
|
525
|
+
|
|
526
|
+
today = datetime.now()
|
|
527
|
+
boosted_nodes = []
|
|
528
|
+
|
|
529
|
+
for node in nodes:
|
|
530
|
+
metadata = node.node.metadata or {}
|
|
531
|
+
doc_date_str = metadata.get("last_modified_date") or metadata.get("creation_date")
|
|
532
|
+
base_score = node.score if node.score is not None else 0.5
|
|
533
|
+
|
|
534
|
+
if not doc_date_str:
|
|
535
|
+
boosted_nodes.append(NodeWithScore(node=node.node, score=base_score))
|
|
536
|
+
continue
|
|
537
|
+
|
|
538
|
+
doc_date = _parse_date(doc_date_str)
|
|
539
|
+
if not doc_date:
|
|
540
|
+
boosted_nodes.append(NodeWithScore(node=node.node, score=base_score))
|
|
541
|
+
continue
|
|
542
|
+
|
|
543
|
+
age_days = (today - doc_date).days
|
|
544
|
+
if age_days < 0:
|
|
545
|
+
age_days = 0
|
|
546
|
+
|
|
547
|
+
decay_rate = math.log(2) / half_life_days
|
|
548
|
+
recency_factor = math.exp(-decay_rate * age_days)
|
|
549
|
+
boosted_score = base_score * (1 + boost_weight * recency_factor)
|
|
550
|
+
|
|
551
|
+
boosted_nodes.append(NodeWithScore(node=node.node, score=boosted_score))
|
|
552
|
+
|
|
553
|
+
boosted_nodes.sort(key=lambda x: x.score or 0, reverse=True)
|
|
554
|
+
return boosted_nodes
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def run_search(
|
|
558
|
+
query: str,
|
|
559
|
+
date_from: str | None = None,
|
|
560
|
+
date_to: str | None = None,
|
|
561
|
+
config_path: Path | None = None,
|
|
562
|
+
) -> dict[str, Any]:
|
|
563
|
+
"""Execute the full search pipeline and return structured results.
|
|
564
|
+
|
|
565
|
+
This is the shared implementation used by both the MCP tool and CLI.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
query: Search query text
|
|
569
|
+
date_from: Optional start date filter (YYYY-MM-DD, inclusive)
|
|
570
|
+
date_to: Optional end date filter (YYYY-MM-DD, inclusive)
|
|
571
|
+
config_path: Optional path to config.yaml
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
Structured response dict with matched_files, chunks, etc.
|
|
575
|
+
"""
|
|
576
|
+
config = _init_config(config_path) if config_path else _get_config()
|
|
577
|
+
|
|
578
|
+
# Setup environment on first call
|
|
579
|
+
_setup_offline_mode(config)
|
|
580
|
+
_setup_ssl(config)
|
|
581
|
+
|
|
582
|
+
start_time = time.time()
|
|
583
|
+
|
|
584
|
+
try:
|
|
585
|
+
enhanced_query = _preprocess_query(query)
|
|
586
|
+
|
|
587
|
+
# Load index
|
|
588
|
+
index = load_llamaindex_index(config)
|
|
589
|
+
|
|
590
|
+
# Stage 1a: Vector search
|
|
591
|
+
embed_top_k = config["retrieval"]["embed_top_k"]
|
|
592
|
+
retriever = index.as_retriever(similarity_top_k=embed_top_k)
|
|
593
|
+
vector_nodes = retriever.retrieve(enhanced_query)
|
|
594
|
+
|
|
595
|
+
# Stage 1b: BM25 file name search
|
|
596
|
+
matched_files: list[dict[str, Any]] = []
|
|
597
|
+
bm25_retriever = _ensure_bm25_retriever(config)
|
|
598
|
+
if bm25_retriever:
|
|
599
|
+
try:
|
|
600
|
+
bm25_matches = bm25_retriever.retrieve(enhanced_query)
|
|
601
|
+
if bm25_matches:
|
|
602
|
+
matched_files = _format_bm25_matches(bm25_matches, config)
|
|
603
|
+
logger.info(f"BM25 matched {len(matched_files)} files (from {len(bm25_matches)} candidates)")
|
|
604
|
+
except Exception as e:
|
|
605
|
+
logger.error(f"BM25 search failed: {e}")
|
|
606
|
+
|
|
607
|
+
nodes = vector_nodes
|
|
608
|
+
|
|
609
|
+
# Search Confluence (with timeout)
|
|
610
|
+
confluence_nodes: list[NodeWithScore] = []
|
|
611
|
+
confluence_timeout = config["confluence"]["timeout"]
|
|
612
|
+
if config["confluence"]["url"]:
|
|
613
|
+
try:
|
|
614
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
615
|
+
future = executor.submit(_search_confluence, enhanced_query, config)
|
|
616
|
+
confluence_nodes = future.result(timeout=confluence_timeout)
|
|
617
|
+
logger.info(f"Confluence search returned {len(confluence_nodes)} entries")
|
|
618
|
+
except FuturesTimeoutError:
|
|
619
|
+
logger.warning(f"Confluence search timed out after {confluence_timeout}s")
|
|
620
|
+
except Exception as e:
|
|
621
|
+
logger.error(f"Error during Confluence search: {e}")
|
|
622
|
+
|
|
623
|
+
if confluence_nodes:
|
|
624
|
+
nodes.extend(confluence_nodes)
|
|
625
|
+
|
|
626
|
+
# Apply date filtering
|
|
627
|
+
if date_from or date_to:
|
|
628
|
+
original_count = len(nodes)
|
|
629
|
+
nodes = _filter_nodes_by_date(nodes, date_from, date_to)
|
|
630
|
+
logger.info(f"Date filtering: {original_count} -> {len(nodes)} nodes")
|
|
631
|
+
|
|
632
|
+
# Apply recency boost
|
|
633
|
+
recency_boost = config["retrieval"]["recency_boost"]
|
|
634
|
+
recency_half_life = config["retrieval"]["recency_half_life_days"]
|
|
635
|
+
if recency_boost > 0:
|
|
636
|
+
nodes = _apply_recency_boost(nodes, recency_boost, recency_half_life)
|
|
637
|
+
|
|
638
|
+
# Cap candidates before reranking
|
|
639
|
+
rerank_candidates = config["retrieval"]["rerank_candidates"]
|
|
640
|
+
if len(nodes) > rerank_candidates:
|
|
641
|
+
logger.info(f"Capping rerank candidates: {len(nodes)} -> {rerank_candidates}")
|
|
642
|
+
nodes = nodes[:rerank_candidates]
|
|
643
|
+
|
|
644
|
+
# Stage 2: Rerank
|
|
645
|
+
rerank_top_k = config["retrieval"]["rerank_top_k"]
|
|
646
|
+
rerank_scores: dict[int, float] = {}
|
|
647
|
+
if nodes:
|
|
648
|
+
rerank_limit = max(1, min(rerank_top_k, len(nodes)))
|
|
649
|
+
try:
|
|
650
|
+
reranker = _ensure_reranker(config)
|
|
651
|
+
passages = [{"text": node.node.get_content() or ""} for node in nodes]
|
|
652
|
+
|
|
653
|
+
from flashrank import RerankRequest
|
|
654
|
+
rerank_request = RerankRequest(query=enhanced_query, passages=passages)
|
|
655
|
+
reranked_results = reranker.rerank(rerank_request)
|
|
656
|
+
|
|
657
|
+
text_to_indices: dict[str, list[tuple[int, NodeWithScore]]] = {}
|
|
658
|
+
for idx, node in enumerate(nodes):
|
|
659
|
+
node_text = node.node.get_content() or ""
|
|
660
|
+
if node_text not in text_to_indices:
|
|
661
|
+
text_to_indices[node_text] = []
|
|
662
|
+
text_to_indices[node_text].append((idx, node))
|
|
663
|
+
|
|
664
|
+
reranked_nodes = []
|
|
665
|
+
seen_indices: set[int] = set()
|
|
666
|
+
for result in reranked_results:
|
|
667
|
+
doc_text = result.get("text", "")
|
|
668
|
+
score = result.get("score", 0.0)
|
|
669
|
+
|
|
670
|
+
if doc_text in text_to_indices:
|
|
671
|
+
for idx, node in text_to_indices[doc_text]:
|
|
672
|
+
if idx not in seen_indices:
|
|
673
|
+
reranked_nodes.append(node)
|
|
674
|
+
rerank_scores[id(node)] = float(score)
|
|
675
|
+
seen_indices.add(idx)
|
|
676
|
+
break
|
|
677
|
+
|
|
678
|
+
for idx, node in enumerate(nodes):
|
|
679
|
+
if idx not in seen_indices:
|
|
680
|
+
reranked_nodes.append(node)
|
|
681
|
+
|
|
682
|
+
nodes = reranked_nodes[:rerank_limit]
|
|
683
|
+
except Exception as e:
|
|
684
|
+
logger.error(f"Reranking failed, falling back to vector search order: {e}")
|
|
685
|
+
nodes = nodes[:rerank_limit]
|
|
686
|
+
|
|
687
|
+
# Filter by score threshold
|
|
688
|
+
score_threshold = config["retrieval"]["score_threshold"]
|
|
689
|
+
if score_threshold > 0:
|
|
690
|
+
nodes = [
|
|
691
|
+
node for node in nodes
|
|
692
|
+
if rerank_scores.get(id(node), 0.0) >= score_threshold
|
|
693
|
+
]
|
|
694
|
+
|
|
695
|
+
# Format chunks
|
|
696
|
+
chunks = []
|
|
697
|
+
for node in nodes:
|
|
698
|
+
metadata = dict(node.node.metadata or {})
|
|
699
|
+
chunk_text = node.node.get_content()
|
|
700
|
+
|
|
701
|
+
file_path = (
|
|
702
|
+
metadata.get("file_path")
|
|
703
|
+
or metadata.get("file_name")
|
|
704
|
+
or metadata.get("source")
|
|
705
|
+
)
|
|
706
|
+
original_source = metadata.get("source")
|
|
707
|
+
|
|
708
|
+
# Build heading path
|
|
709
|
+
headings = metadata.get("document_headings") or metadata.get("headings") or []
|
|
710
|
+
if not headings and file_path:
|
|
711
|
+
headings = get_heading_store().get_headings(str(file_path))
|
|
712
|
+
char_start = getattr(node.node, "start_char_idx", None)
|
|
713
|
+
heading_text = metadata.get("heading")
|
|
714
|
+
heading_path: list[str] = []
|
|
715
|
+
if isinstance(headings, list) and headings:
|
|
716
|
+
if heading_text is None and char_start is not None:
|
|
717
|
+
heading_text, heading_path = _build_heading_path(headings, char_start)
|
|
718
|
+
meta_heading_path = metadata.get("heading_path")
|
|
719
|
+
if not heading_path and meta_heading_path:
|
|
720
|
+
heading_path = list(meta_heading_path)
|
|
721
|
+
if heading_text and (not heading_path or heading_path[-1] != heading_text):
|
|
722
|
+
heading_path = heading_path + [heading_text] if heading_path else [heading_text]
|
|
723
|
+
|
|
724
|
+
# Build URI
|
|
725
|
+
source_uri = None
|
|
726
|
+
if original_source == "Confluence":
|
|
727
|
+
confluence_url = config["confluence"]["url"]
|
|
728
|
+
page_id = metadata.get("page_id")
|
|
729
|
+
if confluence_url and page_id:
|
|
730
|
+
source_uri = f"{confluence_url.rstrip('/')}/pages/viewpage.action?pageId={page_id}"
|
|
731
|
+
elif confluence_url:
|
|
732
|
+
title = metadata.get("title", metadata.get("file_name", ""))
|
|
733
|
+
if title:
|
|
734
|
+
from urllib.parse import quote
|
|
735
|
+
encoded_title = quote(title.replace(" ", "+"))
|
|
736
|
+
source_uri = f"{confluence_url.rstrip('/')}/spaces/~{encoded_title}"
|
|
737
|
+
elif file_path:
|
|
738
|
+
source_uri = _resolve_file_uri(file_path, config)
|
|
739
|
+
|
|
740
|
+
page_number = (
|
|
741
|
+
metadata.get("page_label")
|
|
742
|
+
or metadata.get("page_number")
|
|
743
|
+
or metadata.get("page")
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
line_number = None
|
|
747
|
+
line_offsets = metadata.get("line_offsets")
|
|
748
|
+
if line_offsets and char_start is not None:
|
|
749
|
+
line_number = _char_offset_to_line(char_start, line_offsets)
|
|
750
|
+
|
|
751
|
+
location = {
|
|
752
|
+
"uri": source_uri,
|
|
753
|
+
"page": page_number,
|
|
754
|
+
"line": line_number,
|
|
755
|
+
"heading_path": heading_path if heading_path else None,
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
score_value = rerank_scores.get(id(node), getattr(node, "score", None))
|
|
759
|
+
chunk_data = {
|
|
760
|
+
"text": chunk_text,
|
|
761
|
+
"score": round(float(score_value), 3) if score_value is not None else 0.0,
|
|
762
|
+
"location": location,
|
|
763
|
+
}
|
|
764
|
+
chunks.append(chunk_data)
|
|
765
|
+
|
|
766
|
+
elapsed = time.time() - start_time
|
|
767
|
+
|
|
768
|
+
return {
|
|
769
|
+
"matched_files": matched_files,
|
|
770
|
+
"num_matched_files": len(matched_files),
|
|
771
|
+
"chunks": chunks,
|
|
772
|
+
"num_chunks": len(chunks),
|
|
773
|
+
"query": query,
|
|
774
|
+
"retrieval_time": f"{elapsed:.2f}s",
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"Error in search: {e}", exc_info=True)
|
|
779
|
+
return {
|
|
780
|
+
"matched_files": [],
|
|
781
|
+
"chunks": [],
|
|
782
|
+
"error": str(e),
|
|
783
|
+
"query": query,
|
|
784
|
+
}
|