gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""OpenSearch knowledge searcher implementation.
|
|
2
|
+
|
|
3
|
+
Uses OpenSearch neural search - embeddings are generated automatically
|
|
4
|
+
by the deployed ML model. No Python-side embedding generation needed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
|
|
14
|
+
from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
|
|
15
|
+
from gnosisllm_knowledge.core.domain.search import (
|
|
16
|
+
SearchMode,
|
|
17
|
+
SearchQuery,
|
|
18
|
+
SearchResult,
|
|
19
|
+
SearchResultItem,
|
|
20
|
+
)
|
|
21
|
+
from gnosisllm_knowledge.core.exceptions import SearchError
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from opensearchpy import AsyncOpenSearch
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OpenSearchKnowledgeSearcher:
|
|
30
|
+
"""OpenSearch knowledge searcher.
|
|
31
|
+
|
|
32
|
+
Implements the IKnowledgeSearcher protocol for semantic, keyword,
|
|
33
|
+
and hybrid search over knowledge documents.
|
|
34
|
+
|
|
35
|
+
Uses OpenSearch neural search for semantic queries - the deployed
|
|
36
|
+
ML model handles embedding generation automatically.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
```python
|
|
40
|
+
config = OpenSearchConfig.from_env()
|
|
41
|
+
client = AsyncOpenSearch(hosts=[config.url])
|
|
42
|
+
searcher = OpenSearchKnowledgeSearcher(client, config)
|
|
43
|
+
|
|
44
|
+
query = SearchQuery(text="how to configure", mode=SearchMode.HYBRID)
|
|
45
|
+
results = await searcher.search(query, "my-index")
|
|
46
|
+
```
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
client: AsyncOpenSearch,
|
|
52
|
+
config: OpenSearchConfig,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Initialize the searcher.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
client: OpenSearch async client.
|
|
58
|
+
config: OpenSearch configuration (includes model_id).
|
|
59
|
+
"""
|
|
60
|
+
self._client = client
|
|
61
|
+
self._config = config
|
|
62
|
+
|
|
63
|
+
async def search(
|
|
64
|
+
self,
|
|
65
|
+
query: SearchQuery,
|
|
66
|
+
index_name: str,
|
|
67
|
+
**options: Any,
|
|
68
|
+
) -> SearchResult:
|
|
69
|
+
"""Execute search query using the specified mode.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
query: Search query with mode and parameters.
|
|
73
|
+
index_name: Index to search.
|
|
74
|
+
**options: Additional search options.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Search results.
|
|
78
|
+
"""
|
|
79
|
+
mode = query.mode
|
|
80
|
+
|
|
81
|
+
if mode == SearchMode.SEMANTIC:
|
|
82
|
+
return await self.semantic_search(query, index_name, **options)
|
|
83
|
+
elif mode == SearchMode.KEYWORD:
|
|
84
|
+
return await self.keyword_search(query, index_name, **options)
|
|
85
|
+
elif mode == SearchMode.HYBRID:
|
|
86
|
+
return await self.hybrid_search(query, index_name, **options)
|
|
87
|
+
else:
|
|
88
|
+
# Default to hybrid
|
|
89
|
+
return await self.hybrid_search(query, index_name, **options)
|
|
90
|
+
|
|
91
|
+
async def semantic_search(
|
|
92
|
+
self,
|
|
93
|
+
query: SearchQuery,
|
|
94
|
+
index_name: str,
|
|
95
|
+
**options: Any,
|
|
96
|
+
) -> SearchResult:
|
|
97
|
+
"""Execute semantic (neural) search.
|
|
98
|
+
|
|
99
|
+
OpenSearch handles embedding generation via the deployed model.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
query: Search query.
|
|
103
|
+
index_name: Index to search.
|
|
104
|
+
**options: Additional options.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Search results.
|
|
108
|
+
"""
|
|
109
|
+
start_time = time.perf_counter()
|
|
110
|
+
|
|
111
|
+
model_id = options.get("model_id", self._config.model_id)
|
|
112
|
+
if not model_id:
|
|
113
|
+
raise SearchError(
|
|
114
|
+
message="model_id required for semantic search",
|
|
115
|
+
details={"query": query.text[:100]},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Build and execute query
|
|
119
|
+
builder = QueryBuilder(
|
|
120
|
+
query,
|
|
121
|
+
model_id=model_id,
|
|
122
|
+
embedding_field=self._config.embedding_field,
|
|
123
|
+
)
|
|
124
|
+
os_query = builder.build_semantic_query()
|
|
125
|
+
|
|
126
|
+
result = await self._execute_search(
|
|
127
|
+
query, os_query, index_name, start_time
|
|
128
|
+
)
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
async def keyword_search(
|
|
132
|
+
self,
|
|
133
|
+
query: SearchQuery,
|
|
134
|
+
index_name: str,
|
|
135
|
+
**options: Any,
|
|
136
|
+
) -> SearchResult:
|
|
137
|
+
"""Execute keyword (BM25) search.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: Search query.
|
|
141
|
+
index_name: Index to search.
|
|
142
|
+
**options: Additional options.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Search results.
|
|
146
|
+
"""
|
|
147
|
+
start_time = time.perf_counter()
|
|
148
|
+
|
|
149
|
+
builder = QueryBuilder(query)
|
|
150
|
+
os_query = builder.build_keyword_query()
|
|
151
|
+
|
|
152
|
+
result = await self._execute_search(
|
|
153
|
+
query, os_query, index_name, start_time
|
|
154
|
+
)
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
async def hybrid_search(
|
|
158
|
+
self,
|
|
159
|
+
query: SearchQuery,
|
|
160
|
+
index_name: str,
|
|
161
|
+
**options: Any,
|
|
162
|
+
) -> SearchResult:
|
|
163
|
+
"""Execute hybrid search (semantic + keyword).
|
|
164
|
+
|
|
165
|
+
Uses OpenSearch hybrid query with search pipeline for
|
|
166
|
+
score normalization.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
query: Search query.
|
|
170
|
+
index_name: Index to search.
|
|
171
|
+
**options: semantic_weight, keyword_weight, model_id.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Search results.
|
|
175
|
+
"""
|
|
176
|
+
start_time = time.perf_counter()
|
|
177
|
+
|
|
178
|
+
model_id = options.get("model_id", self._config.model_id)
|
|
179
|
+
|
|
180
|
+
# Build hybrid query
|
|
181
|
+
semantic_weight = options.get("semantic_weight", 0.7)
|
|
182
|
+
keyword_weight = options.get("keyword_weight", 0.3)
|
|
183
|
+
|
|
184
|
+
builder = QueryBuilder(
|
|
185
|
+
query,
|
|
186
|
+
model_id=model_id,
|
|
187
|
+
embedding_field=self._config.embedding_field,
|
|
188
|
+
)
|
|
189
|
+
os_query = builder.build_hybrid_query(
|
|
190
|
+
semantic_weight=semantic_weight,
|
|
191
|
+
keyword_weight=keyword_weight,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Use search pipeline for hybrid if configured
|
|
195
|
+
search_params = {}
|
|
196
|
+
if self._config.search_pipeline_name:
|
|
197
|
+
search_params["search_pipeline"] = self._config.search_pipeline_name
|
|
198
|
+
|
|
199
|
+
result = await self._execute_search(
|
|
200
|
+
query, os_query, index_name, start_time, **search_params
|
|
201
|
+
)
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
async def get_similar_documents(
|
|
205
|
+
self,
|
|
206
|
+
doc_id: str,
|
|
207
|
+
index_name: str,
|
|
208
|
+
limit: int = 10,
|
|
209
|
+
) -> SearchResult:
|
|
210
|
+
"""Find documents similar to a given document.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
doc_id: Document ID to find similar documents for.
|
|
214
|
+
index_name: Index to search.
|
|
215
|
+
limit: Maximum results.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Search results with similar documents.
|
|
219
|
+
"""
|
|
220
|
+
start_time = time.perf_counter()
|
|
221
|
+
|
|
222
|
+
query = SearchQuery(text="", limit=limit)
|
|
223
|
+
builder = QueryBuilder(query)
|
|
224
|
+
os_query = builder.build_more_like_this_query(doc_id)
|
|
225
|
+
|
|
226
|
+
result = await self._execute_search(
|
|
227
|
+
query, os_query, index_name, start_time
|
|
228
|
+
)
|
|
229
|
+
return result
|
|
230
|
+
|
|
231
|
+
async def multi_search(
|
|
232
|
+
self,
|
|
233
|
+
queries: list[SearchQuery],
|
|
234
|
+
index_name: str,
|
|
235
|
+
) -> list[SearchResult]:
|
|
236
|
+
"""Execute multiple searches in a single request.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
queries: List of search queries.
|
|
240
|
+
index_name: Index to search.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List of search results.
|
|
244
|
+
"""
|
|
245
|
+
if not queries:
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
start_time = time.perf_counter()
|
|
249
|
+
msearch_body = []
|
|
250
|
+
|
|
251
|
+
for query in queries:
|
|
252
|
+
builder = QueryBuilder(
|
|
253
|
+
query,
|
|
254
|
+
model_id=self._config.model_id,
|
|
255
|
+
embedding_field=self._config.embedding_field,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if query.mode == SearchMode.SEMANTIC:
|
|
259
|
+
os_query = builder.build_semantic_query()
|
|
260
|
+
elif query.mode == SearchMode.KEYWORD:
|
|
261
|
+
os_query = builder.build_keyword_query()
|
|
262
|
+
else:
|
|
263
|
+
os_query = builder.build_hybrid_query()
|
|
264
|
+
|
|
265
|
+
msearch_body.append({"index": index_name})
|
|
266
|
+
msearch_body.append(os_query)
|
|
267
|
+
|
|
268
|
+
response = await self._client.msearch(body=msearch_body)
|
|
269
|
+
|
|
270
|
+
results = []
|
|
271
|
+
for i, resp in enumerate(response.get("responses", [])):
|
|
272
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
273
|
+
result = self._parse_response(queries[i], resp, duration_ms)
|
|
274
|
+
results.append(result)
|
|
275
|
+
|
|
276
|
+
return results
|
|
277
|
+
|
|
278
|
+
async def _execute_search(
|
|
279
|
+
self,
|
|
280
|
+
query: SearchQuery,
|
|
281
|
+
os_query: dict[str, Any],
|
|
282
|
+
index_name: str,
|
|
283
|
+
start_time: float,
|
|
284
|
+
**params: Any,
|
|
285
|
+
) -> SearchResult:
|
|
286
|
+
"""Execute search and parse results.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
query: Original search query.
|
|
290
|
+
os_query: OpenSearch query dictionary.
|
|
291
|
+
index_name: Index to search.
|
|
292
|
+
start_time: Search start time.
|
|
293
|
+
**params: Additional search parameters.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Parsed search results.
|
|
297
|
+
"""
|
|
298
|
+
try:
|
|
299
|
+
response = await self._client.search(
|
|
300
|
+
index=index_name,
|
|
301
|
+
body=os_query,
|
|
302
|
+
**params,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
306
|
+
return self._parse_response(query, response, duration_ms)
|
|
307
|
+
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.error(f"Search failed: {e}")
|
|
310
|
+
raise SearchError(
|
|
311
|
+
message=f"Search failed: {e}",
|
|
312
|
+
details={"query": query.text[:100], "index": index_name},
|
|
313
|
+
cause=e,
|
|
314
|
+
) from e
|
|
315
|
+
|
|
316
|
+
def _parse_response(
|
|
317
|
+
self,
|
|
318
|
+
query: SearchQuery,
|
|
319
|
+
response: dict[str, Any],
|
|
320
|
+
duration_ms: float,
|
|
321
|
+
) -> SearchResult:
|
|
322
|
+
"""Parse OpenSearch response into SearchResult.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
query: Original query.
|
|
326
|
+
response: OpenSearch response.
|
|
327
|
+
duration_ms: Search duration in milliseconds.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Parsed search result.
|
|
331
|
+
"""
|
|
332
|
+
hits = response.get("hits", {})
|
|
333
|
+
total = hits.get("total", {})
|
|
334
|
+
total_hits = total.get("value", 0) if isinstance(total, dict) else total
|
|
335
|
+
max_score = hits.get("max_score")
|
|
336
|
+
|
|
337
|
+
items = []
|
|
338
|
+
search_after_token = None
|
|
339
|
+
|
|
340
|
+
for hit in hits.get("hits", []):
|
|
341
|
+
source = hit.get("_source", {})
|
|
342
|
+
|
|
343
|
+
# Extract highlights
|
|
344
|
+
highlights = None
|
|
345
|
+
highlighted_title = None
|
|
346
|
+
if "highlight" in hit:
|
|
347
|
+
highlight_data = hit["highlight"]
|
|
348
|
+
highlights = highlight_data.get("content", [])
|
|
349
|
+
title_highlights = highlight_data.get("title", [])
|
|
350
|
+
if title_highlights:
|
|
351
|
+
highlighted_title = title_highlights[0]
|
|
352
|
+
|
|
353
|
+
item = SearchResultItem(
|
|
354
|
+
doc_id=hit.get("_id", ""),
|
|
355
|
+
content=source.get("content", ""),
|
|
356
|
+
score=hit.get("_score", 0.0),
|
|
357
|
+
title=source.get("title"),
|
|
358
|
+
url=source.get("url"),
|
|
359
|
+
source=source.get("source"),
|
|
360
|
+
collection_id=source.get("collection_id"),
|
|
361
|
+
highlights=highlights,
|
|
362
|
+
highlighted_title=highlighted_title,
|
|
363
|
+
metadata=source.get("metadata"),
|
|
364
|
+
chunk_index=source.get("chunk_index"),
|
|
365
|
+
total_chunks=source.get("total_chunks"),
|
|
366
|
+
explanation=hit.get("_explanation") if query.explain else None,
|
|
367
|
+
)
|
|
368
|
+
items.append(item)
|
|
369
|
+
|
|
370
|
+
# Track search_after for cursor pagination
|
|
371
|
+
if "sort" in hit:
|
|
372
|
+
search_after_token = hit["sort"]
|
|
373
|
+
|
|
374
|
+
return SearchResult(
|
|
375
|
+
query=query.text,
|
|
376
|
+
mode=query.mode,
|
|
377
|
+
items=items,
|
|
378
|
+
total_hits=total_hits,
|
|
379
|
+
duration_ms=duration_ms,
|
|
380
|
+
max_score=max_score,
|
|
381
|
+
search_after_token=search_after_token,
|
|
382
|
+
has_more=len(items) == query.limit and total_hits > query.offset + len(items),
|
|
383
|
+
)
|