gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,383 @@
1
+ """OpenSearch knowledge searcher implementation.
2
+
3
+ Uses OpenSearch neural search - embeddings are generated automatically
4
+ by the deployed ML model. No Python-side embedding generation needed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import time
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
14
+ from gnosisllm_knowledge.backends.opensearch.queries import QueryBuilder
15
+ from gnosisllm_knowledge.core.domain.search import (
16
+ SearchMode,
17
+ SearchQuery,
18
+ SearchResult,
19
+ SearchResultItem,
20
+ )
21
+ from gnosisllm_knowledge.core.exceptions import SearchError
22
+
23
+ if TYPE_CHECKING:
24
+ from opensearchpy import AsyncOpenSearch
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class OpenSearchKnowledgeSearcher:
30
+ """OpenSearch knowledge searcher.
31
+
32
+ Implements the IKnowledgeSearcher protocol for semantic, keyword,
33
+ and hybrid search over knowledge documents.
34
+
35
+ Uses OpenSearch neural search for semantic queries - the deployed
36
+ ML model handles embedding generation automatically.
37
+
38
+ Example:
39
+ ```python
40
+ config = OpenSearchConfig.from_env()
41
+ client = AsyncOpenSearch(hosts=[config.url])
42
+ searcher = OpenSearchKnowledgeSearcher(client, config)
43
+
44
+ query = SearchQuery(text="how to configure", mode=SearchMode.HYBRID)
45
+ results = await searcher.search(query, "my-index")
46
+ ```
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ client: AsyncOpenSearch,
52
+ config: OpenSearchConfig,
53
+ ) -> None:
54
+ """Initialize the searcher.
55
+
56
+ Args:
57
+ client: OpenSearch async client.
58
+ config: OpenSearch configuration (includes model_id).
59
+ """
60
+ self._client = client
61
+ self._config = config
62
+
63
+ async def search(
64
+ self,
65
+ query: SearchQuery,
66
+ index_name: str,
67
+ **options: Any,
68
+ ) -> SearchResult:
69
+ """Execute search query using the specified mode.
70
+
71
+ Args:
72
+ query: Search query with mode and parameters.
73
+ index_name: Index to search.
74
+ **options: Additional search options.
75
+
76
+ Returns:
77
+ Search results.
78
+ """
79
+ mode = query.mode
80
+
81
+ if mode == SearchMode.SEMANTIC:
82
+ return await self.semantic_search(query, index_name, **options)
83
+ elif mode == SearchMode.KEYWORD:
84
+ return await self.keyword_search(query, index_name, **options)
85
+ elif mode == SearchMode.HYBRID:
86
+ return await self.hybrid_search(query, index_name, **options)
87
+ else:
88
+ # Default to hybrid
89
+ return await self.hybrid_search(query, index_name, **options)
90
+
91
+ async def semantic_search(
92
+ self,
93
+ query: SearchQuery,
94
+ index_name: str,
95
+ **options: Any,
96
+ ) -> SearchResult:
97
+ """Execute semantic (neural) search.
98
+
99
+ OpenSearch handles embedding generation via the deployed model.
100
+
101
+ Args:
102
+ query: Search query.
103
+ index_name: Index to search.
104
+ **options: Additional options.
105
+
106
+ Returns:
107
+ Search results.
108
+ """
109
+ start_time = time.perf_counter()
110
+
111
+ model_id = options.get("model_id", self._config.model_id)
112
+ if not model_id:
113
+ raise SearchError(
114
+ message="model_id required for semantic search",
115
+ details={"query": query.text[:100]},
116
+ )
117
+
118
+ # Build and execute query
119
+ builder = QueryBuilder(
120
+ query,
121
+ model_id=model_id,
122
+ embedding_field=self._config.embedding_field,
123
+ )
124
+ os_query = builder.build_semantic_query()
125
+
126
+ result = await self._execute_search(
127
+ query, os_query, index_name, start_time
128
+ )
129
+ return result
130
+
131
+ async def keyword_search(
132
+ self,
133
+ query: SearchQuery,
134
+ index_name: str,
135
+ **options: Any,
136
+ ) -> SearchResult:
137
+ """Execute keyword (BM25) search.
138
+
139
+ Args:
140
+ query: Search query.
141
+ index_name: Index to search.
142
+ **options: Additional options.
143
+
144
+ Returns:
145
+ Search results.
146
+ """
147
+ start_time = time.perf_counter()
148
+
149
+ builder = QueryBuilder(query)
150
+ os_query = builder.build_keyword_query()
151
+
152
+ result = await self._execute_search(
153
+ query, os_query, index_name, start_time
154
+ )
155
+ return result
156
+
157
+ async def hybrid_search(
158
+ self,
159
+ query: SearchQuery,
160
+ index_name: str,
161
+ **options: Any,
162
+ ) -> SearchResult:
163
+ """Execute hybrid search (semantic + keyword).
164
+
165
+ Uses OpenSearch hybrid query with search pipeline for
166
+ score normalization.
167
+
168
+ Args:
169
+ query: Search query.
170
+ index_name: Index to search.
171
+ **options: semantic_weight, keyword_weight, model_id.
172
+
173
+ Returns:
174
+ Search results.
175
+ """
176
+ start_time = time.perf_counter()
177
+
178
+ model_id = options.get("model_id", self._config.model_id)
179
+
180
+ # Build hybrid query
181
+ semantic_weight = options.get("semantic_weight", 0.7)
182
+ keyword_weight = options.get("keyword_weight", 0.3)
183
+
184
+ builder = QueryBuilder(
185
+ query,
186
+ model_id=model_id,
187
+ embedding_field=self._config.embedding_field,
188
+ )
189
+ os_query = builder.build_hybrid_query(
190
+ semantic_weight=semantic_weight,
191
+ keyword_weight=keyword_weight,
192
+ )
193
+
194
+ # Use search pipeline for hybrid if configured
195
+ search_params = {}
196
+ if self._config.search_pipeline_name:
197
+ search_params["search_pipeline"] = self._config.search_pipeline_name
198
+
199
+ result = await self._execute_search(
200
+ query, os_query, index_name, start_time, **search_params
201
+ )
202
+ return result
203
+
204
+ async def get_similar_documents(
205
+ self,
206
+ doc_id: str,
207
+ index_name: str,
208
+ limit: int = 10,
209
+ ) -> SearchResult:
210
+ """Find documents similar to a given document.
211
+
212
+ Args:
213
+ doc_id: Document ID to find similar documents for.
214
+ index_name: Index to search.
215
+ limit: Maximum results.
216
+
217
+ Returns:
218
+ Search results with similar documents.
219
+ """
220
+ start_time = time.perf_counter()
221
+
222
+ query = SearchQuery(text="", limit=limit)
223
+ builder = QueryBuilder(query)
224
+ os_query = builder.build_more_like_this_query(doc_id)
225
+
226
+ result = await self._execute_search(
227
+ query, os_query, index_name, start_time
228
+ )
229
+ return result
230
+
231
+ async def multi_search(
232
+ self,
233
+ queries: list[SearchQuery],
234
+ index_name: str,
235
+ ) -> list[SearchResult]:
236
+ """Execute multiple searches in a single request.
237
+
238
+ Args:
239
+ queries: List of search queries.
240
+ index_name: Index to search.
241
+
242
+ Returns:
243
+ List of search results.
244
+ """
245
+ if not queries:
246
+ return []
247
+
248
+ start_time = time.perf_counter()
249
+ msearch_body = []
250
+
251
+ for query in queries:
252
+ builder = QueryBuilder(
253
+ query,
254
+ model_id=self._config.model_id,
255
+ embedding_field=self._config.embedding_field,
256
+ )
257
+
258
+ if query.mode == SearchMode.SEMANTIC:
259
+ os_query = builder.build_semantic_query()
260
+ elif query.mode == SearchMode.KEYWORD:
261
+ os_query = builder.build_keyword_query()
262
+ else:
263
+ os_query = builder.build_hybrid_query()
264
+
265
+ msearch_body.append({"index": index_name})
266
+ msearch_body.append(os_query)
267
+
268
+ response = await self._client.msearch(body=msearch_body)
269
+
270
+ results = []
271
+ for i, resp in enumerate(response.get("responses", [])):
272
+ duration_ms = (time.perf_counter() - start_time) * 1000
273
+ result = self._parse_response(queries[i], resp, duration_ms)
274
+ results.append(result)
275
+
276
+ return results
277
+
278
+ async def _execute_search(
279
+ self,
280
+ query: SearchQuery,
281
+ os_query: dict[str, Any],
282
+ index_name: str,
283
+ start_time: float,
284
+ **params: Any,
285
+ ) -> SearchResult:
286
+ """Execute search and parse results.
287
+
288
+ Args:
289
+ query: Original search query.
290
+ os_query: OpenSearch query dictionary.
291
+ index_name: Index to search.
292
+ start_time: Search start time.
293
+ **params: Additional search parameters.
294
+
295
+ Returns:
296
+ Parsed search results.
297
+ """
298
+ try:
299
+ response = await self._client.search(
300
+ index=index_name,
301
+ body=os_query,
302
+ **params,
303
+ )
304
+
305
+ duration_ms = (time.perf_counter() - start_time) * 1000
306
+ return self._parse_response(query, response, duration_ms)
307
+
308
+ except Exception as e:
309
+ logger.error(f"Search failed: {e}")
310
+ raise SearchError(
311
+ message=f"Search failed: {e}",
312
+ details={"query": query.text[:100], "index": index_name},
313
+ cause=e,
314
+ ) from e
315
+
316
+ def _parse_response(
317
+ self,
318
+ query: SearchQuery,
319
+ response: dict[str, Any],
320
+ duration_ms: float,
321
+ ) -> SearchResult:
322
+ """Parse OpenSearch response into SearchResult.
323
+
324
+ Args:
325
+ query: Original query.
326
+ response: OpenSearch response.
327
+ duration_ms: Search duration in milliseconds.
328
+
329
+ Returns:
330
+ Parsed search result.
331
+ """
332
+ hits = response.get("hits", {})
333
+ total = hits.get("total", {})
334
+ total_hits = total.get("value", 0) if isinstance(total, dict) else total
335
+ max_score = hits.get("max_score")
336
+
337
+ items = []
338
+ search_after_token = None
339
+
340
+ for hit in hits.get("hits", []):
341
+ source = hit.get("_source", {})
342
+
343
+ # Extract highlights
344
+ highlights = None
345
+ highlighted_title = None
346
+ if "highlight" in hit:
347
+ highlight_data = hit["highlight"]
348
+ highlights = highlight_data.get("content", [])
349
+ title_highlights = highlight_data.get("title", [])
350
+ if title_highlights:
351
+ highlighted_title = title_highlights[0]
352
+
353
+ item = SearchResultItem(
354
+ doc_id=hit.get("_id", ""),
355
+ content=source.get("content", ""),
356
+ score=hit.get("_score", 0.0),
357
+ title=source.get("title"),
358
+ url=source.get("url"),
359
+ source=source.get("source"),
360
+ collection_id=source.get("collection_id"),
361
+ highlights=highlights,
362
+ highlighted_title=highlighted_title,
363
+ metadata=source.get("metadata"),
364
+ chunk_index=source.get("chunk_index"),
365
+ total_chunks=source.get("total_chunks"),
366
+ explanation=hit.get("_explanation") if query.explain else None,
367
+ )
368
+ items.append(item)
369
+
370
+ # Track search_after for cursor pagination
371
+ if "sort" in hit:
372
+ search_after_token = hit["sort"]
373
+
374
+ return SearchResult(
375
+ query=query.text,
376
+ mode=query.mode,
377
+ items=items,
378
+ total_hits=total_hits,
379
+ duration_ms=duration_ms,
380
+ max_score=max_score,
381
+ search_after_token=search_after_token,
382
+ has_more=len(items) == query.limit and total_hits > query.offset + len(items),
383
+ )