gnosisllm-knowledge 0.4.0__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/PKG-INFO +1 -1
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/pyproject.toml +1 -1
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/knowledge.py +8 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/memory/searcher.py +24 -3
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/searcher.py +63 -2
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/searcher.py +20 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/indexing.py +16 -1
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/search.py +7 -10
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/streaming_pipeline.py +6 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/README.md +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/memory.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/memory/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/memory/indexer.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/agentic.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/indexer.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/mappings.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/memory/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/memory/client.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/memory/config.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/memory/setup.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/opensearch/queries.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/fixed.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/sentence.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/app.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/agentic.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/load.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/memory.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/search.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/setup.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/display/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/display/service.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/utils/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/utils/config.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/discovery.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/document.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/memory.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/result.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/search.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/domain/source.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/events/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/events/emitter.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/events/types.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/exceptions.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/agentic.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/chunker.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/fetcher.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/indexer.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/loader.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/memory.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/setup.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/interfaces/streaming.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/streaming/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/streaming/pipeline.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/config.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/http.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/neoreader.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/neoreader_discovery.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/__init__.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/base.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/discovery.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/discovery_streaming.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/factory.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/sitemap.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/sitemap_streaming.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/website.py +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/py.typed +0 -0
- {gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gnosisllm-knowledge
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Enterprise-grade knowledge loading, indexing, and search for Python
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: knowledge-base,rag,semantic-search,vector-search,opensearch,llm,embeddings,enterprise
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/knowledge.py
RENAMED
|
@@ -119,6 +119,7 @@ class Knowledge:
|
|
|
119
119
|
loader_factory: LoaderFactory | None = None,
|
|
120
120
|
default_index: str | None = None,
|
|
121
121
|
events: EventEmitter | None = None,
|
|
122
|
+
config: OpenSearchConfig | None = None,
|
|
122
123
|
) -> None:
|
|
123
124
|
"""Initialize Knowledge with components.
|
|
124
125
|
|
|
@@ -131,6 +132,7 @@ class Knowledge:
|
|
|
131
132
|
loader_factory: Optional loader factory.
|
|
132
133
|
default_index: Default index name.
|
|
133
134
|
events: Optional event emitter.
|
|
135
|
+
config: Optional OpenSearch config for settings like batch sizes.
|
|
134
136
|
|
|
135
137
|
Note:
|
|
136
138
|
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
@@ -144,6 +146,7 @@ class Knowledge:
|
|
|
144
146
|
self._loader_factory = loader_factory
|
|
145
147
|
self._default_index = default_index
|
|
146
148
|
self._events = events or EventEmitter()
|
|
149
|
+
self._config = config
|
|
147
150
|
|
|
148
151
|
# Initialize services lazily
|
|
149
152
|
self._indexing_service: KnowledgeIndexingService | None = None
|
|
@@ -262,6 +265,7 @@ class Knowledge:
|
|
|
262
265
|
fetcher=fetcher,
|
|
263
266
|
loader_factory=loader_factory,
|
|
264
267
|
default_index=config.knowledge_index_name,
|
|
268
|
+
config=config,
|
|
265
269
|
)
|
|
266
270
|
|
|
267
271
|
@classmethod
|
|
@@ -431,11 +435,15 @@ class Knowledge:
|
|
|
431
435
|
events=self._events,
|
|
432
436
|
)
|
|
433
437
|
|
|
438
|
+
# Get batch size from config or use default
|
|
439
|
+
batch_size = self._config.indexing_batch_size if self._config else 10
|
|
440
|
+
|
|
434
441
|
return await service.load_and_index(
|
|
435
442
|
source=source,
|
|
436
443
|
index_name=index,
|
|
437
444
|
collection_id=collection_id,
|
|
438
445
|
source_id=source_id,
|
|
446
|
+
batch_size=batch_size,
|
|
439
447
|
**options,
|
|
440
448
|
)
|
|
441
449
|
|
|
@@ -395,16 +395,37 @@ class MemorySearcher:
|
|
|
395
395
|
"offset": offset,
|
|
396
396
|
}
|
|
397
397
|
|
|
398
|
-
def count(
|
|
399
|
-
|
|
398
|
+
async def count(
|
|
399
|
+
self,
|
|
400
|
+
index_name: str,
|
|
401
|
+
collection_id: str | None = None,
|
|
402
|
+
source_id: str | None = None,
|
|
403
|
+
) -> int:
|
|
404
|
+
"""Count documents in index with optional filters.
|
|
400
405
|
|
|
401
406
|
Args:
|
|
402
407
|
index_name: Index to count.
|
|
408
|
+
collection_id: Filter by collection.
|
|
409
|
+
source_id: Filter by source.
|
|
403
410
|
|
|
404
411
|
Returns:
|
|
405
412
|
Document count.
|
|
406
413
|
"""
|
|
407
|
-
|
|
414
|
+
# Use efficient O(1) count when no filters
|
|
415
|
+
if not collection_id and not source_id:
|
|
416
|
+
return self._indexer.count(index_name)
|
|
417
|
+
|
|
418
|
+
# With filters, iterate over index values (memory backend is for testing only)
|
|
419
|
+
index_data = self._indexer._indices.get(index_name, {})
|
|
420
|
+
count = 0
|
|
421
|
+
for doc in index_data.values():
|
|
422
|
+
if collection_id and doc.get("collection_id") != collection_id:
|
|
423
|
+
continue
|
|
424
|
+
if source_id and doc.get("source_id") != source_id:
|
|
425
|
+
continue
|
|
426
|
+
count += 1
|
|
427
|
+
|
|
428
|
+
return count
|
|
408
429
|
|
|
409
430
|
async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
|
|
410
431
|
"""Get unique collections with document counts.
|
|
@@ -109,6 +109,11 @@ class OpenSearchConfig:
|
|
|
109
109
|
bulk_batch_size: int = 500
|
|
110
110
|
bulk_max_concurrent: int = 3
|
|
111
111
|
|
|
112
|
+
# === Indexing Service ===
|
|
113
|
+
# Batch size for progressive indexing during load operations
|
|
114
|
+
# Documents are indexed in batches of this size as they stream in
|
|
115
|
+
indexing_batch_size: int = 10
|
|
116
|
+
|
|
112
117
|
@property
|
|
113
118
|
def url(self) -> str:
|
|
114
119
|
"""Get the full OpenSearch URL."""
|
|
@@ -213,4 +218,6 @@ class OpenSearchConfig:
|
|
|
213
218
|
# === Bulk Indexing ===
|
|
214
219
|
bulk_batch_size=int(os.getenv("OPENSEARCH_BULK_BATCH_SIZE", "500")),
|
|
215
220
|
bulk_max_concurrent=int(os.getenv("OPENSEARCH_BULK_MAX_CONCURRENT", "3")),
|
|
221
|
+
# === Indexing Service ===
|
|
222
|
+
indexing_batch_size=int(os.getenv("GNOSISLLM_INDEXING_BATCH_SIZE", "10")),
|
|
216
223
|
)
|
|
@@ -236,12 +236,14 @@ class OpenSearchKnowledgeSearcher:
|
|
|
236
236
|
self,
|
|
237
237
|
queries: list[SearchQuery],
|
|
238
238
|
index_name: str,
|
|
239
|
+
**options: Any,
|
|
239
240
|
) -> list[SearchResult]:
|
|
240
241
|
"""Execute multiple searches in a single request.
|
|
241
242
|
|
|
242
243
|
Args:
|
|
243
244
|
queries: List of search queries.
|
|
244
245
|
index_name: Index to search.
|
|
246
|
+
**options: Additional options (text_field, embedding_field, model_id).
|
|
245
247
|
|
|
246
248
|
Returns:
|
|
247
249
|
List of search results.
|
|
@@ -252,11 +254,15 @@ class OpenSearchKnowledgeSearcher:
|
|
|
252
254
|
start_time = time.perf_counter()
|
|
253
255
|
msearch_body = []
|
|
254
256
|
|
|
257
|
+
# Allow overriding embedding field and model_id via options
|
|
258
|
+
embedding_field = options.get("embedding_field", self._config.embedding_field)
|
|
259
|
+
model_id = options.get("model_id", self._config.model_id)
|
|
260
|
+
|
|
255
261
|
for query in queries:
|
|
256
262
|
builder = QueryBuilder(
|
|
257
263
|
query,
|
|
258
|
-
model_id=
|
|
259
|
-
embedding_field=
|
|
264
|
+
model_id=model_id,
|
|
265
|
+
embedding_field=embedding_field,
|
|
260
266
|
)
|
|
261
267
|
|
|
262
268
|
if query.mode == SearchMode.SEMANTIC:
|
|
@@ -506,6 +512,61 @@ class OpenSearchKnowledgeSearcher:
|
|
|
506
512
|
"error": str(e),
|
|
507
513
|
}
|
|
508
514
|
|
|
515
|
+
async def count(
|
|
516
|
+
self,
|
|
517
|
+
index_name: str,
|
|
518
|
+
collection_id: str | None = None,
|
|
519
|
+
source_id: str | None = None,
|
|
520
|
+
) -> int:
|
|
521
|
+
"""Count documents in index with optional filters.
|
|
522
|
+
|
|
523
|
+
Uses native _count API instead of search for efficiency and to avoid
|
|
524
|
+
hybrid search issues with empty queries.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
index_name: Index to query.
|
|
528
|
+
collection_id: Filter by collection.
|
|
529
|
+
source_id: Filter by source.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
Document count.
|
|
533
|
+
"""
|
|
534
|
+
try:
|
|
535
|
+
# Check if index exists first
|
|
536
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
537
|
+
if not exists:
|
|
538
|
+
logger.debug(f"Index {index_name} does not exist, returning count 0")
|
|
539
|
+
return 0
|
|
540
|
+
|
|
541
|
+
# Build query with optional filters
|
|
542
|
+
query: dict[str, Any] = {"match_all": {}}
|
|
543
|
+
|
|
544
|
+
filters = []
|
|
545
|
+
if collection_id:
|
|
546
|
+
filters.append({"term": {"collection_id": collection_id}})
|
|
547
|
+
if source_id:
|
|
548
|
+
filters.append({"term": {"source_id": source_id}})
|
|
549
|
+
|
|
550
|
+
if filters:
|
|
551
|
+
query = {"bool": {"filter": filters}}
|
|
552
|
+
|
|
553
|
+
# Use native _count API
|
|
554
|
+
response = await self._client.count(
|
|
555
|
+
index=index_name,
|
|
556
|
+
body={"query": query},
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
count = response.get("count", 0)
|
|
560
|
+
logger.debug(f"Count for {index_name}: {count} (collection={collection_id}, source={source_id})")
|
|
561
|
+
return count
|
|
562
|
+
|
|
563
|
+
except Exception as e:
|
|
564
|
+
logger.error(f"Failed to count documents in {index_name}: {e}")
|
|
565
|
+
raise SearchError(
|
|
566
|
+
message=f"Count failed: {e}",
|
|
567
|
+
details={"index": index_name, "collection_id": collection_id, "source_id": source_id},
|
|
568
|
+
) from e
|
|
569
|
+
|
|
509
570
|
async def list_documents(
|
|
510
571
|
self,
|
|
511
572
|
index_name: str,
|
|
@@ -249,10 +249,9 @@ class OpenSearchSetupAdapter:
|
|
|
249
249
|
self._model_id = self._config.model_id
|
|
250
250
|
|
|
251
251
|
# Step 4: Create ingest pipeline
|
|
252
|
-
#
|
|
253
|
-
#
|
|
254
|
-
|
|
255
|
-
if self._model_id and is_global_setup:
|
|
252
|
+
# Create pipeline for any setup that has a model deployed
|
|
253
|
+
# Each index_prefix namespace gets its own pipeline
|
|
254
|
+
if self._model_id:
|
|
256
255
|
try:
|
|
257
256
|
await self._create_ingest_pipeline()
|
|
258
257
|
pipeline_name = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
|
|
@@ -261,35 +260,33 @@ class OpenSearchSetupAdapter:
|
|
|
261
260
|
errors.append(f"Failed to create ingest pipeline: {e}")
|
|
262
261
|
logger.error(f"Failed to create ingest pipeline: {e}")
|
|
263
262
|
|
|
264
|
-
# Step 5: Create search pipeline
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
logger.error(f"Failed to create search pipeline: {e}")
|
|
263
|
+
# Step 5: Create search pipeline for hybrid search
|
|
264
|
+
try:
|
|
265
|
+
await self._create_search_pipeline()
|
|
266
|
+
pipeline_name = self._config.search_pipeline_name or f"{self._config.index_prefix}-search-pipeline"
|
|
267
|
+
steps_completed.append(f"Created search pipeline: {pipeline_name}")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
errors.append(f"Failed to create search pipeline: {e}")
|
|
270
|
+
logger.error(f"Failed to create search pipeline: {e}")
|
|
273
271
|
|
|
274
|
-
# Step 6: Create index template
|
|
275
|
-
# Template covers all
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
template_body = get_index_template(self._config)
|
|
272
|
+
# Step 6: Create index template for this namespace
|
|
273
|
+
# Template covers all {index_prefix}-* indices
|
|
274
|
+
try:
|
|
275
|
+
template_name = f"{self._config.index_prefix}-template"
|
|
276
|
+
template_body = get_index_template(self._config)
|
|
280
277
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
278
|
+
# Set default pipeline for auto-index creation within this namespace
|
|
279
|
+
default_pipeline = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
|
|
280
|
+
template_body["template"]["settings"]["index"]["default_pipeline"] = default_pipeline
|
|
284
281
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
282
|
+
await self._client.indices.put_index_template(
|
|
283
|
+
name=template_name,
|
|
284
|
+
body=template_body,
|
|
285
|
+
)
|
|
286
|
+
steps_completed.append(f"Created index template: {template_name}")
|
|
287
|
+
except Exception as e:
|
|
288
|
+
errors.append(f"Failed to create index template: {e}")
|
|
289
|
+
logger.error(f"Failed to create index template: {e}")
|
|
293
290
|
|
|
294
291
|
# Step 7: Create knowledge index
|
|
295
292
|
try:
|
|
@@ -298,9 +295,8 @@ class OpenSearchSetupAdapter:
|
|
|
298
295
|
|
|
299
296
|
if not exists:
|
|
300
297
|
settings = get_knowledge_index_settings(self._config)
|
|
301
|
-
# Add default pipeline
|
|
302
|
-
|
|
303
|
-
pipeline_name = self._config.ingest_pipeline_name or "gnosisllm-ingest-pipeline"
|
|
298
|
+
# Add default pipeline for this namespace
|
|
299
|
+
pipeline_name = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
|
|
304
300
|
settings["index"]["default_pipeline"] = pipeline_name
|
|
305
301
|
|
|
306
302
|
await self._client.indices.create(
|
|
@@ -185,3 +185,23 @@ class IKnowledgeSearcher(Protocol):
|
|
|
185
185
|
List of SearchResults in same order as queries.
|
|
186
186
|
"""
|
|
187
187
|
...
|
|
188
|
+
|
|
189
|
+
async def count(
|
|
190
|
+
self,
|
|
191
|
+
index_name: str,
|
|
192
|
+
collection_id: str | None = None,
|
|
193
|
+
source_id: str | None = None,
|
|
194
|
+
) -> int:
|
|
195
|
+
"""Count documents in index with optional filters.
|
|
196
|
+
|
|
197
|
+
Uses native count API instead of search for efficiency.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
index_name: Target index name.
|
|
201
|
+
collection_id: Filter by collection.
|
|
202
|
+
source_id: Filter by source.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Document count.
|
|
206
|
+
"""
|
|
207
|
+
...
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/indexing.py
RENAMED
|
@@ -119,6 +119,16 @@ class KnowledgeIndexingService:
|
|
|
119
119
|
source_id = source_id or str(uuid.uuid4())
|
|
120
120
|
document_defaults = options.pop("document_defaults", {})
|
|
121
121
|
|
|
122
|
+
# Extract metadata from document_defaults to merge with doc.metadata later
|
|
123
|
+
# This allows callers to pass custom metadata (e.g., parent_collection_id)
|
|
124
|
+
# without conflicting with the explicit metadata= parameter
|
|
125
|
+
extra_metadata = document_defaults.pop("metadata", {})
|
|
126
|
+
|
|
127
|
+
# Ensure index exists with correct mappings before indexing
|
|
128
|
+
# This prevents OpenSearch from auto-creating the index with dynamic mapping
|
|
129
|
+
# which would map keyword fields (like collection_id) as text fields
|
|
130
|
+
await self._indexer.ensure_index(index_name)
|
|
131
|
+
|
|
122
132
|
# Emit batch started event
|
|
123
133
|
await self._events.emit_async(
|
|
124
134
|
BatchStartedEvent(
|
|
@@ -139,6 +149,8 @@ class KnowledgeIndexingService:
|
|
|
139
149
|
# Note: Loader already chunks content, so we don't re-chunk here
|
|
140
150
|
async for doc in self._loader.load_streaming(source, **options):
|
|
141
151
|
# Enrich document with collection info
|
|
152
|
+
# Merge doc.metadata with extra_metadata from document_defaults
|
|
153
|
+
merged_metadata = {**doc.metadata, **extra_metadata}
|
|
142
154
|
enriched_doc = Document(
|
|
143
155
|
content=doc.content,
|
|
144
156
|
source=source,
|
|
@@ -151,7 +163,7 @@ class KnowledgeIndexingService:
|
|
|
151
163
|
total_chunks=doc.total_chunks,
|
|
152
164
|
parent_doc_id=doc.parent_doc_id,
|
|
153
165
|
status=DocumentStatus.INDEXED,
|
|
154
|
-
metadata=
|
|
166
|
+
metadata=merged_metadata,
|
|
155
167
|
**document_defaults,
|
|
156
168
|
)
|
|
157
169
|
|
|
@@ -223,6 +235,9 @@ class KnowledgeIndexingService:
|
|
|
223
235
|
Returns:
|
|
224
236
|
Index result.
|
|
225
237
|
"""
|
|
238
|
+
# Ensure index exists with correct mappings before indexing
|
|
239
|
+
await self._indexer.ensure_index(index_name)
|
|
240
|
+
|
|
226
241
|
total_indexed = 0
|
|
227
242
|
total_failed = 0
|
|
228
243
|
errors: list[str] = []
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/search.py
RENAMED
|
@@ -328,6 +328,9 @@ class KnowledgeSearchService:
|
|
|
328
328
|
) -> int:
|
|
329
329
|
"""Count documents in index.
|
|
330
330
|
|
|
331
|
+
Uses native count API instead of search for efficiency and to avoid
|
|
332
|
+
hybrid search issues with empty queries.
|
|
333
|
+
|
|
331
334
|
Note:
|
|
332
335
|
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
333
336
|
at the API layer by using separate indices per account.
|
|
@@ -344,18 +347,12 @@ class KnowledgeSearchService:
|
|
|
344
347
|
if not index:
|
|
345
348
|
raise SearchError(message="No index specified")
|
|
346
349
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
collection_ids=[collection_id] if collection_id else None,
|
|
352
|
-
source_ids=[source_id] if source_id else None,
|
|
350
|
+
return await self._searcher.count(
|
|
351
|
+
index_name=index,
|
|
352
|
+
collection_id=collection_id,
|
|
353
|
+
source_id=source_id,
|
|
353
354
|
)
|
|
354
355
|
|
|
355
|
-
# Use a simple match_all to get total count
|
|
356
|
-
result = await self._searcher.search(query, index)
|
|
357
|
-
return result.total_hits
|
|
358
|
-
|
|
359
356
|
async def get_collections(
|
|
360
357
|
self,
|
|
361
358
|
index_name: str | None = None,
|
|
@@ -173,6 +173,12 @@ class StreamingIndexingPipeline:
|
|
|
173
173
|
DeprecationWarning,
|
|
174
174
|
stacklevel=2,
|
|
175
175
|
)
|
|
176
|
+
|
|
177
|
+
# Ensure index exists with correct mappings before indexing
|
|
178
|
+
# This prevents OpenSearch from auto-creating the index with dynamic mapping
|
|
179
|
+
# which would map keyword fields (like collection_id) as text fields
|
|
180
|
+
await self._indexer.ensure_index(index_name)
|
|
181
|
+
|
|
176
182
|
start_time = time.time()
|
|
177
183
|
self._progress = StreamingProgress(current_phase="starting")
|
|
178
184
|
await self._emit_progress()
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/__init__.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/api/memory.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/backends/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/__init__.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/fixed.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/chunking/sentence.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/commands/load.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/cli/utils/config.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/events/types.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/core/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/__init__.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/config.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/fetchers/http.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/__init__.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/base.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/discovery.py
RENAMED
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/factory.py
RENAMED
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/sitemap.py
RENAMED
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/loaders/website.py
RENAMED
|
File without changes
|
|
File without changes
|
{gnosisllm_knowledge-0.4.0 → gnosisllm_knowledge-0.4.4}/src/gnosisllm_knowledge/services/__init__.py
RENAMED
|
File without changes
|