gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Uses OpenSearch neural search - embeddings are generated automatically
|
|
4
4
|
by the deployed ML model. No Python-side embedding generation needed.
|
|
5
|
+
|
|
6
|
+
Note: This module is tenant-agnostic. Multi-tenancy should be handled
|
|
7
|
+
at the API layer by using separate indices per account (e.g.,
|
|
8
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
11
|
from __future__ import annotations
|
|
@@ -296,12 +300,25 @@ class OpenSearchKnowledgeSearcher:
|
|
|
296
300
|
Parsed search results.
|
|
297
301
|
"""
|
|
298
302
|
try:
|
|
303
|
+
logger.debug(f"OpenSearch query: {os_query}")
|
|
304
|
+
logger.debug(f"Query include_highlights: {query.include_highlights}")
|
|
305
|
+
|
|
299
306
|
response = await self._client.search(
|
|
300
307
|
index=index_name,
|
|
301
308
|
body=os_query,
|
|
302
309
|
**params,
|
|
303
310
|
)
|
|
304
311
|
|
|
312
|
+
# Debug: Log first hit to see if highlights are present
|
|
313
|
+
hits = response.get("hits", {}).get("hits", [])
|
|
314
|
+
if hits:
|
|
315
|
+
first_hit = hits[0]
|
|
316
|
+
logger.debug(f"First hit keys: {first_hit.keys()}")
|
|
317
|
+
if "highlight" in first_hit:
|
|
318
|
+
logger.debug(f"Highlight data: {first_hit['highlight']}")
|
|
319
|
+
else:
|
|
320
|
+
logger.debug("No 'highlight' key in response hit")
|
|
321
|
+
|
|
305
322
|
duration_ms = (time.perf_counter() - start_time) * 1000
|
|
306
323
|
return self._parse_response(query, response, duration_ms)
|
|
307
324
|
|
|
@@ -381,3 +398,224 @@ class OpenSearchKnowledgeSearcher:
|
|
|
381
398
|
search_after_token=search_after_token,
|
|
382
399
|
has_more=len(items) == query.limit and total_hits > query.offset + len(items),
|
|
383
400
|
)
|
|
401
|
+
|
|
402
|
+
async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
|
|
403
|
+
"""Get unique collections with document counts via aggregation.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
index_name: Index to query.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
List of collections with id, name, and document_count.
|
|
410
|
+
"""
|
|
411
|
+
try:
|
|
412
|
+
# Check if index exists
|
|
413
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
414
|
+
if not exists:
|
|
415
|
+
logger.debug(f"Index {index_name} does not exist")
|
|
416
|
+
return []
|
|
417
|
+
|
|
418
|
+
# Aggregation query for unique collection_ids with counts
|
|
419
|
+
# Also aggregate collection_name for display
|
|
420
|
+
query = {
|
|
421
|
+
"size": 0,
|
|
422
|
+
"aggs": {
|
|
423
|
+
"collections": {
|
|
424
|
+
"terms": {
|
|
425
|
+
"field": "collection_id",
|
|
426
|
+
"size": 1000, # Max collections to return
|
|
427
|
+
},
|
|
428
|
+
"aggs": {
|
|
429
|
+
"collection_name": {
|
|
430
|
+
"terms": {
|
|
431
|
+
"field": "collection_name",
|
|
432
|
+
"size": 1,
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
response = await self._client.search(index=index_name, body=query)
|
|
441
|
+
|
|
442
|
+
collections = []
|
|
443
|
+
buckets = response.get("aggregations", {}).get("collections", {}).get("buckets", [])
|
|
444
|
+
|
|
445
|
+
for bucket in buckets:
|
|
446
|
+
collection_id = bucket.get("key")
|
|
447
|
+
if not collection_id:
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
doc_count = bucket.get("doc_count", 0)
|
|
451
|
+
|
|
452
|
+
# Get collection name from nested agg or use ID as fallback
|
|
453
|
+
name_buckets = bucket.get("collection_name", {}).get("buckets", [])
|
|
454
|
+
collection_name = name_buckets[0].get("key") if name_buckets else collection_id
|
|
455
|
+
|
|
456
|
+
collections.append({
|
|
457
|
+
"id": collection_id,
|
|
458
|
+
"name": collection_name or collection_id,
|
|
459
|
+
"document_count": doc_count,
|
|
460
|
+
})
|
|
461
|
+
|
|
462
|
+
logger.debug(f"Found {len(collections)} collections in {index_name}")
|
|
463
|
+
return collections
|
|
464
|
+
|
|
465
|
+
except Exception as e:
|
|
466
|
+
logger.error(f"Failed to get collections from {index_name}: {e}")
|
|
467
|
+
return []
|
|
468
|
+
|
|
469
|
+
async def get_stats(self, index_name: str) -> dict[str, Any]:
|
|
470
|
+
"""Get index statistics.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
index_name: Index to query.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Dictionary with document_count and index info.
|
|
477
|
+
"""
|
|
478
|
+
try:
|
|
479
|
+
# Check if index exists
|
|
480
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
481
|
+
if not exists:
|
|
482
|
+
return {
|
|
483
|
+
"document_count": 0,
|
|
484
|
+
"index_name": index_name,
|
|
485
|
+
"exists": False,
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
# Get index stats
|
|
489
|
+
stats = await self._client.indices.stats(index=index_name)
|
|
490
|
+
index_stats = stats.get("indices", {}).get(index_name, {})
|
|
491
|
+
primaries = index_stats.get("primaries", {})
|
|
492
|
+
docs = primaries.get("docs", {})
|
|
493
|
+
|
|
494
|
+
return {
|
|
495
|
+
"document_count": docs.get("count", 0),
|
|
496
|
+
"index_name": index_name,
|
|
497
|
+
"exists": True,
|
|
498
|
+
"size_bytes": primaries.get("store", {}).get("size_in_bytes", 0),
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.error(f"Failed to get stats for {index_name}: {e}")
|
|
503
|
+
return {
|
|
504
|
+
"document_count": 0,
|
|
505
|
+
"index_name": index_name,
|
|
506
|
+
"error": str(e),
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
async def list_documents(
|
|
510
|
+
self,
|
|
511
|
+
index_name: str,
|
|
512
|
+
*,
|
|
513
|
+
source_id: str | None = None,
|
|
514
|
+
collection_id: str | None = None,
|
|
515
|
+
limit: int = 50,
|
|
516
|
+
offset: int = 0,
|
|
517
|
+
) -> dict[str, Any]:
|
|
518
|
+
"""List documents with optional filters.
|
|
519
|
+
|
|
520
|
+
Note:
|
|
521
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
522
|
+
at the API layer by using separate indices per account.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
526
|
+
source_id: Optional source ID filter.
|
|
527
|
+
collection_id: Optional collection ID filter.
|
|
528
|
+
limit: Maximum documents to return.
|
|
529
|
+
offset: Number of documents to skip.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
Dictionary with documents, total, limit, offset.
|
|
533
|
+
"""
|
|
534
|
+
try:
|
|
535
|
+
# Check if index exists
|
|
536
|
+
exists = await self._client.indices.exists(index=index_name)
|
|
537
|
+
if not exists:
|
|
538
|
+
logger.debug(f"Index {index_name} does not exist")
|
|
539
|
+
return {
|
|
540
|
+
"documents": [],
|
|
541
|
+
"total": 0,
|
|
542
|
+
"limit": limit,
|
|
543
|
+
"offset": offset,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
# Build filter clauses
|
|
547
|
+
filters: list[dict[str, Any]] = []
|
|
548
|
+
|
|
549
|
+
if source_id:
|
|
550
|
+
filters.append({"term": {"source_id": source_id}})
|
|
551
|
+
|
|
552
|
+
if collection_id:
|
|
553
|
+
filters.append({"term": {"collection_id": collection_id}})
|
|
554
|
+
|
|
555
|
+
# Build query with match_all and filters
|
|
556
|
+
query: dict[str, Any] = {
|
|
557
|
+
"size": limit,
|
|
558
|
+
"from": offset,
|
|
559
|
+
"sort": [
|
|
560
|
+
{"created_at": {"order": "desc", "unmapped_type": "date"}},
|
|
561
|
+
{"_id": {"order": "asc"}},
|
|
562
|
+
],
|
|
563
|
+
"_source": {
|
|
564
|
+
"excludes": ["content_embedding"],
|
|
565
|
+
},
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
if filters:
|
|
569
|
+
query["query"] = {
|
|
570
|
+
"bool": {
|
|
571
|
+
"must": [{"match_all": {}}],
|
|
572
|
+
"filter": filters,
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
else:
|
|
576
|
+
query["query"] = {"match_all": {}}
|
|
577
|
+
|
|
578
|
+
response = await self._client.search(index=index_name, body=query)
|
|
579
|
+
|
|
580
|
+
hits = response.get("hits", {})
|
|
581
|
+
total = hits.get("total", {})
|
|
582
|
+
total_hits = total.get("value", 0) if isinstance(total, dict) else total
|
|
583
|
+
|
|
584
|
+
documents = []
|
|
585
|
+
for hit in hits.get("hits", []):
|
|
586
|
+
source = hit.get("_source", {})
|
|
587
|
+
doc = {
|
|
588
|
+
"id": hit.get("_id", ""),
|
|
589
|
+
"title": source.get("title"),
|
|
590
|
+
"url": source.get("url"),
|
|
591
|
+
"content_preview": (source.get("content", ""))[:200],
|
|
592
|
+
"content": source.get("content"),
|
|
593
|
+
"chunk_index": source.get("chunk_index"),
|
|
594
|
+
"total_chunks": source.get("total_chunks"),
|
|
595
|
+
"source_id": source.get("source_id"),
|
|
596
|
+
"collection_id": source.get("collection_id"),
|
|
597
|
+
"created_at": source.get("created_at"),
|
|
598
|
+
"metadata": source.get("metadata"),
|
|
599
|
+
}
|
|
600
|
+
documents.append(doc)
|
|
601
|
+
|
|
602
|
+
logger.debug(
|
|
603
|
+
f"Listed {len(documents)} documents from {index_name} "
|
|
604
|
+
f"(total: {total_hits}, source_id: {source_id})"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
return {
|
|
608
|
+
"documents": documents,
|
|
609
|
+
"total": total_hits,
|
|
610
|
+
"limit": limit,
|
|
611
|
+
"offset": offset,
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
except Exception as e:
|
|
615
|
+
logger.error(f"Failed to list documents from {index_name}: {e}")
|
|
616
|
+
return {
|
|
617
|
+
"documents": [],
|
|
618
|
+
"total": 0,
|
|
619
|
+
"limit": limit,
|
|
620
|
+
"offset": offset,
|
|
621
|
+
}
|