gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,10 @@
2
2
 
3
3
  Uses OpenSearch neural search - embeddings are generated automatically
4
4
  by the deployed ML model. No Python-side embedding generation needed.
5
+
6
+ Note: This module is tenant-agnostic. Multi-tenancy should be handled
7
+ at the API layer by using separate indices per account (e.g.,
8
+ `knowledge-{account_id}`) rather than filtering by account_id.
5
9
  """
6
10
 
7
11
  from __future__ import annotations
@@ -296,12 +300,25 @@ class OpenSearchKnowledgeSearcher:
296
300
  Parsed search results.
297
301
  """
298
302
  try:
303
+ logger.debug(f"OpenSearch query: {os_query}")
304
+ logger.debug(f"Query include_highlights: {query.include_highlights}")
305
+
299
306
  response = await self._client.search(
300
307
  index=index_name,
301
308
  body=os_query,
302
309
  **params,
303
310
  )
304
311
 
312
+ # Debug: Log first hit to see if highlights are present
313
+ hits = response.get("hits", {}).get("hits", [])
314
+ if hits:
315
+ first_hit = hits[0]
316
+ logger.debug(f"First hit keys: {first_hit.keys()}")
317
+ if "highlight" in first_hit:
318
+ logger.debug(f"Highlight data: {first_hit['highlight']}")
319
+ else:
320
+ logger.debug("No 'highlight' key in response hit")
321
+
305
322
  duration_ms = (time.perf_counter() - start_time) * 1000
306
323
  return self._parse_response(query, response, duration_ms)
307
324
 
@@ -381,3 +398,224 @@ class OpenSearchKnowledgeSearcher:
381
398
  search_after_token=search_after_token,
382
399
  has_more=len(items) == query.limit and total_hits > query.offset + len(items),
383
400
  )
401
+
402
+ async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
403
+ """Get unique collections with document counts via aggregation.
404
+
405
+ Args:
406
+ index_name: Index to query.
407
+
408
+ Returns:
409
+ List of collections with id, name, and document_count.
410
+ """
411
+ try:
412
+ # Check if index exists
413
+ exists = await self._client.indices.exists(index=index_name)
414
+ if not exists:
415
+ logger.debug(f"Index {index_name} does not exist")
416
+ return []
417
+
418
+ # Aggregation query for unique collection_ids with counts
419
+ # Also aggregate collection_name for display
420
+ query = {
421
+ "size": 0,
422
+ "aggs": {
423
+ "collections": {
424
+ "terms": {
425
+ "field": "collection_id",
426
+ "size": 1000, # Max collections to return
427
+ },
428
+ "aggs": {
429
+ "collection_name": {
430
+ "terms": {
431
+ "field": "collection_name",
432
+ "size": 1,
433
+ }
434
+ }
435
+ }
436
+ }
437
+ }
438
+ }
439
+
440
+ response = await self._client.search(index=index_name, body=query)
441
+
442
+ collections = []
443
+ buckets = response.get("aggregations", {}).get("collections", {}).get("buckets", [])
444
+
445
+ for bucket in buckets:
446
+ collection_id = bucket.get("key")
447
+ if not collection_id:
448
+ continue
449
+
450
+ doc_count = bucket.get("doc_count", 0)
451
+
452
+ # Get collection name from nested agg or use ID as fallback
453
+ name_buckets = bucket.get("collection_name", {}).get("buckets", [])
454
+ collection_name = name_buckets[0].get("key") if name_buckets else collection_id
455
+
456
+ collections.append({
457
+ "id": collection_id,
458
+ "name": collection_name or collection_id,
459
+ "document_count": doc_count,
460
+ })
461
+
462
+ logger.debug(f"Found {len(collections)} collections in {index_name}")
463
+ return collections
464
+
465
+ except Exception as e:
466
+ logger.error(f"Failed to get collections from {index_name}: {e}")
467
+ return []
468
+
469
+ async def get_stats(self, index_name: str) -> dict[str, Any]:
470
+ """Get index statistics.
471
+
472
+ Args:
473
+ index_name: Index to query.
474
+
475
+ Returns:
476
+ Dictionary with document_count and index info.
477
+ """
478
+ try:
479
+ # Check if index exists
480
+ exists = await self._client.indices.exists(index=index_name)
481
+ if not exists:
482
+ return {
483
+ "document_count": 0,
484
+ "index_name": index_name,
485
+ "exists": False,
486
+ }
487
+
488
+ # Get index stats
489
+ stats = await self._client.indices.stats(index=index_name)
490
+ index_stats = stats.get("indices", {}).get(index_name, {})
491
+ primaries = index_stats.get("primaries", {})
492
+ docs = primaries.get("docs", {})
493
+
494
+ return {
495
+ "document_count": docs.get("count", 0),
496
+ "index_name": index_name,
497
+ "exists": True,
498
+ "size_bytes": primaries.get("store", {}).get("size_in_bytes", 0),
499
+ }
500
+
501
+ except Exception as e:
502
+ logger.error(f"Failed to get stats for {index_name}: {e}")
503
+ return {
504
+ "document_count": 0,
505
+ "index_name": index_name,
506
+ "error": str(e),
507
+ }
508
+
509
+ async def list_documents(
510
+ self,
511
+ index_name: str,
512
+ *,
513
+ source_id: str | None = None,
514
+ collection_id: str | None = None,
515
+ limit: int = 50,
516
+ offset: int = 0,
517
+ ) -> dict[str, Any]:
518
+ """List documents with optional filters.
519
+
520
+ Note:
521
+ This method is tenant-agnostic. Multi-tenancy should be handled
522
+ at the API layer by using separate indices per account.
523
+
524
+ Args:
525
+ index_name: Index to query (use tenant-specific name for isolation).
526
+ source_id: Optional source ID filter.
527
+ collection_id: Optional collection ID filter.
528
+ limit: Maximum documents to return.
529
+ offset: Number of documents to skip.
530
+
531
+ Returns:
532
+ Dictionary with documents, total, limit, offset.
533
+ """
534
+ try:
535
+ # Check if index exists
536
+ exists = await self._client.indices.exists(index=index_name)
537
+ if not exists:
538
+ logger.debug(f"Index {index_name} does not exist")
539
+ return {
540
+ "documents": [],
541
+ "total": 0,
542
+ "limit": limit,
543
+ "offset": offset,
544
+ }
545
+
546
+ # Build filter clauses
547
+ filters: list[dict[str, Any]] = []
548
+
549
+ if source_id:
550
+ filters.append({"term": {"source_id": source_id}})
551
+
552
+ if collection_id:
553
+ filters.append({"term": {"collection_id": collection_id}})
554
+
555
+ # Build query with match_all and filters
556
+ query: dict[str, Any] = {
557
+ "size": limit,
558
+ "from": offset,
559
+ "sort": [
560
+ {"created_at": {"order": "desc", "unmapped_type": "date"}},
561
+ {"_id": {"order": "asc"}},
562
+ ],
563
+ "_source": {
564
+ "excludes": ["content_embedding"],
565
+ },
566
+ }
567
+
568
+ if filters:
569
+ query["query"] = {
570
+ "bool": {
571
+ "must": [{"match_all": {}}],
572
+ "filter": filters,
573
+ }
574
+ }
575
+ else:
576
+ query["query"] = {"match_all": {}}
577
+
578
+ response = await self._client.search(index=index_name, body=query)
579
+
580
+ hits = response.get("hits", {})
581
+ total = hits.get("total", {})
582
+ total_hits = total.get("value", 0) if isinstance(total, dict) else total
583
+
584
+ documents = []
585
+ for hit in hits.get("hits", []):
586
+ source = hit.get("_source", {})
587
+ doc = {
588
+ "id": hit.get("_id", ""),
589
+ "title": source.get("title"),
590
+ "url": source.get("url"),
591
+ "content_preview": (source.get("content", ""))[:200],
592
+ "content": source.get("content"),
593
+ "chunk_index": source.get("chunk_index"),
594
+ "total_chunks": source.get("total_chunks"),
595
+ "source_id": source.get("source_id"),
596
+ "collection_id": source.get("collection_id"),
597
+ "created_at": source.get("created_at"),
598
+ "metadata": source.get("metadata"),
599
+ }
600
+ documents.append(doc)
601
+
602
+ logger.debug(
603
+ f"Listed {len(documents)} documents from {index_name} "
604
+ f"(total: {total_hits}, source_id: {source_id})"
605
+ )
606
+
607
+ return {
608
+ "documents": documents,
609
+ "total": total_hits,
610
+ "limit": limit,
611
+ "offset": offset,
612
+ }
613
+
614
+ except Exception as e:
615
+ logger.error(f"Failed to list documents from {index_name}: {e}")
616
+ return {
617
+ "documents": [],
618
+ "total": 0,
619
+ "limit": limit,
620
+ "offset": offset,
621
+ }