gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +502 -32
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  7. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  8. gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
  9. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  10. gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
  11. gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
  12. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  14. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  15. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  16. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  17. gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
  18. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  19. gnosisllm_knowledge/cli/app.py +436 -31
  20. gnosisllm_knowledge/cli/commands/agentic.py +26 -9
  21. gnosisllm_knowledge/cli/commands/load.py +169 -19
  22. gnosisllm_knowledge/cli/commands/memory.py +733 -0
  23. gnosisllm_knowledge/cli/commands/search.py +9 -10
  24. gnosisllm_knowledge/cli/commands/setup.py +49 -23
  25. gnosisllm_knowledge/cli/display/service.py +43 -0
  26. gnosisllm_knowledge/cli/utils/config.py +62 -4
  27. gnosisllm_knowledge/core/domain/__init__.py +54 -0
  28. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  29. gnosisllm_knowledge/core/domain/document.py +19 -19
  30. gnosisllm_knowledge/core/domain/memory.py +440 -0
  31. gnosisllm_knowledge/core/domain/result.py +11 -3
  32. gnosisllm_knowledge/core/domain/search.py +12 -25
  33. gnosisllm_knowledge/core/domain/source.py +11 -12
  34. gnosisllm_knowledge/core/events/__init__.py +8 -0
  35. gnosisllm_knowledge/core/events/types.py +198 -5
  36. gnosisllm_knowledge/core/exceptions.py +227 -0
  37. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  38. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  39. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  40. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  41. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  42. gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
  43. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  44. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  45. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  46. gnosisllm_knowledge/fetchers/config.py +27 -0
  47. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  48. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  49. gnosisllm_knowledge/loaders/__init__.py +5 -1
  50. gnosisllm_knowledge/loaders/base.py +3 -4
  51. gnosisllm_knowledge/loaders/discovery.py +338 -0
  52. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  53. gnosisllm_knowledge/loaders/factory.py +46 -0
  54. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  55. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  56. gnosisllm_knowledge/services/indexing.py +100 -93
  57. gnosisllm_knowledge/services/search.py +84 -31
  58. gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
  59. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
  60. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  62. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  63. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,10 @@
2
2
 
3
3
  Fetches content, chunks it for optimal embedding, and indexes
4
4
  into OpenSearch with automatic embedding generation via ingest pipeline.
5
+
6
+ Note:
7
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
8
+ isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
5
9
  """
6
10
 
7
11
  from __future__ import annotations
@@ -26,6 +30,14 @@ from gnosisllm_knowledge.chunking.sentence import SentenceChunker
26
30
  from gnosisllm_knowledge.cli.display.service import RichDisplayService
27
31
  from gnosisllm_knowledge.cli.utils.config import CliConfig
28
32
  from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
33
+ from gnosisllm_knowledge.core.events.emitter import EventEmitter
34
+ from gnosisllm_knowledge.core.events.types import (
35
+ DiscoveryCompletedEvent,
36
+ DiscoveryFailedEvent,
37
+ DiscoveryProgressEvent,
38
+ DiscoveryStartedEvent,
39
+ EventType,
40
+ )
29
41
  from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
30
42
  from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
31
43
  from gnosisllm_knowledge.loaders.factory import LoaderFactory
@@ -39,7 +51,6 @@ async def load_command(
39
51
  source: str,
40
52
  source_type: str | None = None,
41
53
  index_name: str = "knowledge",
42
- account_id: str | None = None,
43
54
  collection_id: str | None = None,
44
55
  source_id: str | None = None,
45
56
  batch_size: int = 100,
@@ -47,15 +58,22 @@ async def load_command(
47
58
  force: bool = False,
48
59
  dry_run: bool = False,
49
60
  verbose: bool = False,
61
+ discovery: bool = False,
62
+ max_depth: int = 3,
63
+ max_pages: int = 100,
64
+ same_domain: bool = True,
50
65
  ) -> None:
51
66
  """Execute the load command.
52
67
 
68
+ Note:
69
+ Multi-tenancy is achieved through index isolation. Use tenant-specific
70
+ index names instead (e.g., --index knowledge-tenant-123).
71
+
53
72
  Args:
54
73
  display: Display service for output.
55
74
  source: URL or sitemap to load content from.
56
- source_type: Source type (website, sitemap) or auto-detect.
57
- index_name: Target index name.
58
- account_id: Multi-tenant account ID.
75
+ source_type: Source type (website, sitemap, discovery) or auto-detect.
76
+ index_name: Target index name (use tenant-specific name for isolation).
59
77
  collection_id: Collection grouping ID.
60
78
  source_id: Source identifier (defaults to URL).
61
79
  batch_size: Documents per indexing batch.
@@ -63,6 +81,10 @@ async def load_command(
63
81
  force: Delete existing source documents first.
64
82
  dry_run: Preview without indexing.
65
83
  verbose: Show per-document progress.
84
+ discovery: Use discovery loader (website crawling) instead of single URL.
85
+ max_depth: Maximum crawl depth for discovery (default: 3).
86
+ max_pages: Maximum pages to discover (default: 100).
87
+ same_domain: Only crawl URLs on the same domain (default: True).
66
88
  """
67
89
  # Load configuration
68
90
  cli_config = CliConfig.from_env()
@@ -70,10 +92,18 @@ async def load_command(
70
92
  # Auto-detect source type
71
93
  detected_type = source_type
72
94
  if not detected_type:
73
- if "sitemap" in source.lower() or source.endswith(".xml"):
95
+ if discovery:
96
+ detected_type = "discovery"
97
+ elif "sitemap" in source.lower() or source.endswith(".xml"):
74
98
  detected_type = "sitemap"
75
99
  else:
76
100
  detected_type = "website"
101
+ elif discovery and detected_type != "discovery":
102
+ # --discovery flag overrides explicit type for website URLs
103
+ display.warning(
104
+ f"Using discovery loader (--discovery flag overrides --type {detected_type})"
105
+ )
106
+ detected_type = "discovery"
77
107
 
78
108
  # Default source_id to URL
79
109
  final_source_id = source_id or source
@@ -85,18 +115,28 @@ async def load_command(
85
115
  )
86
116
 
87
117
  # Show configuration
118
+ is_auto_detected = not source_type and not discovery
119
+ type_suffix = " (auto-detected)" if is_auto_detected else ""
88
120
  config_rows = [
89
121
  ("Source", source[:50] + "..." if len(source) > 50 else source),
90
- ("Type", f"{detected_type} {'(auto-detected)' if not source_type else ''}"),
122
+ ("Type", f"{detected_type}{type_suffix}"),
91
123
  ("Target Index", index_name),
92
124
  ("Batch Size", str(batch_size)),
93
- ("Max URLs", str(max_urls) if detected_type == "sitemap" else "N/A"),
125
+ ]
126
+
127
+ # Add type-specific configuration
128
+ if detected_type == "sitemap":
129
+ config_rows.append(("Max URLs", str(max_urls)))
130
+ elif detected_type == "discovery":
131
+ config_rows.append(("Max Depth", str(max_depth)))
132
+ config_rows.append(("Max Pages", str(max_pages)))
133
+ config_rows.append(("Same Domain", "Yes" if same_domain else "No"))
134
+
135
+ config_rows.extend([
94
136
  ("Neoreader", cli_config.neoreader_host),
95
137
  ("OpenSearch", f"{cli_config.opensearch_host}:{cli_config.opensearch_port}"),
96
- ]
138
+ ])
97
139
 
98
- if account_id:
99
- config_rows.append(("Account ID", account_id))
100
140
  if collection_id:
101
141
  config_rows.append(("Collection ID", collection_id))
102
142
  if force:
@@ -119,9 +159,16 @@ async def load_command(
119
159
  display.warning(f"Cannot connect to Neoreader at {cli_config.neoreader_host}")
120
160
  display.info("Continuing with fallback HTTP fetcher...")
121
161
 
162
+ # Create event emitter for discovery progress tracking
163
+ event_emitter = EventEmitter()
164
+
122
165
  # Create loader
123
166
  chunker = SentenceChunker()
124
- loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
167
+ loader_factory = LoaderFactory(
168
+ fetcher=fetcher,
169
+ chunker=chunker,
170
+ event_emitter=event_emitter,
171
+ )
125
172
 
126
173
  try:
127
174
  loader = loader_factory.create(detected_type)
@@ -151,9 +198,48 @@ async def load_command(
151
198
  )
152
199
  sys.exit(1)
153
200
 
154
- # Load documents
201
+ # Build loader options for discovery
202
+ loader_options: dict = {}
203
+ if detected_type == "discovery":
204
+ loader_options = {
205
+ "max_depth": max_depth,
206
+ "max_pages": max_pages,
207
+ "same_domain": same_domain,
208
+ }
209
+
210
+ # Load documents with discovery progress display
155
211
  documents: list[Document] = []
156
212
  url_count = 0
213
+ discovery_state: dict = {"started": False, "completed": False, "job_id": None}
214
+
215
+ # Register discovery event handlers for Rich display
216
+ def _on_discovery_started(event: DiscoveryStartedEvent) -> None:
217
+ discovery_state["started"] = True
218
+ discovery_state["job_id"] = event.job_id
219
+
220
+ def _on_discovery_progress(event: DiscoveryProgressEvent) -> None:
221
+ # Update will be handled in the progress context
222
+ discovery_state["percent"] = event.percent
223
+ discovery_state["pages_crawled"] = event.pages_crawled
224
+ discovery_state["urls_discovered"] = event.urls_discovered
225
+ discovery_state["current_depth"] = event.current_depth
226
+ discovery_state["message"] = event.message
227
+
228
+ def _on_discovery_completed(event: DiscoveryCompletedEvent) -> None:
229
+ discovery_state["completed"] = True
230
+ discovery_state["urls_count"] = event.urls_count
231
+ discovery_state["duration_seconds"] = event.duration_seconds
232
+
233
+ def _on_discovery_failed(event: DiscoveryFailedEvent) -> None:
234
+ discovery_state["failed"] = True
235
+ discovery_state["error"] = event.error
236
+
237
+ # Register discovery event handlers
238
+ if detected_type == "discovery":
239
+ event_emitter.add_handler(EventType.DISCOVERY_STARTED, _on_discovery_started)
240
+ event_emitter.add_handler(EventType.DISCOVERY_PROGRESS, _on_discovery_progress)
241
+ event_emitter.add_handler(EventType.DISCOVERY_COMPLETED, _on_discovery_completed)
242
+ event_emitter.add_handler(EventType.DISCOVERY_FAILED, _on_discovery_failed)
157
243
 
158
244
  with Progress(
159
245
  SpinnerColumn(),
@@ -164,9 +250,32 @@ async def load_command(
164
250
  TimeElapsedColumn(),
165
251
  console=display.console,
166
252
  ) as progress:
167
- load_task = progress.add_task("Loading content...", total=None)
253
+ # Add task for discovery phase (if applicable)
254
+ if detected_type == "discovery":
255
+ discovery_task = progress.add_task(
256
+ "Discovering URLs...",
257
+ total=max_pages,
258
+ )
259
+
260
+ load_task = progress.add_task("Loading content...", total=None, visible=False)
261
+
262
+ async for doc in loader.load_streaming(source, **loader_options):
263
+ # Update discovery progress if available
264
+ if detected_type == "discovery":
265
+ if discovery_state.get("started") and not discovery_state.get("completed"):
266
+ pages = discovery_state.get("pages_crawled", 0)
267
+ urls = discovery_state.get("urls_discovered", 0)
268
+ depth = discovery_state.get("current_depth", 0)
269
+ progress.update(
270
+ discovery_task,
271
+ completed=pages,
272
+ description=f"Discovering... (depth {depth}, {urls} URLs found)",
273
+ )
274
+ elif discovery_state.get("completed"):
275
+ # Hide discovery task and show load task
276
+ progress.update(discovery_task, visible=False)
277
+ progress.update(load_task, visible=True)
168
278
 
169
- async for doc in loader.load_streaming(source):
170
279
  documents.append(doc)
171
280
  url_count += 1
172
281
  progress.update(load_task, advance=1, description=f"Loading... ({url_count} docs)")
@@ -176,6 +285,13 @@ async def load_command(
176
285
 
177
286
  progress.update(load_task, completed=url_count)
178
287
 
288
+ # Show discovery summary if applicable
289
+ if detected_type == "discovery" and discovery_state.get("completed"):
290
+ display.success(
291
+ f"Discovered {discovery_state.get('urls_count', 0)} URLs "
292
+ f"in {discovery_state.get('duration_seconds', 0):.1f}s"
293
+ )
294
+
179
295
  display.success(f"Loaded {len(documents)} documents")
180
296
 
181
297
  if not documents:
@@ -223,7 +339,6 @@ async def load_command(
223
339
  url=doc.url,
224
340
  title=doc.title,
225
341
  source=final_source_id,
226
- account_id=account_id,
227
342
  collection_id=collection_id,
228
343
  source_id=final_source_id,
229
344
  metadata=doc.metadata,
@@ -238,7 +353,6 @@ async def load_command(
238
353
  url=doc.url,
239
354
  title=doc.title,
240
355
  source=final_source_id,
241
- account_id=account_id,
242
356
  collection_id=collection_id,
243
357
  source_id=final_source_id,
244
358
  chunk_index=i,
@@ -267,8 +381,11 @@ async def load_command(
267
381
  )
268
382
 
269
383
  try:
270
- # Create indexer config
384
+ # Create indexer config from environment, preserving k-NN and other settings
385
+ # This ensures proper vector mappings are used when creating indices
386
+ base_config = OpenSearchConfig.from_env()
271
387
  opensearch_config = OpenSearchConfig(
388
+ # CLI/CliConfig overrides
272
389
  host=cli_config.opensearch_host,
273
390
  port=cli_config.opensearch_port,
274
391
  username=cli_config.opensearch_username,
@@ -276,7 +393,21 @@ async def load_command(
276
393
  use_ssl=cli_config.opensearch_use_ssl,
277
394
  verify_certs=cli_config.opensearch_verify_certs,
278
395
  model_id=cli_config.opensearch_model_id,
279
- ingest_pipeline_name=cli_config.opensearch_pipeline_name,
396
+ ingest_pipeline_name=cli_config.opensearch_ingest_pipeline_name,
397
+ # Preserve env-based k-NN settings for proper index mappings
398
+ embedding_model=base_config.embedding_model,
399
+ embedding_dimension=base_config.embedding_dimension,
400
+ embedding_field=base_config.embedding_field,
401
+ knn_engine=base_config.knn_engine,
402
+ knn_space_type=base_config.knn_space_type,
403
+ knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
404
+ knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
405
+ knn_algo_param_m=base_config.knn_algo_param_m,
406
+ # Preserve other settings
407
+ index_prefix=base_config.index_prefix,
408
+ number_of_shards=base_config.number_of_shards,
409
+ number_of_replicas=base_config.number_of_replicas,
410
+ search_pipeline_name=base_config.search_pipeline_name,
280
411
  )
281
412
 
282
413
  indexer = OpenSearchIndexer(client, opensearch_config)
@@ -314,6 +445,7 @@ async def load_command(
314
445
 
315
446
  indexed_count = 0
316
447
  failed_count = 0
448
+ all_errors: list[dict] = []
317
449
 
318
450
  with Progress(
319
451
  SpinnerColumn(),
@@ -333,6 +465,8 @@ async def load_command(
333
465
  result = await indexer.bulk_index(batch, index_name, batch_size=batch_size)
334
466
  indexed_count += result.indexed_count
335
467
  failed_count += result.failed_count
468
+ if result.errors:
469
+ all_errors.extend(result.errors)
336
470
 
337
471
  progress.update(index_task, advance=len(batch))
338
472
 
@@ -354,12 +488,28 @@ async def load_command(
354
488
  style="success",
355
489
  )
356
490
  else:
491
+ # Build error details section
492
+ error_details = ""
493
+ if all_errors:
494
+ error_details = "\n\n[bold red]Error Details:[/bold red]\n"
495
+ for i, err in enumerate(all_errors[:5], 1): # Show first 5 errors
496
+ if isinstance(err, dict):
497
+ error_type = err.get("error", {}).get("type", "unknown") if isinstance(err.get("error"), dict) else str(err.get("error", "unknown"))
498
+ error_reason = err.get("error", {}).get("reason", "No reason provided") if isinstance(err.get("error"), dict) else str(err.get("error", "No details"))
499
+ doc_id = err.get("_id", "unknown")
500
+ error_details += f" {i}. [dim]Doc {doc_id}:[/dim] {error_type} - {error_reason}\n"
501
+ else:
502
+ error_details += f" {i}. {err}\n"
503
+ if len(all_errors) > 5:
504
+ error_details += f" ... and {len(all_errors) - 5} more errors\n"
505
+
357
506
  display.panel(
358
507
  f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
359
508
  f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
360
509
  f"Documents Indexed: [green]{indexed_count}[/green]\n"
361
510
  f"Documents Failed: [red]{failed_count}[/red]\n"
362
- f"Index: [cyan]{index_name}[/cyan]",
511
+ f"Index: [cyan]{index_name}[/cyan]"
512
+ f"{error_details}",
363
513
  title="Loading Complete (with errors)",
364
514
  style="warning",
365
515
  )