gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -249,10 +249,9 @@ class OpenSearchSetupAdapter:
249
249
  self._model_id = self._config.model_id
250
250
 
251
251
  # Step 4: Create ingest pipeline
252
- # Only create ingest pipeline for global setup (not per-account)
253
- # Account indices should use the global pipeline to ensure consistent model
254
- is_global_setup = self._config.index_prefix == "gnosisllm"
255
- if self._model_id and is_global_setup:
252
+ # Create pipeline for any setup that has a model deployed
253
+ # Each index_prefix namespace gets its own pipeline
254
+ if self._model_id:
256
255
  try:
257
256
  await self._create_ingest_pipeline()
258
257
  pipeline_name = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
@@ -261,35 +260,33 @@ class OpenSearchSetupAdapter:
261
260
  errors.append(f"Failed to create ingest pipeline: {e}")
262
261
  logger.error(f"Failed to create ingest pipeline: {e}")
263
262
 
264
- # Step 5: Create search pipeline (only for global setup)
265
- if is_global_setup:
266
- try:
267
- await self._create_search_pipeline()
268
- pipeline_name = self._config.search_pipeline_name or f"{self._config.index_prefix}-search-pipeline"
269
- steps_completed.append(f"Created search pipeline: {pipeline_name}")
270
- except Exception as e:
271
- errors.append(f"Failed to create search pipeline: {e}")
272
- logger.error(f"Failed to create search pipeline: {e}")
263
+ # Step 5: Create search pipeline for hybrid search
264
+ try:
265
+ await self._create_search_pipeline()
266
+ pipeline_name = self._config.search_pipeline_name or f"{self._config.index_prefix}-search-pipeline"
267
+ steps_completed.append(f"Created search pipeline: {pipeline_name}")
268
+ except Exception as e:
269
+ errors.append(f"Failed to create search pipeline: {e}")
270
+ logger.error(f"Failed to create search pipeline: {e}")
273
271
 
274
- # Step 6: Create index template (only for global setup)
275
- # Template covers all gnosisllm-* indices including per-account indices
276
- if is_global_setup:
277
- try:
278
- template_name = f"{self._config.index_prefix}-template"
279
- template_body = get_index_template(self._config)
272
+ # Step 6: Create index template for this namespace
273
+ # Template covers all {index_prefix}-* indices
274
+ try:
275
+ template_name = f"{self._config.index_prefix}-template"
276
+ template_body = get_index_template(self._config)
280
277
 
281
- # Ensure template has global pipeline for auto-index creation
282
- global_pipeline = self._config.ingest_pipeline_name or "gnosisllm-ingest-pipeline"
283
- template_body["template"]["settings"]["index"]["default_pipeline"] = global_pipeline
278
+ # Set default pipeline for auto-index creation within this namespace
279
+ default_pipeline = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
280
+ template_body["template"]["settings"]["index"]["default_pipeline"] = default_pipeline
284
281
 
285
- await self._client.indices.put_index_template(
286
- name=template_name,
287
- body=template_body,
288
- )
289
- steps_completed.append(f"Created index template: {template_name}")
290
- except Exception as e:
291
- errors.append(f"Failed to create index template: {e}")
292
- logger.error(f"Failed to create index template: {e}")
282
+ await self._client.indices.put_index_template(
283
+ name=template_name,
284
+ body=template_body,
285
+ )
286
+ steps_completed.append(f"Created index template: {template_name}")
287
+ except Exception as e:
288
+ errors.append(f"Failed to create index template: {e}")
289
+ logger.error(f"Failed to create index template: {e}")
293
290
 
294
291
  # Step 7: Create knowledge index
295
292
  try:
@@ -298,9 +295,8 @@ class OpenSearchSetupAdapter:
298
295
 
299
296
  if not exists:
300
297
  settings = get_knowledge_index_settings(self._config)
301
- # Add default pipeline - always use global pipeline for consistency
302
- # This ensures all accounts use the same embedding model
303
- pipeline_name = self._config.ingest_pipeline_name or "gnosisllm-ingest-pipeline"
298
+ # Add default pipeline for this namespace
299
+ pipeline_name = self._config.ingest_pipeline_name or f"{self._config.index_prefix}-ingest-pipeline"
304
300
  settings["index"]["default_pipeline"] = pipeline_name
305
301
 
306
302
  await self._client.indices.create(
@@ -1,6 +1,11 @@
1
1
  """GnosisLLM Knowledge CLI Application.
2
2
 
3
3
  Main entry point assembling all CLI commands with enterprise-grade UX.
4
+
5
+ Note:
6
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
7
+ isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
8
+ Use --index to target tenant-specific indices.
4
9
  """
5
10
 
6
11
  from __future__ import annotations
@@ -147,17 +152,13 @@ def load(
147
152
  typer.Option(
148
153
  "--type",
149
154
  "-t",
150
- help="Source type: website, sitemap (auto-detects if not specified).",
155
+ help="Source type: website, sitemap, discovery (auto-detects if not specified).",
151
156
  ),
152
157
  ] = None,
153
158
  index: Annotated[
154
159
  str,
155
- typer.Option("--index", "-i", help="Target index name."),
160
+ typer.Option("--index", "-i", help="Target index name (use tenant-specific name for multi-tenancy)."),
156
161
  ] = "knowledge",
157
- account_id: Annotated[
158
- Optional[str],
159
- typer.Option("--account-id", "-a", help="Multi-tenant account ID."),
160
- ] = None,
161
162
  collection_id: Annotated[
162
163
  Optional[str],
163
164
  typer.Option("--collection-id", "-c", help="Collection grouping ID."),
@@ -186,16 +187,50 @@ def load(
186
187
  bool,
187
188
  typer.Option("--verbose", "-V", help="Show per-document progress."),
188
189
  ] = False,
190
+ discovery: Annotated[
191
+ bool,
192
+ typer.Option(
193
+ "--discovery",
194
+ "-D",
195
+ help="Use discovery loader to crawl and discover all URLs from the website.",
196
+ ),
197
+ ] = False,
198
+ max_depth: Annotated[
199
+ int,
200
+ typer.Option("--max-depth", help="Maximum crawl depth for discovery (default: 3)."),
201
+ ] = 3,
202
+ max_pages: Annotated[
203
+ int,
204
+ typer.Option("--max-pages", help="Maximum pages to discover (default: 100)."),
205
+ ] = 100,
206
+ same_domain: Annotated[
207
+ bool,
208
+ typer.Option(
209
+ "--same-domain/--any-domain",
210
+ help="Only crawl URLs on the same domain (default: same domain only).",
211
+ ),
212
+ ] = True,
189
213
  ) -> None:
190
214
  """Load and index content from URLs or sitemaps.
191
215
 
192
216
  Fetches content, chunks it for optimal embedding, and indexes
193
217
  into OpenSearch with automatic embedding generation.
194
218
 
219
+ [bold]Multi-tenancy:[/bold]
220
+ Use --index with tenant-specific index names for isolation
221
+ (e.g., --index knowledge-{account_id}). Each tenant's data
222
+ is stored in a separate index for complete isolation.
223
+
224
+ [bold]Discovery Mode:[/bold]
225
+ Use --discovery to crawl and discover all URLs from a website
226
+ before loading. This is useful for sites without a sitemap.
227
+
195
228
  [bold]Example:[/bold]
196
229
  $ gnosisllm-knowledge load https://docs.example.com/intro
197
230
  $ gnosisllm-knowledge load https://example.com/sitemap.xml --type sitemap
198
231
  $ gnosisllm-knowledge load https://docs.example.com/sitemap.xml --max-urls 500
232
+ $ gnosisllm-knowledge load https://docs.example.com --discovery --max-depth 5
233
+ $ gnosisllm-knowledge load https://docs.example.com --index knowledge-tenant-123
199
234
  """
200
235
  from gnosisllm_knowledge.cli.commands.load import load_command
201
236
 
@@ -205,7 +240,6 @@ def load(
205
240
  source=source,
206
241
  source_type=source_type,
207
242
  index_name=index,
208
- account_id=account_id,
209
243
  collection_id=collection_id,
210
244
  source_id=source_id,
211
245
  batch_size=batch_size,
@@ -213,6 +247,10 @@ def load(
213
247
  force=force,
214
248
  dry_run=dry_run,
215
249
  verbose=verbose,
250
+ discovery=discovery,
251
+ max_depth=max_depth,
252
+ max_pages=max_pages,
253
+ same_domain=same_domain,
216
254
  )
217
255
  )
218
256
 
@@ -238,7 +276,7 @@ def search(
238
276
  ] = "hybrid",
239
277
  index: Annotated[
240
278
  str,
241
- typer.Option("--index", "-i", help="Index to search."),
279
+ typer.Option("--index", "-i", help="Index to search (use tenant-specific name for multi-tenancy)."),
242
280
  ] = "knowledge",
243
281
  limit: Annotated[
244
282
  int,
@@ -248,10 +286,6 @@ def search(
248
286
  int,
249
287
  typer.Option("--offset", "-o", help="Pagination offset."),
250
288
  ] = 0,
251
- account_id: Annotated[
252
- Optional[str],
253
- typer.Option("--account-id", "-a", help="Filter by account ID."),
254
- ] = None,
255
289
  collection_ids: Annotated[
256
290
  Optional[str],
257
291
  typer.Option("--collection-ids", "-c", help="Filter by collection IDs (comma-separated)."),
@@ -289,10 +323,16 @@ def search(
289
323
  - [cyan]hybrid[/cyan]: Combined semantic + keyword (default, best results)
290
324
  - [cyan]agentic[/cyan]: AI-powered search with reasoning
291
325
 
326
+ [bold]Multi-tenancy:[/bold]
327
+ Use --index with tenant-specific index names for isolation
328
+ (e.g., --index knowledge-{account_id}). Each tenant's data
329
+ is stored in a separate index for complete isolation.
330
+
292
331
  [bold]Example:[/bold]
293
332
  $ gnosisllm-knowledge search "how to configure auth"
294
333
  $ gnosisllm-knowledge search "API reference" --mode semantic --limit 10
295
334
  $ gnosisllm-knowledge search --interactive
335
+ $ gnosisllm-knowledge search "query" --index knowledge-tenant-123
296
336
  """
297
337
  from gnosisllm_knowledge.cli.commands.search import search_command
298
338
 
@@ -304,7 +344,6 @@ def search(
304
344
  index_name=index,
305
345
  limit=limit,
306
346
  offset=offset,
307
- account_id=account_id,
308
347
  collection_ids=collection_ids,
309
348
  source_ids=source_ids,
310
349
  min_score=min_score,
@@ -451,7 +490,7 @@ def agentic_setup(
451
490
  def agentic_chat(
452
491
  index: Annotated[
453
492
  str,
454
- typer.Option("--index", "-i", help="Index to search."),
493
+ typer.Option("--index", "-i", help="Index to search (use tenant-specific name for multi-tenancy)."),
455
494
  ] = "knowledge",
456
495
  agent_type: Annotated[
457
496
  str,
@@ -461,10 +500,6 @@ def agentic_chat(
461
500
  help="Agent type: flow or conversational (default).",
462
501
  ),
463
502
  ] = "conversational",
464
- account_id: Annotated[
465
- Optional[str],
466
- typer.Option("--account-id", "-a", help="Filter by account ID."),
467
- ] = None,
468
503
  collection_ids: Annotated[
469
504
  Optional[str],
470
505
  typer.Option("--collection-ids", "-c", help="Filter by collection IDs (comma-separated)."),
@@ -479,10 +514,15 @@ def agentic_chat(
479
514
  Start a conversation with the AI-powered knowledge assistant.
480
515
  The agent remembers context for multi-turn dialogue.
481
516
 
517
+ [bold]Multi-tenancy:[/bold]
518
+ Use --index with tenant-specific index names for isolation
519
+ (e.g., --index knowledge-{account_id}).
520
+
482
521
  [bold]Example:[/bold]
483
522
  $ gnosisllm-knowledge agentic chat
484
523
  $ gnosisllm-knowledge agentic chat --type flow
485
524
  $ gnosisllm-knowledge agentic chat --verbose
525
+ $ gnosisllm-knowledge agentic chat --index knowledge-tenant-123
486
526
  """
487
527
  from gnosisllm_knowledge.cli.commands.agentic import agentic_chat_command
488
528
 
@@ -491,7 +531,6 @@ def agentic_chat(
491
531
  display=display,
492
532
  index_name=index,
493
533
  agent_type=agent_type,
494
- account_id=account_id,
495
534
  collection_ids=collection_ids,
496
535
  verbose=verbose,
497
536
  )
@@ -4,6 +4,10 @@ Commands:
4
4
  - setup: Configure agents in OpenSearch
5
5
  - chat: Interactive agentic chat session
6
6
  - status: Show agent configuration status
7
+
8
+ Note:
9
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
10
+ isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
7
11
  """
8
12
 
9
13
  from __future__ import annotations
@@ -202,17 +206,19 @@ async def agentic_chat_command(
202
206
  display: RichDisplayService,
203
207
  index_name: str = "knowledge",
204
208
  agent_type: str = "conversational",
205
- account_id: str | None = None,
206
209
  collection_ids: str | None = None,
207
210
  verbose: bool = False,
208
211
  ) -> None:
209
212
  """Interactive agentic chat session.
210
213
 
214
+ Note:
215
+ Multi-tenancy is achieved through index isolation. Use tenant-specific
216
+ index names instead (e.g., --index knowledge-tenant-123).
217
+
211
218
  Args:
212
219
  display: Display service for output.
213
- index_name: Index to search.
220
+ index_name: Index to search (use tenant-specific name for isolation).
214
221
  agent_type: Agent type ('flow' or 'conversational').
215
- account_id: Filter by account ID.
216
222
  collection_ids: Filter by collection IDs (comma-separated).
217
223
  verbose: Show reasoning steps.
218
224
  """
@@ -242,7 +248,6 @@ async def agentic_chat_command(
242
248
  if agent_type == "conversational":
243
249
  return await searcher.create_conversation(
244
250
  name="CLI Chat Session",
245
- account_id=account_id,
246
251
  )
247
252
  return None
248
253
 
@@ -291,7 +296,6 @@ async def agentic_chat_command(
291
296
  agent_type=AgentType.CONVERSATIONAL if agent_type == "conversational" else AgentType.FLOW,
292
297
  conversation_id=conversation_id,
293
298
  collection_ids=collection_list,
294
- account_id=account_id,
295
299
  include_reasoning=verbose,
296
300
  )
297
301
 
@@ -395,7 +399,6 @@ async def agentic_search_command(
395
399
  query: str,
396
400
  index_name: str = "knowledge",
397
401
  agent_type: str = "flow",
398
- account_id: str | None = None,
399
402
  collection_ids: str | None = None,
400
403
  source_ids: str | None = None,
401
404
  limit: int = 5,
@@ -404,12 +407,15 @@ async def agentic_search_command(
404
407
  ) -> dict[str, Any] | None:
405
408
  """Execute agentic search.
406
409
 
410
+ Note:
411
+ Multi-tenancy is achieved through index isolation. Use tenant-specific
412
+ index names instead (e.g., --index knowledge-tenant-123).
413
+
407
414
  Args:
408
415
  display: Display service for output.
409
416
  query: Search query text.
410
- index_name: Index to search.
417
+ index_name: Index to search (use tenant-specific name for isolation).
411
418
  agent_type: Agent type ('flow' or 'conversational').
412
- account_id: Filter by account ID.
413
419
  collection_ids: Filter by collection IDs (comma-separated).
414
420
  source_ids: Filter by source IDs (comma-separated).
415
421
  limit: Maximum source documents to retrieve.
@@ -447,12 +453,12 @@ async def agentic_search_command(
447
453
  )
448
454
 
449
455
  # Build query
456
+ # Note: account_id is deprecated and ignored - use index isolation instead
450
457
  agentic_query = AgenticSearchQuery(
451
458
  text=query,
452
459
  agent_type=AgentType.CONVERSATIONAL if agent_type == "conversational" else AgentType.FLOW,
453
460
  collection_ids=collection_list,
454
461
  source_ids=source_list,
455
- account_id=account_id,
456
462
  limit=limit,
457
463
  include_reasoning=verbose,
458
464
  )