gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. gnosisllm_knowledge/api/knowledge.py +225 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  6. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  7. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  8. gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
  9. gnosisllm_knowledge/cli/app.py +58 -19
  10. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  11. gnosisllm_knowledge/cli/commands/load.py +169 -19
  12. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  13. gnosisllm_knowledge/cli/commands/search.py +9 -10
  14. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  15. gnosisllm_knowledge/cli/utils/config.py +4 -4
  16. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  17. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  18. gnosisllm_knowledge/core/domain/document.py +14 -19
  19. gnosisllm_knowledge/core/domain/search.py +10 -25
  20. gnosisllm_knowledge/core/domain/source.py +11 -12
  21. gnosisllm_knowledge/core/events/__init__.py +8 -0
  22. gnosisllm_knowledge/core/events/types.py +122 -5
  23. gnosisllm_knowledge/core/exceptions.py +93 -0
  24. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  25. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  26. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  27. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  28. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  29. gnosisllm_knowledge/fetchers/config.py +27 -0
  30. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  31. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  32. gnosisllm_knowledge/loaders/__init__.py +5 -1
  33. gnosisllm_knowledge/loaders/discovery.py +338 -0
  34. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  35. gnosisllm_knowledge/loaders/factory.py +46 -0
  36. gnosisllm_knowledge/services/indexing.py +35 -20
  37. gnosisllm_knowledge/services/search.py +37 -20
  38. gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
  39. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
  40. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  42. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  43. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,11 @@ from typing import Any
9
9
  from gnosisllm_knowledge.core.events.emitter import EventEmitter
10
10
  from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
11
11
  from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
12
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
13
+ from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
14
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
12
15
  from gnosisllm_knowledge.loaders.base import BaseLoader
16
+ from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
13
17
  from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
14
18
  from gnosisllm_knowledge.loaders.website import WebsiteLoader
15
19
 
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
20
24
  ]
21
25
 
22
26
 
27
+ def _create_discovery_loader(
28
+ fetcher: IContentFetcher,
29
+ chunker: ITextChunker,
30
+ config: dict[str, Any] | None,
31
+ event_emitter: EventEmitter | None,
32
+ ) -> DiscoveryLoader:
33
+ """Factory function for creating DiscoveryLoader instances.
34
+
35
+ Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
36
+ is a NeoreaderContentFetcher, reuses its config to ensure consistency.
37
+ Otherwise, creates config from environment variables.
38
+
39
+ Args:
40
+ fetcher: Content fetcher for retrieving URL content.
41
+ chunker: Text chunker for splitting content.
42
+ config: Optional configuration dictionary.
43
+ event_emitter: Optional event emitter for progress events.
44
+
45
+ Returns:
46
+ Configured DiscoveryLoader instance.
47
+ """
48
+ # Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
49
+ if isinstance(fetcher, NeoreaderContentFetcher):
50
+ neoreader_config = fetcher.config
51
+ else:
52
+ neoreader_config = NeoreaderConfig.from_env()
53
+
54
+ discovery_client = NeoreaderDiscoveryClient(neoreader_config)
55
+ return DiscoveryLoader(
56
+ fetcher=fetcher,
57
+ chunker=chunker,
58
+ discovery_client=discovery_client,
59
+ config=config,
60
+ event_emitter=event_emitter,
61
+ )
62
+
63
+
23
64
  class LoaderFactory:
24
65
  """Factory for creating content loaders (Registry Pattern).
25
66
 
@@ -29,6 +70,7 @@ class LoaderFactory:
29
70
  Built-in loaders:
30
71
  - website: Single URL loading
31
72
  - sitemap: Sitemap XML with recursive discovery
73
+ - discovery: Website crawling via Neo Reader Discovery API
32
74
 
33
75
  Example:
34
76
  ```python
@@ -40,6 +82,9 @@ class LoaderFactory:
40
82
  # Explicit type
41
83
  loader = factory.create("sitemap", config={"max_urls": 500})
42
84
 
85
+ # Discovery loader for full website crawling
86
+ loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
87
+
43
88
  # Register custom loader
44
89
  factory.register("custom", MyCustomLoader)
45
90
  ```
@@ -76,6 +121,7 @@ class LoaderFactory:
76
121
  """Register built-in loader types."""
77
122
  self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
78
123
  self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
124
+ self.register("discovery", _create_discovery_loader)
79
125
 
80
126
  def register(self, name: str, creator: LoaderCreator) -> None:
81
127
  """Register a loader type.
@@ -1,4 +1,13 @@
1
- """Knowledge indexing service."""
1
+ """Knowledge indexing service.
2
+
3
+ This service orchestrates the document ingestion pipeline from source to index,
4
+ including loading, chunking, and indexing.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g.,
9
+ `knowledge-{account_id}`) rather than filtering by account_id.
10
+ """
2
11
 
3
12
  from __future__ import annotations
4
13
 
@@ -82,7 +91,6 @@ class KnowledgeIndexingService:
82
91
  source: str,
83
92
  index_name: str,
84
93
  *,
85
- account_id: str | None = None,
86
94
  collection_id: str | None = None,
87
95
  source_id: str | None = None,
88
96
  batch_size: int = 100,
@@ -93,10 +101,13 @@ class KnowledgeIndexingService:
93
101
  Uses streaming to process and index documents as they're fetched,
94
102
  avoiding memory issues with large sitemaps.
95
103
 
104
+ Note:
105
+ This method is tenant-agnostic. Multi-tenancy should be handled
106
+ at the API layer by using separate indices per account.
107
+
96
108
  Args:
97
109
  source: Source URL or path.
98
- index_name: Target index name.
99
- account_id: Account ID for multi-tenancy.
110
+ index_name: Target index name (use tenant-specific name for isolation).
100
111
  collection_id: Collection ID.
101
112
  source_id: Source ID (auto-generated if not provided).
102
113
  batch_size: Documents per batch for indexing.
@@ -127,14 +138,13 @@ class KnowledgeIndexingService:
127
138
  # Stream documents and index in batches as they arrive
128
139
  # Note: Loader already chunks content, so we don't re-chunk here
129
140
  async for doc in self._loader.load_streaming(source, **options):
130
- # Enrich document with tenant info
141
+ # Enrich document with collection info
131
142
  enriched_doc = Document(
132
143
  content=doc.content,
133
144
  source=source,
134
145
  doc_id=doc.doc_id,
135
146
  url=doc.url,
136
147
  title=doc.title,
137
- account_id=account_id,
138
148
  collection_id=collection_id,
139
149
  source_id=source_id,
140
150
  chunk_index=doc.chunk_index,
@@ -230,8 +240,8 @@ class KnowledgeIndexingService:
230
240
  doc_id=f"{doc.doc_id}-chunk-{i}",
231
241
  url=doc.url,
232
242
  title=doc.title,
233
- account_id=doc.account_id,
234
243
  collection_id=doc.collection_id,
244
+ collection_name=doc.collection_name,
235
245
  source_id=doc.source_id,
236
246
  chunk_index=i,
237
247
  total_chunks=len(chunks),
@@ -271,14 +281,16 @@ class KnowledgeIndexingService:
271
281
  self,
272
282
  source_id: str,
273
283
  index_name: str,
274
- account_id: str | None = None,
275
284
  ) -> int:
276
285
  """Delete all documents from a source.
277
286
 
287
+ Note:
288
+ This method is tenant-agnostic. Multi-tenancy should be handled
289
+ at the API layer by using separate indices per account.
290
+
278
291
  Args:
279
292
  source_id: Source ID to delete.
280
- index_name: Index name.
281
- account_id: Optional account filter.
293
+ index_name: Index name (use tenant-specific name for isolation).
282
294
 
283
295
  Returns:
284
296
  Count of deleted documents.
@@ -287,21 +299,23 @@ class KnowledgeIndexingService:
287
299
  build_delete_by_source_query,
288
300
  )
289
301
 
290
- query = build_delete_by_source_query(source_id, account_id)
302
+ query = build_delete_by_source_query(source_id)
291
303
  return await self._indexer.delete_by_query(query, index_name)
292
304
 
293
305
  async def delete_collection(
294
306
  self,
295
307
  collection_id: str,
296
308
  index_name: str,
297
- account_id: str | None = None,
298
309
  ) -> int:
299
310
  """Delete all documents from a collection.
300
311
 
312
+ Note:
313
+ This method is tenant-agnostic. Multi-tenancy should be handled
314
+ at the API layer by using separate indices per account.
315
+
301
316
  Args:
302
317
  collection_id: Collection ID to delete.
303
- index_name: Index name.
304
- account_id: Optional account filter.
318
+ index_name: Index name (use tenant-specific name for isolation).
305
319
 
306
320
  Returns:
307
321
  Count of deleted documents.
@@ -310,7 +324,7 @@ class KnowledgeIndexingService:
310
324
  build_delete_by_collection_query,
311
325
  )
312
326
 
313
- query = build_delete_by_collection_query(collection_id, account_id)
327
+ query = build_delete_by_collection_query(collection_id)
314
328
  return await self._indexer.delete_by_query(query, index_name)
315
329
 
316
330
  async def reindex_source(
@@ -319,17 +333,19 @@ class KnowledgeIndexingService:
319
333
  source_id: str,
320
334
  index_name: str,
321
335
  *,
322
- account_id: str | None = None,
323
336
  collection_id: str | None = None,
324
337
  **options: Any,
325
338
  ) -> IndexResult:
326
339
  """Reindex a source by deleting and re-loading.
327
340
 
341
+ Note:
342
+ This method is tenant-agnostic. Multi-tenancy should be handled
343
+ at the API layer by using separate indices per account.
344
+
328
345
  Args:
329
346
  source: Source URL or path.
330
347
  source_id: Existing source ID.
331
- index_name: Index name.
332
- account_id: Account ID.
348
+ index_name: Index name (use tenant-specific name for isolation).
333
349
  collection_id: Collection ID.
334
350
  **options: Additional options.
335
351
 
@@ -337,13 +353,12 @@ class KnowledgeIndexingService:
337
353
  Index result.
338
354
  """
339
355
  # Delete existing documents
340
- await self.delete_source(source_id, index_name, account_id)
356
+ await self.delete_source(source_id, index_name)
341
357
 
342
358
  # Re-index
343
359
  return await self.load_and_index(
344
360
  source=source,
345
361
  index_name=index_name,
346
- account_id=account_id,
347
362
  collection_id=collection_id,
348
363
  source_id=source_id,
349
364
  **options,
@@ -1,4 +1,12 @@
1
- """Knowledge search service."""
1
+ """Knowledge search service.
2
+
3
+ This service provides a high-level interface for searching knowledge documents
4
+ using semantic, keyword, and hybrid search modes.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g., knowledge-{account_id}).
9
+ """
2
10
 
3
11
  from __future__ import annotations
4
12
 
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
70
78
  mode: SearchMode = SearchMode.HYBRID,
71
79
  limit: int = 10,
72
80
  offset: int = 0,
73
- account_id: str | None = None,
74
81
  collection_ids: list[str] | None = None,
75
82
  source_ids: list[str] | None = None,
76
83
  min_score: float | None = None,
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
78
85
  ) -> SearchResult:
79
86
  """Search for knowledge documents.
80
87
 
88
+ Note:
89
+ This method is tenant-agnostic. Multi-tenancy should be handled
90
+ at the API layer by using separate indices per account.
91
+
81
92
  Args:
82
93
  query: Search query text.
83
94
  index_name: Index to search (uses default if not provided).
84
95
  mode: Search mode (semantic, keyword, hybrid).
85
96
  limit: Maximum results.
86
97
  offset: Result offset for pagination.
87
- account_id: Account ID for multi-tenancy.
88
98
  collection_ids: Filter by collection IDs.
89
99
  source_ids: Filter by source IDs.
90
100
  min_score: Minimum score threshold.
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
105
115
  mode=mode,
106
116
  limit=limit,
107
117
  offset=offset,
108
- account_id=account_id,
109
118
  collection_ids=collection_ids,
110
119
  source_ids=source_ids,
111
120
  min_score=min_score,
@@ -133,17 +142,19 @@ class KnowledgeSearchService:
133
142
  *,
134
143
  index_name: str | None = None,
135
144
  limit: int = 10,
136
- account_id: str | None = None,
137
145
  collection_ids: list[str] | None = None,
138
146
  **options: Any,
139
147
  ) -> SearchResult:
140
148
  """Execute semantic (vector) search.
141
149
 
150
+ Note:
151
+ This method is tenant-agnostic. Multi-tenancy should be handled
152
+ at the API layer by using separate indices per account.
153
+
142
154
  Args:
143
155
  query: Search query text.
144
156
  index_name: Index to search.
145
157
  limit: Maximum results.
146
- account_id: Account ID for multi-tenancy.
147
158
  collection_ids: Filter by collection IDs.
148
159
  **options: Additional options.
149
160
 
@@ -155,7 +166,6 @@ class KnowledgeSearchService:
155
166
  index_name=index_name,
156
167
  mode=SearchMode.SEMANTIC,
157
168
  limit=limit,
158
- account_id=account_id,
159
169
  collection_ids=collection_ids,
160
170
  **options,
161
171
  )
@@ -166,17 +176,19 @@ class KnowledgeSearchService:
166
176
  *,
167
177
  index_name: str | None = None,
168
178
  limit: int = 10,
169
- account_id: str | None = None,
170
179
  collection_ids: list[str] | None = None,
171
180
  **options: Any,
172
181
  ) -> SearchResult:
173
182
  """Execute keyword (BM25) search.
174
183
 
184
+ Note:
185
+ This method is tenant-agnostic. Multi-tenancy should be handled
186
+ at the API layer by using separate indices per account.
187
+
175
188
  Args:
176
189
  query: Search query text.
177
190
  index_name: Index to search.
178
191
  limit: Maximum results.
179
- account_id: Account ID for multi-tenancy.
180
192
  collection_ids: Filter by collection IDs.
181
193
  **options: Additional options.
182
194
 
@@ -188,7 +200,6 @@ class KnowledgeSearchService:
188
200
  index_name=index_name,
189
201
  mode=SearchMode.KEYWORD,
190
202
  limit=limit,
191
- account_id=account_id,
192
203
  collection_ids=collection_ids,
193
204
  **options,
194
205
  )
@@ -199,7 +210,6 @@ class KnowledgeSearchService:
199
210
  *,
200
211
  index_name: str | None = None,
201
212
  limit: int = 10,
202
- account_id: str | None = None,
203
213
  collection_ids: list[str] | None = None,
204
214
  semantic_weight: float = 0.7,
205
215
  keyword_weight: float = 0.3,
@@ -207,11 +217,14 @@ class KnowledgeSearchService:
207
217
  ) -> SearchResult:
208
218
  """Execute hybrid search (semantic + keyword).
209
219
 
220
+ Note:
221
+ This method is tenant-agnostic. Multi-tenancy should be handled
222
+ at the API layer by using separate indices per account.
223
+
210
224
  Args:
211
225
  query: Search query text.
212
226
  index_name: Index to search.
213
227
  limit: Maximum results.
214
- account_id: Account ID for multi-tenancy.
215
228
  collection_ids: Filter by collection IDs.
216
229
  semantic_weight: Weight for semantic score.
217
230
  keyword_weight: Weight for keyword score.
@@ -225,7 +238,6 @@ class KnowledgeSearchService:
225
238
  index_name=index_name,
226
239
  mode=SearchMode.HYBRID,
227
240
  limit=limit,
228
- account_id=account_id,
229
241
  collection_ids=collection_ids,
230
242
  semantic_weight=semantic_weight,
231
243
  keyword_weight=keyword_weight,
@@ -264,17 +276,19 @@ class KnowledgeSearchService:
264
276
  index_name: str | None = None,
265
277
  mode: SearchMode = SearchMode.HYBRID,
266
278
  limit: int = 10,
267
- account_id: str | None = None,
268
279
  **options: Any,
269
280
  ) -> list[SearchResult]:
270
281
  """Execute multiple searches in parallel.
271
282
 
283
+ Note:
284
+ This method is tenant-agnostic. Multi-tenancy should be handled
285
+ at the API layer by using separate indices per account.
286
+
272
287
  Args:
273
288
  queries: List of query texts.
274
289
  index_name: Index to search.
275
290
  mode: Search mode.
276
291
  limit: Maximum results per query.
277
- account_id: Account ID for multi-tenancy.
278
292
  **options: Additional options.
279
293
 
280
294
  Returns:
@@ -289,7 +303,6 @@ class KnowledgeSearchService:
289
303
  text=query,
290
304
  mode=mode,
291
305
  limit=limit,
292
- account_id=account_id,
293
306
  )
294
307
  for query in queries
295
308
  ]
@@ -310,15 +323,19 @@ class KnowledgeSearchService:
310
323
  async def count(
311
324
  self,
312
325
  index_name: str | None = None,
313
- account_id: str | None = None,
314
326
  collection_id: str | None = None,
327
+ source_id: str | None = None,
315
328
  ) -> int:
316
329
  """Count documents in index.
317
330
 
331
+ Note:
332
+ This method is tenant-agnostic. Multi-tenancy should be handled
333
+ at the API layer by using separate indices per account.
334
+
318
335
  Args:
319
336
  index_name: Index to count.
320
- account_id: Filter by account.
321
337
  collection_id: Filter by collection.
338
+ source_id: Filter by source (for source deletion confirmation).
322
339
 
323
340
  Returns:
324
341
  Document count.
@@ -327,12 +344,12 @@ class KnowledgeSearchService:
327
344
  if not index:
328
345
  raise SearchError(message="No index specified")
329
346
 
330
- # Build count query
347
+ # Build count query with optional filters
331
348
  query = SearchQuery(
332
349
  text="",
333
350
  limit=0,
334
- account_id=account_id,
335
351
  collection_ids=[collection_id] if collection_id else None,
352
+ source_ids=[source_id] if source_id else None,
336
353
  )
337
354
 
338
355
  # Use a simple match_all to get total count
@@ -2,12 +2,19 @@
2
2
 
3
3
  This module provides the StreamingIndexingPipeline that orchestrates
4
4
  the load -> index pipeline with guaranteed bounded memory usage.
5
+
6
+ Note:
7
+ This module is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g.,
9
+ gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
10
+ The account_id parameters are deprecated and will be ignored.
5
11
  """
6
12
 
7
13
  from __future__ import annotations
8
14
 
9
15
  import logging
10
16
  import time
17
+ import warnings
11
18
  from dataclasses import dataclass, field
12
19
  from typing import TYPE_CHECKING, Any
13
20
 
@@ -141,10 +148,16 @@ class StreamingIndexingPipeline:
141
148
  ) -> IndexResult:
142
149
  """Execute the streaming pipeline.
143
150
 
151
+ Note:
152
+ This method is tenant-agnostic. Multi-tenancy should be handled
153
+ at the API layer by using separate indices per account. The
154
+ account_id parameter is deprecated and will be ignored.
155
+
144
156
  Args:
145
157
  source: Sitemap URL.
146
158
  index_name: Target OpenSearch index.
147
- account_id: For multi-tenancy filtering.
159
+ account_id: Deprecated. This parameter is ignored.
160
+ Use index isolation (separate index per account) instead.
148
161
  collection_id: Collection within account.
149
162
  collection_name: Collection name for display.
150
163
  source_id: Source identifier.
@@ -153,6 +166,13 @@ class StreamingIndexingPipeline:
153
166
  Returns:
154
167
  Aggregated index result.
155
168
  """
169
+ if account_id is not None:
170
+ warnings.warn(
171
+ "account_id parameter is deprecated and will be ignored. "
172
+ "Use index isolation (separate index per account) instead.",
173
+ DeprecationWarning,
174
+ stacklevel=2,
175
+ )
156
176
  start_time = time.time()
157
177
  self._progress = StreamingProgress(current_phase="starting")
158
178
  await self._emit_progress()
@@ -167,7 +187,6 @@ class StreamingIndexingPipeline:
167
187
  self._enrich_document(
168
188
  doc,
169
189
  source=source,
170
- account_id=account_id,
171
190
  collection_id=collection_id,
172
191
  collection_name=collection_name,
173
192
  source_id=source_id,
@@ -248,31 +267,44 @@ class StreamingIndexingPipeline:
248
267
  self,
249
268
  doc: Document,
250
269
  source: str,
251
- account_id: str | None,
252
270
  collection_id: str | None,
253
271
  collection_name: str | None,
254
272
  source_id: str | None,
273
+ account_id: str | None = None,
255
274
  ) -> Document:
256
- """Add tenant and source info to document.
275
+ """Add source info to document.
276
+
277
+ Note:
278
+ This method is tenant-agnostic. Multi-tenancy should be handled
279
+ at the API layer by using separate indices per account. The
280
+ account_id parameter is deprecated and will be ignored.
257
281
 
258
282
  Args:
259
283
  doc: Original document.
260
284
  source: Source URL.
261
- account_id: Account identifier.
262
285
  collection_id: Collection identifier.
263
286
  collection_name: Collection name for display.
264
287
  source_id: Source identifier.
288
+ account_id: Deprecated. This parameter is ignored.
289
+ Use index isolation (separate index per account) instead.
265
290
 
266
291
  Returns:
267
- New Document with tenant info.
292
+ New Document with source info.
268
293
  """
294
+ if account_id is not None:
295
+ warnings.warn(
296
+ "account_id parameter is deprecated and will be ignored. "
297
+ "Use index isolation (separate index per account) instead.",
298
+ DeprecationWarning,
299
+ stacklevel=2,
300
+ )
301
+
269
302
  return Document(
270
303
  content=doc.content,
271
304
  source=source,
272
305
  doc_id=doc.doc_id,
273
306
  url=doc.url,
274
307
  title=doc.title,
275
- account_id=account_id,
276
308
  collection_id=collection_id,
277
309
  collection_name=collection_name,
278
310
  source_id=source_id,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gnosisllm-knowledge
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Enterprise-grade knowledge loading, indexing, and search for Python
5
5
  License: MIT
6
6
  Keywords: knowledge-base,rag,semantic-search,vector-search,opensearch,llm,embeddings,enterprise
@@ -46,7 +46,7 @@ Enterprise-grade knowledge loading, indexing, and semantic search library for Py
46
46
  - **Multiple Loaders**: Load content from websites, sitemaps, and files
47
47
  - **Intelligent Chunking**: Sentence-aware text splitting with configurable overlap
48
48
  - **OpenSearch Backend**: Production-ready with k-NN vector search
49
- - **Multi-Tenancy**: Built-in support for account and collection isolation
49
+ - **Multi-Tenancy**: Index isolation for complete tenant separation (tenant-agnostic library)
50
50
  - **Event-Driven**: Observer pattern for progress tracking and monitoring
51
51
  - **SOLID Architecture**: Clean, maintainable, and extensible codebase
52
52
 
@@ -144,14 +144,15 @@ gnosisllm-knowledge load <URL> [OPTIONS]
144
144
 
145
145
  Options:
146
146
  --type Source type: website, sitemap (auto-detects)
147
- --index Target index name (default: knowledge)
148
- --account-id Multi-tenant account ID
147
+ --index Target index name (e.g., knowledge-tenant-123)
149
148
  --collection-id Collection grouping ID
150
149
  --batch-size Documents per batch (default: 100)
151
150
  --max-urls Max URLs from sitemap (default: 1000)
152
151
  --dry-run Preview without indexing
153
152
  ```
154
153
 
154
+ Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names (e.g., `--index knowledge-tenant-123`).
155
+
155
156
  ### Search
156
157
 
157
158
  Search indexed content with multiple modes:
@@ -161,14 +162,15 @@ gnosisllm-knowledge search <QUERY> [OPTIONS]
161
162
 
162
163
  Options:
163
164
  --mode Search mode: semantic, keyword, hybrid, agentic
164
- --index Index to search (default: knowledge)
165
+ --index Index to search (e.g., knowledge-tenant-123)
165
166
  --limit Max results (default: 5)
166
- --account-id Filter by account
167
167
  --collection-ids Filter by collections (comma-separated)
168
168
  --json Output as JSON for scripting
169
169
  --interactive Interactive search session
170
170
  ```
171
171
 
172
+ Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names.
173
+
172
174
  ## Architecture
173
175
 
174
176
  ```
@@ -319,22 +321,40 @@ agent_body = {
319
321
 
320
322
  ## Multi-Tenancy
321
323
 
324
+ This library is **tenant-agnostic**. Multi-tenancy is achieved through **index isolation** - each tenant gets their own OpenSearch index.
325
+
322
326
  ```python
323
- # Load with tenant isolation
327
+ # The calling application (e.g., API) constructs tenant-specific index names
328
+ index_name = f"knowledge-{account_id}"
329
+
330
+ # Create Knowledge instance for the tenant
331
+ knowledge = Knowledge.from_opensearch(
332
+ host="localhost",
333
+ port=9200,
334
+ index_prefix=index_name, # knowledge-tenant-123
335
+ )
336
+
337
+ # Load content to tenant's isolated index
324
338
  await knowledge.load(
325
339
  source="https://docs.example.com/sitemap.xml",
326
- account_id="tenant-123",
327
340
  collection_id="docs",
328
341
  )
329
342
 
330
- # Search within tenant
343
+ # Search within tenant's index (no account_id filter needed)
331
344
  results = await knowledge.search(
332
345
  "query",
333
- account_id="tenant-123",
334
346
  collection_ids=["docs"],
335
347
  )
336
348
  ```
337
349
 
350
+ **Note**: For audit purposes, you can store `account_id` in document metadata:
351
+ ```python
352
+ await knowledge.load(
353
+ source="https://docs.example.com/sitemap.xml",
354
+ document_defaults={"metadata": {"account_id": "tenant-123"}},
355
+ )
356
+ ```
357
+
338
358
  ## Agentic Memory
339
359
 
340
360
  Conversational memory with automatic fact extraction using OpenSearch's ML Memory plugin.