gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -9,7 +9,11 @@ from typing import Any
9
9
  from gnosisllm_knowledge.core.events.emitter import EventEmitter
10
10
  from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
11
11
  from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
12
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
13
+ from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
14
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
12
15
  from gnosisllm_knowledge.loaders.base import BaseLoader
16
+ from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
13
17
  from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
14
18
  from gnosisllm_knowledge.loaders.website import WebsiteLoader
15
19
 
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
20
24
  ]
21
25
 
22
26
 
27
+ def _create_discovery_loader(
28
+ fetcher: IContentFetcher,
29
+ chunker: ITextChunker,
30
+ config: dict[str, Any] | None,
31
+ event_emitter: EventEmitter | None,
32
+ ) -> DiscoveryLoader:
33
+ """Factory function for creating DiscoveryLoader instances.
34
+
35
+ Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
36
+ is a NeoreaderContentFetcher, reuses its config to ensure consistency.
37
+ Otherwise, creates config from environment variables.
38
+
39
+ Args:
40
+ fetcher: Content fetcher for retrieving URL content.
41
+ chunker: Text chunker for splitting content.
42
+ config: Optional configuration dictionary.
43
+ event_emitter: Optional event emitter for progress events.
44
+
45
+ Returns:
46
+ Configured DiscoveryLoader instance.
47
+ """
48
+ # Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
49
+ if isinstance(fetcher, NeoreaderContentFetcher):
50
+ neoreader_config = fetcher.config
51
+ else:
52
+ neoreader_config = NeoreaderConfig.from_env()
53
+
54
+ discovery_client = NeoreaderDiscoveryClient(neoreader_config)
55
+ return DiscoveryLoader(
56
+ fetcher=fetcher,
57
+ chunker=chunker,
58
+ discovery_client=discovery_client,
59
+ config=config,
60
+ event_emitter=event_emitter,
61
+ )
62
+
63
+
23
64
  class LoaderFactory:
24
65
  """Factory for creating content loaders (Registry Pattern).
25
66
 
@@ -29,6 +70,7 @@ class LoaderFactory:
29
70
  Built-in loaders:
30
71
  - website: Single URL loading
31
72
  - sitemap: Sitemap XML with recursive discovery
73
+ - discovery: Website crawling via Neo Reader Discovery API
32
74
 
33
75
  Example:
34
76
  ```python
@@ -40,6 +82,9 @@ class LoaderFactory:
40
82
  # Explicit type
41
83
  loader = factory.create("sitemap", config={"max_urls": 500})
42
84
 
85
+ # Discovery loader for full website crawling
86
+ loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
87
+
43
88
  # Register custom loader
44
89
  factory.register("custom", MyCustomLoader)
45
90
  ```
@@ -76,6 +121,7 @@ class LoaderFactory:
76
121
  """Register built-in loader types."""
77
122
  self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
78
123
  self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
124
+ self.register("discovery", _create_discovery_loader)
79
125
 
80
126
  def register(self, name: str, creator: LoaderCreator) -> None:
81
127
  """Register a loader type.
@@ -1,4 +1,13 @@
1
- """Knowledge indexing service."""
1
+ """Knowledge indexing service.
2
+
3
+ This service orchestrates the document ingestion pipeline from source to index,
4
+ including loading, chunking, and indexing.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g.,
9
+ `knowledge-{account_id}`) rather than filtering by account_id.
10
+ """
2
11
 
3
12
  from __future__ import annotations
4
13
 
@@ -82,7 +91,6 @@ class KnowledgeIndexingService:
82
91
  source: str,
83
92
  index_name: str,
84
93
  *,
85
- account_id: str | None = None,
86
94
  collection_id: str | None = None,
87
95
  source_id: str | None = None,
88
96
  batch_size: int = 100,
@@ -93,10 +101,13 @@ class KnowledgeIndexingService:
93
101
  Uses streaming to process and index documents as they're fetched,
94
102
  avoiding memory issues with large sitemaps.
95
103
 
104
+ Note:
105
+ This method is tenant-agnostic. Multi-tenancy should be handled
106
+ at the API layer by using separate indices per account.
107
+
96
108
  Args:
97
109
  source: Source URL or path.
98
- index_name: Target index name.
99
- account_id: Account ID for multi-tenancy.
110
+ index_name: Target index name (use tenant-specific name for isolation).
100
111
  collection_id: Collection ID.
101
112
  source_id: Source ID (auto-generated if not provided).
102
113
  batch_size: Documents per batch for indexing.
@@ -108,6 +119,16 @@ class KnowledgeIndexingService:
108
119
  source_id = source_id or str(uuid.uuid4())
109
120
  document_defaults = options.pop("document_defaults", {})
110
121
 
122
+ # Extract metadata from document_defaults to merge with doc.metadata later
123
+ # This allows callers to pass custom metadata (e.g., parent_collection_id)
124
+ # without conflicting with the explicit metadata= parameter
125
+ extra_metadata = document_defaults.pop("metadata", {})
126
+
127
+ # Ensure index exists with correct mappings before indexing
128
+ # This prevents OpenSearch from auto-creating the index with dynamic mapping
129
+ # which would map keyword fields (like collection_id) as text fields
130
+ await self._indexer.ensure_index(index_name)
131
+
111
132
  # Emit batch started event
112
133
  await self._events.emit_async(
113
134
  BatchStartedEvent(
@@ -127,21 +148,22 @@ class KnowledgeIndexingService:
127
148
  # Stream documents and index in batches as they arrive
128
149
  # Note: Loader already chunks content, so we don't re-chunk here
129
150
  async for doc in self._loader.load_streaming(source, **options):
130
- # Enrich document with tenant info
151
+ # Enrich document with collection info
152
+ # Merge doc.metadata with extra_metadata from document_defaults
153
+ merged_metadata = {**doc.metadata, **extra_metadata}
131
154
  enriched_doc = Document(
132
155
  content=doc.content,
133
156
  source=source,
134
157
  doc_id=doc.doc_id,
135
158
  url=doc.url,
136
159
  title=doc.title,
137
- account_id=account_id,
138
160
  collection_id=collection_id,
139
161
  source_id=source_id,
140
162
  chunk_index=doc.chunk_index,
141
163
  total_chunks=doc.total_chunks,
142
164
  parent_doc_id=doc.parent_doc_id,
143
165
  status=DocumentStatus.INDEXED,
144
- metadata=doc.metadata,
166
+ metadata=merged_metadata,
145
167
  **document_defaults,
146
168
  )
147
169
 
@@ -213,6 +235,9 @@ class KnowledgeIndexingService:
213
235
  Returns:
214
236
  Index result.
215
237
  """
238
+ # Ensure index exists with correct mappings before indexing
239
+ await self._indexer.ensure_index(index_name)
240
+
216
241
  total_indexed = 0
217
242
  total_failed = 0
218
243
  errors: list[str] = []
@@ -230,8 +255,8 @@ class KnowledgeIndexingService:
230
255
  doc_id=f"{doc.doc_id}-chunk-{i}",
231
256
  url=doc.url,
232
257
  title=doc.title,
233
- account_id=doc.account_id,
234
258
  collection_id=doc.collection_id,
259
+ collection_name=doc.collection_name,
235
260
  source_id=doc.source_id,
236
261
  chunk_index=i,
237
262
  total_chunks=len(chunks),
@@ -271,14 +296,16 @@ class KnowledgeIndexingService:
271
296
  self,
272
297
  source_id: str,
273
298
  index_name: str,
274
- account_id: str | None = None,
275
299
  ) -> int:
276
300
  """Delete all documents from a source.
277
301
 
302
+ Note:
303
+ This method is tenant-agnostic. Multi-tenancy should be handled
304
+ at the API layer by using separate indices per account.
305
+
278
306
  Args:
279
307
  source_id: Source ID to delete.
280
- index_name: Index name.
281
- account_id: Optional account filter.
308
+ index_name: Index name (use tenant-specific name for isolation).
282
309
 
283
310
  Returns:
284
311
  Count of deleted documents.
@@ -287,21 +314,23 @@ class KnowledgeIndexingService:
287
314
  build_delete_by_source_query,
288
315
  )
289
316
 
290
- query = build_delete_by_source_query(source_id, account_id)
317
+ query = build_delete_by_source_query(source_id)
291
318
  return await self._indexer.delete_by_query(query, index_name)
292
319
 
293
320
  async def delete_collection(
294
321
  self,
295
322
  collection_id: str,
296
323
  index_name: str,
297
- account_id: str | None = None,
298
324
  ) -> int:
299
325
  """Delete all documents from a collection.
300
326
 
327
+ Note:
328
+ This method is tenant-agnostic. Multi-tenancy should be handled
329
+ at the API layer by using separate indices per account.
330
+
301
331
  Args:
302
332
  collection_id: Collection ID to delete.
303
- index_name: Index name.
304
- account_id: Optional account filter.
333
+ index_name: Index name (use tenant-specific name for isolation).
305
334
 
306
335
  Returns:
307
336
  Count of deleted documents.
@@ -310,7 +339,7 @@ class KnowledgeIndexingService:
310
339
  build_delete_by_collection_query,
311
340
  )
312
341
 
313
- query = build_delete_by_collection_query(collection_id, account_id)
342
+ query = build_delete_by_collection_query(collection_id)
314
343
  return await self._indexer.delete_by_query(query, index_name)
315
344
 
316
345
  async def reindex_source(
@@ -319,17 +348,19 @@ class KnowledgeIndexingService:
319
348
  source_id: str,
320
349
  index_name: str,
321
350
  *,
322
- account_id: str | None = None,
323
351
  collection_id: str | None = None,
324
352
  **options: Any,
325
353
  ) -> IndexResult:
326
354
  """Reindex a source by deleting and re-loading.
327
355
 
356
+ Note:
357
+ This method is tenant-agnostic. Multi-tenancy should be handled
358
+ at the API layer by using separate indices per account.
359
+
328
360
  Args:
329
361
  source: Source URL or path.
330
362
  source_id: Existing source ID.
331
- index_name: Index name.
332
- account_id: Account ID.
363
+ index_name: Index name (use tenant-specific name for isolation).
333
364
  collection_id: Collection ID.
334
365
  **options: Additional options.
335
366
 
@@ -337,13 +368,12 @@ class KnowledgeIndexingService:
337
368
  Index result.
338
369
  """
339
370
  # Delete existing documents
340
- await self.delete_source(source_id, index_name, account_id)
371
+ await self.delete_source(source_id, index_name)
341
372
 
342
373
  # Re-index
343
374
  return await self.load_and_index(
344
375
  source=source,
345
376
  index_name=index_name,
346
- account_id=account_id,
347
377
  collection_id=collection_id,
348
378
  source_id=source_id,
349
379
  **options,
@@ -1,4 +1,12 @@
1
- """Knowledge search service."""
1
+ """Knowledge search service.
2
+
3
+ This service provides a high-level interface for searching knowledge documents
4
+ using semantic, keyword, and hybrid search modes.
5
+
6
+ Note:
7
+ This service is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g., knowledge-{account_id}).
9
+ """
2
10
 
3
11
  from __future__ import annotations
4
12
 
@@ -70,7 +78,6 @@ class KnowledgeSearchService:
70
78
  mode: SearchMode = SearchMode.HYBRID,
71
79
  limit: int = 10,
72
80
  offset: int = 0,
73
- account_id: str | None = None,
74
81
  collection_ids: list[str] | None = None,
75
82
  source_ids: list[str] | None = None,
76
83
  min_score: float | None = None,
@@ -78,13 +85,16 @@ class KnowledgeSearchService:
78
85
  ) -> SearchResult:
79
86
  """Search for knowledge documents.
80
87
 
88
+ Note:
89
+ This method is tenant-agnostic. Multi-tenancy should be handled
90
+ at the API layer by using separate indices per account.
91
+
81
92
  Args:
82
93
  query: Search query text.
83
94
  index_name: Index to search (uses default if not provided).
84
95
  mode: Search mode (semantic, keyword, hybrid).
85
96
  limit: Maximum results.
86
97
  offset: Result offset for pagination.
87
- account_id: Account ID for multi-tenancy.
88
98
  collection_ids: Filter by collection IDs.
89
99
  source_ids: Filter by source IDs.
90
100
  min_score: Minimum score threshold.
@@ -105,7 +115,6 @@ class KnowledgeSearchService:
105
115
  mode=mode,
106
116
  limit=limit,
107
117
  offset=offset,
108
- account_id=account_id,
109
118
  collection_ids=collection_ids,
110
119
  source_ids=source_ids,
111
120
  min_score=min_score,
@@ -133,17 +142,19 @@ class KnowledgeSearchService:
133
142
  *,
134
143
  index_name: str | None = None,
135
144
  limit: int = 10,
136
- account_id: str | None = None,
137
145
  collection_ids: list[str] | None = None,
138
146
  **options: Any,
139
147
  ) -> SearchResult:
140
148
  """Execute semantic (vector) search.
141
149
 
150
+ Note:
151
+ This method is tenant-agnostic. Multi-tenancy should be handled
152
+ at the API layer by using separate indices per account.
153
+
142
154
  Args:
143
155
  query: Search query text.
144
156
  index_name: Index to search.
145
157
  limit: Maximum results.
146
- account_id: Account ID for multi-tenancy.
147
158
  collection_ids: Filter by collection IDs.
148
159
  **options: Additional options.
149
160
 
@@ -155,7 +166,6 @@ class KnowledgeSearchService:
155
166
  index_name=index_name,
156
167
  mode=SearchMode.SEMANTIC,
157
168
  limit=limit,
158
- account_id=account_id,
159
169
  collection_ids=collection_ids,
160
170
  **options,
161
171
  )
@@ -166,17 +176,19 @@ class KnowledgeSearchService:
166
176
  *,
167
177
  index_name: str | None = None,
168
178
  limit: int = 10,
169
- account_id: str | None = None,
170
179
  collection_ids: list[str] | None = None,
171
180
  **options: Any,
172
181
  ) -> SearchResult:
173
182
  """Execute keyword (BM25) search.
174
183
 
184
+ Note:
185
+ This method is tenant-agnostic. Multi-tenancy should be handled
186
+ at the API layer by using separate indices per account.
187
+
175
188
  Args:
176
189
  query: Search query text.
177
190
  index_name: Index to search.
178
191
  limit: Maximum results.
179
- account_id: Account ID for multi-tenancy.
180
192
  collection_ids: Filter by collection IDs.
181
193
  **options: Additional options.
182
194
 
@@ -188,7 +200,6 @@ class KnowledgeSearchService:
188
200
  index_name=index_name,
189
201
  mode=SearchMode.KEYWORD,
190
202
  limit=limit,
191
- account_id=account_id,
192
203
  collection_ids=collection_ids,
193
204
  **options,
194
205
  )
@@ -199,7 +210,6 @@ class KnowledgeSearchService:
199
210
  *,
200
211
  index_name: str | None = None,
201
212
  limit: int = 10,
202
- account_id: str | None = None,
203
213
  collection_ids: list[str] | None = None,
204
214
  semantic_weight: float = 0.7,
205
215
  keyword_weight: float = 0.3,
@@ -207,11 +217,14 @@ class KnowledgeSearchService:
207
217
  ) -> SearchResult:
208
218
  """Execute hybrid search (semantic + keyword).
209
219
 
220
+ Note:
221
+ This method is tenant-agnostic. Multi-tenancy should be handled
222
+ at the API layer by using separate indices per account.
223
+
210
224
  Args:
211
225
  query: Search query text.
212
226
  index_name: Index to search.
213
227
  limit: Maximum results.
214
- account_id: Account ID for multi-tenancy.
215
228
  collection_ids: Filter by collection IDs.
216
229
  semantic_weight: Weight for semantic score.
217
230
  keyword_weight: Weight for keyword score.
@@ -225,7 +238,6 @@ class KnowledgeSearchService:
225
238
  index_name=index_name,
226
239
  mode=SearchMode.HYBRID,
227
240
  limit=limit,
228
- account_id=account_id,
229
241
  collection_ids=collection_ids,
230
242
  semantic_weight=semantic_weight,
231
243
  keyword_weight=keyword_weight,
@@ -264,17 +276,19 @@ class KnowledgeSearchService:
264
276
  index_name: str | None = None,
265
277
  mode: SearchMode = SearchMode.HYBRID,
266
278
  limit: int = 10,
267
- account_id: str | None = None,
268
279
  **options: Any,
269
280
  ) -> list[SearchResult]:
270
281
  """Execute multiple searches in parallel.
271
282
 
283
+ Note:
284
+ This method is tenant-agnostic. Multi-tenancy should be handled
285
+ at the API layer by using separate indices per account.
286
+
272
287
  Args:
273
288
  queries: List of query texts.
274
289
  index_name: Index to search.
275
290
  mode: Search mode.
276
291
  limit: Maximum results per query.
277
- account_id: Account ID for multi-tenancy.
278
292
  **options: Additional options.
279
293
 
280
294
  Returns:
@@ -289,7 +303,6 @@ class KnowledgeSearchService:
289
303
  text=query,
290
304
  mode=mode,
291
305
  limit=limit,
292
- account_id=account_id,
293
306
  )
294
307
  for query in queries
295
308
  ]
@@ -310,15 +323,22 @@ class KnowledgeSearchService:
310
323
  async def count(
311
324
  self,
312
325
  index_name: str | None = None,
313
- account_id: str | None = None,
314
326
  collection_id: str | None = None,
327
+ source_id: str | None = None,
315
328
  ) -> int:
316
329
  """Count documents in index.
317
330
 
331
+ Uses native count API instead of search for efficiency and to avoid
332
+ hybrid search issues with empty queries.
333
+
334
+ Note:
335
+ This method is tenant-agnostic. Multi-tenancy should be handled
336
+ at the API layer by using separate indices per account.
337
+
318
338
  Args:
319
339
  index_name: Index to count.
320
- account_id: Filter by account.
321
340
  collection_id: Filter by collection.
341
+ source_id: Filter by source (for source deletion confirmation).
322
342
 
323
343
  Returns:
324
344
  Document count.
@@ -327,18 +347,12 @@ class KnowledgeSearchService:
327
347
  if not index:
328
348
  raise SearchError(message="No index specified")
329
349
 
330
- # Build count query
331
- query = SearchQuery(
332
- text="",
333
- limit=0,
334
- account_id=account_id,
335
- collection_ids=[collection_id] if collection_id else None,
350
+ return await self._searcher.count(
351
+ index_name=index,
352
+ collection_id=collection_id,
353
+ source_id=source_id,
336
354
  )
337
355
 
338
- # Use a simple match_all to get total count
339
- result = await self._searcher.search(query, index)
340
- return result.total_hits
341
-
342
356
  async def get_collections(
343
357
  self,
344
358
  index_name: str | None = None,
@@ -2,12 +2,19 @@
2
2
 
3
3
  This module provides the StreamingIndexingPipeline that orchestrates
4
4
  the load -> index pipeline with guaranteed bounded memory usage.
5
+
6
+ Note:
7
+ This module is tenant-agnostic. Multi-tenancy should be handled at the
8
+ API layer by using separate indices per account (e.g.,
9
+ gnosisllm-{account_id}-knowledge) rather than filtering by account_id.
10
+ The account_id parameters are deprecated and will be ignored.
5
11
  """
6
12
 
7
13
  from __future__ import annotations
8
14
 
9
15
  import logging
10
16
  import time
17
+ import warnings
11
18
  from dataclasses import dataclass, field
12
19
  from typing import TYPE_CHECKING, Any
13
20
 
@@ -141,10 +148,16 @@ class StreamingIndexingPipeline:
141
148
  ) -> IndexResult:
142
149
  """Execute the streaming pipeline.
143
150
 
151
+ Note:
152
+ This method is tenant-agnostic. Multi-tenancy should be handled
153
+ at the API layer by using separate indices per account. The
154
+ account_id parameter is deprecated and will be ignored.
155
+
144
156
  Args:
145
157
  source: Sitemap URL.
146
158
  index_name: Target OpenSearch index.
147
- account_id: For multi-tenancy filtering.
159
+ account_id: Deprecated. This parameter is ignored.
160
+ Use index isolation (separate index per account) instead.
148
161
  collection_id: Collection within account.
149
162
  collection_name: Collection name for display.
150
163
  source_id: Source identifier.
@@ -153,6 +166,19 @@ class StreamingIndexingPipeline:
153
166
  Returns:
154
167
  Aggregated index result.
155
168
  """
169
+ if account_id is not None:
170
+ warnings.warn(
171
+ "account_id parameter is deprecated and will be ignored. "
172
+ "Use index isolation (separate index per account) instead.",
173
+ DeprecationWarning,
174
+ stacklevel=2,
175
+ )
176
+
177
+ # Ensure index exists with correct mappings before indexing
178
+ # This prevents OpenSearch from auto-creating the index with dynamic mapping
179
+ # which would map keyword fields (like collection_id) as text fields
180
+ await self._indexer.ensure_index(index_name)
181
+
156
182
  start_time = time.time()
157
183
  self._progress = StreamingProgress(current_phase="starting")
158
184
  await self._emit_progress()
@@ -167,7 +193,6 @@ class StreamingIndexingPipeline:
167
193
  self._enrich_document(
168
194
  doc,
169
195
  source=source,
170
- account_id=account_id,
171
196
  collection_id=collection_id,
172
197
  collection_name=collection_name,
173
198
  source_id=source_id,
@@ -248,31 +273,44 @@ class StreamingIndexingPipeline:
248
273
  self,
249
274
  doc: Document,
250
275
  source: str,
251
- account_id: str | None,
252
276
  collection_id: str | None,
253
277
  collection_name: str | None,
254
278
  source_id: str | None,
279
+ account_id: str | None = None,
255
280
  ) -> Document:
256
- """Add tenant and source info to document.
281
+ """Add source info to document.
282
+
283
+ Note:
284
+ This method is tenant-agnostic. Multi-tenancy should be handled
285
+ at the API layer by using separate indices per account. The
286
+ account_id parameter is deprecated and will be ignored.
257
287
 
258
288
  Args:
259
289
  doc: Original document.
260
290
  source: Source URL.
261
- account_id: Account identifier.
262
291
  collection_id: Collection identifier.
263
292
  collection_name: Collection name for display.
264
293
  source_id: Source identifier.
294
+ account_id: Deprecated. This parameter is ignored.
295
+ Use index isolation (separate index per account) instead.
265
296
 
266
297
  Returns:
267
- New Document with tenant info.
298
+ New Document with source info.
268
299
  """
300
+ if account_id is not None:
301
+ warnings.warn(
302
+ "account_id parameter is deprecated and will be ignored. "
303
+ "Use index isolation (separate index per account) instead.",
304
+ DeprecationWarning,
305
+ stacklevel=2,
306
+ )
307
+
269
308
  return Document(
270
309
  content=doc.content,
271
310
  source=source,
272
311
  doc_id=doc.doc_id,
273
312
  url=doc.url,
274
313
  title=doc.title,
275
- account_id=account_id,
276
314
  collection_id=collection_id,
277
315
  collection_name=collection_name,
278
316
  source_id=source_id,