gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Fetches content, chunks it for optimal embedding, and indexes
|
|
4
4
|
into OpenSearch with automatic embedding generation via ingest pipeline.
|
|
5
|
+
|
|
6
|
+
Note:
|
|
7
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
8
|
+
isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
11
|
from __future__ import annotations
|
|
@@ -26,6 +30,14 @@ from gnosisllm_knowledge.chunking.sentence import SentenceChunker
|
|
|
26
30
|
from gnosisllm_knowledge.cli.display.service import RichDisplayService
|
|
27
31
|
from gnosisllm_knowledge.cli.utils.config import CliConfig
|
|
28
32
|
from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
|
|
33
|
+
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
34
|
+
from gnosisllm_knowledge.core.events.types import (
|
|
35
|
+
DiscoveryCompletedEvent,
|
|
36
|
+
DiscoveryFailedEvent,
|
|
37
|
+
DiscoveryProgressEvent,
|
|
38
|
+
DiscoveryStartedEvent,
|
|
39
|
+
EventType,
|
|
40
|
+
)
|
|
29
41
|
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
30
42
|
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
31
43
|
from gnosisllm_knowledge.loaders.factory import LoaderFactory
|
|
@@ -39,7 +51,6 @@ async def load_command(
|
|
|
39
51
|
source: str,
|
|
40
52
|
source_type: str | None = None,
|
|
41
53
|
index_name: str = "knowledge",
|
|
42
|
-
account_id: str | None = None,
|
|
43
54
|
collection_id: str | None = None,
|
|
44
55
|
source_id: str | None = None,
|
|
45
56
|
batch_size: int = 100,
|
|
@@ -47,15 +58,22 @@ async def load_command(
|
|
|
47
58
|
force: bool = False,
|
|
48
59
|
dry_run: bool = False,
|
|
49
60
|
verbose: bool = False,
|
|
61
|
+
discovery: bool = False,
|
|
62
|
+
max_depth: int = 3,
|
|
63
|
+
max_pages: int = 100,
|
|
64
|
+
same_domain: bool = True,
|
|
50
65
|
) -> None:
|
|
51
66
|
"""Execute the load command.
|
|
52
67
|
|
|
68
|
+
Note:
|
|
69
|
+
Multi-tenancy is achieved through index isolation. Use tenant-specific
|
|
70
|
+
index names instead (e.g., --index knowledge-tenant-123).
|
|
71
|
+
|
|
53
72
|
Args:
|
|
54
73
|
display: Display service for output.
|
|
55
74
|
source: URL or sitemap to load content from.
|
|
56
|
-
source_type: Source type (website, sitemap) or auto-detect.
|
|
57
|
-
index_name: Target index name.
|
|
58
|
-
account_id: Multi-tenant account ID.
|
|
75
|
+
source_type: Source type (website, sitemap, discovery) or auto-detect.
|
|
76
|
+
index_name: Target index name (use tenant-specific name for isolation).
|
|
59
77
|
collection_id: Collection grouping ID.
|
|
60
78
|
source_id: Source identifier (defaults to URL).
|
|
61
79
|
batch_size: Documents per indexing batch.
|
|
@@ -63,6 +81,10 @@ async def load_command(
|
|
|
63
81
|
force: Delete existing source documents first.
|
|
64
82
|
dry_run: Preview without indexing.
|
|
65
83
|
verbose: Show per-document progress.
|
|
84
|
+
discovery: Use discovery loader (website crawling) instead of single URL.
|
|
85
|
+
max_depth: Maximum crawl depth for discovery (default: 3).
|
|
86
|
+
max_pages: Maximum pages to discover (default: 100).
|
|
87
|
+
same_domain: Only crawl URLs on the same domain (default: True).
|
|
66
88
|
"""
|
|
67
89
|
# Load configuration
|
|
68
90
|
cli_config = CliConfig.from_env()
|
|
@@ -70,10 +92,18 @@ async def load_command(
|
|
|
70
92
|
# Auto-detect source type
|
|
71
93
|
detected_type = source_type
|
|
72
94
|
if not detected_type:
|
|
73
|
-
if
|
|
95
|
+
if discovery:
|
|
96
|
+
detected_type = "discovery"
|
|
97
|
+
elif "sitemap" in source.lower() or source.endswith(".xml"):
|
|
74
98
|
detected_type = "sitemap"
|
|
75
99
|
else:
|
|
76
100
|
detected_type = "website"
|
|
101
|
+
elif discovery and detected_type != "discovery":
|
|
102
|
+
# --discovery flag overrides explicit type for website URLs
|
|
103
|
+
display.warning(
|
|
104
|
+
f"Using discovery loader (--discovery flag overrides --type {detected_type})"
|
|
105
|
+
)
|
|
106
|
+
detected_type = "discovery"
|
|
77
107
|
|
|
78
108
|
# Default source_id to URL
|
|
79
109
|
final_source_id = source_id or source
|
|
@@ -85,18 +115,28 @@ async def load_command(
|
|
|
85
115
|
)
|
|
86
116
|
|
|
87
117
|
# Show configuration
|
|
118
|
+
is_auto_detected = not source_type and not discovery
|
|
119
|
+
type_suffix = " (auto-detected)" if is_auto_detected else ""
|
|
88
120
|
config_rows = [
|
|
89
121
|
("Source", source[:50] + "..." if len(source) > 50 else source),
|
|
90
|
-
("Type", f"{detected_type}
|
|
122
|
+
("Type", f"{detected_type}{type_suffix}"),
|
|
91
123
|
("Target Index", index_name),
|
|
92
124
|
("Batch Size", str(batch_size)),
|
|
93
|
-
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Add type-specific configuration
|
|
128
|
+
if detected_type == "sitemap":
|
|
129
|
+
config_rows.append(("Max URLs", str(max_urls)))
|
|
130
|
+
elif detected_type == "discovery":
|
|
131
|
+
config_rows.append(("Max Depth", str(max_depth)))
|
|
132
|
+
config_rows.append(("Max Pages", str(max_pages)))
|
|
133
|
+
config_rows.append(("Same Domain", "Yes" if same_domain else "No"))
|
|
134
|
+
|
|
135
|
+
config_rows.extend([
|
|
94
136
|
("Neoreader", cli_config.neoreader_host),
|
|
95
137
|
("OpenSearch", f"{cli_config.opensearch_host}:{cli_config.opensearch_port}"),
|
|
96
|
-
]
|
|
138
|
+
])
|
|
97
139
|
|
|
98
|
-
if account_id:
|
|
99
|
-
config_rows.append(("Account ID", account_id))
|
|
100
140
|
if collection_id:
|
|
101
141
|
config_rows.append(("Collection ID", collection_id))
|
|
102
142
|
if force:
|
|
@@ -119,9 +159,16 @@ async def load_command(
|
|
|
119
159
|
display.warning(f"Cannot connect to Neoreader at {cli_config.neoreader_host}")
|
|
120
160
|
display.info("Continuing with fallback HTTP fetcher...")
|
|
121
161
|
|
|
162
|
+
# Create event emitter for discovery progress tracking
|
|
163
|
+
event_emitter = EventEmitter()
|
|
164
|
+
|
|
122
165
|
# Create loader
|
|
123
166
|
chunker = SentenceChunker()
|
|
124
|
-
loader_factory = LoaderFactory(
|
|
167
|
+
loader_factory = LoaderFactory(
|
|
168
|
+
fetcher=fetcher,
|
|
169
|
+
chunker=chunker,
|
|
170
|
+
event_emitter=event_emitter,
|
|
171
|
+
)
|
|
125
172
|
|
|
126
173
|
try:
|
|
127
174
|
loader = loader_factory.create(detected_type)
|
|
@@ -151,9 +198,48 @@ async def load_command(
|
|
|
151
198
|
)
|
|
152
199
|
sys.exit(1)
|
|
153
200
|
|
|
154
|
-
#
|
|
201
|
+
# Build loader options for discovery
|
|
202
|
+
loader_options: dict = {}
|
|
203
|
+
if detected_type == "discovery":
|
|
204
|
+
loader_options = {
|
|
205
|
+
"max_depth": max_depth,
|
|
206
|
+
"max_pages": max_pages,
|
|
207
|
+
"same_domain": same_domain,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Load documents with discovery progress display
|
|
155
211
|
documents: list[Document] = []
|
|
156
212
|
url_count = 0
|
|
213
|
+
discovery_state: dict = {"started": False, "completed": False, "job_id": None}
|
|
214
|
+
|
|
215
|
+
# Register discovery event handlers for Rich display
|
|
216
|
+
def _on_discovery_started(event: DiscoveryStartedEvent) -> None:
|
|
217
|
+
discovery_state["started"] = True
|
|
218
|
+
discovery_state["job_id"] = event.job_id
|
|
219
|
+
|
|
220
|
+
def _on_discovery_progress(event: DiscoveryProgressEvent) -> None:
|
|
221
|
+
# Update will be handled in the progress context
|
|
222
|
+
discovery_state["percent"] = event.percent
|
|
223
|
+
discovery_state["pages_crawled"] = event.pages_crawled
|
|
224
|
+
discovery_state["urls_discovered"] = event.urls_discovered
|
|
225
|
+
discovery_state["current_depth"] = event.current_depth
|
|
226
|
+
discovery_state["message"] = event.message
|
|
227
|
+
|
|
228
|
+
def _on_discovery_completed(event: DiscoveryCompletedEvent) -> None:
|
|
229
|
+
discovery_state["completed"] = True
|
|
230
|
+
discovery_state["urls_count"] = event.urls_count
|
|
231
|
+
discovery_state["duration_seconds"] = event.duration_seconds
|
|
232
|
+
|
|
233
|
+
def _on_discovery_failed(event: DiscoveryFailedEvent) -> None:
|
|
234
|
+
discovery_state["failed"] = True
|
|
235
|
+
discovery_state["error"] = event.error
|
|
236
|
+
|
|
237
|
+
# Register discovery event handlers
|
|
238
|
+
if detected_type == "discovery":
|
|
239
|
+
event_emitter.add_handler(EventType.DISCOVERY_STARTED, _on_discovery_started)
|
|
240
|
+
event_emitter.add_handler(EventType.DISCOVERY_PROGRESS, _on_discovery_progress)
|
|
241
|
+
event_emitter.add_handler(EventType.DISCOVERY_COMPLETED, _on_discovery_completed)
|
|
242
|
+
event_emitter.add_handler(EventType.DISCOVERY_FAILED, _on_discovery_failed)
|
|
157
243
|
|
|
158
244
|
with Progress(
|
|
159
245
|
SpinnerColumn(),
|
|
@@ -164,9 +250,32 @@ async def load_command(
|
|
|
164
250
|
TimeElapsedColumn(),
|
|
165
251
|
console=display.console,
|
|
166
252
|
) as progress:
|
|
167
|
-
|
|
253
|
+
# Add task for discovery phase (if applicable)
|
|
254
|
+
if detected_type == "discovery":
|
|
255
|
+
discovery_task = progress.add_task(
|
|
256
|
+
"Discovering URLs...",
|
|
257
|
+
total=max_pages,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
load_task = progress.add_task("Loading content...", total=None, visible=False)
|
|
261
|
+
|
|
262
|
+
async for doc in loader.load_streaming(source, **loader_options):
|
|
263
|
+
# Update discovery progress if available
|
|
264
|
+
if detected_type == "discovery":
|
|
265
|
+
if discovery_state.get("started") and not discovery_state.get("completed"):
|
|
266
|
+
pages = discovery_state.get("pages_crawled", 0)
|
|
267
|
+
urls = discovery_state.get("urls_discovered", 0)
|
|
268
|
+
depth = discovery_state.get("current_depth", 0)
|
|
269
|
+
progress.update(
|
|
270
|
+
discovery_task,
|
|
271
|
+
completed=pages,
|
|
272
|
+
description=f"Discovering... (depth {depth}, {urls} URLs found)",
|
|
273
|
+
)
|
|
274
|
+
elif discovery_state.get("completed"):
|
|
275
|
+
# Hide discovery task and show load task
|
|
276
|
+
progress.update(discovery_task, visible=False)
|
|
277
|
+
progress.update(load_task, visible=True)
|
|
168
278
|
|
|
169
|
-
async for doc in loader.load_streaming(source):
|
|
170
279
|
documents.append(doc)
|
|
171
280
|
url_count += 1
|
|
172
281
|
progress.update(load_task, advance=1, description=f"Loading... ({url_count} docs)")
|
|
@@ -176,6 +285,13 @@ async def load_command(
|
|
|
176
285
|
|
|
177
286
|
progress.update(load_task, completed=url_count)
|
|
178
287
|
|
|
288
|
+
# Show discovery summary if applicable
|
|
289
|
+
if detected_type == "discovery" and discovery_state.get("completed"):
|
|
290
|
+
display.success(
|
|
291
|
+
f"Discovered {discovery_state.get('urls_count', 0)} URLs "
|
|
292
|
+
f"in {discovery_state.get('duration_seconds', 0):.1f}s"
|
|
293
|
+
)
|
|
294
|
+
|
|
179
295
|
display.success(f"Loaded {len(documents)} documents")
|
|
180
296
|
|
|
181
297
|
if not documents:
|
|
@@ -223,7 +339,6 @@ async def load_command(
|
|
|
223
339
|
url=doc.url,
|
|
224
340
|
title=doc.title,
|
|
225
341
|
source=final_source_id,
|
|
226
|
-
account_id=account_id,
|
|
227
342
|
collection_id=collection_id,
|
|
228
343
|
source_id=final_source_id,
|
|
229
344
|
metadata=doc.metadata,
|
|
@@ -238,7 +353,6 @@ async def load_command(
|
|
|
238
353
|
url=doc.url,
|
|
239
354
|
title=doc.title,
|
|
240
355
|
source=final_source_id,
|
|
241
|
-
account_id=account_id,
|
|
242
356
|
collection_id=collection_id,
|
|
243
357
|
source_id=final_source_id,
|
|
244
358
|
chunk_index=i,
|
|
@@ -267,8 +381,11 @@ async def load_command(
|
|
|
267
381
|
)
|
|
268
382
|
|
|
269
383
|
try:
|
|
270
|
-
# Create indexer config
|
|
384
|
+
# Create indexer config from environment, preserving k-NN and other settings
|
|
385
|
+
# This ensures proper vector mappings are used when creating indices
|
|
386
|
+
base_config = OpenSearchConfig.from_env()
|
|
271
387
|
opensearch_config = OpenSearchConfig(
|
|
388
|
+
# CLI/CliConfig overrides
|
|
272
389
|
host=cli_config.opensearch_host,
|
|
273
390
|
port=cli_config.opensearch_port,
|
|
274
391
|
username=cli_config.opensearch_username,
|
|
@@ -276,7 +393,21 @@ async def load_command(
|
|
|
276
393
|
use_ssl=cli_config.opensearch_use_ssl,
|
|
277
394
|
verify_certs=cli_config.opensearch_verify_certs,
|
|
278
395
|
model_id=cli_config.opensearch_model_id,
|
|
279
|
-
ingest_pipeline_name=cli_config.
|
|
396
|
+
ingest_pipeline_name=cli_config.opensearch_ingest_pipeline_name,
|
|
397
|
+
# Preserve env-based k-NN settings for proper index mappings
|
|
398
|
+
embedding_model=base_config.embedding_model,
|
|
399
|
+
embedding_dimension=base_config.embedding_dimension,
|
|
400
|
+
embedding_field=base_config.embedding_field,
|
|
401
|
+
knn_engine=base_config.knn_engine,
|
|
402
|
+
knn_space_type=base_config.knn_space_type,
|
|
403
|
+
knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
|
|
404
|
+
knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
|
|
405
|
+
knn_algo_param_m=base_config.knn_algo_param_m,
|
|
406
|
+
# Preserve other settings
|
|
407
|
+
index_prefix=base_config.index_prefix,
|
|
408
|
+
number_of_shards=base_config.number_of_shards,
|
|
409
|
+
number_of_replicas=base_config.number_of_replicas,
|
|
410
|
+
search_pipeline_name=base_config.search_pipeline_name,
|
|
280
411
|
)
|
|
281
412
|
|
|
282
413
|
indexer = OpenSearchIndexer(client, opensearch_config)
|
|
@@ -314,6 +445,7 @@ async def load_command(
|
|
|
314
445
|
|
|
315
446
|
indexed_count = 0
|
|
316
447
|
failed_count = 0
|
|
448
|
+
all_errors: list[dict] = []
|
|
317
449
|
|
|
318
450
|
with Progress(
|
|
319
451
|
SpinnerColumn(),
|
|
@@ -333,6 +465,8 @@ async def load_command(
|
|
|
333
465
|
result = await indexer.bulk_index(batch, index_name, batch_size=batch_size)
|
|
334
466
|
indexed_count += result.indexed_count
|
|
335
467
|
failed_count += result.failed_count
|
|
468
|
+
if result.errors:
|
|
469
|
+
all_errors.extend(result.errors)
|
|
336
470
|
|
|
337
471
|
progress.update(index_task, advance=len(batch))
|
|
338
472
|
|
|
@@ -354,12 +488,28 @@ async def load_command(
|
|
|
354
488
|
style="success",
|
|
355
489
|
)
|
|
356
490
|
else:
|
|
491
|
+
# Build error details section
|
|
492
|
+
error_details = ""
|
|
493
|
+
if all_errors:
|
|
494
|
+
error_details = "\n\n[bold red]Error Details:[/bold red]\n"
|
|
495
|
+
for i, err in enumerate(all_errors[:5], 1): # Show first 5 errors
|
|
496
|
+
if isinstance(err, dict):
|
|
497
|
+
error_type = err.get("error", {}).get("type", "unknown") if isinstance(err.get("error"), dict) else str(err.get("error", "unknown"))
|
|
498
|
+
error_reason = err.get("error", {}).get("reason", "No reason provided") if isinstance(err.get("error"), dict) else str(err.get("error", "No details"))
|
|
499
|
+
doc_id = err.get("_id", "unknown")
|
|
500
|
+
error_details += f" {i}. [dim]Doc {doc_id}:[/dim] {error_type} - {error_reason}\n"
|
|
501
|
+
else:
|
|
502
|
+
error_details += f" {i}. {err}\n"
|
|
503
|
+
if len(all_errors) > 5:
|
|
504
|
+
error_details += f" ... and {len(all_errors) - 5} more errors\n"
|
|
505
|
+
|
|
357
506
|
display.panel(
|
|
358
507
|
f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
|
|
359
508
|
f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
|
|
360
509
|
f"Documents Indexed: [green]{indexed_count}[/green]\n"
|
|
361
510
|
f"Documents Failed: [red]{failed_count}[/red]\n"
|
|
362
|
-
f"Index: [cyan]{index_name}[/cyan]"
|
|
511
|
+
f"Index: [cyan]{index_name}[/cyan]"
|
|
512
|
+
f"{error_details}",
|
|
363
513
|
title="Loading Complete (with errors)",
|
|
364
514
|
style="warning",
|
|
365
515
|
)
|
|
@@ -608,6 +608,14 @@ async def session_list_command(
|
|
|
608
608
|
limit: Maximum sessions to return.
|
|
609
609
|
json_output: Output as JSON.
|
|
610
610
|
"""
|
|
611
|
+
# Show warning about OpenSearch sessions bug
|
|
612
|
+
if not json_output:
|
|
613
|
+
display.warning(
|
|
614
|
+
"[yellow]⚠ Known Issue:[/yellow] Sessions have a bug in OpenSearch 3.4.0. "
|
|
615
|
+
"The sessions index is not auto-created. See docs/memory.md for details."
|
|
616
|
+
)
|
|
617
|
+
display.newline()
|
|
618
|
+
|
|
611
619
|
cli_config = CliConfig.from_env()
|
|
612
620
|
memory = Memory.from_config(_create_memory_config(cli_config))
|
|
613
621
|
|
|
@@ -638,6 +646,8 @@ async def session_list_command(
|
|
|
638
646
|
for s in sessions
|
|
639
647
|
],
|
|
640
648
|
"total": len(sessions),
|
|
649
|
+
"warning": "Sessions have a known bug in OpenSearch 3.4.0. "
|
|
650
|
+
"The sessions index is not auto-created. See docs/memory.md for details.",
|
|
641
651
|
}
|
|
642
652
|
print(json.dumps(output, indent=2))
|
|
643
653
|
return
|
|
@@ -5,6 +5,10 @@ Supports multiple search modes:
|
|
|
5
5
|
- keyword: Traditional BM25 text matching
|
|
6
6
|
- hybrid: Combined semantic + keyword (default, best results)
|
|
7
7
|
- agentic: AI-powered search with reasoning and answer generation
|
|
8
|
+
|
|
9
|
+
Note:
|
|
10
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
11
|
+
isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
|
|
8
12
|
"""
|
|
9
13
|
|
|
10
14
|
from __future__ import annotations
|
|
@@ -44,7 +48,6 @@ async def search_command(
|
|
|
44
48
|
index_name: str = "knowledge",
|
|
45
49
|
limit: int = 5,
|
|
46
50
|
offset: int = 0,
|
|
47
|
-
account_id: str | None = None,
|
|
48
51
|
collection_ids: str | None = None,
|
|
49
52
|
source_ids: str | None = None,
|
|
50
53
|
min_score: float = 0.0,
|
|
@@ -55,14 +58,17 @@ async def search_command(
|
|
|
55
58
|
) -> None:
|
|
56
59
|
"""Execute the search command.
|
|
57
60
|
|
|
61
|
+
Note:
|
|
62
|
+
Multi-tenancy is achieved through index isolation. Use tenant-specific
|
|
63
|
+
index names instead (e.g., --index knowledge-tenant-123).
|
|
64
|
+
|
|
58
65
|
Args:
|
|
59
66
|
display: Display service for output.
|
|
60
67
|
query: Search query text.
|
|
61
68
|
mode: Search mode (semantic, keyword, hybrid, agentic).
|
|
62
|
-
index_name: Index to search.
|
|
69
|
+
index_name: Index to search (use tenant-specific name for isolation).
|
|
63
70
|
limit: Maximum results to return.
|
|
64
71
|
offset: Pagination offset.
|
|
65
|
-
account_id: Filter by account ID.
|
|
66
72
|
collection_ids: Filter by collection IDs (comma-separated).
|
|
67
73
|
source_ids: Filter by source IDs (comma-separated).
|
|
68
74
|
min_score: Minimum score threshold.
|
|
@@ -86,7 +92,6 @@ async def search_command(
|
|
|
86
92
|
query=query or "",
|
|
87
93
|
index_name=index_name,
|
|
88
94
|
agent_type="flow", # Default to flow for single queries
|
|
89
|
-
account_id=account_id,
|
|
90
95
|
collection_ids=collection_ids,
|
|
91
96
|
source_ids=source_ids,
|
|
92
97
|
limit=limit,
|
|
@@ -117,7 +122,6 @@ async def search_command(
|
|
|
117
122
|
index_name=index_name,
|
|
118
123
|
mode=mode,
|
|
119
124
|
limit=limit,
|
|
120
|
-
account_id=account_id,
|
|
121
125
|
collection_ids=collection_ids,
|
|
122
126
|
source_ids=source_ids,
|
|
123
127
|
min_score=min_score,
|
|
@@ -146,7 +150,6 @@ async def search_command(
|
|
|
146
150
|
index_name=index_name,
|
|
147
151
|
limit=limit,
|
|
148
152
|
offset=offset,
|
|
149
|
-
account_id=account_id,
|
|
150
153
|
collection_ids=collection_ids,
|
|
151
154
|
source_ids=source_ids,
|
|
152
155
|
min_score=min_score,
|
|
@@ -167,7 +170,6 @@ async def _execute_search(
|
|
|
167
170
|
index_name: str,
|
|
168
171
|
limit: int,
|
|
169
172
|
offset: int,
|
|
170
|
-
account_id: str | None,
|
|
171
173
|
collection_ids: str | None,
|
|
172
174
|
source_ids: str | None,
|
|
173
175
|
min_score: float,
|
|
@@ -214,7 +216,6 @@ async def _execute_search(
|
|
|
214
216
|
mode=_get_search_mode(mode),
|
|
215
217
|
limit=limit,
|
|
216
218
|
offset=offset,
|
|
217
|
-
account_id=account_id,
|
|
218
219
|
collection_ids=collection_list,
|
|
219
220
|
source_ids=source_list,
|
|
220
221
|
min_score=min_score,
|
|
@@ -315,7 +316,6 @@ async def _interactive_search(
|
|
|
315
316
|
index_name: str,
|
|
316
317
|
mode: str,
|
|
317
318
|
limit: int,
|
|
318
|
-
account_id: str | None,
|
|
319
319
|
collection_ids: str | None,
|
|
320
320
|
source_ids: str | None,
|
|
321
321
|
min_score: float,
|
|
@@ -396,7 +396,6 @@ async def _interactive_search(
|
|
|
396
396
|
mode=_get_search_mode(mode),
|
|
397
397
|
limit=limit,
|
|
398
398
|
offset=0,
|
|
399
|
-
account_id=account_id,
|
|
400
399
|
collection_ids=collection_list,
|
|
401
400
|
source_ids=source_list,
|
|
402
401
|
min_score=min_score,
|
|
@@ -90,8 +90,11 @@ async def setup_command(
|
|
|
90
90
|
|
|
91
91
|
display.newline()
|
|
92
92
|
|
|
93
|
-
# Create OpenSearch config
|
|
93
|
+
# Create OpenSearch config from environment, then override with CLI args
|
|
94
|
+
# This ensures all env vars (including pipeline names) are respected
|
|
95
|
+
base_config = OpenSearchConfig.from_env()
|
|
94
96
|
opensearch_config = OpenSearchConfig(
|
|
97
|
+
# CLI overrides (if provided)
|
|
95
98
|
host=final_host,
|
|
96
99
|
port=final_port,
|
|
97
100
|
username=final_username,
|
|
@@ -101,6 +104,27 @@ async def setup_command(
|
|
|
101
104
|
openai_api_key=cli_config.openai_api_key,
|
|
102
105
|
embedding_model=cli_config.openai_embedding_model,
|
|
103
106
|
embedding_dimension=cli_config.openai_embedding_dimension,
|
|
107
|
+
# Preserve env-based config for pipelines and other settings
|
|
108
|
+
ingest_pipeline_name=base_config.ingest_pipeline_name,
|
|
109
|
+
search_pipeline_name=base_config.search_pipeline_name,
|
|
110
|
+
index_prefix=base_config.index_prefix,
|
|
111
|
+
model_id=base_config.model_id,
|
|
112
|
+
model_group_id=base_config.model_group_id,
|
|
113
|
+
embedding_field=base_config.embedding_field,
|
|
114
|
+
# k-NN settings
|
|
115
|
+
knn_engine=base_config.knn_engine,
|
|
116
|
+
knn_space_type=base_config.knn_space_type,
|
|
117
|
+
knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
|
|
118
|
+
knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
|
|
119
|
+
knn_algo_param_m=base_config.knn_algo_param_m,
|
|
120
|
+
# Index settings
|
|
121
|
+
number_of_shards=base_config.number_of_shards,
|
|
122
|
+
number_of_replicas=base_config.number_of_replicas,
|
|
123
|
+
refresh_interval=base_config.refresh_interval,
|
|
124
|
+
# Agentic settings
|
|
125
|
+
agentic_llm_model=base_config.agentic_llm_model,
|
|
126
|
+
agentic_max_iterations=base_config.agentic_max_iterations,
|
|
127
|
+
agentic_timeout_seconds=base_config.agentic_timeout_seconds,
|
|
104
128
|
)
|
|
105
129
|
|
|
106
130
|
# Create OpenSearch client
|
|
@@ -27,7 +27,7 @@ class CliConfig:
|
|
|
27
27
|
opensearch_verify_certs: bool = False
|
|
28
28
|
opensearch_model_id: str | None = None
|
|
29
29
|
opensearch_index_name: str = "knowledge"
|
|
30
|
-
|
|
30
|
+
opensearch_ingest_pipeline_name: str = "gnosisllm-ingest-pipeline"
|
|
31
31
|
opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
|
|
32
32
|
|
|
33
33
|
# OpenAI
|
|
@@ -78,11 +78,11 @@ class CliConfig:
|
|
|
78
78
|
== "true",
|
|
79
79
|
opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
|
|
80
80
|
opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
|
|
81
|
-
|
|
82
|
-
"
|
|
81
|
+
opensearch_ingest_pipeline_name=os.getenv(
|
|
82
|
+
"OPENSEARCH_INGEST_PIPELINE", "gnosisllm-ingest-pipeline"
|
|
83
83
|
),
|
|
84
84
|
opensearch_search_pipeline_name=os.getenv(
|
|
85
|
-
"
|
|
85
|
+
"OPENSEARCH_SEARCH_PIPELINE", "gnosisllm-search-pipeline"
|
|
86
86
|
),
|
|
87
87
|
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
88
88
|
openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
"""Domain models - Value objects and entities."""
|
|
2
2
|
|
|
3
|
+
from gnosisllm_knowledge.core.domain.discovery import (
|
|
4
|
+
DiscoveredURL,
|
|
5
|
+
DiscoveryConfig,
|
|
6
|
+
DiscoveryJobStatus,
|
|
7
|
+
DiscoveryProgress,
|
|
8
|
+
DiscoveryStats,
|
|
9
|
+
)
|
|
3
10
|
from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
|
|
4
11
|
from gnosisllm_knowledge.core.domain.memory import (
|
|
5
12
|
ContainerConfig,
|
|
@@ -41,6 +48,12 @@ from gnosisllm_knowledge.core.domain.search import (
|
|
|
41
48
|
from gnosisllm_knowledge.core.domain.source import SourceConfig
|
|
42
49
|
|
|
43
50
|
__all__ = [
|
|
51
|
+
# Discovery
|
|
52
|
+
"DiscoveredURL",
|
|
53
|
+
"DiscoveryConfig",
|
|
54
|
+
"DiscoveryJobStatus",
|
|
55
|
+
"DiscoveryProgress",
|
|
56
|
+
"DiscoveryStats",
|
|
44
57
|
# Document
|
|
45
58
|
"Document",
|
|
46
59
|
"DocumentStatus",
|