gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Load command for indexing content from URLs or sitemaps.
|
|
2
|
+
|
|
3
|
+
Fetches content, chunks it for optimal embedding, and indexes
|
|
4
|
+
into OpenSearch with automatic embedding generation via ingest pipeline.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from opensearchpy import AsyncOpenSearch
|
|
13
|
+
from rich.progress import (
|
|
14
|
+
BarColumn,
|
|
15
|
+
MofNCompleteColumn,
|
|
16
|
+
Progress,
|
|
17
|
+
SpinnerColumn,
|
|
18
|
+
TaskProgressColumn,
|
|
19
|
+
TextColumn,
|
|
20
|
+
TimeElapsedColumn,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
|
|
24
|
+
from gnosisllm_knowledge.backends.opensearch.indexer import OpenSearchIndexer
|
|
25
|
+
from gnosisllm_knowledge.chunking.sentence import SentenceChunker
|
|
26
|
+
from gnosisllm_knowledge.cli.display.service import RichDisplayService
|
|
27
|
+
from gnosisllm_knowledge.cli.utils.config import CliConfig
|
|
28
|
+
from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
|
|
29
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
30
|
+
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
31
|
+
from gnosisllm_knowledge.loaders.factory import LoaderFactory
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def load_command(
|
|
38
|
+
display: RichDisplayService,
|
|
39
|
+
source: str,
|
|
40
|
+
source_type: str | None = None,
|
|
41
|
+
index_name: str = "knowledge",
|
|
42
|
+
account_id: str | None = None,
|
|
43
|
+
collection_id: str | None = None,
|
|
44
|
+
source_id: str | None = None,
|
|
45
|
+
batch_size: int = 100,
|
|
46
|
+
max_urls: int = 1000,
|
|
47
|
+
force: bool = False,
|
|
48
|
+
dry_run: bool = False,
|
|
49
|
+
verbose: bool = False,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Execute the load command.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
display: Display service for output.
|
|
55
|
+
source: URL or sitemap to load content from.
|
|
56
|
+
source_type: Source type (website, sitemap) or auto-detect.
|
|
57
|
+
index_name: Target index name.
|
|
58
|
+
account_id: Multi-tenant account ID.
|
|
59
|
+
collection_id: Collection grouping ID.
|
|
60
|
+
source_id: Source identifier (defaults to URL).
|
|
61
|
+
batch_size: Documents per indexing batch.
|
|
62
|
+
max_urls: Maximum URLs to process from sitemap.
|
|
63
|
+
force: Delete existing source documents first.
|
|
64
|
+
dry_run: Preview without indexing.
|
|
65
|
+
verbose: Show per-document progress.
|
|
66
|
+
"""
|
|
67
|
+
# Load configuration
|
|
68
|
+
cli_config = CliConfig.from_env()
|
|
69
|
+
|
|
70
|
+
# Auto-detect source type
|
|
71
|
+
detected_type = source_type
|
|
72
|
+
if not detected_type:
|
|
73
|
+
if "sitemap" in source.lower() or source.endswith(".xml"):
|
|
74
|
+
detected_type = "sitemap"
|
|
75
|
+
else:
|
|
76
|
+
detected_type = "website"
|
|
77
|
+
|
|
78
|
+
# Default source_id to URL
|
|
79
|
+
final_source_id = source_id or source
|
|
80
|
+
|
|
81
|
+
# Display header
|
|
82
|
+
display.header(
|
|
83
|
+
"GnosisLLM Knowledge Loader",
|
|
84
|
+
f"Loading from: {source[:60]}{'...' if len(source) > 60 else ''}",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Show configuration
|
|
88
|
+
config_rows = [
|
|
89
|
+
("Source", source[:50] + "..." if len(source) > 50 else source),
|
|
90
|
+
("Type", f"{detected_type} {'(auto-detected)' if not source_type else ''}"),
|
|
91
|
+
("Target Index", index_name),
|
|
92
|
+
("Batch Size", str(batch_size)),
|
|
93
|
+
("Max URLs", str(max_urls) if detected_type == "sitemap" else "N/A"),
|
|
94
|
+
("Neoreader", cli_config.neoreader_host),
|
|
95
|
+
("OpenSearch", f"{cli_config.opensearch_host}:{cli_config.opensearch_port}"),
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
if account_id:
|
|
99
|
+
config_rows.append(("Account ID", account_id))
|
|
100
|
+
if collection_id:
|
|
101
|
+
config_rows.append(("Collection ID", collection_id))
|
|
102
|
+
if force:
|
|
103
|
+
config_rows.append(("Force Reload", "Yes"))
|
|
104
|
+
if dry_run:
|
|
105
|
+
config_rows.append(("Dry Run", "Yes (no indexing)"))
|
|
106
|
+
|
|
107
|
+
display.table("Configuration", config_rows)
|
|
108
|
+
display.newline()
|
|
109
|
+
|
|
110
|
+
# Create fetcher
|
|
111
|
+
neoreader_config = NeoreaderConfig(host=cli_config.neoreader_host)
|
|
112
|
+
fetcher = NeoreaderContentFetcher(neoreader_config)
|
|
113
|
+
|
|
114
|
+
# Check Neoreader health
|
|
115
|
+
display.info("Checking Neoreader connection...")
|
|
116
|
+
if await fetcher.health_check():
|
|
117
|
+
display.success("Neoreader connected")
|
|
118
|
+
else:
|
|
119
|
+
display.warning(f"Cannot connect to Neoreader at {cli_config.neoreader_host}")
|
|
120
|
+
display.info("Continuing with fallback HTTP fetcher...")
|
|
121
|
+
|
|
122
|
+
# Create loader
|
|
123
|
+
chunker = SentenceChunker()
|
|
124
|
+
loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
loader = loader_factory.create(detected_type)
|
|
128
|
+
except ValueError as e:
|
|
129
|
+
display.format_error_with_suggestion(
|
|
130
|
+
error=f"Invalid source: {e}",
|
|
131
|
+
suggestion="Check the URL format or specify --type explicitly.",
|
|
132
|
+
command="gnosisllm-knowledge load <url> --type sitemap",
|
|
133
|
+
)
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
# Configure sitemap loader if applicable
|
|
137
|
+
if detected_type == "sitemap":
|
|
138
|
+
loader.max_urls = max_urls
|
|
139
|
+
|
|
140
|
+
display.newline()
|
|
141
|
+
|
|
142
|
+
# Discover URLs
|
|
143
|
+
display.info("Discovering URLs...")
|
|
144
|
+
with display.loading_spinner("Discovering..."):
|
|
145
|
+
validation = await loader.validate_source(source)
|
|
146
|
+
|
|
147
|
+
if not validation.valid:
|
|
148
|
+
display.format_error_with_suggestion(
|
|
149
|
+
error=f"Source validation failed: {validation.message}",
|
|
150
|
+
suggestion="Check that the URL is accessible.",
|
|
151
|
+
)
|
|
152
|
+
sys.exit(1)
|
|
153
|
+
|
|
154
|
+
# Load documents
|
|
155
|
+
documents: list[Document] = []
|
|
156
|
+
url_count = 0
|
|
157
|
+
|
|
158
|
+
with Progress(
|
|
159
|
+
SpinnerColumn(),
|
|
160
|
+
TextColumn("[progress.description]{task.description}"),
|
|
161
|
+
BarColumn(),
|
|
162
|
+
TaskProgressColumn(),
|
|
163
|
+
MofNCompleteColumn(),
|
|
164
|
+
TimeElapsedColumn(),
|
|
165
|
+
console=display.console,
|
|
166
|
+
) as progress:
|
|
167
|
+
load_task = progress.add_task("Loading content...", total=None)
|
|
168
|
+
|
|
169
|
+
async for doc in loader.load_streaming(source):
|
|
170
|
+
documents.append(doc)
|
|
171
|
+
url_count += 1
|
|
172
|
+
progress.update(load_task, advance=1, description=f"Loading... ({url_count} docs)")
|
|
173
|
+
|
|
174
|
+
if url_count >= max_urls and detected_type == "sitemap":
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
progress.update(load_task, completed=url_count)
|
|
178
|
+
|
|
179
|
+
display.success(f"Loaded {len(documents)} documents")
|
|
180
|
+
|
|
181
|
+
if not documents:
|
|
182
|
+
display.warning("No documents found. Check the source URL.")
|
|
183
|
+
sys.exit(0)
|
|
184
|
+
|
|
185
|
+
# Dry run - stop here
|
|
186
|
+
if dry_run:
|
|
187
|
+
display.newline()
|
|
188
|
+
display.panel(
|
|
189
|
+
f"Documents found: {len(documents)}\n\n"
|
|
190
|
+
"Sample URLs:\n"
|
|
191
|
+
+ "\n".join(f" • {d.url}" for d in documents[:5])
|
|
192
|
+
+ (f"\n ... and {len(documents) - 5} more" if len(documents) > 5 else ""),
|
|
193
|
+
title="Dry Run Complete",
|
|
194
|
+
style="info",
|
|
195
|
+
)
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
# Chunk documents
|
|
199
|
+
display.newline()
|
|
200
|
+
display.info("Chunking documents for optimal embedding...")
|
|
201
|
+
|
|
202
|
+
chunker = SentenceChunker()
|
|
203
|
+
chunked_documents: list[Document] = []
|
|
204
|
+
|
|
205
|
+
with Progress(
|
|
206
|
+
SpinnerColumn(),
|
|
207
|
+
TextColumn("[progress.description]{task.description}"),
|
|
208
|
+
BarColumn(),
|
|
209
|
+
TaskProgressColumn(),
|
|
210
|
+
MofNCompleteColumn(),
|
|
211
|
+
TimeElapsedColumn(),
|
|
212
|
+
console=display.console,
|
|
213
|
+
) as progress:
|
|
214
|
+
chunk_task = progress.add_task("Chunking...", total=len(documents))
|
|
215
|
+
|
|
216
|
+
for doc in documents:
|
|
217
|
+
chunks = chunker.chunk(doc.content)
|
|
218
|
+
|
|
219
|
+
if len(chunks) == 1:
|
|
220
|
+
# Single chunk - use original document
|
|
221
|
+
chunked_doc = Document(
|
|
222
|
+
content=doc.content,
|
|
223
|
+
url=doc.url,
|
|
224
|
+
title=doc.title,
|
|
225
|
+
source=final_source_id,
|
|
226
|
+
account_id=account_id,
|
|
227
|
+
collection_id=collection_id,
|
|
228
|
+
source_id=final_source_id,
|
|
229
|
+
metadata=doc.metadata,
|
|
230
|
+
status=DocumentStatus.PENDING,
|
|
231
|
+
)
|
|
232
|
+
chunked_documents.append(chunked_doc)
|
|
233
|
+
else:
|
|
234
|
+
# Multiple chunks - create chunk documents
|
|
235
|
+
for i, chunk in enumerate(chunks):
|
|
236
|
+
chunk_doc = Document(
|
|
237
|
+
content=chunk.content,
|
|
238
|
+
url=doc.url,
|
|
239
|
+
title=doc.title,
|
|
240
|
+
source=final_source_id,
|
|
241
|
+
account_id=account_id,
|
|
242
|
+
collection_id=collection_id,
|
|
243
|
+
source_id=final_source_id,
|
|
244
|
+
chunk_index=i,
|
|
245
|
+
total_chunks=len(chunks),
|
|
246
|
+
parent_doc_id=doc.doc_id,
|
|
247
|
+
metadata={**(doc.metadata or {}), "chunk_start": chunk.start_position},
|
|
248
|
+
status=DocumentStatus.PENDING,
|
|
249
|
+
)
|
|
250
|
+
chunked_documents.append(chunk_doc)
|
|
251
|
+
|
|
252
|
+
progress.update(chunk_task, advance=1)
|
|
253
|
+
|
|
254
|
+
display.success(f"Created {len(chunked_documents)} chunks from {len(documents)} documents")
|
|
255
|
+
|
|
256
|
+
# Create OpenSearch client
|
|
257
|
+
http_auth = None
|
|
258
|
+
if cli_config.opensearch_username and cli_config.opensearch_password:
|
|
259
|
+
http_auth = (cli_config.opensearch_username, cli_config.opensearch_password)
|
|
260
|
+
|
|
261
|
+
client = AsyncOpenSearch(
|
|
262
|
+
hosts=[{"host": cli_config.opensearch_host, "port": cli_config.opensearch_port}],
|
|
263
|
+
http_auth=http_auth,
|
|
264
|
+
use_ssl=cli_config.opensearch_use_ssl,
|
|
265
|
+
verify_certs=cli_config.opensearch_verify_certs,
|
|
266
|
+
ssl_show_warn=False,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
# Create indexer config
|
|
271
|
+
opensearch_config = OpenSearchConfig(
|
|
272
|
+
host=cli_config.opensearch_host,
|
|
273
|
+
port=cli_config.opensearch_port,
|
|
274
|
+
username=cli_config.opensearch_username,
|
|
275
|
+
password=cli_config.opensearch_password,
|
|
276
|
+
use_ssl=cli_config.opensearch_use_ssl,
|
|
277
|
+
verify_certs=cli_config.opensearch_verify_certs,
|
|
278
|
+
model_id=cli_config.opensearch_model_id,
|
|
279
|
+
ingest_pipeline_name=cli_config.opensearch_pipeline_name,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
indexer = OpenSearchIndexer(client, opensearch_config)
|
|
283
|
+
|
|
284
|
+
# Ensure index exists
|
|
285
|
+
display.newline()
|
|
286
|
+
display.info(f"Ensuring index '{index_name}' exists...")
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
created = await indexer.ensure_index(index_name)
|
|
290
|
+
if created:
|
|
291
|
+
display.success(f"Created index: {index_name}")
|
|
292
|
+
else:
|
|
293
|
+
display.info(f"Index already exists: {index_name}")
|
|
294
|
+
except Exception as e:
|
|
295
|
+
display.format_error_with_suggestion(
|
|
296
|
+
error=f"Failed to ensure index: {e}",
|
|
297
|
+
suggestion="Run 'gnosisllm-knowledge setup' first to configure OpenSearch.",
|
|
298
|
+
)
|
|
299
|
+
sys.exit(1)
|
|
300
|
+
|
|
301
|
+
# Force delete existing if requested
|
|
302
|
+
if force:
|
|
303
|
+
display.info(f"Deleting existing documents from source: {final_source_id}")
|
|
304
|
+
deleted = await indexer.delete_by_query(
|
|
305
|
+
{"query": {"term": {"source_id": final_source_id}}},
|
|
306
|
+
index_name,
|
|
307
|
+
)
|
|
308
|
+
if deleted > 0:
|
|
309
|
+
display.info(f"Deleted {deleted} existing documents")
|
|
310
|
+
|
|
311
|
+
# Index documents
|
|
312
|
+
display.newline()
|
|
313
|
+
display.info("Indexing documents...")
|
|
314
|
+
|
|
315
|
+
indexed_count = 0
|
|
316
|
+
failed_count = 0
|
|
317
|
+
|
|
318
|
+
with Progress(
|
|
319
|
+
SpinnerColumn(),
|
|
320
|
+
TextColumn("[progress.description]{task.description}"),
|
|
321
|
+
BarColumn(),
|
|
322
|
+
TaskProgressColumn(),
|
|
323
|
+
MofNCompleteColumn(),
|
|
324
|
+
TimeElapsedColumn(),
|
|
325
|
+
console=display.console,
|
|
326
|
+
) as progress:
|
|
327
|
+
index_task = progress.add_task("Indexing...", total=len(chunked_documents))
|
|
328
|
+
|
|
329
|
+
# Index in batches
|
|
330
|
+
for i in range(0, len(chunked_documents), batch_size):
|
|
331
|
+
batch = chunked_documents[i : i + batch_size]
|
|
332
|
+
|
|
333
|
+
result = await indexer.bulk_index(batch, index_name, batch_size=batch_size)
|
|
334
|
+
indexed_count += result.indexed_count
|
|
335
|
+
failed_count += result.failed_count
|
|
336
|
+
|
|
337
|
+
progress.update(index_task, advance=len(batch))
|
|
338
|
+
|
|
339
|
+
# Refresh index to make documents searchable
|
|
340
|
+
await indexer.refresh_index(index_name)
|
|
341
|
+
|
|
342
|
+
display.newline()
|
|
343
|
+
|
|
344
|
+
# Display results
|
|
345
|
+
if failed_count == 0:
|
|
346
|
+
display.panel(
|
|
347
|
+
f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
|
|
348
|
+
f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
|
|
349
|
+
f"Documents Indexed: [green]{indexed_count}[/green]\n"
|
|
350
|
+
f"Index: [cyan]{index_name}[/cyan]\n\n"
|
|
351
|
+
f"Verify with:\n"
|
|
352
|
+
f' [dim]gnosisllm-knowledge search "your query" --index {index_name}[/dim]',
|
|
353
|
+
title="Loading Complete",
|
|
354
|
+
style="success",
|
|
355
|
+
)
|
|
356
|
+
else:
|
|
357
|
+
display.panel(
|
|
358
|
+
f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
|
|
359
|
+
f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
|
|
360
|
+
f"Documents Indexed: [green]{indexed_count}[/green]\n"
|
|
361
|
+
f"Documents Failed: [red]{failed_count}[/red]\n"
|
|
362
|
+
f"Index: [cyan]{index_name}[/cyan]",
|
|
363
|
+
title="Loading Complete (with errors)",
|
|
364
|
+
style="warning",
|
|
365
|
+
)
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
|
|
368
|
+
finally:
|
|
369
|
+
await client.close()
|