gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,369 @@
1
+ """Load command for indexing content from URLs or sitemaps.
2
+
3
+ Fetches content, chunks it for optimal embedding, and indexes
4
+ into OpenSearch with automatic embedding generation via ingest pipeline.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+ from typing import TYPE_CHECKING
11
+
12
+ from opensearchpy import AsyncOpenSearch
13
+ from rich.progress import (
14
+ BarColumn,
15
+ MofNCompleteColumn,
16
+ Progress,
17
+ SpinnerColumn,
18
+ TaskProgressColumn,
19
+ TextColumn,
20
+ TimeElapsedColumn,
21
+ )
22
+
23
+ from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
24
+ from gnosisllm_knowledge.backends.opensearch.indexer import OpenSearchIndexer
25
+ from gnosisllm_knowledge.chunking.sentence import SentenceChunker
26
+ from gnosisllm_knowledge.cli.display.service import RichDisplayService
27
+ from gnosisllm_knowledge.cli.utils.config import CliConfig
28
+ from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus
29
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
30
+ from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
31
+ from gnosisllm_knowledge.loaders.factory import LoaderFactory
32
+
33
+ if TYPE_CHECKING:
34
+ pass
35
+
36
+
37
+ async def load_command(
38
+ display: RichDisplayService,
39
+ source: str,
40
+ source_type: str | None = None,
41
+ index_name: str = "knowledge",
42
+ account_id: str | None = None,
43
+ collection_id: str | None = None,
44
+ source_id: str | None = None,
45
+ batch_size: int = 100,
46
+ max_urls: int = 1000,
47
+ force: bool = False,
48
+ dry_run: bool = False,
49
+ verbose: bool = False,
50
+ ) -> None:
51
+ """Execute the load command.
52
+
53
+ Args:
54
+ display: Display service for output.
55
+ source: URL or sitemap to load content from.
56
+ source_type: Source type (website, sitemap) or auto-detect.
57
+ index_name: Target index name.
58
+ account_id: Multi-tenant account ID.
59
+ collection_id: Collection grouping ID.
60
+ source_id: Source identifier (defaults to URL).
61
+ batch_size: Documents per indexing batch.
62
+ max_urls: Maximum URLs to process from sitemap.
63
+ force: Delete existing source documents first.
64
+ dry_run: Preview without indexing.
65
+ verbose: Show per-document progress.
66
+ """
67
+ # Load configuration
68
+ cli_config = CliConfig.from_env()
69
+
70
+ # Auto-detect source type
71
+ detected_type = source_type
72
+ if not detected_type:
73
+ if "sitemap" in source.lower() or source.endswith(".xml"):
74
+ detected_type = "sitemap"
75
+ else:
76
+ detected_type = "website"
77
+
78
+ # Default source_id to URL
79
+ final_source_id = source_id or source
80
+
81
+ # Display header
82
+ display.header(
83
+ "GnosisLLM Knowledge Loader",
84
+ f"Loading from: {source[:60]}{'...' if len(source) > 60 else ''}",
85
+ )
86
+
87
+ # Show configuration
88
+ config_rows = [
89
+ ("Source", source[:50] + "..." if len(source) > 50 else source),
90
+ ("Type", f"{detected_type} {'(auto-detected)' if not source_type else ''}"),
91
+ ("Target Index", index_name),
92
+ ("Batch Size", str(batch_size)),
93
+ ("Max URLs", str(max_urls) if detected_type == "sitemap" else "N/A"),
94
+ ("Neoreader", cli_config.neoreader_host),
95
+ ("OpenSearch", f"{cli_config.opensearch_host}:{cli_config.opensearch_port}"),
96
+ ]
97
+
98
+ if account_id:
99
+ config_rows.append(("Account ID", account_id))
100
+ if collection_id:
101
+ config_rows.append(("Collection ID", collection_id))
102
+ if force:
103
+ config_rows.append(("Force Reload", "Yes"))
104
+ if dry_run:
105
+ config_rows.append(("Dry Run", "Yes (no indexing)"))
106
+
107
+ display.table("Configuration", config_rows)
108
+ display.newline()
109
+
110
+ # Create fetcher
111
+ neoreader_config = NeoreaderConfig(host=cli_config.neoreader_host)
112
+ fetcher = NeoreaderContentFetcher(neoreader_config)
113
+
114
+ # Check Neoreader health
115
+ display.info("Checking Neoreader connection...")
116
+ if await fetcher.health_check():
117
+ display.success("Neoreader connected")
118
+ else:
119
+ display.warning(f"Cannot connect to Neoreader at {cli_config.neoreader_host}")
120
+ display.info("Continuing with fallback HTTP fetcher...")
121
+
122
+ # Create loader
123
+ chunker = SentenceChunker()
124
+ loader_factory = LoaderFactory(fetcher=fetcher, chunker=chunker)
125
+
126
+ try:
127
+ loader = loader_factory.create(detected_type)
128
+ except ValueError as e:
129
+ display.format_error_with_suggestion(
130
+ error=f"Invalid source: {e}",
131
+ suggestion="Check the URL format or specify --type explicitly.",
132
+ command="gnosisllm-knowledge load <url> --type sitemap",
133
+ )
134
+ sys.exit(1)
135
+
136
+ # Configure sitemap loader if applicable
137
+ if detected_type == "sitemap":
138
+ loader.max_urls = max_urls
139
+
140
+ display.newline()
141
+
142
+ # Discover URLs
143
+ display.info("Discovering URLs...")
144
+ with display.loading_spinner("Discovering..."):
145
+ validation = await loader.validate_source(source)
146
+
147
+ if not validation.valid:
148
+ display.format_error_with_suggestion(
149
+ error=f"Source validation failed: {validation.message}",
150
+ suggestion="Check that the URL is accessible.",
151
+ )
152
+ sys.exit(1)
153
+
154
+ # Load documents
155
+ documents: list[Document] = []
156
+ url_count = 0
157
+
158
+ with Progress(
159
+ SpinnerColumn(),
160
+ TextColumn("[progress.description]{task.description}"),
161
+ BarColumn(),
162
+ TaskProgressColumn(),
163
+ MofNCompleteColumn(),
164
+ TimeElapsedColumn(),
165
+ console=display.console,
166
+ ) as progress:
167
+ load_task = progress.add_task("Loading content...", total=None)
168
+
169
+ async for doc in loader.load_streaming(source):
170
+ documents.append(doc)
171
+ url_count += 1
172
+ progress.update(load_task, advance=1, description=f"Loading... ({url_count} docs)")
173
+
174
+ if url_count >= max_urls and detected_type == "sitemap":
175
+ break
176
+
177
+ progress.update(load_task, completed=url_count)
178
+
179
+ display.success(f"Loaded {len(documents)} documents")
180
+
181
+ if not documents:
182
+ display.warning("No documents found. Check the source URL.")
183
+ sys.exit(0)
184
+
185
+ # Dry run - stop here
186
+ if dry_run:
187
+ display.newline()
188
+ display.panel(
189
+ f"Documents found: {len(documents)}\n\n"
190
+ "Sample URLs:\n"
191
+ + "\n".join(f" • {d.url}" for d in documents[:5])
192
+ + (f"\n ... and {len(documents) - 5} more" if len(documents) > 5 else ""),
193
+ title="Dry Run Complete",
194
+ style="info",
195
+ )
196
+ return
197
+
198
+ # Chunk documents
199
+ display.newline()
200
+ display.info("Chunking documents for optimal embedding...")
201
+
202
+ chunker = SentenceChunker()
203
+ chunked_documents: list[Document] = []
204
+
205
+ with Progress(
206
+ SpinnerColumn(),
207
+ TextColumn("[progress.description]{task.description}"),
208
+ BarColumn(),
209
+ TaskProgressColumn(),
210
+ MofNCompleteColumn(),
211
+ TimeElapsedColumn(),
212
+ console=display.console,
213
+ ) as progress:
214
+ chunk_task = progress.add_task("Chunking...", total=len(documents))
215
+
216
+ for doc in documents:
217
+ chunks = chunker.chunk(doc.content)
218
+
219
+ if len(chunks) == 1:
220
+ # Single chunk - use original document
221
+ chunked_doc = Document(
222
+ content=doc.content,
223
+ url=doc.url,
224
+ title=doc.title,
225
+ source=final_source_id,
226
+ account_id=account_id,
227
+ collection_id=collection_id,
228
+ source_id=final_source_id,
229
+ metadata=doc.metadata,
230
+ status=DocumentStatus.PENDING,
231
+ )
232
+ chunked_documents.append(chunked_doc)
233
+ else:
234
+ # Multiple chunks - create chunk documents
235
+ for i, chunk in enumerate(chunks):
236
+ chunk_doc = Document(
237
+ content=chunk.content,
238
+ url=doc.url,
239
+ title=doc.title,
240
+ source=final_source_id,
241
+ account_id=account_id,
242
+ collection_id=collection_id,
243
+ source_id=final_source_id,
244
+ chunk_index=i,
245
+ total_chunks=len(chunks),
246
+ parent_doc_id=doc.doc_id,
247
+ metadata={**(doc.metadata or {}), "chunk_start": chunk.start_position},
248
+ status=DocumentStatus.PENDING,
249
+ )
250
+ chunked_documents.append(chunk_doc)
251
+
252
+ progress.update(chunk_task, advance=1)
253
+
254
+ display.success(f"Created {len(chunked_documents)} chunks from {len(documents)} documents")
255
+
256
+ # Create OpenSearch client
257
+ http_auth = None
258
+ if cli_config.opensearch_username and cli_config.opensearch_password:
259
+ http_auth = (cli_config.opensearch_username, cli_config.opensearch_password)
260
+
261
+ client = AsyncOpenSearch(
262
+ hosts=[{"host": cli_config.opensearch_host, "port": cli_config.opensearch_port}],
263
+ http_auth=http_auth,
264
+ use_ssl=cli_config.opensearch_use_ssl,
265
+ verify_certs=cli_config.opensearch_verify_certs,
266
+ ssl_show_warn=False,
267
+ )
268
+
269
+ try:
270
+ # Create indexer config
271
+ opensearch_config = OpenSearchConfig(
272
+ host=cli_config.opensearch_host,
273
+ port=cli_config.opensearch_port,
274
+ username=cli_config.opensearch_username,
275
+ password=cli_config.opensearch_password,
276
+ use_ssl=cli_config.opensearch_use_ssl,
277
+ verify_certs=cli_config.opensearch_verify_certs,
278
+ model_id=cli_config.opensearch_model_id,
279
+ ingest_pipeline_name=cli_config.opensearch_pipeline_name,
280
+ )
281
+
282
+ indexer = OpenSearchIndexer(client, opensearch_config)
283
+
284
+ # Ensure index exists
285
+ display.newline()
286
+ display.info(f"Ensuring index '{index_name}' exists...")
287
+
288
+ try:
289
+ created = await indexer.ensure_index(index_name)
290
+ if created:
291
+ display.success(f"Created index: {index_name}")
292
+ else:
293
+ display.info(f"Index already exists: {index_name}")
294
+ except Exception as e:
295
+ display.format_error_with_suggestion(
296
+ error=f"Failed to ensure index: {e}",
297
+ suggestion="Run 'gnosisllm-knowledge setup' first to configure OpenSearch.",
298
+ )
299
+ sys.exit(1)
300
+
301
+ # Force delete existing if requested
302
+ if force:
303
+ display.info(f"Deleting existing documents from source: {final_source_id}")
304
+ deleted = await indexer.delete_by_query(
305
+ {"query": {"term": {"source_id": final_source_id}}},
306
+ index_name,
307
+ )
308
+ if deleted > 0:
309
+ display.info(f"Deleted {deleted} existing documents")
310
+
311
+ # Index documents
312
+ display.newline()
313
+ display.info("Indexing documents...")
314
+
315
+ indexed_count = 0
316
+ failed_count = 0
317
+
318
+ with Progress(
319
+ SpinnerColumn(),
320
+ TextColumn("[progress.description]{task.description}"),
321
+ BarColumn(),
322
+ TaskProgressColumn(),
323
+ MofNCompleteColumn(),
324
+ TimeElapsedColumn(),
325
+ console=display.console,
326
+ ) as progress:
327
+ index_task = progress.add_task("Indexing...", total=len(chunked_documents))
328
+
329
+ # Index in batches
330
+ for i in range(0, len(chunked_documents), batch_size):
331
+ batch = chunked_documents[i : i + batch_size]
332
+
333
+ result = await indexer.bulk_index(batch, index_name, batch_size=batch_size)
334
+ indexed_count += result.indexed_count
335
+ failed_count += result.failed_count
336
+
337
+ progress.update(index_task, advance=len(batch))
338
+
339
+ # Refresh index to make documents searchable
340
+ await indexer.refresh_index(index_name)
341
+
342
+ display.newline()
343
+
344
+ # Display results
345
+ if failed_count == 0:
346
+ display.panel(
347
+ f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
348
+ f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
349
+ f"Documents Indexed: [green]{indexed_count}[/green]\n"
350
+ f"Index: [cyan]{index_name}[/cyan]\n\n"
351
+ f"Verify with:\n"
352
+ f' [dim]gnosisllm-knowledge search "your query" --index {index_name}[/dim]',
353
+ title="Loading Complete",
354
+ style="success",
355
+ )
356
+ else:
357
+ display.panel(
358
+ f"Documents Loaded: [cyan]{len(documents)}[/cyan]\n"
359
+ f"Chunks Created: [cyan]{len(chunked_documents)}[/cyan]\n"
360
+ f"Documents Indexed: [green]{indexed_count}[/green]\n"
361
+ f"Documents Failed: [red]{failed_count}[/red]\n"
362
+ f"Index: [cyan]{index_name}[/cyan]",
363
+ title="Loading Complete (with errors)",
364
+ style="warning",
365
+ )
366
+ sys.exit(1)
367
+
368
+ finally:
369
+ await client.close()