gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +287 -7
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
  7. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  8. gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
  9. gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
  10. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  11. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  12. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  16. gnosisllm_knowledge/cli/app.py +378 -12
  17. gnosisllm_knowledge/cli/commands/agentic.py +11 -0
  18. gnosisllm_knowledge/cli/commands/memory.py +723 -0
  19. gnosisllm_knowledge/cli/commands/setup.py +24 -22
  20. gnosisllm_knowledge/cli/display/service.py +43 -0
  21. gnosisllm_knowledge/cli/utils/config.py +58 -0
  22. gnosisllm_knowledge/core/domain/__init__.py +41 -0
  23. gnosisllm_knowledge/core/domain/document.py +5 -0
  24. gnosisllm_knowledge/core/domain/memory.py +440 -0
  25. gnosisllm_knowledge/core/domain/result.py +11 -3
  26. gnosisllm_knowledge/core/domain/search.py +2 -0
  27. gnosisllm_knowledge/core/events/types.py +76 -0
  28. gnosisllm_knowledge/core/exceptions.py +134 -0
  29. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  30. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  31. gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
  32. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  33. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  34. gnosisllm_knowledge/loaders/base.py +3 -4
  35. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  36. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  37. gnosisllm_knowledge/services/indexing.py +67 -75
  38. gnosisllm_knowledge/services/search.py +47 -11
  39. gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
  40. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
  42. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  43. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
  44. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,258 @@
1
+ """Streaming sitemap discovery with bounded memory.
2
+
3
+ This module provides streaming URL discovery from sitemaps, yielding
4
+ batches of URLs as they're discovered rather than collecting all URLs
5
+ first. This enables immediate processing and keeps memory bounded.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import fnmatch
12
+ import logging
13
+ import re
14
+ from collections.abc import AsyncIterator
15
+ from typing import Any
16
+ from xml.etree import ElementTree
17
+
18
+ import httpx
19
+
20
+ from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
21
+
22
+ # XML namespace for sitemaps
23
+ SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
24
+
25
+
26
+ class StreamingSitemapDiscoverer:
27
+ """Discovers sitemap URLs in a streaming fashion.
28
+
29
+ Instead of collecting all URLs before processing, this yields
30
+ batches of URLs as they're discovered, enabling immediate processing.
31
+
32
+ Key differences from SitemapLoader._get_urls():
33
+ - Yields batches instead of returning complete list
34
+ - Uses bounded queue for backpressure
35
+ - Memory usage is O(batch_size) not O(total_urls)
36
+
37
+ Example:
38
+ ```python
39
+ discoverer = StreamingSitemapDiscoverer()
40
+
41
+ async for url_batch in discoverer.discover_urls_streaming(
42
+ sitemap_url="https://example.com/sitemap.xml",
43
+ batch_size=50,
44
+ max_urls=1000,
45
+ ):
46
+ # Process batch immediately
47
+ for url in url_batch:
48
+ await fetch_and_process(url)
49
+ ```
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ config: PipelineConfig | None = None,
55
+ ) -> None:
56
+ """Initialize the streaming sitemap discoverer.
57
+
58
+ Args:
59
+ config: Pipeline configuration with batch sizes and concurrency.
60
+ """
61
+ self._config = config or PipelineConfig()
62
+ self._logger = logging.getLogger(__name__)
63
+
64
+ async def discover_urls_streaming(
65
+ self,
66
+ sitemap_url: str,
67
+ batch_size: int | None = None,
68
+ max_urls: int = 10000,
69
+ max_depth: int = 3,
70
+ allowed_patterns: list[str] | None = None,
71
+ blocked_patterns: list[str] | None = None,
72
+ **options: Any,
73
+ ) -> AsyncIterator[list[str]]:
74
+ """Yield batches of URLs as they're discovered.
75
+
76
+ Args:
77
+ sitemap_url: Root sitemap URL.
78
+ batch_size: URLs per batch (default from config).
79
+ max_urls: Maximum total URLs to discover.
80
+ max_depth: Maximum sitemap recursion depth.
81
+ allowed_patterns: URL patterns to include.
82
+ blocked_patterns: URL patterns to exclude.
83
+ **options: Additional options (unused, for compatibility).
84
+
85
+ Yields:
86
+ Lists of discovered URLs, batch_size at a time.
87
+ """
88
+ batch_size = batch_size or self._config.url_batch_size
89
+ allowed_patterns = allowed_patterns or []
90
+ blocked_patterns = blocked_patterns or []
91
+
92
+ # Use bounded queue for discovered URLs
93
+ # Queue size is 2x batch_size to allow producer to stay ahead
94
+ url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
95
+
96
+ # Tracking state
97
+ discovered_count = 0
98
+ seen_urls: set[str] = set()
99
+
100
+ async def discover_recursive(url: str, depth: int) -> None:
101
+ """Recursively discover URLs, pushing to queue."""
102
+ nonlocal discovered_count
103
+
104
+ if depth > max_depth or discovered_count >= max_urls:
105
+ return
106
+
107
+ content = await self._fetch_sitemap(url)
108
+ if not content:
109
+ return
110
+
111
+ try:
112
+ root = ElementTree.fromstring(content)
113
+ except ElementTree.ParseError as e:
114
+ self._logger.error(f"Failed to parse sitemap {url}: {e}")
115
+ return
116
+
117
+ # Check for sitemap index
118
+ sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
119
+ if sitemap_refs:
120
+ self._logger.info(
121
+ f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
122
+ )
123
+ # Process nested sitemaps (limited parallelism to avoid overwhelming)
124
+ tasks = []
125
+ for ref in sitemap_refs[:10]: # Limit parallel sitemap fetches
126
+ if ref.text and discovered_count < max_urls:
127
+ tasks.append(discover_recursive(ref.text.strip(), depth + 1))
128
+ await asyncio.gather(*tasks, return_exceptions=True)
129
+ return
130
+
131
+ # Process URL entries
132
+ url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
133
+ for url_elem in url_elements:
134
+ if url_elem.text and discovered_count < max_urls:
135
+ page_url = url_elem.text.strip()
136
+
137
+ if page_url in seen_urls:
138
+ continue
139
+
140
+ if not self._should_include_url(
141
+ page_url, allowed_patterns, blocked_patterns
142
+ ):
143
+ continue
144
+
145
+ seen_urls.add(page_url)
146
+ await url_queue.put(page_url) # Backpressure if queue full
147
+ discovered_count += 1
148
+
149
+ if discovered_count % 100 == 0:
150
+ self._logger.debug(f"Discovered {discovered_count} URLs so far")
151
+
152
+ async def discover_and_close() -> None:
153
+ """Run discovery and close queue when done."""
154
+ try:
155
+ await discover_recursive(sitemap_url, depth=0)
156
+ except Exception as e:
157
+ self._logger.error(f"Discovery error: {e}")
158
+ finally:
159
+ url_queue.close()
160
+
161
+ # Start discovery in background task
162
+ discovery_task = asyncio.create_task(discover_and_close())
163
+
164
+ # Yield batches from queue
165
+ batch: list[str] = []
166
+ try:
167
+ async for url in url_queue:
168
+ batch.append(url)
169
+ if len(batch) >= batch_size:
170
+ yield batch
171
+ batch = []
172
+
173
+ # Yield remaining URLs
174
+ if batch:
175
+ yield batch
176
+
177
+ finally:
178
+ # Ensure discovery task is complete or cancelled
179
+ if not discovery_task.done():
180
+ discovery_task.cancel()
181
+ try:
182
+ await discovery_task
183
+ except asyncio.CancelledError:
184
+ pass
185
+
186
+ async def _fetch_sitemap(self, url: str) -> str | None:
187
+ """Fetch sitemap XML content.
188
+
189
+ Args:
190
+ url: The sitemap URL to fetch.
191
+
192
+ Returns:
193
+ Sitemap XML content or None if fetch failed.
194
+ """
195
+ try:
196
+ async with httpx.AsyncClient(
197
+ timeout=self._config.fetch_timeout_seconds
198
+ ) as client:
199
+ response = await client.get(
200
+ url,
201
+ headers={"Accept": "application/xml, text/xml, */*"},
202
+ follow_redirects=True,
203
+ )
204
+ response.raise_for_status()
205
+ return response.text
206
+ except Exception as e:
207
+ self._logger.error(f"Failed to fetch sitemap {url}: {e}")
208
+ return None
209
+
210
+ def _should_include_url(
211
+ self,
212
+ url: str,
213
+ allowed_patterns: list[str],
214
+ blocked_patterns: list[str],
215
+ ) -> bool:
216
+ """Check if a URL should be included based on patterns.
217
+
218
+ Args:
219
+ url: The URL to check.
220
+ allowed_patterns: Patterns that must match (if any).
221
+ blocked_patterns: Patterns that must not match.
222
+
223
+ Returns:
224
+ True if URL should be included.
225
+ """
226
+ # Check blocked patterns first
227
+ for pattern in blocked_patterns:
228
+ if self._matches_pattern(url, pattern):
229
+ return False
230
+
231
+ # If allowed patterns specified, at least one must match
232
+ if allowed_patterns:
233
+ return any(self._matches_pattern(url, p) for p in allowed_patterns)
234
+
235
+ return True
236
+
237
+ def _matches_pattern(self, url: str, pattern: str) -> bool:
238
+ """Check if URL matches a pattern.
239
+
240
+ Supports both glob patterns (with *) and regex patterns.
241
+
242
+ Args:
243
+ url: The URL to check.
244
+ pattern: The pattern to match against.
245
+
246
+ Returns:
247
+ True if URL matches the pattern.
248
+ """
249
+ # Try fnmatch for glob patterns
250
+ if "*" in pattern or "?" in pattern:
251
+ return fnmatch.fnmatch(url, pattern)
252
+
253
+ # Try regex
254
+ try:
255
+ return bool(re.search(pattern, url))
256
+ except re.error:
257
+ # Invalid regex, try substring match
258
+ return pattern in url
@@ -88,7 +88,10 @@ class KnowledgeIndexingService:
88
88
  batch_size: int = 100,
89
89
  **options: Any,
90
90
  ) -> IndexResult:
91
- """Load content from source and index it.
91
+ """Load content from source and index it with streaming.
92
+
93
+ Uses streaming to process and index documents as they're fetched,
94
+ avoiding memory issues with large sitemaps.
92
95
 
93
96
  Args:
94
97
  source: Source URL or path.
@@ -96,98 +99,89 @@ class KnowledgeIndexingService:
96
99
  account_id: Account ID for multi-tenancy.
97
100
  collection_id: Collection ID.
98
101
  source_id: Source ID (auto-generated if not provided).
99
- batch_size: Documents per batch.
102
+ batch_size: Documents per batch for indexing.
100
103
  **options: Additional loader/indexer options.
101
104
 
102
105
  Returns:
103
106
  Index result with counts.
104
107
  """
105
108
  source_id = source_id or str(uuid.uuid4())
109
+ document_defaults = options.pop("document_defaults", {})
106
110
 
107
111
  # Emit batch started event
108
112
  await self._events.emit_async(
109
- EventType.BATCH_STARTED,
110
113
  BatchStartedEvent(
111
- source=source,
112
- source_id=source_id,
114
+ batch_index=0,
115
+ batch_size=batch_size,
116
+ total_batches=0, # Unknown for streaming
113
117
  ),
114
118
  )
115
119
 
116
- try:
117
- # Load documents
118
- load_result = await self._loader.load(source, **options)
120
+ total_indexed = 0
121
+ total_failed = 0
122
+ errors: list[str] = []
123
+ batch: list[Document] = []
124
+ batch_index = 0
119
125
 
120
- if not load_result.success:
121
- raise LoadError(
122
- message=f"Failed to load from {source}",
123
- details={"errors": load_result.errors},
126
+ try:
127
+ # Stream documents and index in batches as they arrive
128
+ # Note: Loader already chunks content, so we don't re-chunk here
129
+ async for doc in self._loader.load_streaming(source, **options):
130
+ # Enrich document with tenant info
131
+ enriched_doc = Document(
132
+ content=doc.content,
133
+ source=source,
134
+ doc_id=doc.doc_id,
135
+ url=doc.url,
136
+ title=doc.title,
137
+ account_id=account_id,
138
+ collection_id=collection_id,
139
+ source_id=source_id,
140
+ chunk_index=doc.chunk_index,
141
+ total_chunks=doc.total_chunks,
142
+ parent_doc_id=doc.parent_doc_id,
143
+ status=DocumentStatus.INDEXED,
144
+ metadata=doc.metadata,
145
+ **document_defaults,
124
146
  )
125
147
 
126
- # Process and index documents
127
- total_indexed = 0
128
- total_failed = 0
129
- errors: list[str] = []
130
-
131
- batch: list[Document] = []
132
-
133
- for doc in load_result.documents:
134
- # Chunk the document
135
- chunks = self._chunker.chunk(doc.content)
136
-
137
- for i, chunk in enumerate(chunks):
138
- # Create chunk document
139
- chunk_doc = Document(
140
- id=f"{doc.id}-chunk-{i}",
141
- content=chunk.content,
142
- url=doc.url,
143
- title=doc.title,
144
- source=source,
145
- account_id=account_id,
146
- collection_id=collection_id,
147
- source_id=source_id,
148
- chunk_index=i,
149
- total_chunks=len(chunks),
150
- parent_doc_id=doc.id,
151
- status=DocumentStatus.INDEXED,
152
- metadata=doc.metadata,
153
- )
148
+ batch.append(enriched_doc)
154
149
 
155
- batch.append(chunk_doc)
156
-
157
- # Index batch when full
158
- if len(batch) >= batch_size:
159
- result = await self._index_batch(batch, index_name)
160
- total_indexed += result.documents_indexed
161
- total_failed += result.documents_failed
162
- if result.errors:
163
- errors.extend(result.errors)
164
- batch = []
150
+ # Index batch when full
151
+ if len(batch) >= batch_size:
152
+ result = await self._index_batch(batch, index_name)
153
+ total_indexed += result.indexed_count
154
+ total_failed += result.failed_count
155
+ if result.errors:
156
+ errors.extend(result.errors)
157
+ batch = []
158
+ batch_index += 1
159
+ logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
165
160
 
166
161
  # Index remaining documents
167
162
  if batch:
168
163
  result = await self._index_batch(batch, index_name)
169
- total_indexed += result.documents_indexed
170
- total_failed += result.documents_failed
164
+ total_indexed += result.indexed_count
165
+ total_failed += result.failed_count
171
166
  if result.errors:
172
167
  errors.extend(result.errors)
173
168
 
174
169
  # Emit batch completed event
175
170
  await self._events.emit_async(
176
- EventType.BATCH_COMPLETED,
177
171
  BatchCompletedEvent(
178
- source=source,
179
- source_id=source_id,
180
- documents_indexed=total_indexed,
181
- documents_failed=total_failed,
182
- success=total_failed == 0,
172
+ batch_index=batch_index,
173
+ success_count=total_indexed,
174
+ failure_count=total_failed,
183
175
  ),
184
176
  )
185
177
 
178
+ logger.info(f"Completed indexing from {source}: {total_indexed} documents")
179
+
186
180
  return IndexResult(
187
181
  success=total_failed == 0,
188
- documents_indexed=total_indexed,
189
- documents_failed=total_failed,
190
- errors=errors if errors else None,
182
+ indexed_count=total_indexed,
183
+ failed_count=total_failed,
184
+ errors=errors if errors else [],
191
185
  )
192
186
 
193
187
  except Exception as e:
@@ -231,17 +225,17 @@ class KnowledgeIndexingService:
231
225
 
232
226
  for i, chunk_obj in enumerate(chunks):
233
227
  chunk_doc = Document(
234
- id=f"{doc.id}-chunk-{i}",
235
228
  content=chunk_obj.content,
229
+ source=doc.source,
230
+ doc_id=f"{doc.doc_id}-chunk-{i}",
236
231
  url=doc.url,
237
232
  title=doc.title,
238
- source=doc.source,
239
233
  account_id=doc.account_id,
240
234
  collection_id=doc.collection_id,
241
235
  source_id=doc.source_id,
242
236
  chunk_index=i,
243
237
  total_chunks=len(chunks),
244
- parent_doc_id=doc.id,
238
+ parent_doc_id=doc.doc_id,
245
239
  status=DocumentStatus.INDEXED,
246
240
  metadata=doc.metadata,
247
241
  )
@@ -252,8 +246,8 @@ class KnowledgeIndexingService:
252
246
  # Index batch when full
253
247
  if len(batch) >= batch_size:
254
248
  result = await self._index_batch(batch, index_name)
255
- total_indexed += result.documents_indexed
256
- total_failed += result.documents_failed
249
+ total_indexed += result.indexed_count
250
+ total_failed += result.failed_count
257
251
  if result.errors:
258
252
  errors.extend(result.errors)
259
253
  batch = []
@@ -261,16 +255,16 @@ class KnowledgeIndexingService:
261
255
  # Index remaining
262
256
  if batch:
263
257
  result = await self._index_batch(batch, index_name)
264
- total_indexed += result.documents_indexed
265
- total_failed += result.documents_failed
258
+ total_indexed += result.indexed_count
259
+ total_failed += result.failed_count
266
260
  if result.errors:
267
261
  errors.extend(result.errors)
268
262
 
269
263
  return IndexResult(
270
264
  success=total_failed == 0,
271
- documents_indexed=total_indexed,
272
- documents_failed=total_failed,
273
- errors=errors if errors else None,
265
+ indexed_count=total_indexed,
266
+ failed_count=total_failed,
267
+ errors=errors if errors else [],
274
268
  )
275
269
 
276
270
  async def delete_source(
@@ -375,12 +369,10 @@ class KnowledgeIndexingService:
375
369
  for doc in documents:
376
370
  if result.success:
377
371
  await self._events.emit_async(
378
- EventType.DOCUMENT_INDEXED,
379
372
  DocumentIndexedEvent(
380
- document_id=doc.id,
373
+ doc_id=doc.doc_id,
381
374
  index_name=index_name,
382
- chunk_index=doc.chunk_index,
383
- total_chunks=doc.total_chunks,
375
+ success=True,
384
376
  ),
385
377
  )
386
378
 
@@ -114,17 +114,8 @@ class KnowledgeSearchService:
114
114
  try:
115
115
  result = await self._searcher.search(search_query, index, **options)
116
116
 
117
- # Emit search event
118
- await self._events.emit_async(
119
- EventType.SEARCH_COMPLETED,
120
- {
121
- "query": query,
122
- "mode": mode.value,
123
- "results_count": len(result.items),
124
- "total_hits": result.total_hits,
125
- "duration_ms": result.duration_ms,
126
- },
127
- )
117
+ # TODO: Emit search event when SearchCompletedEvent is defined
118
+ # await self._events.emit_async(SearchCompletedEvent(...))
128
119
 
129
120
  return result
130
121
 
@@ -347,3 +338,48 @@ class KnowledgeSearchService:
347
338
  # Use a simple match_all to get total count
348
339
  result = await self._searcher.search(query, index)
349
340
  return result.total_hits
341
+
342
+ async def get_collections(
343
+ self,
344
+ index_name: str | None = None,
345
+ ) -> list[dict[str, Any]]:
346
+ """Get all collections with document counts.
347
+
348
+ Args:
349
+ index_name: Index to query (uses default if not provided).
350
+
351
+ Returns:
352
+ List of collections with id, name, and document_count.
353
+ """
354
+ index = index_name or self._default_index
355
+ if not index:
356
+ logger.warning("No index specified for get_collections")
357
+ return []
358
+
359
+ try:
360
+ return await self._searcher.get_collections(index)
361
+ except Exception as e:
362
+ logger.error(f"Failed to get collections: {e}")
363
+ return []
364
+
365
+ async def get_stats(
366
+ self,
367
+ index_name: str | None = None,
368
+ ) -> dict[str, Any]:
369
+ """Get index statistics.
370
+
371
+ Args:
372
+ index_name: Index to query (uses default if not provided).
373
+
374
+ Returns:
375
+ Dictionary with document_count, index_name, and other stats.
376
+ """
377
+ index = index_name or self._default_index
378
+ if not index:
379
+ return {"document_count": 0, "index_name": "", "exists": False}
380
+
381
+ try:
382
+ return await self._searcher.get_stats(index)
383
+ except Exception as e:
384
+ logger.error(f"Failed to get stats: {e}")
385
+ return {"document_count": 0, "index_name": index, "error": str(e)}