gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +287 -7
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +378 -12
- gnosisllm_knowledge/cli/commands/agentic.py +11 -0
- gnosisllm_knowledge/cli/commands/memory.py +723 -0
- gnosisllm_knowledge/cli/commands/setup.py +24 -22
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +58 -0
- gnosisllm_knowledge/core/domain/__init__.py +41 -0
- gnosisllm_knowledge/core/domain/document.py +5 -0
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +2 -0
- gnosisllm_knowledge/core/events/types.py +76 -0
- gnosisllm_knowledge/core/exceptions.py +134 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +67 -75
- gnosisllm_knowledge/services/search.py +47 -11
- gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Streaming sitemap discovery with bounded memory.
|
|
2
|
+
|
|
3
|
+
This module provides streaming URL discovery from sitemaps, yielding
|
|
4
|
+
batches of URLs as they're discovered rather than collecting all URLs
|
|
5
|
+
first. This enables immediate processing and keeps memory bounded.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import fnmatch
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from collections.abc import AsyncIterator
|
|
15
|
+
from typing import Any
|
|
16
|
+
from xml.etree import ElementTree
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
|
|
21
|
+
|
|
22
|
+
# XML namespace for sitemaps
|
|
23
|
+
SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class StreamingSitemapDiscoverer:
|
|
27
|
+
"""Discovers sitemap URLs in a streaming fashion.
|
|
28
|
+
|
|
29
|
+
Instead of collecting all URLs before processing, this yields
|
|
30
|
+
batches of URLs as they're discovered, enabling immediate processing.
|
|
31
|
+
|
|
32
|
+
Key differences from SitemapLoader._get_urls():
|
|
33
|
+
- Yields batches instead of returning complete list
|
|
34
|
+
- Uses bounded queue for backpressure
|
|
35
|
+
- Memory usage is O(batch_size) not O(total_urls)
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
```python
|
|
39
|
+
discoverer = StreamingSitemapDiscoverer()
|
|
40
|
+
|
|
41
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
42
|
+
sitemap_url="https://example.com/sitemap.xml",
|
|
43
|
+
batch_size=50,
|
|
44
|
+
max_urls=1000,
|
|
45
|
+
):
|
|
46
|
+
# Process batch immediately
|
|
47
|
+
for url in url_batch:
|
|
48
|
+
await fetch_and_process(url)
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
config: PipelineConfig | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Initialize the streaming sitemap discoverer.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
config: Pipeline configuration with batch sizes and concurrency.
|
|
60
|
+
"""
|
|
61
|
+
self._config = config or PipelineConfig()
|
|
62
|
+
self._logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
async def discover_urls_streaming(
|
|
65
|
+
self,
|
|
66
|
+
sitemap_url: str,
|
|
67
|
+
batch_size: int | None = None,
|
|
68
|
+
max_urls: int = 10000,
|
|
69
|
+
max_depth: int = 3,
|
|
70
|
+
allowed_patterns: list[str] | None = None,
|
|
71
|
+
blocked_patterns: list[str] | None = None,
|
|
72
|
+
**options: Any,
|
|
73
|
+
) -> AsyncIterator[list[str]]:
|
|
74
|
+
"""Yield batches of URLs as they're discovered.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
sitemap_url: Root sitemap URL.
|
|
78
|
+
batch_size: URLs per batch (default from config).
|
|
79
|
+
max_urls: Maximum total URLs to discover.
|
|
80
|
+
max_depth: Maximum sitemap recursion depth.
|
|
81
|
+
allowed_patterns: URL patterns to include.
|
|
82
|
+
blocked_patterns: URL patterns to exclude.
|
|
83
|
+
**options: Additional options (unused, for compatibility).
|
|
84
|
+
|
|
85
|
+
Yields:
|
|
86
|
+
Lists of discovered URLs, batch_size at a time.
|
|
87
|
+
"""
|
|
88
|
+
batch_size = batch_size or self._config.url_batch_size
|
|
89
|
+
allowed_patterns = allowed_patterns or []
|
|
90
|
+
blocked_patterns = blocked_patterns or []
|
|
91
|
+
|
|
92
|
+
# Use bounded queue for discovered URLs
|
|
93
|
+
# Queue size is 2x batch_size to allow producer to stay ahead
|
|
94
|
+
url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
|
|
95
|
+
|
|
96
|
+
# Tracking state
|
|
97
|
+
discovered_count = 0
|
|
98
|
+
seen_urls: set[str] = set()
|
|
99
|
+
|
|
100
|
+
async def discover_recursive(url: str, depth: int) -> None:
|
|
101
|
+
"""Recursively discover URLs, pushing to queue."""
|
|
102
|
+
nonlocal discovered_count
|
|
103
|
+
|
|
104
|
+
if depth > max_depth or discovered_count >= max_urls:
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
content = await self._fetch_sitemap(url)
|
|
108
|
+
if not content:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
root = ElementTree.fromstring(content)
|
|
113
|
+
except ElementTree.ParseError as e:
|
|
114
|
+
self._logger.error(f"Failed to parse sitemap {url}: {e}")
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
# Check for sitemap index
|
|
118
|
+
sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
|
|
119
|
+
if sitemap_refs:
|
|
120
|
+
self._logger.info(
|
|
121
|
+
f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
|
|
122
|
+
)
|
|
123
|
+
# Process nested sitemaps (limited parallelism to avoid overwhelming)
|
|
124
|
+
tasks = []
|
|
125
|
+
for ref in sitemap_refs[:10]: # Limit parallel sitemap fetches
|
|
126
|
+
if ref.text and discovered_count < max_urls:
|
|
127
|
+
tasks.append(discover_recursive(ref.text.strip(), depth + 1))
|
|
128
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
# Process URL entries
|
|
132
|
+
url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
|
|
133
|
+
for url_elem in url_elements:
|
|
134
|
+
if url_elem.text and discovered_count < max_urls:
|
|
135
|
+
page_url = url_elem.text.strip()
|
|
136
|
+
|
|
137
|
+
if page_url in seen_urls:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
if not self._should_include_url(
|
|
141
|
+
page_url, allowed_patterns, blocked_patterns
|
|
142
|
+
):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
seen_urls.add(page_url)
|
|
146
|
+
await url_queue.put(page_url) # Backpressure if queue full
|
|
147
|
+
discovered_count += 1
|
|
148
|
+
|
|
149
|
+
if discovered_count % 100 == 0:
|
|
150
|
+
self._logger.debug(f"Discovered {discovered_count} URLs so far")
|
|
151
|
+
|
|
152
|
+
async def discover_and_close() -> None:
|
|
153
|
+
"""Run discovery and close queue when done."""
|
|
154
|
+
try:
|
|
155
|
+
await discover_recursive(sitemap_url, depth=0)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self._logger.error(f"Discovery error: {e}")
|
|
158
|
+
finally:
|
|
159
|
+
url_queue.close()
|
|
160
|
+
|
|
161
|
+
# Start discovery in background task
|
|
162
|
+
discovery_task = asyncio.create_task(discover_and_close())
|
|
163
|
+
|
|
164
|
+
# Yield batches from queue
|
|
165
|
+
batch: list[str] = []
|
|
166
|
+
try:
|
|
167
|
+
async for url in url_queue:
|
|
168
|
+
batch.append(url)
|
|
169
|
+
if len(batch) >= batch_size:
|
|
170
|
+
yield batch
|
|
171
|
+
batch = []
|
|
172
|
+
|
|
173
|
+
# Yield remaining URLs
|
|
174
|
+
if batch:
|
|
175
|
+
yield batch
|
|
176
|
+
|
|
177
|
+
finally:
|
|
178
|
+
# Ensure discovery task is complete or cancelled
|
|
179
|
+
if not discovery_task.done():
|
|
180
|
+
discovery_task.cancel()
|
|
181
|
+
try:
|
|
182
|
+
await discovery_task
|
|
183
|
+
except asyncio.CancelledError:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
async def _fetch_sitemap(self, url: str) -> str | None:
|
|
187
|
+
"""Fetch sitemap XML content.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
url: The sitemap URL to fetch.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Sitemap XML content or None if fetch failed.
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
async with httpx.AsyncClient(
|
|
197
|
+
timeout=self._config.fetch_timeout_seconds
|
|
198
|
+
) as client:
|
|
199
|
+
response = await client.get(
|
|
200
|
+
url,
|
|
201
|
+
headers={"Accept": "application/xml, text/xml, */*"},
|
|
202
|
+
follow_redirects=True,
|
|
203
|
+
)
|
|
204
|
+
response.raise_for_status()
|
|
205
|
+
return response.text
|
|
206
|
+
except Exception as e:
|
|
207
|
+
self._logger.error(f"Failed to fetch sitemap {url}: {e}")
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _should_include_url(
|
|
211
|
+
self,
|
|
212
|
+
url: str,
|
|
213
|
+
allowed_patterns: list[str],
|
|
214
|
+
blocked_patterns: list[str],
|
|
215
|
+
) -> bool:
|
|
216
|
+
"""Check if a URL should be included based on patterns.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
url: The URL to check.
|
|
220
|
+
allowed_patterns: Patterns that must match (if any).
|
|
221
|
+
blocked_patterns: Patterns that must not match.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True if URL should be included.
|
|
225
|
+
"""
|
|
226
|
+
# Check blocked patterns first
|
|
227
|
+
for pattern in blocked_patterns:
|
|
228
|
+
if self._matches_pattern(url, pattern):
|
|
229
|
+
return False
|
|
230
|
+
|
|
231
|
+
# If allowed patterns specified, at least one must match
|
|
232
|
+
if allowed_patterns:
|
|
233
|
+
return any(self._matches_pattern(url, p) for p in allowed_patterns)
|
|
234
|
+
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
def _matches_pattern(self, url: str, pattern: str) -> bool:
|
|
238
|
+
"""Check if URL matches a pattern.
|
|
239
|
+
|
|
240
|
+
Supports both glob patterns (with *) and regex patterns.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
url: The URL to check.
|
|
244
|
+
pattern: The pattern to match against.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
True if URL matches the pattern.
|
|
248
|
+
"""
|
|
249
|
+
# Try fnmatch for glob patterns
|
|
250
|
+
if "*" in pattern or "?" in pattern:
|
|
251
|
+
return fnmatch.fnmatch(url, pattern)
|
|
252
|
+
|
|
253
|
+
# Try regex
|
|
254
|
+
try:
|
|
255
|
+
return bool(re.search(pattern, url))
|
|
256
|
+
except re.error:
|
|
257
|
+
# Invalid regex, try substring match
|
|
258
|
+
return pattern in url
|
|
@@ -88,7 +88,10 @@ class KnowledgeIndexingService:
|
|
|
88
88
|
batch_size: int = 100,
|
|
89
89
|
**options: Any,
|
|
90
90
|
) -> IndexResult:
|
|
91
|
-
"""Load content from source and index it.
|
|
91
|
+
"""Load content from source and index it with streaming.
|
|
92
|
+
|
|
93
|
+
Uses streaming to process and index documents as they're fetched,
|
|
94
|
+
avoiding memory issues with large sitemaps.
|
|
92
95
|
|
|
93
96
|
Args:
|
|
94
97
|
source: Source URL or path.
|
|
@@ -96,98 +99,89 @@ class KnowledgeIndexingService:
|
|
|
96
99
|
account_id: Account ID for multi-tenancy.
|
|
97
100
|
collection_id: Collection ID.
|
|
98
101
|
source_id: Source ID (auto-generated if not provided).
|
|
99
|
-
batch_size: Documents per batch.
|
|
102
|
+
batch_size: Documents per batch for indexing.
|
|
100
103
|
**options: Additional loader/indexer options.
|
|
101
104
|
|
|
102
105
|
Returns:
|
|
103
106
|
Index result with counts.
|
|
104
107
|
"""
|
|
105
108
|
source_id = source_id or str(uuid.uuid4())
|
|
109
|
+
document_defaults = options.pop("document_defaults", {})
|
|
106
110
|
|
|
107
111
|
# Emit batch started event
|
|
108
112
|
await self._events.emit_async(
|
|
109
|
-
EventType.BATCH_STARTED,
|
|
110
113
|
BatchStartedEvent(
|
|
111
|
-
|
|
112
|
-
|
|
114
|
+
batch_index=0,
|
|
115
|
+
batch_size=batch_size,
|
|
116
|
+
total_batches=0, # Unknown for streaming
|
|
113
117
|
),
|
|
114
118
|
)
|
|
115
119
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
120
|
+
total_indexed = 0
|
|
121
|
+
total_failed = 0
|
|
122
|
+
errors: list[str] = []
|
|
123
|
+
batch: list[Document] = []
|
|
124
|
+
batch_index = 0
|
|
119
125
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
126
|
+
try:
|
|
127
|
+
# Stream documents and index in batches as they arrive
|
|
128
|
+
# Note: Loader already chunks content, so we don't re-chunk here
|
|
129
|
+
async for doc in self._loader.load_streaming(source, **options):
|
|
130
|
+
# Enrich document with tenant info
|
|
131
|
+
enriched_doc = Document(
|
|
132
|
+
content=doc.content,
|
|
133
|
+
source=source,
|
|
134
|
+
doc_id=doc.doc_id,
|
|
135
|
+
url=doc.url,
|
|
136
|
+
title=doc.title,
|
|
137
|
+
account_id=account_id,
|
|
138
|
+
collection_id=collection_id,
|
|
139
|
+
source_id=source_id,
|
|
140
|
+
chunk_index=doc.chunk_index,
|
|
141
|
+
total_chunks=doc.total_chunks,
|
|
142
|
+
parent_doc_id=doc.parent_doc_id,
|
|
143
|
+
status=DocumentStatus.INDEXED,
|
|
144
|
+
metadata=doc.metadata,
|
|
145
|
+
**document_defaults,
|
|
124
146
|
)
|
|
125
147
|
|
|
126
|
-
|
|
127
|
-
total_indexed = 0
|
|
128
|
-
total_failed = 0
|
|
129
|
-
errors: list[str] = []
|
|
130
|
-
|
|
131
|
-
batch: list[Document] = []
|
|
132
|
-
|
|
133
|
-
for doc in load_result.documents:
|
|
134
|
-
# Chunk the document
|
|
135
|
-
chunks = self._chunker.chunk(doc.content)
|
|
136
|
-
|
|
137
|
-
for i, chunk in enumerate(chunks):
|
|
138
|
-
# Create chunk document
|
|
139
|
-
chunk_doc = Document(
|
|
140
|
-
id=f"{doc.id}-chunk-{i}",
|
|
141
|
-
content=chunk.content,
|
|
142
|
-
url=doc.url,
|
|
143
|
-
title=doc.title,
|
|
144
|
-
source=source,
|
|
145
|
-
account_id=account_id,
|
|
146
|
-
collection_id=collection_id,
|
|
147
|
-
source_id=source_id,
|
|
148
|
-
chunk_index=i,
|
|
149
|
-
total_chunks=len(chunks),
|
|
150
|
-
parent_doc_id=doc.id,
|
|
151
|
-
status=DocumentStatus.INDEXED,
|
|
152
|
-
metadata=doc.metadata,
|
|
153
|
-
)
|
|
148
|
+
batch.append(enriched_doc)
|
|
154
149
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
150
|
+
# Index batch when full
|
|
151
|
+
if len(batch) >= batch_size:
|
|
152
|
+
result = await self._index_batch(batch, index_name)
|
|
153
|
+
total_indexed += result.indexed_count
|
|
154
|
+
total_failed += result.failed_count
|
|
155
|
+
if result.errors:
|
|
156
|
+
errors.extend(result.errors)
|
|
157
|
+
batch = []
|
|
158
|
+
batch_index += 1
|
|
159
|
+
logger.info(f"Indexed batch {batch_index}: {total_indexed} total documents")
|
|
165
160
|
|
|
166
161
|
# Index remaining documents
|
|
167
162
|
if batch:
|
|
168
163
|
result = await self._index_batch(batch, index_name)
|
|
169
|
-
total_indexed += result.
|
|
170
|
-
total_failed += result.
|
|
164
|
+
total_indexed += result.indexed_count
|
|
165
|
+
total_failed += result.failed_count
|
|
171
166
|
if result.errors:
|
|
172
167
|
errors.extend(result.errors)
|
|
173
168
|
|
|
174
169
|
# Emit batch completed event
|
|
175
170
|
await self._events.emit_async(
|
|
176
|
-
EventType.BATCH_COMPLETED,
|
|
177
171
|
BatchCompletedEvent(
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
documents_failed=total_failed,
|
|
182
|
-
success=total_failed == 0,
|
|
172
|
+
batch_index=batch_index,
|
|
173
|
+
success_count=total_indexed,
|
|
174
|
+
failure_count=total_failed,
|
|
183
175
|
),
|
|
184
176
|
)
|
|
185
177
|
|
|
178
|
+
logger.info(f"Completed indexing from {source}: {total_indexed} documents")
|
|
179
|
+
|
|
186
180
|
return IndexResult(
|
|
187
181
|
success=total_failed == 0,
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
errors=errors if errors else
|
|
182
|
+
indexed_count=total_indexed,
|
|
183
|
+
failed_count=total_failed,
|
|
184
|
+
errors=errors if errors else [],
|
|
191
185
|
)
|
|
192
186
|
|
|
193
187
|
except Exception as e:
|
|
@@ -231,17 +225,17 @@ class KnowledgeIndexingService:
|
|
|
231
225
|
|
|
232
226
|
for i, chunk_obj in enumerate(chunks):
|
|
233
227
|
chunk_doc = Document(
|
|
234
|
-
id=f"{doc.id}-chunk-{i}",
|
|
235
228
|
content=chunk_obj.content,
|
|
229
|
+
source=doc.source,
|
|
230
|
+
doc_id=f"{doc.doc_id}-chunk-{i}",
|
|
236
231
|
url=doc.url,
|
|
237
232
|
title=doc.title,
|
|
238
|
-
source=doc.source,
|
|
239
233
|
account_id=doc.account_id,
|
|
240
234
|
collection_id=doc.collection_id,
|
|
241
235
|
source_id=doc.source_id,
|
|
242
236
|
chunk_index=i,
|
|
243
237
|
total_chunks=len(chunks),
|
|
244
|
-
parent_doc_id=doc.
|
|
238
|
+
parent_doc_id=doc.doc_id,
|
|
245
239
|
status=DocumentStatus.INDEXED,
|
|
246
240
|
metadata=doc.metadata,
|
|
247
241
|
)
|
|
@@ -252,8 +246,8 @@ class KnowledgeIndexingService:
|
|
|
252
246
|
# Index batch when full
|
|
253
247
|
if len(batch) >= batch_size:
|
|
254
248
|
result = await self._index_batch(batch, index_name)
|
|
255
|
-
total_indexed += result.
|
|
256
|
-
total_failed += result.
|
|
249
|
+
total_indexed += result.indexed_count
|
|
250
|
+
total_failed += result.failed_count
|
|
257
251
|
if result.errors:
|
|
258
252
|
errors.extend(result.errors)
|
|
259
253
|
batch = []
|
|
@@ -261,16 +255,16 @@ class KnowledgeIndexingService:
|
|
|
261
255
|
# Index remaining
|
|
262
256
|
if batch:
|
|
263
257
|
result = await self._index_batch(batch, index_name)
|
|
264
|
-
total_indexed += result.
|
|
265
|
-
total_failed += result.
|
|
258
|
+
total_indexed += result.indexed_count
|
|
259
|
+
total_failed += result.failed_count
|
|
266
260
|
if result.errors:
|
|
267
261
|
errors.extend(result.errors)
|
|
268
262
|
|
|
269
263
|
return IndexResult(
|
|
270
264
|
success=total_failed == 0,
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
errors=errors if errors else
|
|
265
|
+
indexed_count=total_indexed,
|
|
266
|
+
failed_count=total_failed,
|
|
267
|
+
errors=errors if errors else [],
|
|
274
268
|
)
|
|
275
269
|
|
|
276
270
|
async def delete_source(
|
|
@@ -375,12 +369,10 @@ class KnowledgeIndexingService:
|
|
|
375
369
|
for doc in documents:
|
|
376
370
|
if result.success:
|
|
377
371
|
await self._events.emit_async(
|
|
378
|
-
EventType.DOCUMENT_INDEXED,
|
|
379
372
|
DocumentIndexedEvent(
|
|
380
|
-
|
|
373
|
+
doc_id=doc.doc_id,
|
|
381
374
|
index_name=index_name,
|
|
382
|
-
|
|
383
|
-
total_chunks=doc.total_chunks,
|
|
375
|
+
success=True,
|
|
384
376
|
),
|
|
385
377
|
)
|
|
386
378
|
|
|
@@ -114,17 +114,8 @@ class KnowledgeSearchService:
|
|
|
114
114
|
try:
|
|
115
115
|
result = await self._searcher.search(search_query, index, **options)
|
|
116
116
|
|
|
117
|
-
# Emit search event
|
|
118
|
-
await self._events.emit_async(
|
|
119
|
-
EventType.SEARCH_COMPLETED,
|
|
120
|
-
{
|
|
121
|
-
"query": query,
|
|
122
|
-
"mode": mode.value,
|
|
123
|
-
"results_count": len(result.items),
|
|
124
|
-
"total_hits": result.total_hits,
|
|
125
|
-
"duration_ms": result.duration_ms,
|
|
126
|
-
},
|
|
127
|
-
)
|
|
117
|
+
# TODO: Emit search event when SearchCompletedEvent is defined
|
|
118
|
+
# await self._events.emit_async(SearchCompletedEvent(...))
|
|
128
119
|
|
|
129
120
|
return result
|
|
130
121
|
|
|
@@ -347,3 +338,48 @@ class KnowledgeSearchService:
|
|
|
347
338
|
# Use a simple match_all to get total count
|
|
348
339
|
result = await self._searcher.search(query, index)
|
|
349
340
|
return result.total_hits
|
|
341
|
+
|
|
342
|
+
async def get_collections(
|
|
343
|
+
self,
|
|
344
|
+
index_name: str | None = None,
|
|
345
|
+
) -> list[dict[str, Any]]:
|
|
346
|
+
"""Get all collections with document counts.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
index_name: Index to query (uses default if not provided).
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
List of collections with id, name, and document_count.
|
|
353
|
+
"""
|
|
354
|
+
index = index_name or self._default_index
|
|
355
|
+
if not index:
|
|
356
|
+
logger.warning("No index specified for get_collections")
|
|
357
|
+
return []
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
return await self._searcher.get_collections(index)
|
|
361
|
+
except Exception as e:
|
|
362
|
+
logger.error(f"Failed to get collections: {e}")
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
async def get_stats(
|
|
366
|
+
self,
|
|
367
|
+
index_name: str | None = None,
|
|
368
|
+
) -> dict[str, Any]:
|
|
369
|
+
"""Get index statistics.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
index_name: Index to query (uses default if not provided).
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
Dictionary with document_count, index_name, and other stats.
|
|
376
|
+
"""
|
|
377
|
+
index = index_name or self._default_index
|
|
378
|
+
if not index:
|
|
379
|
+
return {"document_count": 0, "index_name": "", "exists": False}
|
|
380
|
+
|
|
381
|
+
try:
|
|
382
|
+
return await self._searcher.get_stats(index)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.error(f"Failed to get stats: {e}")
|
|
385
|
+
return {"document_count": 0, "index_name": index, "error": str(e)}
|