gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. gnosisllm_knowledge/__init__.py +91 -39
  2. gnosisllm_knowledge/api/__init__.py +3 -2
  3. gnosisllm_knowledge/api/knowledge.py +287 -7
  4. gnosisllm_knowledge/api/memory.py +966 -0
  5. gnosisllm_knowledge/backends/__init__.py +14 -5
  6. gnosisllm_knowledge/backends/opensearch/agentic.py +341 -39
  7. gnosisllm_knowledge/backends/opensearch/config.py +49 -28
  8. gnosisllm_knowledge/backends/opensearch/indexer.py +1 -0
  9. gnosisllm_knowledge/backends/opensearch/mappings.py +2 -1
  10. gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
  11. gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
  12. gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
  13. gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +235 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
  16. gnosisllm_knowledge/cli/app.py +378 -12
  17. gnosisllm_knowledge/cli/commands/agentic.py +11 -0
  18. gnosisllm_knowledge/cli/commands/memory.py +723 -0
  19. gnosisllm_knowledge/cli/commands/setup.py +24 -22
  20. gnosisllm_knowledge/cli/display/service.py +43 -0
  21. gnosisllm_knowledge/cli/utils/config.py +58 -0
  22. gnosisllm_knowledge/core/domain/__init__.py +41 -0
  23. gnosisllm_knowledge/core/domain/document.py +5 -0
  24. gnosisllm_knowledge/core/domain/memory.py +440 -0
  25. gnosisllm_knowledge/core/domain/result.py +11 -3
  26. gnosisllm_knowledge/core/domain/search.py +2 -0
  27. gnosisllm_knowledge/core/events/types.py +76 -0
  28. gnosisllm_knowledge/core/exceptions.py +134 -0
  29. gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
  30. gnosisllm_knowledge/core/interfaces/memory.py +524 -0
  31. gnosisllm_knowledge/core/interfaces/streaming.py +127 -0
  32. gnosisllm_knowledge/core/streaming/__init__.py +36 -0
  33. gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
  34. gnosisllm_knowledge/loaders/base.py +3 -4
  35. gnosisllm_knowledge/loaders/sitemap.py +129 -1
  36. gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
  37. gnosisllm_knowledge/services/indexing.py +67 -75
  38. gnosisllm_knowledge/services/search.py +47 -11
  39. gnosisllm_knowledge/services/streaming_pipeline.py +302 -0
  40. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/METADATA +44 -1
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +77 -0
  42. gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
  43. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/WHEEL +0 -0
  44. {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,127 @@
1
+ """Memory-specific configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+
8
+ from gnosisllm_knowledge.core.domain.memory import MemoryStrategy
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class MemoryConfig:
13
+ """Configuration for Agentic Memory.
14
+
15
+ Example:
16
+ ```python
17
+ # From environment
18
+ config = MemoryConfig.from_env()
19
+
20
+ # Explicit configuration
21
+ config = MemoryConfig(
22
+ host="localhost",
23
+ port=9200,
24
+ llm_model_id="model-123",
25
+ embedding_model_id="model-456",
26
+ )
27
+ ```
28
+ """
29
+
30
+ # === OpenSearch Connection ===
31
+ host: str = "localhost"
32
+ port: int = 9200
33
+ username: str | None = None
34
+ password: str | None = None
35
+ use_ssl: bool = False
36
+ verify_certs: bool = True
37
+
38
+ # === Model IDs (Required for inference) ===
39
+ llm_model_id: str | None = None
40
+ embedding_model_id: str | None = None
41
+
42
+ # === LLM Response Parsing ===
43
+ # OpenAI: $.choices[0].message.content
44
+ # Bedrock Claude: $.output.message.content[0].text
45
+ llm_result_path: str = "$.choices[0].message.content"
46
+
47
+ # === Connector Configuration ===
48
+ # For setup: OpenAI API key
49
+ openai_api_key: str | None = None
50
+ llm_model: str = "gpt-4o"
51
+ embedding_model: str = "text-embedding-3-small"
52
+ embedding_dimension: int = 1536
53
+
54
+ # === Timeouts ===
55
+ connect_timeout: float = 5.0
56
+ inference_timeout: float = 60.0
57
+
58
+ # === Default Strategies ===
59
+ default_strategies: tuple[MemoryStrategy, ...] = (
60
+ MemoryStrategy.SEMANTIC,
61
+ MemoryStrategy.USER_PREFERENCE,
62
+ )
63
+
64
+ @property
65
+ def url(self) -> str:
66
+ """Get the full OpenSearch URL."""
67
+ scheme = "https" if self.use_ssl else "http"
68
+ return f"{scheme}://{self.host}:{self.port}"
69
+
70
+ @property
71
+ def auth(self) -> tuple[str, str] | None:
72
+ """Get auth tuple if credentials are configured."""
73
+ if self.username and self.password:
74
+ return (self.username, self.password)
75
+ return None
76
+
77
+ @property
78
+ def is_configured(self) -> bool:
79
+ """Check if memory is properly configured for inference."""
80
+ return bool(self.llm_model_id and self.embedding_model_id)
81
+
82
+ @classmethod
83
+ def from_env(cls) -> MemoryConfig:
84
+ """Create config from environment variables.
85
+
86
+ Environment Variables:
87
+ OPENSEARCH_HOST: OpenSearch host (default: localhost)
88
+ OPENSEARCH_PORT: OpenSearch port (default: 9200)
89
+ OPENSEARCH_USERNAME: Username
90
+ OPENSEARCH_PASSWORD: Password
91
+ OPENSEARCH_USE_SSL: Use SSL (default: false)
92
+ OPENSEARCH_VERIFY_CERTS: Verify certs (default: true)
93
+ OPENSEARCH_LLM_MODEL_ID: LLM model ID for inference
94
+ OPENSEARCH_EMBEDDING_MODEL_ID: Embedding model ID
95
+ OPENSEARCH_LLM_RESULT_PATH: JSONPath for LLM response
96
+ OPENAI_API_KEY: OpenAI API key (for setup)
97
+ MEMORY_LLM_MODEL: LLM model name (default: gpt-4o)
98
+ MEMORY_EMBEDDING_MODEL: Embedding model (default: text-embedding-3-small)
99
+ MEMORY_EMBEDDING_DIMENSION: Embedding dimension (default: 1536)
100
+ MEMORY_INFERENCE_TIMEOUT: Inference timeout (default: 60)
101
+ OPENSEARCH_CONNECT_TIMEOUT: Connect timeout (default: 5)
102
+ """
103
+ return cls(
104
+ # Connection
105
+ host=os.getenv("OPENSEARCH_HOST", "localhost"),
106
+ port=int(os.getenv("OPENSEARCH_PORT", "9200")),
107
+ username=os.getenv("OPENSEARCH_USERNAME"),
108
+ password=os.getenv("OPENSEARCH_PASSWORD"),
109
+ use_ssl=os.getenv("OPENSEARCH_USE_SSL", "false").lower() == "true",
110
+ verify_certs=os.getenv("OPENSEARCH_VERIFY_CERTS", "true").lower() == "true",
111
+ # Model IDs
112
+ llm_model_id=os.getenv("OPENSEARCH_LLM_MODEL_ID"),
113
+ embedding_model_id=os.getenv("OPENSEARCH_EMBEDDING_MODEL_ID"),
114
+ # LLM parsing
115
+ llm_result_path=os.getenv(
116
+ "OPENSEARCH_LLM_RESULT_PATH",
117
+ "$.choices[0].message.content",
118
+ ),
119
+ # Connector setup
120
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
121
+ llm_model=os.getenv("MEMORY_LLM_MODEL", "gpt-4o"),
122
+ embedding_model=os.getenv("MEMORY_EMBEDDING_MODEL", "text-embedding-3-small"),
123
+ embedding_dimension=int(os.getenv("MEMORY_EMBEDDING_DIMENSION", "1536")),
124
+ # Timeouts
125
+ connect_timeout=float(os.getenv("OPENSEARCH_CONNECT_TIMEOUT", "5.0")),
126
+ inference_timeout=float(os.getenv("MEMORY_INFERENCE_TIMEOUT", "60.0")),
127
+ )
@@ -0,0 +1,322 @@
1
+ """Memory setup operations - Connector and Model creation.
2
+
3
+ CRITICAL: The LLM connector MUST use both system_prompt AND user_prompt.
4
+ If only system_prompt is used, zero facts will be extracted.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ import httpx
14
+
15
+ from gnosisllm_knowledge.core.exceptions import MemoryConfigurationError
16
+
17
+ if TYPE_CHECKING:
18
+ from gnosisllm_knowledge.backends.opensearch.memory.config import MemoryConfig
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class SetupStatus:
25
+ """Result of setup verification."""
26
+
27
+ is_ready: bool
28
+ checks: dict[str, bool]
29
+
30
+
31
+ class MemorySetup:
32
+ """Setup operations for Agentic Memory.
33
+
34
+ Creates the required OpenSearch connectors and models for memory to work.
35
+
36
+ Example:
37
+ ```python
38
+ setup = MemorySetup(config)
39
+
40
+ # Create connectors and models
41
+ llm_model_id = await setup.setup_llm_model()
42
+ embedding_model_id = await setup.setup_embedding_model()
43
+
44
+ # Verify setup
45
+ status = await setup.verify_setup()
46
+ if status.is_ready:
47
+ print("Memory is ready!")
48
+ ```
49
+ """
50
+
51
+ def __init__(self, config: MemoryConfig) -> None:
52
+ """Initialize setup.
53
+
54
+ Args:
55
+ config: Memory configuration.
56
+ """
57
+ self._config = config
58
+ self._base_url = config.url
59
+ self._auth = config.auth
60
+
61
+ async def setup_llm_model(self) -> str:
62
+ """Create OpenAI LLM connector and model for fact extraction.
63
+
64
+ CRITICAL: The connector uses BOTH system_prompt AND user_prompt.
65
+
66
+ Returns:
67
+ The deployed LLM model ID.
68
+
69
+ Raises:
70
+ MemoryConfigurationError: If OpenAI API key is not configured.
71
+ """
72
+ if not self._config.openai_api_key:
73
+ raise MemoryConfigurationError(
74
+ "OpenAI API key required for LLM setup",
75
+ missing_config=["openai_api_key"],
76
+ )
77
+
78
+ connector_id = await self._create_llm_connector()
79
+ model_id = await self._register_model(
80
+ name="OpenAI LLM for Agentic Memory",
81
+ connector_id=connector_id,
82
+ function_name="remote",
83
+ )
84
+ await self._deploy_model(model_id)
85
+
86
+ logger.info(f"LLM model deployed: {model_id}")
87
+ return model_id
88
+
89
+ async def _create_llm_connector(self) -> str:
90
+ """Create OpenAI chat connector.
91
+
92
+ CRITICAL: Uses BOTH system_prompt AND user_prompt parameters.
93
+ This is required for Agentic Memory fact extraction to work.
94
+
95
+ Returns:
96
+ The connector ID.
97
+ """
98
+ # CRITICAL: Both system_prompt AND user_prompt are required
99
+ request_body = (
100
+ '{"model": "${parameters.model}", '
101
+ '"messages": ['
102
+ '{"role": "system", "content": "${parameters.system_prompt}"}, '
103
+ '{"role": "user", "content": "${parameters.user_prompt}"}'
104
+ "]}"
105
+ )
106
+
107
+ connector_body: dict[str, Any] = {
108
+ "name": "OpenAI Chat Connector for Agentic Memory",
109
+ "description": "Connector for OpenAI with system_prompt AND user_prompt support",
110
+ "version": "1",
111
+ "protocol": "http",
112
+ "parameters": {
113
+ "model": self._config.llm_model,
114
+ },
115
+ "credential": {
116
+ "openAI_key": self._config.openai_api_key,
117
+ },
118
+ "actions": [
119
+ {
120
+ "action_type": "predict",
121
+ "method": "POST",
122
+ "url": "https://api.openai.com/v1/chat/completions",
123
+ "headers": {
124
+ "Authorization": "Bearer ${credential.openAI_key}",
125
+ "Content-Type": "application/json",
126
+ },
127
+ "request_body": request_body,
128
+ }
129
+ ],
130
+ }
131
+
132
+ async with httpx.AsyncClient(
133
+ verify=self._config.verify_certs,
134
+ timeout=self._config.connect_timeout,
135
+ ) as client:
136
+ response = await client.post(
137
+ f"{self._base_url}/_plugins/_ml/connectors/_create",
138
+ json=connector_body,
139
+ auth=self._auth,
140
+ )
141
+ response.raise_for_status()
142
+ result = response.json()
143
+
144
+ connector_id = result.get("connector_id")
145
+ logger.info(f"LLM connector created: {connector_id}")
146
+ return connector_id
147
+
148
+ async def setup_embedding_model(self) -> str:
149
+ """Create OpenAI embedding connector and model.
150
+
151
+ Returns:
152
+ The deployed embedding model ID.
153
+
154
+ Raises:
155
+ MemoryConfigurationError: If OpenAI API key is not configured.
156
+ """
157
+ if not self._config.openai_api_key:
158
+ raise MemoryConfigurationError(
159
+ "OpenAI API key required for embedding setup",
160
+ missing_config=["openai_api_key"],
161
+ )
162
+
163
+ connector_id = await self._create_embedding_connector()
164
+ model_id = await self._register_model(
165
+ name="OpenAI Embedding for Agentic Memory",
166
+ connector_id=connector_id,
167
+ function_name="remote",
168
+ )
169
+ await self._deploy_model(model_id)
170
+
171
+ logger.info(f"Embedding model deployed: {model_id}")
172
+ return model_id
173
+
174
+ async def _create_embedding_connector(self) -> str:
175
+ """Create OpenAI embedding connector.
176
+
177
+ Returns:
178
+ The connector ID.
179
+ """
180
+ connector_body: dict[str, Any] = {
181
+ "name": "OpenAI Embedding Connector",
182
+ "description": "Connector for OpenAI text-embedding models",
183
+ "version": "1",
184
+ "protocol": "http",
185
+ "parameters": {
186
+ "model": self._config.embedding_model,
187
+ },
188
+ "credential": {
189
+ "openAI_key": self._config.openai_api_key,
190
+ },
191
+ "actions": [
192
+ {
193
+ "action_type": "predict",
194
+ "method": "POST",
195
+ "url": "https://api.openai.com/v1/embeddings",
196
+ "headers": {
197
+ "Authorization": "Bearer ${credential.openAI_key}",
198
+ "Content-Type": "application/json",
199
+ },
200
+ "request_body": '{"model": "${parameters.model}", "input": ${parameters.input}}',
201
+ "post_process_function": "connector.post_process.openai.embedding",
202
+ }
203
+ ],
204
+ }
205
+
206
+ async with httpx.AsyncClient(
207
+ verify=self._config.verify_certs,
208
+ timeout=self._config.connect_timeout,
209
+ ) as client:
210
+ response = await client.post(
211
+ f"{self._base_url}/_plugins/_ml/connectors/_create",
212
+ json=connector_body,
213
+ auth=self._auth,
214
+ )
215
+ response.raise_for_status()
216
+ result = response.json()
217
+
218
+ connector_id = result.get("connector_id")
219
+ logger.info(f"Embedding connector created: {connector_id}")
220
+ return connector_id
221
+
222
+ async def _register_model(
223
+ self,
224
+ name: str,
225
+ connector_id: str,
226
+ function_name: str = "remote",
227
+ ) -> str:
228
+ """Register a model with OpenSearch.
229
+
230
+ Args:
231
+ name: Model name.
232
+ connector_id: Connector ID to use.
233
+ function_name: Model function name (default: remote).
234
+
235
+ Returns:
236
+ The registered model ID.
237
+ """
238
+ model_body: dict[str, Any] = {
239
+ "name": name,
240
+ "function_name": function_name,
241
+ "connector_id": connector_id,
242
+ }
243
+
244
+ async with httpx.AsyncClient(
245
+ verify=self._config.verify_certs,
246
+ timeout=self._config.connect_timeout,
247
+ ) as client:
248
+ response = await client.post(
249
+ f"{self._base_url}/_plugins/_ml/models/_register",
250
+ json=model_body,
251
+ auth=self._auth,
252
+ )
253
+ response.raise_for_status()
254
+ result = response.json()
255
+
256
+ return result.get("model_id")
257
+
258
+ async def _deploy_model(self, model_id: str) -> None:
259
+ """Deploy a model.
260
+
261
+ Args:
262
+ model_id: Model ID to deploy.
263
+ """
264
+ async with httpx.AsyncClient(
265
+ verify=self._config.verify_certs,
266
+ timeout=60.0, # Deployment can be slow
267
+ ) as client:
268
+ response = await client.post(
269
+ f"{self._base_url}/_plugins/_ml/models/{model_id}/_deploy",
270
+ auth=self._auth,
271
+ )
272
+ response.raise_for_status()
273
+
274
+ async def verify_setup(self) -> SetupStatus:
275
+ """Verify that memory is properly configured.
276
+
277
+ Returns:
278
+ SetupStatus with verification results.
279
+ """
280
+ checks: dict[str, bool] = {}
281
+
282
+ # Check LLM model
283
+ if self._config.llm_model_id:
284
+ llm_ok = await self._check_model(self._config.llm_model_id)
285
+ checks["llm_model"] = llm_ok
286
+ else:
287
+ checks["llm_model"] = False
288
+
289
+ # Check embedding model
290
+ if self._config.embedding_model_id:
291
+ embed_ok = await self._check_model(self._config.embedding_model_id)
292
+ checks["embedding_model"] = embed_ok
293
+ else:
294
+ checks["embedding_model"] = False
295
+
296
+ is_ready = all(checks.values())
297
+ return SetupStatus(is_ready=is_ready, checks=checks)
298
+
299
+ async def _check_model(self, model_id: str) -> bool:
300
+ """Check if a model is deployed and responding.
301
+
302
+ Args:
303
+ model_id: Model ID to check.
304
+
305
+ Returns:
306
+ True if model is deployed and ready.
307
+ """
308
+ try:
309
+ async with httpx.AsyncClient(
310
+ verify=self._config.verify_certs,
311
+ timeout=self._config.connect_timeout,
312
+ ) as client:
313
+ response = await client.get(
314
+ f"{self._base_url}/_plugins/_ml/models/{model_id}",
315
+ auth=self._auth,
316
+ )
317
+ if response.status_code == 200:
318
+ data = response.json()
319
+ return data.get("model_state") == "DEPLOYED"
320
+ except Exception:
321
+ pass
322
+ return False
@@ -296,12 +296,25 @@ class OpenSearchKnowledgeSearcher:
296
296
  Parsed search results.
297
297
  """
298
298
  try:
299
+ logger.debug(f"OpenSearch query: {os_query}")
300
+ logger.debug(f"Query include_highlights: {query.include_highlights}")
301
+
299
302
  response = await self._client.search(
300
303
  index=index_name,
301
304
  body=os_query,
302
305
  **params,
303
306
  )
304
307
 
308
+ # Debug: Log first hit to see if highlights are present
309
+ hits = response.get("hits", {}).get("hits", [])
310
+ if hits:
311
+ first_hit = hits[0]
312
+ logger.debug(f"First hit keys: {first_hit.keys()}")
313
+ if "highlight" in first_hit:
314
+ logger.debug(f"Highlight data: {first_hit['highlight']}")
315
+ else:
316
+ logger.debug("No 'highlight' key in response hit")
317
+
305
318
  duration_ms = (time.perf_counter() - start_time) * 1000
306
319
  return self._parse_response(query, response, duration_ms)
307
320
 
@@ -381,3 +394,225 @@ class OpenSearchKnowledgeSearcher:
381
394
  search_after_token=search_after_token,
382
395
  has_more=len(items) == query.limit and total_hits > query.offset + len(items),
383
396
  )
397
+
398
+ async def get_collections(self, index_name: str) -> list[dict[str, Any]]:
399
+ """Get unique collections with document counts via aggregation.
400
+
401
+ Args:
402
+ index_name: Index to query.
403
+
404
+ Returns:
405
+ List of collections with id, name, and document_count.
406
+ """
407
+ try:
408
+ # Check if index exists
409
+ exists = await self._client.indices.exists(index=index_name)
410
+ if not exists:
411
+ logger.debug(f"Index {index_name} does not exist")
412
+ return []
413
+
414
+ # Aggregation query for unique collection_ids with counts
415
+ # Also aggregate collection_name for display
416
+ query = {
417
+ "size": 0,
418
+ "aggs": {
419
+ "collections": {
420
+ "terms": {
421
+ "field": "collection_id",
422
+ "size": 1000, # Max collections to return
423
+ },
424
+ "aggs": {
425
+ "collection_name": {
426
+ "terms": {
427
+ "field": "collection_name",
428
+ "size": 1,
429
+ }
430
+ }
431
+ }
432
+ }
433
+ }
434
+ }
435
+
436
+ response = await self._client.search(index=index_name, body=query)
437
+
438
+ collections = []
439
+ buckets = response.get("aggregations", {}).get("collections", {}).get("buckets", [])
440
+
441
+ for bucket in buckets:
442
+ collection_id = bucket.get("key")
443
+ if not collection_id:
444
+ continue
445
+
446
+ doc_count = bucket.get("doc_count", 0)
447
+
448
+ # Get collection name from nested agg or use ID as fallback
449
+ name_buckets = bucket.get("collection_name", {}).get("buckets", [])
450
+ collection_name = name_buckets[0].get("key") if name_buckets else collection_id
451
+
452
+ collections.append({
453
+ "id": collection_id,
454
+ "name": collection_name or collection_id,
455
+ "document_count": doc_count,
456
+ })
457
+
458
+ logger.debug(f"Found {len(collections)} collections in {index_name}")
459
+ return collections
460
+
461
+ except Exception as e:
462
+ logger.error(f"Failed to get collections from {index_name}: {e}")
463
+ return []
464
+
465
+ async def get_stats(self, index_name: str) -> dict[str, Any]:
466
+ """Get index statistics.
467
+
468
+ Args:
469
+ index_name: Index to query.
470
+
471
+ Returns:
472
+ Dictionary with document_count and index info.
473
+ """
474
+ try:
475
+ # Check if index exists
476
+ exists = await self._client.indices.exists(index=index_name)
477
+ if not exists:
478
+ return {
479
+ "document_count": 0,
480
+ "index_name": index_name,
481
+ "exists": False,
482
+ }
483
+
484
+ # Get index stats
485
+ stats = await self._client.indices.stats(index=index_name)
486
+ index_stats = stats.get("indices", {}).get(index_name, {})
487
+ primaries = index_stats.get("primaries", {})
488
+ docs = primaries.get("docs", {})
489
+
490
+ return {
491
+ "document_count": docs.get("count", 0),
492
+ "index_name": index_name,
493
+ "exists": True,
494
+ "size_bytes": primaries.get("store", {}).get("size_in_bytes", 0),
495
+ }
496
+
497
+ except Exception as e:
498
+ logger.error(f"Failed to get stats for {index_name}: {e}")
499
+ return {
500
+ "document_count": 0,
501
+ "index_name": index_name,
502
+ "error": str(e),
503
+ }
504
+
505
+ async def list_documents(
506
+ self,
507
+ index_name: str,
508
+ *,
509
+ account_id: str | None = None,
510
+ source_id: str | None = None,
511
+ collection_id: str | None = None,
512
+ limit: int = 50,
513
+ offset: int = 0,
514
+ ) -> dict[str, Any]:
515
+ """List documents with optional filters.
516
+
517
+ Args:
518
+ index_name: Index to query.
519
+ account_id: Optional account ID filter.
520
+ source_id: Optional source ID filter.
521
+ collection_id: Optional collection ID filter.
522
+ limit: Maximum documents to return.
523
+ offset: Number of documents to skip.
524
+
525
+ Returns:
526
+ Dictionary with documents, total, limit, offset.
527
+ """
528
+ try:
529
+ # Check if index exists
530
+ exists = await self._client.indices.exists(index=index_name)
531
+ if not exists:
532
+ logger.debug(f"Index {index_name} does not exist")
533
+ return {
534
+ "documents": [],
535
+ "total": 0,
536
+ "limit": limit,
537
+ "offset": offset,
538
+ }
539
+
540
+ # Build filter clauses
541
+ filters: list[dict[str, Any]] = []
542
+
543
+ if account_id:
544
+ filters.append({"term": {"account_id": account_id}})
545
+
546
+ if source_id:
547
+ filters.append({"term": {"source_id": source_id}})
548
+
549
+ if collection_id:
550
+ filters.append({"term": {"collection_id": collection_id}})
551
+
552
+ # Build query with match_all and filters
553
+ query: dict[str, Any] = {
554
+ "size": limit,
555
+ "from": offset,
556
+ "sort": [
557
+ {"created_at": {"order": "desc", "unmapped_type": "date"}},
558
+ {"_id": {"order": "asc"}},
559
+ ],
560
+ "_source": {
561
+ "excludes": ["content_embedding"],
562
+ },
563
+ }
564
+
565
+ if filters:
566
+ query["query"] = {
567
+ "bool": {
568
+ "must": [{"match_all": {}}],
569
+ "filter": filters,
570
+ }
571
+ }
572
+ else:
573
+ query["query"] = {"match_all": {}}
574
+
575
+ response = await self._client.search(index=index_name, body=query)
576
+
577
+ hits = response.get("hits", {})
578
+ total = hits.get("total", {})
579
+ total_hits = total.get("value", 0) if isinstance(total, dict) else total
580
+
581
+ documents = []
582
+ for hit in hits.get("hits", []):
583
+ source = hit.get("_source", {})
584
+ doc = {
585
+ "id": hit.get("_id", ""),
586
+ "title": source.get("title"),
587
+ "url": source.get("url"),
588
+ "content_preview": (source.get("content", ""))[:200],
589
+ "content": source.get("content"),
590
+ "chunk_index": source.get("chunk_index"),
591
+ "total_chunks": source.get("total_chunks"),
592
+ "source_id": source.get("source_id"),
593
+ "collection_id": source.get("collection_id"),
594
+ "created_at": source.get("created_at"),
595
+ "metadata": source.get("metadata"),
596
+ }
597
+ documents.append(doc)
598
+
599
+ logger.debug(
600
+ f"Listed {len(documents)} documents from {index_name} "
601
+ f"(total: {total_hits}, source_id: {source_id})"
602
+ )
603
+
604
+ return {
605
+ "documents": documents,
606
+ "total": total_hits,
607
+ "limit": limit,
608
+ "offset": offset,
609
+ }
610
+
611
+ except Exception as e:
612
+ logger.error(f"Failed to list documents from {index_name}: {e}")
613
+ return {
614
+ "documents": [],
615
+ "total": 0,
616
+ "limit": limit,
617
+ "offset": offset,
618
+ }