gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,255 @@
1
+ """OpenSearch index mappings for knowledge documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
8
+
9
+
10
+ def get_knowledge_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
11
+ """Get index settings for knowledge documents.
12
+
13
+ Args:
14
+ config: OpenSearch configuration.
15
+
16
+ Returns:
17
+ Index settings dictionary.
18
+ """
19
+ settings: dict[str, Any] = {
20
+ "index": {
21
+ "number_of_shards": config.number_of_shards,
22
+ "number_of_replicas": config.number_of_replicas,
23
+ "refresh_interval": config.refresh_interval,
24
+ "knn": True,
25
+ "knn.algo_param.ef_search": config.knn_algo_param_ef_search,
26
+ }
27
+ }
28
+
29
+ # Set default ingest pipeline if configured
30
+ pipeline_name = config.ingest_pipeline_name
31
+ if pipeline_name:
32
+ settings["index"]["default_pipeline"] = pipeline_name
33
+
34
+ return settings
35
+
36
+
37
+ def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
38
+ """Get index mappings for knowledge documents with k-NN vectors.
39
+
40
+ Args:
41
+ config: OpenSearch configuration.
42
+
43
+ Returns:
44
+ Index mappings dictionary.
45
+ """
46
+ embedding_field = config.embedding_field # Default: content_embedding
47
+
48
+ return {
49
+ "properties": {
50
+ # === Document Identity ===
51
+ "id": {"type": "keyword"},
52
+ "url": {"type": "keyword"},
53
+ "title": {
54
+ "type": "text",
55
+ "analyzer": "standard",
56
+ "fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
57
+ },
58
+ "source": {"type": "keyword"},
59
+ # === Multi-tenant Fields ===
60
+ "account_id": {"type": "keyword"},
61
+ "collection_id": {"type": "keyword"},
62
+ "source_id": {"type": "keyword"},
63
+ # === Content ===
64
+ "content": {
65
+ "type": "text",
66
+ "analyzer": "standard",
67
+ "term_vector": "with_positions_offsets", # For highlighting
68
+ },
69
+ # === Embedding Vector ===
70
+ # Field name matches config.embedding_field (default: content_embedding)
71
+ embedding_field: {
72
+ "type": "knn_vector",
73
+ "dimension": config.embedding_dimension,
74
+ "method": {
75
+ "name": "hnsw",
76
+ "space_type": config.knn_space_type,
77
+ "engine": config.knn_engine,
78
+ "parameters": {
79
+ "ef_construction": config.knn_algo_param_ef_construction,
80
+ "m": config.knn_algo_param_m,
81
+ },
82
+ },
83
+ },
84
+ # === Chunking Information ===
85
+ "chunk_index": {"type": "integer"},
86
+ "total_chunks": {"type": "integer"},
87
+ "parent_doc_id": {"type": "keyword"},
88
+ "start_position": {"type": "integer"},
89
+ "end_position": {"type": "integer"},
90
+ # === Quality & Validation ===
91
+ "quality_score": {"type": "float"},
92
+ "language": {"type": "keyword"},
93
+ "content_hash": {"type": "keyword"},
94
+ "word_count": {"type": "integer"},
95
+ # === Status ===
96
+ "status": {"type": "keyword"},
97
+ # === PII Handling ===
98
+ "pii_detected": {"type": "boolean"},
99
+ "pii_redacted": {"type": "boolean"},
100
+ # === Metadata ===
101
+ "metadata": {"type": "object", "enabled": True, "dynamic": True},
102
+ # === Timestamps ===
103
+ "created_at": {"type": "date"},
104
+ "updated_at": {"type": "date"},
105
+ "indexed_at": {"type": "date"},
106
+ }
107
+ }
108
+
109
+
110
+ def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
111
+ """Get index settings for conversation memory.
112
+
113
+ Args:
114
+ config: OpenSearch configuration.
115
+
116
+ Returns:
117
+ Index settings dictionary.
118
+ """
119
+ return {
120
+ "index": {
121
+ "number_of_shards": 1, # Memory is typically smaller
122
+ "number_of_replicas": config.number_of_replicas,
123
+ "refresh_interval": "1s",
124
+ }
125
+ }
126
+
127
+
128
+ def get_memory_index_mappings() -> dict[str, Any]:
129
+ """Get index mappings for conversation memory.
130
+
131
+ Returns:
132
+ Index mappings dictionary.
133
+ """
134
+ return {
135
+ "properties": {
136
+ "conversation_id": {"type": "keyword"},
137
+ "account_id": {"type": "keyword"},
138
+ "user_id": {"type": "keyword"},
139
+ "message_index": {"type": "integer"},
140
+ "role": {"type": "keyword"}, # user, assistant, system
141
+ "content": {"type": "text"},
142
+ "metadata": {"type": "object", "enabled": True, "dynamic": True},
143
+ "created_at": {"type": "date"},
144
+ "expires_at": {"type": "date"},
145
+ }
146
+ }
147
+
148
+
149
+ def get_index_template(
150
+ config: OpenSearchConfig,
151
+ index_patterns: list[str] | None = None,
152
+ ) -> dict[str, Any]:
153
+ """Get index template for knowledge indices.
154
+
155
+ Args:
156
+ config: OpenSearch configuration.
157
+ index_patterns: Index patterns to match (default: ["{prefix}-*"]).
158
+
159
+ Returns:
160
+ Index template dictionary.
161
+ """
162
+ if index_patterns is None:
163
+ index_patterns = [f"{config.index_prefix}-*"]
164
+
165
+ return {
166
+ "index_patterns": index_patterns,
167
+ "template": {
168
+ "settings": get_knowledge_index_settings(config),
169
+ "mappings": get_knowledge_index_mappings(config),
170
+ },
171
+ "priority": 100,
172
+ "version": 1,
173
+ }
174
+
175
+
176
+ def get_ingest_pipeline(config: OpenSearchConfig) -> dict[str, Any]:
177
+ """Get ingest pipeline for document processing.
178
+
179
+ Creates a pipeline that:
180
+ 1. Generates embeddings using the deployed ML model
181
+ 2. Sets indexed_at timestamp
182
+ 3. Calculates word count
183
+
184
+ Args:
185
+ config: OpenSearch configuration.
186
+
187
+ Returns:
188
+ Ingest pipeline dictionary.
189
+ """
190
+ processors: list[dict[str, Any]] = []
191
+
192
+ # Text embedding processor (requires model_id)
193
+ if config.model_id:
194
+ processors.append({
195
+ "text_embedding": {
196
+ "model_id": config.model_id,
197
+ "field_map": {
198
+ "content": config.embedding_field, # content -> content_embedding
199
+ },
200
+ }
201
+ })
202
+
203
+ # Set indexed_at timestamp
204
+ processors.append({
205
+ "set": {
206
+ "field": "indexed_at",
207
+ "value": "{{_ingest.timestamp}}",
208
+ }
209
+ })
210
+
211
+ # Calculate word count
212
+ processors.append({
213
+ "script": {
214
+ "description": "Calculate word count",
215
+ "source": """
216
+ if (ctx.content != null) {
217
+ ctx.word_count = ctx.content.split("\\\\s+").length;
218
+ }
219
+ """,
220
+ "ignore_failure": True,
221
+ }
222
+ })
223
+
224
+ return {
225
+ "description": "GnosisLLM knowledge document ingest pipeline",
226
+ "processors": processors,
227
+ }
228
+
229
+
230
+ def get_search_pipeline(config: OpenSearchConfig) -> dict[str, Any]:
231
+ """Get search pipeline for hybrid search score normalization.
232
+
233
+ Uses min_max normalization and arithmetic_mean combination
234
+ for hybrid neural + keyword search.
235
+
236
+ Args:
237
+ config: OpenSearch configuration.
238
+
239
+ Returns:
240
+ Search pipeline dictionary.
241
+ """
242
+ return {
243
+ "description": "GnosisLLM search pipeline for hybrid search",
244
+ "phase_results_processors": [
245
+ {
246
+ "normalization-processor": {
247
+ "normalization": {"technique": "min_max"},
248
+ "combination": {
249
+ "technique": "arithmetic_mean",
250
+ "parameters": {"weights": [0.7, 0.3]}, # semantic, keyword
251
+ },
252
+ }
253
+ }
254
+ ],
255
+ }
@@ -0,0 +1,445 @@
1
+ """OpenSearch query builders for knowledge search.
2
+
3
+ Uses OpenSearch neural search - embeddings are generated automatically
4
+ via the deployed model. No Python-side embedding generation needed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from gnosisllm_knowledge.core.domain.search import SearchQuery
12
+
13
+
14
+ class QueryBuilder:
15
+ """Builder for OpenSearch queries.
16
+
17
+ Uses OpenSearch neural search for semantic queries. The deployed
18
+ model handles embedding generation automatically via ingest and
19
+ search pipelines.
20
+
21
+ Example:
22
+ ```python
23
+ query = SearchQuery(text="how to configure", account_id="acc123")
24
+ builder = QueryBuilder(query, model_id="abc123")
25
+ os_query = builder.build_hybrid_query()
26
+ ```
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ query: SearchQuery,
32
+ model_id: str | None = None,
33
+ embedding_field: str = "content_embedding",
34
+ ) -> None:
35
+ """Initialize query builder.
36
+
37
+ Args:
38
+ query: Search query parameters.
39
+ model_id: OpenSearch ML model ID for neural search.
40
+ embedding_field: Field name for the embedding vector.
41
+ """
42
+ self._query = query
43
+ self._model_id = model_id
44
+ self._embedding_field = embedding_field
45
+
46
+ def build_semantic_query(self, k: int | None = None) -> dict[str, Any]:
47
+ """Build neural (semantic) search query.
48
+
49
+ OpenSearch automatically embeds the query text using the
50
+ deployed model specified by model_id.
51
+
52
+ Args:
53
+ k: Number of results for k-NN (default: query.limit).
54
+
55
+ Returns:
56
+ OpenSearch query dictionary.
57
+ """
58
+ if not self._model_id:
59
+ raise ValueError("model_id required for semantic search")
60
+
61
+ k = k or self._query.limit
62
+
63
+ query: dict[str, Any] = {
64
+ "size": self._query.limit,
65
+ "from": self._query.offset,
66
+ "query": {
67
+ "bool": {
68
+ "must": [
69
+ {
70
+ "neural": {
71
+ self._embedding_field: {
72
+ "query_text": self._query.text,
73
+ "model_id": self._model_id,
74
+ "k": k,
75
+ }
76
+ }
77
+ }
78
+ ],
79
+ "filter": self._build_filters(),
80
+ }
81
+ },
82
+ }
83
+
84
+ if self._query.min_score:
85
+ query["min_score"] = self._query.min_score
86
+
87
+ self._add_highlighting(query)
88
+ self._add_source_filtering(query)
89
+ self._add_explain(query)
90
+ self._add_search_after(query)
91
+
92
+ return query
93
+
94
+ def build_keyword_query(self) -> dict[str, Any]:
95
+ """Build keyword (BM25) search query.
96
+
97
+ Returns:
98
+ OpenSearch query dictionary.
99
+ """
100
+ # Build match query with optional field boosting
101
+ field_boosts = self._query.field_boosts or {"title": 2.0, "content": 1.0}
102
+
103
+ should_clauses = []
104
+ for field, boost in field_boosts.items():
105
+ should_clauses.append(
106
+ {
107
+ "match": {
108
+ field: {
109
+ "query": self._query.text,
110
+ "boost": boost,
111
+ }
112
+ }
113
+ }
114
+ )
115
+
116
+ query: dict[str, Any] = {
117
+ "size": self._query.limit,
118
+ "from": self._query.offset,
119
+ "query": {
120
+ "bool": {
121
+ "should": should_clauses,
122
+ "minimum_should_match": 1,
123
+ "filter": self._build_filters(),
124
+ }
125
+ },
126
+ }
127
+
128
+ if self._query.min_score:
129
+ query["min_score"] = self._query.min_score
130
+
131
+ self._add_highlighting(query)
132
+ self._add_source_filtering(query)
133
+ self._add_explain(query)
134
+ self._add_search_after(query)
135
+
136
+ return query
137
+
138
+ def build_hybrid_query(
139
+ self,
140
+ semantic_weight: float = 0.7,
141
+ keyword_weight: float = 0.3,
142
+ k: int | None = None,
143
+ ) -> dict[str, Any]:
144
+ """Build hybrid search query combining neural and keyword.
145
+
146
+ Uses OpenSearch hybrid search with search pipeline for score
147
+ normalization and combination.
148
+
149
+ Args:
150
+ semantic_weight: Weight for semantic score (default: 0.7).
151
+ keyword_weight: Weight for keyword score (default: 0.3).
152
+ k: Number of results for k-NN (default: query.limit * 2).
153
+
154
+ Returns:
155
+ OpenSearch query dictionary.
156
+ """
157
+ if not self._model_id:
158
+ # Fall back to keyword-only if no model_id
159
+ return self.build_keyword_query()
160
+
161
+ k = k or (self._query.limit * 2)
162
+ field_boosts = self._query.field_boosts or {"title": 2.0, "content": 1.0}
163
+
164
+ # Build keyword should clauses
165
+ keyword_should = []
166
+ for field, boost in field_boosts.items():
167
+ keyword_should.append(
168
+ {
169
+ "match": {
170
+ field: {
171
+ "query": self._query.text,
172
+ "boost": boost,
173
+ }
174
+ }
175
+ }
176
+ )
177
+
178
+ # OpenSearch hybrid query format
179
+ query: dict[str, Any] = {
180
+ "size": self._query.limit,
181
+ "from": self._query.offset,
182
+ "query": {
183
+ "hybrid": {
184
+ "queries": [
185
+ # Neural (semantic) component
186
+ {
187
+ "neural": {
188
+ self._embedding_field: {
189
+ "query_text": self._query.text,
190
+ "model_id": self._model_id,
191
+ "k": k,
192
+ }
193
+ }
194
+ },
195
+ # Keyword (BM25) component
196
+ {
197
+ "bool": {
198
+ "should": keyword_should,
199
+ "minimum_should_match": 1,
200
+ }
201
+ },
202
+ ]
203
+ }
204
+ },
205
+ }
206
+
207
+ # Apply filters at top level for hybrid
208
+ filters = self._build_filters()
209
+ if filters:
210
+ query["query"] = {
211
+ "bool": {
212
+ "must": [query["query"]],
213
+ "filter": filters,
214
+ }
215
+ }
216
+
217
+ if self._query.min_score:
218
+ query["min_score"] = self._query.min_score
219
+
220
+ self._add_highlighting(query)
221
+ self._add_source_filtering(query)
222
+ self._add_explain(query)
223
+ # Note: Don't add sort for hybrid queries - normalization processor handles it
224
+
225
+ return query
226
+
227
+ def build_more_like_this_query(
228
+ self,
229
+ doc_id: str,
230
+ fields: list[str] | None = None,
231
+ ) -> dict[str, Any]:
232
+ """Build more-like-this query for similar documents.
233
+
234
+ Args:
235
+ doc_id: Document ID to find similar documents for.
236
+ fields: Fields to use for similarity (default: content, title).
237
+
238
+ Returns:
239
+ OpenSearch query dictionary.
240
+ """
241
+ fields = fields or ["content", "title"]
242
+
243
+ query: dict[str, Any] = {
244
+ "size": self._query.limit,
245
+ "from": self._query.offset,
246
+ "query": {
247
+ "bool": {
248
+ "must": [
249
+ {
250
+ "more_like_this": {
251
+ "fields": fields,
252
+ "like": [{"_id": doc_id}],
253
+ "min_term_freq": 1,
254
+ "max_query_terms": 25,
255
+ "min_doc_freq": 1,
256
+ }
257
+ }
258
+ ],
259
+ "filter": self._build_filters(),
260
+ "must_not": [{"ids": {"values": [doc_id]}}],
261
+ }
262
+ },
263
+ }
264
+
265
+ self._add_highlighting(query)
266
+ self._add_source_filtering(query)
267
+
268
+ return query
269
+
270
+ def _build_filters(self) -> list[dict[str, Any]]:
271
+ """Build filter clauses from query parameters.
272
+
273
+ Returns:
274
+ List of filter clauses.
275
+ """
276
+ filters: list[dict[str, Any]] = []
277
+
278
+ # Multi-tenant filter (required for security)
279
+ if self._query.account_id:
280
+ filters.append({"term": {"account_id": self._query.account_id}})
281
+
282
+ # Collection filter
283
+ if self._query.collection_ids:
284
+ filters.append({"terms": {"collection_id": self._query.collection_ids}})
285
+
286
+ # Source filter
287
+ if self._query.source_ids:
288
+ filters.append({"terms": {"source_id": self._query.source_ids}})
289
+
290
+ # Custom metadata filters
291
+ if self._query.metadata_filters:
292
+ for field, value in self._query.metadata_filters.items():
293
+ if isinstance(value, list):
294
+ filters.append({"terms": {field: value}})
295
+ else:
296
+ filters.append({"term": {field: value}})
297
+
298
+ return filters
299
+
300
+ def _add_highlighting(self, query: dict[str, Any]) -> None:
301
+ """Add highlighting configuration to query.
302
+
303
+ Args:
304
+ query: Query dictionary to modify.
305
+ """
306
+ if not self._query.include_highlights:
307
+ return
308
+
309
+ query["highlight"] = {
310
+ "fields": {
311
+ "content": {
312
+ "fragment_size": 150,
313
+ "number_of_fragments": 3,
314
+ },
315
+ "title": {
316
+ "fragment_size": 150,
317
+ "number_of_fragments": 1,
318
+ },
319
+ },
320
+ "pre_tags": ["<em>"],
321
+ "post_tags": ["</em>"],
322
+ }
323
+
324
+ def _add_source_filtering(self, query: dict[str, Any]) -> None:
325
+ """Add source field filtering to query.
326
+
327
+ Args:
328
+ query: Query dictionary to modify.
329
+ """
330
+ if self._query.include_fields or self._query.exclude_fields:
331
+ source: dict[str, Any] = {}
332
+ if self._query.include_fields:
333
+ source["includes"] = self._query.include_fields
334
+ if self._query.exclude_fields:
335
+ source["excludes"] = self._query.exclude_fields
336
+ query["_source"] = source
337
+
338
+ def _add_explain(self, query: dict[str, Any]) -> None:
339
+ """Add explain flag to query for debugging.
340
+
341
+ Args:
342
+ query: Query dictionary to modify.
343
+ """
344
+ if self._query.explain:
345
+ query["explain"] = True
346
+
347
+ def _add_search_after(self, query: dict[str, Any]) -> None:
348
+ """Add cursor-based pagination to query.
349
+
350
+ Args:
351
+ query: Query dictionary to modify.
352
+ """
353
+ # Add sort for consistent pagination
354
+ query["sort"] = [
355
+ {"_score": "desc"},
356
+ {"_id": "asc"}, # Tiebreaker
357
+ ]
358
+
359
+
360
+ def build_delete_by_source_query(
361
+ source_id: str,
362
+ account_id: str | None = None,
363
+ ) -> dict[str, Any]:
364
+ """Build query to delete documents by source.
365
+
366
+ Args:
367
+ source_id: Source ID to delete.
368
+ account_id: Optional account filter for multi-tenancy.
369
+
370
+ Returns:
371
+ Delete-by-query dictionary.
372
+ """
373
+ filters = [{"term": {"source_id": source_id}}]
374
+ if account_id:
375
+ filters.append({"term": {"account_id": account_id}})
376
+
377
+ return {
378
+ "query": {
379
+ "bool": {
380
+ "filter": filters,
381
+ }
382
+ }
383
+ }
384
+
385
+
386
+ def build_delete_by_collection_query(
387
+ collection_id: str,
388
+ account_id: str | None = None,
389
+ ) -> dict[str, Any]:
390
+ """Build query to delete documents by collection.
391
+
392
+ Args:
393
+ collection_id: Collection ID to delete.
394
+ account_id: Optional account filter for multi-tenancy.
395
+
396
+ Returns:
397
+ Delete-by-query dictionary.
398
+ """
399
+ filters = [{"term": {"collection_id": collection_id}}]
400
+ if account_id:
401
+ filters.append({"term": {"account_id": account_id}})
402
+
403
+ return {
404
+ "query": {
405
+ "bool": {
406
+ "filter": filters,
407
+ }
408
+ }
409
+ }
410
+
411
+
412
+ def build_count_query(
413
+ account_id: str | None = None,
414
+ collection_id: str | None = None,
415
+ source_id: str | None = None,
416
+ ) -> dict[str, Any]:
417
+ """Build query to count documents.
418
+
419
+ Args:
420
+ account_id: Optional account filter.
421
+ collection_id: Optional collection filter.
422
+ source_id: Optional source filter.
423
+
424
+ Returns:
425
+ Count query dictionary.
426
+ """
427
+ filters: list[dict[str, Any]] = []
428
+
429
+ if account_id:
430
+ filters.append({"term": {"account_id": account_id}})
431
+ if collection_id:
432
+ filters.append({"term": {"collection_id": collection_id}})
433
+ if source_id:
434
+ filters.append({"term": {"source_id": source_id}})
435
+
436
+ if not filters:
437
+ return {"query": {"match_all": {}}}
438
+
439
+ return {
440
+ "query": {
441
+ "bool": {
442
+ "filter": filters,
443
+ }
444
+ }
445
+ }