gnosisllm-knowledge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +152 -0
- gnosisllm_knowledge/api/__init__.py +5 -0
- gnosisllm_knowledge/api/knowledge.py +548 -0
- gnosisllm_knowledge/backends/__init__.py +26 -0
- gnosisllm_knowledge/backends/memory/__init__.py +9 -0
- gnosisllm_knowledge/backends/memory/indexer.py +384 -0
- gnosisllm_knowledge/backends/memory/searcher.py +516 -0
- gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
- gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
- gnosisllm_knowledge/backends/opensearch/config.py +195 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
- gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
- gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
- gnosisllm_knowledge/chunking/__init__.py +9 -0
- gnosisllm_knowledge/chunking/fixed.py +138 -0
- gnosisllm_knowledge/chunking/sentence.py +239 -0
- gnosisllm_knowledge/cli/__init__.py +18 -0
- gnosisllm_knowledge/cli/app.py +509 -0
- gnosisllm_knowledge/cli/commands/__init__.py +7 -0
- gnosisllm_knowledge/cli/commands/agentic.py +529 -0
- gnosisllm_knowledge/cli/commands/load.py +369 -0
- gnosisllm_knowledge/cli/commands/search.py +440 -0
- gnosisllm_knowledge/cli/commands/setup.py +228 -0
- gnosisllm_knowledge/cli/display/__init__.py +5 -0
- gnosisllm_knowledge/cli/display/service.py +555 -0
- gnosisllm_knowledge/cli/utils/__init__.py +5 -0
- gnosisllm_knowledge/cli/utils/config.py +207 -0
- gnosisllm_knowledge/core/__init__.py +87 -0
- gnosisllm_knowledge/core/domain/__init__.py +43 -0
- gnosisllm_knowledge/core/domain/document.py +240 -0
- gnosisllm_knowledge/core/domain/result.py +176 -0
- gnosisllm_knowledge/core/domain/search.py +327 -0
- gnosisllm_knowledge/core/domain/source.py +139 -0
- gnosisllm_knowledge/core/events/__init__.py +23 -0
- gnosisllm_knowledge/core/events/emitter.py +216 -0
- gnosisllm_knowledge/core/events/types.py +226 -0
- gnosisllm_knowledge/core/exceptions.py +407 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
- gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
- gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
- gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
- gnosisllm_knowledge/core/interfaces/loader.py +102 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
- gnosisllm_knowledge/core/interfaces/setup.py +164 -0
- gnosisllm_knowledge/fetchers/__init__.py +12 -0
- gnosisllm_knowledge/fetchers/config.py +77 -0
- gnosisllm_knowledge/fetchers/http.py +167 -0
- gnosisllm_knowledge/fetchers/neoreader.py +204 -0
- gnosisllm_knowledge/loaders/__init__.py +13 -0
- gnosisllm_knowledge/loaders/base.py +399 -0
- gnosisllm_knowledge/loaders/factory.py +202 -0
- gnosisllm_knowledge/loaders/sitemap.py +285 -0
- gnosisllm_knowledge/loaders/website.py +57 -0
- gnosisllm_knowledge/py.typed +0 -0
- gnosisllm_knowledge/services/__init__.py +9 -0
- gnosisllm_knowledge/services/indexing.py +387 -0
- gnosisllm_knowledge/services/search.py +349 -0
- gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
- gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
- gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""OpenSearch index mappings for knowledge documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from gnosisllm_knowledge.backends.opensearch.config import OpenSearchConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_knowledge_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
11
|
+
"""Get index settings for knowledge documents.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
config: OpenSearch configuration.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Index settings dictionary.
|
|
18
|
+
"""
|
|
19
|
+
settings: dict[str, Any] = {
|
|
20
|
+
"index": {
|
|
21
|
+
"number_of_shards": config.number_of_shards,
|
|
22
|
+
"number_of_replicas": config.number_of_replicas,
|
|
23
|
+
"refresh_interval": config.refresh_interval,
|
|
24
|
+
"knn": True,
|
|
25
|
+
"knn.algo_param.ef_search": config.knn_algo_param_ef_search,
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Set default ingest pipeline if configured
|
|
30
|
+
pipeline_name = config.ingest_pipeline_name
|
|
31
|
+
if pipeline_name:
|
|
32
|
+
settings["index"]["default_pipeline"] = pipeline_name
|
|
33
|
+
|
|
34
|
+
return settings
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_knowledge_index_mappings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
38
|
+
"""Get index mappings for knowledge documents with k-NN vectors.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
config: OpenSearch configuration.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Index mappings dictionary.
|
|
45
|
+
"""
|
|
46
|
+
embedding_field = config.embedding_field # Default: content_embedding
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
"properties": {
|
|
50
|
+
# === Document Identity ===
|
|
51
|
+
"id": {"type": "keyword"},
|
|
52
|
+
"url": {"type": "keyword"},
|
|
53
|
+
"title": {
|
|
54
|
+
"type": "text",
|
|
55
|
+
"analyzer": "standard",
|
|
56
|
+
"fields": {"keyword": {"type": "keyword", "ignore_above": 512}},
|
|
57
|
+
},
|
|
58
|
+
"source": {"type": "keyword"},
|
|
59
|
+
# === Multi-tenant Fields ===
|
|
60
|
+
"account_id": {"type": "keyword"},
|
|
61
|
+
"collection_id": {"type": "keyword"},
|
|
62
|
+
"source_id": {"type": "keyword"},
|
|
63
|
+
# === Content ===
|
|
64
|
+
"content": {
|
|
65
|
+
"type": "text",
|
|
66
|
+
"analyzer": "standard",
|
|
67
|
+
"term_vector": "with_positions_offsets", # For highlighting
|
|
68
|
+
},
|
|
69
|
+
# === Embedding Vector ===
|
|
70
|
+
# Field name matches config.embedding_field (default: content_embedding)
|
|
71
|
+
embedding_field: {
|
|
72
|
+
"type": "knn_vector",
|
|
73
|
+
"dimension": config.embedding_dimension,
|
|
74
|
+
"method": {
|
|
75
|
+
"name": "hnsw",
|
|
76
|
+
"space_type": config.knn_space_type,
|
|
77
|
+
"engine": config.knn_engine,
|
|
78
|
+
"parameters": {
|
|
79
|
+
"ef_construction": config.knn_algo_param_ef_construction,
|
|
80
|
+
"m": config.knn_algo_param_m,
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
# === Chunking Information ===
|
|
85
|
+
"chunk_index": {"type": "integer"},
|
|
86
|
+
"total_chunks": {"type": "integer"},
|
|
87
|
+
"parent_doc_id": {"type": "keyword"},
|
|
88
|
+
"start_position": {"type": "integer"},
|
|
89
|
+
"end_position": {"type": "integer"},
|
|
90
|
+
# === Quality & Validation ===
|
|
91
|
+
"quality_score": {"type": "float"},
|
|
92
|
+
"language": {"type": "keyword"},
|
|
93
|
+
"content_hash": {"type": "keyword"},
|
|
94
|
+
"word_count": {"type": "integer"},
|
|
95
|
+
# === Status ===
|
|
96
|
+
"status": {"type": "keyword"},
|
|
97
|
+
# === PII Handling ===
|
|
98
|
+
"pii_detected": {"type": "boolean"},
|
|
99
|
+
"pii_redacted": {"type": "boolean"},
|
|
100
|
+
# === Metadata ===
|
|
101
|
+
"metadata": {"type": "object", "enabled": True, "dynamic": True},
|
|
102
|
+
# === Timestamps ===
|
|
103
|
+
"created_at": {"type": "date"},
|
|
104
|
+
"updated_at": {"type": "date"},
|
|
105
|
+
"indexed_at": {"type": "date"},
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_memory_index_settings(config: OpenSearchConfig) -> dict[str, Any]:
|
|
111
|
+
"""Get index settings for conversation memory.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
config: OpenSearch configuration.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Index settings dictionary.
|
|
118
|
+
"""
|
|
119
|
+
return {
|
|
120
|
+
"index": {
|
|
121
|
+
"number_of_shards": 1, # Memory is typically smaller
|
|
122
|
+
"number_of_replicas": config.number_of_replicas,
|
|
123
|
+
"refresh_interval": "1s",
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_memory_index_mappings() -> dict[str, Any]:
|
|
129
|
+
"""Get index mappings for conversation memory.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Index mappings dictionary.
|
|
133
|
+
"""
|
|
134
|
+
return {
|
|
135
|
+
"properties": {
|
|
136
|
+
"conversation_id": {"type": "keyword"},
|
|
137
|
+
"account_id": {"type": "keyword"},
|
|
138
|
+
"user_id": {"type": "keyword"},
|
|
139
|
+
"message_index": {"type": "integer"},
|
|
140
|
+
"role": {"type": "keyword"}, # user, assistant, system
|
|
141
|
+
"content": {"type": "text"},
|
|
142
|
+
"metadata": {"type": "object", "enabled": True, "dynamic": True},
|
|
143
|
+
"created_at": {"type": "date"},
|
|
144
|
+
"expires_at": {"type": "date"},
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_index_template(
|
|
150
|
+
config: OpenSearchConfig,
|
|
151
|
+
index_patterns: list[str] | None = None,
|
|
152
|
+
) -> dict[str, Any]:
|
|
153
|
+
"""Get index template for knowledge indices.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
config: OpenSearch configuration.
|
|
157
|
+
index_patterns: Index patterns to match (default: ["{prefix}-*"]).
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Index template dictionary.
|
|
161
|
+
"""
|
|
162
|
+
if index_patterns is None:
|
|
163
|
+
index_patterns = [f"{config.index_prefix}-*"]
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
"index_patterns": index_patterns,
|
|
167
|
+
"template": {
|
|
168
|
+
"settings": get_knowledge_index_settings(config),
|
|
169
|
+
"mappings": get_knowledge_index_mappings(config),
|
|
170
|
+
},
|
|
171
|
+
"priority": 100,
|
|
172
|
+
"version": 1,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def get_ingest_pipeline(config: OpenSearchConfig) -> dict[str, Any]:
|
|
177
|
+
"""Get ingest pipeline for document processing.
|
|
178
|
+
|
|
179
|
+
Creates a pipeline that:
|
|
180
|
+
1. Generates embeddings using the deployed ML model
|
|
181
|
+
2. Sets indexed_at timestamp
|
|
182
|
+
3. Calculates word count
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
config: OpenSearch configuration.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Ingest pipeline dictionary.
|
|
189
|
+
"""
|
|
190
|
+
processors: list[dict[str, Any]] = []
|
|
191
|
+
|
|
192
|
+
# Text embedding processor (requires model_id)
|
|
193
|
+
if config.model_id:
|
|
194
|
+
processors.append({
|
|
195
|
+
"text_embedding": {
|
|
196
|
+
"model_id": config.model_id,
|
|
197
|
+
"field_map": {
|
|
198
|
+
"content": config.embedding_field, # content -> content_embedding
|
|
199
|
+
},
|
|
200
|
+
}
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
# Set indexed_at timestamp
|
|
204
|
+
processors.append({
|
|
205
|
+
"set": {
|
|
206
|
+
"field": "indexed_at",
|
|
207
|
+
"value": "{{_ingest.timestamp}}",
|
|
208
|
+
}
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
# Calculate word count
|
|
212
|
+
processors.append({
|
|
213
|
+
"script": {
|
|
214
|
+
"description": "Calculate word count",
|
|
215
|
+
"source": """
|
|
216
|
+
if (ctx.content != null) {
|
|
217
|
+
ctx.word_count = ctx.content.split("\\\\s+").length;
|
|
218
|
+
}
|
|
219
|
+
""",
|
|
220
|
+
"ignore_failure": True,
|
|
221
|
+
}
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"description": "GnosisLLM knowledge document ingest pipeline",
|
|
226
|
+
"processors": processors,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_search_pipeline(config: OpenSearchConfig) -> dict[str, Any]:
|
|
231
|
+
"""Get search pipeline for hybrid search score normalization.
|
|
232
|
+
|
|
233
|
+
Uses min_max normalization and arithmetic_mean combination
|
|
234
|
+
for hybrid neural + keyword search.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
config: OpenSearch configuration.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Search pipeline dictionary.
|
|
241
|
+
"""
|
|
242
|
+
return {
|
|
243
|
+
"description": "GnosisLLM search pipeline for hybrid search",
|
|
244
|
+
"phase_results_processors": [
|
|
245
|
+
{
|
|
246
|
+
"normalization-processor": {
|
|
247
|
+
"normalization": {"technique": "min_max"},
|
|
248
|
+
"combination": {
|
|
249
|
+
"technique": "arithmetic_mean",
|
|
250
|
+
"parameters": {"weights": [0.7, 0.3]}, # semantic, keyword
|
|
251
|
+
},
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
],
|
|
255
|
+
}
|
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""OpenSearch query builders for knowledge search.
|
|
2
|
+
|
|
3
|
+
Uses OpenSearch neural search - embeddings are generated automatically
|
|
4
|
+
via the deployed model. No Python-side embedding generation needed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from gnosisllm_knowledge.core.domain.search import SearchQuery
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class QueryBuilder:
|
|
15
|
+
"""Builder for OpenSearch queries.
|
|
16
|
+
|
|
17
|
+
Uses OpenSearch neural search for semantic queries. The deployed
|
|
18
|
+
model handles embedding generation automatically via ingest and
|
|
19
|
+
search pipelines.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
```python
|
|
23
|
+
query = SearchQuery(text="how to configure", account_id="acc123")
|
|
24
|
+
builder = QueryBuilder(query, model_id="abc123")
|
|
25
|
+
os_query = builder.build_hybrid_query()
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
query: SearchQuery,
|
|
32
|
+
model_id: str | None = None,
|
|
33
|
+
embedding_field: str = "content_embedding",
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Initialize query builder.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
query: Search query parameters.
|
|
39
|
+
model_id: OpenSearch ML model ID for neural search.
|
|
40
|
+
embedding_field: Field name for the embedding vector.
|
|
41
|
+
"""
|
|
42
|
+
self._query = query
|
|
43
|
+
self._model_id = model_id
|
|
44
|
+
self._embedding_field = embedding_field
|
|
45
|
+
|
|
46
|
+
def build_semantic_query(self, k: int | None = None) -> dict[str, Any]:
|
|
47
|
+
"""Build neural (semantic) search query.
|
|
48
|
+
|
|
49
|
+
OpenSearch automatically embeds the query text using the
|
|
50
|
+
deployed model specified by model_id.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
k: Number of results for k-NN (default: query.limit).
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
OpenSearch query dictionary.
|
|
57
|
+
"""
|
|
58
|
+
if not self._model_id:
|
|
59
|
+
raise ValueError("model_id required for semantic search")
|
|
60
|
+
|
|
61
|
+
k = k or self._query.limit
|
|
62
|
+
|
|
63
|
+
query: dict[str, Any] = {
|
|
64
|
+
"size": self._query.limit,
|
|
65
|
+
"from": self._query.offset,
|
|
66
|
+
"query": {
|
|
67
|
+
"bool": {
|
|
68
|
+
"must": [
|
|
69
|
+
{
|
|
70
|
+
"neural": {
|
|
71
|
+
self._embedding_field: {
|
|
72
|
+
"query_text": self._query.text,
|
|
73
|
+
"model_id": self._model_id,
|
|
74
|
+
"k": k,
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
"filter": self._build_filters(),
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if self._query.min_score:
|
|
85
|
+
query["min_score"] = self._query.min_score
|
|
86
|
+
|
|
87
|
+
self._add_highlighting(query)
|
|
88
|
+
self._add_source_filtering(query)
|
|
89
|
+
self._add_explain(query)
|
|
90
|
+
self._add_search_after(query)
|
|
91
|
+
|
|
92
|
+
return query
|
|
93
|
+
|
|
94
|
+
def build_keyword_query(self) -> dict[str, Any]:
|
|
95
|
+
"""Build keyword (BM25) search query.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
OpenSearch query dictionary.
|
|
99
|
+
"""
|
|
100
|
+
# Build match query with optional field boosting
|
|
101
|
+
field_boosts = self._query.field_boosts or {"title": 2.0, "content": 1.0}
|
|
102
|
+
|
|
103
|
+
should_clauses = []
|
|
104
|
+
for field, boost in field_boosts.items():
|
|
105
|
+
should_clauses.append(
|
|
106
|
+
{
|
|
107
|
+
"match": {
|
|
108
|
+
field: {
|
|
109
|
+
"query": self._query.text,
|
|
110
|
+
"boost": boost,
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
query: dict[str, Any] = {
|
|
117
|
+
"size": self._query.limit,
|
|
118
|
+
"from": self._query.offset,
|
|
119
|
+
"query": {
|
|
120
|
+
"bool": {
|
|
121
|
+
"should": should_clauses,
|
|
122
|
+
"minimum_should_match": 1,
|
|
123
|
+
"filter": self._build_filters(),
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if self._query.min_score:
|
|
129
|
+
query["min_score"] = self._query.min_score
|
|
130
|
+
|
|
131
|
+
self._add_highlighting(query)
|
|
132
|
+
self._add_source_filtering(query)
|
|
133
|
+
self._add_explain(query)
|
|
134
|
+
self._add_search_after(query)
|
|
135
|
+
|
|
136
|
+
return query
|
|
137
|
+
|
|
138
|
+
def build_hybrid_query(
|
|
139
|
+
self,
|
|
140
|
+
semantic_weight: float = 0.7,
|
|
141
|
+
keyword_weight: float = 0.3,
|
|
142
|
+
k: int | None = None,
|
|
143
|
+
) -> dict[str, Any]:
|
|
144
|
+
"""Build hybrid search query combining neural and keyword.
|
|
145
|
+
|
|
146
|
+
Uses OpenSearch hybrid search with search pipeline for score
|
|
147
|
+
normalization and combination.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
semantic_weight: Weight for semantic score (default: 0.7).
|
|
151
|
+
keyword_weight: Weight for keyword score (default: 0.3).
|
|
152
|
+
k: Number of results for k-NN (default: query.limit * 2).
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
OpenSearch query dictionary.
|
|
156
|
+
"""
|
|
157
|
+
if not self._model_id:
|
|
158
|
+
# Fall back to keyword-only if no model_id
|
|
159
|
+
return self.build_keyword_query()
|
|
160
|
+
|
|
161
|
+
k = k or (self._query.limit * 2)
|
|
162
|
+
field_boosts = self._query.field_boosts or {"title": 2.0, "content": 1.0}
|
|
163
|
+
|
|
164
|
+
# Build keyword should clauses
|
|
165
|
+
keyword_should = []
|
|
166
|
+
for field, boost in field_boosts.items():
|
|
167
|
+
keyword_should.append(
|
|
168
|
+
{
|
|
169
|
+
"match": {
|
|
170
|
+
field: {
|
|
171
|
+
"query": self._query.text,
|
|
172
|
+
"boost": boost,
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# OpenSearch hybrid query format
|
|
179
|
+
query: dict[str, Any] = {
|
|
180
|
+
"size": self._query.limit,
|
|
181
|
+
"from": self._query.offset,
|
|
182
|
+
"query": {
|
|
183
|
+
"hybrid": {
|
|
184
|
+
"queries": [
|
|
185
|
+
# Neural (semantic) component
|
|
186
|
+
{
|
|
187
|
+
"neural": {
|
|
188
|
+
self._embedding_field: {
|
|
189
|
+
"query_text": self._query.text,
|
|
190
|
+
"model_id": self._model_id,
|
|
191
|
+
"k": k,
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
},
|
|
195
|
+
# Keyword (BM25) component
|
|
196
|
+
{
|
|
197
|
+
"bool": {
|
|
198
|
+
"should": keyword_should,
|
|
199
|
+
"minimum_should_match": 1,
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
},
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Apply filters at top level for hybrid
|
|
208
|
+
filters = self._build_filters()
|
|
209
|
+
if filters:
|
|
210
|
+
query["query"] = {
|
|
211
|
+
"bool": {
|
|
212
|
+
"must": [query["query"]],
|
|
213
|
+
"filter": filters,
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if self._query.min_score:
|
|
218
|
+
query["min_score"] = self._query.min_score
|
|
219
|
+
|
|
220
|
+
self._add_highlighting(query)
|
|
221
|
+
self._add_source_filtering(query)
|
|
222
|
+
self._add_explain(query)
|
|
223
|
+
# Note: Don't add sort for hybrid queries - normalization processor handles it
|
|
224
|
+
|
|
225
|
+
return query
|
|
226
|
+
|
|
227
|
+
def build_more_like_this_query(
|
|
228
|
+
self,
|
|
229
|
+
doc_id: str,
|
|
230
|
+
fields: list[str] | None = None,
|
|
231
|
+
) -> dict[str, Any]:
|
|
232
|
+
"""Build more-like-this query for similar documents.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
doc_id: Document ID to find similar documents for.
|
|
236
|
+
fields: Fields to use for similarity (default: content, title).
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
OpenSearch query dictionary.
|
|
240
|
+
"""
|
|
241
|
+
fields = fields or ["content", "title"]
|
|
242
|
+
|
|
243
|
+
query: dict[str, Any] = {
|
|
244
|
+
"size": self._query.limit,
|
|
245
|
+
"from": self._query.offset,
|
|
246
|
+
"query": {
|
|
247
|
+
"bool": {
|
|
248
|
+
"must": [
|
|
249
|
+
{
|
|
250
|
+
"more_like_this": {
|
|
251
|
+
"fields": fields,
|
|
252
|
+
"like": [{"_id": doc_id}],
|
|
253
|
+
"min_term_freq": 1,
|
|
254
|
+
"max_query_terms": 25,
|
|
255
|
+
"min_doc_freq": 1,
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
],
|
|
259
|
+
"filter": self._build_filters(),
|
|
260
|
+
"must_not": [{"ids": {"values": [doc_id]}}],
|
|
261
|
+
}
|
|
262
|
+
},
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
self._add_highlighting(query)
|
|
266
|
+
self._add_source_filtering(query)
|
|
267
|
+
|
|
268
|
+
return query
|
|
269
|
+
|
|
270
|
+
def _build_filters(self) -> list[dict[str, Any]]:
|
|
271
|
+
"""Build filter clauses from query parameters.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
List of filter clauses.
|
|
275
|
+
"""
|
|
276
|
+
filters: list[dict[str, Any]] = []
|
|
277
|
+
|
|
278
|
+
# Multi-tenant filter (required for security)
|
|
279
|
+
if self._query.account_id:
|
|
280
|
+
filters.append({"term": {"account_id": self._query.account_id}})
|
|
281
|
+
|
|
282
|
+
# Collection filter
|
|
283
|
+
if self._query.collection_ids:
|
|
284
|
+
filters.append({"terms": {"collection_id": self._query.collection_ids}})
|
|
285
|
+
|
|
286
|
+
# Source filter
|
|
287
|
+
if self._query.source_ids:
|
|
288
|
+
filters.append({"terms": {"source_id": self._query.source_ids}})
|
|
289
|
+
|
|
290
|
+
# Custom metadata filters
|
|
291
|
+
if self._query.metadata_filters:
|
|
292
|
+
for field, value in self._query.metadata_filters.items():
|
|
293
|
+
if isinstance(value, list):
|
|
294
|
+
filters.append({"terms": {field: value}})
|
|
295
|
+
else:
|
|
296
|
+
filters.append({"term": {field: value}})
|
|
297
|
+
|
|
298
|
+
return filters
|
|
299
|
+
|
|
300
|
+
def _add_highlighting(self, query: dict[str, Any]) -> None:
|
|
301
|
+
"""Add highlighting configuration to query.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
query: Query dictionary to modify.
|
|
305
|
+
"""
|
|
306
|
+
if not self._query.include_highlights:
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
query["highlight"] = {
|
|
310
|
+
"fields": {
|
|
311
|
+
"content": {
|
|
312
|
+
"fragment_size": 150,
|
|
313
|
+
"number_of_fragments": 3,
|
|
314
|
+
},
|
|
315
|
+
"title": {
|
|
316
|
+
"fragment_size": 150,
|
|
317
|
+
"number_of_fragments": 1,
|
|
318
|
+
},
|
|
319
|
+
},
|
|
320
|
+
"pre_tags": ["<em>"],
|
|
321
|
+
"post_tags": ["</em>"],
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
def _add_source_filtering(self, query: dict[str, Any]) -> None:
|
|
325
|
+
"""Add source field filtering to query.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
query: Query dictionary to modify.
|
|
329
|
+
"""
|
|
330
|
+
if self._query.include_fields or self._query.exclude_fields:
|
|
331
|
+
source: dict[str, Any] = {}
|
|
332
|
+
if self._query.include_fields:
|
|
333
|
+
source["includes"] = self._query.include_fields
|
|
334
|
+
if self._query.exclude_fields:
|
|
335
|
+
source["excludes"] = self._query.exclude_fields
|
|
336
|
+
query["_source"] = source
|
|
337
|
+
|
|
338
|
+
def _add_explain(self, query: dict[str, Any]) -> None:
|
|
339
|
+
"""Add explain flag to query for debugging.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
query: Query dictionary to modify.
|
|
343
|
+
"""
|
|
344
|
+
if self._query.explain:
|
|
345
|
+
query["explain"] = True
|
|
346
|
+
|
|
347
|
+
def _add_search_after(self, query: dict[str, Any]) -> None:
|
|
348
|
+
"""Add cursor-based pagination to query.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
query: Query dictionary to modify.
|
|
352
|
+
"""
|
|
353
|
+
# Add sort for consistent pagination
|
|
354
|
+
query["sort"] = [
|
|
355
|
+
{"_score": "desc"},
|
|
356
|
+
{"_id": "asc"}, # Tiebreaker
|
|
357
|
+
]
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def build_delete_by_source_query(
|
|
361
|
+
source_id: str,
|
|
362
|
+
account_id: str | None = None,
|
|
363
|
+
) -> dict[str, Any]:
|
|
364
|
+
"""Build query to delete documents by source.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
source_id: Source ID to delete.
|
|
368
|
+
account_id: Optional account filter for multi-tenancy.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
Delete-by-query dictionary.
|
|
372
|
+
"""
|
|
373
|
+
filters = [{"term": {"source_id": source_id}}]
|
|
374
|
+
if account_id:
|
|
375
|
+
filters.append({"term": {"account_id": account_id}})
|
|
376
|
+
|
|
377
|
+
return {
|
|
378
|
+
"query": {
|
|
379
|
+
"bool": {
|
|
380
|
+
"filter": filters,
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def build_delete_by_collection_query(
|
|
387
|
+
collection_id: str,
|
|
388
|
+
account_id: str | None = None,
|
|
389
|
+
) -> dict[str, Any]:
|
|
390
|
+
"""Build query to delete documents by collection.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
collection_id: Collection ID to delete.
|
|
394
|
+
account_id: Optional account filter for multi-tenancy.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
Delete-by-query dictionary.
|
|
398
|
+
"""
|
|
399
|
+
filters = [{"term": {"collection_id": collection_id}}]
|
|
400
|
+
if account_id:
|
|
401
|
+
filters.append({"term": {"account_id": account_id}})
|
|
402
|
+
|
|
403
|
+
return {
|
|
404
|
+
"query": {
|
|
405
|
+
"bool": {
|
|
406
|
+
"filter": filters,
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def build_count_query(
|
|
413
|
+
account_id: str | None = None,
|
|
414
|
+
collection_id: str | None = None,
|
|
415
|
+
source_id: str | None = None,
|
|
416
|
+
) -> dict[str, Any]:
|
|
417
|
+
"""Build query to count documents.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
account_id: Optional account filter.
|
|
421
|
+
collection_id: Optional collection filter.
|
|
422
|
+
source_id: Optional source filter.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Count query dictionary.
|
|
426
|
+
"""
|
|
427
|
+
filters: list[dict[str, Any]] = []
|
|
428
|
+
|
|
429
|
+
if account_id:
|
|
430
|
+
filters.append({"term": {"account_id": account_id}})
|
|
431
|
+
if collection_id:
|
|
432
|
+
filters.append({"term": {"collection_id": collection_id}})
|
|
433
|
+
if source_id:
|
|
434
|
+
filters.append({"term": {"source_id": source_id}})
|
|
435
|
+
|
|
436
|
+
if not filters:
|
|
437
|
+
return {"query": {"match_all": {}}}
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
"query": {
|
|
441
|
+
"bool": {
|
|
442
|
+
"filter": filters,
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|