pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from pulse_engine.storage.schemas import (
|
|
4
|
+
ConnectorHealth,
|
|
5
|
+
Document,
|
|
6
|
+
SearchQuery,
|
|
7
|
+
SearchResult,
|
|
8
|
+
StoreResult,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseStorageConnector(ABC):
|
|
13
|
+
@abstractmethod
|
|
14
|
+
async def initialize(self) -> None: ...
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
async def store(self, tenant_id: str, documents: list[Document]) -> StoreResult: ...
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def retrieve(self, tenant_id: str, doc_id: str) -> Document | None: ...
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult: ...
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def delete(self, tenant_id: str, doc_id: str) -> bool: ...
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def health_check(self) -> ConnectorHealth: ...
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
async def teardown(self) -> None: ...
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import structlog
|
|
5
|
+
from opensearchpy import ConnectionError as OSConnectionError
|
|
6
|
+
from opensearchpy import NotFoundError as OSNotFoundError
|
|
7
|
+
|
|
8
|
+
from pulse_engine.core.exceptions import ServiceUnavailableError
|
|
9
|
+
from pulse_engine.services.opensearch import OpenSearchService
|
|
10
|
+
from pulse_engine.storage.connectors.base import BaseStorageConnector
|
|
11
|
+
from pulse_engine.storage.schemas import (
|
|
12
|
+
ConnectorHealth,
|
|
13
|
+
Document,
|
|
14
|
+
DocumentSource,
|
|
15
|
+
SearchHit,
|
|
16
|
+
SearchQuery,
|
|
17
|
+
SearchQueryType,
|
|
18
|
+
SearchResult,
|
|
19
|
+
StoreResult,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = structlog.get_logger()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OpenSearchConnector(BaseStorageConnector):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
opensearch: OpenSearchService,
|
|
29
|
+
embedding_dimension: int = 768,
|
|
30
|
+
index_prefix: str = "pulse_kb",
|
|
31
|
+
) -> None:
|
|
32
|
+
self._opensearch = opensearch
|
|
33
|
+
self._embedding_dimension = embedding_dimension
|
|
34
|
+
self._index_prefix = index_prefix
|
|
35
|
+
self._existing_indices: set[str] = set()
|
|
36
|
+
|
|
37
|
+
def _index_name(self, tenant_id: str) -> str:
|
|
38
|
+
return f"{self._index_prefix}_{tenant_id}"
|
|
39
|
+
|
|
40
|
+
def _get_mapping(self) -> dict[str, Any]:
|
|
41
|
+
return {
|
|
42
|
+
"settings": {
|
|
43
|
+
"index": {"knn": True},
|
|
44
|
+
},
|
|
45
|
+
"mappings": {
|
|
46
|
+
"properties": {
|
|
47
|
+
"doc_id": {"type": "keyword"},
|
|
48
|
+
"source": {"type": "keyword"},
|
|
49
|
+
"chunk_id": {"type": "keyword"},
|
|
50
|
+
"content": {"type": "text"},
|
|
51
|
+
"metadata": {"type": "object", "enabled": True},
|
|
52
|
+
"metadata_chunk": {"type": "object", "enabled": True},
|
|
53
|
+
"content_summary": {"type": "text"},
|
|
54
|
+
"content_title": {"type": "text"},
|
|
55
|
+
"embedding": {
|
|
56
|
+
"type": "knn_vector",
|
|
57
|
+
"dimension": self._embedding_dimension,
|
|
58
|
+
"method": {
|
|
59
|
+
"name": "hnsw",
|
|
60
|
+
"space_type": "l2",
|
|
61
|
+
"engine": "faiss",
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
"content_embeddings": {
|
|
65
|
+
"type": "knn_vector",
|
|
66
|
+
"dimension": self._embedding_dimension,
|
|
67
|
+
"method": {
|
|
68
|
+
"name": "hnsw",
|
|
69
|
+
"space_type": "l2",
|
|
70
|
+
"engine": "faiss",
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
"content_summary_embeddings": {
|
|
74
|
+
"type": "knn_vector",
|
|
75
|
+
"dimension": self._embedding_dimension,
|
|
76
|
+
"method": {
|
|
77
|
+
"name": "hnsw",
|
|
78
|
+
"space_type": "l2",
|
|
79
|
+
"engine": "faiss",
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"content_title_embeddings": {
|
|
83
|
+
"type": "knn_vector",
|
|
84
|
+
"dimension": self._embedding_dimension,
|
|
85
|
+
"method": {
|
|
86
|
+
"name": "hnsw",
|
|
87
|
+
"space_type": "l2",
|
|
88
|
+
"engine": "faiss",
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async def _ensure_index(self, tenant_id: str) -> None:
|
|
96
|
+
index = self._index_name(tenant_id)
|
|
97
|
+
if index in self._existing_indices:
|
|
98
|
+
return
|
|
99
|
+
try:
|
|
100
|
+
exists = await self._opensearch.index_exists(index)
|
|
101
|
+
if not exists:
|
|
102
|
+
await self._opensearch.create_index(index, self._get_mapping())
|
|
103
|
+
self._existing_indices.add(index)
|
|
104
|
+
except OSConnectionError as e:
|
|
105
|
+
raise ServiceUnavailableError("OpenSearch unavailable") from e
|
|
106
|
+
|
|
107
|
+
def _doc_to_body(self, doc: Document) -> dict[str, Any]:
|
|
108
|
+
body: dict[str, Any] = {
|
|
109
|
+
"doc_id": doc.doc_id,
|
|
110
|
+
"source": doc.source.value,
|
|
111
|
+
"content": doc.content,
|
|
112
|
+
"metadata": doc.metadata,
|
|
113
|
+
"metadata_chunk": doc.metadata_chunk,
|
|
114
|
+
}
|
|
115
|
+
if doc.chunk_id is not None:
|
|
116
|
+
body["chunk_id"] = doc.chunk_id
|
|
117
|
+
if doc.content_summary is not None:
|
|
118
|
+
body["content_summary"] = doc.content_summary
|
|
119
|
+
if doc.content_title is not None:
|
|
120
|
+
body["content_title"] = doc.content_title
|
|
121
|
+
if doc.embedding is not None:
|
|
122
|
+
body["embedding"] = doc.embedding
|
|
123
|
+
if doc.content_embeddings is not None:
|
|
124
|
+
body["content_embeddings"] = doc.content_embeddings
|
|
125
|
+
if doc.content_summary_embeddings is not None:
|
|
126
|
+
body["content_summary_embeddings"] = doc.content_summary_embeddings
|
|
127
|
+
if doc.content_title_embeddings is not None:
|
|
128
|
+
body["content_title_embeddings"] = doc.content_title_embeddings
|
|
129
|
+
return body
|
|
130
|
+
|
|
131
|
+
def _body_to_doc(self, source: dict[str, Any]) -> Document:
|
|
132
|
+
return Document(
|
|
133
|
+
doc_id=source["doc_id"],
|
|
134
|
+
source=DocumentSource(source["source"]),
|
|
135
|
+
chunk_id=source.get("chunk_id"),
|
|
136
|
+
content=source["content"],
|
|
137
|
+
metadata=source.get("metadata", {}),
|
|
138
|
+
metadata_chunk=source.get("metadata_chunk", {}),
|
|
139
|
+
content_summary=source.get("content_summary"),
|
|
140
|
+
content_title=source.get("content_title"),
|
|
141
|
+
embedding=source.get("embedding"),
|
|
142
|
+
content_embeddings=source.get("content_embeddings"),
|
|
143
|
+
content_summary_embeddings=source.get("content_summary_embeddings"),
|
|
144
|
+
content_title_embeddings=source.get("content_title_embeddings"),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
async def initialize(self) -> None:
|
|
148
|
+
logger.info("opensearch_connector_initialized")
|
|
149
|
+
|
|
150
|
+
async def store(self, tenant_id: str, documents: list[Document]) -> StoreResult:
|
|
151
|
+
await self._ensure_index(tenant_id)
|
|
152
|
+
index = self._index_name(tenant_id)
|
|
153
|
+
stored = 0
|
|
154
|
+
failed = 0
|
|
155
|
+
errors: list[str] = []
|
|
156
|
+
|
|
157
|
+
if len(documents) == 1:
|
|
158
|
+
doc = documents[0]
|
|
159
|
+
try:
|
|
160
|
+
await self._opensearch.index_document(
|
|
161
|
+
index=index, doc_id=doc.doc_id, body=self._doc_to_body(doc)
|
|
162
|
+
)
|
|
163
|
+
stored = 1
|
|
164
|
+
except Exception as e:
|
|
165
|
+
failed = 1
|
|
166
|
+
errors.append(str(e))
|
|
167
|
+
else:
|
|
168
|
+
bulk_body: list[dict[str, Any]] = []
|
|
169
|
+
for doc in documents:
|
|
170
|
+
bulk_body.append({"index": {"_index": index, "_id": doc.doc_id}})
|
|
171
|
+
bulk_body.append(self._doc_to_body(doc))
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
result = await self._opensearch.bulk(bulk_body)
|
|
175
|
+
for item in result.get("items", []):
|
|
176
|
+
action = item.get("index", {})
|
|
177
|
+
if action.get("status", 500) < 300:
|
|
178
|
+
stored += 1
|
|
179
|
+
else:
|
|
180
|
+
failed += 1
|
|
181
|
+
errors.append(
|
|
182
|
+
action.get("error", {}).get("reason", "Unknown error")
|
|
183
|
+
)
|
|
184
|
+
except OSConnectionError as e:
|
|
185
|
+
raise ServiceUnavailableError("OpenSearch unavailable") from e
|
|
186
|
+
|
|
187
|
+
return StoreResult(stored_count=stored, failed_count=failed, errors=errors)
|
|
188
|
+
|
|
189
|
+
async def retrieve(self, tenant_id: str, doc_id: str) -> Document | None:
|
|
190
|
+
index = self._index_name(tenant_id)
|
|
191
|
+
try:
|
|
192
|
+
result = await self._opensearch.get_document(index=index, doc_id=doc_id)
|
|
193
|
+
return self._body_to_doc(result["_source"])
|
|
194
|
+
except OSNotFoundError:
|
|
195
|
+
return None
|
|
196
|
+
except OSConnectionError as e:
|
|
197
|
+
raise ServiceUnavailableError("OpenSearch unavailable") from e
|
|
198
|
+
|
|
199
|
+
def _build_text_query(self, query: SearchQuery) -> dict[str, Any]:
|
|
200
|
+
must: dict[str, Any] = {
|
|
201
|
+
"multi_match": {
|
|
202
|
+
"query": query.query_text,
|
|
203
|
+
"fields": ["content", "content_title", "content_summary"],
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
body: dict[str, Any] = {
|
|
207
|
+
"query": {"bool": {"must": [must]}},
|
|
208
|
+
"highlight": {"fields": {"content": {}}},
|
|
209
|
+
"size": query.size,
|
|
210
|
+
"from": query.from_,
|
|
211
|
+
}
|
|
212
|
+
self._apply_filters(body, query.filters)
|
|
213
|
+
return body
|
|
214
|
+
|
|
215
|
+
def _build_semantic_query(self, query: SearchQuery) -> dict[str, Any]:
|
|
216
|
+
body: dict[str, Any] = {
|
|
217
|
+
"size": query.size,
|
|
218
|
+
"from": query.from_,
|
|
219
|
+
"query": {
|
|
220
|
+
"knn": {
|
|
221
|
+
"embedding": {
|
|
222
|
+
"vector": query.vector,
|
|
223
|
+
"k": query.size,
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
},
|
|
227
|
+
}
|
|
228
|
+
self._apply_filters(body, query.filters)
|
|
229
|
+
return body
|
|
230
|
+
|
|
231
|
+
def _build_hybrid_query(self, query: SearchQuery) -> dict[str, Any]:
|
|
232
|
+
text_clause: dict[str, Any] = {
|
|
233
|
+
"multi_match": {
|
|
234
|
+
"query": query.query_text,
|
|
235
|
+
"fields": ["content", "content_title", "content_summary"],
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
knn_clause: dict[str, Any] = {
|
|
239
|
+
"knn": {
|
|
240
|
+
"embedding": {
|
|
241
|
+
"vector": query.vector,
|
|
242
|
+
"k": query.size,
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
body: dict[str, Any] = {
|
|
247
|
+
"size": query.size,
|
|
248
|
+
"from": query.from_,
|
|
249
|
+
"query": {
|
|
250
|
+
"bool": {
|
|
251
|
+
"should": [text_clause, knn_clause],
|
|
252
|
+
}
|
|
253
|
+
},
|
|
254
|
+
"highlight": {"fields": {"content": {}}},
|
|
255
|
+
}
|
|
256
|
+
self._apply_filters(body, query.filters)
|
|
257
|
+
return body
|
|
258
|
+
|
|
259
|
+
@staticmethod
|
|
260
|
+
def _apply_filters(body: dict[str, Any], filters: dict[str, Any]) -> None:
|
|
261
|
+
if not filters:
|
|
262
|
+
return
|
|
263
|
+
filter_clauses = [{"term": {k: v}} for k, v in filters.items()]
|
|
264
|
+
query = body.get("query", {})
|
|
265
|
+
if "bool" not in query:
|
|
266
|
+
body["query"] = {"bool": {"must": [query], "filter": filter_clauses}}
|
|
267
|
+
else:
|
|
268
|
+
query["bool"]["filter"] = filter_clauses
|
|
269
|
+
|
|
270
|
+
async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult:
|
|
271
|
+
index = self._index_name(tenant_id)
|
|
272
|
+
|
|
273
|
+
if query.query_type == SearchQueryType.text:
|
|
274
|
+
body = self._build_text_query(query)
|
|
275
|
+
elif query.query_type == SearchQueryType.semantic:
|
|
276
|
+
body = self._build_semantic_query(query)
|
|
277
|
+
else:
|
|
278
|
+
body = self._build_hybrid_query(query)
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
start = time.monotonic()
|
|
282
|
+
raw = await self._opensearch.search(index=index, query=body)
|
|
283
|
+
took_ms = int((time.monotonic() - start) * 1000)
|
|
284
|
+
except OSConnectionError as e:
|
|
285
|
+
raise ServiceUnavailableError("OpenSearch unavailable") from e
|
|
286
|
+
|
|
287
|
+
hits: list[SearchHit] = []
|
|
288
|
+
for h in raw.get("hits", {}).get("hits", []):
|
|
289
|
+
src = h["_source"]
|
|
290
|
+
highlights = []
|
|
291
|
+
if "highlight" in h:
|
|
292
|
+
for field_highlights in h["highlight"].values():
|
|
293
|
+
highlights.extend(field_highlights)
|
|
294
|
+
|
|
295
|
+
hits.append(
|
|
296
|
+
SearchHit(
|
|
297
|
+
doc_id=src["doc_id"],
|
|
298
|
+
score=h.get("_score", 0.0),
|
|
299
|
+
source=DocumentSource(src["source"]),
|
|
300
|
+
content=src["content"],
|
|
301
|
+
highlights=highlights,
|
|
302
|
+
metadata=src.get("metadata", {}),
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
total_value = raw.get("hits", {}).get("total", {})
|
|
307
|
+
if isinstance(total_value, dict):
|
|
308
|
+
total = total_value.get("value", 0)
|
|
309
|
+
else:
|
|
310
|
+
total = total_value
|
|
311
|
+
|
|
312
|
+
return SearchResult(hits=hits, total=total, took_ms=took_ms)
|
|
313
|
+
|
|
314
|
+
async def delete(self, tenant_id: str, doc_id: str) -> bool:
|
|
315
|
+
index = self._index_name(tenant_id)
|
|
316
|
+
try:
|
|
317
|
+
await self._opensearch.delete_document(index=index, doc_id=doc_id)
|
|
318
|
+
return True
|
|
319
|
+
except OSNotFoundError:
|
|
320
|
+
return False
|
|
321
|
+
except OSConnectionError as e:
|
|
322
|
+
raise ServiceUnavailableError("OpenSearch unavailable") from e
|
|
323
|
+
|
|
324
|
+
async def health_check(self) -> ConnectorHealth:
|
|
325
|
+
start = time.monotonic()
|
|
326
|
+
try:
|
|
327
|
+
ok = await self._opensearch.ping()
|
|
328
|
+
latency = (time.monotonic() - start) * 1000
|
|
329
|
+
return ConnectorHealth(
|
|
330
|
+
connector="opensearch",
|
|
331
|
+
status="up" if ok else "down",
|
|
332
|
+
latency_ms=latency,
|
|
333
|
+
)
|
|
334
|
+
except Exception:
|
|
335
|
+
latency = (time.monotonic() - start) * 1000
|
|
336
|
+
return ConnectorHealth(
|
|
337
|
+
connector="opensearch",
|
|
338
|
+
status="down",
|
|
339
|
+
latency_ms=latency,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
async def teardown(self) -> None:
|
|
343
|
+
self._existing_indices.clear()
|
|
344
|
+
logger.info("opensearch_connector_teardown")
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
|
|
5
|
+
from pulse_engine.core.exceptions import NotFoundError, ServiceUnavailableError
|
|
6
|
+
from pulse_engine.storage.connectors.athena import AthenaConnector
|
|
7
|
+
from pulse_engine.storage.connectors.opensearch import OpenSearchConnector
|
|
8
|
+
from pulse_engine.storage.schemas import (
|
|
9
|
+
Document,
|
|
10
|
+
KBStatsResponse,
|
|
11
|
+
QueryResult,
|
|
12
|
+
SearchQuery,
|
|
13
|
+
SearchResult,
|
|
14
|
+
StoreResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KnowledgeBaseService:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
opensearch_connector: OpenSearchConnector,
|
|
24
|
+
athena_connector: AthenaConnector | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
self._opensearch = opensearch_connector
|
|
27
|
+
self._athena = athena_connector
|
|
28
|
+
|
|
29
|
+
async def store_documents(
|
|
30
|
+
self, tenant_id: str, documents: list[Document]
|
|
31
|
+
) -> StoreResult:
|
|
32
|
+
return await self._opensearch.store(tenant_id, documents)
|
|
33
|
+
|
|
34
|
+
async def retrieve_document(self, tenant_id: str, doc_id: str) -> Document:
|
|
35
|
+
result = await self._opensearch.retrieve(tenant_id, doc_id)
|
|
36
|
+
if result is None:
|
|
37
|
+
raise NotFoundError(f"Document {doc_id} not found")
|
|
38
|
+
return result
|
|
39
|
+
|
|
40
|
+
async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult:
|
|
41
|
+
return await self._opensearch.search(tenant_id, query)
|
|
42
|
+
|
|
43
|
+
async def delete_document(self, tenant_id: str, doc_id: str) -> None:
|
|
44
|
+
deleted = await self._opensearch.delete(tenant_id, doc_id)
|
|
45
|
+
if not deleted:
|
|
46
|
+
raise NotFoundError(f"Document {doc_id} not found")
|
|
47
|
+
|
|
48
|
+
async def get_stats(self, tenant_id: str) -> KBStatsResponse:
|
|
49
|
+
index = self._opensearch._index_name(tenant_id)
|
|
50
|
+
try:
|
|
51
|
+
stats = await self._opensearch._opensearch.index_stats(index)
|
|
52
|
+
total = stats.get("_all", {}).get("primaries", {})
|
|
53
|
+
doc_count = total.get("docs", {}).get("count", 0)
|
|
54
|
+
size_bytes = total.get("store", {}).get("size_in_bytes", 0)
|
|
55
|
+
return KBStatsResponse(
|
|
56
|
+
document_count=doc_count,
|
|
57
|
+
index_size_bytes=size_bytes,
|
|
58
|
+
)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
logger.warning("stats_fetch_failed", error=str(e))
|
|
61
|
+
return KBStatsResponse(document_count=0, index_size_bytes=0)
|
|
62
|
+
|
|
63
|
+
async def execute_query(
|
|
64
|
+
self, tenant_id: str, sql: str, parameters: dict[str, Any] | None = None
|
|
65
|
+
) -> QueryResult:
|
|
66
|
+
if self._athena is None:
|
|
67
|
+
raise ServiceUnavailableError("Athena connector is not configured")
|
|
68
|
+
return await self._athena.execute_query(tenant_id, sql, parameters)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from fastapi import APIRouter, Depends
|
|
2
|
+
from starlette.responses import JSONResponse
|
|
3
|
+
|
|
4
|
+
from pulse_engine.core.scope import require_scope
|
|
5
|
+
from pulse_engine.dependencies import get_knowledge_base
|
|
6
|
+
from pulse_engine.middleware.tenant import get_tenant_id
|
|
7
|
+
from pulse_engine.storage.knowledge_base import KnowledgeBaseService
|
|
8
|
+
from pulse_engine.storage.schemas import (
|
|
9
|
+
Document,
|
|
10
|
+
KBStatsResponse,
|
|
11
|
+
QueryRequest,
|
|
12
|
+
QueryResult,
|
|
13
|
+
SearchQuery,
|
|
14
|
+
SearchResult,
|
|
15
|
+
StoreRequest,
|
|
16
|
+
StoreResult,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
router = APIRouter(prefix="/kb", tags=["Knowledge Base"])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@router.post(
|
|
23
|
+
"/documents",
|
|
24
|
+
response_model=StoreResult,
|
|
25
|
+
status_code=201,
|
|
26
|
+
dependencies=[require_scope("kb:write")],
|
|
27
|
+
)
|
|
28
|
+
async def store_documents(
|
|
29
|
+
body: StoreRequest,
|
|
30
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
31
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
32
|
+
) -> StoreResult:
|
|
33
|
+
return await kb.store_documents(tenant_id, body.documents)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@router.get("/documents/{document_id}", response_model=Document)
|
|
37
|
+
async def retrieve_document(
|
|
38
|
+
document_id: str,
|
|
39
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
40
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
41
|
+
) -> Document:
|
|
42
|
+
return await kb.retrieve_document(tenant_id, document_id)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@router.post("/search", response_model=SearchResult)
|
|
46
|
+
async def search_documents(
|
|
47
|
+
body: SearchQuery,
|
|
48
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
49
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
50
|
+
) -> SearchResult:
|
|
51
|
+
return await kb.search(tenant_id, body)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@router.delete("/documents/{document_id}", status_code=204)
|
|
55
|
+
async def delete_document(
|
|
56
|
+
document_id: str,
|
|
57
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
58
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
59
|
+
) -> JSONResponse:
|
|
60
|
+
await kb.delete_document(tenant_id, document_id)
|
|
61
|
+
return JSONResponse(status_code=204, content=None)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@router.get("/stats", response_model=KBStatsResponse)
|
|
65
|
+
async def get_stats(
|
|
66
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
67
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
68
|
+
) -> KBStatsResponse:
|
|
69
|
+
return await kb.get_stats(tenant_id)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@router.post("/query", response_model=QueryResult)
|
|
73
|
+
async def run_query(
|
|
74
|
+
body: QueryRequest,
|
|
75
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
76
|
+
kb: KnowledgeBaseService = Depends(get_knowledge_base),
|
|
77
|
+
) -> QueryResult:
|
|
78
|
+
return await kb.execute_query(tenant_id, body.sql, body.parameters)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DocumentSource(str, Enum):
|
|
8
|
+
speech = "speech"
|
|
9
|
+
tweet = "tweet"
|
|
10
|
+
youtube = "youtube"
|
|
11
|
+
processed = "processed"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Document(BaseModel):
|
|
15
|
+
doc_id: str
|
|
16
|
+
source: DocumentSource
|
|
17
|
+
chunk_id: str | None = None
|
|
18
|
+
content: str
|
|
19
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
20
|
+
metadata_chunk: dict[str, Any] = Field(default_factory=dict)
|
|
21
|
+
content_summary: str | None = None
|
|
22
|
+
content_title: str | None = None
|
|
23
|
+
embedding: list[float] | None = None
|
|
24
|
+
content_embeddings: list[float] | None = None
|
|
25
|
+
content_summary_embeddings: list[float] | None = None
|
|
26
|
+
content_title_embeddings: list[float] | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SearchQueryType(str, Enum):
|
|
30
|
+
text = "text"
|
|
31
|
+
semantic = "semantic"
|
|
32
|
+
hybrid = "hybrid"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SearchQuery(BaseModel):
|
|
36
|
+
query_type: SearchQueryType = SearchQueryType.text
|
|
37
|
+
query_text: str | None = None
|
|
38
|
+
vector: list[float] | None = None
|
|
39
|
+
filters: dict[str, Any] = Field(default_factory=dict)
|
|
40
|
+
size: int = Field(default=10, ge=1, le=100)
|
|
41
|
+
from_: int = Field(default=0, ge=0, alias="from")
|
|
42
|
+
|
|
43
|
+
model_config = {"populate_by_name": True}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class StoreRequest(BaseModel):
|
|
47
|
+
documents: list[Document]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StoreResult(BaseModel):
|
|
51
|
+
stored_count: int
|
|
52
|
+
failed_count: int
|
|
53
|
+
errors: list[str] = Field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SearchHit(BaseModel):
|
|
57
|
+
doc_id: str
|
|
58
|
+
score: float
|
|
59
|
+
source: DocumentSource
|
|
60
|
+
content: str
|
|
61
|
+
highlights: list[str] = Field(default_factory=list)
|
|
62
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class SearchResult(BaseModel):
|
|
66
|
+
hits: list[SearchHit]
|
|
67
|
+
total: int
|
|
68
|
+
took_ms: int
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ConnectorHealth(BaseModel):
|
|
72
|
+
connector: str
|
|
73
|
+
status: str = Field(pattern=r"^(up|down|degraded)$")
|
|
74
|
+
latency_ms: float | None = None
|
|
75
|
+
details: dict[str, Any] = Field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class QueryRequest(BaseModel):
|
|
79
|
+
sql: str
|
|
80
|
+
parameters: dict[str, Any] = Field(default_factory=dict)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class QueryResult(BaseModel):
|
|
84
|
+
columns: list[str]
|
|
85
|
+
rows: list[list[Any]]
|
|
86
|
+
row_count: int
|
|
87
|
+
execution_time_ms: int
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class KBStatsResponse(BaseModel):
|
|
91
|
+
document_count: int
|
|
92
|
+
index_size_bytes: int
|
|
93
|
+
sources: dict[str, int] = Field(default_factory=dict)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Pulse Engine testing utilities — import fixtures and mocks for product tests."""
|
|
2
|
+
|
|
3
|
+
from pulse_engine.testing.mocks import (
|
|
4
|
+
MockExtractor,
|
|
5
|
+
MockOrchestratorAdapter,
|
|
6
|
+
MockStorageConnector,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"MockExtractor",
|
|
11
|
+
"MockOrchestratorAdapter",
|
|
12
|
+
"MockStorageConnector",
|
|
13
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Reusable pytest fixtures for products that depend on pulse-engine.
|
|
2
|
+
|
|
3
|
+
Products import these in their ``conftest.py``::
|
|
4
|
+
|
|
5
|
+
from pulse_engine.testing.fixtures import * # noqa: F401, F403
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from pulse_engine.processor.pipeline import ProcessingPipeline
|
|
13
|
+
from pulse_engine.storage.knowledge_base import KnowledgeBaseService
|
|
14
|
+
from pulse_engine.testing.mocks import (
|
|
15
|
+
MockExtractor,
|
|
16
|
+
MockOrchestratorAdapter,
|
|
17
|
+
MockStorageConnector,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture()
|
|
22
|
+
def mock_storage_connector() -> MockStorageConnector:
|
|
23
|
+
return MockStorageConnector()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture()
|
|
27
|
+
def mock_orchestrator() -> MockOrchestratorAdapter:
|
|
28
|
+
return MockOrchestratorAdapter()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture()
|
|
32
|
+
def mock_extractor() -> MockExtractor:
|
|
33
|
+
return MockExtractor()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.fixture()
|
|
37
|
+
def kb_service(mock_storage_connector: MockStorageConnector) -> KnowledgeBaseService:
|
|
38
|
+
"""KnowledgeBaseService backed by an in-memory mock connector."""
|
|
39
|
+
return KnowledgeBaseService(
|
|
40
|
+
opensearch_connector=mock_storage_connector, # type: ignore[arg-type]
|
|
41
|
+
athena_connector=None,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture()
|
|
46
|
+
def processing_pipeline(
|
|
47
|
+
kb_service: KnowledgeBaseService,
|
|
48
|
+
) -> ProcessingPipeline:
|
|
49
|
+
"""Pipeline using all default stages and the mock KB service."""
|
|
50
|
+
return ProcessingPipeline(kb_service=kb_service)
|