pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,32 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from pulse_engine.storage.schemas import (
4
+ ConnectorHealth,
5
+ Document,
6
+ SearchQuery,
7
+ SearchResult,
8
+ StoreResult,
9
+ )
10
+
11
+
12
+ class BaseStorageConnector(ABC):
13
+ @abstractmethod
14
+ async def initialize(self) -> None: ...
15
+
16
+ @abstractmethod
17
+ async def store(self, tenant_id: str, documents: list[Document]) -> StoreResult: ...
18
+
19
+ @abstractmethod
20
+ async def retrieve(self, tenant_id: str, doc_id: str) -> Document | None: ...
21
+
22
+ @abstractmethod
23
+ async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult: ...
24
+
25
+ @abstractmethod
26
+ async def delete(self, tenant_id: str, doc_id: str) -> bool: ...
27
+
28
+ @abstractmethod
29
+ async def health_check(self) -> ConnectorHealth: ...
30
+
31
+ @abstractmethod
32
+ async def teardown(self) -> None: ...
@@ -0,0 +1,344 @@
1
+ import time
2
+ from typing import Any
3
+
4
+ import structlog
5
+ from opensearchpy import ConnectionError as OSConnectionError
6
+ from opensearchpy import NotFoundError as OSNotFoundError
7
+
8
+ from pulse_engine.core.exceptions import ServiceUnavailableError
9
+ from pulse_engine.services.opensearch import OpenSearchService
10
+ from pulse_engine.storage.connectors.base import BaseStorageConnector
11
+ from pulse_engine.storage.schemas import (
12
+ ConnectorHealth,
13
+ Document,
14
+ DocumentSource,
15
+ SearchHit,
16
+ SearchQuery,
17
+ SearchQueryType,
18
+ SearchResult,
19
+ StoreResult,
20
+ )
21
+
22
+ logger = structlog.get_logger()
23
+
24
+
25
+ class OpenSearchConnector(BaseStorageConnector):
26
+ def __init__(
27
+ self,
28
+ opensearch: OpenSearchService,
29
+ embedding_dimension: int = 768,
30
+ index_prefix: str = "pulse_kb",
31
+ ) -> None:
32
+ self._opensearch = opensearch
33
+ self._embedding_dimension = embedding_dimension
34
+ self._index_prefix = index_prefix
35
+ self._existing_indices: set[str] = set()
36
+
37
+ def _index_name(self, tenant_id: str) -> str:
38
+ return f"{self._index_prefix}_{tenant_id}"
39
+
40
+ def _get_mapping(self) -> dict[str, Any]:
41
+ return {
42
+ "settings": {
43
+ "index": {"knn": True},
44
+ },
45
+ "mappings": {
46
+ "properties": {
47
+ "doc_id": {"type": "keyword"},
48
+ "source": {"type": "keyword"},
49
+ "chunk_id": {"type": "keyword"},
50
+ "content": {"type": "text"},
51
+ "metadata": {"type": "object", "enabled": True},
52
+ "metadata_chunk": {"type": "object", "enabled": True},
53
+ "content_summary": {"type": "text"},
54
+ "content_title": {"type": "text"},
55
+ "embedding": {
56
+ "type": "knn_vector",
57
+ "dimension": self._embedding_dimension,
58
+ "method": {
59
+ "name": "hnsw",
60
+ "space_type": "l2",
61
+ "engine": "faiss",
62
+ },
63
+ },
64
+ "content_embeddings": {
65
+ "type": "knn_vector",
66
+ "dimension": self._embedding_dimension,
67
+ "method": {
68
+ "name": "hnsw",
69
+ "space_type": "l2",
70
+ "engine": "faiss",
71
+ },
72
+ },
73
+ "content_summary_embeddings": {
74
+ "type": "knn_vector",
75
+ "dimension": self._embedding_dimension,
76
+ "method": {
77
+ "name": "hnsw",
78
+ "space_type": "l2",
79
+ "engine": "faiss",
80
+ },
81
+ },
82
+ "content_title_embeddings": {
83
+ "type": "knn_vector",
84
+ "dimension": self._embedding_dimension,
85
+ "method": {
86
+ "name": "hnsw",
87
+ "space_type": "l2",
88
+ "engine": "faiss",
89
+ },
90
+ },
91
+ }
92
+ },
93
+ }
94
+
95
+ async def _ensure_index(self, tenant_id: str) -> None:
96
+ index = self._index_name(tenant_id)
97
+ if index in self._existing_indices:
98
+ return
99
+ try:
100
+ exists = await self._opensearch.index_exists(index)
101
+ if not exists:
102
+ await self._opensearch.create_index(index, self._get_mapping())
103
+ self._existing_indices.add(index)
104
+ except OSConnectionError as e:
105
+ raise ServiceUnavailableError("OpenSearch unavailable") from e
106
+
107
+ def _doc_to_body(self, doc: Document) -> dict[str, Any]:
108
+ body: dict[str, Any] = {
109
+ "doc_id": doc.doc_id,
110
+ "source": doc.source.value,
111
+ "content": doc.content,
112
+ "metadata": doc.metadata,
113
+ "metadata_chunk": doc.metadata_chunk,
114
+ }
115
+ if doc.chunk_id is not None:
116
+ body["chunk_id"] = doc.chunk_id
117
+ if doc.content_summary is not None:
118
+ body["content_summary"] = doc.content_summary
119
+ if doc.content_title is not None:
120
+ body["content_title"] = doc.content_title
121
+ if doc.embedding is not None:
122
+ body["embedding"] = doc.embedding
123
+ if doc.content_embeddings is not None:
124
+ body["content_embeddings"] = doc.content_embeddings
125
+ if doc.content_summary_embeddings is not None:
126
+ body["content_summary_embeddings"] = doc.content_summary_embeddings
127
+ if doc.content_title_embeddings is not None:
128
+ body["content_title_embeddings"] = doc.content_title_embeddings
129
+ return body
130
+
131
+ def _body_to_doc(self, source: dict[str, Any]) -> Document:
132
+ return Document(
133
+ doc_id=source["doc_id"],
134
+ source=DocumentSource(source["source"]),
135
+ chunk_id=source.get("chunk_id"),
136
+ content=source["content"],
137
+ metadata=source.get("metadata", {}),
138
+ metadata_chunk=source.get("metadata_chunk", {}),
139
+ content_summary=source.get("content_summary"),
140
+ content_title=source.get("content_title"),
141
+ embedding=source.get("embedding"),
142
+ content_embeddings=source.get("content_embeddings"),
143
+ content_summary_embeddings=source.get("content_summary_embeddings"),
144
+ content_title_embeddings=source.get("content_title_embeddings"),
145
+ )
146
+
147
+ async def initialize(self) -> None:
148
+ logger.info("opensearch_connector_initialized")
149
+
150
+ async def store(self, tenant_id: str, documents: list[Document]) -> StoreResult:
151
+ await self._ensure_index(tenant_id)
152
+ index = self._index_name(tenant_id)
153
+ stored = 0
154
+ failed = 0
155
+ errors: list[str] = []
156
+
157
+ if len(documents) == 1:
158
+ doc = documents[0]
159
+ try:
160
+ await self._opensearch.index_document(
161
+ index=index, doc_id=doc.doc_id, body=self._doc_to_body(doc)
162
+ )
163
+ stored = 1
164
+ except Exception as e:
165
+ failed = 1
166
+ errors.append(str(e))
167
+ else:
168
+ bulk_body: list[dict[str, Any]] = []
169
+ for doc in documents:
170
+ bulk_body.append({"index": {"_index": index, "_id": doc.doc_id}})
171
+ bulk_body.append(self._doc_to_body(doc))
172
+
173
+ try:
174
+ result = await self._opensearch.bulk(bulk_body)
175
+ for item in result.get("items", []):
176
+ action = item.get("index", {})
177
+ if action.get("status", 500) < 300:
178
+ stored += 1
179
+ else:
180
+ failed += 1
181
+ errors.append(
182
+ action.get("error", {}).get("reason", "Unknown error")
183
+ )
184
+ except OSConnectionError as e:
185
+ raise ServiceUnavailableError("OpenSearch unavailable") from e
186
+
187
+ return StoreResult(stored_count=stored, failed_count=failed, errors=errors)
188
+
189
+ async def retrieve(self, tenant_id: str, doc_id: str) -> Document | None:
190
+ index = self._index_name(tenant_id)
191
+ try:
192
+ result = await self._opensearch.get_document(index=index, doc_id=doc_id)
193
+ return self._body_to_doc(result["_source"])
194
+ except OSNotFoundError:
195
+ return None
196
+ except OSConnectionError as e:
197
+ raise ServiceUnavailableError("OpenSearch unavailable") from e
198
+
199
+ def _build_text_query(self, query: SearchQuery) -> dict[str, Any]:
200
+ must: dict[str, Any] = {
201
+ "multi_match": {
202
+ "query": query.query_text,
203
+ "fields": ["content", "content_title", "content_summary"],
204
+ }
205
+ }
206
+ body: dict[str, Any] = {
207
+ "query": {"bool": {"must": [must]}},
208
+ "highlight": {"fields": {"content": {}}},
209
+ "size": query.size,
210
+ "from": query.from_,
211
+ }
212
+ self._apply_filters(body, query.filters)
213
+ return body
214
+
215
+ def _build_semantic_query(self, query: SearchQuery) -> dict[str, Any]:
216
+ body: dict[str, Any] = {
217
+ "size": query.size,
218
+ "from": query.from_,
219
+ "query": {
220
+ "knn": {
221
+ "embedding": {
222
+ "vector": query.vector,
223
+ "k": query.size,
224
+ }
225
+ }
226
+ },
227
+ }
228
+ self._apply_filters(body, query.filters)
229
+ return body
230
+
231
+ def _build_hybrid_query(self, query: SearchQuery) -> dict[str, Any]:
232
+ text_clause: dict[str, Any] = {
233
+ "multi_match": {
234
+ "query": query.query_text,
235
+ "fields": ["content", "content_title", "content_summary"],
236
+ }
237
+ }
238
+ knn_clause: dict[str, Any] = {
239
+ "knn": {
240
+ "embedding": {
241
+ "vector": query.vector,
242
+ "k": query.size,
243
+ }
244
+ }
245
+ }
246
+ body: dict[str, Any] = {
247
+ "size": query.size,
248
+ "from": query.from_,
249
+ "query": {
250
+ "bool": {
251
+ "should": [text_clause, knn_clause],
252
+ }
253
+ },
254
+ "highlight": {"fields": {"content": {}}},
255
+ }
256
+ self._apply_filters(body, query.filters)
257
+ return body
258
+
259
+ @staticmethod
260
+ def _apply_filters(body: dict[str, Any], filters: dict[str, Any]) -> None:
261
+ if not filters:
262
+ return
263
+ filter_clauses = [{"term": {k: v}} for k, v in filters.items()]
264
+ query = body.get("query", {})
265
+ if "bool" not in query:
266
+ body["query"] = {"bool": {"must": [query], "filter": filter_clauses}}
267
+ else:
268
+ query["bool"]["filter"] = filter_clauses
269
+
270
+ async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult:
271
+ index = self._index_name(tenant_id)
272
+
273
+ if query.query_type == SearchQueryType.text:
274
+ body = self._build_text_query(query)
275
+ elif query.query_type == SearchQueryType.semantic:
276
+ body = self._build_semantic_query(query)
277
+ else:
278
+ body = self._build_hybrid_query(query)
279
+
280
+ try:
281
+ start = time.monotonic()
282
+ raw = await self._opensearch.search(index=index, query=body)
283
+ took_ms = int((time.monotonic() - start) * 1000)
284
+ except OSConnectionError as e:
285
+ raise ServiceUnavailableError("OpenSearch unavailable") from e
286
+
287
+ hits: list[SearchHit] = []
288
+ for h in raw.get("hits", {}).get("hits", []):
289
+ src = h["_source"]
290
+ highlights = []
291
+ if "highlight" in h:
292
+ for field_highlights in h["highlight"].values():
293
+ highlights.extend(field_highlights)
294
+
295
+ hits.append(
296
+ SearchHit(
297
+ doc_id=src["doc_id"],
298
+ score=h.get("_score", 0.0),
299
+ source=DocumentSource(src["source"]),
300
+ content=src["content"],
301
+ highlights=highlights,
302
+ metadata=src.get("metadata", {}),
303
+ )
304
+ )
305
+
306
+ total_value = raw.get("hits", {}).get("total", {})
307
+ if isinstance(total_value, dict):
308
+ total = total_value.get("value", 0)
309
+ else:
310
+ total = total_value
311
+
312
+ return SearchResult(hits=hits, total=total, took_ms=took_ms)
313
+
314
+ async def delete(self, tenant_id: str, doc_id: str) -> bool:
315
+ index = self._index_name(tenant_id)
316
+ try:
317
+ await self._opensearch.delete_document(index=index, doc_id=doc_id)
318
+ return True
319
+ except OSNotFoundError:
320
+ return False
321
+ except OSConnectionError as e:
322
+ raise ServiceUnavailableError("OpenSearch unavailable") from e
323
+
324
+ async def health_check(self) -> ConnectorHealth:
325
+ start = time.monotonic()
326
+ try:
327
+ ok = await self._opensearch.ping()
328
+ latency = (time.monotonic() - start) * 1000
329
+ return ConnectorHealth(
330
+ connector="opensearch",
331
+ status="up" if ok else "down",
332
+ latency_ms=latency,
333
+ )
334
+ except Exception:
335
+ latency = (time.monotonic() - start) * 1000
336
+ return ConnectorHealth(
337
+ connector="opensearch",
338
+ status="down",
339
+ latency_ms=latency,
340
+ )
341
+
342
+ async def teardown(self) -> None:
343
+ self._existing_indices.clear()
344
+ logger.info("opensearch_connector_teardown")
@@ -0,0 +1,68 @@
1
+ from typing import Any
2
+
3
+ import structlog
4
+
5
+ from pulse_engine.core.exceptions import NotFoundError, ServiceUnavailableError
6
+ from pulse_engine.storage.connectors.athena import AthenaConnector
7
+ from pulse_engine.storage.connectors.opensearch import OpenSearchConnector
8
+ from pulse_engine.storage.schemas import (
9
+ Document,
10
+ KBStatsResponse,
11
+ QueryResult,
12
+ SearchQuery,
13
+ SearchResult,
14
+ StoreResult,
15
+ )
16
+
17
+ logger = structlog.get_logger()
18
+
19
+
20
+ class KnowledgeBaseService:
21
+ def __init__(
22
+ self,
23
+ opensearch_connector: OpenSearchConnector,
24
+ athena_connector: AthenaConnector | None = None,
25
+ ) -> None:
26
+ self._opensearch = opensearch_connector
27
+ self._athena = athena_connector
28
+
29
+ async def store_documents(
30
+ self, tenant_id: str, documents: list[Document]
31
+ ) -> StoreResult:
32
+ return await self._opensearch.store(tenant_id, documents)
33
+
34
+ async def retrieve_document(self, tenant_id: str, doc_id: str) -> Document:
35
+ result = await self._opensearch.retrieve(tenant_id, doc_id)
36
+ if result is None:
37
+ raise NotFoundError(f"Document {doc_id} not found")
38
+ return result
39
+
40
+ async def search(self, tenant_id: str, query: SearchQuery) -> SearchResult:
41
+ return await self._opensearch.search(tenant_id, query)
42
+
43
+ async def delete_document(self, tenant_id: str, doc_id: str) -> None:
44
+ deleted = await self._opensearch.delete(tenant_id, doc_id)
45
+ if not deleted:
46
+ raise NotFoundError(f"Document {doc_id} not found")
47
+
48
+ async def get_stats(self, tenant_id: str) -> KBStatsResponse:
49
+ index = self._opensearch._index_name(tenant_id)
50
+ try:
51
+ stats = await self._opensearch._opensearch.index_stats(index)
52
+ total = stats.get("_all", {}).get("primaries", {})
53
+ doc_count = total.get("docs", {}).get("count", 0)
54
+ size_bytes = total.get("store", {}).get("size_in_bytes", 0)
55
+ return KBStatsResponse(
56
+ document_count=doc_count,
57
+ index_size_bytes=size_bytes,
58
+ )
59
+ except Exception as e:
60
+ logger.warning("stats_fetch_failed", error=str(e))
61
+ return KBStatsResponse(document_count=0, index_size_bytes=0)
62
+
63
+ async def execute_query(
64
+ self, tenant_id: str, sql: str, parameters: dict[str, Any] | None = None
65
+ ) -> QueryResult:
66
+ if self._athena is None:
67
+ raise ServiceUnavailableError("Athena connector is not configured")
68
+ return await self._athena.execute_query(tenant_id, sql, parameters)
@@ -0,0 +1,78 @@
1
+ from fastapi import APIRouter, Depends
2
+ from starlette.responses import JSONResponse
3
+
4
+ from pulse_engine.core.scope import require_scope
5
+ from pulse_engine.dependencies import get_knowledge_base
6
+ from pulse_engine.middleware.tenant import get_tenant_id
7
+ from pulse_engine.storage.knowledge_base import KnowledgeBaseService
8
+ from pulse_engine.storage.schemas import (
9
+ Document,
10
+ KBStatsResponse,
11
+ QueryRequest,
12
+ QueryResult,
13
+ SearchQuery,
14
+ SearchResult,
15
+ StoreRequest,
16
+ StoreResult,
17
+ )
18
+
19
+ router = APIRouter(prefix="/kb", tags=["Knowledge Base"])
20
+
21
+
22
+ @router.post(
23
+ "/documents",
24
+ response_model=StoreResult,
25
+ status_code=201,
26
+ dependencies=[require_scope("kb:write")],
27
+ )
28
+ async def store_documents(
29
+ body: StoreRequest,
30
+ tenant_id: str = Depends(get_tenant_id),
31
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
32
+ ) -> StoreResult:
33
+ return await kb.store_documents(tenant_id, body.documents)
34
+
35
+
36
+ @router.get("/documents/{document_id}", response_model=Document)
37
+ async def retrieve_document(
38
+ document_id: str,
39
+ tenant_id: str = Depends(get_tenant_id),
40
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
41
+ ) -> Document:
42
+ return await kb.retrieve_document(tenant_id, document_id)
43
+
44
+
45
+ @router.post("/search", response_model=SearchResult)
46
+ async def search_documents(
47
+ body: SearchQuery,
48
+ tenant_id: str = Depends(get_tenant_id),
49
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
50
+ ) -> SearchResult:
51
+ return await kb.search(tenant_id, body)
52
+
53
+
54
+ @router.delete("/documents/{document_id}", status_code=204)
55
+ async def delete_document(
56
+ document_id: str,
57
+ tenant_id: str = Depends(get_tenant_id),
58
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
59
+ ) -> JSONResponse:
60
+ await kb.delete_document(tenant_id, document_id)
61
+ return JSONResponse(status_code=204, content=None)
62
+
63
+
64
+ @router.get("/stats", response_model=KBStatsResponse)
65
+ async def get_stats(
66
+ tenant_id: str = Depends(get_tenant_id),
67
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
68
+ ) -> KBStatsResponse:
69
+ return await kb.get_stats(tenant_id)
70
+
71
+
72
+ @router.post("/query", response_model=QueryResult)
73
+ async def run_query(
74
+ body: QueryRequest,
75
+ tenant_id: str = Depends(get_tenant_id),
76
+ kb: KnowledgeBaseService = Depends(get_knowledge_base),
77
+ ) -> QueryResult:
78
+ return await kb.execute_query(tenant_id, body.sql, body.parameters)
@@ -0,0 +1,93 @@
1
+ from enum import Enum
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class DocumentSource(str, Enum):
8
+ speech = "speech"
9
+ tweet = "tweet"
10
+ youtube = "youtube"
11
+ processed = "processed"
12
+
13
+
14
+ class Document(BaseModel):
15
+ doc_id: str
16
+ source: DocumentSource
17
+ chunk_id: str | None = None
18
+ content: str
19
+ metadata: dict[str, Any] = Field(default_factory=dict)
20
+ metadata_chunk: dict[str, Any] = Field(default_factory=dict)
21
+ content_summary: str | None = None
22
+ content_title: str | None = None
23
+ embedding: list[float] | None = None
24
+ content_embeddings: list[float] | None = None
25
+ content_summary_embeddings: list[float] | None = None
26
+ content_title_embeddings: list[float] | None = None
27
+
28
+
29
+ class SearchQueryType(str, Enum):
30
+ text = "text"
31
+ semantic = "semantic"
32
+ hybrid = "hybrid"
33
+
34
+
35
+ class SearchQuery(BaseModel):
36
+ query_type: SearchQueryType = SearchQueryType.text
37
+ query_text: str | None = None
38
+ vector: list[float] | None = None
39
+ filters: dict[str, Any] = Field(default_factory=dict)
40
+ size: int = Field(default=10, ge=1, le=100)
41
+ from_: int = Field(default=0, ge=0, alias="from")
42
+
43
+ model_config = {"populate_by_name": True}
44
+
45
+
46
+ class StoreRequest(BaseModel):
47
+ documents: list[Document]
48
+
49
+
50
+ class StoreResult(BaseModel):
51
+ stored_count: int
52
+ failed_count: int
53
+ errors: list[str] = Field(default_factory=list)
54
+
55
+
56
+ class SearchHit(BaseModel):
57
+ doc_id: str
58
+ score: float
59
+ source: DocumentSource
60
+ content: str
61
+ highlights: list[str] = Field(default_factory=list)
62
+ metadata: dict[str, Any] = Field(default_factory=dict)
63
+
64
+
65
+ class SearchResult(BaseModel):
66
+ hits: list[SearchHit]
67
+ total: int
68
+ took_ms: int
69
+
70
+
71
+ class ConnectorHealth(BaseModel):
72
+ connector: str
73
+ status: str = Field(pattern=r"^(up|down|degraded)$")
74
+ latency_ms: float | None = None
75
+ details: dict[str, Any] = Field(default_factory=dict)
76
+
77
+
78
+ class QueryRequest(BaseModel):
79
+ sql: str
80
+ parameters: dict[str, Any] = Field(default_factory=dict)
81
+
82
+
83
+ class QueryResult(BaseModel):
84
+ columns: list[str]
85
+ rows: list[list[Any]]
86
+ row_count: int
87
+ execution_time_ms: int
88
+
89
+
90
+ class KBStatsResponse(BaseModel):
91
+ document_count: int
92
+ index_size_bytes: int
93
+ sources: dict[str, int] = Field(default_factory=dict)
@@ -0,0 +1,13 @@
1
+ """Pulse Engine testing utilities — import fixtures and mocks for product tests."""
2
+
3
+ from pulse_engine.testing.mocks import (
4
+ MockExtractor,
5
+ MockOrchestratorAdapter,
6
+ MockStorageConnector,
7
+ )
8
+
9
+ __all__ = [
10
+ "MockExtractor",
11
+ "MockOrchestratorAdapter",
12
+ "MockStorageConnector",
13
+ ]
@@ -0,0 +1,50 @@
1
+ """Reusable pytest fixtures for products that depend on pulse-engine.
2
+
3
+ Products import these in their ``conftest.py``::
4
+
5
+ from pulse_engine.testing.fixtures import * # noqa: F401, F403
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import pytest
11
+
12
+ from pulse_engine.processor.pipeline import ProcessingPipeline
13
+ from pulse_engine.storage.knowledge_base import KnowledgeBaseService
14
+ from pulse_engine.testing.mocks import (
15
+ MockExtractor,
16
+ MockOrchestratorAdapter,
17
+ MockStorageConnector,
18
+ )
19
+
20
+
21
+ @pytest.fixture()
22
+ def mock_storage_connector() -> MockStorageConnector:
23
+ return MockStorageConnector()
24
+
25
+
26
+ @pytest.fixture()
27
+ def mock_orchestrator() -> MockOrchestratorAdapter:
28
+ return MockOrchestratorAdapter()
29
+
30
+
31
+ @pytest.fixture()
32
+ def mock_extractor() -> MockExtractor:
33
+ return MockExtractor()
34
+
35
+
36
+ @pytest.fixture()
37
+ def kb_service(mock_storage_connector: MockStorageConnector) -> KnowledgeBaseService:
38
+ """KnowledgeBaseService backed by an in-memory mock connector."""
39
+ return KnowledgeBaseService(
40
+ opensearch_connector=mock_storage_connector, # type: ignore[arg-type]
41
+ athena_connector=None,
42
+ )
43
+
44
+
45
+ @pytest.fixture()
46
+ def processing_pipeline(
47
+ kb_service: KnowledgeBaseService,
48
+ ) -> ProcessingPipeline:
49
+ """Pipeline using all default stages and the mock KB service."""
50
+ return ProcessingPipeline(kb_service=kb_service)