pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""MCP tools for Pipeline operations."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pulse_engine.core.exceptions import NotFoundError, ServiceUnavailableError
|
|
6
|
+
from pulse_engine.mcp.server import get_container, handle_mcp_errors, mcp
|
|
7
|
+
from pulse_engine.pipeline.config_parser import (
|
|
8
|
+
PipelineConfigError,
|
|
9
|
+
parse_pipeline_config,
|
|
10
|
+
)
|
|
11
|
+
from pulse_engine.pipeline.repositories import (
|
|
12
|
+
ModuleRegistryRepository,
|
|
13
|
+
PipelineRunRepository,
|
|
14
|
+
)
|
|
15
|
+
from pulse_engine.pipeline.schemas import PipelineRunStatus, PipelineStatusResponse
|
|
16
|
+
from pulse_engine.pipeline.service import PipelineService
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _build_pipeline_service(
|
|
20
|
+
session: Any,
|
|
21
|
+
container: Any,
|
|
22
|
+
) -> PipelineService:
|
|
23
|
+
return PipelineService(
|
|
24
|
+
module_repo=ModuleRegistryRepository(session),
|
|
25
|
+
run_repo=PipelineRunRepository(session),
|
|
26
|
+
translators=container.pipeline_translators,
|
|
27
|
+
status_providers=container.pipeline_status_providers,
|
|
28
|
+
settings=container.settings,
|
|
29
|
+
token_issuer=container.token_issuer,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@mcp.tool()
|
|
34
|
+
@handle_mcp_errors
|
|
35
|
+
async def pipelines_trigger(
|
|
36
|
+
tenant_id: str,
|
|
37
|
+
product: str,
|
|
38
|
+
orchestrator: str,
|
|
39
|
+
pipeline_yaml: str,
|
|
40
|
+
global_config: dict[str, Any] | None = None,
|
|
41
|
+
) -> dict[str, Any]:
|
|
42
|
+
"""Trigger a pipeline run.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
tenant_id: Tenant identifier.
|
|
46
|
+
product: Product name (must have modules registered).
|
|
47
|
+
orchestrator: Orchestrator backend ("prefect").
|
|
48
|
+
pipeline_yaml: Pipeline definition as a YAML string.
|
|
49
|
+
global_config: Config passed to all steps (overrides yaml config block).
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
{"run_id": "<uuid>"} on success.
|
|
53
|
+
"""
|
|
54
|
+
container = get_container()
|
|
55
|
+
if container.db_session_factory is None:
|
|
56
|
+
raise ServiceUnavailableError("Database not configured")
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
config = parse_pipeline_config(pipeline_yaml)
|
|
60
|
+
except PipelineConfigError as e:
|
|
61
|
+
return {"error": True, "error_type": "PipelineConfigError", "errors": e.errors}
|
|
62
|
+
|
|
63
|
+
async with container.db_session_factory() as session:
|
|
64
|
+
service = _build_pipeline_service(session, container)
|
|
65
|
+
run_id = await service.trigger(
|
|
66
|
+
tenant_id=tenant_id,
|
|
67
|
+
product=product,
|
|
68
|
+
orchestrator=orchestrator,
|
|
69
|
+
config=config,
|
|
70
|
+
global_config=global_config or config.config,
|
|
71
|
+
)
|
|
72
|
+
return {"run_id": run_id}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@mcp.tool()
|
|
76
|
+
@handle_mcp_errors
|
|
77
|
+
async def pipelines_status(
|
|
78
|
+
tenant_id: str,
|
|
79
|
+
run_id: str,
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
"""Get the status and step details of a pipeline run.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
tenant_id: Tenant identifier.
|
|
85
|
+
run_id: Pipeline run ID (from pipelines_trigger).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Pipeline status with run_id, product, orchestrator, status, and steps.
|
|
89
|
+
"""
|
|
90
|
+
container = get_container()
|
|
91
|
+
if container.db_session_factory is None:
|
|
92
|
+
raise ServiceUnavailableError("Database not configured")
|
|
93
|
+
|
|
94
|
+
async with container.db_session_factory() as session:
|
|
95
|
+
service = _build_pipeline_service(session, container)
|
|
96
|
+
run = await service.get_run(tenant_id, run_id)
|
|
97
|
+
if run is None:
|
|
98
|
+
raise NotFoundError("Pipeline run not found", run_id=run_id)
|
|
99
|
+
|
|
100
|
+
pipeline_status = await service.get_status(tenant_id, run_id)
|
|
101
|
+
|
|
102
|
+
resp = PipelineStatusResponse(
|
|
103
|
+
run_id=run.id,
|
|
104
|
+
product=run.product,
|
|
105
|
+
orchestrator=run.orchestrator,
|
|
106
|
+
status=PipelineRunStatus(
|
|
107
|
+
pipeline_status.status if pipeline_status else run.status
|
|
108
|
+
),
|
|
109
|
+
started_at=pipeline_status.started_at if pipeline_status else None, # type: ignore[arg-type]
|
|
110
|
+
steps=pipeline_status.steps if pipeline_status else [],
|
|
111
|
+
)
|
|
112
|
+
return resp.model_dump(mode="json")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@mcp.tool()
|
|
116
|
+
@handle_mcp_errors
|
|
117
|
+
async def pipelines_list(
|
|
118
|
+
tenant_id: str,
|
|
119
|
+
product: str | None = None,
|
|
120
|
+
status: str | None = None,
|
|
121
|
+
limit: int = 20,
|
|
122
|
+
offset: int = 0,
|
|
123
|
+
) -> dict[str, Any]:
|
|
124
|
+
"""List pipeline runs for a tenant.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
tenant_id: Tenant identifier.
|
|
128
|
+
product: Filter by product name.
|
|
129
|
+
status: Filter by status (pending, running, completed, failed).
|
|
130
|
+
limit: Max results to return.
|
|
131
|
+
offset: Pagination offset.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
{"items": [...], "total": N, "limit": N, "offset": N}
|
|
135
|
+
"""
|
|
136
|
+
container = get_container()
|
|
137
|
+
if container.db_session_factory is None:
|
|
138
|
+
raise ServiceUnavailableError("Database not configured")
|
|
139
|
+
|
|
140
|
+
async with container.db_session_factory() as session:
|
|
141
|
+
service = _build_pipeline_service(session, container)
|
|
142
|
+
runs, total = await service.list_runs(
|
|
143
|
+
tenant_id, product=product, status=status, limit=limit, offset=offset
|
|
144
|
+
)
|
|
145
|
+
items = [
|
|
146
|
+
{
|
|
147
|
+
"run_id": r.id,
|
|
148
|
+
"product": r.product,
|
|
149
|
+
"orchestrator": r.orchestrator,
|
|
150
|
+
"status": r.status,
|
|
151
|
+
"created_at": r.created_at.isoformat() if r.created_at else None,
|
|
152
|
+
}
|
|
153
|
+
for r in runs
|
|
154
|
+
]
|
|
155
|
+
return {"items": items, "total": total, "limit": limit, "offset": offset}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@mcp.tool()
|
|
159
|
+
@handle_mcp_errors
|
|
160
|
+
async def pipelines_cancel(
|
|
161
|
+
tenant_id: str,
|
|
162
|
+
run_id: str,
|
|
163
|
+
) -> dict[str, Any]:
|
|
164
|
+
"""Cancel a running pipeline.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
tenant_id: Tenant identifier.
|
|
168
|
+
run_id: Pipeline run ID to cancel.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
{"run_id": "...", "status": "cancelled"} on success.
|
|
172
|
+
"""
|
|
173
|
+
container = get_container()
|
|
174
|
+
if container.db_session_factory is None:
|
|
175
|
+
raise ServiceUnavailableError("Database not configured")
|
|
176
|
+
|
|
177
|
+
async with container.db_session_factory() as session:
|
|
178
|
+
service = _build_pipeline_service(session, container)
|
|
179
|
+
result = await service.cancel(tenant_id, run_id)
|
|
180
|
+
if not result:
|
|
181
|
+
raise NotFoundError(
|
|
182
|
+
"Pipeline run not found or cannot be cancelled", run_id=run_id
|
|
183
|
+
)
|
|
184
|
+
return {"run_id": run_id, "status": "cancelled"}
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@mcp.tool()
|
|
188
|
+
@handle_mcp_errors
|
|
189
|
+
async def pipelines_steps(
|
|
190
|
+
tenant_id: str,
|
|
191
|
+
run_id: str,
|
|
192
|
+
) -> dict[str, Any]:
|
|
193
|
+
"""Get detailed step execution info for a pipeline run.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
tenant_id: Tenant identifier.
|
|
197
|
+
run_id: Pipeline run ID.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
{"run_id": "...", "steps": [{step, module, status, started_at}, ...]}
|
|
201
|
+
"""
|
|
202
|
+
container = get_container()
|
|
203
|
+
if container.db_session_factory is None:
|
|
204
|
+
raise ServiceUnavailableError("Database not configured")
|
|
205
|
+
|
|
206
|
+
async with container.db_session_factory() as session:
|
|
207
|
+
service = _build_pipeline_service(session, container)
|
|
208
|
+
pipeline_status = await service.get_status(tenant_id, run_id)
|
|
209
|
+
if pipeline_status is None:
|
|
210
|
+
raise NotFoundError("Pipeline run not found", run_id=run_id)
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
"run_id": run_id,
|
|
214
|
+
"steps": [s.model_dump(mode="json") for s in pipeline_status.steps],
|
|
215
|
+
}
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""MCP tools for Processor operations."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pulse_engine.mcp.server import get_container, handle_mcp_errors, mcp
|
|
7
|
+
from pulse_engine.processor.core.analysis import (
|
|
8
|
+
classify_sentiment,
|
|
9
|
+
extract_entities,
|
|
10
|
+
extract_topics,
|
|
11
|
+
summarize,
|
|
12
|
+
)
|
|
13
|
+
from pulse_engine.processor.core.chunking import chunk_content
|
|
14
|
+
from pulse_engine.processor.postprocessor.tasks import run_postprocessing
|
|
15
|
+
from pulse_engine.processor.preprocessor.tasks import (
|
|
16
|
+
clean_html,
|
|
17
|
+
detect_language,
|
|
18
|
+
normalize_text,
|
|
19
|
+
validate_content,
|
|
20
|
+
)
|
|
21
|
+
from pulse_engine.processor.schemas import (
|
|
22
|
+
ContentChunk,
|
|
23
|
+
ProcessingContext,
|
|
24
|
+
ProcessingOptions,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@mcp.tool()
|
|
29
|
+
@handle_mcp_errors
|
|
30
|
+
async def process_pipeline(
|
|
31
|
+
tenant_id: str,
|
|
32
|
+
content: str,
|
|
33
|
+
source_id: str,
|
|
34
|
+
source_type: str = "text",
|
|
35
|
+
product: str = "",
|
|
36
|
+
job_id: str | None = None,
|
|
37
|
+
options: dict[str, Any] | None = None,
|
|
38
|
+
) -> dict[str, Any]:
|
|
39
|
+
"""Run the full processing pipeline (preprocess → analyze → postprocess)."""
|
|
40
|
+
from pulse_engine.processor.pipeline import ProcessingPipeline
|
|
41
|
+
|
|
42
|
+
container = get_container()
|
|
43
|
+
pipeline = ProcessingPipeline(kb_service=container.kb_service)
|
|
44
|
+
|
|
45
|
+
opts = ProcessingOptions(**(options or {}))
|
|
46
|
+
ctx = ProcessingContext(
|
|
47
|
+
source_id=source_id,
|
|
48
|
+
source_type=source_type,
|
|
49
|
+
product=product,
|
|
50
|
+
tenant_id=tenant_id,
|
|
51
|
+
job_id=job_id,
|
|
52
|
+
raw_content=content,
|
|
53
|
+
options=opts,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
start = time.monotonic()
|
|
57
|
+
ctx = await pipeline.run(ctx)
|
|
58
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
59
|
+
|
|
60
|
+
doc_count = len(ctx.documents)
|
|
61
|
+
chunk_count = len(ctx.chunks)
|
|
62
|
+
total_chunks = chunk_count + (
|
|
63
|
+
doc_count - chunk_count if doc_count != chunk_count else 0
|
|
64
|
+
)
|
|
65
|
+
stored = len(ctx.documents) if ctx.options.store_results else 0
|
|
66
|
+
deduped = max(0, total_chunks - len(ctx.chunks))
|
|
67
|
+
languages = [ctx.language] if ctx.language and ctx.language != "unknown" else []
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
"source_id": ctx.source_id,
|
|
71
|
+
"chunks_produced": len(ctx.chunks),
|
|
72
|
+
"chunks_stored": stored,
|
|
73
|
+
"chunks_deduplicated": deduped,
|
|
74
|
+
"processing_time_ms": elapsed_ms,
|
|
75
|
+
"summary": ctx.summary,
|
|
76
|
+
"languages_detected": languages,
|
|
77
|
+
"entities_found": len(ctx.entities),
|
|
78
|
+
"stages_completed": ctx.stages_completed,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@mcp.tool()
|
|
83
|
+
@handle_mcp_errors
|
|
84
|
+
async def process_preprocess(
|
|
85
|
+
tenant_id: str,
|
|
86
|
+
content: str,
|
|
87
|
+
source_type: str = "text",
|
|
88
|
+
) -> dict[str, Any]:
|
|
89
|
+
"""Clean, normalize, and validate content."""
|
|
90
|
+
cleaned = clean_html(content)
|
|
91
|
+
cleaned = normalize_text(cleaned)
|
|
92
|
+
lang = detect_language(cleaned) if validate_content(cleaned) else None
|
|
93
|
+
is_valid = validate_content(cleaned)
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
"cleaned_content": cleaned,
|
|
97
|
+
"language": lang,
|
|
98
|
+
"is_valid": is_valid,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@mcp.tool()
|
|
103
|
+
@handle_mcp_errors
|
|
104
|
+
async def process_analyze(
|
|
105
|
+
tenant_id: str,
|
|
106
|
+
content: str,
|
|
107
|
+
options: dict[str, Any] | None = None,
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
"""Chunk content and run NER, sentiment, topic extraction, and summarization."""
|
|
110
|
+
opts = ProcessingOptions(**(options or {}))
|
|
111
|
+
|
|
112
|
+
chunks = chunk_content(
|
|
113
|
+
text=content,
|
|
114
|
+
source_id="analyze-temp",
|
|
115
|
+
strategy=opts.chunk_strategy,
|
|
116
|
+
chunk_size=opts.chunk_size,
|
|
117
|
+
overlap=opts.chunk_overlap,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
for c in chunks:
|
|
121
|
+
if opts.enable_ner:
|
|
122
|
+
c.entities = extract_entities(c.content)
|
|
123
|
+
if opts.enable_sentiment:
|
|
124
|
+
c.sentiment = classify_sentiment(c.content)
|
|
125
|
+
if opts.enable_topics:
|
|
126
|
+
c.topics = extract_topics(c.content)
|
|
127
|
+
|
|
128
|
+
entities = extract_entities(content) if opts.enable_ner else []
|
|
129
|
+
sentiment = classify_sentiment(content) if opts.enable_sentiment else None
|
|
130
|
+
topics = extract_topics(content) if opts.enable_topics else []
|
|
131
|
+
summary_text = summarize(content)
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
"chunks": [c.model_dump(mode="json") for c in chunks],
|
|
135
|
+
"entities": [e.model_dump(mode="json") for e in entities],
|
|
136
|
+
"sentiment": sentiment,
|
|
137
|
+
"topics": topics,
|
|
138
|
+
"summary": summary_text,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@mcp.tool()
|
|
143
|
+
@handle_mcp_errors
|
|
144
|
+
async def process_postprocess(
|
|
145
|
+
tenant_id: str,
|
|
146
|
+
content: str,
|
|
147
|
+
chunks: list[dict[str, Any]],
|
|
148
|
+
source_id: str,
|
|
149
|
+
source_type: str = "text",
|
|
150
|
+
product: str = "",
|
|
151
|
+
options: dict[str, Any] | None = None,
|
|
152
|
+
) -> dict[str, Any]:
|
|
153
|
+
"""Run postprocessing: embeddings, quality scoring, dedup, document formatting."""
|
|
154
|
+
opts = ProcessingOptions(**(options or {}))
|
|
155
|
+
chunk_objs = [ContentChunk(**c) for c in chunks]
|
|
156
|
+
|
|
157
|
+
ctx = ProcessingContext(
|
|
158
|
+
source_id=source_id,
|
|
159
|
+
source_type=source_type,
|
|
160
|
+
product=product,
|
|
161
|
+
tenant_id=tenant_id,
|
|
162
|
+
raw_content=content,
|
|
163
|
+
cleaned_content=content,
|
|
164
|
+
chunks=chunk_objs,
|
|
165
|
+
options=opts,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
ctx = run_postprocessing(ctx)
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
"documents_created": len(ctx.documents),
|
|
172
|
+
"embeddings_generated": len(ctx.embeddings),
|
|
173
|
+
"quality_scores": ctx.quality_scores,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@mcp.tool()
|
|
178
|
+
@handle_mcp_errors
|
|
179
|
+
async def process_chunk(
|
|
180
|
+
tenant_id: str,
|
|
181
|
+
content: str,
|
|
182
|
+
strategy: str = "token_count",
|
|
183
|
+
chunk_size: int = 512,
|
|
184
|
+
overlap: int = 50,
|
|
185
|
+
) -> dict[str, Any]:
|
|
186
|
+
"""Split content into chunks using the specified strategy."""
|
|
187
|
+
chunks = chunk_content(
|
|
188
|
+
text=content,
|
|
189
|
+
source_id="chunk-temp",
|
|
190
|
+
strategy=strategy,
|
|
191
|
+
chunk_size=chunk_size,
|
|
192
|
+
overlap=overlap,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
items = [
|
|
196
|
+
{
|
|
197
|
+
"index": c.chunk_index,
|
|
198
|
+
"content": c.content,
|
|
199
|
+
"token_count": c.token_count,
|
|
200
|
+
}
|
|
201
|
+
for c in chunks
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
"chunks": items,
|
|
206
|
+
"total_chunks": len(items),
|
|
207
|
+
"strategy": strategy,
|
|
208
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""In-memory sliding-window rate limiter.
|
|
2
|
+
|
|
3
|
+
Provides per-key rate limiting without external dependencies. Suitable for
|
|
4
|
+
single-instance deployments; for multi-instance, swap the storage backend
|
|
5
|
+
to Redis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from threading import Lock
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
16
|
+
from starlette.requests import Request
|
|
17
|
+
from starlette.responses import JSONResponse, Response
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class _SlidingWindowCounter:
|
|
21
|
+
"""Thread-safe sliding-window counter."""
|
|
22
|
+
|
|
23
|
+
def __init__(self) -> None:
|
|
24
|
+
self._windows: dict[str, list[float]] = defaultdict(list)
|
|
25
|
+
self._lock = Lock()
|
|
26
|
+
|
|
27
|
+
def hit(self, key: str, max_requests: int, window_seconds: int) -> bool:
|
|
28
|
+
"""Record a hit and return True if under the limit, False if exceeded."""
|
|
29
|
+
now = time.monotonic()
|
|
30
|
+
cutoff = now - window_seconds
|
|
31
|
+
|
|
32
|
+
with self._lock:
|
|
33
|
+
timestamps = self._windows[key]
|
|
34
|
+
# Prune expired entries
|
|
35
|
+
self._windows[key] = timestamps = [t for t in timestamps if t > cutoff]
|
|
36
|
+
if len(timestamps) >= max_requests:
|
|
37
|
+
return False
|
|
38
|
+
timestamps.append(now)
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
def remaining(self, key: str, max_requests: int, window_seconds: int) -> int:
|
|
42
|
+
"""Return remaining requests in the current window."""
|
|
43
|
+
now = time.monotonic()
|
|
44
|
+
cutoff = now - window_seconds
|
|
45
|
+
|
|
46
|
+
with self._lock:
|
|
47
|
+
timestamps = self._windows[key]
|
|
48
|
+
active = [t for t in timestamps if t > cutoff]
|
|
49
|
+
return max(0, max_requests - len(active))
|
|
50
|
+
|
|
51
|
+
def reset(self) -> None:
|
|
52
|
+
"""Clear all counters — intended for use in tests only."""
|
|
53
|
+
with self._lock:
|
|
54
|
+
self._windows.clear()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Global singleton — shared across middleware and endpoint decorators
|
|
58
|
+
_counter = _SlidingWindowCounter()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _client_ip(request: Request) -> str:
|
|
62
|
+
"""Extract the client IP, respecting X-Forwarded-For behind a proxy."""
|
|
63
|
+
forwarded = request.headers.get("x-forwarded-for")
|
|
64
|
+
if forwarded:
|
|
65
|
+
return forwarded.split(",")[0].strip()
|
|
66
|
+
client = request.client
|
|
67
|
+
return client.host if client else "unknown"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class RateLimitMiddleware(BaseHTTPMiddleware):
|
|
71
|
+
"""Global per-IP rate limiting middleware.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
app: ASGI application.
|
|
75
|
+
max_requests: Maximum requests per window (default 100).
|
|
76
|
+
window_seconds: Window duration in seconds (default 60).
|
|
77
|
+
counter: Optional counter instance; defaults to the shared module-level
|
|
78
|
+
singleton. Pass a fresh ``_SlidingWindowCounter()`` in tests to
|
|
79
|
+
keep state isolated between test cases.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
app: Any,
|
|
85
|
+
max_requests: int = 100,
|
|
86
|
+
window_seconds: int = 60,
|
|
87
|
+
counter: _SlidingWindowCounter | None = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
super().__init__(app)
|
|
90
|
+
self._max = max_requests
|
|
91
|
+
self._window = window_seconds
|
|
92
|
+
self._counter = counter if counter is not None else _counter
|
|
93
|
+
|
|
94
|
+
async def dispatch(
|
|
95
|
+
self, request: Request, call_next: RequestResponseEndpoint
|
|
96
|
+
) -> Response:
|
|
97
|
+
key = f"global:{_client_ip(request)}"
|
|
98
|
+
allowed = self._counter.hit(key, self._max, self._window)
|
|
99
|
+
remaining = self._counter.remaining(key, self._max, self._window)
|
|
100
|
+
|
|
101
|
+
if not allowed:
|
|
102
|
+
return JSONResponse(
|
|
103
|
+
status_code=429,
|
|
104
|
+
content={
|
|
105
|
+
"success": False,
|
|
106
|
+
"error": "Rate limit exceeded. Try again later.",
|
|
107
|
+
},
|
|
108
|
+
headers={
|
|
109
|
+
"Retry-After": str(self._window),
|
|
110
|
+
"X-RateLimit-Limit": str(self._max),
|
|
111
|
+
"X-RateLimit-Remaining": "0",
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
response = await call_next(request)
|
|
116
|
+
response.headers["X-RateLimit-Limit"] = str(self._max)
|
|
117
|
+
response.headers["X-RateLimit-Remaining"] = str(remaining)
|
|
118
|
+
return response
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def check_rate_limit(
|
|
122
|
+
request: Request,
|
|
123
|
+
*,
|
|
124
|
+
scope: str,
|
|
125
|
+
max_requests: int,
|
|
126
|
+
window_seconds: int,
|
|
127
|
+
) -> None:
|
|
128
|
+
"""Check rate limit for a specific scope. Raises 429 via JSONResponse.
|
|
129
|
+
|
|
130
|
+
Usage in endpoints::
|
|
131
|
+
|
|
132
|
+
@router.post("/login")
|
|
133
|
+
async def login(request: Request, body: LoginRequest):
|
|
134
|
+
check_rate_limit(request, scope="login", max_requests=5, window_seconds=60)
|
|
135
|
+
...
|
|
136
|
+
"""
|
|
137
|
+
key = f"{scope}:{_client_ip(request)}"
|
|
138
|
+
allowed = _counter.hit(key, max_requests, window_seconds)
|
|
139
|
+
if not allowed:
|
|
140
|
+
from pulse_engine.core.exceptions import TooManyRequestsError
|
|
141
|
+
|
|
142
|
+
raise TooManyRequestsError(
|
|
143
|
+
f"Too many requests. Try again in {window_seconds} seconds."
|
|
144
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
4
|
+
from starlette.requests import Request
|
|
5
|
+
from starlette.responses import Response
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RequestIDMiddleware(BaseHTTPMiddleware):
|
|
9
|
+
async def dispatch(
|
|
10
|
+
self, request: Request, call_next: RequestResponseEndpoint
|
|
11
|
+
) -> Response:
|
|
12
|
+
request_id = request.headers.get("x-request-id") or str(uuid.uuid4())
|
|
13
|
+
request.state.request_id = request_id
|
|
14
|
+
response = await call_next(request)
|
|
15
|
+
response.headers["x-request-id"] = request_id
|
|
16
|
+
return response
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Middleware that adds defensive HTTP security headers to every response."""
|
|
2
|
+
|
|
3
|
+
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
4
|
+
from starlette.requests import Request
|
|
5
|
+
from starlette.responses import Response
|
|
6
|
+
|
|
7
|
+
SECURITY_HEADERS = {
|
|
8
|
+
"X-Content-Type-Options": "nosniff",
|
|
9
|
+
"X-Frame-Options": "DENY",
|
|
10
|
+
"X-XSS-Protection": "1; mode=block",
|
|
11
|
+
"Strict-Transport-Security": "max-age=31536000; includeSubDomains",
|
|
12
|
+
"Referrer-Policy": "strict-origin-when-cross-origin",
|
|
13
|
+
"Permissions-Policy": "geolocation=(), microphone=(), camera=()",
|
|
14
|
+
"Content-Security-Policy": "default-src 'self'",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
|
19
|
+
async def dispatch(
|
|
20
|
+
self, request: Request, call_next: RequestResponseEndpoint
|
|
21
|
+
) -> Response:
|
|
22
|
+
response = await call_next(request)
|
|
23
|
+
for header, value in SECURITY_HEADERS.items():
|
|
24
|
+
response.headers[header] = value
|
|
25
|
+
return response
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import structlog
|
|
2
|
+
from fastapi import Request
|
|
3
|
+
from jose import jwt as jose_jwt
|
|
4
|
+
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
5
|
+
from starlette.responses import JSONResponse, Response
|
|
6
|
+
from starlette.types import ASGIApp
|
|
7
|
+
|
|
8
|
+
from pulse_engine.core.exceptions import UnauthorizedError
|
|
9
|
+
from pulse_engine.core.job_token import JobClaims
|
|
10
|
+
from pulse_engine.core.security import (
|
|
11
|
+
CognitoClaims,
|
|
12
|
+
TokenVerifier,
|
|
13
|
+
extract_bearer_token,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = structlog.get_logger()
|
|
17
|
+
|
|
18
|
+
PUBLIC_PATHS = {
|
|
19
|
+
"/api/v1/health",
|
|
20
|
+
"/api/v1/auth/login",
|
|
21
|
+
# OpenAPI/Swagger docs — only reachable in non-production environments
|
|
22
|
+
# because create_app() sets docs_url/redoc_url/openapi_url to None in production.
|
|
23
|
+
"/docs",
|
|
24
|
+
"/docs/oauth2-redirect",
|
|
25
|
+
"/openapi.json",
|
|
26
|
+
"/redoc",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TenantMiddleware(BaseHTTPMiddleware):
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
app: ASGIApp,
|
|
34
|
+
verifier: TokenVerifier,
|
|
35
|
+
job_token_secret: str = "",
|
|
36
|
+
) -> None:
|
|
37
|
+
super().__init__(app)
|
|
38
|
+
self._verifier = verifier
|
|
39
|
+
self._job_token_secret = job_token_secret
|
|
40
|
+
|
|
41
|
+
async def dispatch(
|
|
42
|
+
self, request: Request, call_next: RequestResponseEndpoint
|
|
43
|
+
) -> Response:
|
|
44
|
+
if request.url.path in PUBLIC_PATHS:
|
|
45
|
+
return await call_next(request)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
token = extract_bearer_token(request.headers.get("authorization"))
|
|
49
|
+
claims = await self._resolve_token(token)
|
|
50
|
+
request.state.tenant_id = claims.tenant_id
|
|
51
|
+
request.state.user_claims = claims
|
|
52
|
+
except UnauthorizedError as e:
|
|
53
|
+
request_id = getattr(request.state, "request_id", "unknown")
|
|
54
|
+
return JSONResponse(
|
|
55
|
+
status_code=401,
|
|
56
|
+
content={
|
|
57
|
+
"success": False,
|
|
58
|
+
"error": e.message,
|
|
59
|
+
"request_id": request_id,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return await call_next(request)
|
|
64
|
+
|
|
65
|
+
async def _resolve_token(self, token: str) -> CognitoClaims | JobClaims:
|
|
66
|
+
"""Dispatch to job-token or Cognito verifier based on sub prefix."""
|
|
67
|
+
if self._job_token_secret:
|
|
68
|
+
try:
|
|
69
|
+
unverified = jose_jwt.get_unverified_claims(token)
|
|
70
|
+
sub = unverified.get("sub", "")
|
|
71
|
+
if sub.startswith("job:"):
|
|
72
|
+
from pulse_engine.core.job_token import JobTokenVerifier
|
|
73
|
+
|
|
74
|
+
verifier = JobTokenVerifier(secret=self._job_token_secret)
|
|
75
|
+
return await verifier.verify(token)
|
|
76
|
+
except UnauthorizedError:
|
|
77
|
+
raise
|
|
78
|
+
except Exception as exc:
|
|
79
|
+
logger.debug(
|
|
80
|
+
"job_token_parse_failed_falling_back_to_cognito",
|
|
81
|
+
error=str(exc),
|
|
82
|
+
)
|
|
83
|
+
return await self._verifier.verify(token)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_tenant_id(request: Request) -> str:
|
|
87
|
+
tenant_id: str | None = getattr(request.state, "tenant_id", None)
|
|
88
|
+
if not tenant_id:
|
|
89
|
+
raise UnauthorizedError("Tenant context not available")
|
|
90
|
+
return tenant_id
|
|
File without changes
|