pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends
|
|
4
|
+
|
|
5
|
+
from pulse_engine.dependencies import get_processing_pipeline
|
|
6
|
+
from pulse_engine.middleware.tenant import get_tenant_id
|
|
7
|
+
from pulse_engine.processor.core.analysis import (
|
|
8
|
+
classify_sentiment,
|
|
9
|
+
extract_entities,
|
|
10
|
+
extract_topics,
|
|
11
|
+
summarize,
|
|
12
|
+
)
|
|
13
|
+
from pulse_engine.processor.core.chunking import chunk_content
|
|
14
|
+
from pulse_engine.processor.pipeline import ProcessingPipeline
|
|
15
|
+
from pulse_engine.processor.postprocessor.tasks import (
|
|
16
|
+
run_postprocessing,
|
|
17
|
+
)
|
|
18
|
+
from pulse_engine.processor.preprocessor.tasks import (
|
|
19
|
+
clean_html,
|
|
20
|
+
detect_language,
|
|
21
|
+
normalize_text,
|
|
22
|
+
validate_content,
|
|
23
|
+
)
|
|
24
|
+
from pulse_engine.processor.schemas import (
|
|
25
|
+
AnalyzeRequest,
|
|
26
|
+
AnalyzeResponse,
|
|
27
|
+
ChunkItem,
|
|
28
|
+
ChunkRequest,
|
|
29
|
+
ChunkResponse,
|
|
30
|
+
PipelineRequest,
|
|
31
|
+
PipelineResponse,
|
|
32
|
+
PostprocessRequest,
|
|
33
|
+
PostprocessResponse,
|
|
34
|
+
PreprocessRequest,
|
|
35
|
+
PreprocessResponse,
|
|
36
|
+
ProcessingContext,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
router = APIRouter(prefix="/process", tags=["Processor"])
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@router.post("/pipeline", response_model=PipelineResponse)
|
|
43
|
+
async def run_pipeline(
|
|
44
|
+
body: PipelineRequest,
|
|
45
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
46
|
+
pipeline: ProcessingPipeline = Depends(get_processing_pipeline),
|
|
47
|
+
) -> PipelineResponse:
|
|
48
|
+
start = time.monotonic()
|
|
49
|
+
|
|
50
|
+
ctx = ProcessingContext(
|
|
51
|
+
source_id=body.source_id,
|
|
52
|
+
source_type=body.source_type,
|
|
53
|
+
product=body.product,
|
|
54
|
+
tenant_id=tenant_id,
|
|
55
|
+
job_id=body.job_id,
|
|
56
|
+
raw_content=body.content,
|
|
57
|
+
options=body.options,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
ctx = await pipeline.run(ctx)
|
|
61
|
+
|
|
62
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
63
|
+
|
|
64
|
+
doc_count = len(ctx.documents)
|
|
65
|
+
chunk_count = len(ctx.chunks)
|
|
66
|
+
total_chunks = chunk_count + (
|
|
67
|
+
doc_count - chunk_count if doc_count != chunk_count else 0
|
|
68
|
+
)
|
|
69
|
+
stored = len(ctx.documents) if ctx.options.store_results else 0
|
|
70
|
+
deduped = max(0, total_chunks - len(ctx.chunks))
|
|
71
|
+
|
|
72
|
+
languages = [ctx.language] if ctx.language and ctx.language != "unknown" else []
|
|
73
|
+
|
|
74
|
+
return PipelineResponse(
|
|
75
|
+
source_id=ctx.source_id,
|
|
76
|
+
chunks_produced=len(ctx.chunks),
|
|
77
|
+
chunks_stored=stored,
|
|
78
|
+
chunks_deduplicated=deduped,
|
|
79
|
+
processing_time_ms=elapsed_ms,
|
|
80
|
+
summary=ctx.summary,
|
|
81
|
+
languages_detected=languages,
|
|
82
|
+
entities_found=len(ctx.entities),
|
|
83
|
+
stages_completed=ctx.stages_completed,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@router.post("/preprocess", response_model=PreprocessResponse)
|
|
88
|
+
async def preprocess(
|
|
89
|
+
body: PreprocessRequest,
|
|
90
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
91
|
+
) -> PreprocessResponse:
|
|
92
|
+
cleaned = clean_html(body.content)
|
|
93
|
+
cleaned = normalize_text(cleaned)
|
|
94
|
+
lang = detect_language(cleaned) if validate_content(cleaned) else None
|
|
95
|
+
is_valid = validate_content(cleaned)
|
|
96
|
+
|
|
97
|
+
return PreprocessResponse(
|
|
98
|
+
cleaned_content=cleaned,
|
|
99
|
+
language=lang,
|
|
100
|
+
is_valid=is_valid,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@router.post("/analyze", response_model=AnalyzeResponse)
|
|
105
|
+
async def analyze(
|
|
106
|
+
body: AnalyzeRequest,
|
|
107
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
108
|
+
) -> AnalyzeResponse:
|
|
109
|
+
chunks = chunk_content(
|
|
110
|
+
text=body.content,
|
|
111
|
+
source_id="analyze-temp",
|
|
112
|
+
strategy=body.options.chunk_strategy,
|
|
113
|
+
chunk_size=body.options.chunk_size,
|
|
114
|
+
overlap=body.options.chunk_overlap,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
for c in chunks:
|
|
118
|
+
if body.options.enable_ner:
|
|
119
|
+
c.entities = extract_entities(c.content)
|
|
120
|
+
if body.options.enable_sentiment:
|
|
121
|
+
c.sentiment = classify_sentiment(c.content)
|
|
122
|
+
if body.options.enable_topics:
|
|
123
|
+
c.topics = extract_topics(c.content)
|
|
124
|
+
|
|
125
|
+
entities = extract_entities(body.content) if body.options.enable_ner else []
|
|
126
|
+
sentiment = (
|
|
127
|
+
classify_sentiment(body.content) if body.options.enable_sentiment else None
|
|
128
|
+
)
|
|
129
|
+
topics = extract_topics(body.content) if body.options.enable_topics else []
|
|
130
|
+
summary_text = summarize(body.content)
|
|
131
|
+
|
|
132
|
+
return AnalyzeResponse(
|
|
133
|
+
chunks=chunks,
|
|
134
|
+
entities=entities,
|
|
135
|
+
sentiment=sentiment,
|
|
136
|
+
topics=topics,
|
|
137
|
+
summary=summary_text,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@router.post("/postprocess", response_model=PostprocessResponse)
|
|
142
|
+
async def postprocess(
|
|
143
|
+
body: PostprocessRequest,
|
|
144
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
145
|
+
) -> PostprocessResponse:
|
|
146
|
+
ctx = ProcessingContext(
|
|
147
|
+
source_id=body.source_id,
|
|
148
|
+
source_type=body.source_type,
|
|
149
|
+
product=body.product,
|
|
150
|
+
tenant_id=body.tenant_id or tenant_id,
|
|
151
|
+
raw_content=body.content,
|
|
152
|
+
cleaned_content=body.content,
|
|
153
|
+
chunks=body.chunks,
|
|
154
|
+
options=body.options,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
ctx = run_postprocessing(ctx)
|
|
158
|
+
|
|
159
|
+
return PostprocessResponse(
|
|
160
|
+
documents_created=len(ctx.documents),
|
|
161
|
+
embeddings_generated=len(ctx.embeddings),
|
|
162
|
+
quality_scores=ctx.quality_scores,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@router.post("/chunk", response_model=ChunkResponse)
|
|
167
|
+
async def chunk(
|
|
168
|
+
body: ChunkRequest,
|
|
169
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
170
|
+
) -> ChunkResponse:
|
|
171
|
+
chunks = chunk_content(
|
|
172
|
+
text=body.content,
|
|
173
|
+
source_id="chunk-temp",
|
|
174
|
+
strategy=body.strategy,
|
|
175
|
+
chunk_size=body.chunk_size,
|
|
176
|
+
overlap=body.overlap,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
items = [
|
|
180
|
+
ChunkItem(
|
|
181
|
+
index=c.chunk_index,
|
|
182
|
+
content=c.content,
|
|
183
|
+
token_count=c.token_count,
|
|
184
|
+
)
|
|
185
|
+
for c in chunks
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
return ChunkResponse(
|
|
189
|
+
chunks=items,
|
|
190
|
+
total_chunks=len(items),
|
|
191
|
+
strategy=body.strategy,
|
|
192
|
+
)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProcessingOptions(BaseModel):
|
|
7
|
+
chunk_strategy: str = "token_count"
|
|
8
|
+
chunk_size: int = 512
|
|
9
|
+
chunk_overlap: int = 50
|
|
10
|
+
enable_ner: bool = True
|
|
11
|
+
enable_sentiment: bool = True
|
|
12
|
+
enable_topics: bool = True
|
|
13
|
+
enable_embedding: bool = True
|
|
14
|
+
enable_dedup: bool = True
|
|
15
|
+
store_results: bool = True
|
|
16
|
+
embedding_model: str = "text-embedding-3-small"
|
|
17
|
+
embedding_provider: str | None = None # None = use config default
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Entity(BaseModel):
|
|
21
|
+
name: str
|
|
22
|
+
type: str # PERSON, ORGANIZATION, LOCATION, DATE, etc.
|
|
23
|
+
start: int = 0
|
|
24
|
+
end: int = 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ContentChunk(BaseModel):
|
|
28
|
+
chunk_index: int
|
|
29
|
+
content: str
|
|
30
|
+
token_count: int
|
|
31
|
+
parent_source_id: str
|
|
32
|
+
entities: list[Entity] = Field(default_factory=list)
|
|
33
|
+
sentiment: str | None = None
|
|
34
|
+
topics: list[str] = Field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProcessingError(BaseModel):
|
|
38
|
+
stage: str
|
|
39
|
+
task: str
|
|
40
|
+
message: str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ProcessingContext(BaseModel):
|
|
44
|
+
source_id: str
|
|
45
|
+
source_type: str
|
|
46
|
+
product: str
|
|
47
|
+
tenant_id: str
|
|
48
|
+
job_id: str | None = None
|
|
49
|
+
raw_content: str
|
|
50
|
+
cleaned_content: str | None = None
|
|
51
|
+
language: str | None = None
|
|
52
|
+
chunks: list[ContentChunk] = Field(default_factory=list)
|
|
53
|
+
entities: list[Entity] = Field(default_factory=list)
|
|
54
|
+
sentiment: str | None = None
|
|
55
|
+
topics: list[str] = Field(default_factory=list)
|
|
56
|
+
summary: str | None = None
|
|
57
|
+
embeddings: list[list[float]] = Field(default_factory=list)
|
|
58
|
+
quality_scores: list[float] = Field(default_factory=list)
|
|
59
|
+
documents: list[Any] = Field(default_factory=list) # Final Document objects
|
|
60
|
+
options: ProcessingOptions = Field(default_factory=ProcessingOptions)
|
|
61
|
+
stages_completed: list[str] = Field(default_factory=list)
|
|
62
|
+
errors: list[ProcessingError] = Field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# API Request/Response schemas
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PipelineRequest(BaseModel):
|
|
69
|
+
content: str
|
|
70
|
+
source_id: str
|
|
71
|
+
source_type: str = "text"
|
|
72
|
+
product: str = ""
|
|
73
|
+
job_id: str | None = None
|
|
74
|
+
options: ProcessingOptions = Field(default_factory=ProcessingOptions)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class PipelineResponse(BaseModel):
|
|
78
|
+
source_id: str
|
|
79
|
+
chunks_produced: int
|
|
80
|
+
chunks_stored: int
|
|
81
|
+
chunks_deduplicated: int
|
|
82
|
+
processing_time_ms: int
|
|
83
|
+
summary: str | None = None
|
|
84
|
+
languages_detected: list[str] = Field(default_factory=list)
|
|
85
|
+
entities_found: int
|
|
86
|
+
stages_completed: list[str] = Field(default_factory=list)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class ChunkRequest(BaseModel):
|
|
90
|
+
content: str
|
|
91
|
+
strategy: str = "token_count"
|
|
92
|
+
chunk_size: int = 512
|
|
93
|
+
overlap: int = 50
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ChunkItem(BaseModel):
|
|
97
|
+
index: int
|
|
98
|
+
content: str
|
|
99
|
+
token_count: int
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ChunkResponse(BaseModel):
|
|
103
|
+
chunks: list[ChunkItem]
|
|
104
|
+
total_chunks: int
|
|
105
|
+
strategy: str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class PreprocessRequest(BaseModel):
|
|
109
|
+
content: str
|
|
110
|
+
source_type: str = "text"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class PreprocessResponse(BaseModel):
|
|
114
|
+
cleaned_content: str
|
|
115
|
+
language: str | None = None
|
|
116
|
+
is_valid: bool
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class AnalyzeRequest(BaseModel):
|
|
120
|
+
content: str
|
|
121
|
+
options: ProcessingOptions = Field(default_factory=ProcessingOptions)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class AnalyzeResponse(BaseModel):
|
|
125
|
+
chunks: list[ContentChunk]
|
|
126
|
+
entities: list[Entity]
|
|
127
|
+
sentiment: str | None = None
|
|
128
|
+
topics: list[str]
|
|
129
|
+
summary: str | None = None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class PostprocessRequest(BaseModel):
|
|
133
|
+
content: str
|
|
134
|
+
chunks: list[ContentChunk]
|
|
135
|
+
source_id: str
|
|
136
|
+
source_type: str = "text"
|
|
137
|
+
product: str = ""
|
|
138
|
+
tenant_id: str = ""
|
|
139
|
+
options: ProcessingOptions = Field(default_factory=ProcessingOptions)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class PostprocessResponse(BaseModel):
|
|
143
|
+
documents_created: int
|
|
144
|
+
embeddings_generated: int
|
|
145
|
+
quality_scores: list[float]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TopicShift(BaseModel):
|
|
149
|
+
index: int
|
|
150
|
+
reason: str # "Topic A to Topic B"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class TopicSplitResult(BaseModel):
|
|
154
|
+
topic_shifts: list[TopicShift] = Field(default_factory=list)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class OCRInput(BaseModel):
|
|
158
|
+
"""Input for an OCR extraction request."""
|
|
159
|
+
|
|
160
|
+
file_path: str | None = None
|
|
161
|
+
file_bytes: bytes | None = None
|
|
162
|
+
mime_type: str = "application/pdf"
|
|
163
|
+
prompt: str = ""
|
|
164
|
+
temperature: float = 0.1
|
|
165
|
+
model: str = "gemini-2.5-flash"
|
|
166
|
+
api_key: str = ""
|
|
167
|
+
max_output_tokens: int = 65536
|
pulse_engine/registry.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Product manifest declaration, validation, and discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib.metadata
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from types import EllipsisType
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pulse_engine.extractor.base import BaseExtractor
|
|
11
|
+
from pulse_engine.processor.base import (
|
|
12
|
+
BaseCoreProcessor,
|
|
13
|
+
BasePostprocessor,
|
|
14
|
+
BasePreprocessor,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ProductManifest:
|
|
20
|
+
"""Declarative manifest that a product uses to register with the engine.
|
|
21
|
+
|
|
22
|
+
For processor stages, the convention is:
|
|
23
|
+
- ``...`` (Ellipsis) → use the engine's default implementation
|
|
24
|
+
- ``None`` → skip the stage entirely
|
|
25
|
+
- an instance → use the provided custom implementation
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str
|
|
29
|
+
version: str
|
|
30
|
+
extractors: list[type[BaseExtractor]] = field(default_factory=list)
|
|
31
|
+
preprocessor: BasePreprocessor | None | EllipsisType = ...
|
|
32
|
+
core_processor: BaseCoreProcessor | None | EllipsisType = ...
|
|
33
|
+
postprocessor: BasePostprocessor | None | EllipsisType = ...
|
|
34
|
+
settings_class: type[Any] | None = None
|
|
35
|
+
routers: list[Any] = field(default_factory=list)
|
|
36
|
+
mcp_tool_modules: list[str] = field(default_factory=list)
|
|
37
|
+
celery_task_modules: list[str] = field(default_factory=list)
|
|
38
|
+
athena_database: str = ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ManifestValidationError(Exception):
|
|
42
|
+
"""Raised when a product manifest fails validation."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, errors: list[str]) -> None:
|
|
45
|
+
self.errors = errors
|
|
46
|
+
super().__init__(f"Manifest validation failed: {'; '.join(errors)}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def validate_manifest(manifest: ProductManifest) -> list[str]:
|
|
50
|
+
"""Validate a ProductManifest and return a list of error strings (empty = valid)."""
|
|
51
|
+
errors: list[str] = []
|
|
52
|
+
|
|
53
|
+
if not manifest.name:
|
|
54
|
+
errors.append("manifest.name is required")
|
|
55
|
+
if not manifest.version:
|
|
56
|
+
errors.append("manifest.version is required")
|
|
57
|
+
|
|
58
|
+
# Validate extractors implement BaseExtractor
|
|
59
|
+
seen_job_types: set[str] = set()
|
|
60
|
+
for i, ext_cls in enumerate(manifest.extractors):
|
|
61
|
+
if not (isinstance(ext_cls, type) and issubclass(ext_cls, BaseExtractor)):
|
|
62
|
+
errors.append(
|
|
63
|
+
f"extractors[{i}] ({ext_cls!r}) must be a subclass of BaseExtractor"
|
|
64
|
+
)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
# Check for duplicate job_types
|
|
68
|
+
try:
|
|
69
|
+
config = ext_cls().get_config()
|
|
70
|
+
if config.job_type in seen_job_types:
|
|
71
|
+
errors.append(
|
|
72
|
+
f"Duplicate job_type '{config.job_type}' "
|
|
73
|
+
f"from extractor '{ext_cls.__name__}'"
|
|
74
|
+
)
|
|
75
|
+
seen_job_types.add(config.job_type)
|
|
76
|
+
except Exception as exc:
|
|
77
|
+
errors.append(
|
|
78
|
+
f"extractors[{i}] ({ext_cls.__name__}).get_config() raised: {exc}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Validate processor stages (must be correct ABC or None or Ellipsis)
|
|
82
|
+
_validate_stage(manifest.preprocessor, BasePreprocessor, "preprocessor", errors)
|
|
83
|
+
_validate_stage(
|
|
84
|
+
manifest.core_processor, BaseCoreProcessor, "core_processor", errors
|
|
85
|
+
)
|
|
86
|
+
_validate_stage(manifest.postprocessor, BasePostprocessor, "postprocessor", errors)
|
|
87
|
+
|
|
88
|
+
return errors
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _validate_stage(
|
|
92
|
+
value: Any,
|
|
93
|
+
abc: type[Any],
|
|
94
|
+
name: str,
|
|
95
|
+
errors: list[str],
|
|
96
|
+
) -> None:
|
|
97
|
+
if value is ... or value is None:
|
|
98
|
+
return
|
|
99
|
+
if not isinstance(value, abc):
|
|
100
|
+
errors.append(
|
|
101
|
+
f"{name} must be an instance of {abc.__name__}, None, or ... (Ellipsis); "
|
|
102
|
+
f"got {type(value).__name__}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def discover_products() -> list[ProductManifest]:
|
|
107
|
+
"""Discover installed products via ``pulse_engine.products`` entry-point group."""
|
|
108
|
+
manifests: list[ProductManifest] = []
|
|
109
|
+
eps = importlib.metadata.entry_points()
|
|
110
|
+
group = eps.select(group="pulse_engine.products")
|
|
111
|
+
for ep in group:
|
|
112
|
+
obj = ep.load()
|
|
113
|
+
if isinstance(obj, ProductManifest):
|
|
114
|
+
manifests.append(obj)
|
|
115
|
+
elif callable(obj):
|
|
116
|
+
manifests.append(obj())
|
|
117
|
+
return manifests
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Lambda handler wrapper for product containers.
|
|
2
|
+
|
|
3
|
+
This is the Lambda handler pointed to by Terraform for native Lambda compute.
|
|
4
|
+
Product containers must NOT add any orchestration logic — they expose a plain
|
|
5
|
+
`entrypoint:run` function and this runner unpacks the event and calls it.
|
|
6
|
+
|
|
7
|
+
Terraform Lambda resource handler:
|
|
8
|
+
pulse_engine.runners.lambda_runner.handler
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def handler(event: dict[str, Any], context: object) -> None:
|
|
17
|
+
"""Engine-owned Lambda handler. Delegates to the product's plain entrypoint."""
|
|
18
|
+
from entrypoint import run # imported at call time from the product container
|
|
19
|
+
|
|
20
|
+
run(
|
|
21
|
+
job_id=event["job_id"],
|
|
22
|
+
chain=event["chain"],
|
|
23
|
+
config=event["config"],
|
|
24
|
+
pulse_api_token=event["pulse_api_token"],
|
|
25
|
+
pulse_engine_url=event["pulse_engine_url"],
|
|
26
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Generic runner for pipeline module containers.
|
|
2
|
+
|
|
3
|
+
This module provides the entrypoint shim that the engine's runner package
|
|
4
|
+
(pulse-engine-runner) uses to call the product's entrypoint.run() function
|
|
5
|
+
with the new pipeline contract.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RunnerError(Exception):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def execute_step(
|
|
19
|
+
entrypoint_fn: Callable[..., dict[str, Any]],
|
|
20
|
+
job_id: str,
|
|
21
|
+
args: dict[str, Any],
|
|
22
|
+
inputs: Any,
|
|
23
|
+
pipeline_run_id: str,
|
|
24
|
+
pulse_engine_url: str,
|
|
25
|
+
pulse_api_token: str,
|
|
26
|
+
) -> dict[str, Any]:
|
|
27
|
+
"""Call the product's entrypoint with the pipeline contract and validate output."""
|
|
28
|
+
result = entrypoint_fn(
|
|
29
|
+
job_id=job_id,
|
|
30
|
+
args=args,
|
|
31
|
+
inputs=inputs,
|
|
32
|
+
pipeline_run_id=pipeline_run_id,
|
|
33
|
+
pulse_engine_url=pulse_engine_url,
|
|
34
|
+
pulse_api_token=pulse_api_token,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
if not isinstance(result, dict):
|
|
38
|
+
raise RunnerError(
|
|
39
|
+
f"Module entrypoint must return a dict (output reference), "
|
|
40
|
+
f"got {type(result).__name__}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return result
|