pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,192 @@
1
+ import time
2
+
3
+ from fastapi import APIRouter, Depends
4
+
5
+ from pulse_engine.dependencies import get_processing_pipeline
6
+ from pulse_engine.middleware.tenant import get_tenant_id
7
+ from pulse_engine.processor.core.analysis import (
8
+ classify_sentiment,
9
+ extract_entities,
10
+ extract_topics,
11
+ summarize,
12
+ )
13
+ from pulse_engine.processor.core.chunking import chunk_content
14
+ from pulse_engine.processor.pipeline import ProcessingPipeline
15
+ from pulse_engine.processor.postprocessor.tasks import (
16
+ run_postprocessing,
17
+ )
18
+ from pulse_engine.processor.preprocessor.tasks import (
19
+ clean_html,
20
+ detect_language,
21
+ normalize_text,
22
+ validate_content,
23
+ )
24
+ from pulse_engine.processor.schemas import (
25
+ AnalyzeRequest,
26
+ AnalyzeResponse,
27
+ ChunkItem,
28
+ ChunkRequest,
29
+ ChunkResponse,
30
+ PipelineRequest,
31
+ PipelineResponse,
32
+ PostprocessRequest,
33
+ PostprocessResponse,
34
+ PreprocessRequest,
35
+ PreprocessResponse,
36
+ ProcessingContext,
37
+ )
38
+
39
+ router = APIRouter(prefix="/process", tags=["Processor"])
40
+
41
+
42
+ @router.post("/pipeline", response_model=PipelineResponse)
43
+ async def run_pipeline(
44
+ body: PipelineRequest,
45
+ tenant_id: str = Depends(get_tenant_id),
46
+ pipeline: ProcessingPipeline = Depends(get_processing_pipeline),
47
+ ) -> PipelineResponse:
48
+ start = time.monotonic()
49
+
50
+ ctx = ProcessingContext(
51
+ source_id=body.source_id,
52
+ source_type=body.source_type,
53
+ product=body.product,
54
+ tenant_id=tenant_id,
55
+ job_id=body.job_id,
56
+ raw_content=body.content,
57
+ options=body.options,
58
+ )
59
+
60
+ ctx = await pipeline.run(ctx)
61
+
62
+ elapsed_ms = int((time.monotonic() - start) * 1000)
63
+
64
+ doc_count = len(ctx.documents)
65
+ chunk_count = len(ctx.chunks)
66
+ total_chunks = chunk_count + (
67
+ doc_count - chunk_count if doc_count != chunk_count else 0
68
+ )
69
+ stored = len(ctx.documents) if ctx.options.store_results else 0
70
+ deduped = max(0, total_chunks - len(ctx.chunks))
71
+
72
+ languages = [ctx.language] if ctx.language and ctx.language != "unknown" else []
73
+
74
+ return PipelineResponse(
75
+ source_id=ctx.source_id,
76
+ chunks_produced=len(ctx.chunks),
77
+ chunks_stored=stored,
78
+ chunks_deduplicated=deduped,
79
+ processing_time_ms=elapsed_ms,
80
+ summary=ctx.summary,
81
+ languages_detected=languages,
82
+ entities_found=len(ctx.entities),
83
+ stages_completed=ctx.stages_completed,
84
+ )
85
+
86
+
87
+ @router.post("/preprocess", response_model=PreprocessResponse)
88
+ async def preprocess(
89
+ body: PreprocessRequest,
90
+ tenant_id: str = Depends(get_tenant_id),
91
+ ) -> PreprocessResponse:
92
+ cleaned = clean_html(body.content)
93
+ cleaned = normalize_text(cleaned)
94
+ lang = detect_language(cleaned) if validate_content(cleaned) else None
95
+ is_valid = validate_content(cleaned)
96
+
97
+ return PreprocessResponse(
98
+ cleaned_content=cleaned,
99
+ language=lang,
100
+ is_valid=is_valid,
101
+ )
102
+
103
+
104
+ @router.post("/analyze", response_model=AnalyzeResponse)
105
+ async def analyze(
106
+ body: AnalyzeRequest,
107
+ tenant_id: str = Depends(get_tenant_id),
108
+ ) -> AnalyzeResponse:
109
+ chunks = chunk_content(
110
+ text=body.content,
111
+ source_id="analyze-temp",
112
+ strategy=body.options.chunk_strategy,
113
+ chunk_size=body.options.chunk_size,
114
+ overlap=body.options.chunk_overlap,
115
+ )
116
+
117
+ for c in chunks:
118
+ if body.options.enable_ner:
119
+ c.entities = extract_entities(c.content)
120
+ if body.options.enable_sentiment:
121
+ c.sentiment = classify_sentiment(c.content)
122
+ if body.options.enable_topics:
123
+ c.topics = extract_topics(c.content)
124
+
125
+ entities = extract_entities(body.content) if body.options.enable_ner else []
126
+ sentiment = (
127
+ classify_sentiment(body.content) if body.options.enable_sentiment else None
128
+ )
129
+ topics = extract_topics(body.content) if body.options.enable_topics else []
130
+ summary_text = summarize(body.content)
131
+
132
+ return AnalyzeResponse(
133
+ chunks=chunks,
134
+ entities=entities,
135
+ sentiment=sentiment,
136
+ topics=topics,
137
+ summary=summary_text,
138
+ )
139
+
140
+
141
+ @router.post("/postprocess", response_model=PostprocessResponse)
142
+ async def postprocess(
143
+ body: PostprocessRequest,
144
+ tenant_id: str = Depends(get_tenant_id),
145
+ ) -> PostprocessResponse:
146
+ ctx = ProcessingContext(
147
+ source_id=body.source_id,
148
+ source_type=body.source_type,
149
+ product=body.product,
150
+ tenant_id=body.tenant_id or tenant_id,
151
+ raw_content=body.content,
152
+ cleaned_content=body.content,
153
+ chunks=body.chunks,
154
+ options=body.options,
155
+ )
156
+
157
+ ctx = run_postprocessing(ctx)
158
+
159
+ return PostprocessResponse(
160
+ documents_created=len(ctx.documents),
161
+ embeddings_generated=len(ctx.embeddings),
162
+ quality_scores=ctx.quality_scores,
163
+ )
164
+
165
+
166
+ @router.post("/chunk", response_model=ChunkResponse)
167
+ async def chunk(
168
+ body: ChunkRequest,
169
+ tenant_id: str = Depends(get_tenant_id),
170
+ ) -> ChunkResponse:
171
+ chunks = chunk_content(
172
+ text=body.content,
173
+ source_id="chunk-temp",
174
+ strategy=body.strategy,
175
+ chunk_size=body.chunk_size,
176
+ overlap=body.overlap,
177
+ )
178
+
179
+ items = [
180
+ ChunkItem(
181
+ index=c.chunk_index,
182
+ content=c.content,
183
+ token_count=c.token_count,
184
+ )
185
+ for c in chunks
186
+ ]
187
+
188
+ return ChunkResponse(
189
+ chunks=items,
190
+ total_chunks=len(items),
191
+ strategy=body.strategy,
192
+ )
@@ -0,0 +1,167 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ProcessingOptions(BaseModel):
7
+ chunk_strategy: str = "token_count"
8
+ chunk_size: int = 512
9
+ chunk_overlap: int = 50
10
+ enable_ner: bool = True
11
+ enable_sentiment: bool = True
12
+ enable_topics: bool = True
13
+ enable_embedding: bool = True
14
+ enable_dedup: bool = True
15
+ store_results: bool = True
16
+ embedding_model: str = "text-embedding-3-small"
17
+ embedding_provider: str | None = None # None = use config default
18
+
19
+
20
+ class Entity(BaseModel):
21
+ name: str
22
+ type: str # PERSON, ORGANIZATION, LOCATION, DATE, etc.
23
+ start: int = 0
24
+ end: int = 0
25
+
26
+
27
+ class ContentChunk(BaseModel):
28
+ chunk_index: int
29
+ content: str
30
+ token_count: int
31
+ parent_source_id: str
32
+ entities: list[Entity] = Field(default_factory=list)
33
+ sentiment: str | None = None
34
+ topics: list[str] = Field(default_factory=list)
35
+
36
+
37
+ class ProcessingError(BaseModel):
38
+ stage: str
39
+ task: str
40
+ message: str
41
+
42
+
43
+ class ProcessingContext(BaseModel):
44
+ source_id: str
45
+ source_type: str
46
+ product: str
47
+ tenant_id: str
48
+ job_id: str | None = None
49
+ raw_content: str
50
+ cleaned_content: str | None = None
51
+ language: str | None = None
52
+ chunks: list[ContentChunk] = Field(default_factory=list)
53
+ entities: list[Entity] = Field(default_factory=list)
54
+ sentiment: str | None = None
55
+ topics: list[str] = Field(default_factory=list)
56
+ summary: str | None = None
57
+ embeddings: list[list[float]] = Field(default_factory=list)
58
+ quality_scores: list[float] = Field(default_factory=list)
59
+ documents: list[Any] = Field(default_factory=list) # Final Document objects
60
+ options: ProcessingOptions = Field(default_factory=ProcessingOptions)
61
+ stages_completed: list[str] = Field(default_factory=list)
62
+ errors: list[ProcessingError] = Field(default_factory=list)
63
+
64
+
65
+ # API Request/Response schemas
66
+
67
+
68
+ class PipelineRequest(BaseModel):
69
+ content: str
70
+ source_id: str
71
+ source_type: str = "text"
72
+ product: str = ""
73
+ job_id: str | None = None
74
+ options: ProcessingOptions = Field(default_factory=ProcessingOptions)
75
+
76
+
77
+ class PipelineResponse(BaseModel):
78
+ source_id: str
79
+ chunks_produced: int
80
+ chunks_stored: int
81
+ chunks_deduplicated: int
82
+ processing_time_ms: int
83
+ summary: str | None = None
84
+ languages_detected: list[str] = Field(default_factory=list)
85
+ entities_found: int
86
+ stages_completed: list[str] = Field(default_factory=list)
87
+
88
+
89
+ class ChunkRequest(BaseModel):
90
+ content: str
91
+ strategy: str = "token_count"
92
+ chunk_size: int = 512
93
+ overlap: int = 50
94
+
95
+
96
+ class ChunkItem(BaseModel):
97
+ index: int
98
+ content: str
99
+ token_count: int
100
+
101
+
102
+ class ChunkResponse(BaseModel):
103
+ chunks: list[ChunkItem]
104
+ total_chunks: int
105
+ strategy: str
106
+
107
+
108
+ class PreprocessRequest(BaseModel):
109
+ content: str
110
+ source_type: str = "text"
111
+
112
+
113
+ class PreprocessResponse(BaseModel):
114
+ cleaned_content: str
115
+ language: str | None = None
116
+ is_valid: bool
117
+
118
+
119
+ class AnalyzeRequest(BaseModel):
120
+ content: str
121
+ options: ProcessingOptions = Field(default_factory=ProcessingOptions)
122
+
123
+
124
+ class AnalyzeResponse(BaseModel):
125
+ chunks: list[ContentChunk]
126
+ entities: list[Entity]
127
+ sentiment: str | None = None
128
+ topics: list[str]
129
+ summary: str | None = None
130
+
131
+
132
+ class PostprocessRequest(BaseModel):
133
+ content: str
134
+ chunks: list[ContentChunk]
135
+ source_id: str
136
+ source_type: str = "text"
137
+ product: str = ""
138
+ tenant_id: str = ""
139
+ options: ProcessingOptions = Field(default_factory=ProcessingOptions)
140
+
141
+
142
+ class PostprocessResponse(BaseModel):
143
+ documents_created: int
144
+ embeddings_generated: int
145
+ quality_scores: list[float]
146
+
147
+
148
+ class TopicShift(BaseModel):
149
+ index: int
150
+ reason: str # "Topic A to Topic B"
151
+
152
+
153
+ class TopicSplitResult(BaseModel):
154
+ topic_shifts: list[TopicShift] = Field(default_factory=list)
155
+
156
+
157
+ class OCRInput(BaseModel):
158
+ """Input for an OCR extraction request."""
159
+
160
+ file_path: str | None = None
161
+ file_bytes: bytes | None = None
162
+ mime_type: str = "application/pdf"
163
+ prompt: str = ""
164
+ temperature: float = 0.1
165
+ model: str = "gemini-2.5-flash"
166
+ api_key: str = ""
167
+ max_output_tokens: int = 65536
@@ -0,0 +1,117 @@
1
+ """Product manifest declaration, validation, and discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib.metadata
6
+ from dataclasses import dataclass, field
7
+ from types import EllipsisType
8
+ from typing import Any
9
+
10
+ from pulse_engine.extractor.base import BaseExtractor
11
+ from pulse_engine.processor.base import (
12
+ BaseCoreProcessor,
13
+ BasePostprocessor,
14
+ BasePreprocessor,
15
+ )
16
+
17
+
18
+ @dataclass
19
+ class ProductManifest:
20
+ """Declarative manifest that a product uses to register with the engine.
21
+
22
+ For processor stages, the convention is:
23
+ - ``...`` (Ellipsis) → use the engine's default implementation
24
+ - ``None`` → skip the stage entirely
25
+ - an instance → use the provided custom implementation
26
+ """
27
+
28
+ name: str
29
+ version: str
30
+ extractors: list[type[BaseExtractor]] = field(default_factory=list)
31
+ preprocessor: BasePreprocessor | None | EllipsisType = ...
32
+ core_processor: BaseCoreProcessor | None | EllipsisType = ...
33
+ postprocessor: BasePostprocessor | None | EllipsisType = ...
34
+ settings_class: type[Any] | None = None
35
+ routers: list[Any] = field(default_factory=list)
36
+ mcp_tool_modules: list[str] = field(default_factory=list)
37
+ celery_task_modules: list[str] = field(default_factory=list)
38
+ athena_database: str = ""
39
+
40
+
41
+ class ManifestValidationError(Exception):
42
+ """Raised when a product manifest fails validation."""
43
+
44
+ def __init__(self, errors: list[str]) -> None:
45
+ self.errors = errors
46
+ super().__init__(f"Manifest validation failed: {'; '.join(errors)}")
47
+
48
+
49
+ def validate_manifest(manifest: ProductManifest) -> list[str]:
50
+ """Validate a ProductManifest and return a list of error strings (empty = valid)."""
51
+ errors: list[str] = []
52
+
53
+ if not manifest.name:
54
+ errors.append("manifest.name is required")
55
+ if not manifest.version:
56
+ errors.append("manifest.version is required")
57
+
58
+ # Validate extractors implement BaseExtractor
59
+ seen_job_types: set[str] = set()
60
+ for i, ext_cls in enumerate(manifest.extractors):
61
+ if not (isinstance(ext_cls, type) and issubclass(ext_cls, BaseExtractor)):
62
+ errors.append(
63
+ f"extractors[{i}] ({ext_cls!r}) must be a subclass of BaseExtractor"
64
+ )
65
+ continue
66
+
67
+ # Check for duplicate job_types
68
+ try:
69
+ config = ext_cls().get_config()
70
+ if config.job_type in seen_job_types:
71
+ errors.append(
72
+ f"Duplicate job_type '{config.job_type}' "
73
+ f"from extractor '{ext_cls.__name__}'"
74
+ )
75
+ seen_job_types.add(config.job_type)
76
+ except Exception as exc:
77
+ errors.append(
78
+ f"extractors[{i}] ({ext_cls.__name__}).get_config() raised: {exc}"
79
+ )
80
+
81
+ # Validate processor stages (must be correct ABC or None or Ellipsis)
82
+ _validate_stage(manifest.preprocessor, BasePreprocessor, "preprocessor", errors)
83
+ _validate_stage(
84
+ manifest.core_processor, BaseCoreProcessor, "core_processor", errors
85
+ )
86
+ _validate_stage(manifest.postprocessor, BasePostprocessor, "postprocessor", errors)
87
+
88
+ return errors
89
+
90
+
91
+ def _validate_stage(
92
+ value: Any,
93
+ abc: type[Any],
94
+ name: str,
95
+ errors: list[str],
96
+ ) -> None:
97
+ if value is ... or value is None:
98
+ return
99
+ if not isinstance(value, abc):
100
+ errors.append(
101
+ f"{name} must be an instance of {abc.__name__}, None, or ... (Ellipsis); "
102
+ f"got {type(value).__name__}"
103
+ )
104
+
105
+
106
+ def discover_products() -> list[ProductManifest]:
107
+ """Discover installed products via ``pulse_engine.products`` entry-point group."""
108
+ manifests: list[ProductManifest] = []
109
+ eps = importlib.metadata.entry_points()
110
+ group = eps.select(group="pulse_engine.products")
111
+ for ep in group:
112
+ obj = ep.load()
113
+ if isinstance(obj, ProductManifest):
114
+ manifests.append(obj)
115
+ elif callable(obj):
116
+ manifests.append(obj())
117
+ return manifests
File without changes
@@ -0,0 +1,26 @@
1
+ """Lambda handler wrapper for product containers.
2
+
3
+ This is the Lambda handler pointed to by Terraform for native Lambda compute.
4
+ Product containers must NOT add any orchestration logic — they expose a plain
5
+ `entrypoint:run` function and this runner unpacks the event and calls it.
6
+
7
+ Terraform Lambda resource handler:
8
+ pulse_engine.runners.lambda_runner.handler
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+
16
+ def handler(event: dict[str, Any], context: object) -> None:
17
+ """Engine-owned Lambda handler. Delegates to the product's plain entrypoint."""
18
+ from entrypoint import run # imported at call time from the product container
19
+
20
+ run(
21
+ job_id=event["job_id"],
22
+ chain=event["chain"],
23
+ config=event["config"],
24
+ pulse_api_token=event["pulse_api_token"],
25
+ pulse_engine_url=event["pulse_engine_url"],
26
+ )
@@ -0,0 +1,43 @@
1
+ """Generic runner for pipeline module containers.
2
+
3
+ This module provides the entrypoint shim that the engine's runner package
4
+ (pulse-engine-runner) uses to call the product's entrypoint.run() function
5
+ with the new pipeline contract.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Callable
11
+ from typing import Any
12
+
13
+
14
+ class RunnerError(Exception):
15
+ pass
16
+
17
+
18
+ def execute_step(
19
+ entrypoint_fn: Callable[..., dict[str, Any]],
20
+ job_id: str,
21
+ args: dict[str, Any],
22
+ inputs: Any,
23
+ pipeline_run_id: str,
24
+ pulse_engine_url: str,
25
+ pulse_api_token: str,
26
+ ) -> dict[str, Any]:
27
+ """Call the product's entrypoint with the pipeline contract and validate output."""
28
+ result = entrypoint_fn(
29
+ job_id=job_id,
30
+ args=args,
31
+ inputs=inputs,
32
+ pipeline_run_id=pipeline_run_id,
33
+ pulse_engine_url=pulse_engine_url,
34
+ pulse_api_token=pulse_api_token,
35
+ )
36
+
37
+ if not isinstance(result, dict):
38
+ raise RunnerError(
39
+ f"Module entrypoint must return a dict (output reference), "
40
+ f"got {type(result).__name__}"
41
+ )
42
+
43
+ return result