pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,261 @@
1
+ """YouTube Data API v3 channel metadata adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from datetime import UTC, datetime, timedelta
7
+ from typing import Any
8
+
9
+ import httpx
10
+ import structlog
11
+
12
+ from .models import VideoMetadata
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ _YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
17
+ _MAX_PAGE_SIZE = 50
18
+
19
+ _ISO8601_DURATION_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?")
20
+
21
+
22
+ def _parse_duration(duration: str) -> int:
23
+ """Parse an ISO 8601 duration string (e.g. ``PT15M33S``) to seconds."""
24
+ m = _ISO8601_DURATION_RE.search(duration)
25
+ if not m:
26
+ return 0
27
+ hours = int(m.group(1) or 0)
28
+ minutes = int(m.group(2) or 0)
29
+ seconds = int(m.group(3) or 0)
30
+ return hours * 3600 + minutes * 60 + seconds
31
+
32
+
33
+ def _is_live_or_upcoming(video: dict[str, Any]) -> bool:
34
+ lbc = (video.get("snippet") or {}).get("liveBroadcastContent") or "none"
35
+ return lbc in ("live", "upcoming")
36
+
37
+
38
+ class YouTubeMetadataAdapter:
39
+ """Discovers video metadata from a YouTube channel via the Data API v3.
40
+
41
+ Mirrors the ``DigitalNewsMetadataAdapter`` interface: construct with
42
+ connection config, call ``fetch()`` to get a list of ``VideoMetadata``.
43
+
44
+ Usage::
45
+
46
+ adapter = YouTubeMetadataAdapter(
47
+ channel_name="@mkbhd",
48
+ api_key="YOUR_YT_API_KEY",
49
+ )
50
+ videos = await adapter.fetch(since=timedelta(days=7), max_videos=50)
51
+ """
52
+
53
+ def __init__(self, channel_name: str, api_key: str) -> None:
54
+ self._channel_name = channel_name
55
+ self._api_key = api_key
56
+
57
+ async def fetch(
58
+ self,
59
+ since: timedelta | None = None,
60
+ max_videos: int = 50,
61
+ min_duration_seconds: int = 0,
62
+ skip_live_streams: bool = True,
63
+ ) -> list[VideoMetadata]:
64
+ """Fetch video metadata from the channel.
65
+
66
+ Args:
67
+ since: Only return videos published within this window.
68
+ max_videos: Upper bound on videos returned.
69
+ min_duration_seconds: Skip videos shorter than this threshold.
70
+ skip_live_streams: Drop live and scheduled broadcasts.
71
+
72
+ Returns:
73
+ List of ``VideoMetadata`` ordered newest-first from the API.
74
+ """
75
+ published_after: str | None = None
76
+ if since is not None:
77
+ cutoff = datetime.now(UTC) - since
78
+ published_after = cutoff.isoformat()
79
+
80
+ logger.info(
81
+ "youtube_metadata_fetch_started",
82
+ channel=self._channel_name,
83
+ since=str(since),
84
+ max_videos=max_videos,
85
+ )
86
+
87
+ async with httpx.AsyncClient(timeout=30.0) as client:
88
+ channel_id = await self._resolve_channel_id(client)
89
+ playlist_id = await self._get_uploads_playlist_id(client, channel_id)
90
+ video_ids = await self._list_playlist_video_ids(
91
+ client,
92
+ playlist_id,
93
+ max_videos=max_videos,
94
+ published_after=published_after,
95
+ )
96
+ raw_videos = await self._get_video_details(client, video_ids)
97
+
98
+ videos = self._apply_filters(
99
+ raw_videos,
100
+ min_duration_seconds=min_duration_seconds,
101
+ skip_live_streams=skip_live_streams,
102
+ )
103
+
104
+ result = [self._to_model(v) for v in videos]
105
+ logger.info(
106
+ "youtube_metadata_fetch_completed",
107
+ channel=self._channel_name,
108
+ returned=len(result),
109
+ )
110
+ return result
111
+
112
+ # ------------------------------------------------------------------
113
+ # Private helpers
114
+ # ------------------------------------------------------------------
115
+
116
+ async def _resolve_channel_id(self, client: httpx.AsyncClient) -> str:
117
+ """Resolve a channel handle or display name to a channel ID."""
118
+ handle = self._channel_name.lstrip("@")
119
+ resp = await client.get(
120
+ f"{_YOUTUBE_API_BASE}/channels",
121
+ params={"part": "id", "forHandle": handle, "key": self._api_key},
122
+ )
123
+ resp.raise_for_status()
124
+ items = resp.json().get("items", [])
125
+ if items:
126
+ return str(items[0]["id"])
127
+
128
+ # Fall back to search by display name.
129
+ resp = await client.get(
130
+ f"{_YOUTUBE_API_BASE}/search",
131
+ params={
132
+ "part": "snippet",
133
+ "type": "channel",
134
+ "q": self._channel_name,
135
+ "maxResults": 1,
136
+ "key": self._api_key,
137
+ },
138
+ )
139
+ resp.raise_for_status()
140
+ items = resp.json().get("items", [])
141
+ if not items:
142
+ raise ValueError(f"YouTube channel not found: '{self._channel_name}'")
143
+ return str(items[0]["snippet"]["channelId"])
144
+
145
+ async def _get_uploads_playlist_id(
146
+ self, client: httpx.AsyncClient, channel_id: str
147
+ ) -> str:
148
+ resp = await client.get(
149
+ f"{_YOUTUBE_API_BASE}/channels",
150
+ params={"part": "contentDetails", "id": channel_id, "key": self._api_key},
151
+ )
152
+ resp.raise_for_status()
153
+ items = resp.json().get("items", [])
154
+ if not items:
155
+ raise ValueError(f"Channel not found: {channel_id}")
156
+ return str(items[0]["contentDetails"]["relatedPlaylists"]["uploads"])
157
+
158
+ async def _list_playlist_video_ids(
159
+ self,
160
+ client: httpx.AsyncClient,
161
+ playlist_id: str,
162
+ max_videos: int,
163
+ published_after: str | None,
164
+ ) -> list[str]:
165
+ video_ids: list[str] = []
166
+ page_token: str | None = None
167
+
168
+ while len(video_ids) < max_videos:
169
+ page_size = min(_MAX_PAGE_SIZE, max_videos - len(video_ids))
170
+ params: dict[str, Any] = {
171
+ "part": "contentDetails,snippet",
172
+ "playlistId": playlist_id,
173
+ "maxResults": page_size,
174
+ "key": self._api_key,
175
+ }
176
+ if page_token:
177
+ params["pageToken"] = page_token
178
+
179
+ resp = await client.get(f"{_YOUTUBE_API_BASE}/playlistItems", params=params)
180
+ resp.raise_for_status()
181
+ data = resp.json()
182
+
183
+ for item in data.get("items", []):
184
+ vid_id = item["contentDetails"]["videoId"]
185
+ if published_after:
186
+ published = item["snippet"].get("publishedAt", "")
187
+ if published < published_after:
188
+ # Playlist is newest-first; older video means we're done.
189
+ return video_ids
190
+ video_ids.append(vid_id)
191
+
192
+ page_token = data.get("nextPageToken")
193
+ if not page_token:
194
+ break
195
+
196
+ return video_ids
197
+
198
+ async def _get_video_details(
199
+ self, client: httpx.AsyncClient, video_ids: list[str]
200
+ ) -> list[dict[str, Any]]:
201
+ if not video_ids:
202
+ return []
203
+ details: list[dict[str, Any]] = []
204
+ for i in range(0, len(video_ids), _MAX_PAGE_SIZE):
205
+ batch = video_ids[i : i + _MAX_PAGE_SIZE]
206
+ resp = await client.get(
207
+ f"{_YOUTUBE_API_BASE}/videos",
208
+ params={
209
+ "part": "snippet,contentDetails,statistics",
210
+ "id": ",".join(batch),
211
+ "key": self._api_key,
212
+ },
213
+ )
214
+ resp.raise_for_status()
215
+ details.extend(resp.json().get("items", []))
216
+ return details
217
+
218
+ def _apply_filters(
219
+ self,
220
+ videos: list[dict[str, Any]],
221
+ min_duration_seconds: int,
222
+ skip_live_streams: bool,
223
+ ) -> list[dict[str, Any]]:
224
+ result: list[dict[str, Any]] = []
225
+ for v in videos:
226
+ if skip_live_streams and _is_live_or_upcoming(v):
227
+ continue
228
+ if min_duration_seconds > 0:
229
+ details = v.get("contentDetails", {})
230
+ dur = _parse_duration(details.get("duration", "PT0S"))
231
+ if dur < min_duration_seconds:
232
+ continue
233
+ result.append(v)
234
+ return result
235
+
236
+ @staticmethod
237
+ def _to_model(v: dict[str, Any]) -> VideoMetadata:
238
+ snippet = v["snippet"]
239
+ pub_at = datetime.fromisoformat(snippet["publishedAt"].replace("Z", "+00:00"))
240
+ thumbnails = snippet.get("thumbnails", {})
241
+ thumbnail_url = (
242
+ (thumbnails.get("maxres") or thumbnails.get("high") or thumbnails.get("medium") or {})
243
+ .get("url")
244
+ )
245
+ return VideoMetadata(
246
+ video_id=v["id"],
247
+ url=f"https://www.youtube.com/watch?v={v['id']}",
248
+ title=snippet["title"],
249
+ channel_id=snippet["channelId"],
250
+ channel_name=snippet["channelTitle"],
251
+ published_at=pub_at,
252
+ description=snippet.get("description", ""),
253
+ tags=snippet.get("tags", []),
254
+ duration_seconds=_parse_duration(
255
+ v.get("contentDetails", {}).get("duration", "PT0S")
256
+ ),
257
+ view_count=v.get("statistics", {}).get("viewCount"),
258
+ like_count=v.get("statistics", {}).get("likeCount"),
259
+ comment_count=v.get("statistics", {}).get("commentCount"),
260
+ thumbnail_url=thumbnail_url,
261
+ )
File without changes
File without changes
@@ -0,0 +1,91 @@
1
+ """Authentication endpoint — exchanges email/password for Cognito tokens."""
2
+
3
+ import base64
4
+ import hashlib
5
+ import hmac
6
+
7
+ import boto3
8
+ import structlog
9
+ from botocore.exceptions import ClientError
10
+ from fastapi import APIRouter, HTTPException, Request
11
+ from jose import jwt as jose_jwt
12
+ from pydantic import BaseModel, EmailStr, Field
13
+
14
+ from pulse_engine.config import get_settings
15
+ from pulse_engine.middleware.rate_limit import check_rate_limit
16
+
17
+ logger = structlog.get_logger()
18
+
19
+ router = APIRouter(prefix="/auth", tags=["Auth"])
20
+
21
+
22
+ class LoginRequest(BaseModel):
23
+ email: EmailStr
24
+ password: str = Field(min_length=8, max_length=128)
25
+
26
+
27
+ class LoginResponse(BaseModel):
28
+ id_token: str
29
+ access_token: str
30
+ refresh_token: str
31
+ expires_in: int
32
+ token_type: str = "Bearer"
33
+ tenant_id: str
34
+ email: str
35
+
36
+
37
+ def _compute_secret_hash(email: str, client_id: str, client_secret: str) -> str:
38
+ message = email + client_id
39
+ digest = hmac.new(
40
+ client_secret.encode("utf-8"),
41
+ message.encode("utf-8"),
42
+ hashlib.sha256,
43
+ ).digest()
44
+ return base64.b64encode(digest).decode("utf-8")
45
+
46
+
47
+ @router.post("/login", response_model=LoginResponse)
48
+ async def login(request: Request, body: LoginRequest) -> LoginResponse:
49
+ """Authenticate with email and password, returns JWT tokens."""
50
+ # Strict per-IP rate limit: 5 attempts per 60 seconds
51
+ check_rate_limit(request, scope="login", max_requests=5, window_seconds=60)
52
+ settings = get_settings()
53
+
54
+ client_id = settings.cognito_app_client_id
55
+ client_secret = settings.cognito_app_client_secret
56
+
57
+ auth_params: dict[str, str] = {
58
+ "USERNAME": body.email,
59
+ "PASSWORD": body.password,
60
+ }
61
+
62
+ if client_secret:
63
+ auth_params["SECRET_HASH"] = _compute_secret_hash(
64
+ body.email, client_id, client_secret
65
+ )
66
+
67
+ try:
68
+ cognito = boto3.client("cognito-idp", region_name=settings.aws_region)
69
+ result = cognito.initiate_auth(
70
+ ClientId=client_id,
71
+ AuthFlow="USER_PASSWORD_AUTH",
72
+ AuthParameters=auth_params,
73
+ )
74
+ except ClientError as e:
75
+ code = e.response["Error"]["Code"]
76
+ if code in ("NotAuthorizedException", "UserNotFoundException"):
77
+ raise HTTPException(status_code=401, detail="Invalid email or password")
78
+ logger.error("cognito_auth_error", error=str(e), code=code)
79
+ raise HTTPException(status_code=500, detail="Authentication service error")
80
+
81
+ auth = result["AuthenticationResult"]
82
+ # Decode id_token (without verification) to extract tenant_id
83
+ claims = jose_jwt.get_unverified_claims(auth["IdToken"])
84
+ return LoginResponse(
85
+ id_token=auth["IdToken"],
86
+ access_token=auth["AccessToken"],
87
+ refresh_token=auth["RefreshToken"],
88
+ expires_in=auth["ExpiresIn"],
89
+ tenant_id=claims.get("custom:tenant_id", ""),
90
+ email=claims.get("email", body.email),
91
+ )
@@ -0,0 +1,62 @@
1
+ import httpx
2
+ import structlog
3
+ from fastapi import APIRouter, Depends, Request
4
+ from pydantic import BaseModel
5
+
6
+ from pulse_engine.config import Settings, get_settings
7
+ from pulse_engine.services.opensearch import OpenSearchService
8
+
9
+ router = APIRouter()
10
+ logger = structlog.get_logger()
11
+
12
+
13
+ class ServiceHealth(BaseModel):
14
+ opensearch: str
15
+ prefect: str
16
+
17
+
18
+ class HealthResponse(BaseModel):
19
+ status: str
20
+ version: str
21
+ environment: str
22
+ services: ServiceHealth
23
+
24
+
25
+ async def _check_opensearch(opensearch: OpenSearchService) -> str:
26
+ if await opensearch.ping():
27
+ return "up"
28
+ return "down"
29
+
30
+
31
+ async def _check_prefect(url: str) -> str:
32
+ try:
33
+ async with httpx.AsyncClient(timeout=5.0) as client:
34
+ resp = await client.get(f"{url}/health")
35
+ resp.raise_for_status()
36
+ return "up"
37
+ except Exception:
38
+ logger.warning("prefect_health_check_failed", url=url)
39
+ return "down"
40
+
41
+
42
+ @router.get("/health", response_model=HealthResponse)
43
+ async def health_check(
44
+ request: Request,
45
+ settings: Settings = Depends(get_settings),
46
+ ) -> HealthResponse:
47
+ opensearch: OpenSearchService = request.app.state.opensearch
48
+ opensearch_status = await _check_opensearch(opensearch)
49
+ prefect_status = await _check_prefect(settings.prefect_api_url)
50
+
51
+ all_up = opensearch_status == "up" and prefect_status == "up"
52
+ overall = "ok" if all_up else "degraded"
53
+
54
+ return HealthResponse(
55
+ status=overall,
56
+ version=settings.app_version,
57
+ environment=settings.app_env,
58
+ services=ServiceHealth(
59
+ opensearch=opensearch_status,
60
+ prefect=prefect_status,
61
+ ),
62
+ )
@@ -0,0 +1,16 @@
1
+ from fastapi import APIRouter
2
+
3
+ from pulse_engine.api.v1.auth import router as auth_router
4
+ from pulse_engine.api.v1.health import router as health_router
5
+ from pulse_engine.deployment.router import router as deployments_router
6
+ from pulse_engine.extractor.router import router as jobs_router
7
+ from pulse_engine.processor.router import router as process_router
8
+ from pulse_engine.storage.router import router as kb_router
9
+
10
+ v1_router = APIRouter(prefix="/api/v1")
11
+ v1_router.include_router(auth_router)
12
+ v1_router.include_router(health_router)
13
+ v1_router.include_router(kb_router)
14
+ v1_router.include_router(jobs_router)
15
+ v1_router.include_router(process_router)
16
+ v1_router.include_router(deployments_router)
@@ -0,0 +1,131 @@
1
+ """Background task that monitors Prefect for stalled or failed chained flows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import UTC, datetime, timedelta
6
+ from typing import TYPE_CHECKING
7
+
8
+ import structlog
9
+
10
+ if TYPE_CHECKING:
11
+ from pulse_engine.config import Settings
12
+ from pulse_engine.deployment.job_launcher import JobLauncher
13
+ from pulse_engine.extractor.orchestrator.base import BaseOrchestratorAdapter
14
+ from pulse_engine.extractor.repository import JobRepository
15
+ from pulse_engine.extractor.stage_repository import StageRepository
16
+
17
+ logger = structlog.get_logger(__name__)
18
+
19
+ _NEXT_STAGE = {
20
+ "extraction": ("processor", "processing"),
21
+ "processing": ("storage", "storage"),
22
+ }
23
+
24
+
25
+ class ChainRecoveryTask:
26
+ """Polls Prefect for stalled chained flows and auto-recovers."""
27
+
28
+ def __init__(
29
+ self,
30
+ stage_repo: StageRepository,
31
+ job_repo: JobRepository,
32
+ job_launcher: JobLauncher,
33
+ orchestrator: BaseOrchestratorAdapter,
34
+ settings: Settings,
35
+ ) -> None:
36
+ self._stage_repo = stage_repo
37
+ self._job_repo = job_repo
38
+ self._job_launcher = job_launcher
39
+ self._orchestrator = orchestrator
40
+ self._grace_seconds = settings.pulse_chain_grace_period_seconds
41
+
42
+ async def check_once(self) -> None:
43
+ """Check running stages against Prefect for failures."""
44
+ stages = await self._stage_repo.get_running_stages()
45
+ for stage in stages:
46
+ if not stage.prefect_flow_run_id:
47
+ continue
48
+ try:
49
+ run_status = await self._orchestrator.get_run_status(
50
+ stage.prefect_flow_run_id
51
+ )
52
+ if run_status.status in ("failed", "cancelled"):
53
+ await self._stage_repo.update_status(
54
+ stage.id,
55
+ run_status.status,
56
+ )
57
+ job = await self._job_repo.get_by_id(stage.job_id)
58
+ if job:
59
+ await self._job_repo.update_status(
60
+ stage.job_id, job.tenant_id, "failed"
61
+ )
62
+ logger.warning(
63
+ "chain_recovery_stage_failed",
64
+ job_id=stage.job_id,
65
+ stage=stage.stage,
66
+ flow_run_status=run_status.status,
67
+ )
68
+ except Exception:
69
+ logger.warning(
70
+ "chain_recovery_check_error",
71
+ stage_id=stage.id,
72
+ exc_info=True,
73
+ )
74
+
75
+ async def check_stalled_chains(self) -> None:
76
+ """Auto-trigger next stage for completed stages where chain stalled."""
77
+ cutoff = datetime.now(UTC) - timedelta(
78
+ seconds=self._grace_seconds,
79
+ )
80
+ stages = await self._stage_repo.get_completed_unchained_stages()
81
+
82
+ for stage in stages:
83
+ if stage.completed_at and stage.completed_at < cutoff:
84
+ next_info = _NEXT_STAGE.get(stage.stage)
85
+ if not next_info:
86
+ continue
87
+
88
+ next_stage_key, next_stage_name = next_info
89
+ job = await self._job_repo.get_by_id(stage.job_id)
90
+ if not job:
91
+ continue
92
+
93
+ params = getattr(job, "parameters", {}) or {}
94
+ if not params.get("chain", False):
95
+ continue
96
+
97
+ orchestrator = params.get("orchestrator", "prefect")
98
+ compute = params.get("compute", "ecs")
99
+
100
+ try:
101
+ result = await self._job_launcher.launch(
102
+ job_id=job.job_id,
103
+ tenant_id=job.tenant_id,
104
+ product=job.product,
105
+ stage=next_stage_key,
106
+ orchestrator=orchestrator,
107
+ compute=compute,
108
+ chain=True,
109
+ config=params.get("config", {}),
110
+ )
111
+
112
+ await self._stage_repo.create(
113
+ job_id=job.job_id,
114
+ stage=next_stage_name,
115
+ prefect_flow_run_id=result.flow_run_id,
116
+ )
117
+
118
+ logger.info(
119
+ "chain_recovery_triggered_next_stage",
120
+ job_id=job.job_id,
121
+ from_stage=stage.stage,
122
+ to_stage=next_stage_name,
123
+ flow_run_id=result.flow_run_id,
124
+ )
125
+ except Exception:
126
+ logger.warning(
127
+ "chain_recovery_trigger_error",
128
+ job_id=stage.job_id,
129
+ stage=stage.stage,
130
+ exc_info=True,
131
+ )
File without changes