pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,423 @@
1
+ """Twitter/X timeline adapter using twikit."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import os
8
+ import re
9
+ import tempfile
10
+ from datetime import UTC, datetime, timedelta
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import structlog
15
+
16
+ from .models import TweetMetadata, TwitterUserResult
17
+
18
+ logger = structlog.get_logger(__name__)
19
+
20
+
21
+ def _patch_twikit_user() -> None:
22
+ """Override twikit.user.User.__init__ with safe .get() access.
23
+
24
+ twikit 2.3.3 uses hard bracket notation for optional API fields that
25
+ Twitter omits for some accounts (e.g. description.urls,
26
+ withheld_in_countries). This replaces the constructor in-process without
27
+ touching site-packages. Incorporates fixes from PR #418
28
+ (github.com/d60/twikit/pull/418).
29
+ """
30
+ try:
31
+ import twikit.user as _twikit_user
32
+ except ImportError:
33
+ return
34
+
35
+ def _safe_init(self: object, client: object, data: dict[str, Any]) -> None:
36
+ self._client = client # type: ignore[attr-defined]
37
+ legacy = data.get("legacy", {})
38
+
39
+ self.id = data.get("rest_id", "") # type: ignore[attr-defined]
40
+ self.created_at = legacy.get("created_at", "") # type: ignore[attr-defined]
41
+ self.name = legacy.get("name", "") # type: ignore[attr-defined]
42
+ self.screen_name = legacy.get("screen_name", "") # type: ignore[attr-defined]
43
+ self.profile_image_url = legacy.get("profile_image_url_https", "") # type: ignore[attr-defined]
44
+ self.profile_banner_url = legacy.get("profile_banner_url") # type: ignore[attr-defined]
45
+ self.url = legacy.get("url") # type: ignore[attr-defined]
46
+ self.location = legacy.get("location", "") # type: ignore[attr-defined]
47
+ self.description = legacy.get("description", "") # type: ignore[attr-defined]
48
+ self.description_urls = ( # type: ignore[attr-defined]
49
+ legacy.get("entities", {}).get("description", {}).get("urls", [])
50
+ )
51
+ self.urls = legacy.get("entities", {}).get("url", {}).get("urls") # type: ignore[attr-defined]
52
+ self.pinned_tweet_ids = legacy.get("pinned_tweet_ids_str", []) # type: ignore[attr-defined]
53
+ self.is_blue_verified = data.get("is_blue_verified", False) # type: ignore[attr-defined]
54
+ self.verified = legacy.get("verified", False) # type: ignore[attr-defined]
55
+ self.possibly_sensitive = legacy.get("possibly_sensitive", False) # type: ignore[attr-defined]
56
+ self.can_dm = legacy.get("can_dm", False) # type: ignore[attr-defined]
57
+ self.can_media_tag = legacy.get("can_media_tag", False) # type: ignore[attr-defined]
58
+ self.want_retweets = legacy.get("want_retweets", False) # type: ignore[attr-defined]
59
+ self.default_profile = legacy.get("default_profile", False) # type: ignore[attr-defined]
60
+ self.default_profile_image = legacy.get("default_profile_image", False) # type: ignore[attr-defined]
61
+ self.has_custom_timelines = legacy.get("has_custom_timelines", False) # type: ignore[attr-defined]
62
+ self.followers_count = legacy.get("followers_count", 0) # type: ignore[attr-defined]
63
+ self.fast_followers_count = legacy.get("fast_followers_count", 0) # type: ignore[attr-defined]
64
+ self.normal_followers_count = legacy.get("normal_followers_count", 0) # type: ignore[attr-defined]
65
+ self.following_count = legacy.get("friends_count", 0) # type: ignore[attr-defined]
66
+ self.favourites_count = legacy.get("favourites_count", 0) # type: ignore[attr-defined]
67
+ self.listed_count = legacy.get("listed_count", 0) # type: ignore[attr-defined]
68
+ self.media_count = legacy.get("media_count", 0) # type: ignore[attr-defined]
69
+ self.statuses_count = legacy.get("statuses_count", 0) # type: ignore[attr-defined]
70
+ self.is_translator = legacy.get("is_translator", False) # type: ignore[attr-defined]
71
+ self.translator_type = legacy.get("translator_type", "") # type: ignore[attr-defined]
72
+ self.withheld_in_countries = legacy.get("withheld_in_countries", []) # type: ignore[attr-defined]
73
+ self.protected = legacy.get("protected", False) # type: ignore[attr-defined]
74
+
75
+ _twikit_user.User.__init__ = _safe_init
76
+ logger.debug("twikit_user_patched")
77
+
78
+
79
+ _patch_twikit_user()
80
+
81
+ _DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
82
+ _HASHTAG_RE = re.compile(r"#\w+")
83
+ _URL_RE = re.compile(r"https?://[^\s]+")
84
+
85
+ _TWEET_DATETIME_FORMATS = (
86
+ "%a %b %d %H:%M:%S +0000 %Y",
87
+ "%Y-%m-%dT%H:%M:%S+00:00",
88
+ )
89
+
90
+
91
+ def _detect_language(text: str) -> str:
92
+ deva = len(_DEVANAGARI_RE.findall(text))
93
+ total = len(text.replace(" ", ""))
94
+ if not total:
95
+ return "en"
96
+ ratio = deva / total
97
+ if ratio > 0.4:
98
+ return "hi"
99
+ if ratio > 0.1:
100
+ return "mixed"
101
+ return "en"
102
+
103
+
104
+ def _parse_datetime(value: object) -> datetime:
105
+ if isinstance(value, datetime):
106
+ return value.replace(tzinfo=UTC) if value.tzinfo is None else value
107
+ s = str(value)
108
+ for fmt in _TWEET_DATETIME_FORMATS:
109
+ try:
110
+ return datetime.strptime(s, fmt).replace(tzinfo=UTC)
111
+ except ValueError:
112
+ continue
113
+ try:
114
+ dt = datetime.fromisoformat(s)
115
+ return dt.replace(tzinfo=UTC) if dt.tzinfo is None else dt
116
+ except ValueError:
117
+ return datetime.now(UTC)
118
+
119
+
120
+ def _to_model(tweet: object, screen_name: str) -> TweetMetadata | None:
121
+ """Convert a twikit tweet object to TweetMetadata. Returns None to skip."""
122
+ if getattr(tweet, "retweeted_tweet", None) is not None:
123
+ return None
124
+ if getattr(tweet, "in_reply_to", None) is not None:
125
+ return None
126
+
127
+ text: str = getattr(tweet, "text", "") or ""
128
+ note = getattr(tweet, "note_tweet", None)
129
+ if note:
130
+ text = getattr(note, "text", None) or text
131
+
132
+ if text.startswith("RT @"):
133
+ return None
134
+
135
+ has_images = has_videos = False
136
+ for item in getattr(tweet, "media", None) or []:
137
+ t = type(item).__name__.lower()
138
+ has_images = has_images or "photo" in t or "image" in t
139
+ has_videos = has_videos or "video" in t
140
+
141
+ views = 0
142
+ raw_views = getattr(tweet, "view_count", None)
143
+ if raw_views:
144
+ try:
145
+ views = int(str(raw_views).replace(",", ""))
146
+ except (ValueError, TypeError):
147
+ pass
148
+
149
+ tweet_id = str(getattr(tweet, "id", ""))
150
+ created_at = _parse_datetime(getattr(tweet, "created_at", datetime.now(UTC)))
151
+
152
+ return TweetMetadata(
153
+ tweet_id=tweet_id,
154
+ url=f"https://x.com/{screen_name}/status/{tweet_id}",
155
+ text=text,
156
+ screen_name=screen_name,
157
+ created_at=created_at,
158
+ language=_detect_language(text),
159
+ hashtags=_HASHTAG_RE.findall(text),
160
+ embedded_urls=_URL_RE.findall(text),
161
+ views=views,
162
+ retweet_count=getattr(tweet, "retweet_count", 0) or 0,
163
+ like_count=getattr(tweet, "favorite_count", 0) or 0,
164
+ reply_count=getattr(tweet, "reply_count", 0) or 0,
165
+ has_images=has_images,
166
+ has_videos=has_videos,
167
+ )
168
+
169
+
170
+ class TwitterAdapter:
171
+ """Fetches tweets from one or more Twitter/X user timelines via twikit.
172
+
173
+ Mirrors the ``YouTubeMetadataAdapter`` interface: construct once with
174
+ credentials, call ``fetch()`` for a single user or ``fetch_many()`` for
175
+ a batch of users.
176
+
177
+ Cookies are loaded from AWS Secrets Manager (JSON string) or a local file:
178
+
179
+ Environment variables:
180
+ - ``TWITTER_COOKIES_SECRET_ID`` — Secrets Manager secret ID (JSON cookies)
181
+ - ``TWITTER_COOKIES_PATH`` — path to a local cookies.json file
182
+ - ``AWS_REGION`` — region for Secrets Manager (default: us-east-1)
183
+
184
+ Secrets Manager takes precedence over the local file.
185
+
186
+ Usage::
187
+
188
+ adapter = TwitterAdapter()
189
+ tweets = await adapter.fetch("srivatsayb", since=timedelta(days=1))
190
+
191
+ results = await adapter.fetch_many(
192
+ ["srivatsayb", "rahulgandhiinc"],
193
+ since=timedelta(days=7),
194
+ )
195
+ """
196
+
197
+ def __init__(self) -> None:
198
+ self._cookies_secret_id: str = os.environ.get(
199
+ "TWITTER_COOKIES_SECRET_ID", ""
200
+ ).strip()
201
+ self._cookies_path: str = os.environ.get("TWITTER_COOKIES_PATH", "").strip()
202
+ self._aws_region: str = os.environ.get("AWS_REGION", "us-east-1").strip()
203
+ self._cookies_cache: str | None = None # lazily populated
204
+
205
+ async def fetch(
206
+ self,
207
+ screen_name: str,
208
+ since: timedelta | None = None,
209
+ max_tweets: int = 200,
210
+ ) -> list[TweetMetadata]:
211
+ """Fetch tweets from a single user's timeline.
212
+
213
+ Args:
214
+ screen_name: Twitter/X handle without @.
215
+ since: Only return tweets published within this window.
216
+ max_tweets: Upper bound on tweets returned (default: 200).
217
+
218
+ Returns:
219
+ List of ``TweetMetadata``, newest first.
220
+ """
221
+ cutoff = (datetime.now(UTC) - since) if since is not None else None
222
+ logger.info(
223
+ "twitter_fetch_started",
224
+ screen_name=screen_name,
225
+ since=str(since),
226
+ max_tweets=max_tweets,
227
+ )
228
+
229
+ client = await self._make_client()
230
+ tweets = await self._paginate(client, screen_name, cutoff, max_tweets)
231
+
232
+ logger.info(
233
+ "twitter_fetch_completed",
234
+ screen_name=screen_name,
235
+ count=len(tweets),
236
+ )
237
+ return tweets
238
+
239
+ async def fetch_many(
240
+ self,
241
+ screen_names: list[str],
242
+ since: timedelta | None = None,
243
+ max_tweets: int = 200,
244
+ ) -> list[TwitterUserResult]:
245
+ """Fetch tweets for multiple users sequentially (Twitter rate limits
246
+ make concurrency impractical here).
247
+
248
+ Returns one ``TwitterUserResult`` per user, preserving input order.
249
+ """
250
+ results: list[TwitterUserResult] = []
251
+ client = await self._make_client()
252
+
253
+ for i, screen_name in enumerate(screen_names):
254
+ cutoff = (datetime.now(UTC) - since) if since is not None else None
255
+ try:
256
+ tweets = await self._paginate(client, screen_name, cutoff, max_tweets)
257
+ results.append(
258
+ TwitterUserResult(
259
+ screen_name=screen_name, tweets=tweets, error=None
260
+ )
261
+ )
262
+ except Exception as exc:
263
+ logger.warning(
264
+ "twitter_fetch_failed",
265
+ screen_name=screen_name,
266
+ error=str(exc),
267
+ )
268
+ results.append(
269
+ TwitterUserResult(
270
+ screen_name=screen_name, tweets=[], error=str(exc)
271
+ )
272
+ )
273
+ # Pause between users to avoid hammering the API.
274
+ if i < len(screen_names) - 1:
275
+ await asyncio.sleep(5)
276
+
277
+ return results
278
+
279
+ # ------------------------------------------------------------------
280
+ # Private helpers
281
+ # ------------------------------------------------------------------
282
+
283
+ async def _resolve_cookies(self) -> str:
284
+ """Return cookies JSON string from Secrets Manager or local file."""
285
+ if self._cookies_cache is not None:
286
+ return self._cookies_cache
287
+
288
+ if self._cookies_secret_id:
289
+ from pulse_engine.secrets import fetch_secret
290
+
291
+ logger.debug(
292
+ "twitter_cookies_fetching_secret",
293
+ secret_id=self._cookies_secret_id,
294
+ )
295
+ raw = await fetch_secret(self._cookies_secret_id, self._aws_region)
296
+ # Secret may be stored as {"TWITTER_COOKIES_SECRET_ID": "<cookies json>"}
297
+ try:
298
+ parsed = json.loads(raw)
299
+ if isinstance(parsed, dict):
300
+ # Extract the first value that looks like a cookies JSON string
301
+ for v in parsed.values():
302
+ if isinstance(v, str):
303
+ raw = v
304
+ break
305
+ except ValueError:
306
+ pass
307
+ cookies = raw
308
+ self._cookies_cache = cookies
309
+ logger.info(
310
+ "twitter_cookies_loaded_from_secrets_manager",
311
+ secret_id=self._cookies_secret_id,
312
+ )
313
+ return cookies
314
+
315
+ if self._cookies_path:
316
+ cookies = Path(self._cookies_path).read_text(encoding="utf-8")
317
+ self._cookies_cache = cookies
318
+ return cookies
319
+
320
+ raise RuntimeError(
321
+ "Twitter cookies not configured. Set TWITTER_COOKIES_SECRET_ID "
322
+ "or TWITTER_COOKIES_PATH."
323
+ )
324
+
325
+ async def _make_client(self) -> object:
326
+ """Build and return an authenticated twikit Client."""
327
+ try:
328
+ from twikit import Client
329
+ except ImportError:
330
+ raise RuntimeError(
331
+ "twikit is required for Twitter scraping: pip install twikit"
332
+ )
333
+
334
+ cookies_json = await self._resolve_cookies()
335
+
336
+ # twikit loads cookies from a file path, so materialise to a temp file.
337
+ with tempfile.NamedTemporaryFile(
338
+ mode="w", suffix=".json", delete=False, encoding="utf-8"
339
+ ) as f:
340
+ f.write(cookies_json)
341
+ tmp_path = f.name
342
+
343
+ client = Client()
344
+ client.load_cookies(tmp_path)
345
+ Path(tmp_path).unlink(missing_ok=True)
346
+ return client
347
+
348
+ async def _paginate(
349
+ self,
350
+ client: object,
351
+ screen_name: str,
352
+ cutoff: datetime | None,
353
+ max_tweets: int,
354
+ ) -> list[TweetMetadata]:
355
+ try:
356
+ from twikit import TooManyRequests
357
+ except ImportError:
358
+ raise RuntimeError("twikit is required: pip install twikit")
359
+
360
+ user = await client.get_user_by_screen_name(screen_name) # type: ignore[attr-defined]
361
+ user_id = user.id
362
+
363
+ tweets: list[TweetMetadata] = []
364
+ cursor = None
365
+
366
+ while len(tweets) < max_tweets:
367
+ try:
368
+ page = await client.get_user_tweets( # type: ignore[attr-defined]
369
+ user_id, "Tweets", count=40, cursor=cursor
370
+ )
371
+ except TooManyRequests as exc:
372
+ reset = getattr(exc, "rate_limit_reset", None)
373
+ wait = (
374
+ max(
375
+ (
376
+ datetime.fromtimestamp(reset) - datetime.now()
377
+ ).total_seconds(),
378
+ 0,
379
+ )
380
+ if reset
381
+ else 60
382
+ )
383
+ logger.warning("twitter_rate_limited", wait_seconds=wait)
384
+ await asyncio.sleep(wait)
385
+ continue
386
+ except Exception:
387
+ logger.warning(
388
+ "twitter_page_fetch_failed",
389
+ screen_name=screen_name,
390
+ exc_info=True,
391
+ )
392
+ break
393
+
394
+ if not page:
395
+ break
396
+
397
+ stop = False
398
+ for tweet in page:
399
+ meta = _to_model(tweet, screen_name)
400
+ if meta is None:
401
+ continue
402
+ if cutoff and meta.created_at < cutoff:
403
+ stop = True
404
+ break
405
+ tweets.append(meta)
406
+ if len(tweets) >= max_tweets:
407
+ stop = True
408
+ break
409
+
410
+ if stop:
411
+ break
412
+
413
+ try:
414
+ cursor = page.next_cursor
415
+ except AttributeError:
416
+ cursor = None
417
+
418
+ if not cursor:
419
+ break
420
+
421
+ await asyncio.sleep(3)
422
+
423
+ return tweets
@@ -0,0 +1,186 @@
1
+ """YouTube audio downloader adapter (yt-dlp only)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import structlog
11
+
12
+ from pulse_engine.secrets import fetch_secret
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ # Default yt-dlp player clients. "tv_embedded" returns audio-only streams for
17
+ # age-restricted videos when cookies are provided. Override via YT_DLP_PLAYER_CLIENTS.
18
+ _DEFAULT_PLAYER_CLIENTS = ["tv_embedded", "web"]
19
+
20
+
21
+ def _build_yt_dlp_opts(
22
+ output_template: str,
23
+ extra: dict[str, Any] | None = None,
24
+ ) -> dict[str, Any]:
25
+ """Build yt-dlp option dict, merging env defaults with caller overrides."""
26
+ raw_clients = os.environ.get("YT_DLP_PLAYER_CLIENTS", "").strip()
27
+ clients: list[str] = (
28
+ [c.strip() for c in raw_clients.split(",") if c.strip()]
29
+ if raw_clients
30
+ else _DEFAULT_PLAYER_CLIENTS
31
+ )
32
+
33
+ opts: dict[str, Any] = {
34
+ "format": "bestaudio/best",
35
+ "outtmpl": output_template,
36
+ "postprocessors": [
37
+ {
38
+ "key": "FFmpegExtractAudio",
39
+ "preferredcodec": "mp3",
40
+ "preferredquality": "64",
41
+ }
42
+ ],
43
+ "quiet": True,
44
+ "no_warnings": True,
45
+ # Fetch updated EJS challenge solver from GitHub when bundled version
46
+ # can't handle the current YouTube player's n-challenge.
47
+ # Must be a set — passing a string causes it to be iterated as chars.
48
+ "remote_components": {"ejs:github"},
49
+ "extractor_args": {
50
+ "youtube": {
51
+ "player_client": clients,
52
+ }
53
+ },
54
+ }
55
+
56
+ if not extra:
57
+ return opts
58
+
59
+ override = dict(extra)
60
+ # cookies_content is handled separately — materialised to a temp file.
61
+ override.pop("cookies_content", None)
62
+ ea = override.pop("extractor_args", None)
63
+ opts.update(override)
64
+ if isinstance(ea, dict):
65
+ existing = opts.get("extractor_args") or {}
66
+ merged: dict[str, Any] = {**existing}
67
+ for k, v in ea.items():
68
+ if isinstance(v, dict) and isinstance(merged.get(k), dict):
69
+ merged[k] = {**merged[k], **v}
70
+ else:
71
+ merged[k] = v
72
+ opts["extractor_args"] = merged
73
+
74
+ return opts
75
+
76
+
77
+ class YouTubeDownloader:
78
+ """Downloads audio from YouTube via yt-dlp, returning a Path to an mp3 file.
79
+
80
+ Environment variables:
81
+ - ``YT_DLP_PLAYER_CLIENTS`` — comma-separated yt-dlp client list
82
+ - ``YT_DLP_COOKIES_SECRET_ID`` — Secrets Manager secret name/ARN for cookies.txt
83
+ - ``AWS_REGION`` — region for Secrets Manager (default: us-east-1)
84
+ """
85
+
86
+ def __init__(self, yt_dlp_opts: dict[str, Any] | None = None) -> None:
87
+ self._yt_dlp_opts = yt_dlp_opts or {}
88
+ self._cookies_secret_id: str = os.environ.get(
89
+ "YT_DLP_COOKIES_SECRET_ID", ""
90
+ ).strip()
91
+ self._aws_region: str = os.environ.get("AWS_REGION", "us-east-1").strip()
92
+ self._cookies_cache: str | None = None # lazily populated
93
+
94
+ async def download(self, video_id: str, output_dir: str) -> Path | None:
95
+ """Download audio for *video_id* into *output_dir*, returning path to mp3.
96
+
97
+ Returns ``None`` if the download fails for any reason.
98
+ """
99
+ url = f"https://www.youtube.com/watch?v={video_id}"
100
+ logger.info("youtube_download_started", video_id=video_id)
101
+
102
+ result = await self._download_audio(url, output_dir)
103
+ if result is None:
104
+ logger.warning("youtube_download_failed", video_id=video_id)
105
+ else:
106
+ logger.info("youtube_download_completed", video_id=video_id, path=str(result))
107
+ return result
108
+
109
+ # ------------------------------------------------------------------
110
+ # Private helpers
111
+ # ------------------------------------------------------------------
112
+
113
+ async def _resolve_cookies(self) -> str:
114
+ """Return cookies.txt content, fetching from Secrets Manager if configured.
115
+
116
+ Result is cached after the first successful fetch so subsequent videos
117
+ in the same adapter instance don't make redundant API calls.
118
+ """
119
+ if self._cookies_cache is not None:
120
+ return self._cookies_cache
121
+
122
+ if not self._cookies_secret_id:
123
+ self._cookies_cache = ""
124
+ return ""
125
+
126
+ logger.debug(
127
+ "youtube_cookies_fetching_secret",
128
+ secret_id=self._cookies_secret_id,
129
+ region=self._aws_region,
130
+ )
131
+ import json as _json
132
+
133
+ raw = await fetch_secret(self._cookies_secret_id, self._aws_region)
134
+ # Secret may be stored as {"YT_DLP_COOKIES_SECRET_ID": "<netscape txt>"}
135
+ try:
136
+ parsed = _json.loads(raw)
137
+ if isinstance(parsed, dict):
138
+ for v in parsed.values():
139
+ if isinstance(v, str):
140
+ raw = v
141
+ break
142
+ except ValueError:
143
+ pass
144
+ cookies = raw
145
+ self._cookies_cache = cookies
146
+ logger.info(
147
+ "youtube_cookies_loaded_from_secrets_manager",
148
+ secret_id=self._cookies_secret_id,
149
+ chars=len(cookies),
150
+ )
151
+ return cookies
152
+
153
+ async def _download_audio(self, url: str, output_dir: str) -> Path | None:
154
+ """Download audio from a YouTube URL to *output_dir* via yt-dlp."""
155
+ try:
156
+ import yt_dlp
157
+ except ImportError:
158
+ logger.warning("yt_dlp_not_installed", url=url)
159
+ return None
160
+
161
+ output_template = str(Path(output_dir) / "%(id)s.%(ext)s")
162
+ ydl_opts = _build_yt_dlp_opts(output_template, extra=self._yt_dlp_opts)
163
+
164
+ # Resolve cookies (priority: Secrets Manager > env var).
165
+ cookies_content = await self._resolve_cookies()
166
+ if cookies_content and "cookiefile" not in ydl_opts:
167
+ cookies_path = Path(output_dir) / "cookies.txt"
168
+ cookies_path.write_text(cookies_content, encoding="utf-8")
169
+ try:
170
+ import os as _os
171
+ _os.chmod(cookies_path, 0o600)
172
+ except OSError:
173
+ pass
174
+ ydl_opts["cookiefile"] = str(cookies_path)
175
+
176
+ def _download() -> Path | None:
177
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
178
+ info = ydl.extract_info(url, download=True)
179
+ candidate = Path(output_dir) / f"{info.get('id', '')}.mp3"
180
+ return candidate if candidate.exists() else None
181
+
182
+ try:
183
+ return await asyncio.to_thread(_download)
184
+ except Exception:
185
+ logger.warning("youtube_audio_download_failed", url=url, exc_info=True)
186
+ return None