pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
"""Twitter/X timeline adapter using twikit."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import tempfile
|
|
10
|
+
from datetime import UTC, datetime, timedelta
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import structlog
|
|
15
|
+
|
|
16
|
+
from .models import TweetMetadata, TwitterUserResult
|
|
17
|
+
|
|
18
|
+
logger = structlog.get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _patch_twikit_user() -> None:
|
|
22
|
+
"""Override twikit.user.User.__init__ with safe .get() access.
|
|
23
|
+
|
|
24
|
+
twikit 2.3.3 uses hard bracket notation for optional API fields that
|
|
25
|
+
Twitter omits for some accounts (e.g. description.urls,
|
|
26
|
+
withheld_in_countries). This replaces the constructor in-process without
|
|
27
|
+
touching site-packages. Incorporates fixes from PR #418
|
|
28
|
+
(github.com/d60/twikit/pull/418).
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
import twikit.user as _twikit_user
|
|
32
|
+
except ImportError:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
def _safe_init(self: object, client: object, data: dict[str, Any]) -> None:
|
|
36
|
+
self._client = client # type: ignore[attr-defined]
|
|
37
|
+
legacy = data.get("legacy", {})
|
|
38
|
+
|
|
39
|
+
self.id = data.get("rest_id", "") # type: ignore[attr-defined]
|
|
40
|
+
self.created_at = legacy.get("created_at", "") # type: ignore[attr-defined]
|
|
41
|
+
self.name = legacy.get("name", "") # type: ignore[attr-defined]
|
|
42
|
+
self.screen_name = legacy.get("screen_name", "") # type: ignore[attr-defined]
|
|
43
|
+
self.profile_image_url = legacy.get("profile_image_url_https", "") # type: ignore[attr-defined]
|
|
44
|
+
self.profile_banner_url = legacy.get("profile_banner_url") # type: ignore[attr-defined]
|
|
45
|
+
self.url = legacy.get("url") # type: ignore[attr-defined]
|
|
46
|
+
self.location = legacy.get("location", "") # type: ignore[attr-defined]
|
|
47
|
+
self.description = legacy.get("description", "") # type: ignore[attr-defined]
|
|
48
|
+
self.description_urls = ( # type: ignore[attr-defined]
|
|
49
|
+
legacy.get("entities", {}).get("description", {}).get("urls", [])
|
|
50
|
+
)
|
|
51
|
+
self.urls = legacy.get("entities", {}).get("url", {}).get("urls") # type: ignore[attr-defined]
|
|
52
|
+
self.pinned_tweet_ids = legacy.get("pinned_tweet_ids_str", []) # type: ignore[attr-defined]
|
|
53
|
+
self.is_blue_verified = data.get("is_blue_verified", False) # type: ignore[attr-defined]
|
|
54
|
+
self.verified = legacy.get("verified", False) # type: ignore[attr-defined]
|
|
55
|
+
self.possibly_sensitive = legacy.get("possibly_sensitive", False) # type: ignore[attr-defined]
|
|
56
|
+
self.can_dm = legacy.get("can_dm", False) # type: ignore[attr-defined]
|
|
57
|
+
self.can_media_tag = legacy.get("can_media_tag", False) # type: ignore[attr-defined]
|
|
58
|
+
self.want_retweets = legacy.get("want_retweets", False) # type: ignore[attr-defined]
|
|
59
|
+
self.default_profile = legacy.get("default_profile", False) # type: ignore[attr-defined]
|
|
60
|
+
self.default_profile_image = legacy.get("default_profile_image", False) # type: ignore[attr-defined]
|
|
61
|
+
self.has_custom_timelines = legacy.get("has_custom_timelines", False) # type: ignore[attr-defined]
|
|
62
|
+
self.followers_count = legacy.get("followers_count", 0) # type: ignore[attr-defined]
|
|
63
|
+
self.fast_followers_count = legacy.get("fast_followers_count", 0) # type: ignore[attr-defined]
|
|
64
|
+
self.normal_followers_count = legacy.get("normal_followers_count", 0) # type: ignore[attr-defined]
|
|
65
|
+
self.following_count = legacy.get("friends_count", 0) # type: ignore[attr-defined]
|
|
66
|
+
self.favourites_count = legacy.get("favourites_count", 0) # type: ignore[attr-defined]
|
|
67
|
+
self.listed_count = legacy.get("listed_count", 0) # type: ignore[attr-defined]
|
|
68
|
+
self.media_count = legacy.get("media_count", 0) # type: ignore[attr-defined]
|
|
69
|
+
self.statuses_count = legacy.get("statuses_count", 0) # type: ignore[attr-defined]
|
|
70
|
+
self.is_translator = legacy.get("is_translator", False) # type: ignore[attr-defined]
|
|
71
|
+
self.translator_type = legacy.get("translator_type", "") # type: ignore[attr-defined]
|
|
72
|
+
self.withheld_in_countries = legacy.get("withheld_in_countries", []) # type: ignore[attr-defined]
|
|
73
|
+
self.protected = legacy.get("protected", False) # type: ignore[attr-defined]
|
|
74
|
+
|
|
75
|
+
_twikit_user.User.__init__ = _safe_init
|
|
76
|
+
logger.debug("twikit_user_patched")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
_patch_twikit_user()
|
|
80
|
+
|
|
81
|
+
_DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
|
|
82
|
+
_HASHTAG_RE = re.compile(r"#\w+")
|
|
83
|
+
_URL_RE = re.compile(r"https?://[^\s]+")
|
|
84
|
+
|
|
85
|
+
_TWEET_DATETIME_FORMATS = (
|
|
86
|
+
"%a %b %d %H:%M:%S +0000 %Y",
|
|
87
|
+
"%Y-%m-%dT%H:%M:%S+00:00",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _detect_language(text: str) -> str:
|
|
92
|
+
deva = len(_DEVANAGARI_RE.findall(text))
|
|
93
|
+
total = len(text.replace(" ", ""))
|
|
94
|
+
if not total:
|
|
95
|
+
return "en"
|
|
96
|
+
ratio = deva / total
|
|
97
|
+
if ratio > 0.4:
|
|
98
|
+
return "hi"
|
|
99
|
+
if ratio > 0.1:
|
|
100
|
+
return "mixed"
|
|
101
|
+
return "en"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _parse_datetime(value: object) -> datetime:
|
|
105
|
+
if isinstance(value, datetime):
|
|
106
|
+
return value.replace(tzinfo=UTC) if value.tzinfo is None else value
|
|
107
|
+
s = str(value)
|
|
108
|
+
for fmt in _TWEET_DATETIME_FORMATS:
|
|
109
|
+
try:
|
|
110
|
+
return datetime.strptime(s, fmt).replace(tzinfo=UTC)
|
|
111
|
+
except ValueError:
|
|
112
|
+
continue
|
|
113
|
+
try:
|
|
114
|
+
dt = datetime.fromisoformat(s)
|
|
115
|
+
return dt.replace(tzinfo=UTC) if dt.tzinfo is None else dt
|
|
116
|
+
except ValueError:
|
|
117
|
+
return datetime.now(UTC)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _to_model(tweet: object, screen_name: str) -> TweetMetadata | None:
|
|
121
|
+
"""Convert a twikit tweet object to TweetMetadata. Returns None to skip."""
|
|
122
|
+
if getattr(tweet, "retweeted_tweet", None) is not None:
|
|
123
|
+
return None
|
|
124
|
+
if getattr(tweet, "in_reply_to", None) is not None:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
text: str = getattr(tweet, "text", "") or ""
|
|
128
|
+
note = getattr(tweet, "note_tweet", None)
|
|
129
|
+
if note:
|
|
130
|
+
text = getattr(note, "text", None) or text
|
|
131
|
+
|
|
132
|
+
if text.startswith("RT @"):
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
has_images = has_videos = False
|
|
136
|
+
for item in getattr(tweet, "media", None) or []:
|
|
137
|
+
t = type(item).__name__.lower()
|
|
138
|
+
has_images = has_images or "photo" in t or "image" in t
|
|
139
|
+
has_videos = has_videos or "video" in t
|
|
140
|
+
|
|
141
|
+
views = 0
|
|
142
|
+
raw_views = getattr(tweet, "view_count", None)
|
|
143
|
+
if raw_views:
|
|
144
|
+
try:
|
|
145
|
+
views = int(str(raw_views).replace(",", ""))
|
|
146
|
+
except (ValueError, TypeError):
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
tweet_id = str(getattr(tweet, "id", ""))
|
|
150
|
+
created_at = _parse_datetime(getattr(tweet, "created_at", datetime.now(UTC)))
|
|
151
|
+
|
|
152
|
+
return TweetMetadata(
|
|
153
|
+
tweet_id=tweet_id,
|
|
154
|
+
url=f"https://x.com/{screen_name}/status/{tweet_id}",
|
|
155
|
+
text=text,
|
|
156
|
+
screen_name=screen_name,
|
|
157
|
+
created_at=created_at,
|
|
158
|
+
language=_detect_language(text),
|
|
159
|
+
hashtags=_HASHTAG_RE.findall(text),
|
|
160
|
+
embedded_urls=_URL_RE.findall(text),
|
|
161
|
+
views=views,
|
|
162
|
+
retweet_count=getattr(tweet, "retweet_count", 0) or 0,
|
|
163
|
+
like_count=getattr(tweet, "favorite_count", 0) or 0,
|
|
164
|
+
reply_count=getattr(tweet, "reply_count", 0) or 0,
|
|
165
|
+
has_images=has_images,
|
|
166
|
+
has_videos=has_videos,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class TwitterAdapter:
|
|
171
|
+
"""Fetches tweets from one or more Twitter/X user timelines via twikit.
|
|
172
|
+
|
|
173
|
+
Mirrors the ``YouTubeMetadataAdapter`` interface: construct once with
|
|
174
|
+
credentials, call ``fetch()`` for a single user or ``fetch_many()`` for
|
|
175
|
+
a batch of users.
|
|
176
|
+
|
|
177
|
+
Cookies are loaded from AWS Secrets Manager (JSON string) or a local file:
|
|
178
|
+
|
|
179
|
+
Environment variables:
|
|
180
|
+
- ``TWITTER_COOKIES_SECRET_ID`` — Secrets Manager secret ID (JSON cookies)
|
|
181
|
+
- ``TWITTER_COOKIES_PATH`` — path to a local cookies.json file
|
|
182
|
+
- ``AWS_REGION`` — region for Secrets Manager (default: us-east-1)
|
|
183
|
+
|
|
184
|
+
Secrets Manager takes precedence over the local file.
|
|
185
|
+
|
|
186
|
+
Usage::
|
|
187
|
+
|
|
188
|
+
adapter = TwitterAdapter()
|
|
189
|
+
tweets = await adapter.fetch("srivatsayb", since=timedelta(days=1))
|
|
190
|
+
|
|
191
|
+
results = await adapter.fetch_many(
|
|
192
|
+
["srivatsayb", "rahulgandhiinc"],
|
|
193
|
+
since=timedelta(days=7),
|
|
194
|
+
)
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
def __init__(self) -> None:
|
|
198
|
+
self._cookies_secret_id: str = os.environ.get(
|
|
199
|
+
"TWITTER_COOKIES_SECRET_ID", ""
|
|
200
|
+
).strip()
|
|
201
|
+
self._cookies_path: str = os.environ.get("TWITTER_COOKIES_PATH", "").strip()
|
|
202
|
+
self._aws_region: str = os.environ.get("AWS_REGION", "us-east-1").strip()
|
|
203
|
+
self._cookies_cache: str | None = None # lazily populated
|
|
204
|
+
|
|
205
|
+
async def fetch(
|
|
206
|
+
self,
|
|
207
|
+
screen_name: str,
|
|
208
|
+
since: timedelta | None = None,
|
|
209
|
+
max_tweets: int = 200,
|
|
210
|
+
) -> list[TweetMetadata]:
|
|
211
|
+
"""Fetch tweets from a single user's timeline.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
screen_name: Twitter/X handle without @.
|
|
215
|
+
since: Only return tweets published within this window.
|
|
216
|
+
max_tweets: Upper bound on tweets returned (default: 200).
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of ``TweetMetadata``, newest first.
|
|
220
|
+
"""
|
|
221
|
+
cutoff = (datetime.now(UTC) - since) if since is not None else None
|
|
222
|
+
logger.info(
|
|
223
|
+
"twitter_fetch_started",
|
|
224
|
+
screen_name=screen_name,
|
|
225
|
+
since=str(since),
|
|
226
|
+
max_tweets=max_tweets,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
client = await self._make_client()
|
|
230
|
+
tweets = await self._paginate(client, screen_name, cutoff, max_tweets)
|
|
231
|
+
|
|
232
|
+
logger.info(
|
|
233
|
+
"twitter_fetch_completed",
|
|
234
|
+
screen_name=screen_name,
|
|
235
|
+
count=len(tweets),
|
|
236
|
+
)
|
|
237
|
+
return tweets
|
|
238
|
+
|
|
239
|
+
async def fetch_many(
|
|
240
|
+
self,
|
|
241
|
+
screen_names: list[str],
|
|
242
|
+
since: timedelta | None = None,
|
|
243
|
+
max_tweets: int = 200,
|
|
244
|
+
) -> list[TwitterUserResult]:
|
|
245
|
+
"""Fetch tweets for multiple users sequentially (Twitter rate limits
|
|
246
|
+
make concurrency impractical here).
|
|
247
|
+
|
|
248
|
+
Returns one ``TwitterUserResult`` per user, preserving input order.
|
|
249
|
+
"""
|
|
250
|
+
results: list[TwitterUserResult] = []
|
|
251
|
+
client = await self._make_client()
|
|
252
|
+
|
|
253
|
+
for i, screen_name in enumerate(screen_names):
|
|
254
|
+
cutoff = (datetime.now(UTC) - since) if since is not None else None
|
|
255
|
+
try:
|
|
256
|
+
tweets = await self._paginate(client, screen_name, cutoff, max_tweets)
|
|
257
|
+
results.append(
|
|
258
|
+
TwitterUserResult(
|
|
259
|
+
screen_name=screen_name, tweets=tweets, error=None
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
except Exception as exc:
|
|
263
|
+
logger.warning(
|
|
264
|
+
"twitter_fetch_failed",
|
|
265
|
+
screen_name=screen_name,
|
|
266
|
+
error=str(exc),
|
|
267
|
+
)
|
|
268
|
+
results.append(
|
|
269
|
+
TwitterUserResult(
|
|
270
|
+
screen_name=screen_name, tweets=[], error=str(exc)
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
# Pause between users to avoid hammering the API.
|
|
274
|
+
if i < len(screen_names) - 1:
|
|
275
|
+
await asyncio.sleep(5)
|
|
276
|
+
|
|
277
|
+
return results
|
|
278
|
+
|
|
279
|
+
# ------------------------------------------------------------------
|
|
280
|
+
# Private helpers
|
|
281
|
+
# ------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
async def _resolve_cookies(self) -> str:
|
|
284
|
+
"""Return cookies JSON string from Secrets Manager or local file."""
|
|
285
|
+
if self._cookies_cache is not None:
|
|
286
|
+
return self._cookies_cache
|
|
287
|
+
|
|
288
|
+
if self._cookies_secret_id:
|
|
289
|
+
from pulse_engine.secrets import fetch_secret
|
|
290
|
+
|
|
291
|
+
logger.debug(
|
|
292
|
+
"twitter_cookies_fetching_secret",
|
|
293
|
+
secret_id=self._cookies_secret_id,
|
|
294
|
+
)
|
|
295
|
+
raw = await fetch_secret(self._cookies_secret_id, self._aws_region)
|
|
296
|
+
# Secret may be stored as {"TWITTER_COOKIES_SECRET_ID": "<cookies json>"}
|
|
297
|
+
try:
|
|
298
|
+
parsed = json.loads(raw)
|
|
299
|
+
if isinstance(parsed, dict):
|
|
300
|
+
# Extract the first value that looks like a cookies JSON string
|
|
301
|
+
for v in parsed.values():
|
|
302
|
+
if isinstance(v, str):
|
|
303
|
+
raw = v
|
|
304
|
+
break
|
|
305
|
+
except ValueError:
|
|
306
|
+
pass
|
|
307
|
+
cookies = raw
|
|
308
|
+
self._cookies_cache = cookies
|
|
309
|
+
logger.info(
|
|
310
|
+
"twitter_cookies_loaded_from_secrets_manager",
|
|
311
|
+
secret_id=self._cookies_secret_id,
|
|
312
|
+
)
|
|
313
|
+
return cookies
|
|
314
|
+
|
|
315
|
+
if self._cookies_path:
|
|
316
|
+
cookies = Path(self._cookies_path).read_text(encoding="utf-8")
|
|
317
|
+
self._cookies_cache = cookies
|
|
318
|
+
return cookies
|
|
319
|
+
|
|
320
|
+
raise RuntimeError(
|
|
321
|
+
"Twitter cookies not configured. Set TWITTER_COOKIES_SECRET_ID "
|
|
322
|
+
"or TWITTER_COOKIES_PATH."
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
async def _make_client(self) -> object:
|
|
326
|
+
"""Build and return an authenticated twikit Client."""
|
|
327
|
+
try:
|
|
328
|
+
from twikit import Client
|
|
329
|
+
except ImportError:
|
|
330
|
+
raise RuntimeError(
|
|
331
|
+
"twikit is required for Twitter scraping: pip install twikit"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
cookies_json = await self._resolve_cookies()
|
|
335
|
+
|
|
336
|
+
# twikit loads cookies from a file path, so materialise to a temp file.
|
|
337
|
+
with tempfile.NamedTemporaryFile(
|
|
338
|
+
mode="w", suffix=".json", delete=False, encoding="utf-8"
|
|
339
|
+
) as f:
|
|
340
|
+
f.write(cookies_json)
|
|
341
|
+
tmp_path = f.name
|
|
342
|
+
|
|
343
|
+
client = Client()
|
|
344
|
+
client.load_cookies(tmp_path)
|
|
345
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
346
|
+
return client
|
|
347
|
+
|
|
348
|
+
async def _paginate(
|
|
349
|
+
self,
|
|
350
|
+
client: object,
|
|
351
|
+
screen_name: str,
|
|
352
|
+
cutoff: datetime | None,
|
|
353
|
+
max_tweets: int,
|
|
354
|
+
) -> list[TweetMetadata]:
|
|
355
|
+
try:
|
|
356
|
+
from twikit import TooManyRequests
|
|
357
|
+
except ImportError:
|
|
358
|
+
raise RuntimeError("twikit is required: pip install twikit")
|
|
359
|
+
|
|
360
|
+
user = await client.get_user_by_screen_name(screen_name) # type: ignore[attr-defined]
|
|
361
|
+
user_id = user.id
|
|
362
|
+
|
|
363
|
+
tweets: list[TweetMetadata] = []
|
|
364
|
+
cursor = None
|
|
365
|
+
|
|
366
|
+
while len(tweets) < max_tweets:
|
|
367
|
+
try:
|
|
368
|
+
page = await client.get_user_tweets( # type: ignore[attr-defined]
|
|
369
|
+
user_id, "Tweets", count=40, cursor=cursor
|
|
370
|
+
)
|
|
371
|
+
except TooManyRequests as exc:
|
|
372
|
+
reset = getattr(exc, "rate_limit_reset", None)
|
|
373
|
+
wait = (
|
|
374
|
+
max(
|
|
375
|
+
(
|
|
376
|
+
datetime.fromtimestamp(reset) - datetime.now()
|
|
377
|
+
).total_seconds(),
|
|
378
|
+
0,
|
|
379
|
+
)
|
|
380
|
+
if reset
|
|
381
|
+
else 60
|
|
382
|
+
)
|
|
383
|
+
logger.warning("twitter_rate_limited", wait_seconds=wait)
|
|
384
|
+
await asyncio.sleep(wait)
|
|
385
|
+
continue
|
|
386
|
+
except Exception:
|
|
387
|
+
logger.warning(
|
|
388
|
+
"twitter_page_fetch_failed",
|
|
389
|
+
screen_name=screen_name,
|
|
390
|
+
exc_info=True,
|
|
391
|
+
)
|
|
392
|
+
break
|
|
393
|
+
|
|
394
|
+
if not page:
|
|
395
|
+
break
|
|
396
|
+
|
|
397
|
+
stop = False
|
|
398
|
+
for tweet in page:
|
|
399
|
+
meta = _to_model(tweet, screen_name)
|
|
400
|
+
if meta is None:
|
|
401
|
+
continue
|
|
402
|
+
if cutoff and meta.created_at < cutoff:
|
|
403
|
+
stop = True
|
|
404
|
+
break
|
|
405
|
+
tweets.append(meta)
|
|
406
|
+
if len(tweets) >= max_tweets:
|
|
407
|
+
stop = True
|
|
408
|
+
break
|
|
409
|
+
|
|
410
|
+
if stop:
|
|
411
|
+
break
|
|
412
|
+
|
|
413
|
+
try:
|
|
414
|
+
cursor = page.next_cursor
|
|
415
|
+
except AttributeError:
|
|
416
|
+
cursor = None
|
|
417
|
+
|
|
418
|
+
if not cursor:
|
|
419
|
+
break
|
|
420
|
+
|
|
421
|
+
await asyncio.sleep(3)
|
|
422
|
+
|
|
423
|
+
return tweets
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""YouTube audio downloader adapter (yt-dlp only)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
from pulse_engine.secrets import fetch_secret
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
# Default yt-dlp player clients. "tv_embedded" returns audio-only streams for
|
|
17
|
+
# age-restricted videos when cookies are provided. Override via YT_DLP_PLAYER_CLIENTS.
|
|
18
|
+
_DEFAULT_PLAYER_CLIENTS = ["tv_embedded", "web"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _build_yt_dlp_opts(
|
|
22
|
+
output_template: str,
|
|
23
|
+
extra: dict[str, Any] | None = None,
|
|
24
|
+
) -> dict[str, Any]:
|
|
25
|
+
"""Build yt-dlp option dict, merging env defaults with caller overrides."""
|
|
26
|
+
raw_clients = os.environ.get("YT_DLP_PLAYER_CLIENTS", "").strip()
|
|
27
|
+
clients: list[str] = (
|
|
28
|
+
[c.strip() for c in raw_clients.split(",") if c.strip()]
|
|
29
|
+
if raw_clients
|
|
30
|
+
else _DEFAULT_PLAYER_CLIENTS
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
opts: dict[str, Any] = {
|
|
34
|
+
"format": "bestaudio/best",
|
|
35
|
+
"outtmpl": output_template,
|
|
36
|
+
"postprocessors": [
|
|
37
|
+
{
|
|
38
|
+
"key": "FFmpegExtractAudio",
|
|
39
|
+
"preferredcodec": "mp3",
|
|
40
|
+
"preferredquality": "64",
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
"quiet": True,
|
|
44
|
+
"no_warnings": True,
|
|
45
|
+
# Fetch updated EJS challenge solver from GitHub when bundled version
|
|
46
|
+
# can't handle the current YouTube player's n-challenge.
|
|
47
|
+
# Must be a set — passing a string causes it to be iterated as chars.
|
|
48
|
+
"remote_components": {"ejs:github"},
|
|
49
|
+
"extractor_args": {
|
|
50
|
+
"youtube": {
|
|
51
|
+
"player_client": clients,
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if not extra:
|
|
57
|
+
return opts
|
|
58
|
+
|
|
59
|
+
override = dict(extra)
|
|
60
|
+
# cookies_content is handled separately — materialised to a temp file.
|
|
61
|
+
override.pop("cookies_content", None)
|
|
62
|
+
ea = override.pop("extractor_args", None)
|
|
63
|
+
opts.update(override)
|
|
64
|
+
if isinstance(ea, dict):
|
|
65
|
+
existing = opts.get("extractor_args") or {}
|
|
66
|
+
merged: dict[str, Any] = {**existing}
|
|
67
|
+
for k, v in ea.items():
|
|
68
|
+
if isinstance(v, dict) and isinstance(merged.get(k), dict):
|
|
69
|
+
merged[k] = {**merged[k], **v}
|
|
70
|
+
else:
|
|
71
|
+
merged[k] = v
|
|
72
|
+
opts["extractor_args"] = merged
|
|
73
|
+
|
|
74
|
+
return opts
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class YouTubeDownloader:
|
|
78
|
+
"""Downloads audio from YouTube via yt-dlp, returning a Path to an mp3 file.
|
|
79
|
+
|
|
80
|
+
Environment variables:
|
|
81
|
+
- ``YT_DLP_PLAYER_CLIENTS`` — comma-separated yt-dlp client list
|
|
82
|
+
- ``YT_DLP_COOKIES_SECRET_ID`` — Secrets Manager secret name/ARN for cookies.txt
|
|
83
|
+
- ``AWS_REGION`` — region for Secrets Manager (default: us-east-1)
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self, yt_dlp_opts: dict[str, Any] | None = None) -> None:
|
|
87
|
+
self._yt_dlp_opts = yt_dlp_opts or {}
|
|
88
|
+
self._cookies_secret_id: str = os.environ.get(
|
|
89
|
+
"YT_DLP_COOKIES_SECRET_ID", ""
|
|
90
|
+
).strip()
|
|
91
|
+
self._aws_region: str = os.environ.get("AWS_REGION", "us-east-1").strip()
|
|
92
|
+
self._cookies_cache: str | None = None # lazily populated
|
|
93
|
+
|
|
94
|
+
async def download(self, video_id: str, output_dir: str) -> Path | None:
|
|
95
|
+
"""Download audio for *video_id* into *output_dir*, returning path to mp3.
|
|
96
|
+
|
|
97
|
+
Returns ``None`` if the download fails for any reason.
|
|
98
|
+
"""
|
|
99
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
100
|
+
logger.info("youtube_download_started", video_id=video_id)
|
|
101
|
+
|
|
102
|
+
result = await self._download_audio(url, output_dir)
|
|
103
|
+
if result is None:
|
|
104
|
+
logger.warning("youtube_download_failed", video_id=video_id)
|
|
105
|
+
else:
|
|
106
|
+
logger.info("youtube_download_completed", video_id=video_id, path=str(result))
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
# Private helpers
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
async def _resolve_cookies(self) -> str:
|
|
114
|
+
"""Return cookies.txt content, fetching from Secrets Manager if configured.
|
|
115
|
+
|
|
116
|
+
Result is cached after the first successful fetch so subsequent videos
|
|
117
|
+
in the same adapter instance don't make redundant API calls.
|
|
118
|
+
"""
|
|
119
|
+
if self._cookies_cache is not None:
|
|
120
|
+
return self._cookies_cache
|
|
121
|
+
|
|
122
|
+
if not self._cookies_secret_id:
|
|
123
|
+
self._cookies_cache = ""
|
|
124
|
+
return ""
|
|
125
|
+
|
|
126
|
+
logger.debug(
|
|
127
|
+
"youtube_cookies_fetching_secret",
|
|
128
|
+
secret_id=self._cookies_secret_id,
|
|
129
|
+
region=self._aws_region,
|
|
130
|
+
)
|
|
131
|
+
import json as _json
|
|
132
|
+
|
|
133
|
+
raw = await fetch_secret(self._cookies_secret_id, self._aws_region)
|
|
134
|
+
# Secret may be stored as {"YT_DLP_COOKIES_SECRET_ID": "<netscape txt>"}
|
|
135
|
+
try:
|
|
136
|
+
parsed = _json.loads(raw)
|
|
137
|
+
if isinstance(parsed, dict):
|
|
138
|
+
for v in parsed.values():
|
|
139
|
+
if isinstance(v, str):
|
|
140
|
+
raw = v
|
|
141
|
+
break
|
|
142
|
+
except ValueError:
|
|
143
|
+
pass
|
|
144
|
+
cookies = raw
|
|
145
|
+
self._cookies_cache = cookies
|
|
146
|
+
logger.info(
|
|
147
|
+
"youtube_cookies_loaded_from_secrets_manager",
|
|
148
|
+
secret_id=self._cookies_secret_id,
|
|
149
|
+
chars=len(cookies),
|
|
150
|
+
)
|
|
151
|
+
return cookies
|
|
152
|
+
|
|
153
|
+
async def _download_audio(self, url: str, output_dir: str) -> Path | None:
|
|
154
|
+
"""Download audio from a YouTube URL to *output_dir* via yt-dlp."""
|
|
155
|
+
try:
|
|
156
|
+
import yt_dlp
|
|
157
|
+
except ImportError:
|
|
158
|
+
logger.warning("yt_dlp_not_installed", url=url)
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
output_template = str(Path(output_dir) / "%(id)s.%(ext)s")
|
|
162
|
+
ydl_opts = _build_yt_dlp_opts(output_template, extra=self._yt_dlp_opts)
|
|
163
|
+
|
|
164
|
+
# Resolve cookies (priority: Secrets Manager > env var).
|
|
165
|
+
cookies_content = await self._resolve_cookies()
|
|
166
|
+
if cookies_content and "cookiefile" not in ydl_opts:
|
|
167
|
+
cookies_path = Path(output_dir) / "cookies.txt"
|
|
168
|
+
cookies_path.write_text(cookies_content, encoding="utf-8")
|
|
169
|
+
try:
|
|
170
|
+
import os as _os
|
|
171
|
+
_os.chmod(cookies_path, 0o600)
|
|
172
|
+
except OSError:
|
|
173
|
+
pass
|
|
174
|
+
ydl_opts["cookiefile"] = str(cookies_path)
|
|
175
|
+
|
|
176
|
+
def _download() -> Path | None:
|
|
177
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
178
|
+
info = ydl.extract_info(url, download=True)
|
|
179
|
+
candidate = Path(output_dir) / f"{info.get('id', '')}.mp3"
|
|
180
|
+
return candidate if candidate.exists() else None
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
return await asyncio.to_thread(_download)
|
|
184
|
+
except Exception:
|
|
185
|
+
logger.warning("youtube_audio_download_failed", url=url, exc_info=True)
|
|
186
|
+
return None
|