pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
pulse_engine/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Pulse Engine adapters — digital news, YouTube, speech, and Twitter."""
|
|
2
|
+
|
|
3
|
+
from .audio_transcription import AudioTranscriptionAdapter
|
|
4
|
+
from .batcher import BatcherAdapter
|
|
5
|
+
from .digital_news import DigitalNewsAdapter
|
|
6
|
+
from .opensearch_storage import OpenSearchStorageAdapter
|
|
7
|
+
from .digital_news_metadata import DigitalNewsMetadataAdapter
|
|
8
|
+
from .exceptions import SitemapDiscoveryError
|
|
9
|
+
from .models import (
|
|
10
|
+
ArticleContent,
|
|
11
|
+
ArticleResult,
|
|
12
|
+
NewsArticleMetadata,
|
|
13
|
+
SpeechContent,
|
|
14
|
+
SpeechMetadata,
|
|
15
|
+
SpeechResult,
|
|
16
|
+
TranscriptSegment,
|
|
17
|
+
TweetMetadata,
|
|
18
|
+
TwitterUserResult,
|
|
19
|
+
VideoAudio,
|
|
20
|
+
VideoMetadata,
|
|
21
|
+
VideoResult,
|
|
22
|
+
)
|
|
23
|
+
from .speech_content import SpeechContentAdapter
|
|
24
|
+
from .speech_metadata import SpeechMetadataAdapter
|
|
25
|
+
from .twitter import TwitterAdapter
|
|
26
|
+
from .youtube_downloader import YouTubeDownloader
|
|
27
|
+
from .youtube_metadata import YouTubeMetadataAdapter
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Utilities
|
|
31
|
+
"BatcherAdapter",
|
|
32
|
+
"OpenSearchStorageAdapter",
|
|
33
|
+
# Digital news
|
|
34
|
+
"DigitalNewsMetadataAdapter",
|
|
35
|
+
"DigitalNewsAdapter",
|
|
36
|
+
"NewsArticleMetadata",
|
|
37
|
+
"ArticleContent",
|
|
38
|
+
"ArticleResult",
|
|
39
|
+
"SitemapDiscoveryError",
|
|
40
|
+
# YouTube
|
|
41
|
+
"YouTubeMetadataAdapter",
|
|
42
|
+
"YouTubeDownloader",
|
|
43
|
+
"AudioTranscriptionAdapter",
|
|
44
|
+
"VideoMetadata",
|
|
45
|
+
"VideoAudio",
|
|
46
|
+
"VideoResult",
|
|
47
|
+
"TranscriptSegment",
|
|
48
|
+
# Speeches
|
|
49
|
+
"SpeechMetadataAdapter",
|
|
50
|
+
"SpeechContentAdapter",
|
|
51
|
+
"SpeechMetadata",
|
|
52
|
+
"SpeechContent",
|
|
53
|
+
"SpeechResult",
|
|
54
|
+
# Twitter
|
|
55
|
+
"TwitterAdapter",
|
|
56
|
+
"TweetMetadata",
|
|
57
|
+
"TwitterUserResult",
|
|
58
|
+
]
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Audio transcription adapter using OpenAI Whisper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import glob
|
|
7
|
+
import os
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import structlog
|
|
14
|
+
|
|
15
|
+
from .models import TranscriptSegment, VideoAudio
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
_WHISPER_MAX_BYTES = 24 * 1024 * 1024 # stay under Whisper's 25 MB hard limit
|
|
20
|
+
_WHISPER_CHUNK_SECS = 10 * 60 # split large files into 10-min chunks
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AudioTranscriptionAdapter:
|
|
24
|
+
"""Transcribes an audio file using OpenAI Whisper, returning a VideoAudio.
|
|
25
|
+
|
|
26
|
+
Large files (>24 MB) are automatically split into 10-minute chunks via
|
|
27
|
+
ffmpeg and stitched back together with adjusted timestamps.
|
|
28
|
+
|
|
29
|
+
Usage::
|
|
30
|
+
|
|
31
|
+
adapter = AudioTranscriptionAdapter(openai_api_key="sk-...")
|
|
32
|
+
audio = await adapter.transcribe(Path("/tmp/abc123.mp3"), language="en")
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, openai_api_key: str) -> None:
|
|
36
|
+
self._openai_api_key = openai_api_key
|
|
37
|
+
|
|
38
|
+
async def transcribe(self, audio_path: Path, language: str = "en") -> VideoAudio:
|
|
39
|
+
"""Transcribe *audio_path* and return a :class:`VideoAudio`.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
audio_path: Path to an mp3 (or other audio) file.
|
|
43
|
+
language: BCP-47 language tag, or ``"auto"`` for Whisper auto-detect.
|
|
44
|
+
"""
|
|
45
|
+
logger.info(
|
|
46
|
+
"audio_transcription_started",
|
|
47
|
+
audio_path=str(audio_path),
|
|
48
|
+
language=language,
|
|
49
|
+
)
|
|
50
|
+
transcript, segments = await self._transcribe(audio_path, language=language)
|
|
51
|
+
logger.info(
|
|
52
|
+
"audio_transcription_completed",
|
|
53
|
+
audio_path=str(audio_path),
|
|
54
|
+
segments=len(segments),
|
|
55
|
+
chars=len(transcript),
|
|
56
|
+
)
|
|
57
|
+
return VideoAudio(transcript=transcript, language=language, segments=segments)
|
|
58
|
+
|
|
59
|
+
# ------------------------------------------------------------------
|
|
60
|
+
# Private helpers
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
async def _transcribe(
|
|
64
|
+
self, audio_path: Path, language: str
|
|
65
|
+
) -> tuple[str, list[TranscriptSegment]]:
|
|
66
|
+
"""Send audio to Whisper, returning (full_text, segments).
|
|
67
|
+
|
|
68
|
+
Large files (>24 MB) are split into 10-minute chunks via ffmpeg and
|
|
69
|
+
stitched back together with adjusted timestamps.
|
|
70
|
+
"""
|
|
71
|
+
# Empty language string signals Whisper auto-detect.
|
|
72
|
+
whisper_language = "" if language.lower() == "auto" else language
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
from openai import AsyncOpenAI
|
|
76
|
+
|
|
77
|
+
client = AsyncOpenAI(api_key=self._openai_api_key)
|
|
78
|
+
|
|
79
|
+
def _transcribe_file(path: Path, offset: float) -> list[TranscriptSegment]:
|
|
80
|
+
import asyncio as _asyncio
|
|
81
|
+
|
|
82
|
+
loop = _asyncio.new_event_loop()
|
|
83
|
+
try:
|
|
84
|
+
with open(path, "rb") as fobj:
|
|
85
|
+
kwargs: dict[str, Any] = {
|
|
86
|
+
"model": "whisper-1",
|
|
87
|
+
"file": fobj,
|
|
88
|
+
"response_format": "verbose_json",
|
|
89
|
+
"timestamp_granularities": ["segment"],
|
|
90
|
+
}
|
|
91
|
+
if whisper_language:
|
|
92
|
+
kwargs["language"] = whisper_language
|
|
93
|
+
resp = loop.run_until_complete(
|
|
94
|
+
client.audio.transcriptions.create(**kwargs)
|
|
95
|
+
)
|
|
96
|
+
finally:
|
|
97
|
+
loop.close()
|
|
98
|
+
|
|
99
|
+
raw = getattr(resp, "segments", None) or []
|
|
100
|
+
return [
|
|
101
|
+
TranscriptSegment(
|
|
102
|
+
start=float(
|
|
103
|
+
s.get("start", 0) if isinstance(s, dict) else s.start
|
|
104
|
+
)
|
|
105
|
+
+ offset,
|
|
106
|
+
end=float(s.get("end", 0) if isinstance(s, dict) else s.end)
|
|
107
|
+
+ offset,
|
|
108
|
+
text=(
|
|
109
|
+
s.get("text", "") if isinstance(s, dict) else s.text
|
|
110
|
+
).strip(),
|
|
111
|
+
)
|
|
112
|
+
for s in raw
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
file_size = audio_path.stat().st_size
|
|
116
|
+
|
|
117
|
+
if file_size <= _WHISPER_MAX_BYTES:
|
|
118
|
+
segments = await asyncio.to_thread(_transcribe_file, audio_path, 0.0)
|
|
119
|
+
else:
|
|
120
|
+
logger.info(
|
|
121
|
+
"audio_whisper_splitting_large_file",
|
|
122
|
+
size_mb=round(file_size / (1024 * 1024), 1),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _split_and_transcribe() -> list[TranscriptSegment]:
|
|
126
|
+
with tempfile.TemporaryDirectory(prefix="whisper_chunks_") as td:
|
|
127
|
+
chunk_pattern = os.path.join(td, "chunk_%04d.mp3")
|
|
128
|
+
subprocess.run(
|
|
129
|
+
[
|
|
130
|
+
"ffmpeg",
|
|
131
|
+
"-i",
|
|
132
|
+
str(audio_path),
|
|
133
|
+
"-f",
|
|
134
|
+
"segment",
|
|
135
|
+
"-segment_time",
|
|
136
|
+
str(_WHISPER_CHUNK_SECS),
|
|
137
|
+
"-c",
|
|
138
|
+
"copy",
|
|
139
|
+
"-reset_timestamps",
|
|
140
|
+
"1",
|
|
141
|
+
chunk_pattern,
|
|
142
|
+
],
|
|
143
|
+
check=True,
|
|
144
|
+
capture_output=True,
|
|
145
|
+
)
|
|
146
|
+
chunk_files = sorted(glob.glob(os.path.join(td, "chunk_*.mp3")))
|
|
147
|
+
all_segs: list[TranscriptSegment] = []
|
|
148
|
+
offset = 0.0
|
|
149
|
+
for chunk_path in chunk_files:
|
|
150
|
+
segs = _transcribe_file(Path(chunk_path), offset)
|
|
151
|
+
if segs:
|
|
152
|
+
offset = segs[-1].end
|
|
153
|
+
all_segs.extend(segs)
|
|
154
|
+
return all_segs
|
|
155
|
+
|
|
156
|
+
segments = await asyncio.to_thread(_split_and_transcribe)
|
|
157
|
+
|
|
158
|
+
full_text = " ".join(s.text for s in segments)
|
|
159
|
+
return full_text, segments
|
|
160
|
+
|
|
161
|
+
except Exception:
|
|
162
|
+
logger.warning(
|
|
163
|
+
"audio_whisper_transcription_failed",
|
|
164
|
+
audio_path=str(audio_path),
|
|
165
|
+
exc_info=True,
|
|
166
|
+
)
|
|
167
|
+
return "", []
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""BatcherAdapter — splits a list of items into fixed-size batches."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BatcherAdapter:
|
|
9
|
+
"""Split a flat list of items into batches of a fixed size.
|
|
10
|
+
|
|
11
|
+
Usage::
|
|
12
|
+
|
|
13
|
+
adapter = BatcherAdapter(batch_size=50)
|
|
14
|
+
result = adapter.batch(items)
|
|
15
|
+
# result == {"batches": [[...], [...]]}
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, batch_size: int = 50) -> None:
|
|
19
|
+
if batch_size < 1:
|
|
20
|
+
raise ValueError(f"batch_size must be >= 1, got {batch_size}")
|
|
21
|
+
self.batch_size = batch_size
|
|
22
|
+
|
|
23
|
+
def batch(self, items: list[Any]) -> dict[str, list[list[Any]]]:
|
|
24
|
+
"""Split items into batches.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
items: Flat list of items to batch.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
{"batches": [[item, ...], ...]}
|
|
31
|
+
"""
|
|
32
|
+
batches = [
|
|
33
|
+
items[i : i + self.batch_size]
|
|
34
|
+
for i in range(0, len(items), self.batch_size)
|
|
35
|
+
]
|
|
36
|
+
return {"batches": batches}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Fetches article content via Cloudflare Browser Rendering and LLM extraction."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
import structlog
|
|
9
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from .models import ArticleContent, ArticleResult
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
_CF_MARKDOWN_URL = (
|
|
17
|
+
"https://api.cloudflare.com/client/v4/accounts"
|
|
18
|
+
"/{account_id}/browser-rendering/markdown"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_EXTRACTION_PROMPT = (
|
|
22
|
+
"Extract the article title and the main body content verbatim. "
|
|
23
|
+
"Ignore ads, navigation, comments, and unrelated sections. "
|
|
24
|
+
"Return clean readable text."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _ArticleSchema(BaseModel):
|
|
29
|
+
"""Schema for structured LLM output."""
|
|
30
|
+
|
|
31
|
+
title: str = Field(description="The main headline of the article")
|
|
32
|
+
content: str = Field(description="The full main article text content")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DigitalNewsAdapter:
|
|
36
|
+
"""Fetches article content via Cloudflare Browser Rendering + LLM.
|
|
37
|
+
|
|
38
|
+
Accepts any LangChain BaseChatModel — works with OpenAI, Anthropic,
|
|
39
|
+
Ollama, Gemini, or any provider LangChain supports.
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
# With explicit LLM
|
|
43
|
+
from langchain_openai import ChatOpenAI
|
|
44
|
+
llm = ChatOpenAI(model="gpt-5.4-nano")
|
|
45
|
+
adapter = DigitalNewsAdapter(llm=llm)
|
|
46
|
+
|
|
47
|
+
# With project default (from get_llm())
|
|
48
|
+
from pulse_engine.processor.llm.provider import get_llm
|
|
49
|
+
adapter = DigitalNewsAdapter(llm=get_llm())
|
|
50
|
+
|
|
51
|
+
# Ollama
|
|
52
|
+
from langchain_ollama import ChatOllama
|
|
53
|
+
adapter = DigitalNewsAdapter(llm=ChatOllama(model="llama3"))
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
llm: Any,
|
|
59
|
+
cloudflare_timeout: float = 60.0,
|
|
60
|
+
) -> None:
|
|
61
|
+
self._llm = llm
|
|
62
|
+
self._cloudflare_timeout = cloudflare_timeout
|
|
63
|
+
|
|
64
|
+
self._account_id = os.environ.get("CLOUDFLARE_ACCOUNT_ID")
|
|
65
|
+
if not self._account_id:
|
|
66
|
+
raise OSError("CLOUDFLARE_ACCOUNT_ID environment variable is required")
|
|
67
|
+
|
|
68
|
+
self._bearer_token = os.environ.get("CLOUDFLARE_BEARER_TOKEN")
|
|
69
|
+
if not self._bearer_token:
|
|
70
|
+
raise OSError("CLOUDFLARE_BEARER_TOKEN environment variable is required")
|
|
71
|
+
|
|
72
|
+
async def fetch(self, url: str) -> ArticleContent:
|
|
73
|
+
"""Fetch and extract content from a single article URL."""
|
|
74
|
+
markdown = await self._fetch_markdown(url)
|
|
75
|
+
return await self._extract_content(markdown)
|
|
76
|
+
|
|
77
|
+
async def fetch_many(
|
|
78
|
+
self,
|
|
79
|
+
urls: list[str],
|
|
80
|
+
concurrency: int = 5,
|
|
81
|
+
) -> list[ArticleResult]:
|
|
82
|
+
"""Fetch multiple articles concurrently, preserving input order."""
|
|
83
|
+
if not urls:
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
87
|
+
|
|
88
|
+
async def _guarded(u: str) -> ArticleResult:
|
|
89
|
+
async with semaphore:
|
|
90
|
+
try:
|
|
91
|
+
content = await self.fetch(u)
|
|
92
|
+
return ArticleResult(url=u, content=content, error=None)
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
logger.warning("fetch_failed", url=u, error=str(exc))
|
|
95
|
+
return ArticleResult(url=u, content=None, error=str(exc))
|
|
96
|
+
|
|
97
|
+
tasks = [_guarded(u) for u in urls]
|
|
98
|
+
return list(await asyncio.gather(*tasks))
|
|
99
|
+
|
|
100
|
+
async def _fetch_markdown(self, url: str) -> str:
|
|
101
|
+
cf_url = _CF_MARKDOWN_URL.format(account_id=self._account_id)
|
|
102
|
+
async with httpx.AsyncClient(
|
|
103
|
+
timeout=self._cloudflare_timeout, follow_redirects=True
|
|
104
|
+
) as client:
|
|
105
|
+
resp = await client.post(
|
|
106
|
+
cf_url,
|
|
107
|
+
headers={
|
|
108
|
+
"Authorization": f"Bearer {self._bearer_token}",
|
|
109
|
+
"Content-Type": "application/json",
|
|
110
|
+
},
|
|
111
|
+
json={"url": url},
|
|
112
|
+
)
|
|
113
|
+
resp.raise_for_status()
|
|
114
|
+
|
|
115
|
+
data = resp.json()
|
|
116
|
+
markdown: str = data.get("result", "")
|
|
117
|
+
if not markdown or not markdown.strip():
|
|
118
|
+
raise RuntimeError(f"Empty markdown returned by Cloudflare for {url}")
|
|
119
|
+
return markdown
|
|
120
|
+
|
|
121
|
+
async def _extract_content(self, markdown: str) -> ArticleContent:
|
|
122
|
+
structured_llm = self._llm.with_structured_output(_ArticleSchema)
|
|
123
|
+
messages = [
|
|
124
|
+
SystemMessage(content=_EXTRACTION_PROMPT),
|
|
125
|
+
HumanMessage(content=markdown),
|
|
126
|
+
]
|
|
127
|
+
result: _ArticleSchema = await structured_llm.ainvoke(messages)
|
|
128
|
+
return ArticleContent(title=result.title, content=result.content)
|