pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""OpenSearchStorageAdapter — upsert documents into an OpenSearch index.
|
|
2
|
+
|
|
3
|
+
Schema (index name, field mappings, id field) is defined at call time so
|
|
4
|
+
products can configure it entirely from pipeline.yaml args.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Maps pipeline.yaml type names → OpenSearch mapping types
|
|
15
|
+
_TYPE_MAP: dict[str, dict[str, Any]] = {
|
|
16
|
+
"text": {"type": "text"},
|
|
17
|
+
"keyword": {"type": "keyword"},
|
|
18
|
+
"date": {"type": "date"},
|
|
19
|
+
"integer": {"type": "integer"},
|
|
20
|
+
"long": {"type": "long"},
|
|
21
|
+
"float": {"type": "float"},
|
|
22
|
+
"boolean": {"type": "boolean"},
|
|
23
|
+
"object": {"type": "object", "enabled": False}, # stored but not indexed
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OpenSearchStorageAdapter:
|
|
28
|
+
"""Upsert documents into an OpenSearch index.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
url: OpenSearch endpoint URL.
|
|
32
|
+
username: HTTP basic auth username (optional).
|
|
33
|
+
password: HTTP basic auth password (optional).
|
|
34
|
+
use_ssl: Whether to use SSL. Default True.
|
|
35
|
+
verify_certs: Whether to verify SSL certificates. Default True.
|
|
36
|
+
index_prefix: Prefix prepended to index name. Default "".
|
|
37
|
+
|
|
38
|
+
Usage::
|
|
39
|
+
|
|
40
|
+
adapter = OpenSearchStorageAdapter(url="https://...")
|
|
41
|
+
await adapter.ensure_index(
|
|
42
|
+
index="pulse_videos",
|
|
43
|
+
id_field="video_id",
|
|
44
|
+
mappings={"video_id": "keyword", "title": "text", ...},
|
|
45
|
+
)
|
|
46
|
+
result = await adapter.upsert_batch(
|
|
47
|
+
index="pulse_videos",
|
|
48
|
+
id_field="video_id",
|
|
49
|
+
documents=[{"video_id": "abc", "title": "...", ...}],
|
|
50
|
+
)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
url: str,
|
|
56
|
+
username: str = "",
|
|
57
|
+
password: str = "",
|
|
58
|
+
use_ssl: bool = True,
|
|
59
|
+
verify_certs: bool = True,
|
|
60
|
+
index_prefix: str = "",
|
|
61
|
+
) -> None:
|
|
62
|
+
self._url = url.rstrip("/")
|
|
63
|
+
self._username = username
|
|
64
|
+
self._password = password
|
|
65
|
+
self._use_ssl = use_ssl
|
|
66
|
+
self._verify_certs = verify_certs
|
|
67
|
+
self._index_prefix = index_prefix
|
|
68
|
+
|
|
69
|
+
def _client(self) -> Any:
|
|
70
|
+
from opensearchpy import AsyncOpenSearch
|
|
71
|
+
|
|
72
|
+
kwargs: dict[str, Any] = {
|
|
73
|
+
"hosts": [self._url],
|
|
74
|
+
"use_ssl": self._use_ssl,
|
|
75
|
+
"verify_certs": self._verify_certs,
|
|
76
|
+
}
|
|
77
|
+
if self._username and self._password:
|
|
78
|
+
kwargs["http_auth"] = (self._username, self._password)
|
|
79
|
+
return AsyncOpenSearch(**kwargs)
|
|
80
|
+
|
|
81
|
+
def _index_name(self, index: str) -> str:
|
|
82
|
+
return f"{self._index_prefix}{index}" if self._index_prefix else index
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _build_properties(mappings: dict[str, str]) -> dict[str, Any]:
|
|
86
|
+
properties: dict[str, Any] = {}
|
|
87
|
+
for field, type_name in mappings.items():
|
|
88
|
+
properties[field] = _TYPE_MAP.get(type_name, {"type": "keyword"})
|
|
89
|
+
return properties
|
|
90
|
+
|
|
91
|
+
async def ensure_index(
|
|
92
|
+
self,
|
|
93
|
+
index: str,
|
|
94
|
+
id_field: str,
|
|
95
|
+
mappings: dict[str, str],
|
|
96
|
+
) -> None:
|
|
97
|
+
"""Create index with mappings if it doesn't exist."""
|
|
98
|
+
name = self._index_name(index)
|
|
99
|
+
body = {
|
|
100
|
+
"settings": {"number_of_shards": 1, "number_of_replicas": 1},
|
|
101
|
+
"mappings": {"properties": self._build_properties(mappings)},
|
|
102
|
+
}
|
|
103
|
+
async with self._client() as client:
|
|
104
|
+
exists = await client.indices.exists(index=name)
|
|
105
|
+
if not exists:
|
|
106
|
+
await client.indices.create(index=name, body=body)
|
|
107
|
+
logger.info("Created OpenSearch index '%s'", name)
|
|
108
|
+
else:
|
|
109
|
+
logger.debug("OpenSearch index '%s' already exists", name)
|
|
110
|
+
|
|
111
|
+
async def upsert_batch(
|
|
112
|
+
self,
|
|
113
|
+
index: str,
|
|
114
|
+
id_field: str,
|
|
115
|
+
documents: list[dict[str, Any]],
|
|
116
|
+
) -> dict[str, Any]:
|
|
117
|
+
"""Bulk upsert documents. Uses id_field value as document _id.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
{"indexed": N, "errors": [...]}
|
|
121
|
+
"""
|
|
122
|
+
from opensearchpy import AsyncOpenSearch
|
|
123
|
+
from opensearchpy.helpers import async_bulk
|
|
124
|
+
|
|
125
|
+
name = self._index_name(index)
|
|
126
|
+
actions = [
|
|
127
|
+
{
|
|
128
|
+
"_op_type": "index",
|
|
129
|
+
"_index": name,
|
|
130
|
+
"_id": str(doc[id_field]),
|
|
131
|
+
"_source": doc,
|
|
132
|
+
}
|
|
133
|
+
for doc in documents
|
|
134
|
+
if doc.get(id_field)
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
skipped = len(documents) - len(actions)
|
|
138
|
+
if skipped:
|
|
139
|
+
logger.warning("Skipped %d documents missing id_field '%s'", skipped, id_field)
|
|
140
|
+
|
|
141
|
+
errors: list[Any] = []
|
|
142
|
+
async with self._client() as client:
|
|
143
|
+
success, failed = await async_bulk(client, actions, raise_on_error=False, stats_only=False)
|
|
144
|
+
if failed:
|
|
145
|
+
errors = failed
|
|
146
|
+
logger.warning("OpenSearch bulk errors: %d failed", len(failed))
|
|
147
|
+
|
|
148
|
+
logger.info("Upserted %d documents to index '%s'", success, name)
|
|
149
|
+
return {"indexed": success, "errors": errors}
|
|
150
|
+
|
|
151
|
+
async def store(
|
|
152
|
+
self,
|
|
153
|
+
index: str,
|
|
154
|
+
id_field: str,
|
|
155
|
+
mappings: dict[str, str],
|
|
156
|
+
documents: list[dict[str, Any]],
|
|
157
|
+
) -> dict[str, Any]:
|
|
158
|
+
"""Ensure index exists then upsert all documents."""
|
|
159
|
+
await self.ensure_index(index=index, id_field=id_field, mappings=mappings)
|
|
160
|
+
return await self.upsert_batch(index=index, id_field=id_field, documents=documents)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Fetches and extracts text content from speech PDFs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
import structlog
|
|
10
|
+
|
|
11
|
+
from .models import SpeechContent, SpeechMetadata, SpeechResult
|
|
12
|
+
|
|
13
|
+
logger = structlog.get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
_DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)"}
|
|
16
|
+
_DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _detect_language(text: str) -> str:
|
|
20
|
+
deva = len(_DEVANAGARI_RE.findall(text))
|
|
21
|
+
total = len(text.replace(" ", ""))
|
|
22
|
+
if not total:
|
|
23
|
+
return "en"
|
|
24
|
+
ratio = deva / total
|
|
25
|
+
if ratio > 0.4:
|
|
26
|
+
return "hi"
|
|
27
|
+
if ratio > 0.1:
|
|
28
|
+
return "mixed"
|
|
29
|
+
return "en"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SpeechContentAdapter:
|
|
33
|
+
"""Fetches PDF text for speeches discovered by ``SpeechMetadataAdapter``.
|
|
34
|
+
|
|
35
|
+
Mirrors the ``DigitalNewsAdapter`` interface: call ``fetch()`` for a single
|
|
36
|
+
speech or ``fetch_many()`` for a batch.
|
|
37
|
+
|
|
38
|
+
Requires ``pymupdf`` (``fitz``) to be installed for PDF extraction.
|
|
39
|
+
|
|
40
|
+
Usage::
|
|
41
|
+
|
|
42
|
+
adapter = SpeechContentAdapter()
|
|
43
|
+
content = await adapter.fetch(metadata)
|
|
44
|
+
|
|
45
|
+
results = await adapter.fetch_many(metadata_list, concurrency=4)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, concurrency: int = 4, timeout: float = 60.0) -> None:
|
|
49
|
+
self._default_concurrency = concurrency
|
|
50
|
+
self._timeout = timeout
|
|
51
|
+
|
|
52
|
+
async def fetch(self, speech: SpeechMetadata) -> SpeechContent:
|
|
53
|
+
"""Download and extract text from a single speech PDF. Raises on failure."""
|
|
54
|
+
if not speech.pdf_url:
|
|
55
|
+
raise ValueError(f"No PDF URL for speech: {speech.url}")
|
|
56
|
+
|
|
57
|
+
logger.info("speech_content_fetch_started", url=speech.url, pdf=speech.pdf_url)
|
|
58
|
+
text = await self._extract_pdf(speech.pdf_url)
|
|
59
|
+
if not text:
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
f"PDF extraction returned empty text for {speech.pdf_url}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
language = _detect_language(text)
|
|
65
|
+
logger.info(
|
|
66
|
+
"speech_content_fetch_completed",
|
|
67
|
+
url=speech.url,
|
|
68
|
+
chars=len(text),
|
|
69
|
+
language=language,
|
|
70
|
+
)
|
|
71
|
+
return SpeechContent(text=text, language=language, pdf_url=speech.pdf_url)
|
|
72
|
+
|
|
73
|
+
async def fetch_many(
|
|
74
|
+
self,
|
|
75
|
+
speeches: list[SpeechMetadata],
|
|
76
|
+
concurrency: int | None = None,
|
|
77
|
+
) -> list[SpeechResult]:
|
|
78
|
+
"""Extract content from multiple speeches concurrently.
|
|
79
|
+
|
|
80
|
+
Speeches without a PDF URL are returned immediately as errors.
|
|
81
|
+
Preserves input order.
|
|
82
|
+
"""
|
|
83
|
+
if not speeches:
|
|
84
|
+
return []
|
|
85
|
+
|
|
86
|
+
sem = asyncio.Semaphore(concurrency or self._default_concurrency)
|
|
87
|
+
|
|
88
|
+
async def _guarded(speech: SpeechMetadata) -> SpeechResult:
|
|
89
|
+
async with sem:
|
|
90
|
+
try:
|
|
91
|
+
content = await self.fetch(speech)
|
|
92
|
+
return SpeechResult(url=speech.url, content=content, error=None)
|
|
93
|
+
except Exception as exc:
|
|
94
|
+
logger.warning(
|
|
95
|
+
"speech_content_fetch_failed",
|
|
96
|
+
url=speech.url,
|
|
97
|
+
error=str(exc),
|
|
98
|
+
)
|
|
99
|
+
return SpeechResult(url=speech.url, content=None, error=str(exc))
|
|
100
|
+
|
|
101
|
+
tasks = [_guarded(s) for s in speeches]
|
|
102
|
+
return list(await asyncio.gather(*tasks))
|
|
103
|
+
|
|
104
|
+
# ------------------------------------------------------------------
|
|
105
|
+
# Private helpers
|
|
106
|
+
# ------------------------------------------------------------------
|
|
107
|
+
|
|
108
|
+
async def _extract_pdf(self, pdf_url: str) -> str:
|
|
109
|
+
"""Download PDF bytes and extract plain text via pymupdf."""
|
|
110
|
+
try:
|
|
111
|
+
import fitz # pymupdf
|
|
112
|
+
except ImportError:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
"pymupdf is required for PDF extraction: pip install pymupdf"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
async with httpx.AsyncClient(
|
|
118
|
+
follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=self._timeout
|
|
119
|
+
) as client:
|
|
120
|
+
resp = await client.get(pdf_url)
|
|
121
|
+
resp.raise_for_status()
|
|
122
|
+
pdf_bytes = resp.content
|
|
123
|
+
|
|
124
|
+
def _parse() -> str:
|
|
125
|
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
126
|
+
pages = [page.get_text() for page in doc if page.get_text().strip()]
|
|
127
|
+
doc.close()
|
|
128
|
+
return "\n\n".join(pages).strip()
|
|
129
|
+
|
|
130
|
+
return await asyncio.to_thread(_parse)
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""Discovers speech metadata from inc.in listing pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from datetime import UTC, datetime, timedelta
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
from .models import SpeechMetadata
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
_INC_BASE = "https://www.inc.in/media/speeches"
|
|
17
|
+
_MEDIA_PATH = "/media/speeches/"
|
|
18
|
+
_MAX_PAGES = 200
|
|
19
|
+
|
|
20
|
+
_DATE_RE = re.compile(r"\b(\d{1,2})\s+([A-Za-z]{3,9})\s+(\d{4})\b")
|
|
21
|
+
_YT_RE = re.compile(
|
|
22
|
+
r"(?:https?://)?(?:www\.)?(?:youtube\.com/(?:watch\?v=|embed/|shorts/)|youtu\.be/)"
|
|
23
|
+
r"([\w\-]{11})",
|
|
24
|
+
re.I,
|
|
25
|
+
)
|
|
26
|
+
_PDF_RE = re.compile(r"https?://[^\s'\"]+?\.pdf(?:\?[^'\"\s]*)?", re.I)
|
|
27
|
+
_DATE_FORMATS = [
|
|
28
|
+
"%d %B %Y",
|
|
29
|
+
"%d %b %Y",
|
|
30
|
+
"%B %d, %Y",
|
|
31
|
+
"%b %d, %Y",
|
|
32
|
+
"%Y-%m-%d",
|
|
33
|
+
"%d/%m/%Y",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
_DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _parse_date(text: str | None) -> datetime | None:
|
|
40
|
+
if not text:
|
|
41
|
+
return None
|
|
42
|
+
m = _DATE_RE.search(text)
|
|
43
|
+
if m:
|
|
44
|
+
text = f"{m.group(1)} {m.group(2)} {m.group(3)}"
|
|
45
|
+
for fmt in _DATE_FORMATS:
|
|
46
|
+
try:
|
|
47
|
+
return datetime.strptime(text.strip(), fmt).replace(tzinfo=UTC)
|
|
48
|
+
except ValueError:
|
|
49
|
+
continue
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _scan_for_media(obj: object) -> tuple[str | None, str | None]:
|
|
54
|
+
"""Recursively scan a JSON object for PDF and YouTube URLs."""
|
|
55
|
+
pdf: str | None = None
|
|
56
|
+
yt: str | None = None
|
|
57
|
+
|
|
58
|
+
def walk(node: object) -> None:
|
|
59
|
+
nonlocal pdf, yt
|
|
60
|
+
if pdf and yt:
|
|
61
|
+
return
|
|
62
|
+
if isinstance(node, str):
|
|
63
|
+
if not pdf and ".pdf" in node.lower():
|
|
64
|
+
m = _PDF_RE.search(node)
|
|
65
|
+
if m:
|
|
66
|
+
pdf = m.group(0)
|
|
67
|
+
if not yt:
|
|
68
|
+
m = _YT_RE.search(node)
|
|
69
|
+
if m:
|
|
70
|
+
yt = f"https://www.youtube.com/watch?v={m.group(1)}"
|
|
71
|
+
elif isinstance(node, dict):
|
|
72
|
+
if not pdf:
|
|
73
|
+
u = node.get("url", "")
|
|
74
|
+
if isinstance(u, str) and ".pdf" in u.lower() and u.startswith("http"):
|
|
75
|
+
pdf = u
|
|
76
|
+
if not yt:
|
|
77
|
+
_yt_keys = (
|
|
78
|
+
"youtube_url",
|
|
79
|
+
"youtube",
|
|
80
|
+
"video_url",
|
|
81
|
+
"video_link",
|
|
82
|
+
"embed_url",
|
|
83
|
+
)
|
|
84
|
+
for k in _yt_keys:
|
|
85
|
+
v = node.get(k)
|
|
86
|
+
if isinstance(v, str):
|
|
87
|
+
m = _YT_RE.search(v)
|
|
88
|
+
if m:
|
|
89
|
+
yt = f"https://www.youtube.com/watch?v={m.group(1)}"
|
|
90
|
+
break
|
|
91
|
+
for v in node.values():
|
|
92
|
+
walk(v)
|
|
93
|
+
elif isinstance(node, list | tuple):
|
|
94
|
+
for v in node:
|
|
95
|
+
walk(v)
|
|
96
|
+
|
|
97
|
+
walk(obj)
|
|
98
|
+
return pdf, yt
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SpeechMetadataAdapter:
|
|
102
|
+
"""Discovers speech metadata from the inc.in speeches listing.
|
|
103
|
+
|
|
104
|
+
Mirrors the ``DigitalNewsMetadataAdapter`` interface: construct with
|
|
105
|
+
optional filters, call ``fetch()`` to get a list of ``SpeechMetadata``.
|
|
106
|
+
|
|
107
|
+
Each entry contains the speech URL, title, speaker, date, PDF URL, and
|
|
108
|
+
YouTube URL (when present). Content extraction (PDF text) is handled
|
|
109
|
+
separately by ``SpeechContentAdapter``.
|
|
110
|
+
|
|
111
|
+
Usage::
|
|
112
|
+
|
|
113
|
+
adapter = SpeechMetadataAdapter()
|
|
114
|
+
speeches = await adapter.fetch(since=timedelta(days=7), speaker="rahul")
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, base_url: str = _INC_BASE) -> None:
|
|
118
|
+
self._base_url = base_url.rstrip("/")
|
|
119
|
+
|
|
120
|
+
async def fetch(
|
|
121
|
+
self,
|
|
122
|
+
since: timedelta | None = None,
|
|
123
|
+
speaker: str | None = None,
|
|
124
|
+
) -> list[SpeechMetadata]:
|
|
125
|
+
"""Discover and return filtered speech metadata.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
since: Only return speeches published within this window.
|
|
129
|
+
speaker: Case-insensitive substring match against speaker name.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of ``SpeechMetadata``, ordered as discovered from listing.
|
|
133
|
+
"""
|
|
134
|
+
logger.info(
|
|
135
|
+
"speech_metadata_fetch_started",
|
|
136
|
+
since=str(since),
|
|
137
|
+
speaker=speaker,
|
|
138
|
+
)
|
|
139
|
+
cutoff = (datetime.now(UTC) - since) if since is not None else None
|
|
140
|
+
urls = await self._discover_urls()
|
|
141
|
+
logger.info("speech_urls_discovered", count=len(urls))
|
|
142
|
+
|
|
143
|
+
entries = await self._parse_pages(urls)
|
|
144
|
+
logger.info("speech_pages_parsed", total=len(entries))
|
|
145
|
+
|
|
146
|
+
filtered = self._apply_filters(entries, cutoff=cutoff, speaker=speaker)
|
|
147
|
+
logger.info(
|
|
148
|
+
"speech_metadata_fetch_completed",
|
|
149
|
+
before=len(entries),
|
|
150
|
+
after=len(filtered),
|
|
151
|
+
)
|
|
152
|
+
return filtered
|
|
153
|
+
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
# Discovery
|
|
156
|
+
# ------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
async def _discover_urls(self) -> list[str]:
|
|
159
|
+
seen: set[str] = set()
|
|
160
|
+
|
|
161
|
+
async with httpx.AsyncClient(
|
|
162
|
+
follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=30
|
|
163
|
+
) as client:
|
|
164
|
+
# Page 1 — also extract buildId for JSON API route.
|
|
165
|
+
build_id = await self._scrape_html_page(client, self._base_url, seen)
|
|
166
|
+
|
|
167
|
+
if isinstance(build_id, str):
|
|
168
|
+
# JSON API is complete — no need for HTML pagination.
|
|
169
|
+
await self._scrape_json_pages(client, build_id, seen)
|
|
170
|
+
else:
|
|
171
|
+
# JSON unavailable — fall back to HTML pagination.
|
|
172
|
+
# Break as soon as a page returns no speech links at all.
|
|
173
|
+
for page in range(2, _MAX_PAGES + 1):
|
|
174
|
+
url = f"{self._base_url}/page/{page}"
|
|
175
|
+
had_links = await self._scrape_html_page(
|
|
176
|
+
client, url, seen, return_had_links=True
|
|
177
|
+
)
|
|
178
|
+
if not had_links:
|
|
179
|
+
break
|
|
180
|
+
|
|
181
|
+
return list(seen)
|
|
182
|
+
|
|
183
|
+
async def _scrape_html_page(
|
|
184
|
+
self,
|
|
185
|
+
client: httpx.AsyncClient,
|
|
186
|
+
url: str,
|
|
187
|
+
seen: set[str],
|
|
188
|
+
return_had_links: bool = False,
|
|
189
|
+
) -> str | None | bool:
|
|
190
|
+
"""Scrape one HTML listing page, add speech URLs to *seen*.
|
|
191
|
+
|
|
192
|
+
When *return_had_links* is True, returns a bool indicating whether any
|
|
193
|
+
speech links were found on the page (used to detect end of pagination).
|
|
194
|
+
Otherwise returns the Next.js buildId if found, else None.
|
|
195
|
+
"""
|
|
196
|
+
try:
|
|
197
|
+
resp = await client.get(url)
|
|
198
|
+
if resp.status_code != 200:
|
|
199
|
+
return False if return_had_links else None
|
|
200
|
+
except httpx.HTTPError:
|
|
201
|
+
return False if return_had_links else None
|
|
202
|
+
|
|
203
|
+
from bs4 import BeautifulSoup
|
|
204
|
+
|
|
205
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
206
|
+
had_links = False
|
|
207
|
+
for a in soup.find_all("a", href=True):
|
|
208
|
+
href = str(a["href"])
|
|
209
|
+
if href.startswith(_MEDIA_PATH) and href.count("/") >= 3:
|
|
210
|
+
seen.add(f"https://www.inc.in{href}")
|
|
211
|
+
had_links = True
|
|
212
|
+
|
|
213
|
+
if return_had_links:
|
|
214
|
+
return had_links
|
|
215
|
+
|
|
216
|
+
build_id: str | None = None
|
|
217
|
+
script = soup.find("script", id="__NEXT_DATA__", type="application/json")
|
|
218
|
+
if script:
|
|
219
|
+
try:
|
|
220
|
+
data = json.loads(script.string or script.get_text())
|
|
221
|
+
build_id = data.get("buildId")
|
|
222
|
+
self._collect_urls_from_obj(data, seen)
|
|
223
|
+
except (json.JSONDecodeError, AttributeError):
|
|
224
|
+
pass
|
|
225
|
+
return build_id
|
|
226
|
+
|
|
227
|
+
async def _scrape_json_pages(
|
|
228
|
+
self,
|
|
229
|
+
client: httpx.AsyncClient,
|
|
230
|
+
build_id: str,
|
|
231
|
+
seen: set[str],
|
|
232
|
+
) -> None:
|
|
233
|
+
base = f"https://www.inc.in/_next/data/{build_id}/media/speeches.json"
|
|
234
|
+
no_progress = 0
|
|
235
|
+
for page in range(1, _MAX_PAGES + 1):
|
|
236
|
+
try:
|
|
237
|
+
resp = await client.get(f"{base}?page={page}")
|
|
238
|
+
if resp.status_code != 200:
|
|
239
|
+
break
|
|
240
|
+
before = len(seen)
|
|
241
|
+
self._collect_urls_from_obj(resp.json(), seen)
|
|
242
|
+
added = len(seen) - before
|
|
243
|
+
logger.debug("json_page", page=page, added=added, total=len(seen))
|
|
244
|
+
no_progress = 0 if added else no_progress + 1
|
|
245
|
+
if no_progress >= 2:
|
|
246
|
+
break
|
|
247
|
+
except Exception:
|
|
248
|
+
logger.debug("json_page_failed", page=page, exc_info=True)
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
@staticmethod
|
|
252
|
+
def _collect_urls_from_obj(obj: object, seen: set[str]) -> None:
|
|
253
|
+
if isinstance(obj, dict):
|
|
254
|
+
uid = obj.get("uid")
|
|
255
|
+
if isinstance(uid, str) and uid and "/" not in uid.strip("/"):
|
|
256
|
+
seen.add(f"https://www.inc.in{_MEDIA_PATH}{uid.strip('/')}")
|
|
257
|
+
for v in obj.values():
|
|
258
|
+
SpeechMetadataAdapter._collect_urls_from_obj(v, seen)
|
|
259
|
+
elif isinstance(obj, list | tuple):
|
|
260
|
+
for v in obj:
|
|
261
|
+
SpeechMetadataAdapter._collect_urls_from_obj(v, seen)
|
|
262
|
+
elif isinstance(obj, str) and obj.strip().startswith(_MEDIA_PATH):
|
|
263
|
+
seen.add(f"https://www.inc.in{obj.strip()}")
|
|
264
|
+
|
|
265
|
+
# ------------------------------------------------------------------
|
|
266
|
+
# Page parsing
|
|
267
|
+
# ------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
async def _parse_pages(self, urls: list[str]) -> list[SpeechMetadata]:
|
|
270
|
+
async with httpx.AsyncClient(
|
|
271
|
+
follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=30
|
|
272
|
+
) as client:
|
|
273
|
+
import asyncio
|
|
274
|
+
|
|
275
|
+
tasks = [self._parse_one(client, url) for url in urls]
|
|
276
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
277
|
+
|
|
278
|
+
entries: list[SpeechMetadata] = []
|
|
279
|
+
for url, result in zip(urls, results):
|
|
280
|
+
if isinstance(result, BaseException):
|
|
281
|
+
logger.warning("speech_page_parse_failed", url=url, error=str(result))
|
|
282
|
+
elif result is not None:
|
|
283
|
+
entries.append(result)
|
|
284
|
+
return entries
|
|
285
|
+
|
|
286
|
+
async def _parse_one(
|
|
287
|
+
self, client: httpx.AsyncClient, url: str
|
|
288
|
+
) -> SpeechMetadata | None:
|
|
289
|
+
try:
|
|
290
|
+
resp = await client.get(url)
|
|
291
|
+
if resp.status_code != 200:
|
|
292
|
+
return None
|
|
293
|
+
except httpx.HTTPError as exc:
|
|
294
|
+
logger.debug("speech_page_http_error", url=url, error=str(exc))
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
from bs4 import BeautifulSoup
|
|
298
|
+
|
|
299
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
300
|
+
title = speaker = date_raw = pdf_url = yt_url = None
|
|
301
|
+
|
|
302
|
+
script = soup.find("script", id="__NEXT_DATA__", type="application/json")
|
|
303
|
+
if script:
|
|
304
|
+
try:
|
|
305
|
+
data = json.loads(script.string or script.get_text())
|
|
306
|
+
pp = (data.get("props") or {}).get("pageProps") or {}
|
|
307
|
+
obj = (
|
|
308
|
+
pp.get("objspeeches")
|
|
309
|
+
or pp.get("objSpeeches")
|
|
310
|
+
or pp.get("speech")
|
|
311
|
+
or pp
|
|
312
|
+
)
|
|
313
|
+
if isinstance(obj, dict):
|
|
314
|
+
speaker = obj.get("title") or obj.get("speaker")
|
|
315
|
+
title = obj.get("expert") or obj.get("title") or obj.get("heading")
|
|
316
|
+
date_raw = (
|
|
317
|
+
obj.get("publishdate")
|
|
318
|
+
or obj.get("publishedate")
|
|
319
|
+
or obj.get("date")
|
|
320
|
+
)
|
|
321
|
+
speech_doc = obj.get("SpeechDocument")
|
|
322
|
+
if isinstance(speech_doc, dict):
|
|
323
|
+
pdf_url = speech_doc.get("url")
|
|
324
|
+
yt_url = obj.get("video_link")
|
|
325
|
+
_pdf, _yt = _scan_for_media(data)
|
|
326
|
+
pdf_url = pdf_url or _pdf
|
|
327
|
+
yt_url = yt_url or _yt
|
|
328
|
+
except (json.JSONDecodeError, AttributeError):
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
if not title:
|
|
332
|
+
h = soup.find(["h1", "h2"])
|
|
333
|
+
title = h.get_text(strip=True) if h else None
|
|
334
|
+
if not date_raw:
|
|
335
|
+
m = _DATE_RE.search(soup.get_text(" ", strip=True))
|
|
336
|
+
if m:
|
|
337
|
+
date_raw = f"{m.group(1)} {m.group(2)} {m.group(3)}"
|
|
338
|
+
|
|
339
|
+
return SpeechMetadata(
|
|
340
|
+
url=url,
|
|
341
|
+
title=title,
|
|
342
|
+
speaker=speaker,
|
|
343
|
+
speech_date=_parse_date(date_raw),
|
|
344
|
+
pdf_url=pdf_url,
|
|
345
|
+
youtube_url=yt_url,
|
|
346
|
+
language=None, # detected after content extraction
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# ------------------------------------------------------------------
|
|
350
|
+
# Filtering
|
|
351
|
+
# ------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
@staticmethod
|
|
354
|
+
def _apply_filters(
|
|
355
|
+
entries: list[SpeechMetadata],
|
|
356
|
+
cutoff: datetime | None,
|
|
357
|
+
speaker: str | None,
|
|
358
|
+
) -> list[SpeechMetadata]:
|
|
359
|
+
result = entries
|
|
360
|
+
|
|
361
|
+
if cutoff is not None:
|
|
362
|
+
result = [
|
|
363
|
+
e for e in result if e.speech_date is None or e.speech_date >= cutoff
|
|
364
|
+
]
|
|
365
|
+
|
|
366
|
+
if speaker is not None:
|
|
367
|
+
kw = speaker.lower()
|
|
368
|
+
result = [
|
|
369
|
+
e
|
|
370
|
+
for e in result
|
|
371
|
+
if kw in (e.speaker or "").lower() or kw in (e.title or "").lower()
|
|
372
|
+
]
|
|
373
|
+
|
|
374
|
+
return result
|