pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,160 @@
1
+ """OpenSearchStorageAdapter — upsert documents into an OpenSearch index.
2
+
3
+ Schema (index name, field mappings, id field) is defined at call time so
4
+ products can configure it entirely from pipeline.yaml args.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Any
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Maps pipeline.yaml type names → OpenSearch mapping types
15
+ _TYPE_MAP: dict[str, dict[str, Any]] = {
16
+ "text": {"type": "text"},
17
+ "keyword": {"type": "keyword"},
18
+ "date": {"type": "date"},
19
+ "integer": {"type": "integer"},
20
+ "long": {"type": "long"},
21
+ "float": {"type": "float"},
22
+ "boolean": {"type": "boolean"},
23
+ "object": {"type": "object", "enabled": False}, # stored but not indexed
24
+ }
25
+
26
+
27
+ class OpenSearchStorageAdapter:
28
+ """Upsert documents into an OpenSearch index.
29
+
30
+ Args:
31
+ url: OpenSearch endpoint URL.
32
+ username: HTTP basic auth username (optional).
33
+ password: HTTP basic auth password (optional).
34
+ use_ssl: Whether to use SSL. Default True.
35
+ verify_certs: Whether to verify SSL certificates. Default True.
36
+ index_prefix: Prefix prepended to index name. Default "".
37
+
38
+ Usage::
39
+
40
+ adapter = OpenSearchStorageAdapter(url="https://...")
41
+ await adapter.ensure_index(
42
+ index="pulse_videos",
43
+ id_field="video_id",
44
+ mappings={"video_id": "keyword", "title": "text", ...},
45
+ )
46
+ result = await adapter.upsert_batch(
47
+ index="pulse_videos",
48
+ id_field="video_id",
49
+ documents=[{"video_id": "abc", "title": "...", ...}],
50
+ )
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ url: str,
56
+ username: str = "",
57
+ password: str = "",
58
+ use_ssl: bool = True,
59
+ verify_certs: bool = True,
60
+ index_prefix: str = "",
61
+ ) -> None:
62
+ self._url = url.rstrip("/")
63
+ self._username = username
64
+ self._password = password
65
+ self._use_ssl = use_ssl
66
+ self._verify_certs = verify_certs
67
+ self._index_prefix = index_prefix
68
+
69
+ def _client(self) -> Any:
70
+ from opensearchpy import AsyncOpenSearch
71
+
72
+ kwargs: dict[str, Any] = {
73
+ "hosts": [self._url],
74
+ "use_ssl": self._use_ssl,
75
+ "verify_certs": self._verify_certs,
76
+ }
77
+ if self._username and self._password:
78
+ kwargs["http_auth"] = (self._username, self._password)
79
+ return AsyncOpenSearch(**kwargs)
80
+
81
+ def _index_name(self, index: str) -> str:
82
+ return f"{self._index_prefix}{index}" if self._index_prefix else index
83
+
84
+ @staticmethod
85
+ def _build_properties(mappings: dict[str, str]) -> dict[str, Any]:
86
+ properties: dict[str, Any] = {}
87
+ for field, type_name in mappings.items():
88
+ properties[field] = _TYPE_MAP.get(type_name, {"type": "keyword"})
89
+ return properties
90
+
91
+ async def ensure_index(
92
+ self,
93
+ index: str,
94
+ id_field: str,
95
+ mappings: dict[str, str],
96
+ ) -> None:
97
+ """Create index with mappings if it doesn't exist."""
98
+ name = self._index_name(index)
99
+ body = {
100
+ "settings": {"number_of_shards": 1, "number_of_replicas": 1},
101
+ "mappings": {"properties": self._build_properties(mappings)},
102
+ }
103
+ async with self._client() as client:
104
+ exists = await client.indices.exists(index=name)
105
+ if not exists:
106
+ await client.indices.create(index=name, body=body)
107
+ logger.info("Created OpenSearch index '%s'", name)
108
+ else:
109
+ logger.debug("OpenSearch index '%s' already exists", name)
110
+
111
+ async def upsert_batch(
112
+ self,
113
+ index: str,
114
+ id_field: str,
115
+ documents: list[dict[str, Any]],
116
+ ) -> dict[str, Any]:
117
+ """Bulk upsert documents. Uses id_field value as document _id.
118
+
119
+ Returns:
120
+ {"indexed": N, "errors": [...]}
121
+ """
122
+ from opensearchpy import AsyncOpenSearch
123
+ from opensearchpy.helpers import async_bulk
124
+
125
+ name = self._index_name(index)
126
+ actions = [
127
+ {
128
+ "_op_type": "index",
129
+ "_index": name,
130
+ "_id": str(doc[id_field]),
131
+ "_source": doc,
132
+ }
133
+ for doc in documents
134
+ if doc.get(id_field)
135
+ ]
136
+
137
+ skipped = len(documents) - len(actions)
138
+ if skipped:
139
+ logger.warning("Skipped %d documents missing id_field '%s'", skipped, id_field)
140
+
141
+ errors: list[Any] = []
142
+ async with self._client() as client:
143
+ success, failed = await async_bulk(client, actions, raise_on_error=False, stats_only=False)
144
+ if failed:
145
+ errors = failed
146
+ logger.warning("OpenSearch bulk errors: %d failed", len(failed))
147
+
148
+ logger.info("Upserted %d documents to index '%s'", success, name)
149
+ return {"indexed": success, "errors": errors}
150
+
151
+ async def store(
152
+ self,
153
+ index: str,
154
+ id_field: str,
155
+ mappings: dict[str, str],
156
+ documents: list[dict[str, Any]],
157
+ ) -> dict[str, Any]:
158
+ """Ensure index exists then upsert all documents."""
159
+ await self.ensure_index(index=index, id_field=id_field, mappings=mappings)
160
+ return await self.upsert_batch(index=index, id_field=id_field, documents=documents)
@@ -0,0 +1,130 @@
1
+ """Fetches and extracts text content from speech PDFs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import re
7
+
8
+ import httpx
9
+ import structlog
10
+
11
+ from .models import SpeechContent, SpeechMetadata, SpeechResult
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+ _DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)"}
16
+ _DEVANAGARI_RE = re.compile(r"[\u0900-\u097F]")
17
+
18
+
19
+ def _detect_language(text: str) -> str:
20
+ deva = len(_DEVANAGARI_RE.findall(text))
21
+ total = len(text.replace(" ", ""))
22
+ if not total:
23
+ return "en"
24
+ ratio = deva / total
25
+ if ratio > 0.4:
26
+ return "hi"
27
+ if ratio > 0.1:
28
+ return "mixed"
29
+ return "en"
30
+
31
+
32
+ class SpeechContentAdapter:
33
+ """Fetches PDF text for speeches discovered by ``SpeechMetadataAdapter``.
34
+
35
+ Mirrors the ``DigitalNewsAdapter`` interface: call ``fetch()`` for a single
36
+ speech or ``fetch_many()`` for a batch.
37
+
38
+ Requires ``pymupdf`` (``fitz``) to be installed for PDF extraction.
39
+
40
+ Usage::
41
+
42
+ adapter = SpeechContentAdapter()
43
+ content = await adapter.fetch(metadata)
44
+
45
+ results = await adapter.fetch_many(metadata_list, concurrency=4)
46
+ """
47
+
48
+ def __init__(self, concurrency: int = 4, timeout: float = 60.0) -> None:
49
+ self._default_concurrency = concurrency
50
+ self._timeout = timeout
51
+
52
+ async def fetch(self, speech: SpeechMetadata) -> SpeechContent:
53
+ """Download and extract text from a single speech PDF. Raises on failure."""
54
+ if not speech.pdf_url:
55
+ raise ValueError(f"No PDF URL for speech: {speech.url}")
56
+
57
+ logger.info("speech_content_fetch_started", url=speech.url, pdf=speech.pdf_url)
58
+ text = await self._extract_pdf(speech.pdf_url)
59
+ if not text:
60
+ raise RuntimeError(
61
+ f"PDF extraction returned empty text for {speech.pdf_url}"
62
+ )
63
+
64
+ language = _detect_language(text)
65
+ logger.info(
66
+ "speech_content_fetch_completed",
67
+ url=speech.url,
68
+ chars=len(text),
69
+ language=language,
70
+ )
71
+ return SpeechContent(text=text, language=language, pdf_url=speech.pdf_url)
72
+
73
+ async def fetch_many(
74
+ self,
75
+ speeches: list[SpeechMetadata],
76
+ concurrency: int | None = None,
77
+ ) -> list[SpeechResult]:
78
+ """Extract content from multiple speeches concurrently.
79
+
80
+ Speeches without a PDF URL are returned immediately as errors.
81
+ Preserves input order.
82
+ """
83
+ if not speeches:
84
+ return []
85
+
86
+ sem = asyncio.Semaphore(concurrency or self._default_concurrency)
87
+
88
+ async def _guarded(speech: SpeechMetadata) -> SpeechResult:
89
+ async with sem:
90
+ try:
91
+ content = await self.fetch(speech)
92
+ return SpeechResult(url=speech.url, content=content, error=None)
93
+ except Exception as exc:
94
+ logger.warning(
95
+ "speech_content_fetch_failed",
96
+ url=speech.url,
97
+ error=str(exc),
98
+ )
99
+ return SpeechResult(url=speech.url, content=None, error=str(exc))
100
+
101
+ tasks = [_guarded(s) for s in speeches]
102
+ return list(await asyncio.gather(*tasks))
103
+
104
+ # ------------------------------------------------------------------
105
+ # Private helpers
106
+ # ------------------------------------------------------------------
107
+
108
+ async def _extract_pdf(self, pdf_url: str) -> str:
109
+ """Download PDF bytes and extract plain text via pymupdf."""
110
+ try:
111
+ import fitz # pymupdf
112
+ except ImportError:
113
+ raise RuntimeError(
114
+ "pymupdf is required for PDF extraction: pip install pymupdf"
115
+ )
116
+
117
+ async with httpx.AsyncClient(
118
+ follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=self._timeout
119
+ ) as client:
120
+ resp = await client.get(pdf_url)
121
+ resp.raise_for_status()
122
+ pdf_bytes = resp.content
123
+
124
+ def _parse() -> str:
125
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
126
+ pages = [page.get_text() for page in doc if page.get_text().strip()]
127
+ doc.close()
128
+ return "\n\n".join(pages).strip()
129
+
130
+ return await asyncio.to_thread(_parse)
@@ -0,0 +1,374 @@
1
+ """Discovers speech metadata from inc.in listing pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from datetime import UTC, datetime, timedelta
8
+
9
+ import httpx
10
+ import structlog
11
+
12
+ from .models import SpeechMetadata
13
+
14
+ logger = structlog.get_logger(__name__)
15
+
16
+ _INC_BASE = "https://www.inc.in/media/speeches"
17
+ _MEDIA_PATH = "/media/speeches/"
18
+ _MAX_PAGES = 200
19
+
20
+ _DATE_RE = re.compile(r"\b(\d{1,2})\s+([A-Za-z]{3,9})\s+(\d{4})\b")
21
+ _YT_RE = re.compile(
22
+ r"(?:https?://)?(?:www\.)?(?:youtube\.com/(?:watch\?v=|embed/|shorts/)|youtu\.be/)"
23
+ r"([\w\-]{11})",
24
+ re.I,
25
+ )
26
+ _PDF_RE = re.compile(r"https?://[^\s'\"]+?\.pdf(?:\?[^'\"\s]*)?", re.I)
27
+ _DATE_FORMATS = [
28
+ "%d %B %Y",
29
+ "%d %b %Y",
30
+ "%B %d, %Y",
31
+ "%b %d, %Y",
32
+ "%Y-%m-%d",
33
+ "%d/%m/%Y",
34
+ ]
35
+
36
+ _DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)"}
37
+
38
+
39
+ def _parse_date(text: str | None) -> datetime | None:
40
+ if not text:
41
+ return None
42
+ m = _DATE_RE.search(text)
43
+ if m:
44
+ text = f"{m.group(1)} {m.group(2)} {m.group(3)}"
45
+ for fmt in _DATE_FORMATS:
46
+ try:
47
+ return datetime.strptime(text.strip(), fmt).replace(tzinfo=UTC)
48
+ except ValueError:
49
+ continue
50
+ return None
51
+
52
+
53
+ def _scan_for_media(obj: object) -> tuple[str | None, str | None]:
54
+ """Recursively scan a JSON object for PDF and YouTube URLs."""
55
+ pdf: str | None = None
56
+ yt: str | None = None
57
+
58
+ def walk(node: object) -> None:
59
+ nonlocal pdf, yt
60
+ if pdf and yt:
61
+ return
62
+ if isinstance(node, str):
63
+ if not pdf and ".pdf" in node.lower():
64
+ m = _PDF_RE.search(node)
65
+ if m:
66
+ pdf = m.group(0)
67
+ if not yt:
68
+ m = _YT_RE.search(node)
69
+ if m:
70
+ yt = f"https://www.youtube.com/watch?v={m.group(1)}"
71
+ elif isinstance(node, dict):
72
+ if not pdf:
73
+ u = node.get("url", "")
74
+ if isinstance(u, str) and ".pdf" in u.lower() and u.startswith("http"):
75
+ pdf = u
76
+ if not yt:
77
+ _yt_keys = (
78
+ "youtube_url",
79
+ "youtube",
80
+ "video_url",
81
+ "video_link",
82
+ "embed_url",
83
+ )
84
+ for k in _yt_keys:
85
+ v = node.get(k)
86
+ if isinstance(v, str):
87
+ m = _YT_RE.search(v)
88
+ if m:
89
+ yt = f"https://www.youtube.com/watch?v={m.group(1)}"
90
+ break
91
+ for v in node.values():
92
+ walk(v)
93
+ elif isinstance(node, list | tuple):
94
+ for v in node:
95
+ walk(v)
96
+
97
+ walk(obj)
98
+ return pdf, yt
99
+
100
+
101
+ class SpeechMetadataAdapter:
102
+ """Discovers speech metadata from the inc.in speeches listing.
103
+
104
+ Mirrors the ``DigitalNewsMetadataAdapter`` interface: construct with
105
+ optional filters, call ``fetch()`` to get a list of ``SpeechMetadata``.
106
+
107
+ Each entry contains the speech URL, title, speaker, date, PDF URL, and
108
+ YouTube URL (when present). Content extraction (PDF text) is handled
109
+ separately by ``SpeechContentAdapter``.
110
+
111
+ Usage::
112
+
113
+ adapter = SpeechMetadataAdapter()
114
+ speeches = await adapter.fetch(since=timedelta(days=7), speaker="rahul")
115
+ """
116
+
117
+ def __init__(self, base_url: str = _INC_BASE) -> None:
118
+ self._base_url = base_url.rstrip("/")
119
+
120
+ async def fetch(
121
+ self,
122
+ since: timedelta | None = None,
123
+ speaker: str | None = None,
124
+ ) -> list[SpeechMetadata]:
125
+ """Discover and return filtered speech metadata.
126
+
127
+ Args:
128
+ since: Only return speeches published within this window.
129
+ speaker: Case-insensitive substring match against speaker name.
130
+
131
+ Returns:
132
+ List of ``SpeechMetadata``, ordered as discovered from listing.
133
+ """
134
+ logger.info(
135
+ "speech_metadata_fetch_started",
136
+ since=str(since),
137
+ speaker=speaker,
138
+ )
139
+ cutoff = (datetime.now(UTC) - since) if since is not None else None
140
+ urls = await self._discover_urls()
141
+ logger.info("speech_urls_discovered", count=len(urls))
142
+
143
+ entries = await self._parse_pages(urls)
144
+ logger.info("speech_pages_parsed", total=len(entries))
145
+
146
+ filtered = self._apply_filters(entries, cutoff=cutoff, speaker=speaker)
147
+ logger.info(
148
+ "speech_metadata_fetch_completed",
149
+ before=len(entries),
150
+ after=len(filtered),
151
+ )
152
+ return filtered
153
+
154
+ # ------------------------------------------------------------------
155
+ # Discovery
156
+ # ------------------------------------------------------------------
157
+
158
+ async def _discover_urls(self) -> list[str]:
159
+ seen: set[str] = set()
160
+
161
+ async with httpx.AsyncClient(
162
+ follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=30
163
+ ) as client:
164
+ # Page 1 — also extract buildId for JSON API route.
165
+ build_id = await self._scrape_html_page(client, self._base_url, seen)
166
+
167
+ if isinstance(build_id, str):
168
+ # JSON API is complete — no need for HTML pagination.
169
+ await self._scrape_json_pages(client, build_id, seen)
170
+ else:
171
+ # JSON unavailable — fall back to HTML pagination.
172
+ # Break as soon as a page returns no speech links at all.
173
+ for page in range(2, _MAX_PAGES + 1):
174
+ url = f"{self._base_url}/page/{page}"
175
+ had_links = await self._scrape_html_page(
176
+ client, url, seen, return_had_links=True
177
+ )
178
+ if not had_links:
179
+ break
180
+
181
+ return list(seen)
182
+
183
+ async def _scrape_html_page(
184
+ self,
185
+ client: httpx.AsyncClient,
186
+ url: str,
187
+ seen: set[str],
188
+ return_had_links: bool = False,
189
+ ) -> str | None | bool:
190
+ """Scrape one HTML listing page, add speech URLs to *seen*.
191
+
192
+ When *return_had_links* is True, returns a bool indicating whether any
193
+ speech links were found on the page (used to detect end of pagination).
194
+ Otherwise returns the Next.js buildId if found, else None.
195
+ """
196
+ try:
197
+ resp = await client.get(url)
198
+ if resp.status_code != 200:
199
+ return False if return_had_links else None
200
+ except httpx.HTTPError:
201
+ return False if return_had_links else None
202
+
203
+ from bs4 import BeautifulSoup
204
+
205
+ soup = BeautifulSoup(resp.text, "html.parser")
206
+ had_links = False
207
+ for a in soup.find_all("a", href=True):
208
+ href = str(a["href"])
209
+ if href.startswith(_MEDIA_PATH) and href.count("/") >= 3:
210
+ seen.add(f"https://www.inc.in{href}")
211
+ had_links = True
212
+
213
+ if return_had_links:
214
+ return had_links
215
+
216
+ build_id: str | None = None
217
+ script = soup.find("script", id="__NEXT_DATA__", type="application/json")
218
+ if script:
219
+ try:
220
+ data = json.loads(script.string or script.get_text())
221
+ build_id = data.get("buildId")
222
+ self._collect_urls_from_obj(data, seen)
223
+ except (json.JSONDecodeError, AttributeError):
224
+ pass
225
+ return build_id
226
+
227
+ async def _scrape_json_pages(
228
+ self,
229
+ client: httpx.AsyncClient,
230
+ build_id: str,
231
+ seen: set[str],
232
+ ) -> None:
233
+ base = f"https://www.inc.in/_next/data/{build_id}/media/speeches.json"
234
+ no_progress = 0
235
+ for page in range(1, _MAX_PAGES + 1):
236
+ try:
237
+ resp = await client.get(f"{base}?page={page}")
238
+ if resp.status_code != 200:
239
+ break
240
+ before = len(seen)
241
+ self._collect_urls_from_obj(resp.json(), seen)
242
+ added = len(seen) - before
243
+ logger.debug("json_page", page=page, added=added, total=len(seen))
244
+ no_progress = 0 if added else no_progress + 1
245
+ if no_progress >= 2:
246
+ break
247
+ except Exception:
248
+ logger.debug("json_page_failed", page=page, exc_info=True)
249
+ break
250
+
251
+ @staticmethod
252
+ def _collect_urls_from_obj(obj: object, seen: set[str]) -> None:
253
+ if isinstance(obj, dict):
254
+ uid = obj.get("uid")
255
+ if isinstance(uid, str) and uid and "/" not in uid.strip("/"):
256
+ seen.add(f"https://www.inc.in{_MEDIA_PATH}{uid.strip('/')}")
257
+ for v in obj.values():
258
+ SpeechMetadataAdapter._collect_urls_from_obj(v, seen)
259
+ elif isinstance(obj, list | tuple):
260
+ for v in obj:
261
+ SpeechMetadataAdapter._collect_urls_from_obj(v, seen)
262
+ elif isinstance(obj, str) and obj.strip().startswith(_MEDIA_PATH):
263
+ seen.add(f"https://www.inc.in{obj.strip()}")
264
+
265
+ # ------------------------------------------------------------------
266
+ # Page parsing
267
+ # ------------------------------------------------------------------
268
+
269
+ async def _parse_pages(self, urls: list[str]) -> list[SpeechMetadata]:
270
+ async with httpx.AsyncClient(
271
+ follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=30
272
+ ) as client:
273
+ import asyncio
274
+
275
+ tasks = [self._parse_one(client, url) for url in urls]
276
+ results = await asyncio.gather(*tasks, return_exceptions=True)
277
+
278
+ entries: list[SpeechMetadata] = []
279
+ for url, result in zip(urls, results):
280
+ if isinstance(result, BaseException):
281
+ logger.warning("speech_page_parse_failed", url=url, error=str(result))
282
+ elif result is not None:
283
+ entries.append(result)
284
+ return entries
285
+
286
+ async def _parse_one(
287
+ self, client: httpx.AsyncClient, url: str
288
+ ) -> SpeechMetadata | None:
289
+ try:
290
+ resp = await client.get(url)
291
+ if resp.status_code != 200:
292
+ return None
293
+ except httpx.HTTPError as exc:
294
+ logger.debug("speech_page_http_error", url=url, error=str(exc))
295
+ return None
296
+
297
+ from bs4 import BeautifulSoup
298
+
299
+ soup = BeautifulSoup(resp.text, "html.parser")
300
+ title = speaker = date_raw = pdf_url = yt_url = None
301
+
302
+ script = soup.find("script", id="__NEXT_DATA__", type="application/json")
303
+ if script:
304
+ try:
305
+ data = json.loads(script.string or script.get_text())
306
+ pp = (data.get("props") or {}).get("pageProps") or {}
307
+ obj = (
308
+ pp.get("objspeeches")
309
+ or pp.get("objSpeeches")
310
+ or pp.get("speech")
311
+ or pp
312
+ )
313
+ if isinstance(obj, dict):
314
+ speaker = obj.get("title") or obj.get("speaker")
315
+ title = obj.get("expert") or obj.get("title") or obj.get("heading")
316
+ date_raw = (
317
+ obj.get("publishdate")
318
+ or obj.get("publishedate")
319
+ or obj.get("date")
320
+ )
321
+ speech_doc = obj.get("SpeechDocument")
322
+ if isinstance(speech_doc, dict):
323
+ pdf_url = speech_doc.get("url")
324
+ yt_url = obj.get("video_link")
325
+ _pdf, _yt = _scan_for_media(data)
326
+ pdf_url = pdf_url or _pdf
327
+ yt_url = yt_url or _yt
328
+ except (json.JSONDecodeError, AttributeError):
329
+ pass
330
+
331
+ if not title:
332
+ h = soup.find(["h1", "h2"])
333
+ title = h.get_text(strip=True) if h else None
334
+ if not date_raw:
335
+ m = _DATE_RE.search(soup.get_text(" ", strip=True))
336
+ if m:
337
+ date_raw = f"{m.group(1)} {m.group(2)} {m.group(3)}"
338
+
339
+ return SpeechMetadata(
340
+ url=url,
341
+ title=title,
342
+ speaker=speaker,
343
+ speech_date=_parse_date(date_raw),
344
+ pdf_url=pdf_url,
345
+ youtube_url=yt_url,
346
+ language=None, # detected after content extraction
347
+ )
348
+
349
+ # ------------------------------------------------------------------
350
+ # Filtering
351
+ # ------------------------------------------------------------------
352
+
353
+ @staticmethod
354
+ def _apply_filters(
355
+ entries: list[SpeechMetadata],
356
+ cutoff: datetime | None,
357
+ speaker: str | None,
358
+ ) -> list[SpeechMetadata]:
359
+ result = entries
360
+
361
+ if cutoff is not None:
362
+ result = [
363
+ e for e in result if e.speech_date is None or e.speech_date >= cutoff
364
+ ]
365
+
366
+ if speaker is not None:
367
+ kw = speaker.lower()
368
+ result = [
369
+ e
370
+ for e in result
371
+ if kw in (e.speaker or "").lower() or kw in (e.title or "").lower()
372
+ ]
373
+
374
+ return result