pulse-engine 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. pulse_engine/__init__.py +0 -0
  2. pulse_engine/adapters/__init__.py +58 -0
  3. pulse_engine/adapters/audio_transcription.py +167 -0
  4. pulse_engine/adapters/batcher.py +36 -0
  5. pulse_engine/adapters/digital_news.py +128 -0
  6. pulse_engine/adapters/digital_news_metadata.py +536 -0
  7. pulse_engine/adapters/exceptions.py +10 -0
  8. pulse_engine/adapters/models.py +134 -0
  9. pulse_engine/adapters/opensearch_storage.py +160 -0
  10. pulse_engine/adapters/speech_content.py +130 -0
  11. pulse_engine/adapters/speech_metadata.py +374 -0
  12. pulse_engine/adapters/twitter.py +423 -0
  13. pulse_engine/adapters/youtube_downloader.py +186 -0
  14. pulse_engine/adapters/youtube_metadata.py +261 -0
  15. pulse_engine/api/__init__.py +0 -0
  16. pulse_engine/api/v1/__init__.py +0 -0
  17. pulse_engine/api/v1/auth.py +91 -0
  18. pulse_engine/api/v1/health.py +62 -0
  19. pulse_engine/api/v1/router.py +16 -0
  20. pulse_engine/chain_recovery.py +131 -0
  21. pulse_engine/cli/__init__.py +0 -0
  22. pulse_engine/cli/main.py +169 -0
  23. pulse_engine/cli/templates/cookiecutter.json +4 -0
  24. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
  25. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
  26. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
  27. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
  28. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
  29. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
  30. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
  31. pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
  32. pulse_engine/client.py +95 -0
  33. pulse_engine/config.py +157 -0
  34. pulse_engine/core/__init__.py +0 -0
  35. pulse_engine/core/error_handlers.py +64 -0
  36. pulse_engine/core/exceptions.py +67 -0
  37. pulse_engine/core/job_token.py +109 -0
  38. pulse_engine/core/logging.py +45 -0
  39. pulse_engine/core/scope.py +23 -0
  40. pulse_engine/core/security.py +130 -0
  41. pulse_engine/database.py +30 -0
  42. pulse_engine/dependencies.py +166 -0
  43. pulse_engine/deployment/__init__.py +0 -0
  44. pulse_engine/deployment/backend_deployment_repository.py +83 -0
  45. pulse_engine/deployment/backends/__init__.py +0 -0
  46. pulse_engine/deployment/backends/base.py +50 -0
  47. pulse_engine/deployment/backends/exceptions.py +20 -0
  48. pulse_engine/deployment/backends/native_lambda.py +125 -0
  49. pulse_engine/deployment/backends/prefect_ecs.py +116 -0
  50. pulse_engine/deployment/backends/prefect_k8s.py +131 -0
  51. pulse_engine/deployment/backends/registry.py +50 -0
  52. pulse_engine/deployment/infra_provisioner.py +285 -0
  53. pulse_engine/deployment/job_launcher.py +178 -0
  54. pulse_engine/deployment/models.py +48 -0
  55. pulse_engine/deployment/repository.py +54 -0
  56. pulse_engine/deployment/router.py +22 -0
  57. pulse_engine/deployment/schemas.py +18 -0
  58. pulse_engine/deployment/service.py +65 -0
  59. pulse_engine/extractor/__init__.py +0 -0
  60. pulse_engine/extractor/adapters/__init__.py +0 -0
  61. pulse_engine/extractor/base.py +48 -0
  62. pulse_engine/extractor/models.py +50 -0
  63. pulse_engine/extractor/orchestrator/__init__.py +15 -0
  64. pulse_engine/extractor/orchestrator/base.py +34 -0
  65. pulse_engine/extractor/orchestrator/noop.py +37 -0
  66. pulse_engine/extractor/orchestrator/prefect.py +163 -0
  67. pulse_engine/extractor/repository.py +163 -0
  68. pulse_engine/extractor/router.py +102 -0
  69. pulse_engine/extractor/schemas.py +93 -0
  70. pulse_engine/extractor/service.py +431 -0
  71. pulse_engine/extractor/stage_models.py +36 -0
  72. pulse_engine/extractor/stage_repository.py +109 -0
  73. pulse_engine/main.py +195 -0
  74. pulse_engine/mcp/__init__.py +0 -0
  75. pulse_engine/mcp/__main__.py +5 -0
  76. pulse_engine/mcp/server.py +108 -0
  77. pulse_engine/mcp/tools_jobs.py +159 -0
  78. pulse_engine/mcp/tools_kb.py +88 -0
  79. pulse_engine/mcp/tools_modules.py +115 -0
  80. pulse_engine/mcp/tools_pipelines.py +215 -0
  81. pulse_engine/mcp/tools_processor.py +208 -0
  82. pulse_engine/middleware/__init__.py +0 -0
  83. pulse_engine/middleware/rate_limit.py +144 -0
  84. pulse_engine/middleware/request_id.py +16 -0
  85. pulse_engine/middleware/security_headers.py +25 -0
  86. pulse_engine/middleware/tenant.py +90 -0
  87. pulse_engine/pipeline/__init__.py +0 -0
  88. pulse_engine/pipeline/config_parser.py +148 -0
  89. pulse_engine/pipeline/expression.py +268 -0
  90. pulse_engine/pipeline/models.py +98 -0
  91. pulse_engine/pipeline/repositories.py +224 -0
  92. pulse_engine/pipeline/router_modules.py +66 -0
  93. pulse_engine/pipeline/router_pipelines.py +198 -0
  94. pulse_engine/pipeline/schemas.py +200 -0
  95. pulse_engine/pipeline/service.py +250 -0
  96. pulse_engine/pipeline/translators/__init__.py +44 -0
  97. pulse_engine/pipeline/translators/airflow_status.py +11 -0
  98. pulse_engine/pipeline/translators/airflow_translator.py +22 -0
  99. pulse_engine/pipeline/translators/base.py +42 -0
  100. pulse_engine/pipeline/translators/prefect_status.py +93 -0
  101. pulse_engine/pipeline/translators/prefect_translator.py +195 -0
  102. pulse_engine/processor/__init__.py +0 -0
  103. pulse_engine/processor/base.py +36 -0
  104. pulse_engine/processor/core/__init__.py +0 -0
  105. pulse_engine/processor/core/analysis.py +148 -0
  106. pulse_engine/processor/core/chunking.py +158 -0
  107. pulse_engine/processor/core/prompts.py +340 -0
  108. pulse_engine/processor/core/topic_splitter.py +105 -0
  109. pulse_engine/processor/defaults/__init__.py +11 -0
  110. pulse_engine/processor/defaults/core_processor.py +12 -0
  111. pulse_engine/processor/defaults/postprocessor.py +12 -0
  112. pulse_engine/processor/defaults/preprocessor.py +12 -0
  113. pulse_engine/processor/llm/__init__.py +0 -0
  114. pulse_engine/processor/llm/provider.py +58 -0
  115. pulse_engine/processor/ocr/gemini.py +52 -0
  116. pulse_engine/processor/pipeline.py +107 -0
  117. pulse_engine/processor/postprocessor/__init__.py +0 -0
  118. pulse_engine/processor/postprocessor/embeddings.py +34 -0
  119. pulse_engine/processor/postprocessor/tasks.py +180 -0
  120. pulse_engine/processor/preprocessor/__init__.py +0 -0
  121. pulse_engine/processor/preprocessor/tasks.py +71 -0
  122. pulse_engine/processor/router.py +192 -0
  123. pulse_engine/processor/schemas.py +167 -0
  124. pulse_engine/registry.py +117 -0
  125. pulse_engine/runners/__init__.py +0 -0
  126. pulse_engine/runners/lambda_runner.py +26 -0
  127. pulse_engine/runners/pipeline_runner.py +43 -0
  128. pulse_engine/runners/prefect_pipeline_flow.py +904 -0
  129. pulse_engine/runners/prefect_runner.py +33 -0
  130. pulse_engine/s3.py +72 -0
  131. pulse_engine/secrets.py +46 -0
  132. pulse_engine/services/__init__.py +0 -0
  133. pulse_engine/services/bootstrap.py +211 -0
  134. pulse_engine/services/opensearch.py +84 -0
  135. pulse_engine/storage/__init__.py +0 -0
  136. pulse_engine/storage/connectors/__init__.py +0 -0
  137. pulse_engine/storage/connectors/athena.py +226 -0
  138. pulse_engine/storage/connectors/base.py +32 -0
  139. pulse_engine/storage/connectors/opensearch.py +344 -0
  140. pulse_engine/storage/knowledge_base.py +68 -0
  141. pulse_engine/storage/router.py +78 -0
  142. pulse_engine/storage/schemas.py +93 -0
  143. pulse_engine/testing/__init__.py +13 -0
  144. pulse_engine/testing/fixtures.py +50 -0
  145. pulse_engine/testing/mocks.py +104 -0
  146. pulse_engine/worker.py +53 -0
  147. pulse_engine-0.2.0.dist-info/METADATA +654 -0
  148. pulse_engine-0.2.0.dist-info/RECORD +150 -0
  149. pulse_engine-0.2.0.dist-info/WHEEL +4 -0
  150. pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,536 @@
1
+ """Discovers news article URLs and metadata from Google News sitemaps."""
2
+
3
+ import re
4
+ import xml.etree.ElementTree as ET
5
+ from datetime import UTC, datetime, timedelta
6
+
7
+ import httpx
8
+ import structlog
9
+
10
+ from .exceptions import SitemapDiscoveryError
11
+ from .models import NewsArticleMetadata
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+ # Regex to extract a year-month from date-encoded sitemap URL filenames.
16
+ # Matches patterns like: 2026-April-1.xml, 2026-03-1.xml, 2025-December-2.xml
17
+ _SITEMAP_DATE_RE = re.compile(
18
+ r"(\d{4})[-_]"
19
+ r"(January|February|March|April|May|June|July|August|September|October|November|December|"
20
+ r"01|02|03|04|05|06|07|08|09|10|11|12)"
21
+ r"(?:[-_]\d+)?\.xml",
22
+ re.IGNORECASE,
23
+ )
24
+
25
+ _MONTH_NAME_TO_NUM = {
26
+ "january": 1,
27
+ "february": 2,
28
+ "march": 3,
29
+ "april": 4,
30
+ "may": 5,
31
+ "june": 6,
32
+ "july": 7,
33
+ "august": 8,
34
+ "september": 9,
35
+ "october": 10,
36
+ "november": 11,
37
+ "december": 12,
38
+ }
39
+
40
+ _FALLBACK_PATHS = ["/news-sitemap.xml", "/sitemap_news.xml", "/sitemap-news.xml"]
41
+
42
+ _DEFAULT_HEADERS = {
43
+ "User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)",
44
+ }
45
+
46
+ # HTTP status codes that indicate bot-blocking; trigger curl_cffi fallback
47
+ _BLOCKED_STATUS_CODES = frozenset([403, 429, 503])
48
+
49
+
50
+ class DigitalNewsMetadataAdapter:
51
+ """Discovers news article URLs and metadata from a domain's Google News sitemaps.
52
+
53
+ Two usage modes:
54
+
55
+ 1. Domain-based discovery (auto-discovers sitemaps via robots.txt / probes):
56
+ ``DigitalNewsMetadataAdapter("hindustantimes.com")``
57
+
58
+ 2. Direct sitemap URLs (skips discovery entirely):
59
+ ``DigitalNewsMetadataAdapter(sitemap_urls=["https://example.com/news-sitemap.xml"])``
60
+ ``DigitalNewsMetadataAdapter("example.com", sitemap_urls=["https://..."])``
61
+
62
+ When ``sitemap_urls`` is provided, ``domain`` is optional and used only for logging.
63
+
64
+ Parent sitemap index URLs are fully supported in both modes. If a provided (or
65
+ discovered) URL points to a ``<sitemapindex>``, all child sitemaps are fetched and
66
+ parsed recursively. Example:
67
+
68
+ .. code-block:: python
69
+
70
+ # ET index → resolves sitemap-today.xml, sitemap-yesterday.xml, … automatically
71
+ adapter = DigitalNewsMetadataAdapter(
72
+ sitemap_urls=[
73
+ "https://economictimes.indiatimes.com/etstatic/sitemaps/et/news/sitemap-index.xml"
74
+ ]
75
+ )
76
+ articles = await adapter.fetch(since=timedelta(days=7))
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ domain: str | None = None,
82
+ *,
83
+ sitemap_urls: list[str] | None = None,
84
+ ) -> None:
85
+ if domain is None and not sitemap_urls:
86
+ raise ValueError("Provide domain, sitemap_urls, or both.")
87
+ self._sitemap_urls = sitemap_urls
88
+ # Infer domain from first sitemap URL when not explicitly given
89
+ if domain is None:
90
+ from urllib.parse import urlparse
91
+
92
+ self.domain = urlparse(sitemap_urls[0]).netloc # type: ignore[index]
93
+ else:
94
+ self.domain = domain
95
+ self._base_url = f"https://{self.domain}"
96
+
97
+ async def fetch(
98
+ self,
99
+ since: timedelta | None = None,
100
+ keywords: list[str] | None = None,
101
+ ) -> list[NewsArticleMetadata]:
102
+ """Discover and return filtered news article metadata."""
103
+ logger.info(
104
+ "fetch_started", domain=self.domain, since=str(since), keywords=keywords
105
+ )
106
+ if self._sitemap_urls is not None:
107
+ sitemap_urls = self._sitemap_urls
108
+ logger.info(
109
+ "sitemaps_provided",
110
+ domain=self.domain,
111
+ count=len(sitemap_urls),
112
+ urls=sitemap_urls,
113
+ )
114
+ else:
115
+ sitemap_urls = await self._discover_sitemap_urls()
116
+ logger.info(
117
+ "sitemaps_discovered",
118
+ domain=self.domain,
119
+ count=len(sitemap_urls),
120
+ urls=sitemap_urls,
121
+ )
122
+ cutoff = (datetime.now(UTC) - since) if since is not None else None
123
+ entries = await self._resolve_and_parse(sitemap_urls, cutoff=cutoff)
124
+ logger.info("entries_parsed", domain=self.domain, total_entries=len(entries))
125
+ filtered = self._apply_filters(entries, since=since, keywords=keywords)
126
+ logger.info(
127
+ "entries_filtered",
128
+ domain=self.domain,
129
+ before=len(entries),
130
+ after=len(filtered),
131
+ )
132
+ return filtered
133
+
134
+ # --- Discovery ---
135
+
136
+ async def _discover_sitemap_urls(self) -> list[str]:
137
+ all_urls: list[str] = []
138
+ robots_url = f"{self._base_url}/robots.txt"
139
+ logger.debug("fetching_robots_txt", url=robots_url)
140
+ try:
141
+ status, text = await _fetch_url(robots_url)
142
+ logger.debug("robots_txt_response", status=status)
143
+ if 200 <= status < 300:
144
+ for line in text.splitlines():
145
+ stripped = line.strip()
146
+ if stripped.lower().startswith("sitemap:"):
147
+ url = stripped.split(":", 1)[1].strip()
148
+ all_urls.append(url)
149
+ if all_urls:
150
+ news_urls = _filter_news_sitemaps(all_urls)
151
+ news_urls = _prioritize_news_sitemaps(news_urls)
152
+ logger.info(
153
+ "sitemap_directives_found",
154
+ domain=self.domain,
155
+ total=len(all_urls),
156
+ news=len(news_urls),
157
+ urls=news_urls,
158
+ )
159
+ if news_urls:
160
+ return news_urls
161
+ logger.info("no_news_sitemap_directives", domain=self.domain)
162
+ except Exception as exc:
163
+ logger.warning(
164
+ "robots_txt_fetch_failed", domain=self.domain, error=str(exc)
165
+ )
166
+
167
+ # Fallback probes — only news-related paths
168
+ logger.info("trying_fallback_probes", domain=self.domain, paths=_FALLBACK_PATHS)
169
+ found: list[str] = []
170
+ for path in _FALLBACK_PATHS:
171
+ probe_url = f"{self._base_url}{path}"
172
+ try:
173
+ status, _ = await _fetch_url(probe_url)
174
+ if 200 <= status < 300:
175
+ logger.info("fallback_probe_success", url=probe_url)
176
+ found.append(probe_url)
177
+ else:
178
+ logger.debug("fallback_probe_miss", url=probe_url, status=status)
179
+ except Exception:
180
+ logger.debug("fallback_probe_failed", url=probe_url)
181
+
182
+ if not found:
183
+ raise SitemapDiscoveryError(
184
+ self.domain,
185
+ "No news sitemaps found in robots.txt and all fallback probes failed",
186
+ )
187
+ return found
188
+
189
+ # --- XML Parsing ---
190
+
191
+ def _parse_sitemap_index(self, xml_text: str) -> tuple[bool, list[str]]:
192
+ try:
193
+ root = ET.fromstring(xml_text) # noqa: S314
194
+ except ET.ParseError:
195
+ return False, []
196
+
197
+ tag = root.tag
198
+ if "}" in tag:
199
+ tag = tag.split("}", 1)[1]
200
+
201
+ if tag != "sitemapindex":
202
+ return False, []
203
+
204
+ ns = _detect_sm_ns(root)
205
+ urls: list[str] = []
206
+ for sitemap_el in root.findall(f"{ns}sitemap"):
207
+ loc = sitemap_el.find(f"{ns}loc")
208
+ if loc is not None and loc.text:
209
+ urls.append(loc.text.strip())
210
+ return True, urls
211
+
212
+ def _parse_plain_entries(self, xml_text: str) -> list[NewsArticleMetadata]:
213
+ """Fallback parser for plain sitemaps without <news:news> elements.
214
+
215
+ Handles the format used by TOI, ET, and similar publishers:
216
+ ``<url><loc>URL</loc><lastmod>DATE</lastmod></url>``
217
+ Title is unavailable at this stage; keyword filtering falls back to URL slug
218
+ matching.
219
+ """
220
+ try:
221
+ root = ET.fromstring(xml_text) # noqa: S314
222
+ except ET.ParseError:
223
+ return []
224
+
225
+ sm_ns = _detect_sm_ns(root)
226
+ entries: list[NewsArticleMetadata] = []
227
+ for url_el in root.findall(f"{sm_ns}url"):
228
+ loc_el = url_el.find(f"{sm_ns}loc")
229
+ if loc_el is None or not loc_el.text:
230
+ continue
231
+ lastmod_el = url_el.find(f"{sm_ns}lastmod")
232
+ pub_date = (
233
+ _parse_date(lastmod_el.text.strip())
234
+ if lastmod_el is not None and lastmod_el.text
235
+ else None
236
+ )
237
+ entries.append(
238
+ NewsArticleMetadata(
239
+ url=loc_el.text.strip(),
240
+ title=None,
241
+ publication_name=None,
242
+ language=None,
243
+ publication_date=pub_date,
244
+ keywords=[],
245
+ )
246
+ )
247
+ return entries
248
+
249
+ def _parse_news_entries(self, xml_text: str) -> list[NewsArticleMetadata]:
250
+ try:
251
+ root = ET.fromstring(xml_text) # noqa: S314
252
+ except ET.ParseError:
253
+ return []
254
+
255
+ sm_ns = _detect_sm_ns(root)
256
+ news_ns = "{http://www.google.com/schemas/sitemap-news/0.9}"
257
+
258
+ entries: list[NewsArticleMetadata] = []
259
+ for url_el in root.findall(f"{sm_ns}url"):
260
+ loc_el = url_el.find(f"{sm_ns}loc")
261
+ if loc_el is None or not loc_el.text:
262
+ continue
263
+
264
+ news_el = url_el.find(f"{news_ns}news")
265
+ if news_el is None:
266
+ continue
267
+
268
+ url = loc_el.text.strip()
269
+ title = _text(news_el.find(f"{news_ns}title"))
270
+
271
+ pub_el = news_el.find(f"{news_ns}publication")
272
+ pub_name = (
273
+ _text(pub_el.find(f"{news_ns}name")) if pub_el is not None else None
274
+ )
275
+ language = (
276
+ _text(pub_el.find(f"{news_ns}language")) if pub_el is not None else None
277
+ )
278
+
279
+ date_text = _text(news_el.find(f"{news_ns}publication_date"))
280
+ pub_date = _parse_date(date_text) if date_text else None
281
+
282
+ kw_text = _text(news_el.find(f"{news_ns}keywords"))
283
+ keywords = (
284
+ [k.strip() for k in kw_text.split(",") if k.strip()] if kw_text else []
285
+ )
286
+
287
+ entries.append(
288
+ NewsArticleMetadata(
289
+ url=url,
290
+ title=title,
291
+ publication_name=pub_name,
292
+ language=language,
293
+ publication_date=pub_date,
294
+ keywords=keywords,
295
+ )
296
+ )
297
+ return entries
298
+
299
+ async def _resolve_and_parse(
300
+ self,
301
+ sitemap_urls: list[str],
302
+ cutoff: datetime | None = None,
303
+ ) -> list[NewsArticleMetadata]:
304
+ all_entries: list[NewsArticleMetadata] = []
305
+ for url in sitemap_urls:
306
+ # Early termination: if URL encodes a year-month older than cutoff, skip.
307
+ if cutoff is not None and _sitemap_url_older_than(url, cutoff):
308
+ logger.debug("sitemap_skipped_too_old", url=url)
309
+ continue
310
+
311
+ logger.debug("fetching_sitemap", url=url)
312
+ try:
313
+ status, xml_text = await _fetch_url(url)
314
+ if not (200 <= status < 300):
315
+ logger.warning("sitemap_fetch_failed", url=url, status=status)
316
+ continue
317
+ except Exception as exc:
318
+ logger.warning("sitemap_fetch_failed", url=url, error=str(exc))
319
+ continue
320
+
321
+ is_index, nested = self._parse_sitemap_index(xml_text)
322
+ if is_index:
323
+ logger.info("sitemap_index_found", url=url, nested_count=len(nested))
324
+ nested_entries = await self._resolve_and_parse(nested, cutoff=cutoff)
325
+ all_entries.extend(nested_entries)
326
+ else:
327
+ try:
328
+ entries = self._parse_news_entries(xml_text)
329
+ if not entries:
330
+ # No <news:news> elements — try plain <loc>+<lastmod> format
331
+ entries = self._parse_plain_entries(xml_text)
332
+ if entries:
333
+ logger.info(
334
+ "plain_sitemap_parsed", url=url, entries=len(entries)
335
+ )
336
+ else:
337
+ logger.info("sitemap_parsed_empty", url=url)
338
+ else:
339
+ logger.info(
340
+ "sitemap_parsed", url=url, news_entries=len(entries)
341
+ )
342
+ all_entries.extend(entries)
343
+ except Exception as exc:
344
+ logger.warning("sitemap_parse_failed", url=url, error=str(exc))
345
+ return all_entries
346
+
347
+ # --- Filtering ---
348
+
349
+ def _apply_filters(
350
+ self,
351
+ entries: list[NewsArticleMetadata],
352
+ since: timedelta | None = None,
353
+ keywords: list[str] | None = None,
354
+ now: datetime | None = None,
355
+ ) -> list[NewsArticleMetadata]:
356
+ if now is None:
357
+ now = datetime.now(UTC)
358
+
359
+ result = entries
360
+
361
+ if since is not None:
362
+ cutoff = now - since
363
+ result = [
364
+ e
365
+ for e in result
366
+ if e.publication_date is None or e.publication_date >= cutoff
367
+ ]
368
+
369
+ if keywords is not None:
370
+ kw_lower = [k.lower() for k in keywords]
371
+ result = [e for e in result if _matches_keywords(e, kw_lower)]
372
+
373
+ return result
374
+
375
+
376
+ def _detect_sm_ns(root: ET.Element) -> str:
377
+ """Return the sitemap namespace prefix (with braces) detected from the root tag.
378
+
379
+ Handles both the standard http:// and non-standard https:// variants used by
380
+ some publishers (e.g. Jagran uses https://www.sitemaps.org/...).
381
+ Falls back to the standard http:// prefix if undetectable.
382
+ """
383
+ tag = root.tag
384
+ if tag.startswith("{"):
385
+ return tag[: tag.index("}") + 1]
386
+ return "{http://www.sitemaps.org/schemas/sitemap/0.9}"
387
+
388
+
389
+ def _sitemap_url_older_than(url: str, cutoff: datetime) -> bool:
390
+ """Return True if the URL encodes a year-month that is entirely before *cutoff*.
391
+
392
+ Matches filename patterns like:
393
+ 2026-April-1.xml 2026-April-2.xml 2025-December-5.xml
394
+ 2026-03-1.xml 2024-09.xml
395
+ Only skips when the *entire month* is before the cutoff — i.e., when the last
396
+ day of that month is still before cutoff. This avoids skipping the current
397
+ month even when we are only a few days in.
398
+ """
399
+ m = _SITEMAP_DATE_RE.search(url)
400
+ if not m:
401
+ return False
402
+ year = int(m.group(1))
403
+ month_raw = m.group(2).lower()
404
+ month_num = _MONTH_NAME_TO_NUM.get(month_raw) or int(month_raw)
405
+
406
+ # Last day of the detected month
407
+ if month_num == 12:
408
+ last_day = datetime(year + 1, 1, 1, tzinfo=UTC) - timedelta(seconds=1)
409
+ else:
410
+ last_day = datetime(year, month_num + 1, 1, tzinfo=UTC) - timedelta(seconds=1)
411
+
412
+ return last_day < cutoff
413
+
414
+
415
+ _LANGUAGE_PATH_SEGMENTS = frozenset(
416
+ [
417
+ "telugu",
418
+ "bangla",
419
+ "hindi",
420
+ "marathi",
421
+ "urdu",
422
+ "tamil",
423
+ "kannada",
424
+ "malayalam",
425
+ "gujarati",
426
+ "punjabi",
427
+ "odia",
428
+ ]
429
+ )
430
+
431
+
432
+ async def _fetch_url(url: str) -> tuple[int, str]:
433
+ """Fetch URL with httpx; retry via curl_cffi Chrome impersonation on bot-block."""
434
+ try:
435
+ async with httpx.AsyncClient(
436
+ follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=15
437
+ ) as client:
438
+ resp = await client.get(url)
439
+ if resp.status_code not in _BLOCKED_STATUS_CODES:
440
+ return resp.status_code, resp.text
441
+ logger.debug(
442
+ "httpx_blocked_retrying_curl_cffi",
443
+ url=url,
444
+ status=resp.status_code,
445
+ )
446
+ except httpx.HTTPError as exc:
447
+ logger.debug("httpx_error_retrying_curl_cffi", url=url, error=str(exc))
448
+
449
+ from curl_cffi.requests import AsyncSession
450
+
451
+ # Do NOT pass _DEFAULT_HEADERS here — curl_cffi sets Chrome-native headers
452
+ # as part of impersonation; overriding User-Agent breaks the fingerprint.
453
+ async with AsyncSession(impersonate="chrome120") as session:
454
+ resp = await session.get(url)
455
+ logger.debug("curl_cffi_response", url=url, status=resp.status_code)
456
+ return resp.status_code, resp.text
457
+
458
+
459
+ def _filter_news_sitemaps(urls: list[str]) -> list[str]:
460
+ """Keep only sitemap URLs with 'news' in the path."""
461
+ return [u for u in urls if "news" in u.lower().split("//", 1)[-1]]
462
+
463
+
464
+ def _url_filename(url: str) -> str:
465
+ return url.rstrip("/").split("/")[-1].lower()
466
+
467
+
468
+ def _prioritize_news_sitemaps(urls: list[str]) -> list[str]:
469
+ """Apply priority rules when multiple news sitemaps are discovered.
470
+
471
+ Rules (applied in order, short-circuit on match):
472
+ 1. Strip language-specific paths (e.g. /telugu/, /bangla/).
473
+ 2. If a sitemap-index URL exists, keep only index URLs — they recursively
474
+ subsume child sitemaps like sitemap-today.xml / sitemap-yesterday.xml.
475
+ 3. If an "all-" prefixed sitemap exists, prefer it over plain variants.
476
+ """
477
+ if len(urls) <= 1:
478
+ return urls
479
+
480
+ # Rule 1: language filter
481
+ lang_filtered = [
482
+ u
483
+ for u in urls
484
+ if not any(
485
+ seg in _LANGUAGE_PATH_SEGMENTS
486
+ for seg in u.lower().split("//", 1)[-1].split("/")
487
+ )
488
+ ]
489
+ working = lang_filtered if lang_filtered else urls
490
+
491
+ if len(working) <= 1:
492
+ return working
493
+
494
+ # Rule 2: prefer index over individual child sitemaps
495
+ index_urls = [u for u in working if "index" in _url_filename(u)]
496
+ if index_urls:
497
+ return index_urls
498
+
499
+ # Rule 3: prefer "all-" prefixed sitemap
500
+ all_urls = [u for u in working if _url_filename(u).startswith("all-")]
501
+ if all_urls:
502
+ return all_urls
503
+
504
+ return working
505
+
506
+
507
+ def _text(el: ET.Element | None) -> str | None:
508
+ if el is None or not el.text:
509
+ return None
510
+ return el.text.strip()
511
+
512
+
513
+ def _parse_date(text: str) -> datetime | None:
514
+ try:
515
+ cleaned = text.replace("Z", "+00:00")
516
+ return datetime.fromisoformat(cleaned)
517
+ except (ValueError, TypeError):
518
+ return None
519
+
520
+
521
+ def _matches_keywords(entry: NewsArticleMetadata, kw_lower: list[str]) -> bool:
522
+ title_lower = (entry.title or "").lower()
523
+ entry_kw_lower = [k.lower() for k in entry.keywords]
524
+ url_lower = entry.url.lower()
525
+ for kw in kw_lower:
526
+ if kw in title_lower:
527
+ return True
528
+ if any(kw in ek for ek in entry_kw_lower):
529
+ return True
530
+ # Plain sitemaps have no title/keywords — match via URL slug (ASCII only).
531
+ # e.g. "rahul gandhi" → "rahul-gandhi" in URL path
532
+ if kw.isascii():
533
+ slug = kw.replace(" ", "-")
534
+ if slug in url_lower:
535
+ return True
536
+ return False
@@ -0,0 +1,10 @@
1
+ """Custom exceptions for digital news adapters."""
2
+
3
+
4
+ class SitemapDiscoveryError(Exception):
5
+ """Raised when sitemap discovery fails for a domain."""
6
+
7
+ def __init__(self, domain: str, reason: str) -> None:
8
+ self.domain = domain
9
+ self.reason = reason
10
+ super().__init__(f"Sitemap discovery failed for {domain}: {reason}")
@@ -0,0 +1,134 @@
1
+ """Data models for digital news, speech, YouTube, and Twitter adapters."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+ # ---------------------------------------------------------------------------
7
+ # Digital news models
8
+ # ---------------------------------------------------------------------------
9
+
10
+
11
+ @dataclass
12
+ class NewsArticleMetadata:
13
+ url: str
14
+ title: str | None
15
+ publication_name: str | None
16
+ language: str | None
17
+ publication_date: datetime | None
18
+ keywords: list[str]
19
+
20
+
21
+ @dataclass
22
+ class ArticleContent:
23
+ title: str
24
+ content: str
25
+
26
+
27
+ @dataclass
28
+ class ArticleResult:
29
+ url: str
30
+ content: ArticleContent | None
31
+ error: str | None
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Speech models
36
+ # ---------------------------------------------------------------------------
37
+
38
+
39
+ @dataclass
40
+ class SpeechMetadata:
41
+ url: str
42
+ title: str | None
43
+ speaker: str | None
44
+ speech_date: datetime | None
45
+ pdf_url: str | None
46
+ youtube_url: str | None
47
+ language: str | None
48
+
49
+
50
+ @dataclass
51
+ class SpeechContent:
52
+ text: str
53
+ language: str
54
+ pdf_url: str | None
55
+
56
+
57
+ @dataclass
58
+ class SpeechResult:
59
+ url: str
60
+ content: SpeechContent | None
61
+ error: str | None
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Twitter models
66
+ # ---------------------------------------------------------------------------
67
+
68
+
69
+ @dataclass
70
+ class TweetMetadata:
71
+ tweet_id: str
72
+ url: str
73
+ text: str
74
+ screen_name: str
75
+ created_at: datetime
76
+ language: str
77
+ hashtags: list[str]
78
+ embedded_urls: list[str]
79
+ views: int
80
+ retweet_count: int
81
+ like_count: int
82
+ reply_count: int
83
+ has_images: bool
84
+ has_videos: bool
85
+
86
+
87
+ @dataclass
88
+ class TwitterUserResult:
89
+ screen_name: str
90
+ tweets: list[TweetMetadata]
91
+ error: str | None
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # YouTube models
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ @dataclass
100
+ class VideoMetadata:
101
+ video_id: str
102
+ url: str
103
+ title: str
104
+ channel_id: str
105
+ channel_name: str
106
+ published_at: datetime
107
+ description: str
108
+ tags: list[str]
109
+ duration_seconds: int
110
+ view_count: str | None
111
+ like_count: str | None
112
+ comment_count: str | None
113
+ thumbnail_url: str | None = None
114
+
115
+
116
+ @dataclass
117
+ class TranscriptSegment:
118
+ start: float
119
+ end: float
120
+ text: str
121
+
122
+
123
+ @dataclass
124
+ class VideoAudio:
125
+ transcript: str
126
+ language: str
127
+ segments: list[TranscriptSegment] = field(default_factory=list)
128
+
129
+
130
+ @dataclass
131
+ class VideoResult:
132
+ video_id: str
133
+ audio: VideoAudio | None
134
+ error: str | None