pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
"""Discovers news article URLs and metadata from Google News sitemaps."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
from datetime import UTC, datetime, timedelta
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
import structlog
|
|
9
|
+
|
|
10
|
+
from .exceptions import SitemapDiscoveryError
|
|
11
|
+
from .models import NewsArticleMetadata
|
|
12
|
+
|
|
13
|
+
logger = structlog.get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
# Regex to extract a year-month from date-encoded sitemap URL filenames.
|
|
16
|
+
# Matches patterns like: 2026-April-1.xml, 2026-03-1.xml, 2025-December-2.xml
|
|
17
|
+
_SITEMAP_DATE_RE = re.compile(
|
|
18
|
+
r"(\d{4})[-_]"
|
|
19
|
+
r"(January|February|March|April|May|June|July|August|September|October|November|December|"
|
|
20
|
+
r"01|02|03|04|05|06|07|08|09|10|11|12)"
|
|
21
|
+
r"(?:[-_]\d+)?\.xml",
|
|
22
|
+
re.IGNORECASE,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_MONTH_NAME_TO_NUM = {
|
|
26
|
+
"january": 1,
|
|
27
|
+
"february": 2,
|
|
28
|
+
"march": 3,
|
|
29
|
+
"april": 4,
|
|
30
|
+
"may": 5,
|
|
31
|
+
"june": 6,
|
|
32
|
+
"july": 7,
|
|
33
|
+
"august": 8,
|
|
34
|
+
"september": 9,
|
|
35
|
+
"october": 10,
|
|
36
|
+
"november": 11,
|
|
37
|
+
"december": 12,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_FALLBACK_PATHS = ["/news-sitemap.xml", "/sitemap_news.xml", "/sitemap-news.xml"]
|
|
41
|
+
|
|
42
|
+
_DEFAULT_HEADERS = {
|
|
43
|
+
"User-Agent": "Mozilla/5.0 (compatible; PulseBot/1.0)",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# HTTP status codes that indicate bot-blocking; trigger curl_cffi fallback
|
|
47
|
+
_BLOCKED_STATUS_CODES = frozenset([403, 429, 503])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DigitalNewsMetadataAdapter:
|
|
51
|
+
"""Discovers news article URLs and metadata from a domain's Google News sitemaps.
|
|
52
|
+
|
|
53
|
+
Two usage modes:
|
|
54
|
+
|
|
55
|
+
1. Domain-based discovery (auto-discovers sitemaps via robots.txt / probes):
|
|
56
|
+
``DigitalNewsMetadataAdapter("hindustantimes.com")``
|
|
57
|
+
|
|
58
|
+
2. Direct sitemap URLs (skips discovery entirely):
|
|
59
|
+
``DigitalNewsMetadataAdapter(sitemap_urls=["https://example.com/news-sitemap.xml"])``
|
|
60
|
+
``DigitalNewsMetadataAdapter("example.com", sitemap_urls=["https://..."])``
|
|
61
|
+
|
|
62
|
+
When ``sitemap_urls`` is provided, ``domain`` is optional and used only for logging.
|
|
63
|
+
|
|
64
|
+
Parent sitemap index URLs are fully supported in both modes. If a provided (or
|
|
65
|
+
discovered) URL points to a ``<sitemapindex>``, all child sitemaps are fetched and
|
|
66
|
+
parsed recursively. Example:
|
|
67
|
+
|
|
68
|
+
.. code-block:: python
|
|
69
|
+
|
|
70
|
+
# ET index → resolves sitemap-today.xml, sitemap-yesterday.xml, … automatically
|
|
71
|
+
adapter = DigitalNewsMetadataAdapter(
|
|
72
|
+
sitemap_urls=[
|
|
73
|
+
"https://economictimes.indiatimes.com/etstatic/sitemaps/et/news/sitemap-index.xml"
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
articles = await adapter.fetch(since=timedelta(days=7))
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
domain: str | None = None,
|
|
82
|
+
*,
|
|
83
|
+
sitemap_urls: list[str] | None = None,
|
|
84
|
+
) -> None:
|
|
85
|
+
if domain is None and not sitemap_urls:
|
|
86
|
+
raise ValueError("Provide domain, sitemap_urls, or both.")
|
|
87
|
+
self._sitemap_urls = sitemap_urls
|
|
88
|
+
# Infer domain from first sitemap URL when not explicitly given
|
|
89
|
+
if domain is None:
|
|
90
|
+
from urllib.parse import urlparse
|
|
91
|
+
|
|
92
|
+
self.domain = urlparse(sitemap_urls[0]).netloc # type: ignore[index]
|
|
93
|
+
else:
|
|
94
|
+
self.domain = domain
|
|
95
|
+
self._base_url = f"https://{self.domain}"
|
|
96
|
+
|
|
97
|
+
async def fetch(
|
|
98
|
+
self,
|
|
99
|
+
since: timedelta | None = None,
|
|
100
|
+
keywords: list[str] | None = None,
|
|
101
|
+
) -> list[NewsArticleMetadata]:
|
|
102
|
+
"""Discover and return filtered news article metadata."""
|
|
103
|
+
logger.info(
|
|
104
|
+
"fetch_started", domain=self.domain, since=str(since), keywords=keywords
|
|
105
|
+
)
|
|
106
|
+
if self._sitemap_urls is not None:
|
|
107
|
+
sitemap_urls = self._sitemap_urls
|
|
108
|
+
logger.info(
|
|
109
|
+
"sitemaps_provided",
|
|
110
|
+
domain=self.domain,
|
|
111
|
+
count=len(sitemap_urls),
|
|
112
|
+
urls=sitemap_urls,
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
sitemap_urls = await self._discover_sitemap_urls()
|
|
116
|
+
logger.info(
|
|
117
|
+
"sitemaps_discovered",
|
|
118
|
+
domain=self.domain,
|
|
119
|
+
count=len(sitemap_urls),
|
|
120
|
+
urls=sitemap_urls,
|
|
121
|
+
)
|
|
122
|
+
cutoff = (datetime.now(UTC) - since) if since is not None else None
|
|
123
|
+
entries = await self._resolve_and_parse(sitemap_urls, cutoff=cutoff)
|
|
124
|
+
logger.info("entries_parsed", domain=self.domain, total_entries=len(entries))
|
|
125
|
+
filtered = self._apply_filters(entries, since=since, keywords=keywords)
|
|
126
|
+
logger.info(
|
|
127
|
+
"entries_filtered",
|
|
128
|
+
domain=self.domain,
|
|
129
|
+
before=len(entries),
|
|
130
|
+
after=len(filtered),
|
|
131
|
+
)
|
|
132
|
+
return filtered
|
|
133
|
+
|
|
134
|
+
# --- Discovery ---
|
|
135
|
+
|
|
136
|
+
async def _discover_sitemap_urls(self) -> list[str]:
|
|
137
|
+
all_urls: list[str] = []
|
|
138
|
+
robots_url = f"{self._base_url}/robots.txt"
|
|
139
|
+
logger.debug("fetching_robots_txt", url=robots_url)
|
|
140
|
+
try:
|
|
141
|
+
status, text = await _fetch_url(robots_url)
|
|
142
|
+
logger.debug("robots_txt_response", status=status)
|
|
143
|
+
if 200 <= status < 300:
|
|
144
|
+
for line in text.splitlines():
|
|
145
|
+
stripped = line.strip()
|
|
146
|
+
if stripped.lower().startswith("sitemap:"):
|
|
147
|
+
url = stripped.split(":", 1)[1].strip()
|
|
148
|
+
all_urls.append(url)
|
|
149
|
+
if all_urls:
|
|
150
|
+
news_urls = _filter_news_sitemaps(all_urls)
|
|
151
|
+
news_urls = _prioritize_news_sitemaps(news_urls)
|
|
152
|
+
logger.info(
|
|
153
|
+
"sitemap_directives_found",
|
|
154
|
+
domain=self.domain,
|
|
155
|
+
total=len(all_urls),
|
|
156
|
+
news=len(news_urls),
|
|
157
|
+
urls=news_urls,
|
|
158
|
+
)
|
|
159
|
+
if news_urls:
|
|
160
|
+
return news_urls
|
|
161
|
+
logger.info("no_news_sitemap_directives", domain=self.domain)
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
logger.warning(
|
|
164
|
+
"robots_txt_fetch_failed", domain=self.domain, error=str(exc)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Fallback probes — only news-related paths
|
|
168
|
+
logger.info("trying_fallback_probes", domain=self.domain, paths=_FALLBACK_PATHS)
|
|
169
|
+
found: list[str] = []
|
|
170
|
+
for path in _FALLBACK_PATHS:
|
|
171
|
+
probe_url = f"{self._base_url}{path}"
|
|
172
|
+
try:
|
|
173
|
+
status, _ = await _fetch_url(probe_url)
|
|
174
|
+
if 200 <= status < 300:
|
|
175
|
+
logger.info("fallback_probe_success", url=probe_url)
|
|
176
|
+
found.append(probe_url)
|
|
177
|
+
else:
|
|
178
|
+
logger.debug("fallback_probe_miss", url=probe_url, status=status)
|
|
179
|
+
except Exception:
|
|
180
|
+
logger.debug("fallback_probe_failed", url=probe_url)
|
|
181
|
+
|
|
182
|
+
if not found:
|
|
183
|
+
raise SitemapDiscoveryError(
|
|
184
|
+
self.domain,
|
|
185
|
+
"No news sitemaps found in robots.txt and all fallback probes failed",
|
|
186
|
+
)
|
|
187
|
+
return found
|
|
188
|
+
|
|
189
|
+
# --- XML Parsing ---
|
|
190
|
+
|
|
191
|
+
def _parse_sitemap_index(self, xml_text: str) -> tuple[bool, list[str]]:
|
|
192
|
+
try:
|
|
193
|
+
root = ET.fromstring(xml_text) # noqa: S314
|
|
194
|
+
except ET.ParseError:
|
|
195
|
+
return False, []
|
|
196
|
+
|
|
197
|
+
tag = root.tag
|
|
198
|
+
if "}" in tag:
|
|
199
|
+
tag = tag.split("}", 1)[1]
|
|
200
|
+
|
|
201
|
+
if tag != "sitemapindex":
|
|
202
|
+
return False, []
|
|
203
|
+
|
|
204
|
+
ns = _detect_sm_ns(root)
|
|
205
|
+
urls: list[str] = []
|
|
206
|
+
for sitemap_el in root.findall(f"{ns}sitemap"):
|
|
207
|
+
loc = sitemap_el.find(f"{ns}loc")
|
|
208
|
+
if loc is not None and loc.text:
|
|
209
|
+
urls.append(loc.text.strip())
|
|
210
|
+
return True, urls
|
|
211
|
+
|
|
212
|
+
def _parse_plain_entries(self, xml_text: str) -> list[NewsArticleMetadata]:
|
|
213
|
+
"""Fallback parser for plain sitemaps without <news:news> elements.
|
|
214
|
+
|
|
215
|
+
Handles the format used by TOI, ET, and similar publishers:
|
|
216
|
+
``<url><loc>URL</loc><lastmod>DATE</lastmod></url>``
|
|
217
|
+
Title is unavailable at this stage; keyword filtering falls back to URL slug
|
|
218
|
+
matching.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
root = ET.fromstring(xml_text) # noqa: S314
|
|
222
|
+
except ET.ParseError:
|
|
223
|
+
return []
|
|
224
|
+
|
|
225
|
+
sm_ns = _detect_sm_ns(root)
|
|
226
|
+
entries: list[NewsArticleMetadata] = []
|
|
227
|
+
for url_el in root.findall(f"{sm_ns}url"):
|
|
228
|
+
loc_el = url_el.find(f"{sm_ns}loc")
|
|
229
|
+
if loc_el is None or not loc_el.text:
|
|
230
|
+
continue
|
|
231
|
+
lastmod_el = url_el.find(f"{sm_ns}lastmod")
|
|
232
|
+
pub_date = (
|
|
233
|
+
_parse_date(lastmod_el.text.strip())
|
|
234
|
+
if lastmod_el is not None and lastmod_el.text
|
|
235
|
+
else None
|
|
236
|
+
)
|
|
237
|
+
entries.append(
|
|
238
|
+
NewsArticleMetadata(
|
|
239
|
+
url=loc_el.text.strip(),
|
|
240
|
+
title=None,
|
|
241
|
+
publication_name=None,
|
|
242
|
+
language=None,
|
|
243
|
+
publication_date=pub_date,
|
|
244
|
+
keywords=[],
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
return entries
|
|
248
|
+
|
|
249
|
+
def _parse_news_entries(self, xml_text: str) -> list[NewsArticleMetadata]:
|
|
250
|
+
try:
|
|
251
|
+
root = ET.fromstring(xml_text) # noqa: S314
|
|
252
|
+
except ET.ParseError:
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
sm_ns = _detect_sm_ns(root)
|
|
256
|
+
news_ns = "{http://www.google.com/schemas/sitemap-news/0.9}"
|
|
257
|
+
|
|
258
|
+
entries: list[NewsArticleMetadata] = []
|
|
259
|
+
for url_el in root.findall(f"{sm_ns}url"):
|
|
260
|
+
loc_el = url_el.find(f"{sm_ns}loc")
|
|
261
|
+
if loc_el is None or not loc_el.text:
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
news_el = url_el.find(f"{news_ns}news")
|
|
265
|
+
if news_el is None:
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
url = loc_el.text.strip()
|
|
269
|
+
title = _text(news_el.find(f"{news_ns}title"))
|
|
270
|
+
|
|
271
|
+
pub_el = news_el.find(f"{news_ns}publication")
|
|
272
|
+
pub_name = (
|
|
273
|
+
_text(pub_el.find(f"{news_ns}name")) if pub_el is not None else None
|
|
274
|
+
)
|
|
275
|
+
language = (
|
|
276
|
+
_text(pub_el.find(f"{news_ns}language")) if pub_el is not None else None
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
date_text = _text(news_el.find(f"{news_ns}publication_date"))
|
|
280
|
+
pub_date = _parse_date(date_text) if date_text else None
|
|
281
|
+
|
|
282
|
+
kw_text = _text(news_el.find(f"{news_ns}keywords"))
|
|
283
|
+
keywords = (
|
|
284
|
+
[k.strip() for k in kw_text.split(",") if k.strip()] if kw_text else []
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
entries.append(
|
|
288
|
+
NewsArticleMetadata(
|
|
289
|
+
url=url,
|
|
290
|
+
title=title,
|
|
291
|
+
publication_name=pub_name,
|
|
292
|
+
language=language,
|
|
293
|
+
publication_date=pub_date,
|
|
294
|
+
keywords=keywords,
|
|
295
|
+
)
|
|
296
|
+
)
|
|
297
|
+
return entries
|
|
298
|
+
|
|
299
|
+
async def _resolve_and_parse(
|
|
300
|
+
self,
|
|
301
|
+
sitemap_urls: list[str],
|
|
302
|
+
cutoff: datetime | None = None,
|
|
303
|
+
) -> list[NewsArticleMetadata]:
|
|
304
|
+
all_entries: list[NewsArticleMetadata] = []
|
|
305
|
+
for url in sitemap_urls:
|
|
306
|
+
# Early termination: if URL encodes a year-month older than cutoff, skip.
|
|
307
|
+
if cutoff is not None and _sitemap_url_older_than(url, cutoff):
|
|
308
|
+
logger.debug("sitemap_skipped_too_old", url=url)
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
logger.debug("fetching_sitemap", url=url)
|
|
312
|
+
try:
|
|
313
|
+
status, xml_text = await _fetch_url(url)
|
|
314
|
+
if not (200 <= status < 300):
|
|
315
|
+
logger.warning("sitemap_fetch_failed", url=url, status=status)
|
|
316
|
+
continue
|
|
317
|
+
except Exception as exc:
|
|
318
|
+
logger.warning("sitemap_fetch_failed", url=url, error=str(exc))
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
is_index, nested = self._parse_sitemap_index(xml_text)
|
|
322
|
+
if is_index:
|
|
323
|
+
logger.info("sitemap_index_found", url=url, nested_count=len(nested))
|
|
324
|
+
nested_entries = await self._resolve_and_parse(nested, cutoff=cutoff)
|
|
325
|
+
all_entries.extend(nested_entries)
|
|
326
|
+
else:
|
|
327
|
+
try:
|
|
328
|
+
entries = self._parse_news_entries(xml_text)
|
|
329
|
+
if not entries:
|
|
330
|
+
# No <news:news> elements — try plain <loc>+<lastmod> format
|
|
331
|
+
entries = self._parse_plain_entries(xml_text)
|
|
332
|
+
if entries:
|
|
333
|
+
logger.info(
|
|
334
|
+
"plain_sitemap_parsed", url=url, entries=len(entries)
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
logger.info("sitemap_parsed_empty", url=url)
|
|
338
|
+
else:
|
|
339
|
+
logger.info(
|
|
340
|
+
"sitemap_parsed", url=url, news_entries=len(entries)
|
|
341
|
+
)
|
|
342
|
+
all_entries.extend(entries)
|
|
343
|
+
except Exception as exc:
|
|
344
|
+
logger.warning("sitemap_parse_failed", url=url, error=str(exc))
|
|
345
|
+
return all_entries
|
|
346
|
+
|
|
347
|
+
# --- Filtering ---
|
|
348
|
+
|
|
349
|
+
def _apply_filters(
|
|
350
|
+
self,
|
|
351
|
+
entries: list[NewsArticleMetadata],
|
|
352
|
+
since: timedelta | None = None,
|
|
353
|
+
keywords: list[str] | None = None,
|
|
354
|
+
now: datetime | None = None,
|
|
355
|
+
) -> list[NewsArticleMetadata]:
|
|
356
|
+
if now is None:
|
|
357
|
+
now = datetime.now(UTC)
|
|
358
|
+
|
|
359
|
+
result = entries
|
|
360
|
+
|
|
361
|
+
if since is not None:
|
|
362
|
+
cutoff = now - since
|
|
363
|
+
result = [
|
|
364
|
+
e
|
|
365
|
+
for e in result
|
|
366
|
+
if e.publication_date is None or e.publication_date >= cutoff
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
if keywords is not None:
|
|
370
|
+
kw_lower = [k.lower() for k in keywords]
|
|
371
|
+
result = [e for e in result if _matches_keywords(e, kw_lower)]
|
|
372
|
+
|
|
373
|
+
return result
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _detect_sm_ns(root: ET.Element) -> str:
|
|
377
|
+
"""Return the sitemap namespace prefix (with braces) detected from the root tag.
|
|
378
|
+
|
|
379
|
+
Handles both the standard http:// and non-standard https:// variants used by
|
|
380
|
+
some publishers (e.g. Jagran uses https://www.sitemaps.org/...).
|
|
381
|
+
Falls back to the standard http:// prefix if undetectable.
|
|
382
|
+
"""
|
|
383
|
+
tag = root.tag
|
|
384
|
+
if tag.startswith("{"):
|
|
385
|
+
return tag[: tag.index("}") + 1]
|
|
386
|
+
return "{http://www.sitemaps.org/schemas/sitemap/0.9}"
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _sitemap_url_older_than(url: str, cutoff: datetime) -> bool:
|
|
390
|
+
"""Return True if the URL encodes a year-month that is entirely before *cutoff*.
|
|
391
|
+
|
|
392
|
+
Matches filename patterns like:
|
|
393
|
+
2026-April-1.xml 2026-April-2.xml 2025-December-5.xml
|
|
394
|
+
2026-03-1.xml 2024-09.xml
|
|
395
|
+
Only skips when the *entire month* is before the cutoff — i.e., when the last
|
|
396
|
+
day of that month is still before cutoff. This avoids skipping the current
|
|
397
|
+
month even when we are only a few days in.
|
|
398
|
+
"""
|
|
399
|
+
m = _SITEMAP_DATE_RE.search(url)
|
|
400
|
+
if not m:
|
|
401
|
+
return False
|
|
402
|
+
year = int(m.group(1))
|
|
403
|
+
month_raw = m.group(2).lower()
|
|
404
|
+
month_num = _MONTH_NAME_TO_NUM.get(month_raw) or int(month_raw)
|
|
405
|
+
|
|
406
|
+
# Last day of the detected month
|
|
407
|
+
if month_num == 12:
|
|
408
|
+
last_day = datetime(year + 1, 1, 1, tzinfo=UTC) - timedelta(seconds=1)
|
|
409
|
+
else:
|
|
410
|
+
last_day = datetime(year, month_num + 1, 1, tzinfo=UTC) - timedelta(seconds=1)
|
|
411
|
+
|
|
412
|
+
return last_day < cutoff
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
_LANGUAGE_PATH_SEGMENTS = frozenset(
|
|
416
|
+
[
|
|
417
|
+
"telugu",
|
|
418
|
+
"bangla",
|
|
419
|
+
"hindi",
|
|
420
|
+
"marathi",
|
|
421
|
+
"urdu",
|
|
422
|
+
"tamil",
|
|
423
|
+
"kannada",
|
|
424
|
+
"malayalam",
|
|
425
|
+
"gujarati",
|
|
426
|
+
"punjabi",
|
|
427
|
+
"odia",
|
|
428
|
+
]
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
async def _fetch_url(url: str) -> tuple[int, str]:
|
|
433
|
+
"""Fetch URL with httpx; retry via curl_cffi Chrome impersonation on bot-block."""
|
|
434
|
+
try:
|
|
435
|
+
async with httpx.AsyncClient(
|
|
436
|
+
follow_redirects=True, headers=_DEFAULT_HEADERS, timeout=15
|
|
437
|
+
) as client:
|
|
438
|
+
resp = await client.get(url)
|
|
439
|
+
if resp.status_code not in _BLOCKED_STATUS_CODES:
|
|
440
|
+
return resp.status_code, resp.text
|
|
441
|
+
logger.debug(
|
|
442
|
+
"httpx_blocked_retrying_curl_cffi",
|
|
443
|
+
url=url,
|
|
444
|
+
status=resp.status_code,
|
|
445
|
+
)
|
|
446
|
+
except httpx.HTTPError as exc:
|
|
447
|
+
logger.debug("httpx_error_retrying_curl_cffi", url=url, error=str(exc))
|
|
448
|
+
|
|
449
|
+
from curl_cffi.requests import AsyncSession
|
|
450
|
+
|
|
451
|
+
# Do NOT pass _DEFAULT_HEADERS here — curl_cffi sets Chrome-native headers
|
|
452
|
+
# as part of impersonation; overriding User-Agent breaks the fingerprint.
|
|
453
|
+
async with AsyncSession(impersonate="chrome120") as session:
|
|
454
|
+
resp = await session.get(url)
|
|
455
|
+
logger.debug("curl_cffi_response", url=url, status=resp.status_code)
|
|
456
|
+
return resp.status_code, resp.text
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _filter_news_sitemaps(urls: list[str]) -> list[str]:
|
|
460
|
+
"""Keep only sitemap URLs with 'news' in the path."""
|
|
461
|
+
return [u for u in urls if "news" in u.lower().split("//", 1)[-1]]
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def _url_filename(url: str) -> str:
|
|
465
|
+
return url.rstrip("/").split("/")[-1].lower()
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _prioritize_news_sitemaps(urls: list[str]) -> list[str]:
|
|
469
|
+
"""Apply priority rules when multiple news sitemaps are discovered.
|
|
470
|
+
|
|
471
|
+
Rules (applied in order, short-circuit on match):
|
|
472
|
+
1. Strip language-specific paths (e.g. /telugu/, /bangla/).
|
|
473
|
+
2. If a sitemap-index URL exists, keep only index URLs — they recursively
|
|
474
|
+
subsume child sitemaps like sitemap-today.xml / sitemap-yesterday.xml.
|
|
475
|
+
3. If an "all-" prefixed sitemap exists, prefer it over plain variants.
|
|
476
|
+
"""
|
|
477
|
+
if len(urls) <= 1:
|
|
478
|
+
return urls
|
|
479
|
+
|
|
480
|
+
# Rule 1: language filter
|
|
481
|
+
lang_filtered = [
|
|
482
|
+
u
|
|
483
|
+
for u in urls
|
|
484
|
+
if not any(
|
|
485
|
+
seg in _LANGUAGE_PATH_SEGMENTS
|
|
486
|
+
for seg in u.lower().split("//", 1)[-1].split("/")
|
|
487
|
+
)
|
|
488
|
+
]
|
|
489
|
+
working = lang_filtered if lang_filtered else urls
|
|
490
|
+
|
|
491
|
+
if len(working) <= 1:
|
|
492
|
+
return working
|
|
493
|
+
|
|
494
|
+
# Rule 2: prefer index over individual child sitemaps
|
|
495
|
+
index_urls = [u for u in working if "index" in _url_filename(u)]
|
|
496
|
+
if index_urls:
|
|
497
|
+
return index_urls
|
|
498
|
+
|
|
499
|
+
# Rule 3: prefer "all-" prefixed sitemap
|
|
500
|
+
all_urls = [u for u in working if _url_filename(u).startswith("all-")]
|
|
501
|
+
if all_urls:
|
|
502
|
+
return all_urls
|
|
503
|
+
|
|
504
|
+
return working
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _text(el: ET.Element | None) -> str | None:
|
|
508
|
+
if el is None or not el.text:
|
|
509
|
+
return None
|
|
510
|
+
return el.text.strip()
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _parse_date(text: str) -> datetime | None:
|
|
514
|
+
try:
|
|
515
|
+
cleaned = text.replace("Z", "+00:00")
|
|
516
|
+
return datetime.fromisoformat(cleaned)
|
|
517
|
+
except (ValueError, TypeError):
|
|
518
|
+
return None
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def _matches_keywords(entry: NewsArticleMetadata, kw_lower: list[str]) -> bool:
|
|
522
|
+
title_lower = (entry.title or "").lower()
|
|
523
|
+
entry_kw_lower = [k.lower() for k in entry.keywords]
|
|
524
|
+
url_lower = entry.url.lower()
|
|
525
|
+
for kw in kw_lower:
|
|
526
|
+
if kw in title_lower:
|
|
527
|
+
return True
|
|
528
|
+
if any(kw in ek for ek in entry_kw_lower):
|
|
529
|
+
return True
|
|
530
|
+
# Plain sitemaps have no title/keywords — match via URL slug (ASCII only).
|
|
531
|
+
# e.g. "rahul gandhi" → "rahul-gandhi" in URL path
|
|
532
|
+
if kw.isascii():
|
|
533
|
+
slug = kw.replace(" ", "-")
|
|
534
|
+
if slug in url_lower:
|
|
535
|
+
return True
|
|
536
|
+
return False
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Custom exceptions for digital news adapters."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SitemapDiscoveryError(Exception):
|
|
5
|
+
"""Raised when sitemap discovery fails for a domain."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, domain: str, reason: str) -> None:
|
|
8
|
+
self.domain = domain
|
|
9
|
+
self.reason = reason
|
|
10
|
+
super().__init__(f"Sitemap discovery failed for {domain}: {reason}")
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Data models for digital news, speech, YouTube, and Twitter adapters."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
# ---------------------------------------------------------------------------
|
|
7
|
+
# Digital news models
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class NewsArticleMetadata:
|
|
13
|
+
url: str
|
|
14
|
+
title: str | None
|
|
15
|
+
publication_name: str | None
|
|
16
|
+
language: str | None
|
|
17
|
+
publication_date: datetime | None
|
|
18
|
+
keywords: list[str]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ArticleContent:
|
|
23
|
+
title: str
|
|
24
|
+
content: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ArticleResult:
|
|
29
|
+
url: str
|
|
30
|
+
content: ArticleContent | None
|
|
31
|
+
error: str | None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Speech models
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class SpeechMetadata:
|
|
41
|
+
url: str
|
|
42
|
+
title: str | None
|
|
43
|
+
speaker: str | None
|
|
44
|
+
speech_date: datetime | None
|
|
45
|
+
pdf_url: str | None
|
|
46
|
+
youtube_url: str | None
|
|
47
|
+
language: str | None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class SpeechContent:
|
|
52
|
+
text: str
|
|
53
|
+
language: str
|
|
54
|
+
pdf_url: str | None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class SpeechResult:
|
|
59
|
+
url: str
|
|
60
|
+
content: SpeechContent | None
|
|
61
|
+
error: str | None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Twitter models
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class TweetMetadata:
|
|
71
|
+
tweet_id: str
|
|
72
|
+
url: str
|
|
73
|
+
text: str
|
|
74
|
+
screen_name: str
|
|
75
|
+
created_at: datetime
|
|
76
|
+
language: str
|
|
77
|
+
hashtags: list[str]
|
|
78
|
+
embedded_urls: list[str]
|
|
79
|
+
views: int
|
|
80
|
+
retweet_count: int
|
|
81
|
+
like_count: int
|
|
82
|
+
reply_count: int
|
|
83
|
+
has_images: bool
|
|
84
|
+
has_videos: bool
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class TwitterUserResult:
|
|
89
|
+
screen_name: str
|
|
90
|
+
tweets: list[TweetMetadata]
|
|
91
|
+
error: str | None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# YouTube models
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class VideoMetadata:
|
|
101
|
+
video_id: str
|
|
102
|
+
url: str
|
|
103
|
+
title: str
|
|
104
|
+
channel_id: str
|
|
105
|
+
channel_name: str
|
|
106
|
+
published_at: datetime
|
|
107
|
+
description: str
|
|
108
|
+
tags: list[str]
|
|
109
|
+
duration_seconds: int
|
|
110
|
+
view_count: str | None
|
|
111
|
+
like_count: str | None
|
|
112
|
+
comment_count: str | None
|
|
113
|
+
thumbnail_url: str | None = None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class TranscriptSegment:
|
|
118
|
+
start: float
|
|
119
|
+
end: float
|
|
120
|
+
text: str
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class VideoAudio:
|
|
125
|
+
transcript: str
|
|
126
|
+
language: str
|
|
127
|
+
segments: list[TranscriptSegment] = field(default_factory=list)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class VideoResult:
|
|
132
|
+
video_id: str
|
|
133
|
+
audio: VideoAudio | None
|
|
134
|
+
error: str | None
|