oq-data 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oq_data/__init__.py ADDED
@@ -0,0 +1,94 @@
1
+ """oq-data — NSE/BSE data pipeline for OpenQuant India.
2
+
3
+ Top-level convenience imports mirror the most-used public API:
4
+
5
+ >>> from oq_data import prices, universe, wide_prices
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from oq_data.announcements import (
11
+ download_announcements,
12
+ parse_announcements_blob,
13
+ read_announcements,
14
+ write_announcements,
15
+ )
16
+ from oq_data.api import list_symbols, prices, resolve_symbol, universe, wide_prices
17
+ from oq_data.bhavcopy import build_url, download_bhavcopy, parse_bhavcopy_blob, sync_range
18
+ from oq_data.config import DataPaths, default_root, get_paths
19
+ from oq_data.corporate_actions import CorporateAction, add_actions, adjust_prices, load_actions
20
+ from oq_data.delivery import (
21
+ download_delivery,
22
+ parse_delivery_blob,
23
+ read_delivery,
24
+ write_delivery,
25
+ )
26
+ from oq_data.flows import (
27
+ download_flows,
28
+ parse_flows_blob,
29
+ read_flows,
30
+ write_flows,
31
+ )
32
+ from oq_data.fno import (
33
+ download_fno,
34
+ parse_fno_blob,
35
+ )
36
+ from oq_data.storage import (
37
+ coverage,
38
+ query,
39
+ read_fno,
40
+ read_prices,
41
+ write_eod,
42
+ write_fno,
43
+ )
44
+ from oq_data.symbols import SymbolMaster, add_mapping, load_master
45
+ from oq_data.universes import UniverseEntry, add_entries, load_universes, members_as_of
46
+
47
+ __version__ = "0.1.0"
48
+
49
+ __all__ = [
50
+ "CorporateAction",
51
+ "DataPaths",
52
+ "SymbolMaster",
53
+ "UniverseEntry",
54
+ "__version__",
55
+ "add_actions",
56
+ "add_entries",
57
+ "add_mapping",
58
+ "adjust_prices",
59
+ "build_url",
60
+ "coverage",
61
+ "default_root",
62
+ "download_announcements",
63
+ "download_bhavcopy",
64
+ "download_delivery",
65
+ "download_flows",
66
+ "download_fno",
67
+ "get_paths",
68
+ "list_symbols",
69
+ "load_actions",
70
+ "load_master",
71
+ "load_universes",
72
+ "members_as_of",
73
+ "parse_announcements_blob",
74
+ "parse_bhavcopy_blob",
75
+ "parse_delivery_blob",
76
+ "parse_flows_blob",
77
+ "parse_fno_blob",
78
+ "prices",
79
+ "query",
80
+ "read_announcements",
81
+ "read_delivery",
82
+ "read_flows",
83
+ "read_fno",
84
+ "read_prices",
85
+ "resolve_symbol",
86
+ "sync_range",
87
+ "universe",
88
+ "wide_prices",
89
+ "write_announcements",
90
+ "write_delivery",
91
+ "write_eod",
92
+ "write_flows",
93
+ "write_fno",
94
+ ]
@@ -0,0 +1,176 @@
1
+ """Corporate-announcements feed ingestion.
2
+
3
+ NSE publishes a rolling JSON feed of corporate announcements at
4
+ ``https://www.nseindia.com/api/corporate-announcements?index=equities``.
5
+ Each row carries the announcement timestamp, symbol, broad category, a
6
+ short subject line, and an attachment URL.
7
+
8
+ The canonical schema we persist is::
9
+
10
+ date, symbol, category, subject, attachment
11
+
12
+ ``date`` is the announcement business date (``date``-typed), suitable
13
+ for the same year-partitioned storage layout used by the EOD writers.
14
+
15
+ Network calls go through the same injectable ``Fetcher`` as the rest of
16
+ the pipeline so the suite stays offline.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import io
22
+ import json
23
+ import logging
24
+ from collections.abc import Iterable
25
+ from dataclasses import dataclass
26
+ from datetime import date, timedelta
27
+
28
+ import pandas as pd
29
+
30
+ from oq_data.bhavcopy import Fetcher, _default_fetcher
31
+ from oq_data.config import DataPaths, get_paths
32
+ from oq_data.storage import write_partitioned
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ _NORMALISED_COLUMNS = ["date", "symbol", "category", "subject", "attachment"]
37
+
38
+
39
+ @dataclass(frozen=True, slots=True)
40
+ class AnnouncementsSource:
41
+ when: date
42
+ url: str
43
+ filename: str
44
+
45
+
46
+ def build_url(when: date) -> AnnouncementsSource:
47
+ fname = f"announcements_{when:%Y%m%d}.json"
48
+ url = (
49
+ "https://www.nseindia.com/api/corporate-announcements"
50
+ f"?index=equities&from_date={when:%d-%m-%Y}&to_date={when:%d-%m-%Y}"
51
+ )
52
+ return AnnouncementsSource(when=when, url=url, filename=fname)
53
+
54
+
55
+ def _pick(row: dict, *keys: str) -> object:
56
+ for k in keys:
57
+ if k in row and row[k] not in (None, ""):
58
+ return row[k]
59
+ return ""
60
+
61
+
62
+ def parse_announcements_blob(blob: bytes, when: date) -> pd.DataFrame:
63
+ text = blob.decode("utf-8-sig", errors="ignore").lstrip()
64
+ if text.startswith("[") or text.startswith("{"):
65
+ data = json.loads(text)
66
+ rows = data if isinstance(data, list) else data.get("data", data.get("rows", []))
67
+ else:
68
+ rows = pd.read_csv(io.BytesIO(blob)).to_dict("records")
69
+ if not rows:
70
+ return pd.DataFrame(columns=_NORMALISED_COLUMNS)
71
+ df = pd.DataFrame(
72
+ {
73
+ "date": pd.to_datetime(when),
74
+ "symbol": [str(_pick(r, "symbol", "Symbol", "SYMBOL")).strip() for r in rows],
75
+ "category": [
76
+ str(_pick(r, "category", "Category", "broadcastsubject")).strip() for r in rows
77
+ ],
78
+ "subject": [
79
+ str(_pick(r, "subject", "Subject", "desc", "Description")).strip() for r in rows
80
+ ],
81
+ "attachment": [
82
+ str(_pick(r, "attchmntFile", "attachment", "attachmentUrl")).strip() for r in rows
83
+ ],
84
+ }
85
+ )
86
+ df = df[df["symbol"] != ""].reset_index(drop=True)
87
+ return df[_NORMALISED_COLUMNS]
88
+
89
+
90
+ def _cache_dir(paths: DataPaths):
91
+ p = paths.raw / "announcements"
92
+ p.mkdir(parents=True, exist_ok=True)
93
+ return p
94
+
95
+
96
+ def download_announcements(
97
+ when: date,
98
+ paths: DataPaths | None = None,
99
+ fetcher: Fetcher | None = None,
100
+ use_cache: bool = True,
101
+ ) -> pd.DataFrame:
102
+ paths = paths or get_paths()
103
+ paths.ensure()
104
+ src = build_url(when)
105
+ cache_path = _cache_dir(paths) / src.filename
106
+ fetch = fetcher or _default_fetcher
107
+ if use_cache and cache_path.exists():
108
+ blob = cache_path.read_bytes()
109
+ else:
110
+ blob = fetch(src.url)
111
+ cache_path.write_bytes(blob)
112
+ return parse_announcements_blob(blob, when)
113
+
114
+
115
+ def write_announcements(df: pd.DataFrame, paths: DataPaths | None = None) -> int:
116
+ paths = paths or get_paths()
117
+ paths.ensure()
118
+ keys = ["date", "symbol", "subject"]
119
+ return write_partitioned(df, paths.announcements, keys)
120
+
121
+
122
+ def read_announcements(
123
+ symbols: str | Iterable[str] | None = None,
124
+ start: date | str | None = None,
125
+ end: date | str | None = None,
126
+ paths: DataPaths | None = None,
127
+ ) -> pd.DataFrame:
128
+ paths = paths or get_paths()
129
+ parts = sorted(paths.announcements.glob("year=*/data.parquet"))
130
+ if not parts:
131
+ return pd.DataFrame(columns=_NORMALISED_COLUMNS)
132
+ df = pd.concat([pd.read_parquet(p) for p in parts], ignore_index=True)
133
+ if symbols is not None:
134
+ syms = {symbols} if isinstance(symbols, str) else set(symbols)
135
+ df = df[df["symbol"].isin(syms)]
136
+ if start is not None:
137
+ df = df[df["date"] >= pd.to_datetime(start)]
138
+ if end is not None:
139
+ df = df[df["date"] <= pd.to_datetime(end)]
140
+ return df.sort_values(["date", "symbol"]).reset_index(drop=True)
141
+
142
+
143
+ def sync_range(
144
+ start: date,
145
+ end: date,
146
+ paths: DataPaths | None = None,
147
+ fetcher: Fetcher | None = None,
148
+ on_missing: str = "skip",
149
+ ) -> Iterable[date]:
150
+ if end < start:
151
+ raise ValueError("end must be >= start")
152
+ paths = paths or get_paths()
153
+ paths.ensure()
154
+ cur = start
155
+ one_day = timedelta(days=1)
156
+ while cur <= end:
157
+ if cur.weekday() < 5:
158
+ try:
159
+ download_announcements(cur, paths=paths, fetcher=fetcher)
160
+ yield cur
161
+ except Exception as exc:
162
+ if on_missing == "raise":
163
+ raise
164
+ logger.info("announcements feed unavailable for %s: %s", cur, exc)
165
+ cur += one_day
166
+
167
+
168
+ __all__ = [
169
+ "AnnouncementsSource",
170
+ "build_url",
171
+ "download_announcements",
172
+ "parse_announcements_blob",
173
+ "read_announcements",
174
+ "sync_range",
175
+ "write_announcements",
176
+ ]
oq_data/api.py ADDED
@@ -0,0 +1,103 @@
1
+ """High-level Python API for downstream packages.
2
+
3
+ This is the surface most users will touch: :func:`prices` for a clean
4
+ adjusted price series, :func:`universe` for a point-in-time membership
5
+ set, and :func:`wide_prices` for a date-indexed wide frame that drops
6
+ straight into :func:`oq_backtest.backtest`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Iterable
12
+ from datetime import date
13
+
14
+ import pandas as pd
15
+
16
+ from oq_data import corporate_actions, storage, symbols, universes
17
+ from oq_data.config import DataPaths, get_paths
18
+
19
+
20
+ def prices(
21
+ symbol: str | Iterable[str],
22
+ start: date | str | None = None,
23
+ end: date | str | None = None,
24
+ adjusted: bool = True,
25
+ paths: DataPaths | None = None,
26
+ ) -> pd.DataFrame:
27
+ """Read a long-form OHLCV frame for one or many symbols.
28
+
29
+ With ``adjusted=True`` (default), splits, bonuses, and dividends are
30
+ back-adjusted so the returned series is a continuous total-return
31
+ proxy suitable for backtesting.
32
+ """
33
+ paths = paths or get_paths()
34
+ df = storage.read_prices(symbols=symbol, start=start, end=end, paths=paths)
35
+ if df.empty or not adjusted:
36
+ return df
37
+ actions = corporate_actions.load_actions(paths=paths)
38
+ if actions.empty:
39
+ return df
40
+ return corporate_actions.adjust_prices(df, actions)
41
+
42
+
43
+ def wide_prices(
44
+ universe_symbols: Iterable[str],
45
+ start: date | str | None = None,
46
+ end: date | str | None = None,
47
+ field: str = "close",
48
+ adjusted: bool = True,
49
+ paths: DataPaths | None = None,
50
+ ) -> pd.DataFrame:
51
+ """Return a date-indexed wide DataFrame ready for the backtester.
52
+
53
+ The output is what :func:`oq_backtest.backtest` consumes as
54
+ ``prices``: rows are trading dates, columns are symbols, values are
55
+ the requested field (default ``close``).
56
+ """
57
+ syms = list(universe_symbols)
58
+ if not syms:
59
+ raise ValueError("universe_symbols must be non-empty")
60
+ long_df = prices(syms, start=start, end=end, adjusted=adjusted, paths=paths)
61
+ if long_df.empty:
62
+ return pd.DataFrame()
63
+ if field not in long_df.columns:
64
+ raise KeyError(f"field {field!r} not in {sorted(long_df.columns)}")
65
+ wide = long_df.pivot_table(index="date", columns="symbol", values=field, aggfunc="last")
66
+ wide = wide.sort_index()
67
+ wide.index = pd.DatetimeIndex(wide.index)
68
+ return wide
69
+
70
+
71
+ def universe(
72
+ index_name: str,
73
+ as_of: date | str,
74
+ paths: DataPaths | None = None,
75
+ ) -> list[str]:
76
+ """List the symbols that made up ``index_name`` on ``as_of``."""
77
+ paths = paths or get_paths()
78
+ when = pd.to_datetime(as_of).date()
79
+ members = universes.members_as_of(index_name, when, paths=paths)
80
+ return members["symbol"].tolist()
81
+
82
+
83
+ def resolve_symbol(symbol: str, when: date | str, paths: DataPaths | None = None) -> str:
84
+ """Translate a current ticker to the symbol used on ``when``."""
85
+ paths = paths or get_paths()
86
+ master = symbols.load_master(paths=paths)
87
+ return master.resolve_as_of(symbol, pd.to_datetime(when).date())
88
+
89
+
90
+ def list_symbols(paths: DataPaths | None = None) -> list[str]:
91
+ """All distinct ``symbol`` values present in the EOD dataset."""
92
+ paths = paths or get_paths()
93
+ df = storage.query("SELECT DISTINCT symbol FROM eod ORDER BY symbol", paths=paths)
94
+ return df["symbol"].tolist() if not df.empty else []
95
+
96
+
97
+ __all__ = [
98
+ "list_symbols",
99
+ "prices",
100
+ "resolve_symbol",
101
+ "universe",
102
+ "wide_prices",
103
+ ]
oq_data/bhavcopy.py ADDED
@@ -0,0 +1,272 @@
1
+ """NSE equity Bhavcopy ingestion.
2
+
3
+ NSE publishes one zipped CSV per trading day with all listed cash-market
4
+ instruments and their OHLCV. Two formats are in active circulation:
5
+
6
+ * **Legacy** (until ~July 2020): ``cm{DDMMMYYYY}bhav.csv.zip`` with the
7
+ classic ``SYMBOL,SERIES,OPEN,HIGH,LOW,CLOSE,LAST,PREVCLOSE,...`` schema.
8
+ * **UDiFF** (from ~July 2020 onward): ``BhavCopy_NSE_CM_0_0_0_{YYYYMMDD}
9
+ _F_0000.csv.zip`` with the longer ``TradDt,BizDt,Sgmt,Src,FinInstrmTp,
10
+ FinInstrmId,ISIN,TckrSymb,SctySrs,...`` schema.
11
+
12
+ This module exposes URL builders, format-detecting parsers, and a single
13
+ :func:`download_bhavcopy` entry point that retries, resumes from cache,
14
+ and returns a normalised :class:`pandas.DataFrame`.
15
+
16
+ Network calls are isolated behind an injectable ``fetcher`` callable so
17
+ tests run fully offline against the fixtures under ``tests/fixtures``.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import io
23
+ import logging
24
+ import time
25
+ import zipfile
26
+ from collections.abc import Callable, Iterable
27
+ from dataclasses import dataclass
28
+ from datetime import date, datetime, timedelta
29
+ from pathlib import Path
30
+
31
+ import httpx
32
+ import pandas as pd
33
+
34
+ from oq_data.config import DataPaths, get_paths
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ UDIFF_CUTOVER = date(2020, 7, 11)
39
+ USER_AGENT = (
40
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
41
+ "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
42
+ )
43
+ DEFAULT_HEADERS = {
44
+ "User-Agent": USER_AGENT,
45
+ "Accept": "*/*",
46
+ "Accept-Language": "en-US,en;q=0.9",
47
+ }
48
+
49
+ Fetcher = Callable[[str], bytes]
50
+
51
+ _NORMALISED_COLUMNS = [
52
+ "date",
53
+ "symbol",
54
+ "isin",
55
+ "series",
56
+ "open",
57
+ "high",
58
+ "low",
59
+ "close",
60
+ "prev_close",
61
+ "volume",
62
+ "value",
63
+ "trades",
64
+ ]
65
+
66
+
67
+ @dataclass(frozen=True, slots=True)
68
+ class BhavcopySource:
69
+ """The full address of a bhavcopy for a given date."""
70
+
71
+ when: date
72
+ url: str
73
+ filename: str
74
+ is_udiff: bool
75
+
76
+
77
+ def is_udiff_date(when: date) -> bool:
78
+ """Return True if NSE was publishing the UDiFF schema on ``when``."""
79
+ return when >= UDIFF_CUTOVER
80
+
81
+
82
+ def build_url(when: date) -> BhavcopySource:
83
+ """Build the canonical NSE archive URL for the bhavcopy on ``when``."""
84
+ if is_udiff_date(when):
85
+ fname = f"BhavCopy_NSE_CM_0_0_0_{when:%Y%m%d}_F_0000.csv.zip"
86
+ url = f"https://nsearchives.nseindia.com/content/cm/{fname}"
87
+ return BhavcopySource(when=when, url=url, filename=fname, is_udiff=True)
88
+ fname = f"cm{when.strftime('%d%b%Y').upper()}bhav.csv.zip"
89
+ url = (
90
+ "https://nsearchives.nseindia.com/content/historical/EQUITIES/"
91
+ f"{when:%Y}/{when.strftime('%b').upper()}/{fname}"
92
+ )
93
+ return BhavcopySource(when=when, url=url, filename=fname, is_udiff=False)
94
+
95
+
96
+ def _default_fetcher(url: str, timeout: float = 30.0, retries: int = 3) -> bytes:
97
+ """HTTP GET with retry and NSE-friendly headers. Raises on final failure."""
98
+ last_exc: Exception | None = None
99
+ for attempt in range(1, retries + 1):
100
+ try:
101
+ with httpx.Client(
102
+ headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout
103
+ ) as client:
104
+ resp = client.get(url)
105
+ resp.raise_for_status()
106
+ return resp.content
107
+ except httpx.HTTPError as exc:
108
+ last_exc = exc
109
+ logger.warning("bhavcopy fetch failed (attempt %d/%d): %s", attempt, retries, exc)
110
+ time.sleep(min(2**attempt, 8))
111
+ raise RuntimeError(f"failed to fetch {url} after {retries} attempts") from last_exc
112
+
113
+
114
+ def _read_csv_from_zip(blob: bytes) -> pd.DataFrame:
115
+ with zipfile.ZipFile(io.BytesIO(blob)) as zf:
116
+ names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
117
+ if not names:
118
+ raise ValueError("zip archive contains no .csv member")
119
+ with zf.open(names[0]) as fh:
120
+ return pd.read_csv(fh)
121
+
122
+
123
+ def _normalise_legacy(raw: pd.DataFrame, when: date) -> pd.DataFrame:
124
+ raw = raw.rename(columns=lambda c: c.strip().upper())
125
+ df = pd.DataFrame(
126
+ {
127
+ "date": pd.to_datetime(when),
128
+ "symbol": raw["SYMBOL"].astype(str).str.strip(),
129
+ "isin": raw.get("ISIN", pd.Series([pd.NA] * len(raw))).astype("string").str.strip(),
130
+ "series": raw["SERIES"].astype(str).str.strip(),
131
+ "open": pd.to_numeric(raw["OPEN"], errors="coerce"),
132
+ "high": pd.to_numeric(raw["HIGH"], errors="coerce"),
133
+ "low": pd.to_numeric(raw["LOW"], errors="coerce"),
134
+ "close": pd.to_numeric(raw["CLOSE"], errors="coerce"),
135
+ "prev_close": pd.to_numeric(raw["PREVCLOSE"], errors="coerce"),
136
+ "volume": pd.to_numeric(raw["TOTTRDQTY"], errors="coerce").astype("Int64"),
137
+ "value": pd.to_numeric(raw["TOTTRDVAL"], errors="coerce"),
138
+ "trades": pd.to_numeric(raw.get("TOTALTRADES", pd.NA), errors="coerce").astype("Int64"),
139
+ }
140
+ )
141
+ return df
142
+
143
+
144
+ def _normalise_udiff(raw: pd.DataFrame, when: date) -> pd.DataFrame:
145
+ raw = raw.rename(columns=lambda c: c.strip())
146
+ df = pd.DataFrame(
147
+ {
148
+ "date": pd.to_datetime(when),
149
+ "symbol": raw["TckrSymb"].astype(str).str.strip(),
150
+ "isin": raw["ISIN"].astype("string").str.strip(),
151
+ "series": raw["SctySrs"].astype(str).str.strip(),
152
+ "open": pd.to_numeric(raw["OpnPric"], errors="coerce"),
153
+ "high": pd.to_numeric(raw["HghPric"], errors="coerce"),
154
+ "low": pd.to_numeric(raw["LwPric"], errors="coerce"),
155
+ "close": pd.to_numeric(raw["ClsPric"], errors="coerce"),
156
+ "prev_close": pd.to_numeric(raw["PrvsClsgPric"], errors="coerce"),
157
+ "volume": pd.to_numeric(raw["TtlTradgVol"], errors="coerce").astype("Int64"),
158
+ "value": pd.to_numeric(raw["TtlTrfVal"], errors="coerce"),
159
+ "trades": pd.to_numeric(raw["TtlNbOfTxsExctd"], errors="coerce").astype("Int64"),
160
+ }
161
+ )
162
+ fininstrm = raw.get("FinInstrmTp")
163
+ if fininstrm is not None:
164
+ df = df[fininstrm.astype(str).str.upper().isin({"STK", "EQ"})].reset_index(drop=True)
165
+ return df
166
+
167
+
168
+ def parse_bhavcopy_blob(blob: bytes, when: date) -> pd.DataFrame:
169
+ """Parse a downloaded bhavcopy zip (or raw csv) into the canonical schema.
170
+
171
+ The schema returned, regardless of input format, is exactly:
172
+ ``date, symbol, isin, series, open, high, low, close, prev_close,
173
+ volume, value, trades``.
174
+ """
175
+ raw = _read_csv_from_zip(blob) if blob[:2] == b"PK" else pd.read_csv(io.BytesIO(blob))
176
+ upper_cols = {c.strip().upper() for c in raw.columns}
177
+ is_udiff = "TCKRSYMB" in upper_cols
178
+ df = _normalise_udiff(raw, when) if is_udiff else _normalise_legacy(raw, when)
179
+ return df[_NORMALISED_COLUMNS]
180
+
181
+
182
+ def download_bhavcopy(
183
+ when: date,
184
+ paths: DataPaths | None = None,
185
+ fetcher: Fetcher | None = None,
186
+ use_cache: bool = True,
187
+ ) -> pd.DataFrame:
188
+ """Download and parse the NSE equity bhavcopy for ``when``.
189
+
190
+ The raw zip is cached under ``paths.bhavcopy`` so reruns are offline.
191
+ Pass ``fetcher`` to substitute a callable (used by tests) or to swap
192
+ in your own retry/auth wrapper.
193
+ """
194
+ paths = paths or get_paths()
195
+ paths.ensure()
196
+ src = build_url(when)
197
+ cache_path = paths.bhavcopy / src.filename
198
+ fetch = fetcher or _default_fetcher
199
+
200
+ if use_cache and cache_path.exists():
201
+ blob = cache_path.read_bytes()
202
+ else:
203
+ blob = fetch(src.url)
204
+ cache_path.write_bytes(blob)
205
+ return parse_bhavcopy_blob(blob, when)
206
+
207
+
208
+ def sync_range(
209
+ start: date,
210
+ end: date,
211
+ paths: DataPaths | None = None,
212
+ fetcher: Fetcher | None = None,
213
+ on_missing: str = "skip",
214
+ ) -> Iterable[date]:
215
+ """Download every available bhavcopy in ``[start, end]`` inclusive.
216
+
217
+ Weekends are always skipped. NSE holidays show up as 404s; with
218
+ ``on_missing='skip'`` (default) they are silently passed over. With
219
+ ``on_missing='raise'`` the first 404 aborts the run.
220
+ Yields the dates that were successfully ingested.
221
+ """
222
+ if end < start:
223
+ raise ValueError("end must be >= start")
224
+ paths = paths or get_paths()
225
+ paths.ensure()
226
+ cur = start
227
+ one_day = timedelta(days=1)
228
+ while cur <= end:
229
+ if cur.weekday() < 5:
230
+ try:
231
+ download_bhavcopy(cur, paths=paths, fetcher=fetcher)
232
+ yield cur
233
+ except Exception as exc:
234
+ if on_missing == "raise":
235
+ raise
236
+ logger.info("bhavcopy unavailable for %s: %s", cur, exc)
237
+ cur += one_day
238
+
239
+
240
+ def parse_filename_date(filename: str) -> date:
241
+ """Recover the trading date from a cached bhavcopy filename."""
242
+ if filename.startswith("BhavCopy_NSE_CM_"):
243
+ token = filename.split("_")[6]
244
+ return datetime.strptime(token, "%Y%m%d").date()
245
+ if filename.startswith("cm") and filename.endswith("bhav.csv.zip"):
246
+ token = filename[2:-12]
247
+ return datetime.strptime(token, "%d%b%Y").date()
248
+ raise ValueError(f"unrecognised bhavcopy filename: {filename}")
249
+
250
+
251
+ def iter_cached(paths: DataPaths) -> Iterable[Path]:
252
+ """Yield every cached bhavcopy archive under ``paths.bhavcopy``."""
253
+ if not paths.bhavcopy.exists():
254
+ return
255
+ for entry in sorted(paths.bhavcopy.iterdir()):
256
+ if entry.is_file() and entry.suffix == ".zip":
257
+ yield entry
258
+
259
+
260
+ __all__ = [
261
+ "DEFAULT_HEADERS",
262
+ "UDIFF_CUTOVER",
263
+ "BhavcopySource",
264
+ "Fetcher",
265
+ "build_url",
266
+ "download_bhavcopy",
267
+ "is_udiff_date",
268
+ "iter_cached",
269
+ "parse_bhavcopy_blob",
270
+ "parse_filename_date",
271
+ "sync_range",
272
+ ]