oq-data 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oq_data/__init__.py +94 -0
- oq_data/announcements.py +176 -0
- oq_data/api.py +103 -0
- oq_data/bhavcopy.py +272 -0
- oq_data/cli.py +232 -0
- oq_data/config.py +105 -0
- oq_data/corporate_actions.py +172 -0
- oq_data/delivery.py +156 -0
- oq_data/flows.py +183 -0
- oq_data/fno.py +225 -0
- oq_data/storage.py +285 -0
- oq_data/symbols.py +146 -0
- oq_data/universes.py +130 -0
- oq_data-0.1.0.dist-info/METADATA +51 -0
- oq_data-0.1.0.dist-info/RECORD +17 -0
- oq_data-0.1.0.dist-info/WHEEL +4 -0
- oq_data-0.1.0.dist-info/entry_points.txt +2 -0
oq_data/__init__.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""oq-data — NSE/BSE data pipeline for OpenQuant India.
|
|
2
|
+
|
|
3
|
+
Top-level convenience imports mirror the most-used public API:
|
|
4
|
+
|
|
5
|
+
>>> from oq_data import prices, universe, wide_prices
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from oq_data.announcements import (
|
|
11
|
+
download_announcements,
|
|
12
|
+
parse_announcements_blob,
|
|
13
|
+
read_announcements,
|
|
14
|
+
write_announcements,
|
|
15
|
+
)
|
|
16
|
+
from oq_data.api import list_symbols, prices, resolve_symbol, universe, wide_prices
|
|
17
|
+
from oq_data.bhavcopy import build_url, download_bhavcopy, parse_bhavcopy_blob, sync_range
|
|
18
|
+
from oq_data.config import DataPaths, default_root, get_paths
|
|
19
|
+
from oq_data.corporate_actions import CorporateAction, add_actions, adjust_prices, load_actions
|
|
20
|
+
from oq_data.delivery import (
|
|
21
|
+
download_delivery,
|
|
22
|
+
parse_delivery_blob,
|
|
23
|
+
read_delivery,
|
|
24
|
+
write_delivery,
|
|
25
|
+
)
|
|
26
|
+
from oq_data.flows import (
|
|
27
|
+
download_flows,
|
|
28
|
+
parse_flows_blob,
|
|
29
|
+
read_flows,
|
|
30
|
+
write_flows,
|
|
31
|
+
)
|
|
32
|
+
from oq_data.fno import (
|
|
33
|
+
download_fno,
|
|
34
|
+
parse_fno_blob,
|
|
35
|
+
)
|
|
36
|
+
from oq_data.storage import (
|
|
37
|
+
coverage,
|
|
38
|
+
query,
|
|
39
|
+
read_fno,
|
|
40
|
+
read_prices,
|
|
41
|
+
write_eod,
|
|
42
|
+
write_fno,
|
|
43
|
+
)
|
|
44
|
+
from oq_data.symbols import SymbolMaster, add_mapping, load_master
|
|
45
|
+
from oq_data.universes import UniverseEntry, add_entries, load_universes, members_as_of
|
|
46
|
+
|
|
47
|
+
__version__ = "0.1.0"
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"CorporateAction",
|
|
51
|
+
"DataPaths",
|
|
52
|
+
"SymbolMaster",
|
|
53
|
+
"UniverseEntry",
|
|
54
|
+
"__version__",
|
|
55
|
+
"add_actions",
|
|
56
|
+
"add_entries",
|
|
57
|
+
"add_mapping",
|
|
58
|
+
"adjust_prices",
|
|
59
|
+
"build_url",
|
|
60
|
+
"coverage",
|
|
61
|
+
"default_root",
|
|
62
|
+
"download_announcements",
|
|
63
|
+
"download_bhavcopy",
|
|
64
|
+
"download_delivery",
|
|
65
|
+
"download_flows",
|
|
66
|
+
"download_fno",
|
|
67
|
+
"get_paths",
|
|
68
|
+
"list_symbols",
|
|
69
|
+
"load_actions",
|
|
70
|
+
"load_master",
|
|
71
|
+
"load_universes",
|
|
72
|
+
"members_as_of",
|
|
73
|
+
"parse_announcements_blob",
|
|
74
|
+
"parse_bhavcopy_blob",
|
|
75
|
+
"parse_delivery_blob",
|
|
76
|
+
"parse_flows_blob",
|
|
77
|
+
"parse_fno_blob",
|
|
78
|
+
"prices",
|
|
79
|
+
"query",
|
|
80
|
+
"read_announcements",
|
|
81
|
+
"read_delivery",
|
|
82
|
+
"read_flows",
|
|
83
|
+
"read_fno",
|
|
84
|
+
"read_prices",
|
|
85
|
+
"resolve_symbol",
|
|
86
|
+
"sync_range",
|
|
87
|
+
"universe",
|
|
88
|
+
"wide_prices",
|
|
89
|
+
"write_announcements",
|
|
90
|
+
"write_delivery",
|
|
91
|
+
"write_eod",
|
|
92
|
+
"write_flows",
|
|
93
|
+
"write_fno",
|
|
94
|
+
]
|
oq_data/announcements.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Corporate-announcements feed ingestion.
|
|
2
|
+
|
|
3
|
+
NSE publishes a rolling JSON feed of corporate announcements at
|
|
4
|
+
``https://www.nseindia.com/api/corporate-announcements?index=equities``.
|
|
5
|
+
Each row carries the announcement timestamp, symbol, broad category, a
|
|
6
|
+
short subject line, and an attachment URL.
|
|
7
|
+
|
|
8
|
+
The canonical schema we persist is::
|
|
9
|
+
|
|
10
|
+
date, symbol, category, subject, attachment
|
|
11
|
+
|
|
12
|
+
``date`` is the announcement business date (``date``-typed), suitable
|
|
13
|
+
for the same year-partitioned storage layout used by the EOD writers.
|
|
14
|
+
|
|
15
|
+
Network calls go through the same injectable ``Fetcher`` as the rest of
|
|
16
|
+
the pipeline so the suite stays offline.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import io
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
from collections.abc import Iterable
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from datetime import date, timedelta
|
|
27
|
+
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from oq_data.bhavcopy import Fetcher, _default_fetcher
|
|
31
|
+
from oq_data.config import DataPaths, get_paths
|
|
32
|
+
from oq_data.storage import write_partitioned
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
_NORMALISED_COLUMNS = ["date", "symbol", "category", "subject", "attachment"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True, slots=True)
|
|
40
|
+
class AnnouncementsSource:
|
|
41
|
+
when: date
|
|
42
|
+
url: str
|
|
43
|
+
filename: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_url(when: date) -> AnnouncementsSource:
|
|
47
|
+
fname = f"announcements_{when:%Y%m%d}.json"
|
|
48
|
+
url = (
|
|
49
|
+
"https://www.nseindia.com/api/corporate-announcements"
|
|
50
|
+
f"?index=equities&from_date={when:%d-%m-%Y}&to_date={when:%d-%m-%Y}"
|
|
51
|
+
)
|
|
52
|
+
return AnnouncementsSource(when=when, url=url, filename=fname)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _pick(row: dict, *keys: str) -> object:
|
|
56
|
+
for k in keys:
|
|
57
|
+
if k in row and row[k] not in (None, ""):
|
|
58
|
+
return row[k]
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_announcements_blob(blob: bytes, when: date) -> pd.DataFrame:
|
|
63
|
+
text = blob.decode("utf-8-sig", errors="ignore").lstrip()
|
|
64
|
+
if text.startswith("[") or text.startswith("{"):
|
|
65
|
+
data = json.loads(text)
|
|
66
|
+
rows = data if isinstance(data, list) else data.get("data", data.get("rows", []))
|
|
67
|
+
else:
|
|
68
|
+
rows = pd.read_csv(io.BytesIO(blob)).to_dict("records")
|
|
69
|
+
if not rows:
|
|
70
|
+
return pd.DataFrame(columns=_NORMALISED_COLUMNS)
|
|
71
|
+
df = pd.DataFrame(
|
|
72
|
+
{
|
|
73
|
+
"date": pd.to_datetime(when),
|
|
74
|
+
"symbol": [str(_pick(r, "symbol", "Symbol", "SYMBOL")).strip() for r in rows],
|
|
75
|
+
"category": [
|
|
76
|
+
str(_pick(r, "category", "Category", "broadcastsubject")).strip() for r in rows
|
|
77
|
+
],
|
|
78
|
+
"subject": [
|
|
79
|
+
str(_pick(r, "subject", "Subject", "desc", "Description")).strip() for r in rows
|
|
80
|
+
],
|
|
81
|
+
"attachment": [
|
|
82
|
+
str(_pick(r, "attchmntFile", "attachment", "attachmentUrl")).strip() for r in rows
|
|
83
|
+
],
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
df = df[df["symbol"] != ""].reset_index(drop=True)
|
|
87
|
+
return df[_NORMALISED_COLUMNS]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _cache_dir(paths: DataPaths):
|
|
91
|
+
p = paths.raw / "announcements"
|
|
92
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
return p
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def download_announcements(
|
|
97
|
+
when: date,
|
|
98
|
+
paths: DataPaths | None = None,
|
|
99
|
+
fetcher: Fetcher | None = None,
|
|
100
|
+
use_cache: bool = True,
|
|
101
|
+
) -> pd.DataFrame:
|
|
102
|
+
paths = paths or get_paths()
|
|
103
|
+
paths.ensure()
|
|
104
|
+
src = build_url(when)
|
|
105
|
+
cache_path = _cache_dir(paths) / src.filename
|
|
106
|
+
fetch = fetcher or _default_fetcher
|
|
107
|
+
if use_cache and cache_path.exists():
|
|
108
|
+
blob = cache_path.read_bytes()
|
|
109
|
+
else:
|
|
110
|
+
blob = fetch(src.url)
|
|
111
|
+
cache_path.write_bytes(blob)
|
|
112
|
+
return parse_announcements_blob(blob, when)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def write_announcements(df: pd.DataFrame, paths: DataPaths | None = None) -> int:
|
|
116
|
+
paths = paths or get_paths()
|
|
117
|
+
paths.ensure()
|
|
118
|
+
keys = ["date", "symbol", "subject"]
|
|
119
|
+
return write_partitioned(df, paths.announcements, keys)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def read_announcements(
|
|
123
|
+
symbols: str | Iterable[str] | None = None,
|
|
124
|
+
start: date | str | None = None,
|
|
125
|
+
end: date | str | None = None,
|
|
126
|
+
paths: DataPaths | None = None,
|
|
127
|
+
) -> pd.DataFrame:
|
|
128
|
+
paths = paths or get_paths()
|
|
129
|
+
parts = sorted(paths.announcements.glob("year=*/data.parquet"))
|
|
130
|
+
if not parts:
|
|
131
|
+
return pd.DataFrame(columns=_NORMALISED_COLUMNS)
|
|
132
|
+
df = pd.concat([pd.read_parquet(p) for p in parts], ignore_index=True)
|
|
133
|
+
if symbols is not None:
|
|
134
|
+
syms = {symbols} if isinstance(symbols, str) else set(symbols)
|
|
135
|
+
df = df[df["symbol"].isin(syms)]
|
|
136
|
+
if start is not None:
|
|
137
|
+
df = df[df["date"] >= pd.to_datetime(start)]
|
|
138
|
+
if end is not None:
|
|
139
|
+
df = df[df["date"] <= pd.to_datetime(end)]
|
|
140
|
+
return df.sort_values(["date", "symbol"]).reset_index(drop=True)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def sync_range(
|
|
144
|
+
start: date,
|
|
145
|
+
end: date,
|
|
146
|
+
paths: DataPaths | None = None,
|
|
147
|
+
fetcher: Fetcher | None = None,
|
|
148
|
+
on_missing: str = "skip",
|
|
149
|
+
) -> Iterable[date]:
|
|
150
|
+
if end < start:
|
|
151
|
+
raise ValueError("end must be >= start")
|
|
152
|
+
paths = paths or get_paths()
|
|
153
|
+
paths.ensure()
|
|
154
|
+
cur = start
|
|
155
|
+
one_day = timedelta(days=1)
|
|
156
|
+
while cur <= end:
|
|
157
|
+
if cur.weekday() < 5:
|
|
158
|
+
try:
|
|
159
|
+
download_announcements(cur, paths=paths, fetcher=fetcher)
|
|
160
|
+
yield cur
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
if on_missing == "raise":
|
|
163
|
+
raise
|
|
164
|
+
logger.info("announcements feed unavailable for %s: %s", cur, exc)
|
|
165
|
+
cur += one_day
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
__all__ = [
|
|
169
|
+
"AnnouncementsSource",
|
|
170
|
+
"build_url",
|
|
171
|
+
"download_announcements",
|
|
172
|
+
"parse_announcements_blob",
|
|
173
|
+
"read_announcements",
|
|
174
|
+
"sync_range",
|
|
175
|
+
"write_announcements",
|
|
176
|
+
]
|
oq_data/api.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""High-level Python API for downstream packages.
|
|
2
|
+
|
|
3
|
+
This is the surface most users will touch: :func:`prices` for a clean
|
|
4
|
+
adjusted price series, :func:`universe` for a point-in-time membership
|
|
5
|
+
set, and :func:`wide_prices` for a date-indexed wide frame that drops
|
|
6
|
+
straight into :func:`oq_backtest.backtest`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Iterable
|
|
12
|
+
from datetime import date
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from oq_data import corporate_actions, storage, symbols, universes
|
|
17
|
+
from oq_data.config import DataPaths, get_paths
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def prices(
|
|
21
|
+
symbol: str | Iterable[str],
|
|
22
|
+
start: date | str | None = None,
|
|
23
|
+
end: date | str | None = None,
|
|
24
|
+
adjusted: bool = True,
|
|
25
|
+
paths: DataPaths | None = None,
|
|
26
|
+
) -> pd.DataFrame:
|
|
27
|
+
"""Read a long-form OHLCV frame for one or many symbols.
|
|
28
|
+
|
|
29
|
+
With ``adjusted=True`` (default), splits, bonuses, and dividends are
|
|
30
|
+
back-adjusted so the returned series is a continuous total-return
|
|
31
|
+
proxy suitable for backtesting.
|
|
32
|
+
"""
|
|
33
|
+
paths = paths or get_paths()
|
|
34
|
+
df = storage.read_prices(symbols=symbol, start=start, end=end, paths=paths)
|
|
35
|
+
if df.empty or not adjusted:
|
|
36
|
+
return df
|
|
37
|
+
actions = corporate_actions.load_actions(paths=paths)
|
|
38
|
+
if actions.empty:
|
|
39
|
+
return df
|
|
40
|
+
return corporate_actions.adjust_prices(df, actions)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def wide_prices(
|
|
44
|
+
universe_symbols: Iterable[str],
|
|
45
|
+
start: date | str | None = None,
|
|
46
|
+
end: date | str | None = None,
|
|
47
|
+
field: str = "close",
|
|
48
|
+
adjusted: bool = True,
|
|
49
|
+
paths: DataPaths | None = None,
|
|
50
|
+
) -> pd.DataFrame:
|
|
51
|
+
"""Return a date-indexed wide DataFrame ready for the backtester.
|
|
52
|
+
|
|
53
|
+
The output is what :func:`oq_backtest.backtest` consumes as
|
|
54
|
+
``prices``: rows are trading dates, columns are symbols, values are
|
|
55
|
+
the requested field (default ``close``).
|
|
56
|
+
"""
|
|
57
|
+
syms = list(universe_symbols)
|
|
58
|
+
if not syms:
|
|
59
|
+
raise ValueError("universe_symbols must be non-empty")
|
|
60
|
+
long_df = prices(syms, start=start, end=end, adjusted=adjusted, paths=paths)
|
|
61
|
+
if long_df.empty:
|
|
62
|
+
return pd.DataFrame()
|
|
63
|
+
if field not in long_df.columns:
|
|
64
|
+
raise KeyError(f"field {field!r} not in {sorted(long_df.columns)}")
|
|
65
|
+
wide = long_df.pivot_table(index="date", columns="symbol", values=field, aggfunc="last")
|
|
66
|
+
wide = wide.sort_index()
|
|
67
|
+
wide.index = pd.DatetimeIndex(wide.index)
|
|
68
|
+
return wide
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def universe(
|
|
72
|
+
index_name: str,
|
|
73
|
+
as_of: date | str,
|
|
74
|
+
paths: DataPaths | None = None,
|
|
75
|
+
) -> list[str]:
|
|
76
|
+
"""List the symbols that made up ``index_name`` on ``as_of``."""
|
|
77
|
+
paths = paths or get_paths()
|
|
78
|
+
when = pd.to_datetime(as_of).date()
|
|
79
|
+
members = universes.members_as_of(index_name, when, paths=paths)
|
|
80
|
+
return members["symbol"].tolist()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def resolve_symbol(symbol: str, when: date | str, paths: DataPaths | None = None) -> str:
|
|
84
|
+
"""Translate a current ticker to the symbol used on ``when``."""
|
|
85
|
+
paths = paths or get_paths()
|
|
86
|
+
master = symbols.load_master(paths=paths)
|
|
87
|
+
return master.resolve_as_of(symbol, pd.to_datetime(when).date())
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def list_symbols(paths: DataPaths | None = None) -> list[str]:
|
|
91
|
+
"""All distinct ``symbol`` values present in the EOD dataset."""
|
|
92
|
+
paths = paths or get_paths()
|
|
93
|
+
df = storage.query("SELECT DISTINCT symbol FROM eod ORDER BY symbol", paths=paths)
|
|
94
|
+
return df["symbol"].tolist() if not df.empty else []
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
__all__ = [
|
|
98
|
+
"list_symbols",
|
|
99
|
+
"prices",
|
|
100
|
+
"resolve_symbol",
|
|
101
|
+
"universe",
|
|
102
|
+
"wide_prices",
|
|
103
|
+
]
|
oq_data/bhavcopy.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""NSE equity Bhavcopy ingestion.
|
|
2
|
+
|
|
3
|
+
NSE publishes one zipped CSV per trading day with all listed cash-market
|
|
4
|
+
instruments and their OHLCV. Two formats are in active circulation:
|
|
5
|
+
|
|
6
|
+
* **Legacy** (until ~July 2020): ``cm{DDMMMYYYY}bhav.csv.zip`` with the
|
|
7
|
+
classic ``SYMBOL,SERIES,OPEN,HIGH,LOW,CLOSE,LAST,PREVCLOSE,...`` schema.
|
|
8
|
+
* **UDiFF** (from ~July 2020 onward): ``BhavCopy_NSE_CM_0_0_0_{YYYYMMDD}
|
|
9
|
+
_F_0000.csv.zip`` with the longer ``TradDt,BizDt,Sgmt,Src,FinInstrmTp,
|
|
10
|
+
FinInstrmId,ISIN,TckrSymb,SctySrs,...`` schema.
|
|
11
|
+
|
|
12
|
+
This module exposes URL builders, format-detecting parsers, and a single
|
|
13
|
+
:func:`download_bhavcopy` entry point that retries, resumes from cache,
|
|
14
|
+
and returns a normalised :class:`pandas.DataFrame`.
|
|
15
|
+
|
|
16
|
+
Network calls are isolated behind an injectable ``fetcher`` callable so
|
|
17
|
+
tests run fully offline against the fixtures under ``tests/fixtures``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import io
|
|
23
|
+
import logging
|
|
24
|
+
import time
|
|
25
|
+
import zipfile
|
|
26
|
+
from collections.abc import Callable, Iterable
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from datetime import date, datetime, timedelta
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
import httpx
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
from oq_data.config import DataPaths, get_paths
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
UDIFF_CUTOVER = date(2020, 7, 11)
|
|
39
|
+
USER_AGENT = (
|
|
40
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
|
41
|
+
"(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
42
|
+
)
|
|
43
|
+
DEFAULT_HEADERS = {
|
|
44
|
+
"User-Agent": USER_AGENT,
|
|
45
|
+
"Accept": "*/*",
|
|
46
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
Fetcher = Callable[[str], bytes]
|
|
50
|
+
|
|
51
|
+
_NORMALISED_COLUMNS = [
|
|
52
|
+
"date",
|
|
53
|
+
"symbol",
|
|
54
|
+
"isin",
|
|
55
|
+
"series",
|
|
56
|
+
"open",
|
|
57
|
+
"high",
|
|
58
|
+
"low",
|
|
59
|
+
"close",
|
|
60
|
+
"prev_close",
|
|
61
|
+
"volume",
|
|
62
|
+
"value",
|
|
63
|
+
"trades",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(frozen=True, slots=True)
|
|
68
|
+
class BhavcopySource:
|
|
69
|
+
"""The full address of a bhavcopy for a given date."""
|
|
70
|
+
|
|
71
|
+
when: date
|
|
72
|
+
url: str
|
|
73
|
+
filename: str
|
|
74
|
+
is_udiff: bool
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def is_udiff_date(when: date) -> bool:
|
|
78
|
+
"""Return True if NSE was publishing the UDiFF schema on ``when``."""
|
|
79
|
+
return when >= UDIFF_CUTOVER
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def build_url(when: date) -> BhavcopySource:
|
|
83
|
+
"""Build the canonical NSE archive URL for the bhavcopy on ``when``."""
|
|
84
|
+
if is_udiff_date(when):
|
|
85
|
+
fname = f"BhavCopy_NSE_CM_0_0_0_{when:%Y%m%d}_F_0000.csv.zip"
|
|
86
|
+
url = f"https://nsearchives.nseindia.com/content/cm/{fname}"
|
|
87
|
+
return BhavcopySource(when=when, url=url, filename=fname, is_udiff=True)
|
|
88
|
+
fname = f"cm{when.strftime('%d%b%Y').upper()}bhav.csv.zip"
|
|
89
|
+
url = (
|
|
90
|
+
"https://nsearchives.nseindia.com/content/historical/EQUITIES/"
|
|
91
|
+
f"{when:%Y}/{when.strftime('%b').upper()}/{fname}"
|
|
92
|
+
)
|
|
93
|
+
return BhavcopySource(when=when, url=url, filename=fname, is_udiff=False)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _default_fetcher(url: str, timeout: float = 30.0, retries: int = 3) -> bytes:
|
|
97
|
+
"""HTTP GET with retry and NSE-friendly headers. Raises on final failure."""
|
|
98
|
+
last_exc: Exception | None = None
|
|
99
|
+
for attempt in range(1, retries + 1):
|
|
100
|
+
try:
|
|
101
|
+
with httpx.Client(
|
|
102
|
+
headers=DEFAULT_HEADERS, follow_redirects=True, timeout=timeout
|
|
103
|
+
) as client:
|
|
104
|
+
resp = client.get(url)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
return resp.content
|
|
107
|
+
except httpx.HTTPError as exc:
|
|
108
|
+
last_exc = exc
|
|
109
|
+
logger.warning("bhavcopy fetch failed (attempt %d/%d): %s", attempt, retries, exc)
|
|
110
|
+
time.sleep(min(2**attempt, 8))
|
|
111
|
+
raise RuntimeError(f"failed to fetch {url} after {retries} attempts") from last_exc
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _read_csv_from_zip(blob: bytes) -> pd.DataFrame:
|
|
115
|
+
with zipfile.ZipFile(io.BytesIO(blob)) as zf:
|
|
116
|
+
names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
|
|
117
|
+
if not names:
|
|
118
|
+
raise ValueError("zip archive contains no .csv member")
|
|
119
|
+
with zf.open(names[0]) as fh:
|
|
120
|
+
return pd.read_csv(fh)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _normalise_legacy(raw: pd.DataFrame, when: date) -> pd.DataFrame:
|
|
124
|
+
raw = raw.rename(columns=lambda c: c.strip().upper())
|
|
125
|
+
df = pd.DataFrame(
|
|
126
|
+
{
|
|
127
|
+
"date": pd.to_datetime(when),
|
|
128
|
+
"symbol": raw["SYMBOL"].astype(str).str.strip(),
|
|
129
|
+
"isin": raw.get("ISIN", pd.Series([pd.NA] * len(raw))).astype("string").str.strip(),
|
|
130
|
+
"series": raw["SERIES"].astype(str).str.strip(),
|
|
131
|
+
"open": pd.to_numeric(raw["OPEN"], errors="coerce"),
|
|
132
|
+
"high": pd.to_numeric(raw["HIGH"], errors="coerce"),
|
|
133
|
+
"low": pd.to_numeric(raw["LOW"], errors="coerce"),
|
|
134
|
+
"close": pd.to_numeric(raw["CLOSE"], errors="coerce"),
|
|
135
|
+
"prev_close": pd.to_numeric(raw["PREVCLOSE"], errors="coerce"),
|
|
136
|
+
"volume": pd.to_numeric(raw["TOTTRDQTY"], errors="coerce").astype("Int64"),
|
|
137
|
+
"value": pd.to_numeric(raw["TOTTRDVAL"], errors="coerce"),
|
|
138
|
+
"trades": pd.to_numeric(raw.get("TOTALTRADES", pd.NA), errors="coerce").astype("Int64"),
|
|
139
|
+
}
|
|
140
|
+
)
|
|
141
|
+
return df
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _normalise_udiff(raw: pd.DataFrame, when: date) -> pd.DataFrame:
|
|
145
|
+
raw = raw.rename(columns=lambda c: c.strip())
|
|
146
|
+
df = pd.DataFrame(
|
|
147
|
+
{
|
|
148
|
+
"date": pd.to_datetime(when),
|
|
149
|
+
"symbol": raw["TckrSymb"].astype(str).str.strip(),
|
|
150
|
+
"isin": raw["ISIN"].astype("string").str.strip(),
|
|
151
|
+
"series": raw["SctySrs"].astype(str).str.strip(),
|
|
152
|
+
"open": pd.to_numeric(raw["OpnPric"], errors="coerce"),
|
|
153
|
+
"high": pd.to_numeric(raw["HghPric"], errors="coerce"),
|
|
154
|
+
"low": pd.to_numeric(raw["LwPric"], errors="coerce"),
|
|
155
|
+
"close": pd.to_numeric(raw["ClsPric"], errors="coerce"),
|
|
156
|
+
"prev_close": pd.to_numeric(raw["PrvsClsgPric"], errors="coerce"),
|
|
157
|
+
"volume": pd.to_numeric(raw["TtlTradgVol"], errors="coerce").astype("Int64"),
|
|
158
|
+
"value": pd.to_numeric(raw["TtlTrfVal"], errors="coerce"),
|
|
159
|
+
"trades": pd.to_numeric(raw["TtlNbOfTxsExctd"], errors="coerce").astype("Int64"),
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
fininstrm = raw.get("FinInstrmTp")
|
|
163
|
+
if fininstrm is not None:
|
|
164
|
+
df = df[fininstrm.astype(str).str.upper().isin({"STK", "EQ"})].reset_index(drop=True)
|
|
165
|
+
return df
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def parse_bhavcopy_blob(blob: bytes, when: date) -> pd.DataFrame:
|
|
169
|
+
"""Parse a downloaded bhavcopy zip (or raw csv) into the canonical schema.
|
|
170
|
+
|
|
171
|
+
The schema returned, regardless of input format, is exactly:
|
|
172
|
+
``date, symbol, isin, series, open, high, low, close, prev_close,
|
|
173
|
+
volume, value, trades``.
|
|
174
|
+
"""
|
|
175
|
+
raw = _read_csv_from_zip(blob) if blob[:2] == b"PK" else pd.read_csv(io.BytesIO(blob))
|
|
176
|
+
upper_cols = {c.strip().upper() for c in raw.columns}
|
|
177
|
+
is_udiff = "TCKRSYMB" in upper_cols
|
|
178
|
+
df = _normalise_udiff(raw, when) if is_udiff else _normalise_legacy(raw, when)
|
|
179
|
+
return df[_NORMALISED_COLUMNS]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def download_bhavcopy(
|
|
183
|
+
when: date,
|
|
184
|
+
paths: DataPaths | None = None,
|
|
185
|
+
fetcher: Fetcher | None = None,
|
|
186
|
+
use_cache: bool = True,
|
|
187
|
+
) -> pd.DataFrame:
|
|
188
|
+
"""Download and parse the NSE equity bhavcopy for ``when``.
|
|
189
|
+
|
|
190
|
+
The raw zip is cached under ``paths.bhavcopy`` so reruns are offline.
|
|
191
|
+
Pass ``fetcher`` to substitute a callable (used by tests) or to swap
|
|
192
|
+
in your own retry/auth wrapper.
|
|
193
|
+
"""
|
|
194
|
+
paths = paths or get_paths()
|
|
195
|
+
paths.ensure()
|
|
196
|
+
src = build_url(when)
|
|
197
|
+
cache_path = paths.bhavcopy / src.filename
|
|
198
|
+
fetch = fetcher or _default_fetcher
|
|
199
|
+
|
|
200
|
+
if use_cache and cache_path.exists():
|
|
201
|
+
blob = cache_path.read_bytes()
|
|
202
|
+
else:
|
|
203
|
+
blob = fetch(src.url)
|
|
204
|
+
cache_path.write_bytes(blob)
|
|
205
|
+
return parse_bhavcopy_blob(blob, when)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def sync_range(
|
|
209
|
+
start: date,
|
|
210
|
+
end: date,
|
|
211
|
+
paths: DataPaths | None = None,
|
|
212
|
+
fetcher: Fetcher | None = None,
|
|
213
|
+
on_missing: str = "skip",
|
|
214
|
+
) -> Iterable[date]:
|
|
215
|
+
"""Download every available bhavcopy in ``[start, end]`` inclusive.
|
|
216
|
+
|
|
217
|
+
Weekends are always skipped. NSE holidays show up as 404s; with
|
|
218
|
+
``on_missing='skip'`` (default) they are silently passed over. With
|
|
219
|
+
``on_missing='raise'`` the first 404 aborts the run.
|
|
220
|
+
Yields the dates that were successfully ingested.
|
|
221
|
+
"""
|
|
222
|
+
if end < start:
|
|
223
|
+
raise ValueError("end must be >= start")
|
|
224
|
+
paths = paths or get_paths()
|
|
225
|
+
paths.ensure()
|
|
226
|
+
cur = start
|
|
227
|
+
one_day = timedelta(days=1)
|
|
228
|
+
while cur <= end:
|
|
229
|
+
if cur.weekday() < 5:
|
|
230
|
+
try:
|
|
231
|
+
download_bhavcopy(cur, paths=paths, fetcher=fetcher)
|
|
232
|
+
yield cur
|
|
233
|
+
except Exception as exc:
|
|
234
|
+
if on_missing == "raise":
|
|
235
|
+
raise
|
|
236
|
+
logger.info("bhavcopy unavailable for %s: %s", cur, exc)
|
|
237
|
+
cur += one_day
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def parse_filename_date(filename: str) -> date:
|
|
241
|
+
"""Recover the trading date from a cached bhavcopy filename."""
|
|
242
|
+
if filename.startswith("BhavCopy_NSE_CM_"):
|
|
243
|
+
token = filename.split("_")[6]
|
|
244
|
+
return datetime.strptime(token, "%Y%m%d").date()
|
|
245
|
+
if filename.startswith("cm") and filename.endswith("bhav.csv.zip"):
|
|
246
|
+
token = filename[2:-12]
|
|
247
|
+
return datetime.strptime(token, "%d%b%Y").date()
|
|
248
|
+
raise ValueError(f"unrecognised bhavcopy filename: {filename}")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def iter_cached(paths: DataPaths) -> Iterable[Path]:
|
|
252
|
+
"""Yield every cached bhavcopy archive under ``paths.bhavcopy``."""
|
|
253
|
+
if not paths.bhavcopy.exists():
|
|
254
|
+
return
|
|
255
|
+
for entry in sorted(paths.bhavcopy.iterdir()):
|
|
256
|
+
if entry.is_file() and entry.suffix == ".zip":
|
|
257
|
+
yield entry
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
__all__ = [
|
|
261
|
+
"DEFAULT_HEADERS",
|
|
262
|
+
"UDIFF_CUTOVER",
|
|
263
|
+
"BhavcopySource",
|
|
264
|
+
"Fetcher",
|
|
265
|
+
"build_url",
|
|
266
|
+
"download_bhavcopy",
|
|
267
|
+
"is_udiff_date",
|
|
268
|
+
"iter_cached",
|
|
269
|
+
"parse_bhavcopy_blob",
|
|
270
|
+
"parse_filename_date",
|
|
271
|
+
"sync_range",
|
|
272
|
+
]
|