yfd 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yfd/__init__.py +1 -0
- yfd/cli.py +77 -0
- yfd/fetcher.py +62 -0
- yfd/fundamentals.py +471 -0
- yfd/r2.py +39 -0
- yfd/store.py +124 -0
- yfd/symbols.py +26 -0
- yfd/updater.py +189 -0
- yfd/yahoo.py +434 -0
- yfd-0.3.0.dist-info/METADATA +12 -0
- yfd-0.3.0.dist-info/RECORD +13 -0
- yfd-0.3.0.dist-info/WHEEL +4 -0
- yfd-0.3.0.dist-info/entry_points.txt +2 -0
yfd/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from yfd.store import DATA_TYPES, get, symbols, types
|
yfd/cli.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""CLI entry point: ``yfd <data_type> <symbol> [--start] [--end] [--json]``."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from yfd.store import DATA_TYPES, get, symbols, types
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _format_df(df: pd.DataFrame, as_json: bool) -> str:
|
|
13
|
+
if as_json:
|
|
14
|
+
return df.to_json(orient="split", date_format="iso", indent=2)
|
|
15
|
+
return df.to_string()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _format_value(val, as_json: bool) -> str:
|
|
19
|
+
if as_json:
|
|
20
|
+
return json.dumps(val, default=str, indent=2)
|
|
21
|
+
if isinstance(val, dict):
|
|
22
|
+
return "\n".join(f"{k}: {v}" for k, v in val.items())
|
|
23
|
+
if isinstance(val, list):
|
|
24
|
+
return json.dumps(val, default=str, indent=2)
|
|
25
|
+
return str(val)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main():
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog="yfd",
|
|
31
|
+
description="Yahoo Finance data — read from R2 storage",
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"command",
|
|
35
|
+
help="Data type (e.g. price, income-stmt, info) or: types, symbols",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument("symbol", nargs="?", help="Ticker symbol (e.g. AAPL)")
|
|
38
|
+
parser.add_argument("--start", help="Start date filter (price only)")
|
|
39
|
+
parser.add_argument("--end", help="End date filter (price only)")
|
|
40
|
+
parser.add_argument("--json", action="store_true", dest="as_json", help="JSON output")
|
|
41
|
+
|
|
42
|
+
args = parser.parse_args()
|
|
43
|
+
|
|
44
|
+
if args.command == "types":
|
|
45
|
+
for name, desc in sorted(types().items()):
|
|
46
|
+
print(f" {name:<25} {desc}")
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
if args.command == "symbols":
|
|
50
|
+
syms = symbols()
|
|
51
|
+
print(f"{len(syms)} symbols: {', '.join(syms[:20])}{'...' if len(syms) > 20 else ''}")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
data_type = args.command.replace("-", "_")
|
|
55
|
+
|
|
56
|
+
if data_type not in DATA_TYPES:
|
|
57
|
+
print(f"Unknown: {args.command}. Run 'yfd types' for valid types.", file=sys.stderr)
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
if not args.symbol:
|
|
61
|
+
print(f"Usage: yfd {args.command} <SYMBOL>", file=sys.stderr)
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
result = get(args.symbol.upper(), data_type, start=args.start, end=args.end)
|
|
65
|
+
|
|
66
|
+
if result is None:
|
|
67
|
+
print(f"No data for {args.symbol.upper()}/{data_type}", file=sys.stderr)
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
if isinstance(result, pd.DataFrame):
|
|
71
|
+
print(_format_df(result, args.as_json))
|
|
72
|
+
else:
|
|
73
|
+
print(_format_value(result, args.as_json))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
main()
|
yfd/fetcher.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Fetch full daily OHLCV history and save as per-symbol parquet files."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from yfd.yahoo import Yahoo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def fetch_daily_history(
|
|
11
|
+
symbols: list[str],
|
|
12
|
+
output_dir: Path,
|
|
13
|
+
batch_size: int = 50,
|
|
14
|
+
pause: float = 1.0,
|
|
15
|
+
end: str | None = None,
|
|
16
|
+
workers: int = 10,
|
|
17
|
+
) -> list[str]:
|
|
18
|
+
"""Fetch full daily OHLCV history and save as per-symbol parquet files.
|
|
19
|
+
|
|
20
|
+
Returns list of symbols that failed.
|
|
21
|
+
"""
|
|
22
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
existing = {p.stem for p in output_dir.glob("*.parquet")}
|
|
24
|
+
remaining = [s for s in symbols if s not in existing]
|
|
25
|
+
if existing:
|
|
26
|
+
print(f"Skipping {len(existing)} already fetched, {len(remaining)} remaining")
|
|
27
|
+
|
|
28
|
+
yahoo = Yahoo()
|
|
29
|
+
failed: list[str] = []
|
|
30
|
+
total_batches = -(-len(remaining) // batch_size)
|
|
31
|
+
|
|
32
|
+
for i in range(0, len(remaining), batch_size):
|
|
33
|
+
batch = remaining[i : i + batch_size]
|
|
34
|
+
batch_num = i // batch_size + 1
|
|
35
|
+
print(f"[{batch_num}/{total_batches}] Fetching {len(batch)} symbols...")
|
|
36
|
+
|
|
37
|
+
saved = 0
|
|
38
|
+
|
|
39
|
+
def _fetch(sym):
|
|
40
|
+
df, _, _ = yahoo.chart(sym, end=end, period="max")
|
|
41
|
+
return sym, df
|
|
42
|
+
|
|
43
|
+
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
44
|
+
futures = {pool.submit(_fetch, s): s for s in batch}
|
|
45
|
+
for fut in as_completed(futures):
|
|
46
|
+
sym = futures[fut]
|
|
47
|
+
try:
|
|
48
|
+
_, df = fut.result()
|
|
49
|
+
if df is not None and not df.empty:
|
|
50
|
+
df.to_parquet(output_dir / f"{sym}.parquet")
|
|
51
|
+
saved += 1
|
|
52
|
+
else:
|
|
53
|
+
failed.append(sym)
|
|
54
|
+
except Exception:
|
|
55
|
+
failed.append(sym)
|
|
56
|
+
|
|
57
|
+
print(f" Saved {saved}/{len(batch)} symbols")
|
|
58
|
+
|
|
59
|
+
if batch_num < total_batches:
|
|
60
|
+
time.sleep(pause)
|
|
61
|
+
|
|
62
|
+
return failed
|
yfd/fundamentals.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
"""Fetch non-price data (financials, holders, analysis, etc.) and upload to R2.
|
|
2
|
+
|
|
3
|
+
Uses direct Yahoo Finance API calls instead of yfinance.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from yfd.r2 import R2
|
|
13
|
+
from yfd.symbols import fetch_symbols
|
|
14
|
+
from yfd.yahoo import (
|
|
15
|
+
BALANCE_SHEET_KEYS,
|
|
16
|
+
CASH_FLOW_KEYS,
|
|
17
|
+
INCOME_KEYS,
|
|
18
|
+
Yahoo,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# All quoteSummary modules we need, fetched in a single API call per symbol.
|
|
22
|
+
QS_MODULES = [
|
|
23
|
+
"majorHoldersBreakdown", "institutionOwnership", "fundOwnership",
|
|
24
|
+
"insiderTransactions", "insiderHolders", "netSharePurchaseActivity",
|
|
25
|
+
"recommendationTrend", "upgradeDowngradeHistory",
|
|
26
|
+
"earningsTrend", "earningsHistory",
|
|
27
|
+
"industryTrend", "sectorTrend", "indexTrend",
|
|
28
|
+
"esgScores", "financialData", "calendarEvents", "secFilings",
|
|
29
|
+
"quoteType", "defaultKeyStatistics", "assetProfile", "summaryDetail",
|
|
30
|
+
"summaryProfile", "topHoldings", "fundProfile",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# (keys, timeseries prefix, R2 prefixes to upload to)
|
|
34
|
+
TIMESERIES_MAP = [
|
|
35
|
+
(INCOME_KEYS, "annual", ["financials", "income_stmt"]),
|
|
36
|
+
(INCOME_KEYS, "quarterly", ["financials_quarterly", "income_stmt_quarterly"]),
|
|
37
|
+
(INCOME_KEYS, "trailing", ["ttm_financials", "ttm_income_stmt"]),
|
|
38
|
+
(BALANCE_SHEET_KEYS, "annual", ["balance_sheet"]),
|
|
39
|
+
(BALANCE_SHEET_KEYS, "quarterly", ["balance_sheet_quarterly"]),
|
|
40
|
+
(CASH_FLOW_KEYS, "annual", ["cashflow"]),
|
|
41
|
+
(CASH_FLOW_KEYS, "quarterly", ["cashflow_quarterly"]),
|
|
42
|
+
(CASH_FLOW_KEYS, "trailing", ["ttm_cashflow"]),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Helpers
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def _raw(val):
|
|
51
|
+
"""Unwrap Yahoo's ``{raw, fmt}`` value wrapper."""
|
|
52
|
+
if isinstance(val, dict) and "raw" in val:
|
|
53
|
+
return val["raw"]
|
|
54
|
+
return val
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _put_df(df: pd.DataFrame | pd.Series | None, r2: R2, key: str):
|
|
58
|
+
if df is None or (hasattr(df, "empty") and df.empty):
|
|
59
|
+
return
|
|
60
|
+
if isinstance(df, pd.Series):
|
|
61
|
+
df = df.to_frame()
|
|
62
|
+
buf = io.BytesIO()
|
|
63
|
+
df.to_parquet(buf)
|
|
64
|
+
r2.s3.put_object(Bucket=r2.bucket, Key=key, Body=buf.seek(0) or buf.read())
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _put_json(data, r2: R2, key: str):
|
|
68
|
+
if not data:
|
|
69
|
+
return
|
|
70
|
+
r2.s3.put_object(
|
|
71
|
+
Bucket=r2.bucket,
|
|
72
|
+
Key=key,
|
|
73
|
+
Body=json.dumps(data, default=str).encode(),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _safe(fn, errors: list, label: str):
|
|
78
|
+
"""Call *fn*; on error, append to *errors* and return ``None``."""
|
|
79
|
+
try:
|
|
80
|
+
return fn()
|
|
81
|
+
except Exception as e:
|
|
82
|
+
errors.append(f"{label}: {e}")
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Parsers — chart events
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def _parse_dividends(events):
|
|
91
|
+
divs = events.get("dividends", {})
|
|
92
|
+
if not divs:
|
|
93
|
+
return None
|
|
94
|
+
rows = [
|
|
95
|
+
{"Date": pd.Timestamp(int(ts), unit="s"), "Dividends": d["amount"]}
|
|
96
|
+
for ts, d in divs.items()
|
|
97
|
+
]
|
|
98
|
+
return pd.DataFrame(rows).set_index("Date").sort_index()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _parse_splits(events):
|
|
102
|
+
splits = events.get("splits", {})
|
|
103
|
+
if not splits:
|
|
104
|
+
return None
|
|
105
|
+
rows = [
|
|
106
|
+
{
|
|
107
|
+
"Date": pd.Timestamp(int(ts), unit="s"),
|
|
108
|
+
"Stock Splits": s["numerator"] / s["denominator"],
|
|
109
|
+
}
|
|
110
|
+
for ts, s in splits.items()
|
|
111
|
+
]
|
|
112
|
+
return pd.DataFrame(rows).set_index("Date").sort_index()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _parse_actions(events):
|
|
116
|
+
d = _parse_dividends(events)
|
|
117
|
+
s = _parse_splits(events)
|
|
118
|
+
if d is None and s is None:
|
|
119
|
+
return None
|
|
120
|
+
parts = [x for x in (d, s) if x is not None]
|
|
121
|
+
return pd.concat(parts, axis=1).fillna(0)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _parse_capital_gains(events):
|
|
125
|
+
cg = events.get("capitalGains", {})
|
|
126
|
+
if not cg:
|
|
127
|
+
return None
|
|
128
|
+
rows = [
|
|
129
|
+
{"Date": pd.Timestamp(int(ts), unit="s"), "Capital Gains": d["amount"]}
|
|
130
|
+
for ts, d in cg.items()
|
|
131
|
+
]
|
|
132
|
+
return pd.DataFrame(rows).set_index("Date").sort_index()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ---------------------------------------------------------------------------
|
|
136
|
+
# Parsers — quoteSummary → DataFrames
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def _qs_list_df(qs, module, key):
|
|
140
|
+
"""Generic: extract a list from a quoteSummary module → DataFrame."""
|
|
141
|
+
items = qs.get(module, {}).get(key, [])
|
|
142
|
+
return pd.DataFrame(items) if items else None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _qs_flat_df(qs, module):
|
|
146
|
+
"""Generic: flat dict → single-column DataFrame."""
|
|
147
|
+
data = {k: _raw(v) for k, v in qs.get(module, {}).items() if k != "maxAge"}
|
|
148
|
+
if not data:
|
|
149
|
+
return None
|
|
150
|
+
return pd.DataFrame.from_dict(data, orient="index", columns=["Value"])
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _parse_upgrades_downgrades(qs):
|
|
154
|
+
items = qs.get("upgradeDowngradeHistory", {}).get("history", [])
|
|
155
|
+
if not items:
|
|
156
|
+
return None
|
|
157
|
+
df = pd.DataFrame(items)
|
|
158
|
+
if "epochGradeDate" in df.columns:
|
|
159
|
+
df["GradeDate"] = pd.to_datetime(df["epochGradeDate"], unit="s")
|
|
160
|
+
df = df.drop(columns=["epochGradeDate"]).set_index("GradeDate")
|
|
161
|
+
return df
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _parse_trend_sub(qs, sub_key):
|
|
165
|
+
"""Extract a sub-dict from each earningsTrend period."""
|
|
166
|
+
trend = qs.get("earningsTrend", {}).get("trend", [])
|
|
167
|
+
if not trend:
|
|
168
|
+
return None
|
|
169
|
+
rows = []
|
|
170
|
+
for item in trend[:4]:
|
|
171
|
+
sub = item.get(sub_key, {})
|
|
172
|
+
row = {"period": item.get("period")}
|
|
173
|
+
for k, v in sub.items():
|
|
174
|
+
if k != "maxAge":
|
|
175
|
+
row[k] = _raw(v)
|
|
176
|
+
rows.append(row)
|
|
177
|
+
df = pd.DataFrame(rows)
|
|
178
|
+
return df.set_index("period") if "period" in df.columns else df
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _parse_earnings_history(qs):
|
|
182
|
+
items = qs.get("earningsHistory", {}).get("history", [])
|
|
183
|
+
if not items:
|
|
184
|
+
return None
|
|
185
|
+
rows = []
|
|
186
|
+
for item in items:
|
|
187
|
+
row = {}
|
|
188
|
+
for k, v in item.items():
|
|
189
|
+
if k != "maxAge":
|
|
190
|
+
row[k] = _raw(v)
|
|
191
|
+
rows.append(row)
|
|
192
|
+
return pd.DataFrame(rows)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _parse_growth_estimates(qs):
|
|
196
|
+
trend = qs.get("earningsTrend", {}).get("trend", [])
|
|
197
|
+
if not trend:
|
|
198
|
+
return None
|
|
199
|
+
rows = []
|
|
200
|
+
for item in trend:
|
|
201
|
+
rows.append({
|
|
202
|
+
"period": item.get("period"),
|
|
203
|
+
"stockTrend": _raw(item.get("growth")),
|
|
204
|
+
})
|
|
205
|
+
df = pd.DataFrame(rows)
|
|
206
|
+
for module, col in [
|
|
207
|
+
("industryTrend", "industryTrend"),
|
|
208
|
+
("sectorTrend", "sectorTrend"),
|
|
209
|
+
("indexTrend", "indexTrend"),
|
|
210
|
+
]:
|
|
211
|
+
estimates = qs.get(module, {}).get("estimates", [])
|
|
212
|
+
gmap = {e.get("period"): _raw(e.get("growth")) for e in estimates}
|
|
213
|
+
df[col] = df["period"].map(gmap)
|
|
214
|
+
return df.set_index("period") if "period" in df.columns else df
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
# Parsers — quoteSummary → JSON
|
|
219
|
+
# ---------------------------------------------------------------------------
|
|
220
|
+
|
|
221
|
+
def _parse_analyst_price_targets(qs):
|
|
222
|
+
data = qs.get("financialData", {})
|
|
223
|
+
result = {}
|
|
224
|
+
for k, v in data.items():
|
|
225
|
+
if k.startswith("target") or k == "currentPrice":
|
|
226
|
+
clean = k.replace("target", "").lower() if k.startswith("target") else k
|
|
227
|
+
result[clean] = _raw(v)
|
|
228
|
+
return result or None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _parse_calendar(qs):
|
|
232
|
+
cal = qs.get("calendarEvents", {})
|
|
233
|
+
if not cal:
|
|
234
|
+
return None
|
|
235
|
+
result = {}
|
|
236
|
+
for k in ("dividendDate", "exDividendDate"):
|
|
237
|
+
v = cal.get(k)
|
|
238
|
+
if v is not None:
|
|
239
|
+
result[k] = _raw(v)
|
|
240
|
+
earnings = cal.get("earnings", {})
|
|
241
|
+
for k in (
|
|
242
|
+
"earningsDate", "earningsHigh", "earningsLow", "earningsAverage",
|
|
243
|
+
"revenueHigh", "revenueLow", "revenueAverage",
|
|
244
|
+
):
|
|
245
|
+
v = earnings.get(k)
|
|
246
|
+
if v is not None:
|
|
247
|
+
result[k] = [_raw(x) for x in v] if isinstance(v, list) else _raw(v)
|
|
248
|
+
return result or None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _parse_fast_info(qs, meta):
|
|
252
|
+
result = {}
|
|
253
|
+
for k in ("currency", "instrumentType", "exchangeName", "exchangeTimezoneName",
|
|
254
|
+
"regularMarketPrice"):
|
|
255
|
+
if k in meta:
|
|
256
|
+
result[k] = meta[k]
|
|
257
|
+
sd = qs.get("summaryDetail", {})
|
|
258
|
+
for k in ("previousClose", "fiftyTwoWeekHigh", "fiftyTwoWeekLow",
|
|
259
|
+
"fiftyDayAverage", "twoHundredDayAverage", "volume",
|
|
260
|
+
"averageVolume", "averageVolume10days", "marketCap"):
|
|
261
|
+
v = sd.get(k)
|
|
262
|
+
if v is not None:
|
|
263
|
+
result[k] = _raw(v)
|
|
264
|
+
return result or None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _parse_funds_data(qs):
|
|
268
|
+
result = {}
|
|
269
|
+
profile = qs.get("summaryProfile", {})
|
|
270
|
+
if profile.get("longBusinessSummary"):
|
|
271
|
+
result["description"] = profile["longBusinessSummary"]
|
|
272
|
+
|
|
273
|
+
fp = qs.get("fundProfile", {})
|
|
274
|
+
if fp:
|
|
275
|
+
overview = {k: fp[k] for k in ("categoryName", "family", "legalType") if k in fp}
|
|
276
|
+
if overview:
|
|
277
|
+
result["fund_overview"] = overview
|
|
278
|
+
ops = fp.get("feesExpensesInvestment", {})
|
|
279
|
+
if ops:
|
|
280
|
+
result["fund_operations"] = ops
|
|
281
|
+
|
|
282
|
+
top = qs.get("topHoldings", {})
|
|
283
|
+
if top:
|
|
284
|
+
ac = {k: _raw(top[k]) for k in (
|
|
285
|
+
"cashPosition", "stockPosition", "bondPosition",
|
|
286
|
+
"preferredPosition", "convertiblePosition", "otherPosition",
|
|
287
|
+
) if k in top}
|
|
288
|
+
if ac:
|
|
289
|
+
result["asset_classes"] = ac
|
|
290
|
+
for k in ("holdings", "equityHoldings", "bondHoldings",
|
|
291
|
+
"bondRatings", "sectorWeightings"):
|
|
292
|
+
if k in top:
|
|
293
|
+
result[k] = top[k]
|
|
294
|
+
return result or None
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _build_info(qs, quote_data):
|
|
298
|
+
"""Assemble info dict from quoteSummary modules + v7 quote."""
|
|
299
|
+
info = {}
|
|
300
|
+
for module in ("financialData", "quoteType", "defaultKeyStatistics",
|
|
301
|
+
"assetProfile", "summaryDetail"):
|
|
302
|
+
for k, v in qs.get(module, {}).items():
|
|
303
|
+
if k == "maxAge":
|
|
304
|
+
continue
|
|
305
|
+
info[k] = _raw(v)
|
|
306
|
+
if quote_data:
|
|
307
|
+
for k, v in quote_data.items():
|
|
308
|
+
info[k] = _raw(v)
|
|
309
|
+
return info
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
# Per-symbol processing
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
def _process_symbol(yahoo: Yahoo, symbol: str, r2: R2) -> dict:
|
|
317
|
+
"""Fetch all data for one symbol and upload to R2."""
|
|
318
|
+
errors: list[str] = []
|
|
319
|
+
|
|
320
|
+
# ---- Batch API calls ----
|
|
321
|
+
qs = _safe(lambda: yahoo.quote_summary(symbol, QS_MODULES), errors, "quoteSummary") or {}
|
|
322
|
+
chart_result = _safe(lambda: yahoo.chart(symbol, period="max"), errors, "chart")
|
|
323
|
+
events = chart_result[1] if chart_result else {}
|
|
324
|
+
meta = chart_result[2] if chart_result else {}
|
|
325
|
+
|
|
326
|
+
# ---- Financial statements (timeseries) ----
|
|
327
|
+
ts_cache: dict[tuple, pd.DataFrame | None] = {}
|
|
328
|
+
for keys, prefix, r2_prefixes in TIMESERIES_MAP:
|
|
329
|
+
cache_key = (id(keys), prefix)
|
|
330
|
+
if cache_key not in ts_cache:
|
|
331
|
+
ts_cache[cache_key] = _safe(
|
|
332
|
+
lambda k=keys, p=prefix: yahoo.timeseries(symbol, k, p),
|
|
333
|
+
errors, f"timeseries-{prefix}",
|
|
334
|
+
)
|
|
335
|
+
df = ts_cache[cache_key]
|
|
336
|
+
for rp in r2_prefixes:
|
|
337
|
+
_safe(lambda d=df, r=rp: _put_df(d, r2, f"{r}/{symbol}.parquet"), errors, rp)
|
|
338
|
+
|
|
339
|
+
# ---- Chart events ----
|
|
340
|
+
_safe(lambda: _put_df(_parse_dividends(events), r2, f"dividends/{symbol}.parquet"), errors, "dividends")
|
|
341
|
+
_safe(lambda: _put_df(_parse_splits(events), r2, f"splits/{symbol}.parquet"), errors, "splits")
|
|
342
|
+
_safe(lambda: _put_df(_parse_actions(events), r2, f"actions/{symbol}.parquet"), errors, "actions")
|
|
343
|
+
_safe(lambda: _put_df(_parse_capital_gains(events), r2, f"capital_gains/{symbol}.parquet"), errors, "capital_gains")
|
|
344
|
+
|
|
345
|
+
# ---- Holders ----
|
|
346
|
+
_safe(lambda: _put_df(_qs_flat_df(qs, "majorHoldersBreakdown"), r2, f"major_holders/{symbol}.parquet"), errors, "major_holders")
|
|
347
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "institutionOwnership", "ownershipList"), r2, f"institutional_holders/{symbol}.parquet"), errors, "institutional_holders")
|
|
348
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "fundOwnership", "ownershipList"), r2, f"mutualfund_holders/{symbol}.parquet"), errors, "mutualfund_holders")
|
|
349
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "insiderTransactions", "transactions"), r2, f"insider_transactions/{symbol}.parquet"), errors, "insider_transactions")
|
|
350
|
+
_safe(lambda: _put_df(_qs_flat_df(qs, "netSharePurchaseActivity"), r2, f"insider_purchases/{symbol}.parquet"), errors, "insider_purchases")
|
|
351
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "insiderHolders", "holders"), r2, f"insider_roster_holders/{symbol}.parquet"), errors, "insider_roster_holders")
|
|
352
|
+
|
|
353
|
+
# ---- Analysis ----
|
|
354
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "recommendationTrend", "trend"), r2, f"recommendations/{symbol}.parquet"), errors, "recommendations")
|
|
355
|
+
_safe(lambda: _put_df(_qs_list_df(qs, "recommendationTrend", "trend"), r2, f"recommendations_summary/{symbol}.parquet"), errors, "recommendations_summary")
|
|
356
|
+
_safe(lambda: _put_df(_parse_upgrades_downgrades(qs), r2, f"upgrades_downgrades/{symbol}.parquet"), errors, "upgrades_downgrades")
|
|
357
|
+
_safe(lambda: _put_df(_parse_trend_sub(qs, "earningsEstimate"), r2, f"earnings_estimate/{symbol}.parquet"), errors, "earnings_estimate")
|
|
358
|
+
_safe(lambda: _put_df(_parse_trend_sub(qs, "revenueEstimate"), r2, f"revenue_estimate/{symbol}.parquet"), errors, "revenue_estimate")
|
|
359
|
+
_safe(lambda: _put_df(_parse_earnings_history(qs), r2, f"earnings_history/{symbol}.parquet"), errors, "earnings_history")
|
|
360
|
+
_safe(lambda: _put_df(_parse_trend_sub(qs, "epsTrend"), r2, f"eps_trend/{symbol}.parquet"), errors, "eps_trend")
|
|
361
|
+
_safe(lambda: _put_df(_parse_trend_sub(qs, "epsRevisions"), r2, f"eps_revisions/{symbol}.parquet"), errors, "eps_revisions")
|
|
362
|
+
_safe(lambda: _put_df(_parse_growth_estimates(qs), r2, f"growth_estimates/{symbol}.parquet"), errors, "growth_estimates")
|
|
363
|
+
_safe(lambda: _put_df(_qs_flat_df(qs, "esgScores"), r2, f"sustainability/{symbol}.parquet"), errors, "sustainability")
|
|
364
|
+
|
|
365
|
+
# ---- Earnings dates (separate HTML scrape) ----
|
|
366
|
+
_safe(lambda: _put_df(yahoo.earnings_dates(symbol), r2, f"earnings/{symbol}.parquet"), errors, "earnings")
|
|
367
|
+
|
|
368
|
+
# ---- JSON types ----
|
|
369
|
+
_safe(lambda: _put_json(_parse_analyst_price_targets(qs), r2, f"analyst_price_targets/{symbol}.json"), errors, "analyst_price_targets")
|
|
370
|
+
_safe(lambda: _put_json(_parse_calendar(qs), r2, f"calendar/{symbol}.json"), errors, "calendar")
|
|
371
|
+
_safe(lambda: _put_json(meta, r2, f"history_metadata/{symbol}.json"), errors, "history_metadata")
|
|
372
|
+
_safe(lambda: _put_json(yahoo.news(symbol), r2, f"news/{symbol}.json"), errors, "news")
|
|
373
|
+
_safe(lambda: _put_json(qs.get("secFilings", {}).get("filings", []) or None, r2, f"sec_filings/{symbol}.json"), errors, "sec_filings")
|
|
374
|
+
|
|
375
|
+
# ---- Options ----
|
|
376
|
+
_safe(lambda: _put_df(yahoo.options_chain(symbol), r2, f"options/{symbol}.parquet"), errors, "options")
|
|
377
|
+
|
|
378
|
+
# ---- Special JSON ----
|
|
379
|
+
_safe(lambda: _put_json(_parse_fast_info(qs, meta), r2, f"fast_info/{symbol}.json"), errors, "fast_info")
|
|
380
|
+
isin_val = _safe(lambda: yahoo.isin(symbol), errors, "isin")
|
|
381
|
+
if isin_val:
|
|
382
|
+
_safe(lambda: _put_json(isin_val, r2, f"isin/{symbol}.json"), errors, "isin_upload")
|
|
383
|
+
_safe(lambda: _put_json(_parse_funds_data(qs), r2, f"funds_data/{symbol}.json"), errors, "funds_data")
|
|
384
|
+
|
|
385
|
+
# ---- Info (last — serves as freshness marker) ----
|
|
386
|
+
_safe(lambda: _put_json(_build_info(qs, yahoo.quote(symbol)), r2, f"info/{symbol}.json"), errors, "info")
|
|
387
|
+
|
|
388
|
+
status = "failed" if any(e.startswith(("quoteSummary:", "chart:")) for e in errors) else "ok"
|
|
389
|
+
return {"status": status, "errors": errors}
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# ---------------------------------------------------------------------------
|
|
393
|
+
# Main loop
|
|
394
|
+
# ---------------------------------------------------------------------------
|
|
395
|
+
|
|
396
|
+
def fetch_fundamentals(
|
|
397
|
+
symbols: list[str],
|
|
398
|
+
r2: R2,
|
|
399
|
+
workers: int = 20,
|
|
400
|
+
) -> dict[str, dict]:
|
|
401
|
+
"""Fetch all non-price data and upload to R2 concurrently."""
|
|
402
|
+
print(f"Processing {len(symbols)} symbols with {workers} workers...")
|
|
403
|
+
yahoo = Yahoo()
|
|
404
|
+
report: dict[str, dict] = {}
|
|
405
|
+
|
|
406
|
+
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
407
|
+
futures = {
|
|
408
|
+
pool.submit(_process_symbol, yahoo, sym, r2): sym
|
|
409
|
+
for sym in symbols
|
|
410
|
+
}
|
|
411
|
+
for fut in as_completed(futures):
|
|
412
|
+
sym = futures[fut]
|
|
413
|
+
try:
|
|
414
|
+
report[sym] = fut.result()
|
|
415
|
+
except Exception as e:
|
|
416
|
+
report[sym] = {"status": "failed", "errors": [str(e)]}
|
|
417
|
+
|
|
418
|
+
done = len(report)
|
|
419
|
+
if done % 500 == 0:
|
|
420
|
+
print(f" {done}/{len(symbols)} done...")
|
|
421
|
+
|
|
422
|
+
return report
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def write_report(report: dict[str, dict]):
|
|
426
|
+
"""Write run report to reports/fundamentals/{date}.txt."""
|
|
427
|
+
from datetime import date
|
|
428
|
+
from pathlib import Path
|
|
429
|
+
|
|
430
|
+
total = len(report)
|
|
431
|
+
failed = {s: r for s, r in report.items() if r["status"] == "failed"}
|
|
432
|
+
with_errors = {s: r for s, r in report.items() if r["errors"] and r["status"] == "ok"}
|
|
433
|
+
|
|
434
|
+
lines = []
|
|
435
|
+
lines.append(f"FUNDAMENTALS REPORT — {date.today()}")
|
|
436
|
+
lines.append(f"Symbols: {total}")
|
|
437
|
+
lines.append(f"OK: {total - len(failed)}, Failed: {len(failed)}, Partial errors: {len(with_errors)}")
|
|
438
|
+
|
|
439
|
+
if failed:
|
|
440
|
+
lines.append(f"\nFAILED ({len(failed)}):")
|
|
441
|
+
for sym, r in sorted(failed.items()):
|
|
442
|
+
lines.append(f" {sym}: {r['errors'][0]}")
|
|
443
|
+
|
|
444
|
+
if with_errors:
|
|
445
|
+
lines.append(f"\nPARTIAL ERRORS ({len(with_errors)}):")
|
|
446
|
+
for sym, r in sorted(with_errors.items()):
|
|
447
|
+
lines.append(f" {sym}: {', '.join(r['errors'])}")
|
|
448
|
+
|
|
449
|
+
text = "\n".join(lines)
|
|
450
|
+
print(text)
|
|
451
|
+
|
|
452
|
+
out = Path("reports/fundamentals")
|
|
453
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
454
|
+
(out / f"{date.today()}.txt").write_text(text)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def main():
|
|
458
|
+
from dotenv import load_dotenv
|
|
459
|
+
|
|
460
|
+
load_dotenv()
|
|
461
|
+
|
|
462
|
+
symbols = fetch_symbols()
|
|
463
|
+
print(f"Found {len(symbols)} symbols")
|
|
464
|
+
|
|
465
|
+
r2 = R2.from_env()
|
|
466
|
+
report = fetch_fundamentals(symbols, r2)
|
|
467
|
+
write_report(report)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
if __name__ == "__main__":
|
|
471
|
+
main()
|
yfd/r2.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class R2:
|
|
8
|
+
def __init__(self, endpoint: str, access_key: str, secret_key: str, bucket: str):
|
|
9
|
+
self.s3 = boto3.client(
|
|
10
|
+
"s3",
|
|
11
|
+
endpoint_url=endpoint,
|
|
12
|
+
aws_access_key_id=access_key,
|
|
13
|
+
aws_secret_access_key=secret_key,
|
|
14
|
+
region_name="auto",
|
|
15
|
+
)
|
|
16
|
+
self.bucket = bucket
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_env(cls) -> "R2":
|
|
20
|
+
return cls(
|
|
21
|
+
endpoint=os.environ["R2_ENDPOINT"],
|
|
22
|
+
access_key=os.environ["R2_ACCESS_KEY_ID"],
|
|
23
|
+
secret_key=os.environ["R2_SECRET_ACCESS_KEY"],
|
|
24
|
+
bucket=os.environ["R2_BUCKET"],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def download(self, key: str, local_path: Path):
|
|
28
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
self.s3.download_file(self.bucket, key, str(local_path))
|
|
30
|
+
|
|
31
|
+
def upload(self, local_path: Path, key: str):
|
|
32
|
+
self.s3.upload_file(str(local_path), self.bucket, key)
|
|
33
|
+
|
|
34
|
+
def exists(self, key: str) -> bool:
|
|
35
|
+
try:
|
|
36
|
+
self.s3.head_object(Bucket=self.bucket, Key=key)
|
|
37
|
+
return True
|
|
38
|
+
except self.s3.exceptions.ClientError:
|
|
39
|
+
return False
|