yfd 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yfd/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from yfd.store import DATA_TYPES, get, symbols, types
yfd/cli.py ADDED
@@ -0,0 +1,77 @@
1
+ """CLI entry point: ``yfd <data_type> <symbol> [--start] [--end] [--json]``."""
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+
7
+ import pandas as pd
8
+
9
+ from yfd.store import DATA_TYPES, get, symbols, types
10
+
11
+
12
+ def _format_df(df: pd.DataFrame, as_json: bool) -> str:
13
+ if as_json:
14
+ return df.to_json(orient="split", date_format="iso", indent=2)
15
+ return df.to_string()
16
+
17
+
18
+ def _format_value(val, as_json: bool) -> str:
19
+ if as_json:
20
+ return json.dumps(val, default=str, indent=2)
21
+ if isinstance(val, dict):
22
+ return "\n".join(f"{k}: {v}" for k, v in val.items())
23
+ if isinstance(val, list):
24
+ return json.dumps(val, default=str, indent=2)
25
+ return str(val)
26
+
27
+
28
+ def main():
29
+ parser = argparse.ArgumentParser(
30
+ prog="yfd",
31
+ description="Yahoo Finance data — read from R2 storage",
32
+ )
33
+ parser.add_argument(
34
+ "command",
35
+ help="Data type (e.g. price, income-stmt, info) or: types, symbols",
36
+ )
37
+ parser.add_argument("symbol", nargs="?", help="Ticker symbol (e.g. AAPL)")
38
+ parser.add_argument("--start", help="Start date filter (price only)")
39
+ parser.add_argument("--end", help="End date filter (price only)")
40
+ parser.add_argument("--json", action="store_true", dest="as_json", help="JSON output")
41
+
42
+ args = parser.parse_args()
43
+
44
+ if args.command == "types":
45
+ for name, desc in sorted(types().items()):
46
+ print(f" {name:<25} {desc}")
47
+ return
48
+
49
+ if args.command == "symbols":
50
+ syms = symbols()
51
+ print(f"{len(syms)} symbols: {', '.join(syms[:20])}{'...' if len(syms) > 20 else ''}")
52
+ return
53
+
54
+ data_type = args.command.replace("-", "_")
55
+
56
+ if data_type not in DATA_TYPES:
57
+ print(f"Unknown: {args.command}. Run 'yfd types' for valid types.", file=sys.stderr)
58
+ sys.exit(1)
59
+
60
+ if not args.symbol:
61
+ print(f"Usage: yfd {args.command} <SYMBOL>", file=sys.stderr)
62
+ sys.exit(1)
63
+
64
+ result = get(args.symbol.upper(), data_type, start=args.start, end=args.end)
65
+
66
+ if result is None:
67
+ print(f"No data for {args.symbol.upper()}/{data_type}", file=sys.stderr)
68
+ sys.exit(1)
69
+
70
+ if isinstance(result, pd.DataFrame):
71
+ print(_format_df(result, args.as_json))
72
+ else:
73
+ print(_format_value(result, args.as_json))
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
yfd/fetcher.py ADDED
@@ -0,0 +1,62 @@
1
+ """Fetch full daily OHLCV history and save as per-symbol parquet files."""
2
+
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from pathlib import Path
6
+
7
+ from yfd.yahoo import Yahoo
8
+
9
+
10
+ def fetch_daily_history(
11
+ symbols: list[str],
12
+ output_dir: Path,
13
+ batch_size: int = 50,
14
+ pause: float = 1.0,
15
+ end: str | None = None,
16
+ workers: int = 10,
17
+ ) -> list[str]:
18
+ """Fetch full daily OHLCV history and save as per-symbol parquet files.
19
+
20
+ Returns list of symbols that failed.
21
+ """
22
+ output_dir.mkdir(parents=True, exist_ok=True)
23
+ existing = {p.stem for p in output_dir.glob("*.parquet")}
24
+ remaining = [s for s in symbols if s not in existing]
25
+ if existing:
26
+ print(f"Skipping {len(existing)} already fetched, {len(remaining)} remaining")
27
+
28
+ yahoo = Yahoo()
29
+ failed: list[str] = []
30
+ total_batches = -(-len(remaining) // batch_size)
31
+
32
+ for i in range(0, len(remaining), batch_size):
33
+ batch = remaining[i : i + batch_size]
34
+ batch_num = i // batch_size + 1
35
+ print(f"[{batch_num}/{total_batches}] Fetching {len(batch)} symbols...")
36
+
37
+ saved = 0
38
+
39
+ def _fetch(sym):
40
+ df, _, _ = yahoo.chart(sym, end=end, period="max")
41
+ return sym, df
42
+
43
+ with ThreadPoolExecutor(max_workers=workers) as pool:
44
+ futures = {pool.submit(_fetch, s): s for s in batch}
45
+ for fut in as_completed(futures):
46
+ sym = futures[fut]
47
+ try:
48
+ _, df = fut.result()
49
+ if df is not None and not df.empty:
50
+ df.to_parquet(output_dir / f"{sym}.parquet")
51
+ saved += 1
52
+ else:
53
+ failed.append(sym)
54
+ except Exception:
55
+ failed.append(sym)
56
+
57
+ print(f" Saved {saved}/{len(batch)} symbols")
58
+
59
+ if batch_num < total_batches:
60
+ time.sleep(pause)
61
+
62
+ return failed
yfd/fundamentals.py ADDED
@@ -0,0 +1,471 @@
1
+ """Fetch non-price data (financials, holders, analysis, etc.) and upload to R2.
2
+
3
+ Uses direct Yahoo Finance API calls instead of yfinance.
4
+ """
5
+
6
+ import io
7
+ import json
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ import pandas as pd
11
+
12
+ from yfd.r2 import R2
13
+ from yfd.symbols import fetch_symbols
14
+ from yfd.yahoo import (
15
+ BALANCE_SHEET_KEYS,
16
+ CASH_FLOW_KEYS,
17
+ INCOME_KEYS,
18
+ Yahoo,
19
+ )
20
+
21
+ # All quoteSummary modules we need, fetched in a single API call per symbol.
22
+ QS_MODULES = [
23
+ "majorHoldersBreakdown", "institutionOwnership", "fundOwnership",
24
+ "insiderTransactions", "insiderHolders", "netSharePurchaseActivity",
25
+ "recommendationTrend", "upgradeDowngradeHistory",
26
+ "earningsTrend", "earningsHistory",
27
+ "industryTrend", "sectorTrend", "indexTrend",
28
+ "esgScores", "financialData", "calendarEvents", "secFilings",
29
+ "quoteType", "defaultKeyStatistics", "assetProfile", "summaryDetail",
30
+ "summaryProfile", "topHoldings", "fundProfile",
31
+ ]
32
+
33
+ # (keys, timeseries prefix, R2 prefixes to upload to)
34
+ TIMESERIES_MAP = [
35
+ (INCOME_KEYS, "annual", ["financials", "income_stmt"]),
36
+ (INCOME_KEYS, "quarterly", ["financials_quarterly", "income_stmt_quarterly"]),
37
+ (INCOME_KEYS, "trailing", ["ttm_financials", "ttm_income_stmt"]),
38
+ (BALANCE_SHEET_KEYS, "annual", ["balance_sheet"]),
39
+ (BALANCE_SHEET_KEYS, "quarterly", ["balance_sheet_quarterly"]),
40
+ (CASH_FLOW_KEYS, "annual", ["cashflow"]),
41
+ (CASH_FLOW_KEYS, "quarterly", ["cashflow_quarterly"]),
42
+ (CASH_FLOW_KEYS, "trailing", ["ttm_cashflow"]),
43
+ ]
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Helpers
48
+ # ---------------------------------------------------------------------------
49
+
50
+ def _raw(val):
51
+ """Unwrap Yahoo's ``{raw, fmt}`` value wrapper."""
52
+ if isinstance(val, dict) and "raw" in val:
53
+ return val["raw"]
54
+ return val
55
+
56
+
57
+ def _put_df(df: pd.DataFrame | pd.Series | None, r2: R2, key: str):
58
+ if df is None or (hasattr(df, "empty") and df.empty):
59
+ return
60
+ if isinstance(df, pd.Series):
61
+ df = df.to_frame()
62
+ buf = io.BytesIO()
63
+ df.to_parquet(buf)
64
+ r2.s3.put_object(Bucket=r2.bucket, Key=key, Body=buf.seek(0) or buf.read())
65
+
66
+
67
+ def _put_json(data, r2: R2, key: str):
68
+ if not data:
69
+ return
70
+ r2.s3.put_object(
71
+ Bucket=r2.bucket,
72
+ Key=key,
73
+ Body=json.dumps(data, default=str).encode(),
74
+ )
75
+
76
+
77
+ def _safe(fn, errors: list, label: str):
78
+ """Call *fn*; on error, append to *errors* and return ``None``."""
79
+ try:
80
+ return fn()
81
+ except Exception as e:
82
+ errors.append(f"{label}: {e}")
83
+ return None
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Parsers — chart events
88
+ # ---------------------------------------------------------------------------
89
+
90
+ def _parse_dividends(events):
91
+ divs = events.get("dividends", {})
92
+ if not divs:
93
+ return None
94
+ rows = [
95
+ {"Date": pd.Timestamp(int(ts), unit="s"), "Dividends": d["amount"]}
96
+ for ts, d in divs.items()
97
+ ]
98
+ return pd.DataFrame(rows).set_index("Date").sort_index()
99
+
100
+
101
+ def _parse_splits(events):
102
+ splits = events.get("splits", {})
103
+ if not splits:
104
+ return None
105
+ rows = [
106
+ {
107
+ "Date": pd.Timestamp(int(ts), unit="s"),
108
+ "Stock Splits": s["numerator"] / s["denominator"],
109
+ }
110
+ for ts, s in splits.items()
111
+ ]
112
+ return pd.DataFrame(rows).set_index("Date").sort_index()
113
+
114
+
115
+ def _parse_actions(events):
116
+ d = _parse_dividends(events)
117
+ s = _parse_splits(events)
118
+ if d is None and s is None:
119
+ return None
120
+ parts = [x for x in (d, s) if x is not None]
121
+ return pd.concat(parts, axis=1).fillna(0)
122
+
123
+
124
+ def _parse_capital_gains(events):
125
+ cg = events.get("capitalGains", {})
126
+ if not cg:
127
+ return None
128
+ rows = [
129
+ {"Date": pd.Timestamp(int(ts), unit="s"), "Capital Gains": d["amount"]}
130
+ for ts, d in cg.items()
131
+ ]
132
+ return pd.DataFrame(rows).set_index("Date").sort_index()
133
+
134
+
135
+ # ---------------------------------------------------------------------------
136
+ # Parsers — quoteSummary → DataFrames
137
+ # ---------------------------------------------------------------------------
138
+
139
+ def _qs_list_df(qs, module, key):
140
+ """Generic: extract a list from a quoteSummary module → DataFrame."""
141
+ items = qs.get(module, {}).get(key, [])
142
+ return pd.DataFrame(items) if items else None
143
+
144
+
145
+ def _qs_flat_df(qs, module):
146
+ """Generic: flat dict → single-column DataFrame."""
147
+ data = {k: _raw(v) for k, v in qs.get(module, {}).items() if k != "maxAge"}
148
+ if not data:
149
+ return None
150
+ return pd.DataFrame.from_dict(data, orient="index", columns=["Value"])
151
+
152
+
153
+ def _parse_upgrades_downgrades(qs):
154
+ items = qs.get("upgradeDowngradeHistory", {}).get("history", [])
155
+ if not items:
156
+ return None
157
+ df = pd.DataFrame(items)
158
+ if "epochGradeDate" in df.columns:
159
+ df["GradeDate"] = pd.to_datetime(df["epochGradeDate"], unit="s")
160
+ df = df.drop(columns=["epochGradeDate"]).set_index("GradeDate")
161
+ return df
162
+
163
+
164
+ def _parse_trend_sub(qs, sub_key):
165
+ """Extract a sub-dict from each earningsTrend period."""
166
+ trend = qs.get("earningsTrend", {}).get("trend", [])
167
+ if not trend:
168
+ return None
169
+ rows = []
170
+ for item in trend[:4]:
171
+ sub = item.get(sub_key, {})
172
+ row = {"period": item.get("period")}
173
+ for k, v in sub.items():
174
+ if k != "maxAge":
175
+ row[k] = _raw(v)
176
+ rows.append(row)
177
+ df = pd.DataFrame(rows)
178
+ return df.set_index("period") if "period" in df.columns else df
179
+
180
+
181
+ def _parse_earnings_history(qs):
182
+ items = qs.get("earningsHistory", {}).get("history", [])
183
+ if not items:
184
+ return None
185
+ rows = []
186
+ for item in items:
187
+ row = {}
188
+ for k, v in item.items():
189
+ if k != "maxAge":
190
+ row[k] = _raw(v)
191
+ rows.append(row)
192
+ return pd.DataFrame(rows)
193
+
194
+
195
+ def _parse_growth_estimates(qs):
196
+ trend = qs.get("earningsTrend", {}).get("trend", [])
197
+ if not trend:
198
+ return None
199
+ rows = []
200
+ for item in trend:
201
+ rows.append({
202
+ "period": item.get("period"),
203
+ "stockTrend": _raw(item.get("growth")),
204
+ })
205
+ df = pd.DataFrame(rows)
206
+ for module, col in [
207
+ ("industryTrend", "industryTrend"),
208
+ ("sectorTrend", "sectorTrend"),
209
+ ("indexTrend", "indexTrend"),
210
+ ]:
211
+ estimates = qs.get(module, {}).get("estimates", [])
212
+ gmap = {e.get("period"): _raw(e.get("growth")) for e in estimates}
213
+ df[col] = df["period"].map(gmap)
214
+ return df.set_index("period") if "period" in df.columns else df
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Parsers — quoteSummary → JSON
219
+ # ---------------------------------------------------------------------------
220
+
221
+ def _parse_analyst_price_targets(qs):
222
+ data = qs.get("financialData", {})
223
+ result = {}
224
+ for k, v in data.items():
225
+ if k.startswith("target") or k == "currentPrice":
226
+ clean = k.replace("target", "").lower() if k.startswith("target") else k
227
+ result[clean] = _raw(v)
228
+ return result or None
229
+
230
+
231
+ def _parse_calendar(qs):
232
+ cal = qs.get("calendarEvents", {})
233
+ if not cal:
234
+ return None
235
+ result = {}
236
+ for k in ("dividendDate", "exDividendDate"):
237
+ v = cal.get(k)
238
+ if v is not None:
239
+ result[k] = _raw(v)
240
+ earnings = cal.get("earnings", {})
241
+ for k in (
242
+ "earningsDate", "earningsHigh", "earningsLow", "earningsAverage",
243
+ "revenueHigh", "revenueLow", "revenueAverage",
244
+ ):
245
+ v = earnings.get(k)
246
+ if v is not None:
247
+ result[k] = [_raw(x) for x in v] if isinstance(v, list) else _raw(v)
248
+ return result or None
249
+
250
+
251
+ def _parse_fast_info(qs, meta):
252
+ result = {}
253
+ for k in ("currency", "instrumentType", "exchangeName", "exchangeTimezoneName",
254
+ "regularMarketPrice"):
255
+ if k in meta:
256
+ result[k] = meta[k]
257
+ sd = qs.get("summaryDetail", {})
258
+ for k in ("previousClose", "fiftyTwoWeekHigh", "fiftyTwoWeekLow",
259
+ "fiftyDayAverage", "twoHundredDayAverage", "volume",
260
+ "averageVolume", "averageVolume10days", "marketCap"):
261
+ v = sd.get(k)
262
+ if v is not None:
263
+ result[k] = _raw(v)
264
+ return result or None
265
+
266
+
267
+ def _parse_funds_data(qs):
268
+ result = {}
269
+ profile = qs.get("summaryProfile", {})
270
+ if profile.get("longBusinessSummary"):
271
+ result["description"] = profile["longBusinessSummary"]
272
+
273
+ fp = qs.get("fundProfile", {})
274
+ if fp:
275
+ overview = {k: fp[k] for k in ("categoryName", "family", "legalType") if k in fp}
276
+ if overview:
277
+ result["fund_overview"] = overview
278
+ ops = fp.get("feesExpensesInvestment", {})
279
+ if ops:
280
+ result["fund_operations"] = ops
281
+
282
+ top = qs.get("topHoldings", {})
283
+ if top:
284
+ ac = {k: _raw(top[k]) for k in (
285
+ "cashPosition", "stockPosition", "bondPosition",
286
+ "preferredPosition", "convertiblePosition", "otherPosition",
287
+ ) if k in top}
288
+ if ac:
289
+ result["asset_classes"] = ac
290
+ for k in ("holdings", "equityHoldings", "bondHoldings",
291
+ "bondRatings", "sectorWeightings"):
292
+ if k in top:
293
+ result[k] = top[k]
294
+ return result or None
295
+
296
+
297
+ def _build_info(qs, quote_data):
298
+ """Assemble info dict from quoteSummary modules + v7 quote."""
299
+ info = {}
300
+ for module in ("financialData", "quoteType", "defaultKeyStatistics",
301
+ "assetProfile", "summaryDetail"):
302
+ for k, v in qs.get(module, {}).items():
303
+ if k == "maxAge":
304
+ continue
305
+ info[k] = _raw(v)
306
+ if quote_data:
307
+ for k, v in quote_data.items():
308
+ info[k] = _raw(v)
309
+ return info
310
+
311
+
312
+ # ---------------------------------------------------------------------------
313
+ # Per-symbol processing
314
+ # ---------------------------------------------------------------------------
315
+
316
+ def _process_symbol(yahoo: Yahoo, symbol: str, r2: R2) -> dict:
317
+ """Fetch all data for one symbol and upload to R2."""
318
+ errors: list[str] = []
319
+
320
+ # ---- Batch API calls ----
321
+ qs = _safe(lambda: yahoo.quote_summary(symbol, QS_MODULES), errors, "quoteSummary") or {}
322
+ chart_result = _safe(lambda: yahoo.chart(symbol, period="max"), errors, "chart")
323
+ events = chart_result[1] if chart_result else {}
324
+ meta = chart_result[2] if chart_result else {}
325
+
326
+ # ---- Financial statements (timeseries) ----
327
+ ts_cache: dict[tuple, pd.DataFrame | None] = {}
328
+ for keys, prefix, r2_prefixes in TIMESERIES_MAP:
329
+ cache_key = (id(keys), prefix)
330
+ if cache_key not in ts_cache:
331
+ ts_cache[cache_key] = _safe(
332
+ lambda k=keys, p=prefix: yahoo.timeseries(symbol, k, p),
333
+ errors, f"timeseries-{prefix}",
334
+ )
335
+ df = ts_cache[cache_key]
336
+ for rp in r2_prefixes:
337
+ _safe(lambda d=df, r=rp: _put_df(d, r2, f"{r}/{symbol}.parquet"), errors, rp)
338
+
339
+ # ---- Chart events ----
340
+ _safe(lambda: _put_df(_parse_dividends(events), r2, f"dividends/{symbol}.parquet"), errors, "dividends")
341
+ _safe(lambda: _put_df(_parse_splits(events), r2, f"splits/{symbol}.parquet"), errors, "splits")
342
+ _safe(lambda: _put_df(_parse_actions(events), r2, f"actions/{symbol}.parquet"), errors, "actions")
343
+ _safe(lambda: _put_df(_parse_capital_gains(events), r2, f"capital_gains/{symbol}.parquet"), errors, "capital_gains")
344
+
345
+ # ---- Holders ----
346
+ _safe(lambda: _put_df(_qs_flat_df(qs, "majorHoldersBreakdown"), r2, f"major_holders/{symbol}.parquet"), errors, "major_holders")
347
+ _safe(lambda: _put_df(_qs_list_df(qs, "institutionOwnership", "ownershipList"), r2, f"institutional_holders/{symbol}.parquet"), errors, "institutional_holders")
348
+ _safe(lambda: _put_df(_qs_list_df(qs, "fundOwnership", "ownershipList"), r2, f"mutualfund_holders/{symbol}.parquet"), errors, "mutualfund_holders")
349
+ _safe(lambda: _put_df(_qs_list_df(qs, "insiderTransactions", "transactions"), r2, f"insider_transactions/{symbol}.parquet"), errors, "insider_transactions")
350
+ _safe(lambda: _put_df(_qs_flat_df(qs, "netSharePurchaseActivity"), r2, f"insider_purchases/{symbol}.parquet"), errors, "insider_purchases")
351
+ _safe(lambda: _put_df(_qs_list_df(qs, "insiderHolders", "holders"), r2, f"insider_roster_holders/{symbol}.parquet"), errors, "insider_roster_holders")
352
+
353
+ # ---- Analysis ----
354
+ _safe(lambda: _put_df(_qs_list_df(qs, "recommendationTrend", "trend"), r2, f"recommendations/{symbol}.parquet"), errors, "recommendations")
355
+ _safe(lambda: _put_df(_qs_list_df(qs, "recommendationTrend", "trend"), r2, f"recommendations_summary/{symbol}.parquet"), errors, "recommendations_summary")
356
+ _safe(lambda: _put_df(_parse_upgrades_downgrades(qs), r2, f"upgrades_downgrades/{symbol}.parquet"), errors, "upgrades_downgrades")
357
+ _safe(lambda: _put_df(_parse_trend_sub(qs, "earningsEstimate"), r2, f"earnings_estimate/{symbol}.parquet"), errors, "earnings_estimate")
358
+ _safe(lambda: _put_df(_parse_trend_sub(qs, "revenueEstimate"), r2, f"revenue_estimate/{symbol}.parquet"), errors, "revenue_estimate")
359
+ _safe(lambda: _put_df(_parse_earnings_history(qs), r2, f"earnings_history/{symbol}.parquet"), errors, "earnings_history")
360
+ _safe(lambda: _put_df(_parse_trend_sub(qs, "epsTrend"), r2, f"eps_trend/{symbol}.parquet"), errors, "eps_trend")
361
+ _safe(lambda: _put_df(_parse_trend_sub(qs, "epsRevisions"), r2, f"eps_revisions/{symbol}.parquet"), errors, "eps_revisions")
362
+ _safe(lambda: _put_df(_parse_growth_estimates(qs), r2, f"growth_estimates/{symbol}.parquet"), errors, "growth_estimates")
363
+ _safe(lambda: _put_df(_qs_flat_df(qs, "esgScores"), r2, f"sustainability/{symbol}.parquet"), errors, "sustainability")
364
+
365
+ # ---- Earnings dates (separate HTML scrape) ----
366
+ _safe(lambda: _put_df(yahoo.earnings_dates(symbol), r2, f"earnings/{symbol}.parquet"), errors, "earnings")
367
+
368
+ # ---- JSON types ----
369
+ _safe(lambda: _put_json(_parse_analyst_price_targets(qs), r2, f"analyst_price_targets/{symbol}.json"), errors, "analyst_price_targets")
370
+ _safe(lambda: _put_json(_parse_calendar(qs), r2, f"calendar/{symbol}.json"), errors, "calendar")
371
+ _safe(lambda: _put_json(meta, r2, f"history_metadata/{symbol}.json"), errors, "history_metadata")
372
+ _safe(lambda: _put_json(yahoo.news(symbol), r2, f"news/{symbol}.json"), errors, "news")
373
+ _safe(lambda: _put_json(qs.get("secFilings", {}).get("filings", []) or None, r2, f"sec_filings/{symbol}.json"), errors, "sec_filings")
374
+
375
+ # ---- Options ----
376
+ _safe(lambda: _put_df(yahoo.options_chain(symbol), r2, f"options/{symbol}.parquet"), errors, "options")
377
+
378
+ # ---- Special JSON ----
379
+ _safe(lambda: _put_json(_parse_fast_info(qs, meta), r2, f"fast_info/{symbol}.json"), errors, "fast_info")
380
+ isin_val = _safe(lambda: yahoo.isin(symbol), errors, "isin")
381
+ if isin_val:
382
+ _safe(lambda: _put_json(isin_val, r2, f"isin/{symbol}.json"), errors, "isin_upload")
383
+ _safe(lambda: _put_json(_parse_funds_data(qs), r2, f"funds_data/{symbol}.json"), errors, "funds_data")
384
+
385
+ # ---- Info (last — serves as freshness marker) ----
386
+ _safe(lambda: _put_json(_build_info(qs, yahoo.quote(symbol)), r2, f"info/{symbol}.json"), errors, "info")
387
+
388
+ status = "failed" if any(e.startswith(("quoteSummary:", "chart:")) for e in errors) else "ok"
389
+ return {"status": status, "errors": errors}
390
+
391
+
392
+ # ---------------------------------------------------------------------------
393
+ # Main loop
394
+ # ---------------------------------------------------------------------------
395
+
396
+ def fetch_fundamentals(
397
+ symbols: list[str],
398
+ r2: R2,
399
+ workers: int = 20,
400
+ ) -> dict[str, dict]:
401
+ """Fetch all non-price data and upload to R2 concurrently."""
402
+ print(f"Processing {len(symbols)} symbols with {workers} workers...")
403
+ yahoo = Yahoo()
404
+ report: dict[str, dict] = {}
405
+
406
+ with ThreadPoolExecutor(max_workers=workers) as pool:
407
+ futures = {
408
+ pool.submit(_process_symbol, yahoo, sym, r2): sym
409
+ for sym in symbols
410
+ }
411
+ for fut in as_completed(futures):
412
+ sym = futures[fut]
413
+ try:
414
+ report[sym] = fut.result()
415
+ except Exception as e:
416
+ report[sym] = {"status": "failed", "errors": [str(e)]}
417
+
418
+ done = len(report)
419
+ if done % 500 == 0:
420
+ print(f" {done}/{len(symbols)} done...")
421
+
422
+ return report
423
+
424
+
425
+ def write_report(report: dict[str, dict]):
426
+ """Write run report to reports/fundamentals/{date}.txt."""
427
+ from datetime import date
428
+ from pathlib import Path
429
+
430
+ total = len(report)
431
+ failed = {s: r for s, r in report.items() if r["status"] == "failed"}
432
+ with_errors = {s: r for s, r in report.items() if r["errors"] and r["status"] == "ok"}
433
+
434
+ lines = []
435
+ lines.append(f"FUNDAMENTALS REPORT — {date.today()}")
436
+ lines.append(f"Symbols: {total}")
437
+ lines.append(f"OK: {total - len(failed)}, Failed: {len(failed)}, Partial errors: {len(with_errors)}")
438
+
439
+ if failed:
440
+ lines.append(f"\nFAILED ({len(failed)}):")
441
+ for sym, r in sorted(failed.items()):
442
+ lines.append(f" {sym}: {r['errors'][0]}")
443
+
444
+ if with_errors:
445
+ lines.append(f"\nPARTIAL ERRORS ({len(with_errors)}):")
446
+ for sym, r in sorted(with_errors.items()):
447
+ lines.append(f" {sym}: {', '.join(r['errors'])}")
448
+
449
+ text = "\n".join(lines)
450
+ print(text)
451
+
452
+ out = Path("reports/fundamentals")
453
+ out.mkdir(parents=True, exist_ok=True)
454
+ (out / f"{date.today()}.txt").write_text(text)
455
+
456
+
457
+ def main():
458
+ from dotenv import load_dotenv
459
+
460
+ load_dotenv()
461
+
462
+ symbols = fetch_symbols()
463
+ print(f"Found {len(symbols)} symbols")
464
+
465
+ r2 = R2.from_env()
466
+ report = fetch_fundamentals(symbols, r2)
467
+ write_report(report)
468
+
469
+
470
+ if __name__ == "__main__":
471
+ main()
yfd/r2.py ADDED
@@ -0,0 +1,39 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import boto3
5
+
6
+
7
+ class R2:
8
+ def __init__(self, endpoint: str, access_key: str, secret_key: str, bucket: str):
9
+ self.s3 = boto3.client(
10
+ "s3",
11
+ endpoint_url=endpoint,
12
+ aws_access_key_id=access_key,
13
+ aws_secret_access_key=secret_key,
14
+ region_name="auto",
15
+ )
16
+ self.bucket = bucket
17
+
18
+ @classmethod
19
+ def from_env(cls) -> "R2":
20
+ return cls(
21
+ endpoint=os.environ["R2_ENDPOINT"],
22
+ access_key=os.environ["R2_ACCESS_KEY_ID"],
23
+ secret_key=os.environ["R2_SECRET_ACCESS_KEY"],
24
+ bucket=os.environ["R2_BUCKET"],
25
+ )
26
+
27
+ def download(self, key: str, local_path: Path):
28
+ local_path.parent.mkdir(parents=True, exist_ok=True)
29
+ self.s3.download_file(self.bucket, key, str(local_path))
30
+
31
+ def upload(self, local_path: Path, key: str):
32
+ self.s3.upload_file(str(local_path), self.bucket, key)
33
+
34
+ def exists(self, key: str) -> bool:
35
+ try:
36
+ self.s3.head_object(Bucket=self.bucket, Key=key)
37
+ return True
38
+ except self.s3.exceptions.ClientError:
39
+ return False