sigma-terminal 2.0.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sigma/__init__.py +182 -6
- sigma/__main__.py +2 -2
- sigma/analytics/__init__.py +636 -0
- sigma/app.py +563 -898
- sigma/backtest.py +372 -0
- sigma/charts.py +407 -0
- sigma/cli.py +434 -0
- sigma/comparison.py +611 -0
- sigma/config.py +195 -0
- sigma/core/__init__.py +4 -17
- sigma/core/engine.py +493 -0
- sigma/core/intent.py +595 -0
- sigma/core/models.py +516 -125
- sigma/data/__init__.py +681 -0
- sigma/data/models.py +130 -0
- sigma/llm.py +401 -0
- sigma/monitoring.py +666 -0
- sigma/portfolio.py +697 -0
- sigma/reporting.py +658 -0
- sigma/robustness.py +675 -0
- sigma/setup.py +305 -402
- sigma/strategy.py +753 -0
- sigma/tools/backtest.py +23 -5
- sigma/tools.py +617 -0
- sigma/visualization.py +766 -0
- sigma_terminal-3.2.0.dist-info/METADATA +298 -0
- sigma_terminal-3.2.0.dist-info/RECORD +30 -0
- sigma_terminal-3.2.0.dist-info/entry_points.txt +6 -0
- sigma_terminal-3.2.0.dist-info/licenses/LICENSE +25 -0
- sigma/core/agent.py +0 -205
- sigma/core/config.py +0 -119
- sigma/core/llm.py +0 -794
- sigma/tools/__init__.py +0 -5
- sigma/tools/charts.py +0 -400
- sigma/tools/financial.py +0 -1457
- sigma/ui/__init__.py +0 -1
- sigma_terminal-2.0.1.dist-info/METADATA +0 -222
- sigma_terminal-2.0.1.dist-info/RECORD +0 -19
- sigma_terminal-2.0.1.dist-info/entry_points.txt +0 -2
- sigma_terminal-2.0.1.dist-info/licenses/LICENSE +0 -42
- {sigma_terminal-2.0.1.dist-info → sigma_terminal-3.2.0.dist-info}/WHEEL +0 -0
sigma/data/__init__.py
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
"""Data acquisition layer with quality checks and lineage tracking."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from datetime import datetime, date, timedelta
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import yfinance as yf
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
from .models import (
|
|
16
|
+
DataSource,
|
|
17
|
+
DataLineage,
|
|
18
|
+
DataQualityReport,
|
|
19
|
+
CorporateAction,
|
|
20
|
+
PriceBar,
|
|
21
|
+
Fundamental,
|
|
22
|
+
AssetClass,
|
|
23
|
+
detect_asset_class,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ============================================================================
|
|
28
|
+
# DATA PROVIDER INTERFACE
|
|
29
|
+
# ============================================================================
|
|
30
|
+
|
|
31
|
+
class DataProvider:
|
|
32
|
+
"""Base class for data providers."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, name: str, api_key: Optional[str] = None):
|
|
35
|
+
self.name = name
|
|
36
|
+
self.api_key = api_key
|
|
37
|
+
self.rate_limit = 5 # requests per second
|
|
38
|
+
self.last_request = 0
|
|
39
|
+
|
|
40
|
+
async def get_price_history(
|
|
41
|
+
self,
|
|
42
|
+
symbol: str,
|
|
43
|
+
start: date,
|
|
44
|
+
end: date,
|
|
45
|
+
interval: str = "1d",
|
|
46
|
+
) -> pd.DataFrame:
|
|
47
|
+
"""Get price history for a symbol."""
|
|
48
|
+
raise NotImplementedError
|
|
49
|
+
|
|
50
|
+
async def get_fundamentals(self, symbol: str) -> Dict[str, Any]:
|
|
51
|
+
"""Get fundamental data for a symbol."""
|
|
52
|
+
raise NotImplementedError
|
|
53
|
+
|
|
54
|
+
async def get_corporate_actions(
|
|
55
|
+
self,
|
|
56
|
+
symbol: str,
|
|
57
|
+
start: date,
|
|
58
|
+
end: date,
|
|
59
|
+
) -> List[CorporateAction]:
|
|
60
|
+
"""Get corporate actions for a symbol."""
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class YFinanceProvider(DataProvider):
|
|
65
|
+
"""Yahoo Finance data provider."""
|
|
66
|
+
|
|
67
|
+
def __init__(self):
|
|
68
|
+
super().__init__("yfinance")
|
|
69
|
+
|
|
70
|
+
async def get_price_history(
|
|
71
|
+
self,
|
|
72
|
+
symbol: str,
|
|
73
|
+
start: date,
|
|
74
|
+
end: date,
|
|
75
|
+
interval: str = "1d",
|
|
76
|
+
) -> pd.DataFrame:
|
|
77
|
+
"""Get price history from Yahoo Finance."""
|
|
78
|
+
ticker = yf.Ticker(symbol)
|
|
79
|
+
|
|
80
|
+
# Run in thread pool to avoid blocking
|
|
81
|
+
loop = asyncio.get_event_loop()
|
|
82
|
+
df = await loop.run_in_executor(
|
|
83
|
+
None,
|
|
84
|
+
lambda: ticker.history(start=start, end=end, interval=interval)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
async def get_fundamentals(self, symbol: str) -> Dict[str, Any]:
|
|
90
|
+
"""Get fundamental data from Yahoo Finance."""
|
|
91
|
+
ticker = yf.Ticker(symbol)
|
|
92
|
+
|
|
93
|
+
loop = asyncio.get_event_loop()
|
|
94
|
+
info = await loop.run_in_executor(None, lambda: ticker.info)
|
|
95
|
+
|
|
96
|
+
return info
|
|
97
|
+
|
|
98
|
+
async def get_corporate_actions(
|
|
99
|
+
self,
|
|
100
|
+
symbol: str,
|
|
101
|
+
start: date,
|
|
102
|
+
end: date,
|
|
103
|
+
) -> List[CorporateAction]:
|
|
104
|
+
"""Get corporate actions from Yahoo Finance."""
|
|
105
|
+
ticker = yf.Ticker(symbol)
|
|
106
|
+
|
|
107
|
+
loop = asyncio.get_event_loop()
|
|
108
|
+
|
|
109
|
+
# Get splits
|
|
110
|
+
splits = await loop.run_in_executor(None, lambda: ticker.splits)
|
|
111
|
+
|
|
112
|
+
# Get dividends
|
|
113
|
+
dividends = await loop.run_in_executor(None, lambda: ticker.dividends)
|
|
114
|
+
|
|
115
|
+
actions = []
|
|
116
|
+
|
|
117
|
+
if not splits.empty:
|
|
118
|
+
for dt, ratio in splits.items():
|
|
119
|
+
try:
|
|
120
|
+
if hasattr(dt, 'date'):
|
|
121
|
+
action_date = dt.date()
|
|
122
|
+
elif hasattr(dt, 'to_pydatetime'):
|
|
123
|
+
action_date = dt.to_pydatetime().date()
|
|
124
|
+
else:
|
|
125
|
+
action_date = date.fromisoformat(str(dt)[:10])
|
|
126
|
+
if start <= action_date <= end:
|
|
127
|
+
actions.append(CorporateAction(
|
|
128
|
+
date=action_date,
|
|
129
|
+
symbol=symbol,
|
|
130
|
+
action_type="split",
|
|
131
|
+
ratio=float(ratio),
|
|
132
|
+
adjustment_factor=float(ratio),
|
|
133
|
+
details={"ratio": float(ratio)},
|
|
134
|
+
))
|
|
135
|
+
except (TypeError, AttributeError, ValueError):
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
if not dividends.empty:
|
|
139
|
+
for dt, amount in dividends.items():
|
|
140
|
+
try:
|
|
141
|
+
if hasattr(dt, 'date'):
|
|
142
|
+
action_date = dt.date()
|
|
143
|
+
elif hasattr(dt, 'to_pydatetime'):
|
|
144
|
+
action_date = dt.to_pydatetime().date()
|
|
145
|
+
else:
|
|
146
|
+
action_date = date.fromisoformat(str(dt)[:10])
|
|
147
|
+
if start <= action_date <= end:
|
|
148
|
+
actions.append(CorporateAction(
|
|
149
|
+
date=action_date,
|
|
150
|
+
symbol=symbol,
|
|
151
|
+
action_type="dividend",
|
|
152
|
+
amount=float(amount),
|
|
153
|
+
details={"amount": float(amount)},
|
|
154
|
+
))
|
|
155
|
+
except (TypeError, AttributeError, ValueError):
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
return actions
|
|
159
|
+
|
|
160
|
+
async def get_financial_statements(self, symbol: str) -> Dict[str, pd.DataFrame]:
|
|
161
|
+
"""Get financial statements from Yahoo Finance."""
|
|
162
|
+
ticker = yf.Ticker(symbol)
|
|
163
|
+
|
|
164
|
+
loop = asyncio.get_event_loop()
|
|
165
|
+
|
|
166
|
+
income = await loop.run_in_executor(None, lambda: ticker.income_stmt)
|
|
167
|
+
balance = await loop.run_in_executor(None, lambda: ticker.balance_sheet)
|
|
168
|
+
cashflow = await loop.run_in_executor(None, lambda: ticker.cashflow)
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
"income_statement": income,
|
|
172
|
+
"balance_sheet": balance,
|
|
173
|
+
"cash_flow": cashflow,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
async def get_options_data(self, symbol: str) -> Dict[str, Any]:
|
|
177
|
+
"""Get options data from Yahoo Finance."""
|
|
178
|
+
ticker = yf.Ticker(symbol)
|
|
179
|
+
|
|
180
|
+
loop = asyncio.get_event_loop()
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
expirations = await loop.run_in_executor(None, lambda: ticker.options)
|
|
184
|
+
|
|
185
|
+
if not expirations:
|
|
186
|
+
return {"error": "No options data available"}
|
|
187
|
+
|
|
188
|
+
# Get first expiration's chain
|
|
189
|
+
chain = await loop.run_in_executor(
|
|
190
|
+
None, lambda: ticker.option_chain(expirations[0])
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"expirations": list(expirations),
|
|
195
|
+
"calls": chain.calls.to_dict() if hasattr(chain, 'calls') else {},
|
|
196
|
+
"puts": chain.puts.to_dict() if hasattr(chain, 'puts') else {},
|
|
197
|
+
}
|
|
198
|
+
except Exception as e:
|
|
199
|
+
return {"error": str(e)}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class FREDProvider(DataProvider):
|
|
203
|
+
"""FRED (Federal Reserve Economic Data) provider."""
|
|
204
|
+
|
|
205
|
+
BASE_URL = "https://api.stlouisfed.org/fred"
|
|
206
|
+
|
|
207
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
208
|
+
super().__init__("fred", api_key)
|
|
209
|
+
|
|
210
|
+
async def get_series(
|
|
211
|
+
self,
|
|
212
|
+
series_id: str,
|
|
213
|
+
start: date,
|
|
214
|
+
end: date,
|
|
215
|
+
) -> pd.DataFrame:
|
|
216
|
+
"""Get FRED series data."""
|
|
217
|
+
if not self.api_key:
|
|
218
|
+
raise ValueError("FRED API key required")
|
|
219
|
+
|
|
220
|
+
url = f"{self.BASE_URL}/series/observations"
|
|
221
|
+
params = {
|
|
222
|
+
"series_id": series_id,
|
|
223
|
+
"api_key": self.api_key,
|
|
224
|
+
"file_type": "json",
|
|
225
|
+
"observation_start": start.isoformat(),
|
|
226
|
+
"observation_end": end.isoformat(),
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
loop = asyncio.get_event_loop()
|
|
230
|
+
response = await loop.run_in_executor(
|
|
231
|
+
None, lambda: requests.get(url, params=params)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
data = response.json()
|
|
235
|
+
|
|
236
|
+
if "observations" not in data:
|
|
237
|
+
return pd.DataFrame()
|
|
238
|
+
|
|
239
|
+
df = pd.DataFrame(data["observations"])
|
|
240
|
+
df["date"] = pd.to_datetime(df["date"])
|
|
241
|
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
242
|
+
df = df.set_index("date")
|
|
243
|
+
|
|
244
|
+
return df[["value"]]
|
|
245
|
+
|
|
246
|
+
# Common FRED series
|
|
247
|
+
SERIES = {
|
|
248
|
+
"treasury_10y": "DGS10",
|
|
249
|
+
"treasury_2y": "DGS2",
|
|
250
|
+
"fed_funds": "FEDFUNDS",
|
|
251
|
+
"inflation_cpi": "CPIAUCSL",
|
|
252
|
+
"unemployment": "UNRATE",
|
|
253
|
+
"gdp": "GDP",
|
|
254
|
+
"credit_spread": "BAMLC0A0CM",
|
|
255
|
+
"vix": "VIXCLS",
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ============================================================================
|
|
260
|
+
# DATA QUALITY ENGINE
|
|
261
|
+
# ============================================================================
|
|
262
|
+
|
|
263
|
+
class DataQualityEngine:
|
|
264
|
+
"""Perform data quality checks and cleaning."""
|
|
265
|
+
|
|
266
|
+
@staticmethod
|
|
267
|
+
def check_quality(df: pd.DataFrame, symbol: Optional[str] = None) -> DataQualityReport:
|
|
268
|
+
"""Run comprehensive quality checks on data."""
|
|
269
|
+
total = len(df)
|
|
270
|
+
|
|
271
|
+
# Missing data
|
|
272
|
+
missing = df.isnull().sum().sum()
|
|
273
|
+
missing_pct = (missing / (total * len(df.columns))) * 100 if total > 0 else 0
|
|
274
|
+
|
|
275
|
+
# Stale ticks (same OHLC for multiple days)
|
|
276
|
+
stale = 0
|
|
277
|
+
if "Close" in df.columns and len(df) > 1:
|
|
278
|
+
stale = (df["Close"].diff() == 0).sum()
|
|
279
|
+
|
|
280
|
+
# Outliers (returns > 50% in a day)
|
|
281
|
+
outliers = 0
|
|
282
|
+
if "Close" in df.columns and len(df) > 1:
|
|
283
|
+
returns = df["Close"].pct_change().abs()
|
|
284
|
+
outliers = (returns > 0.5).sum()
|
|
285
|
+
|
|
286
|
+
# Date gaps
|
|
287
|
+
gaps = []
|
|
288
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
289
|
+
expected_dates = pd.date_range(df.index.min(), df.index.max(), freq="B")
|
|
290
|
+
actual_dates = set(df.index.date)
|
|
291
|
+
missing_dates = set(expected_dates.date) - actual_dates
|
|
292
|
+
|
|
293
|
+
# Find consecutive gaps
|
|
294
|
+
if missing_dates:
|
|
295
|
+
sorted_missing = sorted(missing_dates)
|
|
296
|
+
gap_start = sorted_missing[0]
|
|
297
|
+
gap_end = sorted_missing[0]
|
|
298
|
+
|
|
299
|
+
for d in sorted_missing[1:]:
|
|
300
|
+
if (d - gap_end).days <= 3: # Allow weekends
|
|
301
|
+
gap_end = d
|
|
302
|
+
else:
|
|
303
|
+
if gap_start != gap_end:
|
|
304
|
+
gaps.append((gap_start, gap_end))
|
|
305
|
+
gap_start = d
|
|
306
|
+
gap_end = d
|
|
307
|
+
|
|
308
|
+
if gap_start != gap_end:
|
|
309
|
+
gaps.append((gap_start, gap_end))
|
|
310
|
+
|
|
311
|
+
# Timezone issues
|
|
312
|
+
tz_issues = 0
|
|
313
|
+
if isinstance(df.index, pd.DatetimeIndex):
|
|
314
|
+
if df.index.tz is None:
|
|
315
|
+
tz_issues = 1 # Timezone-naive
|
|
316
|
+
|
|
317
|
+
# Warnings
|
|
318
|
+
warnings = []
|
|
319
|
+
if missing_pct > 5:
|
|
320
|
+
warnings.append(f"High missing data: {missing_pct:.1f}%")
|
|
321
|
+
if stale > len(df) * 0.1:
|
|
322
|
+
warnings.append(f"Many stale ticks: {stale}")
|
|
323
|
+
if outliers > 0:
|
|
324
|
+
warnings.append(f"Potential outliers detected: {outliers}")
|
|
325
|
+
if len(gaps) > 0:
|
|
326
|
+
warnings.append(f"Data gaps detected: {len(gaps)}")
|
|
327
|
+
|
|
328
|
+
return DataQualityReport(
|
|
329
|
+
total_records=total,
|
|
330
|
+
missing_count=missing,
|
|
331
|
+
missing_pct=missing_pct,
|
|
332
|
+
stale_ticks=stale,
|
|
333
|
+
outliers_detected=outliers,
|
|
334
|
+
timezone_issues=tz_issues,
|
|
335
|
+
date_range=(df.index.min(), df.index.max()) if len(df) > 0 else (None, None),
|
|
336
|
+
gaps=gaps,
|
|
337
|
+
warnings=warnings,
|
|
338
|
+
passed=len(warnings) == 0,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
@staticmethod
|
|
342
|
+
def clean_price_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
343
|
+
"""Clean and adjust price data."""
|
|
344
|
+
df = df.copy()
|
|
345
|
+
|
|
346
|
+
# Handle missing values
|
|
347
|
+
for col in ["Open", "High", "Low", "Close"]:
|
|
348
|
+
if col in df.columns:
|
|
349
|
+
df[col] = df[col].ffill()
|
|
350
|
+
|
|
351
|
+
# Handle volume
|
|
352
|
+
if "Volume" in df.columns:
|
|
353
|
+
df["Volume"] = df["Volume"].fillna(0)
|
|
354
|
+
|
|
355
|
+
# Remove obvious outliers (returns > 100%)
|
|
356
|
+
if "Close" in df.columns and len(df) > 1:
|
|
357
|
+
returns = df["Close"].pct_change().abs()
|
|
358
|
+
outlier_mask = returns > 1.0
|
|
359
|
+
if outlier_mask.any():
|
|
360
|
+
df.loc[outlier_mask, "Close"] = np.nan
|
|
361
|
+
df["Close"] = df["Close"].interpolate()
|
|
362
|
+
|
|
363
|
+
return df
|
|
364
|
+
|
|
365
|
+
@staticmethod
|
|
366
|
+
def adjust_for_splits(df: pd.DataFrame, splits: List[CorporateAction]) -> pd.DataFrame:
|
|
367
|
+
"""Adjust historical prices for stock splits."""
|
|
368
|
+
df = df.copy()
|
|
369
|
+
|
|
370
|
+
for split in sorted(splits, key=lambda x: x.date, reverse=True):
|
|
371
|
+
factor = split.adjustment_factor or split.ratio
|
|
372
|
+
if factor:
|
|
373
|
+
try:
|
|
374
|
+
mask = pd.to_datetime(df.index).date < split.date
|
|
375
|
+
for col in ["Open", "High", "Low", "Close"]:
|
|
376
|
+
if col in df.columns:
|
|
377
|
+
df.loc[mask, col] = df.loc[mask, col] / factor
|
|
378
|
+
if "Volume" in df.columns:
|
|
379
|
+
df.loc[mask, "Volume"] = df.loc[mask, "Volume"] * factor
|
|
380
|
+
except (TypeError, AttributeError):
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
return df
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# ============================================================================
|
|
387
|
+
# DATA MANAGER
|
|
388
|
+
# ============================================================================
|
|
389
|
+
|
|
390
|
+
class DataManager:
|
|
391
|
+
"""Central data manager with caching and lineage tracking."""
|
|
392
|
+
|
|
393
|
+
def __init__(self, cache_dir: str = None):
|
|
394
|
+
self.cache_dir = cache_dir or os.path.expanduser("~/.sigma/data_cache")
|
|
395
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
|
396
|
+
|
|
397
|
+
self.providers = {
|
|
398
|
+
DataSource.YFINANCE: YFinanceProvider(),
|
|
399
|
+
}
|
|
400
|
+
self.lineage_records = []
|
|
401
|
+
self.quality_engine = DataQualityEngine()
|
|
402
|
+
|
|
403
|
+
def add_provider(self, source: DataSource, provider: DataProvider):
|
|
404
|
+
"""Add a data provider."""
|
|
405
|
+
self.providers[source] = provider
|
|
406
|
+
|
|
407
|
+
async def get_price_data(
|
|
408
|
+
self,
|
|
409
|
+
symbols: Union[str, List[str]],
|
|
410
|
+
start: Optional[date] = None,
|
|
411
|
+
end: Optional[date] = None,
|
|
412
|
+
period: str = "2y",
|
|
413
|
+
source: DataSource = DataSource.YFINANCE,
|
|
414
|
+
clean: bool = True,
|
|
415
|
+
adjust_splits: bool = True,
|
|
416
|
+
) -> Dict[str, pd.DataFrame]:
|
|
417
|
+
"""Get price data for one or more symbols."""
|
|
418
|
+
if isinstance(symbols, str):
|
|
419
|
+
symbols = [symbols]
|
|
420
|
+
|
|
421
|
+
# Parse period if dates not provided
|
|
422
|
+
if start is None or end is None:
|
|
423
|
+
end = date.today()
|
|
424
|
+
period_days = {
|
|
425
|
+
"1d": 1, "5d": 5, "1mo": 30, "3mo": 90, "6mo": 180,
|
|
426
|
+
"1y": 365, "2y": 730, "5y": 1825, "10y": 3650,
|
|
427
|
+
}
|
|
428
|
+
days = period_days.get(period, 730)
|
|
429
|
+
start = end - timedelta(days=days)
|
|
430
|
+
|
|
431
|
+
provider = self.providers.get(source)
|
|
432
|
+
if not provider:
|
|
433
|
+
raise ValueError(f"Provider not available: {source}")
|
|
434
|
+
|
|
435
|
+
results = {}
|
|
436
|
+
transformations = []
|
|
437
|
+
|
|
438
|
+
for symbol in symbols:
|
|
439
|
+
# Check cache
|
|
440
|
+
cache_key = self._cache_key(symbol, start, end, source)
|
|
441
|
+
cached = self._load_cache(cache_key)
|
|
442
|
+
|
|
443
|
+
if cached is not None:
|
|
444
|
+
df = cached
|
|
445
|
+
transformations.append("loaded_from_cache")
|
|
446
|
+
else:
|
|
447
|
+
# Fetch from provider
|
|
448
|
+
df = await provider.get_price_history(symbol, start, end)
|
|
449
|
+
|
|
450
|
+
if df.empty:
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
# Cache the raw data
|
|
454
|
+
self._save_cache(cache_key, df)
|
|
455
|
+
transformations.append("fetched_fresh")
|
|
456
|
+
|
|
457
|
+
# Clean data
|
|
458
|
+
if clean:
|
|
459
|
+
df = self.quality_engine.clean_price_data(df)
|
|
460
|
+
transformations.append("cleaned")
|
|
461
|
+
|
|
462
|
+
# Adjust for splits
|
|
463
|
+
if adjust_splits and source == DataSource.YFINANCE:
|
|
464
|
+
splits = await provider.get_corporate_actions(symbol, start, end)
|
|
465
|
+
split_actions = [a for a in splits if a.action_type == "split"]
|
|
466
|
+
if split_actions:
|
|
467
|
+
df = self.quality_engine.adjust_for_splits(df, split_actions)
|
|
468
|
+
transformations.append("split_adjusted")
|
|
469
|
+
|
|
470
|
+
# Quality report
|
|
471
|
+
quality = self.quality_engine.check_quality(df, symbol)
|
|
472
|
+
|
|
473
|
+
# Record lineage
|
|
474
|
+
lineage = DataLineage(
|
|
475
|
+
source=source,
|
|
476
|
+
fetch_timestamp=datetime.now(),
|
|
477
|
+
symbols=[symbol],
|
|
478
|
+
date_range=(start, end),
|
|
479
|
+
transformations=transformations,
|
|
480
|
+
quality_report=quality,
|
|
481
|
+
)
|
|
482
|
+
self.lineage_records.append(lineage)
|
|
483
|
+
|
|
484
|
+
results[symbol] = df
|
|
485
|
+
|
|
486
|
+
return results
|
|
487
|
+
|
|
488
|
+
async def get_fundamentals(
|
|
489
|
+
self,
|
|
490
|
+
symbol: str,
|
|
491
|
+
source: DataSource = DataSource.YFINANCE,
|
|
492
|
+
point_in_time: Optional[date] = None,
|
|
493
|
+
) -> Dict[str, Any]:
|
|
494
|
+
"""Get fundamental data with point-in-time awareness."""
|
|
495
|
+
provider = self.providers.get(source)
|
|
496
|
+
if not provider:
|
|
497
|
+
raise ValueError(f"Provider not available: {source}")
|
|
498
|
+
|
|
499
|
+
data = await provider.get_fundamentals(symbol)
|
|
500
|
+
|
|
501
|
+
# Add point-in-time metadata
|
|
502
|
+
data["_as_of_date"] = point_in_time or date.today()
|
|
503
|
+
data["_fetch_timestamp"] = datetime.now().isoformat()
|
|
504
|
+
data["_source"] = source.value
|
|
505
|
+
|
|
506
|
+
return data
|
|
507
|
+
|
|
508
|
+
async def get_macro_data(
|
|
509
|
+
self,
|
|
510
|
+
series: List[str],
|
|
511
|
+
start: Optional[date] = None,
|
|
512
|
+
end: Optional[date] = None,
|
|
513
|
+
) -> Dict[str, pd.DataFrame]:
|
|
514
|
+
"""Get macroeconomic data."""
|
|
515
|
+
fred = self.providers.get(DataSource.FRED)
|
|
516
|
+
if not fred:
|
|
517
|
+
# Return empty if FRED not configured
|
|
518
|
+
return {}
|
|
519
|
+
|
|
520
|
+
end = end or date.today()
|
|
521
|
+
start = start or (end - timedelta(days=365*5))
|
|
522
|
+
|
|
523
|
+
results = {}
|
|
524
|
+
for series_name in series:
|
|
525
|
+
series_id = FREDProvider.SERIES.get(series_name, series_name)
|
|
526
|
+
try:
|
|
527
|
+
df = await fred.get_series(series_id, start, end)
|
|
528
|
+
results[series_name] = df
|
|
529
|
+
except Exception:
|
|
530
|
+
continue
|
|
531
|
+
|
|
532
|
+
return results
|
|
533
|
+
|
|
534
|
+
def get_lineage(self, symbol: str = None) -> List[DataLineage]:
|
|
535
|
+
"""Get data lineage records."""
|
|
536
|
+
if symbol:
|
|
537
|
+
return [l for l in self.lineage_records if symbol in l.symbols]
|
|
538
|
+
return self.lineage_records
|
|
539
|
+
|
|
540
|
+
def _cache_key(self, symbol: str, start: date, end: date, source: DataSource) -> str:
|
|
541
|
+
"""Generate cache key."""
|
|
542
|
+
key = f"{symbol}_{start}_{end}_{source.value}"
|
|
543
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
544
|
+
|
|
545
|
+
def _load_cache(self, key: str) -> Optional[pd.DataFrame]:
|
|
546
|
+
"""Load from cache."""
|
|
547
|
+
cache_path = os.path.join(self.cache_dir, f"{key}.pkl")
|
|
548
|
+
if os.path.exists(cache_path):
|
|
549
|
+
# Check if cache is fresh (less than 1 day old)
|
|
550
|
+
mtime = os.path.getmtime(cache_path)
|
|
551
|
+
if datetime.now().timestamp() - mtime < 86400: # 24 hours
|
|
552
|
+
return pd.read_pickle(cache_path)
|
|
553
|
+
return None
|
|
554
|
+
|
|
555
|
+
def _save_cache(self, key: str, df: pd.DataFrame):
|
|
556
|
+
"""Save to cache."""
|
|
557
|
+
cache_path = os.path.join(self.cache_dir, f"{key}.pkl")
|
|
558
|
+
df.to_pickle(cache_path)
|
|
559
|
+
|
|
560
|
+
def clear_cache(self, symbol: str = None):
|
|
561
|
+
"""Clear cache."""
|
|
562
|
+
if symbol:
|
|
563
|
+
# Clear specific symbol
|
|
564
|
+
for f in os.listdir(self.cache_dir):
|
|
565
|
+
if symbol in f:
|
|
566
|
+
os.remove(os.path.join(self.cache_dir, f))
|
|
567
|
+
else:
|
|
568
|
+
# Clear all
|
|
569
|
+
for f in os.listdir(self.cache_dir):
|
|
570
|
+
os.remove(os.path.join(self.cache_dir, f))
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
# ============================================================================
|
|
574
|
+
# SURVIVORSHIP BIAS HANDLING
|
|
575
|
+
# ============================================================================
|
|
576
|
+
|
|
577
|
+
class DelistedTracker:
|
|
578
|
+
"""Track delisted securities for survivorship-bias-free analysis."""
|
|
579
|
+
|
|
580
|
+
# Known delisted tickers (simplified - would need database in production)
|
|
581
|
+
KNOWN_DELISTED = {
|
|
582
|
+
"LVLT": {"delisted": "2017-11-01", "reason": "acquired", "successor": "CTL"},
|
|
583
|
+
"TWX": {"delisted": "2018-06-15", "reason": "acquired", "successor": "T"},
|
|
584
|
+
"YHOO": {"delisted": "2017-06-13", "reason": "acquired", "successor": "VZ"},
|
|
585
|
+
"MON": {"delisted": "2018-06-07", "reason": "acquired", "successor": "BAYER.DE"},
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
@classmethod
|
|
589
|
+
def is_delisted(cls, symbol: str) -> bool:
|
|
590
|
+
"""Check if a symbol is delisted."""
|
|
591
|
+
return symbol.upper() in cls.KNOWN_DELISTED
|
|
592
|
+
|
|
593
|
+
@classmethod
|
|
594
|
+
def get_delisting_info(cls, symbol: str) -> Optional[Dict[str, Any]]:
|
|
595
|
+
"""Get delisting information."""
|
|
596
|
+
return cls.KNOWN_DELISTED.get(symbol.upper())
|
|
597
|
+
|
|
598
|
+
@classmethod
|
|
599
|
+
def warn_survivorship_bias(cls, symbols: List[str], start_date: date) -> List[str]:
|
|
600
|
+
"""Warn about potential survivorship bias."""
|
|
601
|
+
warnings = []
|
|
602
|
+
|
|
603
|
+
for symbol in symbols:
|
|
604
|
+
info = cls.get_delisting_info(symbol)
|
|
605
|
+
if info:
|
|
606
|
+
delisted = date.fromisoformat(info["delisted"])
|
|
607
|
+
if start_date < delisted:
|
|
608
|
+
warnings.append(
|
|
609
|
+
f"{symbol} was delisted on {info['delisted']} "
|
|
610
|
+
f"({info['reason']}). Consider including in analysis for period before delisting."
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# General warning if analyzing only current constituents
|
|
614
|
+
if len(symbols) > 10 and not any(cls.is_delisted(s) for s in symbols):
|
|
615
|
+
warnings.append(
|
|
616
|
+
"Warning: Analyzing only current constituents may introduce survivorship bias. "
|
|
617
|
+
"Consider including delisted securities for historical analysis."
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
return warnings
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
# ============================================================================
|
|
624
|
+
# CALENDAR DATA
|
|
625
|
+
# ============================================================================
|
|
626
|
+
|
|
627
|
+
class FinancialCalendar:
|
|
628
|
+
"""Financial calendar for events."""
|
|
629
|
+
|
|
630
|
+
@staticmethod
|
|
631
|
+
async def get_earnings_calendar(
|
|
632
|
+
symbols: List[str],
|
|
633
|
+
start: date,
|
|
634
|
+
end: date,
|
|
635
|
+
) -> List[Dict[str, Any]]:
|
|
636
|
+
"""Get earnings calendar."""
|
|
637
|
+
events = []
|
|
638
|
+
|
|
639
|
+
for symbol in symbols:
|
|
640
|
+
ticker = yf.Ticker(symbol)
|
|
641
|
+
|
|
642
|
+
loop = asyncio.get_event_loop()
|
|
643
|
+
calendar = await loop.run_in_executor(None, lambda: ticker.calendar)
|
|
644
|
+
|
|
645
|
+
if calendar is not None and not calendar.empty:
|
|
646
|
+
for col in calendar.columns:
|
|
647
|
+
event_date = calendar[col].get("Earnings Date")
|
|
648
|
+
if event_date:
|
|
649
|
+
events.append({
|
|
650
|
+
"symbol": symbol,
|
|
651
|
+
"event_type": "earnings",
|
|
652
|
+
"date": event_date,
|
|
653
|
+
"details": calendar[col].to_dict(),
|
|
654
|
+
})
|
|
655
|
+
|
|
656
|
+
return events
|
|
657
|
+
|
|
658
|
+
@staticmethod
|
|
659
|
+
async def get_dividend_calendar(
|
|
660
|
+
symbols: List[str],
|
|
661
|
+
lookback_days: int = 90,
|
|
662
|
+
) -> List[Dict[str, Any]]:
|
|
663
|
+
"""Get upcoming dividends."""
|
|
664
|
+
events = []
|
|
665
|
+
|
|
666
|
+
for symbol in symbols:
|
|
667
|
+
ticker = yf.Ticker(symbol)
|
|
668
|
+
|
|
669
|
+
loop = asyncio.get_event_loop()
|
|
670
|
+
info = await loop.run_in_executor(None, lambda: ticker.info)
|
|
671
|
+
|
|
672
|
+
if info.get("dividendDate"):
|
|
673
|
+
events.append({
|
|
674
|
+
"symbol": symbol,
|
|
675
|
+
"event_type": "dividend",
|
|
676
|
+
"date": datetime.fromtimestamp(info["dividendDate"]).date(),
|
|
677
|
+
"amount": info.get("dividendRate"),
|
|
678
|
+
"yield": info.get("dividendYield"),
|
|
679
|
+
})
|
|
680
|
+
|
|
681
|
+
return events
|