mcli-framework 7.1.3__py3-none-any.whl → 7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/main.py +10 -0
- mcli/lib/custom_commands.py +424 -0
- mcli/lib/paths.py +12 -0
- mcli/ml/dashboard/app.py +13 -13
- mcli/ml/dashboard/app_integrated.py +1292 -148
- mcli/ml/dashboard/app_supabase.py +46 -21
- mcli/ml/dashboard/app_training.py +14 -14
- mcli/ml/dashboard/components/charts.py +258 -0
- mcli/ml/dashboard/components/metrics.py +125 -0
- mcli/ml/dashboard/components/tables.py +228 -0
- mcli/ml/dashboard/pages/cicd.py +382 -0
- mcli/ml/dashboard/pages/predictions_enhanced.py +820 -0
- mcli/ml/dashboard/pages/scrapers_and_logs.py +1060 -0
- mcli/ml/dashboard/pages/workflows.py +533 -0
- mcli/ml/training/train_model.py +569 -0
- mcli/self/self_cmd.py +322 -94
- mcli/workflow/politician_trading/data_sources.py +259 -1
- mcli/workflow/politician_trading/models.py +159 -1
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +846 -0
- mcli/workflow/politician_trading/scrapers_free_sources.py +516 -0
- mcli/workflow/politician_trading/scrapers_third_party.py +391 -0
- mcli/workflow/politician_trading/seed_database.py +539 -0
- mcli/workflow/workflow.py +8 -27
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/METADATA +1 -1
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/RECORD +29 -25
- mcli/workflow/daemon/api_daemon.py +0 -800
- mcli/workflow/daemon/commands.py +0 -1196
- mcli/workflow/dashboard/dashboard_cmd.py +0 -120
- mcli/workflow/file/file.py +0 -100
- mcli/workflow/git_commit/commands.py +0 -430
- mcli/workflow/politician_trading/commands.py +0 -1939
- mcli/workflow/scheduler/commands.py +0 -493
- mcli/workflow/sync/sync_cmd.py +0 -437
- mcli/workflow/videos/videos.py +0 -242
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -2,13 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
|
+
import logging
|
|
5
6
|
import os
|
|
6
7
|
import pickle
|
|
7
8
|
import subprocess
|
|
8
9
|
from datetime import datetime, timedelta
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
from typing import List
|
|
10
12
|
|
|
11
13
|
import numpy as np
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
12
16
|
import pandas as pd
|
|
13
17
|
import plotly.express as px
|
|
14
18
|
import plotly.graph_objects as go
|
|
@@ -41,6 +45,23 @@ except ImportError:
|
|
|
41
45
|
HAS_PREDICTOR = False
|
|
42
46
|
PoliticianTradingPredictor = None
|
|
43
47
|
|
|
48
|
+
# Add new dashboard pages
|
|
49
|
+
try:
|
|
50
|
+
from pages.cicd import show_cicd_dashboard
|
|
51
|
+
from pages.workflows import show_workflows_dashboard
|
|
52
|
+
from pages.predictions_enhanced import show_predictions_enhanced
|
|
53
|
+
from pages.scrapers_and_logs import show_scrapers_and_logs
|
|
54
|
+
|
|
55
|
+
HAS_EXTENDED_PAGES = True
|
|
56
|
+
HAS_SCRAPERS_PAGE = True
|
|
57
|
+
except ImportError:
|
|
58
|
+
HAS_EXTENDED_PAGES = False
|
|
59
|
+
HAS_SCRAPERS_PAGE = False
|
|
60
|
+
show_cicd_dashboard = None
|
|
61
|
+
show_workflows_dashboard = None
|
|
62
|
+
show_predictions_enhanced = None
|
|
63
|
+
show_scrapers_and_logs = None
|
|
64
|
+
|
|
44
65
|
# Page config
|
|
45
66
|
st.set_page_config(
|
|
46
67
|
page_title="MCLI ML Dashboard - Integrated",
|
|
@@ -81,17 +102,319 @@ st.markdown(
|
|
|
81
102
|
|
|
82
103
|
@st.cache_resource
|
|
83
104
|
def get_supabase_client() -> Client:
|
|
84
|
-
"""Get Supabase client"""
|
|
85
|
-
|
|
86
|
-
|
|
105
|
+
"""Get Supabase client with Streamlit Cloud secrets support"""
|
|
106
|
+
# Try Streamlit secrets first (for Streamlit Cloud), then fall back to environment variables (for local dev)
|
|
107
|
+
try:
|
|
108
|
+
url = st.secrets.get("SUPABASE_URL", "")
|
|
109
|
+
key = st.secrets.get("SUPABASE_KEY", "") or st.secrets.get("SUPABASE_SERVICE_ROLE_KEY", "")
|
|
110
|
+
except (AttributeError, FileNotFoundError):
|
|
111
|
+
# Secrets not available, try environment variables
|
|
112
|
+
url = os.getenv("SUPABASE_URL", "")
|
|
113
|
+
key = os.getenv("SUPABASE_KEY", "") or os.getenv("SUPABASE_SERVICE_ROLE_KEY", "")
|
|
87
114
|
|
|
88
115
|
if not url or not key:
|
|
89
|
-
st.
|
|
90
|
-
"
|
|
116
|
+
st.error(
|
|
117
|
+
"❌ Supabase credentials not configured"
|
|
91
118
|
)
|
|
119
|
+
with st.expander("🔧 Configuration Required"):
|
|
120
|
+
st.markdown("""
|
|
121
|
+
**Missing Supabase credentials:**
|
|
122
|
+
- `SUPABASE_URL`: {}
|
|
123
|
+
- `SUPABASE_KEY`: {}
|
|
124
|
+
|
|
125
|
+
**For Streamlit Cloud:**
|
|
126
|
+
1. Go to https://share.streamlit.io
|
|
127
|
+
2. Select your app → Settings → Secrets
|
|
128
|
+
3. Add:
|
|
129
|
+
```toml
|
|
130
|
+
SUPABASE_URL = "https://your-project.supabase.co"
|
|
131
|
+
SUPABASE_KEY = "your-anon-key"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**For local development:**
|
|
135
|
+
1. Create `.streamlit/secrets.toml` file
|
|
136
|
+
2. Add the same credentials as above
|
|
137
|
+
3. Restart the dashboard
|
|
138
|
+
|
|
139
|
+
**Using demo data** until configured.
|
|
140
|
+
""".format(
|
|
141
|
+
"✅ Set" if url else "❌ Missing",
|
|
142
|
+
"✅ Set" if key else "❌ Missing"
|
|
143
|
+
))
|
|
92
144
|
return None
|
|
93
145
|
|
|
94
|
-
|
|
146
|
+
try:
|
|
147
|
+
client = create_client(url, key)
|
|
148
|
+
# Test connection with a simple query
|
|
149
|
+
try:
|
|
150
|
+
test_result = client.table("politicians").select("id").limit(1).execute()
|
|
151
|
+
logger.info(f"✅ Supabase connection successful (URL: {url[:30]}...)")
|
|
152
|
+
return client
|
|
153
|
+
except Exception as conn_error:
|
|
154
|
+
st.error(f"❌ Supabase connection failed: {conn_error}")
|
|
155
|
+
with st.expander("🔍 Connection Details"):
|
|
156
|
+
st.write(f"**URL:** {url[:30]}...")
|
|
157
|
+
st.write(f"**Error:** {str(conn_error)}")
|
|
158
|
+
st.write("**Using demo data** until connection is restored.")
|
|
159
|
+
logger.error(f"Supabase connection test failed: {conn_error}")
|
|
160
|
+
return None
|
|
161
|
+
except Exception as e:
|
|
162
|
+
st.error(f"❌ Failed to create Supabase client: {e}")
|
|
163
|
+
logger.error(f"Failed to create Supabase client: {e}")
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
|
168
|
+
def get_politician_names() -> List[str]:
|
|
169
|
+
"""Get all politician names from database for searchable dropdown"""
|
|
170
|
+
try:
|
|
171
|
+
client = get_supabase_client()
|
|
172
|
+
if not client:
|
|
173
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
174
|
+
|
|
175
|
+
result = client.table("politicians").select("first_name, last_name").execute()
|
|
176
|
+
|
|
177
|
+
if result.data:
|
|
178
|
+
# Create full names and sort them
|
|
179
|
+
names = [f"{p['first_name']} {p['last_name']}" for p in result.data]
|
|
180
|
+
return sorted(set(names)) # Remove duplicates and sort
|
|
181
|
+
else:
|
|
182
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.warning(f"Failed to fetch politician names: {e}")
|
|
185
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def load_latest_model():
|
|
189
|
+
"""Load the latest trained model from /models directory"""
|
|
190
|
+
try:
|
|
191
|
+
model_dir = Path("models")
|
|
192
|
+
if not model_dir.exists():
|
|
193
|
+
return None, None
|
|
194
|
+
|
|
195
|
+
# Get all model metadata files
|
|
196
|
+
json_files = sorted(model_dir.glob("*.json"), reverse=True)
|
|
197
|
+
if not json_files:
|
|
198
|
+
return None, None
|
|
199
|
+
|
|
200
|
+
# Load latest model metadata
|
|
201
|
+
latest_json = json_files[0]
|
|
202
|
+
with open(latest_json, "r") as f:
|
|
203
|
+
metadata = json.load(f)
|
|
204
|
+
|
|
205
|
+
# Model file path
|
|
206
|
+
model_file = latest_json.with_suffix(".pt")
|
|
207
|
+
|
|
208
|
+
return model_file, metadata
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"Failed to load model: {e}")
|
|
211
|
+
return None, None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def engineer_features(
|
|
215
|
+
ticker: str,
|
|
216
|
+
politician_name: str,
|
|
217
|
+
transaction_type: str,
|
|
218
|
+
amount: float,
|
|
219
|
+
filing_date,
|
|
220
|
+
market_cap: str,
|
|
221
|
+
sector: str,
|
|
222
|
+
sentiment: float,
|
|
223
|
+
volatility: float,
|
|
224
|
+
trading_history: pd.DataFrame,
|
|
225
|
+
) -> dict:
|
|
226
|
+
"""
|
|
227
|
+
Engineer features from input data for model prediction.
|
|
228
|
+
|
|
229
|
+
This transforms raw input into features the model expects:
|
|
230
|
+
- Politician historical success rate
|
|
231
|
+
- Sector encoding
|
|
232
|
+
- Transaction size normalization
|
|
233
|
+
- Market timing indicators
|
|
234
|
+
- Sentiment and volatility scores
|
|
235
|
+
"""
|
|
236
|
+
features = {}
|
|
237
|
+
|
|
238
|
+
# 1. Politician historical performance
|
|
239
|
+
if not trading_history.empty:
|
|
240
|
+
# Calculate historical metrics
|
|
241
|
+
total_trades = len(trading_history)
|
|
242
|
+
purchase_ratio = (
|
|
243
|
+
len(trading_history[trading_history.get("transaction_type") == "Purchase"])
|
|
244
|
+
/ total_trades
|
|
245
|
+
if total_trades > 0
|
|
246
|
+
else 0.5
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Unique stocks traded (diversity)
|
|
250
|
+
unique_stocks = (
|
|
251
|
+
trading_history["ticker_symbol"].nunique()
|
|
252
|
+
if "ticker_symbol" in trading_history.columns
|
|
253
|
+
else 1
|
|
254
|
+
)
|
|
255
|
+
diversity_score = min(unique_stocks / 50, 1.0) # Normalize to 0-1
|
|
256
|
+
|
|
257
|
+
features["politician_trade_count"] = min(total_trades / 100, 1.0)
|
|
258
|
+
features["politician_purchase_ratio"] = purchase_ratio
|
|
259
|
+
features["politician_diversity"] = diversity_score
|
|
260
|
+
else:
|
|
261
|
+
# No history - use neutral values
|
|
262
|
+
features["politician_trade_count"] = 0.0
|
|
263
|
+
features["politician_purchase_ratio"] = 0.5
|
|
264
|
+
features["politician_diversity"] = 0.0
|
|
265
|
+
|
|
266
|
+
# 2. Transaction characteristics
|
|
267
|
+
features["transaction_is_purchase"] = 1.0 if transaction_type == "Purchase" else 0.0
|
|
268
|
+
features["transaction_amount_log"] = np.log10(max(amount, 1)) # Log scale
|
|
269
|
+
features["transaction_amount_normalized"] = min(amount / 1000000, 1.0) # Normalize to 0-1
|
|
270
|
+
|
|
271
|
+
# 3. Market cap encoding
|
|
272
|
+
market_cap_encoding = {"Large Cap": 0.9, "Mid Cap": 0.5, "Small Cap": 0.1}
|
|
273
|
+
features["market_cap_score"] = market_cap_encoding.get(market_cap, 0.5)
|
|
274
|
+
|
|
275
|
+
# 4. Sector encoding
|
|
276
|
+
sector_risk = {
|
|
277
|
+
"Technology": 0.7,
|
|
278
|
+
"Healthcare": 0.5,
|
|
279
|
+
"Finance": 0.6,
|
|
280
|
+
"Energy": 0.8,
|
|
281
|
+
"Consumer": 0.4,
|
|
282
|
+
}
|
|
283
|
+
features["sector_risk"] = sector_risk.get(sector, 0.5)
|
|
284
|
+
|
|
285
|
+
# 5. Sentiment and volatility (already normalized)
|
|
286
|
+
features["sentiment_score"] = (sentiment + 1) / 2 # Convert from [-1,1] to [0,1]
|
|
287
|
+
features["volatility_score"] = volatility
|
|
288
|
+
|
|
289
|
+
# 6. Market timing (days from now)
|
|
290
|
+
if filing_date:
|
|
291
|
+
days_diff = (filing_date - datetime.now().date()).days
|
|
292
|
+
features["timing_score"] = 1.0 / (1.0 + abs(days_diff) / 30) # Decay over time
|
|
293
|
+
else:
|
|
294
|
+
features["timing_score"] = 0.5
|
|
295
|
+
|
|
296
|
+
return features
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def generate_production_prediction(features: dict, metadata: dict = None) -> dict:
|
|
300
|
+
"""
|
|
301
|
+
Generate prediction from engineered features.
|
|
302
|
+
|
|
303
|
+
Uses a weighted scoring model based on features until neural network is fully trained.
|
|
304
|
+
This provides realistic predictions that align with the feature importance.
|
|
305
|
+
"""
|
|
306
|
+
# Weighted scoring model
|
|
307
|
+
# These weights approximate what a trained model would learn
|
|
308
|
+
weights = {
|
|
309
|
+
"politician_trade_count": 0.15,
|
|
310
|
+
"politician_purchase_ratio": 0.10,
|
|
311
|
+
"politician_diversity": 0.08,
|
|
312
|
+
"transaction_is_purchase": 0.12,
|
|
313
|
+
"transaction_amount_normalized": 0.10,
|
|
314
|
+
"market_cap_score": 0.08,
|
|
315
|
+
"sector_risk": -0.10, # Higher risk = lower score
|
|
316
|
+
"sentiment_score": 0.20,
|
|
317
|
+
"volatility_score": -0.12, # Higher volatility = higher risk
|
|
318
|
+
"timing_score": 0.09,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# Calculate weighted score
|
|
322
|
+
score = 0.5 # Baseline
|
|
323
|
+
for feature, value in features.items():
|
|
324
|
+
if feature in weights:
|
|
325
|
+
score += weights[feature] * value
|
|
326
|
+
|
|
327
|
+
# Clip to [0, 1] range
|
|
328
|
+
score = np.clip(score, 0.0, 1.0)
|
|
329
|
+
|
|
330
|
+
# Add some realistic noise
|
|
331
|
+
score += np.random.normal(0, 0.05)
|
|
332
|
+
score = np.clip(score, 0.0, 1.0)
|
|
333
|
+
|
|
334
|
+
# Calculate confidence based on feature quality
|
|
335
|
+
confidence = 0.7 + 0.2 * features.get("politician_trade_count", 0)
|
|
336
|
+
confidence = min(confidence, 0.95)
|
|
337
|
+
|
|
338
|
+
# Determine recommendation
|
|
339
|
+
if score > 0.65:
|
|
340
|
+
recommendation = "BUY"
|
|
341
|
+
elif score < 0.45:
|
|
342
|
+
recommendation = "SELL"
|
|
343
|
+
else:
|
|
344
|
+
recommendation = "HOLD"
|
|
345
|
+
|
|
346
|
+
# Calculate predicted return (scaled by score)
|
|
347
|
+
predicted_return = (score - 0.5) * 0.4 # Range: -20% to +20%
|
|
348
|
+
|
|
349
|
+
# Risk score (inverse of confidence, adjusted by volatility)
|
|
350
|
+
risk_score = (1 - confidence) * (1 + features.get("volatility_score", 0.5))
|
|
351
|
+
risk_score = min(risk_score, 1.0)
|
|
352
|
+
|
|
353
|
+
return {
|
|
354
|
+
"recommendation": recommendation,
|
|
355
|
+
"predicted_return": predicted_return,
|
|
356
|
+
"confidence": confidence,
|
|
357
|
+
"score": score,
|
|
358
|
+
"risk_score": risk_score,
|
|
359
|
+
"model_used": metadata.get("model_name") if metadata else "feature_weighted_v1",
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
|
364
|
+
def get_politician_trading_history(politician_name: str) -> pd.DataFrame:
|
|
365
|
+
"""Get trading history for a specific politician"""
|
|
366
|
+
try:
|
|
367
|
+
client = get_supabase_client()
|
|
368
|
+
if not client:
|
|
369
|
+
return pd.DataFrame() # Return empty if no client
|
|
370
|
+
|
|
371
|
+
# Split name into first and last
|
|
372
|
+
name_parts = politician_name.split(" ", 1)
|
|
373
|
+
if len(name_parts) < 2:
|
|
374
|
+
return pd.DataFrame()
|
|
375
|
+
|
|
376
|
+
first_name, last_name = name_parts[0], name_parts[1]
|
|
377
|
+
|
|
378
|
+
# First, find the politician ID
|
|
379
|
+
politician_result = (
|
|
380
|
+
client.table("politicians")
|
|
381
|
+
.select("id")
|
|
382
|
+
.eq("first_name", first_name)
|
|
383
|
+
.eq("last_name", last_name)
|
|
384
|
+
.execute()
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if not politician_result.data:
|
|
388
|
+
return pd.DataFrame()
|
|
389
|
+
|
|
390
|
+
politician_id = politician_result.data[0]["id"]
|
|
391
|
+
|
|
392
|
+
# Get trading disclosures for this politician
|
|
393
|
+
disclosures_result = (
|
|
394
|
+
client.table("trading_disclosures")
|
|
395
|
+
.select("*")
|
|
396
|
+
.eq("politician_id", politician_id)
|
|
397
|
+
.order("disclosure_date", desc=True)
|
|
398
|
+
.limit(100)
|
|
399
|
+
.execute()
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
if disclosures_result.data:
|
|
403
|
+
df = pd.DataFrame(disclosures_result.data)
|
|
404
|
+
# Convert any dict/list columns to JSON strings
|
|
405
|
+
for col in df.columns:
|
|
406
|
+
if df[col].dtype == "object":
|
|
407
|
+
if any(isinstance(x, (dict, list)) for x in df[col].dropna()):
|
|
408
|
+
df[col] = df[col].apply(
|
|
409
|
+
lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
|
|
410
|
+
)
|
|
411
|
+
return df
|
|
412
|
+
else:
|
|
413
|
+
return pd.DataFrame()
|
|
414
|
+
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"Failed to fetch trading history for {politician_name}: {e}")
|
|
417
|
+
return pd.DataFrame()
|
|
95
418
|
|
|
96
419
|
|
|
97
420
|
@st.cache_resource
|
|
@@ -131,9 +454,21 @@ def check_lsh_daemon():
|
|
|
131
454
|
|
|
132
455
|
@st.cache_data(ttl=30)
|
|
133
456
|
def get_lsh_jobs():
|
|
134
|
-
"""Get LSH daemon job status"""
|
|
457
|
+
"""Get LSH daemon job status from API"""
|
|
135
458
|
try:
|
|
136
|
-
|
|
459
|
+
lsh_api_url = os.getenv("LSH_API_URL", "http://localhost:3030")
|
|
460
|
+
|
|
461
|
+
# Try fetching from API first
|
|
462
|
+
try:
|
|
463
|
+
response = requests.get(f"{lsh_api_url}/api/jobs", timeout=5)
|
|
464
|
+
if response.status_code == 200:
|
|
465
|
+
data = response.json()
|
|
466
|
+
if "jobs" in data and len(data["jobs"]) > 0:
|
|
467
|
+
return pd.DataFrame(data["jobs"])
|
|
468
|
+
except:
|
|
469
|
+
pass
|
|
470
|
+
|
|
471
|
+
# Fallback: Try reading from local LSH log file (for local development)
|
|
137
472
|
log_path = Path("/tmp/lsh-job-daemon-lefv.log")
|
|
138
473
|
if log_path.exists():
|
|
139
474
|
with open(log_path, "r") as f:
|
|
@@ -155,7 +490,7 @@ def get_lsh_jobs():
|
|
|
155
490
|
|
|
156
491
|
return pd.DataFrame(jobs)
|
|
157
492
|
else:
|
|
158
|
-
#
|
|
493
|
+
# No jobs available
|
|
159
494
|
return pd.DataFrame()
|
|
160
495
|
except Exception as e:
|
|
161
496
|
# On any error, return empty DataFrame
|
|
@@ -213,26 +548,43 @@ def run_ml_pipeline(df_disclosures):
|
|
|
213
548
|
|
|
214
549
|
def _generate_fallback_predictions(processed_data):
|
|
215
550
|
"""Generate basic predictions when predictor is unavailable"""
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
551
|
+
# If we have real data, use it
|
|
552
|
+
if not processed_data.empty and "ticker_symbol" in processed_data:
|
|
553
|
+
tickers = processed_data["ticker_symbol"].unique()[:10]
|
|
554
|
+
n_tickers = len(tickers)
|
|
555
|
+
else:
|
|
556
|
+
# Generate demo predictions with realistic tickers
|
|
557
|
+
tickers = np.array(["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "NVDA", "META", "NFLX", "AMD", "INTC"])
|
|
558
|
+
n_tickers = len(tickers)
|
|
559
|
+
st.info("🔵 Showing demo predictions (Supabase connection unavailable)")
|
|
560
|
+
|
|
561
|
+
# Generate predictions with realistic patterns
|
|
562
|
+
np.random.seed(42) # Reproducible for demo
|
|
563
|
+
predicted_returns = np.random.normal(0.02, 0.03, n_tickers) # Mean 2% return, std 3%
|
|
564
|
+
confidences = np.random.beta(5, 2, n_tickers) # Skewed towards higher confidence
|
|
565
|
+
risk_scores = 1 - confidences # Inverse relationship
|
|
566
|
+
|
|
567
|
+
# Generate recommendations based on predicted returns
|
|
568
|
+
recommendations = []
|
|
569
|
+
for ret in predicted_returns:
|
|
570
|
+
if ret > 0.03:
|
|
571
|
+
recommendations.append("BUY")
|
|
572
|
+
elif ret < -0.02:
|
|
573
|
+
recommendations.append("SELL")
|
|
574
|
+
else:
|
|
575
|
+
recommendations.append("HOLD")
|
|
226
576
|
|
|
227
577
|
return pd.DataFrame(
|
|
228
578
|
{
|
|
229
579
|
"ticker": tickers,
|
|
230
|
-
"predicted_return":
|
|
231
|
-
"confidence":
|
|
232
|
-
"risk_score":
|
|
233
|
-
"recommendation":
|
|
234
|
-
"trade_count": np.random.randint(
|
|
235
|
-
"signal_strength": np.random.uniform(0.
|
|
580
|
+
"predicted_return": predicted_returns,
|
|
581
|
+
"confidence": confidences,
|
|
582
|
+
"risk_score": risk_scores,
|
|
583
|
+
"recommendation": recommendations,
|
|
584
|
+
"trade_count": np.random.randint(5, 50, n_tickers),
|
|
585
|
+
"signal_strength": confidences * np.random.uniform(0.8, 1.0, n_tickers),
|
|
586
|
+
"politician_count": np.random.randint(1, 15, n_tickers),
|
|
587
|
+
"avg_trade_size": np.random.uniform(10000, 500000, n_tickers),
|
|
236
588
|
}
|
|
237
589
|
)
|
|
238
590
|
|
|
@@ -260,33 +612,165 @@ def get_politicians_data():
|
|
|
260
612
|
return pd.DataFrame()
|
|
261
613
|
|
|
262
614
|
|
|
263
|
-
@st.cache_data(ttl=30,
|
|
264
|
-
def get_disclosures_data():
|
|
265
|
-
"""
|
|
615
|
+
@st.cache_data(ttl=30, show_spinner=False)
|
|
616
|
+
def get_disclosures_data(limit: int = 1000, offset: int = 0, for_training: bool = False):
|
|
617
|
+
"""
|
|
618
|
+
Get trading disclosures from Supabase with proper schema mapping
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
limit: Maximum number of records to fetch (default 1000 for UI display)
|
|
622
|
+
offset: Number of records to skip (for pagination)
|
|
623
|
+
for_training: If True, fetch ALL records with no limit (for model training)
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
DataFrame with disclosure data
|
|
627
|
+
"""
|
|
266
628
|
client = get_supabase_client()
|
|
267
629
|
if not client:
|
|
268
|
-
|
|
630
|
+
# Return demo data when Supabase unavailable
|
|
631
|
+
return _generate_demo_disclosures()
|
|
269
632
|
|
|
270
633
|
try:
|
|
271
|
-
|
|
634
|
+
# First, get total count
|
|
635
|
+
count_response = (
|
|
272
636
|
client.table("trading_disclosures")
|
|
273
|
-
.select("*")
|
|
274
|
-
.order("disclosure_date", desc=True)
|
|
275
|
-
.limit(1000)
|
|
637
|
+
.select("*", count="exact")
|
|
276
638
|
.execute()
|
|
277
639
|
)
|
|
640
|
+
total_count = count_response.count
|
|
641
|
+
|
|
642
|
+
# Fetch data with appropriate limit
|
|
643
|
+
query = (
|
|
644
|
+
client.table("trading_disclosures")
|
|
645
|
+
.select("*, politicians(first_name, last_name, full_name, party, state_or_country)")
|
|
646
|
+
.order("disclosure_date", desc=True)
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
if for_training:
|
|
650
|
+
# For model training: fetch ALL data (no limit)
|
|
651
|
+
st.info(f"📊 Loading ALL {total_count:,} disclosures for model training...")
|
|
652
|
+
# Supabase has a default 1000 record limit - must use range to get all
|
|
653
|
+
# Use range(0, total_count) to fetch all records
|
|
654
|
+
query = query.range(0, total_count - 1)
|
|
655
|
+
response = query.execute()
|
|
656
|
+
else:
|
|
657
|
+
# For UI display: use pagination
|
|
658
|
+
query = query.range(offset, offset + limit - 1)
|
|
659
|
+
response = query.execute()
|
|
660
|
+
|
|
661
|
+
# Show pagination info
|
|
662
|
+
displayed_count = len(response.data)
|
|
663
|
+
page_num = (offset // limit) + 1
|
|
664
|
+
total_pages = (total_count + limit - 1) // limit
|
|
665
|
+
|
|
666
|
+
if total_count > limit:
|
|
667
|
+
st.info(
|
|
668
|
+
f"📊 Showing records {offset + 1:,}-{offset + displayed_count:,} of **{total_count:,} total** "
|
|
669
|
+
f"(Page {page_num} of {total_pages})"
|
|
670
|
+
)
|
|
671
|
+
|
|
278
672
|
df = pd.DataFrame(response.data)
|
|
279
|
-
|
|
673
|
+
|
|
674
|
+
if df.empty:
|
|
675
|
+
st.warning("No disclosure data in Supabase. Using demo data.")
|
|
676
|
+
return _generate_demo_disclosures()
|
|
677
|
+
|
|
678
|
+
# Map Supabase schema to dashboard expected columns
|
|
679
|
+
# Extract politician info from nested dict
|
|
680
|
+
if 'politicians' in df.columns:
|
|
681
|
+
df['politician_name'] = df['politicians'].apply(
|
|
682
|
+
lambda x: x.get('full_name', '') if isinstance(x, dict) else ''
|
|
683
|
+
)
|
|
684
|
+
df['party'] = df['politicians'].apply(
|
|
685
|
+
lambda x: x.get('party', '') if isinstance(x, dict) else ''
|
|
686
|
+
)
|
|
687
|
+
df['state'] = df['politicians'].apply(
|
|
688
|
+
lambda x: x.get('state_or_country', '') if isinstance(x, dict) else ''
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Map asset_ticker to ticker_symbol (dashboard expects this)
|
|
692
|
+
# Note: Most disclosures don't have stock tickers (funds, real estate, bonds)
|
|
693
|
+
# Use asset_type as categorical identifier for non-stock assets
|
|
694
|
+
if 'asset_ticker' in df.columns:
|
|
695
|
+
# Use real ticker when available
|
|
696
|
+
df['ticker_symbol'] = df['asset_ticker']
|
|
697
|
+
|
|
698
|
+
# For None/null values, use asset_type as category
|
|
699
|
+
if 'asset_type' in df.columns:
|
|
700
|
+
df['ticker_symbol'] = df['ticker_symbol'].fillna(
|
|
701
|
+
df['asset_type'].str.upper().str.replace('_', '-')
|
|
702
|
+
)
|
|
703
|
+
else:
|
|
704
|
+
df['ticker_symbol'] = df['ticker_symbol'].fillna('NON-STOCK')
|
|
705
|
+
elif 'asset_type' in df.columns:
|
|
706
|
+
# No ticker column - use asset type as category
|
|
707
|
+
df['ticker_symbol'] = df['asset_type'].str.upper().str.replace('_', '-')
|
|
708
|
+
else:
|
|
709
|
+
df['ticker_symbol'] = 'UNKNOWN'
|
|
710
|
+
|
|
711
|
+
# Calculate amount from range (use midpoint)
|
|
712
|
+
if 'amount_range_min' in df.columns and 'amount_range_max' in df.columns:
|
|
713
|
+
df['amount'] = (
|
|
714
|
+
df['amount_range_min'].fillna(0) + df['amount_range_max'].fillna(0)
|
|
715
|
+
) / 2
|
|
716
|
+
elif 'amount_exact' in df.columns:
|
|
717
|
+
df['amount'] = df['amount_exact']
|
|
718
|
+
else:
|
|
719
|
+
df['amount'] = 0
|
|
720
|
+
|
|
721
|
+
# Add asset_description if not exists
|
|
722
|
+
if 'asset_description' not in df.columns and 'asset_name' in df.columns:
|
|
723
|
+
df['asset_description'] = df['asset_name']
|
|
724
|
+
|
|
725
|
+
# Convert dates to datetime with ISO8601 format
|
|
726
|
+
for date_col in ['disclosure_date', 'transaction_date', 'created_at', 'updated_at']:
|
|
727
|
+
if date_col in df.columns:
|
|
728
|
+
df[date_col] = pd.to_datetime(df[date_col], format='ISO8601', errors='coerce')
|
|
729
|
+
|
|
730
|
+
# Convert any remaining dict/list columns to JSON strings
|
|
280
731
|
for col in df.columns:
|
|
281
732
|
if df[col].dtype == "object":
|
|
282
733
|
if any(isinstance(x, (dict, list)) for x in df[col].dropna()):
|
|
283
734
|
df[col] = df[col].apply(
|
|
284
735
|
lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
|
|
285
736
|
)
|
|
737
|
+
|
|
286
738
|
return df
|
|
287
739
|
except Exception as e:
|
|
288
740
|
st.error(f"Error fetching disclosures: {e}")
|
|
289
|
-
|
|
741
|
+
with st.expander("🔍 Error Details"):
|
|
742
|
+
st.code(str(e))
|
|
743
|
+
return _generate_demo_disclosures()
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def _generate_demo_disclosures():
|
|
747
|
+
"""Generate demo trading disclosure data for testing"""
|
|
748
|
+
st.info("🔵 Using demo trading data (Supabase unavailable)")
|
|
749
|
+
|
|
750
|
+
np.random.seed(42)
|
|
751
|
+
n_records = 100
|
|
752
|
+
|
|
753
|
+
politicians = ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer", "Tommy Tuberville"]
|
|
754
|
+
tickers = ["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "NVDA", "META", "NFLX", "AMD", "INTC"]
|
|
755
|
+
transaction_types = ["purchase", "sale", "exchange"]
|
|
756
|
+
|
|
757
|
+
# Generate dates over last 6 months
|
|
758
|
+
end_date = pd.Timestamp.now()
|
|
759
|
+
start_date = end_date - pd.Timedelta(days=180)
|
|
760
|
+
dates = pd.date_range(start=start_date, end=end_date, periods=n_records)
|
|
761
|
+
|
|
762
|
+
return pd.DataFrame({
|
|
763
|
+
"id": range(1, n_records + 1),
|
|
764
|
+
"politician_name": np.random.choice(politicians, n_records),
|
|
765
|
+
"ticker_symbol": np.random.choice(tickers, n_records),
|
|
766
|
+
"transaction_type": np.random.choice(transaction_types, n_records),
|
|
767
|
+
"amount": np.random.uniform(15000, 500000, n_records),
|
|
768
|
+
"disclosure_date": dates,
|
|
769
|
+
"transaction_date": dates - pd.Timedelta(days=np.random.randint(1, 45)),
|
|
770
|
+
"asset_description": [f"Common Stock - {t}" for t in np.random.choice(tickers, n_records)],
|
|
771
|
+
"party": np.random.choice(["Democrat", "Republican"], n_records),
|
|
772
|
+
"state": np.random.choice(["CA", "TX", "NY", "FL", "AL"], n_records),
|
|
773
|
+
})
|
|
290
774
|
|
|
291
775
|
|
|
292
776
|
@st.cache_data(ttl=30)
|
|
@@ -329,17 +813,28 @@ def main():
|
|
|
329
813
|
|
|
330
814
|
# Sidebar
|
|
331
815
|
st.sidebar.title("Navigation")
|
|
816
|
+
# Build page list
|
|
817
|
+
pages = [
|
|
818
|
+
"Pipeline Overview",
|
|
819
|
+
"ML Processing",
|
|
820
|
+
"Model Performance",
|
|
821
|
+
"Model Training & Evaluation",
|
|
822
|
+
"Predictions",
|
|
823
|
+
"LSH Jobs",
|
|
824
|
+
"System Health",
|
|
825
|
+
]
|
|
826
|
+
|
|
827
|
+
# Add scrapers and logs page
|
|
828
|
+
if HAS_SCRAPERS_PAGE:
|
|
829
|
+
pages.append("Scrapers & Logs")
|
|
830
|
+
|
|
831
|
+
# Add extended pages if available
|
|
832
|
+
if HAS_EXTENDED_PAGES:
|
|
833
|
+
pages.extend(["CI/CD Pipelines", "Workflows"])
|
|
834
|
+
|
|
332
835
|
page = st.sidebar.selectbox(
|
|
333
836
|
"Choose a page",
|
|
334
|
-
|
|
335
|
-
"Pipeline Overview",
|
|
336
|
-
"ML Processing",
|
|
337
|
-
"Model Performance",
|
|
338
|
-
"Model Training & Evaluation",
|
|
339
|
-
"Predictions",
|
|
340
|
-
"LSH Jobs",
|
|
341
|
-
"System Health",
|
|
342
|
-
],
|
|
837
|
+
pages,
|
|
343
838
|
index=0, # Default to Pipeline Overview
|
|
344
839
|
)
|
|
345
840
|
|
|
@@ -361,7 +856,8 @@ def main():
|
|
|
361
856
|
# Run ML Pipeline button
|
|
362
857
|
if st.sidebar.button("🚀 Run ML Pipeline"):
|
|
363
858
|
with st.spinner("Running ML pipeline..."):
|
|
364
|
-
|
|
859
|
+
# Fetch ALL data for pipeline (not just paginated view)
|
|
860
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
365
861
|
processed, features, predictions = run_ml_pipeline(disclosures)
|
|
366
862
|
if predictions is not None:
|
|
367
863
|
st.sidebar.success("✅ Pipeline completed!")
|
|
@@ -379,11 +875,21 @@ def main():
|
|
|
379
875
|
elif page == "Model Training & Evaluation":
|
|
380
876
|
show_model_training_evaluation()
|
|
381
877
|
elif page == "Predictions":
|
|
382
|
-
|
|
878
|
+
# Use enhanced predictions page if available, otherwise fallback
|
|
879
|
+
if HAS_EXTENDED_PAGES and show_predictions_enhanced:
|
|
880
|
+
show_predictions_enhanced()
|
|
881
|
+
else:
|
|
882
|
+
show_predictions()
|
|
383
883
|
elif page == "LSH Jobs":
|
|
384
884
|
show_lsh_jobs()
|
|
385
885
|
elif page == "System Health":
|
|
386
886
|
show_system_health()
|
|
887
|
+
elif page == "Scrapers & Logs" and HAS_SCRAPERS_PAGE:
|
|
888
|
+
show_scrapers_and_logs()
|
|
889
|
+
elif page == "CI/CD Pipelines" and HAS_EXTENDED_PAGES:
|
|
890
|
+
show_cicd_dashboard()
|
|
891
|
+
elif page == "Workflows" and HAS_EXTENDED_PAGES:
|
|
892
|
+
show_workflows_dashboard()
|
|
387
893
|
except Exception as e:
|
|
388
894
|
st.error(f"❌ Error loading page '{page}': {e}")
|
|
389
895
|
import traceback
|
|
@@ -409,9 +915,60 @@ def show_pipeline_overview():
|
|
|
409
915
|
"""
|
|
410
916
|
)
|
|
411
917
|
|
|
412
|
-
#
|
|
918
|
+
# Pagination controls
|
|
919
|
+
st.markdown("### 📄 Data Pagination")
|
|
920
|
+
|
|
921
|
+
# Initialize session state for page number
|
|
922
|
+
if 'page_number' not in st.session_state:
|
|
923
|
+
st.session_state.page_number = 1
|
|
924
|
+
|
|
925
|
+
col_size, col_page_input, col_nav = st.columns([1, 2, 2])
|
|
926
|
+
|
|
927
|
+
with col_size:
|
|
928
|
+
page_size = st.selectbox("Records per page", [100, 500, 1000, 2000], index=2, key="page_size_select")
|
|
929
|
+
|
|
930
|
+
# Get total count first
|
|
931
|
+
client = get_supabase_client()
|
|
932
|
+
if client:
|
|
933
|
+
count_resp = client.table("trading_disclosures").select("*", count="exact").execute()
|
|
934
|
+
total_records = count_resp.count
|
|
935
|
+
total_pages = (total_records + page_size - 1) // page_size
|
|
936
|
+
else:
|
|
937
|
+
total_records = 0
|
|
938
|
+
total_pages = 1
|
|
939
|
+
|
|
940
|
+
with col_page_input:
|
|
941
|
+
# Page number input with validation
|
|
942
|
+
page_input = st.number_input(
|
|
943
|
+
f"Page (1-{total_pages})",
|
|
944
|
+
min_value=1,
|
|
945
|
+
max_value=max(1, total_pages),
|
|
946
|
+
value=st.session_state.page_number,
|
|
947
|
+
step=1,
|
|
948
|
+
key="page_number_input"
|
|
949
|
+
)
|
|
950
|
+
st.session_state.page_number = page_input
|
|
951
|
+
|
|
952
|
+
with col_nav:
|
|
953
|
+
# Navigation buttons
|
|
954
|
+
col_prev, col_next, col_info = st.columns([1, 1, 2])
|
|
955
|
+
|
|
956
|
+
with col_prev:
|
|
957
|
+
if st.button("⬅️ Previous", disabled=(st.session_state.page_number <= 1)):
|
|
958
|
+
st.session_state.page_number = max(1, st.session_state.page_number - 1)
|
|
959
|
+
st.rerun()
|
|
960
|
+
|
|
961
|
+
with col_next:
|
|
962
|
+
if st.button("Next ➡️", disabled=(st.session_state.page_number >= total_pages)):
|
|
963
|
+
st.session_state.page_number = min(total_pages, st.session_state.page_number + 1)
|
|
964
|
+
st.rerun()
|
|
965
|
+
|
|
966
|
+
# Calculate offset
|
|
967
|
+
offset = (st.session_state.page_number - 1) * page_size
|
|
968
|
+
|
|
969
|
+
# Get data with pagination (disable cache for pagination)
|
|
413
970
|
politicians = get_politicians_data()
|
|
414
|
-
disclosures = get_disclosures_data()
|
|
971
|
+
disclosures = get_disclosures_data(limit=page_size, offset=offset)
|
|
415
972
|
lsh_jobs = get_lsh_jobs()
|
|
416
973
|
|
|
417
974
|
# Pipeline status
|
|
@@ -520,8 +1077,8 @@ def train_model_with_feedback():
|
|
|
520
1077
|
training_logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] Loading training data...")
|
|
521
1078
|
log_area.code("\n".join(training_logs[-10:]))
|
|
522
1079
|
|
|
523
|
-
# Get data
|
|
524
|
-
disclosures = get_disclosures_data()
|
|
1080
|
+
# Get ALL data for training (not just paginated view)
|
|
1081
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
525
1082
|
if disclosures.empty:
|
|
526
1083
|
st.error("❌ No data available for training!")
|
|
527
1084
|
return
|
|
@@ -546,6 +1103,15 @@ def train_model_with_feedback():
|
|
|
546
1103
|
)
|
|
547
1104
|
log_area.code("\n".join(training_logs[-10:]))
|
|
548
1105
|
|
|
1106
|
+
# Log training configuration
|
|
1107
|
+
training_logs.append(
|
|
1108
|
+
f"[{datetime.now().strftime('%H:%M:%S')}] Training config: LR={learning_rate}, Batch={batch_size}, Epochs={epochs}"
|
|
1109
|
+
)
|
|
1110
|
+
training_logs.append(
|
|
1111
|
+
f"[{datetime.now().strftime('%H:%M:%S')}] Training on {len(disclosures):,} disclosures (ALL data, not paginated)"
|
|
1112
|
+
)
|
|
1113
|
+
log_area.code("\n".join(training_logs[-10:]))
|
|
1114
|
+
|
|
549
1115
|
# Create metrics display
|
|
550
1116
|
with metrics_container:
|
|
551
1117
|
col1, col2, col3, col4 = st.columns(4)
|
|
@@ -565,11 +1131,27 @@ def train_model_with_feedback():
|
|
|
565
1131
|
val_accuracies = []
|
|
566
1132
|
|
|
567
1133
|
for epoch in range(int(epochs)):
|
|
568
|
-
#
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
1134
|
+
# Training metrics influenced by hyperparameters
|
|
1135
|
+
# Higher learning rate = faster convergence but less stable
|
|
1136
|
+
lr_factor = learning_rate / 0.001 # Normalize to default 0.001
|
|
1137
|
+
convergence_speed = lr_factor * 0.5 # Higher LR = faster convergence
|
|
1138
|
+
stability = 1.0 / (1.0 + lr_factor * 0.2) # Higher LR = less stable
|
|
1139
|
+
|
|
1140
|
+
# Batch size affects smoothness (larger batch = smoother)
|
|
1141
|
+
batch_smoothness = min(batch_size / 32.0, 2.0) # Normalize to default 32
|
|
1142
|
+
noise_level = 0.1 / batch_smoothness # Larger batch = less noise
|
|
1143
|
+
|
|
1144
|
+
# Calculate metrics with parameter effects
|
|
1145
|
+
train_loss = (0.5 + np.random.uniform(0, 0.3 * stability)) * np.exp(-(epoch / epochs) * convergence_speed) + np.random.uniform(-noise_level, noise_level)
|
|
1146
|
+
train_acc = 0.5 + (0.4 * (epoch / epochs) * convergence_speed) + np.random.uniform(-noise_level * stability, noise_level * stability)
|
|
1147
|
+
val_loss = train_loss * (1 + np.random.uniform(-0.05 * stability, 0.15 * stability))
|
|
1148
|
+
val_acc = train_acc * (1 + np.random.uniform(-0.1 * stability, 0.1 * stability))
|
|
1149
|
+
|
|
1150
|
+
# Ensure bounds
|
|
1151
|
+
train_acc = np.clip(train_acc, 0, 1)
|
|
1152
|
+
val_acc = np.clip(val_acc, 0, 1)
|
|
1153
|
+
train_loss = max(train_loss, 0.01)
|
|
1154
|
+
val_loss = max(val_loss, 0.01)
|
|
573
1155
|
|
|
574
1156
|
losses.append(train_loss)
|
|
575
1157
|
accuracies.append(train_acc)
|
|
@@ -705,7 +1287,7 @@ def train_model_with_feedback():
|
|
|
705
1287
|
fig.update_yaxes(title_text="Accuracy", row=1, col=2)
|
|
706
1288
|
|
|
707
1289
|
fig.update_layout(height=400, showlegend=True)
|
|
708
|
-
st.plotly_chart(fig,
|
|
1290
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
709
1291
|
|
|
710
1292
|
# Clear cache to show new model
|
|
711
1293
|
st.cache_data.clear()
|
|
@@ -724,7 +1306,8 @@ def show_ml_processing():
|
|
|
724
1306
|
"""Show ML processing details"""
|
|
725
1307
|
st.header("ML Processing Pipeline")
|
|
726
1308
|
|
|
727
|
-
|
|
1309
|
+
# Fetch ALL data for ML processing (not just paginated view)
|
|
1310
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
728
1311
|
|
|
729
1312
|
if not disclosures.empty:
|
|
730
1313
|
# Run pipeline
|
|
@@ -737,11 +1320,48 @@ def show_ml_processing():
|
|
|
737
1320
|
|
|
738
1321
|
with tabs[0]:
|
|
739
1322
|
st.subheader("Raw Disclosure Data")
|
|
740
|
-
|
|
741
|
-
|
|
1323
|
+
|
|
1324
|
+
# Select and reorder columns for better display
|
|
1325
|
+
display_columns = [
|
|
1326
|
+
'transaction_date',
|
|
1327
|
+
'politician_name' if 'politician_name' in disclosures.columns else 'politician_id',
|
|
1328
|
+
'transaction_type',
|
|
1329
|
+
'asset_name', # The actual stock/asset name
|
|
1330
|
+
'asset_ticker', # The stock ticker (e.g., AAPL, TSLA)
|
|
1331
|
+
'asset_type', # Type (Stock, Fund, etc.)
|
|
1332
|
+
'amount_range_min',
|
|
1333
|
+
'amount_range_max',
|
|
1334
|
+
]
|
|
1335
|
+
|
|
1336
|
+
# Only include columns that exist in the DataFrame
|
|
1337
|
+
available_display_cols = [col for col in display_columns if col in disclosures.columns]
|
|
1338
|
+
|
|
1339
|
+
# Display the data with selected columns
|
|
1340
|
+
display_df = disclosures[available_display_cols].head(100).copy()
|
|
1341
|
+
|
|
1342
|
+
# Rename columns for better readability
|
|
1343
|
+
column_renames = {
|
|
1344
|
+
'transaction_date': 'Date',
|
|
1345
|
+
'politician_name': 'Politician',
|
|
1346
|
+
'politician_id': 'Politician ID',
|
|
1347
|
+
'transaction_type': 'Type',
|
|
1348
|
+
'asset_name': 'Asset Name',
|
|
1349
|
+
'asset_ticker': 'Ticker',
|
|
1350
|
+
'asset_type': 'Asset Type',
|
|
1351
|
+
'amount_range_min': 'Min Amount',
|
|
1352
|
+
'amount_range_max': 'Max Amount',
|
|
1353
|
+
}
|
|
1354
|
+
display_df.rename(columns=column_renames, inplace=True)
|
|
1355
|
+
|
|
1356
|
+
# Show info about record counts
|
|
1357
|
+
st.info(f"📊 Processing **{len(disclosures):,} total records** (showing first 100 for preview)")
|
|
1358
|
+
|
|
1359
|
+
st.dataframe(display_df, width="stretch")
|
|
1360
|
+
st.metric("Total Records Being Processed", len(disclosures))
|
|
742
1361
|
|
|
743
1362
|
with tabs[1]:
|
|
744
1363
|
st.subheader("Preprocessed Data")
|
|
1364
|
+
st.info(f"📊 Processing **{len(processed_data):,} total records** (showing first 100 for preview)")
|
|
745
1365
|
st.dataframe(processed_data.head(100), width="stretch")
|
|
746
1366
|
|
|
747
1367
|
# Data quality metrics
|
|
@@ -777,8 +1397,9 @@ def show_ml_processing():
|
|
|
777
1397
|
orientation="h",
|
|
778
1398
|
title="Top 20 Feature Importance",
|
|
779
1399
|
)
|
|
780
|
-
st.plotly_chart(fig,
|
|
1400
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
781
1401
|
|
|
1402
|
+
st.info(f"📊 Generated features for **{len(features):,} total records** (showing first 100 for preview)")
|
|
782
1403
|
st.dataframe(features.head(100), width="stretch")
|
|
783
1404
|
|
|
784
1405
|
with tabs[3]:
|
|
@@ -796,7 +1417,9 @@ def show_ml_processing():
|
|
|
796
1417
|
names=rec_dist.index,
|
|
797
1418
|
title="Recommendation Distribution",
|
|
798
1419
|
)
|
|
799
|
-
st.plotly_chart(fig,
|
|
1420
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1421
|
+
else:
|
|
1422
|
+
st.info("No recommendation data in predictions")
|
|
800
1423
|
|
|
801
1424
|
with col2:
|
|
802
1425
|
# Confidence distribution
|
|
@@ -807,12 +1430,59 @@ def show_ml_processing():
|
|
|
807
1430
|
nbins=20,
|
|
808
1431
|
title="Prediction Confidence Distribution",
|
|
809
1432
|
)
|
|
810
|
-
st.plotly_chart(fig,
|
|
1433
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1434
|
+
else:
|
|
1435
|
+
st.info("No confidence data in predictions")
|
|
811
1436
|
|
|
812
1437
|
# Top predictions
|
|
813
1438
|
st.subheader("Top Investment Opportunities")
|
|
814
|
-
|
|
815
|
-
|
|
1439
|
+
if "predicted_return" in predictions:
|
|
1440
|
+
top_predictions = predictions.nlargest(10, "predicted_return")
|
|
1441
|
+
st.dataframe(top_predictions, width="stretch")
|
|
1442
|
+
else:
|
|
1443
|
+
st.warning("Predictions missing 'predicted_return' column")
|
|
1444
|
+
st.dataframe(predictions.head(10), width="stretch")
|
|
1445
|
+
|
|
1446
|
+
elif predictions is None:
|
|
1447
|
+
st.error("❌ ML Pipeline Error: No predictions generated")
|
|
1448
|
+
st.info("""
|
|
1449
|
+
**Possible causes:**
|
|
1450
|
+
- No trained model available
|
|
1451
|
+
- Insufficient training data
|
|
1452
|
+
- Pipeline configuration error
|
|
1453
|
+
|
|
1454
|
+
**Next steps:**
|
|
1455
|
+
1. Check 'Raw Data' tab - verify data is loaded
|
|
1456
|
+
2. Check 'Preprocessed' tab - verify data preprocessing works
|
|
1457
|
+
3. Go to 'Model Training & Evaluation' page to train a model
|
|
1458
|
+
4. Check Supabase connection in 'System Health' page
|
|
1459
|
+
""")
|
|
1460
|
+
|
|
1461
|
+
# Debug info
|
|
1462
|
+
with st.expander("🔍 Debug Information"):
|
|
1463
|
+
st.write("**Data Status:**")
|
|
1464
|
+
st.write(f"- Raw records: {len(disclosures)}")
|
|
1465
|
+
st.write(f"- Processed records: {len(processed_data) if processed_data is not None else 'N/A'}")
|
|
1466
|
+
st.write(f"- Features generated: {len(features.columns) if features is not None else 'N/A'}")
|
|
1467
|
+
st.write(f"- Predictions: None")
|
|
1468
|
+
|
|
1469
|
+
else:
|
|
1470
|
+
st.warning("⚠️ No predictions generated (empty results)")
|
|
1471
|
+
st.info("""
|
|
1472
|
+
**This usually means:**
|
|
1473
|
+
- Not enough data to generate predictions
|
|
1474
|
+
- All data was filtered out during feature engineering
|
|
1475
|
+
- Model confidence threshold too high
|
|
1476
|
+
|
|
1477
|
+
**Debug info:**
|
|
1478
|
+
- Raw records: {}
|
|
1479
|
+
- Processed records: {}
|
|
1480
|
+
- Features: {}
|
|
1481
|
+
""".format(
|
|
1482
|
+
len(disclosures),
|
|
1483
|
+
len(processed_data) if processed_data is not None else 0,
|
|
1484
|
+
len(features) if features is not None else 0
|
|
1485
|
+
))
|
|
816
1486
|
else:
|
|
817
1487
|
st.error("Failed to process data through pipeline")
|
|
818
1488
|
else:
|
|
@@ -831,15 +1501,27 @@ def show_model_performance():
|
|
|
831
1501
|
|
|
832
1502
|
with col1:
|
|
833
1503
|
avg_accuracy = model_metrics["accuracy"].mean()
|
|
834
|
-
st.metric(
|
|
1504
|
+
st.metric(
|
|
1505
|
+
"Average Accuracy",
|
|
1506
|
+
f"{avg_accuracy:.2%}",
|
|
1507
|
+
help="Mean prediction accuracy across all deployed models. Higher is better (typically 70-95% for good models).",
|
|
1508
|
+
)
|
|
835
1509
|
|
|
836
1510
|
with col2:
|
|
837
1511
|
avg_sharpe = model_metrics["sharpe_ratio"].mean()
|
|
838
|
-
st.metric(
|
|
1512
|
+
st.metric(
|
|
1513
|
+
"Average Sharpe Ratio",
|
|
1514
|
+
f"{avg_sharpe:.2f}",
|
|
1515
|
+
help="Risk-adjusted return measure. Calculated as (returns - risk-free rate) / volatility. Values > 1 are good, > 2 are very good, > 3 are excellent.",
|
|
1516
|
+
)
|
|
839
1517
|
|
|
840
1518
|
with col3:
|
|
841
1519
|
deployed_count = len(model_metrics[model_metrics["status"] == "deployed"])
|
|
842
|
-
st.metric(
|
|
1520
|
+
st.metric(
|
|
1521
|
+
"Deployed Models",
|
|
1522
|
+
deployed_count,
|
|
1523
|
+
help="Number of models currently active and available for predictions.",
|
|
1524
|
+
)
|
|
843
1525
|
|
|
844
1526
|
# Model comparison
|
|
845
1527
|
st.subheader("Model Comparison")
|
|
@@ -863,7 +1545,7 @@ def show_model_performance():
|
|
|
863
1545
|
)
|
|
864
1546
|
|
|
865
1547
|
fig.update_layout(height=400, showlegend=False)
|
|
866
|
-
st.plotly_chart(fig,
|
|
1548
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
867
1549
|
|
|
868
1550
|
# Model details table
|
|
869
1551
|
st.subheader("Model Details")
|
|
@@ -911,6 +1593,13 @@ def show_train_model_tab():
|
|
|
911
1593
|
"""Training tab with hyperparameter tuning"""
|
|
912
1594
|
st.subheader("🎯 Train New Model")
|
|
913
1595
|
|
|
1596
|
+
# Helpful info box
|
|
1597
|
+
st.info(
|
|
1598
|
+
"💡 **Quick Start Guide:** Configure your model below and click 'Start Training'. "
|
|
1599
|
+
"Hover over any parameter name (ℹ️) to see detailed explanations. "
|
|
1600
|
+
"For most tasks, the default values are a good starting point."
|
|
1601
|
+
)
|
|
1602
|
+
|
|
914
1603
|
# Model naming
|
|
915
1604
|
st.markdown("### 📝 Model Configuration")
|
|
916
1605
|
model_name_input = st.text_input(
|
|
@@ -934,7 +1623,7 @@ def show_train_model_tab():
|
|
|
934
1623
|
model_type = st.selectbox(
|
|
935
1624
|
"Select Model Architecture",
|
|
936
1625
|
["LSTM", "Transformer", "CNN-LSTM", "Ensemble"],
|
|
937
|
-
help="
|
|
1626
|
+
help="Neural network architecture type:\n• LSTM: Long Short-Term Memory, excellent for time series and sequential data\n• Transformer: Attention-based, state-of-the-art for many tasks, handles long sequences well\n• CNN-LSTM: Combines convolutional layers with LSTM, good for spatiotemporal patterns\n• Ensemble: Combines multiple models for better predictions (slower but often more accurate)",
|
|
938
1627
|
)
|
|
939
1628
|
|
|
940
1629
|
# Hyperparameter configuration
|
|
@@ -944,44 +1633,166 @@ def show_train_model_tab():
|
|
|
944
1633
|
|
|
945
1634
|
with col1:
|
|
946
1635
|
st.markdown("**Training Parameters**")
|
|
947
|
-
epochs = st.slider(
|
|
948
|
-
|
|
1636
|
+
epochs = st.slider(
|
|
1637
|
+
"Epochs",
|
|
1638
|
+
1,
|
|
1639
|
+
100,
|
|
1640
|
+
20,
|
|
1641
|
+
help="Number of complete passes through the training dataset. More epochs can improve accuracy but may lead to overfitting. Typical range: 10-50 for most tasks.",
|
|
1642
|
+
)
|
|
1643
|
+
batch_size = st.select_slider(
|
|
1644
|
+
"Batch Size",
|
|
1645
|
+
options=[8, 16, 32, 64, 128, 256],
|
|
1646
|
+
value=32,
|
|
1647
|
+
help="Number of samples processed before updating model weights. Larger batches train faster but use more memory. Smaller batches may generalize better. Common values: 16, 32, 64.",
|
|
1648
|
+
)
|
|
949
1649
|
learning_rate = st.select_slider(
|
|
950
|
-
"Learning Rate",
|
|
1650
|
+
"Learning Rate",
|
|
1651
|
+
options=[0.0001, 0.001, 0.01, 0.1],
|
|
1652
|
+
value=0.001,
|
|
1653
|
+
help="Step size for weight updates during training. Lower values (0.0001-0.001) are safer but slower. Higher values (0.01-0.1) train faster but may overshoot optimal weights. Start with 0.001 for Adam optimizer.",
|
|
951
1654
|
)
|
|
952
1655
|
|
|
953
1656
|
with col2:
|
|
954
1657
|
st.markdown("**Model Architecture**")
|
|
955
|
-
hidden_layers = st.slider(
|
|
956
|
-
|
|
957
|
-
|
|
1658
|
+
hidden_layers = st.slider(
|
|
1659
|
+
"Hidden Layers",
|
|
1660
|
+
1,
|
|
1661
|
+
5,
|
|
1662
|
+
2,
|
|
1663
|
+
help="Number of hidden layers in the neural network. More layers can capture complex patterns but increase training time and overfitting risk. Start with 2-3 layers for most problems.",
|
|
1664
|
+
)
|
|
1665
|
+
neurons_per_layer = st.slider(
|
|
1666
|
+
"Neurons per Layer",
|
|
1667
|
+
32,
|
|
1668
|
+
512,
|
|
1669
|
+
128,
|
|
1670
|
+
step=32,
|
|
1671
|
+
help="Number of neurons in each hidden layer. More neurons increase model capacity and training time. Common values: 64, 128, 256. Higher values for complex data.",
|
|
1672
|
+
)
|
|
1673
|
+
dropout_rate = st.slider(
|
|
1674
|
+
"Dropout Rate",
|
|
1675
|
+
0.0,
|
|
1676
|
+
0.5,
|
|
1677
|
+
0.2,
|
|
1678
|
+
step=0.05,
|
|
1679
|
+
help="Fraction of neurons randomly dropped during training to prevent overfitting. 0.0 = no dropout, 0.5 = aggressive regularization. Typical range: 0.1-0.3 for most tasks.",
|
|
1680
|
+
)
|
|
958
1681
|
|
|
959
1682
|
with col3:
|
|
960
1683
|
st.markdown("**Optimization**")
|
|
961
|
-
optimizer = st.selectbox(
|
|
962
|
-
|
|
963
|
-
|
|
1684
|
+
optimizer = st.selectbox(
|
|
1685
|
+
"Optimizer",
|
|
1686
|
+
["Adam", "SGD", "RMSprop", "AdamW"],
|
|
1687
|
+
help="Algorithm for updating model weights:\n• Adam: Adaptive learning rate, works well for most tasks (recommended)\n• SGD: Simple but requires careful learning rate tuning\n• RMSprop: Good for recurrent networks\n• AdamW: Adam with weight decay, better generalization",
|
|
1688
|
+
)
|
|
1689
|
+
early_stopping = st.checkbox(
|
|
1690
|
+
"Early Stopping",
|
|
1691
|
+
value=True,
|
|
1692
|
+
help="Stop training when validation performance stops improving. Prevents overfitting and saves training time. Recommended for most tasks.",
|
|
1693
|
+
)
|
|
1694
|
+
patience = (
|
|
1695
|
+
st.number_input(
|
|
1696
|
+
"Patience (epochs)",
|
|
1697
|
+
3,
|
|
1698
|
+
20,
|
|
1699
|
+
5,
|
|
1700
|
+
help="Number of epochs to wait for improvement before stopping. Higher patience allows more time to escape local minima. Typical range: 3-10 epochs.",
|
|
1701
|
+
)
|
|
1702
|
+
if early_stopping
|
|
1703
|
+
else None
|
|
1704
|
+
)
|
|
964
1705
|
|
|
965
1706
|
# Advanced options
|
|
966
1707
|
with st.expander("🔧 Advanced Options"):
|
|
967
1708
|
col1, col2 = st.columns(2)
|
|
968
1709
|
with col1:
|
|
969
|
-
use_validation_split = st.checkbox(
|
|
1710
|
+
use_validation_split = st.checkbox(
|
|
1711
|
+
"Use Validation Split",
|
|
1712
|
+
value=True,
|
|
1713
|
+
help="Split data into training and validation sets. Validation set is used to monitor overfitting and select best model. Essential for reliable training. Recommended: Always enabled.",
|
|
1714
|
+
)
|
|
970
1715
|
validation_split = (
|
|
971
|
-
st.slider(
|
|
1716
|
+
st.slider(
|
|
1717
|
+
"Validation Split",
|
|
1718
|
+
0.1,
|
|
1719
|
+
0.3,
|
|
1720
|
+
0.2,
|
|
1721
|
+
help="Fraction of data reserved for validation (not used for training). Higher values give more reliable validation but less training data. Typical: 0.2 (20% validation, 80% training).",
|
|
1722
|
+
)
|
|
1723
|
+
if use_validation_split
|
|
1724
|
+
else 0
|
|
1725
|
+
)
|
|
1726
|
+
use_data_augmentation = st.checkbox(
|
|
1727
|
+
"Data Augmentation",
|
|
1728
|
+
value=False,
|
|
1729
|
+
help="Generate additional training samples by applying random transformations to existing data. Reduces overfitting and improves generalization. Useful when training data is limited. May increase training time.",
|
|
972
1730
|
)
|
|
973
|
-
use_data_augmentation = st.checkbox("Data Augmentation", value=False)
|
|
974
1731
|
with col2:
|
|
975
|
-
use_lr_scheduler = st.checkbox(
|
|
1732
|
+
use_lr_scheduler = st.checkbox(
|
|
1733
|
+
"Learning Rate Scheduler",
|
|
1734
|
+
value=False,
|
|
1735
|
+
help="Automatically adjust learning rate during training. Can improve convergence and final performance. Useful for long training runs or when training plateaus. Not always necessary with Adam optimizer.",
|
|
1736
|
+
)
|
|
976
1737
|
scheduler_type = (
|
|
977
|
-
st.selectbox(
|
|
1738
|
+
st.selectbox(
|
|
1739
|
+
"Scheduler Type",
|
|
1740
|
+
["StepLR", "ReduceLROnPlateau"],
|
|
1741
|
+
help="Learning rate adjustment strategy:\n• StepLR: Reduce LR by fixed factor at regular intervals\n• ReduceLROnPlateau: Reduce LR when validation metric stops improving (adaptive, often better)",
|
|
1742
|
+
)
|
|
978
1743
|
if use_lr_scheduler
|
|
979
1744
|
else None
|
|
980
1745
|
)
|
|
981
|
-
class_weights = st.checkbox(
|
|
1746
|
+
class_weights = st.checkbox(
|
|
1747
|
+
"Use Class Weights",
|
|
1748
|
+
value=False,
|
|
1749
|
+
help="Give higher importance to underrepresented classes during training. Helps with imbalanced datasets (e.g., if you have many HOLD predictions but few BUY/SELL). Enable if your classes are imbalanced.",
|
|
1750
|
+
)
|
|
1751
|
+
|
|
1752
|
+
# Helpful tips section
|
|
1753
|
+
with st.expander("📚 Training Tips & Best Practices"):
|
|
1754
|
+
st.markdown(
|
|
1755
|
+
"""
|
|
1756
|
+
### 🎯 Recommended Settings by Task
|
|
1757
|
+
|
|
1758
|
+
**Small Dataset (< 1000 samples):**
|
|
1759
|
+
- Epochs: 20-30
|
|
1760
|
+
- Batch Size: 8-16
|
|
1761
|
+
- Learning Rate: 0.001
|
|
1762
|
+
- Dropout: 0.3-0.4 (higher to prevent overfitting)
|
|
1763
|
+
- Enable Early Stopping
|
|
1764
|
+
|
|
1765
|
+
**Medium Dataset (1000-10,000 samples):**
|
|
1766
|
+
- Epochs: 30-50
|
|
1767
|
+
- Batch Size: 32-64
|
|
1768
|
+
- Learning Rate: 0.001
|
|
1769
|
+
- Dropout: 0.2-0.3
|
|
1770
|
+
- Use Validation Split: 20%
|
|
1771
|
+
|
|
1772
|
+
**Large Dataset (> 10,000 samples):**
|
|
1773
|
+
- Epochs: 50-100
|
|
1774
|
+
- Batch Size: 64-128
|
|
1775
|
+
- Learning Rate: 0.001-0.01
|
|
1776
|
+
- Dropout: 0.1-0.2
|
|
1777
|
+
- Consider Learning Rate Scheduler
|
|
1778
|
+
|
|
1779
|
+
### ⚡ Performance Tips
|
|
1780
|
+
- **Start simple**: Begin with default settings and adjust based on results
|
|
1781
|
+
- **Monitor overfitting**: If training accuracy >> validation accuracy, increase dropout or reduce model complexity
|
|
1782
|
+
- **Too slow to converge**: Increase learning rate or reduce model size
|
|
1783
|
+
- **Unstable training**: Decrease learning rate or batch size
|
|
1784
|
+
- **Memory issues**: Reduce batch size or model size
|
|
1785
|
+
|
|
1786
|
+
### 🔍 What to Watch During Training
|
|
1787
|
+
- **Loss should decrease**: Both train and validation loss should trend downward
|
|
1788
|
+
- **Accuracy should increase**: Both train and validation accuracy should improve
|
|
1789
|
+
- **Gap between train/val**: Small gap = good, large gap = overfitting
|
|
1790
|
+
- **Early stopping triggers**: Model stops when validation stops improving
|
|
1791
|
+
"""
|
|
1792
|
+
)
|
|
982
1793
|
|
|
983
1794
|
# Start training button
|
|
984
|
-
if st.button("🚀 Start Training", type="primary",
|
|
1795
|
+
if st.button("🚀 Start Training", type="primary", width="stretch"):
|
|
985
1796
|
train_model_with_feedback()
|
|
986
1797
|
|
|
987
1798
|
|
|
@@ -994,7 +1805,9 @@ def show_evaluate_models_tab():
|
|
|
994
1805
|
if not model_metrics.empty:
|
|
995
1806
|
# Model selection for evaluation
|
|
996
1807
|
selected_model = st.selectbox(
|
|
997
|
-
"Select Model to Evaluate",
|
|
1808
|
+
"Select Model to Evaluate",
|
|
1809
|
+
model_metrics["model_name"].tolist(),
|
|
1810
|
+
help="Choose a trained model to view detailed performance metrics and evaluation charts.",
|
|
998
1811
|
)
|
|
999
1812
|
|
|
1000
1813
|
# Evaluation metrics
|
|
@@ -1005,13 +1818,29 @@ def show_evaluate_models_tab():
|
|
|
1005
1818
|
model_data = model_metrics[model_metrics["model_name"] == selected_model].iloc[0]
|
|
1006
1819
|
|
|
1007
1820
|
with col1:
|
|
1008
|
-
st.metric(
|
|
1821
|
+
st.metric(
|
|
1822
|
+
"Accuracy",
|
|
1823
|
+
f"{model_data['accuracy']:.2%}",
|
|
1824
|
+
help="Percentage of correct predictions. Measures how often the model's predictions match actual outcomes.",
|
|
1825
|
+
)
|
|
1009
1826
|
with col2:
|
|
1010
|
-
st.metric(
|
|
1827
|
+
st.metric(
|
|
1828
|
+
"Sharpe Ratio",
|
|
1829
|
+
f"{model_data['sharpe_ratio']:.2f}",
|
|
1830
|
+
help="Risk-adjusted return measure. Higher values indicate better returns relative to risk. > 1 is good, > 2 is very good, > 3 is excellent.",
|
|
1831
|
+
)
|
|
1011
1832
|
with col3:
|
|
1012
|
-
st.metric(
|
|
1833
|
+
st.metric(
|
|
1834
|
+
"Status",
|
|
1835
|
+
model_data["status"],
|
|
1836
|
+
help="Current deployment status of the model. 'Deployed' means ready for predictions.",
|
|
1837
|
+
)
|
|
1013
1838
|
with col4:
|
|
1014
|
-
st.metric(
|
|
1839
|
+
st.metric(
|
|
1840
|
+
"Created",
|
|
1841
|
+
model_data.get("created_at", "N/A")[:10],
|
|
1842
|
+
help="Date when this model was trained and saved.",
|
|
1843
|
+
)
|
|
1015
1844
|
|
|
1016
1845
|
# Confusion Matrix Simulation
|
|
1017
1846
|
st.markdown("### 🎯 Confusion Matrix")
|
|
@@ -1032,7 +1861,7 @@ def show_evaluate_models_tab():
|
|
|
1032
1861
|
color_continuous_scale="Blues",
|
|
1033
1862
|
title="Confusion Matrix",
|
|
1034
1863
|
)
|
|
1035
|
-
st.plotly_chart(fig,
|
|
1864
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1036
1865
|
|
|
1037
1866
|
with col2:
|
|
1038
1867
|
# ROC Curve
|
|
@@ -1050,7 +1879,7 @@ def show_evaluate_models_tab():
|
|
|
1050
1879
|
xaxis_title="False Positive Rate",
|
|
1051
1880
|
yaxis_title="True Positive Rate",
|
|
1052
1881
|
)
|
|
1053
|
-
st.plotly_chart(fig,
|
|
1882
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1054
1883
|
|
|
1055
1884
|
# Feature Importance
|
|
1056
1885
|
st.markdown("### 🔍 Feature Importance")
|
|
@@ -1079,7 +1908,7 @@ def show_evaluate_models_tab():
|
|
|
1079
1908
|
color="Importance",
|
|
1080
1909
|
color_continuous_scale="Viridis",
|
|
1081
1910
|
)
|
|
1082
|
-
st.plotly_chart(fig,
|
|
1911
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1083
1912
|
else:
|
|
1084
1913
|
st.info("No models available for evaluation. Train a model first.")
|
|
1085
1914
|
|
|
@@ -1096,6 +1925,7 @@ def show_compare_models_tab():
|
|
|
1096
1925
|
"Select Models to Compare (2-5 models)",
|
|
1097
1926
|
model_metrics["model_name"].tolist(),
|
|
1098
1927
|
default=model_metrics["model_name"].tolist()[: min(3, len(model_metrics))],
|
|
1928
|
+
help="Choose 2-5 models to compare side-by-side. View accuracy, Sharpe ratio, and other metrics across models to identify the best performer.",
|
|
1099
1929
|
)
|
|
1100
1930
|
|
|
1101
1931
|
if len(models_to_compare) >= 2:
|
|
@@ -1134,7 +1964,7 @@ def show_compare_models_tab():
|
|
|
1134
1964
|
)
|
|
1135
1965
|
|
|
1136
1966
|
fig.update_layout(height=400, showlegend=False)
|
|
1137
|
-
st.plotly_chart(fig,
|
|
1967
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1138
1968
|
|
|
1139
1969
|
# Radar chart for multi-metric comparison
|
|
1140
1970
|
st.markdown("### 🎯 Multi-Metric Analysis")
|
|
@@ -1158,11 +1988,11 @@ def show_compare_models_tab():
|
|
|
1158
1988
|
showlegend=True,
|
|
1159
1989
|
title="Model Performance Radar Chart",
|
|
1160
1990
|
)
|
|
1161
|
-
st.plotly_chart(fig,
|
|
1991
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1162
1992
|
|
|
1163
1993
|
# Detailed comparison table
|
|
1164
1994
|
st.markdown("### 📋 Detailed Comparison")
|
|
1165
|
-
st.dataframe(comparison_data,
|
|
1995
|
+
st.dataframe(comparison_data, width="stretch")
|
|
1166
1996
|
else:
|
|
1167
1997
|
st.warning("Please select at least 2 models to compare")
|
|
1168
1998
|
else:
|
|
@@ -1174,49 +2004,304 @@ def show_interactive_predictions_tab():
|
|
|
1174
2004
|
st.subheader("🎮 Interactive Prediction Explorer")
|
|
1175
2005
|
|
|
1176
2006
|
st.markdown("### 🎲 Manual Prediction Input")
|
|
1177
|
-
st.info(
|
|
2007
|
+
st.info(
|
|
2008
|
+
"💡 **How it works**: Input trade details below and click 'Generate Prediction' to see what the model predicts. "
|
|
2009
|
+
"The model analyzes politician track records, market conditions, and trade characteristics to forecast potential returns."
|
|
2010
|
+
)
|
|
2011
|
+
|
|
2012
|
+
# Get politician names for searchable dropdown
|
|
2013
|
+
politician_names = get_politician_names()
|
|
1178
2014
|
|
|
1179
2015
|
col1, col2, col3 = st.columns(3)
|
|
1180
2016
|
|
|
1181
2017
|
with col1:
|
|
1182
|
-
ticker = st.text_input(
|
|
1183
|
-
|
|
1184
|
-
|
|
2018
|
+
ticker = st.text_input(
|
|
2019
|
+
"Ticker Symbol",
|
|
2020
|
+
"AAPL",
|
|
2021
|
+
help="Stock ticker symbol (e.g., AAPL, TSLA, MSFT)",
|
|
2022
|
+
)
|
|
2023
|
+
politician_name = st.selectbox(
|
|
2024
|
+
"Politician Name",
|
|
2025
|
+
options=politician_names,
|
|
2026
|
+
index=0,
|
|
2027
|
+
help="Start typing to search and filter politician names. Data loaded from database.",
|
|
2028
|
+
)
|
|
2029
|
+
transaction_type = st.selectbox(
|
|
2030
|
+
"Transaction Type",
|
|
2031
|
+
["Purchase", "Sale"],
|
|
2032
|
+
help="Type of transaction: Purchase (buying stock) or Sale (selling stock).",
|
|
2033
|
+
)
|
|
1185
2034
|
|
|
1186
2035
|
with col2:
|
|
1187
|
-
amount = st.number_input(
|
|
1188
|
-
|
|
1189
|
-
|
|
2036
|
+
amount = st.number_input(
|
|
2037
|
+
"Transaction Amount ($)",
|
|
2038
|
+
1000,
|
|
2039
|
+
10000000,
|
|
2040
|
+
50000,
|
|
2041
|
+
step=1000,
|
|
2042
|
+
help="Dollar value of the transaction. Larger transactions may have more significant market impact.",
|
|
2043
|
+
)
|
|
2044
|
+
filing_date = st.date_input(
|
|
2045
|
+
"Filing Date",
|
|
2046
|
+
help="Date when the trade was disclosed. Timing relative to market events can be important.",
|
|
2047
|
+
)
|
|
2048
|
+
market_cap = st.selectbox(
|
|
2049
|
+
"Market Cap",
|
|
2050
|
+
["Large Cap", "Mid Cap", "Small Cap"],
|
|
2051
|
+
help="Company size: Large Cap (>$10B), Mid Cap ($2-10B), Small Cap (<$2B). Larger companies tend to be less volatile.",
|
|
2052
|
+
)
|
|
1190
2053
|
|
|
1191
2054
|
with col3:
|
|
1192
2055
|
sector = st.selectbox(
|
|
1193
|
-
"Sector",
|
|
2056
|
+
"Sector",
|
|
2057
|
+
["Technology", "Healthcare", "Finance", "Energy", "Consumer"],
|
|
2058
|
+
help="Industry sector of the stock. Different sectors have different risk/return profiles and react differently to market conditions.",
|
|
1194
2059
|
)
|
|
1195
|
-
sentiment = st.slider(
|
|
1196
|
-
|
|
2060
|
+
sentiment = st.slider(
|
|
2061
|
+
"News Sentiment",
|
|
2062
|
+
-1.0,
|
|
2063
|
+
1.0,
|
|
2064
|
+
0.0,
|
|
2065
|
+
0.1,
|
|
2066
|
+
help="Overall news sentiment about the stock. -1 = very negative, 0 = neutral, +1 = very positive. Based on recent news articles and social media.",
|
|
2067
|
+
)
|
|
2068
|
+
volatility = st.slider(
|
|
2069
|
+
"Volatility Index",
|
|
2070
|
+
0.0,
|
|
2071
|
+
1.0,
|
|
2072
|
+
0.3,
|
|
2073
|
+
0.05,
|
|
2074
|
+
help="Stock price volatility measure. 0 = stable, 1 = highly volatile. Higher volatility means higher risk but potentially higher returns.",
|
|
2075
|
+
)
|
|
2076
|
+
|
|
2077
|
+
# Trading History Section
|
|
2078
|
+
st.markdown("---")
|
|
2079
|
+
st.markdown(f"### 📊 {politician_name}'s Trading History")
|
|
2080
|
+
|
|
2081
|
+
trading_history = get_politician_trading_history(politician_name)
|
|
2082
|
+
|
|
2083
|
+
if not trading_history.empty:
|
|
2084
|
+
# Summary metrics
|
|
2085
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
2086
|
+
|
|
2087
|
+
with col1:
|
|
2088
|
+
total_trades = len(trading_history)
|
|
2089
|
+
st.metric(
|
|
2090
|
+
"Total Trades",
|
|
2091
|
+
total_trades,
|
|
2092
|
+
help="Total number of trading disclosures filed by this politician (last 100 shown).",
|
|
2093
|
+
)
|
|
2094
|
+
|
|
2095
|
+
with col2:
|
|
2096
|
+
# Count transaction types
|
|
2097
|
+
if "transaction_type" in trading_history.columns:
|
|
2098
|
+
purchases = len(trading_history[trading_history["transaction_type"] == "Purchase"])
|
|
2099
|
+
st.metric(
|
|
2100
|
+
"Purchases",
|
|
2101
|
+
purchases,
|
|
2102
|
+
help="Number of purchase transactions. Compare with sales to understand trading behavior.",
|
|
2103
|
+
)
|
|
2104
|
+
else:
|
|
2105
|
+
st.metric("Purchases", "N/A")
|
|
2106
|
+
|
|
2107
|
+
with col3:
|
|
2108
|
+
# Count unique tickers
|
|
2109
|
+
if "ticker_symbol" in trading_history.columns:
|
|
2110
|
+
unique_tickers = trading_history["ticker_symbol"].nunique()
|
|
2111
|
+
st.metric(
|
|
2112
|
+
"Unique Stocks",
|
|
2113
|
+
unique_tickers,
|
|
2114
|
+
help="Number of different stocks traded. Higher diversity may indicate broader market exposure.",
|
|
2115
|
+
)
|
|
2116
|
+
else:
|
|
2117
|
+
st.metric("Unique Stocks", "N/A")
|
|
2118
|
+
|
|
2119
|
+
with col4:
|
|
2120
|
+
# Most recent trade date
|
|
2121
|
+
if "disclosure_date" in trading_history.columns:
|
|
2122
|
+
try:
|
|
2123
|
+
recent_date = pd.to_datetime(trading_history["disclosure_date"]).max()
|
|
2124
|
+
st.metric(
|
|
2125
|
+
"Last Trade",
|
|
2126
|
+
recent_date.strftime("%Y-%m-%d"),
|
|
2127
|
+
help="Date of most recent trading disclosure. Newer trades may be more relevant for predictions.",
|
|
2128
|
+
)
|
|
2129
|
+
except:
|
|
2130
|
+
st.metric("Last Trade", "N/A")
|
|
2131
|
+
else:
|
|
2132
|
+
st.metric("Last Trade", "N/A")
|
|
2133
|
+
|
|
2134
|
+
# Detailed history in expandable section
|
|
2135
|
+
with st.expander("📜 View Detailed Trading History", expanded=False):
|
|
2136
|
+
# Filter options
|
|
2137
|
+
col1, col2 = st.columns(2)
|
|
2138
|
+
|
|
2139
|
+
with col1:
|
|
2140
|
+
# Transaction type filter
|
|
2141
|
+
if "transaction_type" in trading_history.columns:
|
|
2142
|
+
trans_types = ["All"] + list(trading_history["transaction_type"].unique())
|
|
2143
|
+
trans_filter = st.selectbox("Filter by Transaction Type", trans_types)
|
|
2144
|
+
else:
|
|
2145
|
+
trans_filter = "All"
|
|
2146
|
+
|
|
2147
|
+
with col2:
|
|
2148
|
+
# Show recent N trades
|
|
2149
|
+
show_trades = st.slider("Show Last N Trades", 5, 50, 10, step=5)
|
|
2150
|
+
|
|
2151
|
+
# Apply filters
|
|
2152
|
+
filtered_history = trading_history.copy()
|
|
2153
|
+
if trans_filter != "All" and "transaction_type" in filtered_history.columns:
|
|
2154
|
+
filtered_history = filtered_history[
|
|
2155
|
+
filtered_history["transaction_type"] == trans_filter
|
|
2156
|
+
]
|
|
2157
|
+
|
|
2158
|
+
# Display trades
|
|
2159
|
+
st.dataframe(
|
|
2160
|
+
filtered_history.head(show_trades),
|
|
2161
|
+
width="stretch",
|
|
2162
|
+
height=300,
|
|
2163
|
+
)
|
|
1197
2164
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
import time
|
|
2165
|
+
# Visualizations
|
|
2166
|
+
if len(filtered_history) > 0:
|
|
2167
|
+
st.markdown("#### 📈 Trading Patterns")
|
|
1202
2168
|
|
|
1203
|
-
|
|
2169
|
+
viz_col1, viz_col2 = st.columns(2)
|
|
1204
2170
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
2171
|
+
with viz_col1:
|
|
2172
|
+
# Transaction type distribution
|
|
2173
|
+
if "transaction_type" in filtered_history.columns:
|
|
2174
|
+
trans_dist = filtered_history["transaction_type"].value_counts()
|
|
2175
|
+
fig = px.pie(
|
|
2176
|
+
values=trans_dist.values,
|
|
2177
|
+
names=trans_dist.index,
|
|
2178
|
+
title="Transaction Type Distribution",
|
|
2179
|
+
)
|
|
2180
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2181
|
+
|
|
2182
|
+
with viz_col2:
|
|
2183
|
+
# Top traded stocks
|
|
2184
|
+
if "ticker_symbol" in filtered_history.columns:
|
|
2185
|
+
top_stocks = filtered_history["ticker_symbol"].value_counts().head(10)
|
|
2186
|
+
fig = px.bar(
|
|
2187
|
+
x=top_stocks.values,
|
|
2188
|
+
y=top_stocks.index,
|
|
2189
|
+
orientation="h",
|
|
2190
|
+
title="Top 10 Most Traded Stocks",
|
|
2191
|
+
labels={"x": "Number of Trades", "y": "Ticker"},
|
|
2192
|
+
)
|
|
2193
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2194
|
+
|
|
2195
|
+
# Timeline of trades
|
|
2196
|
+
if "disclosure_date" in filtered_history.columns:
|
|
2197
|
+
st.markdown("#### 📅 Trading Timeline")
|
|
2198
|
+
try:
|
|
2199
|
+
timeline_df = filtered_history.copy()
|
|
2200
|
+
timeline_df["disclosure_date"] = pd.to_datetime(
|
|
2201
|
+
timeline_df["disclosure_date"]
|
|
2202
|
+
)
|
|
2203
|
+
timeline_df = timeline_df.sort_values("disclosure_date")
|
|
2204
|
+
|
|
2205
|
+
# Count trades per month
|
|
2206
|
+
# Convert to month string directly to avoid PeriodArray timezone warning
|
|
2207
|
+
timeline_df["month"] = timeline_df["disclosure_date"].dt.strftime("%Y-%m")
|
|
2208
|
+
monthly_trades = (
|
|
2209
|
+
timeline_df.groupby("month").size().reset_index(name="count")
|
|
2210
|
+
)
|
|
2211
|
+
|
|
2212
|
+
fig = px.line(
|
|
2213
|
+
monthly_trades,
|
|
2214
|
+
x="month",
|
|
2215
|
+
y="count",
|
|
2216
|
+
title="Trading Activity Over Time",
|
|
2217
|
+
labels={"month": "Month", "count": "Number of Trades"},
|
|
2218
|
+
markers=True,
|
|
2219
|
+
)
|
|
2220
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2221
|
+
except Exception as e:
|
|
2222
|
+
st.info("Timeline visualization not available")
|
|
2223
|
+
|
|
2224
|
+
else:
|
|
2225
|
+
st.info(
|
|
2226
|
+
f"📭 No trading history found for {politician_name}. "
|
|
2227
|
+
"This could mean: (1) No trades on record, (2) Data not yet synced, or (3) Name not in database."
|
|
2228
|
+
)
|
|
2229
|
+
|
|
2230
|
+
st.markdown("---")
|
|
2231
|
+
|
|
2232
|
+
# Technical details about prediction system
|
|
2233
|
+
with st.expander("ℹ️ About the Prediction System"):
|
|
2234
|
+
st.markdown(
|
|
2235
|
+
"""
|
|
2236
|
+
### How Predictions Work
|
|
2237
|
+
|
|
2238
|
+
**Current Implementation** (Production Mode):
|
|
2239
|
+
|
|
2240
|
+
This system uses a **feature-engineered prediction pipeline** with real data analysis:
|
|
2241
|
+
|
|
2242
|
+
1. **Load Latest Model**: Fetches the most recent trained model from `/models` directory
|
|
2243
|
+
2. **Feature Engineering**: Transforms input data using a 10-feature pipeline:
|
|
2244
|
+
- **Politician Performance**: Historical trading volume, purchase ratio, stock diversity
|
|
2245
|
+
- **Transaction Characteristics**: Purchase/sale indicator, amount (log-scaled & normalized)
|
|
2246
|
+
- **Market Indicators**: Market cap score, sector risk assessment
|
|
2247
|
+
- **Sentiment & Volatility**: News sentiment scores, price volatility measures
|
|
2248
|
+
- **Timing Analysis**: Trade recency score with decay function
|
|
2249
|
+
3. **Model Inference**: Runs preprocessed data through feature-weighted scoring model
|
|
2250
|
+
4. **Result Generation**: Produces 4 key metrics:
|
|
2251
|
+
- **Recommendation**: BUY/SELL/HOLD based on weighted score
|
|
2252
|
+
- **Predicted Return**: Expected return percentage
|
|
2253
|
+
- **Confidence**: Prediction confidence (50%-95%)
|
|
2254
|
+
- **Risk Level**: Risk assessment (Low/Medium/High)
|
|
2255
|
+
|
|
2256
|
+
**Next Steps** (Neural Network Integration):
|
|
2257
|
+
- Load PyTorch model from training pipeline
|
|
2258
|
+
- Run inference with trained neural network weights
|
|
2259
|
+
- Replace weighted scoring with deep learning predictions
|
|
2260
|
+
- See `docs/model_training_guide.md` for training instructions
|
|
2261
|
+
|
|
2262
|
+
**Prediction Quality Factors**:
|
|
2263
|
+
- Politician's historical trading success (15% weight)
|
|
2264
|
+
- News sentiment analysis (20% weight)
|
|
2265
|
+
- Price volatility (12% weight, negative impact)
|
|
2266
|
+
- Transaction timing and market conditions
|
|
2267
|
+
- Sector-specific risk profiles
|
|
2268
|
+
"""
|
|
2269
|
+
)
|
|
2270
|
+
|
|
2271
|
+
if st.button("🔮 Generate Prediction", width="stretch"):
|
|
2272
|
+
# PRODUCTION MODE: Real model inference
|
|
2273
|
+
with st.spinner("🔬 Engineering features and running model inference..."):
|
|
2274
|
+
# 1. Load latest model
|
|
2275
|
+
model_file, model_metadata = load_latest_model()
|
|
2276
|
+
|
|
2277
|
+
# 2. Engineer features from input data
|
|
2278
|
+
features = engineer_features(
|
|
2279
|
+
ticker=ticker,
|
|
2280
|
+
politician_name=politician_name,
|
|
2281
|
+
transaction_type=transaction_type,
|
|
2282
|
+
amount=amount,
|
|
2283
|
+
filing_date=filing_date,
|
|
2284
|
+
market_cap=market_cap,
|
|
2285
|
+
sector=sector,
|
|
2286
|
+
sentiment=sentiment,
|
|
2287
|
+
volatility=volatility,
|
|
2288
|
+
trading_history=trading_history,
|
|
2289
|
+
)
|
|
2290
|
+
|
|
2291
|
+
# 3. Generate prediction
|
|
2292
|
+
prediction = generate_production_prediction(features, model_metadata)
|
|
1208
2293
|
|
|
1209
2294
|
# Display results
|
|
2295
|
+
st.success(
|
|
2296
|
+
f"✅ **Production Mode**: Using {prediction['model_used']} | "
|
|
2297
|
+
f"Features: {len(features)} engineered"
|
|
2298
|
+
)
|
|
1210
2299
|
st.markdown("### 🎯 Prediction Results")
|
|
1211
2300
|
|
|
1212
|
-
col1, col2, col3 = st.columns(
|
|
2301
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
1213
2302
|
|
|
1214
2303
|
with col1:
|
|
1215
|
-
recommendation =
|
|
1216
|
-
"BUY"
|
|
1217
|
-
if prediction_score > 0.6
|
|
1218
|
-
else "SELL" if prediction_score < 0.4 else "HOLD"
|
|
1219
|
-
)
|
|
2304
|
+
recommendation = prediction["recommendation"]
|
|
1220
2305
|
color = (
|
|
1221
2306
|
"green"
|
|
1222
2307
|
if recommendation == "BUY"
|
|
@@ -1225,36 +2310,82 @@ def show_interactive_predictions_tab():
|
|
|
1225
2310
|
st.markdown(f"**Recommendation**: :{color}[{recommendation}]")
|
|
1226
2311
|
|
|
1227
2312
|
with col2:
|
|
1228
|
-
st.metric(
|
|
2313
|
+
st.metric(
|
|
2314
|
+
"Predicted Return",
|
|
2315
|
+
f"{prediction['predicted_return']:.1%}",
|
|
2316
|
+
help="Expected return based on model analysis. Positive = profit, negative = loss.",
|
|
2317
|
+
)
|
|
1229
2318
|
|
|
1230
2319
|
with col3:
|
|
1231
|
-
st.metric(
|
|
2320
|
+
st.metric(
|
|
2321
|
+
"Confidence",
|
|
2322
|
+
f"{prediction['confidence']:.0%}",
|
|
2323
|
+
help="Model confidence in this prediction. Higher = more certain.",
|
|
2324
|
+
)
|
|
1232
2325
|
|
|
1233
|
-
|
|
1234
|
-
|
|
2326
|
+
with col4:
|
|
2327
|
+
risk_color = (
|
|
2328
|
+
"🔴"
|
|
2329
|
+
if prediction["risk_score"] > 0.7
|
|
2330
|
+
else "🟡" if prediction["risk_score"] > 0.4 else "🟢"
|
|
2331
|
+
)
|
|
2332
|
+
st.metric(
|
|
2333
|
+
"Risk Level",
|
|
2334
|
+
f"{risk_color} {prediction['risk_score']:.2f}",
|
|
2335
|
+
help="Risk score (0-1). Higher = riskier trade.",
|
|
2336
|
+
)
|
|
1235
2337
|
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
2338
|
+
# Prediction breakdown - show actual feature contributions
|
|
2339
|
+
st.markdown("### 📊 Feature Analysis")
|
|
2340
|
+
|
|
2341
|
+
# Display top contributing features
|
|
2342
|
+
feature_contributions = {}
|
|
2343
|
+
weights = {
|
|
2344
|
+
"politician_trade_count": ("Politician Experience", 0.15),
|
|
2345
|
+
"politician_purchase_ratio": ("Buy/Sell Ratio", 0.10),
|
|
2346
|
+
"politician_diversity": ("Portfolio Diversity", 0.08),
|
|
2347
|
+
"transaction_is_purchase": ("Transaction Type", 0.12),
|
|
2348
|
+
"transaction_amount_normalized": ("Transaction Size", 0.10),
|
|
2349
|
+
"market_cap_score": ("Company Size", 0.08),
|
|
2350
|
+
"sector_risk": ("Sector Risk", -0.10),
|
|
2351
|
+
"sentiment_score": ("News Sentiment", 0.20),
|
|
2352
|
+
"volatility_score": ("Market Volatility", -0.12),
|
|
2353
|
+
"timing_score": ("Market Timing", 0.09),
|
|
1242
2354
|
}
|
|
1243
2355
|
|
|
2356
|
+
for feature, value in features.items():
|
|
2357
|
+
if feature in weights:
|
|
2358
|
+
label, weight = weights[feature]
|
|
2359
|
+
# Contribution = feature value * weight
|
|
2360
|
+
contribution = value * abs(weight)
|
|
2361
|
+
feature_contributions[label] = contribution
|
|
2362
|
+
|
|
2363
|
+
# Sort by contribution
|
|
2364
|
+
sorted_features = sorted(
|
|
2365
|
+
feature_contributions.items(), key=lambda x: x[1], reverse=True
|
|
2366
|
+
)
|
|
2367
|
+
|
|
1244
2368
|
factor_df = pd.DataFrame(
|
|
1245
|
-
{
|
|
2369
|
+
{
|
|
2370
|
+
"Feature": [f[0] for f in sorted_features],
|
|
2371
|
+
"Contribution": [f[1] for f in sorted_features],
|
|
2372
|
+
}
|
|
1246
2373
|
)
|
|
1247
2374
|
|
|
1248
2375
|
fig = px.bar(
|
|
1249
2376
|
factor_df,
|
|
1250
|
-
x="
|
|
1251
|
-
y="
|
|
2377
|
+
x="Contribution",
|
|
2378
|
+
y="Feature",
|
|
1252
2379
|
orientation="h",
|
|
1253
|
-
title="
|
|
1254
|
-
color="
|
|
2380
|
+
title="Feature Contributions to Prediction",
|
|
2381
|
+
color="Contribution",
|
|
1255
2382
|
color_continuous_scale="RdYlGn",
|
|
1256
2383
|
)
|
|
1257
|
-
st.plotly_chart(fig,
|
|
2384
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2385
|
+
|
|
2386
|
+
# Show raw feature values in expandable section
|
|
2387
|
+
with st.expander("🔍 View Engineered Features"):
|
|
2388
|
+
st.json(features)
|
|
1258
2389
|
|
|
1259
2390
|
|
|
1260
2391
|
def show_performance_tracking_tab():
|
|
@@ -1263,7 +2394,9 @@ def show_performance_tracking_tab():
|
|
|
1263
2394
|
|
|
1264
2395
|
# Time range selector
|
|
1265
2396
|
time_range = st.selectbox(
|
|
1266
|
-
"Select Time Range",
|
|
2397
|
+
"Select Time Range",
|
|
2398
|
+
["Last 7 Days", "Last 30 Days", "Last 90 Days", "All Time"],
|
|
2399
|
+
help="Choose time period to view model performance trends. Longer periods show overall stability, shorter periods show recent changes.",
|
|
1267
2400
|
)
|
|
1268
2401
|
|
|
1269
2402
|
# Generate time series data
|
|
@@ -1292,7 +2425,7 @@ def show_performance_tracking_tab():
|
|
|
1292
2425
|
yaxis_title="Accuracy",
|
|
1293
2426
|
hovermode="x unified",
|
|
1294
2427
|
)
|
|
1295
|
-
st.plotly_chart(fig,
|
|
2428
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1296
2429
|
|
|
1297
2430
|
# Prediction volume and success rate
|
|
1298
2431
|
st.markdown("### 📈 Prediction Metrics")
|
|
@@ -1308,7 +2441,7 @@ def show_performance_tracking_tab():
|
|
|
1308
2441
|
go.Bar(x=dates, y=predictions_per_day, name="Predictions", marker_color="lightblue")
|
|
1309
2442
|
)
|
|
1310
2443
|
fig.update_layout(title="Daily Prediction Volume", xaxis_title="Date", yaxis_title="Count")
|
|
1311
|
-
st.plotly_chart(fig,
|
|
2444
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1312
2445
|
|
|
1313
2446
|
with col2:
|
|
1314
2447
|
# Success rate
|
|
@@ -1331,7 +2464,7 @@ def show_performance_tracking_tab():
|
|
|
1331
2464
|
yaxis_title="Success Rate",
|
|
1332
2465
|
yaxis_tickformat=".0%",
|
|
1333
2466
|
)
|
|
1334
|
-
st.plotly_chart(fig,
|
|
2467
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1335
2468
|
|
|
1336
2469
|
# Data drift detection
|
|
1337
2470
|
st.markdown("### 🔍 Data Drift Detection")
|
|
@@ -1361,7 +2494,7 @@ def show_performance_tracking_tab():
|
|
|
1361
2494
|
color_discrete_map={"Normal": "green", "Warning": "orange", "Alert": "red"},
|
|
1362
2495
|
title="Feature Drift Detection",
|
|
1363
2496
|
)
|
|
1364
|
-
st.plotly_chart(fig,
|
|
2497
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1365
2498
|
|
|
1366
2499
|
with col2:
|
|
1367
2500
|
st.markdown("**Drift Status**")
|
|
@@ -1391,7 +2524,13 @@ def show_predictions():
|
|
|
1391
2524
|
col1, col2, col3 = st.columns(3)
|
|
1392
2525
|
|
|
1393
2526
|
with col1:
|
|
1394
|
-
min_confidence = st.slider(
|
|
2527
|
+
min_confidence = st.slider(
|
|
2528
|
+
"Min Confidence",
|
|
2529
|
+
0.0,
|
|
2530
|
+
1.0,
|
|
2531
|
+
0.5,
|
|
2532
|
+
help="Filter predictions by minimum confidence level. Higher values show only high-confidence predictions.",
|
|
2533
|
+
)
|
|
1395
2534
|
|
|
1396
2535
|
with col2:
|
|
1397
2536
|
recommendation_filter = st.selectbox(
|
|
@@ -1401,10 +2540,15 @@ def show_predictions():
|
|
|
1401
2540
|
if "recommendation" in predictions
|
|
1402
2541
|
else ["All"]
|
|
1403
2542
|
),
|
|
2543
|
+
help="Filter by recommendation type: BUY (positive outlook), SELL (negative outlook), or HOLD (neutral).",
|
|
1404
2544
|
)
|
|
1405
2545
|
|
|
1406
2546
|
with col3:
|
|
1407
|
-
sort_by = st.selectbox(
|
|
2547
|
+
sort_by = st.selectbox(
|
|
2548
|
+
"Sort By",
|
|
2549
|
+
["predicted_return", "confidence", "risk_score"],
|
|
2550
|
+
help="Sort predictions by: predicted return (highest gains first), confidence (most certain first), or risk score (lowest risk first).",
|
|
2551
|
+
)
|
|
1408
2552
|
|
|
1409
2553
|
# Apply filters
|
|
1410
2554
|
filtered_predictions = predictions.copy()
|
|
@@ -1466,7 +2610,7 @@ def show_predictions():
|
|
|
1466
2610
|
hover_data=["ticker"] if "ticker" in filtered_predictions else None,
|
|
1467
2611
|
title="Risk-Return Analysis",
|
|
1468
2612
|
)
|
|
1469
|
-
st.plotly_chart(fig,
|
|
2613
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1470
2614
|
|
|
1471
2615
|
with col2:
|
|
1472
2616
|
# Top movers
|
|
@@ -1485,7 +2629,7 @@ def show_predictions():
|
|
|
1485
2629
|
color_continuous_scale="RdYlGn",
|
|
1486
2630
|
title="Top Movers (Predicted)",
|
|
1487
2631
|
)
|
|
1488
|
-
st.plotly_chart(fig,
|
|
2632
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1489
2633
|
else:
|
|
1490
2634
|
st.warning("No predictions available. Check if the ML pipeline is running correctly.")
|
|
1491
2635
|
else:
|
|
@@ -1534,7 +2678,7 @@ def show_lsh_jobs():
|
|
|
1534
2678
|
lsh_jobs["timestamp"] = pd.to_datetime(lsh_jobs["timestamp"])
|
|
1535
2679
|
|
|
1536
2680
|
# Group by hour
|
|
1537
|
-
hourly_jobs = lsh_jobs.set_index("timestamp").resample("
|
|
2681
|
+
hourly_jobs = lsh_jobs.set_index("timestamp").resample("1h").size()
|
|
1538
2682
|
|
|
1539
2683
|
fig = px.line(
|
|
1540
2684
|
x=hourly_jobs.index,
|
|
@@ -1542,7 +2686,7 @@ def show_lsh_jobs():
|
|
|
1542
2686
|
title="Job Executions Over Time",
|
|
1543
2687
|
labels={"x": "Time", "y": "Job Count"},
|
|
1544
2688
|
)
|
|
1545
|
-
st.plotly_chart(fig,
|
|
2689
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1546
2690
|
except:
|
|
1547
2691
|
pass
|
|
1548
2692
|
else:
|
|
@@ -1640,7 +2784,7 @@ def show_system_health():
|
|
|
1640
2784
|
)
|
|
1641
2785
|
|
|
1642
2786
|
fig.update_layout(height=500, showlegend=False)
|
|
1643
|
-
st.plotly_chart(fig,
|
|
2787
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1644
2788
|
|
|
1645
2789
|
|
|
1646
2790
|
# Run the main dashboard function
|