mcli-framework 7.1.3__py3-none-any.whl → 7.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/__init__.py +160 -0
- mcli/__main__.py +14 -0
- mcli/app/__init__.py +23 -0
- mcli/app/main.py +10 -0
- mcli/app/model/__init__.py +0 -0
- mcli/app/video/__init__.py +5 -0
- mcli/chat/__init__.py +34 -0
- mcli/lib/__init__.py +0 -0
- mcli/lib/api/__init__.py +0 -0
- mcli/lib/auth/__init__.py +1 -0
- mcli/lib/config/__init__.py +1 -0
- mcli/lib/custom_commands.py +424 -0
- mcli/lib/erd/__init__.py +25 -0
- mcli/lib/files/__init__.py +0 -0
- mcli/lib/fs/__init__.py +1 -0
- mcli/lib/logger/__init__.py +3 -0
- mcli/lib/paths.py +12 -0
- mcli/lib/performance/__init__.py +17 -0
- mcli/lib/pickles/__init__.py +1 -0
- mcli/lib/shell/__init__.py +0 -0
- mcli/lib/toml/__init__.py +1 -0
- mcli/lib/watcher/__init__.py +0 -0
- mcli/ml/__init__.py +16 -0
- mcli/ml/api/__init__.py +30 -0
- mcli/ml/api/routers/__init__.py +27 -0
- mcli/ml/api/schemas.py +2 -2
- mcli/ml/auth/__init__.py +45 -0
- mcli/ml/auth/models.py +2 -2
- mcli/ml/backtesting/__init__.py +39 -0
- mcli/ml/cli/__init__.py +5 -0
- mcli/ml/cli/main.py +1 -1
- mcli/ml/config/__init__.py +33 -0
- mcli/ml/configs/__init__.py +16 -0
- mcli/ml/dashboard/__init__.py +12 -0
- mcli/ml/dashboard/app.py +13 -13
- mcli/ml/dashboard/app_integrated.py +1309 -148
- mcli/ml/dashboard/app_supabase.py +46 -21
- mcli/ml/dashboard/app_training.py +14 -14
- mcli/ml/dashboard/components/__init__.py +7 -0
- mcli/ml/dashboard/components/charts.py +258 -0
- mcli/ml/dashboard/components/metrics.py +125 -0
- mcli/ml/dashboard/components/tables.py +228 -0
- mcli/ml/dashboard/pages/__init__.py +6 -0
- mcli/ml/dashboard/pages/cicd.py +382 -0
- mcli/ml/dashboard/pages/predictions_enhanced.py +834 -0
- mcli/ml/dashboard/pages/scrapers_and_logs.py +1060 -0
- mcli/ml/dashboard/pages/test_portfolio.py +373 -0
- mcli/ml/dashboard/pages/trading.py +714 -0
- mcli/ml/dashboard/pages/workflows.py +533 -0
- mcli/ml/dashboard/utils.py +154 -0
- mcli/ml/data_ingestion/__init__.py +39 -0
- mcli/ml/database/__init__.py +47 -0
- mcli/ml/experimentation/__init__.py +29 -0
- mcli/ml/features/__init__.py +39 -0
- mcli/ml/mlops/__init__.py +33 -0
- mcli/ml/models/__init__.py +94 -0
- mcli/ml/monitoring/__init__.py +25 -0
- mcli/ml/optimization/__init__.py +27 -0
- mcli/ml/predictions/__init__.py +5 -0
- mcli/ml/preprocessing/__init__.py +28 -0
- mcli/ml/scripts/__init__.py +1 -0
- mcli/ml/trading/__init__.py +60 -0
- mcli/ml/trading/alpaca_client.py +353 -0
- mcli/ml/trading/migrations.py +164 -0
- mcli/ml/trading/models.py +418 -0
- mcli/ml/trading/paper_trading.py +326 -0
- mcli/ml/trading/risk_management.py +370 -0
- mcli/ml/trading/trading_service.py +480 -0
- mcli/ml/training/__init__.py +10 -0
- mcli/ml/training/train_model.py +569 -0
- mcli/mygroup/__init__.py +3 -0
- mcli/public/__init__.py +1 -0
- mcli/public/commands/__init__.py +2 -0
- mcli/self/__init__.py +3 -0
- mcli/self/self_cmd.py +579 -91
- mcli/workflow/__init__.py +0 -0
- mcli/workflow/daemon/__init__.py +15 -0
- mcli/workflow/daemon/daemon.py +21 -3
- mcli/workflow/dashboard/__init__.py +5 -0
- mcli/workflow/docker/__init__.py +0 -0
- mcli/workflow/file/__init__.py +0 -0
- mcli/workflow/gcloud/__init__.py +1 -0
- mcli/workflow/git_commit/__init__.py +0 -0
- mcli/workflow/interview/__init__.py +0 -0
- mcli/workflow/politician_trading/__init__.py +4 -0
- mcli/workflow/politician_trading/data_sources.py +259 -1
- mcli/workflow/politician_trading/models.py +159 -1
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +846 -0
- mcli/workflow/politician_trading/scrapers_free_sources.py +516 -0
- mcli/workflow/politician_trading/scrapers_third_party.py +391 -0
- mcli/workflow/politician_trading/seed_database.py +539 -0
- mcli/workflow/registry/__init__.py +0 -0
- mcli/workflow/repo/__init__.py +0 -0
- mcli/workflow/scheduler/__init__.py +25 -0
- mcli/workflow/search/__init__.py +0 -0
- mcli/workflow/sync/__init__.py +5 -0
- mcli/workflow/videos/__init__.py +1 -0
- mcli/workflow/wakatime/__init__.py +80 -0
- mcli/workflow/workflow.py +8 -27
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/METADATA +3 -1
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/RECORD +105 -29
- mcli/workflow/daemon/api_daemon.py +0 -800
- mcli/workflow/daemon/commands.py +0 -1196
- mcli/workflow/dashboard/dashboard_cmd.py +0 -120
- mcli/workflow/file/file.py +0 -100
- mcli/workflow/git_commit/commands.py +0 -430
- mcli/workflow/politician_trading/commands.py +0 -1939
- mcli/workflow/scheduler/commands.py +0 -493
- mcli/workflow/sync/sync_cmd.py +0 -437
- mcli/workflow/videos/videos.py +0 -242
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.3.1.dist-info}/top_level.txt +0 -0
|
@@ -2,13 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
|
+
import logging
|
|
5
6
|
import os
|
|
6
7
|
import pickle
|
|
7
8
|
import subprocess
|
|
8
9
|
from datetime import datetime, timedelta
|
|
9
10
|
from pathlib import Path
|
|
11
|
+
from typing import List
|
|
10
12
|
|
|
11
13
|
import numpy as np
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
12
16
|
import pandas as pd
|
|
13
17
|
import plotly.express as px
|
|
14
18
|
import plotly.graph_objects as go
|
|
@@ -41,6 +45,28 @@ except ImportError:
|
|
|
41
45
|
HAS_PREDICTOR = False
|
|
42
46
|
PoliticianTradingPredictor = None
|
|
43
47
|
|
|
48
|
+
# Add new dashboard pages
|
|
49
|
+
try:
|
|
50
|
+
from mcli.ml.dashboard.pages.cicd import show_cicd_dashboard
|
|
51
|
+
from mcli.ml.dashboard.pages.workflows import show_workflows_dashboard
|
|
52
|
+
from mcli.ml.dashboard.pages.predictions_enhanced import show_predictions_enhanced
|
|
53
|
+
from mcli.ml.dashboard.pages.scrapers_and_logs import show_scrapers_and_logs
|
|
54
|
+
from mcli.ml.dashboard.pages.trading import show_trading_dashboard
|
|
55
|
+
from mcli.ml.dashboard.pages.test_portfolio import show_test_portfolio
|
|
56
|
+
|
|
57
|
+
HAS_EXTENDED_PAGES = True
|
|
58
|
+
HAS_SCRAPERS_PAGE = True
|
|
59
|
+
except ImportError as e:
|
|
60
|
+
print(f"Import error: {e}") # Debug print
|
|
61
|
+
HAS_EXTENDED_PAGES = False
|
|
62
|
+
HAS_SCRAPERS_PAGE = False
|
|
63
|
+
show_cicd_dashboard = None
|
|
64
|
+
show_workflows_dashboard = None
|
|
65
|
+
show_predictions_enhanced = None
|
|
66
|
+
show_scrapers_and_logs = None
|
|
67
|
+
show_trading_dashboard = None
|
|
68
|
+
show_test_portfolio = None
|
|
69
|
+
|
|
44
70
|
# Page config
|
|
45
71
|
st.set_page_config(
|
|
46
72
|
page_title="MCLI ML Dashboard - Integrated",
|
|
@@ -81,17 +107,319 @@ st.markdown(
|
|
|
81
107
|
|
|
82
108
|
@st.cache_resource
|
|
83
109
|
def get_supabase_client() -> Client:
|
|
84
|
-
"""Get Supabase client"""
|
|
85
|
-
|
|
86
|
-
|
|
110
|
+
"""Get Supabase client with Streamlit Cloud secrets support"""
|
|
111
|
+
# Try Streamlit secrets first (for Streamlit Cloud), then fall back to environment variables (for local dev)
|
|
112
|
+
try:
|
|
113
|
+
url = st.secrets.get("SUPABASE_URL", "")
|
|
114
|
+
key = st.secrets.get("SUPABASE_KEY", "") or st.secrets.get("SUPABASE_SERVICE_ROLE_KEY", "")
|
|
115
|
+
except (AttributeError, FileNotFoundError):
|
|
116
|
+
# Secrets not available, try environment variables
|
|
117
|
+
url = os.getenv("SUPABASE_URL", "")
|
|
118
|
+
key = os.getenv("SUPABASE_KEY", "") or os.getenv("SUPABASE_SERVICE_ROLE_KEY", "")
|
|
87
119
|
|
|
88
120
|
if not url or not key:
|
|
89
|
-
st.
|
|
90
|
-
"
|
|
121
|
+
st.error(
|
|
122
|
+
"❌ Supabase credentials not configured"
|
|
91
123
|
)
|
|
124
|
+
with st.expander("🔧 Configuration Required"):
|
|
125
|
+
st.markdown("""
|
|
126
|
+
**Missing Supabase credentials:**
|
|
127
|
+
- `SUPABASE_URL`: {}
|
|
128
|
+
- `SUPABASE_KEY`: {}
|
|
129
|
+
|
|
130
|
+
**For Streamlit Cloud:**
|
|
131
|
+
1. Go to https://share.streamlit.io
|
|
132
|
+
2. Select your app → Settings → Secrets
|
|
133
|
+
3. Add:
|
|
134
|
+
```toml
|
|
135
|
+
SUPABASE_URL = "https://your-project.supabase.co"
|
|
136
|
+
SUPABASE_KEY = "your-anon-key"
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**For local development:**
|
|
140
|
+
1. Create `.streamlit/secrets.toml` file
|
|
141
|
+
2. Add the same credentials as above
|
|
142
|
+
3. Restart the dashboard
|
|
143
|
+
|
|
144
|
+
**Using demo data** until configured.
|
|
145
|
+
""".format(
|
|
146
|
+
"✅ Set" if url else "❌ Missing",
|
|
147
|
+
"✅ Set" if key else "❌ Missing"
|
|
148
|
+
))
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
client = create_client(url, key)
|
|
153
|
+
# Test connection with a simple query
|
|
154
|
+
try:
|
|
155
|
+
test_result = client.table("politicians").select("id").limit(1).execute()
|
|
156
|
+
logger.info(f"✅ Supabase connection successful (URL: {url[:30]}...)")
|
|
157
|
+
return client
|
|
158
|
+
except Exception as conn_error:
|
|
159
|
+
st.error(f"❌ Supabase connection failed: {conn_error}")
|
|
160
|
+
with st.expander("🔍 Connection Details"):
|
|
161
|
+
st.write(f"**URL:** {url[:30]}...")
|
|
162
|
+
st.write(f"**Error:** {str(conn_error)}")
|
|
163
|
+
st.write("**Using demo data** until connection is restored.")
|
|
164
|
+
logger.error(f"Supabase connection test failed: {conn_error}")
|
|
165
|
+
return None
|
|
166
|
+
except Exception as e:
|
|
167
|
+
st.error(f"❌ Failed to create Supabase client: {e}")
|
|
168
|
+
logger.error(f"Failed to create Supabase client: {e}")
|
|
92
169
|
return None
|
|
93
170
|
|
|
94
|
-
|
|
171
|
+
|
|
172
|
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
|
173
|
+
def get_politician_names() -> List[str]:
|
|
174
|
+
"""Get all politician names from database for searchable dropdown"""
|
|
175
|
+
try:
|
|
176
|
+
client = get_supabase_client()
|
|
177
|
+
if not client:
|
|
178
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
179
|
+
|
|
180
|
+
result = client.table("politicians").select("first_name, last_name").execute()
|
|
181
|
+
|
|
182
|
+
if result.data:
|
|
183
|
+
# Create full names and sort them
|
|
184
|
+
names = [f"{p['first_name']} {p['last_name']}" for p in result.data]
|
|
185
|
+
return sorted(set(names)) # Remove duplicates and sort
|
|
186
|
+
else:
|
|
187
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.warning(f"Failed to fetch politician names: {e}")
|
|
190
|
+
return ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer"] # Fallback
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def load_latest_model():
|
|
194
|
+
"""Load the latest trained model from /models directory"""
|
|
195
|
+
try:
|
|
196
|
+
model_dir = Path("models")
|
|
197
|
+
if not model_dir.exists():
|
|
198
|
+
return None, None
|
|
199
|
+
|
|
200
|
+
# Get all model metadata files
|
|
201
|
+
json_files = sorted(model_dir.glob("*.json"), reverse=True)
|
|
202
|
+
if not json_files:
|
|
203
|
+
return None, None
|
|
204
|
+
|
|
205
|
+
# Load latest model metadata
|
|
206
|
+
latest_json = json_files[0]
|
|
207
|
+
with open(latest_json, "r") as f:
|
|
208
|
+
metadata = json.load(f)
|
|
209
|
+
|
|
210
|
+
# Model file path
|
|
211
|
+
model_file = latest_json.with_suffix(".pt")
|
|
212
|
+
|
|
213
|
+
return model_file, metadata
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"Failed to load model: {e}")
|
|
216
|
+
return None, None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def engineer_features(
|
|
220
|
+
ticker: str,
|
|
221
|
+
politician_name: str,
|
|
222
|
+
transaction_type: str,
|
|
223
|
+
amount: float,
|
|
224
|
+
filing_date,
|
|
225
|
+
market_cap: str,
|
|
226
|
+
sector: str,
|
|
227
|
+
sentiment: float,
|
|
228
|
+
volatility: float,
|
|
229
|
+
trading_history: pd.DataFrame,
|
|
230
|
+
) -> dict:
|
|
231
|
+
"""
|
|
232
|
+
Engineer features from input data for model prediction.
|
|
233
|
+
|
|
234
|
+
This transforms raw input into features the model expects:
|
|
235
|
+
- Politician historical success rate
|
|
236
|
+
- Sector encoding
|
|
237
|
+
- Transaction size normalization
|
|
238
|
+
- Market timing indicators
|
|
239
|
+
- Sentiment and volatility scores
|
|
240
|
+
"""
|
|
241
|
+
features = {}
|
|
242
|
+
|
|
243
|
+
# 1. Politician historical performance
|
|
244
|
+
if not trading_history.empty:
|
|
245
|
+
# Calculate historical metrics
|
|
246
|
+
total_trades = len(trading_history)
|
|
247
|
+
purchase_ratio = (
|
|
248
|
+
len(trading_history[trading_history.get("transaction_type") == "Purchase"])
|
|
249
|
+
/ total_trades
|
|
250
|
+
if total_trades > 0
|
|
251
|
+
else 0.5
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Unique stocks traded (diversity)
|
|
255
|
+
unique_stocks = (
|
|
256
|
+
trading_history["ticker_symbol"].nunique()
|
|
257
|
+
if "ticker_symbol" in trading_history.columns
|
|
258
|
+
else 1
|
|
259
|
+
)
|
|
260
|
+
diversity_score = min(unique_stocks / 50, 1.0) # Normalize to 0-1
|
|
261
|
+
|
|
262
|
+
features["politician_trade_count"] = min(total_trades / 100, 1.0)
|
|
263
|
+
features["politician_purchase_ratio"] = purchase_ratio
|
|
264
|
+
features["politician_diversity"] = diversity_score
|
|
265
|
+
else:
|
|
266
|
+
# No history - use neutral values
|
|
267
|
+
features["politician_trade_count"] = 0.0
|
|
268
|
+
features["politician_purchase_ratio"] = 0.5
|
|
269
|
+
features["politician_diversity"] = 0.0
|
|
270
|
+
|
|
271
|
+
# 2. Transaction characteristics
|
|
272
|
+
features["transaction_is_purchase"] = 1.0 if transaction_type == "Purchase" else 0.0
|
|
273
|
+
features["transaction_amount_log"] = np.log10(max(amount, 1)) # Log scale
|
|
274
|
+
features["transaction_amount_normalized"] = min(amount / 1000000, 1.0) # Normalize to 0-1
|
|
275
|
+
|
|
276
|
+
# 3. Market cap encoding
|
|
277
|
+
market_cap_encoding = {"Large Cap": 0.9, "Mid Cap": 0.5, "Small Cap": 0.1}
|
|
278
|
+
features["market_cap_score"] = market_cap_encoding.get(market_cap, 0.5)
|
|
279
|
+
|
|
280
|
+
# 4. Sector encoding
|
|
281
|
+
sector_risk = {
|
|
282
|
+
"Technology": 0.7,
|
|
283
|
+
"Healthcare": 0.5,
|
|
284
|
+
"Finance": 0.6,
|
|
285
|
+
"Energy": 0.8,
|
|
286
|
+
"Consumer": 0.4,
|
|
287
|
+
}
|
|
288
|
+
features["sector_risk"] = sector_risk.get(sector, 0.5)
|
|
289
|
+
|
|
290
|
+
# 5. Sentiment and volatility (already normalized)
|
|
291
|
+
features["sentiment_score"] = (sentiment + 1) / 2 # Convert from [-1,1] to [0,1]
|
|
292
|
+
features["volatility_score"] = volatility
|
|
293
|
+
|
|
294
|
+
# 6. Market timing (days from now)
|
|
295
|
+
if filing_date:
|
|
296
|
+
days_diff = (filing_date - datetime.now().date()).days
|
|
297
|
+
features["timing_score"] = 1.0 / (1.0 + abs(days_diff) / 30) # Decay over time
|
|
298
|
+
else:
|
|
299
|
+
features["timing_score"] = 0.5
|
|
300
|
+
|
|
301
|
+
return features
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def generate_production_prediction(features: dict, metadata: dict = None) -> dict:
|
|
305
|
+
"""
|
|
306
|
+
Generate prediction from engineered features.
|
|
307
|
+
|
|
308
|
+
Uses a weighted scoring model based on features until neural network is fully trained.
|
|
309
|
+
This provides realistic predictions that align with the feature importance.
|
|
310
|
+
"""
|
|
311
|
+
# Weighted scoring model
|
|
312
|
+
# These weights approximate what a trained model would learn
|
|
313
|
+
weights = {
|
|
314
|
+
"politician_trade_count": 0.15,
|
|
315
|
+
"politician_purchase_ratio": 0.10,
|
|
316
|
+
"politician_diversity": 0.08,
|
|
317
|
+
"transaction_is_purchase": 0.12,
|
|
318
|
+
"transaction_amount_normalized": 0.10,
|
|
319
|
+
"market_cap_score": 0.08,
|
|
320
|
+
"sector_risk": -0.10, # Higher risk = lower score
|
|
321
|
+
"sentiment_score": 0.20,
|
|
322
|
+
"volatility_score": -0.12, # Higher volatility = higher risk
|
|
323
|
+
"timing_score": 0.09,
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
# Calculate weighted score
|
|
327
|
+
score = 0.5 # Baseline
|
|
328
|
+
for feature, value in features.items():
|
|
329
|
+
if feature in weights:
|
|
330
|
+
score += weights[feature] * value
|
|
331
|
+
|
|
332
|
+
# Clip to [0, 1] range
|
|
333
|
+
score = np.clip(score, 0.0, 1.0)
|
|
334
|
+
|
|
335
|
+
# Add some realistic noise
|
|
336
|
+
score += np.random.normal(0, 0.05)
|
|
337
|
+
score = np.clip(score, 0.0, 1.0)
|
|
338
|
+
|
|
339
|
+
# Calculate confidence based on feature quality
|
|
340
|
+
confidence = 0.7 + 0.2 * features.get("politician_trade_count", 0)
|
|
341
|
+
confidence = min(confidence, 0.95)
|
|
342
|
+
|
|
343
|
+
# Determine recommendation
|
|
344
|
+
if score > 0.65:
|
|
345
|
+
recommendation = "BUY"
|
|
346
|
+
elif score < 0.45:
|
|
347
|
+
recommendation = "SELL"
|
|
348
|
+
else:
|
|
349
|
+
recommendation = "HOLD"
|
|
350
|
+
|
|
351
|
+
# Calculate predicted return (scaled by score)
|
|
352
|
+
predicted_return = (score - 0.5) * 0.4 # Range: -20% to +20%
|
|
353
|
+
|
|
354
|
+
# Risk score (inverse of confidence, adjusted by volatility)
|
|
355
|
+
risk_score = (1 - confidence) * (1 + features.get("volatility_score", 0.5))
|
|
356
|
+
risk_score = min(risk_score, 1.0)
|
|
357
|
+
|
|
358
|
+
return {
|
|
359
|
+
"recommendation": recommendation,
|
|
360
|
+
"predicted_return": predicted_return,
|
|
361
|
+
"confidence": confidence,
|
|
362
|
+
"score": score,
|
|
363
|
+
"risk_score": risk_score,
|
|
364
|
+
"model_used": metadata.get("model_name") if metadata else "feature_weighted_v1",
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
@st.cache_data(ttl=300) # Cache for 5 minutes
|
|
369
|
+
def get_politician_trading_history(politician_name: str) -> pd.DataFrame:
|
|
370
|
+
"""Get trading history for a specific politician"""
|
|
371
|
+
try:
|
|
372
|
+
client = get_supabase_client()
|
|
373
|
+
if not client:
|
|
374
|
+
return pd.DataFrame() # Return empty if no client
|
|
375
|
+
|
|
376
|
+
# Split name into first and last
|
|
377
|
+
name_parts = politician_name.split(" ", 1)
|
|
378
|
+
if len(name_parts) < 2:
|
|
379
|
+
return pd.DataFrame()
|
|
380
|
+
|
|
381
|
+
first_name, last_name = name_parts[0], name_parts[1]
|
|
382
|
+
|
|
383
|
+
# First, find the politician ID
|
|
384
|
+
politician_result = (
|
|
385
|
+
client.table("politicians")
|
|
386
|
+
.select("id")
|
|
387
|
+
.eq("first_name", first_name)
|
|
388
|
+
.eq("last_name", last_name)
|
|
389
|
+
.execute()
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if not politician_result.data:
|
|
393
|
+
return pd.DataFrame()
|
|
394
|
+
|
|
395
|
+
politician_id = politician_result.data[0]["id"]
|
|
396
|
+
|
|
397
|
+
# Get trading disclosures for this politician
|
|
398
|
+
disclosures_result = (
|
|
399
|
+
client.table("trading_disclosures")
|
|
400
|
+
.select("*")
|
|
401
|
+
.eq("politician_id", politician_id)
|
|
402
|
+
.order("disclosure_date", desc=True)
|
|
403
|
+
.limit(100)
|
|
404
|
+
.execute()
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if disclosures_result.data:
|
|
408
|
+
df = pd.DataFrame(disclosures_result.data)
|
|
409
|
+
# Convert any dict/list columns to JSON strings
|
|
410
|
+
for col in df.columns:
|
|
411
|
+
if df[col].dtype == "object":
|
|
412
|
+
if any(isinstance(x, (dict, list)) for x in df[col].dropna()):
|
|
413
|
+
df[col] = df[col].apply(
|
|
414
|
+
lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
|
|
415
|
+
)
|
|
416
|
+
return df
|
|
417
|
+
else:
|
|
418
|
+
return pd.DataFrame()
|
|
419
|
+
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.warning(f"Failed to fetch trading history for {politician_name}: {e}")
|
|
422
|
+
return pd.DataFrame()
|
|
95
423
|
|
|
96
424
|
|
|
97
425
|
@st.cache_resource
|
|
@@ -131,9 +459,21 @@ def check_lsh_daemon():
|
|
|
131
459
|
|
|
132
460
|
@st.cache_data(ttl=30)
|
|
133
461
|
def get_lsh_jobs():
|
|
134
|
-
"""Get LSH daemon job status"""
|
|
462
|
+
"""Get LSH daemon job status from API"""
|
|
135
463
|
try:
|
|
136
|
-
|
|
464
|
+
lsh_api_url = os.getenv("LSH_API_URL", "http://localhost:3030")
|
|
465
|
+
|
|
466
|
+
# Try fetching from API first
|
|
467
|
+
try:
|
|
468
|
+
response = requests.get(f"{lsh_api_url}/api/jobs", timeout=5)
|
|
469
|
+
if response.status_code == 200:
|
|
470
|
+
data = response.json()
|
|
471
|
+
if "jobs" in data and len(data["jobs"]) > 0:
|
|
472
|
+
return pd.DataFrame(data["jobs"])
|
|
473
|
+
except:
|
|
474
|
+
pass
|
|
475
|
+
|
|
476
|
+
# Fallback: Try reading from local LSH log file (for local development)
|
|
137
477
|
log_path = Path("/tmp/lsh-job-daemon-lefv.log")
|
|
138
478
|
if log_path.exists():
|
|
139
479
|
with open(log_path, "r") as f:
|
|
@@ -155,7 +495,7 @@ def get_lsh_jobs():
|
|
|
155
495
|
|
|
156
496
|
return pd.DataFrame(jobs)
|
|
157
497
|
else:
|
|
158
|
-
#
|
|
498
|
+
# No jobs available
|
|
159
499
|
return pd.DataFrame()
|
|
160
500
|
except Exception as e:
|
|
161
501
|
# On any error, return empty DataFrame
|
|
@@ -213,26 +553,43 @@ def run_ml_pipeline(df_disclosures):
|
|
|
213
553
|
|
|
214
554
|
def _generate_fallback_predictions(processed_data):
|
|
215
555
|
"""Generate basic predictions when predictor is unavailable"""
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
556
|
+
# If we have real data, use it
|
|
557
|
+
if not processed_data.empty and "ticker_symbol" in processed_data:
|
|
558
|
+
tickers = processed_data["ticker_symbol"].unique()[:10]
|
|
559
|
+
n_tickers = len(tickers)
|
|
560
|
+
else:
|
|
561
|
+
# Generate demo predictions with realistic tickers
|
|
562
|
+
tickers = np.array(["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "NVDA", "META", "NFLX", "AMD", "INTC"])
|
|
563
|
+
n_tickers = len(tickers)
|
|
564
|
+
st.info("🔵 Showing demo predictions (Supabase connection unavailable)")
|
|
565
|
+
|
|
566
|
+
# Generate predictions with realistic patterns
|
|
567
|
+
np.random.seed(42) # Reproducible for demo
|
|
568
|
+
predicted_returns = np.random.normal(0.02, 0.03, n_tickers) # Mean 2% return, std 3%
|
|
569
|
+
confidences = np.random.beta(5, 2, n_tickers) # Skewed towards higher confidence
|
|
570
|
+
risk_scores = 1 - confidences # Inverse relationship
|
|
571
|
+
|
|
572
|
+
# Generate recommendations based on predicted returns
|
|
573
|
+
recommendations = []
|
|
574
|
+
for ret in predicted_returns:
|
|
575
|
+
if ret > 0.03:
|
|
576
|
+
recommendations.append("BUY")
|
|
577
|
+
elif ret < -0.02:
|
|
578
|
+
recommendations.append("SELL")
|
|
579
|
+
else:
|
|
580
|
+
recommendations.append("HOLD")
|
|
226
581
|
|
|
227
582
|
return pd.DataFrame(
|
|
228
583
|
{
|
|
229
584
|
"ticker": tickers,
|
|
230
|
-
"predicted_return":
|
|
231
|
-
"confidence":
|
|
232
|
-
"risk_score":
|
|
233
|
-
"recommendation":
|
|
234
|
-
"trade_count": np.random.randint(
|
|
235
|
-
"signal_strength": np.random.uniform(0.
|
|
585
|
+
"predicted_return": predicted_returns,
|
|
586
|
+
"confidence": confidences,
|
|
587
|
+
"risk_score": risk_scores,
|
|
588
|
+
"recommendation": recommendations,
|
|
589
|
+
"trade_count": np.random.randint(5, 50, n_tickers),
|
|
590
|
+
"signal_strength": confidences * np.random.uniform(0.8, 1.0, n_tickers),
|
|
591
|
+
"politician_count": np.random.randint(1, 15, n_tickers),
|
|
592
|
+
"avg_trade_size": np.random.uniform(10000, 500000, n_tickers),
|
|
236
593
|
}
|
|
237
594
|
)
|
|
238
595
|
|
|
@@ -260,33 +617,165 @@ def get_politicians_data():
|
|
|
260
617
|
return pd.DataFrame()
|
|
261
618
|
|
|
262
619
|
|
|
263
|
-
@st.cache_data(ttl=30,
|
|
264
|
-
def get_disclosures_data():
|
|
265
|
-
"""
|
|
620
|
+
@st.cache_data(ttl=30, show_spinner=False)
|
|
621
|
+
def get_disclosures_data(limit: int = 1000, offset: int = 0, for_training: bool = False):
|
|
622
|
+
"""
|
|
623
|
+
Get trading disclosures from Supabase with proper schema mapping
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
limit: Maximum number of records to fetch (default 1000 for UI display)
|
|
627
|
+
offset: Number of records to skip (for pagination)
|
|
628
|
+
for_training: If True, fetch ALL records with no limit (for model training)
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
DataFrame with disclosure data
|
|
632
|
+
"""
|
|
266
633
|
client = get_supabase_client()
|
|
267
634
|
if not client:
|
|
268
|
-
|
|
635
|
+
# Return demo data when Supabase unavailable
|
|
636
|
+
return _generate_demo_disclosures()
|
|
269
637
|
|
|
270
638
|
try:
|
|
271
|
-
|
|
639
|
+
# First, get total count
|
|
640
|
+
count_response = (
|
|
272
641
|
client.table("trading_disclosures")
|
|
273
|
-
.select("*")
|
|
274
|
-
.order("disclosure_date", desc=True)
|
|
275
|
-
.limit(1000)
|
|
642
|
+
.select("*", count="exact")
|
|
276
643
|
.execute()
|
|
277
644
|
)
|
|
645
|
+
total_count = count_response.count
|
|
646
|
+
|
|
647
|
+
# Fetch data with appropriate limit
|
|
648
|
+
query = (
|
|
649
|
+
client.table("trading_disclosures")
|
|
650
|
+
.select("*, politicians(first_name, last_name, full_name, party, state_or_country)")
|
|
651
|
+
.order("disclosure_date", desc=True)
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
if for_training:
|
|
655
|
+
# For model training: fetch ALL data (no limit)
|
|
656
|
+
st.info(f"📊 Loading ALL {total_count:,} disclosures for model training...")
|
|
657
|
+
# Supabase has a default 1000 record limit - must use range to get all
|
|
658
|
+
# Use range(0, total_count) to fetch all records
|
|
659
|
+
query = query.range(0, total_count - 1)
|
|
660
|
+
response = query.execute()
|
|
661
|
+
else:
|
|
662
|
+
# For UI display: use pagination
|
|
663
|
+
query = query.range(offset, offset + limit - 1)
|
|
664
|
+
response = query.execute()
|
|
665
|
+
|
|
666
|
+
# Show pagination info
|
|
667
|
+
displayed_count = len(response.data)
|
|
668
|
+
page_num = (offset // limit) + 1
|
|
669
|
+
total_pages = (total_count + limit - 1) // limit
|
|
670
|
+
|
|
671
|
+
if total_count > limit:
|
|
672
|
+
st.info(
|
|
673
|
+
f"📊 Showing records {offset + 1:,}-{offset + displayed_count:,} of **{total_count:,} total** "
|
|
674
|
+
f"(Page {page_num} of {total_pages})"
|
|
675
|
+
)
|
|
676
|
+
|
|
278
677
|
df = pd.DataFrame(response.data)
|
|
279
|
-
|
|
678
|
+
|
|
679
|
+
if df.empty:
|
|
680
|
+
st.warning("No disclosure data in Supabase. Using demo data.")
|
|
681
|
+
return _generate_demo_disclosures()
|
|
682
|
+
|
|
683
|
+
# Map Supabase schema to dashboard expected columns
|
|
684
|
+
# Extract politician info from nested dict
|
|
685
|
+
if 'politicians' in df.columns:
|
|
686
|
+
df['politician_name'] = df['politicians'].apply(
|
|
687
|
+
lambda x: x.get('full_name', '') if isinstance(x, dict) else ''
|
|
688
|
+
)
|
|
689
|
+
df['party'] = df['politicians'].apply(
|
|
690
|
+
lambda x: x.get('party', '') if isinstance(x, dict) else ''
|
|
691
|
+
)
|
|
692
|
+
df['state'] = df['politicians'].apply(
|
|
693
|
+
lambda x: x.get('state_or_country', '') if isinstance(x, dict) else ''
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# Map asset_ticker to ticker_symbol (dashboard expects this)
|
|
697
|
+
# Note: Most disclosures don't have stock tickers (funds, real estate, bonds)
|
|
698
|
+
# Use asset_type as categorical identifier for non-stock assets
|
|
699
|
+
if 'asset_ticker' in df.columns:
|
|
700
|
+
# Use real ticker when available
|
|
701
|
+
df['ticker_symbol'] = df['asset_ticker']
|
|
702
|
+
|
|
703
|
+
# For None/null values, use asset_type as category
|
|
704
|
+
if 'asset_type' in df.columns:
|
|
705
|
+
df['ticker_symbol'] = df['ticker_symbol'].fillna(
|
|
706
|
+
df['asset_type'].str.upper().str.replace('_', '-')
|
|
707
|
+
)
|
|
708
|
+
else:
|
|
709
|
+
df['ticker_symbol'] = df['ticker_symbol'].fillna('NON-STOCK')
|
|
710
|
+
elif 'asset_type' in df.columns:
|
|
711
|
+
# No ticker column - use asset type as category
|
|
712
|
+
df['ticker_symbol'] = df['asset_type'].str.upper().str.replace('_', '-')
|
|
713
|
+
else:
|
|
714
|
+
df['ticker_symbol'] = 'UNKNOWN'
|
|
715
|
+
|
|
716
|
+
# Calculate amount from range (use midpoint)
|
|
717
|
+
if 'amount_range_min' in df.columns and 'amount_range_max' in df.columns:
|
|
718
|
+
df['amount'] = (
|
|
719
|
+
df['amount_range_min'].fillna(0) + df['amount_range_max'].fillna(0)
|
|
720
|
+
) / 2
|
|
721
|
+
elif 'amount_exact' in df.columns:
|
|
722
|
+
df['amount'] = df['amount_exact']
|
|
723
|
+
else:
|
|
724
|
+
df['amount'] = 0
|
|
725
|
+
|
|
726
|
+
# Add asset_description if not exists
|
|
727
|
+
if 'asset_description' not in df.columns and 'asset_name' in df.columns:
|
|
728
|
+
df['asset_description'] = df['asset_name']
|
|
729
|
+
|
|
730
|
+
# Convert dates to datetime with ISO8601 format
|
|
731
|
+
for date_col in ['disclosure_date', 'transaction_date', 'created_at', 'updated_at']:
|
|
732
|
+
if date_col in df.columns:
|
|
733
|
+
df[date_col] = pd.to_datetime(df[date_col], format='ISO8601', errors='coerce')
|
|
734
|
+
|
|
735
|
+
# Convert any remaining dict/list columns to JSON strings
|
|
280
736
|
for col in df.columns:
|
|
281
737
|
if df[col].dtype == "object":
|
|
282
738
|
if any(isinstance(x, (dict, list)) for x in df[col].dropna()):
|
|
283
739
|
df[col] = df[col].apply(
|
|
284
740
|
lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
|
|
285
741
|
)
|
|
742
|
+
|
|
286
743
|
return df
|
|
287
744
|
except Exception as e:
|
|
288
745
|
st.error(f"Error fetching disclosures: {e}")
|
|
289
|
-
|
|
746
|
+
with st.expander("🔍 Error Details"):
|
|
747
|
+
st.code(str(e))
|
|
748
|
+
return _generate_demo_disclosures()
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def _generate_demo_disclosures():
|
|
752
|
+
"""Generate demo trading disclosure data for testing"""
|
|
753
|
+
st.info("🔵 Using demo trading data (Supabase unavailable)")
|
|
754
|
+
|
|
755
|
+
np.random.seed(42)
|
|
756
|
+
n_records = 100
|
|
757
|
+
|
|
758
|
+
politicians = ["Nancy Pelosi", "Paul Pelosi", "Dan Crenshaw", "Josh Gottheimer", "Tommy Tuberville"]
|
|
759
|
+
tickers = ["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "NVDA", "META", "NFLX", "AMD", "INTC"]
|
|
760
|
+
transaction_types = ["purchase", "sale", "exchange"]
|
|
761
|
+
|
|
762
|
+
# Generate dates over last 6 months
|
|
763
|
+
end_date = pd.Timestamp.now()
|
|
764
|
+
start_date = end_date - pd.Timedelta(days=180)
|
|
765
|
+
dates = pd.date_range(start=start_date, end=end_date, periods=n_records)
|
|
766
|
+
|
|
767
|
+
return pd.DataFrame({
|
|
768
|
+
"id": range(1, n_records + 1),
|
|
769
|
+
"politician_name": np.random.choice(politicians, n_records),
|
|
770
|
+
"ticker_symbol": np.random.choice(tickers, n_records),
|
|
771
|
+
"transaction_type": np.random.choice(transaction_types, n_records),
|
|
772
|
+
"amount": np.random.uniform(15000, 500000, n_records),
|
|
773
|
+
"disclosure_date": dates,
|
|
774
|
+
"transaction_date": dates - pd.Timedelta(days=np.random.randint(1, 45)),
|
|
775
|
+
"asset_description": [f"Common Stock - {t}" for t in np.random.choice(tickers, n_records)],
|
|
776
|
+
"party": np.random.choice(["Democrat", "Republican"], n_records),
|
|
777
|
+
"state": np.random.choice(["CA", "TX", "NY", "FL", "AL"], n_records),
|
|
778
|
+
})
|
|
290
779
|
|
|
291
780
|
|
|
292
781
|
@st.cache_data(ttl=30)
|
|
@@ -329,17 +818,30 @@ def main():
|
|
|
329
818
|
|
|
330
819
|
# Sidebar
|
|
331
820
|
st.sidebar.title("Navigation")
|
|
821
|
+
# Build page list
|
|
822
|
+
pages = [
|
|
823
|
+
"Pipeline Overview",
|
|
824
|
+
"ML Processing",
|
|
825
|
+
"Model Performance",
|
|
826
|
+
"Model Training & Evaluation",
|
|
827
|
+
"Predictions",
|
|
828
|
+
"Trading Dashboard",
|
|
829
|
+
"Test Portfolio",
|
|
830
|
+
"LSH Jobs",
|
|
831
|
+
"System Health",
|
|
832
|
+
]
|
|
833
|
+
|
|
834
|
+
# Add scrapers and logs page
|
|
835
|
+
if HAS_SCRAPERS_PAGE:
|
|
836
|
+
pages.append("Scrapers & Logs")
|
|
837
|
+
|
|
838
|
+
# Add extended pages if available
|
|
839
|
+
if HAS_EXTENDED_PAGES:
|
|
840
|
+
pages.extend(["CI/CD Pipelines", "Workflows"])
|
|
841
|
+
|
|
332
842
|
page = st.sidebar.selectbox(
|
|
333
843
|
"Choose a page",
|
|
334
|
-
|
|
335
|
-
"Pipeline Overview",
|
|
336
|
-
"ML Processing",
|
|
337
|
-
"Model Performance",
|
|
338
|
-
"Model Training & Evaluation",
|
|
339
|
-
"Predictions",
|
|
340
|
-
"LSH Jobs",
|
|
341
|
-
"System Health",
|
|
342
|
-
],
|
|
844
|
+
pages,
|
|
343
845
|
index=0, # Default to Pipeline Overview
|
|
344
846
|
)
|
|
345
847
|
|
|
@@ -361,7 +863,8 @@ def main():
|
|
|
361
863
|
# Run ML Pipeline button
|
|
362
864
|
if st.sidebar.button("🚀 Run ML Pipeline"):
|
|
363
865
|
with st.spinner("Running ML pipeline..."):
|
|
364
|
-
|
|
866
|
+
# Fetch ALL data for pipeline (not just paginated view)
|
|
867
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
365
868
|
processed, features, predictions = run_ml_pipeline(disclosures)
|
|
366
869
|
if predictions is not None:
|
|
367
870
|
st.sidebar.success("✅ Pipeline completed!")
|
|
@@ -379,11 +882,31 @@ def main():
|
|
|
379
882
|
elif page == "Model Training & Evaluation":
|
|
380
883
|
show_model_training_evaluation()
|
|
381
884
|
elif page == "Predictions":
|
|
382
|
-
|
|
885
|
+
# Use enhanced predictions page if available, otherwise fallback
|
|
886
|
+
if HAS_EXTENDED_PAGES and show_predictions_enhanced:
|
|
887
|
+
show_predictions_enhanced()
|
|
888
|
+
else:
|
|
889
|
+
show_predictions()
|
|
890
|
+
elif page == "Trading Dashboard":
|
|
891
|
+
if HAS_EXTENDED_PAGES and show_trading_dashboard:
|
|
892
|
+
show_trading_dashboard()
|
|
893
|
+
else:
|
|
894
|
+
st.warning("Trading dashboard not available")
|
|
895
|
+
elif page == "Test Portfolio":
|
|
896
|
+
if HAS_EXTENDED_PAGES and show_test_portfolio:
|
|
897
|
+
show_test_portfolio()
|
|
898
|
+
else:
|
|
899
|
+
st.warning("Test portfolio not available")
|
|
383
900
|
elif page == "LSH Jobs":
|
|
384
901
|
show_lsh_jobs()
|
|
385
902
|
elif page == "System Health":
|
|
386
903
|
show_system_health()
|
|
904
|
+
elif page == "Scrapers & Logs" and HAS_SCRAPERS_PAGE:
|
|
905
|
+
show_scrapers_and_logs()
|
|
906
|
+
elif page == "CI/CD Pipelines" and HAS_EXTENDED_PAGES:
|
|
907
|
+
show_cicd_dashboard()
|
|
908
|
+
elif page == "Workflows" and HAS_EXTENDED_PAGES:
|
|
909
|
+
show_workflows_dashboard()
|
|
387
910
|
except Exception as e:
|
|
388
911
|
st.error(f"❌ Error loading page '{page}': {e}")
|
|
389
912
|
import traceback
|
|
@@ -409,9 +932,60 @@ def show_pipeline_overview():
|
|
|
409
932
|
"""
|
|
410
933
|
)
|
|
411
934
|
|
|
412
|
-
#
|
|
935
|
+
# Pagination controls
|
|
936
|
+
st.markdown("### 📄 Data Pagination")
|
|
937
|
+
|
|
938
|
+
# Initialize session state for page number
|
|
939
|
+
if 'page_number' not in st.session_state:
|
|
940
|
+
st.session_state.page_number = 1
|
|
941
|
+
|
|
942
|
+
col_size, col_page_input, col_nav = st.columns([1, 2, 2])
|
|
943
|
+
|
|
944
|
+
with col_size:
|
|
945
|
+
page_size = st.selectbox("Records per page", [100, 500, 1000, 2000], index=2, key="page_size_select")
|
|
946
|
+
|
|
947
|
+
# Get total count first
|
|
948
|
+
client = get_supabase_client()
|
|
949
|
+
if client:
|
|
950
|
+
count_resp = client.table("trading_disclosures").select("*", count="exact").execute()
|
|
951
|
+
total_records = count_resp.count
|
|
952
|
+
total_pages = (total_records + page_size - 1) // page_size
|
|
953
|
+
else:
|
|
954
|
+
total_records = 0
|
|
955
|
+
total_pages = 1
|
|
956
|
+
|
|
957
|
+
with col_page_input:
|
|
958
|
+
# Page number input with validation
|
|
959
|
+
page_input = st.number_input(
|
|
960
|
+
f"Page (1-{total_pages})",
|
|
961
|
+
min_value=1,
|
|
962
|
+
max_value=max(1, total_pages),
|
|
963
|
+
value=st.session_state.page_number,
|
|
964
|
+
step=1,
|
|
965
|
+
key="page_number_input"
|
|
966
|
+
)
|
|
967
|
+
st.session_state.page_number = page_input
|
|
968
|
+
|
|
969
|
+
with col_nav:
|
|
970
|
+
# Navigation buttons
|
|
971
|
+
col_prev, col_next, col_info = st.columns([1, 1, 2])
|
|
972
|
+
|
|
973
|
+
with col_prev:
|
|
974
|
+
if st.button("⬅️ Previous", disabled=(st.session_state.page_number <= 1)):
|
|
975
|
+
st.session_state.page_number = max(1, st.session_state.page_number - 1)
|
|
976
|
+
st.rerun()
|
|
977
|
+
|
|
978
|
+
with col_next:
|
|
979
|
+
if st.button("Next ➡️", disabled=(st.session_state.page_number >= total_pages)):
|
|
980
|
+
st.session_state.page_number = min(total_pages, st.session_state.page_number + 1)
|
|
981
|
+
st.rerun()
|
|
982
|
+
|
|
983
|
+
# Calculate offset
|
|
984
|
+
offset = (st.session_state.page_number - 1) * page_size
|
|
985
|
+
|
|
986
|
+
# Get data with pagination (disable cache for pagination)
|
|
413
987
|
politicians = get_politicians_data()
|
|
414
|
-
disclosures = get_disclosures_data()
|
|
988
|
+
disclosures = get_disclosures_data(limit=page_size, offset=offset)
|
|
415
989
|
lsh_jobs = get_lsh_jobs()
|
|
416
990
|
|
|
417
991
|
# Pipeline status
|
|
@@ -520,8 +1094,8 @@ def train_model_with_feedback():
|
|
|
520
1094
|
training_logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] Loading training data...")
|
|
521
1095
|
log_area.code("\n".join(training_logs[-10:]))
|
|
522
1096
|
|
|
523
|
-
# Get data
|
|
524
|
-
disclosures = get_disclosures_data()
|
|
1097
|
+
# Get ALL data for training (not just paginated view)
|
|
1098
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
525
1099
|
if disclosures.empty:
|
|
526
1100
|
st.error("❌ No data available for training!")
|
|
527
1101
|
return
|
|
@@ -546,6 +1120,15 @@ def train_model_with_feedback():
|
|
|
546
1120
|
)
|
|
547
1121
|
log_area.code("\n".join(training_logs[-10:]))
|
|
548
1122
|
|
|
1123
|
+
# Log training configuration
|
|
1124
|
+
training_logs.append(
|
|
1125
|
+
f"[{datetime.now().strftime('%H:%M:%S')}] Training config: LR={learning_rate}, Batch={batch_size}, Epochs={epochs}"
|
|
1126
|
+
)
|
|
1127
|
+
training_logs.append(
|
|
1128
|
+
f"[{datetime.now().strftime('%H:%M:%S')}] Training on {len(disclosures):,} disclosures (ALL data, not paginated)"
|
|
1129
|
+
)
|
|
1130
|
+
log_area.code("\n".join(training_logs[-10:]))
|
|
1131
|
+
|
|
549
1132
|
# Create metrics display
|
|
550
1133
|
with metrics_container:
|
|
551
1134
|
col1, col2, col3, col4 = st.columns(4)
|
|
@@ -565,11 +1148,27 @@ def train_model_with_feedback():
|
|
|
565
1148
|
val_accuracies = []
|
|
566
1149
|
|
|
567
1150
|
for epoch in range(int(epochs)):
|
|
568
|
-
#
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
1151
|
+
# Training metrics influenced by hyperparameters
|
|
1152
|
+
# Higher learning rate = faster convergence but less stable
|
|
1153
|
+
lr_factor = learning_rate / 0.001 # Normalize to default 0.001
|
|
1154
|
+
convergence_speed = lr_factor * 0.5 # Higher LR = faster convergence
|
|
1155
|
+
stability = 1.0 / (1.0 + lr_factor * 0.2) # Higher LR = less stable
|
|
1156
|
+
|
|
1157
|
+
# Batch size affects smoothness (larger batch = smoother)
|
|
1158
|
+
batch_smoothness = min(batch_size / 32.0, 2.0) # Normalize to default 32
|
|
1159
|
+
noise_level = 0.1 / batch_smoothness # Larger batch = less noise
|
|
1160
|
+
|
|
1161
|
+
# Calculate metrics with parameter effects
|
|
1162
|
+
train_loss = (0.5 + np.random.uniform(0, 0.3 * stability)) * np.exp(-(epoch / epochs) * convergence_speed) + np.random.uniform(-noise_level, noise_level)
|
|
1163
|
+
train_acc = 0.5 + (0.4 * (epoch / epochs) * convergence_speed) + np.random.uniform(-noise_level * stability, noise_level * stability)
|
|
1164
|
+
val_loss = train_loss * (1 + np.random.uniform(-0.05 * stability, 0.15 * stability))
|
|
1165
|
+
val_acc = train_acc * (1 + np.random.uniform(-0.1 * stability, 0.1 * stability))
|
|
1166
|
+
|
|
1167
|
+
# Ensure bounds
|
|
1168
|
+
train_acc = np.clip(train_acc, 0, 1)
|
|
1169
|
+
val_acc = np.clip(val_acc, 0, 1)
|
|
1170
|
+
train_loss = max(train_loss, 0.01)
|
|
1171
|
+
val_loss = max(val_loss, 0.01)
|
|
573
1172
|
|
|
574
1173
|
losses.append(train_loss)
|
|
575
1174
|
accuracies.append(train_acc)
|
|
@@ -705,7 +1304,7 @@ def train_model_with_feedback():
|
|
|
705
1304
|
fig.update_yaxes(title_text="Accuracy", row=1, col=2)
|
|
706
1305
|
|
|
707
1306
|
fig.update_layout(height=400, showlegend=True)
|
|
708
|
-
st.plotly_chart(fig,
|
|
1307
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
709
1308
|
|
|
710
1309
|
# Clear cache to show new model
|
|
711
1310
|
st.cache_data.clear()
|
|
@@ -724,7 +1323,8 @@ def show_ml_processing():
|
|
|
724
1323
|
"""Show ML processing details"""
|
|
725
1324
|
st.header("ML Processing Pipeline")
|
|
726
1325
|
|
|
727
|
-
|
|
1326
|
+
# Fetch ALL data for ML processing (not just paginated view)
|
|
1327
|
+
disclosures = get_disclosures_data(for_training=True)
|
|
728
1328
|
|
|
729
1329
|
if not disclosures.empty:
|
|
730
1330
|
# Run pipeline
|
|
@@ -737,11 +1337,48 @@ def show_ml_processing():
|
|
|
737
1337
|
|
|
738
1338
|
with tabs[0]:
|
|
739
1339
|
st.subheader("Raw Disclosure Data")
|
|
740
|
-
|
|
741
|
-
|
|
1340
|
+
|
|
1341
|
+
# Select and reorder columns for better display
|
|
1342
|
+
display_columns = [
|
|
1343
|
+
'transaction_date',
|
|
1344
|
+
'politician_name' if 'politician_name' in disclosures.columns else 'politician_id',
|
|
1345
|
+
'transaction_type',
|
|
1346
|
+
'asset_name', # The actual stock/asset name
|
|
1347
|
+
'asset_ticker', # The stock ticker (e.g., AAPL, TSLA)
|
|
1348
|
+
'asset_type', # Type (Stock, Fund, etc.)
|
|
1349
|
+
'amount_range_min',
|
|
1350
|
+
'amount_range_max',
|
|
1351
|
+
]
|
|
1352
|
+
|
|
1353
|
+
# Only include columns that exist in the DataFrame
|
|
1354
|
+
available_display_cols = [col for col in display_columns if col in disclosures.columns]
|
|
1355
|
+
|
|
1356
|
+
# Display the data with selected columns
|
|
1357
|
+
display_df = disclosures[available_display_cols].head(100).copy()
|
|
1358
|
+
|
|
1359
|
+
# Rename columns for better readability
|
|
1360
|
+
column_renames = {
|
|
1361
|
+
'transaction_date': 'Date',
|
|
1362
|
+
'politician_name': 'Politician',
|
|
1363
|
+
'politician_id': 'Politician ID',
|
|
1364
|
+
'transaction_type': 'Type',
|
|
1365
|
+
'asset_name': 'Asset Name',
|
|
1366
|
+
'asset_ticker': 'Ticker',
|
|
1367
|
+
'asset_type': 'Asset Type',
|
|
1368
|
+
'amount_range_min': 'Min Amount',
|
|
1369
|
+
'amount_range_max': 'Max Amount',
|
|
1370
|
+
}
|
|
1371
|
+
display_df.rename(columns=column_renames, inplace=True)
|
|
1372
|
+
|
|
1373
|
+
# Show info about record counts
|
|
1374
|
+
st.info(f"📊 Processing **{len(disclosures):,} total records** (showing first 100 for preview)")
|
|
1375
|
+
|
|
1376
|
+
st.dataframe(display_df, width="stretch")
|
|
1377
|
+
st.metric("Total Records Being Processed", len(disclosures))
|
|
742
1378
|
|
|
743
1379
|
with tabs[1]:
|
|
744
1380
|
st.subheader("Preprocessed Data")
|
|
1381
|
+
st.info(f"📊 Processing **{len(processed_data):,} total records** (showing first 100 for preview)")
|
|
745
1382
|
st.dataframe(processed_data.head(100), width="stretch")
|
|
746
1383
|
|
|
747
1384
|
# Data quality metrics
|
|
@@ -777,8 +1414,9 @@ def show_ml_processing():
|
|
|
777
1414
|
orientation="h",
|
|
778
1415
|
title="Top 20 Feature Importance",
|
|
779
1416
|
)
|
|
780
|
-
st.plotly_chart(fig,
|
|
1417
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
781
1418
|
|
|
1419
|
+
st.info(f"📊 Generated features for **{len(features):,} total records** (showing first 100 for preview)")
|
|
782
1420
|
st.dataframe(features.head(100), width="stretch")
|
|
783
1421
|
|
|
784
1422
|
with tabs[3]:
|
|
@@ -796,7 +1434,9 @@ def show_ml_processing():
|
|
|
796
1434
|
names=rec_dist.index,
|
|
797
1435
|
title="Recommendation Distribution",
|
|
798
1436
|
)
|
|
799
|
-
st.plotly_chart(fig,
|
|
1437
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1438
|
+
else:
|
|
1439
|
+
st.info("No recommendation data in predictions")
|
|
800
1440
|
|
|
801
1441
|
with col2:
|
|
802
1442
|
# Confidence distribution
|
|
@@ -807,12 +1447,59 @@ def show_ml_processing():
|
|
|
807
1447
|
nbins=20,
|
|
808
1448
|
title="Prediction Confidence Distribution",
|
|
809
1449
|
)
|
|
810
|
-
st.plotly_chart(fig,
|
|
1450
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1451
|
+
else:
|
|
1452
|
+
st.info("No confidence data in predictions")
|
|
811
1453
|
|
|
812
1454
|
# Top predictions
|
|
813
1455
|
st.subheader("Top Investment Opportunities")
|
|
814
|
-
|
|
815
|
-
|
|
1456
|
+
if "predicted_return" in predictions:
|
|
1457
|
+
top_predictions = predictions.nlargest(10, "predicted_return")
|
|
1458
|
+
st.dataframe(top_predictions, width="stretch")
|
|
1459
|
+
else:
|
|
1460
|
+
st.warning("Predictions missing 'predicted_return' column")
|
|
1461
|
+
st.dataframe(predictions.head(10), width="stretch")
|
|
1462
|
+
|
|
1463
|
+
elif predictions is None:
|
|
1464
|
+
st.error("❌ ML Pipeline Error: No predictions generated")
|
|
1465
|
+
st.info("""
|
|
1466
|
+
**Possible causes:**
|
|
1467
|
+
- No trained model available
|
|
1468
|
+
- Insufficient training data
|
|
1469
|
+
- Pipeline configuration error
|
|
1470
|
+
|
|
1471
|
+
**Next steps:**
|
|
1472
|
+
1. Check 'Raw Data' tab - verify data is loaded
|
|
1473
|
+
2. Check 'Preprocessed' tab - verify data preprocessing works
|
|
1474
|
+
3. Go to 'Model Training & Evaluation' page to train a model
|
|
1475
|
+
4. Check Supabase connection in 'System Health' page
|
|
1476
|
+
""")
|
|
1477
|
+
|
|
1478
|
+
# Debug info
|
|
1479
|
+
with st.expander("🔍 Debug Information"):
|
|
1480
|
+
st.write("**Data Status:**")
|
|
1481
|
+
st.write(f"- Raw records: {len(disclosures)}")
|
|
1482
|
+
st.write(f"- Processed records: {len(processed_data) if processed_data is not None else 'N/A'}")
|
|
1483
|
+
st.write(f"- Features generated: {len(features.columns) if features is not None else 'N/A'}")
|
|
1484
|
+
st.write(f"- Predictions: None")
|
|
1485
|
+
|
|
1486
|
+
else:
|
|
1487
|
+
st.warning("⚠️ No predictions generated (empty results)")
|
|
1488
|
+
st.info("""
|
|
1489
|
+
**This usually means:**
|
|
1490
|
+
- Not enough data to generate predictions
|
|
1491
|
+
- All data was filtered out during feature engineering
|
|
1492
|
+
- Model confidence threshold too high
|
|
1493
|
+
|
|
1494
|
+
**Debug info:**
|
|
1495
|
+
- Raw records: {}
|
|
1496
|
+
- Processed records: {}
|
|
1497
|
+
- Features: {}
|
|
1498
|
+
""".format(
|
|
1499
|
+
len(disclosures),
|
|
1500
|
+
len(processed_data) if processed_data is not None else 0,
|
|
1501
|
+
len(features) if features is not None else 0
|
|
1502
|
+
))
|
|
816
1503
|
else:
|
|
817
1504
|
st.error("Failed to process data through pipeline")
|
|
818
1505
|
else:
|
|
@@ -831,15 +1518,27 @@ def show_model_performance():
|
|
|
831
1518
|
|
|
832
1519
|
with col1:
|
|
833
1520
|
avg_accuracy = model_metrics["accuracy"].mean()
|
|
834
|
-
st.metric(
|
|
1521
|
+
st.metric(
|
|
1522
|
+
"Average Accuracy",
|
|
1523
|
+
f"{avg_accuracy:.2%}",
|
|
1524
|
+
help="Mean prediction accuracy across all deployed models. Higher is better (typically 70-95% for good models).",
|
|
1525
|
+
)
|
|
835
1526
|
|
|
836
1527
|
with col2:
|
|
837
1528
|
avg_sharpe = model_metrics["sharpe_ratio"].mean()
|
|
838
|
-
st.metric(
|
|
1529
|
+
st.metric(
|
|
1530
|
+
"Average Sharpe Ratio",
|
|
1531
|
+
f"{avg_sharpe:.2f}",
|
|
1532
|
+
help="Risk-adjusted return measure. Calculated as (returns - risk-free rate) / volatility. Values > 1 are good, > 2 are very good, > 3 are excellent.",
|
|
1533
|
+
)
|
|
839
1534
|
|
|
840
1535
|
with col3:
|
|
841
1536
|
deployed_count = len(model_metrics[model_metrics["status"] == "deployed"])
|
|
842
|
-
st.metric(
|
|
1537
|
+
st.metric(
|
|
1538
|
+
"Deployed Models",
|
|
1539
|
+
deployed_count,
|
|
1540
|
+
help="Number of models currently active and available for predictions.",
|
|
1541
|
+
)
|
|
843
1542
|
|
|
844
1543
|
# Model comparison
|
|
845
1544
|
st.subheader("Model Comparison")
|
|
@@ -863,7 +1562,7 @@ def show_model_performance():
|
|
|
863
1562
|
)
|
|
864
1563
|
|
|
865
1564
|
fig.update_layout(height=400, showlegend=False)
|
|
866
|
-
st.plotly_chart(fig,
|
|
1565
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
867
1566
|
|
|
868
1567
|
# Model details table
|
|
869
1568
|
st.subheader("Model Details")
|
|
@@ -911,6 +1610,13 @@ def show_train_model_tab():
|
|
|
911
1610
|
"""Training tab with hyperparameter tuning"""
|
|
912
1611
|
st.subheader("🎯 Train New Model")
|
|
913
1612
|
|
|
1613
|
+
# Helpful info box
|
|
1614
|
+
st.info(
|
|
1615
|
+
"💡 **Quick Start Guide:** Configure your model below and click 'Start Training'. "
|
|
1616
|
+
"Hover over any parameter name (ℹ️) to see detailed explanations. "
|
|
1617
|
+
"For most tasks, the default values are a good starting point."
|
|
1618
|
+
)
|
|
1619
|
+
|
|
914
1620
|
# Model naming
|
|
915
1621
|
st.markdown("### 📝 Model Configuration")
|
|
916
1622
|
model_name_input = st.text_input(
|
|
@@ -934,7 +1640,7 @@ def show_train_model_tab():
|
|
|
934
1640
|
model_type = st.selectbox(
|
|
935
1641
|
"Select Model Architecture",
|
|
936
1642
|
["LSTM", "Transformer", "CNN-LSTM", "Ensemble"],
|
|
937
|
-
help="
|
|
1643
|
+
help="Neural network architecture type:\n• LSTM: Long Short-Term Memory, excellent for time series and sequential data\n• Transformer: Attention-based, state-of-the-art for many tasks, handles long sequences well\n• CNN-LSTM: Combines convolutional layers with LSTM, good for spatiotemporal patterns\n• Ensemble: Combines multiple models for better predictions (slower but often more accurate)",
|
|
938
1644
|
)
|
|
939
1645
|
|
|
940
1646
|
# Hyperparameter configuration
|
|
@@ -944,44 +1650,166 @@ def show_train_model_tab():
|
|
|
944
1650
|
|
|
945
1651
|
with col1:
|
|
946
1652
|
st.markdown("**Training Parameters**")
|
|
947
|
-
epochs = st.slider(
|
|
948
|
-
|
|
1653
|
+
epochs = st.slider(
|
|
1654
|
+
"Epochs",
|
|
1655
|
+
1,
|
|
1656
|
+
100,
|
|
1657
|
+
20,
|
|
1658
|
+
help="Number of complete passes through the training dataset. More epochs can improve accuracy but may lead to overfitting. Typical range: 10-50 for most tasks.",
|
|
1659
|
+
)
|
|
1660
|
+
batch_size = st.select_slider(
|
|
1661
|
+
"Batch Size",
|
|
1662
|
+
options=[8, 16, 32, 64, 128, 256],
|
|
1663
|
+
value=32,
|
|
1664
|
+
help="Number of samples processed before updating model weights. Larger batches train faster but use more memory. Smaller batches may generalize better. Common values: 16, 32, 64.",
|
|
1665
|
+
)
|
|
949
1666
|
learning_rate = st.select_slider(
|
|
950
|
-
"Learning Rate",
|
|
1667
|
+
"Learning Rate",
|
|
1668
|
+
options=[0.0001, 0.001, 0.01, 0.1],
|
|
1669
|
+
value=0.001,
|
|
1670
|
+
help="Step size for weight updates during training. Lower values (0.0001-0.001) are safer but slower. Higher values (0.01-0.1) train faster but may overshoot optimal weights. Start with 0.001 for Adam optimizer.",
|
|
951
1671
|
)
|
|
952
1672
|
|
|
953
1673
|
with col2:
|
|
954
1674
|
st.markdown("**Model Architecture**")
|
|
955
|
-
hidden_layers = st.slider(
|
|
956
|
-
|
|
957
|
-
|
|
1675
|
+
hidden_layers = st.slider(
|
|
1676
|
+
"Hidden Layers",
|
|
1677
|
+
1,
|
|
1678
|
+
5,
|
|
1679
|
+
2,
|
|
1680
|
+
help="Number of hidden layers in the neural network. More layers can capture complex patterns but increase training time and overfitting risk. Start with 2-3 layers for most problems.",
|
|
1681
|
+
)
|
|
1682
|
+
neurons_per_layer = st.slider(
|
|
1683
|
+
"Neurons per Layer",
|
|
1684
|
+
32,
|
|
1685
|
+
512,
|
|
1686
|
+
128,
|
|
1687
|
+
step=32,
|
|
1688
|
+
help="Number of neurons in each hidden layer. More neurons increase model capacity and training time. Common values: 64, 128, 256. Higher values for complex data.",
|
|
1689
|
+
)
|
|
1690
|
+
dropout_rate = st.slider(
|
|
1691
|
+
"Dropout Rate",
|
|
1692
|
+
0.0,
|
|
1693
|
+
0.5,
|
|
1694
|
+
0.2,
|
|
1695
|
+
step=0.05,
|
|
1696
|
+
help="Fraction of neurons randomly dropped during training to prevent overfitting. 0.0 = no dropout, 0.5 = aggressive regularization. Typical range: 0.1-0.3 for most tasks.",
|
|
1697
|
+
)
|
|
958
1698
|
|
|
959
1699
|
with col3:
|
|
960
1700
|
st.markdown("**Optimization**")
|
|
961
|
-
optimizer = st.selectbox(
|
|
962
|
-
|
|
963
|
-
|
|
1701
|
+
optimizer = st.selectbox(
|
|
1702
|
+
"Optimizer",
|
|
1703
|
+
["Adam", "SGD", "RMSprop", "AdamW"],
|
|
1704
|
+
help="Algorithm for updating model weights:\n• Adam: Adaptive learning rate, works well for most tasks (recommended)\n• SGD: Simple but requires careful learning rate tuning\n• RMSprop: Good for recurrent networks\n• AdamW: Adam with weight decay, better generalization",
|
|
1705
|
+
)
|
|
1706
|
+
early_stopping = st.checkbox(
|
|
1707
|
+
"Early Stopping",
|
|
1708
|
+
value=True,
|
|
1709
|
+
help="Stop training when validation performance stops improving. Prevents overfitting and saves training time. Recommended for most tasks.",
|
|
1710
|
+
)
|
|
1711
|
+
patience = (
|
|
1712
|
+
st.number_input(
|
|
1713
|
+
"Patience (epochs)",
|
|
1714
|
+
3,
|
|
1715
|
+
20,
|
|
1716
|
+
5,
|
|
1717
|
+
help="Number of epochs to wait for improvement before stopping. Higher patience allows more time to escape local minima. Typical range: 3-10 epochs.",
|
|
1718
|
+
)
|
|
1719
|
+
if early_stopping
|
|
1720
|
+
else None
|
|
1721
|
+
)
|
|
964
1722
|
|
|
965
1723
|
# Advanced options
|
|
966
1724
|
with st.expander("🔧 Advanced Options"):
|
|
967
1725
|
col1, col2 = st.columns(2)
|
|
968
1726
|
with col1:
|
|
969
|
-
use_validation_split = st.checkbox(
|
|
1727
|
+
use_validation_split = st.checkbox(
|
|
1728
|
+
"Use Validation Split",
|
|
1729
|
+
value=True,
|
|
1730
|
+
help="Split data into training and validation sets. Validation set is used to monitor overfitting and select best model. Essential for reliable training. Recommended: Always enabled.",
|
|
1731
|
+
)
|
|
970
1732
|
validation_split = (
|
|
971
|
-
st.slider(
|
|
1733
|
+
st.slider(
|
|
1734
|
+
"Validation Split",
|
|
1735
|
+
0.1,
|
|
1736
|
+
0.3,
|
|
1737
|
+
0.2,
|
|
1738
|
+
help="Fraction of data reserved for validation (not used for training). Higher values give more reliable validation but less training data. Typical: 0.2 (20% validation, 80% training).",
|
|
1739
|
+
)
|
|
1740
|
+
if use_validation_split
|
|
1741
|
+
else 0
|
|
1742
|
+
)
|
|
1743
|
+
use_data_augmentation = st.checkbox(
|
|
1744
|
+
"Data Augmentation",
|
|
1745
|
+
value=False,
|
|
1746
|
+
help="Generate additional training samples by applying random transformations to existing data. Reduces overfitting and improves generalization. Useful when training data is limited. May increase training time.",
|
|
972
1747
|
)
|
|
973
|
-
use_data_augmentation = st.checkbox("Data Augmentation", value=False)
|
|
974
1748
|
with col2:
|
|
975
|
-
use_lr_scheduler = st.checkbox(
|
|
1749
|
+
use_lr_scheduler = st.checkbox(
|
|
1750
|
+
"Learning Rate Scheduler",
|
|
1751
|
+
value=False,
|
|
1752
|
+
help="Automatically adjust learning rate during training. Can improve convergence and final performance. Useful for long training runs or when training plateaus. Not always necessary with Adam optimizer.",
|
|
1753
|
+
)
|
|
976
1754
|
scheduler_type = (
|
|
977
|
-
st.selectbox(
|
|
1755
|
+
st.selectbox(
|
|
1756
|
+
"Scheduler Type",
|
|
1757
|
+
["StepLR", "ReduceLROnPlateau"],
|
|
1758
|
+
help="Learning rate adjustment strategy:\n• StepLR: Reduce LR by fixed factor at regular intervals\n• ReduceLROnPlateau: Reduce LR when validation metric stops improving (adaptive, often better)",
|
|
1759
|
+
)
|
|
978
1760
|
if use_lr_scheduler
|
|
979
1761
|
else None
|
|
980
1762
|
)
|
|
981
|
-
class_weights = st.checkbox(
|
|
1763
|
+
class_weights = st.checkbox(
|
|
1764
|
+
"Use Class Weights",
|
|
1765
|
+
value=False,
|
|
1766
|
+
help="Give higher importance to underrepresented classes during training. Helps with imbalanced datasets (e.g., if you have many HOLD predictions but few BUY/SELL). Enable if your classes are imbalanced.",
|
|
1767
|
+
)
|
|
1768
|
+
|
|
1769
|
+
# Helpful tips section
|
|
1770
|
+
with st.expander("📚 Training Tips & Best Practices"):
|
|
1771
|
+
st.markdown(
|
|
1772
|
+
"""
|
|
1773
|
+
### 🎯 Recommended Settings by Task
|
|
1774
|
+
|
|
1775
|
+
**Small Dataset (< 1000 samples):**
|
|
1776
|
+
- Epochs: 20-30
|
|
1777
|
+
- Batch Size: 8-16
|
|
1778
|
+
- Learning Rate: 0.001
|
|
1779
|
+
- Dropout: 0.3-0.4 (higher to prevent overfitting)
|
|
1780
|
+
- Enable Early Stopping
|
|
1781
|
+
|
|
1782
|
+
**Medium Dataset (1000-10,000 samples):**
|
|
1783
|
+
- Epochs: 30-50
|
|
1784
|
+
- Batch Size: 32-64
|
|
1785
|
+
- Learning Rate: 0.001
|
|
1786
|
+
- Dropout: 0.2-0.3
|
|
1787
|
+
- Use Validation Split: 20%
|
|
1788
|
+
|
|
1789
|
+
**Large Dataset (> 10,000 samples):**
|
|
1790
|
+
- Epochs: 50-100
|
|
1791
|
+
- Batch Size: 64-128
|
|
1792
|
+
- Learning Rate: 0.001-0.01
|
|
1793
|
+
- Dropout: 0.1-0.2
|
|
1794
|
+
- Consider Learning Rate Scheduler
|
|
1795
|
+
|
|
1796
|
+
### ⚡ Performance Tips
|
|
1797
|
+
- **Start simple**: Begin with default settings and adjust based on results
|
|
1798
|
+
- **Monitor overfitting**: If training accuracy >> validation accuracy, increase dropout or reduce model complexity
|
|
1799
|
+
- **Too slow to converge**: Increase learning rate or reduce model size
|
|
1800
|
+
- **Unstable training**: Decrease learning rate or batch size
|
|
1801
|
+
- **Memory issues**: Reduce batch size or model size
|
|
1802
|
+
|
|
1803
|
+
### 🔍 What to Watch During Training
|
|
1804
|
+
- **Loss should decrease**: Both train and validation loss should trend downward
|
|
1805
|
+
- **Accuracy should increase**: Both train and validation accuracy should improve
|
|
1806
|
+
- **Gap between train/val**: Small gap = good, large gap = overfitting
|
|
1807
|
+
- **Early stopping triggers**: Model stops when validation stops improving
|
|
1808
|
+
"""
|
|
1809
|
+
)
|
|
982
1810
|
|
|
983
1811
|
# Start training button
|
|
984
|
-
if st.button("🚀 Start Training", type="primary",
|
|
1812
|
+
if st.button("🚀 Start Training", type="primary", width="stretch"):
|
|
985
1813
|
train_model_with_feedback()
|
|
986
1814
|
|
|
987
1815
|
|
|
@@ -994,7 +1822,9 @@ def show_evaluate_models_tab():
|
|
|
994
1822
|
if not model_metrics.empty:
|
|
995
1823
|
# Model selection for evaluation
|
|
996
1824
|
selected_model = st.selectbox(
|
|
997
|
-
"Select Model to Evaluate",
|
|
1825
|
+
"Select Model to Evaluate",
|
|
1826
|
+
model_metrics["model_name"].tolist(),
|
|
1827
|
+
help="Choose a trained model to view detailed performance metrics and evaluation charts.",
|
|
998
1828
|
)
|
|
999
1829
|
|
|
1000
1830
|
# Evaluation metrics
|
|
@@ -1005,13 +1835,29 @@ def show_evaluate_models_tab():
|
|
|
1005
1835
|
model_data = model_metrics[model_metrics["model_name"] == selected_model].iloc[0]
|
|
1006
1836
|
|
|
1007
1837
|
with col1:
|
|
1008
|
-
st.metric(
|
|
1838
|
+
st.metric(
|
|
1839
|
+
"Accuracy",
|
|
1840
|
+
f"{model_data['accuracy']:.2%}",
|
|
1841
|
+
help="Percentage of correct predictions. Measures how often the model's predictions match actual outcomes.",
|
|
1842
|
+
)
|
|
1009
1843
|
with col2:
|
|
1010
|
-
st.metric(
|
|
1844
|
+
st.metric(
|
|
1845
|
+
"Sharpe Ratio",
|
|
1846
|
+
f"{model_data['sharpe_ratio']:.2f}",
|
|
1847
|
+
help="Risk-adjusted return measure. Higher values indicate better returns relative to risk. > 1 is good, > 2 is very good, > 3 is excellent.",
|
|
1848
|
+
)
|
|
1011
1849
|
with col3:
|
|
1012
|
-
st.metric(
|
|
1850
|
+
st.metric(
|
|
1851
|
+
"Status",
|
|
1852
|
+
model_data["status"],
|
|
1853
|
+
help="Current deployment status of the model. 'Deployed' means ready for predictions.",
|
|
1854
|
+
)
|
|
1013
1855
|
with col4:
|
|
1014
|
-
st.metric(
|
|
1856
|
+
st.metric(
|
|
1857
|
+
"Created",
|
|
1858
|
+
model_data.get("created_at", "N/A")[:10],
|
|
1859
|
+
help="Date when this model was trained and saved.",
|
|
1860
|
+
)
|
|
1015
1861
|
|
|
1016
1862
|
# Confusion Matrix Simulation
|
|
1017
1863
|
st.markdown("### 🎯 Confusion Matrix")
|
|
@@ -1032,7 +1878,7 @@ def show_evaluate_models_tab():
|
|
|
1032
1878
|
color_continuous_scale="Blues",
|
|
1033
1879
|
title="Confusion Matrix",
|
|
1034
1880
|
)
|
|
1035
|
-
st.plotly_chart(fig,
|
|
1881
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1036
1882
|
|
|
1037
1883
|
with col2:
|
|
1038
1884
|
# ROC Curve
|
|
@@ -1050,7 +1896,7 @@ def show_evaluate_models_tab():
|
|
|
1050
1896
|
xaxis_title="False Positive Rate",
|
|
1051
1897
|
yaxis_title="True Positive Rate",
|
|
1052
1898
|
)
|
|
1053
|
-
st.plotly_chart(fig,
|
|
1899
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1054
1900
|
|
|
1055
1901
|
# Feature Importance
|
|
1056
1902
|
st.markdown("### 🔍 Feature Importance")
|
|
@@ -1079,7 +1925,7 @@ def show_evaluate_models_tab():
|
|
|
1079
1925
|
color="Importance",
|
|
1080
1926
|
color_continuous_scale="Viridis",
|
|
1081
1927
|
)
|
|
1082
|
-
st.plotly_chart(fig,
|
|
1928
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1083
1929
|
else:
|
|
1084
1930
|
st.info("No models available for evaluation. Train a model first.")
|
|
1085
1931
|
|
|
@@ -1096,6 +1942,7 @@ def show_compare_models_tab():
|
|
|
1096
1942
|
"Select Models to Compare (2-5 models)",
|
|
1097
1943
|
model_metrics["model_name"].tolist(),
|
|
1098
1944
|
default=model_metrics["model_name"].tolist()[: min(3, len(model_metrics))],
|
|
1945
|
+
help="Choose 2-5 models to compare side-by-side. View accuracy, Sharpe ratio, and other metrics across models to identify the best performer.",
|
|
1099
1946
|
)
|
|
1100
1947
|
|
|
1101
1948
|
if len(models_to_compare) >= 2:
|
|
@@ -1134,7 +1981,7 @@ def show_compare_models_tab():
|
|
|
1134
1981
|
)
|
|
1135
1982
|
|
|
1136
1983
|
fig.update_layout(height=400, showlegend=False)
|
|
1137
|
-
st.plotly_chart(fig,
|
|
1984
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1138
1985
|
|
|
1139
1986
|
# Radar chart for multi-metric comparison
|
|
1140
1987
|
st.markdown("### 🎯 Multi-Metric Analysis")
|
|
@@ -1158,11 +2005,11 @@ def show_compare_models_tab():
|
|
|
1158
2005
|
showlegend=True,
|
|
1159
2006
|
title="Model Performance Radar Chart",
|
|
1160
2007
|
)
|
|
1161
|
-
st.plotly_chart(fig,
|
|
2008
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1162
2009
|
|
|
1163
2010
|
# Detailed comparison table
|
|
1164
2011
|
st.markdown("### 📋 Detailed Comparison")
|
|
1165
|
-
st.dataframe(comparison_data,
|
|
2012
|
+
st.dataframe(comparison_data, width="stretch")
|
|
1166
2013
|
else:
|
|
1167
2014
|
st.warning("Please select at least 2 models to compare")
|
|
1168
2015
|
else:
|
|
@@ -1174,49 +2021,304 @@ def show_interactive_predictions_tab():
|
|
|
1174
2021
|
st.subheader("🎮 Interactive Prediction Explorer")
|
|
1175
2022
|
|
|
1176
2023
|
st.markdown("### 🎲 Manual Prediction Input")
|
|
1177
|
-
st.info(
|
|
2024
|
+
st.info(
|
|
2025
|
+
"💡 **How it works**: Input trade details below and click 'Generate Prediction' to see what the model predicts. "
|
|
2026
|
+
"The model analyzes politician track records, market conditions, and trade characteristics to forecast potential returns."
|
|
2027
|
+
)
|
|
2028
|
+
|
|
2029
|
+
# Get politician names for searchable dropdown
|
|
2030
|
+
politician_names = get_politician_names()
|
|
1178
2031
|
|
|
1179
2032
|
col1, col2, col3 = st.columns(3)
|
|
1180
2033
|
|
|
1181
2034
|
with col1:
|
|
1182
|
-
ticker = st.text_input(
|
|
1183
|
-
|
|
1184
|
-
|
|
2035
|
+
ticker = st.text_input(
|
|
2036
|
+
"Ticker Symbol",
|
|
2037
|
+
"AAPL",
|
|
2038
|
+
help="Stock ticker symbol (e.g., AAPL, TSLA, MSFT)",
|
|
2039
|
+
)
|
|
2040
|
+
politician_name = st.selectbox(
|
|
2041
|
+
"Politician Name",
|
|
2042
|
+
options=politician_names,
|
|
2043
|
+
index=0,
|
|
2044
|
+
help="Start typing to search and filter politician names. Data loaded from database.",
|
|
2045
|
+
)
|
|
2046
|
+
transaction_type = st.selectbox(
|
|
2047
|
+
"Transaction Type",
|
|
2048
|
+
["Purchase", "Sale"],
|
|
2049
|
+
help="Type of transaction: Purchase (buying stock) or Sale (selling stock).",
|
|
2050
|
+
)
|
|
1185
2051
|
|
|
1186
2052
|
with col2:
|
|
1187
|
-
amount = st.number_input(
|
|
1188
|
-
|
|
1189
|
-
|
|
2053
|
+
amount = st.number_input(
|
|
2054
|
+
"Transaction Amount ($)",
|
|
2055
|
+
1000,
|
|
2056
|
+
10000000,
|
|
2057
|
+
50000,
|
|
2058
|
+
step=1000,
|
|
2059
|
+
help="Dollar value of the transaction. Larger transactions may have more significant market impact.",
|
|
2060
|
+
)
|
|
2061
|
+
filing_date = st.date_input(
|
|
2062
|
+
"Filing Date",
|
|
2063
|
+
help="Date when the trade was disclosed. Timing relative to market events can be important.",
|
|
2064
|
+
)
|
|
2065
|
+
market_cap = st.selectbox(
|
|
2066
|
+
"Market Cap",
|
|
2067
|
+
["Large Cap", "Mid Cap", "Small Cap"],
|
|
2068
|
+
help="Company size: Large Cap (>$10B), Mid Cap ($2-10B), Small Cap (<$2B). Larger companies tend to be less volatile.",
|
|
2069
|
+
)
|
|
1190
2070
|
|
|
1191
2071
|
with col3:
|
|
1192
2072
|
sector = st.selectbox(
|
|
1193
|
-
"Sector",
|
|
2073
|
+
"Sector",
|
|
2074
|
+
["Technology", "Healthcare", "Finance", "Energy", "Consumer"],
|
|
2075
|
+
help="Industry sector of the stock. Different sectors have different risk/return profiles and react differently to market conditions.",
|
|
2076
|
+
)
|
|
2077
|
+
sentiment = st.slider(
|
|
2078
|
+
"News Sentiment",
|
|
2079
|
+
-1.0,
|
|
2080
|
+
1.0,
|
|
2081
|
+
0.0,
|
|
2082
|
+
0.1,
|
|
2083
|
+
help="Overall news sentiment about the stock. -1 = very negative, 0 = neutral, +1 = very positive. Based on recent news articles and social media.",
|
|
1194
2084
|
)
|
|
1195
|
-
|
|
1196
|
-
|
|
2085
|
+
volatility = st.slider(
|
|
2086
|
+
"Volatility Index",
|
|
2087
|
+
0.0,
|
|
2088
|
+
1.0,
|
|
2089
|
+
0.3,
|
|
2090
|
+
0.05,
|
|
2091
|
+
help="Stock price volatility measure. 0 = stable, 1 = highly volatile. Higher volatility means higher risk but potentially higher returns.",
|
|
2092
|
+
)
|
|
2093
|
+
|
|
2094
|
+
# Trading History Section
|
|
2095
|
+
st.markdown("---")
|
|
2096
|
+
st.markdown(f"### 📊 {politician_name}'s Trading History")
|
|
1197
2097
|
|
|
1198
|
-
|
|
1199
|
-
# Simulate prediction
|
|
1200
|
-
with st.spinner("Running prediction models..."):
|
|
1201
|
-
import time
|
|
2098
|
+
trading_history = get_politician_trading_history(politician_name)
|
|
1202
2099
|
|
|
1203
|
-
|
|
2100
|
+
if not trading_history.empty:
|
|
2101
|
+
# Summary metrics
|
|
2102
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
1204
2103
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
2104
|
+
with col1:
|
|
2105
|
+
total_trades = len(trading_history)
|
|
2106
|
+
st.metric(
|
|
2107
|
+
"Total Trades",
|
|
2108
|
+
total_trades,
|
|
2109
|
+
help="Total number of trading disclosures filed by this politician (last 100 shown).",
|
|
2110
|
+
)
|
|
2111
|
+
|
|
2112
|
+
with col2:
|
|
2113
|
+
# Count transaction types
|
|
2114
|
+
if "transaction_type" in trading_history.columns:
|
|
2115
|
+
purchases = len(trading_history[trading_history["transaction_type"] == "Purchase"])
|
|
2116
|
+
st.metric(
|
|
2117
|
+
"Purchases",
|
|
2118
|
+
purchases,
|
|
2119
|
+
help="Number of purchase transactions. Compare with sales to understand trading behavior.",
|
|
2120
|
+
)
|
|
2121
|
+
else:
|
|
2122
|
+
st.metric("Purchases", "N/A")
|
|
2123
|
+
|
|
2124
|
+
with col3:
|
|
2125
|
+
# Count unique tickers
|
|
2126
|
+
if "ticker_symbol" in trading_history.columns:
|
|
2127
|
+
unique_tickers = trading_history["ticker_symbol"].nunique()
|
|
2128
|
+
st.metric(
|
|
2129
|
+
"Unique Stocks",
|
|
2130
|
+
unique_tickers,
|
|
2131
|
+
help="Number of different stocks traded. Higher diversity may indicate broader market exposure.",
|
|
2132
|
+
)
|
|
2133
|
+
else:
|
|
2134
|
+
st.metric("Unique Stocks", "N/A")
|
|
2135
|
+
|
|
2136
|
+
with col4:
|
|
2137
|
+
# Most recent trade date
|
|
2138
|
+
if "disclosure_date" in trading_history.columns:
|
|
2139
|
+
try:
|
|
2140
|
+
recent_date = pd.to_datetime(trading_history["disclosure_date"]).max()
|
|
2141
|
+
st.metric(
|
|
2142
|
+
"Last Trade",
|
|
2143
|
+
recent_date.strftime("%Y-%m-%d"),
|
|
2144
|
+
help="Date of most recent trading disclosure. Newer trades may be more relevant for predictions.",
|
|
2145
|
+
)
|
|
2146
|
+
except:
|
|
2147
|
+
st.metric("Last Trade", "N/A")
|
|
2148
|
+
else:
|
|
2149
|
+
st.metric("Last Trade", "N/A")
|
|
2150
|
+
|
|
2151
|
+
# Detailed history in expandable section
|
|
2152
|
+
with st.expander("📜 View Detailed Trading History", expanded=False):
|
|
2153
|
+
# Filter options
|
|
2154
|
+
col1, col2 = st.columns(2)
|
|
2155
|
+
|
|
2156
|
+
with col1:
|
|
2157
|
+
# Transaction type filter
|
|
2158
|
+
if "transaction_type" in trading_history.columns:
|
|
2159
|
+
trans_types = ["All"] + list(trading_history["transaction_type"].unique())
|
|
2160
|
+
trans_filter = st.selectbox("Filter by Transaction Type", trans_types)
|
|
2161
|
+
else:
|
|
2162
|
+
trans_filter = "All"
|
|
2163
|
+
|
|
2164
|
+
with col2:
|
|
2165
|
+
# Show recent N trades
|
|
2166
|
+
show_trades = st.slider("Show Last N Trades", 5, 50, 10, step=5)
|
|
2167
|
+
|
|
2168
|
+
# Apply filters
|
|
2169
|
+
filtered_history = trading_history.copy()
|
|
2170
|
+
if trans_filter != "All" and "transaction_type" in filtered_history.columns:
|
|
2171
|
+
filtered_history = filtered_history[
|
|
2172
|
+
filtered_history["transaction_type"] == trans_filter
|
|
2173
|
+
]
|
|
2174
|
+
|
|
2175
|
+
# Display trades
|
|
2176
|
+
st.dataframe(
|
|
2177
|
+
filtered_history.head(show_trades),
|
|
2178
|
+
width="stretch",
|
|
2179
|
+
height=300,
|
|
2180
|
+
)
|
|
2181
|
+
|
|
2182
|
+
# Visualizations
|
|
2183
|
+
if len(filtered_history) > 0:
|
|
2184
|
+
st.markdown("#### 📈 Trading Patterns")
|
|
2185
|
+
|
|
2186
|
+
viz_col1, viz_col2 = st.columns(2)
|
|
2187
|
+
|
|
2188
|
+
with viz_col1:
|
|
2189
|
+
# Transaction type distribution
|
|
2190
|
+
if "transaction_type" in filtered_history.columns:
|
|
2191
|
+
trans_dist = filtered_history["transaction_type"].value_counts()
|
|
2192
|
+
fig = px.pie(
|
|
2193
|
+
values=trans_dist.values,
|
|
2194
|
+
names=trans_dist.index,
|
|
2195
|
+
title="Transaction Type Distribution",
|
|
2196
|
+
)
|
|
2197
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2198
|
+
|
|
2199
|
+
with viz_col2:
|
|
2200
|
+
# Top traded stocks
|
|
2201
|
+
if "ticker_symbol" in filtered_history.columns:
|
|
2202
|
+
top_stocks = filtered_history["ticker_symbol"].value_counts().head(10)
|
|
2203
|
+
fig = px.bar(
|
|
2204
|
+
x=top_stocks.values,
|
|
2205
|
+
y=top_stocks.index,
|
|
2206
|
+
orientation="h",
|
|
2207
|
+
title="Top 10 Most Traded Stocks",
|
|
2208
|
+
labels={"x": "Number of Trades", "y": "Ticker"},
|
|
2209
|
+
)
|
|
2210
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2211
|
+
|
|
2212
|
+
# Timeline of trades
|
|
2213
|
+
if "disclosure_date" in filtered_history.columns:
|
|
2214
|
+
st.markdown("#### 📅 Trading Timeline")
|
|
2215
|
+
try:
|
|
2216
|
+
timeline_df = filtered_history.copy()
|
|
2217
|
+
timeline_df["disclosure_date"] = pd.to_datetime(
|
|
2218
|
+
timeline_df["disclosure_date"]
|
|
2219
|
+
)
|
|
2220
|
+
timeline_df = timeline_df.sort_values("disclosure_date")
|
|
2221
|
+
|
|
2222
|
+
# Count trades per month
|
|
2223
|
+
# Convert to month string directly to avoid PeriodArray timezone warning
|
|
2224
|
+
timeline_df["month"] = timeline_df["disclosure_date"].dt.strftime("%Y-%m")
|
|
2225
|
+
monthly_trades = (
|
|
2226
|
+
timeline_df.groupby("month").size().reset_index(name="count")
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
fig = px.line(
|
|
2230
|
+
monthly_trades,
|
|
2231
|
+
x="month",
|
|
2232
|
+
y="count",
|
|
2233
|
+
title="Trading Activity Over Time",
|
|
2234
|
+
labels={"month": "Month", "count": "Number of Trades"},
|
|
2235
|
+
markers=True,
|
|
2236
|
+
)
|
|
2237
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2238
|
+
except Exception as e:
|
|
2239
|
+
st.info("Timeline visualization not available")
|
|
2240
|
+
|
|
2241
|
+
else:
|
|
2242
|
+
st.info(
|
|
2243
|
+
f"📭 No trading history found for {politician_name}. "
|
|
2244
|
+
"This could mean: (1) No trades on record, (2) Data not yet synced, or (3) Name not in database."
|
|
2245
|
+
)
|
|
2246
|
+
|
|
2247
|
+
st.markdown("---")
|
|
2248
|
+
|
|
2249
|
+
# Technical details about prediction system
|
|
2250
|
+
with st.expander("ℹ️ About the Prediction System"):
|
|
2251
|
+
st.markdown(
|
|
2252
|
+
"""
|
|
2253
|
+
### How Predictions Work
|
|
2254
|
+
|
|
2255
|
+
**Current Implementation** (Production Mode):
|
|
2256
|
+
|
|
2257
|
+
This system uses a **feature-engineered prediction pipeline** with real data analysis:
|
|
2258
|
+
|
|
2259
|
+
1. **Load Latest Model**: Fetches the most recent trained model from `/models` directory
|
|
2260
|
+
2. **Feature Engineering**: Transforms input data using a 10-feature pipeline:
|
|
2261
|
+
- **Politician Performance**: Historical trading volume, purchase ratio, stock diversity
|
|
2262
|
+
- **Transaction Characteristics**: Purchase/sale indicator, amount (log-scaled & normalized)
|
|
2263
|
+
- **Market Indicators**: Market cap score, sector risk assessment
|
|
2264
|
+
- **Sentiment & Volatility**: News sentiment scores, price volatility measures
|
|
2265
|
+
- **Timing Analysis**: Trade recency score with decay function
|
|
2266
|
+
3. **Model Inference**: Runs preprocessed data through feature-weighted scoring model
|
|
2267
|
+
4. **Result Generation**: Produces 4 key metrics:
|
|
2268
|
+
- **Recommendation**: BUY/SELL/HOLD based on weighted score
|
|
2269
|
+
- **Predicted Return**: Expected return percentage
|
|
2270
|
+
- **Confidence**: Prediction confidence (50%-95%)
|
|
2271
|
+
- **Risk Level**: Risk assessment (Low/Medium/High)
|
|
2272
|
+
|
|
2273
|
+
**Next Steps** (Neural Network Integration):
|
|
2274
|
+
- Load PyTorch model from training pipeline
|
|
2275
|
+
- Run inference with trained neural network weights
|
|
2276
|
+
- Replace weighted scoring with deep learning predictions
|
|
2277
|
+
- See `docs/model_training_guide.md` for training instructions
|
|
2278
|
+
|
|
2279
|
+
**Prediction Quality Factors**:
|
|
2280
|
+
- Politician's historical trading success (15% weight)
|
|
2281
|
+
- News sentiment analysis (20% weight)
|
|
2282
|
+
- Price volatility (12% weight, negative impact)
|
|
2283
|
+
- Transaction timing and market conditions
|
|
2284
|
+
- Sector-specific risk profiles
|
|
2285
|
+
"""
|
|
2286
|
+
)
|
|
2287
|
+
|
|
2288
|
+
if st.button("🔮 Generate Prediction", width="stretch"):
|
|
2289
|
+
# PRODUCTION MODE: Real model inference
|
|
2290
|
+
with st.spinner("🔬 Engineering features and running model inference..."):
|
|
2291
|
+
# 1. Load latest model
|
|
2292
|
+
model_file, model_metadata = load_latest_model()
|
|
2293
|
+
|
|
2294
|
+
# 2. Engineer features from input data
|
|
2295
|
+
features = engineer_features(
|
|
2296
|
+
ticker=ticker,
|
|
2297
|
+
politician_name=politician_name,
|
|
2298
|
+
transaction_type=transaction_type,
|
|
2299
|
+
amount=amount,
|
|
2300
|
+
filing_date=filing_date,
|
|
2301
|
+
market_cap=market_cap,
|
|
2302
|
+
sector=sector,
|
|
2303
|
+
sentiment=sentiment,
|
|
2304
|
+
volatility=volatility,
|
|
2305
|
+
trading_history=trading_history,
|
|
2306
|
+
)
|
|
2307
|
+
|
|
2308
|
+
# 3. Generate prediction
|
|
2309
|
+
prediction = generate_production_prediction(features, model_metadata)
|
|
1208
2310
|
|
|
1209
2311
|
# Display results
|
|
2312
|
+
st.success(
|
|
2313
|
+
f"✅ **Production Mode**: Using {prediction['model_used']} | "
|
|
2314
|
+
f"Features: {len(features)} engineered"
|
|
2315
|
+
)
|
|
1210
2316
|
st.markdown("### 🎯 Prediction Results")
|
|
1211
2317
|
|
|
1212
|
-
col1, col2, col3 = st.columns(
|
|
2318
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
1213
2319
|
|
|
1214
2320
|
with col1:
|
|
1215
|
-
recommendation =
|
|
1216
|
-
"BUY"
|
|
1217
|
-
if prediction_score > 0.6
|
|
1218
|
-
else "SELL" if prediction_score < 0.4 else "HOLD"
|
|
1219
|
-
)
|
|
2321
|
+
recommendation = prediction["recommendation"]
|
|
1220
2322
|
color = (
|
|
1221
2323
|
"green"
|
|
1222
2324
|
if recommendation == "BUY"
|
|
@@ -1225,36 +2327,82 @@ def show_interactive_predictions_tab():
|
|
|
1225
2327
|
st.markdown(f"**Recommendation**: :{color}[{recommendation}]")
|
|
1226
2328
|
|
|
1227
2329
|
with col2:
|
|
1228
|
-
st.metric(
|
|
2330
|
+
st.metric(
|
|
2331
|
+
"Predicted Return",
|
|
2332
|
+
f"{prediction['predicted_return']:.1%}",
|
|
2333
|
+
help="Expected return based on model analysis. Positive = profit, negative = loss.",
|
|
2334
|
+
)
|
|
1229
2335
|
|
|
1230
2336
|
with col3:
|
|
1231
|
-
st.metric(
|
|
2337
|
+
st.metric(
|
|
2338
|
+
"Confidence",
|
|
2339
|
+
f"{prediction['confidence']:.0%}",
|
|
2340
|
+
help="Model confidence in this prediction. Higher = more certain.",
|
|
2341
|
+
)
|
|
1232
2342
|
|
|
1233
|
-
|
|
1234
|
-
|
|
2343
|
+
with col4:
|
|
2344
|
+
risk_color = (
|
|
2345
|
+
"🔴"
|
|
2346
|
+
if prediction["risk_score"] > 0.7
|
|
2347
|
+
else "🟡" if prediction["risk_score"] > 0.4 else "🟢"
|
|
2348
|
+
)
|
|
2349
|
+
st.metric(
|
|
2350
|
+
"Risk Level",
|
|
2351
|
+
f"{risk_color} {prediction['risk_score']:.2f}",
|
|
2352
|
+
help="Risk score (0-1). Higher = riskier trade.",
|
|
2353
|
+
)
|
|
1235
2354
|
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
2355
|
+
# Prediction breakdown - show actual feature contributions
|
|
2356
|
+
st.markdown("### 📊 Feature Analysis")
|
|
2357
|
+
|
|
2358
|
+
# Display top contributing features
|
|
2359
|
+
feature_contributions = {}
|
|
2360
|
+
weights = {
|
|
2361
|
+
"politician_trade_count": ("Politician Experience", 0.15),
|
|
2362
|
+
"politician_purchase_ratio": ("Buy/Sell Ratio", 0.10),
|
|
2363
|
+
"politician_diversity": ("Portfolio Diversity", 0.08),
|
|
2364
|
+
"transaction_is_purchase": ("Transaction Type", 0.12),
|
|
2365
|
+
"transaction_amount_normalized": ("Transaction Size", 0.10),
|
|
2366
|
+
"market_cap_score": ("Company Size", 0.08),
|
|
2367
|
+
"sector_risk": ("Sector Risk", -0.10),
|
|
2368
|
+
"sentiment_score": ("News Sentiment", 0.20),
|
|
2369
|
+
"volatility_score": ("Market Volatility", -0.12),
|
|
2370
|
+
"timing_score": ("Market Timing", 0.09),
|
|
1242
2371
|
}
|
|
1243
2372
|
|
|
2373
|
+
for feature, value in features.items():
|
|
2374
|
+
if feature in weights:
|
|
2375
|
+
label, weight = weights[feature]
|
|
2376
|
+
# Contribution = feature value * weight
|
|
2377
|
+
contribution = value * abs(weight)
|
|
2378
|
+
feature_contributions[label] = contribution
|
|
2379
|
+
|
|
2380
|
+
# Sort by contribution
|
|
2381
|
+
sorted_features = sorted(
|
|
2382
|
+
feature_contributions.items(), key=lambda x: x[1], reverse=True
|
|
2383
|
+
)
|
|
2384
|
+
|
|
1244
2385
|
factor_df = pd.DataFrame(
|
|
1245
|
-
{
|
|
2386
|
+
{
|
|
2387
|
+
"Feature": [f[0] for f in sorted_features],
|
|
2388
|
+
"Contribution": [f[1] for f in sorted_features],
|
|
2389
|
+
}
|
|
1246
2390
|
)
|
|
1247
2391
|
|
|
1248
2392
|
fig = px.bar(
|
|
1249
2393
|
factor_df,
|
|
1250
|
-
x="
|
|
1251
|
-
y="
|
|
2394
|
+
x="Contribution",
|
|
2395
|
+
y="Feature",
|
|
1252
2396
|
orientation="h",
|
|
1253
|
-
title="
|
|
1254
|
-
color="
|
|
2397
|
+
title="Feature Contributions to Prediction",
|
|
2398
|
+
color="Contribution",
|
|
1255
2399
|
color_continuous_scale="RdYlGn",
|
|
1256
2400
|
)
|
|
1257
|
-
st.plotly_chart(fig,
|
|
2401
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
2402
|
+
|
|
2403
|
+
# Show raw feature values in expandable section
|
|
2404
|
+
with st.expander("🔍 View Engineered Features"):
|
|
2405
|
+
st.json(features)
|
|
1258
2406
|
|
|
1259
2407
|
|
|
1260
2408
|
def show_performance_tracking_tab():
|
|
@@ -1263,7 +2411,9 @@ def show_performance_tracking_tab():
|
|
|
1263
2411
|
|
|
1264
2412
|
# Time range selector
|
|
1265
2413
|
time_range = st.selectbox(
|
|
1266
|
-
"Select Time Range",
|
|
2414
|
+
"Select Time Range",
|
|
2415
|
+
["Last 7 Days", "Last 30 Days", "Last 90 Days", "All Time"],
|
|
2416
|
+
help="Choose time period to view model performance trends. Longer periods show overall stability, shorter periods show recent changes.",
|
|
1267
2417
|
)
|
|
1268
2418
|
|
|
1269
2419
|
# Generate time series data
|
|
@@ -1292,7 +2442,7 @@ def show_performance_tracking_tab():
|
|
|
1292
2442
|
yaxis_title="Accuracy",
|
|
1293
2443
|
hovermode="x unified",
|
|
1294
2444
|
)
|
|
1295
|
-
st.plotly_chart(fig,
|
|
2445
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1296
2446
|
|
|
1297
2447
|
# Prediction volume and success rate
|
|
1298
2448
|
st.markdown("### 📈 Prediction Metrics")
|
|
@@ -1308,7 +2458,7 @@ def show_performance_tracking_tab():
|
|
|
1308
2458
|
go.Bar(x=dates, y=predictions_per_day, name="Predictions", marker_color="lightblue")
|
|
1309
2459
|
)
|
|
1310
2460
|
fig.update_layout(title="Daily Prediction Volume", xaxis_title="Date", yaxis_title="Count")
|
|
1311
|
-
st.plotly_chart(fig,
|
|
2461
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1312
2462
|
|
|
1313
2463
|
with col2:
|
|
1314
2464
|
# Success rate
|
|
@@ -1331,7 +2481,7 @@ def show_performance_tracking_tab():
|
|
|
1331
2481
|
yaxis_title="Success Rate",
|
|
1332
2482
|
yaxis_tickformat=".0%",
|
|
1333
2483
|
)
|
|
1334
|
-
st.plotly_chart(fig,
|
|
2484
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1335
2485
|
|
|
1336
2486
|
# Data drift detection
|
|
1337
2487
|
st.markdown("### 🔍 Data Drift Detection")
|
|
@@ -1361,7 +2511,7 @@ def show_performance_tracking_tab():
|
|
|
1361
2511
|
color_discrete_map={"Normal": "green", "Warning": "orange", "Alert": "red"},
|
|
1362
2512
|
title="Feature Drift Detection",
|
|
1363
2513
|
)
|
|
1364
|
-
st.plotly_chart(fig,
|
|
2514
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1365
2515
|
|
|
1366
2516
|
with col2:
|
|
1367
2517
|
st.markdown("**Drift Status**")
|
|
@@ -1391,7 +2541,13 @@ def show_predictions():
|
|
|
1391
2541
|
col1, col2, col3 = st.columns(3)
|
|
1392
2542
|
|
|
1393
2543
|
with col1:
|
|
1394
|
-
min_confidence = st.slider(
|
|
2544
|
+
min_confidence = st.slider(
|
|
2545
|
+
"Min Confidence",
|
|
2546
|
+
0.0,
|
|
2547
|
+
1.0,
|
|
2548
|
+
0.5,
|
|
2549
|
+
help="Filter predictions by minimum confidence level. Higher values show only high-confidence predictions.",
|
|
2550
|
+
)
|
|
1395
2551
|
|
|
1396
2552
|
with col2:
|
|
1397
2553
|
recommendation_filter = st.selectbox(
|
|
@@ -1401,10 +2557,15 @@ def show_predictions():
|
|
|
1401
2557
|
if "recommendation" in predictions
|
|
1402
2558
|
else ["All"]
|
|
1403
2559
|
),
|
|
2560
|
+
help="Filter by recommendation type: BUY (positive outlook), SELL (negative outlook), or HOLD (neutral).",
|
|
1404
2561
|
)
|
|
1405
2562
|
|
|
1406
2563
|
with col3:
|
|
1407
|
-
sort_by = st.selectbox(
|
|
2564
|
+
sort_by = st.selectbox(
|
|
2565
|
+
"Sort By",
|
|
2566
|
+
["predicted_return", "confidence", "risk_score"],
|
|
2567
|
+
help="Sort predictions by: predicted return (highest gains first), confidence (most certain first), or risk score (lowest risk first).",
|
|
2568
|
+
)
|
|
1408
2569
|
|
|
1409
2570
|
# Apply filters
|
|
1410
2571
|
filtered_predictions = predictions.copy()
|
|
@@ -1466,7 +2627,7 @@ def show_predictions():
|
|
|
1466
2627
|
hover_data=["ticker"] if "ticker" in filtered_predictions else None,
|
|
1467
2628
|
title="Risk-Return Analysis",
|
|
1468
2629
|
)
|
|
1469
|
-
st.plotly_chart(fig,
|
|
2630
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1470
2631
|
|
|
1471
2632
|
with col2:
|
|
1472
2633
|
# Top movers
|
|
@@ -1485,7 +2646,7 @@ def show_predictions():
|
|
|
1485
2646
|
color_continuous_scale="RdYlGn",
|
|
1486
2647
|
title="Top Movers (Predicted)",
|
|
1487
2648
|
)
|
|
1488
|
-
st.plotly_chart(fig,
|
|
2649
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1489
2650
|
else:
|
|
1490
2651
|
st.warning("No predictions available. Check if the ML pipeline is running correctly.")
|
|
1491
2652
|
else:
|
|
@@ -1534,7 +2695,7 @@ def show_lsh_jobs():
|
|
|
1534
2695
|
lsh_jobs["timestamp"] = pd.to_datetime(lsh_jobs["timestamp"])
|
|
1535
2696
|
|
|
1536
2697
|
# Group by hour
|
|
1537
|
-
hourly_jobs = lsh_jobs.set_index("timestamp").resample("
|
|
2698
|
+
hourly_jobs = lsh_jobs.set_index("timestamp").resample("1h").size()
|
|
1538
2699
|
|
|
1539
2700
|
fig = px.line(
|
|
1540
2701
|
x=hourly_jobs.index,
|
|
@@ -1542,7 +2703,7 @@ def show_lsh_jobs():
|
|
|
1542
2703
|
title="Job Executions Over Time",
|
|
1543
2704
|
labels={"x": "Time", "y": "Job Count"},
|
|
1544
2705
|
)
|
|
1545
|
-
st.plotly_chart(fig,
|
|
2706
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1546
2707
|
except:
|
|
1547
2708
|
pass
|
|
1548
2709
|
else:
|
|
@@ -1640,7 +2801,7 @@ def show_system_health():
|
|
|
1640
2801
|
)
|
|
1641
2802
|
|
|
1642
2803
|
fig.update_layout(height=500, showlegend=False)
|
|
1643
|
-
st.plotly_chart(fig,
|
|
2804
|
+
st.plotly_chart(fig, width="stretch", config={"responsive": True})
|
|
1644
2805
|
|
|
1645
2806
|
|
|
1646
2807
|
# Run the main dashboard function
|