mcli-framework 7.1.3__py3-none-any.whl → 7.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/main.py +10 -0
- mcli/lib/custom_commands.py +424 -0
- mcli/lib/paths.py +12 -0
- mcli/ml/dashboard/app.py +13 -13
- mcli/ml/dashboard/app_integrated.py +1292 -148
- mcli/ml/dashboard/app_supabase.py +46 -21
- mcli/ml/dashboard/app_training.py +14 -14
- mcli/ml/dashboard/components/charts.py +258 -0
- mcli/ml/dashboard/components/metrics.py +125 -0
- mcli/ml/dashboard/components/tables.py +228 -0
- mcli/ml/dashboard/pages/cicd.py +382 -0
- mcli/ml/dashboard/pages/predictions_enhanced.py +820 -0
- mcli/ml/dashboard/pages/scrapers_and_logs.py +1060 -0
- mcli/ml/dashboard/pages/workflows.py +533 -0
- mcli/ml/training/train_model.py +569 -0
- mcli/self/self_cmd.py +322 -94
- mcli/workflow/politician_trading/data_sources.py +259 -1
- mcli/workflow/politician_trading/models.py +159 -1
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +846 -0
- mcli/workflow/politician_trading/scrapers_free_sources.py +516 -0
- mcli/workflow/politician_trading/scrapers_third_party.py +391 -0
- mcli/workflow/politician_trading/seed_database.py +539 -0
- mcli/workflow/workflow.py +8 -27
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/METADATA +1 -1
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/RECORD +29 -25
- mcli/workflow/daemon/api_daemon.py +0 -800
- mcli/workflow/daemon/commands.py +0 -1196
- mcli/workflow/dashboard/dashboard_cmd.py +0 -120
- mcli/workflow/file/file.py +0 -100
- mcli/workflow/git_commit/commands.py +0 -430
- mcli/workflow/politician_trading/commands.py +0 -1939
- mcli/workflow/scheduler/commands.py +0 -493
- mcli/workflow/sync/sync_cmd.py +0 -437
- mcli/workflow/videos/videos.py +0 -242
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.3.dist-info → mcli_framework-7.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1060 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scrapers and Logs Dashboard Page
|
|
3
|
+
|
|
4
|
+
This page provides:
|
|
5
|
+
1. Manual scraping interface for corporate registry data
|
|
6
|
+
2. Real-time scraper logs and job status
|
|
7
|
+
3. System logs viewer
|
|
8
|
+
4. Job history and statistics
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
from io import StringIO
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import plotly.express as px
|
|
20
|
+
import plotly.graph_objects as go
|
|
21
|
+
import streamlit as st
|
|
22
|
+
from plotly.subplots import make_subplots
|
|
23
|
+
|
|
24
|
+
# Configure logging
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
level=logging.INFO,
|
|
27
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
28
|
+
)
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def show_scrapers_and_logs():
|
|
33
|
+
"""Main function for scrapers and logs page"""
|
|
34
|
+
st.header("🔍 Data Scrapers & System Logs")
|
|
35
|
+
|
|
36
|
+
st.markdown("""
|
|
37
|
+
**Features:**
|
|
38
|
+
- 🚀 Manual data scraping from corporate registries
|
|
39
|
+
- 📊 Real-time scraper logs and job status
|
|
40
|
+
- 📝 System logs viewer
|
|
41
|
+
- 📈 Job history and statistics
|
|
42
|
+
""")
|
|
43
|
+
|
|
44
|
+
# Create tabs
|
|
45
|
+
tabs = st.tabs([
|
|
46
|
+
"🚀 Manual Scraping",
|
|
47
|
+
"📊 Scraper Logs",
|
|
48
|
+
"📝 System Logs",
|
|
49
|
+
"📈 Job History"
|
|
50
|
+
])
|
|
51
|
+
|
|
52
|
+
with tabs[0]:
|
|
53
|
+
show_manual_scraping()
|
|
54
|
+
|
|
55
|
+
with tabs[1]:
|
|
56
|
+
show_scraper_logs()
|
|
57
|
+
|
|
58
|
+
with tabs[2]:
|
|
59
|
+
show_system_logs()
|
|
60
|
+
|
|
61
|
+
with tabs[3]:
|
|
62
|
+
show_job_history()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def show_manual_scraping():
|
|
66
|
+
"""Manual scraping interface"""
|
|
67
|
+
st.subheader("🚀 Manual Data Scraping")
|
|
68
|
+
|
|
69
|
+
st.markdown("""
|
|
70
|
+
Manually trigger data scraping jobs from various sources.
|
|
71
|
+
Select a source, configure parameters, and run the scraper.
|
|
72
|
+
""")
|
|
73
|
+
|
|
74
|
+
# Source selection
|
|
75
|
+
source_type = st.selectbox(
|
|
76
|
+
"Select Data Source",
|
|
77
|
+
[
|
|
78
|
+
"UK Companies House",
|
|
79
|
+
"Info-Financière (France)",
|
|
80
|
+
"OpenCorporates",
|
|
81
|
+
"XBRL Filings (EU/UK)",
|
|
82
|
+
"XBRL US",
|
|
83
|
+
"Senate Stock Watcher (GitHub)",
|
|
84
|
+
],
|
|
85
|
+
help="Choose which data source to scrape"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Source-specific configuration
|
|
89
|
+
if source_type == "UK Companies House":
|
|
90
|
+
show_uk_companies_house_scraper()
|
|
91
|
+
elif source_type == "Info-Financière (France)":
|
|
92
|
+
show_info_financiere_scraper()
|
|
93
|
+
elif source_type == "OpenCorporates":
|
|
94
|
+
show_opencorporates_scraper()
|
|
95
|
+
elif source_type == "XBRL Filings (EU/UK)":
|
|
96
|
+
show_xbrl_filings_scraper()
|
|
97
|
+
elif source_type == "XBRL US":
|
|
98
|
+
show_xbrl_us_scraper()
|
|
99
|
+
elif source_type == "Senate Stock Watcher (GitHub)":
|
|
100
|
+
show_senate_watcher_scraper()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def show_uk_companies_house_scraper():
|
|
104
|
+
"""UK Companies House scraper interface"""
|
|
105
|
+
st.markdown("### UK Companies House Configuration")
|
|
106
|
+
|
|
107
|
+
# Check API key
|
|
108
|
+
api_key = os.getenv("UK_COMPANIES_HOUSE_API_KEY") or st.secrets.get("UK_COMPANIES_HOUSE_API_KEY", "")
|
|
109
|
+
|
|
110
|
+
if not api_key:
|
|
111
|
+
st.error("❌ UK Companies House API key not configured")
|
|
112
|
+
st.info("""
|
|
113
|
+
To use this scraper, set `UK_COMPANIES_HOUSE_API_KEY` in:
|
|
114
|
+
- Streamlit Cloud: Settings → Secrets
|
|
115
|
+
- Local: .streamlit/secrets.toml or environment variable
|
|
116
|
+
|
|
117
|
+
Get free API key: https://developer.company-information.service.gov.uk/
|
|
118
|
+
""")
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
st.success("✅ API key configured")
|
|
122
|
+
|
|
123
|
+
# Configuration
|
|
124
|
+
col1, col2 = st.columns(2)
|
|
125
|
+
|
|
126
|
+
with col1:
|
|
127
|
+
company_query = st.text_input(
|
|
128
|
+
"Company Name",
|
|
129
|
+
value="Tesco",
|
|
130
|
+
help="Company name to search for"
|
|
131
|
+
)
|
|
132
|
+
max_results = st.number_input(
|
|
133
|
+
"Max Results",
|
|
134
|
+
min_value=1,
|
|
135
|
+
max_value=100,
|
|
136
|
+
value=10,
|
|
137
|
+
help="Maximum number of companies to fetch"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
with col2:
|
|
141
|
+
fetch_officers = st.checkbox("Fetch Officers", value=True)
|
|
142
|
+
fetch_psc = st.checkbox("Fetch PSC Data", value=True)
|
|
143
|
+
save_to_db = st.checkbox("Save to Database", value=False)
|
|
144
|
+
|
|
145
|
+
# Run scraper
|
|
146
|
+
if st.button("🚀 Run UK Companies House Scraper", type="primary"):
|
|
147
|
+
run_uk_companies_house_scraper(
|
|
148
|
+
company_query,
|
|
149
|
+
max_results,
|
|
150
|
+
fetch_officers,
|
|
151
|
+
fetch_psc,
|
|
152
|
+
save_to_db
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def run_uk_companies_house_scraper(
|
|
157
|
+
query: str,
|
|
158
|
+
max_results: int,
|
|
159
|
+
fetch_officers: bool,
|
|
160
|
+
fetch_psc: bool,
|
|
161
|
+
save_to_db: bool
|
|
162
|
+
):
|
|
163
|
+
"""Execute UK Companies House scraper"""
|
|
164
|
+
try:
|
|
165
|
+
from mcli.workflow.politician_trading.scrapers_corporate_registry import UKCompaniesHouseScraper
|
|
166
|
+
|
|
167
|
+
# Create log capture
|
|
168
|
+
log_stream = StringIO()
|
|
169
|
+
handler = logging.StreamHandler(log_stream)
|
|
170
|
+
handler.setLevel(logging.INFO)
|
|
171
|
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
172
|
+
handler.setFormatter(formatter)
|
|
173
|
+
|
|
174
|
+
scraper_logger = logging.getLogger("mcli.workflow.politician_trading.scrapers_corporate_registry")
|
|
175
|
+
scraper_logger.addHandler(handler)
|
|
176
|
+
|
|
177
|
+
# Create progress containers
|
|
178
|
+
status_container = st.empty()
|
|
179
|
+
progress_bar = st.progress(0)
|
|
180
|
+
log_container = st.empty()
|
|
181
|
+
results_container = st.empty()
|
|
182
|
+
|
|
183
|
+
# Initialize scraper
|
|
184
|
+
status_container.info("🔄 Initializing UK Companies House scraper...")
|
|
185
|
+
scraper = UKCompaniesHouseScraper()
|
|
186
|
+
progress_bar.progress(10)
|
|
187
|
+
|
|
188
|
+
# Search companies
|
|
189
|
+
status_container.info(f"🔍 Searching for '{query}'...")
|
|
190
|
+
companies = scraper.search_companies(query, items_per_page=max_results)
|
|
191
|
+
progress_bar.progress(30)
|
|
192
|
+
|
|
193
|
+
if not companies:
|
|
194
|
+
status_container.warning(f"⚠️ No companies found matching '{query}'")
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
status_container.success(f"✅ Found {len(companies)} companies")
|
|
198
|
+
|
|
199
|
+
# Fetch additional data
|
|
200
|
+
all_officers = []
|
|
201
|
+
all_psc = []
|
|
202
|
+
|
|
203
|
+
for i, company in enumerate(companies):
|
|
204
|
+
company_number = company.get("company_number")
|
|
205
|
+
company_name = company.get("title", "Unknown")
|
|
206
|
+
|
|
207
|
+
if fetch_officers:
|
|
208
|
+
status_container.info(f"👥 Fetching officers for {company_name}...")
|
|
209
|
+
officers = scraper.get_company_officers(company_number)
|
|
210
|
+
all_officers.extend(officers)
|
|
211
|
+
|
|
212
|
+
if fetch_psc:
|
|
213
|
+
status_container.info(f"🏢 Fetching PSC for {company_name}...")
|
|
214
|
+
psc = scraper.get_persons_with_significant_control(company_number)
|
|
215
|
+
all_psc.extend(psc)
|
|
216
|
+
|
|
217
|
+
progress_bar.progress(30 + int((i + 1) / len(companies) * 50))
|
|
218
|
+
|
|
219
|
+
# Display logs
|
|
220
|
+
log_container.text_area(
|
|
221
|
+
"Scraper Logs",
|
|
222
|
+
log_stream.getvalue(),
|
|
223
|
+
height=200
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Display results
|
|
227
|
+
with results_container:
|
|
228
|
+
st.markdown("### 📊 Scraping Results")
|
|
229
|
+
|
|
230
|
+
col1, col2, col3 = st.columns(3)
|
|
231
|
+
col1.metric("Companies", len(companies))
|
|
232
|
+
col2.metric("Officers", len(all_officers))
|
|
233
|
+
col3.metric("PSC", len(all_psc))
|
|
234
|
+
|
|
235
|
+
# Show companies
|
|
236
|
+
st.markdown("#### Companies Found")
|
|
237
|
+
companies_df = pd.DataFrame([{
|
|
238
|
+
"Number": c.get("company_number"),
|
|
239
|
+
"Name": c.get("title"),
|
|
240
|
+
"Status": c.get("company_status"),
|
|
241
|
+
"Type": c.get("company_type"),
|
|
242
|
+
"Address": c.get("address_snippet", "")[:50]
|
|
243
|
+
} for c in companies])
|
|
244
|
+
st.dataframe(companies_df, use_container_width=True)
|
|
245
|
+
|
|
246
|
+
# Show officers
|
|
247
|
+
if all_officers:
|
|
248
|
+
st.markdown("#### Officers Found")
|
|
249
|
+
officers_df = pd.DataFrame([{
|
|
250
|
+
"Name": o.get("name"),
|
|
251
|
+
"Role": o.get("officer_role"),
|
|
252
|
+
"Appointed": o.get("appointed_on", ""),
|
|
253
|
+
"Nationality": o.get("nationality", ""),
|
|
254
|
+
"Occupation": o.get("occupation", "")
|
|
255
|
+
} for o in all_officers[:50]]) # Limit to 50 for display
|
|
256
|
+
st.dataframe(officers_df, use_container_width=True)
|
|
257
|
+
|
|
258
|
+
# Show PSC
|
|
259
|
+
if all_psc:
|
|
260
|
+
st.markdown("#### Persons with Significant Control")
|
|
261
|
+
psc_df = pd.DataFrame([{
|
|
262
|
+
"Name": p.get("name"),
|
|
263
|
+
"Kind": p.get("kind", "").replace("-", " ").title(),
|
|
264
|
+
"Control": ", ".join(p.get("natures_of_control", [])),
|
|
265
|
+
"Nationality": p.get("nationality", ""),
|
|
266
|
+
} for p in all_psc[:50]])
|
|
267
|
+
st.dataframe(psc_df, use_container_width=True)
|
|
268
|
+
|
|
269
|
+
progress_bar.progress(100)
|
|
270
|
+
status_container.success(f"✅ Scraping completed! Found {len(companies)} companies, {len(all_officers)} officers, {len(all_psc)} PSC")
|
|
271
|
+
|
|
272
|
+
# Save to database if requested
|
|
273
|
+
if save_to_db:
|
|
274
|
+
save_corporate_data_to_db(companies, all_officers, all_psc, "uk_companies_house")
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
st.error(f"❌ Error: {e}")
|
|
278
|
+
import traceback
|
|
279
|
+
st.code(traceback.format_exc())
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def show_info_financiere_scraper():
|
|
283
|
+
"""Info-Financière scraper interface"""
|
|
284
|
+
st.markdown("### Info-Financière (France) Configuration")
|
|
285
|
+
|
|
286
|
+
st.success("✅ No API key required (FREE)")
|
|
287
|
+
|
|
288
|
+
# Configuration
|
|
289
|
+
col1, col2 = st.columns(2)
|
|
290
|
+
|
|
291
|
+
with col1:
|
|
292
|
+
query = st.text_input(
|
|
293
|
+
"Search Query (optional)",
|
|
294
|
+
value="",
|
|
295
|
+
help="Company name, ISIN, or leave blank for all"
|
|
296
|
+
)
|
|
297
|
+
days_back = st.number_input(
|
|
298
|
+
"Days Back",
|
|
299
|
+
min_value=1,
|
|
300
|
+
max_value=365,
|
|
301
|
+
value=30,
|
|
302
|
+
help="How many days of history to fetch"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
with col2:
|
|
306
|
+
max_results = st.number_input(
|
|
307
|
+
"Max Results",
|
|
308
|
+
min_value=1,
|
|
309
|
+
max_value=100,
|
|
310
|
+
value=20
|
|
311
|
+
)
|
|
312
|
+
save_to_db = st.checkbox("Save to Database", value=False)
|
|
313
|
+
|
|
314
|
+
# Run scraper
|
|
315
|
+
if st.button("🚀 Run Info-Financière Scraper", type="primary"):
|
|
316
|
+
run_info_financiere_scraper(query, days_back, max_results, save_to_db)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def run_info_financiere_scraper(
|
|
320
|
+
query: str,
|
|
321
|
+
days_back: int,
|
|
322
|
+
max_results: int,
|
|
323
|
+
save_to_db: bool
|
|
324
|
+
):
|
|
325
|
+
"""Execute Info-Financière scraper"""
|
|
326
|
+
try:
|
|
327
|
+
from mcli.workflow.politician_trading.scrapers_corporate_registry import InfoFinanciereAPIScraper
|
|
328
|
+
|
|
329
|
+
status_container = st.empty()
|
|
330
|
+
progress_bar = st.progress(0)
|
|
331
|
+
results_container = st.empty()
|
|
332
|
+
|
|
333
|
+
# Initialize scraper
|
|
334
|
+
status_container.info("🔄 Initializing Info-Financière scraper...")
|
|
335
|
+
scraper = InfoFinanciereAPIScraper()
|
|
336
|
+
progress_bar.progress(20)
|
|
337
|
+
|
|
338
|
+
# Calculate date range
|
|
339
|
+
from_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")
|
|
340
|
+
to_date = datetime.now().strftime("%Y-%m-%d")
|
|
341
|
+
|
|
342
|
+
# Search publications
|
|
343
|
+
status_container.info(f"🔍 Searching publications ({from_date} to {to_date})...")
|
|
344
|
+
publications = scraper.search_publications(
|
|
345
|
+
query=query or None,
|
|
346
|
+
from_date=from_date,
|
|
347
|
+
to_date=to_date,
|
|
348
|
+
per_page=max_results
|
|
349
|
+
)
|
|
350
|
+
progress_bar.progress(80)
|
|
351
|
+
|
|
352
|
+
# Display results
|
|
353
|
+
with results_container:
|
|
354
|
+
st.markdown("### 📊 Scraping Results")
|
|
355
|
+
|
|
356
|
+
if not publications:
|
|
357
|
+
st.warning(f"⚠️ No publications found for the given criteria")
|
|
358
|
+
return
|
|
359
|
+
|
|
360
|
+
st.metric("Publications Found", len(publications))
|
|
361
|
+
|
|
362
|
+
# Show publications
|
|
363
|
+
pubs_df = pd.DataFrame([{
|
|
364
|
+
"Date": p.get("publication_date", ""),
|
|
365
|
+
"Title": p.get("title", "")[:100],
|
|
366
|
+
"Type": p.get("publication_type", ""),
|
|
367
|
+
"Issuer": p.get("issuer_name", "")
|
|
368
|
+
} for p in publications])
|
|
369
|
+
st.dataframe(pubs_df, use_container_width=True)
|
|
370
|
+
|
|
371
|
+
progress_bar.progress(100)
|
|
372
|
+
status_container.success(f"✅ Scraping completed! Found {len(publications)} publications")
|
|
373
|
+
|
|
374
|
+
if save_to_db:
|
|
375
|
+
save_financial_publications_to_db(publications, "info_financiere")
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
st.error(f"❌ Error: {e}")
|
|
379
|
+
import traceback
|
|
380
|
+
st.code(traceback.format_exc())
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def show_opencorporates_scraper():
|
|
384
|
+
"""OpenCorporates scraper interface"""
|
|
385
|
+
st.markdown("### OpenCorporates Configuration")
|
|
386
|
+
|
|
387
|
+
api_key = os.getenv("OPENCORPORATES_API_KEY") or st.secrets.get("OPENCORPORATES_API_KEY", "")
|
|
388
|
+
|
|
389
|
+
if api_key:
|
|
390
|
+
st.success("✅ API key configured")
|
|
391
|
+
else:
|
|
392
|
+
st.info("ℹ️ No API key (free tier with rate limits). Get API key for better performance: https://opencorporates.com/api_accounts/new")
|
|
393
|
+
|
|
394
|
+
# Configuration
|
|
395
|
+
col1, col2 = st.columns(2)
|
|
396
|
+
|
|
397
|
+
with col1:
|
|
398
|
+
query = st.text_input(
|
|
399
|
+
"Company Name",
|
|
400
|
+
value="Apple",
|
|
401
|
+
help="Company name to search for"
|
|
402
|
+
)
|
|
403
|
+
jurisdiction = st.selectbox(
|
|
404
|
+
"Jurisdiction (optional)",
|
|
405
|
+
["", "us_ca", "us_de", "us_ny", "gb", "de", "fr", "nl"],
|
|
406
|
+
help="Filter by jurisdiction code"
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
with col2:
|
|
410
|
+
max_results = st.number_input(
|
|
411
|
+
"Max Results",
|
|
412
|
+
min_value=1,
|
|
413
|
+
max_value=100,
|
|
414
|
+
value=10
|
|
415
|
+
)
|
|
416
|
+
save_to_db = st.checkbox("Save to Database", value=False)
|
|
417
|
+
|
|
418
|
+
# Run scraper
|
|
419
|
+
if st.button("🚀 Run OpenCorporates Scraper", type="primary"):
|
|
420
|
+
run_opencorporates_scraper(query, jurisdiction or None, max_results, save_to_db)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def run_opencorporates_scraper(
|
|
424
|
+
query: str,
|
|
425
|
+
jurisdiction: str,
|
|
426
|
+
max_results: int,
|
|
427
|
+
save_to_db: bool
|
|
428
|
+
):
|
|
429
|
+
"""Execute OpenCorporates scraper"""
|
|
430
|
+
try:
|
|
431
|
+
from mcli.workflow.politician_trading.scrapers_corporate_registry import OpenCorporatesScraper
|
|
432
|
+
|
|
433
|
+
status_container = st.empty()
|
|
434
|
+
progress_bar = st.progress(0)
|
|
435
|
+
results_container = st.empty()
|
|
436
|
+
|
|
437
|
+
# Initialize scraper
|
|
438
|
+
status_container.info("🔄 Initializing OpenCorporates scraper...")
|
|
439
|
+
scraper = OpenCorporatesScraper()
|
|
440
|
+
progress_bar.progress(20)
|
|
441
|
+
|
|
442
|
+
# Search companies
|
|
443
|
+
status_container.info(f"🔍 Searching for '{query}'...")
|
|
444
|
+
companies = scraper.search_companies(
|
|
445
|
+
query,
|
|
446
|
+
jurisdiction_code=jurisdiction,
|
|
447
|
+
per_page=max_results
|
|
448
|
+
)
|
|
449
|
+
progress_bar.progress(80)
|
|
450
|
+
|
|
451
|
+
# Display results
|
|
452
|
+
with results_container:
|
|
453
|
+
st.markdown("### 📊 Scraping Results")
|
|
454
|
+
|
|
455
|
+
if not companies:
|
|
456
|
+
st.warning(f"⚠️ No companies found matching '{query}'")
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
st.metric("Companies Found", len(companies))
|
|
460
|
+
|
|
461
|
+
# Show companies
|
|
462
|
+
companies_df = pd.DataFrame([{
|
|
463
|
+
"Jurisdiction": c.get("company", {}).get("jurisdiction_code", ""),
|
|
464
|
+
"Number": c.get("company", {}).get("company_number", ""),
|
|
465
|
+
"Name": c.get("company", {}).get("name", ""),
|
|
466
|
+
"Status": c.get("company", {}).get("current_status", ""),
|
|
467
|
+
"Type": c.get("company", {}).get("company_type", "")
|
|
468
|
+
} for c in companies])
|
|
469
|
+
st.dataframe(companies_df, use_container_width=True)
|
|
470
|
+
|
|
471
|
+
progress_bar.progress(100)
|
|
472
|
+
status_container.success(f"✅ Scraping completed! Found {len(companies)} companies")
|
|
473
|
+
|
|
474
|
+
except Exception as e:
|
|
475
|
+
st.error(f"❌ Error: {e}")
|
|
476
|
+
import traceback
|
|
477
|
+
st.code(traceback.format_exc())
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def show_xbrl_filings_scraper():
|
|
481
|
+
"""XBRL Filings scraper interface"""
|
|
482
|
+
st.markdown("### XBRL Filings (EU/UK) Configuration")
|
|
483
|
+
|
|
484
|
+
st.success("✅ No API key required (FREE)")
|
|
485
|
+
|
|
486
|
+
# Configuration
|
|
487
|
+
col1, col2 = st.columns(2)
|
|
488
|
+
|
|
489
|
+
with col1:
|
|
490
|
+
country = st.selectbox(
|
|
491
|
+
"Country (optional)",
|
|
492
|
+
["", "GB", "FR", "DE", "ES", "IT", "NL", "BE"],
|
|
493
|
+
help="Filter by country code"
|
|
494
|
+
)
|
|
495
|
+
days_back = st.number_input(
|
|
496
|
+
"Days Back",
|
|
497
|
+
min_value=1,
|
|
498
|
+
max_value=365,
|
|
499
|
+
value=30
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
with col2:
|
|
503
|
+
max_results = st.number_input(
|
|
504
|
+
"Max Results",
|
|
505
|
+
min_value=1,
|
|
506
|
+
max_value=500,
|
|
507
|
+
value=100
|
|
508
|
+
)
|
|
509
|
+
save_to_db = st.checkbox("Save to Database", value=False)
|
|
510
|
+
|
|
511
|
+
# Run scraper
|
|
512
|
+
if st.button("🚀 Run XBRL Filings Scraper", type="primary"):
|
|
513
|
+
run_xbrl_filings_scraper(country or None, days_back, max_results, save_to_db)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def run_xbrl_filings_scraper(
|
|
517
|
+
country: str,
|
|
518
|
+
days_back: int,
|
|
519
|
+
max_results: int,
|
|
520
|
+
save_to_db: bool
|
|
521
|
+
):
|
|
522
|
+
"""Execute XBRL Filings scraper"""
|
|
523
|
+
try:
|
|
524
|
+
from mcli.workflow.politician_trading.scrapers_corporate_registry import XBRLFilingsScraper
|
|
525
|
+
|
|
526
|
+
status_container = st.empty()
|
|
527
|
+
progress_bar = st.progress(0)
|
|
528
|
+
results_container = st.empty()
|
|
529
|
+
|
|
530
|
+
# Initialize scraper
|
|
531
|
+
status_container.info("🔄 Initializing XBRL Filings scraper...")
|
|
532
|
+
scraper = XBRLFilingsScraper()
|
|
533
|
+
progress_bar.progress(20)
|
|
534
|
+
|
|
535
|
+
# Calculate date range
|
|
536
|
+
from_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")
|
|
537
|
+
|
|
538
|
+
# Get filings
|
|
539
|
+
status_container.info(f"🔍 Fetching XBRL filings since {from_date}...")
|
|
540
|
+
filings = scraper.get_filings(
|
|
541
|
+
country=country,
|
|
542
|
+
from_date=from_date,
|
|
543
|
+
page_size=max_results
|
|
544
|
+
)
|
|
545
|
+
progress_bar.progress(80)
|
|
546
|
+
|
|
547
|
+
# Display results
|
|
548
|
+
with results_container:
|
|
549
|
+
st.markdown("### 📊 Scraping Results")
|
|
550
|
+
|
|
551
|
+
if not filings:
|
|
552
|
+
st.warning(f"⚠️ No filings found for the given criteria")
|
|
553
|
+
return
|
|
554
|
+
|
|
555
|
+
st.metric("Filings Found", len(filings))
|
|
556
|
+
|
|
557
|
+
# Show filings
|
|
558
|
+
filings_df = pd.DataFrame([{
|
|
559
|
+
"ID": f.get("id", ""),
|
|
560
|
+
"Country": f.get("attributes", {}).get("country", ""),
|
|
561
|
+
"Entity": f.get("attributes", {}).get("entity_name", "")[:50],
|
|
562
|
+
"Period": f.get("attributes", {}).get("period_end", ""),
|
|
563
|
+
"Date Added": f.get("attributes", {}).get("date_added", "")
|
|
564
|
+
} for f in filings])
|
|
565
|
+
st.dataframe(filings_df, use_container_width=True)
|
|
566
|
+
|
|
567
|
+
progress_bar.progress(100)
|
|
568
|
+
status_container.success(f"✅ Scraping completed! Found {len(filings)} filings")
|
|
569
|
+
|
|
570
|
+
except Exception as e:
|
|
571
|
+
st.error(f"❌ Error: {e}")
|
|
572
|
+
import traceback
|
|
573
|
+
st.code(traceback.format_exc())
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def show_xbrl_us_scraper():
|
|
577
|
+
"""XBRL US scraper interface"""
|
|
578
|
+
st.markdown("### XBRL US Configuration")
|
|
579
|
+
|
|
580
|
+
api_key = os.getenv("XBRL_US_API_KEY") or st.secrets.get("XBRL_US_API_KEY", "")
|
|
581
|
+
|
|
582
|
+
if not api_key:
|
|
583
|
+
st.error("❌ XBRL US API key not configured")
|
|
584
|
+
st.info("""
|
|
585
|
+
To use this scraper, set `XBRL_US_API_KEY` in:
|
|
586
|
+
- Streamlit Cloud: Settings → Secrets
|
|
587
|
+
- Local: .streamlit/secrets.toml or environment variable
|
|
588
|
+
|
|
589
|
+
Get free API key: https://xbrl.us/home/use/xbrl-api/
|
|
590
|
+
""")
|
|
591
|
+
return
|
|
592
|
+
|
|
593
|
+
st.success("✅ API key configured")
|
|
594
|
+
|
|
595
|
+
# Configuration
|
|
596
|
+
col1, col2 = st.columns(2)
|
|
597
|
+
|
|
598
|
+
with col1:
|
|
599
|
+
query = st.text_input(
|
|
600
|
+
"Company Name or Ticker",
|
|
601
|
+
value="Tesla",
|
|
602
|
+
help="Search by company name or stock ticker"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
with col2:
|
|
606
|
+
max_results = st.number_input(
|
|
607
|
+
"Max Results",
|
|
608
|
+
min_value=1,
|
|
609
|
+
max_value=100,
|
|
610
|
+
value=10
|
|
611
|
+
)
|
|
612
|
+
save_to_db = st.checkbox("Save to Database", value=False)
|
|
613
|
+
|
|
614
|
+
# Run scraper
|
|
615
|
+
if st.button("🚀 Run XBRL US Scraper", type="primary"):
|
|
616
|
+
run_xbrl_us_scraper(query, max_results, save_to_db)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def run_xbrl_us_scraper(
|
|
620
|
+
query: str,
|
|
621
|
+
max_results: int,
|
|
622
|
+
save_to_db: bool
|
|
623
|
+
):
|
|
624
|
+
"""Execute XBRL US scraper"""
|
|
625
|
+
try:
|
|
626
|
+
from mcli.workflow.politician_trading.scrapers_corporate_registry import XBRLUSScraper
|
|
627
|
+
|
|
628
|
+
status_container = st.empty()
|
|
629
|
+
progress_bar = st.progress(0)
|
|
630
|
+
results_container = st.empty()
|
|
631
|
+
|
|
632
|
+
# Initialize scraper
|
|
633
|
+
status_container.info("🔄 Initializing XBRL US scraper...")
|
|
634
|
+
scraper = XBRLUSScraper()
|
|
635
|
+
progress_bar.progress(20)
|
|
636
|
+
|
|
637
|
+
# Search companies
|
|
638
|
+
status_container.info(f"🔍 Searching for '{query}'...")
|
|
639
|
+
entities = scraper.search_companies(query, limit=max_results)
|
|
640
|
+
progress_bar.progress(80)
|
|
641
|
+
|
|
642
|
+
# Display results
|
|
643
|
+
with results_container:
|
|
644
|
+
st.markdown("### 📊 Scraping Results")
|
|
645
|
+
|
|
646
|
+
if not entities:
|
|
647
|
+
st.warning(f"⚠️ No entities found matching '{query}'")
|
|
648
|
+
return
|
|
649
|
+
|
|
650
|
+
st.metric("Entities Found", len(entities))
|
|
651
|
+
|
|
652
|
+
# Show entities
|
|
653
|
+
entities_df = pd.DataFrame([{
|
|
654
|
+
"ID": e.get("entity", {}).get("id", ""),
|
|
655
|
+
"Name": e.get("entity", {}).get("name", ""),
|
|
656
|
+
"CIK": e.get("entity", {}).get("cik", ""),
|
|
657
|
+
"Ticker": e.get("entity", {}).get("ticker", "")
|
|
658
|
+
} for e in entities])
|
|
659
|
+
st.dataframe(entities_df, use_container_width=True)
|
|
660
|
+
|
|
661
|
+
progress_bar.progress(100)
|
|
662
|
+
status_container.success(f"✅ Scraping completed! Found {len(entities)} entities")
|
|
663
|
+
|
|
664
|
+
except Exception as e:
|
|
665
|
+
st.error(f"❌ Error: {e}")
|
|
666
|
+
import traceback
|
|
667
|
+
st.code(traceback.format_exc())
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def show_senate_watcher_scraper():
|
|
671
|
+
"""Senate Stock Watcher scraper interface"""
|
|
672
|
+
st.markdown("### Senate Stock Watcher (GitHub) Configuration")
|
|
673
|
+
|
|
674
|
+
st.success("✅ No API key required (FREE)")
|
|
675
|
+
|
|
676
|
+
# Configuration
|
|
677
|
+
col1, col2 = st.columns(2)
|
|
678
|
+
|
|
679
|
+
with col1:
|
|
680
|
+
recent_only = st.checkbox("Recent Only", value=True)
|
|
681
|
+
days_back = st.number_input(
|
|
682
|
+
"Days Back (if recent)",
|
|
683
|
+
min_value=1,
|
|
684
|
+
max_value=365,
|
|
685
|
+
value=90,
|
|
686
|
+
disabled=not recent_only
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
with col2:
|
|
690
|
+
save_to_db = st.checkbox("Save to Database", value=True)
|
|
691
|
+
|
|
692
|
+
# Run scraper
|
|
693
|
+
if st.button("🚀 Run Senate Stock Watcher Scraper", type="primary"):
|
|
694
|
+
run_senate_watcher_scraper(recent_only, days_back, save_to_db)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def run_senate_watcher_scraper(
|
|
698
|
+
recent_only: bool,
|
|
699
|
+
days_back: int,
|
|
700
|
+
save_to_db: bool
|
|
701
|
+
):
|
|
702
|
+
"""Execute Senate Stock Watcher scraper"""
|
|
703
|
+
try:
|
|
704
|
+
from mcli.workflow.politician_trading.scrapers_free_sources import FreeDataFetcher
|
|
705
|
+
|
|
706
|
+
status_container = st.empty()
|
|
707
|
+
progress_bar = st.progress(0)
|
|
708
|
+
results_container = st.empty()
|
|
709
|
+
|
|
710
|
+
# Initialize fetcher
|
|
711
|
+
status_container.info("🔄 Initializing Senate Stock Watcher scraper...")
|
|
712
|
+
fetcher = FreeDataFetcher()
|
|
713
|
+
progress_bar.progress(20)
|
|
714
|
+
|
|
715
|
+
# Fetch data
|
|
716
|
+
status_container.info("🔍 Fetching Senate trading data from GitHub...")
|
|
717
|
+
data = fetcher.fetch_from_senate_watcher(recent_only=recent_only, days=days_back)
|
|
718
|
+
progress_bar.progress(80)
|
|
719
|
+
|
|
720
|
+
politicians = data.get("politicians", [])
|
|
721
|
+
disclosures = data.get("disclosures", [])
|
|
722
|
+
|
|
723
|
+
# Display results
|
|
724
|
+
with results_container:
|
|
725
|
+
st.markdown("### 📊 Scraping Results")
|
|
726
|
+
|
|
727
|
+
col1, col2 = st.columns(2)
|
|
728
|
+
col1.metric("Politicians", len(politicians))
|
|
729
|
+
col2.metric("Disclosures", len(disclosures))
|
|
730
|
+
|
|
731
|
+
# Show disclosures
|
|
732
|
+
if disclosures:
|
|
733
|
+
st.markdown("#### Recent Trading Disclosures")
|
|
734
|
+
disc_df = pd.DataFrame([{
|
|
735
|
+
"Date": d.transaction_date.strftime("%Y-%m-%d") if hasattr(d.transaction_date, 'strftime') else str(d.transaction_date),
|
|
736
|
+
"Politician": d.politician_bioguide_id,
|
|
737
|
+
"Type": d.transaction_type,
|
|
738
|
+
"Asset": d.asset_name[:50],
|
|
739
|
+
"Ticker": d.asset_ticker or "",
|
|
740
|
+
"Min": f"${d.amount_range_min:,.0f}" if d.amount_range_min else "",
|
|
741
|
+
"Max": f"${d.amount_range_max:,.0f}" if d.amount_range_max else ""
|
|
742
|
+
} for d in disclosures[:100]]) # Limit to 100 for display
|
|
743
|
+
st.dataframe(disc_df, use_container_width=True)
|
|
744
|
+
|
|
745
|
+
progress_bar.progress(100)
|
|
746
|
+
status_container.success(f"✅ Scraping completed! Found {len(politicians)} politicians, {len(disclosures)} disclosures")
|
|
747
|
+
|
|
748
|
+
if save_to_db:
|
|
749
|
+
save_politician_trading_to_db(politicians, disclosures)
|
|
750
|
+
|
|
751
|
+
except Exception as e:
|
|
752
|
+
st.error(f"❌ Error: {e}")
|
|
753
|
+
import traceback
|
|
754
|
+
st.code(traceback.format_exc())
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def save_corporate_data_to_db(companies, officers, psc, source):
|
|
758
|
+
"""Save corporate data to Supabase"""
|
|
759
|
+
st.info("⚠️ Database saving not yet implemented. Data displayed above.")
|
|
760
|
+
# TODO: Implement Supabase upsert logic
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def save_financial_publications_to_db(publications, source):
|
|
764
|
+
"""Save financial publications to Supabase"""
|
|
765
|
+
st.info("⚠️ Database saving not yet implemented. Data displayed above.")
|
|
766
|
+
# TODO: Implement Supabase upsert logic
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def save_politician_trading_to_db(politicians, disclosures):
|
|
770
|
+
"""Save politician trading data to Supabase"""
|
|
771
|
+
st.info("⚠️ Using existing seed_database.py logic for this source")
|
|
772
|
+
# TODO: Call seed_database.py functions
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
def show_scraper_logs():
|
|
776
|
+
"""Display scraper logs"""
|
|
777
|
+
st.subheader("📊 Scraper Logs")
|
|
778
|
+
|
|
779
|
+
st.markdown("""
|
|
780
|
+
View real-time logs from scraping operations and data pull jobs.
|
|
781
|
+
""")
|
|
782
|
+
|
|
783
|
+
# Get logs from Supabase data_pull_jobs
|
|
784
|
+
try:
|
|
785
|
+
from mcli.ml.dashboard.app_integrated import get_supabase_client
|
|
786
|
+
|
|
787
|
+
client = get_supabase_client()
|
|
788
|
+
|
|
789
|
+
if client:
|
|
790
|
+
# Get recent jobs
|
|
791
|
+
jobs = client.table("data_pull_jobs").select("*").order("created_at", desc=True).limit(50).execute()
|
|
792
|
+
|
|
793
|
+
if jobs.data:
|
|
794
|
+
st.markdown("### Recent Data Pull Jobs")
|
|
795
|
+
|
|
796
|
+
jobs_df = pd.DataFrame(jobs.data)
|
|
797
|
+
|
|
798
|
+
# Format dates
|
|
799
|
+
for col in ['started_at', 'completed_at', 'created_at']:
|
|
800
|
+
if col in jobs_df.columns:
|
|
801
|
+
jobs_df[col] = pd.to_datetime(jobs_df[col], format='ISO8601', errors='coerce')
|
|
802
|
+
|
|
803
|
+
# Display jobs table
|
|
804
|
+
display_df = jobs_df[[
|
|
805
|
+
'created_at', 'job_type', 'status', 'records_found',
|
|
806
|
+
'records_new', 'records_updated', 'records_failed'
|
|
807
|
+
]].copy()
|
|
808
|
+
|
|
809
|
+
display_df.columns = [
|
|
810
|
+
'Timestamp', 'Job Type', 'Status', 'Found',
|
|
811
|
+
'New', 'Updated', 'Failed'
|
|
812
|
+
]
|
|
813
|
+
|
|
814
|
+
st.dataframe(display_df, use_container_width=True)
|
|
815
|
+
|
|
816
|
+
# Job details
|
|
817
|
+
st.markdown("### Job Details")
|
|
818
|
+
|
|
819
|
+
selected_job = st.selectbox(
|
|
820
|
+
"Select Job",
|
|
821
|
+
jobs_df['id'].tolist(),
|
|
822
|
+
format_func=lambda x: f"{jobs_df[jobs_df['id']==x]['job_type'].values[0]} - {jobs_df[jobs_df['id']==x]['created_at'].values[0]}"
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
if selected_job:
|
|
826
|
+
job = jobs_df[jobs_df['id'] == selected_job].iloc[0]
|
|
827
|
+
|
|
828
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
829
|
+
col1.metric("Status", job['status'])
|
|
830
|
+
col2.metric("Records Found", job['records_found'])
|
|
831
|
+
col3.metric("New Records", job['records_new'])
|
|
832
|
+
col4.metric("Failed", job['records_failed'])
|
|
833
|
+
|
|
834
|
+
if job.get('error_message'):
|
|
835
|
+
st.error(f"**Error:** {job['error_message']}")
|
|
836
|
+
|
|
837
|
+
# Show config snapshot
|
|
838
|
+
if job.get('config_snapshot'):
|
|
839
|
+
with st.expander("Configuration Snapshot"):
|
|
840
|
+
st.json(job['config_snapshot'])
|
|
841
|
+
|
|
842
|
+
else:
|
|
843
|
+
st.info("No jobs found in database")
|
|
844
|
+
|
|
845
|
+
else:
|
|
846
|
+
st.warning("Supabase not connected - logs unavailable")
|
|
847
|
+
|
|
848
|
+
except Exception as e:
|
|
849
|
+
st.error(f"Error loading scraper logs: {e}")
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
def show_system_logs():
|
|
853
|
+
"""Display system logs"""
|
|
854
|
+
st.subheader("📝 System Logs")
|
|
855
|
+
|
|
856
|
+
st.markdown("""
|
|
857
|
+
View application logs, errors, and system events.
|
|
858
|
+
""")
|
|
859
|
+
|
|
860
|
+
# Log file path
|
|
861
|
+
log_file = Path("/tmp/seed_database.log")
|
|
862
|
+
|
|
863
|
+
if log_file.exists():
|
|
864
|
+
try:
|
|
865
|
+
with open(log_file, 'r') as f:
|
|
866
|
+
logs = f.readlines()
|
|
867
|
+
|
|
868
|
+
# Filter options
|
|
869
|
+
col1, col2, col3 = st.columns(3)
|
|
870
|
+
|
|
871
|
+
with col1:
|
|
872
|
+
log_level = st.selectbox(
|
|
873
|
+
"Log Level",
|
|
874
|
+
["ALL", "ERROR", "WARNING", "INFO", "DEBUG"]
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
with col2:
|
|
878
|
+
lines_to_show = st.number_input(
|
|
879
|
+
"Lines to Show",
|
|
880
|
+
min_value=10,
|
|
881
|
+
max_value=1000,
|
|
882
|
+
value=100
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
with col3:
|
|
886
|
+
search_term = st.text_input("Search", value="")
|
|
887
|
+
|
|
888
|
+
# Filter logs
|
|
889
|
+
filtered_logs = logs[-lines_to_show:]
|
|
890
|
+
|
|
891
|
+
if log_level != "ALL":
|
|
892
|
+
filtered_logs = [l for l in filtered_logs if log_level in l]
|
|
893
|
+
|
|
894
|
+
if search_term:
|
|
895
|
+
filtered_logs = [l for l in filtered_logs if search_term.lower() in l.lower()]
|
|
896
|
+
|
|
897
|
+
# Display logs
|
|
898
|
+
st.text_area(
|
|
899
|
+
"Log Output",
|
|
900
|
+
"".join(filtered_logs),
|
|
901
|
+
height=400
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Download button
|
|
905
|
+
st.download_button(
|
|
906
|
+
"Download Full Logs",
|
|
907
|
+
"".join(logs),
|
|
908
|
+
file_name=f"system_logs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
909
|
+
mime="text/plain"
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
except Exception as e:
|
|
913
|
+
st.error(f"Error reading log file: {e}")
|
|
914
|
+
else:
|
|
915
|
+
st.info(f"Log file not found at {log_file}. Logs will appear here after scraping jobs run.")
|
|
916
|
+
|
|
917
|
+
# Create example logs display
|
|
918
|
+
st.markdown("### Example Log Output")
|
|
919
|
+
st.code("""
|
|
920
|
+
2025-10-07 12:00:00 - INFO - Starting data pull job: senate_watcher_seed
|
|
921
|
+
2025-10-07 12:00:05 - INFO - Fetched 8350 Senate transactions
|
|
922
|
+
2025-10-07 12:00:10 - INFO - Upserted 89 politicians (5 new, 84 updated)
|
|
923
|
+
2025-10-07 12:01:30 - INFO - Upserted 8350 disclosures (6353 new, 1893 updated, 104 failed)
|
|
924
|
+
2025-10-07 12:01:31 - INFO - Job completed successfully
|
|
925
|
+
""", language="log")
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def show_job_history():
|
|
929
|
+
"""Display job history and statistics"""
|
|
930
|
+
st.subheader("📈 Job History & Statistics")
|
|
931
|
+
|
|
932
|
+
st.markdown("""
|
|
933
|
+
View historical data about scraping jobs, success rates, and trends.
|
|
934
|
+
""")
|
|
935
|
+
|
|
936
|
+
try:
|
|
937
|
+
from mcli.ml.dashboard.app_integrated import get_supabase_client
|
|
938
|
+
|
|
939
|
+
client = get_supabase_client()
|
|
940
|
+
|
|
941
|
+
if client:
|
|
942
|
+
# Get all jobs
|
|
943
|
+
jobs = client.table("data_pull_jobs").select("*").order("created_at", desc=True).limit(1000).execute()
|
|
944
|
+
|
|
945
|
+
if jobs.data and len(jobs.data) > 0:
|
|
946
|
+
jobs_df = pd.DataFrame(jobs.data)
|
|
947
|
+
|
|
948
|
+
# Format dates
|
|
949
|
+
for col in ['started_at', 'completed_at', 'created_at']:
|
|
950
|
+
if col in jobs_df.columns:
|
|
951
|
+
jobs_df[col] = pd.to_datetime(jobs_df[col], format='ISO8601', errors='coerce')
|
|
952
|
+
|
|
953
|
+
# Statistics
|
|
954
|
+
st.markdown("### Overall Statistics")
|
|
955
|
+
|
|
956
|
+
col1, col2, col3, col4 = st.columns(4)
|
|
957
|
+
|
|
958
|
+
total_jobs = len(jobs_df)
|
|
959
|
+
completed_jobs = len(jobs_df[jobs_df['status'] == 'completed'])
|
|
960
|
+
failed_jobs = len(jobs_df[jobs_df['status'] == 'failed'])
|
|
961
|
+
success_rate = (completed_jobs / total_jobs * 100) if total_jobs > 0 else 0
|
|
962
|
+
|
|
963
|
+
col1.metric("Total Jobs", total_jobs)
|
|
964
|
+
col2.metric("Completed", completed_jobs)
|
|
965
|
+
col3.metric("Failed", failed_jobs)
|
|
966
|
+
col4.metric("Success Rate", f"{success_rate:.1f}%")
|
|
967
|
+
|
|
968
|
+
# Job type breakdown
|
|
969
|
+
st.markdown("### Job Type Breakdown")
|
|
970
|
+
|
|
971
|
+
job_type_counts = jobs_df['job_type'].value_counts()
|
|
972
|
+
|
|
973
|
+
fig = px.pie(
|
|
974
|
+
values=job_type_counts.values,
|
|
975
|
+
names=job_type_counts.index,
|
|
976
|
+
title="Jobs by Type"
|
|
977
|
+
)
|
|
978
|
+
st.plotly_chart(fig, use_container_width=True)
|
|
979
|
+
|
|
980
|
+
# Status breakdown
|
|
981
|
+
st.markdown("### Status Breakdown")
|
|
982
|
+
|
|
983
|
+
status_counts = jobs_df['status'].value_counts()
|
|
984
|
+
|
|
985
|
+
fig = px.bar(
|
|
986
|
+
x=status_counts.index,
|
|
987
|
+
y=status_counts.values,
|
|
988
|
+
labels={'x': 'Status', 'y': 'Count'},
|
|
989
|
+
title="Jobs by Status"
|
|
990
|
+
)
|
|
991
|
+
st.plotly_chart(fig, use_container_width=True)
|
|
992
|
+
|
|
993
|
+
# Timeline
|
|
994
|
+
st.markdown("### Job Timeline")
|
|
995
|
+
|
|
996
|
+
jobs_df['date'] = jobs_df['created_at'].dt.date
|
|
997
|
+
|
|
998
|
+
timeline_df = jobs_df.groupby(['date', 'status']).size().reset_index(name='count')
|
|
999
|
+
|
|
1000
|
+
fig = px.line(
|
|
1001
|
+
timeline_df,
|
|
1002
|
+
x='date',
|
|
1003
|
+
y='count',
|
|
1004
|
+
color='status',
|
|
1005
|
+
title="Jobs Over Time"
|
|
1006
|
+
)
|
|
1007
|
+
st.plotly_chart(fig, use_container_width=True)
|
|
1008
|
+
|
|
1009
|
+
# Records processed
|
|
1010
|
+
st.markdown("### Records Processed")
|
|
1011
|
+
|
|
1012
|
+
records_df = jobs_df[jobs_df['status'] == 'completed'][['created_at', 'records_found', 'records_new', 'records_updated', 'records_failed']].copy()
|
|
1013
|
+
|
|
1014
|
+
if not records_df.empty:
|
|
1015
|
+
fig = go.Figure()
|
|
1016
|
+
|
|
1017
|
+
fig.add_trace(go.Scatter(
|
|
1018
|
+
x=records_df['created_at'],
|
|
1019
|
+
y=records_df['records_new'],
|
|
1020
|
+
name='New Records',
|
|
1021
|
+
mode='lines+markers'
|
|
1022
|
+
))
|
|
1023
|
+
|
|
1024
|
+
fig.add_trace(go.Scatter(
|
|
1025
|
+
x=records_df['created_at'],
|
|
1026
|
+
y=records_df['records_updated'],
|
|
1027
|
+
name='Updated Records',
|
|
1028
|
+
mode='lines+markers'
|
|
1029
|
+
))
|
|
1030
|
+
|
|
1031
|
+
fig.add_trace(go.Scatter(
|
|
1032
|
+
x=records_df['created_at'],
|
|
1033
|
+
y=records_df['records_failed'],
|
|
1034
|
+
name='Failed Records',
|
|
1035
|
+
mode='lines+markers'
|
|
1036
|
+
))
|
|
1037
|
+
|
|
1038
|
+
fig.update_layout(
|
|
1039
|
+
title="Records Processed Over Time",
|
|
1040
|
+
xaxis_title="Date",
|
|
1041
|
+
yaxis_title="Count",
|
|
1042
|
+
hovermode='x unified'
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
st.plotly_chart(fig, use_container_width=True)
|
|
1046
|
+
|
|
1047
|
+
else:
|
|
1048
|
+
st.info("No job history available yet. Run some scraping jobs to see statistics here.")
|
|
1049
|
+
|
|
1050
|
+
else:
|
|
1051
|
+
st.warning("Supabase not connected - job history unavailable")
|
|
1052
|
+
|
|
1053
|
+
except Exception as e:
|
|
1054
|
+
st.error(f"Error loading job history: {e}")
|
|
1055
|
+
import traceback
|
|
1056
|
+
st.code(traceback.format_exc())
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
# Export for use in main dashboard
|
|
1060
|
+
__all__ = ["show_scrapers_and_logs"]
|