mcli-framework 7.10.1__py3-none-any.whl → 7.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/commands_cmd.py +150 -58
- mcli/app/main.py +21 -27
- mcli/lib/custom_commands.py +62 -12
- mcli/lib/optional_deps.py +240 -0
- mcli/lib/paths.py +129 -5
- mcli/self/migrate_cmd.py +261 -0
- mcli/self/self_cmd.py +8 -0
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/__init__.py +16 -0
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/secrets/__init__.py +4 -0
- mcli/workflow/secrets/secrets_cmd.py +192 -0
- mcli/workflow/workflow.py +35 -5
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/METADATA +86 -55
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/RECORD +22 -34
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/__init__.py +0 -4
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,879 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Main workflow orchestrator for politician trading data collection
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import logging
|
|
7
|
-
import uuid
|
|
8
|
-
from datetime import datetime, timedelta
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
|
10
|
-
|
|
11
|
-
from .config import WorkflowConfig
|
|
12
|
-
from .database import PoliticianTradingDB
|
|
13
|
-
from .models import DataPullJob, Politician, PoliticianRole, TradingDisclosure
|
|
14
|
-
from .scrapers import (
|
|
15
|
-
CongressTradingScraper,
|
|
16
|
-
EUParliamentScraper,
|
|
17
|
-
PoliticianMatcher,
|
|
18
|
-
QuiverQuantScraper,
|
|
19
|
-
run_california_workflow,
|
|
20
|
-
run_eu_member_states_workflow,
|
|
21
|
-
run_uk_parliament_workflow,
|
|
22
|
-
run_us_states_workflow,
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
logger = logging.getLogger(__name__)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class PoliticianTradingWorkflow:
|
|
29
|
-
"""Main workflow for collecting politician trading data"""
|
|
30
|
-
|
|
31
|
-
def __init__(self, config: WorkflowConfig = None):
|
|
32
|
-
self.config = config or WorkflowConfig.default()
|
|
33
|
-
self.db = PoliticianTradingDB(self.config)
|
|
34
|
-
self.politicians: List[Politician] = []
|
|
35
|
-
|
|
36
|
-
async def run_full_collection(self) -> Dict[str, Any]:
|
|
37
|
-
"""Run complete data collection workflow"""
|
|
38
|
-
logger.info("Starting full politician trading data collection")
|
|
39
|
-
|
|
40
|
-
results = {
|
|
41
|
-
"started_at": datetime.utcnow().isoformat(),
|
|
42
|
-
"jobs": {},
|
|
43
|
-
"summary": {"total_new_disclosures": 0, "total_updated_disclosures": 0, "errors": []},
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
try:
|
|
47
|
-
# Ensure database schema
|
|
48
|
-
schema_ok = await self.db.ensure_schema()
|
|
49
|
-
if not schema_ok:
|
|
50
|
-
raise Exception("Database schema verification failed")
|
|
51
|
-
|
|
52
|
-
# Load existing politicians for matching
|
|
53
|
-
await self._load_politicians()
|
|
54
|
-
|
|
55
|
-
# Run US Congress collection
|
|
56
|
-
us_results = await self._collect_us_congress_data()
|
|
57
|
-
results["jobs"]["us_congress"] = us_results
|
|
58
|
-
results["summary"]["total_new_disclosures"] += us_results.get("new_disclosures", 0)
|
|
59
|
-
results["summary"]["total_updated_disclosures"] += us_results.get(
|
|
60
|
-
"updated_disclosures", 0
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
# Run EU Parliament collection
|
|
64
|
-
eu_results = await self._collect_eu_parliament_data()
|
|
65
|
-
results["jobs"]["eu_parliament"] = eu_results
|
|
66
|
-
results["summary"]["total_new_disclosures"] += eu_results.get("new_disclosures", 0)
|
|
67
|
-
results["summary"]["total_updated_disclosures"] += eu_results.get(
|
|
68
|
-
"updated_disclosures", 0
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
# Run California collection
|
|
72
|
-
ca_results = await self._collect_california_data()
|
|
73
|
-
results["jobs"]["california"] = ca_results
|
|
74
|
-
results["summary"]["total_new_disclosures"] += ca_results.get("new_disclosures", 0)
|
|
75
|
-
results["summary"]["total_updated_disclosures"] += ca_results.get(
|
|
76
|
-
"updated_disclosures", 0
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# Run EU member states collection
|
|
80
|
-
eu_states_results = await self._collect_eu_member_states_data()
|
|
81
|
-
results["jobs"]["eu_member_states"] = eu_states_results
|
|
82
|
-
results["summary"]["total_new_disclosures"] += eu_states_results.get(
|
|
83
|
-
"new_disclosures", 0
|
|
84
|
-
)
|
|
85
|
-
results["summary"]["total_updated_disclosures"] += eu_states_results.get(
|
|
86
|
-
"updated_disclosures", 0
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
# Run US states collection
|
|
90
|
-
us_states_results = await self._collect_us_states_data()
|
|
91
|
-
results["jobs"]["us_states"] = us_states_results
|
|
92
|
-
results["summary"]["total_new_disclosures"] += us_states_results.get(
|
|
93
|
-
"new_disclosures", 0
|
|
94
|
-
)
|
|
95
|
-
results["summary"]["total_updated_disclosures"] += us_states_results.get(
|
|
96
|
-
"updated_disclosures", 0
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
results["completed_at"] = datetime.utcnow().isoformat()
|
|
100
|
-
results["status"] = "completed"
|
|
101
|
-
|
|
102
|
-
except Exception as e:
|
|
103
|
-
logger.error(f"Full collection workflow failed: {e}")
|
|
104
|
-
results["error"] = str(e)
|
|
105
|
-
results["status"] = "failed"
|
|
106
|
-
results["summary"]["errors"].append(str(e))
|
|
107
|
-
|
|
108
|
-
logger.info(f"Workflow completed: {results['summary']}")
|
|
109
|
-
return results
|
|
110
|
-
|
|
111
|
-
async def _load_politicians(self):
|
|
112
|
-
"""Load politicians from database for matching"""
|
|
113
|
-
try:
|
|
114
|
-
# For now, create some sample politicians
|
|
115
|
-
# In production, you'd load from a politicians API or database
|
|
116
|
-
sample_politicians = [
|
|
117
|
-
Politician(
|
|
118
|
-
id=str(uuid.uuid4()),
|
|
119
|
-
first_name="Nancy",
|
|
120
|
-
last_name="Pelosi",
|
|
121
|
-
full_name="Nancy Pelosi",
|
|
122
|
-
role=PoliticianRole.US_HOUSE_REP,
|
|
123
|
-
party="Democratic",
|
|
124
|
-
state_or_country="CA",
|
|
125
|
-
district="5",
|
|
126
|
-
bioguide_id="P000197",
|
|
127
|
-
),
|
|
128
|
-
Politician(
|
|
129
|
-
id=str(uuid.uuid4()),
|
|
130
|
-
first_name="Ted",
|
|
131
|
-
last_name="Cruz",
|
|
132
|
-
full_name="Ted Cruz",
|
|
133
|
-
role=PoliticianRole.US_SENATOR,
|
|
134
|
-
party="Republican",
|
|
135
|
-
state_or_country="TX",
|
|
136
|
-
bioguide_id="C001098",
|
|
137
|
-
),
|
|
138
|
-
]
|
|
139
|
-
|
|
140
|
-
# Store politicians in database
|
|
141
|
-
for politician in sample_politicians:
|
|
142
|
-
politician_id = await self.db.upsert_politician(politician)
|
|
143
|
-
politician.id = politician_id
|
|
144
|
-
self.politicians.append(politician)
|
|
145
|
-
|
|
146
|
-
logger.info(f"Loaded {len(self.politicians)} politicians for matching")
|
|
147
|
-
|
|
148
|
-
except Exception as e:
|
|
149
|
-
logger.error(f"Failed to load politicians: {e}")
|
|
150
|
-
self.politicians = []
|
|
151
|
-
|
|
152
|
-
async def _collect_us_congress_data(self) -> Dict[str, Any]:
|
|
153
|
-
"""Collect US Congress trading data"""
|
|
154
|
-
job_id = await self.db.create_data_pull_job(
|
|
155
|
-
"us_congress", self.config.to_serializable_dict()
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
job_result = {
|
|
159
|
-
"job_id": job_id,
|
|
160
|
-
"status": "running",
|
|
161
|
-
"new_disclosures": 0,
|
|
162
|
-
"updated_disclosures": 0,
|
|
163
|
-
"errors": [],
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
job = DataPullJob(
|
|
167
|
-
id=job_id, job_type="us_congress", status="running", started_at=datetime.utcnow()
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
logger.info("Starting US Congress data collection")
|
|
172
|
-
|
|
173
|
-
# Initialize scrapers
|
|
174
|
-
congress_scraper = CongressTradingScraper(self.config.scraping)
|
|
175
|
-
quiver_scraper = QuiverQuantScraper(self.config.scraping)
|
|
176
|
-
|
|
177
|
-
all_disclosures = []
|
|
178
|
-
|
|
179
|
-
# Scrape official sources
|
|
180
|
-
async with congress_scraper:
|
|
181
|
-
house_disclosures = await congress_scraper.scrape_house_disclosures()
|
|
182
|
-
senate_disclosures = await congress_scraper.scrape_senate_disclosures()
|
|
183
|
-
all_disclosures.extend(house_disclosures)
|
|
184
|
-
all_disclosures.extend(senate_disclosures)
|
|
185
|
-
|
|
186
|
-
# Scrape backup sources
|
|
187
|
-
async with quiver_scraper:
|
|
188
|
-
quiver_trades = await quiver_scraper.scrape_congress_trades()
|
|
189
|
-
for trade_data in quiver_trades:
|
|
190
|
-
disclosure = quiver_scraper.parse_quiver_trade(trade_data)
|
|
191
|
-
if disclosure:
|
|
192
|
-
all_disclosures.append(disclosure)
|
|
193
|
-
|
|
194
|
-
job.records_found = len(all_disclosures)
|
|
195
|
-
|
|
196
|
-
# Process disclosures
|
|
197
|
-
matcher = PoliticianMatcher(self.politicians)
|
|
198
|
-
|
|
199
|
-
for disclosure in all_disclosures:
|
|
200
|
-
try:
|
|
201
|
-
# Find matching politician
|
|
202
|
-
politician_name = disclosure.raw_data.get("politician_name", "")
|
|
203
|
-
if not politician_name or politician_name.strip() == "":
|
|
204
|
-
logger.warning("Skipping disclosure with empty politician name")
|
|
205
|
-
job.records_failed += 1
|
|
206
|
-
continue
|
|
207
|
-
|
|
208
|
-
# Filter out obviously invalid politician names
|
|
209
|
-
if self._is_invalid_politician_name(politician_name):
|
|
210
|
-
logger.warning(
|
|
211
|
-
f"Skipping disclosure with invalid politician name: {politician_name}"
|
|
212
|
-
)
|
|
213
|
-
job.records_failed += 1
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
politician = matcher.find_politician(politician_name)
|
|
217
|
-
|
|
218
|
-
if not politician:
|
|
219
|
-
# Create new politician with real name from scraper
|
|
220
|
-
logger.info(f"Creating new politician for: {politician_name}")
|
|
221
|
-
|
|
222
|
-
# Parse real name into first/last components
|
|
223
|
-
name_parts = politician_name.strip().split()
|
|
224
|
-
if len(name_parts) >= 2:
|
|
225
|
-
first_name = name_parts[0]
|
|
226
|
-
last_name = " ".join(name_parts[1:])
|
|
227
|
-
else:
|
|
228
|
-
first_name = politician_name.strip()
|
|
229
|
-
last_name = ""
|
|
230
|
-
|
|
231
|
-
# Create politician with real name - use generic role for now
|
|
232
|
-
new_politician = Politician(
|
|
233
|
-
first_name=first_name,
|
|
234
|
-
last_name=last_name,
|
|
235
|
-
full_name=politician_name.strip(),
|
|
236
|
-
role=PoliticianRole.US_HOUSE_REP, # Default role
|
|
237
|
-
)
|
|
238
|
-
politician_id = await self.db.upsert_politician(new_politician)
|
|
239
|
-
disclosure.politician_id = politician_id
|
|
240
|
-
else:
|
|
241
|
-
disclosure.politician_id = politician.id
|
|
242
|
-
|
|
243
|
-
# Check if disclosure already exists
|
|
244
|
-
existing = await self.db.find_disclosure_by_transaction(
|
|
245
|
-
disclosure.politician_id,
|
|
246
|
-
disclosure.transaction_date,
|
|
247
|
-
disclosure.asset_name,
|
|
248
|
-
disclosure.transaction_type.value,
|
|
249
|
-
)
|
|
250
|
-
|
|
251
|
-
if existing:
|
|
252
|
-
# Update existing record
|
|
253
|
-
disclosure.id = existing.id
|
|
254
|
-
if await self.db.update_disclosure(disclosure):
|
|
255
|
-
job.records_updated += 1
|
|
256
|
-
job_result["updated_disclosures"] += 1
|
|
257
|
-
else:
|
|
258
|
-
job.records_failed += 1
|
|
259
|
-
else:
|
|
260
|
-
# Insert new record
|
|
261
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
262
|
-
if disclosure_id:
|
|
263
|
-
job.records_new += 1
|
|
264
|
-
job_result["new_disclosures"] += 1
|
|
265
|
-
else:
|
|
266
|
-
job.records_failed += 1
|
|
267
|
-
|
|
268
|
-
job.records_processed += 1
|
|
269
|
-
|
|
270
|
-
except Exception as e:
|
|
271
|
-
logger.error(f"Failed to process disclosure: {e}")
|
|
272
|
-
job.records_failed += 1
|
|
273
|
-
job_result["errors"].append(str(e))
|
|
274
|
-
|
|
275
|
-
job.status = "completed"
|
|
276
|
-
job.completed_at = datetime.utcnow()
|
|
277
|
-
job_result["status"] = "completed"
|
|
278
|
-
|
|
279
|
-
except Exception as e:
|
|
280
|
-
logger.error(f"US Congress collection failed: {e}")
|
|
281
|
-
job.status = "failed"
|
|
282
|
-
job.error_message = str(e)
|
|
283
|
-
job.completed_at = datetime.utcnow()
|
|
284
|
-
job_result["status"] = "failed"
|
|
285
|
-
job_result["errors"].append(str(e))
|
|
286
|
-
|
|
287
|
-
# Update job status
|
|
288
|
-
await self.db.update_data_pull_job(job)
|
|
289
|
-
|
|
290
|
-
return job_result
|
|
291
|
-
|
|
292
|
-
async def _collect_eu_parliament_data(self) -> Dict[str, Any]:
|
|
293
|
-
"""Collect EU Parliament trading/financial data"""
|
|
294
|
-
job_id = await self.db.create_data_pull_job(
|
|
295
|
-
"eu_parliament", self.config.to_serializable_dict()
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
job_result = {
|
|
299
|
-
"job_id": job_id,
|
|
300
|
-
"status": "running",
|
|
301
|
-
"new_disclosures": 0,
|
|
302
|
-
"updated_disclosures": 0,
|
|
303
|
-
"errors": [],
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
job = DataPullJob(
|
|
307
|
-
id=job_id, job_type="eu_parliament", status="running", started_at=datetime.utcnow()
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
try:
|
|
311
|
-
logger.info("Starting EU Parliament data collection")
|
|
312
|
-
|
|
313
|
-
scraper = EUParliamentScraper(self.config.scraping)
|
|
314
|
-
|
|
315
|
-
async with scraper:
|
|
316
|
-
disclosures = await scraper.scrape_mep_declarations()
|
|
317
|
-
|
|
318
|
-
job.records_found = len(disclosures)
|
|
319
|
-
|
|
320
|
-
# Process EU disclosures (similar to US processing)
|
|
321
|
-
for disclosure in disclosures:
|
|
322
|
-
try:
|
|
323
|
-
# For EU, we'd need a different politician matching strategy
|
|
324
|
-
# For now, create a sample politician
|
|
325
|
-
if not disclosure.politician_id:
|
|
326
|
-
# Create placeholder politician
|
|
327
|
-
eu_politician = Politician(
|
|
328
|
-
first_name="Sample",
|
|
329
|
-
last_name="MEP",
|
|
330
|
-
full_name="Sample MEP",
|
|
331
|
-
role=PoliticianRole.EU_MEP,
|
|
332
|
-
state_or_country="EU",
|
|
333
|
-
)
|
|
334
|
-
politician_id = await self.db.upsert_politician(eu_politician)
|
|
335
|
-
disclosure.politician_id = politician_id
|
|
336
|
-
|
|
337
|
-
# Insert disclosure
|
|
338
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
339
|
-
if disclosure_id:
|
|
340
|
-
job.records_new += 1
|
|
341
|
-
job_result["new_disclosures"] += 1
|
|
342
|
-
else:
|
|
343
|
-
job.records_failed += 1
|
|
344
|
-
|
|
345
|
-
job.records_processed += 1
|
|
346
|
-
|
|
347
|
-
except Exception as e:
|
|
348
|
-
logger.error(f"Failed to process EU disclosure: {e}")
|
|
349
|
-
job.records_failed += 1
|
|
350
|
-
job_result["errors"].append(str(e))
|
|
351
|
-
|
|
352
|
-
job.status = "completed"
|
|
353
|
-
job.completed_at = datetime.utcnow()
|
|
354
|
-
job_result["status"] = "completed"
|
|
355
|
-
|
|
356
|
-
except Exception as e:
|
|
357
|
-
logger.error(f"EU Parliament collection failed: {e}")
|
|
358
|
-
job.status = "failed"
|
|
359
|
-
job.error_message = str(e)
|
|
360
|
-
job.completed_at = datetime.utcnow()
|
|
361
|
-
job_result["status"] = "failed"
|
|
362
|
-
job_result["errors"].append(str(e))
|
|
363
|
-
|
|
364
|
-
# Update job status
|
|
365
|
-
await self.db.update_data_pull_job(job)
|
|
366
|
-
|
|
367
|
-
return job_result
|
|
368
|
-
|
|
369
|
-
async def _collect_uk_parliament_data(self) -> Dict[str, Any]:
|
|
370
|
-
"""Collect UK Parliament financial interests data"""
|
|
371
|
-
job_id = await self.db.create_data_pull_job(
|
|
372
|
-
"uk_parliament", self.config.to_serializable_dict()
|
|
373
|
-
)
|
|
374
|
-
|
|
375
|
-
job_result = {
|
|
376
|
-
"job_id": job_id,
|
|
377
|
-
"status": "running",
|
|
378
|
-
"new_disclosures": 0,
|
|
379
|
-
"updated_disclosures": 0,
|
|
380
|
-
"errors": [],
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
job = DataPullJob(
|
|
384
|
-
id=job_id,
|
|
385
|
-
job_type="uk_parliament",
|
|
386
|
-
status="running",
|
|
387
|
-
started_at=datetime.utcnow(),
|
|
388
|
-
)
|
|
389
|
-
|
|
390
|
-
try:
|
|
391
|
-
# Collect UK Parliament financial interests
|
|
392
|
-
logger.info("Starting UK Parliament financial interests collection")
|
|
393
|
-
uk_disclosures = await run_uk_parliament_workflow(self.config.scraping)
|
|
394
|
-
|
|
395
|
-
job.records_found = len(uk_disclosures)
|
|
396
|
-
|
|
397
|
-
# Process each disclosure
|
|
398
|
-
matcher = PoliticianMatcher(self.politicians)
|
|
399
|
-
|
|
400
|
-
for disclosure in uk_disclosures:
|
|
401
|
-
try:
|
|
402
|
-
# For UK Parliament, find or create politician using real names from scrapers
|
|
403
|
-
if not disclosure.politician_id:
|
|
404
|
-
# Extract real politician name from raw data
|
|
405
|
-
politician_name = disclosure.raw_data.get("politician_name", "")
|
|
406
|
-
|
|
407
|
-
if not politician_name or politician_name.strip() == "":
|
|
408
|
-
# Fallback to using member ID if no name available
|
|
409
|
-
if disclosure.raw_data.get("uk_member_id"):
|
|
410
|
-
logger.warning(
|
|
411
|
-
f"Using member ID as fallback for UK disclosure: {disclosure.raw_data.get('uk_member_id')}"
|
|
412
|
-
)
|
|
413
|
-
uk_politician = Politician(
|
|
414
|
-
first_name="UK",
|
|
415
|
-
last_name="MP",
|
|
416
|
-
full_name=f"UK MP {disclosure.raw_data.get('uk_member_id')}",
|
|
417
|
-
role=PoliticianRole.UK_MP,
|
|
418
|
-
state_or_country="UK",
|
|
419
|
-
)
|
|
420
|
-
politician_id = await self.db.upsert_politician(uk_politician)
|
|
421
|
-
disclosure.politician_id = politician_id
|
|
422
|
-
else:
|
|
423
|
-
logger.warning(
|
|
424
|
-
"Skipping UK disclosure with no politician name or member ID"
|
|
425
|
-
)
|
|
426
|
-
job.records_failed += 1
|
|
427
|
-
continue
|
|
428
|
-
else:
|
|
429
|
-
# Filter out obviously invalid politician names
|
|
430
|
-
if self._is_invalid_politician_name(politician_name):
|
|
431
|
-
logger.warning(
|
|
432
|
-
f"Skipping UK disclosure with invalid politician name: {politician_name}"
|
|
433
|
-
)
|
|
434
|
-
job.records_failed += 1
|
|
435
|
-
continue
|
|
436
|
-
|
|
437
|
-
# Try to find existing politician
|
|
438
|
-
politician = matcher.find_politician(politician_name)
|
|
439
|
-
|
|
440
|
-
if not politician:
|
|
441
|
-
# Create new politician with real name from scraper
|
|
442
|
-
# Parse real name into first/last components
|
|
443
|
-
name_parts = politician_name.strip().split()
|
|
444
|
-
if len(name_parts) >= 2:
|
|
445
|
-
first_name = name_parts[0]
|
|
446
|
-
last_name = " ".join(name_parts[1:])
|
|
447
|
-
else:
|
|
448
|
-
first_name = politician_name.strip()
|
|
449
|
-
last_name = ""
|
|
450
|
-
|
|
451
|
-
# Create politician with REAL name
|
|
452
|
-
uk_politician = Politician(
|
|
453
|
-
first_name=first_name,
|
|
454
|
-
last_name=last_name,
|
|
455
|
-
full_name=politician_name.strip(),
|
|
456
|
-
role=PoliticianRole.UK_MP,
|
|
457
|
-
state_or_country="UK",
|
|
458
|
-
)
|
|
459
|
-
politician_id = await self.db.upsert_politician(uk_politician)
|
|
460
|
-
disclosure.politician_id = politician_id
|
|
461
|
-
logger.info(f"Created new UK MP: {politician_name}")
|
|
462
|
-
else:
|
|
463
|
-
disclosure.politician_id = politician.id
|
|
464
|
-
|
|
465
|
-
# Insert disclosure
|
|
466
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
467
|
-
if disclosure_id:
|
|
468
|
-
job.records_new += 1
|
|
469
|
-
job_result["new_disclosures"] += 1
|
|
470
|
-
else:
|
|
471
|
-
job.records_failed += 1
|
|
472
|
-
|
|
473
|
-
job.records_processed += 1
|
|
474
|
-
|
|
475
|
-
except Exception as e:
|
|
476
|
-
logger.error(f"Failed to process UK Parliament disclosure: {e}")
|
|
477
|
-
job.records_failed += 1
|
|
478
|
-
job_result["errors"].append(str(e))
|
|
479
|
-
|
|
480
|
-
job.status = "completed"
|
|
481
|
-
job.completed_at = datetime.utcnow()
|
|
482
|
-
job_result["status"] = "completed"
|
|
483
|
-
|
|
484
|
-
except Exception as e:
|
|
485
|
-
logger.error(f"UK Parliament collection failed: {e}")
|
|
486
|
-
job.status = "failed"
|
|
487
|
-
job.error_message = str(e)
|
|
488
|
-
job.completed_at = datetime.utcnow()
|
|
489
|
-
job_result["status"] = "failed"
|
|
490
|
-
job_result["errors"].append(str(e))
|
|
491
|
-
|
|
492
|
-
# Update job status
|
|
493
|
-
await self.db.update_data_pull_job(job)
|
|
494
|
-
|
|
495
|
-
return job_result
|
|
496
|
-
|
|
497
|
-
async def _collect_california_data(self) -> Dict[str, Any]:
|
|
498
|
-
"""Collect California NetFile and state disclosure data"""
|
|
499
|
-
job_id = await self.db.create_data_pull_job(
|
|
500
|
-
"california", self.config.to_serializable_dict()
|
|
501
|
-
)
|
|
502
|
-
|
|
503
|
-
job_result = {
|
|
504
|
-
"job_id": job_id,
|
|
505
|
-
"status": "running",
|
|
506
|
-
"new_disclosures": 0,
|
|
507
|
-
"updated_disclosures": 0,
|
|
508
|
-
"errors": [],
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
job = DataPullJob(
|
|
512
|
-
id=job_id,
|
|
513
|
-
job_type="california",
|
|
514
|
-
status="running",
|
|
515
|
-
started_at=datetime.utcnow(),
|
|
516
|
-
)
|
|
517
|
-
|
|
518
|
-
try:
|
|
519
|
-
# Collect California financial disclosures
|
|
520
|
-
logger.info("Starting California financial disclosures collection")
|
|
521
|
-
california_disclosures = await run_california_workflow(self.config.scraping)
|
|
522
|
-
|
|
523
|
-
job.records_found = len(california_disclosures)
|
|
524
|
-
|
|
525
|
-
# Process each disclosure
|
|
526
|
-
matcher = PoliticianMatcher(self.politicians)
|
|
527
|
-
|
|
528
|
-
for disclosure in california_disclosures:
|
|
529
|
-
try:
|
|
530
|
-
# For California, create politician if needed
|
|
531
|
-
if not disclosure.politician_id:
|
|
532
|
-
# Extract politician name from raw data or create placeholder
|
|
533
|
-
politician_name = disclosure.raw_data.get("politician_name", "")
|
|
534
|
-
if not politician_name:
|
|
535
|
-
# Create placeholder for California politician
|
|
536
|
-
ca_politician = Politician(
|
|
537
|
-
first_name="California",
|
|
538
|
-
last_name="Politician",
|
|
539
|
-
full_name=f"California Politician {disclosure.raw_data.get('jurisdiction', 'Unknown')}",
|
|
540
|
-
role=PoliticianRole.US_HOUSE_REP, # Could be state-level role
|
|
541
|
-
state_or_country="CA",
|
|
542
|
-
)
|
|
543
|
-
politician_id = await self.db.upsert_politician(ca_politician)
|
|
544
|
-
disclosure.politician_id = politician_id
|
|
545
|
-
|
|
546
|
-
# Insert disclosure
|
|
547
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
548
|
-
if disclosure_id:
|
|
549
|
-
job.records_new += 1
|
|
550
|
-
job_result["new_disclosures"] += 1
|
|
551
|
-
else:
|
|
552
|
-
job.records_failed += 1
|
|
553
|
-
|
|
554
|
-
job.records_processed += 1
|
|
555
|
-
|
|
556
|
-
except Exception as e:
|
|
557
|
-
logger.error(f"Failed to process California disclosure: {e}")
|
|
558
|
-
job.records_failed += 1
|
|
559
|
-
job_result["errors"].append(str(e))
|
|
560
|
-
|
|
561
|
-
job.status = "completed"
|
|
562
|
-
job.completed_at = datetime.utcnow()
|
|
563
|
-
job_result["status"] = "completed"
|
|
564
|
-
|
|
565
|
-
except Exception as e:
|
|
566
|
-
logger.error(f"California collection failed: {e}")
|
|
567
|
-
job.status = "failed"
|
|
568
|
-
job.error_message = str(e)
|
|
569
|
-
job.completed_at = datetime.utcnow()
|
|
570
|
-
job_result["status"] = "failed"
|
|
571
|
-
job_result["errors"].append(str(e))
|
|
572
|
-
|
|
573
|
-
# Update job status
|
|
574
|
-
await self.db.update_data_pull_job(job)
|
|
575
|
-
|
|
576
|
-
return job_result
|
|
577
|
-
|
|
578
|
-
async def _collect_eu_member_states_data(self) -> Dict[str, Any]:
|
|
579
|
-
"""Collect EU member states financial disclosure data"""
|
|
580
|
-
job_id = await self.db.create_data_pull_job(
|
|
581
|
-
"eu_member_states", self.config.to_serializable_dict()
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
job_result = {
|
|
585
|
-
"job_id": job_id,
|
|
586
|
-
"status": "running",
|
|
587
|
-
"new_disclosures": 0,
|
|
588
|
-
"updated_disclosures": 0,
|
|
589
|
-
"errors": [],
|
|
590
|
-
}
|
|
591
|
-
|
|
592
|
-
job = DataPullJob(
|
|
593
|
-
id=job_id,
|
|
594
|
-
job_type="eu_member_states",
|
|
595
|
-
status="running",
|
|
596
|
-
started_at=datetime.utcnow(),
|
|
597
|
-
)
|
|
598
|
-
|
|
599
|
-
try:
|
|
600
|
-
# Collect EU member states financial disclosures
|
|
601
|
-
logger.info("Starting EU member states financial disclosures collection")
|
|
602
|
-
eu_states_disclosures = await run_eu_member_states_workflow(self.config.scraping)
|
|
603
|
-
|
|
604
|
-
job.records_found = len(eu_states_disclosures)
|
|
605
|
-
|
|
606
|
-
# Process each disclosure
|
|
607
|
-
matcher = PoliticianMatcher(self.politicians)
|
|
608
|
-
|
|
609
|
-
for disclosure in eu_states_disclosures:
|
|
610
|
-
try:
|
|
611
|
-
# For EU member states, create politician if needed
|
|
612
|
-
if not disclosure.politician_id:
|
|
613
|
-
# Extract politician details from raw data
|
|
614
|
-
country = disclosure.raw_data.get("country", "Unknown")
|
|
615
|
-
source = disclosure.raw_data.get("source", "unknown")
|
|
616
|
-
|
|
617
|
-
# Map country to appropriate role
|
|
618
|
-
role_map = {
|
|
619
|
-
"Germany": PoliticianRole.GERMAN_BUNDESTAG,
|
|
620
|
-
"France": PoliticianRole.FRENCH_DEPUTY,
|
|
621
|
-
"Italy": PoliticianRole.ITALIAN_DEPUTY,
|
|
622
|
-
"Spain": PoliticianRole.SPANISH_DEPUTY,
|
|
623
|
-
"Netherlands": PoliticianRole.DUTCH_MP,
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
politician_role = role_map.get(country, PoliticianRole.EU_MEP)
|
|
627
|
-
|
|
628
|
-
# Create placeholder politician
|
|
629
|
-
eu_politician = Politician(
|
|
630
|
-
first_name=country,
|
|
631
|
-
last_name="Politician",
|
|
632
|
-
full_name=f"{country} Politician ({source})",
|
|
633
|
-
role=politician_role,
|
|
634
|
-
state_or_country=country,
|
|
635
|
-
)
|
|
636
|
-
politician_id = await self.db.upsert_politician(eu_politician)
|
|
637
|
-
disclosure.politician_id = politician_id
|
|
638
|
-
|
|
639
|
-
# Insert disclosure
|
|
640
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
641
|
-
if disclosure_id:
|
|
642
|
-
job.records_new += 1
|
|
643
|
-
job_result["new_disclosures"] += 1
|
|
644
|
-
else:
|
|
645
|
-
job.records_failed += 1
|
|
646
|
-
|
|
647
|
-
job.records_processed += 1
|
|
648
|
-
|
|
649
|
-
except Exception as e:
|
|
650
|
-
logger.error(f"Failed to process EU member state disclosure: {e}")
|
|
651
|
-
job.records_failed += 1
|
|
652
|
-
job_result["errors"].append(str(e))
|
|
653
|
-
|
|
654
|
-
job.status = "completed"
|
|
655
|
-
job.completed_at = datetime.utcnow()
|
|
656
|
-
job_result["status"] = "completed"
|
|
657
|
-
|
|
658
|
-
except Exception as e:
|
|
659
|
-
logger.error(f"EU member states collection failed: {e}")
|
|
660
|
-
job.status = "failed"
|
|
661
|
-
job.error_message = str(e)
|
|
662
|
-
job.completed_at = datetime.utcnow()
|
|
663
|
-
job_result["status"] = "failed"
|
|
664
|
-
job_result["errors"].append(str(e))
|
|
665
|
-
|
|
666
|
-
# Update job status
|
|
667
|
-
await self.db.update_data_pull_job(job)
|
|
668
|
-
|
|
669
|
-
return job_result
|
|
670
|
-
|
|
671
|
-
async def _collect_us_states_data(self) -> Dict[str, Any]:
|
|
672
|
-
"""Collect US states financial disclosure data"""
|
|
673
|
-
job_id = await self.db.create_data_pull_job("us_states", self.config.to_serializable_dict())
|
|
674
|
-
|
|
675
|
-
job_result = {
|
|
676
|
-
"job_id": job_id,
|
|
677
|
-
"status": "running",
|
|
678
|
-
"new_disclosures": 0,
|
|
679
|
-
"updated_disclosures": 0,
|
|
680
|
-
"errors": [],
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
job = DataPullJob(
|
|
684
|
-
id=job_id,
|
|
685
|
-
job_type="us_states",
|
|
686
|
-
status="running",
|
|
687
|
-
started_at=datetime.utcnow(),
|
|
688
|
-
)
|
|
689
|
-
|
|
690
|
-
try:
|
|
691
|
-
# Collect US states financial disclosures
|
|
692
|
-
logger.info("Starting US states financial disclosures collection")
|
|
693
|
-
us_states_disclosures = await run_us_states_workflow(self.config.scraping)
|
|
694
|
-
|
|
695
|
-
job.records_found = len(us_states_disclosures)
|
|
696
|
-
|
|
697
|
-
# Process each disclosure
|
|
698
|
-
matcher = PoliticianMatcher(self.politicians)
|
|
699
|
-
|
|
700
|
-
for disclosure in us_states_disclosures:
|
|
701
|
-
try:
|
|
702
|
-
# For US states, find or create politician using real names from scrapers
|
|
703
|
-
if not disclosure.politician_id:
|
|
704
|
-
# Extract real politician name from raw data
|
|
705
|
-
politician_name = disclosure.raw_data.get("politician_name", "")
|
|
706
|
-
if not politician_name or politician_name.strip() == "":
|
|
707
|
-
logger.warning(
|
|
708
|
-
"Skipping US states disclosure with empty politician name"
|
|
709
|
-
)
|
|
710
|
-
job.records_failed += 1
|
|
711
|
-
continue
|
|
712
|
-
|
|
713
|
-
# Filter out obviously invalid politician names
|
|
714
|
-
if self._is_invalid_politician_name(politician_name):
|
|
715
|
-
logger.warning(
|
|
716
|
-
f"Skipping US states disclosure with invalid politician name: {politician_name}"
|
|
717
|
-
)
|
|
718
|
-
job.records_failed += 1
|
|
719
|
-
continue
|
|
720
|
-
|
|
721
|
-
# Try to find existing politician
|
|
722
|
-
politician = matcher.find_politician(politician_name)
|
|
723
|
-
|
|
724
|
-
if not politician:
|
|
725
|
-
# Create new politician with real name from scraper
|
|
726
|
-
state = disclosure.raw_data.get("state", "Unknown")
|
|
727
|
-
source = disclosure.raw_data.get("source", "unknown")
|
|
728
|
-
|
|
729
|
-
# Map state to appropriate role
|
|
730
|
-
role_map = {
|
|
731
|
-
"Texas": PoliticianRole.TEXAS_STATE_OFFICIAL,
|
|
732
|
-
"New York": PoliticianRole.NEW_YORK_STATE_OFFICIAL,
|
|
733
|
-
"Florida": PoliticianRole.FLORIDA_STATE_OFFICIAL,
|
|
734
|
-
"Illinois": PoliticianRole.ILLINOIS_STATE_OFFICIAL,
|
|
735
|
-
"Pennsylvania": PoliticianRole.PENNSYLVANIA_STATE_OFFICIAL,
|
|
736
|
-
"Massachusetts": PoliticianRole.MASSACHUSETTS_STATE_OFFICIAL,
|
|
737
|
-
"California": PoliticianRole.CALIFORNIA_STATE_OFFICIAL,
|
|
738
|
-
}
|
|
739
|
-
|
|
740
|
-
politician_role = role_map.get(state, PoliticianRole.US_HOUSE_REP)
|
|
741
|
-
|
|
742
|
-
# Parse real name into first/last components
|
|
743
|
-
name_parts = politician_name.strip().split()
|
|
744
|
-
if len(name_parts) >= 2:
|
|
745
|
-
first_name = name_parts[0]
|
|
746
|
-
last_name = " ".join(name_parts[1:])
|
|
747
|
-
else:
|
|
748
|
-
first_name = politician_name.strip()
|
|
749
|
-
last_name = ""
|
|
750
|
-
|
|
751
|
-
# Create politician with REAL name
|
|
752
|
-
state_politician = Politician(
|
|
753
|
-
first_name=first_name,
|
|
754
|
-
last_name=last_name,
|
|
755
|
-
full_name=politician_name.strip(),
|
|
756
|
-
role=politician_role,
|
|
757
|
-
state_or_country=state,
|
|
758
|
-
)
|
|
759
|
-
politician_id = await self.db.upsert_politician(state_politician)
|
|
760
|
-
disclosure.politician_id = politician_id
|
|
761
|
-
logger.info(f"Created new US state politician: {politician_name}")
|
|
762
|
-
else:
|
|
763
|
-
disclosure.politician_id = politician.id
|
|
764
|
-
|
|
765
|
-
# Insert disclosure
|
|
766
|
-
disclosure_id = await self.db.insert_disclosure(disclosure)
|
|
767
|
-
if disclosure_id:
|
|
768
|
-
job.records_new += 1
|
|
769
|
-
job_result["new_disclosures"] += 1
|
|
770
|
-
else:
|
|
771
|
-
job.records_failed += 1
|
|
772
|
-
|
|
773
|
-
job.records_processed += 1
|
|
774
|
-
|
|
775
|
-
except Exception as e:
|
|
776
|
-
logger.error(f"Failed to process US state disclosure: {e}")
|
|
777
|
-
job.records_failed += 1
|
|
778
|
-
job_result["errors"].append(str(e))
|
|
779
|
-
|
|
780
|
-
job.status = "completed"
|
|
781
|
-
job.completed_at = datetime.utcnow()
|
|
782
|
-
job_result["status"] = "completed"
|
|
783
|
-
|
|
784
|
-
except Exception as e:
|
|
785
|
-
logger.error(f"US states collection failed: {e}")
|
|
786
|
-
job.status = "failed"
|
|
787
|
-
job.error_message = str(e)
|
|
788
|
-
job.completed_at = datetime.utcnow()
|
|
789
|
-
job_result["status"] = "failed"
|
|
790
|
-
job_result["errors"].append(str(e))
|
|
791
|
-
|
|
792
|
-
# Update job status
|
|
793
|
-
await self.db.update_data_pull_job(job)
|
|
794
|
-
|
|
795
|
-
return job_result
|
|
796
|
-
|
|
797
|
-
async def get_status(self) -> Dict[str, Any]:
|
|
798
|
-
"""Get current workflow status"""
|
|
799
|
-
try:
|
|
800
|
-
return await self.db.get_job_status()
|
|
801
|
-
except Exception as e:
|
|
802
|
-
logger.error(f"Failed to get status: {e}")
|
|
803
|
-
return {"error": str(e)}
|
|
804
|
-
|
|
805
|
-
async def run_quick_check(self) -> Dict[str, Any]:
|
|
806
|
-
"""Run a quick status check without full data collection"""
|
|
807
|
-
try:
|
|
808
|
-
status = await self.get_status()
|
|
809
|
-
|
|
810
|
-
# Add some additional quick checks
|
|
811
|
-
status["database_connection"] = "ok" if self.db.client else "failed"
|
|
812
|
-
status["config_loaded"] = "ok" if self.config else "failed"
|
|
813
|
-
status["timestamp"] = datetime.utcnow().isoformat()
|
|
814
|
-
|
|
815
|
-
return status
|
|
816
|
-
|
|
817
|
-
except Exception as e:
|
|
818
|
-
return {"error": str(e), "timestamp": datetime.utcnow().isoformat(), "status": "failed"}
|
|
819
|
-
|
|
820
|
-
def _is_invalid_politician_name(self, name: str) -> bool:
|
|
821
|
-
"""Check if a name is obviously not a politician name"""
|
|
822
|
-
if not name or len(name.strip()) < 2:
|
|
823
|
-
return True
|
|
824
|
-
|
|
825
|
-
# Check for proper name structure first (before converting to uppercase)
|
|
826
|
-
original_name = name.strip()
|
|
827
|
-
import re
|
|
828
|
-
|
|
829
|
-
if not re.search(r"[A-Za-z]", original_name): # Should have at least one letter
|
|
830
|
-
return True
|
|
831
|
-
if re.search(r"^\d+", original_name): # Starting with numbers
|
|
832
|
-
return True
|
|
833
|
-
|
|
834
|
-
# Now convert to uppercase for pattern matching
|
|
835
|
-
name = original_name.upper()
|
|
836
|
-
|
|
837
|
-
# Filter out obvious non-names
|
|
838
|
-
invalid_patterns = [
|
|
839
|
-
# Asset tickers and financial instruments
|
|
840
|
-
r"^-.*CT$", # -ETHEREUMCT, -DOGCT patterns
|
|
841
|
-
r"^[A-Z]{2,5}$", # Short all-caps (likely tickers)
|
|
842
|
-
r"^\$", # Starting with $
|
|
843
|
-
# Municipal and financial terms
|
|
844
|
-
r"MUNICIPAL",
|
|
845
|
-
r"BOND",
|
|
846
|
-
r"TRUST",
|
|
847
|
-
r"FUND",
|
|
848
|
-
r"CORP",
|
|
849
|
-
r"INC\.$",
|
|
850
|
-
r"LLC$",
|
|
851
|
-
r"LP$",
|
|
852
|
-
# Common non-name patterns
|
|
853
|
-
r"^UNKNOWN",
|
|
854
|
-
r"^TEST",
|
|
855
|
-
r"^SAMPLE",
|
|
856
|
-
# Crypto/financial asset patterns
|
|
857
|
-
r"ETHEREUM",
|
|
858
|
-
r"BITCOIN",
|
|
859
|
-
r"CRYPTO",
|
|
860
|
-
]
|
|
861
|
-
|
|
862
|
-
for pattern in invalid_patterns:
|
|
863
|
-
if re.search(pattern, name):
|
|
864
|
-
return True
|
|
865
|
-
|
|
866
|
-
return False
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
# Standalone functions for cron job usage
|
|
870
|
-
async def run_politician_trading_collection() -> Dict[str, Any]:
|
|
871
|
-
"""Standalone function for cron job execution"""
|
|
872
|
-
workflow = PoliticianTradingWorkflow()
|
|
873
|
-
return await workflow.run_full_collection()
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
async def check_politician_trading_status() -> Dict[str, Any]:
|
|
877
|
-
"""Standalone function for status checking"""
|
|
878
|
-
workflow = PoliticianTradingWorkflow()
|
|
879
|
-
return await workflow.run_quick_check()
|