mcli-framework 7.10.1__py3-none-any.whl → 7.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (43) hide show
  1. mcli/app/commands_cmd.py +150 -58
  2. mcli/app/main.py +21 -27
  3. mcli/lib/custom_commands.py +62 -12
  4. mcli/lib/optional_deps.py +240 -0
  5. mcli/lib/paths.py +129 -5
  6. mcli/self/migrate_cmd.py +261 -0
  7. mcli/self/self_cmd.py +8 -0
  8. mcli/workflow/git_commit/ai_service.py +13 -2
  9. mcli/workflow/notebook/__init__.py +16 -0
  10. mcli/workflow/notebook/converter.py +375 -0
  11. mcli/workflow/notebook/notebook_cmd.py +441 -0
  12. mcli/workflow/notebook/schema.py +402 -0
  13. mcli/workflow/notebook/validator.py +313 -0
  14. mcli/workflow/secrets/__init__.py +4 -0
  15. mcli/workflow/secrets/secrets_cmd.py +192 -0
  16. mcli/workflow/workflow.py +35 -5
  17. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/METADATA +86 -55
  18. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/RECORD +22 -34
  19. mcli/ml/features/political_features.py +0 -677
  20. mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
  21. mcli/workflow/politician_trading/__init__.py +0 -4
  22. mcli/workflow/politician_trading/config.py +0 -134
  23. mcli/workflow/politician_trading/connectivity.py +0 -492
  24. mcli/workflow/politician_trading/data_sources.py +0 -654
  25. mcli/workflow/politician_trading/database.py +0 -412
  26. mcli/workflow/politician_trading/demo.py +0 -249
  27. mcli/workflow/politician_trading/models.py +0 -327
  28. mcli/workflow/politician_trading/monitoring.py +0 -413
  29. mcli/workflow/politician_trading/scrapers.py +0 -1074
  30. mcli/workflow/politician_trading/scrapers_california.py +0 -434
  31. mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
  32. mcli/workflow/politician_trading/scrapers_eu.py +0 -376
  33. mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
  34. mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
  35. mcli/workflow/politician_trading/scrapers_uk.py +0 -378
  36. mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
  37. mcli/workflow/politician_trading/seed_database.py +0 -520
  38. mcli/workflow/politician_trading/supabase_functions.py +0 -354
  39. mcli/workflow/politician_trading/workflow.py +0 -879
  40. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/WHEEL +0 -0
  41. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/entry_points.txt +0 -0
  42. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/licenses/LICENSE +0 -0
  43. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/top_level.txt +0 -0
@@ -1,879 +0,0 @@
1
- """
2
- Main workflow orchestrator for politician trading data collection
3
- """
4
-
5
- import asyncio
6
- import logging
7
- import uuid
8
- from datetime import datetime, timedelta
9
- from typing import Any, Dict, List, Optional
10
-
11
- from .config import WorkflowConfig
12
- from .database import PoliticianTradingDB
13
- from .models import DataPullJob, Politician, PoliticianRole, TradingDisclosure
14
- from .scrapers import (
15
- CongressTradingScraper,
16
- EUParliamentScraper,
17
- PoliticianMatcher,
18
- QuiverQuantScraper,
19
- run_california_workflow,
20
- run_eu_member_states_workflow,
21
- run_uk_parliament_workflow,
22
- run_us_states_workflow,
23
- )
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- class PoliticianTradingWorkflow:
29
- """Main workflow for collecting politician trading data"""
30
-
31
- def __init__(self, config: WorkflowConfig = None):
32
- self.config = config or WorkflowConfig.default()
33
- self.db = PoliticianTradingDB(self.config)
34
- self.politicians: List[Politician] = []
35
-
36
- async def run_full_collection(self) -> Dict[str, Any]:
37
- """Run complete data collection workflow"""
38
- logger.info("Starting full politician trading data collection")
39
-
40
- results = {
41
- "started_at": datetime.utcnow().isoformat(),
42
- "jobs": {},
43
- "summary": {"total_new_disclosures": 0, "total_updated_disclosures": 0, "errors": []},
44
- }
45
-
46
- try:
47
- # Ensure database schema
48
- schema_ok = await self.db.ensure_schema()
49
- if not schema_ok:
50
- raise Exception("Database schema verification failed")
51
-
52
- # Load existing politicians for matching
53
- await self._load_politicians()
54
-
55
- # Run US Congress collection
56
- us_results = await self._collect_us_congress_data()
57
- results["jobs"]["us_congress"] = us_results
58
- results["summary"]["total_new_disclosures"] += us_results.get("new_disclosures", 0)
59
- results["summary"]["total_updated_disclosures"] += us_results.get(
60
- "updated_disclosures", 0
61
- )
62
-
63
- # Run EU Parliament collection
64
- eu_results = await self._collect_eu_parliament_data()
65
- results["jobs"]["eu_parliament"] = eu_results
66
- results["summary"]["total_new_disclosures"] += eu_results.get("new_disclosures", 0)
67
- results["summary"]["total_updated_disclosures"] += eu_results.get(
68
- "updated_disclosures", 0
69
- )
70
-
71
- # Run California collection
72
- ca_results = await self._collect_california_data()
73
- results["jobs"]["california"] = ca_results
74
- results["summary"]["total_new_disclosures"] += ca_results.get("new_disclosures", 0)
75
- results["summary"]["total_updated_disclosures"] += ca_results.get(
76
- "updated_disclosures", 0
77
- )
78
-
79
- # Run EU member states collection
80
- eu_states_results = await self._collect_eu_member_states_data()
81
- results["jobs"]["eu_member_states"] = eu_states_results
82
- results["summary"]["total_new_disclosures"] += eu_states_results.get(
83
- "new_disclosures", 0
84
- )
85
- results["summary"]["total_updated_disclosures"] += eu_states_results.get(
86
- "updated_disclosures", 0
87
- )
88
-
89
- # Run US states collection
90
- us_states_results = await self._collect_us_states_data()
91
- results["jobs"]["us_states"] = us_states_results
92
- results["summary"]["total_new_disclosures"] += us_states_results.get(
93
- "new_disclosures", 0
94
- )
95
- results["summary"]["total_updated_disclosures"] += us_states_results.get(
96
- "updated_disclosures", 0
97
- )
98
-
99
- results["completed_at"] = datetime.utcnow().isoformat()
100
- results["status"] = "completed"
101
-
102
- except Exception as e:
103
- logger.error(f"Full collection workflow failed: {e}")
104
- results["error"] = str(e)
105
- results["status"] = "failed"
106
- results["summary"]["errors"].append(str(e))
107
-
108
- logger.info(f"Workflow completed: {results['summary']}")
109
- return results
110
-
111
- async def _load_politicians(self):
112
- """Load politicians from database for matching"""
113
- try:
114
- # For now, create some sample politicians
115
- # In production, you'd load from a politicians API or database
116
- sample_politicians = [
117
- Politician(
118
- id=str(uuid.uuid4()),
119
- first_name="Nancy",
120
- last_name="Pelosi",
121
- full_name="Nancy Pelosi",
122
- role=PoliticianRole.US_HOUSE_REP,
123
- party="Democratic",
124
- state_or_country="CA",
125
- district="5",
126
- bioguide_id="P000197",
127
- ),
128
- Politician(
129
- id=str(uuid.uuid4()),
130
- first_name="Ted",
131
- last_name="Cruz",
132
- full_name="Ted Cruz",
133
- role=PoliticianRole.US_SENATOR,
134
- party="Republican",
135
- state_or_country="TX",
136
- bioguide_id="C001098",
137
- ),
138
- ]
139
-
140
- # Store politicians in database
141
- for politician in sample_politicians:
142
- politician_id = await self.db.upsert_politician(politician)
143
- politician.id = politician_id
144
- self.politicians.append(politician)
145
-
146
- logger.info(f"Loaded {len(self.politicians)} politicians for matching")
147
-
148
- except Exception as e:
149
- logger.error(f"Failed to load politicians: {e}")
150
- self.politicians = []
151
-
152
- async def _collect_us_congress_data(self) -> Dict[str, Any]:
153
- """Collect US Congress trading data"""
154
- job_id = await self.db.create_data_pull_job(
155
- "us_congress", self.config.to_serializable_dict()
156
- )
157
-
158
- job_result = {
159
- "job_id": job_id,
160
- "status": "running",
161
- "new_disclosures": 0,
162
- "updated_disclosures": 0,
163
- "errors": [],
164
- }
165
-
166
- job = DataPullJob(
167
- id=job_id, job_type="us_congress", status="running", started_at=datetime.utcnow()
168
- )
169
-
170
- try:
171
- logger.info("Starting US Congress data collection")
172
-
173
- # Initialize scrapers
174
- congress_scraper = CongressTradingScraper(self.config.scraping)
175
- quiver_scraper = QuiverQuantScraper(self.config.scraping)
176
-
177
- all_disclosures = []
178
-
179
- # Scrape official sources
180
- async with congress_scraper:
181
- house_disclosures = await congress_scraper.scrape_house_disclosures()
182
- senate_disclosures = await congress_scraper.scrape_senate_disclosures()
183
- all_disclosures.extend(house_disclosures)
184
- all_disclosures.extend(senate_disclosures)
185
-
186
- # Scrape backup sources
187
- async with quiver_scraper:
188
- quiver_trades = await quiver_scraper.scrape_congress_trades()
189
- for trade_data in quiver_trades:
190
- disclosure = quiver_scraper.parse_quiver_trade(trade_data)
191
- if disclosure:
192
- all_disclosures.append(disclosure)
193
-
194
- job.records_found = len(all_disclosures)
195
-
196
- # Process disclosures
197
- matcher = PoliticianMatcher(self.politicians)
198
-
199
- for disclosure in all_disclosures:
200
- try:
201
- # Find matching politician
202
- politician_name = disclosure.raw_data.get("politician_name", "")
203
- if not politician_name or politician_name.strip() == "":
204
- logger.warning("Skipping disclosure with empty politician name")
205
- job.records_failed += 1
206
- continue
207
-
208
- # Filter out obviously invalid politician names
209
- if self._is_invalid_politician_name(politician_name):
210
- logger.warning(
211
- f"Skipping disclosure with invalid politician name: {politician_name}"
212
- )
213
- job.records_failed += 1
214
- continue
215
-
216
- politician = matcher.find_politician(politician_name)
217
-
218
- if not politician:
219
- # Create new politician with real name from scraper
220
- logger.info(f"Creating new politician for: {politician_name}")
221
-
222
- # Parse real name into first/last components
223
- name_parts = politician_name.strip().split()
224
- if len(name_parts) >= 2:
225
- first_name = name_parts[0]
226
- last_name = " ".join(name_parts[1:])
227
- else:
228
- first_name = politician_name.strip()
229
- last_name = ""
230
-
231
- # Create politician with real name - use generic role for now
232
- new_politician = Politician(
233
- first_name=first_name,
234
- last_name=last_name,
235
- full_name=politician_name.strip(),
236
- role=PoliticianRole.US_HOUSE_REP, # Default role
237
- )
238
- politician_id = await self.db.upsert_politician(new_politician)
239
- disclosure.politician_id = politician_id
240
- else:
241
- disclosure.politician_id = politician.id
242
-
243
- # Check if disclosure already exists
244
- existing = await self.db.find_disclosure_by_transaction(
245
- disclosure.politician_id,
246
- disclosure.transaction_date,
247
- disclosure.asset_name,
248
- disclosure.transaction_type.value,
249
- )
250
-
251
- if existing:
252
- # Update existing record
253
- disclosure.id = existing.id
254
- if await self.db.update_disclosure(disclosure):
255
- job.records_updated += 1
256
- job_result["updated_disclosures"] += 1
257
- else:
258
- job.records_failed += 1
259
- else:
260
- # Insert new record
261
- disclosure_id = await self.db.insert_disclosure(disclosure)
262
- if disclosure_id:
263
- job.records_new += 1
264
- job_result["new_disclosures"] += 1
265
- else:
266
- job.records_failed += 1
267
-
268
- job.records_processed += 1
269
-
270
- except Exception as e:
271
- logger.error(f"Failed to process disclosure: {e}")
272
- job.records_failed += 1
273
- job_result["errors"].append(str(e))
274
-
275
- job.status = "completed"
276
- job.completed_at = datetime.utcnow()
277
- job_result["status"] = "completed"
278
-
279
- except Exception as e:
280
- logger.error(f"US Congress collection failed: {e}")
281
- job.status = "failed"
282
- job.error_message = str(e)
283
- job.completed_at = datetime.utcnow()
284
- job_result["status"] = "failed"
285
- job_result["errors"].append(str(e))
286
-
287
- # Update job status
288
- await self.db.update_data_pull_job(job)
289
-
290
- return job_result
291
-
292
- async def _collect_eu_parliament_data(self) -> Dict[str, Any]:
293
- """Collect EU Parliament trading/financial data"""
294
- job_id = await self.db.create_data_pull_job(
295
- "eu_parliament", self.config.to_serializable_dict()
296
- )
297
-
298
- job_result = {
299
- "job_id": job_id,
300
- "status": "running",
301
- "new_disclosures": 0,
302
- "updated_disclosures": 0,
303
- "errors": [],
304
- }
305
-
306
- job = DataPullJob(
307
- id=job_id, job_type="eu_parliament", status="running", started_at=datetime.utcnow()
308
- )
309
-
310
- try:
311
- logger.info("Starting EU Parliament data collection")
312
-
313
- scraper = EUParliamentScraper(self.config.scraping)
314
-
315
- async with scraper:
316
- disclosures = await scraper.scrape_mep_declarations()
317
-
318
- job.records_found = len(disclosures)
319
-
320
- # Process EU disclosures (similar to US processing)
321
- for disclosure in disclosures:
322
- try:
323
- # For EU, we'd need a different politician matching strategy
324
- # For now, create a sample politician
325
- if not disclosure.politician_id:
326
- # Create placeholder politician
327
- eu_politician = Politician(
328
- first_name="Sample",
329
- last_name="MEP",
330
- full_name="Sample MEP",
331
- role=PoliticianRole.EU_MEP,
332
- state_or_country="EU",
333
- )
334
- politician_id = await self.db.upsert_politician(eu_politician)
335
- disclosure.politician_id = politician_id
336
-
337
- # Insert disclosure
338
- disclosure_id = await self.db.insert_disclosure(disclosure)
339
- if disclosure_id:
340
- job.records_new += 1
341
- job_result["new_disclosures"] += 1
342
- else:
343
- job.records_failed += 1
344
-
345
- job.records_processed += 1
346
-
347
- except Exception as e:
348
- logger.error(f"Failed to process EU disclosure: {e}")
349
- job.records_failed += 1
350
- job_result["errors"].append(str(e))
351
-
352
- job.status = "completed"
353
- job.completed_at = datetime.utcnow()
354
- job_result["status"] = "completed"
355
-
356
- except Exception as e:
357
- logger.error(f"EU Parliament collection failed: {e}")
358
- job.status = "failed"
359
- job.error_message = str(e)
360
- job.completed_at = datetime.utcnow()
361
- job_result["status"] = "failed"
362
- job_result["errors"].append(str(e))
363
-
364
- # Update job status
365
- await self.db.update_data_pull_job(job)
366
-
367
- return job_result
368
-
369
- async def _collect_uk_parliament_data(self) -> Dict[str, Any]:
370
- """Collect UK Parliament financial interests data"""
371
- job_id = await self.db.create_data_pull_job(
372
- "uk_parliament", self.config.to_serializable_dict()
373
- )
374
-
375
- job_result = {
376
- "job_id": job_id,
377
- "status": "running",
378
- "new_disclosures": 0,
379
- "updated_disclosures": 0,
380
- "errors": [],
381
- }
382
-
383
- job = DataPullJob(
384
- id=job_id,
385
- job_type="uk_parliament",
386
- status="running",
387
- started_at=datetime.utcnow(),
388
- )
389
-
390
- try:
391
- # Collect UK Parliament financial interests
392
- logger.info("Starting UK Parliament financial interests collection")
393
- uk_disclosures = await run_uk_parliament_workflow(self.config.scraping)
394
-
395
- job.records_found = len(uk_disclosures)
396
-
397
- # Process each disclosure
398
- matcher = PoliticianMatcher(self.politicians)
399
-
400
- for disclosure in uk_disclosures:
401
- try:
402
- # For UK Parliament, find or create politician using real names from scrapers
403
- if not disclosure.politician_id:
404
- # Extract real politician name from raw data
405
- politician_name = disclosure.raw_data.get("politician_name", "")
406
-
407
- if not politician_name or politician_name.strip() == "":
408
- # Fallback to using member ID if no name available
409
- if disclosure.raw_data.get("uk_member_id"):
410
- logger.warning(
411
- f"Using member ID as fallback for UK disclosure: {disclosure.raw_data.get('uk_member_id')}"
412
- )
413
- uk_politician = Politician(
414
- first_name="UK",
415
- last_name="MP",
416
- full_name=f"UK MP {disclosure.raw_data.get('uk_member_id')}",
417
- role=PoliticianRole.UK_MP,
418
- state_or_country="UK",
419
- )
420
- politician_id = await self.db.upsert_politician(uk_politician)
421
- disclosure.politician_id = politician_id
422
- else:
423
- logger.warning(
424
- "Skipping UK disclosure with no politician name or member ID"
425
- )
426
- job.records_failed += 1
427
- continue
428
- else:
429
- # Filter out obviously invalid politician names
430
- if self._is_invalid_politician_name(politician_name):
431
- logger.warning(
432
- f"Skipping UK disclosure with invalid politician name: {politician_name}"
433
- )
434
- job.records_failed += 1
435
- continue
436
-
437
- # Try to find existing politician
438
- politician = matcher.find_politician(politician_name)
439
-
440
- if not politician:
441
- # Create new politician with real name from scraper
442
- # Parse real name into first/last components
443
- name_parts = politician_name.strip().split()
444
- if len(name_parts) >= 2:
445
- first_name = name_parts[0]
446
- last_name = " ".join(name_parts[1:])
447
- else:
448
- first_name = politician_name.strip()
449
- last_name = ""
450
-
451
- # Create politician with REAL name
452
- uk_politician = Politician(
453
- first_name=first_name,
454
- last_name=last_name,
455
- full_name=politician_name.strip(),
456
- role=PoliticianRole.UK_MP,
457
- state_or_country="UK",
458
- )
459
- politician_id = await self.db.upsert_politician(uk_politician)
460
- disclosure.politician_id = politician_id
461
- logger.info(f"Created new UK MP: {politician_name}")
462
- else:
463
- disclosure.politician_id = politician.id
464
-
465
- # Insert disclosure
466
- disclosure_id = await self.db.insert_disclosure(disclosure)
467
- if disclosure_id:
468
- job.records_new += 1
469
- job_result["new_disclosures"] += 1
470
- else:
471
- job.records_failed += 1
472
-
473
- job.records_processed += 1
474
-
475
- except Exception as e:
476
- logger.error(f"Failed to process UK Parliament disclosure: {e}")
477
- job.records_failed += 1
478
- job_result["errors"].append(str(e))
479
-
480
- job.status = "completed"
481
- job.completed_at = datetime.utcnow()
482
- job_result["status"] = "completed"
483
-
484
- except Exception as e:
485
- logger.error(f"UK Parliament collection failed: {e}")
486
- job.status = "failed"
487
- job.error_message = str(e)
488
- job.completed_at = datetime.utcnow()
489
- job_result["status"] = "failed"
490
- job_result["errors"].append(str(e))
491
-
492
- # Update job status
493
- await self.db.update_data_pull_job(job)
494
-
495
- return job_result
496
-
497
- async def _collect_california_data(self) -> Dict[str, Any]:
498
- """Collect California NetFile and state disclosure data"""
499
- job_id = await self.db.create_data_pull_job(
500
- "california", self.config.to_serializable_dict()
501
- )
502
-
503
- job_result = {
504
- "job_id": job_id,
505
- "status": "running",
506
- "new_disclosures": 0,
507
- "updated_disclosures": 0,
508
- "errors": [],
509
- }
510
-
511
- job = DataPullJob(
512
- id=job_id,
513
- job_type="california",
514
- status="running",
515
- started_at=datetime.utcnow(),
516
- )
517
-
518
- try:
519
- # Collect California financial disclosures
520
- logger.info("Starting California financial disclosures collection")
521
- california_disclosures = await run_california_workflow(self.config.scraping)
522
-
523
- job.records_found = len(california_disclosures)
524
-
525
- # Process each disclosure
526
- matcher = PoliticianMatcher(self.politicians)
527
-
528
- for disclosure in california_disclosures:
529
- try:
530
- # For California, create politician if needed
531
- if not disclosure.politician_id:
532
- # Extract politician name from raw data or create placeholder
533
- politician_name = disclosure.raw_data.get("politician_name", "")
534
- if not politician_name:
535
- # Create placeholder for California politician
536
- ca_politician = Politician(
537
- first_name="California",
538
- last_name="Politician",
539
- full_name=f"California Politician {disclosure.raw_data.get('jurisdiction', 'Unknown')}",
540
- role=PoliticianRole.US_HOUSE_REP, # Could be state-level role
541
- state_or_country="CA",
542
- )
543
- politician_id = await self.db.upsert_politician(ca_politician)
544
- disclosure.politician_id = politician_id
545
-
546
- # Insert disclosure
547
- disclosure_id = await self.db.insert_disclosure(disclosure)
548
- if disclosure_id:
549
- job.records_new += 1
550
- job_result["new_disclosures"] += 1
551
- else:
552
- job.records_failed += 1
553
-
554
- job.records_processed += 1
555
-
556
- except Exception as e:
557
- logger.error(f"Failed to process California disclosure: {e}")
558
- job.records_failed += 1
559
- job_result["errors"].append(str(e))
560
-
561
- job.status = "completed"
562
- job.completed_at = datetime.utcnow()
563
- job_result["status"] = "completed"
564
-
565
- except Exception as e:
566
- logger.error(f"California collection failed: {e}")
567
- job.status = "failed"
568
- job.error_message = str(e)
569
- job.completed_at = datetime.utcnow()
570
- job_result["status"] = "failed"
571
- job_result["errors"].append(str(e))
572
-
573
- # Update job status
574
- await self.db.update_data_pull_job(job)
575
-
576
- return job_result
577
-
578
- async def _collect_eu_member_states_data(self) -> Dict[str, Any]:
579
- """Collect EU member states financial disclosure data"""
580
- job_id = await self.db.create_data_pull_job(
581
- "eu_member_states", self.config.to_serializable_dict()
582
- )
583
-
584
- job_result = {
585
- "job_id": job_id,
586
- "status": "running",
587
- "new_disclosures": 0,
588
- "updated_disclosures": 0,
589
- "errors": [],
590
- }
591
-
592
- job = DataPullJob(
593
- id=job_id,
594
- job_type="eu_member_states",
595
- status="running",
596
- started_at=datetime.utcnow(),
597
- )
598
-
599
- try:
600
- # Collect EU member states financial disclosures
601
- logger.info("Starting EU member states financial disclosures collection")
602
- eu_states_disclosures = await run_eu_member_states_workflow(self.config.scraping)
603
-
604
- job.records_found = len(eu_states_disclosures)
605
-
606
- # Process each disclosure
607
- matcher = PoliticianMatcher(self.politicians)
608
-
609
- for disclosure in eu_states_disclosures:
610
- try:
611
- # For EU member states, create politician if needed
612
- if not disclosure.politician_id:
613
- # Extract politician details from raw data
614
- country = disclosure.raw_data.get("country", "Unknown")
615
- source = disclosure.raw_data.get("source", "unknown")
616
-
617
- # Map country to appropriate role
618
- role_map = {
619
- "Germany": PoliticianRole.GERMAN_BUNDESTAG,
620
- "France": PoliticianRole.FRENCH_DEPUTY,
621
- "Italy": PoliticianRole.ITALIAN_DEPUTY,
622
- "Spain": PoliticianRole.SPANISH_DEPUTY,
623
- "Netherlands": PoliticianRole.DUTCH_MP,
624
- }
625
-
626
- politician_role = role_map.get(country, PoliticianRole.EU_MEP)
627
-
628
- # Create placeholder politician
629
- eu_politician = Politician(
630
- first_name=country,
631
- last_name="Politician",
632
- full_name=f"{country} Politician ({source})",
633
- role=politician_role,
634
- state_or_country=country,
635
- )
636
- politician_id = await self.db.upsert_politician(eu_politician)
637
- disclosure.politician_id = politician_id
638
-
639
- # Insert disclosure
640
- disclosure_id = await self.db.insert_disclosure(disclosure)
641
- if disclosure_id:
642
- job.records_new += 1
643
- job_result["new_disclosures"] += 1
644
- else:
645
- job.records_failed += 1
646
-
647
- job.records_processed += 1
648
-
649
- except Exception as e:
650
- logger.error(f"Failed to process EU member state disclosure: {e}")
651
- job.records_failed += 1
652
- job_result["errors"].append(str(e))
653
-
654
- job.status = "completed"
655
- job.completed_at = datetime.utcnow()
656
- job_result["status"] = "completed"
657
-
658
- except Exception as e:
659
- logger.error(f"EU member states collection failed: {e}")
660
- job.status = "failed"
661
- job.error_message = str(e)
662
- job.completed_at = datetime.utcnow()
663
- job_result["status"] = "failed"
664
- job_result["errors"].append(str(e))
665
-
666
- # Update job status
667
- await self.db.update_data_pull_job(job)
668
-
669
- return job_result
670
-
671
- async def _collect_us_states_data(self) -> Dict[str, Any]:
672
- """Collect US states financial disclosure data"""
673
- job_id = await self.db.create_data_pull_job("us_states", self.config.to_serializable_dict())
674
-
675
- job_result = {
676
- "job_id": job_id,
677
- "status": "running",
678
- "new_disclosures": 0,
679
- "updated_disclosures": 0,
680
- "errors": [],
681
- }
682
-
683
- job = DataPullJob(
684
- id=job_id,
685
- job_type="us_states",
686
- status="running",
687
- started_at=datetime.utcnow(),
688
- )
689
-
690
- try:
691
- # Collect US states financial disclosures
692
- logger.info("Starting US states financial disclosures collection")
693
- us_states_disclosures = await run_us_states_workflow(self.config.scraping)
694
-
695
- job.records_found = len(us_states_disclosures)
696
-
697
- # Process each disclosure
698
- matcher = PoliticianMatcher(self.politicians)
699
-
700
- for disclosure in us_states_disclosures:
701
- try:
702
- # For US states, find or create politician using real names from scrapers
703
- if not disclosure.politician_id:
704
- # Extract real politician name from raw data
705
- politician_name = disclosure.raw_data.get("politician_name", "")
706
- if not politician_name or politician_name.strip() == "":
707
- logger.warning(
708
- "Skipping US states disclosure with empty politician name"
709
- )
710
- job.records_failed += 1
711
- continue
712
-
713
- # Filter out obviously invalid politician names
714
- if self._is_invalid_politician_name(politician_name):
715
- logger.warning(
716
- f"Skipping US states disclosure with invalid politician name: {politician_name}"
717
- )
718
- job.records_failed += 1
719
- continue
720
-
721
- # Try to find existing politician
722
- politician = matcher.find_politician(politician_name)
723
-
724
- if not politician:
725
- # Create new politician with real name from scraper
726
- state = disclosure.raw_data.get("state", "Unknown")
727
- source = disclosure.raw_data.get("source", "unknown")
728
-
729
- # Map state to appropriate role
730
- role_map = {
731
- "Texas": PoliticianRole.TEXAS_STATE_OFFICIAL,
732
- "New York": PoliticianRole.NEW_YORK_STATE_OFFICIAL,
733
- "Florida": PoliticianRole.FLORIDA_STATE_OFFICIAL,
734
- "Illinois": PoliticianRole.ILLINOIS_STATE_OFFICIAL,
735
- "Pennsylvania": PoliticianRole.PENNSYLVANIA_STATE_OFFICIAL,
736
- "Massachusetts": PoliticianRole.MASSACHUSETTS_STATE_OFFICIAL,
737
- "California": PoliticianRole.CALIFORNIA_STATE_OFFICIAL,
738
- }
739
-
740
- politician_role = role_map.get(state, PoliticianRole.US_HOUSE_REP)
741
-
742
- # Parse real name into first/last components
743
- name_parts = politician_name.strip().split()
744
- if len(name_parts) >= 2:
745
- first_name = name_parts[0]
746
- last_name = " ".join(name_parts[1:])
747
- else:
748
- first_name = politician_name.strip()
749
- last_name = ""
750
-
751
- # Create politician with REAL name
752
- state_politician = Politician(
753
- first_name=first_name,
754
- last_name=last_name,
755
- full_name=politician_name.strip(),
756
- role=politician_role,
757
- state_or_country=state,
758
- )
759
- politician_id = await self.db.upsert_politician(state_politician)
760
- disclosure.politician_id = politician_id
761
- logger.info(f"Created new US state politician: {politician_name}")
762
- else:
763
- disclosure.politician_id = politician.id
764
-
765
- # Insert disclosure
766
- disclosure_id = await self.db.insert_disclosure(disclosure)
767
- if disclosure_id:
768
- job.records_new += 1
769
- job_result["new_disclosures"] += 1
770
- else:
771
- job.records_failed += 1
772
-
773
- job.records_processed += 1
774
-
775
- except Exception as e:
776
- logger.error(f"Failed to process US state disclosure: {e}")
777
- job.records_failed += 1
778
- job_result["errors"].append(str(e))
779
-
780
- job.status = "completed"
781
- job.completed_at = datetime.utcnow()
782
- job_result["status"] = "completed"
783
-
784
- except Exception as e:
785
- logger.error(f"US states collection failed: {e}")
786
- job.status = "failed"
787
- job.error_message = str(e)
788
- job.completed_at = datetime.utcnow()
789
- job_result["status"] = "failed"
790
- job_result["errors"].append(str(e))
791
-
792
- # Update job status
793
- await self.db.update_data_pull_job(job)
794
-
795
- return job_result
796
-
797
- async def get_status(self) -> Dict[str, Any]:
798
- """Get current workflow status"""
799
- try:
800
- return await self.db.get_job_status()
801
- except Exception as e:
802
- logger.error(f"Failed to get status: {e}")
803
- return {"error": str(e)}
804
-
805
- async def run_quick_check(self) -> Dict[str, Any]:
806
- """Run a quick status check without full data collection"""
807
- try:
808
- status = await self.get_status()
809
-
810
- # Add some additional quick checks
811
- status["database_connection"] = "ok" if self.db.client else "failed"
812
- status["config_loaded"] = "ok" if self.config else "failed"
813
- status["timestamp"] = datetime.utcnow().isoformat()
814
-
815
- return status
816
-
817
- except Exception as e:
818
- return {"error": str(e), "timestamp": datetime.utcnow().isoformat(), "status": "failed"}
819
-
820
- def _is_invalid_politician_name(self, name: str) -> bool:
821
- """Check if a name is obviously not a politician name"""
822
- if not name or len(name.strip()) < 2:
823
- return True
824
-
825
- # Check for proper name structure first (before converting to uppercase)
826
- original_name = name.strip()
827
- import re
828
-
829
- if not re.search(r"[A-Za-z]", original_name): # Should have at least one letter
830
- return True
831
- if re.search(r"^\d+", original_name): # Starting with numbers
832
- return True
833
-
834
- # Now convert to uppercase for pattern matching
835
- name = original_name.upper()
836
-
837
- # Filter out obvious non-names
838
- invalid_patterns = [
839
- # Asset tickers and financial instruments
840
- r"^-.*CT$", # -ETHEREUMCT, -DOGCT patterns
841
- r"^[A-Z]{2,5}$", # Short all-caps (likely tickers)
842
- r"^\$", # Starting with $
843
- # Municipal and financial terms
844
- r"MUNICIPAL",
845
- r"BOND",
846
- r"TRUST",
847
- r"FUND",
848
- r"CORP",
849
- r"INC\.$",
850
- r"LLC$",
851
- r"LP$",
852
- # Common non-name patterns
853
- r"^UNKNOWN",
854
- r"^TEST",
855
- r"^SAMPLE",
856
- # Crypto/financial asset patterns
857
- r"ETHEREUM",
858
- r"BITCOIN",
859
- r"CRYPTO",
860
- ]
861
-
862
- for pattern in invalid_patterns:
863
- if re.search(pattern, name):
864
- return True
865
-
866
- return False
867
-
868
-
869
- # Standalone functions for cron job usage
870
- async def run_politician_trading_collection() -> Dict[str, Any]:
871
- """Standalone function for cron job execution"""
872
- workflow = PoliticianTradingWorkflow()
873
- return await workflow.run_full_collection()
874
-
875
-
876
- async def check_politician_trading_status() -> Dict[str, Any]:
877
- """Standalone function for status checking"""
878
- workflow = PoliticianTradingWorkflow()
879
- return await workflow.run_quick_check()