mcli-framework 7.10.0__py3-none-any.whl → 7.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. mcli/lib/custom_commands.py +10 -0
  2. mcli/lib/optional_deps.py +240 -0
  3. mcli/ml/backtesting/run.py +5 -3
  4. mcli/ml/models/ensemble_models.py +1 -0
  5. mcli/ml/models/recommendation_models.py +1 -0
  6. mcli/ml/optimization/optimize.py +6 -4
  7. mcli/ml/serving/serve.py +2 -2
  8. mcli/ml/training/train.py +14 -7
  9. mcli/self/completion_cmd.py +2 -2
  10. mcli/workflow/doc_convert.py +82 -112
  11. mcli/workflow/git_commit/ai_service.py +13 -2
  12. mcli/workflow/notebook/converter.py +375 -0
  13. mcli/workflow/notebook/notebook_cmd.py +441 -0
  14. mcli/workflow/notebook/schema.py +402 -0
  15. mcli/workflow/notebook/validator.py +313 -0
  16. mcli/workflow/workflow.py +14 -0
  17. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/METADATA +37 -3
  18. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/RECORD +22 -37
  19. mcli/ml/features/political_features.py +0 -677
  20. mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
  21. mcli/workflow/politician_trading/config.py +0 -134
  22. mcli/workflow/politician_trading/connectivity.py +0 -492
  23. mcli/workflow/politician_trading/data_sources.py +0 -654
  24. mcli/workflow/politician_trading/database.py +0 -412
  25. mcli/workflow/politician_trading/demo.py +0 -249
  26. mcli/workflow/politician_trading/models.py +0 -327
  27. mcli/workflow/politician_trading/monitoring.py +0 -413
  28. mcli/workflow/politician_trading/scrapers.py +0 -1074
  29. mcli/workflow/politician_trading/scrapers_california.py +0 -434
  30. mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
  31. mcli/workflow/politician_trading/scrapers_eu.py +0 -376
  32. mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
  33. mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
  34. mcli/workflow/politician_trading/scrapers_uk.py +0 -378
  35. mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
  36. mcli/workflow/politician_trading/seed_database.py +0 -520
  37. mcli/workflow/politician_trading/supabase_functions.py +0 -354
  38. mcli/workflow/politician_trading/workflow.py +0 -879
  39. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/WHEEL +0 -0
  40. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/entry_points.txt +0 -0
  41. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/licenses/LICENSE +0 -0
  42. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/top_level.txt +0 -0
@@ -1,1074 +0,0 @@
1
- """
2
- Web scrapers for politician trading data
3
- """
4
-
5
- import asyncio
6
- import logging
7
- import re
8
- from datetime import datetime, timedelta
9
- from decimal import Decimal
10
- from typing import Any, Dict, List, Optional, Tuple
11
- from urllib.parse import urljoin, urlparse
12
-
13
- import aiohttp
14
- from bs4 import BeautifulSoup
15
-
16
- from .config import ScrapingConfig
17
- from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class BaseScraper:
23
- """Base class for all scrapers"""
24
-
25
- def __init__(self, config: ScrapingConfig):
26
- self.config = config
27
- self.session: Optional[aiohttp.ClientSession] = None
28
-
29
- async def __aenter__(self):
30
- """Async context manager entry"""
31
- self.session = aiohttp.ClientSession(
32
- timeout=aiohttp.ClientTimeout(total=self.config.timeout),
33
- headers={"User-Agent": self.config.user_agent},
34
- )
35
- return self
36
-
37
- async def __aexit__(self, exc_type, exc_val, exc_tb):
38
- """Async context manager exit"""
39
- if self.session:
40
- await self.session.close()
41
-
42
- async def fetch_page(self, url: str, **kwargs) -> Optional[str]:
43
- """Fetch a web page with error handling and rate limiting"""
44
- for attempt in range(self.config.max_retries):
45
- try:
46
- await asyncio.sleep(self.config.request_delay)
47
-
48
- async with self.session.get(url, **kwargs) as response:
49
- if response.status == 200:
50
- return await response.text()
51
- else:
52
- logger.warning(f"HTTP {response.status} for {url}")
53
- if response.status == 429: # Rate limited
54
- await asyncio.sleep(self.config.request_delay * 2)
55
-
56
- except Exception as e:
57
- logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
58
- if attempt < self.config.max_retries - 1:
59
- await asyncio.sleep(self.config.request_delay * (attempt + 1))
60
-
61
- return None
62
-
63
- def parse_amount_range(
64
- self, amount_text: str
65
- ) -> Tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
66
- """Parse amount text into range values"""
67
- if not amount_text:
68
- return None, None, None
69
-
70
- amount_text = amount_text.replace(",", "").replace("$", "").strip()
71
-
72
- # Look for range patterns like "$1,001 - $15,000"
73
- range_match = re.search(r"(\d+(?:\.\d{2})?)\s*[-–]\s*(\d+(?:\.\d{2})?)", amount_text)
74
- if range_match:
75
- min_val = Decimal(range_match.group(1))
76
- max_val = Decimal(range_match.group(2))
77
- return min_val, max_val, None
78
-
79
- # Look for exact amounts
80
- exact_match = re.search(r"(\d+(?:\.\d{2})?)", amount_text)
81
- if exact_match:
82
- exact_val = Decimal(exact_match.group(1))
83
- return None, None, exact_val
84
-
85
- # Handle standard ranges
86
- range_mappings = {
87
- "$1,001 - $15,000": (Decimal("1001"), Decimal("15000")),
88
- "$15,001 - $50,000": (Decimal("15001"), Decimal("50000")),
89
- "$50,001 - $100,000": (Decimal("50001"), Decimal("100000")),
90
- "$100,001 - $250,000": (Decimal("100001"), Decimal("250000")),
91
- "$250,001 - $500,000": (Decimal("250001"), Decimal("500000")),
92
- "$500,001 - $1,000,000": (Decimal("500001"), Decimal("1000000")),
93
- "$1,000,001 - $5,000,000": (Decimal("1000001"), Decimal("5000000")),
94
- "$5,000,001 - $25,000,000": (Decimal("5000001"), Decimal("25000000")),
95
- "$25,000,001 - $50,000,000": (Decimal("25000001"), Decimal("50000000")),
96
- "Over $50,000,000": (Decimal("50000001"), None),
97
- }
98
-
99
- for pattern, (min_val, max_val) in range_mappings.items():
100
- if pattern.lower() in amount_text.lower():
101
- return min_val, max_val, None
102
-
103
- return None, None, None
104
-
105
-
106
- class CongressTradingScraper(BaseScraper):
107
- """Scraper for US Congress trading data"""
108
-
109
- async def scrape_house_disclosures(self) -> List[TradingDisclosure]:
110
- """Scrape House financial disclosures from the official database"""
111
- disclosures = []
112
- base_url = "https://disclosures-clerk.house.gov"
113
- search_url = f"{base_url}/FinancialDisclosure"
114
-
115
- try:
116
- logger.info("Starting House disclosures scrape from official database")
117
-
118
- async with aiohttp.ClientSession(
119
- timeout=aiohttp.ClientTimeout(total=self.config.timeout),
120
- headers={"User-Agent": self.config.user_agent},
121
- ) as session:
122
-
123
- # Get the ViewSearch form page
124
- view_search_url = f"{base_url}/FinancialDisclosure/ViewSearch"
125
- async with session.get(view_search_url) as response:
126
- if response.status == 200:
127
- html = await response.text()
128
- logger.info("Successfully accessed House financial disclosure search form")
129
-
130
- # Extract form data for ASPX
131
- soup = BeautifulSoup(html, "html.parser")
132
-
133
- # Look for common ASPX form fields
134
- form_fields = {}
135
- for field_name in [
136
- "__VIEWSTATE",
137
- "__VIEWSTATEGENERATOR",
138
- "__EVENTVALIDATION",
139
- "__REQUESTVERIFICATIONTOKEN",
140
- ]:
141
- field = soup.find("input", {"name": field_name})
142
- if field and field.get("value"):
143
- form_fields[field_name] = field["value"]
144
-
145
- if "__VIEWSTATE" in form_fields:
146
- logger.info("Found required ASPX form fields")
147
-
148
- # Search for recent disclosures - try different form field names
149
- current_year = str(datetime.now().year)
150
-
151
- # Search for common politician last names to get real data
152
- common_names = [
153
- "Smith",
154
- "Johnson",
155
- "Brown",
156
- "Davis",
157
- "Wilson",
158
- "Miller",
159
- "Garcia",
160
- ]
161
-
162
- # Try different form patterns with actual names
163
- possible_form_data_sets = []
164
-
165
- for name in common_names:
166
- possible_form_data_sets.extend(
167
- [
168
- {
169
- **form_fields,
170
- "ctl00$MainContent$txtLastName": name,
171
- "ctl00$MainContent$ddlFilingYear": current_year,
172
- "ctl00$MainContent$btnSearch": "Search",
173
- },
174
- {
175
- **form_fields,
176
- "ctl00$ContentPlaceHolder1$txtLastName": name,
177
- "ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
178
- "ctl00$ContentPlaceHolder1$btnSearch": "Search",
179
- },
180
- {
181
- **form_fields,
182
- "LastName": name,
183
- "FilingYear": current_year,
184
- "Search": "Search",
185
- },
186
- ]
187
- )
188
-
189
- # Also try without names (all results)
190
- possible_form_data_sets.extend(
191
- [
192
- {
193
- **form_fields,
194
- "ctl00$MainContent$ddlFilingYear": current_year,
195
- "ctl00$MainContent$btnSearch": "Search",
196
- },
197
- {
198
- **form_fields,
199
- "ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
200
- "ctl00$ContentPlaceHolder1$btnSearch": "Search",
201
- },
202
- ]
203
- )
204
-
205
- # Try each form configuration
206
- for i, form_data in enumerate(possible_form_data_sets):
207
- try:
208
- logger.info(f"Attempting search with form configuration {i+1}")
209
- async with session.post(
210
- view_search_url, data=form_data
211
- ) as search_response:
212
- if search_response.status == 200:
213
- results_html = await search_response.text()
214
- if (
215
- "search results" in results_html.lower()
216
- or "disclosure" in results_html.lower()
217
- ):
218
- disclosures = await self._parse_house_results(
219
- results_html, base_url
220
- )
221
- logger.info(
222
- f"Successfully found {len(disclosures)} House disclosures"
223
- )
224
- break
225
- else:
226
- logger.debug(
227
- f"Form config {i+1} didn't return results"
228
- )
229
- else:
230
- logger.debug(
231
- f"Form config {i+1} failed with status {search_response.status}"
232
- )
233
- except Exception as e:
234
- logger.debug(f"Form config {i+1} failed: {e}")
235
- else:
236
- logger.warning(
237
- "All form configurations failed, using basic page scraping"
238
- )
239
- # Fall back to scraping any existing disclosure links on the page
240
- disclosures = await self._parse_house_results(html, base_url)
241
- else:
242
- logger.warning(
243
- "Could not find required ASPX form fields, using basic page scraping"
244
- )
245
- # Fall back to parsing any existing links
246
- disclosures = await self._parse_house_results(html, base_url)
247
- else:
248
- logger.warning(f"Failed to access House disclosure site: {response.status}")
249
-
250
- # Rate limiting
251
- await asyncio.sleep(self.config.request_delay)
252
-
253
- except Exception as e:
254
- logger.error(f"House disclosures scrape failed: {e}")
255
- # Return empty list on error rather than sample data
256
-
257
- return disclosures
258
-
259
- async def scrape_senate_disclosures(self) -> List[TradingDisclosure]:
260
- """Scrape Senate financial disclosures from the official EFD database"""
261
- disclosures = []
262
- base_url = "https://efdsearch.senate.gov"
263
- search_url = f"{base_url}/search/"
264
-
265
- try:
266
- logger.info("Starting Senate disclosures scrape from EFD database")
267
-
268
- async with aiohttp.ClientSession(
269
- timeout=aiohttp.ClientTimeout(total=self.config.timeout),
270
- headers={"User-Agent": self.config.user_agent},
271
- ) as session:
272
-
273
- # Search for recent periodic transaction reports (PTRs)
274
- search_params = {
275
- "report_type": "11", # Periodic Transaction Report
276
- "submitted_start_date": (datetime.now() - timedelta(days=90)).strftime(
277
- "%m/%d/%Y"
278
- ),
279
- "submitted_end_date": datetime.now().strftime("%m/%d/%Y"),
280
- }
281
-
282
- async with session.get(search_url, params=search_params) as response:
283
- if response.status == 200:
284
- html = await response.text()
285
- disclosures = await self._parse_senate_results(html, base_url)
286
- logger.info(f"Found {len(disclosures)} Senate disclosures")
287
- else:
288
- logger.warning(f"Senate search failed with status {response.status}")
289
-
290
- # Rate limiting
291
- await asyncio.sleep(self.config.request_delay)
292
-
293
- except Exception as e:
294
- logger.error(f"Senate disclosures scrape failed: {e}")
295
- # Return empty list on error rather than sample data
296
-
297
- return disclosures
298
-
299
- async def _parse_house_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
300
- """Parse House disclosure search results"""
301
- disclosures = []
302
-
303
- try:
304
- soup = BeautifulSoup(html, "html.parser")
305
-
306
- # Look for disclosure result rows - try multiple selectors
307
- result_rows = (
308
- soup.find_all("tr", class_="disclosure-row")
309
- or soup.select('tr[id*="GridView"]')
310
- or soup.select("table tr")
311
- or soup.find_all("tr")
312
- )
313
-
314
- logger.info(f"Found {len(result_rows)} potential result rows")
315
-
316
- for row in result_rows[:20]: # Limit to 20 most recent
317
- cells = row.find_all("td")
318
- if len(cells) >= 3: # At least 3 columns
319
- # Extract text from each cell to identify the structure
320
- cell_texts = [
321
- cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)
322
- ]
323
-
324
- if not cell_texts:
325
- continue
326
-
327
- # Try to identify which cell contains the politician name
328
- # Names usually contain letters and may have titles like "Rep.", "Hon."
329
- politician_name = ""
330
-
331
- for text in cell_texts:
332
- # Look for text that looks like a person's name
333
- if (
334
- len(text) > 3
335
- and any(c.isalpha() for c in text)
336
- and not text.isdigit()
337
- and not text.startswith("20") # Not a year
338
- and "pdf" not in text.lower()
339
- and "view" not in text.lower()
340
- ):
341
-
342
- # Clean up potential name
343
- clean_name = (
344
- text.replace("Hon.", "")
345
- .replace("Rep.", "")
346
- .replace("Sen.", "")
347
- .strip()
348
- )
349
- if len(clean_name) > 3 and " " in clean_name: # Likely full name
350
- politician_name = clean_name
351
- break
352
-
353
- if not politician_name:
354
- politician_name = cell_texts[0] # Fallback to first cell
355
-
356
- # Extract other information
357
- filing_year = next(
358
- (
359
- text
360
- for text in cell_texts
361
- if text.isdigit() and len(text) == 4 and text.startswith("20")
362
- ),
363
- "",
364
- )
365
- filing_type = next(
366
- (
367
- text
368
- for text in cell_texts
369
- if "periodic" in text.lower() or "annual" in text.lower()
370
- ),
371
- "",
372
- )
373
-
374
- # Look for PDF link
375
- pdf_link = row.find("a", href=True)
376
- if pdf_link:
377
- pdf_url = urljoin(base_url, pdf_link["href"])
378
-
379
- # Create basic disclosure entry
380
- # Note: Actual transaction details would require PDF parsing
381
- disclosure = TradingDisclosure(
382
- politician_id="", # To be filled by matcher
383
- transaction_date=datetime.now() - timedelta(days=30), # Estimate
384
- disclosure_date=datetime.now() - timedelta(days=15), # Estimate
385
- transaction_type=TransactionType.PURCHASE, # Default
386
- asset_name="Unknown Asset", # Would need PDF parsing
387
- asset_type="stock",
388
- amount_range_min=Decimal("1001"),
389
- amount_range_max=Decimal("15000"),
390
- source_url=pdf_url,
391
- raw_data={
392
- "politician_name": politician_name,
393
- "filing_year": filing_year,
394
- "filing_type": filing_type,
395
- "requires_pdf_parsing": True,
396
- "extraction_method": "house_search_results",
397
- },
398
- )
399
- disclosures.append(disclosure)
400
-
401
- except Exception as e:
402
- logger.error(f"Error parsing House results: {e}")
403
-
404
- return disclosures
405
-
406
- async def _parse_senate_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
407
- """Parse Senate EFD search results"""
408
- disclosures = []
409
-
410
- try:
411
- soup = BeautifulSoup(html, "html.parser")
412
-
413
- # Look for search result rows
414
- result_rows = soup.find_all("tr", class_="searchresult") or soup.select("tbody tr")
415
-
416
- for row in result_rows[:20]: # Limit to 20 most recent
417
- cells = row.find_all("td")
418
- if len(cells) >= 4:
419
- # Extract information
420
- name = cells[0].get_text(strip=True) if cells[0] else ""
421
- report_type = cells[1].get_text(strip=True) if cells[1] else ""
422
- filing_date = cells[2].get_text(strip=True) if cells[2] else ""
423
-
424
- # Look for report link
425
- report_link = row.find("a", href=True)
426
- if report_link and "ptr" in report_type.lower(): # Periodic Transaction Report
427
- report_url = urljoin(base_url, report_link["href"])
428
-
429
- # Create disclosure entry
430
- # Note: Actual transaction details would require report parsing
431
- disclosure = TradingDisclosure(
432
- politician_id="", # To be filled by matcher
433
- transaction_date=datetime.now() - timedelta(days=30), # Estimate
434
- disclosure_date=self._parse_date(filing_date) or datetime.now(),
435
- transaction_type=TransactionType.PURCHASE, # Default
436
- asset_name="Unknown Asset", # Would need report parsing
437
- asset_type="stock",
438
- amount_range_min=Decimal("1001"),
439
- amount_range_max=Decimal("50000"),
440
- source_url=report_url,
441
- raw_data={
442
- "politician_name": name,
443
- "report_type": report_type,
444
- "filing_date": filing_date,
445
- "requires_report_parsing": True,
446
- },
447
- )
448
- disclosures.append(disclosure)
449
-
450
- except Exception as e:
451
- logger.error(f"Error parsing Senate results: {e}")
452
-
453
- return disclosures
454
-
455
- def _parse_date(self, date_str: str) -> Optional[datetime]:
456
- """Parse various date formats from disclosure sites"""
457
- if not date_str:
458
- return None
459
-
460
- date_formats = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%B %d, %Y"]
461
-
462
- for fmt in date_formats:
463
- try:
464
- return datetime.strptime(date_str.strip(), fmt)
465
- except ValueError:
466
- continue
467
-
468
- logger.warning(f"Could not parse date: {date_str}")
469
- return None
470
-
471
-
472
- class QuiverQuantScraper(BaseScraper):
473
- """Scraper for QuiverQuant congress trading data as a backup source"""
474
-
475
- async def scrape_congress_trades(self) -> List[Dict[str, Any]]:
476
- """Scrape congress trading data from QuiverQuant"""
477
- trades = []
478
-
479
- try:
480
- # This would implement scraping from QuiverQuant's public data
481
- # Note: Respect their robots.txt and terms of service
482
- logger.info("Starting QuiverQuant scrape")
483
-
484
- url = "https://www.quiverquant.com/congresstrading/"
485
- html = await self.fetch_page(url)
486
-
487
- if html:
488
- soup = BeautifulSoup(html, "html.parser")
489
-
490
- # Parse the trading data table (simplified example)
491
- # In reality, this might require handling JavaScript rendering
492
- trade_rows = soup.select("table tr")
493
-
494
- for row in trade_rows[1:10]: # Skip header, limit to 10 for example
495
- cells = row.select("td")
496
- if len(cells) >= 4:
497
- # Extract cell contents and try to identify the correct fields
498
- cell_texts = [cell.get_text(strip=True) for cell in cells]
499
-
500
- # Try to identify which cell contains what data
501
- politician_name = cell_texts[0] if len(cell_texts) > 0 else ""
502
-
503
- # Look for date-like patterns (YYYY-MM-DD, MM/DD/YYYY, etc.)
504
- transaction_date = ""
505
- ticker = ""
506
- transaction_type = ""
507
- amount = ""
508
-
509
- for i, text in enumerate(
510
- cell_texts[1:], 1
511
- ): # Skip first cell (politician name)
512
- # Check if this looks like a date
513
- if self._looks_like_date(text):
514
- transaction_date = text
515
- # Check if this looks like a ticker (all caps, short)
516
- elif text.isupper() and len(text) <= 5 and text.isalpha():
517
- ticker = text
518
- # Check if this contains transaction type keywords
519
- elif any(
520
- word in text.lower() for word in ["purchase", "sale", "buy", "sell"]
521
- ):
522
- # Split transaction type and amount if combined
523
- if "$" in text:
524
- # Split on $ to separate transaction type from amount
525
- parts = text.split("$", 1)
526
- transaction_type = parts[0].strip()
527
- amount = "$" + parts[1] if len(parts) > 1 else ""
528
- else:
529
- transaction_type = text
530
- # Check if this looks like an amount (contains $ or numbers with ,)
531
- elif "$" in text or ("," in text and any(c.isdigit() for c in text)):
532
- amount = text
533
-
534
- # Only create trade data if we have essential fields
535
- if politician_name and (transaction_date or ticker):
536
- trade_data = {
537
- "politician_name": politician_name,
538
- "transaction_date": transaction_date,
539
- "ticker": ticker,
540
- "transaction_type": transaction_type,
541
- "amount": amount,
542
- "source": "quiverquant",
543
- }
544
- trades.append(trade_data)
545
-
546
- except Exception as e:
547
- logger.error(f"QuiverQuant scrape failed: {e}")
548
-
549
- return trades
550
-
551
- def _looks_like_date(self, text: str) -> bool:
552
- """Check if a string looks like a date"""
553
- if not text or len(text) < 8:
554
- return False
555
-
556
- # Common date patterns
557
- date_patterns = [
558
- r"\d{4}-\d{1,2}-\d{1,2}", # YYYY-MM-DD
559
- r"\d{1,2}/\d{1,2}/\d{4}", # MM/DD/YYYY
560
- r"\d{1,2}-\d{1,2}-\d{4}", # MM-DD-YYYY
561
- r"\w{3}\s+\d{1,2},?\s+\d{4}", # Month DD, YYYY
562
- ]
563
-
564
- import re
565
-
566
- for pattern in date_patterns:
567
- if re.search(pattern, text):
568
- return True
569
- return False
570
-
571
- def parse_quiver_trade(self, trade_data: Dict[str, Any]) -> Optional[TradingDisclosure]:
572
- """Parse QuiverQuant trade data into TradingDisclosure"""
573
- try:
574
- # Debug: Log the trade data structure
575
- logger.debug(f"Parsing QuiverQuant trade data: {trade_data}")
576
- # Parse transaction type
577
- transaction_type_map = {
578
- "purchase": TransactionType.PURCHASE,
579
- "sale": TransactionType.SALE,
580
- "buy": TransactionType.PURCHASE,
581
- "sell": TransactionType.SALE,
582
- }
583
-
584
- transaction_type = transaction_type_map.get(
585
- trade_data.get("transaction_type", "").lower(), TransactionType.PURCHASE
586
- )
587
-
588
- # Parse date
589
- date_str = trade_data.get("transaction_date", "")
590
- if not date_str or date_str.strip() == "" or not self._looks_like_date(date_str):
591
- # Use estimated date if no valid date found
592
- transaction_date = datetime.now() - timedelta(days=30) # Estimate 30 days ago
593
- else:
594
- # Try multiple date formats
595
- try:
596
- # Standard format
597
- transaction_date = datetime.strptime(date_str, "%Y-%m-%d")
598
- except ValueError:
599
- try:
600
- # Alternative format
601
- transaction_date = datetime.strptime(date_str, "%m/%d/%Y")
602
- except ValueError:
603
- try:
604
- # Try MM-DD-YYYY
605
- transaction_date = datetime.strptime(date_str, "%m-%d-%Y")
606
- except ValueError:
607
- logger.warning(
608
- f"Could not parse date '{date_str}', using estimated date"
609
- )
610
- transaction_date = datetime.now() - timedelta(days=30)
611
-
612
- # Parse amount
613
- amount_min, amount_max, amount_exact = self.parse_amount_range(
614
- trade_data.get("amount", "")
615
- )
616
-
617
- disclosure = TradingDisclosure(
618
- politician_id="", # Will be filled after politician matching
619
- transaction_date=transaction_date,
620
- disclosure_date=datetime.now(), # QuiverQuant aggregation date
621
- transaction_type=transaction_type,
622
- asset_name=trade_data.get("ticker", ""),
623
- asset_ticker=trade_data.get("ticker", ""),
624
- asset_type="stock",
625
- amount_range_min=amount_min,
626
- amount_range_max=amount_max,
627
- amount_exact=amount_exact,
628
- source_url="https://www.quiverquant.com/congresstrading/",
629
- raw_data=trade_data,
630
- )
631
-
632
- return disclosure
633
-
634
- except Exception as e:
635
- logger.error(f"Failed to parse QuiverQuant trade: {e}")
636
- return None
637
-
638
-
639
- class EUParliamentScraper(BaseScraper):
640
- """Scraper for EU Parliament member declarations"""
641
-
642
- async def scrape_mep_declarations(self) -> List[TradingDisclosure]:
643
- """Scrape MEP financial declarations from official EU Parliament site"""
644
- disclosures = []
645
-
646
- try:
647
- logger.info("Starting EU Parliament MEP declarations scrape")
648
- base_url = "https://www.europarl.europa.eu"
649
-
650
- async with aiohttp.ClientSession(
651
- timeout=aiohttp.ClientTimeout(total=self.config.timeout),
652
- headers={"User-Agent": self.config.user_agent},
653
- ) as session:
654
-
655
- # Get list of current MEPs
656
- mep_list_url = f"{base_url}/meps/en/full-list/all"
657
-
658
- async with session.get(mep_list_url) as response:
659
- if response.status == 200:
660
- html = await response.text()
661
- mep_data = await self._extract_mep_urls(html, base_url)
662
- logger.info(f"Found {len(mep_data)} MEP profiles to check")
663
-
664
- # Check declarations for a subset of MEPs (to avoid overwhelming the server)
665
- for i, mep_info in enumerate(mep_data[:50]): # Limit to 50 MEPs
666
- try:
667
- mep_disclosures = await self._scrape_mep_profile(
668
- session, mep_info["url"], mep_info
669
- )
670
- disclosures.extend(mep_disclosures)
671
-
672
- # Rate limiting - EU Parliament is more sensitive
673
- await asyncio.sleep(self.config.request_delay * 2)
674
-
675
- if i > 0 and i % 10 == 0:
676
- logger.info(f"Processed {i} MEP profiles")
677
-
678
- except Exception as e:
679
- logger.warning(f"Failed to process MEP profile {mep_url}: {e}")
680
- continue
681
- else:
682
- logger.warning(f"Failed to access MEP list: {response.status}")
683
-
684
- logger.info(f"Collected {len(disclosures)} EU Parliament disclosures")
685
-
686
- except Exception as e:
687
- logger.error(f"EU Parliament scrape failed: {e}")
688
-
689
- return disclosures
690
-
691
- async def _extract_mep_urls(self, html: str, base_url: str) -> List[Dict[str, str]]:
692
- """Extract MEP profile URLs and names from the MEP list page"""
693
- mep_data = []
694
-
695
- try:
696
- soup = BeautifulSoup(html, "html.parser")
697
-
698
- # Look for MEP profile links - they usually contain both name and link
699
- mep_links = soup.find_all("a", href=True)
700
-
701
- seen_urls = set()
702
-
703
- for link in mep_links:
704
- href = link.get("href", "")
705
- if "/meps/en/" in href and "/home" in href:
706
- full_url = urljoin(base_url, href)
707
-
708
- if full_url not in seen_urls:
709
- # Extract MEP name from link text or nearby elements
710
- mep_name = ""
711
-
712
- # Try to get name from link text
713
- link_text = link.get_text(strip=True)
714
- if (
715
- link_text
716
- and len(link_text) > 3
717
- and not link_text.lower().startswith("http")
718
- ):
719
- mep_name = link_text
720
-
721
- # If no name in link, look in parent elements
722
- if not mep_name:
723
- parent = link.parent
724
- if parent:
725
- # Look for text that looks like a name
726
- for text_node in parent.stripped_strings:
727
- if (
728
- len(text_node) > 3
729
- and " " in text_node
730
- and not text_node.startswith("http")
731
- and not text_node.isdigit()
732
- ):
733
- mep_name = text_node
734
- break
735
-
736
- # Extract country/party info if available
737
- country = ""
738
- party = ""
739
-
740
- # Look for country and party info near the link
741
- container = link.find_parent(["div", "article", "section"])
742
- if container:
743
- text_elements = list(container.stripped_strings)
744
- for i, text in enumerate(text_elements):
745
- if text == mep_name and i < len(text_elements) - 2:
746
- # Country and party usually come after name
747
- country = (
748
- text_elements[i + 1] if i + 1 < len(text_elements) else ""
749
- )
750
- party = (
751
- text_elements[i + 2] if i + 2 < len(text_elements) else ""
752
- )
753
-
754
- if mep_name: # Only add if we found a name
755
- mep_data.append(
756
- {
757
- "url": full_url,
758
- "name": mep_name,
759
- "country": country,
760
- "party": party,
761
- }
762
- )
763
- seen_urls.add(full_url)
764
-
765
- # Limit to prevent overwhelming the servers
766
- if len(mep_data) >= 50:
767
- break
768
-
769
- except Exception as e:
770
- logger.error(f"Error extracting MEP data: {e}")
771
-
772
- return mep_data
773
-
774
- async def _scrape_mep_profile(
775
- self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None
776
- ) -> List[TradingDisclosure]:
777
- """Scrape financial interests from an individual MEP profile"""
778
- disclosures = []
779
-
780
- try:
781
- async with session.get(mep_url) as response:
782
- if response.status == 200:
783
- html = await response.text()
784
- soup = BeautifulSoup(html, "html.parser")
785
-
786
- # Use extracted MEP name from list, or try to extract from profile
787
- if mep_info and mep_info.get("name"):
788
- mep_name = mep_info["name"]
789
- mep_country = mep_info.get("country", "")
790
- mep_party = mep_info.get("party", "")
791
- else:
792
- # Fallback: extract from profile page
793
- name_element = soup.find("h1", class_="ep-header-title")
794
- mep_name = (
795
- name_element.get_text(strip=True) if name_element else "Unknown MEP"
796
- )
797
- mep_country = ""
798
- mep_party = ""
799
-
800
- # Look for financial interests section
801
- # EU Parliament declarations are typically in a specific section
802
- interests_section = (
803
- soup.find("div", id="financial-interests")
804
- or soup.find("section", class_="ep-a-section")
805
- or soup.find("div", class_="ep-m-content-block")
806
- )
807
-
808
- if interests_section:
809
- # Parse financial interests
810
- # Note: EU declarations focus more on activities and interests than specific trades
811
- interest_items = interests_section.find_all(
812
- ["p", "li", "div"], recursive=True
813
- )
814
-
815
- for item in interest_items:
816
- item_text = item.get_text(strip=True).lower()
817
-
818
- # Look for financial keywords
819
- if any(
820
- keyword in item_text
821
- for keyword in [
822
- "shareholding",
823
- "investment",
824
- "director",
825
- "board",
826
- "financial interest",
827
- "remuneration",
828
- "consulting",
829
- ]
830
- ):
831
- # Create disclosure for detected financial interest
832
- disclosure = TradingDisclosure(
833
- politician_id="", # To be filled by matcher
834
- transaction_date=datetime.now()
835
- - timedelta(days=90), # Estimate
836
- disclosure_date=datetime.now() - timedelta(days=60), # Estimate
837
- transaction_type=TransactionType.PURCHASE, # Default for interests
838
- asset_name=self._extract_company_name(item_text),
839
- asset_type="interest",
840
- amount_range_min=Decimal(
841
- "0"
842
- ), # EU doesn't always specify amounts
843
- amount_range_max=Decimal("0"),
844
- source_url=mep_url,
845
- raw_data={
846
- "politician_name": mep_name,
847
- "country": mep_country,
848
- "party": mep_party,
849
- "interest_type": "financial_activity",
850
- "interest_description": item.get_text(strip=True)[
851
- :500
852
- ], # Truncate
853
- "region": "eu",
854
- "extraction_method": "mep_profile_scraping",
855
- "requires_manual_review": True,
856
- },
857
- )
858
- disclosures.append(disclosure)
859
-
860
- except Exception as e:
861
- logger.warning(f"Error scraping MEP profile {mep_url}: {e}")
862
-
863
- return disclosures
864
-
865
- def _extract_company_name(self, text: str) -> str:
866
- """Extract company/organization name from interest description"""
867
- # Simple heuristic to extract potential company names
868
- words = text.split()
869
-
870
- # Look for capitalized sequences that might be company names
871
- potential_names = []
872
- current_name = []
873
-
874
- for word in words:
875
- if word[0].isupper() and len(word) > 2:
876
- current_name.append(word)
877
- else:
878
- if current_name and len(current_name) <= 4: # Reasonable company name length
879
- potential_names.append(" ".join(current_name))
880
- current_name = []
881
-
882
- if current_name and len(current_name) <= 4:
883
- potential_names.append(" ".join(current_name))
884
-
885
- # Return the first reasonable candidate or default
886
- return potential_names[0] if potential_names else "Financial Interest"
887
-
888
-
889
- class PoliticianMatcher:
890
- """Matches scraped names to politician records"""
891
-
892
- def __init__(self, politicians: List[Politician]):
893
- self.politicians = politicians
894
- self._build_lookup()
895
-
896
- def _build_lookup(self):
897
- """Build lookup dictionaries for fast matching"""
898
- self.name_lookup = {}
899
- self.bioguide_lookup = {}
900
-
901
- for politician in self.politicians:
902
- # Full name variations
903
- full_name = politician.full_name.lower()
904
- self.name_lookup[full_name] = politician
905
-
906
- # Last, First format
907
- if politician.first_name and politician.last_name:
908
- last_first = f"{politician.last_name.lower()}, {politician.first_name.lower()}"
909
- self.name_lookup[last_first] = politician
910
-
911
- # First Last format
912
- first_last = f"{politician.first_name.lower()} {politician.last_name.lower()}"
913
- self.name_lookup[first_last] = politician
914
-
915
- # Bioguide ID lookup
916
- if politician.bioguide_id:
917
- self.bioguide_lookup[politician.bioguide_id] = politician
918
-
919
- def find_politician(self, name: str, bioguide_id: str = None) -> Optional[Politician]:
920
- """Find politician by name or bioguide ID"""
921
- if bioguide_id and bioguide_id in self.bioguide_lookup:
922
- return self.bioguide_lookup[bioguide_id]
923
-
924
- if name:
925
- name_clean = name.lower().strip()
926
-
927
- # Direct match
928
- if name_clean in self.name_lookup:
929
- return self.name_lookup[name_clean]
930
-
931
- # Fuzzy matching (simplified)
932
- for lookup_name, politician in self.name_lookup.items():
933
- if self._names_similar(name_clean, lookup_name):
934
- return politician
935
-
936
- return None
937
-
938
- def _names_similar(self, name1: str, name2: str) -> bool:
939
- """Simple similarity check for names"""
940
- # Remove common prefixes/suffixes
941
- prefixes = ["rep.", "sen.", "senator", "representative", "mr.", "mrs.", "ms."]
942
- suffixes = ["jr.", "sr.", "ii", "iii", "iv"]
943
-
944
- for prefix in prefixes:
945
- name1 = name1.replace(prefix, "").strip()
946
- name2 = name2.replace(prefix, "").strip()
947
-
948
- for suffix in suffixes:
949
- name1 = name1.replace(suffix, "").strip()
950
- name2 = name2.replace(suffix, "").strip()
951
-
952
- # Check if one name contains the other
953
- return name1 in name2 or name2 in name1
954
-
955
-
956
- # Import specialized scrapers after base classes are defined
957
- try:
958
- from .scrapers_uk import UKParliamentScraper, run_uk_parliament_collection
959
-
960
- UK_SCRAPER_AVAILABLE = True
961
- except Exception as e:
962
- logger.debug(f"UK scraper import failed: {e}")
963
- UKParliamentScraper = None
964
- run_uk_parliament_collection = None
965
- UK_SCRAPER_AVAILABLE = False
966
-
967
- try:
968
- from .scrapers_california import CaliforniaNetFileScraper, run_california_collection
969
-
970
- CALIFORNIA_SCRAPER_AVAILABLE = True
971
- except Exception as e:
972
- logger.debug(f"California scraper import failed: {e}")
973
- CaliforniaNetFileScraper = None
974
- run_california_collection = None
975
- CALIFORNIA_SCRAPER_AVAILABLE = False
976
-
977
- try:
978
- from .scrapers_eu import EUMemberStatesScraper, run_eu_member_states_collection
979
-
980
- EU_MEMBER_STATES_SCRAPER_AVAILABLE = True
981
- except Exception as e:
982
- logger.debug(f"EU member states scraper import failed: {e}")
983
- EUMemberStatesScraper = None
984
- run_eu_member_states_collection = None
985
- EU_MEMBER_STATES_SCRAPER_AVAILABLE = False
986
-
987
- try:
988
- from .scrapers_us_states import USStatesScraper, run_us_states_collection
989
-
990
- US_STATES_SCRAPER_AVAILABLE = True
991
- except Exception as e:
992
- logger.debug(f"US states scraper import failed: {e}")
993
- USStatesScraper = None
994
- run_us_states_collection = None
995
- US_STATES_SCRAPER_AVAILABLE = False
996
-
997
-
998
- # Workflow functions using imported scrapers
999
- async def run_uk_parliament_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
1000
- """Run UK Parliament data collection workflow"""
1001
- if not UK_SCRAPER_AVAILABLE:
1002
- logger.warning("UK Parliament scraper not available")
1003
- return []
1004
-
1005
- logger.info("Starting UK Parliament financial interests collection")
1006
- try:
1007
- disclosures = await run_uk_parliament_collection(config)
1008
- logger.info(f"Successfully collected {len(disclosures)} UK Parliament disclosures")
1009
- return disclosures
1010
- except Exception as e:
1011
- logger.error(f"UK Parliament collection failed: {e}")
1012
- return []
1013
-
1014
-
1015
- async def run_california_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
1016
- """Run California NetFile and state disclosure collection workflow"""
1017
- if not CALIFORNIA_SCRAPER_AVAILABLE:
1018
- logger.warning("California scraper not available")
1019
- return []
1020
-
1021
- logger.info("Starting California financial disclosures collection")
1022
- try:
1023
- disclosures = await run_california_collection(config)
1024
- logger.info(f"Successfully collected {len(disclosures)} California disclosures")
1025
- return disclosures
1026
- except Exception as e:
1027
- logger.error(f"California collection failed: {e}")
1028
- return []
1029
-
1030
-
1031
- async def run_eu_member_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
1032
- """Run EU member states financial disclosure collection workflow"""
1033
- if not EU_MEMBER_STATES_SCRAPER_AVAILABLE:
1034
- logger.warning("EU member states scraper not available")
1035
- return []
1036
-
1037
- logger.info("Starting EU member states financial disclosures collection")
1038
- try:
1039
- disclosures = await run_eu_member_states_collection(config)
1040
- logger.info(f"Successfully collected {len(disclosures)} EU member states disclosures")
1041
- return disclosures
1042
- except Exception as e:
1043
- logger.error(f"EU member states collection failed: {e}")
1044
- return []
1045
-
1046
-
1047
- async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
1048
- """Run US states financial disclosure collection workflow"""
1049
- if not US_STATES_SCRAPER_AVAILABLE:
1050
- logger.warning("US states scraper not available")
1051
- return []
1052
-
1053
- logger.info("Starting US states financial disclosures collection")
1054
- try:
1055
- disclosures = await run_us_states_collection(config)
1056
- logger.info(f"Successfully collected {len(disclosures)} US states disclosures")
1057
- return disclosures
1058
- except Exception as e:
1059
- logger.error(f"US states collection failed: {e}")
1060
- return []
1061
-
1062
-
1063
- # Export the new workflow function
1064
- __all__ = [
1065
- "BaseScraper",
1066
- "CongressTradingScraper",
1067
- "QuiverQuantScraper",
1068
- "EUParliamentScraper",
1069
- "PoliticianMatcher",
1070
- "run_uk_parliament_workflow",
1071
- "run_california_workflow",
1072
- "run_eu_member_states_workflow",
1073
- "run_us_states_workflow",
1074
- ]