mcli-framework 7.1.0__py3-none-any.whl → 7.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (94) hide show
  1. mcli/app/completion_cmd.py +59 -49
  2. mcli/app/completion_helpers.py +60 -138
  3. mcli/app/logs_cmd.py +46 -13
  4. mcli/app/main.py +17 -14
  5. mcli/app/model_cmd.py +19 -4
  6. mcli/chat/chat.py +3 -2
  7. mcli/lib/search/cached_vectorizer.py +1 -0
  8. mcli/lib/services/data_pipeline.py +12 -5
  9. mcli/lib/services/lsh_client.py +69 -58
  10. mcli/ml/api/app.py +28 -36
  11. mcli/ml/api/middleware.py +8 -16
  12. mcli/ml/api/routers/admin_router.py +3 -1
  13. mcli/ml/api/routers/auth_router.py +32 -56
  14. mcli/ml/api/routers/backtest_router.py +3 -1
  15. mcli/ml/api/routers/data_router.py +3 -1
  16. mcli/ml/api/routers/model_router.py +35 -74
  17. mcli/ml/api/routers/monitoring_router.py +3 -1
  18. mcli/ml/api/routers/portfolio_router.py +3 -1
  19. mcli/ml/api/routers/prediction_router.py +60 -65
  20. mcli/ml/api/routers/trade_router.py +6 -2
  21. mcli/ml/api/routers/websocket_router.py +12 -9
  22. mcli/ml/api/schemas.py +10 -2
  23. mcli/ml/auth/auth_manager.py +49 -114
  24. mcli/ml/auth/models.py +30 -15
  25. mcli/ml/auth/permissions.py +12 -19
  26. mcli/ml/backtesting/backtest_engine.py +134 -108
  27. mcli/ml/backtesting/performance_metrics.py +142 -108
  28. mcli/ml/cache.py +12 -18
  29. mcli/ml/cli/main.py +37 -23
  30. mcli/ml/config/settings.py +29 -12
  31. mcli/ml/dashboard/app.py +122 -130
  32. mcli/ml/dashboard/app_integrated.py +283 -152
  33. mcli/ml/dashboard/app_supabase.py +176 -108
  34. mcli/ml/dashboard/app_training.py +212 -206
  35. mcli/ml/dashboard/cli.py +14 -5
  36. mcli/ml/data_ingestion/api_connectors.py +51 -81
  37. mcli/ml/data_ingestion/data_pipeline.py +127 -125
  38. mcli/ml/data_ingestion/stream_processor.py +72 -80
  39. mcli/ml/database/migrations/env.py +3 -2
  40. mcli/ml/database/models.py +112 -79
  41. mcli/ml/database/session.py +6 -5
  42. mcli/ml/experimentation/ab_testing.py +149 -99
  43. mcli/ml/features/ensemble_features.py +9 -8
  44. mcli/ml/features/political_features.py +6 -5
  45. mcli/ml/features/recommendation_engine.py +15 -14
  46. mcli/ml/features/stock_features.py +7 -6
  47. mcli/ml/features/test_feature_engineering.py +8 -7
  48. mcli/ml/logging.py +10 -15
  49. mcli/ml/mlops/data_versioning.py +57 -64
  50. mcli/ml/mlops/experiment_tracker.py +49 -41
  51. mcli/ml/mlops/model_serving.py +59 -62
  52. mcli/ml/mlops/pipeline_orchestrator.py +203 -149
  53. mcli/ml/models/base_models.py +8 -7
  54. mcli/ml/models/ensemble_models.py +6 -5
  55. mcli/ml/models/recommendation_models.py +7 -6
  56. mcli/ml/models/test_models.py +18 -14
  57. mcli/ml/monitoring/drift_detection.py +95 -74
  58. mcli/ml/monitoring/metrics.py +10 -22
  59. mcli/ml/optimization/portfolio_optimizer.py +172 -132
  60. mcli/ml/predictions/prediction_engine.py +235 -0
  61. mcli/ml/preprocessing/data_cleaners.py +6 -5
  62. mcli/ml/preprocessing/feature_extractors.py +7 -6
  63. mcli/ml/preprocessing/ml_pipeline.py +3 -2
  64. mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
  65. mcli/ml/preprocessing/test_preprocessing.py +4 -4
  66. mcli/ml/scripts/populate_sample_data.py +36 -16
  67. mcli/ml/tasks.py +82 -83
  68. mcli/ml/tests/test_integration.py +86 -76
  69. mcli/ml/tests/test_training_dashboard.py +169 -142
  70. mcli/mygroup/test_cmd.py +2 -1
  71. mcli/self/self_cmd.py +38 -18
  72. mcli/self/test_cmd.py +2 -1
  73. mcli/workflow/dashboard/dashboard_cmd.py +13 -6
  74. mcli/workflow/lsh_integration.py +46 -58
  75. mcli/workflow/politician_trading/commands.py +576 -427
  76. mcli/workflow/politician_trading/config.py +7 -7
  77. mcli/workflow/politician_trading/connectivity.py +35 -33
  78. mcli/workflow/politician_trading/data_sources.py +72 -71
  79. mcli/workflow/politician_trading/database.py +18 -16
  80. mcli/workflow/politician_trading/demo.py +4 -3
  81. mcli/workflow/politician_trading/models.py +5 -5
  82. mcli/workflow/politician_trading/monitoring.py +13 -13
  83. mcli/workflow/politician_trading/scrapers.py +332 -224
  84. mcli/workflow/politician_trading/scrapers_california.py +116 -94
  85. mcli/workflow/politician_trading/scrapers_eu.py +70 -71
  86. mcli/workflow/politician_trading/scrapers_uk.py +118 -90
  87. mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
  88. mcli/workflow/politician_trading/workflow.py +98 -71
  89. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +2 -2
  90. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -93
  91. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
  92. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
  93. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
  94. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
@@ -7,14 +7,14 @@ import logging
7
7
  import re
8
8
  from datetime import datetime, timedelta
9
9
  from decimal import Decimal
10
- from typing import List, Dict, Any, Optional, Tuple
10
+ from typing import Any, Dict, List, Optional, Tuple
11
11
  from urllib.parse import urljoin, urlparse
12
12
 
13
13
  import aiohttp
14
14
  from bs4 import BeautifulSoup
15
15
 
16
- from .models import Politician, TradingDisclosure, TransactionType, PoliticianRole
17
16
  from .config import ScrapingConfig
17
+ from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
18
18
 
19
19
  logger = logging.getLogger(__name__)
20
20
 
@@ -114,105 +114,139 @@ class CongressTradingScraper(BaseScraper):
114
114
 
115
115
  try:
116
116
  logger.info("Starting House disclosures scrape from official database")
117
-
117
+
118
118
  async with aiohttp.ClientSession(
119
119
  timeout=aiohttp.ClientTimeout(total=self.config.timeout),
120
- headers={'User-Agent': self.config.user_agent}
120
+ headers={"User-Agent": self.config.user_agent},
121
121
  ) as session:
122
-
122
+
123
123
  # Get the ViewSearch form page
124
124
  view_search_url = f"{base_url}/FinancialDisclosure/ViewSearch"
125
125
  async with session.get(view_search_url) as response:
126
126
  if response.status == 200:
127
127
  html = await response.text()
128
128
  logger.info("Successfully accessed House financial disclosure search form")
129
-
129
+
130
130
  # Extract form data for ASPX
131
- soup = BeautifulSoup(html, 'html.parser')
132
-
131
+ soup = BeautifulSoup(html, "html.parser")
132
+
133
133
  # Look for common ASPX form fields
134
134
  form_fields = {}
135
- for field_name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION', '__REQUESTVERIFICATIONTOKEN']:
136
- field = soup.find('input', {'name': field_name})
137
- if field and field.get('value'):
138
- form_fields[field_name] = field['value']
139
-
140
- if '__VIEWSTATE' in form_fields:
135
+ for field_name in [
136
+ "__VIEWSTATE",
137
+ "__VIEWSTATEGENERATOR",
138
+ "__EVENTVALIDATION",
139
+ "__REQUESTVERIFICATIONTOKEN",
140
+ ]:
141
+ field = soup.find("input", {"name": field_name})
142
+ if field and field.get("value"):
143
+ form_fields[field_name] = field["value"]
144
+
145
+ if "__VIEWSTATE" in form_fields:
141
146
  logger.info("Found required ASPX form fields")
142
-
147
+
143
148
  # Search for recent disclosures - try different form field names
144
149
  current_year = str(datetime.now().year)
145
-
150
+
146
151
  # Search for common politician last names to get real data
147
- common_names = ['Smith', 'Johnson', 'Brown', 'Davis', 'Wilson', 'Miller', 'Garcia']
148
-
152
+ common_names = [
153
+ "Smith",
154
+ "Johnson",
155
+ "Brown",
156
+ "Davis",
157
+ "Wilson",
158
+ "Miller",
159
+ "Garcia",
160
+ ]
161
+
149
162
  # Try different form patterns with actual names
150
163
  possible_form_data_sets = []
151
-
164
+
152
165
  for name in common_names:
153
- possible_form_data_sets.extend([
166
+ possible_form_data_sets.extend(
167
+ [
168
+ {
169
+ **form_fields,
170
+ "ctl00$MainContent$txtLastName": name,
171
+ "ctl00$MainContent$ddlFilingYear": current_year,
172
+ "ctl00$MainContent$btnSearch": "Search",
173
+ },
174
+ {
175
+ **form_fields,
176
+ "ctl00$ContentPlaceHolder1$txtLastName": name,
177
+ "ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
178
+ "ctl00$ContentPlaceHolder1$btnSearch": "Search",
179
+ },
180
+ {
181
+ **form_fields,
182
+ "LastName": name,
183
+ "FilingYear": current_year,
184
+ "Search": "Search",
185
+ },
186
+ ]
187
+ )
188
+
189
+ # Also try without names (all results)
190
+ possible_form_data_sets.extend(
191
+ [
154
192
  {
155
193
  **form_fields,
156
- 'ctl00$MainContent$txtLastName': name,
157
- 'ctl00$MainContent$ddlFilingYear': current_year,
158
- 'ctl00$MainContent$btnSearch': 'Search'
194
+ "ctl00$MainContent$ddlFilingYear": current_year,
195
+ "ctl00$MainContent$btnSearch": "Search",
159
196
  },
160
197
  {
161
198
  **form_fields,
162
- 'ctl00$ContentPlaceHolder1$txtLastName': name,
163
- 'ctl00$ContentPlaceHolder1$ddlFilingYear': current_year,
164
- 'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
199
+ "ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
200
+ "ctl00$ContentPlaceHolder1$btnSearch": "Search",
165
201
  },
166
- {
167
- **form_fields,
168
- 'LastName': name,
169
- 'FilingYear': current_year,
170
- 'Search': 'Search'
171
- }
172
- ])
173
-
174
- # Also try without names (all results)
175
- possible_form_data_sets.extend([
176
- {
177
- **form_fields,
178
- 'ctl00$MainContent$ddlFilingYear': current_year,
179
- 'ctl00$MainContent$btnSearch': 'Search'
180
- },
181
- {
182
- **form_fields,
183
- 'ctl00$ContentPlaceHolder1$ddlFilingYear': current_year,
184
- 'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
185
- }
186
- ])
187
-
202
+ ]
203
+ )
204
+
188
205
  # Try each form configuration
189
206
  for i, form_data in enumerate(possible_form_data_sets):
190
207
  try:
191
208
  logger.info(f"Attempting search with form configuration {i+1}")
192
- async with session.post(view_search_url, data=form_data) as search_response:
209
+ async with session.post(
210
+ view_search_url, data=form_data
211
+ ) as search_response:
193
212
  if search_response.status == 200:
194
213
  results_html = await search_response.text()
195
- if 'search results' in results_html.lower() or 'disclosure' in results_html.lower():
196
- disclosures = await self._parse_house_results(results_html, base_url)
197
- logger.info(f"Successfully found {len(disclosures)} House disclosures")
214
+ if (
215
+ "search results" in results_html.lower()
216
+ or "disclosure" in results_html.lower()
217
+ ):
218
+ disclosures = await self._parse_house_results(
219
+ results_html, base_url
220
+ )
221
+ logger.info(
222
+ f"Successfully found {len(disclosures)} House disclosures"
223
+ )
198
224
  break
199
225
  else:
200
- logger.debug(f"Form config {i+1} didn't return results")
226
+ logger.debug(
227
+ f"Form config {i+1} didn't return results"
228
+ )
201
229
  else:
202
- logger.debug(f"Form config {i+1} failed with status {search_response.status}")
230
+ logger.debug(
231
+ f"Form config {i+1} failed with status {search_response.status}"
232
+ )
203
233
  except Exception as e:
204
234
  logger.debug(f"Form config {i+1} failed: {e}")
205
235
  else:
206
- logger.warning("All form configurations failed, using basic page scraping")
236
+ logger.warning(
237
+ "All form configurations failed, using basic page scraping"
238
+ )
207
239
  # Fall back to scraping any existing disclosure links on the page
208
240
  disclosures = await self._parse_house_results(html, base_url)
209
241
  else:
210
- logger.warning("Could not find required ASPX form fields, using basic page scraping")
242
+ logger.warning(
243
+ "Could not find required ASPX form fields, using basic page scraping"
244
+ )
211
245
  # Fall back to parsing any existing links
212
246
  disclosures = await self._parse_house_results(html, base_url)
213
247
  else:
214
248
  logger.warning(f"Failed to access House disclosure site: {response.status}")
215
-
249
+
216
250
  # Rate limiting
217
251
  await asyncio.sleep(self.config.request_delay)
218
252
 
@@ -230,19 +264,21 @@ class CongressTradingScraper(BaseScraper):
230
264
 
231
265
  try:
232
266
  logger.info("Starting Senate disclosures scrape from EFD database")
233
-
267
+
234
268
  async with aiohttp.ClientSession(
235
269
  timeout=aiohttp.ClientTimeout(total=self.config.timeout),
236
- headers={'User-Agent': self.config.user_agent}
270
+ headers={"User-Agent": self.config.user_agent},
237
271
  ) as session:
238
-
272
+
239
273
  # Search for recent periodic transaction reports (PTRs)
240
274
  search_params = {
241
- 'report_type': '11', # Periodic Transaction Report
242
- 'submitted_start_date': (datetime.now() - timedelta(days=90)).strftime('%m/%d/%Y'),
243
- 'submitted_end_date': datetime.now().strftime('%m/%d/%Y')
275
+ "report_type": "11", # Periodic Transaction Report
276
+ "submitted_start_date": (datetime.now() - timedelta(days=90)).strftime(
277
+ "%m/%d/%Y"
278
+ ),
279
+ "submitted_end_date": datetime.now().strftime("%m/%d/%Y"),
244
280
  }
245
-
281
+
246
282
  async with session.get(search_url, params=search_params) as response:
247
283
  if response.status == 200:
248
284
  html = await response.text()
@@ -250,7 +286,7 @@ class CongressTradingScraper(BaseScraper):
250
286
  logger.info(f"Found {len(disclosures)} Senate disclosures")
251
287
  else:
252
288
  logger.warning(f"Senate search failed with status {response.status}")
253
-
289
+
254
290
  # Rate limiting
255
291
  await asyncio.sleep(self.config.request_delay)
256
292
 
@@ -263,64 +299,89 @@ class CongressTradingScraper(BaseScraper):
263
299
  async def _parse_house_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
264
300
  """Parse House disclosure search results"""
265
301
  disclosures = []
266
-
302
+
267
303
  try:
268
- soup = BeautifulSoup(html, 'html.parser')
269
-
304
+ soup = BeautifulSoup(html, "html.parser")
305
+
270
306
  # Look for disclosure result rows - try multiple selectors
271
- result_rows = (soup.find_all('tr', class_='disclosure-row') or
272
- soup.select('tr[id*="GridView"]') or
273
- soup.select('table tr') or
274
- soup.find_all('tr'))
275
-
307
+ result_rows = (
308
+ soup.find_all("tr", class_="disclosure-row")
309
+ or soup.select('tr[id*="GridView"]')
310
+ or soup.select("table tr")
311
+ or soup.find_all("tr")
312
+ )
313
+
276
314
  logger.info(f"Found {len(result_rows)} potential result rows")
277
-
315
+
278
316
  for row in result_rows[:20]: # Limit to 20 most recent
279
- cells = row.find_all('td')
317
+ cells = row.find_all("td")
280
318
  if len(cells) >= 3: # At least 3 columns
281
319
  # Extract text from each cell to identify the structure
282
- cell_texts = [cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)]
283
-
320
+ cell_texts = [
321
+ cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)
322
+ ]
323
+
284
324
  if not cell_texts:
285
325
  continue
286
-
326
+
287
327
  # Try to identify which cell contains the politician name
288
328
  # Names usually contain letters and may have titles like "Rep.", "Hon."
289
329
  politician_name = ""
290
-
330
+
291
331
  for text in cell_texts:
292
332
  # Look for text that looks like a person's name
293
- if (len(text) > 3 and
294
- any(c.isalpha() for c in text) and
295
- not text.isdigit() and
296
- not text.startswith('20') and # Not a year
297
- 'pdf' not in text.lower() and
298
- 'view' not in text.lower()):
299
-
333
+ if (
334
+ len(text) > 3
335
+ and any(c.isalpha() for c in text)
336
+ and not text.isdigit()
337
+ and not text.startswith("20") # Not a year
338
+ and "pdf" not in text.lower()
339
+ and "view" not in text.lower()
340
+ ):
341
+
300
342
  # Clean up potential name
301
- clean_name = text.replace('Hon.', '').replace('Rep.', '').replace('Sen.', '').strip()
302
- if len(clean_name) > 3 and ' ' in clean_name: # Likely full name
343
+ clean_name = (
344
+ text.replace("Hon.", "")
345
+ .replace("Rep.", "")
346
+ .replace("Sen.", "")
347
+ .strip()
348
+ )
349
+ if len(clean_name) > 3 and " " in clean_name: # Likely full name
303
350
  politician_name = clean_name
304
351
  break
305
-
352
+
306
353
  if not politician_name:
307
354
  politician_name = cell_texts[0] # Fallback to first cell
308
-
355
+
309
356
  # Extract other information
310
- filing_year = next((text for text in cell_texts if text.isdigit() and len(text) == 4 and text.startswith('20')), "")
311
- filing_type = next((text for text in cell_texts if 'periodic' in text.lower() or 'annual' in text.lower()), "")
312
-
357
+ filing_year = next(
358
+ (
359
+ text
360
+ for text in cell_texts
361
+ if text.isdigit() and len(text) == 4 and text.startswith("20")
362
+ ),
363
+ "",
364
+ )
365
+ filing_type = next(
366
+ (
367
+ text
368
+ for text in cell_texts
369
+ if "periodic" in text.lower() or "annual" in text.lower()
370
+ ),
371
+ "",
372
+ )
373
+
313
374
  # Look for PDF link
314
- pdf_link = row.find('a', href=True)
375
+ pdf_link = row.find("a", href=True)
315
376
  if pdf_link:
316
- pdf_url = urljoin(base_url, pdf_link['href'])
317
-
377
+ pdf_url = urljoin(base_url, pdf_link["href"])
378
+
318
379
  # Create basic disclosure entry
319
380
  # Note: Actual transaction details would require PDF parsing
320
381
  disclosure = TradingDisclosure(
321
382
  politician_id="", # To be filled by matcher
322
383
  transaction_date=datetime.now() - timedelta(days=30), # Estimate
323
- disclosure_date=datetime.now() - timedelta(days=15), # Estimate
384
+ disclosure_date=datetime.now() - timedelta(days=15), # Estimate
324
385
  transaction_type=TransactionType.PURCHASE, # Default
325
386
  asset_name="Unknown Asset", # Would need PDF parsing
326
387
  asset_type="stock",
@@ -332,39 +393,39 @@ class CongressTradingScraper(BaseScraper):
332
393
  "filing_year": filing_year,
333
394
  "filing_type": filing_type,
334
395
  "requires_pdf_parsing": True,
335
- "extraction_method": "house_search_results"
336
- }
396
+ "extraction_method": "house_search_results",
397
+ },
337
398
  )
338
399
  disclosures.append(disclosure)
339
-
400
+
340
401
  except Exception as e:
341
402
  logger.error(f"Error parsing House results: {e}")
342
-
403
+
343
404
  return disclosures
344
405
 
345
406
  async def _parse_senate_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
346
407
  """Parse Senate EFD search results"""
347
408
  disclosures = []
348
-
409
+
349
410
  try:
350
- soup = BeautifulSoup(html, 'html.parser')
351
-
411
+ soup = BeautifulSoup(html, "html.parser")
412
+
352
413
  # Look for search result rows
353
- result_rows = soup.find_all('tr', class_='searchresult') or soup.select('tbody tr')
354
-
414
+ result_rows = soup.find_all("tr", class_="searchresult") or soup.select("tbody tr")
415
+
355
416
  for row in result_rows[:20]: # Limit to 20 most recent
356
- cells = row.find_all('td')
417
+ cells = row.find_all("td")
357
418
  if len(cells) >= 4:
358
419
  # Extract information
359
420
  name = cells[0].get_text(strip=True) if cells[0] else ""
360
421
  report_type = cells[1].get_text(strip=True) if cells[1] else ""
361
422
  filing_date = cells[2].get_text(strip=True) if cells[2] else ""
362
-
423
+
363
424
  # Look for report link
364
- report_link = row.find('a', href=True)
365
- if report_link and 'ptr' in report_type.lower(): # Periodic Transaction Report
366
- report_url = urljoin(base_url, report_link['href'])
367
-
425
+ report_link = row.find("a", href=True)
426
+ if report_link and "ptr" in report_type.lower(): # Periodic Transaction Report
427
+ report_url = urljoin(base_url, report_link["href"])
428
+
368
429
  # Create disclosure entry
369
430
  # Note: Actual transaction details would require report parsing
370
431
  disclosure = TradingDisclosure(
@@ -381,34 +442,29 @@ class CongressTradingScraper(BaseScraper):
381
442
  "politician_name": name,
382
443
  "report_type": report_type,
383
444
  "filing_date": filing_date,
384
- "requires_report_parsing": True
385
- }
445
+ "requires_report_parsing": True,
446
+ },
386
447
  )
387
448
  disclosures.append(disclosure)
388
-
449
+
389
450
  except Exception as e:
390
451
  logger.error(f"Error parsing Senate results: {e}")
391
-
452
+
392
453
  return disclosures
393
454
 
394
455
  def _parse_date(self, date_str: str) -> Optional[datetime]:
395
456
  """Parse various date formats from disclosure sites"""
396
457
  if not date_str:
397
458
  return None
398
-
399
- date_formats = [
400
- '%m/%d/%Y',
401
- '%Y-%m-%d',
402
- '%m-%d-%Y',
403
- '%B %d, %Y'
404
- ]
405
-
459
+
460
+ date_formats = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%B %d, %Y"]
461
+
406
462
  for fmt in date_formats:
407
463
  try:
408
464
  return datetime.strptime(date_str.strip(), fmt)
409
465
  except ValueError:
410
466
  continue
411
-
467
+
412
468
  logger.warning(f"Could not parse date: {date_str}")
413
469
  return None
414
470
 
@@ -440,17 +496,19 @@ class QuiverQuantScraper(BaseScraper):
440
496
  if len(cells) >= 4:
441
497
  # Extract cell contents and try to identify the correct fields
442
498
  cell_texts = [cell.get_text(strip=True) for cell in cells]
443
-
499
+
444
500
  # Try to identify which cell contains what data
445
501
  politician_name = cell_texts[0] if len(cell_texts) > 0 else ""
446
-
502
+
447
503
  # Look for date-like patterns (YYYY-MM-DD, MM/DD/YYYY, etc.)
448
504
  transaction_date = ""
449
505
  ticker = ""
450
506
  transaction_type = ""
451
507
  amount = ""
452
-
453
- for i, text in enumerate(cell_texts[1:], 1): # Skip first cell (politician name)
508
+
509
+ for i, text in enumerate(
510
+ cell_texts[1:], 1
511
+ ): # Skip first cell (politician name)
454
512
  # Check if this looks like a date
455
513
  if self._looks_like_date(text):
456
514
  transaction_date = text
@@ -458,19 +516,21 @@ class QuiverQuantScraper(BaseScraper):
458
516
  elif text.isupper() and len(text) <= 5 and text.isalpha():
459
517
  ticker = text
460
518
  # Check if this contains transaction type keywords
461
- elif any(word in text.lower() for word in ['purchase', 'sale', 'buy', 'sell']):
519
+ elif any(
520
+ word in text.lower() for word in ["purchase", "sale", "buy", "sell"]
521
+ ):
462
522
  # Split transaction type and amount if combined
463
- if '$' in text:
523
+ if "$" in text:
464
524
  # Split on $ to separate transaction type from amount
465
- parts = text.split('$', 1)
525
+ parts = text.split("$", 1)
466
526
  transaction_type = parts[0].strip()
467
- amount = '$' + parts[1] if len(parts) > 1 else ""
527
+ amount = "$" + parts[1] if len(parts) > 1 else ""
468
528
  else:
469
529
  transaction_type = text
470
530
  # Check if this looks like an amount (contains $ or numbers with ,)
471
- elif '$' in text or (',' in text and any(c.isdigit() for c in text)):
531
+ elif "$" in text or ("," in text and any(c.isdigit() for c in text)):
472
532
  amount = text
473
-
533
+
474
534
  # Only create trade data if we have essential fields
475
535
  if politician_name and (transaction_date or ticker):
476
536
  trade_data = {
@@ -492,16 +552,17 @@ class QuiverQuantScraper(BaseScraper):
492
552
  """Check if a string looks like a date"""
493
553
  if not text or len(text) < 8:
494
554
  return False
495
-
555
+
496
556
  # Common date patterns
497
557
  date_patterns = [
498
- r'\d{4}-\d{1,2}-\d{1,2}', # YYYY-MM-DD
499
- r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
500
- r'\d{1,2}-\d{1,2}-\d{4}', # MM-DD-YYYY
501
- r'\w{3}\s+\d{1,2},?\s+\d{4}', # Month DD, YYYY
558
+ r"\d{4}-\d{1,2}-\d{1,2}", # YYYY-MM-DD
559
+ r"\d{1,2}/\d{1,2}/\d{4}", # MM/DD/YYYY
560
+ r"\d{1,2}-\d{1,2}-\d{4}", # MM-DD-YYYY
561
+ r"\w{3}\s+\d{1,2},?\s+\d{4}", # Month DD, YYYY
502
562
  ]
503
-
563
+
504
564
  import re
565
+
505
566
  for pattern in date_patterns:
506
567
  if re.search(pattern, text):
507
568
  return True
@@ -543,7 +604,9 @@ class QuiverQuantScraper(BaseScraper):
543
604
  # Try MM-DD-YYYY
544
605
  transaction_date = datetime.strptime(date_str, "%m-%d-%Y")
545
606
  except ValueError:
546
- logger.warning(f"Could not parse date '{date_str}', using estimated date")
607
+ logger.warning(
608
+ f"Could not parse date '{date_str}', using estimated date"
609
+ )
547
610
  transaction_date = datetime.now() - timedelta(days=30)
548
611
 
549
612
  # Parse amount
@@ -583,39 +646,41 @@ class EUParliamentScraper(BaseScraper):
583
646
  try:
584
647
  logger.info("Starting EU Parliament MEP declarations scrape")
585
648
  base_url = "https://www.europarl.europa.eu"
586
-
649
+
587
650
  async with aiohttp.ClientSession(
588
651
  timeout=aiohttp.ClientTimeout(total=self.config.timeout),
589
- headers={'User-Agent': self.config.user_agent}
652
+ headers={"User-Agent": self.config.user_agent},
590
653
  ) as session:
591
-
654
+
592
655
  # Get list of current MEPs
593
656
  mep_list_url = f"{base_url}/meps/en/full-list/all"
594
-
657
+
595
658
  async with session.get(mep_list_url) as response:
596
659
  if response.status == 200:
597
660
  html = await response.text()
598
661
  mep_data = await self._extract_mep_urls(html, base_url)
599
662
  logger.info(f"Found {len(mep_data)} MEP profiles to check")
600
-
663
+
601
664
  # Check declarations for a subset of MEPs (to avoid overwhelming the server)
602
665
  for i, mep_info in enumerate(mep_data[:50]): # Limit to 50 MEPs
603
666
  try:
604
- mep_disclosures = await self._scrape_mep_profile(session, mep_info['url'], mep_info)
667
+ mep_disclosures = await self._scrape_mep_profile(
668
+ session, mep_info["url"], mep_info
669
+ )
605
670
  disclosures.extend(mep_disclosures)
606
-
671
+
607
672
  # Rate limiting - EU Parliament is more sensitive
608
673
  await asyncio.sleep(self.config.request_delay * 2)
609
-
674
+
610
675
  if i > 0 and i % 10 == 0:
611
676
  logger.info(f"Processed {i} MEP profiles")
612
-
677
+
613
678
  except Exception as e:
614
679
  logger.warning(f"Failed to process MEP profile {mep_url}: {e}")
615
680
  continue
616
681
  else:
617
682
  logger.warning(f"Failed to access MEP list: {response.status}")
618
-
683
+
619
684
  logger.info(f"Collected {len(disclosures)} EU Parliament disclosures")
620
685
 
621
686
  except Exception as e:
@@ -626,124 +691,155 @@ class EUParliamentScraper(BaseScraper):
626
691
  async def _extract_mep_urls(self, html: str, base_url: str) -> List[Dict[str, str]]:
627
692
  """Extract MEP profile URLs and names from the MEP list page"""
628
693
  mep_data = []
629
-
694
+
630
695
  try:
631
- soup = BeautifulSoup(html, 'html.parser')
632
-
696
+ soup = BeautifulSoup(html, "html.parser")
697
+
633
698
  # Look for MEP profile links - they usually contain both name and link
634
- mep_links = soup.find_all('a', href=True)
635
-
699
+ mep_links = soup.find_all("a", href=True)
700
+
636
701
  seen_urls = set()
637
-
702
+
638
703
  for link in mep_links:
639
- href = link.get('href', '')
640
- if '/meps/en/' in href and '/home' in href:
704
+ href = link.get("href", "")
705
+ if "/meps/en/" in href and "/home" in href:
641
706
  full_url = urljoin(base_url, href)
642
-
707
+
643
708
  if full_url not in seen_urls:
644
709
  # Extract MEP name from link text or nearby elements
645
710
  mep_name = ""
646
-
711
+
647
712
  # Try to get name from link text
648
713
  link_text = link.get_text(strip=True)
649
- if link_text and len(link_text) > 3 and not link_text.lower().startswith('http'):
714
+ if (
715
+ link_text
716
+ and len(link_text) > 3
717
+ and not link_text.lower().startswith("http")
718
+ ):
650
719
  mep_name = link_text
651
-
720
+
652
721
  # If no name in link, look in parent elements
653
722
  if not mep_name:
654
723
  parent = link.parent
655
724
  if parent:
656
725
  # Look for text that looks like a name
657
726
  for text_node in parent.stripped_strings:
658
- if (len(text_node) > 3 and
659
- ' ' in text_node and
660
- not text_node.startswith('http') and
661
- not text_node.isdigit()):
727
+ if (
728
+ len(text_node) > 3
729
+ and " " in text_node
730
+ and not text_node.startswith("http")
731
+ and not text_node.isdigit()
732
+ ):
662
733
  mep_name = text_node
663
734
  break
664
-
735
+
665
736
  # Extract country/party info if available
666
737
  country = ""
667
738
  party = ""
668
-
739
+
669
740
  # Look for country and party info near the link
670
- container = link.find_parent(['div', 'article', 'section'])
741
+ container = link.find_parent(["div", "article", "section"])
671
742
  if container:
672
743
  text_elements = list(container.stripped_strings)
673
744
  for i, text in enumerate(text_elements):
674
745
  if text == mep_name and i < len(text_elements) - 2:
675
746
  # Country and party usually come after name
676
- country = text_elements[i + 1] if i + 1 < len(text_elements) else ""
677
- party = text_elements[i + 2] if i + 2 < len(text_elements) else ""
678
-
747
+ country = (
748
+ text_elements[i + 1] if i + 1 < len(text_elements) else ""
749
+ )
750
+ party = (
751
+ text_elements[i + 2] if i + 2 < len(text_elements) else ""
752
+ )
753
+
679
754
  if mep_name: # Only add if we found a name
680
- mep_data.append({
681
- 'url': full_url,
682
- 'name': mep_name,
683
- 'country': country,
684
- 'party': party
685
- })
755
+ mep_data.append(
756
+ {
757
+ "url": full_url,
758
+ "name": mep_name,
759
+ "country": country,
760
+ "party": party,
761
+ }
762
+ )
686
763
  seen_urls.add(full_url)
687
-
764
+
688
765
  # Limit to prevent overwhelming the servers
689
766
  if len(mep_data) >= 50:
690
767
  break
691
-
768
+
692
769
  except Exception as e:
693
770
  logger.error(f"Error extracting MEP data: {e}")
694
-
771
+
695
772
  return mep_data
696
773
 
697
- async def _scrape_mep_profile(self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None) -> List[TradingDisclosure]:
774
+ async def _scrape_mep_profile(
775
+ self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None
776
+ ) -> List[TradingDisclosure]:
698
777
  """Scrape financial interests from an individual MEP profile"""
699
778
  disclosures = []
700
-
779
+
701
780
  try:
702
781
  async with session.get(mep_url) as response:
703
782
  if response.status == 200:
704
783
  html = await response.text()
705
- soup = BeautifulSoup(html, 'html.parser')
706
-
784
+ soup = BeautifulSoup(html, "html.parser")
785
+
707
786
  # Use extracted MEP name from list, or try to extract from profile
708
- if mep_info and mep_info.get('name'):
709
- mep_name = mep_info['name']
710
- mep_country = mep_info.get('country', '')
711
- mep_party = mep_info.get('party', '')
787
+ if mep_info and mep_info.get("name"):
788
+ mep_name = mep_info["name"]
789
+ mep_country = mep_info.get("country", "")
790
+ mep_party = mep_info.get("party", "")
712
791
  else:
713
792
  # Fallback: extract from profile page
714
- name_element = soup.find('h1', class_='ep-header-title')
715
- mep_name = name_element.get_text(strip=True) if name_element else "Unknown MEP"
793
+ name_element = soup.find("h1", class_="ep-header-title")
794
+ mep_name = (
795
+ name_element.get_text(strip=True) if name_element else "Unknown MEP"
796
+ )
716
797
  mep_country = ""
717
798
  mep_party = ""
718
-
799
+
719
800
  # Look for financial interests section
720
801
  # EU Parliament declarations are typically in a specific section
721
- interests_section = soup.find('div', id='financial-interests') or \
722
- soup.find('section', class_='ep-a-section') or \
723
- soup.find('div', class_='ep-m-content-block')
724
-
802
+ interests_section = (
803
+ soup.find("div", id="financial-interests")
804
+ or soup.find("section", class_="ep-a-section")
805
+ or soup.find("div", class_="ep-m-content-block")
806
+ )
807
+
725
808
  if interests_section:
726
809
  # Parse financial interests
727
810
  # Note: EU declarations focus more on activities and interests than specific trades
728
- interest_items = interests_section.find_all(['p', 'li', 'div'], recursive=True)
729
-
811
+ interest_items = interests_section.find_all(
812
+ ["p", "li", "div"], recursive=True
813
+ )
814
+
730
815
  for item in interest_items:
731
816
  item_text = item.get_text(strip=True).lower()
732
-
817
+
733
818
  # Look for financial keywords
734
- if any(keyword in item_text for keyword in [
735
- 'shareholding', 'investment', 'director', 'board',
736
- 'financial interest', 'remuneration', 'consulting'
737
- ]):
819
+ if any(
820
+ keyword in item_text
821
+ for keyword in [
822
+ "shareholding",
823
+ "investment",
824
+ "director",
825
+ "board",
826
+ "financial interest",
827
+ "remuneration",
828
+ "consulting",
829
+ ]
830
+ ):
738
831
  # Create disclosure for detected financial interest
739
832
  disclosure = TradingDisclosure(
740
833
  politician_id="", # To be filled by matcher
741
- transaction_date=datetime.now() - timedelta(days=90), # Estimate
742
- disclosure_date=datetime.now() - timedelta(days=60), # Estimate
834
+ transaction_date=datetime.now()
835
+ - timedelta(days=90), # Estimate
836
+ disclosure_date=datetime.now() - timedelta(days=60), # Estimate
743
837
  transaction_type=TransactionType.PURCHASE, # Default for interests
744
838
  asset_name=self._extract_company_name(item_text),
745
839
  asset_type="interest",
746
- amount_range_min=Decimal("0"), # EU doesn't always specify amounts
840
+ amount_range_min=Decimal(
841
+ "0"
842
+ ), # EU doesn't always specify amounts
747
843
  amount_range_max=Decimal("0"),
748
844
  source_url=mep_url,
749
845
  raw_data={
@@ -751,28 +847,30 @@ class EUParliamentScraper(BaseScraper):
751
847
  "country": mep_country,
752
848
  "party": mep_party,
753
849
  "interest_type": "financial_activity",
754
- "interest_description": item.get_text(strip=True)[:500], # Truncate
850
+ "interest_description": item.get_text(strip=True)[
851
+ :500
852
+ ], # Truncate
755
853
  "region": "eu",
756
854
  "extraction_method": "mep_profile_scraping",
757
- "requires_manual_review": True
758
- }
855
+ "requires_manual_review": True,
856
+ },
759
857
  )
760
858
  disclosures.append(disclosure)
761
-
859
+
762
860
  except Exception as e:
763
861
  logger.warning(f"Error scraping MEP profile {mep_url}: {e}")
764
-
862
+
765
863
  return disclosures
766
864
 
767
865
  def _extract_company_name(self, text: str) -> str:
768
866
  """Extract company/organization name from interest description"""
769
867
  # Simple heuristic to extract potential company names
770
868
  words = text.split()
771
-
869
+
772
870
  # Look for capitalized sequences that might be company names
773
871
  potential_names = []
774
872
  current_name = []
775
-
873
+
776
874
  for word in words:
777
875
  if word[0].isupper() and len(word) > 2:
778
876
  current_name.append(word)
@@ -780,10 +878,10 @@ class EUParliamentScraper(BaseScraper):
780
878
  if current_name and len(current_name) <= 4: # Reasonable company name length
781
879
  potential_names.append(" ".join(current_name))
782
880
  current_name = []
783
-
881
+
784
882
  if current_name and len(current_name) <= 4:
785
883
  potential_names.append(" ".join(current_name))
786
-
884
+
787
885
  # Return the first reasonable candidate or default
788
886
  return potential_names[0] if potential_names else "Financial Interest"
789
887
 
@@ -858,6 +956,7 @@ class PoliticianMatcher:
858
956
  # Import specialized scrapers after base classes are defined
859
957
  try:
860
958
  from .scrapers_uk import UKParliamentScraper, run_uk_parliament_collection
959
+
861
960
  UK_SCRAPER_AVAILABLE = True
862
961
  except Exception as e:
863
962
  logger.debug(f"UK scraper import failed: {e}")
@@ -867,6 +966,7 @@ except Exception as e:
867
966
 
868
967
  try:
869
968
  from .scrapers_california import CaliforniaNetFileScraper, run_california_collection
969
+
870
970
  CALIFORNIA_SCRAPER_AVAILABLE = True
871
971
  except Exception as e:
872
972
  logger.debug(f"California scraper import failed: {e}")
@@ -876,6 +976,7 @@ except Exception as e:
876
976
 
877
977
  try:
878
978
  from .scrapers_eu import EUMemberStatesScraper, run_eu_member_states_collection
979
+
879
980
  EU_MEMBER_STATES_SCRAPER_AVAILABLE = True
880
981
  except Exception as e:
881
982
  logger.debug(f"EU member states scraper import failed: {e}")
@@ -885,6 +986,7 @@ except Exception as e:
885
986
 
886
987
  try:
887
988
  from .scrapers_us_states import USStatesScraper, run_us_states_collection
989
+
888
990
  US_STATES_SCRAPER_AVAILABLE = True
889
991
  except Exception as e:
890
992
  logger.debug(f"US states scraper import failed: {e}")
@@ -899,7 +1001,7 @@ async def run_uk_parliament_workflow(config: ScrapingConfig) -> List[TradingDisc
899
1001
  if not UK_SCRAPER_AVAILABLE:
900
1002
  logger.warning("UK Parliament scraper not available")
901
1003
  return []
902
-
1004
+
903
1005
  logger.info("Starting UK Parliament financial interests collection")
904
1006
  try:
905
1007
  disclosures = await run_uk_parliament_collection(config)
@@ -915,7 +1017,7 @@ async def run_california_workflow(config: ScrapingConfig) -> List[TradingDisclos
915
1017
  if not CALIFORNIA_SCRAPER_AVAILABLE:
916
1018
  logger.warning("California scraper not available")
917
1019
  return []
918
-
1020
+
919
1021
  logger.info("Starting California financial disclosures collection")
920
1022
  try:
921
1023
  disclosures = await run_california_collection(config)
@@ -931,7 +1033,7 @@ async def run_eu_member_states_workflow(config: ScrapingConfig) -> List[TradingD
931
1033
  if not EU_MEMBER_STATES_SCRAPER_AVAILABLE:
932
1034
  logger.warning("EU member states scraper not available")
933
1035
  return []
934
-
1036
+
935
1037
  logger.info("Starting EU member states financial disclosures collection")
936
1038
  try:
937
1039
  disclosures = await run_eu_member_states_collection(config)
@@ -947,7 +1049,7 @@ async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosu
947
1049
  if not US_STATES_SCRAPER_AVAILABLE:
948
1050
  logger.warning("US states scraper not available")
949
1051
  return []
950
-
1052
+
951
1053
  logger.info("Starting US states financial disclosures collection")
952
1054
  try:
953
1055
  disclosures = await run_us_states_collection(config)
@@ -960,7 +1062,13 @@ async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosu
960
1062
 
961
1063
  # Export the new workflow function
962
1064
  __all__ = [
963
- 'BaseScraper', 'CongressTradingScraper', 'QuiverQuantScraper', 'EUParliamentScraper',
964
- 'PoliticianMatcher', 'run_uk_parliament_workflow', 'run_california_workflow',
965
- 'run_eu_member_states_workflow', 'run_us_states_workflow'
1065
+ "BaseScraper",
1066
+ "CongressTradingScraper",
1067
+ "QuiverQuantScraper",
1068
+ "EUParliamentScraper",
1069
+ "PoliticianMatcher",
1070
+ "run_uk_parliament_workflow",
1071
+ "run_california_workflow",
1072
+ "run_eu_member_states_workflow",
1073
+ "run_us_states_workflow",
966
1074
  ]