mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,966 @@
1
+ """
2
+ Web scrapers for politician trading data
3
+ """
4
+
5
+ import asyncio
6
+ import logging
7
+ import re
8
+ from datetime import datetime, timedelta
9
+ from decimal import Decimal
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ from urllib.parse import urljoin, urlparse
12
+
13
+ import aiohttp
14
+ from bs4 import BeautifulSoup
15
+
16
+ from .models import Politician, TradingDisclosure, TransactionType, PoliticianRole
17
+ from .config import ScrapingConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class BaseScraper:
23
+ """Base class for all scrapers"""
24
+
25
+ def __init__(self, config: ScrapingConfig):
26
+ self.config = config
27
+ self.session: Optional[aiohttp.ClientSession] = None
28
+
29
+ async def __aenter__(self):
30
+ """Async context manager entry"""
31
+ self.session = aiohttp.ClientSession(
32
+ timeout=aiohttp.ClientTimeout(total=self.config.timeout),
33
+ headers={"User-Agent": self.config.user_agent},
34
+ )
35
+ return self
36
+
37
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
38
+ """Async context manager exit"""
39
+ if self.session:
40
+ await self.session.close()
41
+
42
+ async def fetch_page(self, url: str, **kwargs) -> Optional[str]:
43
+ """Fetch a web page with error handling and rate limiting"""
44
+ for attempt in range(self.config.max_retries):
45
+ try:
46
+ await asyncio.sleep(self.config.request_delay)
47
+
48
+ async with self.session.get(url, **kwargs) as response:
49
+ if response.status == 200:
50
+ return await response.text()
51
+ else:
52
+ logger.warning(f"HTTP {response.status} for {url}")
53
+ if response.status == 429: # Rate limited
54
+ await asyncio.sleep(self.config.request_delay * 2)
55
+
56
+ except Exception as e:
57
+ logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
58
+ if attempt < self.config.max_retries - 1:
59
+ await asyncio.sleep(self.config.request_delay * (attempt + 1))
60
+
61
+ return None
62
+
63
+ def parse_amount_range(
64
+ self, amount_text: str
65
+ ) -> Tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
66
+ """Parse amount text into range values"""
67
+ if not amount_text:
68
+ return None, None, None
69
+
70
+ amount_text = amount_text.replace(",", "").replace("$", "").strip()
71
+
72
+ # Look for range patterns like "$1,001 - $15,000"
73
+ range_match = re.search(r"(\d+(?:\.\d{2})?)\s*[-–]\s*(\d+(?:\.\d{2})?)", amount_text)
74
+ if range_match:
75
+ min_val = Decimal(range_match.group(1))
76
+ max_val = Decimal(range_match.group(2))
77
+ return min_val, max_val, None
78
+
79
+ # Look for exact amounts
80
+ exact_match = re.search(r"(\d+(?:\.\d{2})?)", amount_text)
81
+ if exact_match:
82
+ exact_val = Decimal(exact_match.group(1))
83
+ return None, None, exact_val
84
+
85
+ # Handle standard ranges
86
+ range_mappings = {
87
+ "$1,001 - $15,000": (Decimal("1001"), Decimal("15000")),
88
+ "$15,001 - $50,000": (Decimal("15001"), Decimal("50000")),
89
+ "$50,001 - $100,000": (Decimal("50001"), Decimal("100000")),
90
+ "$100,001 - $250,000": (Decimal("100001"), Decimal("250000")),
91
+ "$250,001 - $500,000": (Decimal("250001"), Decimal("500000")),
92
+ "$500,001 - $1,000,000": (Decimal("500001"), Decimal("1000000")),
93
+ "$1,000,001 - $5,000,000": (Decimal("1000001"), Decimal("5000000")),
94
+ "$5,000,001 - $25,000,000": (Decimal("5000001"), Decimal("25000000")),
95
+ "$25,000,001 - $50,000,000": (Decimal("25000001"), Decimal("50000000")),
96
+ "Over $50,000,000": (Decimal("50000001"), None),
97
+ }
98
+
99
+ for pattern, (min_val, max_val) in range_mappings.items():
100
+ if pattern.lower() in amount_text.lower():
101
+ return min_val, max_val, None
102
+
103
+ return None, None, None
104
+
105
+
106
+ class CongressTradingScraper(BaseScraper):
107
+ """Scraper for US Congress trading data"""
108
+
109
+ async def scrape_house_disclosures(self) -> List[TradingDisclosure]:
110
+ """Scrape House financial disclosures from the official database"""
111
+ disclosures = []
112
+ base_url = "https://disclosures-clerk.house.gov"
113
+ search_url = f"{base_url}/FinancialDisclosure"
114
+
115
+ try:
116
+ logger.info("Starting House disclosures scrape from official database")
117
+
118
+ async with aiohttp.ClientSession(
119
+ timeout=aiohttp.ClientTimeout(total=self.config.timeout),
120
+ headers={'User-Agent': self.config.user_agent}
121
+ ) as session:
122
+
123
+ # Get the ViewSearch form page
124
+ view_search_url = f"{base_url}/FinancialDisclosure/ViewSearch"
125
+ async with session.get(view_search_url) as response:
126
+ if response.status == 200:
127
+ html = await response.text()
128
+ logger.info("Successfully accessed House financial disclosure search form")
129
+
130
+ # Extract form data for ASPX
131
+ soup = BeautifulSoup(html, 'html.parser')
132
+
133
+ # Look for common ASPX form fields
134
+ form_fields = {}
135
+ for field_name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION', '__REQUESTVERIFICATIONTOKEN']:
136
+ field = soup.find('input', {'name': field_name})
137
+ if field and field.get('value'):
138
+ form_fields[field_name] = field['value']
139
+
140
+ if '__VIEWSTATE' in form_fields:
141
+ logger.info("Found required ASPX form fields")
142
+
143
+ # Search for recent disclosures - try different form field names
144
+ current_year = str(datetime.now().year)
145
+
146
+ # Search for common politician last names to get real data
147
+ common_names = ['Smith', 'Johnson', 'Brown', 'Davis', 'Wilson', 'Miller', 'Garcia']
148
+
149
+ # Try different form patterns with actual names
150
+ possible_form_data_sets = []
151
+
152
+ for name in common_names:
153
+ possible_form_data_sets.extend([
154
+ {
155
+ **form_fields,
156
+ 'ctl00$MainContent$txtLastName': name,
157
+ 'ctl00$MainContent$ddlFilingYear': current_year,
158
+ 'ctl00$MainContent$btnSearch': 'Search'
159
+ },
160
+ {
161
+ **form_fields,
162
+ 'ctl00$ContentPlaceHolder1$txtLastName': name,
163
+ 'ctl00$ContentPlaceHolder1$ddlFilingYear': current_year,
164
+ 'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
165
+ },
166
+ {
167
+ **form_fields,
168
+ 'LastName': name,
169
+ 'FilingYear': current_year,
170
+ 'Search': 'Search'
171
+ }
172
+ ])
173
+
174
+ # Also try without names (all results)
175
+ possible_form_data_sets.extend([
176
+ {
177
+ **form_fields,
178
+ 'ctl00$MainContent$ddlFilingYear': current_year,
179
+ 'ctl00$MainContent$btnSearch': 'Search'
180
+ },
181
+ {
182
+ **form_fields,
183
+ 'ctl00$ContentPlaceHolder1$ddlFilingYear': current_year,
184
+ 'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
185
+ }
186
+ ])
187
+
188
+ # Try each form configuration
189
+ for i, form_data in enumerate(possible_form_data_sets):
190
+ try:
191
+ logger.info(f"Attempting search with form configuration {i+1}")
192
+ async with session.post(view_search_url, data=form_data) as search_response:
193
+ if search_response.status == 200:
194
+ results_html = await search_response.text()
195
+ if 'search results' in results_html.lower() or 'disclosure' in results_html.lower():
196
+ disclosures = await self._parse_house_results(results_html, base_url)
197
+ logger.info(f"Successfully found {len(disclosures)} House disclosures")
198
+ break
199
+ else:
200
+ logger.debug(f"Form config {i+1} didn't return results")
201
+ else:
202
+ logger.debug(f"Form config {i+1} failed with status {search_response.status}")
203
+ except Exception as e:
204
+ logger.debug(f"Form config {i+1} failed: {e}")
205
+ else:
206
+ logger.warning("All form configurations failed, using basic page scraping")
207
+ # Fall back to scraping any existing disclosure links on the page
208
+ disclosures = await self._parse_house_results(html, base_url)
209
+ else:
210
+ logger.warning("Could not find required ASPX form fields, using basic page scraping")
211
+ # Fall back to parsing any existing links
212
+ disclosures = await self._parse_house_results(html, base_url)
213
+ else:
214
+ logger.warning(f"Failed to access House disclosure site: {response.status}")
215
+
216
+ # Rate limiting
217
+ await asyncio.sleep(self.config.request_delay)
218
+
219
+ except Exception as e:
220
+ logger.error(f"House disclosures scrape failed: {e}")
221
+ # Return empty list on error rather than sample data
222
+
223
+ return disclosures
224
+
225
+ async def scrape_senate_disclosures(self) -> List[TradingDisclosure]:
226
+ """Scrape Senate financial disclosures from the official EFD database"""
227
+ disclosures = []
228
+ base_url = "https://efdsearch.senate.gov"
229
+ search_url = f"{base_url}/search/"
230
+
231
+ try:
232
+ logger.info("Starting Senate disclosures scrape from EFD database")
233
+
234
+ async with aiohttp.ClientSession(
235
+ timeout=aiohttp.ClientTimeout(total=self.config.timeout),
236
+ headers={'User-Agent': self.config.user_agent}
237
+ ) as session:
238
+
239
+ # Search for recent periodic transaction reports (PTRs)
240
+ search_params = {
241
+ 'report_type': '11', # Periodic Transaction Report
242
+ 'submitted_start_date': (datetime.now() - timedelta(days=90)).strftime('%m/%d/%Y'),
243
+ 'submitted_end_date': datetime.now().strftime('%m/%d/%Y')
244
+ }
245
+
246
+ async with session.get(search_url, params=search_params) as response:
247
+ if response.status == 200:
248
+ html = await response.text()
249
+ disclosures = await self._parse_senate_results(html, base_url)
250
+ logger.info(f"Found {len(disclosures)} Senate disclosures")
251
+ else:
252
+ logger.warning(f"Senate search failed with status {response.status}")
253
+
254
+ # Rate limiting
255
+ await asyncio.sleep(self.config.request_delay)
256
+
257
+ except Exception as e:
258
+ logger.error(f"Senate disclosures scrape failed: {e}")
259
+ # Return empty list on error rather than sample data
260
+
261
+ return disclosures
262
+
263
+ async def _parse_house_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
264
+ """Parse House disclosure search results"""
265
+ disclosures = []
266
+
267
+ try:
268
+ soup = BeautifulSoup(html, 'html.parser')
269
+
270
+ # Look for disclosure result rows - try multiple selectors
271
+ result_rows = (soup.find_all('tr', class_='disclosure-row') or
272
+ soup.select('tr[id*="GridView"]') or
273
+ soup.select('table tr') or
274
+ soup.find_all('tr'))
275
+
276
+ logger.info(f"Found {len(result_rows)} potential result rows")
277
+
278
+ for row in result_rows[:20]: # Limit to 20 most recent
279
+ cells = row.find_all('td')
280
+ if len(cells) >= 3: # At least 3 columns
281
+ # Extract text from each cell to identify the structure
282
+ cell_texts = [cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)]
283
+
284
+ if not cell_texts:
285
+ continue
286
+
287
+ # Try to identify which cell contains the politician name
288
+ # Names usually contain letters and may have titles like "Rep.", "Hon."
289
+ politician_name = ""
290
+
291
+ for text in cell_texts:
292
+ # Look for text that looks like a person's name
293
+ if (len(text) > 3 and
294
+ any(c.isalpha() for c in text) and
295
+ not text.isdigit() and
296
+ not text.startswith('20') and # Not a year
297
+ 'pdf' not in text.lower() and
298
+ 'view' not in text.lower()):
299
+
300
+ # Clean up potential name
301
+ clean_name = text.replace('Hon.', '').replace('Rep.', '').replace('Sen.', '').strip()
302
+ if len(clean_name) > 3 and ' ' in clean_name: # Likely full name
303
+ politician_name = clean_name
304
+ break
305
+
306
+ if not politician_name:
307
+ politician_name = cell_texts[0] # Fallback to first cell
308
+
309
+ # Extract other information
310
+ filing_year = next((text for text in cell_texts if text.isdigit() and len(text) == 4 and text.startswith('20')), "")
311
+ filing_type = next((text for text in cell_texts if 'periodic' in text.lower() or 'annual' in text.lower()), "")
312
+
313
+ # Look for PDF link
314
+ pdf_link = row.find('a', href=True)
315
+ if pdf_link:
316
+ pdf_url = urljoin(base_url, pdf_link['href'])
317
+
318
+ # Create basic disclosure entry
319
+ # Note: Actual transaction details would require PDF parsing
320
+ disclosure = TradingDisclosure(
321
+ politician_id="", # To be filled by matcher
322
+ transaction_date=datetime.now() - timedelta(days=30), # Estimate
323
+ disclosure_date=datetime.now() - timedelta(days=15), # Estimate
324
+ transaction_type=TransactionType.PURCHASE, # Default
325
+ asset_name="Unknown Asset", # Would need PDF parsing
326
+ asset_type="stock",
327
+ amount_range_min=Decimal("1001"),
328
+ amount_range_max=Decimal("15000"),
329
+ source_url=pdf_url,
330
+ raw_data={
331
+ "politician_name": politician_name,
332
+ "filing_year": filing_year,
333
+ "filing_type": filing_type,
334
+ "requires_pdf_parsing": True,
335
+ "extraction_method": "house_search_results"
336
+ }
337
+ )
338
+ disclosures.append(disclosure)
339
+
340
+ except Exception as e:
341
+ logger.error(f"Error parsing House results: {e}")
342
+
343
+ return disclosures
344
+
345
+ async def _parse_senate_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
346
+ """Parse Senate EFD search results"""
347
+ disclosures = []
348
+
349
+ try:
350
+ soup = BeautifulSoup(html, 'html.parser')
351
+
352
+ # Look for search result rows
353
+ result_rows = soup.find_all('tr', class_='searchresult') or soup.select('tbody tr')
354
+
355
+ for row in result_rows[:20]: # Limit to 20 most recent
356
+ cells = row.find_all('td')
357
+ if len(cells) >= 4:
358
+ # Extract information
359
+ name = cells[0].get_text(strip=True) if cells[0] else ""
360
+ report_type = cells[1].get_text(strip=True) if cells[1] else ""
361
+ filing_date = cells[2].get_text(strip=True) if cells[2] else ""
362
+
363
+ # Look for report link
364
+ report_link = row.find('a', href=True)
365
+ if report_link and 'ptr' in report_type.lower(): # Periodic Transaction Report
366
+ report_url = urljoin(base_url, report_link['href'])
367
+
368
+ # Create disclosure entry
369
+ # Note: Actual transaction details would require report parsing
370
+ disclosure = TradingDisclosure(
371
+ politician_id="", # To be filled by matcher
372
+ transaction_date=datetime.now() - timedelta(days=30), # Estimate
373
+ disclosure_date=self._parse_date(filing_date) or datetime.now(),
374
+ transaction_type=TransactionType.PURCHASE, # Default
375
+ asset_name="Unknown Asset", # Would need report parsing
376
+ asset_type="stock",
377
+ amount_range_min=Decimal("1001"),
378
+ amount_range_max=Decimal("50000"),
379
+ source_url=report_url,
380
+ raw_data={
381
+ "politician_name": name,
382
+ "report_type": report_type,
383
+ "filing_date": filing_date,
384
+ "requires_report_parsing": True
385
+ }
386
+ )
387
+ disclosures.append(disclosure)
388
+
389
+ except Exception as e:
390
+ logger.error(f"Error parsing Senate results: {e}")
391
+
392
+ return disclosures
393
+
394
+ def _parse_date(self, date_str: str) -> Optional[datetime]:
395
+ """Parse various date formats from disclosure sites"""
396
+ if not date_str:
397
+ return None
398
+
399
+ date_formats = [
400
+ '%m/%d/%Y',
401
+ '%Y-%m-%d',
402
+ '%m-%d-%Y',
403
+ '%B %d, %Y'
404
+ ]
405
+
406
+ for fmt in date_formats:
407
+ try:
408
+ return datetime.strptime(date_str.strip(), fmt)
409
+ except ValueError:
410
+ continue
411
+
412
+ logger.warning(f"Could not parse date: {date_str}")
413
+ return None
414
+
415
+
416
+ class QuiverQuantScraper(BaseScraper):
417
+ """Scraper for QuiverQuant congress trading data as a backup source"""
418
+
419
+ async def scrape_congress_trades(self) -> List[Dict[str, Any]]:
420
+ """Scrape congress trading data from QuiverQuant"""
421
+ trades = []
422
+
423
+ try:
424
+ # This would implement scraping from QuiverQuant's public data
425
+ # Note: Respect their robots.txt and terms of service
426
+ logger.info("Starting QuiverQuant scrape")
427
+
428
+ url = "https://www.quiverquant.com/congresstrading/"
429
+ html = await self.fetch_page(url)
430
+
431
+ if html:
432
+ soup = BeautifulSoup(html, "html.parser")
433
+
434
+ # Parse the trading data table (simplified example)
435
+ # In reality, this might require handling JavaScript rendering
436
+ trade_rows = soup.select("table tr")
437
+
438
+ for row in trade_rows[1:10]: # Skip header, limit to 10 for example
439
+ cells = row.select("td")
440
+ if len(cells) >= 4:
441
+ # Extract cell contents and try to identify the correct fields
442
+ cell_texts = [cell.get_text(strip=True) for cell in cells]
443
+
444
+ # Try to identify which cell contains what data
445
+ politician_name = cell_texts[0] if len(cell_texts) > 0 else ""
446
+
447
+ # Look for date-like patterns (YYYY-MM-DD, MM/DD/YYYY, etc.)
448
+ transaction_date = ""
449
+ ticker = ""
450
+ transaction_type = ""
451
+ amount = ""
452
+
453
+ for i, text in enumerate(cell_texts[1:], 1): # Skip first cell (politician name)
454
+ # Check if this looks like a date
455
+ if self._looks_like_date(text):
456
+ transaction_date = text
457
+ # Check if this looks like a ticker (all caps, short)
458
+ elif text.isupper() and len(text) <= 5 and text.isalpha():
459
+ ticker = text
460
+ # Check if this contains transaction type keywords
461
+ elif any(word in text.lower() for word in ['purchase', 'sale', 'buy', 'sell']):
462
+ # Split transaction type and amount if combined
463
+ if '$' in text:
464
+ # Split on $ to separate transaction type from amount
465
+ parts = text.split('$', 1)
466
+ transaction_type = parts[0].strip()
467
+ amount = '$' + parts[1] if len(parts) > 1 else ""
468
+ else:
469
+ transaction_type = text
470
+ # Check if this looks like an amount (contains $ or numbers with ,)
471
+ elif '$' in text or (',' in text and any(c.isdigit() for c in text)):
472
+ amount = text
473
+
474
+ # Only create trade data if we have essential fields
475
+ if politician_name and (transaction_date or ticker):
476
+ trade_data = {
477
+ "politician_name": politician_name,
478
+ "transaction_date": transaction_date,
479
+ "ticker": ticker,
480
+ "transaction_type": transaction_type,
481
+ "amount": amount,
482
+ "source": "quiverquant",
483
+ }
484
+ trades.append(trade_data)
485
+
486
+ except Exception as e:
487
+ logger.error(f"QuiverQuant scrape failed: {e}")
488
+
489
+ return trades
490
+
491
+ def _looks_like_date(self, text: str) -> bool:
492
+ """Check if a string looks like a date"""
493
+ if not text or len(text) < 8:
494
+ return False
495
+
496
+ # Common date patterns
497
+ date_patterns = [
498
+ r'\d{4}-\d{1,2}-\d{1,2}', # YYYY-MM-DD
499
+ r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
500
+ r'\d{1,2}-\d{1,2}-\d{4}', # MM-DD-YYYY
501
+ r'\w{3}\s+\d{1,2},?\s+\d{4}', # Month DD, YYYY
502
+ ]
503
+
504
+ import re
505
+ for pattern in date_patterns:
506
+ if re.search(pattern, text):
507
+ return True
508
+ return False
509
+
510
+ def parse_quiver_trade(self, trade_data: Dict[str, Any]) -> Optional[TradingDisclosure]:
511
+ """Parse QuiverQuant trade data into TradingDisclosure"""
512
+ try:
513
+ # Debug: Log the trade data structure
514
+ logger.debug(f"Parsing QuiverQuant trade data: {trade_data}")
515
+ # Parse transaction type
516
+ transaction_type_map = {
517
+ "purchase": TransactionType.PURCHASE,
518
+ "sale": TransactionType.SALE,
519
+ "buy": TransactionType.PURCHASE,
520
+ "sell": TransactionType.SALE,
521
+ }
522
+
523
+ transaction_type = transaction_type_map.get(
524
+ trade_data.get("transaction_type", "").lower(), TransactionType.PURCHASE
525
+ )
526
+
527
+ # Parse date
528
+ date_str = trade_data.get("transaction_date", "")
529
+ if not date_str or date_str.strip() == "" or not self._looks_like_date(date_str):
530
+ # Use estimated date if no valid date found
531
+ transaction_date = datetime.now() - timedelta(days=30) # Estimate 30 days ago
532
+ else:
533
+ # Try multiple date formats
534
+ try:
535
+ # Standard format
536
+ transaction_date = datetime.strptime(date_str, "%Y-%m-%d")
537
+ except ValueError:
538
+ try:
539
+ # Alternative format
540
+ transaction_date = datetime.strptime(date_str, "%m/%d/%Y")
541
+ except ValueError:
542
+ try:
543
+ # Try MM-DD-YYYY
544
+ transaction_date = datetime.strptime(date_str, "%m-%d-%Y")
545
+ except ValueError:
546
+ logger.warning(f"Could not parse date '{date_str}', using estimated date")
547
+ transaction_date = datetime.now() - timedelta(days=30)
548
+
549
+ # Parse amount
550
+ amount_min, amount_max, amount_exact = self.parse_amount_range(
551
+ trade_data.get("amount", "")
552
+ )
553
+
554
+ disclosure = TradingDisclosure(
555
+ politician_id="", # Will be filled after politician matching
556
+ transaction_date=transaction_date,
557
+ disclosure_date=datetime.now(), # QuiverQuant aggregation date
558
+ transaction_type=transaction_type,
559
+ asset_name=trade_data.get("ticker", ""),
560
+ asset_ticker=trade_data.get("ticker", ""),
561
+ asset_type="stock",
562
+ amount_range_min=amount_min,
563
+ amount_range_max=amount_max,
564
+ amount_exact=amount_exact,
565
+ source_url="https://www.quiverquant.com/congresstrading/",
566
+ raw_data=trade_data,
567
+ )
568
+
569
+ return disclosure
570
+
571
+ except Exception as e:
572
+ logger.error(f"Failed to parse QuiverQuant trade: {e}")
573
+ return None
574
+
575
+
576
+ class EUParliamentScraper(BaseScraper):
577
+ """Scraper for EU Parliament member declarations"""
578
+
579
+ async def scrape_mep_declarations(self) -> List[TradingDisclosure]:
580
+ """Scrape MEP financial declarations from official EU Parliament site"""
581
+ disclosures = []
582
+
583
+ try:
584
+ logger.info("Starting EU Parliament MEP declarations scrape")
585
+ base_url = "https://www.europarl.europa.eu"
586
+
587
+ async with aiohttp.ClientSession(
588
+ timeout=aiohttp.ClientTimeout(total=self.config.timeout),
589
+ headers={'User-Agent': self.config.user_agent}
590
+ ) as session:
591
+
592
+ # Get list of current MEPs
593
+ mep_list_url = f"{base_url}/meps/en/full-list/all"
594
+
595
+ async with session.get(mep_list_url) as response:
596
+ if response.status == 200:
597
+ html = await response.text()
598
+ mep_data = await self._extract_mep_urls(html, base_url)
599
+ logger.info(f"Found {len(mep_data)} MEP profiles to check")
600
+
601
+ # Check declarations for a subset of MEPs (to avoid overwhelming the server)
602
+ for i, mep_info in enumerate(mep_data[:50]): # Limit to 50 MEPs
603
+ try:
604
+ mep_disclosures = await self._scrape_mep_profile(session, mep_info['url'], mep_info)
605
+ disclosures.extend(mep_disclosures)
606
+
607
+ # Rate limiting - EU Parliament is more sensitive
608
+ await asyncio.sleep(self.config.request_delay * 2)
609
+
610
+ if i > 0 and i % 10 == 0:
611
+ logger.info(f"Processed {i} MEP profiles")
612
+
613
+ except Exception as e:
614
+ logger.warning(f"Failed to process MEP profile {mep_url}: {e}")
615
+ continue
616
+ else:
617
+ logger.warning(f"Failed to access MEP list: {response.status}")
618
+
619
+ logger.info(f"Collected {len(disclosures)} EU Parliament disclosures")
620
+
621
+ except Exception as e:
622
+ logger.error(f"EU Parliament scrape failed: {e}")
623
+
624
+ return disclosures
625
+
626
+ async def _extract_mep_urls(self, html: str, base_url: str) -> List[Dict[str, str]]:
627
+ """Extract MEP profile URLs and names from the MEP list page"""
628
+ mep_data = []
629
+
630
+ try:
631
+ soup = BeautifulSoup(html, 'html.parser')
632
+
633
+ # Look for MEP profile links - they usually contain both name and link
634
+ mep_links = soup.find_all('a', href=True)
635
+
636
+ seen_urls = set()
637
+
638
+ for link in mep_links:
639
+ href = link.get('href', '')
640
+ if '/meps/en/' in href and '/home' in href:
641
+ full_url = urljoin(base_url, href)
642
+
643
+ if full_url not in seen_urls:
644
+ # Extract MEP name from link text or nearby elements
645
+ mep_name = ""
646
+
647
+ # Try to get name from link text
648
+ link_text = link.get_text(strip=True)
649
+ if link_text and len(link_text) > 3 and not link_text.lower().startswith('http'):
650
+ mep_name = link_text
651
+
652
+ # If no name in link, look in parent elements
653
+ if not mep_name:
654
+ parent = link.parent
655
+ if parent:
656
+ # Look for text that looks like a name
657
+ for text_node in parent.stripped_strings:
658
+ if (len(text_node) > 3 and
659
+ ' ' in text_node and
660
+ not text_node.startswith('http') and
661
+ not text_node.isdigit()):
662
+ mep_name = text_node
663
+ break
664
+
665
+ # Extract country/party info if available
666
+ country = ""
667
+ party = ""
668
+
669
+ # Look for country and party info near the link
670
+ container = link.find_parent(['div', 'article', 'section'])
671
+ if container:
672
+ text_elements = list(container.stripped_strings)
673
+ for i, text in enumerate(text_elements):
674
+ if text == mep_name and i < len(text_elements) - 2:
675
+ # Country and party usually come after name
676
+ country = text_elements[i + 1] if i + 1 < len(text_elements) else ""
677
+ party = text_elements[i + 2] if i + 2 < len(text_elements) else ""
678
+
679
+ if mep_name: # Only add if we found a name
680
+ mep_data.append({
681
+ 'url': full_url,
682
+ 'name': mep_name,
683
+ 'country': country,
684
+ 'party': party
685
+ })
686
+ seen_urls.add(full_url)
687
+
688
+ # Limit to prevent overwhelming the servers
689
+ if len(mep_data) >= 50:
690
+ break
691
+
692
+ except Exception as e:
693
+ logger.error(f"Error extracting MEP data: {e}")
694
+
695
+ return mep_data
696
+
697
+ async def _scrape_mep_profile(self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None) -> List[TradingDisclosure]:
698
+ """Scrape financial interests from an individual MEP profile"""
699
+ disclosures = []
700
+
701
+ try:
702
+ async with session.get(mep_url) as response:
703
+ if response.status == 200:
704
+ html = await response.text()
705
+ soup = BeautifulSoup(html, 'html.parser')
706
+
707
+ # Use extracted MEP name from list, or try to extract from profile
708
+ if mep_info and mep_info.get('name'):
709
+ mep_name = mep_info['name']
710
+ mep_country = mep_info.get('country', '')
711
+ mep_party = mep_info.get('party', '')
712
+ else:
713
+ # Fallback: extract from profile page
714
+ name_element = soup.find('h1', class_='ep-header-title')
715
+ mep_name = name_element.get_text(strip=True) if name_element else "Unknown MEP"
716
+ mep_country = ""
717
+ mep_party = ""
718
+
719
+ # Look for financial interests section
720
+ # EU Parliament declarations are typically in a specific section
721
+ interests_section = soup.find('div', id='financial-interests') or \
722
+ soup.find('section', class_='ep-a-section') or \
723
+ soup.find('div', class_='ep-m-content-block')
724
+
725
+ if interests_section:
726
+ # Parse financial interests
727
+ # Note: EU declarations focus more on activities and interests than specific trades
728
+ interest_items = interests_section.find_all(['p', 'li', 'div'], recursive=True)
729
+
730
+ for item in interest_items:
731
+ item_text = item.get_text(strip=True).lower()
732
+
733
+ # Look for financial keywords
734
+ if any(keyword in item_text for keyword in [
735
+ 'shareholding', 'investment', 'director', 'board',
736
+ 'financial interest', 'remuneration', 'consulting'
737
+ ]):
738
+ # Create disclosure for detected financial interest
739
+ disclosure = TradingDisclosure(
740
+ politician_id="", # To be filled by matcher
741
+ transaction_date=datetime.now() - timedelta(days=90), # Estimate
742
+ disclosure_date=datetime.now() - timedelta(days=60), # Estimate
743
+ transaction_type=TransactionType.PURCHASE, # Default for interests
744
+ asset_name=self._extract_company_name(item_text),
745
+ asset_type="interest",
746
+ amount_range_min=Decimal("0"), # EU doesn't always specify amounts
747
+ amount_range_max=Decimal("0"),
748
+ source_url=mep_url,
749
+ raw_data={
750
+ "politician_name": mep_name,
751
+ "country": mep_country,
752
+ "party": mep_party,
753
+ "interest_type": "financial_activity",
754
+ "interest_description": item.get_text(strip=True)[:500], # Truncate
755
+ "region": "eu",
756
+ "extraction_method": "mep_profile_scraping",
757
+ "requires_manual_review": True
758
+ }
759
+ )
760
+ disclosures.append(disclosure)
761
+
762
+ except Exception as e:
763
+ logger.warning(f"Error scraping MEP profile {mep_url}: {e}")
764
+
765
+ return disclosures
766
+
767
+ def _extract_company_name(self, text: str) -> str:
768
+ """Extract company/organization name from interest description"""
769
+ # Simple heuristic to extract potential company names
770
+ words = text.split()
771
+
772
+ # Look for capitalized sequences that might be company names
773
+ potential_names = []
774
+ current_name = []
775
+
776
+ for word in words:
777
+ if word[0].isupper() and len(word) > 2:
778
+ current_name.append(word)
779
+ else:
780
+ if current_name and len(current_name) <= 4: # Reasonable company name length
781
+ potential_names.append(" ".join(current_name))
782
+ current_name = []
783
+
784
+ if current_name and len(current_name) <= 4:
785
+ potential_names.append(" ".join(current_name))
786
+
787
+ # Return the first reasonable candidate or default
788
+ return potential_names[0] if potential_names else "Financial Interest"
789
+
790
+
791
+ class PoliticianMatcher:
792
+ """Matches scraped names to politician records"""
793
+
794
+ def __init__(self, politicians: List[Politician]):
795
+ self.politicians = politicians
796
+ self._build_lookup()
797
+
798
+ def _build_lookup(self):
799
+ """Build lookup dictionaries for fast matching"""
800
+ self.name_lookup = {}
801
+ self.bioguide_lookup = {}
802
+
803
+ for politician in self.politicians:
804
+ # Full name variations
805
+ full_name = politician.full_name.lower()
806
+ self.name_lookup[full_name] = politician
807
+
808
+ # Last, First format
809
+ if politician.first_name and politician.last_name:
810
+ last_first = f"{politician.last_name.lower()}, {politician.first_name.lower()}"
811
+ self.name_lookup[last_first] = politician
812
+
813
+ # First Last format
814
+ first_last = f"{politician.first_name.lower()} {politician.last_name.lower()}"
815
+ self.name_lookup[first_last] = politician
816
+
817
+ # Bioguide ID lookup
818
+ if politician.bioguide_id:
819
+ self.bioguide_lookup[politician.bioguide_id] = politician
820
+
821
+ def find_politician(self, name: str, bioguide_id: str = None) -> Optional[Politician]:
822
+ """Find politician by name or bioguide ID"""
823
+ if bioguide_id and bioguide_id in self.bioguide_lookup:
824
+ return self.bioguide_lookup[bioguide_id]
825
+
826
+ if name:
827
+ name_clean = name.lower().strip()
828
+
829
+ # Direct match
830
+ if name_clean in self.name_lookup:
831
+ return self.name_lookup[name_clean]
832
+
833
+ # Fuzzy matching (simplified)
834
+ for lookup_name, politician in self.name_lookup.items():
835
+ if self._names_similar(name_clean, lookup_name):
836
+ return politician
837
+
838
+ return None
839
+
840
+ def _names_similar(self, name1: str, name2: str) -> bool:
841
+ """Simple similarity check for names"""
842
+ # Remove common prefixes/suffixes
843
+ prefixes = ["rep.", "sen.", "senator", "representative", "mr.", "mrs.", "ms."]
844
+ suffixes = ["jr.", "sr.", "ii", "iii", "iv"]
845
+
846
+ for prefix in prefixes:
847
+ name1 = name1.replace(prefix, "").strip()
848
+ name2 = name2.replace(prefix, "").strip()
849
+
850
+ for suffix in suffixes:
851
+ name1 = name1.replace(suffix, "").strip()
852
+ name2 = name2.replace(suffix, "").strip()
853
+
854
+ # Check if one name contains the other
855
+ return name1 in name2 or name2 in name1
856
+
857
+
858
+ # Import specialized scrapers after base classes are defined
859
+ try:
860
+ from .scrapers_uk import UKParliamentScraper, run_uk_parliament_collection
861
+ UK_SCRAPER_AVAILABLE = True
862
+ except Exception as e:
863
+ logger.debug(f"UK scraper import failed: {e}")
864
+ UKParliamentScraper = None
865
+ run_uk_parliament_collection = None
866
+ UK_SCRAPER_AVAILABLE = False
867
+
868
+ try:
869
+ from .scrapers_california import CaliforniaNetFileScraper, run_california_collection
870
+ CALIFORNIA_SCRAPER_AVAILABLE = True
871
+ except Exception as e:
872
+ logger.debug(f"California scraper import failed: {e}")
873
+ CaliforniaNetFileScraper = None
874
+ run_california_collection = None
875
+ CALIFORNIA_SCRAPER_AVAILABLE = False
876
+
877
+ try:
878
+ from .scrapers_eu import EUMemberStatesScraper, run_eu_member_states_collection
879
+ EU_MEMBER_STATES_SCRAPER_AVAILABLE = True
880
+ except Exception as e:
881
+ logger.debug(f"EU member states scraper import failed: {e}")
882
+ EUMemberStatesScraper = None
883
+ run_eu_member_states_collection = None
884
+ EU_MEMBER_STATES_SCRAPER_AVAILABLE = False
885
+
886
+ try:
887
+ from .scrapers_us_states import USStatesScraper, run_us_states_collection
888
+ US_STATES_SCRAPER_AVAILABLE = True
889
+ except Exception as e:
890
+ logger.debug(f"US states scraper import failed: {e}")
891
+ USStatesScraper = None
892
+ run_us_states_collection = None
893
+ US_STATES_SCRAPER_AVAILABLE = False
894
+
895
+
896
+ # Workflow functions using imported scrapers
897
+ async def run_uk_parliament_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
898
+ """Run UK Parliament data collection workflow"""
899
+ if not UK_SCRAPER_AVAILABLE:
900
+ logger.warning("UK Parliament scraper not available")
901
+ return []
902
+
903
+ logger.info("Starting UK Parliament financial interests collection")
904
+ try:
905
+ disclosures = await run_uk_parliament_collection(config)
906
+ logger.info(f"Successfully collected {len(disclosures)} UK Parliament disclosures")
907
+ return disclosures
908
+ except Exception as e:
909
+ logger.error(f"UK Parliament collection failed: {e}")
910
+ return []
911
+
912
+
913
+ async def run_california_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
914
+ """Run California NetFile and state disclosure collection workflow"""
915
+ if not CALIFORNIA_SCRAPER_AVAILABLE:
916
+ logger.warning("California scraper not available")
917
+ return []
918
+
919
+ logger.info("Starting California financial disclosures collection")
920
+ try:
921
+ disclosures = await run_california_collection(config)
922
+ logger.info(f"Successfully collected {len(disclosures)} California disclosures")
923
+ return disclosures
924
+ except Exception as e:
925
+ logger.error(f"California collection failed: {e}")
926
+ return []
927
+
928
+
929
+ async def run_eu_member_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
930
+ """Run EU member states financial disclosure collection workflow"""
931
+ if not EU_MEMBER_STATES_SCRAPER_AVAILABLE:
932
+ logger.warning("EU member states scraper not available")
933
+ return []
934
+
935
+ logger.info("Starting EU member states financial disclosures collection")
936
+ try:
937
+ disclosures = await run_eu_member_states_collection(config)
938
+ logger.info(f"Successfully collected {len(disclosures)} EU member states disclosures")
939
+ return disclosures
940
+ except Exception as e:
941
+ logger.error(f"EU member states collection failed: {e}")
942
+ return []
943
+
944
+
945
+ async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
946
+ """Run US states financial disclosure collection workflow"""
947
+ if not US_STATES_SCRAPER_AVAILABLE:
948
+ logger.warning("US states scraper not available")
949
+ return []
950
+
951
+ logger.info("Starting US states financial disclosures collection")
952
+ try:
953
+ disclosures = await run_us_states_collection(config)
954
+ logger.info(f"Successfully collected {len(disclosures)} US states disclosures")
955
+ return disclosures
956
+ except Exception as e:
957
+ logger.error(f"US states collection failed: {e}")
958
+ return []
959
+
960
+
961
+ # Export the new workflow function
962
+ __all__ = [
963
+ 'BaseScraper', 'CongressTradingScraper', 'QuiverQuantScraper', 'EUParliamentScraper',
964
+ 'PoliticianMatcher', 'run_uk_parliament_workflow', 'run_california_workflow',
965
+ 'run_eu_member_states_workflow', 'run_us_states_workflow'
966
+ ]