mcli-framework 7.10.1__py3-none-any.whl → 7.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/commands_cmd.py +150 -58
- mcli/app/main.py +21 -27
- mcli/lib/custom_commands.py +62 -12
- mcli/lib/optional_deps.py +240 -0
- mcli/lib/paths.py +129 -5
- mcli/self/migrate_cmd.py +261 -0
- mcli/self/self_cmd.py +8 -0
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/__init__.py +16 -0
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/secrets/__init__.py +4 -0
- mcli/workflow/secrets/secrets_cmd.py +192 -0
- mcli/workflow/workflow.py +35 -5
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/METADATA +86 -55
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/RECORD +22 -34
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/__init__.py +0 -4
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,1074 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Web scrapers for politician trading data
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import logging
|
|
7
|
-
import re
|
|
8
|
-
from datetime import datetime, timedelta
|
|
9
|
-
from decimal import Decimal
|
|
10
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
-
from urllib.parse import urljoin, urlparse
|
|
12
|
-
|
|
13
|
-
import aiohttp
|
|
14
|
-
from bs4 import BeautifulSoup
|
|
15
|
-
|
|
16
|
-
from .config import ScrapingConfig
|
|
17
|
-
from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class BaseScraper:
|
|
23
|
-
"""Base class for all scrapers"""
|
|
24
|
-
|
|
25
|
-
def __init__(self, config: ScrapingConfig):
|
|
26
|
-
self.config = config
|
|
27
|
-
self.session: Optional[aiohttp.ClientSession] = None
|
|
28
|
-
|
|
29
|
-
async def __aenter__(self):
|
|
30
|
-
"""Async context manager entry"""
|
|
31
|
-
self.session = aiohttp.ClientSession(
|
|
32
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
33
|
-
headers={"User-Agent": self.config.user_agent},
|
|
34
|
-
)
|
|
35
|
-
return self
|
|
36
|
-
|
|
37
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
38
|
-
"""Async context manager exit"""
|
|
39
|
-
if self.session:
|
|
40
|
-
await self.session.close()
|
|
41
|
-
|
|
42
|
-
async def fetch_page(self, url: str, **kwargs) -> Optional[str]:
|
|
43
|
-
"""Fetch a web page with error handling and rate limiting"""
|
|
44
|
-
for attempt in range(self.config.max_retries):
|
|
45
|
-
try:
|
|
46
|
-
await asyncio.sleep(self.config.request_delay)
|
|
47
|
-
|
|
48
|
-
async with self.session.get(url, **kwargs) as response:
|
|
49
|
-
if response.status == 200:
|
|
50
|
-
return await response.text()
|
|
51
|
-
else:
|
|
52
|
-
logger.warning(f"HTTP {response.status} for {url}")
|
|
53
|
-
if response.status == 429: # Rate limited
|
|
54
|
-
await asyncio.sleep(self.config.request_delay * 2)
|
|
55
|
-
|
|
56
|
-
except Exception as e:
|
|
57
|
-
logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
|
|
58
|
-
if attempt < self.config.max_retries - 1:
|
|
59
|
-
await asyncio.sleep(self.config.request_delay * (attempt + 1))
|
|
60
|
-
|
|
61
|
-
return None
|
|
62
|
-
|
|
63
|
-
def parse_amount_range(
|
|
64
|
-
self, amount_text: str
|
|
65
|
-
) -> Tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
|
|
66
|
-
"""Parse amount text into range values"""
|
|
67
|
-
if not amount_text:
|
|
68
|
-
return None, None, None
|
|
69
|
-
|
|
70
|
-
amount_text = amount_text.replace(",", "").replace("$", "").strip()
|
|
71
|
-
|
|
72
|
-
# Look for range patterns like "$1,001 - $15,000"
|
|
73
|
-
range_match = re.search(r"(\d+(?:\.\d{2})?)\s*[-–]\s*(\d+(?:\.\d{2})?)", amount_text)
|
|
74
|
-
if range_match:
|
|
75
|
-
min_val = Decimal(range_match.group(1))
|
|
76
|
-
max_val = Decimal(range_match.group(2))
|
|
77
|
-
return min_val, max_val, None
|
|
78
|
-
|
|
79
|
-
# Look for exact amounts
|
|
80
|
-
exact_match = re.search(r"(\d+(?:\.\d{2})?)", amount_text)
|
|
81
|
-
if exact_match:
|
|
82
|
-
exact_val = Decimal(exact_match.group(1))
|
|
83
|
-
return None, None, exact_val
|
|
84
|
-
|
|
85
|
-
# Handle standard ranges
|
|
86
|
-
range_mappings = {
|
|
87
|
-
"$1,001 - $15,000": (Decimal("1001"), Decimal("15000")),
|
|
88
|
-
"$15,001 - $50,000": (Decimal("15001"), Decimal("50000")),
|
|
89
|
-
"$50,001 - $100,000": (Decimal("50001"), Decimal("100000")),
|
|
90
|
-
"$100,001 - $250,000": (Decimal("100001"), Decimal("250000")),
|
|
91
|
-
"$250,001 - $500,000": (Decimal("250001"), Decimal("500000")),
|
|
92
|
-
"$500,001 - $1,000,000": (Decimal("500001"), Decimal("1000000")),
|
|
93
|
-
"$1,000,001 - $5,000,000": (Decimal("1000001"), Decimal("5000000")),
|
|
94
|
-
"$5,000,001 - $25,000,000": (Decimal("5000001"), Decimal("25000000")),
|
|
95
|
-
"$25,000,001 - $50,000,000": (Decimal("25000001"), Decimal("50000000")),
|
|
96
|
-
"Over $50,000,000": (Decimal("50000001"), None),
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
for pattern, (min_val, max_val) in range_mappings.items():
|
|
100
|
-
if pattern.lower() in amount_text.lower():
|
|
101
|
-
return min_val, max_val, None
|
|
102
|
-
|
|
103
|
-
return None, None, None
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class CongressTradingScraper(BaseScraper):
|
|
107
|
-
"""Scraper for US Congress trading data"""
|
|
108
|
-
|
|
109
|
-
async def scrape_house_disclosures(self) -> List[TradingDisclosure]:
|
|
110
|
-
"""Scrape House financial disclosures from the official database"""
|
|
111
|
-
disclosures = []
|
|
112
|
-
base_url = "https://disclosures-clerk.house.gov"
|
|
113
|
-
search_url = f"{base_url}/FinancialDisclosure"
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
logger.info("Starting House disclosures scrape from official database")
|
|
117
|
-
|
|
118
|
-
async with aiohttp.ClientSession(
|
|
119
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
120
|
-
headers={"User-Agent": self.config.user_agent},
|
|
121
|
-
) as session:
|
|
122
|
-
|
|
123
|
-
# Get the ViewSearch form page
|
|
124
|
-
view_search_url = f"{base_url}/FinancialDisclosure/ViewSearch"
|
|
125
|
-
async with session.get(view_search_url) as response:
|
|
126
|
-
if response.status == 200:
|
|
127
|
-
html = await response.text()
|
|
128
|
-
logger.info("Successfully accessed House financial disclosure search form")
|
|
129
|
-
|
|
130
|
-
# Extract form data for ASPX
|
|
131
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
132
|
-
|
|
133
|
-
# Look for common ASPX form fields
|
|
134
|
-
form_fields = {}
|
|
135
|
-
for field_name in [
|
|
136
|
-
"__VIEWSTATE",
|
|
137
|
-
"__VIEWSTATEGENERATOR",
|
|
138
|
-
"__EVENTVALIDATION",
|
|
139
|
-
"__REQUESTVERIFICATIONTOKEN",
|
|
140
|
-
]:
|
|
141
|
-
field = soup.find("input", {"name": field_name})
|
|
142
|
-
if field and field.get("value"):
|
|
143
|
-
form_fields[field_name] = field["value"]
|
|
144
|
-
|
|
145
|
-
if "__VIEWSTATE" in form_fields:
|
|
146
|
-
logger.info("Found required ASPX form fields")
|
|
147
|
-
|
|
148
|
-
# Search for recent disclosures - try different form field names
|
|
149
|
-
current_year = str(datetime.now().year)
|
|
150
|
-
|
|
151
|
-
# Search for common politician last names to get real data
|
|
152
|
-
common_names = [
|
|
153
|
-
"Smith",
|
|
154
|
-
"Johnson",
|
|
155
|
-
"Brown",
|
|
156
|
-
"Davis",
|
|
157
|
-
"Wilson",
|
|
158
|
-
"Miller",
|
|
159
|
-
"Garcia",
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
# Try different form patterns with actual names
|
|
163
|
-
possible_form_data_sets = []
|
|
164
|
-
|
|
165
|
-
for name in common_names:
|
|
166
|
-
possible_form_data_sets.extend(
|
|
167
|
-
[
|
|
168
|
-
{
|
|
169
|
-
**form_fields,
|
|
170
|
-
"ctl00$MainContent$txtLastName": name,
|
|
171
|
-
"ctl00$MainContent$ddlFilingYear": current_year,
|
|
172
|
-
"ctl00$MainContent$btnSearch": "Search",
|
|
173
|
-
},
|
|
174
|
-
{
|
|
175
|
-
**form_fields,
|
|
176
|
-
"ctl00$ContentPlaceHolder1$txtLastName": name,
|
|
177
|
-
"ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
|
|
178
|
-
"ctl00$ContentPlaceHolder1$btnSearch": "Search",
|
|
179
|
-
},
|
|
180
|
-
{
|
|
181
|
-
**form_fields,
|
|
182
|
-
"LastName": name,
|
|
183
|
-
"FilingYear": current_year,
|
|
184
|
-
"Search": "Search",
|
|
185
|
-
},
|
|
186
|
-
]
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
# Also try without names (all results)
|
|
190
|
-
possible_form_data_sets.extend(
|
|
191
|
-
[
|
|
192
|
-
{
|
|
193
|
-
**form_fields,
|
|
194
|
-
"ctl00$MainContent$ddlFilingYear": current_year,
|
|
195
|
-
"ctl00$MainContent$btnSearch": "Search",
|
|
196
|
-
},
|
|
197
|
-
{
|
|
198
|
-
**form_fields,
|
|
199
|
-
"ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
|
|
200
|
-
"ctl00$ContentPlaceHolder1$btnSearch": "Search",
|
|
201
|
-
},
|
|
202
|
-
]
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Try each form configuration
|
|
206
|
-
for i, form_data in enumerate(possible_form_data_sets):
|
|
207
|
-
try:
|
|
208
|
-
logger.info(f"Attempting search with form configuration {i+1}")
|
|
209
|
-
async with session.post(
|
|
210
|
-
view_search_url, data=form_data
|
|
211
|
-
) as search_response:
|
|
212
|
-
if search_response.status == 200:
|
|
213
|
-
results_html = await search_response.text()
|
|
214
|
-
if (
|
|
215
|
-
"search results" in results_html.lower()
|
|
216
|
-
or "disclosure" in results_html.lower()
|
|
217
|
-
):
|
|
218
|
-
disclosures = await self._parse_house_results(
|
|
219
|
-
results_html, base_url
|
|
220
|
-
)
|
|
221
|
-
logger.info(
|
|
222
|
-
f"Successfully found {len(disclosures)} House disclosures"
|
|
223
|
-
)
|
|
224
|
-
break
|
|
225
|
-
else:
|
|
226
|
-
logger.debug(
|
|
227
|
-
f"Form config {i+1} didn't return results"
|
|
228
|
-
)
|
|
229
|
-
else:
|
|
230
|
-
logger.debug(
|
|
231
|
-
f"Form config {i+1} failed with status {search_response.status}"
|
|
232
|
-
)
|
|
233
|
-
except Exception as e:
|
|
234
|
-
logger.debug(f"Form config {i+1} failed: {e}")
|
|
235
|
-
else:
|
|
236
|
-
logger.warning(
|
|
237
|
-
"All form configurations failed, using basic page scraping"
|
|
238
|
-
)
|
|
239
|
-
# Fall back to scraping any existing disclosure links on the page
|
|
240
|
-
disclosures = await self._parse_house_results(html, base_url)
|
|
241
|
-
else:
|
|
242
|
-
logger.warning(
|
|
243
|
-
"Could not find required ASPX form fields, using basic page scraping"
|
|
244
|
-
)
|
|
245
|
-
# Fall back to parsing any existing links
|
|
246
|
-
disclosures = await self._parse_house_results(html, base_url)
|
|
247
|
-
else:
|
|
248
|
-
logger.warning(f"Failed to access House disclosure site: {response.status}")
|
|
249
|
-
|
|
250
|
-
# Rate limiting
|
|
251
|
-
await asyncio.sleep(self.config.request_delay)
|
|
252
|
-
|
|
253
|
-
except Exception as e:
|
|
254
|
-
logger.error(f"House disclosures scrape failed: {e}")
|
|
255
|
-
# Return empty list on error rather than sample data
|
|
256
|
-
|
|
257
|
-
return disclosures
|
|
258
|
-
|
|
259
|
-
async def scrape_senate_disclosures(self) -> List[TradingDisclosure]:
|
|
260
|
-
"""Scrape Senate financial disclosures from the official EFD database"""
|
|
261
|
-
disclosures = []
|
|
262
|
-
base_url = "https://efdsearch.senate.gov"
|
|
263
|
-
search_url = f"{base_url}/search/"
|
|
264
|
-
|
|
265
|
-
try:
|
|
266
|
-
logger.info("Starting Senate disclosures scrape from EFD database")
|
|
267
|
-
|
|
268
|
-
async with aiohttp.ClientSession(
|
|
269
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
270
|
-
headers={"User-Agent": self.config.user_agent},
|
|
271
|
-
) as session:
|
|
272
|
-
|
|
273
|
-
# Search for recent periodic transaction reports (PTRs)
|
|
274
|
-
search_params = {
|
|
275
|
-
"report_type": "11", # Periodic Transaction Report
|
|
276
|
-
"submitted_start_date": (datetime.now() - timedelta(days=90)).strftime(
|
|
277
|
-
"%m/%d/%Y"
|
|
278
|
-
),
|
|
279
|
-
"submitted_end_date": datetime.now().strftime("%m/%d/%Y"),
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
async with session.get(search_url, params=search_params) as response:
|
|
283
|
-
if response.status == 200:
|
|
284
|
-
html = await response.text()
|
|
285
|
-
disclosures = await self._parse_senate_results(html, base_url)
|
|
286
|
-
logger.info(f"Found {len(disclosures)} Senate disclosures")
|
|
287
|
-
else:
|
|
288
|
-
logger.warning(f"Senate search failed with status {response.status}")
|
|
289
|
-
|
|
290
|
-
# Rate limiting
|
|
291
|
-
await asyncio.sleep(self.config.request_delay)
|
|
292
|
-
|
|
293
|
-
except Exception as e:
|
|
294
|
-
logger.error(f"Senate disclosures scrape failed: {e}")
|
|
295
|
-
# Return empty list on error rather than sample data
|
|
296
|
-
|
|
297
|
-
return disclosures
|
|
298
|
-
|
|
299
|
-
async def _parse_house_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
|
|
300
|
-
"""Parse House disclosure search results"""
|
|
301
|
-
disclosures = []
|
|
302
|
-
|
|
303
|
-
try:
|
|
304
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
305
|
-
|
|
306
|
-
# Look for disclosure result rows - try multiple selectors
|
|
307
|
-
result_rows = (
|
|
308
|
-
soup.find_all("tr", class_="disclosure-row")
|
|
309
|
-
or soup.select('tr[id*="GridView"]')
|
|
310
|
-
or soup.select("table tr")
|
|
311
|
-
or soup.find_all("tr")
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
logger.info(f"Found {len(result_rows)} potential result rows")
|
|
315
|
-
|
|
316
|
-
for row in result_rows[:20]: # Limit to 20 most recent
|
|
317
|
-
cells = row.find_all("td")
|
|
318
|
-
if len(cells) >= 3: # At least 3 columns
|
|
319
|
-
# Extract text from each cell to identify the structure
|
|
320
|
-
cell_texts = [
|
|
321
|
-
cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)
|
|
322
|
-
]
|
|
323
|
-
|
|
324
|
-
if not cell_texts:
|
|
325
|
-
continue
|
|
326
|
-
|
|
327
|
-
# Try to identify which cell contains the politician name
|
|
328
|
-
# Names usually contain letters and may have titles like "Rep.", "Hon."
|
|
329
|
-
politician_name = ""
|
|
330
|
-
|
|
331
|
-
for text in cell_texts:
|
|
332
|
-
# Look for text that looks like a person's name
|
|
333
|
-
if (
|
|
334
|
-
len(text) > 3
|
|
335
|
-
and any(c.isalpha() for c in text)
|
|
336
|
-
and not text.isdigit()
|
|
337
|
-
and not text.startswith("20") # Not a year
|
|
338
|
-
and "pdf" not in text.lower()
|
|
339
|
-
and "view" not in text.lower()
|
|
340
|
-
):
|
|
341
|
-
|
|
342
|
-
# Clean up potential name
|
|
343
|
-
clean_name = (
|
|
344
|
-
text.replace("Hon.", "")
|
|
345
|
-
.replace("Rep.", "")
|
|
346
|
-
.replace("Sen.", "")
|
|
347
|
-
.strip()
|
|
348
|
-
)
|
|
349
|
-
if len(clean_name) > 3 and " " in clean_name: # Likely full name
|
|
350
|
-
politician_name = clean_name
|
|
351
|
-
break
|
|
352
|
-
|
|
353
|
-
if not politician_name:
|
|
354
|
-
politician_name = cell_texts[0] # Fallback to first cell
|
|
355
|
-
|
|
356
|
-
# Extract other information
|
|
357
|
-
filing_year = next(
|
|
358
|
-
(
|
|
359
|
-
text
|
|
360
|
-
for text in cell_texts
|
|
361
|
-
if text.isdigit() and len(text) == 4 and text.startswith("20")
|
|
362
|
-
),
|
|
363
|
-
"",
|
|
364
|
-
)
|
|
365
|
-
filing_type = next(
|
|
366
|
-
(
|
|
367
|
-
text
|
|
368
|
-
for text in cell_texts
|
|
369
|
-
if "periodic" in text.lower() or "annual" in text.lower()
|
|
370
|
-
),
|
|
371
|
-
"",
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
# Look for PDF link
|
|
375
|
-
pdf_link = row.find("a", href=True)
|
|
376
|
-
if pdf_link:
|
|
377
|
-
pdf_url = urljoin(base_url, pdf_link["href"])
|
|
378
|
-
|
|
379
|
-
# Create basic disclosure entry
|
|
380
|
-
# Note: Actual transaction details would require PDF parsing
|
|
381
|
-
disclosure = TradingDisclosure(
|
|
382
|
-
politician_id="", # To be filled by matcher
|
|
383
|
-
transaction_date=datetime.now() - timedelta(days=30), # Estimate
|
|
384
|
-
disclosure_date=datetime.now() - timedelta(days=15), # Estimate
|
|
385
|
-
transaction_type=TransactionType.PURCHASE, # Default
|
|
386
|
-
asset_name="Unknown Asset", # Would need PDF parsing
|
|
387
|
-
asset_type="stock",
|
|
388
|
-
amount_range_min=Decimal("1001"),
|
|
389
|
-
amount_range_max=Decimal("15000"),
|
|
390
|
-
source_url=pdf_url,
|
|
391
|
-
raw_data={
|
|
392
|
-
"politician_name": politician_name,
|
|
393
|
-
"filing_year": filing_year,
|
|
394
|
-
"filing_type": filing_type,
|
|
395
|
-
"requires_pdf_parsing": True,
|
|
396
|
-
"extraction_method": "house_search_results",
|
|
397
|
-
},
|
|
398
|
-
)
|
|
399
|
-
disclosures.append(disclosure)
|
|
400
|
-
|
|
401
|
-
except Exception as e:
|
|
402
|
-
logger.error(f"Error parsing House results: {e}")
|
|
403
|
-
|
|
404
|
-
return disclosures
|
|
405
|
-
|
|
406
|
-
async def _parse_senate_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
|
|
407
|
-
"""Parse Senate EFD search results"""
|
|
408
|
-
disclosures = []
|
|
409
|
-
|
|
410
|
-
try:
|
|
411
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
412
|
-
|
|
413
|
-
# Look for search result rows
|
|
414
|
-
result_rows = soup.find_all("tr", class_="searchresult") or soup.select("tbody tr")
|
|
415
|
-
|
|
416
|
-
for row in result_rows[:20]: # Limit to 20 most recent
|
|
417
|
-
cells = row.find_all("td")
|
|
418
|
-
if len(cells) >= 4:
|
|
419
|
-
# Extract information
|
|
420
|
-
name = cells[0].get_text(strip=True) if cells[0] else ""
|
|
421
|
-
report_type = cells[1].get_text(strip=True) if cells[1] else ""
|
|
422
|
-
filing_date = cells[2].get_text(strip=True) if cells[2] else ""
|
|
423
|
-
|
|
424
|
-
# Look for report link
|
|
425
|
-
report_link = row.find("a", href=True)
|
|
426
|
-
if report_link and "ptr" in report_type.lower(): # Periodic Transaction Report
|
|
427
|
-
report_url = urljoin(base_url, report_link["href"])
|
|
428
|
-
|
|
429
|
-
# Create disclosure entry
|
|
430
|
-
# Note: Actual transaction details would require report parsing
|
|
431
|
-
disclosure = TradingDisclosure(
|
|
432
|
-
politician_id="", # To be filled by matcher
|
|
433
|
-
transaction_date=datetime.now() - timedelta(days=30), # Estimate
|
|
434
|
-
disclosure_date=self._parse_date(filing_date) or datetime.now(),
|
|
435
|
-
transaction_type=TransactionType.PURCHASE, # Default
|
|
436
|
-
asset_name="Unknown Asset", # Would need report parsing
|
|
437
|
-
asset_type="stock",
|
|
438
|
-
amount_range_min=Decimal("1001"),
|
|
439
|
-
amount_range_max=Decimal("50000"),
|
|
440
|
-
source_url=report_url,
|
|
441
|
-
raw_data={
|
|
442
|
-
"politician_name": name,
|
|
443
|
-
"report_type": report_type,
|
|
444
|
-
"filing_date": filing_date,
|
|
445
|
-
"requires_report_parsing": True,
|
|
446
|
-
},
|
|
447
|
-
)
|
|
448
|
-
disclosures.append(disclosure)
|
|
449
|
-
|
|
450
|
-
except Exception as e:
|
|
451
|
-
logger.error(f"Error parsing Senate results: {e}")
|
|
452
|
-
|
|
453
|
-
return disclosures
|
|
454
|
-
|
|
455
|
-
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
|
456
|
-
"""Parse various date formats from disclosure sites"""
|
|
457
|
-
if not date_str:
|
|
458
|
-
return None
|
|
459
|
-
|
|
460
|
-
date_formats = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%B %d, %Y"]
|
|
461
|
-
|
|
462
|
-
for fmt in date_formats:
|
|
463
|
-
try:
|
|
464
|
-
return datetime.strptime(date_str.strip(), fmt)
|
|
465
|
-
except ValueError:
|
|
466
|
-
continue
|
|
467
|
-
|
|
468
|
-
logger.warning(f"Could not parse date: {date_str}")
|
|
469
|
-
return None
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
class QuiverQuantScraper(BaseScraper):
|
|
473
|
-
"""Scraper for QuiverQuant congress trading data as a backup source"""
|
|
474
|
-
|
|
475
|
-
async def scrape_congress_trades(self) -> List[Dict[str, Any]]:
|
|
476
|
-
"""Scrape congress trading data from QuiverQuant"""
|
|
477
|
-
trades = []
|
|
478
|
-
|
|
479
|
-
try:
|
|
480
|
-
# This would implement scraping from QuiverQuant's public data
|
|
481
|
-
# Note: Respect their robots.txt and terms of service
|
|
482
|
-
logger.info("Starting QuiverQuant scrape")
|
|
483
|
-
|
|
484
|
-
url = "https://www.quiverquant.com/congresstrading/"
|
|
485
|
-
html = await self.fetch_page(url)
|
|
486
|
-
|
|
487
|
-
if html:
|
|
488
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
489
|
-
|
|
490
|
-
# Parse the trading data table (simplified example)
|
|
491
|
-
# In reality, this might require handling JavaScript rendering
|
|
492
|
-
trade_rows = soup.select("table tr")
|
|
493
|
-
|
|
494
|
-
for row in trade_rows[1:10]: # Skip header, limit to 10 for example
|
|
495
|
-
cells = row.select("td")
|
|
496
|
-
if len(cells) >= 4:
|
|
497
|
-
# Extract cell contents and try to identify the correct fields
|
|
498
|
-
cell_texts = [cell.get_text(strip=True) for cell in cells]
|
|
499
|
-
|
|
500
|
-
# Try to identify which cell contains what data
|
|
501
|
-
politician_name = cell_texts[0] if len(cell_texts) > 0 else ""
|
|
502
|
-
|
|
503
|
-
# Look for date-like patterns (YYYY-MM-DD, MM/DD/YYYY, etc.)
|
|
504
|
-
transaction_date = ""
|
|
505
|
-
ticker = ""
|
|
506
|
-
transaction_type = ""
|
|
507
|
-
amount = ""
|
|
508
|
-
|
|
509
|
-
for i, text in enumerate(
|
|
510
|
-
cell_texts[1:], 1
|
|
511
|
-
): # Skip first cell (politician name)
|
|
512
|
-
# Check if this looks like a date
|
|
513
|
-
if self._looks_like_date(text):
|
|
514
|
-
transaction_date = text
|
|
515
|
-
# Check if this looks like a ticker (all caps, short)
|
|
516
|
-
elif text.isupper() and len(text) <= 5 and text.isalpha():
|
|
517
|
-
ticker = text
|
|
518
|
-
# Check if this contains transaction type keywords
|
|
519
|
-
elif any(
|
|
520
|
-
word in text.lower() for word in ["purchase", "sale", "buy", "sell"]
|
|
521
|
-
):
|
|
522
|
-
# Split transaction type and amount if combined
|
|
523
|
-
if "$" in text:
|
|
524
|
-
# Split on $ to separate transaction type from amount
|
|
525
|
-
parts = text.split("$", 1)
|
|
526
|
-
transaction_type = parts[0].strip()
|
|
527
|
-
amount = "$" + parts[1] if len(parts) > 1 else ""
|
|
528
|
-
else:
|
|
529
|
-
transaction_type = text
|
|
530
|
-
# Check if this looks like an amount (contains $ or numbers with ,)
|
|
531
|
-
elif "$" in text or ("," in text and any(c.isdigit() for c in text)):
|
|
532
|
-
amount = text
|
|
533
|
-
|
|
534
|
-
# Only create trade data if we have essential fields
|
|
535
|
-
if politician_name and (transaction_date or ticker):
|
|
536
|
-
trade_data = {
|
|
537
|
-
"politician_name": politician_name,
|
|
538
|
-
"transaction_date": transaction_date,
|
|
539
|
-
"ticker": ticker,
|
|
540
|
-
"transaction_type": transaction_type,
|
|
541
|
-
"amount": amount,
|
|
542
|
-
"source": "quiverquant",
|
|
543
|
-
}
|
|
544
|
-
trades.append(trade_data)
|
|
545
|
-
|
|
546
|
-
except Exception as e:
|
|
547
|
-
logger.error(f"QuiverQuant scrape failed: {e}")
|
|
548
|
-
|
|
549
|
-
return trades
|
|
550
|
-
|
|
551
|
-
def _looks_like_date(self, text: str) -> bool:
|
|
552
|
-
"""Check if a string looks like a date"""
|
|
553
|
-
if not text or len(text) < 8:
|
|
554
|
-
return False
|
|
555
|
-
|
|
556
|
-
# Common date patterns
|
|
557
|
-
date_patterns = [
|
|
558
|
-
r"\d{4}-\d{1,2}-\d{1,2}", # YYYY-MM-DD
|
|
559
|
-
r"\d{1,2}/\d{1,2}/\d{4}", # MM/DD/YYYY
|
|
560
|
-
r"\d{1,2}-\d{1,2}-\d{4}", # MM-DD-YYYY
|
|
561
|
-
r"\w{3}\s+\d{1,2},?\s+\d{4}", # Month DD, YYYY
|
|
562
|
-
]
|
|
563
|
-
|
|
564
|
-
import re
|
|
565
|
-
|
|
566
|
-
for pattern in date_patterns:
|
|
567
|
-
if re.search(pattern, text):
|
|
568
|
-
return True
|
|
569
|
-
return False
|
|
570
|
-
|
|
571
|
-
def parse_quiver_trade(self, trade_data: Dict[str, Any]) -> Optional[TradingDisclosure]:
|
|
572
|
-
"""Parse QuiverQuant trade data into TradingDisclosure"""
|
|
573
|
-
try:
|
|
574
|
-
# Debug: Log the trade data structure
|
|
575
|
-
logger.debug(f"Parsing QuiverQuant trade data: {trade_data}")
|
|
576
|
-
# Parse transaction type
|
|
577
|
-
transaction_type_map = {
|
|
578
|
-
"purchase": TransactionType.PURCHASE,
|
|
579
|
-
"sale": TransactionType.SALE,
|
|
580
|
-
"buy": TransactionType.PURCHASE,
|
|
581
|
-
"sell": TransactionType.SALE,
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
transaction_type = transaction_type_map.get(
|
|
585
|
-
trade_data.get("transaction_type", "").lower(), TransactionType.PURCHASE
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
# Parse date
|
|
589
|
-
date_str = trade_data.get("transaction_date", "")
|
|
590
|
-
if not date_str or date_str.strip() == "" or not self._looks_like_date(date_str):
|
|
591
|
-
# Use estimated date if no valid date found
|
|
592
|
-
transaction_date = datetime.now() - timedelta(days=30) # Estimate 30 days ago
|
|
593
|
-
else:
|
|
594
|
-
# Try multiple date formats
|
|
595
|
-
try:
|
|
596
|
-
# Standard format
|
|
597
|
-
transaction_date = datetime.strptime(date_str, "%Y-%m-%d")
|
|
598
|
-
except ValueError:
|
|
599
|
-
try:
|
|
600
|
-
# Alternative format
|
|
601
|
-
transaction_date = datetime.strptime(date_str, "%m/%d/%Y")
|
|
602
|
-
except ValueError:
|
|
603
|
-
try:
|
|
604
|
-
# Try MM-DD-YYYY
|
|
605
|
-
transaction_date = datetime.strptime(date_str, "%m-%d-%Y")
|
|
606
|
-
except ValueError:
|
|
607
|
-
logger.warning(
|
|
608
|
-
f"Could not parse date '{date_str}', using estimated date"
|
|
609
|
-
)
|
|
610
|
-
transaction_date = datetime.now() - timedelta(days=30)
|
|
611
|
-
|
|
612
|
-
# Parse amount
|
|
613
|
-
amount_min, amount_max, amount_exact = self.parse_amount_range(
|
|
614
|
-
trade_data.get("amount", "")
|
|
615
|
-
)
|
|
616
|
-
|
|
617
|
-
disclosure = TradingDisclosure(
|
|
618
|
-
politician_id="", # Will be filled after politician matching
|
|
619
|
-
transaction_date=transaction_date,
|
|
620
|
-
disclosure_date=datetime.now(), # QuiverQuant aggregation date
|
|
621
|
-
transaction_type=transaction_type,
|
|
622
|
-
asset_name=trade_data.get("ticker", ""),
|
|
623
|
-
asset_ticker=trade_data.get("ticker", ""),
|
|
624
|
-
asset_type="stock",
|
|
625
|
-
amount_range_min=amount_min,
|
|
626
|
-
amount_range_max=amount_max,
|
|
627
|
-
amount_exact=amount_exact,
|
|
628
|
-
source_url="https://www.quiverquant.com/congresstrading/",
|
|
629
|
-
raw_data=trade_data,
|
|
630
|
-
)
|
|
631
|
-
|
|
632
|
-
return disclosure
|
|
633
|
-
|
|
634
|
-
except Exception as e:
|
|
635
|
-
logger.error(f"Failed to parse QuiverQuant trade: {e}")
|
|
636
|
-
return None
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
class EUParliamentScraper(BaseScraper):
|
|
640
|
-
"""Scraper for EU Parliament member declarations"""
|
|
641
|
-
|
|
642
|
-
async def scrape_mep_declarations(self) -> List[TradingDisclosure]:
|
|
643
|
-
"""Scrape MEP financial declarations from official EU Parliament site"""
|
|
644
|
-
disclosures = []
|
|
645
|
-
|
|
646
|
-
try:
|
|
647
|
-
logger.info("Starting EU Parliament MEP declarations scrape")
|
|
648
|
-
base_url = "https://www.europarl.europa.eu"
|
|
649
|
-
|
|
650
|
-
async with aiohttp.ClientSession(
|
|
651
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
652
|
-
headers={"User-Agent": self.config.user_agent},
|
|
653
|
-
) as session:
|
|
654
|
-
|
|
655
|
-
# Get list of current MEPs
|
|
656
|
-
mep_list_url = f"{base_url}/meps/en/full-list/all"
|
|
657
|
-
|
|
658
|
-
async with session.get(mep_list_url) as response:
|
|
659
|
-
if response.status == 200:
|
|
660
|
-
html = await response.text()
|
|
661
|
-
mep_data = await self._extract_mep_urls(html, base_url)
|
|
662
|
-
logger.info(f"Found {len(mep_data)} MEP profiles to check")
|
|
663
|
-
|
|
664
|
-
# Check declarations for a subset of MEPs (to avoid overwhelming the server)
|
|
665
|
-
for i, mep_info in enumerate(mep_data[:50]): # Limit to 50 MEPs
|
|
666
|
-
try:
|
|
667
|
-
mep_disclosures = await self._scrape_mep_profile(
|
|
668
|
-
session, mep_info["url"], mep_info
|
|
669
|
-
)
|
|
670
|
-
disclosures.extend(mep_disclosures)
|
|
671
|
-
|
|
672
|
-
# Rate limiting - EU Parliament is more sensitive
|
|
673
|
-
await asyncio.sleep(self.config.request_delay * 2)
|
|
674
|
-
|
|
675
|
-
if i > 0 and i % 10 == 0:
|
|
676
|
-
logger.info(f"Processed {i} MEP profiles")
|
|
677
|
-
|
|
678
|
-
except Exception as e:
|
|
679
|
-
logger.warning(f"Failed to process MEP profile {mep_url}: {e}")
|
|
680
|
-
continue
|
|
681
|
-
else:
|
|
682
|
-
logger.warning(f"Failed to access MEP list: {response.status}")
|
|
683
|
-
|
|
684
|
-
logger.info(f"Collected {len(disclosures)} EU Parliament disclosures")
|
|
685
|
-
|
|
686
|
-
except Exception as e:
|
|
687
|
-
logger.error(f"EU Parliament scrape failed: {e}")
|
|
688
|
-
|
|
689
|
-
return disclosures
|
|
690
|
-
|
|
691
|
-
async def _extract_mep_urls(self, html: str, base_url: str) -> List[Dict[str, str]]:
|
|
692
|
-
"""Extract MEP profile URLs and names from the MEP list page"""
|
|
693
|
-
mep_data = []
|
|
694
|
-
|
|
695
|
-
try:
|
|
696
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
697
|
-
|
|
698
|
-
# Look for MEP profile links - they usually contain both name and link
|
|
699
|
-
mep_links = soup.find_all("a", href=True)
|
|
700
|
-
|
|
701
|
-
seen_urls = set()
|
|
702
|
-
|
|
703
|
-
for link in mep_links:
|
|
704
|
-
href = link.get("href", "")
|
|
705
|
-
if "/meps/en/" in href and "/home" in href:
|
|
706
|
-
full_url = urljoin(base_url, href)
|
|
707
|
-
|
|
708
|
-
if full_url not in seen_urls:
|
|
709
|
-
# Extract MEP name from link text or nearby elements
|
|
710
|
-
mep_name = ""
|
|
711
|
-
|
|
712
|
-
# Try to get name from link text
|
|
713
|
-
link_text = link.get_text(strip=True)
|
|
714
|
-
if (
|
|
715
|
-
link_text
|
|
716
|
-
and len(link_text) > 3
|
|
717
|
-
and not link_text.lower().startswith("http")
|
|
718
|
-
):
|
|
719
|
-
mep_name = link_text
|
|
720
|
-
|
|
721
|
-
# If no name in link, look in parent elements
|
|
722
|
-
if not mep_name:
|
|
723
|
-
parent = link.parent
|
|
724
|
-
if parent:
|
|
725
|
-
# Look for text that looks like a name
|
|
726
|
-
for text_node in parent.stripped_strings:
|
|
727
|
-
if (
|
|
728
|
-
len(text_node) > 3
|
|
729
|
-
and " " in text_node
|
|
730
|
-
and not text_node.startswith("http")
|
|
731
|
-
and not text_node.isdigit()
|
|
732
|
-
):
|
|
733
|
-
mep_name = text_node
|
|
734
|
-
break
|
|
735
|
-
|
|
736
|
-
# Extract country/party info if available
|
|
737
|
-
country = ""
|
|
738
|
-
party = ""
|
|
739
|
-
|
|
740
|
-
# Look for country and party info near the link
|
|
741
|
-
container = link.find_parent(["div", "article", "section"])
|
|
742
|
-
if container:
|
|
743
|
-
text_elements = list(container.stripped_strings)
|
|
744
|
-
for i, text in enumerate(text_elements):
|
|
745
|
-
if text == mep_name and i < len(text_elements) - 2:
|
|
746
|
-
# Country and party usually come after name
|
|
747
|
-
country = (
|
|
748
|
-
text_elements[i + 1] if i + 1 < len(text_elements) else ""
|
|
749
|
-
)
|
|
750
|
-
party = (
|
|
751
|
-
text_elements[i + 2] if i + 2 < len(text_elements) else ""
|
|
752
|
-
)
|
|
753
|
-
|
|
754
|
-
if mep_name: # Only add if we found a name
|
|
755
|
-
mep_data.append(
|
|
756
|
-
{
|
|
757
|
-
"url": full_url,
|
|
758
|
-
"name": mep_name,
|
|
759
|
-
"country": country,
|
|
760
|
-
"party": party,
|
|
761
|
-
}
|
|
762
|
-
)
|
|
763
|
-
seen_urls.add(full_url)
|
|
764
|
-
|
|
765
|
-
# Limit to prevent overwhelming the servers
|
|
766
|
-
if len(mep_data) >= 50:
|
|
767
|
-
break
|
|
768
|
-
|
|
769
|
-
except Exception as e:
|
|
770
|
-
logger.error(f"Error extracting MEP data: {e}")
|
|
771
|
-
|
|
772
|
-
return mep_data
|
|
773
|
-
|
|
774
|
-
async def _scrape_mep_profile(
|
|
775
|
-
self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None
|
|
776
|
-
) -> List[TradingDisclosure]:
|
|
777
|
-
"""Scrape financial interests from an individual MEP profile"""
|
|
778
|
-
disclosures = []
|
|
779
|
-
|
|
780
|
-
try:
|
|
781
|
-
async with session.get(mep_url) as response:
|
|
782
|
-
if response.status == 200:
|
|
783
|
-
html = await response.text()
|
|
784
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
785
|
-
|
|
786
|
-
# Use extracted MEP name from list, or try to extract from profile
|
|
787
|
-
if mep_info and mep_info.get("name"):
|
|
788
|
-
mep_name = mep_info["name"]
|
|
789
|
-
mep_country = mep_info.get("country", "")
|
|
790
|
-
mep_party = mep_info.get("party", "")
|
|
791
|
-
else:
|
|
792
|
-
# Fallback: extract from profile page
|
|
793
|
-
name_element = soup.find("h1", class_="ep-header-title")
|
|
794
|
-
mep_name = (
|
|
795
|
-
name_element.get_text(strip=True) if name_element else "Unknown MEP"
|
|
796
|
-
)
|
|
797
|
-
mep_country = ""
|
|
798
|
-
mep_party = ""
|
|
799
|
-
|
|
800
|
-
# Look for financial interests section
|
|
801
|
-
# EU Parliament declarations are typically in a specific section
|
|
802
|
-
interests_section = (
|
|
803
|
-
soup.find("div", id="financial-interests")
|
|
804
|
-
or soup.find("section", class_="ep-a-section")
|
|
805
|
-
or soup.find("div", class_="ep-m-content-block")
|
|
806
|
-
)
|
|
807
|
-
|
|
808
|
-
if interests_section:
|
|
809
|
-
# Parse financial interests
|
|
810
|
-
# Note: EU declarations focus more on activities and interests than specific trades
|
|
811
|
-
interest_items = interests_section.find_all(
|
|
812
|
-
["p", "li", "div"], recursive=True
|
|
813
|
-
)
|
|
814
|
-
|
|
815
|
-
for item in interest_items:
|
|
816
|
-
item_text = item.get_text(strip=True).lower()
|
|
817
|
-
|
|
818
|
-
# Look for financial keywords
|
|
819
|
-
if any(
|
|
820
|
-
keyword in item_text
|
|
821
|
-
for keyword in [
|
|
822
|
-
"shareholding",
|
|
823
|
-
"investment",
|
|
824
|
-
"director",
|
|
825
|
-
"board",
|
|
826
|
-
"financial interest",
|
|
827
|
-
"remuneration",
|
|
828
|
-
"consulting",
|
|
829
|
-
]
|
|
830
|
-
):
|
|
831
|
-
# Create disclosure for detected financial interest
|
|
832
|
-
disclosure = TradingDisclosure(
|
|
833
|
-
politician_id="", # To be filled by matcher
|
|
834
|
-
transaction_date=datetime.now()
|
|
835
|
-
- timedelta(days=90), # Estimate
|
|
836
|
-
disclosure_date=datetime.now() - timedelta(days=60), # Estimate
|
|
837
|
-
transaction_type=TransactionType.PURCHASE, # Default for interests
|
|
838
|
-
asset_name=self._extract_company_name(item_text),
|
|
839
|
-
asset_type="interest",
|
|
840
|
-
amount_range_min=Decimal(
|
|
841
|
-
"0"
|
|
842
|
-
), # EU doesn't always specify amounts
|
|
843
|
-
amount_range_max=Decimal("0"),
|
|
844
|
-
source_url=mep_url,
|
|
845
|
-
raw_data={
|
|
846
|
-
"politician_name": mep_name,
|
|
847
|
-
"country": mep_country,
|
|
848
|
-
"party": mep_party,
|
|
849
|
-
"interest_type": "financial_activity",
|
|
850
|
-
"interest_description": item.get_text(strip=True)[
|
|
851
|
-
:500
|
|
852
|
-
], # Truncate
|
|
853
|
-
"region": "eu",
|
|
854
|
-
"extraction_method": "mep_profile_scraping",
|
|
855
|
-
"requires_manual_review": True,
|
|
856
|
-
},
|
|
857
|
-
)
|
|
858
|
-
disclosures.append(disclosure)
|
|
859
|
-
|
|
860
|
-
except Exception as e:
|
|
861
|
-
logger.warning(f"Error scraping MEP profile {mep_url}: {e}")
|
|
862
|
-
|
|
863
|
-
return disclosures
|
|
864
|
-
|
|
865
|
-
def _extract_company_name(self, text: str) -> str:
|
|
866
|
-
"""Extract company/organization name from interest description"""
|
|
867
|
-
# Simple heuristic to extract potential company names
|
|
868
|
-
words = text.split()
|
|
869
|
-
|
|
870
|
-
# Look for capitalized sequences that might be company names
|
|
871
|
-
potential_names = []
|
|
872
|
-
current_name = []
|
|
873
|
-
|
|
874
|
-
for word in words:
|
|
875
|
-
if word[0].isupper() and len(word) > 2:
|
|
876
|
-
current_name.append(word)
|
|
877
|
-
else:
|
|
878
|
-
if current_name and len(current_name) <= 4: # Reasonable company name length
|
|
879
|
-
potential_names.append(" ".join(current_name))
|
|
880
|
-
current_name = []
|
|
881
|
-
|
|
882
|
-
if current_name and len(current_name) <= 4:
|
|
883
|
-
potential_names.append(" ".join(current_name))
|
|
884
|
-
|
|
885
|
-
# Return the first reasonable candidate or default
|
|
886
|
-
return potential_names[0] if potential_names else "Financial Interest"
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
class PoliticianMatcher:
|
|
890
|
-
"""Matches scraped names to politician records"""
|
|
891
|
-
|
|
892
|
-
def __init__(self, politicians: List[Politician]):
|
|
893
|
-
self.politicians = politicians
|
|
894
|
-
self._build_lookup()
|
|
895
|
-
|
|
896
|
-
def _build_lookup(self):
|
|
897
|
-
"""Build lookup dictionaries for fast matching"""
|
|
898
|
-
self.name_lookup = {}
|
|
899
|
-
self.bioguide_lookup = {}
|
|
900
|
-
|
|
901
|
-
for politician in self.politicians:
|
|
902
|
-
# Full name variations
|
|
903
|
-
full_name = politician.full_name.lower()
|
|
904
|
-
self.name_lookup[full_name] = politician
|
|
905
|
-
|
|
906
|
-
# Last, First format
|
|
907
|
-
if politician.first_name and politician.last_name:
|
|
908
|
-
last_first = f"{politician.last_name.lower()}, {politician.first_name.lower()}"
|
|
909
|
-
self.name_lookup[last_first] = politician
|
|
910
|
-
|
|
911
|
-
# First Last format
|
|
912
|
-
first_last = f"{politician.first_name.lower()} {politician.last_name.lower()}"
|
|
913
|
-
self.name_lookup[first_last] = politician
|
|
914
|
-
|
|
915
|
-
# Bioguide ID lookup
|
|
916
|
-
if politician.bioguide_id:
|
|
917
|
-
self.bioguide_lookup[politician.bioguide_id] = politician
|
|
918
|
-
|
|
919
|
-
def find_politician(self, name: str, bioguide_id: str = None) -> Optional[Politician]:
|
|
920
|
-
"""Find politician by name or bioguide ID"""
|
|
921
|
-
if bioguide_id and bioguide_id in self.bioguide_lookup:
|
|
922
|
-
return self.bioguide_lookup[bioguide_id]
|
|
923
|
-
|
|
924
|
-
if name:
|
|
925
|
-
name_clean = name.lower().strip()
|
|
926
|
-
|
|
927
|
-
# Direct match
|
|
928
|
-
if name_clean in self.name_lookup:
|
|
929
|
-
return self.name_lookup[name_clean]
|
|
930
|
-
|
|
931
|
-
# Fuzzy matching (simplified)
|
|
932
|
-
for lookup_name, politician in self.name_lookup.items():
|
|
933
|
-
if self._names_similar(name_clean, lookup_name):
|
|
934
|
-
return politician
|
|
935
|
-
|
|
936
|
-
return None
|
|
937
|
-
|
|
938
|
-
def _names_similar(self, name1: str, name2: str) -> bool:
|
|
939
|
-
"""Simple similarity check for names"""
|
|
940
|
-
# Remove common prefixes/suffixes
|
|
941
|
-
prefixes = ["rep.", "sen.", "senator", "representative", "mr.", "mrs.", "ms."]
|
|
942
|
-
suffixes = ["jr.", "sr.", "ii", "iii", "iv"]
|
|
943
|
-
|
|
944
|
-
for prefix in prefixes:
|
|
945
|
-
name1 = name1.replace(prefix, "").strip()
|
|
946
|
-
name2 = name2.replace(prefix, "").strip()
|
|
947
|
-
|
|
948
|
-
for suffix in suffixes:
|
|
949
|
-
name1 = name1.replace(suffix, "").strip()
|
|
950
|
-
name2 = name2.replace(suffix, "").strip()
|
|
951
|
-
|
|
952
|
-
# Check if one name contains the other
|
|
953
|
-
return name1 in name2 or name2 in name1
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
# Import specialized scrapers after base classes are defined
|
|
957
|
-
try:
|
|
958
|
-
from .scrapers_uk import UKParliamentScraper, run_uk_parliament_collection
|
|
959
|
-
|
|
960
|
-
UK_SCRAPER_AVAILABLE = True
|
|
961
|
-
except Exception as e:
|
|
962
|
-
logger.debug(f"UK scraper import failed: {e}")
|
|
963
|
-
UKParliamentScraper = None
|
|
964
|
-
run_uk_parliament_collection = None
|
|
965
|
-
UK_SCRAPER_AVAILABLE = False
|
|
966
|
-
|
|
967
|
-
try:
|
|
968
|
-
from .scrapers_california import CaliforniaNetFileScraper, run_california_collection
|
|
969
|
-
|
|
970
|
-
CALIFORNIA_SCRAPER_AVAILABLE = True
|
|
971
|
-
except Exception as e:
|
|
972
|
-
logger.debug(f"California scraper import failed: {e}")
|
|
973
|
-
CaliforniaNetFileScraper = None
|
|
974
|
-
run_california_collection = None
|
|
975
|
-
CALIFORNIA_SCRAPER_AVAILABLE = False
|
|
976
|
-
|
|
977
|
-
try:
|
|
978
|
-
from .scrapers_eu import EUMemberStatesScraper, run_eu_member_states_collection
|
|
979
|
-
|
|
980
|
-
EU_MEMBER_STATES_SCRAPER_AVAILABLE = True
|
|
981
|
-
except Exception as e:
|
|
982
|
-
logger.debug(f"EU member states scraper import failed: {e}")
|
|
983
|
-
EUMemberStatesScraper = None
|
|
984
|
-
run_eu_member_states_collection = None
|
|
985
|
-
EU_MEMBER_STATES_SCRAPER_AVAILABLE = False
|
|
986
|
-
|
|
987
|
-
try:
|
|
988
|
-
from .scrapers_us_states import USStatesScraper, run_us_states_collection
|
|
989
|
-
|
|
990
|
-
US_STATES_SCRAPER_AVAILABLE = True
|
|
991
|
-
except Exception as e:
|
|
992
|
-
logger.debug(f"US states scraper import failed: {e}")
|
|
993
|
-
USStatesScraper = None
|
|
994
|
-
run_us_states_collection = None
|
|
995
|
-
US_STATES_SCRAPER_AVAILABLE = False
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
# Workflow functions using imported scrapers
|
|
999
|
-
async def run_uk_parliament_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
|
|
1000
|
-
"""Run UK Parliament data collection workflow"""
|
|
1001
|
-
if not UK_SCRAPER_AVAILABLE:
|
|
1002
|
-
logger.warning("UK Parliament scraper not available")
|
|
1003
|
-
return []
|
|
1004
|
-
|
|
1005
|
-
logger.info("Starting UK Parliament financial interests collection")
|
|
1006
|
-
try:
|
|
1007
|
-
disclosures = await run_uk_parliament_collection(config)
|
|
1008
|
-
logger.info(f"Successfully collected {len(disclosures)} UK Parliament disclosures")
|
|
1009
|
-
return disclosures
|
|
1010
|
-
except Exception as e:
|
|
1011
|
-
logger.error(f"UK Parliament collection failed: {e}")
|
|
1012
|
-
return []
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
async def run_california_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
|
|
1016
|
-
"""Run California NetFile and state disclosure collection workflow"""
|
|
1017
|
-
if not CALIFORNIA_SCRAPER_AVAILABLE:
|
|
1018
|
-
logger.warning("California scraper not available")
|
|
1019
|
-
return []
|
|
1020
|
-
|
|
1021
|
-
logger.info("Starting California financial disclosures collection")
|
|
1022
|
-
try:
|
|
1023
|
-
disclosures = await run_california_collection(config)
|
|
1024
|
-
logger.info(f"Successfully collected {len(disclosures)} California disclosures")
|
|
1025
|
-
return disclosures
|
|
1026
|
-
except Exception as e:
|
|
1027
|
-
logger.error(f"California collection failed: {e}")
|
|
1028
|
-
return []
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
async def run_eu_member_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
|
|
1032
|
-
"""Run EU member states financial disclosure collection workflow"""
|
|
1033
|
-
if not EU_MEMBER_STATES_SCRAPER_AVAILABLE:
|
|
1034
|
-
logger.warning("EU member states scraper not available")
|
|
1035
|
-
return []
|
|
1036
|
-
|
|
1037
|
-
logger.info("Starting EU member states financial disclosures collection")
|
|
1038
|
-
try:
|
|
1039
|
-
disclosures = await run_eu_member_states_collection(config)
|
|
1040
|
-
logger.info(f"Successfully collected {len(disclosures)} EU member states disclosures")
|
|
1041
|
-
return disclosures
|
|
1042
|
-
except Exception as e:
|
|
1043
|
-
logger.error(f"EU member states collection failed: {e}")
|
|
1044
|
-
return []
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosure]:
|
|
1048
|
-
"""Run US states financial disclosure collection workflow"""
|
|
1049
|
-
if not US_STATES_SCRAPER_AVAILABLE:
|
|
1050
|
-
logger.warning("US states scraper not available")
|
|
1051
|
-
return []
|
|
1052
|
-
|
|
1053
|
-
logger.info("Starting US states financial disclosures collection")
|
|
1054
|
-
try:
|
|
1055
|
-
disclosures = await run_us_states_collection(config)
|
|
1056
|
-
logger.info(f"Successfully collected {len(disclosures)} US states disclosures")
|
|
1057
|
-
return disclosures
|
|
1058
|
-
except Exception as e:
|
|
1059
|
-
logger.error(f"US states collection failed: {e}")
|
|
1060
|
-
return []
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
# Export the new workflow function
|
|
1064
|
-
__all__ = [
|
|
1065
|
-
"BaseScraper",
|
|
1066
|
-
"CongressTradingScraper",
|
|
1067
|
-
"QuiverQuantScraper",
|
|
1068
|
-
"EUParliamentScraper",
|
|
1069
|
-
"PoliticianMatcher",
|
|
1070
|
-
"run_uk_parliament_workflow",
|
|
1071
|
-
"run_california_workflow",
|
|
1072
|
-
"run_eu_member_states_workflow",
|
|
1073
|
-
"run_us_states_workflow",
|
|
1074
|
-
]
|