mcli-framework 7.1.0__py3-none-any.whl → 7.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/completion_cmd.py +59 -49
- mcli/app/completion_helpers.py +60 -138
- mcli/app/logs_cmd.py +46 -13
- mcli/app/main.py +17 -14
- mcli/app/model_cmd.py +19 -4
- mcli/chat/chat.py +3 -2
- mcli/lib/search/cached_vectorizer.py +1 -0
- mcli/lib/services/data_pipeline.py +12 -5
- mcli/lib/services/lsh_client.py +69 -58
- mcli/ml/api/app.py +28 -36
- mcli/ml/api/middleware.py +8 -16
- mcli/ml/api/routers/admin_router.py +3 -1
- mcli/ml/api/routers/auth_router.py +32 -56
- mcli/ml/api/routers/backtest_router.py +3 -1
- mcli/ml/api/routers/data_router.py +3 -1
- mcli/ml/api/routers/model_router.py +35 -74
- mcli/ml/api/routers/monitoring_router.py +3 -1
- mcli/ml/api/routers/portfolio_router.py +3 -1
- mcli/ml/api/routers/prediction_router.py +60 -65
- mcli/ml/api/routers/trade_router.py +6 -2
- mcli/ml/api/routers/websocket_router.py +12 -9
- mcli/ml/api/schemas.py +10 -2
- mcli/ml/auth/auth_manager.py +49 -114
- mcli/ml/auth/models.py +30 -15
- mcli/ml/auth/permissions.py +12 -19
- mcli/ml/backtesting/backtest_engine.py +134 -108
- mcli/ml/backtesting/performance_metrics.py +142 -108
- mcli/ml/cache.py +12 -18
- mcli/ml/cli/main.py +37 -23
- mcli/ml/config/settings.py +29 -12
- mcli/ml/dashboard/app.py +122 -130
- mcli/ml/dashboard/app_integrated.py +283 -152
- mcli/ml/dashboard/app_supabase.py +176 -108
- mcli/ml/dashboard/app_training.py +212 -206
- mcli/ml/dashboard/cli.py +14 -5
- mcli/ml/data_ingestion/api_connectors.py +51 -81
- mcli/ml/data_ingestion/data_pipeline.py +127 -125
- mcli/ml/data_ingestion/stream_processor.py +72 -80
- mcli/ml/database/migrations/env.py +3 -2
- mcli/ml/database/models.py +112 -79
- mcli/ml/database/session.py +6 -5
- mcli/ml/experimentation/ab_testing.py +149 -99
- mcli/ml/features/ensemble_features.py +9 -8
- mcli/ml/features/political_features.py +6 -5
- mcli/ml/features/recommendation_engine.py +15 -14
- mcli/ml/features/stock_features.py +7 -6
- mcli/ml/features/test_feature_engineering.py +8 -7
- mcli/ml/logging.py +10 -15
- mcli/ml/mlops/data_versioning.py +57 -64
- mcli/ml/mlops/experiment_tracker.py +49 -41
- mcli/ml/mlops/model_serving.py +59 -62
- mcli/ml/mlops/pipeline_orchestrator.py +203 -149
- mcli/ml/models/base_models.py +8 -7
- mcli/ml/models/ensemble_models.py +6 -5
- mcli/ml/models/recommendation_models.py +7 -6
- mcli/ml/models/test_models.py +18 -14
- mcli/ml/monitoring/drift_detection.py +95 -74
- mcli/ml/monitoring/metrics.py +10 -22
- mcli/ml/optimization/portfolio_optimizer.py +172 -132
- mcli/ml/predictions/prediction_engine.py +235 -0
- mcli/ml/preprocessing/data_cleaners.py +6 -5
- mcli/ml/preprocessing/feature_extractors.py +7 -6
- mcli/ml/preprocessing/ml_pipeline.py +3 -2
- mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
- mcli/ml/preprocessing/test_preprocessing.py +4 -4
- mcli/ml/scripts/populate_sample_data.py +36 -16
- mcli/ml/tasks.py +82 -83
- mcli/ml/tests/test_integration.py +86 -76
- mcli/ml/tests/test_training_dashboard.py +169 -142
- mcli/mygroup/test_cmd.py +2 -1
- mcli/self/self_cmd.py +38 -18
- mcli/self/test_cmd.py +2 -1
- mcli/workflow/dashboard/dashboard_cmd.py +13 -6
- mcli/workflow/lsh_integration.py +46 -58
- mcli/workflow/politician_trading/commands.py +576 -427
- mcli/workflow/politician_trading/config.py +7 -7
- mcli/workflow/politician_trading/connectivity.py +35 -33
- mcli/workflow/politician_trading/data_sources.py +72 -71
- mcli/workflow/politician_trading/database.py +18 -16
- mcli/workflow/politician_trading/demo.py +4 -3
- mcli/workflow/politician_trading/models.py +5 -5
- mcli/workflow/politician_trading/monitoring.py +13 -13
- mcli/workflow/politician_trading/scrapers.py +332 -224
- mcli/workflow/politician_trading/scrapers_california.py +116 -94
- mcli/workflow/politician_trading/scrapers_eu.py +70 -71
- mcli/workflow/politician_trading/scrapers_uk.py +118 -90
- mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
- mcli/workflow/politician_trading/workflow.py +98 -71
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +2 -2
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -93
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
|
@@ -7,14 +7,14 @@ import logging
|
|
|
7
7
|
import re
|
|
8
8
|
from datetime import datetime, timedelta
|
|
9
9
|
from decimal import Decimal
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
11
|
from urllib.parse import urljoin, urlparse
|
|
12
12
|
|
|
13
13
|
import aiohttp
|
|
14
14
|
from bs4 import BeautifulSoup
|
|
15
15
|
|
|
16
|
-
from .models import Politician, TradingDisclosure, TransactionType, PoliticianRole
|
|
17
16
|
from .config import ScrapingConfig
|
|
17
|
+
from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
@@ -114,105 +114,139 @@ class CongressTradingScraper(BaseScraper):
|
|
|
114
114
|
|
|
115
115
|
try:
|
|
116
116
|
logger.info("Starting House disclosures scrape from official database")
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
async with aiohttp.ClientSession(
|
|
119
119
|
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
120
|
-
headers={
|
|
120
|
+
headers={"User-Agent": self.config.user_agent},
|
|
121
121
|
) as session:
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
# Get the ViewSearch form page
|
|
124
124
|
view_search_url = f"{base_url}/FinancialDisclosure/ViewSearch"
|
|
125
125
|
async with session.get(view_search_url) as response:
|
|
126
126
|
if response.status == 200:
|
|
127
127
|
html = await response.text()
|
|
128
128
|
logger.info("Successfully accessed House financial disclosure search form")
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
# Extract form data for ASPX
|
|
131
|
-
soup = BeautifulSoup(html,
|
|
132
|
-
|
|
131
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
132
|
+
|
|
133
133
|
# Look for common ASPX form fields
|
|
134
134
|
form_fields = {}
|
|
135
|
-
for field_name in [
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
135
|
+
for field_name in [
|
|
136
|
+
"__VIEWSTATE",
|
|
137
|
+
"__VIEWSTATEGENERATOR",
|
|
138
|
+
"__EVENTVALIDATION",
|
|
139
|
+
"__REQUESTVERIFICATIONTOKEN",
|
|
140
|
+
]:
|
|
141
|
+
field = soup.find("input", {"name": field_name})
|
|
142
|
+
if field and field.get("value"):
|
|
143
|
+
form_fields[field_name] = field["value"]
|
|
144
|
+
|
|
145
|
+
if "__VIEWSTATE" in form_fields:
|
|
141
146
|
logger.info("Found required ASPX form fields")
|
|
142
|
-
|
|
147
|
+
|
|
143
148
|
# Search for recent disclosures - try different form field names
|
|
144
149
|
current_year = str(datetime.now().year)
|
|
145
|
-
|
|
150
|
+
|
|
146
151
|
# Search for common politician last names to get real data
|
|
147
|
-
common_names = [
|
|
148
|
-
|
|
152
|
+
common_names = [
|
|
153
|
+
"Smith",
|
|
154
|
+
"Johnson",
|
|
155
|
+
"Brown",
|
|
156
|
+
"Davis",
|
|
157
|
+
"Wilson",
|
|
158
|
+
"Miller",
|
|
159
|
+
"Garcia",
|
|
160
|
+
]
|
|
161
|
+
|
|
149
162
|
# Try different form patterns with actual names
|
|
150
163
|
possible_form_data_sets = []
|
|
151
|
-
|
|
164
|
+
|
|
152
165
|
for name in common_names:
|
|
153
|
-
possible_form_data_sets.extend(
|
|
166
|
+
possible_form_data_sets.extend(
|
|
167
|
+
[
|
|
168
|
+
{
|
|
169
|
+
**form_fields,
|
|
170
|
+
"ctl00$MainContent$txtLastName": name,
|
|
171
|
+
"ctl00$MainContent$ddlFilingYear": current_year,
|
|
172
|
+
"ctl00$MainContent$btnSearch": "Search",
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
**form_fields,
|
|
176
|
+
"ctl00$ContentPlaceHolder1$txtLastName": name,
|
|
177
|
+
"ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
|
|
178
|
+
"ctl00$ContentPlaceHolder1$btnSearch": "Search",
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
**form_fields,
|
|
182
|
+
"LastName": name,
|
|
183
|
+
"FilingYear": current_year,
|
|
184
|
+
"Search": "Search",
|
|
185
|
+
},
|
|
186
|
+
]
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Also try without names (all results)
|
|
190
|
+
possible_form_data_sets.extend(
|
|
191
|
+
[
|
|
154
192
|
{
|
|
155
193
|
**form_fields,
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
'ctl00$MainContent$btnSearch': 'Search'
|
|
194
|
+
"ctl00$MainContent$ddlFilingYear": current_year,
|
|
195
|
+
"ctl00$MainContent$btnSearch": "Search",
|
|
159
196
|
},
|
|
160
197
|
{
|
|
161
198
|
**form_fields,
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
|
|
199
|
+
"ctl00$ContentPlaceHolder1$ddlFilingYear": current_year,
|
|
200
|
+
"ctl00$ContentPlaceHolder1$btnSearch": "Search",
|
|
165
201
|
},
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
'FilingYear': current_year,
|
|
170
|
-
'Search': 'Search'
|
|
171
|
-
}
|
|
172
|
-
])
|
|
173
|
-
|
|
174
|
-
# Also try without names (all results)
|
|
175
|
-
possible_form_data_sets.extend([
|
|
176
|
-
{
|
|
177
|
-
**form_fields,
|
|
178
|
-
'ctl00$MainContent$ddlFilingYear': current_year,
|
|
179
|
-
'ctl00$MainContent$btnSearch': 'Search'
|
|
180
|
-
},
|
|
181
|
-
{
|
|
182
|
-
**form_fields,
|
|
183
|
-
'ctl00$ContentPlaceHolder1$ddlFilingYear': current_year,
|
|
184
|
-
'ctl00$ContentPlaceHolder1$btnSearch': 'Search'
|
|
185
|
-
}
|
|
186
|
-
])
|
|
187
|
-
|
|
202
|
+
]
|
|
203
|
+
)
|
|
204
|
+
|
|
188
205
|
# Try each form configuration
|
|
189
206
|
for i, form_data in enumerate(possible_form_data_sets):
|
|
190
207
|
try:
|
|
191
208
|
logger.info(f"Attempting search with form configuration {i+1}")
|
|
192
|
-
async with session.post(
|
|
209
|
+
async with session.post(
|
|
210
|
+
view_search_url, data=form_data
|
|
211
|
+
) as search_response:
|
|
193
212
|
if search_response.status == 200:
|
|
194
213
|
results_html = await search_response.text()
|
|
195
|
-
if
|
|
196
|
-
|
|
197
|
-
|
|
214
|
+
if (
|
|
215
|
+
"search results" in results_html.lower()
|
|
216
|
+
or "disclosure" in results_html.lower()
|
|
217
|
+
):
|
|
218
|
+
disclosures = await self._parse_house_results(
|
|
219
|
+
results_html, base_url
|
|
220
|
+
)
|
|
221
|
+
logger.info(
|
|
222
|
+
f"Successfully found {len(disclosures)} House disclosures"
|
|
223
|
+
)
|
|
198
224
|
break
|
|
199
225
|
else:
|
|
200
|
-
logger.debug(
|
|
226
|
+
logger.debug(
|
|
227
|
+
f"Form config {i+1} didn't return results"
|
|
228
|
+
)
|
|
201
229
|
else:
|
|
202
|
-
logger.debug(
|
|
230
|
+
logger.debug(
|
|
231
|
+
f"Form config {i+1} failed with status {search_response.status}"
|
|
232
|
+
)
|
|
203
233
|
except Exception as e:
|
|
204
234
|
logger.debug(f"Form config {i+1} failed: {e}")
|
|
205
235
|
else:
|
|
206
|
-
logger.warning(
|
|
236
|
+
logger.warning(
|
|
237
|
+
"All form configurations failed, using basic page scraping"
|
|
238
|
+
)
|
|
207
239
|
# Fall back to scraping any existing disclosure links on the page
|
|
208
240
|
disclosures = await self._parse_house_results(html, base_url)
|
|
209
241
|
else:
|
|
210
|
-
logger.warning(
|
|
242
|
+
logger.warning(
|
|
243
|
+
"Could not find required ASPX form fields, using basic page scraping"
|
|
244
|
+
)
|
|
211
245
|
# Fall back to parsing any existing links
|
|
212
246
|
disclosures = await self._parse_house_results(html, base_url)
|
|
213
247
|
else:
|
|
214
248
|
logger.warning(f"Failed to access House disclosure site: {response.status}")
|
|
215
|
-
|
|
249
|
+
|
|
216
250
|
# Rate limiting
|
|
217
251
|
await asyncio.sleep(self.config.request_delay)
|
|
218
252
|
|
|
@@ -230,19 +264,21 @@ class CongressTradingScraper(BaseScraper):
|
|
|
230
264
|
|
|
231
265
|
try:
|
|
232
266
|
logger.info("Starting Senate disclosures scrape from EFD database")
|
|
233
|
-
|
|
267
|
+
|
|
234
268
|
async with aiohttp.ClientSession(
|
|
235
269
|
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
236
|
-
headers={
|
|
270
|
+
headers={"User-Agent": self.config.user_agent},
|
|
237
271
|
) as session:
|
|
238
|
-
|
|
272
|
+
|
|
239
273
|
# Search for recent periodic transaction reports (PTRs)
|
|
240
274
|
search_params = {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
275
|
+
"report_type": "11", # Periodic Transaction Report
|
|
276
|
+
"submitted_start_date": (datetime.now() - timedelta(days=90)).strftime(
|
|
277
|
+
"%m/%d/%Y"
|
|
278
|
+
),
|
|
279
|
+
"submitted_end_date": datetime.now().strftime("%m/%d/%Y"),
|
|
244
280
|
}
|
|
245
|
-
|
|
281
|
+
|
|
246
282
|
async with session.get(search_url, params=search_params) as response:
|
|
247
283
|
if response.status == 200:
|
|
248
284
|
html = await response.text()
|
|
@@ -250,7 +286,7 @@ class CongressTradingScraper(BaseScraper):
|
|
|
250
286
|
logger.info(f"Found {len(disclosures)} Senate disclosures")
|
|
251
287
|
else:
|
|
252
288
|
logger.warning(f"Senate search failed with status {response.status}")
|
|
253
|
-
|
|
289
|
+
|
|
254
290
|
# Rate limiting
|
|
255
291
|
await asyncio.sleep(self.config.request_delay)
|
|
256
292
|
|
|
@@ -263,64 +299,89 @@ class CongressTradingScraper(BaseScraper):
|
|
|
263
299
|
async def _parse_house_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
|
|
264
300
|
"""Parse House disclosure search results"""
|
|
265
301
|
disclosures = []
|
|
266
|
-
|
|
302
|
+
|
|
267
303
|
try:
|
|
268
|
-
soup = BeautifulSoup(html,
|
|
269
|
-
|
|
304
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
305
|
+
|
|
270
306
|
# Look for disclosure result rows - try multiple selectors
|
|
271
|
-
result_rows = (
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
307
|
+
result_rows = (
|
|
308
|
+
soup.find_all("tr", class_="disclosure-row")
|
|
309
|
+
or soup.select('tr[id*="GridView"]')
|
|
310
|
+
or soup.select("table tr")
|
|
311
|
+
or soup.find_all("tr")
|
|
312
|
+
)
|
|
313
|
+
|
|
276
314
|
logger.info(f"Found {len(result_rows)} potential result rows")
|
|
277
|
-
|
|
315
|
+
|
|
278
316
|
for row in result_rows[:20]: # Limit to 20 most recent
|
|
279
|
-
cells = row.find_all(
|
|
317
|
+
cells = row.find_all("td")
|
|
280
318
|
if len(cells) >= 3: # At least 3 columns
|
|
281
319
|
# Extract text from each cell to identify the structure
|
|
282
|
-
cell_texts = [
|
|
283
|
-
|
|
320
|
+
cell_texts = [
|
|
321
|
+
cell.get_text(strip=True) for cell in cells if cell.get_text(strip=True)
|
|
322
|
+
]
|
|
323
|
+
|
|
284
324
|
if not cell_texts:
|
|
285
325
|
continue
|
|
286
|
-
|
|
326
|
+
|
|
287
327
|
# Try to identify which cell contains the politician name
|
|
288
328
|
# Names usually contain letters and may have titles like "Rep.", "Hon."
|
|
289
329
|
politician_name = ""
|
|
290
|
-
|
|
330
|
+
|
|
291
331
|
for text in cell_texts:
|
|
292
332
|
# Look for text that looks like a person's name
|
|
293
|
-
if (
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
not text.
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
333
|
+
if (
|
|
334
|
+
len(text) > 3
|
|
335
|
+
and any(c.isalpha() for c in text)
|
|
336
|
+
and not text.isdigit()
|
|
337
|
+
and not text.startswith("20") # Not a year
|
|
338
|
+
and "pdf" not in text.lower()
|
|
339
|
+
and "view" not in text.lower()
|
|
340
|
+
):
|
|
341
|
+
|
|
300
342
|
# Clean up potential name
|
|
301
|
-
clean_name =
|
|
302
|
-
|
|
343
|
+
clean_name = (
|
|
344
|
+
text.replace("Hon.", "")
|
|
345
|
+
.replace("Rep.", "")
|
|
346
|
+
.replace("Sen.", "")
|
|
347
|
+
.strip()
|
|
348
|
+
)
|
|
349
|
+
if len(clean_name) > 3 and " " in clean_name: # Likely full name
|
|
303
350
|
politician_name = clean_name
|
|
304
351
|
break
|
|
305
|
-
|
|
352
|
+
|
|
306
353
|
if not politician_name:
|
|
307
354
|
politician_name = cell_texts[0] # Fallback to first cell
|
|
308
|
-
|
|
355
|
+
|
|
309
356
|
# Extract other information
|
|
310
|
-
filing_year = next(
|
|
311
|
-
|
|
312
|
-
|
|
357
|
+
filing_year = next(
|
|
358
|
+
(
|
|
359
|
+
text
|
|
360
|
+
for text in cell_texts
|
|
361
|
+
if text.isdigit() and len(text) == 4 and text.startswith("20")
|
|
362
|
+
),
|
|
363
|
+
"",
|
|
364
|
+
)
|
|
365
|
+
filing_type = next(
|
|
366
|
+
(
|
|
367
|
+
text
|
|
368
|
+
for text in cell_texts
|
|
369
|
+
if "periodic" in text.lower() or "annual" in text.lower()
|
|
370
|
+
),
|
|
371
|
+
"",
|
|
372
|
+
)
|
|
373
|
+
|
|
313
374
|
# Look for PDF link
|
|
314
|
-
pdf_link = row.find(
|
|
375
|
+
pdf_link = row.find("a", href=True)
|
|
315
376
|
if pdf_link:
|
|
316
|
-
pdf_url = urljoin(base_url, pdf_link[
|
|
317
|
-
|
|
377
|
+
pdf_url = urljoin(base_url, pdf_link["href"])
|
|
378
|
+
|
|
318
379
|
# Create basic disclosure entry
|
|
319
380
|
# Note: Actual transaction details would require PDF parsing
|
|
320
381
|
disclosure = TradingDisclosure(
|
|
321
382
|
politician_id="", # To be filled by matcher
|
|
322
383
|
transaction_date=datetime.now() - timedelta(days=30), # Estimate
|
|
323
|
-
disclosure_date=datetime.now() - timedelta(days=15),
|
|
384
|
+
disclosure_date=datetime.now() - timedelta(days=15), # Estimate
|
|
324
385
|
transaction_type=TransactionType.PURCHASE, # Default
|
|
325
386
|
asset_name="Unknown Asset", # Would need PDF parsing
|
|
326
387
|
asset_type="stock",
|
|
@@ -332,39 +393,39 @@ class CongressTradingScraper(BaseScraper):
|
|
|
332
393
|
"filing_year": filing_year,
|
|
333
394
|
"filing_type": filing_type,
|
|
334
395
|
"requires_pdf_parsing": True,
|
|
335
|
-
"extraction_method": "house_search_results"
|
|
336
|
-
}
|
|
396
|
+
"extraction_method": "house_search_results",
|
|
397
|
+
},
|
|
337
398
|
)
|
|
338
399
|
disclosures.append(disclosure)
|
|
339
|
-
|
|
400
|
+
|
|
340
401
|
except Exception as e:
|
|
341
402
|
logger.error(f"Error parsing House results: {e}")
|
|
342
|
-
|
|
403
|
+
|
|
343
404
|
return disclosures
|
|
344
405
|
|
|
345
406
|
async def _parse_senate_results(self, html: str, base_url: str) -> List[TradingDisclosure]:
|
|
346
407
|
"""Parse Senate EFD search results"""
|
|
347
408
|
disclosures = []
|
|
348
|
-
|
|
409
|
+
|
|
349
410
|
try:
|
|
350
|
-
soup = BeautifulSoup(html,
|
|
351
|
-
|
|
411
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
412
|
+
|
|
352
413
|
# Look for search result rows
|
|
353
|
-
result_rows = soup.find_all(
|
|
354
|
-
|
|
414
|
+
result_rows = soup.find_all("tr", class_="searchresult") or soup.select("tbody tr")
|
|
415
|
+
|
|
355
416
|
for row in result_rows[:20]: # Limit to 20 most recent
|
|
356
|
-
cells = row.find_all(
|
|
417
|
+
cells = row.find_all("td")
|
|
357
418
|
if len(cells) >= 4:
|
|
358
419
|
# Extract information
|
|
359
420
|
name = cells[0].get_text(strip=True) if cells[0] else ""
|
|
360
421
|
report_type = cells[1].get_text(strip=True) if cells[1] else ""
|
|
361
422
|
filing_date = cells[2].get_text(strip=True) if cells[2] else ""
|
|
362
|
-
|
|
423
|
+
|
|
363
424
|
# Look for report link
|
|
364
|
-
report_link = row.find(
|
|
365
|
-
if report_link and
|
|
366
|
-
report_url = urljoin(base_url, report_link[
|
|
367
|
-
|
|
425
|
+
report_link = row.find("a", href=True)
|
|
426
|
+
if report_link and "ptr" in report_type.lower(): # Periodic Transaction Report
|
|
427
|
+
report_url = urljoin(base_url, report_link["href"])
|
|
428
|
+
|
|
368
429
|
# Create disclosure entry
|
|
369
430
|
# Note: Actual transaction details would require report parsing
|
|
370
431
|
disclosure = TradingDisclosure(
|
|
@@ -381,34 +442,29 @@ class CongressTradingScraper(BaseScraper):
|
|
|
381
442
|
"politician_name": name,
|
|
382
443
|
"report_type": report_type,
|
|
383
444
|
"filing_date": filing_date,
|
|
384
|
-
"requires_report_parsing": True
|
|
385
|
-
}
|
|
445
|
+
"requires_report_parsing": True,
|
|
446
|
+
},
|
|
386
447
|
)
|
|
387
448
|
disclosures.append(disclosure)
|
|
388
|
-
|
|
449
|
+
|
|
389
450
|
except Exception as e:
|
|
390
451
|
logger.error(f"Error parsing Senate results: {e}")
|
|
391
|
-
|
|
452
|
+
|
|
392
453
|
return disclosures
|
|
393
454
|
|
|
394
455
|
def _parse_date(self, date_str: str) -> Optional[datetime]:
|
|
395
456
|
"""Parse various date formats from disclosure sites"""
|
|
396
457
|
if not date_str:
|
|
397
458
|
return None
|
|
398
|
-
|
|
399
|
-
date_formats = [
|
|
400
|
-
|
|
401
|
-
'%Y-%m-%d',
|
|
402
|
-
'%m-%d-%Y',
|
|
403
|
-
'%B %d, %Y'
|
|
404
|
-
]
|
|
405
|
-
|
|
459
|
+
|
|
460
|
+
date_formats = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%B %d, %Y"]
|
|
461
|
+
|
|
406
462
|
for fmt in date_formats:
|
|
407
463
|
try:
|
|
408
464
|
return datetime.strptime(date_str.strip(), fmt)
|
|
409
465
|
except ValueError:
|
|
410
466
|
continue
|
|
411
|
-
|
|
467
|
+
|
|
412
468
|
logger.warning(f"Could not parse date: {date_str}")
|
|
413
469
|
return None
|
|
414
470
|
|
|
@@ -440,17 +496,19 @@ class QuiverQuantScraper(BaseScraper):
|
|
|
440
496
|
if len(cells) >= 4:
|
|
441
497
|
# Extract cell contents and try to identify the correct fields
|
|
442
498
|
cell_texts = [cell.get_text(strip=True) for cell in cells]
|
|
443
|
-
|
|
499
|
+
|
|
444
500
|
# Try to identify which cell contains what data
|
|
445
501
|
politician_name = cell_texts[0] if len(cell_texts) > 0 else ""
|
|
446
|
-
|
|
502
|
+
|
|
447
503
|
# Look for date-like patterns (YYYY-MM-DD, MM/DD/YYYY, etc.)
|
|
448
504
|
transaction_date = ""
|
|
449
505
|
ticker = ""
|
|
450
506
|
transaction_type = ""
|
|
451
507
|
amount = ""
|
|
452
|
-
|
|
453
|
-
for i, text in enumerate(
|
|
508
|
+
|
|
509
|
+
for i, text in enumerate(
|
|
510
|
+
cell_texts[1:], 1
|
|
511
|
+
): # Skip first cell (politician name)
|
|
454
512
|
# Check if this looks like a date
|
|
455
513
|
if self._looks_like_date(text):
|
|
456
514
|
transaction_date = text
|
|
@@ -458,19 +516,21 @@ class QuiverQuantScraper(BaseScraper):
|
|
|
458
516
|
elif text.isupper() and len(text) <= 5 and text.isalpha():
|
|
459
517
|
ticker = text
|
|
460
518
|
# Check if this contains transaction type keywords
|
|
461
|
-
elif any(
|
|
519
|
+
elif any(
|
|
520
|
+
word in text.lower() for word in ["purchase", "sale", "buy", "sell"]
|
|
521
|
+
):
|
|
462
522
|
# Split transaction type and amount if combined
|
|
463
|
-
if
|
|
523
|
+
if "$" in text:
|
|
464
524
|
# Split on $ to separate transaction type from amount
|
|
465
|
-
parts = text.split(
|
|
525
|
+
parts = text.split("$", 1)
|
|
466
526
|
transaction_type = parts[0].strip()
|
|
467
|
-
amount =
|
|
527
|
+
amount = "$" + parts[1] if len(parts) > 1 else ""
|
|
468
528
|
else:
|
|
469
529
|
transaction_type = text
|
|
470
530
|
# Check if this looks like an amount (contains $ or numbers with ,)
|
|
471
|
-
elif
|
|
531
|
+
elif "$" in text or ("," in text and any(c.isdigit() for c in text)):
|
|
472
532
|
amount = text
|
|
473
|
-
|
|
533
|
+
|
|
474
534
|
# Only create trade data if we have essential fields
|
|
475
535
|
if politician_name and (transaction_date or ticker):
|
|
476
536
|
trade_data = {
|
|
@@ -492,16 +552,17 @@ class QuiverQuantScraper(BaseScraper):
|
|
|
492
552
|
"""Check if a string looks like a date"""
|
|
493
553
|
if not text or len(text) < 8:
|
|
494
554
|
return False
|
|
495
|
-
|
|
555
|
+
|
|
496
556
|
# Common date patterns
|
|
497
557
|
date_patterns = [
|
|
498
|
-
r
|
|
499
|
-
r
|
|
500
|
-
r
|
|
501
|
-
r
|
|
558
|
+
r"\d{4}-\d{1,2}-\d{1,2}", # YYYY-MM-DD
|
|
559
|
+
r"\d{1,2}/\d{1,2}/\d{4}", # MM/DD/YYYY
|
|
560
|
+
r"\d{1,2}-\d{1,2}-\d{4}", # MM-DD-YYYY
|
|
561
|
+
r"\w{3}\s+\d{1,2},?\s+\d{4}", # Month DD, YYYY
|
|
502
562
|
]
|
|
503
|
-
|
|
563
|
+
|
|
504
564
|
import re
|
|
565
|
+
|
|
505
566
|
for pattern in date_patterns:
|
|
506
567
|
if re.search(pattern, text):
|
|
507
568
|
return True
|
|
@@ -543,7 +604,9 @@ class QuiverQuantScraper(BaseScraper):
|
|
|
543
604
|
# Try MM-DD-YYYY
|
|
544
605
|
transaction_date = datetime.strptime(date_str, "%m-%d-%Y")
|
|
545
606
|
except ValueError:
|
|
546
|
-
logger.warning(
|
|
607
|
+
logger.warning(
|
|
608
|
+
f"Could not parse date '{date_str}', using estimated date"
|
|
609
|
+
)
|
|
547
610
|
transaction_date = datetime.now() - timedelta(days=30)
|
|
548
611
|
|
|
549
612
|
# Parse amount
|
|
@@ -583,39 +646,41 @@ class EUParliamentScraper(BaseScraper):
|
|
|
583
646
|
try:
|
|
584
647
|
logger.info("Starting EU Parliament MEP declarations scrape")
|
|
585
648
|
base_url = "https://www.europarl.europa.eu"
|
|
586
|
-
|
|
649
|
+
|
|
587
650
|
async with aiohttp.ClientSession(
|
|
588
651
|
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
589
|
-
headers={
|
|
652
|
+
headers={"User-Agent": self.config.user_agent},
|
|
590
653
|
) as session:
|
|
591
|
-
|
|
654
|
+
|
|
592
655
|
# Get list of current MEPs
|
|
593
656
|
mep_list_url = f"{base_url}/meps/en/full-list/all"
|
|
594
|
-
|
|
657
|
+
|
|
595
658
|
async with session.get(mep_list_url) as response:
|
|
596
659
|
if response.status == 200:
|
|
597
660
|
html = await response.text()
|
|
598
661
|
mep_data = await self._extract_mep_urls(html, base_url)
|
|
599
662
|
logger.info(f"Found {len(mep_data)} MEP profiles to check")
|
|
600
|
-
|
|
663
|
+
|
|
601
664
|
# Check declarations for a subset of MEPs (to avoid overwhelming the server)
|
|
602
665
|
for i, mep_info in enumerate(mep_data[:50]): # Limit to 50 MEPs
|
|
603
666
|
try:
|
|
604
|
-
mep_disclosures = await self._scrape_mep_profile(
|
|
667
|
+
mep_disclosures = await self._scrape_mep_profile(
|
|
668
|
+
session, mep_info["url"], mep_info
|
|
669
|
+
)
|
|
605
670
|
disclosures.extend(mep_disclosures)
|
|
606
|
-
|
|
671
|
+
|
|
607
672
|
# Rate limiting - EU Parliament is more sensitive
|
|
608
673
|
await asyncio.sleep(self.config.request_delay * 2)
|
|
609
|
-
|
|
674
|
+
|
|
610
675
|
if i > 0 and i % 10 == 0:
|
|
611
676
|
logger.info(f"Processed {i} MEP profiles")
|
|
612
|
-
|
|
677
|
+
|
|
613
678
|
except Exception as e:
|
|
614
679
|
logger.warning(f"Failed to process MEP profile {mep_url}: {e}")
|
|
615
680
|
continue
|
|
616
681
|
else:
|
|
617
682
|
logger.warning(f"Failed to access MEP list: {response.status}")
|
|
618
|
-
|
|
683
|
+
|
|
619
684
|
logger.info(f"Collected {len(disclosures)} EU Parliament disclosures")
|
|
620
685
|
|
|
621
686
|
except Exception as e:
|
|
@@ -626,124 +691,155 @@ class EUParliamentScraper(BaseScraper):
|
|
|
626
691
|
async def _extract_mep_urls(self, html: str, base_url: str) -> List[Dict[str, str]]:
|
|
627
692
|
"""Extract MEP profile URLs and names from the MEP list page"""
|
|
628
693
|
mep_data = []
|
|
629
|
-
|
|
694
|
+
|
|
630
695
|
try:
|
|
631
|
-
soup = BeautifulSoup(html,
|
|
632
|
-
|
|
696
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
697
|
+
|
|
633
698
|
# Look for MEP profile links - they usually contain both name and link
|
|
634
|
-
mep_links = soup.find_all(
|
|
635
|
-
|
|
699
|
+
mep_links = soup.find_all("a", href=True)
|
|
700
|
+
|
|
636
701
|
seen_urls = set()
|
|
637
|
-
|
|
702
|
+
|
|
638
703
|
for link in mep_links:
|
|
639
|
-
href = link.get(
|
|
640
|
-
if
|
|
704
|
+
href = link.get("href", "")
|
|
705
|
+
if "/meps/en/" in href and "/home" in href:
|
|
641
706
|
full_url = urljoin(base_url, href)
|
|
642
|
-
|
|
707
|
+
|
|
643
708
|
if full_url not in seen_urls:
|
|
644
709
|
# Extract MEP name from link text or nearby elements
|
|
645
710
|
mep_name = ""
|
|
646
|
-
|
|
711
|
+
|
|
647
712
|
# Try to get name from link text
|
|
648
713
|
link_text = link.get_text(strip=True)
|
|
649
|
-
if
|
|
714
|
+
if (
|
|
715
|
+
link_text
|
|
716
|
+
and len(link_text) > 3
|
|
717
|
+
and not link_text.lower().startswith("http")
|
|
718
|
+
):
|
|
650
719
|
mep_name = link_text
|
|
651
|
-
|
|
720
|
+
|
|
652
721
|
# If no name in link, look in parent elements
|
|
653
722
|
if not mep_name:
|
|
654
723
|
parent = link.parent
|
|
655
724
|
if parent:
|
|
656
725
|
# Look for text that looks like a name
|
|
657
726
|
for text_node in parent.stripped_strings:
|
|
658
|
-
if (
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
not text_node.
|
|
727
|
+
if (
|
|
728
|
+
len(text_node) > 3
|
|
729
|
+
and " " in text_node
|
|
730
|
+
and not text_node.startswith("http")
|
|
731
|
+
and not text_node.isdigit()
|
|
732
|
+
):
|
|
662
733
|
mep_name = text_node
|
|
663
734
|
break
|
|
664
|
-
|
|
735
|
+
|
|
665
736
|
# Extract country/party info if available
|
|
666
737
|
country = ""
|
|
667
738
|
party = ""
|
|
668
|
-
|
|
739
|
+
|
|
669
740
|
# Look for country and party info near the link
|
|
670
|
-
container = link.find_parent([
|
|
741
|
+
container = link.find_parent(["div", "article", "section"])
|
|
671
742
|
if container:
|
|
672
743
|
text_elements = list(container.stripped_strings)
|
|
673
744
|
for i, text in enumerate(text_elements):
|
|
674
745
|
if text == mep_name and i < len(text_elements) - 2:
|
|
675
746
|
# Country and party usually come after name
|
|
676
|
-
country =
|
|
677
|
-
|
|
678
|
-
|
|
747
|
+
country = (
|
|
748
|
+
text_elements[i + 1] if i + 1 < len(text_elements) else ""
|
|
749
|
+
)
|
|
750
|
+
party = (
|
|
751
|
+
text_elements[i + 2] if i + 2 < len(text_elements) else ""
|
|
752
|
+
)
|
|
753
|
+
|
|
679
754
|
if mep_name: # Only add if we found a name
|
|
680
|
-
mep_data.append(
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
755
|
+
mep_data.append(
|
|
756
|
+
{
|
|
757
|
+
"url": full_url,
|
|
758
|
+
"name": mep_name,
|
|
759
|
+
"country": country,
|
|
760
|
+
"party": party,
|
|
761
|
+
}
|
|
762
|
+
)
|
|
686
763
|
seen_urls.add(full_url)
|
|
687
|
-
|
|
764
|
+
|
|
688
765
|
# Limit to prevent overwhelming the servers
|
|
689
766
|
if len(mep_data) >= 50:
|
|
690
767
|
break
|
|
691
|
-
|
|
768
|
+
|
|
692
769
|
except Exception as e:
|
|
693
770
|
logger.error(f"Error extracting MEP data: {e}")
|
|
694
|
-
|
|
771
|
+
|
|
695
772
|
return mep_data
|
|
696
773
|
|
|
697
|
-
async def _scrape_mep_profile(
|
|
774
|
+
async def _scrape_mep_profile(
|
|
775
|
+
self, session: aiohttp.ClientSession, mep_url: str, mep_info: Dict[str, str] = None
|
|
776
|
+
) -> List[TradingDisclosure]:
|
|
698
777
|
"""Scrape financial interests from an individual MEP profile"""
|
|
699
778
|
disclosures = []
|
|
700
|
-
|
|
779
|
+
|
|
701
780
|
try:
|
|
702
781
|
async with session.get(mep_url) as response:
|
|
703
782
|
if response.status == 200:
|
|
704
783
|
html = await response.text()
|
|
705
|
-
soup = BeautifulSoup(html,
|
|
706
|
-
|
|
784
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
785
|
+
|
|
707
786
|
# Use extracted MEP name from list, or try to extract from profile
|
|
708
|
-
if mep_info and mep_info.get(
|
|
709
|
-
mep_name = mep_info[
|
|
710
|
-
mep_country = mep_info.get(
|
|
711
|
-
mep_party = mep_info.get(
|
|
787
|
+
if mep_info and mep_info.get("name"):
|
|
788
|
+
mep_name = mep_info["name"]
|
|
789
|
+
mep_country = mep_info.get("country", "")
|
|
790
|
+
mep_party = mep_info.get("party", "")
|
|
712
791
|
else:
|
|
713
792
|
# Fallback: extract from profile page
|
|
714
|
-
name_element = soup.find(
|
|
715
|
-
mep_name =
|
|
793
|
+
name_element = soup.find("h1", class_="ep-header-title")
|
|
794
|
+
mep_name = (
|
|
795
|
+
name_element.get_text(strip=True) if name_element else "Unknown MEP"
|
|
796
|
+
)
|
|
716
797
|
mep_country = ""
|
|
717
798
|
mep_party = ""
|
|
718
|
-
|
|
799
|
+
|
|
719
800
|
# Look for financial interests section
|
|
720
801
|
# EU Parliament declarations are typically in a specific section
|
|
721
|
-
interests_section =
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
802
|
+
interests_section = (
|
|
803
|
+
soup.find("div", id="financial-interests")
|
|
804
|
+
or soup.find("section", class_="ep-a-section")
|
|
805
|
+
or soup.find("div", class_="ep-m-content-block")
|
|
806
|
+
)
|
|
807
|
+
|
|
725
808
|
if interests_section:
|
|
726
809
|
# Parse financial interests
|
|
727
810
|
# Note: EU declarations focus more on activities and interests than specific trades
|
|
728
|
-
interest_items = interests_section.find_all(
|
|
729
|
-
|
|
811
|
+
interest_items = interests_section.find_all(
|
|
812
|
+
["p", "li", "div"], recursive=True
|
|
813
|
+
)
|
|
814
|
+
|
|
730
815
|
for item in interest_items:
|
|
731
816
|
item_text = item.get_text(strip=True).lower()
|
|
732
|
-
|
|
817
|
+
|
|
733
818
|
# Look for financial keywords
|
|
734
|
-
if any(
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
819
|
+
if any(
|
|
820
|
+
keyword in item_text
|
|
821
|
+
for keyword in [
|
|
822
|
+
"shareholding",
|
|
823
|
+
"investment",
|
|
824
|
+
"director",
|
|
825
|
+
"board",
|
|
826
|
+
"financial interest",
|
|
827
|
+
"remuneration",
|
|
828
|
+
"consulting",
|
|
829
|
+
]
|
|
830
|
+
):
|
|
738
831
|
# Create disclosure for detected financial interest
|
|
739
832
|
disclosure = TradingDisclosure(
|
|
740
833
|
politician_id="", # To be filled by matcher
|
|
741
|
-
transaction_date=datetime.now()
|
|
742
|
-
|
|
834
|
+
transaction_date=datetime.now()
|
|
835
|
+
- timedelta(days=90), # Estimate
|
|
836
|
+
disclosure_date=datetime.now() - timedelta(days=60), # Estimate
|
|
743
837
|
transaction_type=TransactionType.PURCHASE, # Default for interests
|
|
744
838
|
asset_name=self._extract_company_name(item_text),
|
|
745
839
|
asset_type="interest",
|
|
746
|
-
amount_range_min=Decimal(
|
|
840
|
+
amount_range_min=Decimal(
|
|
841
|
+
"0"
|
|
842
|
+
), # EU doesn't always specify amounts
|
|
747
843
|
amount_range_max=Decimal("0"),
|
|
748
844
|
source_url=mep_url,
|
|
749
845
|
raw_data={
|
|
@@ -751,28 +847,30 @@ class EUParliamentScraper(BaseScraper):
|
|
|
751
847
|
"country": mep_country,
|
|
752
848
|
"party": mep_party,
|
|
753
849
|
"interest_type": "financial_activity",
|
|
754
|
-
"interest_description": item.get_text(strip=True)[
|
|
850
|
+
"interest_description": item.get_text(strip=True)[
|
|
851
|
+
:500
|
|
852
|
+
], # Truncate
|
|
755
853
|
"region": "eu",
|
|
756
854
|
"extraction_method": "mep_profile_scraping",
|
|
757
|
-
"requires_manual_review": True
|
|
758
|
-
}
|
|
855
|
+
"requires_manual_review": True,
|
|
856
|
+
},
|
|
759
857
|
)
|
|
760
858
|
disclosures.append(disclosure)
|
|
761
|
-
|
|
859
|
+
|
|
762
860
|
except Exception as e:
|
|
763
861
|
logger.warning(f"Error scraping MEP profile {mep_url}: {e}")
|
|
764
|
-
|
|
862
|
+
|
|
765
863
|
return disclosures
|
|
766
864
|
|
|
767
865
|
def _extract_company_name(self, text: str) -> str:
|
|
768
866
|
"""Extract company/organization name from interest description"""
|
|
769
867
|
# Simple heuristic to extract potential company names
|
|
770
868
|
words = text.split()
|
|
771
|
-
|
|
869
|
+
|
|
772
870
|
# Look for capitalized sequences that might be company names
|
|
773
871
|
potential_names = []
|
|
774
872
|
current_name = []
|
|
775
|
-
|
|
873
|
+
|
|
776
874
|
for word in words:
|
|
777
875
|
if word[0].isupper() and len(word) > 2:
|
|
778
876
|
current_name.append(word)
|
|
@@ -780,10 +878,10 @@ class EUParliamentScraper(BaseScraper):
|
|
|
780
878
|
if current_name and len(current_name) <= 4: # Reasonable company name length
|
|
781
879
|
potential_names.append(" ".join(current_name))
|
|
782
880
|
current_name = []
|
|
783
|
-
|
|
881
|
+
|
|
784
882
|
if current_name and len(current_name) <= 4:
|
|
785
883
|
potential_names.append(" ".join(current_name))
|
|
786
|
-
|
|
884
|
+
|
|
787
885
|
# Return the first reasonable candidate or default
|
|
788
886
|
return potential_names[0] if potential_names else "Financial Interest"
|
|
789
887
|
|
|
@@ -858,6 +956,7 @@ class PoliticianMatcher:
|
|
|
858
956
|
# Import specialized scrapers after base classes are defined
|
|
859
957
|
try:
|
|
860
958
|
from .scrapers_uk import UKParliamentScraper, run_uk_parliament_collection
|
|
959
|
+
|
|
861
960
|
UK_SCRAPER_AVAILABLE = True
|
|
862
961
|
except Exception as e:
|
|
863
962
|
logger.debug(f"UK scraper import failed: {e}")
|
|
@@ -867,6 +966,7 @@ except Exception as e:
|
|
|
867
966
|
|
|
868
967
|
try:
|
|
869
968
|
from .scrapers_california import CaliforniaNetFileScraper, run_california_collection
|
|
969
|
+
|
|
870
970
|
CALIFORNIA_SCRAPER_AVAILABLE = True
|
|
871
971
|
except Exception as e:
|
|
872
972
|
logger.debug(f"California scraper import failed: {e}")
|
|
@@ -876,6 +976,7 @@ except Exception as e:
|
|
|
876
976
|
|
|
877
977
|
try:
|
|
878
978
|
from .scrapers_eu import EUMemberStatesScraper, run_eu_member_states_collection
|
|
979
|
+
|
|
879
980
|
EU_MEMBER_STATES_SCRAPER_AVAILABLE = True
|
|
880
981
|
except Exception as e:
|
|
881
982
|
logger.debug(f"EU member states scraper import failed: {e}")
|
|
@@ -885,6 +986,7 @@ except Exception as e:
|
|
|
885
986
|
|
|
886
987
|
try:
|
|
887
988
|
from .scrapers_us_states import USStatesScraper, run_us_states_collection
|
|
989
|
+
|
|
888
990
|
US_STATES_SCRAPER_AVAILABLE = True
|
|
889
991
|
except Exception as e:
|
|
890
992
|
logger.debug(f"US states scraper import failed: {e}")
|
|
@@ -899,7 +1001,7 @@ async def run_uk_parliament_workflow(config: ScrapingConfig) -> List[TradingDisc
|
|
|
899
1001
|
if not UK_SCRAPER_AVAILABLE:
|
|
900
1002
|
logger.warning("UK Parliament scraper not available")
|
|
901
1003
|
return []
|
|
902
|
-
|
|
1004
|
+
|
|
903
1005
|
logger.info("Starting UK Parliament financial interests collection")
|
|
904
1006
|
try:
|
|
905
1007
|
disclosures = await run_uk_parliament_collection(config)
|
|
@@ -915,7 +1017,7 @@ async def run_california_workflow(config: ScrapingConfig) -> List[TradingDisclos
|
|
|
915
1017
|
if not CALIFORNIA_SCRAPER_AVAILABLE:
|
|
916
1018
|
logger.warning("California scraper not available")
|
|
917
1019
|
return []
|
|
918
|
-
|
|
1020
|
+
|
|
919
1021
|
logger.info("Starting California financial disclosures collection")
|
|
920
1022
|
try:
|
|
921
1023
|
disclosures = await run_california_collection(config)
|
|
@@ -931,7 +1033,7 @@ async def run_eu_member_states_workflow(config: ScrapingConfig) -> List[TradingD
|
|
|
931
1033
|
if not EU_MEMBER_STATES_SCRAPER_AVAILABLE:
|
|
932
1034
|
logger.warning("EU member states scraper not available")
|
|
933
1035
|
return []
|
|
934
|
-
|
|
1036
|
+
|
|
935
1037
|
logger.info("Starting EU member states financial disclosures collection")
|
|
936
1038
|
try:
|
|
937
1039
|
disclosures = await run_eu_member_states_collection(config)
|
|
@@ -947,7 +1049,7 @@ async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosu
|
|
|
947
1049
|
if not US_STATES_SCRAPER_AVAILABLE:
|
|
948
1050
|
logger.warning("US states scraper not available")
|
|
949
1051
|
return []
|
|
950
|
-
|
|
1052
|
+
|
|
951
1053
|
logger.info("Starting US states financial disclosures collection")
|
|
952
1054
|
try:
|
|
953
1055
|
disclosures = await run_us_states_collection(config)
|
|
@@ -960,7 +1062,13 @@ async def run_us_states_workflow(config: ScrapingConfig) -> List[TradingDisclosu
|
|
|
960
1062
|
|
|
961
1063
|
# Export the new workflow function
|
|
962
1064
|
__all__ = [
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
1065
|
+
"BaseScraper",
|
|
1066
|
+
"CongressTradingScraper",
|
|
1067
|
+
"QuiverQuantScraper",
|
|
1068
|
+
"EUParliamentScraper",
|
|
1069
|
+
"PoliticianMatcher",
|
|
1070
|
+
"run_uk_parliament_workflow",
|
|
1071
|
+
"run_california_workflow",
|
|
1072
|
+
"run_eu_member_states_workflow",
|
|
1073
|
+
"run_us_states_workflow",
|
|
966
1074
|
]
|