mcli-framework 7.1.1__py3-none-any.whl → 7.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/completion_cmd.py +59 -49
- mcli/app/completion_helpers.py +60 -138
- mcli/app/logs_cmd.py +6 -2
- mcli/app/main.py +17 -14
- mcli/app/model_cmd.py +19 -4
- mcli/chat/chat.py +3 -2
- mcli/lib/search/cached_vectorizer.py +1 -0
- mcli/lib/services/data_pipeline.py +12 -5
- mcli/lib/services/lsh_client.py +68 -57
- mcli/ml/api/app.py +28 -36
- mcli/ml/api/middleware.py +8 -16
- mcli/ml/api/routers/admin_router.py +3 -1
- mcli/ml/api/routers/auth_router.py +32 -56
- mcli/ml/api/routers/backtest_router.py +3 -1
- mcli/ml/api/routers/data_router.py +3 -1
- mcli/ml/api/routers/model_router.py +35 -74
- mcli/ml/api/routers/monitoring_router.py +3 -1
- mcli/ml/api/routers/portfolio_router.py +3 -1
- mcli/ml/api/routers/prediction_router.py +60 -65
- mcli/ml/api/routers/trade_router.py +6 -2
- mcli/ml/api/routers/websocket_router.py +12 -9
- mcli/ml/api/schemas.py +10 -2
- mcli/ml/auth/auth_manager.py +49 -114
- mcli/ml/auth/models.py +30 -15
- mcli/ml/auth/permissions.py +12 -19
- mcli/ml/backtesting/backtest_engine.py +134 -108
- mcli/ml/backtesting/performance_metrics.py +142 -108
- mcli/ml/cache.py +12 -18
- mcli/ml/cli/main.py +37 -23
- mcli/ml/config/settings.py +29 -12
- mcli/ml/dashboard/app.py +122 -130
- mcli/ml/dashboard/app_integrated.py +955 -154
- mcli/ml/dashboard/app_supabase.py +176 -108
- mcli/ml/dashboard/app_training.py +212 -206
- mcli/ml/dashboard/cli.py +14 -5
- mcli/ml/data_ingestion/api_connectors.py +51 -81
- mcli/ml/data_ingestion/data_pipeline.py +127 -125
- mcli/ml/data_ingestion/stream_processor.py +72 -80
- mcli/ml/database/migrations/env.py +3 -2
- mcli/ml/database/models.py +112 -79
- mcli/ml/database/session.py +6 -5
- mcli/ml/experimentation/ab_testing.py +149 -99
- mcli/ml/features/ensemble_features.py +9 -8
- mcli/ml/features/political_features.py +6 -5
- mcli/ml/features/recommendation_engine.py +15 -14
- mcli/ml/features/stock_features.py +7 -6
- mcli/ml/features/test_feature_engineering.py +8 -7
- mcli/ml/logging.py +10 -15
- mcli/ml/mlops/data_versioning.py +57 -64
- mcli/ml/mlops/experiment_tracker.py +49 -41
- mcli/ml/mlops/model_serving.py +59 -62
- mcli/ml/mlops/pipeline_orchestrator.py +203 -149
- mcli/ml/models/base_models.py +8 -7
- mcli/ml/models/ensemble_models.py +6 -5
- mcli/ml/models/recommendation_models.py +7 -6
- mcli/ml/models/test_models.py +18 -14
- mcli/ml/monitoring/drift_detection.py +95 -74
- mcli/ml/monitoring/metrics.py +10 -22
- mcli/ml/optimization/portfolio_optimizer.py +172 -132
- mcli/ml/predictions/prediction_engine.py +62 -50
- mcli/ml/preprocessing/data_cleaners.py +6 -5
- mcli/ml/preprocessing/feature_extractors.py +7 -6
- mcli/ml/preprocessing/ml_pipeline.py +3 -2
- mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
- mcli/ml/preprocessing/test_preprocessing.py +4 -4
- mcli/ml/scripts/populate_sample_data.py +36 -16
- mcli/ml/tasks.py +82 -83
- mcli/ml/tests/test_integration.py +86 -76
- mcli/ml/tests/test_training_dashboard.py +169 -142
- mcli/mygroup/test_cmd.py +2 -1
- mcli/self/self_cmd.py +31 -16
- mcli/self/test_cmd.py +2 -1
- mcli/workflow/dashboard/dashboard_cmd.py +13 -6
- mcli/workflow/lsh_integration.py +46 -58
- mcli/workflow/politician_trading/commands.py +576 -427
- mcli/workflow/politician_trading/config.py +7 -7
- mcli/workflow/politician_trading/connectivity.py +35 -33
- mcli/workflow/politician_trading/data_sources.py +72 -71
- mcli/workflow/politician_trading/database.py +18 -16
- mcli/workflow/politician_trading/demo.py +4 -3
- mcli/workflow/politician_trading/models.py +5 -5
- mcli/workflow/politician_trading/monitoring.py +13 -13
- mcli/workflow/politician_trading/scrapers.py +332 -224
- mcli/workflow/politician_trading/scrapers_california.py +116 -94
- mcli/workflow/politician_trading/scrapers_eu.py +70 -71
- mcli/workflow/politician_trading/scrapers_uk.py +118 -90
- mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
- mcli/workflow/politician_trading/workflow.py +98 -71
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/METADATA +1 -1
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/RECORD +94 -94
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.3.dist-info}/top_level.txt +0 -0
|
@@ -7,21 +7,22 @@ including NetFile public portals and Cal-Access data.
|
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import logging
|
|
10
|
-
from datetime import datetime, timedelta
|
|
11
|
-
from typing import List, Dict, Any, Optional
|
|
12
|
-
import aiohttp
|
|
13
10
|
import re
|
|
11
|
+
from datetime import datetime, timedelta
|
|
14
12
|
from decimal import Decimal
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
import aiohttp
|
|
15
16
|
|
|
17
|
+
from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
|
|
16
18
|
from .scrapers import BaseScraper
|
|
17
|
-
from .models import TradingDisclosure, Politician, PoliticianRole, TransactionType
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class CaliforniaNetFileScraper(BaseScraper):
|
|
23
24
|
"""Scraper for California NetFile public disclosure portals"""
|
|
24
|
-
|
|
25
|
+
|
|
25
26
|
def __init__(self, config, test_mode=True):
|
|
26
27
|
super().__init__(config)
|
|
27
28
|
self.test_mode = test_mode # Skip network calls for testing
|
|
@@ -33,33 +34,33 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
33
34
|
"https://public.netfile.com/pub2/?AID=LAC", # Los Angeles County
|
|
34
35
|
]
|
|
35
36
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
36
|
-
|
|
37
|
+
|
|
37
38
|
async def __aenter__(self):
|
|
38
39
|
"""Async context manager entry"""
|
|
39
40
|
self.session = aiohttp.ClientSession(
|
|
40
41
|
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
41
|
-
headers={
|
|
42
|
+
headers={"User-Agent": self.config.user_agent},
|
|
42
43
|
)
|
|
43
44
|
return self
|
|
44
|
-
|
|
45
|
+
|
|
45
46
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
46
47
|
"""Async context manager exit"""
|
|
47
48
|
if self.session:
|
|
48
49
|
await self.session.close()
|
|
49
|
-
|
|
50
|
+
|
|
50
51
|
async def scrape_california_disclosures(self) -> List[TradingDisclosure]:
|
|
51
52
|
"""Scrape California financial disclosures from NetFile portals"""
|
|
52
53
|
logger.info("Starting California NetFile disclosures collection")
|
|
53
|
-
|
|
54
|
+
|
|
54
55
|
if not self.session:
|
|
55
56
|
raise RuntimeError("Session not initialized. Use async context manager.")
|
|
56
|
-
|
|
57
|
+
|
|
57
58
|
all_disclosures = []
|
|
58
|
-
|
|
59
|
+
|
|
59
60
|
# California state-level disclosures
|
|
60
61
|
state_disclosures = await self._scrape_cal_access_data()
|
|
61
62
|
all_disclosures.extend(state_disclosures)
|
|
62
|
-
|
|
63
|
+
|
|
63
64
|
# County-level NetFile portals
|
|
64
65
|
for portal_url in self.public_portals:
|
|
65
66
|
try:
|
|
@@ -68,33 +69,38 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
68
69
|
await asyncio.sleep(self.config.request_delay)
|
|
69
70
|
except Exception as e:
|
|
70
71
|
logger.error(f"Failed to scrape NetFile portal {portal_url}: {e}")
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
logger.info(f"Collected {len(all_disclosures)} California disclosures")
|
|
73
74
|
return all_disclosures
|
|
74
|
-
|
|
75
|
+
|
|
75
76
|
async def _scrape_cal_access_data(self) -> List[TradingDisclosure]:
|
|
76
77
|
"""Scrape California Secretary of State Cal-Access data"""
|
|
77
78
|
disclosures = []
|
|
78
|
-
|
|
79
|
+
|
|
79
80
|
try:
|
|
80
81
|
logger.debug("Scraping Cal-Access state-level data")
|
|
81
|
-
|
|
82
|
+
|
|
82
83
|
# Cal-Access API endpoints (simplified - actual implementation would need
|
|
83
84
|
# to handle their specific data format and authentication)
|
|
84
85
|
cal_access_url = "https://www.sos.ca.gov/campaign-lobbying/cal-access-resources"
|
|
85
|
-
|
|
86
|
+
|
|
86
87
|
# This is a placeholder for actual Cal-Access API implementation
|
|
87
88
|
# The real implementation would:
|
|
88
89
|
# 1. Access Cal-Access database exports
|
|
89
90
|
# 2. Parse the fixed-width format files
|
|
90
91
|
# 3. Extract candidate and committee financial data
|
|
91
|
-
|
|
92
|
+
|
|
92
93
|
# Sample disclosures with real California politician names for demonstration
|
|
93
94
|
ca_politicians = [
|
|
94
|
-
"Gavin Newsom",
|
|
95
|
-
"
|
|
95
|
+
"Gavin Newsom",
|
|
96
|
+
"Rob Bonta",
|
|
97
|
+
"Tony Thurmond",
|
|
98
|
+
"Fiona Ma",
|
|
99
|
+
"Betty Yee",
|
|
100
|
+
"Ricardo Lara",
|
|
101
|
+
"Shirley Weber",
|
|
96
102
|
]
|
|
97
|
-
|
|
103
|
+
|
|
98
104
|
for politician in ca_politicians[:3]: # Create a few sample disclosures
|
|
99
105
|
sample_disclosure = TradingDisclosure(
|
|
100
106
|
politician_id="", # Will be filled during politician matching
|
|
@@ -110,47 +116,51 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
110
116
|
"source": "cal_access",
|
|
111
117
|
"jurisdiction": "california_state",
|
|
112
118
|
"politician_name": politician,
|
|
113
|
-
"sample": False
|
|
114
|
-
}
|
|
119
|
+
"sample": False,
|
|
120
|
+
},
|
|
115
121
|
)
|
|
116
122
|
disclosures.append(sample_disclosure)
|
|
117
|
-
|
|
123
|
+
|
|
118
124
|
except Exception as e:
|
|
119
125
|
logger.error(f"Failed to scrape Cal-Access data: {e}")
|
|
120
|
-
|
|
126
|
+
|
|
121
127
|
return disclosures
|
|
122
|
-
|
|
128
|
+
|
|
123
129
|
async def _scrape_netfile_portal(self, portal_url: str) -> List[TradingDisclosure]:
|
|
124
130
|
"""Scrape a specific NetFile public portal"""
|
|
125
131
|
disclosures = []
|
|
126
|
-
|
|
132
|
+
|
|
127
133
|
try:
|
|
128
134
|
# Extract jurisdiction from URL
|
|
129
135
|
jurisdiction = self._extract_jurisdiction(portal_url)
|
|
130
136
|
logger.debug(f"Scraping NetFile portal for {jurisdiction}")
|
|
131
|
-
|
|
137
|
+
|
|
132
138
|
# NetFile servers are often overloaded, use special handling
|
|
133
139
|
# Skip network calls in test mode due to server unreliability
|
|
134
140
|
if not self.test_mode:
|
|
135
141
|
try:
|
|
136
142
|
html = await self._fetch_netfile_with_backoff(portal_url)
|
|
137
143
|
if not html:
|
|
138
|
-
logger.warning(
|
|
144
|
+
logger.warning(
|
|
145
|
+
f"Could not access NetFile portal for {jurisdiction} - servers may be overloaded, using sample data"
|
|
146
|
+
)
|
|
139
147
|
except Exception as e:
|
|
140
|
-
logger.warning(
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"NetFile portal {jurisdiction} unavailable: {e}, using sample data"
|
|
150
|
+
)
|
|
141
151
|
else:
|
|
142
152
|
logger.info(f"Test mode enabled - using sample data for {jurisdiction}")
|
|
143
|
-
|
|
153
|
+
|
|
144
154
|
# NetFile portals typically have search forms and results tables
|
|
145
155
|
# This is a simplified implementation - real scraper would:
|
|
146
156
|
# 1. Navigate search forms for candidate/committee data
|
|
147
157
|
# 2. Parse results tables with transaction data
|
|
148
158
|
# 3. Handle pagination for large result sets
|
|
149
159
|
# 4. Extract specific financial disclosure information
|
|
150
|
-
|
|
160
|
+
|
|
151
161
|
# Create sample data with local politician names for this jurisdiction
|
|
152
162
|
local_politicians = self._get_sample_local_politicians(jurisdiction)
|
|
153
|
-
|
|
163
|
+
|
|
154
164
|
for politician_name in local_politicians[:2]: # Create 2 disclosures per portal
|
|
155
165
|
sample_disclosure = TradingDisclosure(
|
|
156
166
|
politician_id="",
|
|
@@ -167,34 +177,34 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
167
177
|
"jurisdiction": jurisdiction,
|
|
168
178
|
"portal_url": portal_url,
|
|
169
179
|
"politician_name": politician_name,
|
|
170
|
-
"sample": False
|
|
171
|
-
}
|
|
180
|
+
"sample": False,
|
|
181
|
+
},
|
|
172
182
|
)
|
|
173
183
|
disclosures.append(sample_disclosure)
|
|
174
|
-
|
|
184
|
+
|
|
175
185
|
except Exception as e:
|
|
176
186
|
logger.error(f"Failed to scrape NetFile portal {portal_url}: {e}")
|
|
177
|
-
|
|
187
|
+
|
|
178
188
|
return disclosures
|
|
179
|
-
|
|
189
|
+
|
|
180
190
|
def _extract_jurisdiction(self, portal_url: str) -> str:
|
|
181
191
|
"""Extract jurisdiction name from NetFile portal URL"""
|
|
182
192
|
jurisdiction_map = {
|
|
183
193
|
"VCO": "Ventura County",
|
|
184
194
|
"SFO": "San Francisco",
|
|
185
|
-
"SCC": "Santa Clara County",
|
|
195
|
+
"SCC": "Santa Clara County",
|
|
186
196
|
"SAC": "Sacramento County",
|
|
187
|
-
"LAC": "Los Angeles County"
|
|
197
|
+
"LAC": "Los Angeles County",
|
|
188
198
|
}
|
|
189
|
-
|
|
199
|
+
|
|
190
200
|
# Extract AID parameter from URL
|
|
191
|
-
aid_match = re.search(r
|
|
201
|
+
aid_match = re.search(r"AID=([A-Z]+)", portal_url)
|
|
192
202
|
if aid_match:
|
|
193
203
|
aid = aid_match.group(1)
|
|
194
204
|
return jurisdiction_map.get(aid, f"California {aid}")
|
|
195
|
-
|
|
205
|
+
|
|
196
206
|
return "California Unknown"
|
|
197
|
-
|
|
207
|
+
|
|
198
208
|
def _get_sample_local_politicians(self, jurisdiction: str) -> List[str]:
|
|
199
209
|
"""Get sample local politician names for a jurisdiction"""
|
|
200
210
|
politician_map = {
|
|
@@ -202,25 +212,24 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
202
212
|
"San Francisco": ["London Breed", "Aaron Peskin", "Matt Dorsey", "Connie Chan"],
|
|
203
213
|
"Santa Clara County": ["Cindy Chavez", "Susan Ellenberg", "Joe Simitian"],
|
|
204
214
|
"Sacramento County": ["Phil Serna", "Rich Desmond", "Don Nottoli"],
|
|
205
|
-
"Los Angeles County": ["Hilda Solis", "Sheila Kuehl", "Janice Hahn", "Holly Mitchell"]
|
|
215
|
+
"Los Angeles County": ["Hilda Solis", "Sheila Kuehl", "Janice Hahn", "Holly Mitchell"],
|
|
206
216
|
}
|
|
207
|
-
|
|
217
|
+
|
|
208
218
|
return politician_map.get(jurisdiction, ["California Local Politician"])
|
|
209
|
-
|
|
219
|
+
|
|
210
220
|
async def _fetch_netfile_with_backoff(self, url: str) -> Optional[str]:
|
|
211
221
|
"""Fetch NetFile page with progressive backoff for server overload"""
|
|
212
222
|
if not self.session:
|
|
213
223
|
return None
|
|
214
|
-
|
|
224
|
+
|
|
215
225
|
# NetFile servers are notoriously slow and overloaded, use shorter delays for testing
|
|
216
226
|
delays = [1, 2] # Quick attempts only for testing
|
|
217
|
-
|
|
227
|
+
|
|
218
228
|
for attempt, delay in enumerate(delays):
|
|
219
229
|
try:
|
|
220
230
|
# Use shorter timeout for testing
|
|
221
231
|
async with self.session.get(
|
|
222
|
-
url,
|
|
223
|
-
timeout=aiohttp.ClientTimeout(total=5) # 5 second timeout for testing
|
|
232
|
+
url, timeout=aiohttp.ClientTimeout(total=5) # 5 second timeout for testing
|
|
224
233
|
) as response:
|
|
225
234
|
if response.status == 200:
|
|
226
235
|
return await response.text()
|
|
@@ -232,20 +241,24 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
232
241
|
await asyncio.sleep(delay)
|
|
233
242
|
else:
|
|
234
243
|
logger.warning(f"NetFile returned HTTP {response.status} for {url}")
|
|
235
|
-
|
|
244
|
+
|
|
236
245
|
except asyncio.TimeoutError:
|
|
237
|
-
logger.info(
|
|
246
|
+
logger.info(
|
|
247
|
+
f"NetFile timeout (attempt {attempt + 1}/{len(delays)}), waiting {delay} seconds"
|
|
248
|
+
)
|
|
238
249
|
if attempt < len(delays) - 1:
|
|
239
250
|
await asyncio.sleep(delay)
|
|
240
251
|
except Exception as e:
|
|
241
252
|
logger.warning(f"NetFile error (attempt {attempt + 1}/{len(delays)}): {e}")
|
|
242
253
|
if attempt < len(delays) - 1:
|
|
243
254
|
await asyncio.sleep(delay)
|
|
244
|
-
|
|
255
|
+
|
|
245
256
|
logger.error(f"NetFile portal {url} unavailable after {len(delays)} attempts")
|
|
246
257
|
return None
|
|
247
|
-
|
|
248
|
-
def _parse_netfile_transaction(
|
|
258
|
+
|
|
259
|
+
def _parse_netfile_transaction(
|
|
260
|
+
self, transaction_data: Dict[str, Any]
|
|
261
|
+
) -> Optional[TradingDisclosure]:
|
|
249
262
|
"""Parse NetFile transaction data into TradingDisclosure format"""
|
|
250
263
|
try:
|
|
251
264
|
# Parse transaction type
|
|
@@ -253,12 +266,12 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
253
266
|
"contribution": TransactionType.PURCHASE,
|
|
254
267
|
"expenditure": TransactionType.SALE,
|
|
255
268
|
"investment": TransactionType.PURCHASE,
|
|
256
|
-
"loan": TransactionType.PURCHASE
|
|
269
|
+
"loan": TransactionType.PURCHASE,
|
|
257
270
|
}
|
|
258
|
-
|
|
271
|
+
|
|
259
272
|
raw_type = transaction_data.get("transaction_type", "").lower()
|
|
260
273
|
transaction_type = transaction_type_map.get(raw_type, TransactionType.PURCHASE)
|
|
261
|
-
|
|
274
|
+
|
|
262
275
|
# Parse date
|
|
263
276
|
date_str = transaction_data.get("transaction_date", "")
|
|
264
277
|
try:
|
|
@@ -268,11 +281,11 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
268
281
|
transaction_date = datetime.strptime(date_str, "%m/%d/%Y")
|
|
269
282
|
except ValueError:
|
|
270
283
|
transaction_date = datetime.now()
|
|
271
|
-
|
|
284
|
+
|
|
272
285
|
# Parse amount
|
|
273
286
|
amount_str = transaction_data.get("amount", "")
|
|
274
287
|
amount_min, amount_max, amount_exact = self._parse_california_amount(amount_str)
|
|
275
|
-
|
|
288
|
+
|
|
276
289
|
disclosure = TradingDisclosure(
|
|
277
290
|
politician_id="", # Will be filled after politician matching
|
|
278
291
|
transaction_date=transaction_date,
|
|
@@ -285,23 +298,25 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
285
298
|
amount_range_max=amount_max,
|
|
286
299
|
amount_exact=amount_exact,
|
|
287
300
|
source_url=transaction_data.get("source_url", ""),
|
|
288
|
-
raw_data=transaction_data
|
|
301
|
+
raw_data=transaction_data,
|
|
289
302
|
)
|
|
290
|
-
|
|
303
|
+
|
|
291
304
|
return disclosure
|
|
292
|
-
|
|
305
|
+
|
|
293
306
|
except Exception as e:
|
|
294
307
|
logger.error(f"Failed to parse NetFile transaction: {e}")
|
|
295
308
|
return None
|
|
296
|
-
|
|
297
|
-
def _parse_california_amount(
|
|
309
|
+
|
|
310
|
+
def _parse_california_amount(
|
|
311
|
+
self, amount_text: str
|
|
312
|
+
) -> tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
|
|
298
313
|
"""Parse California-specific amount formats"""
|
|
299
314
|
if not amount_text:
|
|
300
315
|
return None, None, None
|
|
301
|
-
|
|
316
|
+
|
|
302
317
|
# Clean amount text
|
|
303
318
|
amount_clean = amount_text.replace(",", "").replace("$", "").strip()
|
|
304
|
-
|
|
319
|
+
|
|
305
320
|
# California disclosure thresholds
|
|
306
321
|
ca_thresholds = {
|
|
307
322
|
"under $100": (None, Decimal("100")),
|
|
@@ -309,50 +324,55 @@ class CaliforniaNetFileScraper(BaseScraper):
|
|
|
309
324
|
"$500 - $999": (Decimal("500"), Decimal("999")),
|
|
310
325
|
"$1,000 - $9,999": (Decimal("1000"), Decimal("9999")),
|
|
311
326
|
"$10,000 - $99,999": (Decimal("10000"), Decimal("99999")),
|
|
312
|
-
"$100,000+": (Decimal("100000"), None)
|
|
327
|
+
"$100,000+": (Decimal("100000"), None),
|
|
313
328
|
}
|
|
314
|
-
|
|
329
|
+
|
|
315
330
|
# Check threshold patterns
|
|
316
331
|
for threshold_text, (min_val, max_val) in ca_thresholds.items():
|
|
317
332
|
if threshold_text.lower() in amount_text.lower():
|
|
318
333
|
return min_val, max_val, None
|
|
319
|
-
|
|
334
|
+
|
|
320
335
|
# Try exact amount parsing
|
|
321
336
|
try:
|
|
322
337
|
exact_amount = Decimal(amount_clean)
|
|
323
338
|
return None, None, exact_amount
|
|
324
339
|
except:
|
|
325
340
|
pass
|
|
326
|
-
|
|
341
|
+
|
|
327
342
|
# Try range parsing
|
|
328
|
-
range_match = re.search(r
|
|
343
|
+
range_match = re.search(r"(\d+(?:\.\d{2})?)\s*[-–]\s*(\d+(?:\.\d{2})?)", amount_clean)
|
|
329
344
|
if range_match:
|
|
330
345
|
min_val = Decimal(range_match.group(1))
|
|
331
346
|
max_val = Decimal(range_match.group(2))
|
|
332
347
|
return min_val, max_val, None
|
|
333
|
-
|
|
348
|
+
|
|
334
349
|
return None, None, None
|
|
335
350
|
|
|
336
351
|
|
|
337
352
|
class CaliforniaStateLegislatureScraper(BaseScraper):
|
|
338
353
|
"""Scraper for California State Legislature financial disclosures"""
|
|
339
|
-
|
|
354
|
+
|
|
340
355
|
async def scrape_legislature_disclosures(self) -> List[TradingDisclosure]:
|
|
341
356
|
"""Scrape California State Legislature member financial disclosures"""
|
|
342
357
|
logger.info("Starting California Legislature disclosures collection")
|
|
343
|
-
|
|
358
|
+
|
|
344
359
|
disclosures = []
|
|
345
|
-
|
|
360
|
+
|
|
346
361
|
try:
|
|
347
362
|
# California Legislature financial disclosure system
|
|
348
363
|
# Would integrate with FPPC (Fair Political Practices Commission) data
|
|
349
|
-
|
|
364
|
+
|
|
350
365
|
# Sample disclosures with real California legislators
|
|
351
366
|
ca_legislators = [
|
|
352
|
-
"Toni Atkins",
|
|
353
|
-
"
|
|
367
|
+
"Toni Atkins",
|
|
368
|
+
"Robert Rivas",
|
|
369
|
+
"Scott Wiener",
|
|
370
|
+
"Nancy Skinner",
|
|
371
|
+
"Anthony Portantino",
|
|
372
|
+
"Maria Elena Durazo",
|
|
373
|
+
"Alex Padilla",
|
|
354
374
|
]
|
|
355
|
-
|
|
375
|
+
|
|
356
376
|
for legislator in ca_legislators[:2]: # Create sample disclosures
|
|
357
377
|
sample_disclosure = TradingDisclosure(
|
|
358
378
|
politician_id="",
|
|
@@ -368,45 +388,47 @@ class CaliforniaStateLegislatureScraper(BaseScraper):
|
|
|
368
388
|
"source": "ca_legislature",
|
|
369
389
|
"fppc_form": "Form 700",
|
|
370
390
|
"politician_name": legislator,
|
|
371
|
-
"sample": False
|
|
372
|
-
}
|
|
391
|
+
"sample": False,
|
|
392
|
+
},
|
|
373
393
|
)
|
|
374
394
|
disclosures.append(sample_disclosure)
|
|
375
|
-
|
|
395
|
+
|
|
376
396
|
except Exception as e:
|
|
377
397
|
logger.error(f"Failed to scrape California Legislature data: {e}")
|
|
378
|
-
|
|
398
|
+
|
|
379
399
|
return disclosures
|
|
380
400
|
|
|
381
401
|
|
|
382
402
|
async def run_california_collection(config) -> List[TradingDisclosure]:
|
|
383
403
|
"""Main function to run California data collection"""
|
|
384
404
|
all_disclosures = []
|
|
385
|
-
|
|
405
|
+
|
|
386
406
|
# NetFile portals
|
|
387
407
|
async with CaliforniaNetFileScraper(config) as netfile_scraper:
|
|
388
408
|
netfile_disclosures = await netfile_scraper.scrape_california_disclosures()
|
|
389
409
|
all_disclosures.extend(netfile_disclosures)
|
|
390
|
-
|
|
410
|
+
|
|
391
411
|
# State Legislature
|
|
392
412
|
legislature_scraper = CaliforniaStateLegislatureScraper(config)
|
|
393
413
|
async with legislature_scraper:
|
|
394
414
|
legislature_disclosures = await legislature_scraper.scrape_legislature_disclosures()
|
|
395
415
|
all_disclosures.extend(legislature_disclosures)
|
|
396
|
-
|
|
416
|
+
|
|
397
417
|
return all_disclosures
|
|
398
418
|
|
|
399
419
|
|
|
400
420
|
# Example usage for testing
|
|
401
421
|
if __name__ == "__main__":
|
|
402
422
|
from .config import WorkflowConfig
|
|
403
|
-
|
|
423
|
+
|
|
404
424
|
async def main():
|
|
405
425
|
config = WorkflowConfig.default()
|
|
406
426
|
disclosures = await run_california_collection(config.scraping)
|
|
407
427
|
print(f"Collected {len(disclosures)} California financial disclosures")
|
|
408
|
-
|
|
428
|
+
|
|
409
429
|
for disclosure in disclosures[:3]: # Show first 3
|
|
410
|
-
print(
|
|
411
|
-
|
|
412
|
-
|
|
430
|
+
print(
|
|
431
|
+
f"- {disclosure.asset_name} ({disclosure.raw_data.get('jurisdiction', 'Unknown')})"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
asyncio.run(main())
|