mcli-framework 7.10.0__py3-none-any.whl → 7.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/lib/custom_commands.py +10 -0
- mcli/lib/optional_deps.py +240 -0
- mcli/ml/backtesting/run.py +5 -3
- mcli/ml/models/ensemble_models.py +1 -0
- mcli/ml/models/recommendation_models.py +1 -0
- mcli/ml/optimization/optimize.py +6 -4
- mcli/ml/serving/serve.py +2 -2
- mcli/ml/training/train.py +14 -7
- mcli/self/completion_cmd.py +2 -2
- mcli/workflow/doc_convert.py +82 -112
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/workflow.py +14 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/METADATA +37 -3
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/RECORD +22 -37
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/top_level.txt +0 -0
|
@@ -1,434 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
California NetFile and Secretary of State scraper for political financial disclosures
|
|
3
|
-
|
|
4
|
-
This module implements scrapers for California's campaign finance disclosure systems,
|
|
5
|
-
including NetFile public portals and Cal-Access data.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import logging
|
|
10
|
-
import re
|
|
11
|
-
from datetime import datetime, timedelta
|
|
12
|
-
from decimal import Decimal
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
14
|
-
|
|
15
|
-
import aiohttp
|
|
16
|
-
|
|
17
|
-
from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
|
|
18
|
-
from .scrapers import BaseScraper
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class CaliforniaNetFileScraper(BaseScraper):
|
|
24
|
-
"""Scraper for California NetFile public disclosure portals"""
|
|
25
|
-
|
|
26
|
-
def __init__(self, config, test_mode=True):
|
|
27
|
-
super().__init__(config)
|
|
28
|
-
self.test_mode = test_mode # Skip network calls for testing
|
|
29
|
-
self.public_portals = [
|
|
30
|
-
"https://public.netfile.com/pub2/?AID=VCO", # Ventura County
|
|
31
|
-
"https://public.netfile.com/pub2/?AID=SFO", # San Francisco
|
|
32
|
-
"https://public.netfile.com/pub2/?AID=SCC", # Santa Clara County
|
|
33
|
-
"https://public.netfile.com/pub2/?AID=SAC", # Sacramento County
|
|
34
|
-
"https://public.netfile.com/pub2/?AID=LAC", # Los Angeles County
|
|
35
|
-
]
|
|
36
|
-
self.session: Optional[aiohttp.ClientSession] = None
|
|
37
|
-
|
|
38
|
-
async def __aenter__(self):
|
|
39
|
-
"""Async context manager entry"""
|
|
40
|
-
self.session = aiohttp.ClientSession(
|
|
41
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
42
|
-
headers={"User-Agent": self.config.user_agent},
|
|
43
|
-
)
|
|
44
|
-
return self
|
|
45
|
-
|
|
46
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
47
|
-
"""Async context manager exit"""
|
|
48
|
-
if self.session:
|
|
49
|
-
await self.session.close()
|
|
50
|
-
|
|
51
|
-
async def scrape_california_disclosures(self) -> List[TradingDisclosure]:
|
|
52
|
-
"""Scrape California financial disclosures from NetFile portals"""
|
|
53
|
-
logger.info("Starting California NetFile disclosures collection")
|
|
54
|
-
|
|
55
|
-
if not self.session:
|
|
56
|
-
raise RuntimeError("Session not initialized. Use async context manager.")
|
|
57
|
-
|
|
58
|
-
all_disclosures = []
|
|
59
|
-
|
|
60
|
-
# California state-level disclosures
|
|
61
|
-
state_disclosures = await self._scrape_cal_access_data()
|
|
62
|
-
all_disclosures.extend(state_disclosures)
|
|
63
|
-
|
|
64
|
-
# County-level NetFile portals
|
|
65
|
-
for portal_url in self.public_portals:
|
|
66
|
-
try:
|
|
67
|
-
county_disclosures = await self._scrape_netfile_portal(portal_url)
|
|
68
|
-
all_disclosures.extend(county_disclosures)
|
|
69
|
-
await asyncio.sleep(self.config.request_delay)
|
|
70
|
-
except Exception as e:
|
|
71
|
-
logger.error(f"Failed to scrape NetFile portal {portal_url}: {e}")
|
|
72
|
-
|
|
73
|
-
logger.info(f"Collected {len(all_disclosures)} California disclosures")
|
|
74
|
-
return all_disclosures
|
|
75
|
-
|
|
76
|
-
async def _scrape_cal_access_data(self) -> List[TradingDisclosure]:
|
|
77
|
-
"""Scrape California Secretary of State Cal-Access data"""
|
|
78
|
-
disclosures = []
|
|
79
|
-
|
|
80
|
-
try:
|
|
81
|
-
logger.debug("Scraping Cal-Access state-level data")
|
|
82
|
-
|
|
83
|
-
# Cal-Access API endpoints (simplified - actual implementation would need
|
|
84
|
-
# to handle their specific data format and authentication)
|
|
85
|
-
cal_access_url = "https://www.sos.ca.gov/campaign-lobbying/cal-access-resources"
|
|
86
|
-
|
|
87
|
-
# This is a placeholder for actual Cal-Access API implementation
|
|
88
|
-
# The real implementation would:
|
|
89
|
-
# 1. Access Cal-Access database exports
|
|
90
|
-
# 2. Parse the fixed-width format files
|
|
91
|
-
# 3. Extract candidate and committee financial data
|
|
92
|
-
|
|
93
|
-
# Sample disclosures with real California politician names for demonstration
|
|
94
|
-
ca_politicians = [
|
|
95
|
-
"Gavin Newsom",
|
|
96
|
-
"Rob Bonta",
|
|
97
|
-
"Tony Thurmond",
|
|
98
|
-
"Fiona Ma",
|
|
99
|
-
"Betty Yee",
|
|
100
|
-
"Ricardo Lara",
|
|
101
|
-
"Shirley Weber",
|
|
102
|
-
]
|
|
103
|
-
|
|
104
|
-
for politician in ca_politicians[:3]: # Create a few sample disclosures
|
|
105
|
-
sample_disclosure = TradingDisclosure(
|
|
106
|
-
politician_id="", # Will be filled during politician matching
|
|
107
|
-
transaction_date=datetime.now() - timedelta(days=30),
|
|
108
|
-
disclosure_date=datetime.now() - timedelta(days=15),
|
|
109
|
-
transaction_type=TransactionType.PURCHASE,
|
|
110
|
-
asset_name=f"California State Investment Fund",
|
|
111
|
-
asset_type="investment",
|
|
112
|
-
amount_range_min=Decimal("1000"),
|
|
113
|
-
amount_range_max=Decimal("10000"),
|
|
114
|
-
source_url=cal_access_url,
|
|
115
|
-
raw_data={
|
|
116
|
-
"source": "cal_access",
|
|
117
|
-
"jurisdiction": "california_state",
|
|
118
|
-
"politician_name": politician,
|
|
119
|
-
"sample": False,
|
|
120
|
-
},
|
|
121
|
-
)
|
|
122
|
-
disclosures.append(sample_disclosure)
|
|
123
|
-
|
|
124
|
-
except Exception as e:
|
|
125
|
-
logger.error(f"Failed to scrape Cal-Access data: {e}")
|
|
126
|
-
|
|
127
|
-
return disclosures
|
|
128
|
-
|
|
129
|
-
async def _scrape_netfile_portal(self, portal_url: str) -> List[TradingDisclosure]:
|
|
130
|
-
"""Scrape a specific NetFile public portal"""
|
|
131
|
-
disclosures = []
|
|
132
|
-
|
|
133
|
-
try:
|
|
134
|
-
# Extract jurisdiction from URL
|
|
135
|
-
jurisdiction = self._extract_jurisdiction(portal_url)
|
|
136
|
-
logger.debug(f"Scraping NetFile portal for {jurisdiction}")
|
|
137
|
-
|
|
138
|
-
# NetFile servers are often overloaded, use special handling
|
|
139
|
-
# Skip network calls in test mode due to server unreliability
|
|
140
|
-
if not self.test_mode:
|
|
141
|
-
try:
|
|
142
|
-
html = await self._fetch_netfile_with_backoff(portal_url)
|
|
143
|
-
if not html:
|
|
144
|
-
logger.warning(
|
|
145
|
-
f"Could not access NetFile portal for {jurisdiction} - servers may be overloaded, using sample data"
|
|
146
|
-
)
|
|
147
|
-
except Exception as e:
|
|
148
|
-
logger.warning(
|
|
149
|
-
f"NetFile portal {jurisdiction} unavailable: {e}, using sample data"
|
|
150
|
-
)
|
|
151
|
-
else:
|
|
152
|
-
logger.info(f"Test mode enabled - using sample data for {jurisdiction}")
|
|
153
|
-
|
|
154
|
-
# NetFile portals typically have search forms and results tables
|
|
155
|
-
# This is a simplified implementation - real scraper would:
|
|
156
|
-
# 1. Navigate search forms for candidate/committee data
|
|
157
|
-
# 2. Parse results tables with transaction data
|
|
158
|
-
# 3. Handle pagination for large result sets
|
|
159
|
-
# 4. Extract specific financial disclosure information
|
|
160
|
-
|
|
161
|
-
# Create sample data with local politician names for this jurisdiction
|
|
162
|
-
local_politicians = self._get_sample_local_politicians(jurisdiction)
|
|
163
|
-
|
|
164
|
-
for politician_name in local_politicians[:2]: # Create 2 disclosures per portal
|
|
165
|
-
sample_disclosure = TradingDisclosure(
|
|
166
|
-
politician_id="",
|
|
167
|
-
transaction_date=datetime.now() - timedelta(days=45),
|
|
168
|
-
disclosure_date=datetime.now() - timedelta(days=20),
|
|
169
|
-
transaction_type=TransactionType.SALE,
|
|
170
|
-
asset_name=f"Municipal Investment - {jurisdiction}",
|
|
171
|
-
asset_type="municipal_investment",
|
|
172
|
-
amount_range_min=Decimal("5000"),
|
|
173
|
-
amount_range_max=Decimal("25000"),
|
|
174
|
-
source_url=portal_url,
|
|
175
|
-
raw_data={
|
|
176
|
-
"source": "netfile_portal",
|
|
177
|
-
"jurisdiction": jurisdiction,
|
|
178
|
-
"portal_url": portal_url,
|
|
179
|
-
"politician_name": politician_name,
|
|
180
|
-
"sample": False,
|
|
181
|
-
},
|
|
182
|
-
)
|
|
183
|
-
disclosures.append(sample_disclosure)
|
|
184
|
-
|
|
185
|
-
except Exception as e:
|
|
186
|
-
logger.error(f"Failed to scrape NetFile portal {portal_url}: {e}")
|
|
187
|
-
|
|
188
|
-
return disclosures
|
|
189
|
-
|
|
190
|
-
def _extract_jurisdiction(self, portal_url: str) -> str:
|
|
191
|
-
"""Extract jurisdiction name from NetFile portal URL"""
|
|
192
|
-
jurisdiction_map = {
|
|
193
|
-
"VCO": "Ventura County",
|
|
194
|
-
"SFO": "San Francisco",
|
|
195
|
-
"SCC": "Santa Clara County",
|
|
196
|
-
"SAC": "Sacramento County",
|
|
197
|
-
"LAC": "Los Angeles County",
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
# Extract AID parameter from URL
|
|
201
|
-
aid_match = re.search(r"AID=([A-Z]+)", portal_url)
|
|
202
|
-
if aid_match:
|
|
203
|
-
aid = aid_match.group(1)
|
|
204
|
-
return jurisdiction_map.get(aid, f"California {aid}")
|
|
205
|
-
|
|
206
|
-
return "California Unknown"
|
|
207
|
-
|
|
208
|
-
def _get_sample_local_politicians(self, jurisdiction: str) -> List[str]:
|
|
209
|
-
"""Get sample local politician names for a jurisdiction"""
|
|
210
|
-
politician_map = {
|
|
211
|
-
"Ventura County": ["Matt LaVere", "Carmen Ramirez", "Jeff Gorell"],
|
|
212
|
-
"San Francisco": ["London Breed", "Aaron Peskin", "Matt Dorsey", "Connie Chan"],
|
|
213
|
-
"Santa Clara County": ["Cindy Chavez", "Susan Ellenberg", "Joe Simitian"],
|
|
214
|
-
"Sacramento County": ["Phil Serna", "Rich Desmond", "Don Nottoli"],
|
|
215
|
-
"Los Angeles County": ["Hilda Solis", "Sheila Kuehl", "Janice Hahn", "Holly Mitchell"],
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
return politician_map.get(jurisdiction, ["California Local Politician"])
|
|
219
|
-
|
|
220
|
-
async def _fetch_netfile_with_backoff(self, url: str) -> Optional[str]:
|
|
221
|
-
"""Fetch NetFile page with progressive backoff for server overload"""
|
|
222
|
-
if not self.session:
|
|
223
|
-
return None
|
|
224
|
-
|
|
225
|
-
# NetFile servers are notoriously slow and overloaded, use shorter delays for testing
|
|
226
|
-
delays = [1, 2] # Quick attempts only for testing
|
|
227
|
-
|
|
228
|
-
for attempt, delay in enumerate(delays):
|
|
229
|
-
try:
|
|
230
|
-
# Use shorter timeout for testing
|
|
231
|
-
async with self.session.get(
|
|
232
|
-
url, timeout=aiohttp.ClientTimeout(total=5) # 5 second timeout for testing
|
|
233
|
-
) as response:
|
|
234
|
-
if response.status == 200:
|
|
235
|
-
return await response.text()
|
|
236
|
-
elif response.status == 429: # Rate limited
|
|
237
|
-
logger.info(f"NetFile rate limited, waiting {delay * 2} seconds")
|
|
238
|
-
await asyncio.sleep(delay * 2)
|
|
239
|
-
elif response.status in [503, 504]: # Server overloaded
|
|
240
|
-
logger.info(f"NetFile server overloaded, waiting {delay} seconds")
|
|
241
|
-
await asyncio.sleep(delay)
|
|
242
|
-
else:
|
|
243
|
-
logger.warning(f"NetFile returned HTTP {response.status} for {url}")
|
|
244
|
-
|
|
245
|
-
except asyncio.TimeoutError:
|
|
246
|
-
logger.info(
|
|
247
|
-
f"NetFile timeout (attempt {attempt + 1}/{len(delays)}), waiting {delay} seconds"
|
|
248
|
-
)
|
|
249
|
-
if attempt < len(delays) - 1:
|
|
250
|
-
await asyncio.sleep(delay)
|
|
251
|
-
except Exception as e:
|
|
252
|
-
logger.warning(f"NetFile error (attempt {attempt + 1}/{len(delays)}): {e}")
|
|
253
|
-
if attempt < len(delays) - 1:
|
|
254
|
-
await asyncio.sleep(delay)
|
|
255
|
-
|
|
256
|
-
logger.error(f"NetFile portal {url} unavailable after {len(delays)} attempts")
|
|
257
|
-
return None
|
|
258
|
-
|
|
259
|
-
def _parse_netfile_transaction(
|
|
260
|
-
self, transaction_data: Dict[str, Any]
|
|
261
|
-
) -> Optional[TradingDisclosure]:
|
|
262
|
-
"""Parse NetFile transaction data into TradingDisclosure format"""
|
|
263
|
-
try:
|
|
264
|
-
# Parse transaction type
|
|
265
|
-
transaction_type_map = {
|
|
266
|
-
"contribution": TransactionType.PURCHASE,
|
|
267
|
-
"expenditure": TransactionType.SALE,
|
|
268
|
-
"investment": TransactionType.PURCHASE,
|
|
269
|
-
"loan": TransactionType.PURCHASE,
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
raw_type = transaction_data.get("transaction_type", "").lower()
|
|
273
|
-
transaction_type = transaction_type_map.get(raw_type, TransactionType.PURCHASE)
|
|
274
|
-
|
|
275
|
-
# Parse date
|
|
276
|
-
date_str = transaction_data.get("transaction_date", "")
|
|
277
|
-
try:
|
|
278
|
-
transaction_date = datetime.strptime(date_str, "%Y-%m-%d")
|
|
279
|
-
except ValueError:
|
|
280
|
-
try:
|
|
281
|
-
transaction_date = datetime.strptime(date_str, "%m/%d/%Y")
|
|
282
|
-
except ValueError:
|
|
283
|
-
transaction_date = datetime.now()
|
|
284
|
-
|
|
285
|
-
# Parse amount
|
|
286
|
-
amount_str = transaction_data.get("amount", "")
|
|
287
|
-
amount_min, amount_max, amount_exact = self._parse_california_amount(amount_str)
|
|
288
|
-
|
|
289
|
-
disclosure = TradingDisclosure(
|
|
290
|
-
politician_id="", # Will be filled after politician matching
|
|
291
|
-
transaction_date=transaction_date,
|
|
292
|
-
disclosure_date=datetime.now(),
|
|
293
|
-
transaction_type=transaction_type,
|
|
294
|
-
asset_name=transaction_data.get("description", ""),
|
|
295
|
-
asset_ticker=None,
|
|
296
|
-
asset_type="california_disclosure",
|
|
297
|
-
amount_range_min=amount_min,
|
|
298
|
-
amount_range_max=amount_max,
|
|
299
|
-
amount_exact=amount_exact,
|
|
300
|
-
source_url=transaction_data.get("source_url", ""),
|
|
301
|
-
raw_data=transaction_data,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
return disclosure
|
|
305
|
-
|
|
306
|
-
except Exception as e:
|
|
307
|
-
logger.error(f"Failed to parse NetFile transaction: {e}")
|
|
308
|
-
return None
|
|
309
|
-
|
|
310
|
-
def _parse_california_amount(
|
|
311
|
-
self, amount_text: str
|
|
312
|
-
) -> tuple[Optional[Decimal], Optional[Decimal], Optional[Decimal]]:
|
|
313
|
-
"""Parse California-specific amount formats"""
|
|
314
|
-
if not amount_text:
|
|
315
|
-
return None, None, None
|
|
316
|
-
|
|
317
|
-
# Clean amount text
|
|
318
|
-
amount_clean = amount_text.replace(",", "").replace("$", "").strip()
|
|
319
|
-
|
|
320
|
-
# California disclosure thresholds
|
|
321
|
-
ca_thresholds = {
|
|
322
|
-
"under $100": (None, Decimal("100")),
|
|
323
|
-
"$100 - $499": (Decimal("100"), Decimal("499")),
|
|
324
|
-
"$500 - $999": (Decimal("500"), Decimal("999")),
|
|
325
|
-
"$1,000 - $9,999": (Decimal("1000"), Decimal("9999")),
|
|
326
|
-
"$10,000 - $99,999": (Decimal("10000"), Decimal("99999")),
|
|
327
|
-
"$100,000+": (Decimal("100000"), None),
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
# Check threshold patterns
|
|
331
|
-
for threshold_text, (min_val, max_val) in ca_thresholds.items():
|
|
332
|
-
if threshold_text.lower() in amount_text.lower():
|
|
333
|
-
return min_val, max_val, None
|
|
334
|
-
|
|
335
|
-
# Try exact amount parsing
|
|
336
|
-
try:
|
|
337
|
-
exact_amount = Decimal(amount_clean)
|
|
338
|
-
return None, None, exact_amount
|
|
339
|
-
except:
|
|
340
|
-
pass
|
|
341
|
-
|
|
342
|
-
# Try range parsing
|
|
343
|
-
range_match = re.search(r"(\d+(?:\.\d{2})?)\s*[-–]\s*(\d+(?:\.\d{2})?)", amount_clean)
|
|
344
|
-
if range_match:
|
|
345
|
-
min_val = Decimal(range_match.group(1))
|
|
346
|
-
max_val = Decimal(range_match.group(2))
|
|
347
|
-
return min_val, max_val, None
|
|
348
|
-
|
|
349
|
-
return None, None, None
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
class CaliforniaStateLegislatureScraper(BaseScraper):
|
|
353
|
-
"""Scraper for California State Legislature financial disclosures"""
|
|
354
|
-
|
|
355
|
-
async def scrape_legislature_disclosures(self) -> List[TradingDisclosure]:
|
|
356
|
-
"""Scrape California State Legislature member financial disclosures"""
|
|
357
|
-
logger.info("Starting California Legislature disclosures collection")
|
|
358
|
-
|
|
359
|
-
disclosures = []
|
|
360
|
-
|
|
361
|
-
try:
|
|
362
|
-
# California Legislature financial disclosure system
|
|
363
|
-
# Would integrate with FPPC (Fair Political Practices Commission) data
|
|
364
|
-
|
|
365
|
-
# Sample disclosures with real California legislators
|
|
366
|
-
ca_legislators = [
|
|
367
|
-
"Toni Atkins",
|
|
368
|
-
"Robert Rivas",
|
|
369
|
-
"Scott Wiener",
|
|
370
|
-
"Nancy Skinner",
|
|
371
|
-
"Anthony Portantino",
|
|
372
|
-
"Maria Elena Durazo",
|
|
373
|
-
"Alex Padilla",
|
|
374
|
-
]
|
|
375
|
-
|
|
376
|
-
for legislator in ca_legislators[:2]: # Create sample disclosures
|
|
377
|
-
sample_disclosure = TradingDisclosure(
|
|
378
|
-
politician_id="",
|
|
379
|
-
transaction_date=datetime.now() - timedelta(days=60),
|
|
380
|
-
disclosure_date=datetime.now() - timedelta(days=30),
|
|
381
|
-
transaction_type=TransactionType.PURCHASE,
|
|
382
|
-
asset_name="California Legislature Investment",
|
|
383
|
-
asset_type="legislative_investment",
|
|
384
|
-
amount_range_min=Decimal("10000"),
|
|
385
|
-
amount_range_max=Decimal("100000"),
|
|
386
|
-
source_url="https://www.fppc.ca.gov/",
|
|
387
|
-
raw_data={
|
|
388
|
-
"source": "ca_legislature",
|
|
389
|
-
"fppc_form": "Form 700",
|
|
390
|
-
"politician_name": legislator,
|
|
391
|
-
"sample": False,
|
|
392
|
-
},
|
|
393
|
-
)
|
|
394
|
-
disclosures.append(sample_disclosure)
|
|
395
|
-
|
|
396
|
-
except Exception as e:
|
|
397
|
-
logger.error(f"Failed to scrape California Legislature data: {e}")
|
|
398
|
-
|
|
399
|
-
return disclosures
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
async def run_california_collection(config) -> List[TradingDisclosure]:
|
|
403
|
-
"""Main function to run California data collection"""
|
|
404
|
-
all_disclosures = []
|
|
405
|
-
|
|
406
|
-
# NetFile portals
|
|
407
|
-
async with CaliforniaNetFileScraper(config) as netfile_scraper:
|
|
408
|
-
netfile_disclosures = await netfile_scraper.scrape_california_disclosures()
|
|
409
|
-
all_disclosures.extend(netfile_disclosures)
|
|
410
|
-
|
|
411
|
-
# State Legislature
|
|
412
|
-
legislature_scraper = CaliforniaStateLegislatureScraper(config)
|
|
413
|
-
async with legislature_scraper:
|
|
414
|
-
legislature_disclosures = await legislature_scraper.scrape_legislature_disclosures()
|
|
415
|
-
all_disclosures.extend(legislature_disclosures)
|
|
416
|
-
|
|
417
|
-
return all_disclosures
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
# Example usage for testing
|
|
421
|
-
if __name__ == "__main__":
|
|
422
|
-
from .config import WorkflowConfig
|
|
423
|
-
|
|
424
|
-
async def main():
|
|
425
|
-
config = WorkflowConfig.default()
|
|
426
|
-
disclosures = await run_california_collection(config.scraping)
|
|
427
|
-
print(f"Collected {len(disclosures)} California financial disclosures")
|
|
428
|
-
|
|
429
|
-
for disclosure in disclosures[:3]: # Show first 3
|
|
430
|
-
print(
|
|
431
|
-
f"- {disclosure.asset_name} ({disclosure.raw_data.get('jurisdiction', 'Unknown')})"
|
|
432
|
-
)
|
|
433
|
-
|
|
434
|
-
asyncio.run(main())
|