mcli-framework 7.10.1__py3-none-any.whl → 7.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/commands_cmd.py +150 -58
- mcli/app/main.py +21 -27
- mcli/lib/custom_commands.py +62 -12
- mcli/lib/optional_deps.py +240 -0
- mcli/lib/paths.py +129 -5
- mcli/self/migrate_cmd.py +261 -0
- mcli/self/self_cmd.py +8 -0
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/__init__.py +16 -0
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/secrets/__init__.py +4 -0
- mcli/workflow/secrets/secrets_cmd.py +192 -0
- mcli/workflow/workflow.py +35 -5
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/METADATA +86 -55
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/RECORD +22 -34
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/__init__.py +0 -4
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,378 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
UK Parliament API scraper for financial interests register data
|
|
3
|
-
|
|
4
|
-
This module implements scrapers for the UK Parliament's Register of Interests API
|
|
5
|
-
to collect MP financial disclosure data.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import logging
|
|
10
|
-
from datetime import datetime
|
|
11
|
-
from typing import Any, Dict, List, Optional
|
|
12
|
-
|
|
13
|
-
import aiohttp
|
|
14
|
-
|
|
15
|
-
from .models import Politician, PoliticianRole, TradingDisclosure, TransactionType
|
|
16
|
-
from .scrapers import BaseScraper
|
|
17
|
-
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class UKParliamentScraper(BaseScraper):
|
|
22
|
-
"""Scraper for UK Parliament Register of Interests API"""
|
|
23
|
-
|
|
24
|
-
def __init__(self, config):
|
|
25
|
-
super().__init__(config)
|
|
26
|
-
self.base_url = "https://interests-api.parliament.uk/api/v1"
|
|
27
|
-
self.session: Optional[aiohttp.ClientSession] = None
|
|
28
|
-
|
|
29
|
-
async def __aenter__(self):
|
|
30
|
-
"""Async context manager entry"""
|
|
31
|
-
self.session = aiohttp.ClientSession(
|
|
32
|
-
timeout=aiohttp.ClientTimeout(total=self.config.timeout),
|
|
33
|
-
headers={"User-Agent": self.config.user_agent},
|
|
34
|
-
)
|
|
35
|
-
return self
|
|
36
|
-
|
|
37
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
38
|
-
"""Async context manager exit"""
|
|
39
|
-
if self.session:
|
|
40
|
-
await self.session.close()
|
|
41
|
-
|
|
42
|
-
async def fetch_members_interests(self) -> List[TradingDisclosure]:
|
|
43
|
-
"""Fetch all MP financial interests from the API"""
|
|
44
|
-
logger.info("Starting UK Parliament financial interests collection")
|
|
45
|
-
|
|
46
|
-
if not self.session:
|
|
47
|
-
raise RuntimeError("Session not initialized. Use async context manager.")
|
|
48
|
-
|
|
49
|
-
disclosures = []
|
|
50
|
-
|
|
51
|
-
try:
|
|
52
|
-
# First, get all interest categories to understand what types of interests exist
|
|
53
|
-
categories = await self._fetch_categories()
|
|
54
|
-
logger.info(f"Found {len(categories)} interest categories")
|
|
55
|
-
|
|
56
|
-
# Get all interests for financial/investment categories
|
|
57
|
-
financial_categories = self._filter_financial_categories(categories)
|
|
58
|
-
|
|
59
|
-
for category in financial_categories:
|
|
60
|
-
category_disclosures = await self._fetch_interests_by_category(category)
|
|
61
|
-
disclosures.extend(category_disclosures)
|
|
62
|
-
|
|
63
|
-
# Rate limiting
|
|
64
|
-
await asyncio.sleep(self.config.request_delay)
|
|
65
|
-
|
|
66
|
-
logger.info(f"Collected {len(disclosures)} UK Parliament financial interests")
|
|
67
|
-
return disclosures
|
|
68
|
-
|
|
69
|
-
except Exception as e:
|
|
70
|
-
logger.error(f"Failed to fetch UK Parliament interests: {e}")
|
|
71
|
-
raise
|
|
72
|
-
|
|
73
|
-
async def _fetch_categories(self) -> List[Dict[str, Any]]:
|
|
74
|
-
"""Fetch all interest categories from the API"""
|
|
75
|
-
url = f"{self.base_url}/Categories"
|
|
76
|
-
params = {"Take": 100} # Get up to 100 categories
|
|
77
|
-
|
|
78
|
-
async with self.session.get(url, params=params) as response:
|
|
79
|
-
response.raise_for_status()
|
|
80
|
-
data = await response.json()
|
|
81
|
-
return data.get("items", [])
|
|
82
|
-
|
|
83
|
-
def _filter_financial_categories(
|
|
84
|
-
self, categories: List[Dict[str, Any]]
|
|
85
|
-
) -> List[Dict[str, Any]]:
|
|
86
|
-
"""Filter categories to include only financial/investment related ones"""
|
|
87
|
-
financial_keywords = [
|
|
88
|
-
"shareholding",
|
|
89
|
-
"share",
|
|
90
|
-
"investment",
|
|
91
|
-
"financial",
|
|
92
|
-
"company",
|
|
93
|
-
"directorship",
|
|
94
|
-
"employment",
|
|
95
|
-
"remuneration",
|
|
96
|
-
"sponsorship",
|
|
97
|
-
"gift",
|
|
98
|
-
"benefit",
|
|
99
|
-
"land",
|
|
100
|
-
"property",
|
|
101
|
-
]
|
|
102
|
-
|
|
103
|
-
financial_categories = []
|
|
104
|
-
for category in categories:
|
|
105
|
-
category_name = category.get("name", "").lower()
|
|
106
|
-
if any(keyword in category_name for keyword in financial_keywords):
|
|
107
|
-
financial_categories.append(category)
|
|
108
|
-
logger.debug(f"Including financial category: {category.get('name')}")
|
|
109
|
-
|
|
110
|
-
return financial_categories
|
|
111
|
-
|
|
112
|
-
async def _fetch_interests_by_category(
|
|
113
|
-
self, category: Dict[str, Any]
|
|
114
|
-
) -> List[TradingDisclosure]:
|
|
115
|
-
"""Fetch interests for a specific category"""
|
|
116
|
-
category_id = category.get("id")
|
|
117
|
-
category_name = category.get("name")
|
|
118
|
-
|
|
119
|
-
logger.debug(f"Fetching interests for category: {category_name} (ID: {category_id})")
|
|
120
|
-
|
|
121
|
-
disclosures = []
|
|
122
|
-
skip = 0
|
|
123
|
-
take = 50
|
|
124
|
-
|
|
125
|
-
while True:
|
|
126
|
-
url = f"{self.base_url}/Interests"
|
|
127
|
-
params = {"categoryId": category_id, "Skip": skip, "Take": take}
|
|
128
|
-
|
|
129
|
-
try:
|
|
130
|
-
async with self.session.get(url, params=params) as response:
|
|
131
|
-
response.raise_for_status()
|
|
132
|
-
data = await response.json()
|
|
133
|
-
|
|
134
|
-
interests = data.get("items", [])
|
|
135
|
-
if not interests:
|
|
136
|
-
break
|
|
137
|
-
|
|
138
|
-
for interest in interests:
|
|
139
|
-
disclosure = await self._parse_uk_interest(interest, category_name)
|
|
140
|
-
if disclosure:
|
|
141
|
-
disclosures.append(disclosure)
|
|
142
|
-
|
|
143
|
-
skip += take
|
|
144
|
-
|
|
145
|
-
# If we got fewer results than requested, we're done
|
|
146
|
-
if len(interests) < take:
|
|
147
|
-
break
|
|
148
|
-
|
|
149
|
-
except Exception as e:
|
|
150
|
-
logger.error(f"Failed to fetch interests for category {category_name}: {e}")
|
|
151
|
-
break
|
|
152
|
-
|
|
153
|
-
logger.debug(f"Found {len(disclosures)} interests in category: {category_name}")
|
|
154
|
-
return disclosures
|
|
155
|
-
|
|
156
|
-
async def _parse_uk_interest(
|
|
157
|
-
self, interest: Dict[str, Any], category_name: str
|
|
158
|
-
) -> Optional[TradingDisclosure]:
|
|
159
|
-
"""Parse a UK Parliament interest into a TradingDisclosure"""
|
|
160
|
-
try:
|
|
161
|
-
# Extract member information from the new API structure
|
|
162
|
-
member_data = interest.get("member")
|
|
163
|
-
if not member_data:
|
|
164
|
-
return None
|
|
165
|
-
|
|
166
|
-
member_id = member_data.get("id")
|
|
167
|
-
politician_name = member_data.get("nameDisplayAs", "")
|
|
168
|
-
|
|
169
|
-
# Get interest details
|
|
170
|
-
interest_id = interest.get("id")
|
|
171
|
-
description = interest.get("summary", "")
|
|
172
|
-
registered_date = interest.get("registrationDate")
|
|
173
|
-
|
|
174
|
-
# Parse dates
|
|
175
|
-
transaction_date = (
|
|
176
|
-
self._parse_date(registered_date) if registered_date else datetime.now()
|
|
177
|
-
)
|
|
178
|
-
disclosure_date = transaction_date # UK system doesn't separate these
|
|
179
|
-
|
|
180
|
-
# Determine transaction type from description
|
|
181
|
-
transaction_type = self._infer_transaction_type(description, category_name)
|
|
182
|
-
|
|
183
|
-
# Extract asset information from fields and description
|
|
184
|
-
asset_name, asset_ticker = self._extract_asset_info_from_fields(
|
|
185
|
-
interest, description, category_name
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
# Extract amount information (if available)
|
|
189
|
-
amount_min, amount_max, amount_exact = self._extract_amount_info(description)
|
|
190
|
-
|
|
191
|
-
disclosure = TradingDisclosure(
|
|
192
|
-
id=f"uk_parliament_{interest_id}",
|
|
193
|
-
politician_id="", # Will be filled during politician matching
|
|
194
|
-
transaction_date=transaction_date,
|
|
195
|
-
disclosure_date=disclosure_date,
|
|
196
|
-
transaction_type=transaction_type,
|
|
197
|
-
asset_name=asset_name,
|
|
198
|
-
asset_ticker=asset_ticker,
|
|
199
|
-
asset_type="shareholding", # Most UK disclosures are shareholdings
|
|
200
|
-
amount_range_min=amount_min,
|
|
201
|
-
amount_range_max=amount_max,
|
|
202
|
-
amount_exact=amount_exact,
|
|
203
|
-
source_url=f"https://www.parliament.uk/mps-lords-and-offices/standards-and-financial-interests/",
|
|
204
|
-
raw_data={
|
|
205
|
-
"uk_interest_id": interest_id,
|
|
206
|
-
"uk_member_id": member_id,
|
|
207
|
-
"description": description,
|
|
208
|
-
"category_name": category_name,
|
|
209
|
-
"registered_date": registered_date,
|
|
210
|
-
"source": "uk_parliament_api",
|
|
211
|
-
"politician_name": politician_name,
|
|
212
|
-
},
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
return disclosure
|
|
216
|
-
|
|
217
|
-
except Exception as e:
|
|
218
|
-
logger.error(f"Failed to parse UK interest: {e}")
|
|
219
|
-
return None
|
|
220
|
-
|
|
221
|
-
async def _fetch_mp_name(self, member_id: int) -> str:
|
|
222
|
-
"""Fetch MP name from the Parliament API using member ID"""
|
|
223
|
-
if not self.session:
|
|
224
|
-
return ""
|
|
225
|
-
|
|
226
|
-
try:
|
|
227
|
-
# Try the Members endpoint to get MP details
|
|
228
|
-
member_url = f"{self.base_url}/Members/{member_id}"
|
|
229
|
-
|
|
230
|
-
async with self.session.get(member_url) as response:
|
|
231
|
-
if response.status == 200:
|
|
232
|
-
data = await response.json()
|
|
233
|
-
|
|
234
|
-
# Extract name from the response
|
|
235
|
-
name = data.get("name", "")
|
|
236
|
-
if not name:
|
|
237
|
-
# Try alternative field names
|
|
238
|
-
name = data.get("displayAs", "")
|
|
239
|
-
if not name:
|
|
240
|
-
# Combine first and last name if available
|
|
241
|
-
first_name = data.get("nameGiven", "")
|
|
242
|
-
last_name = data.get("nameFull", "") or data.get("nameFamily", "")
|
|
243
|
-
if first_name and last_name:
|
|
244
|
-
name = f"{first_name} {last_name}"
|
|
245
|
-
|
|
246
|
-
if name:
|
|
247
|
-
logger.debug(f"Found MP name for ID {member_id}: {name}")
|
|
248
|
-
return name.strip()
|
|
249
|
-
|
|
250
|
-
else:
|
|
251
|
-
logger.debug(
|
|
252
|
-
f"Could not fetch MP details for ID {member_id}: HTTP {response.status}"
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
except Exception as e:
|
|
256
|
-
logger.debug(f"Failed to fetch MP name for ID {member_id}: {e}")
|
|
257
|
-
|
|
258
|
-
return ""
|
|
259
|
-
|
|
260
|
-
def _parse_date(self, date_str: str) -> datetime:
|
|
261
|
-
"""Parse UK Parliament API date format"""
|
|
262
|
-
try:
|
|
263
|
-
# UK Parliament API uses ISO format
|
|
264
|
-
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
265
|
-
except Exception:
|
|
266
|
-
return datetime.now()
|
|
267
|
-
|
|
268
|
-
def _infer_transaction_type(self, description: str, category_name: str) -> TransactionType:
|
|
269
|
-
"""Infer transaction type from description and category"""
|
|
270
|
-
description_lower = description.lower()
|
|
271
|
-
category_lower = category_name.lower()
|
|
272
|
-
|
|
273
|
-
# UK Parliament disclosures are mostly about holdings, not transactions
|
|
274
|
-
# But we can infer some information
|
|
275
|
-
if any(word in description_lower for word in ["sold", "disposed", "divested"]):
|
|
276
|
-
return TransactionType.SALE
|
|
277
|
-
elif any(word in description_lower for word in ["acquired", "purchased", "bought"]):
|
|
278
|
-
return TransactionType.PURCHASE
|
|
279
|
-
elif "shareholding" in category_lower:
|
|
280
|
-
return TransactionType.PURCHASE # Assume shareholding disclosure is a purchase
|
|
281
|
-
else:
|
|
282
|
-
return TransactionType.PURCHASE # Default assumption
|
|
283
|
-
|
|
284
|
-
def _extract_asset_info_from_fields(
|
|
285
|
-
self, interest: Dict[str, Any], description: str, category_name: str
|
|
286
|
-
) -> tuple[str, Optional[str]]:
|
|
287
|
-
"""Extract asset name and ticker from interest fields"""
|
|
288
|
-
# Look for OrganisationName in fields
|
|
289
|
-
fields = interest.get("fields", [])
|
|
290
|
-
organization_name = None
|
|
291
|
-
|
|
292
|
-
for field in fields:
|
|
293
|
-
if field.get("name") == "OrganisationName":
|
|
294
|
-
organization_name = field.get("value")
|
|
295
|
-
break
|
|
296
|
-
|
|
297
|
-
# Use organization name if available, otherwise fall back to description
|
|
298
|
-
if organization_name:
|
|
299
|
-
return organization_name, None
|
|
300
|
-
else:
|
|
301
|
-
return self._extract_asset_info(description, category_name)
|
|
302
|
-
|
|
303
|
-
def _extract_asset_info(
|
|
304
|
-
self, description: str, category_name: str
|
|
305
|
-
) -> tuple[str, Optional[str]]:
|
|
306
|
-
"""Extract asset name and ticker from description"""
|
|
307
|
-
# UK descriptions often contain company names
|
|
308
|
-
# This is a simplified extraction - could be enhanced with NLP
|
|
309
|
-
|
|
310
|
-
if "shareholding" in category_name.lower():
|
|
311
|
-
# Try to extract company name from shareholding descriptions
|
|
312
|
-
# Format often like: "Shareholding in [Company Name] Ltd"
|
|
313
|
-
if " in " in description:
|
|
314
|
-
parts = description.split(" in ", 1)
|
|
315
|
-
if len(parts) > 1:
|
|
316
|
-
asset_name = parts[1].strip().rstrip(".")
|
|
317
|
-
return asset_name, None
|
|
318
|
-
|
|
319
|
-
# Fallback: use description as asset name
|
|
320
|
-
return description[:100], None # Truncate to reasonable length
|
|
321
|
-
|
|
322
|
-
def _extract_amount_info(
|
|
323
|
-
self, description: str
|
|
324
|
-
) -> tuple[Optional[float], Optional[float], Optional[float]]:
|
|
325
|
-
"""Extract amount information from description"""
|
|
326
|
-
# UK Parliament disclosures often don't include specific amounts
|
|
327
|
-
# They use threshold categories (£70,000+, etc.)
|
|
328
|
-
|
|
329
|
-
description_lower = description.lower()
|
|
330
|
-
|
|
331
|
-
# Look for UK threshold amounts
|
|
332
|
-
if "£70,000" in description_lower or "70000" in description_lower:
|
|
333
|
-
return 70000.0, None, None
|
|
334
|
-
elif "£" in description_lower:
|
|
335
|
-
# Try to extract specific amounts
|
|
336
|
-
import re
|
|
337
|
-
|
|
338
|
-
amount_pattern = r"£([\d,]+)"
|
|
339
|
-
matches = re.findall(amount_pattern, description)
|
|
340
|
-
if matches:
|
|
341
|
-
try:
|
|
342
|
-
amount = float(matches[0].replace(",", ""))
|
|
343
|
-
return amount, None, amount
|
|
344
|
-
except ValueError:
|
|
345
|
-
pass
|
|
346
|
-
|
|
347
|
-
return None, None, None
|
|
348
|
-
|
|
349
|
-
async def get_politicians(self) -> List[Politician]:
|
|
350
|
-
"""Fetch current MPs from the Members API"""
|
|
351
|
-
logger.info("Fetching current UK MPs")
|
|
352
|
-
|
|
353
|
-
# For now, return empty list - would need Members API integration
|
|
354
|
-
# This would require calling https://members-api.parliament.uk/
|
|
355
|
-
return []
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
async def run_uk_parliament_collection(config) -> List[TradingDisclosure]:
|
|
359
|
-
"""Main function to run UK Parliament data collection"""
|
|
360
|
-
async with UKParliamentScraper(config) as scraper:
|
|
361
|
-
return await scraper.fetch_members_interests()
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
# Example usage for testing
|
|
365
|
-
if __name__ == "__main__":
|
|
366
|
-
from .config import WorkflowConfig
|
|
367
|
-
|
|
368
|
-
async def main():
|
|
369
|
-
config = WorkflowConfig.default()
|
|
370
|
-
disclosures = await run_uk_parliament_collection(config.scraping)
|
|
371
|
-
print(f"Collected {len(disclosures)} UK Parliament financial interests")
|
|
372
|
-
|
|
373
|
-
for disclosure in disclosures[:3]: # Show first 3
|
|
374
|
-
print(
|
|
375
|
-
f"- {disclosure.asset_name} by {disclosure.raw_data.get('politician_name', 'Unknown')}"
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
asyncio.run(main())
|