mcli-framework 7.10.0__py3-none-any.whl → 7.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/lib/custom_commands.py +10 -0
- mcli/lib/optional_deps.py +240 -0
- mcli/ml/backtesting/run.py +5 -3
- mcli/ml/models/ensemble_models.py +1 -0
- mcli/ml/models/recommendation_models.py +1 -0
- mcli/ml/optimization/optimize.py +6 -4
- mcli/ml/serving/serve.py +2 -2
- mcli/ml/training/train.py +14 -7
- mcli/self/completion_cmd.py +2 -2
- mcli/workflow/doc_convert.py +82 -112
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/workflow.py +14 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/METADATA +37 -3
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/RECORD +22 -37
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/top_level.txt +0 -0
|
@@ -1,677 +0,0 @@
|
|
|
1
|
-
"""Political influence features for stock recommendation models"""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from collections import Counter, defaultdict
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from datetime import datetime, timedelta
|
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
-
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class PoliticalFeatureConfig:
|
|
17
|
-
"""Configuration for political feature extraction"""
|
|
18
|
-
|
|
19
|
-
# Politician influence scoring
|
|
20
|
-
committee_weights: Dict[str, float] = None
|
|
21
|
-
party_influence_weights: Dict[str, float] = None
|
|
22
|
-
position_weights: Dict[str, float] = None
|
|
23
|
-
|
|
24
|
-
# Trading pattern analysis
|
|
25
|
-
influence_lookback_days: int = 180
|
|
26
|
-
insider_threshold_days: int = 30
|
|
27
|
-
cluster_analysis_window: int = 60
|
|
28
|
-
|
|
29
|
-
# Policy impact modeling
|
|
30
|
-
sector_policy_mapping: Dict[str, List[str]] = None
|
|
31
|
-
policy_announcement_window: int = 7
|
|
32
|
-
|
|
33
|
-
def __post_init__(self):
|
|
34
|
-
if self.committee_weights is None:
|
|
35
|
-
self.committee_weights = {
|
|
36
|
-
"financial_services": 3.0,
|
|
37
|
-
"energy_commerce": 2.5,
|
|
38
|
-
"judiciary": 2.0,
|
|
39
|
-
"appropriations": 2.5,
|
|
40
|
-
"ways_means": 3.0,
|
|
41
|
-
"defense": 2.0,
|
|
42
|
-
"foreign_affairs": 1.5,
|
|
43
|
-
"healthcare": 2.0,
|
|
44
|
-
"technology": 2.5,
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
if self.party_influence_weights is None:
|
|
48
|
-
self.party_influence_weights = {
|
|
49
|
-
"majority_party": 1.2,
|
|
50
|
-
"minority_party": 0.8,
|
|
51
|
-
"leadership": 2.0,
|
|
52
|
-
"committee_chair": 1.8,
|
|
53
|
-
"ranking_member": 1.4,
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
if self.position_weights is None:
|
|
57
|
-
self.position_weights = {
|
|
58
|
-
"speaker": 3.0,
|
|
59
|
-
"majority_leader": 2.5,
|
|
60
|
-
"minority_leader": 2.0,
|
|
61
|
-
"committee_chair": 2.0,
|
|
62
|
-
"subcommittee_chair": 1.5,
|
|
63
|
-
"ranking_member": 1.3,
|
|
64
|
-
"member": 1.0,
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
if self.sector_policy_mapping is None:
|
|
68
|
-
self.sector_policy_mapping = {
|
|
69
|
-
"technology": ["tech_regulation", "data_privacy", "antitrust"],
|
|
70
|
-
"healthcare": ["medicare", "drug_pricing", "healthcare_reform"],
|
|
71
|
-
"energy": ["climate_policy", "renewable_energy", "oil_regulation"],
|
|
72
|
-
"financial": ["banking_regulation", "fintech", "cryptocurrency"],
|
|
73
|
-
"defense": ["defense_spending", "military_contracts", "cybersecurity"],
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
class PoliticalInfluenceFeatures:
|
|
78
|
-
"""Extract features based on political influence and power"""
|
|
79
|
-
|
|
80
|
-
def __init__(self, config: Optional[PoliticalFeatureConfig] = None):
|
|
81
|
-
self.config = config or PoliticalFeatureConfig()
|
|
82
|
-
self.politician_influence_cache = {}
|
|
83
|
-
|
|
84
|
-
def extract_influence_features(
|
|
85
|
-
self, trading_data: pd.DataFrame, politician_metadata: Optional[pd.DataFrame] = None
|
|
86
|
-
) -> pd.DataFrame:
|
|
87
|
-
"""Extract political influence features from trading data"""
|
|
88
|
-
df = trading_data.copy()
|
|
89
|
-
|
|
90
|
-
# Calculate politician influence scores
|
|
91
|
-
df = self._calculate_politician_influence(df, politician_metadata)
|
|
92
|
-
|
|
93
|
-
# Trading timing analysis
|
|
94
|
-
df = self._analyze_trading_timing(df)
|
|
95
|
-
|
|
96
|
-
# Committee and sector alignment
|
|
97
|
-
df = self._analyze_committee_sector_alignment(df, politician_metadata)
|
|
98
|
-
|
|
99
|
-
# Party clustering analysis
|
|
100
|
-
df = self._analyze_party_clustering(df)
|
|
101
|
-
|
|
102
|
-
# Seniority and experience features
|
|
103
|
-
df = self._extract_seniority_features(df, politician_metadata)
|
|
104
|
-
|
|
105
|
-
return df
|
|
106
|
-
|
|
107
|
-
def _calculate_politician_influence(
|
|
108
|
-
self, df: pd.DataFrame, metadata: Optional[pd.DataFrame]
|
|
109
|
-
) -> pd.DataFrame:
|
|
110
|
-
"""Calculate comprehensive politician influence scores"""
|
|
111
|
-
|
|
112
|
-
# Base influence score from trading frequency and volume
|
|
113
|
-
politician_stats = (
|
|
114
|
-
df.groupby("politician_name_cleaned")
|
|
115
|
-
.agg(
|
|
116
|
-
{
|
|
117
|
-
"transaction_amount_cleaned": ["count", "sum", "mean", "std"],
|
|
118
|
-
"asset_name_cleaned": "nunique",
|
|
119
|
-
}
|
|
120
|
-
)
|
|
121
|
-
.round(2)
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
politician_stats.columns = [
|
|
125
|
-
"trade_count",
|
|
126
|
-
"total_volume",
|
|
127
|
-
"avg_trade_size",
|
|
128
|
-
"trade_size_std",
|
|
129
|
-
"unique_assets",
|
|
130
|
-
]
|
|
131
|
-
|
|
132
|
-
# Calculate base influence from trading metrics
|
|
133
|
-
# More trades, higher volumes, and diverse assets = higher influence
|
|
134
|
-
politician_stats["trade_influence"] = (
|
|
135
|
-
np.log1p(politician_stats["trade_count"])
|
|
136
|
-
+ np.log1p(politician_stats["total_volume"]) / 10
|
|
137
|
-
+ np.log1p(politician_stats["unique_assets"]) * 2
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
# Normalize to 0-1 scale
|
|
141
|
-
politician_stats["trade_influence"] = (
|
|
142
|
-
politician_stats["trade_influence"] / politician_stats["trade_influence"].max()
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Add metadata-based influence if available
|
|
146
|
-
if metadata is not None:
|
|
147
|
-
politician_stats = self._add_metadata_influence(politician_stats, metadata)
|
|
148
|
-
else:
|
|
149
|
-
# Use default influence based on trading patterns
|
|
150
|
-
politician_stats["position_influence"] = 1.0
|
|
151
|
-
politician_stats["committee_influence"] = 1.0
|
|
152
|
-
politician_stats["party_influence"] = 1.0
|
|
153
|
-
|
|
154
|
-
# Combined influence score
|
|
155
|
-
politician_stats["total_influence"] = (
|
|
156
|
-
politician_stats["trade_influence"] * 0.4
|
|
157
|
-
+ politician_stats["position_influence"] * 0.3
|
|
158
|
-
+ politician_stats["committee_influence"] * 0.2
|
|
159
|
-
+ politician_stats["party_influence"] * 0.1
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
# Merge back to main dataframe
|
|
163
|
-
df = df.merge(
|
|
164
|
-
politician_stats[["total_influence", "trade_influence"]],
|
|
165
|
-
left_on="politician_name_cleaned",
|
|
166
|
-
right_index=True,
|
|
167
|
-
how="left",
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return df
|
|
171
|
-
|
|
172
|
-
def _add_metadata_influence(
|
|
173
|
-
self, stats_df: pd.DataFrame, metadata: pd.DataFrame
|
|
174
|
-
) -> pd.DataFrame:
|
|
175
|
-
"""Add influence scores based on politician metadata"""
|
|
176
|
-
|
|
177
|
-
# Position-based influence
|
|
178
|
-
if "position" in metadata.columns:
|
|
179
|
-
position_influence = metadata["position"].map(self.config.position_weights)
|
|
180
|
-
metadata["position_influence"] = position_influence.fillna(1.0)
|
|
181
|
-
else:
|
|
182
|
-
metadata["position_influence"] = 1.0
|
|
183
|
-
|
|
184
|
-
# Committee-based influence
|
|
185
|
-
if "committees" in metadata.columns:
|
|
186
|
-
|
|
187
|
-
def calculate_committee_influence(committees_str):
|
|
188
|
-
if pd.isna(committees_str):
|
|
189
|
-
return 1.0
|
|
190
|
-
committees = str(committees_str).lower().split(",")
|
|
191
|
-
influence = 1.0
|
|
192
|
-
for committee in committees:
|
|
193
|
-
committee = committee.strip()
|
|
194
|
-
for key, weight in self.config.committee_weights.items():
|
|
195
|
-
if key in committee:
|
|
196
|
-
influence = max(influence, weight)
|
|
197
|
-
return influence
|
|
198
|
-
|
|
199
|
-
metadata["committee_influence"] = metadata["committees"].apply(
|
|
200
|
-
calculate_committee_influence
|
|
201
|
-
)
|
|
202
|
-
else:
|
|
203
|
-
metadata["committee_influence"] = 1.0
|
|
204
|
-
|
|
205
|
-
# Party-based influence (simplified)
|
|
206
|
-
if "party" in metadata.columns:
|
|
207
|
-
# Assume majority party has more influence (would need current data)
|
|
208
|
-
party_influence = metadata["party"].map({"Republican": 1.1, "Democrat": 1.0})
|
|
209
|
-
metadata["party_influence"] = party_influence.fillna(1.0)
|
|
210
|
-
else:
|
|
211
|
-
metadata["party_influence"] = 1.0
|
|
212
|
-
|
|
213
|
-
# Merge metadata influence scores
|
|
214
|
-
influence_cols = ["position_influence", "committee_influence", "party_influence"]
|
|
215
|
-
available_cols = [col for col in influence_cols if col in metadata.columns]
|
|
216
|
-
|
|
217
|
-
if available_cols:
|
|
218
|
-
stats_df = stats_df.merge(
|
|
219
|
-
metadata[["politician_name_cleaned"] + available_cols],
|
|
220
|
-
left_index=True,
|
|
221
|
-
right_on="politician_name_cleaned",
|
|
222
|
-
how="left",
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
# Fill missing values
|
|
226
|
-
for col in influence_cols:
|
|
227
|
-
if col not in stats_df.columns:
|
|
228
|
-
stats_df[col] = 1.0
|
|
229
|
-
else:
|
|
230
|
-
stats_df[col] = stats_df[col].fillna(1.0)
|
|
231
|
-
|
|
232
|
-
return stats_df
|
|
233
|
-
|
|
234
|
-
def _analyze_trading_timing(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
235
|
-
"""Analyze timing patterns in political trading"""
|
|
236
|
-
|
|
237
|
-
# Convert date to datetime if not already
|
|
238
|
-
if "transaction_date_cleaned" in df.columns:
|
|
239
|
-
df["transaction_date_dt"] = pd.to_datetime(df["transaction_date_cleaned"])
|
|
240
|
-
|
|
241
|
-
# Days since last trade by politician
|
|
242
|
-
df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
|
|
243
|
-
df["days_since_last_trade"] = (
|
|
244
|
-
df.groupby("politician_name_cleaned")["transaction_date_dt"].diff().dt.days
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# Trading frequency score (more frequent = higher score)
|
|
248
|
-
df["trading_frequency_score"] = np.where(
|
|
249
|
-
df["days_since_last_trade"].isna(),
|
|
250
|
-
1.0,
|
|
251
|
-
np.clip(30 / (df["days_since_last_trade"] + 1), 0, 2.0),
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
# Cluster trading detection (multiple trades in short timeframe)
|
|
255
|
-
df["cluster_trades"] = (
|
|
256
|
-
df.groupby("politician_name_cleaned")["days_since_last_trade"]
|
|
257
|
-
.rolling(window=5, min_periods=1)
|
|
258
|
-
.apply(lambda x: (x <= 7).sum())
|
|
259
|
-
.values
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Quarterly timing (end of quarter trading patterns)
|
|
263
|
-
df["quarter_end_trade"] = (
|
|
264
|
-
df["transaction_date_dt"].dt.month.isin([3, 6, 9, 12])
|
|
265
|
-
& (df["transaction_date_dt"].dt.day >= 25)
|
|
266
|
-
).astype(int)
|
|
267
|
-
|
|
268
|
-
# Year-end trading
|
|
269
|
-
df["year_end_trade"] = (
|
|
270
|
-
(df["transaction_date_dt"].dt.month == 12) & (df["transaction_date_dt"].dt.day >= 20)
|
|
271
|
-
).astype(int)
|
|
272
|
-
|
|
273
|
-
# Pre-earnings timing (approximate - would need earnings calendar)
|
|
274
|
-
df["potential_insider_timing"] = (df["days_since_last_trade"] <= 5).astype(int)
|
|
275
|
-
|
|
276
|
-
return df
|
|
277
|
-
|
|
278
|
-
def _analyze_committee_sector_alignment(
|
|
279
|
-
self, df: pd.DataFrame, metadata: Optional[pd.DataFrame]
|
|
280
|
-
) -> pd.DataFrame:
|
|
281
|
-
"""Analyze alignment between committee assignments and traded sectors"""
|
|
282
|
-
|
|
283
|
-
# Simplified sector classification based on asset names
|
|
284
|
-
def classify_sector(asset_name):
|
|
285
|
-
if pd.isna(asset_name):
|
|
286
|
-
return "unknown"
|
|
287
|
-
|
|
288
|
-
asset_lower = str(asset_name).lower()
|
|
289
|
-
|
|
290
|
-
# Technology sector
|
|
291
|
-
tech_keywords = [
|
|
292
|
-
"tech",
|
|
293
|
-
"software",
|
|
294
|
-
"microsoft",
|
|
295
|
-
"apple",
|
|
296
|
-
"google",
|
|
297
|
-
"meta",
|
|
298
|
-
"facebook",
|
|
299
|
-
"amazon",
|
|
300
|
-
"netflix",
|
|
301
|
-
"tesla",
|
|
302
|
-
"nvidia",
|
|
303
|
-
"intel",
|
|
304
|
-
]
|
|
305
|
-
if any(keyword in asset_lower for keyword in tech_keywords):
|
|
306
|
-
return "technology"
|
|
307
|
-
|
|
308
|
-
# Healthcare sector
|
|
309
|
-
health_keywords = [
|
|
310
|
-
"health",
|
|
311
|
-
"pharma",
|
|
312
|
-
"medical",
|
|
313
|
-
"bio",
|
|
314
|
-
"johnson",
|
|
315
|
-
"pfizer",
|
|
316
|
-
"merck",
|
|
317
|
-
"abbott",
|
|
318
|
-
"healthcare",
|
|
319
|
-
]
|
|
320
|
-
if any(keyword in asset_lower for keyword in health_keywords):
|
|
321
|
-
return "healthcare"
|
|
322
|
-
|
|
323
|
-
# Financial sector
|
|
324
|
-
finance_keywords = [
|
|
325
|
-
"bank",
|
|
326
|
-
"financial",
|
|
327
|
-
"capital",
|
|
328
|
-
"credit",
|
|
329
|
-
"jpmorgan",
|
|
330
|
-
"bank of america",
|
|
331
|
-
"wells fargo",
|
|
332
|
-
"goldman",
|
|
333
|
-
"morgan stanley",
|
|
334
|
-
]
|
|
335
|
-
if any(keyword in asset_lower for keyword in finance_keywords):
|
|
336
|
-
return "financial"
|
|
337
|
-
|
|
338
|
-
# Energy sector
|
|
339
|
-
energy_keywords = [
|
|
340
|
-
"energy",
|
|
341
|
-
"oil",
|
|
342
|
-
"gas",
|
|
343
|
-
"exxon",
|
|
344
|
-
"chevron",
|
|
345
|
-
"renewable",
|
|
346
|
-
"solar",
|
|
347
|
-
"wind",
|
|
348
|
-
"petroleum",
|
|
349
|
-
]
|
|
350
|
-
if any(keyword in asset_lower for keyword in energy_keywords):
|
|
351
|
-
return "energy"
|
|
352
|
-
|
|
353
|
-
# Defense sector
|
|
354
|
-
defense_keywords = [
|
|
355
|
-
"defense",
|
|
356
|
-
"aerospace",
|
|
357
|
-
"boeing",
|
|
358
|
-
"lockheed",
|
|
359
|
-
"raytheon",
|
|
360
|
-
"general dynamics",
|
|
361
|
-
"northrop",
|
|
362
|
-
]
|
|
363
|
-
if any(keyword in asset_lower for keyword in defense_keywords):
|
|
364
|
-
return "defense"
|
|
365
|
-
|
|
366
|
-
return "other"
|
|
367
|
-
|
|
368
|
-
df["sector_classification"] = df["asset_name_cleaned"].apply(classify_sector)
|
|
369
|
-
|
|
370
|
-
# Committee-sector alignment score
|
|
371
|
-
if metadata is not None and "committees" in metadata.columns:
|
|
372
|
-
|
|
373
|
-
def calculate_alignment_score(politician, sector):
|
|
374
|
-
politician_metadata = metadata[metadata["politician_name_cleaned"] == politician]
|
|
375
|
-
if politician_metadata.empty:
|
|
376
|
-
return 0.5 # Neutral alignment
|
|
377
|
-
|
|
378
|
-
committees = str(politician_metadata.iloc[0]["committees"]).lower()
|
|
379
|
-
|
|
380
|
-
# Check for relevant committee memberships
|
|
381
|
-
alignment_score = 0.5 # Base neutral score
|
|
382
|
-
|
|
383
|
-
if sector == "technology" and any(
|
|
384
|
-
keyword in committees for keyword in ["technology", "commerce", "judiciary"]
|
|
385
|
-
):
|
|
386
|
-
alignment_score = 0.9
|
|
387
|
-
elif sector == "healthcare" and "health" in committees:
|
|
388
|
-
alignment_score = 0.9
|
|
389
|
-
elif sector == "financial" and "financial" in committees:
|
|
390
|
-
alignment_score = 0.9
|
|
391
|
-
elif sector == "energy" and any(
|
|
392
|
-
keyword in committees for keyword in ["energy", "environment"]
|
|
393
|
-
):
|
|
394
|
-
alignment_score = 0.9
|
|
395
|
-
elif sector == "defense" and any(
|
|
396
|
-
keyword in committees for keyword in ["defense", "armed services"]
|
|
397
|
-
):
|
|
398
|
-
alignment_score = 0.9
|
|
399
|
-
|
|
400
|
-
return alignment_score
|
|
401
|
-
|
|
402
|
-
df["committee_sector_alignment"] = df.apply(
|
|
403
|
-
lambda row: calculate_alignment_score(
|
|
404
|
-
row["politician_name_cleaned"], row["sector_classification"]
|
|
405
|
-
),
|
|
406
|
-
axis=1,
|
|
407
|
-
)
|
|
408
|
-
else:
|
|
409
|
-
df["committee_sector_alignment"] = 0.5 # Neutral when no metadata
|
|
410
|
-
|
|
411
|
-
return df
|
|
412
|
-
|
|
413
|
-
def _analyze_party_clustering(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
414
|
-
"""Analyze clustering of trades by party affiliation"""
|
|
415
|
-
|
|
416
|
-
# Mock party assignment based on politician name patterns
|
|
417
|
-
# In real implementation, this would come from metadata
|
|
418
|
-
def assign_party(name):
|
|
419
|
-
# This is a simplified mock assignment
|
|
420
|
-
# In practice, this would come from politician metadata
|
|
421
|
-
republican_indicators = ["mitch", "mcconnell", "cruz", "rubio", "romney"]
|
|
422
|
-
democrat_indicators = ["pelosi", "schumer", "warren", "sanders"]
|
|
423
|
-
|
|
424
|
-
name_lower = str(name).lower()
|
|
425
|
-
if any(indicator in name_lower for indicator in republican_indicators):
|
|
426
|
-
return "Republican"
|
|
427
|
-
elif any(indicator in name_lower for indicator in democrat_indicators):
|
|
428
|
-
return "Democrat"
|
|
429
|
-
else:
|
|
430
|
-
return "Independent" # Default
|
|
431
|
-
|
|
432
|
-
df["estimated_party"] = df["politician_name_cleaned"].apply(assign_party)
|
|
433
|
-
|
|
434
|
-
# Party-based trading patterns
|
|
435
|
-
party_stats = (
|
|
436
|
-
df.groupby(["estimated_party", "sector_classification"])
|
|
437
|
-
.agg(
|
|
438
|
-
{
|
|
439
|
-
"transaction_amount_cleaned": ["count", "mean"],
|
|
440
|
-
"transaction_type_cleaned": lambda x: (x == "buy").mean(),
|
|
441
|
-
}
|
|
442
|
-
)
|
|
443
|
-
.round(3)
|
|
444
|
-
)
|
|
445
|
-
|
|
446
|
-
party_stats.columns = ["party_sector_trades", "party_avg_amount", "party_buy_ratio"]
|
|
447
|
-
|
|
448
|
-
# Calculate party consensus score for each trade
|
|
449
|
-
df = df.merge(
|
|
450
|
-
party_stats,
|
|
451
|
-
left_on=["estimated_party", "sector_classification"],
|
|
452
|
-
right_index=True,
|
|
453
|
-
how="left",
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
# Party divergence score (how much this trade differs from party norm)
|
|
457
|
-
df["party_divergence"] = abs(
|
|
458
|
-
(df["transaction_type_cleaned"] == "buy").astype(int) - df["party_buy_ratio"]
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
return df
|
|
462
|
-
|
|
463
|
-
def _extract_seniority_features(
|
|
464
|
-
self, df: pd.DataFrame, metadata: Optional[pd.DataFrame]
|
|
465
|
-
) -> pd.DataFrame:
|
|
466
|
-
"""Extract features related to politician seniority and experience"""
|
|
467
|
-
|
|
468
|
-
# Estimate seniority based on trading patterns (mock implementation)
|
|
469
|
-
politician_first_trade = df.groupby("politician_name_cleaned")["transaction_date_dt"].min()
|
|
470
|
-
|
|
471
|
-
# Calculate trading experience (days since first recorded trade)
|
|
472
|
-
df = df.merge(
|
|
473
|
-
politician_first_trade.rename("first_trade_date"),
|
|
474
|
-
left_on="politician_name_cleaned",
|
|
475
|
-
right_index=True,
|
|
476
|
-
how="left",
|
|
477
|
-
)
|
|
478
|
-
|
|
479
|
-
df["trading_experience_days"] = (df["transaction_date_dt"] - df["first_trade_date"]).dt.days
|
|
480
|
-
|
|
481
|
-
# Experience categories
|
|
482
|
-
df["experience_category"] = pd.cut(
|
|
483
|
-
df["trading_experience_days"],
|
|
484
|
-
bins=[0, 90, 365, 1095, float("inf")],
|
|
485
|
-
labels=["novice", "intermediate", "experienced", "veteran"],
|
|
486
|
-
)
|
|
487
|
-
|
|
488
|
-
# Seniority influence score
|
|
489
|
-
df["seniority_influence"] = np.clip(np.log1p(df["trading_experience_days"]) / 10, 0, 2.0)
|
|
490
|
-
|
|
491
|
-
return df
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
class CongressionalTrackingFeatures:
|
|
495
|
-
"""Features based on congressional trading disclosure tracking"""
|
|
496
|
-
|
|
497
|
-
def __init__(self, config: Optional[PoliticalFeatureConfig] = None):
|
|
498
|
-
self.config = config or PoliticalFeatureConfig()
|
|
499
|
-
|
|
500
|
-
def extract_disclosure_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
501
|
-
"""Extract features related to disclosure timing and patterns"""
|
|
502
|
-
|
|
503
|
-
# Disclosure delay analysis
|
|
504
|
-
if "disclosure_date" in df.columns and "transaction_date_cleaned" in df.columns:
|
|
505
|
-
df["disclosure_date_dt"] = pd.to_datetime(df["disclosure_date"])
|
|
506
|
-
df["disclosure_delay_days"] = (
|
|
507
|
-
df["disclosure_date_dt"] - df["transaction_date_dt"]
|
|
508
|
-
).dt.days
|
|
509
|
-
|
|
510
|
-
# Disclosure compliance scoring
|
|
511
|
-
df["timely_disclosure"] = (df["disclosure_delay_days"] <= 45).astype(int)
|
|
512
|
-
df["late_disclosure"] = (df["disclosure_delay_days"] > 45).astype(int)
|
|
513
|
-
df["very_late_disclosure"] = (df["disclosure_delay_days"] > 90).astype(int)
|
|
514
|
-
|
|
515
|
-
# Disclosure pattern analysis
|
|
516
|
-
df["disclosure_compliance_score"] = np.clip(
|
|
517
|
-
1.0 - (df["disclosure_delay_days"] / 90), 0, 1
|
|
518
|
-
)
|
|
519
|
-
else:
|
|
520
|
-
# Default values when disclosure dates not available
|
|
521
|
-
df["disclosure_delay_days"] = 30
|
|
522
|
-
df["timely_disclosure"] = 1
|
|
523
|
-
df["disclosure_compliance_score"] = 0.8
|
|
524
|
-
|
|
525
|
-
# Transaction size vs disclosure timing
|
|
526
|
-
df["large_trade_late_disclosure"] = (
|
|
527
|
-
(df["transaction_amount_cleaned"] > df["transaction_amount_cleaned"].quantile(0.9))
|
|
528
|
-
& (df["disclosure_delay_days"] > 45)
|
|
529
|
-
).astype(int)
|
|
530
|
-
|
|
531
|
-
return df
|
|
532
|
-
|
|
533
|
-
def extract_reporting_patterns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
534
|
-
"""Extract patterns in reporting behavior"""
|
|
535
|
-
|
|
536
|
-
# Reporting frequency by politician
|
|
537
|
-
politician_reporting = df.groupby("politician_name_cleaned").agg(
|
|
538
|
-
{
|
|
539
|
-
"disclosure_delay_days": ["mean", "std", "max"],
|
|
540
|
-
"timely_disclosure": "mean",
|
|
541
|
-
"transaction_amount_cleaned": "count",
|
|
542
|
-
}
|
|
543
|
-
)
|
|
544
|
-
|
|
545
|
-
politician_reporting.columns = [
|
|
546
|
-
"avg_disclosure_delay",
|
|
547
|
-
"disclosure_delay_std",
|
|
548
|
-
"max_disclosure_delay",
|
|
549
|
-
"timely_disclosure_rate",
|
|
550
|
-
"total_disclosures",
|
|
551
|
-
]
|
|
552
|
-
|
|
553
|
-
# Reporting reliability score
|
|
554
|
-
politician_reporting["reporting_reliability"] = (
|
|
555
|
-
politician_reporting["timely_disclosure_rate"] * 0.7
|
|
556
|
-
+ np.clip(1.0 - politician_reporting["avg_disclosure_delay"] / 90, 0, 1) * 0.3
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
# Merge back to main dataframe
|
|
560
|
-
df = df.merge(
|
|
561
|
-
politician_reporting[["reporting_reliability", "avg_disclosure_delay"]],
|
|
562
|
-
left_on="politician_name_cleaned",
|
|
563
|
-
right_index=True,
|
|
564
|
-
how="left",
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
return df
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
class PolicyImpactFeatures:
|
|
571
|
-
"""Features related to policy announcements and their market impact"""
|
|
572
|
-
|
|
573
|
-
def __init__(self, config: Optional[PoliticalFeatureConfig] = None):
|
|
574
|
-
self.config = config or PoliticalFeatureConfig()
|
|
575
|
-
|
|
576
|
-
def extract_policy_timing_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
577
|
-
"""Extract features related to policy announcement timing"""
|
|
578
|
-
|
|
579
|
-
# Mock policy events (in practice, this would come from news/policy databases)
|
|
580
|
-
policy_events = self._generate_mock_policy_events(df)
|
|
581
|
-
|
|
582
|
-
if policy_events:
|
|
583
|
-
df = self._analyze_policy_trade_timing(df, policy_events)
|
|
584
|
-
else:
|
|
585
|
-
# Default values when no policy data available
|
|
586
|
-
df["days_to_policy_event"] = 999
|
|
587
|
-
df["pre_policy_trade"] = 0
|
|
588
|
-
df["post_policy_trade"] = 0
|
|
589
|
-
df["policy_relevant_trade"] = 0
|
|
590
|
-
|
|
591
|
-
return df
|
|
592
|
-
|
|
593
|
-
def _generate_mock_policy_events(self, df: pd.DataFrame) -> List[Dict]:
|
|
594
|
-
"""Generate mock policy events for demonstration"""
|
|
595
|
-
# In practice, this would be loaded from external policy/news data
|
|
596
|
-
|
|
597
|
-
date_range = pd.date_range(
|
|
598
|
-
start=df["transaction_date_dt"].min(),
|
|
599
|
-
end=df["transaction_date_dt"].max(),
|
|
600
|
-
freq="30D",
|
|
601
|
-
)
|
|
602
|
-
|
|
603
|
-
policy_events = []
|
|
604
|
-
sectors = ["technology", "healthcare", "financial", "energy"]
|
|
605
|
-
|
|
606
|
-
for date in date_range:
|
|
607
|
-
for sector in sectors:
|
|
608
|
-
if np.random.random() < 0.1: # 10% chance of policy event
|
|
609
|
-
policy_events.append(
|
|
610
|
-
{
|
|
611
|
-
"date": date,
|
|
612
|
-
"sector": sector,
|
|
613
|
-
"event_type": np.random.choice(
|
|
614
|
-
["regulation", "legislation", "hearing"]
|
|
615
|
-
),
|
|
616
|
-
"impact_score": np.random.uniform(0.1, 1.0),
|
|
617
|
-
}
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
return policy_events
|
|
621
|
-
|
|
622
|
-
def _analyze_policy_trade_timing(
|
|
623
|
-
self, df: pd.DataFrame, policy_events: List[Dict]
|
|
624
|
-
) -> pd.DataFrame:
|
|
625
|
-
"""Analyze timing of trades relative to policy events"""
|
|
626
|
-
|
|
627
|
-
# Convert policy events to DataFrame
|
|
628
|
-
policy_df = pd.DataFrame(policy_events)
|
|
629
|
-
policy_df["date"] = pd.to_datetime(policy_df["date"])
|
|
630
|
-
|
|
631
|
-
# For each trade, find the nearest policy event in the same sector
|
|
632
|
-
def find_nearest_policy_event(row):
|
|
633
|
-
sector = row["sector_classification"]
|
|
634
|
-
trade_date = row["transaction_date_dt"]
|
|
635
|
-
|
|
636
|
-
# Filter policy events for the same sector
|
|
637
|
-
sector_events = policy_df[policy_df["sector"] == sector]
|
|
638
|
-
|
|
639
|
-
if sector_events.empty:
|
|
640
|
-
return 999, 0 # No relevant events
|
|
641
|
-
|
|
642
|
-
# Calculate days to each event
|
|
643
|
-
days_diff = (sector_events["date"] - trade_date).dt.days
|
|
644
|
-
|
|
645
|
-
# Find nearest event (past or future)
|
|
646
|
-
abs_days = days_diff.abs()
|
|
647
|
-
nearest_idx = abs_days.idxmin()
|
|
648
|
-
|
|
649
|
-
nearest_days = days_diff.loc[nearest_idx]
|
|
650
|
-
impact_score = sector_events.loc[nearest_idx, "impact_score"]
|
|
651
|
-
|
|
652
|
-
return nearest_days, impact_score
|
|
653
|
-
|
|
654
|
-
# Apply to all trades
|
|
655
|
-
policy_analysis = df.apply(find_nearest_policy_event, axis=1, result_type="expand")
|
|
656
|
-
df["days_to_policy_event"] = policy_analysis[0]
|
|
657
|
-
df["policy_impact_score"] = policy_analysis[1]
|
|
658
|
-
|
|
659
|
-
# Policy-related trade flags
|
|
660
|
-
df["pre_policy_trade"] = (
|
|
661
|
-
(df["days_to_policy_event"] > 0) & (df["days_to_policy_event"] <= 7)
|
|
662
|
-
).astype(int)
|
|
663
|
-
|
|
664
|
-
df["post_policy_trade"] = (
|
|
665
|
-
(df["days_to_policy_event"] < 0) & (df["days_to_policy_event"] >= -7)
|
|
666
|
-
).astype(int)
|
|
667
|
-
|
|
668
|
-
df["policy_relevant_trade"] = (abs(df["days_to_policy_event"]) <= 7).astype(int)
|
|
669
|
-
|
|
670
|
-
# Potential insider trading indicator
|
|
671
|
-
df["potential_insider_policy"] = (
|
|
672
|
-
(df["pre_policy_trade"] == 1)
|
|
673
|
-
& (df["policy_impact_score"] > 0.7)
|
|
674
|
-
& (df["transaction_amount_cleaned"] > df["transaction_amount_cleaned"].quantile(0.8))
|
|
675
|
-
).astype(int)
|
|
676
|
-
|
|
677
|
-
return df
|