mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
"""Feature extraction utilities for ML preprocessing"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import re
|
|
9
|
+
import logging
|
|
10
|
+
from collections import defaultdict, Counter
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureExtractionStats:
|
|
17
|
+
"""Statistics about feature extraction operations"""
|
|
18
|
+
|
|
19
|
+
total_records: int
|
|
20
|
+
features_extracted: int
|
|
21
|
+
failed_extractions: int
|
|
22
|
+
feature_counts: Dict[str, int]
|
|
23
|
+
extraction_time: float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PoliticianFeatureExtractor:
|
|
27
|
+
"""Extracts features related to politicians"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
30
|
+
self.config = config or {}
|
|
31
|
+
self.politician_cache = {}
|
|
32
|
+
self.party_mapping = {
|
|
33
|
+
"democrat": "D",
|
|
34
|
+
"democratic": "D",
|
|
35
|
+
"republican": "R",
|
|
36
|
+
"independent": "I",
|
|
37
|
+
"libertarian": "L",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
def extract_politician_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
41
|
+
"""Extract politician-related features"""
|
|
42
|
+
df_features = df.copy()
|
|
43
|
+
|
|
44
|
+
# Basic politician features
|
|
45
|
+
df_features = self._extract_name_features(df_features)
|
|
46
|
+
df_features = self._extract_trading_patterns(df_features)
|
|
47
|
+
df_features = self._extract_frequency_features(df_features)
|
|
48
|
+
df_features = self._extract_timing_features(df_features)
|
|
49
|
+
|
|
50
|
+
return df_features
|
|
51
|
+
|
|
52
|
+
def _extract_name_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
53
|
+
"""Extract features from politician names"""
|
|
54
|
+
if "politician_name_cleaned" not in df.columns:
|
|
55
|
+
return df
|
|
56
|
+
|
|
57
|
+
# Name length and word count
|
|
58
|
+
df["politician_name_length"] = df["politician_name_cleaned"].str.len()
|
|
59
|
+
df["politician_name_word_count"] = df["politician_name_cleaned"].str.split().str.len()
|
|
60
|
+
|
|
61
|
+
# Common prefixes/suffixes
|
|
62
|
+
df["has_jr_sr"] = df["politician_name_cleaned"].str.contains(
|
|
63
|
+
r"\b(Jr|Sr|III|IV|II)\b", case=False
|
|
64
|
+
)
|
|
65
|
+
df["has_hyphen"] = df["politician_name_cleaned"].str.contains("-")
|
|
66
|
+
|
|
67
|
+
# Name frequency encoding (politician trading frequency)
|
|
68
|
+
name_counts = df["politician_name_cleaned"].value_counts()
|
|
69
|
+
df["politician_trading_frequency"] = df["politician_name_cleaned"].map(name_counts)
|
|
70
|
+
|
|
71
|
+
return df
|
|
72
|
+
|
|
73
|
+
def _extract_trading_patterns(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
"""Extract trading pattern features for each politician"""
|
|
75
|
+
if "politician_name_cleaned" not in df.columns:
|
|
76
|
+
return df
|
|
77
|
+
|
|
78
|
+
# Group by politician to calculate patterns
|
|
79
|
+
politician_stats = (
|
|
80
|
+
df.groupby("politician_name_cleaned")
|
|
81
|
+
.agg(
|
|
82
|
+
{
|
|
83
|
+
"transaction_amount_cleaned": ["count", "sum", "mean", "std", "min", "max"],
|
|
84
|
+
"transaction_type_cleaned": lambda x: x.value_counts().to_dict(),
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
.reset_index()
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Flatten column names
|
|
91
|
+
politician_stats.columns = [
|
|
92
|
+
"politician_name_cleaned",
|
|
93
|
+
"total_transactions",
|
|
94
|
+
"total_volume",
|
|
95
|
+
"avg_transaction_size",
|
|
96
|
+
"transaction_size_std",
|
|
97
|
+
"min_transaction_size",
|
|
98
|
+
"max_transaction_size",
|
|
99
|
+
"transaction_type_dist",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Calculate buy/sell ratios
|
|
103
|
+
def extract_buy_sell_ratio(type_dist):
|
|
104
|
+
if not isinstance(type_dist, dict):
|
|
105
|
+
return 0.5, 0, 0
|
|
106
|
+
|
|
107
|
+
buys = type_dist.get("buy", 0)
|
|
108
|
+
sells = type_dist.get("sell", 0)
|
|
109
|
+
total = buys + sells
|
|
110
|
+
|
|
111
|
+
if total == 0:
|
|
112
|
+
return 0.5, 0, 0
|
|
113
|
+
|
|
114
|
+
buy_ratio = buys / total
|
|
115
|
+
return buy_ratio, buys, sells
|
|
116
|
+
|
|
117
|
+
politician_stats[["buy_ratio", "total_buys", "total_sells"]] = pd.DataFrame(
|
|
118
|
+
politician_stats["transaction_type_dist"].apply(extract_buy_sell_ratio).tolist()
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Risk tolerance (std/mean of transaction sizes)
|
|
122
|
+
politician_stats["transaction_volatility"] = (
|
|
123
|
+
politician_stats["transaction_size_std"] / politician_stats["avg_transaction_size"]
|
|
124
|
+
).fillna(0)
|
|
125
|
+
|
|
126
|
+
# Merge back to main dataframe
|
|
127
|
+
feature_cols = [
|
|
128
|
+
"total_transactions",
|
|
129
|
+
"total_volume",
|
|
130
|
+
"avg_transaction_size",
|
|
131
|
+
"transaction_size_std",
|
|
132
|
+
"min_transaction_size",
|
|
133
|
+
"max_transaction_size",
|
|
134
|
+
"buy_ratio",
|
|
135
|
+
"total_buys",
|
|
136
|
+
"total_sells",
|
|
137
|
+
"transaction_volatility",
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
df = df.merge(
|
|
141
|
+
politician_stats[["politician_name_cleaned"] + feature_cols],
|
|
142
|
+
on="politician_name_cleaned",
|
|
143
|
+
how="left",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
def _extract_frequency_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
149
|
+
"""Extract trading frequency features"""
|
|
150
|
+
if not all(
|
|
151
|
+
col in df.columns for col in ["politician_name_cleaned", "transaction_date_cleaned"]
|
|
152
|
+
):
|
|
153
|
+
return df
|
|
154
|
+
|
|
155
|
+
# Convert date to datetime
|
|
156
|
+
df["transaction_date_dt"] = pd.to_datetime(df["transaction_date_cleaned"])
|
|
157
|
+
|
|
158
|
+
# Sort by politician and date
|
|
159
|
+
df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
|
|
160
|
+
|
|
161
|
+
# Calculate days between trades for each politician
|
|
162
|
+
df["days_since_last_trade"] = (
|
|
163
|
+
df.groupby("politician_name_cleaned")["transaction_date_dt"].diff().dt.days
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Trading frequency metrics
|
|
167
|
+
politician_freq = (
|
|
168
|
+
df.groupby("politician_name_cleaned")
|
|
169
|
+
.agg({"days_since_last_trade": ["mean", "std", "min", "max"]})
|
|
170
|
+
.reset_index()
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
politician_freq.columns = [
|
|
174
|
+
"politician_name_cleaned",
|
|
175
|
+
"avg_days_between_trades",
|
|
176
|
+
"days_between_trades_std",
|
|
177
|
+
"min_days_between_trades",
|
|
178
|
+
"max_days_between_trades",
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
# Calculate trading consistency
|
|
182
|
+
politician_freq["trading_consistency"] = 1 / (
|
|
183
|
+
1 + politician_freq["days_between_trades_std"].fillna(0)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
df = df.merge(politician_freq, on="politician_name_cleaned", how="left")
|
|
187
|
+
|
|
188
|
+
return df
|
|
189
|
+
|
|
190
|
+
def _extract_timing_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
191
|
+
"""Extract timing-related features"""
|
|
192
|
+
if "transaction_date_dt" not in df.columns:
|
|
193
|
+
return df
|
|
194
|
+
|
|
195
|
+
# Day of week (Monday=0, Sunday=6)
|
|
196
|
+
df["transaction_day_of_week"] = df["transaction_date_dt"].dt.dayofweek
|
|
197
|
+
|
|
198
|
+
# Month
|
|
199
|
+
df["transaction_month"] = df["transaction_date_dt"].dt.month
|
|
200
|
+
|
|
201
|
+
# Quarter
|
|
202
|
+
df["transaction_quarter"] = df["transaction_date_dt"].dt.quarter
|
|
203
|
+
|
|
204
|
+
# Year
|
|
205
|
+
df["transaction_year"] = df["transaction_date_dt"].dt.year
|
|
206
|
+
|
|
207
|
+
# Is weekend
|
|
208
|
+
df["is_weekend"] = df["transaction_day_of_week"].isin([5, 6])
|
|
209
|
+
|
|
210
|
+
# Is end of month
|
|
211
|
+
df["is_end_of_month"] = df["transaction_date_dt"].dt.day >= 25
|
|
212
|
+
|
|
213
|
+
# Is end of quarter
|
|
214
|
+
df["is_end_of_quarter"] = (
|
|
215
|
+
df["transaction_date_dt"].dt.month.isin([3, 6, 9, 12]) & df["is_end_of_month"]
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return df
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class MarketFeatureExtractor:
|
|
222
|
+
"""Extracts market-related features"""
|
|
223
|
+
|
|
224
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
225
|
+
self.config = config or {}
|
|
226
|
+
self.sector_mapping = self._load_sector_mapping()
|
|
227
|
+
|
|
228
|
+
def extract_market_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
229
|
+
"""Extract market-related features"""
|
|
230
|
+
df_features = df.copy()
|
|
231
|
+
|
|
232
|
+
# Asset features
|
|
233
|
+
df_features = self._extract_asset_features(df_features)
|
|
234
|
+
df_features = self._extract_ticker_features(df_features)
|
|
235
|
+
df_features = self._extract_market_cap_features(df_features)
|
|
236
|
+
|
|
237
|
+
return df_features
|
|
238
|
+
|
|
239
|
+
def _extract_asset_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
240
|
+
"""Extract features from asset names"""
|
|
241
|
+
if "asset_name_cleaned" not in df.columns:
|
|
242
|
+
return df
|
|
243
|
+
|
|
244
|
+
# Asset name characteristics
|
|
245
|
+
df["asset_name_length"] = df["asset_name_cleaned"].str.len()
|
|
246
|
+
df["asset_name_word_count"] = df["asset_name_cleaned"].str.split().str.len()
|
|
247
|
+
|
|
248
|
+
# Common asset types
|
|
249
|
+
df["is_tech_stock"] = df["asset_name_cleaned"].str.contains(
|
|
250
|
+
r"\b(tech|software|computer|data|digital|cyber|internet|online|cloud)\b", case=False
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
df["is_bank_stock"] = df["asset_name_cleaned"].str.contains(
|
|
254
|
+
r"\b(bank|financial|credit|capital|trust|investment)\b", case=False
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
df["is_pharma_stock"] = df["asset_name_cleaned"].str.contains(
|
|
258
|
+
r"\b(pharma|biotech|medical|health|drug|therapeutic)\b", case=False
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
df["is_energy_stock"] = df["asset_name_cleaned"].str.contains(
|
|
262
|
+
r"\b(energy|oil|gas|petroleum|renewable|solar|wind)\b", case=False
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Asset popularity (trading frequency)
|
|
266
|
+
asset_counts = df["asset_name_cleaned"].value_counts()
|
|
267
|
+
df["asset_trading_frequency"] = df["asset_name_cleaned"].map(asset_counts)
|
|
268
|
+
|
|
269
|
+
return df
|
|
270
|
+
|
|
271
|
+
def _extract_ticker_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
272
|
+
"""Extract features from stock tickers"""
|
|
273
|
+
if "ticker_cleaned" not in df.columns:
|
|
274
|
+
return df
|
|
275
|
+
|
|
276
|
+
# Ticker characteristics
|
|
277
|
+
df["ticker_length"] = df["ticker_cleaned"].str.len()
|
|
278
|
+
df["ticker_has_numbers"] = df["ticker_cleaned"].str.contains(r"\d")
|
|
279
|
+
|
|
280
|
+
# Ticker popularity
|
|
281
|
+
ticker_counts = df["ticker_cleaned"].value_counts()
|
|
282
|
+
df["ticker_trading_frequency"] = df["ticker_cleaned"].map(ticker_counts)
|
|
283
|
+
|
|
284
|
+
# Map to sectors (simplified)
|
|
285
|
+
df["estimated_sector"] = df["ticker_cleaned"].map(self.sector_mapping).fillna("unknown")
|
|
286
|
+
|
|
287
|
+
return df
|
|
288
|
+
|
|
289
|
+
def _extract_market_cap_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
290
|
+
"""Extract market cap related features (placeholder)"""
|
|
291
|
+
# This would typically connect to external APIs
|
|
292
|
+
# For now, create estimated features based on transaction amounts
|
|
293
|
+
|
|
294
|
+
if "transaction_amount_cleaned" not in df.columns:
|
|
295
|
+
return df
|
|
296
|
+
|
|
297
|
+
# Estimate market cap tier based on typical trading amounts
|
|
298
|
+
def estimate_market_cap_tier(amount):
|
|
299
|
+
if amount < 10000:
|
|
300
|
+
return "large_cap" # Large institutions trade large caps in smaller amounts
|
|
301
|
+
elif amount < 50000:
|
|
302
|
+
return "mid_cap"
|
|
303
|
+
else:
|
|
304
|
+
return "small_cap" # Large amounts might indicate smaller, riskier stocks
|
|
305
|
+
|
|
306
|
+
df["estimated_market_cap_tier"] = df["transaction_amount_cleaned"].apply(
|
|
307
|
+
estimate_market_cap_tier
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
return df
|
|
311
|
+
|
|
312
|
+
def _load_sector_mapping(self) -> Dict[str, str]:
|
|
313
|
+
"""Load ticker to sector mapping (simplified)"""
|
|
314
|
+
# This would typically be loaded from a data file or API
|
|
315
|
+
return {
|
|
316
|
+
"AAPL": "technology",
|
|
317
|
+
"MSFT": "technology",
|
|
318
|
+
"GOOGL": "technology",
|
|
319
|
+
"GOOG": "technology",
|
|
320
|
+
"AMZN": "consumer_discretionary",
|
|
321
|
+
"TSLA": "consumer_discretionary",
|
|
322
|
+
"META": "technology",
|
|
323
|
+
"JPM": "financials",
|
|
324
|
+
"BAC": "financials",
|
|
325
|
+
"WFC": "financials",
|
|
326
|
+
"XOM": "energy",
|
|
327
|
+
"CVX": "energy",
|
|
328
|
+
"JNJ": "healthcare",
|
|
329
|
+
"PFE": "healthcare",
|
|
330
|
+
"UNH": "healthcare",
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class TemporalFeatureExtractor:
|
|
335
|
+
"""Extracts temporal features for time series analysis"""
|
|
336
|
+
|
|
337
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
338
|
+
self.config = config or {}
|
|
339
|
+
self.lookback_periods = config.get("lookback_periods", [7, 30, 90, 365])
|
|
340
|
+
|
|
341
|
+
def extract_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
342
|
+
"""Extract temporal features"""
|
|
343
|
+
df_features = df.copy()
|
|
344
|
+
|
|
345
|
+
if "transaction_date_dt" not in df.columns:
|
|
346
|
+
return df_features
|
|
347
|
+
|
|
348
|
+
# Sort by date
|
|
349
|
+
df_features = df_features.sort_values("transaction_date_dt")
|
|
350
|
+
|
|
351
|
+
# Rolling features
|
|
352
|
+
df_features = self._extract_rolling_features(df_features)
|
|
353
|
+
df_features = self._extract_lag_features(df_features)
|
|
354
|
+
df_features = self._extract_trend_features(df_features)
|
|
355
|
+
|
|
356
|
+
return df_features
|
|
357
|
+
|
|
358
|
+
def _extract_rolling_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
359
|
+
"""Extract rolling window features"""
|
|
360
|
+
# Set date as index temporarily
|
|
361
|
+
df_indexed = df.set_index("transaction_date_dt")
|
|
362
|
+
|
|
363
|
+
for period in self.lookback_periods:
|
|
364
|
+
# Rolling transaction counts
|
|
365
|
+
df[f"transactions_last_{period}d"] = (
|
|
366
|
+
df_indexed.groupby("politician_name_cleaned")
|
|
367
|
+
.rolling(f"{period}D")["transaction_amount_cleaned"]
|
|
368
|
+
.count()
|
|
369
|
+
.reset_index(level=0, drop=True)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Rolling volume
|
|
373
|
+
df[f"volume_last_{period}d"] = (
|
|
374
|
+
df_indexed.groupby("politician_name_cleaned")
|
|
375
|
+
.rolling(f"{period}D")["transaction_amount_cleaned"]
|
|
376
|
+
.sum()
|
|
377
|
+
.reset_index(level=0, drop=True)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Rolling average transaction size
|
|
381
|
+
df[f"avg_transaction_last_{period}d"] = (
|
|
382
|
+
df_indexed.groupby("politician_name_cleaned")
|
|
383
|
+
.rolling(f"{period}D")["transaction_amount_cleaned"]
|
|
384
|
+
.mean()
|
|
385
|
+
.reset_index(level=0, drop=True)
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return df
|
|
389
|
+
|
|
390
|
+
def _extract_lag_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
391
|
+
"""Extract lagged features"""
|
|
392
|
+
lag_periods = [1, 7, 30]
|
|
393
|
+
|
|
394
|
+
for lag in lag_periods:
|
|
395
|
+
# Lag transaction amounts
|
|
396
|
+
df[f"transaction_amount_lag_{lag}"] = df.groupby("politician_name_cleaned")[
|
|
397
|
+
"transaction_amount_cleaned"
|
|
398
|
+
].shift(lag)
|
|
399
|
+
|
|
400
|
+
# Lag transaction types
|
|
401
|
+
df[f"transaction_type_lag_{lag}"] = df.groupby("politician_name_cleaned")[
|
|
402
|
+
"transaction_type_cleaned"
|
|
403
|
+
].shift(lag)
|
|
404
|
+
|
|
405
|
+
return df
|
|
406
|
+
|
|
407
|
+
def _extract_trend_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
408
|
+
"""Extract trend features"""
|
|
409
|
+
# Calculate percentage changes
|
|
410
|
+
df["amount_pct_change_1d"] = df.groupby("politician_name_cleaned")[
|
|
411
|
+
"transaction_amount_cleaned"
|
|
412
|
+
].pct_change()
|
|
413
|
+
|
|
414
|
+
df["amount_pct_change_7d"] = df.groupby("politician_name_cleaned")[
|
|
415
|
+
"transaction_amount_cleaned"
|
|
416
|
+
].pct_change(periods=7)
|
|
417
|
+
|
|
418
|
+
# Moving averages
|
|
419
|
+
df["amount_ma_7"] = (
|
|
420
|
+
df.groupby("politician_name_cleaned")["transaction_amount_cleaned"]
|
|
421
|
+
.rolling(window=7, min_periods=1)
|
|
422
|
+
.mean()
|
|
423
|
+
.reset_index(level=0, drop=True)
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
df["amount_ma_30"] = (
|
|
427
|
+
df.groupby("politician_name_cleaned")["transaction_amount_cleaned"]
|
|
428
|
+
.rolling(window=30, min_periods=1)
|
|
429
|
+
.mean()
|
|
430
|
+
.reset_index(level=0, drop=True)
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Trend indicators
|
|
434
|
+
df["amount_above_ma_7"] = df["transaction_amount_cleaned"] > df["amount_ma_7"]
|
|
435
|
+
df["amount_above_ma_30"] = df["transaction_amount_cleaned"] > df["amount_ma_30"]
|
|
436
|
+
|
|
437
|
+
return df
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
class SentimentFeatureExtractor:
|
|
441
|
+
"""Extracts sentiment and text-based features"""
|
|
442
|
+
|
|
443
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
444
|
+
self.config = config or {}
|
|
445
|
+
self.positive_words = ["gain", "profit", "up", "rise", "bull", "growth", "strong"]
|
|
446
|
+
self.negative_words = ["loss", "down", "bear", "decline", "weak", "fall", "drop"]
|
|
447
|
+
|
|
448
|
+
def extract_sentiment_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
449
|
+
"""Extract sentiment features from text fields"""
|
|
450
|
+
df_features = df.copy()
|
|
451
|
+
|
|
452
|
+
# Asset name sentiment
|
|
453
|
+
if "asset_name_cleaned" in df.columns:
|
|
454
|
+
df_features = self._extract_text_sentiment(
|
|
455
|
+
df_features, "asset_name_cleaned", "asset_name"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# News sentiment (placeholder for future news integration)
|
|
459
|
+
df_features["news_sentiment_score"] = 0.0 # Neutral baseline
|
|
460
|
+
df_features["news_volume"] = 0 # No news volume baseline
|
|
461
|
+
|
|
462
|
+
return df_features
|
|
463
|
+
|
|
464
|
+
def _extract_text_sentiment(
|
|
465
|
+
self, df: pd.DataFrame, text_column: str, prefix: str
|
|
466
|
+
) -> pd.DataFrame:
|
|
467
|
+
"""Extract sentiment from text column"""
|
|
468
|
+
if text_column not in df.columns:
|
|
469
|
+
return df
|
|
470
|
+
|
|
471
|
+
text_series = df[text_column].fillna("").str.lower()
|
|
472
|
+
|
|
473
|
+
# Count positive and negative words
|
|
474
|
+
positive_count = text_series.apply(
|
|
475
|
+
lambda x: sum(1 for word in self.positive_words if word in x)
|
|
476
|
+
)
|
|
477
|
+
negative_count = text_series.apply(
|
|
478
|
+
lambda x: sum(1 for word in self.negative_words if word in x)
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Calculate sentiment score
|
|
482
|
+
total_sentiment_words = positive_count + negative_count
|
|
483
|
+
sentiment_score = np.where(
|
|
484
|
+
total_sentiment_words > 0, (positive_count - negative_count) / total_sentiment_words, 0
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
df[f"{prefix}_positive_words"] = positive_count
|
|
488
|
+
df[f"{prefix}_negative_words"] = negative_count
|
|
489
|
+
df[f"{prefix}_sentiment_score"] = sentiment_score
|
|
490
|
+
|
|
491
|
+
return df
|