mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,491 @@
1
+ """Feature extraction utilities for ML preprocessing"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+ from dataclasses import dataclass
8
+ import re
9
+ import logging
10
+ from collections import defaultdict, Counter
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class FeatureExtractionStats:
17
+ """Statistics about feature extraction operations"""
18
+
19
+ total_records: int
20
+ features_extracted: int
21
+ failed_extractions: int
22
+ feature_counts: Dict[str, int]
23
+ extraction_time: float
24
+
25
+
26
+ class PoliticianFeatureExtractor:
27
+ """Extracts features related to politicians"""
28
+
29
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
30
+ self.config = config or {}
31
+ self.politician_cache = {}
32
+ self.party_mapping = {
33
+ "democrat": "D",
34
+ "democratic": "D",
35
+ "republican": "R",
36
+ "independent": "I",
37
+ "libertarian": "L",
38
+ }
39
+
40
+ def extract_politician_features(self, df: pd.DataFrame) -> pd.DataFrame:
41
+ """Extract politician-related features"""
42
+ df_features = df.copy()
43
+
44
+ # Basic politician features
45
+ df_features = self._extract_name_features(df_features)
46
+ df_features = self._extract_trading_patterns(df_features)
47
+ df_features = self._extract_frequency_features(df_features)
48
+ df_features = self._extract_timing_features(df_features)
49
+
50
+ return df_features
51
+
52
+ def _extract_name_features(self, df: pd.DataFrame) -> pd.DataFrame:
53
+ """Extract features from politician names"""
54
+ if "politician_name_cleaned" not in df.columns:
55
+ return df
56
+
57
+ # Name length and word count
58
+ df["politician_name_length"] = df["politician_name_cleaned"].str.len()
59
+ df["politician_name_word_count"] = df["politician_name_cleaned"].str.split().str.len()
60
+
61
+ # Common prefixes/suffixes
62
+ df["has_jr_sr"] = df["politician_name_cleaned"].str.contains(
63
+ r"\b(Jr|Sr|III|IV|II)\b", case=False
64
+ )
65
+ df["has_hyphen"] = df["politician_name_cleaned"].str.contains("-")
66
+
67
+ # Name frequency encoding (politician trading frequency)
68
+ name_counts = df["politician_name_cleaned"].value_counts()
69
+ df["politician_trading_frequency"] = df["politician_name_cleaned"].map(name_counts)
70
+
71
+ return df
72
+
73
+ def _extract_trading_patterns(self, df: pd.DataFrame) -> pd.DataFrame:
74
+ """Extract trading pattern features for each politician"""
75
+ if "politician_name_cleaned" not in df.columns:
76
+ return df
77
+
78
+ # Group by politician to calculate patterns
79
+ politician_stats = (
80
+ df.groupby("politician_name_cleaned")
81
+ .agg(
82
+ {
83
+ "transaction_amount_cleaned": ["count", "sum", "mean", "std", "min", "max"],
84
+ "transaction_type_cleaned": lambda x: x.value_counts().to_dict(),
85
+ }
86
+ )
87
+ .reset_index()
88
+ )
89
+
90
+ # Flatten column names
91
+ politician_stats.columns = [
92
+ "politician_name_cleaned",
93
+ "total_transactions",
94
+ "total_volume",
95
+ "avg_transaction_size",
96
+ "transaction_size_std",
97
+ "min_transaction_size",
98
+ "max_transaction_size",
99
+ "transaction_type_dist",
100
+ ]
101
+
102
+ # Calculate buy/sell ratios
103
+ def extract_buy_sell_ratio(type_dist):
104
+ if not isinstance(type_dist, dict):
105
+ return 0.5, 0, 0
106
+
107
+ buys = type_dist.get("buy", 0)
108
+ sells = type_dist.get("sell", 0)
109
+ total = buys + sells
110
+
111
+ if total == 0:
112
+ return 0.5, 0, 0
113
+
114
+ buy_ratio = buys / total
115
+ return buy_ratio, buys, sells
116
+
117
+ politician_stats[["buy_ratio", "total_buys", "total_sells"]] = pd.DataFrame(
118
+ politician_stats["transaction_type_dist"].apply(extract_buy_sell_ratio).tolist()
119
+ )
120
+
121
+ # Risk tolerance (std/mean of transaction sizes)
122
+ politician_stats["transaction_volatility"] = (
123
+ politician_stats["transaction_size_std"] / politician_stats["avg_transaction_size"]
124
+ ).fillna(0)
125
+
126
+ # Merge back to main dataframe
127
+ feature_cols = [
128
+ "total_transactions",
129
+ "total_volume",
130
+ "avg_transaction_size",
131
+ "transaction_size_std",
132
+ "min_transaction_size",
133
+ "max_transaction_size",
134
+ "buy_ratio",
135
+ "total_buys",
136
+ "total_sells",
137
+ "transaction_volatility",
138
+ ]
139
+
140
+ df = df.merge(
141
+ politician_stats[["politician_name_cleaned"] + feature_cols],
142
+ on="politician_name_cleaned",
143
+ how="left",
144
+ )
145
+
146
+ return df
147
+
148
+ def _extract_frequency_features(self, df: pd.DataFrame) -> pd.DataFrame:
149
+ """Extract trading frequency features"""
150
+ if not all(
151
+ col in df.columns for col in ["politician_name_cleaned", "transaction_date_cleaned"]
152
+ ):
153
+ return df
154
+
155
+ # Convert date to datetime
156
+ df["transaction_date_dt"] = pd.to_datetime(df["transaction_date_cleaned"])
157
+
158
+ # Sort by politician and date
159
+ df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
160
+
161
+ # Calculate days between trades for each politician
162
+ df["days_since_last_trade"] = (
163
+ df.groupby("politician_name_cleaned")["transaction_date_dt"].diff().dt.days
164
+ )
165
+
166
+ # Trading frequency metrics
167
+ politician_freq = (
168
+ df.groupby("politician_name_cleaned")
169
+ .agg({"days_since_last_trade": ["mean", "std", "min", "max"]})
170
+ .reset_index()
171
+ )
172
+
173
+ politician_freq.columns = [
174
+ "politician_name_cleaned",
175
+ "avg_days_between_trades",
176
+ "days_between_trades_std",
177
+ "min_days_between_trades",
178
+ "max_days_between_trades",
179
+ ]
180
+
181
+ # Calculate trading consistency
182
+ politician_freq["trading_consistency"] = 1 / (
183
+ 1 + politician_freq["days_between_trades_std"].fillna(0)
184
+ )
185
+
186
+ df = df.merge(politician_freq, on="politician_name_cleaned", how="left")
187
+
188
+ return df
189
+
190
+ def _extract_timing_features(self, df: pd.DataFrame) -> pd.DataFrame:
191
+ """Extract timing-related features"""
192
+ if "transaction_date_dt" not in df.columns:
193
+ return df
194
+
195
+ # Day of week (Monday=0, Sunday=6)
196
+ df["transaction_day_of_week"] = df["transaction_date_dt"].dt.dayofweek
197
+
198
+ # Month
199
+ df["transaction_month"] = df["transaction_date_dt"].dt.month
200
+
201
+ # Quarter
202
+ df["transaction_quarter"] = df["transaction_date_dt"].dt.quarter
203
+
204
+ # Year
205
+ df["transaction_year"] = df["transaction_date_dt"].dt.year
206
+
207
+ # Is weekend
208
+ df["is_weekend"] = df["transaction_day_of_week"].isin([5, 6])
209
+
210
+ # Is end of month
211
+ df["is_end_of_month"] = df["transaction_date_dt"].dt.day >= 25
212
+
213
+ # Is end of quarter
214
+ df["is_end_of_quarter"] = (
215
+ df["transaction_date_dt"].dt.month.isin([3, 6, 9, 12]) & df["is_end_of_month"]
216
+ )
217
+
218
+ return df
219
+
220
+
221
+ class MarketFeatureExtractor:
222
+ """Extracts market-related features"""
223
+
224
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
225
+ self.config = config or {}
226
+ self.sector_mapping = self._load_sector_mapping()
227
+
228
+ def extract_market_features(self, df: pd.DataFrame) -> pd.DataFrame:
229
+ """Extract market-related features"""
230
+ df_features = df.copy()
231
+
232
+ # Asset features
233
+ df_features = self._extract_asset_features(df_features)
234
+ df_features = self._extract_ticker_features(df_features)
235
+ df_features = self._extract_market_cap_features(df_features)
236
+
237
+ return df_features
238
+
239
+ def _extract_asset_features(self, df: pd.DataFrame) -> pd.DataFrame:
240
+ """Extract features from asset names"""
241
+ if "asset_name_cleaned" not in df.columns:
242
+ return df
243
+
244
+ # Asset name characteristics
245
+ df["asset_name_length"] = df["asset_name_cleaned"].str.len()
246
+ df["asset_name_word_count"] = df["asset_name_cleaned"].str.split().str.len()
247
+
248
+ # Common asset types
249
+ df["is_tech_stock"] = df["asset_name_cleaned"].str.contains(
250
+ r"\b(tech|software|computer|data|digital|cyber|internet|online|cloud)\b", case=False
251
+ )
252
+
253
+ df["is_bank_stock"] = df["asset_name_cleaned"].str.contains(
254
+ r"\b(bank|financial|credit|capital|trust|investment)\b", case=False
255
+ )
256
+
257
+ df["is_pharma_stock"] = df["asset_name_cleaned"].str.contains(
258
+ r"\b(pharma|biotech|medical|health|drug|therapeutic)\b", case=False
259
+ )
260
+
261
+ df["is_energy_stock"] = df["asset_name_cleaned"].str.contains(
262
+ r"\b(energy|oil|gas|petroleum|renewable|solar|wind)\b", case=False
263
+ )
264
+
265
+ # Asset popularity (trading frequency)
266
+ asset_counts = df["asset_name_cleaned"].value_counts()
267
+ df["asset_trading_frequency"] = df["asset_name_cleaned"].map(asset_counts)
268
+
269
+ return df
270
+
271
+ def _extract_ticker_features(self, df: pd.DataFrame) -> pd.DataFrame:
272
+ """Extract features from stock tickers"""
273
+ if "ticker_cleaned" not in df.columns:
274
+ return df
275
+
276
+ # Ticker characteristics
277
+ df["ticker_length"] = df["ticker_cleaned"].str.len()
278
+ df["ticker_has_numbers"] = df["ticker_cleaned"].str.contains(r"\d")
279
+
280
+ # Ticker popularity
281
+ ticker_counts = df["ticker_cleaned"].value_counts()
282
+ df["ticker_trading_frequency"] = df["ticker_cleaned"].map(ticker_counts)
283
+
284
+ # Map to sectors (simplified)
285
+ df["estimated_sector"] = df["ticker_cleaned"].map(self.sector_mapping).fillna("unknown")
286
+
287
+ return df
288
+
289
+ def _extract_market_cap_features(self, df: pd.DataFrame) -> pd.DataFrame:
290
+ """Extract market cap related features (placeholder)"""
291
+ # This would typically connect to external APIs
292
+ # For now, create estimated features based on transaction amounts
293
+
294
+ if "transaction_amount_cleaned" not in df.columns:
295
+ return df
296
+
297
+ # Estimate market cap tier based on typical trading amounts
298
+ def estimate_market_cap_tier(amount):
299
+ if amount < 10000:
300
+ return "large_cap" # Large institutions trade large caps in smaller amounts
301
+ elif amount < 50000:
302
+ return "mid_cap"
303
+ else:
304
+ return "small_cap" # Large amounts might indicate smaller, riskier stocks
305
+
306
+ df["estimated_market_cap_tier"] = df["transaction_amount_cleaned"].apply(
307
+ estimate_market_cap_tier
308
+ )
309
+
310
+ return df
311
+
312
+ def _load_sector_mapping(self) -> Dict[str, str]:
313
+ """Load ticker to sector mapping (simplified)"""
314
+ # This would typically be loaded from a data file or API
315
+ return {
316
+ "AAPL": "technology",
317
+ "MSFT": "technology",
318
+ "GOOGL": "technology",
319
+ "GOOG": "technology",
320
+ "AMZN": "consumer_discretionary",
321
+ "TSLA": "consumer_discretionary",
322
+ "META": "technology",
323
+ "JPM": "financials",
324
+ "BAC": "financials",
325
+ "WFC": "financials",
326
+ "XOM": "energy",
327
+ "CVX": "energy",
328
+ "JNJ": "healthcare",
329
+ "PFE": "healthcare",
330
+ "UNH": "healthcare",
331
+ }
332
+
333
+
334
+ class TemporalFeatureExtractor:
335
+ """Extracts temporal features for time series analysis"""
336
+
337
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
338
+ self.config = config or {}
339
+ self.lookback_periods = config.get("lookback_periods", [7, 30, 90, 365])
340
+
341
+ def extract_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
342
+ """Extract temporal features"""
343
+ df_features = df.copy()
344
+
345
+ if "transaction_date_dt" not in df.columns:
346
+ return df_features
347
+
348
+ # Sort by date
349
+ df_features = df_features.sort_values("transaction_date_dt")
350
+
351
+ # Rolling features
352
+ df_features = self._extract_rolling_features(df_features)
353
+ df_features = self._extract_lag_features(df_features)
354
+ df_features = self._extract_trend_features(df_features)
355
+
356
+ return df_features
357
+
358
+ def _extract_rolling_features(self, df: pd.DataFrame) -> pd.DataFrame:
359
+ """Extract rolling window features"""
360
+ # Set date as index temporarily
361
+ df_indexed = df.set_index("transaction_date_dt")
362
+
363
+ for period in self.lookback_periods:
364
+ # Rolling transaction counts
365
+ df[f"transactions_last_{period}d"] = (
366
+ df_indexed.groupby("politician_name_cleaned")
367
+ .rolling(f"{period}D")["transaction_amount_cleaned"]
368
+ .count()
369
+ .reset_index(level=0, drop=True)
370
+ )
371
+
372
+ # Rolling volume
373
+ df[f"volume_last_{period}d"] = (
374
+ df_indexed.groupby("politician_name_cleaned")
375
+ .rolling(f"{period}D")["transaction_amount_cleaned"]
376
+ .sum()
377
+ .reset_index(level=0, drop=True)
378
+ )
379
+
380
+ # Rolling average transaction size
381
+ df[f"avg_transaction_last_{period}d"] = (
382
+ df_indexed.groupby("politician_name_cleaned")
383
+ .rolling(f"{period}D")["transaction_amount_cleaned"]
384
+ .mean()
385
+ .reset_index(level=0, drop=True)
386
+ )
387
+
388
+ return df
389
+
390
+ def _extract_lag_features(self, df: pd.DataFrame) -> pd.DataFrame:
391
+ """Extract lagged features"""
392
+ lag_periods = [1, 7, 30]
393
+
394
+ for lag in lag_periods:
395
+ # Lag transaction amounts
396
+ df[f"transaction_amount_lag_{lag}"] = df.groupby("politician_name_cleaned")[
397
+ "transaction_amount_cleaned"
398
+ ].shift(lag)
399
+
400
+ # Lag transaction types
401
+ df[f"transaction_type_lag_{lag}"] = df.groupby("politician_name_cleaned")[
402
+ "transaction_type_cleaned"
403
+ ].shift(lag)
404
+
405
+ return df
406
+
407
+ def _extract_trend_features(self, df: pd.DataFrame) -> pd.DataFrame:
408
+ """Extract trend features"""
409
+ # Calculate percentage changes
410
+ df["amount_pct_change_1d"] = df.groupby("politician_name_cleaned")[
411
+ "transaction_amount_cleaned"
412
+ ].pct_change()
413
+
414
+ df["amount_pct_change_7d"] = df.groupby("politician_name_cleaned")[
415
+ "transaction_amount_cleaned"
416
+ ].pct_change(periods=7)
417
+
418
+ # Moving averages
419
+ df["amount_ma_7"] = (
420
+ df.groupby("politician_name_cleaned")["transaction_amount_cleaned"]
421
+ .rolling(window=7, min_periods=1)
422
+ .mean()
423
+ .reset_index(level=0, drop=True)
424
+ )
425
+
426
+ df["amount_ma_30"] = (
427
+ df.groupby("politician_name_cleaned")["transaction_amount_cleaned"]
428
+ .rolling(window=30, min_periods=1)
429
+ .mean()
430
+ .reset_index(level=0, drop=True)
431
+ )
432
+
433
+ # Trend indicators
434
+ df["amount_above_ma_7"] = df["transaction_amount_cleaned"] > df["amount_ma_7"]
435
+ df["amount_above_ma_30"] = df["transaction_amount_cleaned"] > df["amount_ma_30"]
436
+
437
+ return df
438
+
439
+
440
+ class SentimentFeatureExtractor:
441
+ """Extracts sentiment and text-based features"""
442
+
443
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
444
+ self.config = config or {}
445
+ self.positive_words = ["gain", "profit", "up", "rise", "bull", "growth", "strong"]
446
+ self.negative_words = ["loss", "down", "bear", "decline", "weak", "fall", "drop"]
447
+
448
+ def extract_sentiment_features(self, df: pd.DataFrame) -> pd.DataFrame:
449
+ """Extract sentiment features from text fields"""
450
+ df_features = df.copy()
451
+
452
+ # Asset name sentiment
453
+ if "asset_name_cleaned" in df.columns:
454
+ df_features = self._extract_text_sentiment(
455
+ df_features, "asset_name_cleaned", "asset_name"
456
+ )
457
+
458
+ # News sentiment (placeholder for future news integration)
459
+ df_features["news_sentiment_score"] = 0.0 # Neutral baseline
460
+ df_features["news_volume"] = 0 # No news volume baseline
461
+
462
+ return df_features
463
+
464
+ def _extract_text_sentiment(
465
+ self, df: pd.DataFrame, text_column: str, prefix: str
466
+ ) -> pd.DataFrame:
467
+ """Extract sentiment from text column"""
468
+ if text_column not in df.columns:
469
+ return df
470
+
471
+ text_series = df[text_column].fillna("").str.lower()
472
+
473
+ # Count positive and negative words
474
+ positive_count = text_series.apply(
475
+ lambda x: sum(1 for word in self.positive_words if word in x)
476
+ )
477
+ negative_count = text_series.apply(
478
+ lambda x: sum(1 for word in self.negative_words if word in x)
479
+ )
480
+
481
+ # Calculate sentiment score
482
+ total_sentiment_words = positive_count + negative_count
483
+ sentiment_score = np.where(
484
+ total_sentiment_words > 0, (positive_count - negative_count) / total_sentiment_words, 0
485
+ )
486
+
487
+ df[f"{prefix}_positive_words"] = positive_count
488
+ df[f"{prefix}_negative_words"] = negative_count
489
+ df[f"{prefix}_sentiment_score"] = sentiment_score
490
+
491
+ return df