mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,852 @@
1
+ """
2
+ Main workflow orchestrator for politician trading data collection
3
+ """
4
+
5
+ import asyncio
6
+ import logging
7
+ import uuid
8
+ from datetime import datetime, timedelta
9
+ from typing import List, Dict, Any, Optional
10
+
11
+ from .config import WorkflowConfig
12
+ from .database import PoliticianTradingDB
13
+ from .models import DataPullJob, Politician, TradingDisclosure, PoliticianRole
14
+ from .scrapers import (
15
+ CongressTradingScraper,
16
+ QuiverQuantScraper,
17
+ EUParliamentScraper,
18
+ PoliticianMatcher,
19
+ run_uk_parliament_workflow,
20
+ run_california_workflow,
21
+ run_eu_member_states_workflow,
22
+ run_us_states_workflow,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class PoliticianTradingWorkflow:
29
+ """Main workflow for collecting politician trading data"""
30
+
31
+ def __init__(self, config: WorkflowConfig = None):
32
+ self.config = config or WorkflowConfig.default()
33
+ self.db = PoliticianTradingDB(self.config)
34
+ self.politicians: List[Politician] = []
35
+
36
+ async def run_full_collection(self) -> Dict[str, Any]:
37
+ """Run complete data collection workflow"""
38
+ logger.info("Starting full politician trading data collection")
39
+
40
+ results = {
41
+ "started_at": datetime.utcnow().isoformat(),
42
+ "jobs": {},
43
+ "summary": {"total_new_disclosures": 0, "total_updated_disclosures": 0, "errors": []},
44
+ }
45
+
46
+ try:
47
+ # Ensure database schema
48
+ schema_ok = await self.db.ensure_schema()
49
+ if not schema_ok:
50
+ raise Exception("Database schema verification failed")
51
+
52
+ # Load existing politicians for matching
53
+ await self._load_politicians()
54
+
55
+ # Run US Congress collection
56
+ us_results = await self._collect_us_congress_data()
57
+ results["jobs"]["us_congress"] = us_results
58
+ results["summary"]["total_new_disclosures"] += us_results.get("new_disclosures", 0)
59
+ results["summary"]["total_updated_disclosures"] += us_results.get(
60
+ "updated_disclosures", 0
61
+ )
62
+
63
+ # Run EU Parliament collection
64
+ eu_results = await self._collect_eu_parliament_data()
65
+ results["jobs"]["eu_parliament"] = eu_results
66
+ results["summary"]["total_new_disclosures"] += eu_results.get("new_disclosures", 0)
67
+ results["summary"]["total_updated_disclosures"] += eu_results.get(
68
+ "updated_disclosures", 0
69
+ )
70
+
71
+ # Run California collection
72
+ ca_results = await self._collect_california_data()
73
+ results["jobs"]["california"] = ca_results
74
+ results["summary"]["total_new_disclosures"] += ca_results.get("new_disclosures", 0)
75
+ results["summary"]["total_updated_disclosures"] += ca_results.get(
76
+ "updated_disclosures", 0
77
+ )
78
+
79
+ # Run EU member states collection
80
+ eu_states_results = await self._collect_eu_member_states_data()
81
+ results["jobs"]["eu_member_states"] = eu_states_results
82
+ results["summary"]["total_new_disclosures"] += eu_states_results.get("new_disclosures", 0)
83
+ results["summary"]["total_updated_disclosures"] += eu_states_results.get(
84
+ "updated_disclosures", 0
85
+ )
86
+
87
+ # Run US states collection
88
+ us_states_results = await self._collect_us_states_data()
89
+ results["jobs"]["us_states"] = us_states_results
90
+ results["summary"]["total_new_disclosures"] += us_states_results.get("new_disclosures", 0)
91
+ results["summary"]["total_updated_disclosures"] += us_states_results.get(
92
+ "updated_disclosures", 0
93
+ )
94
+
95
+ results["completed_at"] = datetime.utcnow().isoformat()
96
+ results["status"] = "completed"
97
+
98
+ except Exception as e:
99
+ logger.error(f"Full collection workflow failed: {e}")
100
+ results["error"] = str(e)
101
+ results["status"] = "failed"
102
+ results["summary"]["errors"].append(str(e))
103
+
104
+ logger.info(f"Workflow completed: {results['summary']}")
105
+ return results
106
+
107
+ async def _load_politicians(self):
108
+ """Load politicians from database for matching"""
109
+ try:
110
+ # For now, create some sample politicians
111
+ # In production, you'd load from a politicians API or database
112
+ sample_politicians = [
113
+ Politician(
114
+ id=str(uuid.uuid4()),
115
+ first_name="Nancy",
116
+ last_name="Pelosi",
117
+ full_name="Nancy Pelosi",
118
+ role=PoliticianRole.US_HOUSE_REP,
119
+ party="Democratic",
120
+ state_or_country="CA",
121
+ district="5",
122
+ bioguide_id="P000197",
123
+ ),
124
+ Politician(
125
+ id=str(uuid.uuid4()),
126
+ first_name="Ted",
127
+ last_name="Cruz",
128
+ full_name="Ted Cruz",
129
+ role=PoliticianRole.US_SENATOR,
130
+ party="Republican",
131
+ state_or_country="TX",
132
+ bioguide_id="C001098",
133
+ ),
134
+ ]
135
+
136
+ # Store politicians in database
137
+ for politician in sample_politicians:
138
+ politician_id = await self.db.upsert_politician(politician)
139
+ politician.id = politician_id
140
+ self.politicians.append(politician)
141
+
142
+ logger.info(f"Loaded {len(self.politicians)} politicians for matching")
143
+
144
+ except Exception as e:
145
+ logger.error(f"Failed to load politicians: {e}")
146
+ self.politicians = []
147
+
148
+ async def _collect_us_congress_data(self) -> Dict[str, Any]:
149
+ """Collect US Congress trading data"""
150
+ job_id = await self.db.create_data_pull_job("us_congress", self.config.to_serializable_dict())
151
+
152
+ job_result = {
153
+ "job_id": job_id,
154
+ "status": "running",
155
+ "new_disclosures": 0,
156
+ "updated_disclosures": 0,
157
+ "errors": [],
158
+ }
159
+
160
+ job = DataPullJob(
161
+ id=job_id, job_type="us_congress", status="running", started_at=datetime.utcnow()
162
+ )
163
+
164
+ try:
165
+ logger.info("Starting US Congress data collection")
166
+
167
+ # Initialize scrapers
168
+ congress_scraper = CongressTradingScraper(self.config.scraping)
169
+ quiver_scraper = QuiverQuantScraper(self.config.scraping)
170
+
171
+ all_disclosures = []
172
+
173
+ # Scrape official sources
174
+ async with congress_scraper:
175
+ house_disclosures = await congress_scraper.scrape_house_disclosures()
176
+ senate_disclosures = await congress_scraper.scrape_senate_disclosures()
177
+ all_disclosures.extend(house_disclosures)
178
+ all_disclosures.extend(senate_disclosures)
179
+
180
+ # Scrape backup sources
181
+ async with quiver_scraper:
182
+ quiver_trades = await quiver_scraper.scrape_congress_trades()
183
+ for trade_data in quiver_trades:
184
+ disclosure = quiver_scraper.parse_quiver_trade(trade_data)
185
+ if disclosure:
186
+ all_disclosures.append(disclosure)
187
+
188
+ job.records_found = len(all_disclosures)
189
+
190
+ # Process disclosures
191
+ matcher = PoliticianMatcher(self.politicians)
192
+
193
+ for disclosure in all_disclosures:
194
+ try:
195
+ # Find matching politician
196
+ politician_name = disclosure.raw_data.get("politician_name", "")
197
+ if not politician_name or politician_name.strip() == "":
198
+ logger.warning("Skipping disclosure with empty politician name")
199
+ job.records_failed += 1
200
+ continue
201
+
202
+ # Filter out obviously invalid politician names
203
+ if self._is_invalid_politician_name(politician_name):
204
+ logger.warning(f"Skipping disclosure with invalid politician name: {politician_name}")
205
+ job.records_failed += 1
206
+ continue
207
+
208
+ politician = matcher.find_politician(politician_name)
209
+
210
+ if not politician:
211
+ # Create new politician with real name from scraper
212
+ logger.info(f"Creating new politician for: {politician_name}")
213
+
214
+ # Parse real name into first/last components
215
+ name_parts = politician_name.strip().split()
216
+ if len(name_parts) >= 2:
217
+ first_name = name_parts[0]
218
+ last_name = " ".join(name_parts[1:])
219
+ else:
220
+ first_name = politician_name.strip()
221
+ last_name = ""
222
+
223
+ # Create politician with real name - use generic role for now
224
+ new_politician = Politician(
225
+ first_name=first_name,
226
+ last_name=last_name,
227
+ full_name=politician_name.strip(),
228
+ role=PoliticianRole.US_HOUSE_REP, # Default role
229
+ )
230
+ politician_id = await self.db.upsert_politician(new_politician)
231
+ disclosure.politician_id = politician_id
232
+ else:
233
+ disclosure.politician_id = politician.id
234
+
235
+ # Check if disclosure already exists
236
+ existing = await self.db.find_disclosure_by_transaction(
237
+ disclosure.politician_id,
238
+ disclosure.transaction_date,
239
+ disclosure.asset_name,
240
+ disclosure.transaction_type.value,
241
+ )
242
+
243
+ if existing:
244
+ # Update existing record
245
+ disclosure.id = existing.id
246
+ if await self.db.update_disclosure(disclosure):
247
+ job.records_updated += 1
248
+ job_result["updated_disclosures"] += 1
249
+ else:
250
+ job.records_failed += 1
251
+ else:
252
+ # Insert new record
253
+ disclosure_id = await self.db.insert_disclosure(disclosure)
254
+ if disclosure_id:
255
+ job.records_new += 1
256
+ job_result["new_disclosures"] += 1
257
+ else:
258
+ job.records_failed += 1
259
+
260
+ job.records_processed += 1
261
+
262
+ except Exception as e:
263
+ logger.error(f"Failed to process disclosure: {e}")
264
+ job.records_failed += 1
265
+ job_result["errors"].append(str(e))
266
+
267
+ job.status = "completed"
268
+ job.completed_at = datetime.utcnow()
269
+ job_result["status"] = "completed"
270
+
271
+ except Exception as e:
272
+ logger.error(f"US Congress collection failed: {e}")
273
+ job.status = "failed"
274
+ job.error_message = str(e)
275
+ job.completed_at = datetime.utcnow()
276
+ job_result["status"] = "failed"
277
+ job_result["errors"].append(str(e))
278
+
279
+ # Update job status
280
+ await self.db.update_data_pull_job(job)
281
+
282
+ return job_result
283
+
284
+ async def _collect_eu_parliament_data(self) -> Dict[str, Any]:
285
+ """Collect EU Parliament trading/financial data"""
286
+ job_id = await self.db.create_data_pull_job("eu_parliament", self.config.to_serializable_dict())
287
+
288
+ job_result = {
289
+ "job_id": job_id,
290
+ "status": "running",
291
+ "new_disclosures": 0,
292
+ "updated_disclosures": 0,
293
+ "errors": [],
294
+ }
295
+
296
+ job = DataPullJob(
297
+ id=job_id, job_type="eu_parliament", status="running", started_at=datetime.utcnow()
298
+ )
299
+
300
+ try:
301
+ logger.info("Starting EU Parliament data collection")
302
+
303
+ scraper = EUParliamentScraper(self.config.scraping)
304
+
305
+ async with scraper:
306
+ disclosures = await scraper.scrape_mep_declarations()
307
+
308
+ job.records_found = len(disclosures)
309
+
310
+ # Process EU disclosures (similar to US processing)
311
+ for disclosure in disclosures:
312
+ try:
313
+ # For EU, we'd need a different politician matching strategy
314
+ # For now, create a sample politician
315
+ if not disclosure.politician_id:
316
+ # Create placeholder politician
317
+ eu_politician = Politician(
318
+ first_name="Sample",
319
+ last_name="MEP",
320
+ full_name="Sample MEP",
321
+ role=PoliticianRole.EU_MEP,
322
+ state_or_country="EU",
323
+ )
324
+ politician_id = await self.db.upsert_politician(eu_politician)
325
+ disclosure.politician_id = politician_id
326
+
327
+ # Insert disclosure
328
+ disclosure_id = await self.db.insert_disclosure(disclosure)
329
+ if disclosure_id:
330
+ job.records_new += 1
331
+ job_result["new_disclosures"] += 1
332
+ else:
333
+ job.records_failed += 1
334
+
335
+ job.records_processed += 1
336
+
337
+ except Exception as e:
338
+ logger.error(f"Failed to process EU disclosure: {e}")
339
+ job.records_failed += 1
340
+ job_result["errors"].append(str(e))
341
+
342
+ job.status = "completed"
343
+ job.completed_at = datetime.utcnow()
344
+ job_result["status"] = "completed"
345
+
346
+ except Exception as e:
347
+ logger.error(f"EU Parliament collection failed: {e}")
348
+ job.status = "failed"
349
+ job.error_message = str(e)
350
+ job.completed_at = datetime.utcnow()
351
+ job_result["status"] = "failed"
352
+ job_result["errors"].append(str(e))
353
+
354
+ # Update job status
355
+ await self.db.update_data_pull_job(job)
356
+
357
+ return job_result
358
+
359
+ async def _collect_uk_parliament_data(self) -> Dict[str, Any]:
360
+ """Collect UK Parliament financial interests data"""
361
+ job_id = await self.db.create_data_pull_job("uk_parliament", self.config.to_serializable_dict())
362
+
363
+ job_result = {
364
+ "job_id": job_id,
365
+ "status": "running",
366
+ "new_disclosures": 0,
367
+ "updated_disclosures": 0,
368
+ "errors": [],
369
+ }
370
+
371
+ job = DataPullJob(
372
+ id=job_id,
373
+ job_type="uk_parliament",
374
+ status="running",
375
+ started_at=datetime.utcnow(),
376
+ )
377
+
378
+ try:
379
+ # Collect UK Parliament financial interests
380
+ logger.info("Starting UK Parliament financial interests collection")
381
+ uk_disclosures = await run_uk_parliament_workflow(self.config.scraping)
382
+
383
+ job.records_found = len(uk_disclosures)
384
+
385
+ # Process each disclosure
386
+ matcher = PoliticianMatcher(self.politicians)
387
+
388
+ for disclosure in uk_disclosures:
389
+ try:
390
+ # For UK Parliament, find or create politician using real names from scrapers
391
+ if not disclosure.politician_id:
392
+ # Extract real politician name from raw data
393
+ politician_name = disclosure.raw_data.get("politician_name", "")
394
+
395
+ if not politician_name or politician_name.strip() == "":
396
+ # Fallback to using member ID if no name available
397
+ if disclosure.raw_data.get("uk_member_id"):
398
+ logger.warning(f"Using member ID as fallback for UK disclosure: {disclosure.raw_data.get('uk_member_id')}")
399
+ uk_politician = Politician(
400
+ first_name="UK",
401
+ last_name="MP",
402
+ full_name=f"UK MP {disclosure.raw_data.get('uk_member_id')}",
403
+ role=PoliticianRole.UK_MP,
404
+ state_or_country="UK",
405
+ )
406
+ politician_id = await self.db.upsert_politician(uk_politician)
407
+ disclosure.politician_id = politician_id
408
+ else:
409
+ logger.warning("Skipping UK disclosure with no politician name or member ID")
410
+ job.records_failed += 1
411
+ continue
412
+ else:
413
+ # Filter out obviously invalid politician names
414
+ if self._is_invalid_politician_name(politician_name):
415
+ logger.warning(f"Skipping UK disclosure with invalid politician name: {politician_name}")
416
+ job.records_failed += 1
417
+ continue
418
+
419
+ # Try to find existing politician
420
+ politician = matcher.find_politician(politician_name)
421
+
422
+ if not politician:
423
+ # Create new politician with real name from scraper
424
+ # Parse real name into first/last components
425
+ name_parts = politician_name.strip().split()
426
+ if len(name_parts) >= 2:
427
+ first_name = name_parts[0]
428
+ last_name = " ".join(name_parts[1:])
429
+ else:
430
+ first_name = politician_name.strip()
431
+ last_name = ""
432
+
433
+ # Create politician with REAL name
434
+ uk_politician = Politician(
435
+ first_name=first_name,
436
+ last_name=last_name,
437
+ full_name=politician_name.strip(),
438
+ role=PoliticianRole.UK_MP,
439
+ state_or_country="UK",
440
+ )
441
+ politician_id = await self.db.upsert_politician(uk_politician)
442
+ disclosure.politician_id = politician_id
443
+ logger.info(f"Created new UK MP: {politician_name}")
444
+ else:
445
+ disclosure.politician_id = politician.id
446
+
447
+ # Insert disclosure
448
+ disclosure_id = await self.db.insert_disclosure(disclosure)
449
+ if disclosure_id:
450
+ job.records_new += 1
451
+ job_result["new_disclosures"] += 1
452
+ else:
453
+ job.records_failed += 1
454
+
455
+ job.records_processed += 1
456
+
457
+ except Exception as e:
458
+ logger.error(f"Failed to process UK Parliament disclosure: {e}")
459
+ job.records_failed += 1
460
+ job_result["errors"].append(str(e))
461
+
462
+ job.status = "completed"
463
+ job.completed_at = datetime.utcnow()
464
+ job_result["status"] = "completed"
465
+
466
+ except Exception as e:
467
+ logger.error(f"UK Parliament collection failed: {e}")
468
+ job.status = "failed"
469
+ job.error_message = str(e)
470
+ job.completed_at = datetime.utcnow()
471
+ job_result["status"] = "failed"
472
+ job_result["errors"].append(str(e))
473
+
474
+ # Update job status
475
+ await self.db.update_data_pull_job(job)
476
+
477
+ return job_result
478
+
479
+ async def _collect_california_data(self) -> Dict[str, Any]:
480
+ """Collect California NetFile and state disclosure data"""
481
+ job_id = await self.db.create_data_pull_job("california", self.config.to_serializable_dict())
482
+
483
+ job_result = {
484
+ "job_id": job_id,
485
+ "status": "running",
486
+ "new_disclosures": 0,
487
+ "updated_disclosures": 0,
488
+ "errors": [],
489
+ }
490
+
491
+ job = DataPullJob(
492
+ id=job_id,
493
+ job_type="california",
494
+ status="running",
495
+ started_at=datetime.utcnow(),
496
+ )
497
+
498
+ try:
499
+ # Collect California financial disclosures
500
+ logger.info("Starting California financial disclosures collection")
501
+ california_disclosures = await run_california_workflow(self.config.scraping)
502
+
503
+ job.records_found = len(california_disclosures)
504
+
505
+ # Process each disclosure
506
+ matcher = PoliticianMatcher(self.politicians)
507
+
508
+ for disclosure in california_disclosures:
509
+ try:
510
+ # For California, create politician if needed
511
+ if not disclosure.politician_id:
512
+ # Extract politician name from raw data or create placeholder
513
+ politician_name = disclosure.raw_data.get("politician_name", "")
514
+ if not politician_name:
515
+ # Create placeholder for California politician
516
+ ca_politician = Politician(
517
+ first_name="California",
518
+ last_name="Politician",
519
+ full_name=f"California Politician {disclosure.raw_data.get('jurisdiction', 'Unknown')}",
520
+ role=PoliticianRole.US_HOUSE_REP, # Could be state-level role
521
+ state_or_country="CA",
522
+ )
523
+ politician_id = await self.db.upsert_politician(ca_politician)
524
+ disclosure.politician_id = politician_id
525
+
526
+ # Insert disclosure
527
+ disclosure_id = await self.db.insert_disclosure(disclosure)
528
+ if disclosure_id:
529
+ job.records_new += 1
530
+ job_result["new_disclosures"] += 1
531
+ else:
532
+ job.records_failed += 1
533
+
534
+ job.records_processed += 1
535
+
536
+ except Exception as e:
537
+ logger.error(f"Failed to process California disclosure: {e}")
538
+ job.records_failed += 1
539
+ job_result["errors"].append(str(e))
540
+
541
+ job.status = "completed"
542
+ job.completed_at = datetime.utcnow()
543
+ job_result["status"] = "completed"
544
+
545
+ except Exception as e:
546
+ logger.error(f"California collection failed: {e}")
547
+ job.status = "failed"
548
+ job.error_message = str(e)
549
+ job.completed_at = datetime.utcnow()
550
+ job_result["status"] = "failed"
551
+ job_result["errors"].append(str(e))
552
+
553
+ # Update job status
554
+ await self.db.update_data_pull_job(job)
555
+
556
+ return job_result
557
+
558
+ async def _collect_eu_member_states_data(self) -> Dict[str, Any]:
559
+ """Collect EU member states financial disclosure data"""
560
+ job_id = await self.db.create_data_pull_job("eu_member_states", self.config.to_serializable_dict())
561
+
562
+ job_result = {
563
+ "job_id": job_id,
564
+ "status": "running",
565
+ "new_disclosures": 0,
566
+ "updated_disclosures": 0,
567
+ "errors": [],
568
+ }
569
+
570
+ job = DataPullJob(
571
+ id=job_id,
572
+ job_type="eu_member_states",
573
+ status="running",
574
+ started_at=datetime.utcnow(),
575
+ )
576
+
577
+ try:
578
+ # Collect EU member states financial disclosures
579
+ logger.info("Starting EU member states financial disclosures collection")
580
+ eu_states_disclosures = await run_eu_member_states_workflow(self.config.scraping)
581
+
582
+ job.records_found = len(eu_states_disclosures)
583
+
584
+ # Process each disclosure
585
+ matcher = PoliticianMatcher(self.politicians)
586
+
587
+ for disclosure in eu_states_disclosures:
588
+ try:
589
+ # For EU member states, create politician if needed
590
+ if not disclosure.politician_id:
591
+ # Extract politician details from raw data
592
+ country = disclosure.raw_data.get("country", "Unknown")
593
+ source = disclosure.raw_data.get("source", "unknown")
594
+
595
+ # Map country to appropriate role
596
+ role_map = {
597
+ "Germany": PoliticianRole.GERMAN_BUNDESTAG,
598
+ "France": PoliticianRole.FRENCH_DEPUTY,
599
+ "Italy": PoliticianRole.ITALIAN_DEPUTY,
600
+ "Spain": PoliticianRole.SPANISH_DEPUTY,
601
+ "Netherlands": PoliticianRole.DUTCH_MP
602
+ }
603
+
604
+ politician_role = role_map.get(country, PoliticianRole.EU_MEP)
605
+
606
+ # Create placeholder politician
607
+ eu_politician = Politician(
608
+ first_name=country,
609
+ last_name="Politician",
610
+ full_name=f"{country} Politician ({source})",
611
+ role=politician_role,
612
+ state_or_country=country,
613
+ )
614
+ politician_id = await self.db.upsert_politician(eu_politician)
615
+ disclosure.politician_id = politician_id
616
+
617
+ # Insert disclosure
618
+ disclosure_id = await self.db.insert_disclosure(disclosure)
619
+ if disclosure_id:
620
+ job.records_new += 1
621
+ job_result["new_disclosures"] += 1
622
+ else:
623
+ job.records_failed += 1
624
+
625
+ job.records_processed += 1
626
+
627
+ except Exception as e:
628
+ logger.error(f"Failed to process EU member state disclosure: {e}")
629
+ job.records_failed += 1
630
+ job_result["errors"].append(str(e))
631
+
632
+ job.status = "completed"
633
+ job.completed_at = datetime.utcnow()
634
+ job_result["status"] = "completed"
635
+
636
+ except Exception as e:
637
+ logger.error(f"EU member states collection failed: {e}")
638
+ job.status = "failed"
639
+ job.error_message = str(e)
640
+ job.completed_at = datetime.utcnow()
641
+ job_result["status"] = "failed"
642
+ job_result["errors"].append(str(e))
643
+
644
+ # Update job status
645
+ await self.db.update_data_pull_job(job)
646
+
647
+ return job_result
648
+
649
+ async def _collect_us_states_data(self) -> Dict[str, Any]:
650
+ """Collect US states financial disclosure data"""
651
+ job_id = await self.db.create_data_pull_job("us_states", self.config.to_serializable_dict())
652
+
653
+ job_result = {
654
+ "job_id": job_id,
655
+ "status": "running",
656
+ "new_disclosures": 0,
657
+ "updated_disclosures": 0,
658
+ "errors": [],
659
+ }
660
+
661
+ job = DataPullJob(
662
+ id=job_id,
663
+ job_type="us_states",
664
+ status="running",
665
+ started_at=datetime.utcnow(),
666
+ )
667
+
668
+ try:
669
+ # Collect US states financial disclosures
670
+ logger.info("Starting US states financial disclosures collection")
671
+ us_states_disclosures = await run_us_states_workflow(self.config.scraping)
672
+
673
+ job.records_found = len(us_states_disclosures)
674
+
675
+ # Process each disclosure
676
+ matcher = PoliticianMatcher(self.politicians)
677
+
678
+ for disclosure in us_states_disclosures:
679
+ try:
680
+ # For US states, find or create politician using real names from scrapers
681
+ if not disclosure.politician_id:
682
+ # Extract real politician name from raw data
683
+ politician_name = disclosure.raw_data.get("politician_name", "")
684
+ if not politician_name or politician_name.strip() == "":
685
+ logger.warning("Skipping US states disclosure with empty politician name")
686
+ job.records_failed += 1
687
+ continue
688
+
689
+ # Filter out obviously invalid politician names
690
+ if self._is_invalid_politician_name(politician_name):
691
+ logger.warning(f"Skipping US states disclosure with invalid politician name: {politician_name}")
692
+ job.records_failed += 1
693
+ continue
694
+
695
+ # Try to find existing politician
696
+ politician = matcher.find_politician(politician_name)
697
+
698
+ if not politician:
699
+ # Create new politician with real name from scraper
700
+ state = disclosure.raw_data.get("state", "Unknown")
701
+ source = disclosure.raw_data.get("source", "unknown")
702
+
703
+ # Map state to appropriate role
704
+ role_map = {
705
+ "Texas": PoliticianRole.TEXAS_STATE_OFFICIAL,
706
+ "New York": PoliticianRole.NEW_YORK_STATE_OFFICIAL,
707
+ "Florida": PoliticianRole.FLORIDA_STATE_OFFICIAL,
708
+ "Illinois": PoliticianRole.ILLINOIS_STATE_OFFICIAL,
709
+ "Pennsylvania": PoliticianRole.PENNSYLVANIA_STATE_OFFICIAL,
710
+ "Massachusetts": PoliticianRole.MASSACHUSETTS_STATE_OFFICIAL,
711
+ "California": PoliticianRole.CALIFORNIA_STATE_OFFICIAL
712
+ }
713
+
714
+ politician_role = role_map.get(state, PoliticianRole.US_HOUSE_REP)
715
+
716
+ # Parse real name into first/last components
717
+ name_parts = politician_name.strip().split()
718
+ if len(name_parts) >= 2:
719
+ first_name = name_parts[0]
720
+ last_name = " ".join(name_parts[1:])
721
+ else:
722
+ first_name = politician_name.strip()
723
+ last_name = ""
724
+
725
+ # Create politician with REAL name
726
+ state_politician = Politician(
727
+ first_name=first_name,
728
+ last_name=last_name,
729
+ full_name=politician_name.strip(),
730
+ role=politician_role,
731
+ state_or_country=state,
732
+ )
733
+ politician_id = await self.db.upsert_politician(state_politician)
734
+ disclosure.politician_id = politician_id
735
+ logger.info(f"Created new US state politician: {politician_name}")
736
+ else:
737
+ disclosure.politician_id = politician.id
738
+
739
+ # Insert disclosure
740
+ disclosure_id = await self.db.insert_disclosure(disclosure)
741
+ if disclosure_id:
742
+ job.records_new += 1
743
+ job_result["new_disclosures"] += 1
744
+ else:
745
+ job.records_failed += 1
746
+
747
+ job.records_processed += 1
748
+
749
+ except Exception as e:
750
+ logger.error(f"Failed to process US state disclosure: {e}")
751
+ job.records_failed += 1
752
+ job_result["errors"].append(str(e))
753
+
754
+ job.status = "completed"
755
+ job.completed_at = datetime.utcnow()
756
+ job_result["status"] = "completed"
757
+
758
+ except Exception as e:
759
+ logger.error(f"US states collection failed: {e}")
760
+ job.status = "failed"
761
+ job.error_message = str(e)
762
+ job.completed_at = datetime.utcnow()
763
+ job_result["status"] = "failed"
764
+ job_result["errors"].append(str(e))
765
+
766
+ # Update job status
767
+ await self.db.update_data_pull_job(job)
768
+
769
+ return job_result
770
+
771
+ async def get_status(self) -> Dict[str, Any]:
772
+ """Get current workflow status"""
773
+ try:
774
+ return await self.db.get_job_status()
775
+ except Exception as e:
776
+ logger.error(f"Failed to get status: {e}")
777
+ return {"error": str(e)}
778
+
779
+ async def run_quick_check(self) -> Dict[str, Any]:
780
+ """Run a quick status check without full data collection"""
781
+ try:
782
+ status = await self.get_status()
783
+
784
+ # Add some additional quick checks
785
+ status["database_connection"] = "ok" if self.db.client else "failed"
786
+ status["config_loaded"] = "ok" if self.config else "failed"
787
+ status["timestamp"] = datetime.utcnow().isoformat()
788
+
789
+ return status
790
+
791
+ except Exception as e:
792
+ return {"error": str(e), "timestamp": datetime.utcnow().isoformat(), "status": "failed"}
793
+
794
+ def _is_invalid_politician_name(self, name: str) -> bool:
795
+ """Check if a name is obviously not a politician name"""
796
+ if not name or len(name.strip()) < 2:
797
+ return True
798
+
799
+ # Check for proper name structure first (before converting to uppercase)
800
+ original_name = name.strip()
801
+ import re
802
+ if not re.search(r'[A-Za-z]', original_name): # Should have at least one letter
803
+ return True
804
+ if re.search(r'^\d+', original_name): # Starting with numbers
805
+ return True
806
+
807
+ # Now convert to uppercase for pattern matching
808
+ name = original_name.upper()
809
+
810
+ # Filter out obvious non-names
811
+ invalid_patterns = [
812
+ # Asset tickers and financial instruments
813
+ r'^-.*CT$', # -ETHEREUMCT, -DOGCT patterns
814
+ r'^[A-Z]{2,5}$', # Short all-caps (likely tickers)
815
+ r'^\$', # Starting with $
816
+ # Municipal and financial terms
817
+ r'MUNICIPAL',
818
+ r'BOND',
819
+ r'TRUST',
820
+ r'FUND',
821
+ r'CORP',
822
+ r'INC\.$',
823
+ r'LLC$',
824
+ r'LP$',
825
+ # Common non-name patterns
826
+ r'^UNKNOWN',
827
+ r'^TEST',
828
+ r'^SAMPLE',
829
+ # Crypto/financial asset patterns
830
+ r'ETHEREUM',
831
+ r'BITCOIN',
832
+ r'CRYPTO',
833
+ ]
834
+
835
+ for pattern in invalid_patterns:
836
+ if re.search(pattern, name):
837
+ return True
838
+
839
+ return False
840
+
841
+
842
+ # Standalone functions for cron job usage
843
+ async def run_politician_trading_collection() -> Dict[str, Any]:
844
+ """Standalone function for cron job execution"""
845
+ workflow = PoliticianTradingWorkflow()
846
+ return await workflow.run_full_collection()
847
+
848
+
849
+ async def check_politician_trading_status() -> Dict[str, Any]:
850
+ """Standalone function for status checking"""
851
+ workflow = PoliticianTradingWorkflow()
852
+ return await workflow.run_quick_check()