mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to download and run efficient models from Ollama using MCLI model service.
4
+
5
+ This script identifies the most efficient models in terms of compute and accuracy,
6
+ downloads them, and runs them using the MCLI model service.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import subprocess
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Dict, List, Optional
16
+
17
+ import click
18
+ import requests
19
+
20
+ # Add the parent directory to the path so we can import the model service
21
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
22
+
23
+ from mcli.workflow.model_service.model_service import ModelManager, ModelService
24
+
25
+ # Efficient models from Ollama search results
26
+ EFFICIENT_MODELS = {
27
+ "phi3-mini": {
28
+ "name": "Phi-3 Mini",
29
+ "description": "Microsoft's lightweight 3.8B model with excellent reasoning",
30
+ "model_url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/pytorch_model.bin",
31
+ "tokenizer_url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/tokenizer.json",
32
+ "model_type": "text-generation",
33
+ "parameters": "3.8B",
34
+ "efficiency_score": 9.5,
35
+ "accuracy_score": 8.5,
36
+ },
37
+ "gemma3n-1b": {
38
+ "name": "Gemma3n 1B",
39
+ "description": "Google's efficient 1B model for everyday devices",
40
+ "model_url": "https://huggingface.co/google/gemma3n-1b/resolve/main/pytorch_model.bin",
41
+ "tokenizer_url": "https://huggingface.co/google/gemma3n-1b/resolve/main/tokenizer.json",
42
+ "model_type": "text-generation",
43
+ "parameters": "1B",
44
+ "efficiency_score": 9.8,
45
+ "accuracy_score": 7.5,
46
+ },
47
+ "tinyllama-1.1b": {
48
+ "name": "TinyLlama 1.1B",
49
+ "description": "Compact 1.1B model trained on 3 trillion tokens",
50
+ "model_url": "https://huggingface.co/jzhang38/TinyLlama-1.1B-Chat-v1.0/resolve/main/pytorch_model.bin",
51
+ "tokenizer_url": "https://huggingface.co/jzhang38/TinyLlama-1.1B-Chat-v1.0/resolve/main/tokenizer.json",
52
+ "model_type": "text-generation",
53
+ "parameters": "1.1B",
54
+ "efficiency_score": 9.7,
55
+ "accuracy_score": 7.0,
56
+ },
57
+ "phi4-mini-reasoning": {
58
+ "name": "Phi-4 Mini Reasoning",
59
+ "description": "Lightweight 3.8B model with advanced reasoning",
60
+ "model_url": "https://huggingface.co/microsoft/Phi-4-mini-reasoning/resolve/main/pytorch_model.bin",
61
+ "tokenizer_url": "https://huggingface.co/microsoft/Phi-4-mini-reasoning/resolve/main/tokenizer.json",
62
+ "model_type": "text-generation",
63
+ "parameters": "3.8B",
64
+ "efficiency_score": 9.3,
65
+ "accuracy_score": 8.8,
66
+ },
67
+ }
68
+
69
+
70
+ def get_system_info():
71
+ """Get system information for model selection"""
72
+ import psutil
73
+
74
+ # Get CPU info
75
+ cpu_count = psutil.cpu_count()
76
+ cpu_freq = psutil.cpu_freq()
77
+ memory_gb = psutil.virtual_memory().total / (1024**3)
78
+
79
+ # Check for GPU
80
+ try:
81
+ import torch
82
+
83
+ gpu_available = torch.cuda.is_available()
84
+ if gpu_available:
85
+ gpu_name = torch.cuda.get_device_name(0)
86
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
87
+ else:
88
+ gpu_name = "None"
89
+ gpu_memory = 0
90
+ except ImportError:
91
+ gpu_available = False
92
+ gpu_name = "PyTorch not available"
93
+ gpu_memory = 0
94
+
95
+ return {
96
+ "cpu_count": cpu_count,
97
+ "cpu_freq_mhz": cpu_freq.current if cpu_freq else 0,
98
+ "memory_gb": memory_gb,
99
+ "gpu_available": gpu_available,
100
+ "gpu_name": gpu_name,
101
+ "gpu_memory_gb": gpu_memory,
102
+ }
103
+
104
+
105
+ def recommend_model(system_info: Dict) -> str:
106
+ """Recommend the best model based on system capabilities"""
107
+ print("🔍 Analyzing system capabilities...")
108
+ print(f" CPU Cores: {system_info['cpu_count']}")
109
+ print(f" CPU Frequency: {system_info['cpu_freq_mhz']:.0f} MHz")
110
+ print(f" RAM: {system_info['memory_gb']:.1f} GB")
111
+ print(f" GPU: {system_info['gpu_name']}")
112
+ print(f" GPU Memory: {system_info['gpu_memory_gb']:.1f} GB")
113
+
114
+ # Simple recommendation logic
115
+ if system_info["gpu_available"] and system_info["gpu_memory_gb"] >= 4:
116
+ # Good GPU available
117
+ if system_info["memory_gb"] >= 16:
118
+ return "phi3-mini" # Best balance for good hardware
119
+ else:
120
+ return "gemma3n-1b" # More memory efficient
121
+ elif system_info["memory_gb"] >= 8:
122
+ # CPU-only with decent RAM
123
+ return "phi3-mini"
124
+ else:
125
+ # Limited resources
126
+ return "tinyllama-1.1b"
127
+
128
+
129
+ def download_and_setup_model(model_key: str, service: ModelService) -> Optional[str]:
130
+ """Download and setup a model using the MCLI service"""
131
+ model_info = EFFICIENT_MODELS[model_key]
132
+
133
+ print(f"\n🚀 Setting up {model_info['name']}...")
134
+ print(f" Description: {model_info['description']}")
135
+ print(f" Parameters: {model_info['parameters']}")
136
+ print(f" Efficiency Score: {model_info['efficiency_score']}/10")
137
+ print(f" Accuracy Score: {model_info['accuracy_score']}/10")
138
+
139
+ try:
140
+ # Add model to service
141
+ model_id = service.model_manager.add_model_from_url(
142
+ name=model_info["name"],
143
+ model_type=model_info["model_type"],
144
+ model_url=model_info["model_url"],
145
+ tokenizer_url=model_info["tokenizer_url"],
146
+ device="auto",
147
+ max_length=2048,
148
+ temperature=0.7,
149
+ top_p=0.9,
150
+ top_k=50,
151
+ )
152
+
153
+ print(f"✅ Model {model_info['name']} successfully added with ID: {model_id}")
154
+ return model_id
155
+
156
+ except Exception as e:
157
+ print(f"❌ Error setting up model {model_info['name']}: {e}")
158
+ return None
159
+
160
+
161
+ def test_model(service: ModelService, model_id: str, model_name: str):
162
+ """Test the model with sample prompts"""
163
+ print(f"\n🧪 Testing {model_name}...")
164
+
165
+ test_prompts = [
166
+ "Explain quantum computing in simple terms.",
167
+ "Write a Python function to calculate fibonacci numbers.",
168
+ "What are the benefits of renewable energy?",
169
+ "Translate 'Hello, how are you?' to Spanish.",
170
+ ]
171
+
172
+ for i, prompt in enumerate(test_prompts, 1):
173
+ print(f"\n📝 Test {i}: {prompt}")
174
+
175
+ try:
176
+ start_time = time.time()
177
+
178
+ # Generate response
179
+ response = service.model_manager.generate_text(
180
+ model_id=model_id, prompt=prompt, max_length=512, temperature=0.7
181
+ )
182
+
183
+ execution_time = time.time() - start_time
184
+
185
+ print(f"⏱️ Response time: {execution_time:.2f} seconds")
186
+ print(f"🤖 Response: {response[:200]}{'...' if len(response) > 200 else ''}")
187
+
188
+ except Exception as e:
189
+ print(f"❌ Error generating response: {e}")
190
+
191
+
192
+ def start_model_service():
193
+ """Start the MCLI model service"""
194
+ print("🔧 Starting MCLI model service...")
195
+
196
+ try:
197
+ # Check if service is already running
198
+ service = ModelService()
199
+ status = service.status()
200
+
201
+ if status["running"]:
202
+ print(f"✅ Model service already running at {status['api_url']}")
203
+ return service
204
+ else:
205
+ print("🚀 Starting model service...")
206
+ # Start service in background
207
+ import threading
208
+
209
+ service_thread = threading.Thread(target=service.start, daemon=True)
210
+ service_thread.start()
211
+
212
+ # Wait for service to start
213
+ time.sleep(3)
214
+ print("✅ Model service started")
215
+ return service
216
+
217
+ except Exception as e:
218
+ print(f"❌ Error starting model service: {e}")
219
+ return None
220
+
221
+
222
+ @click.command()
223
+ @click.option(
224
+ "--model",
225
+ type=click.Choice(list(EFFICIENT_MODELS.keys())),
226
+ help="Specific model to download and run",
227
+ )
228
+ @click.option(
229
+ "--auto", is_flag=True, default=True, help="Automatically select best model for your system"
230
+ )
231
+ @click.option("--test", is_flag=True, default=True, help="Run test prompts after setup")
232
+ @click.option(
233
+ "--service-only", is_flag=True, help="Only start the model service without downloading models"
234
+ )
235
+ def main(model: Optional[str], auto: bool, test: bool, service_only: bool):
236
+ """Download and run efficient models from Ollama using MCLI"""
237
+
238
+ print("🚀 MCLI Efficient Model Runner")
239
+ print("=" * 50)
240
+
241
+ # Start model service
242
+ service = start_model_service()
243
+ if not service:
244
+ print("❌ Failed to start model service")
245
+ return 1
246
+
247
+ if service_only:
248
+ print("✅ Model service is running. Use the API or CLI to manage models.")
249
+ return 0
250
+
251
+ # Get system info and recommend model
252
+ system_info = get_system_info()
253
+
254
+ if model:
255
+ selected_model = model
256
+ print(f"🎯 Using specified model: {selected_model}")
257
+ elif auto:
258
+ selected_model = recommend_model(system_info)
259
+ print(f"🎯 Recommended model: {selected_model}")
260
+ else:
261
+ print("Available models:")
262
+ for key, info in EFFICIENT_MODELS.items():
263
+ print(f" {key}: {info['name']} ({info['parameters']})")
264
+ selected_model = click.prompt(
265
+ "Select model", type=click.Choice(list(EFFICIENT_MODELS.keys()))
266
+ )
267
+
268
+ # Download and setup model
269
+ model_id = download_and_setup_model(selected_model, service)
270
+ if not model_id:
271
+ print("❌ Failed to setup model")
272
+ return 1
273
+
274
+ # Test the model
275
+ if test:
276
+ model_name = EFFICIENT_MODELS[selected_model]["name"]
277
+ test_model(service, model_id, model_name)
278
+
279
+ print(f"\n🎉 Setup complete! Model {EFFICIENT_MODELS[selected_model]['name']} is ready to use.")
280
+ print(f"📊 Model ID: {model_id}")
281
+ print(f"🌐 API available at: http://localhost:8000")
282
+ print(f"📝 Use 'mcli model-service list-models' to see all models")
283
+
284
+ return 0
285
+
286
+
287
+ if __name__ == "__main__":
288
+ sys.exit(main())
@@ -0,0 +1,397 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Lightweight Text Embedder for MCLI Model Service
4
+
5
+ This module provides lightweight text embedding capabilities
6
+ that don't require heavy ML libraries like PyTorch or transformers.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ import numpy as np
19
+
20
+ # Try to import lightweight alternatives
21
+ try:
22
+ import sentence_transformers
23
+
24
+ HAS_SENTENCE_TRANSFORMERS = True
25
+ except ImportError:
26
+ HAS_SENTENCE_TRANSFORMERS = False
27
+
28
+ try:
29
+ import sklearn
30
+ from sklearn.feature_extraction.text import TfidfVectorizer
31
+ from sklearn.metrics.pairwise import cosine_similarity
32
+
33
+ HAS_SKLEARN = True
34
+ except ImportError:
35
+ HAS_SKLEARN = False
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class LightweightEmbedder:
41
+ """Lightweight text embedder with multiple fallback methods"""
42
+
43
+ def __init__(self, models_dir: str = "./models/embeddings"):
44
+ self.models_dir = Path(models_dir)
45
+ self.models_dir.mkdir(parents=True, exist_ok=True)
46
+ self.vectorizer = None
47
+ self.embedding_cache = {}
48
+
49
+ def get_embedding_method(self) -> str:
50
+ """Determine the best available embedding method"""
51
+ if HAS_SENTENCE_TRANSFORMERS:
52
+ return "sentence_transformers"
53
+ elif HAS_SKLEARN:
54
+ return "tfidf"
55
+ else:
56
+ return "simple_hash"
57
+
58
+ def embed_text(self, text: str, method: Optional[str] = None) -> Dict[str, Any]:
59
+ """Embed text using the specified or best available method"""
60
+ if not method:
61
+ method = self.get_embedding_method()
62
+
63
+ try:
64
+ if method == "sentence_transformers":
65
+ return self._embed_with_sentence_transformers(text)
66
+ elif method == "tfidf":
67
+ return self._embed_with_tfidf(text)
68
+ else:
69
+ return self._embed_with_simple_hash(text)
70
+ except Exception as e:
71
+ logger.error(f"Error embedding text with {method}: {e}")
72
+ # Fallback to simple hash
73
+ return self._embed_with_simple_hash(text)
74
+
75
+ def _embed_with_sentence_transformers(self, text: str) -> Dict[str, Any]:
76
+ """Embed text using sentence-transformers"""
77
+ try:
78
+ from sentence_transformers import SentenceTransformer
79
+
80
+ # Use a lightweight model
81
+ model_name = "all-MiniLM-L6-v2"
82
+ model = SentenceTransformer(model_name)
83
+
84
+ # Generate embedding
85
+ embedding = model.encode(text)
86
+
87
+ return {
88
+ "method": "sentence_transformers",
89
+ "model": model_name,
90
+ "embedding": embedding.tolist(),
91
+ "dimensions": len(embedding),
92
+ "text_length": len(text),
93
+ "timestamp": datetime.now().isoformat(),
94
+ }
95
+ except Exception as e:
96
+ logger.error(f"Sentence transformers embedding failed: {e}")
97
+ raise
98
+
99
+ def _embed_with_tfidf(self, text: str) -> Dict[str, Any]:
100
+ """Embed text using TF-IDF"""
101
+ try:
102
+ from sklearn.feature_extraction.text import TfidfVectorizer
103
+
104
+ # Create or reuse vectorizer
105
+ if self.vectorizer is None:
106
+ self.vectorizer = TfidfVectorizer(
107
+ max_features=1000, stop_words="english", ngram_range=(1, 2)
108
+ )
109
+ # Fit on the current text
110
+ self.vectorizer.fit([text])
111
+
112
+ # Transform text to TF-IDF vector
113
+ tfidf_vector = self.vectorizer.transform([text])
114
+ embedding = tfidf_vector.toarray()[0]
115
+
116
+ return {
117
+ "method": "tfidf",
118
+ "model": "sklearn_tfidf",
119
+ "embedding": embedding.tolist(),
120
+ "dimensions": len(embedding),
121
+ "text_length": len(text),
122
+ "timestamp": datetime.now().isoformat(),
123
+ }
124
+ except Exception as e:
125
+ logger.error(f"TF-IDF embedding failed: {e}")
126
+ raise
127
+
128
+ def _embed_with_simple_hash(self, text: str) -> Dict[str, Any]:
129
+ """Embed text using simple hash-based method"""
130
+ try:
131
+ # Create a simple hash-based embedding
132
+ words = text.lower().split()
133
+ word_freq = {}
134
+
135
+ # Count word frequencies
136
+ for word in words:
137
+ if len(word) > 2: # Skip very short words
138
+ word_freq[word] = word_freq.get(word, 0) + 1
139
+
140
+ # Create a fixed-size embedding using hashes
141
+ embedding_size = 128
142
+ embedding = np.zeros(embedding_size)
143
+
144
+ for word, freq in word_freq.items():
145
+ # Create hash of word
146
+ word_hash = hash(word) % embedding_size
147
+ embedding[word_hash] += freq
148
+
149
+ # Normalize the embedding
150
+ if np.sum(embedding) > 0:
151
+ embedding = embedding / np.sum(embedding)
152
+
153
+ return {
154
+ "method": "simple_hash",
155
+ "model": "hash_based",
156
+ "embedding": embedding.tolist(),
157
+ "dimensions": len(embedding),
158
+ "text_length": len(text),
159
+ "word_count": len(words),
160
+ "unique_words": len(word_freq),
161
+ "timestamp": datetime.now().isoformat(),
162
+ }
163
+ except Exception as e:
164
+ logger.error(f"Simple hash embedding failed: {e}")
165
+ raise
166
+
167
+ def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
168
+ """Split text into overlapping chunks"""
169
+ chunks = []
170
+ start = 0
171
+
172
+ while start < len(text):
173
+ end = start + chunk_size
174
+ chunk = text[start:end]
175
+ chunks.append(chunk)
176
+ start = end - overlap
177
+
178
+ if start >= len(text):
179
+ break
180
+
181
+ return chunks
182
+
183
+ def embed_document(self, text: str, chunk_size: int = 1000) -> Dict[str, Any]:
184
+ """Embed a document by chunking and embedding each chunk"""
185
+ try:
186
+ # Split text into chunks
187
+ chunks = self.chunk_text(text, chunk_size)
188
+
189
+ # Embed each chunk
190
+ chunk_embeddings = []
191
+ for i, chunk in enumerate(chunks):
192
+ embedding_result = self.embed_text(chunk)
193
+ chunk_embeddings.append(
194
+ {
195
+ "chunk_index": i,
196
+ "chunk_text": chunk[:100] + "..." if len(chunk) > 100 else chunk,
197
+ "embedding": embedding_result,
198
+ }
199
+ )
200
+
201
+ # Create document-level summary
202
+ total_chunks = len(chunk_embeddings)
203
+ total_text_length = len(text)
204
+
205
+ return {
206
+ "success": True,
207
+ "document_embedding": {
208
+ "total_chunks": total_chunks,
209
+ "total_text_length": total_text_length,
210
+ "chunk_embeddings": chunk_embeddings,
211
+ "method": (
212
+ chunk_embeddings[0]["embedding"]["method"]
213
+ if chunk_embeddings
214
+ else "unknown"
215
+ ),
216
+ "timestamp": datetime.now().isoformat(),
217
+ },
218
+ }
219
+
220
+ except Exception as e:
221
+ logger.error(f"Error embedding document: {e}")
222
+ return {"success": False, "error": str(e)}
223
+
224
+ def search_similar(self, query: str, embeddings: List[Dict], top_k: int = 5) -> List[Dict]:
225
+ """Search for similar documents using embeddings"""
226
+ try:
227
+ # Embed the query
228
+ query_embedding = self.embed_text(query)
229
+ query_vector = np.array(query_embedding["embedding"])
230
+
231
+ results = []
232
+
233
+ for doc_embedding in embeddings:
234
+ # Calculate similarity for each chunk
235
+ similarities = []
236
+ for chunk in doc_embedding.get("chunk_embeddings", []):
237
+ chunk_vector = np.array(chunk["embedding"]["embedding"])
238
+
239
+ # Calculate cosine similarity
240
+ similarity = np.dot(query_vector, chunk_vector) / (
241
+ np.linalg.norm(query_vector) * np.linalg.norm(chunk_vector)
242
+ )
243
+ similarities.append(similarity)
244
+
245
+ # Use the best similarity score
246
+ best_similarity = max(similarities) if similarities else 0
247
+
248
+ results.append(
249
+ {
250
+ "document_id": doc_embedding.get("document_id"),
251
+ "similarity": float(best_similarity),
252
+ "chunk_count": len(doc_embedding.get("chunk_embeddings", [])),
253
+ "text_length": doc_embedding.get("total_text_length", 0),
254
+ }
255
+ )
256
+
257
+ # Sort by similarity and return top_k
258
+ results.sort(key=lambda x: x["similarity"], reverse=True)
259
+ return results[:top_k]
260
+
261
+ except Exception as e:
262
+ logger.error(f"Error searching similar documents: {e}")
263
+ return []
264
+
265
+ def get_status(self) -> Dict[str, Any]:
266
+ """Get the status of the embedder"""
267
+ return {
268
+ "available_methods": {
269
+ "sentence_transformers": HAS_SENTENCE_TRANSFORMERS,
270
+ "tfidf": HAS_SKLEARN,
271
+ "simple_hash": True, # Always available
272
+ },
273
+ "current_method": self.get_embedding_method(),
274
+ "models_dir": str(self.models_dir),
275
+ "cache_size": len(self.embedding_cache),
276
+ }
277
+
278
+
279
+ def create_embedder_api():
280
+ """Create a simple API for the embedder"""
281
+ import urllib.parse
282
+ from http.server import BaseHTTPRequestHandler, HTTPServer
283
+
284
+ class EmbedderHandler(BaseHTTPRequestHandler):
285
+ def __init__(self, *args, embedder=None, **kwargs):
286
+ self.embedder = embedder
287
+ super().__init__(*args, **kwargs)
288
+
289
+ def do_POST(self):
290
+ """Handle embedding requests"""
291
+ parsed_path = urllib.parse.urlparse(self.path)
292
+ path = parsed_path.path
293
+
294
+ if path == "/embed-text":
295
+ self._handle_embed_text()
296
+ elif path == "/embed-document":
297
+ self._handle_embed_document()
298
+ elif path == "/search":
299
+ self._handle_search()
300
+ else:
301
+ self._send_response(404, {"error": "Endpoint not found"})
302
+
303
+ def do_GET(self):
304
+ """Handle status requests"""
305
+ parsed_path = urllib.parse.urlparse(self.path)
306
+ path = parsed_path.path
307
+
308
+ if path == "/status":
309
+ status = self.embedder.get_status()
310
+ self._send_response(200, status)
311
+ else:
312
+ self._send_response(404, {"error": "Endpoint not found"})
313
+
314
+ def _handle_embed_text(self):
315
+ """Handle text embedding requests"""
316
+ try:
317
+ content_length = int(self.headers.get("Content-Length", 0))
318
+ post_data = self.rfile.read(content_length)
319
+ request_data = json.loads(post_data.decode("utf-8"))
320
+
321
+ text = request_data.get("text")
322
+ method = request_data.get("method")
323
+
324
+ if not text:
325
+ self._send_response(400, {"error": "Text is required"})
326
+ return
327
+
328
+ result = self.embedder.embed_text(text, method)
329
+ self._send_response(200, result)
330
+
331
+ except Exception as e:
332
+ self._send_response(500, {"error": str(e)})
333
+
334
+ def _handle_embed_document(self):
335
+ """Handle document embedding requests"""
336
+ try:
337
+ content_length = int(self.headers.get("Content-Length", 0))
338
+ post_data = self.rfile.read(content_length)
339
+ request_data = json.loads(post_data.decode("utf-8"))
340
+
341
+ text = request_data.get("text")
342
+ chunk_size = request_data.get("chunk_size", 1000)
343
+
344
+ if not text:
345
+ self._send_response(400, {"error": "Text is required"})
346
+ return
347
+
348
+ result = self.embedder.embed_document(text, chunk_size)
349
+ self._send_response(200, result)
350
+
351
+ except Exception as e:
352
+ self._send_response(500, {"error": str(e)})
353
+
354
+ def _handle_search(self):
355
+ """Handle search requests"""
356
+ try:
357
+ content_length = int(self.headers.get("Content-Length", 0))
358
+ post_data = self.rfile.read(content_length)
359
+ request_data = json.loads(post_data.decode("utf-8"))
360
+
361
+ query = request_data.get("query")
362
+ embeddings = request_data.get("embeddings", [])
363
+ top_k = request_data.get("top_k", 5)
364
+
365
+ if not query:
366
+ self._send_response(400, {"error": "Query is required"})
367
+ return
368
+
369
+ results = self.embedder.search_similar(query, embeddings, top_k)
370
+ self._send_response(200, {"results": results})
371
+
372
+ except Exception as e:
373
+ self._send_response(500, {"error": str(e)})
374
+
375
+ def _send_response(self, status_code, data):
376
+ """Send JSON response"""
377
+ self.send_response(status_code)
378
+ self.send_header("Content-Type", "application/json")
379
+ self.send_header("Access-Control-Allow-Origin", "*")
380
+ self.end_headers()
381
+ self.wfile.write(json.dumps(data).encode("utf-8"))
382
+
383
+ return EmbedderHandler
384
+
385
+
386
+ if __name__ == "__main__":
387
+ # Test the embedder
388
+ embedder = LightweightEmbedder()
389
+
390
+ # Test with sample text
391
+ test_text = "This is a sample text for testing the lightweight embedder."
392
+ result = embedder.embed_text(test_text)
393
+ print(json.dumps(result, indent=2))
394
+
395
+ # Test document embedding
396
+ doc_result = embedder.embed_document(test_text * 10) # Longer text
397
+ print(json.dumps(doc_result, indent=2))