isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,576 +0,0 @@
1
- """
2
- Task Classification System for Training Requests
3
-
4
- This module automatically classifies training tasks based on:
5
- - Natural language descriptions
6
- - Dataset characteristics
7
- - Model requirements
8
- - Domain-specific patterns
9
-
10
- Supports classification for LLM, CV, Audio, and multi-modal tasks.
11
- """
12
-
13
- import logging
14
- import re
15
- from typing import Dict, List, Optional, Any, Tuple
16
- from dataclasses import dataclass
17
- from pathlib import Path
18
- import json
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- @dataclass
24
- class TaskAnalysis:
25
- """Results of task classification analysis."""
26
-
27
- # Primary classification
28
- task_type: str # "chat", "classification", "summarization", "generation", etc.
29
- domain: str # "general", "medical", "legal", "technical", etc.
30
- modality: str # "text", "image", "audio", "multimodal"
31
-
32
- # Training characteristics
33
- training_type: str # "sft", "rlhf", "dpo", "pretraining"
34
- complexity: str # "simple", "medium", "complex"
35
-
36
- # Data characteristics
37
- language: str = "english"
38
- dataset_type: str = "instruction" # "instruction", "conversational", "raw_text"
39
- estimated_size: int = 0
40
-
41
- # Confidence and metadata
42
- confidence: float = 0.0
43
- keywords: List[str] = None
44
- reasoning: List[str] = None
45
-
46
- def __post_init__(self):
47
- if self.keywords is None:
48
- self.keywords = []
49
- if self.reasoning is None:
50
- self.reasoning = []
51
-
52
-
53
- class TaskClassifier:
54
- """
55
- Intelligent task classification system.
56
-
57
- Analyzes training requests and datasets to automatically determine:
58
- - Task type (chat, classification, summarization, etc.)
59
- - Domain (medical, legal, technical, etc.)
60
- - Modality (text, image, audio, multimodal)
61
- - Training approach (SFT, RLHF, DPO, etc.)
62
- - Complexity level
63
-
64
- Example:
65
- ```python
66
- classifier = TaskClassifier()
67
-
68
- analysis = classifier.analyze_request(
69
- "Fine-tune a model for medical question answering",
70
- "medical_qa_dataset.json"
71
- )
72
-
73
- print(f"Task: {analysis.task_type}")
74
- print(f"Domain: {analysis.domain}")
75
- print(f"Training: {analysis.training_type}")
76
- ```
77
- """
78
-
79
- def __init__(self):
80
- """Initialize task classifier with pattern libraries."""
81
- self.task_patterns = self._load_task_patterns()
82
- self.domain_patterns = self._load_domain_patterns()
83
- self.language_patterns = self._load_language_patterns()
84
-
85
- logger.info("Task classifier initialized")
86
-
87
- def analyze_request(self, description: str, dataset_source: str) -> TaskAnalysis:
88
- """
89
- Analyze training request and classify task.
90
-
91
- Args:
92
- description: Natural language description of training task
93
- dataset_source: Path to dataset or dataset identifier
94
-
95
- Returns:
96
- Complete task analysis
97
- """
98
- logger.info(f"Classifying task: {description[:50]}...")
99
-
100
- try:
101
- # Step 1: Extract keywords and normalize text
102
- keywords = self._extract_keywords(description)
103
- normalized_desc = description.lower()
104
-
105
- # Step 2: Classify task type
106
- task_type, task_confidence = self._classify_task_type(normalized_desc, keywords)
107
-
108
- # Step 3: Classify domain
109
- domain, domain_confidence = self._classify_domain(normalized_desc, keywords)
110
-
111
- # Step 4: Determine modality
112
- modality = self._determine_modality(normalized_desc, keywords, dataset_source)
113
-
114
- # Step 5: Determine training type
115
- training_type = self._determine_training_type(normalized_desc, keywords)
116
-
117
- # Step 6: Analyze complexity
118
- complexity = self._analyze_complexity(normalized_desc, keywords, dataset_source)
119
-
120
- # Step 7: Detect language
121
- language = self._detect_language(normalized_desc, keywords)
122
-
123
- # Step 8: Determine dataset type
124
- dataset_type = self._determine_dataset_type(dataset_source, normalized_desc)
125
-
126
- # Step 9: Generate reasoning
127
- reasoning = self._generate_reasoning(
128
- task_type, domain, modality, training_type, complexity, keywords
129
- )
130
-
131
- # Step 10: Calculate overall confidence
132
- overall_confidence = (task_confidence + domain_confidence) / 2
133
-
134
- analysis = TaskAnalysis(
135
- task_type=task_type,
136
- domain=domain,
137
- modality=modality,
138
- training_type=training_type,
139
- complexity=complexity,
140
- language=language,
141
- dataset_type=dataset_type,
142
- confidence=overall_confidence,
143
- keywords=keywords,
144
- reasoning=reasoning
145
- )
146
-
147
- logger.info(f"Task classified: {task_type} ({domain}) - {training_type}")
148
- return analysis
149
-
150
- except Exception as e:
151
- logger.error(f"Task classification failed: {e}")
152
- # Return default analysis
153
- return TaskAnalysis(
154
- task_type="sft",
155
- domain="general",
156
- modality="text",
157
- training_type="sft",
158
- complexity="medium",
159
- confidence=0.1,
160
- reasoning=["Classification failed, using defaults"]
161
- )
162
-
163
- def _load_task_patterns(self) -> Dict[str, Dict[str, Any]]:
164
- """Load task type classification patterns."""
165
- return {
166
- "chat": {
167
- "keywords": ["chat", "conversation", "dialogue", "chatbot", "assistant", "qa", "question", "answer"],
168
- "patterns": [
169
- r"chat\s*(bot|assistant)",
170
- r"(conversation|dialogue)\s*model",
171
- r"question\s*answer",
172
- r"customer\s*service",
173
- r"virtual\s*assistant"
174
- ],
175
- "weight": 1.0
176
- },
177
- "classification": {
178
- "keywords": ["classify", "classification", "categorize", "category", "label", "sentiment", "emotion"],
179
- "patterns": [
180
- r"(text|document)\s*classification",
181
- r"sentiment\s*analysis",
182
- r"categoriz[ae]",
183
- r"label\s*prediction",
184
- r"emotion\s*detection"
185
- ],
186
- "weight": 1.0
187
- },
188
- "summarization": {
189
- "keywords": ["summarize", "summary", "summarization", "abstract", "brief", "condense"],
190
- "patterns": [
191
- r"summariz[ae]",
192
- r"abstract\s*generation",
193
- r"text\s*summary",
194
- r"document\s*summary"
195
- ],
196
- "weight": 1.0
197
- },
198
- "generation": {
199
- "keywords": ["generate", "generation", "creative", "story", "content", "write", "writing"],
200
- "patterns": [
201
- r"text\s*generation",
202
- r"content\s*generation",
203
- r"creative\s*writing",
204
- r"story\s*generation"
205
- ],
206
- "weight": 1.0
207
- },
208
- "translation": {
209
- "keywords": ["translate", "translation", "multilingual", "language", "cross-lingual"],
210
- "patterns": [
211
- r"translation",
212
- r"translate\s*between",
213
- r"multilingual",
214
- r"cross-lingual"
215
- ],
216
- "weight": 1.0
217
- },
218
- "reasoning": {
219
- "keywords": ["reasoning", "logic", "math", "mathematical", "problem", "solve"],
220
- "patterns": [
221
- r"mathematical\s*reasoning",
222
- r"logical\s*reasoning",
223
- r"problem\s*solving",
224
- r"math\s*problems"
225
- ],
226
- "weight": 1.0
227
- },
228
- "code": {
229
- "keywords": ["code", "programming", "python", "javascript", "sql", "development"],
230
- "patterns": [
231
- r"code\s*(generation|completion)",
232
- r"programming\s*assistance",
233
- r"software\s*development",
234
- r"(python|javascript|sql)\s*code"
235
- ],
236
- "weight": 1.0
237
- }
238
- }
239
-
240
- def _load_domain_patterns(self) -> Dict[str, Dict[str, Any]]:
241
- """Load domain classification patterns."""
242
- return {
243
- "medical": {
244
- "keywords": ["medical", "health", "healthcare", "clinical", "patient", "diagnosis", "treatment"],
245
- "patterns": [
246
- r"medical\s*(qa|question|diagnosis)",
247
- r"healthcare\s*assistant",
248
- r"clinical\s*notes",
249
- r"patient\s*records"
250
- ],
251
- "weight": 1.0
252
- },
253
- "legal": {
254
- "keywords": ["legal", "law", "lawyer", "court", "contract", "compliance", "regulation"],
255
- "patterns": [
256
- r"legal\s*(document|analysis)",
257
- r"law\s*assistant",
258
- r"contract\s*review",
259
- r"compliance\s*check"
260
- ],
261
- "weight": 1.0
262
- },
263
- "financial": {
264
- "keywords": ["financial", "finance", "trading", "investment", "banking", "economic"],
265
- "patterns": [
266
- r"financial\s*analysis",
267
- r"trading\s*assistant",
268
- r"investment\s*advice",
269
- r"banking\s*support"
270
- ],
271
- "weight": 1.0
272
- },
273
- "technical": {
274
- "keywords": ["technical", "engineering", "software", "programming", "development", "api"],
275
- "patterns": [
276
- r"technical\s*documentation",
277
- r"engineering\s*assistant",
278
- r"api\s*documentation",
279
- r"software\s*support"
280
- ],
281
- "weight": 1.0
282
- },
283
- "education": {
284
- "keywords": ["education", "learning", "teaching", "student", "tutor", "academic"],
285
- "patterns": [
286
- r"educational\s*assistant",
287
- r"tutoring\s*system",
288
- r"academic\s*support",
289
- r"learning\s*companion"
290
- ],
291
- "weight": 1.0
292
- },
293
- "ecommerce": {
294
- "keywords": ["ecommerce", "shopping", "product", "recommendation", "retail", "customer"],
295
- "patterns": [
296
- r"product\s*recommendation",
297
- r"shopping\s*assistant",
298
- r"ecommerce\s*support",
299
- r"retail\s*assistant"
300
- ],
301
- "weight": 1.0
302
- },
303
- "general": {
304
- "keywords": ["general", "assistant", "helper", "support", "chatbot"],
305
- "patterns": [
306
- r"general\s*purpose",
307
- r"personal\s*assistant",
308
- r"general\s*chatbot"
309
- ],
310
- "weight": 0.5 # Lower weight as fallback
311
- }
312
- }
313
-
314
- def _load_language_patterns(self) -> Dict[str, List[str]]:
315
- """Load language detection patterns."""
316
- return {
317
- "chinese": ["chinese", "中文", "汉语", "普通话", "mandarin", "cantonese", "zh"],
318
- "japanese": ["japanese", "日本語", "nihongo", "ja"],
319
- "korean": ["korean", "한국어", "hangul", "ko"],
320
- "spanish": ["spanish", "español", "castellano", "es"],
321
- "french": ["french", "français", "fr"],
322
- "german": ["german", "deutsch", "de"],
323
- "italian": ["italian", "italiano", "it"],
324
- "portuguese": ["portuguese", "português", "pt"],
325
- "russian": ["russian", "русский", "ru"],
326
- "arabic": ["arabic", "العربية", "ar"],
327
- "hindi": ["hindi", "हिंदी", "hi"],
328
- "english": ["english", "en"] # Default
329
- }
330
-
331
- def _extract_keywords(self, text: str) -> List[str]:
332
- """Extract relevant keywords from text."""
333
- # Simple keyword extraction
334
- words = re.findall(r'\b\w+\b', text.lower())
335
-
336
- # Filter out common stop words
337
- stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}
338
-
339
- keywords = [word for word in words if word not in stop_words and len(word) > 2]
340
-
341
- return keywords[:20] # Return top 20 keywords
342
-
343
- def _classify_task_type(self, text: str, keywords: List[str]) -> Tuple[str, float]:
344
- """Classify the primary task type."""
345
- scores = {}
346
-
347
- for task_type, patterns in self.task_patterns.items():
348
- score = 0.0
349
-
350
- # Check keywords
351
- for keyword in patterns["keywords"]:
352
- if keyword in text or keyword in keywords:
353
- score += 1.0
354
-
355
- # Check regex patterns
356
- for pattern in patterns["patterns"]:
357
- if re.search(pattern, text):
358
- score += 2.0
359
-
360
- # Apply weight
361
- score *= patterns["weight"]
362
- scores[task_type] = score
363
-
364
- # Find highest scoring task type
365
- if scores:
366
- best_task = max(scores, key=scores.get)
367
- confidence = min(1.0, scores[best_task] / 3.0) # Normalize confidence
368
-
369
- if confidence > 0.3:
370
- return best_task, confidence
371
-
372
- # Default to chat if no clear classification
373
- return "chat", 0.5
374
-
375
- def _classify_domain(self, text: str, keywords: List[str]) -> Tuple[str, float]:
376
- """Classify the domain/industry."""
377
- scores = {}
378
-
379
- for domain, patterns in self.domain_patterns.items():
380
- score = 0.0
381
-
382
- # Check keywords
383
- for keyword in patterns["keywords"]:
384
- if keyword in text or keyword in keywords:
385
- score += 1.0
386
-
387
- # Check regex patterns
388
- for pattern in patterns["patterns"]:
389
- if re.search(pattern, text):
390
- score += 2.0
391
-
392
- # Apply weight
393
- score *= patterns["weight"]
394
- scores[domain] = score
395
-
396
- # Find highest scoring domain
397
- if scores:
398
- best_domain = max(scores, key=scores.get)
399
- confidence = min(1.0, scores[best_domain] / 2.0)
400
-
401
- if confidence > 0.3:
402
- return best_domain, confidence
403
-
404
- # Default to general
405
- return "general", 0.5
406
-
407
- def _determine_modality(self, text: str, keywords: List[str], dataset_source: str) -> str:
408
- """Determine the modality (text, image, audio, multimodal)."""
409
- # Check for image-related keywords
410
- image_keywords = ["image", "picture", "photo", "visual", "vision", "cnn", "resnet", "vit"]
411
- if any(keyword in text for keyword in image_keywords):
412
- return "image"
413
-
414
- # Check for audio-related keywords
415
- audio_keywords = ["audio", "speech", "voice", "sound", "whisper", "tts", "stt"]
416
- if any(keyword in text for keyword in audio_keywords):
417
- return "audio"
418
-
419
- # Check for multimodal keywords
420
- multimodal_keywords = ["multimodal", "vision-language", "clip", "blip", "image-text"]
421
- if any(keyword in text for keyword in multimodal_keywords):
422
- return "multimodal"
423
-
424
- # Check dataset source for file extensions
425
- if dataset_source:
426
- if any(ext in dataset_source.lower() for ext in [".jpg", ".png", ".jpeg", ".gif", ".bmp"]):
427
- return "image"
428
- elif any(ext in dataset_source.lower() for ext in [".wav", ".mp3", ".flac", ".m4a"]):
429
- return "audio"
430
-
431
- # Default to text
432
- return "text"
433
-
434
- def _determine_training_type(self, text: str, keywords: List[str]) -> str:
435
- """Determine the training approach."""
436
- # Check for specific training types
437
- if any(keyword in text for keyword in ["rlhf", "reinforcement", "human feedback"]):
438
- return "rlhf"
439
-
440
- if any(keyword in text for keyword in ["dpo", "direct preference", "preference optimization"]):
441
- return "dpo"
442
-
443
- if any(keyword in text for keyword in ["pretrain", "pretraining", "from scratch"]):
444
- return "pretraining"
445
-
446
- if any(keyword in text for keyword in ["instruction", "supervised", "fine-tune", "finetune"]):
447
- return "sft"
448
-
449
- # Default to SFT
450
- return "sft"
451
-
452
- def _analyze_complexity(self, text: str, keywords: List[str], dataset_source: str) -> str:
453
- """Analyze task complexity."""
454
- complexity_score = 0
455
-
456
- # High complexity indicators
457
- high_complexity_keywords = ["complex", "advanced", "sophisticated", "multi-step", "reasoning", "mathematical"]
458
- if any(keyword in text for keyword in high_complexity_keywords):
459
- complexity_score += 2
460
-
461
- # Medium complexity indicators
462
- medium_complexity_keywords = ["detailed", "comprehensive", "analysis", "professional"]
463
- if any(keyword in text for keyword in medium_complexity_keywords):
464
- complexity_score += 1
465
-
466
- # Simple complexity indicators
467
- simple_complexity_keywords = ["simple", "basic", "quick", "fast", "easy"]
468
- if any(keyword in text for keyword in simple_complexity_keywords):
469
- complexity_score -= 1
470
-
471
- # Determine complexity level
472
- if complexity_score >= 2:
473
- return "complex"
474
- elif complexity_score <= -1:
475
- return "simple"
476
- else:
477
- return "medium"
478
-
479
- def _detect_language(self, text: str, keywords: List[str]) -> str:
480
- """Detect the target language."""
481
- for language, patterns in self.language_patterns.items():
482
- if any(pattern in text for pattern in patterns):
483
- return language
484
-
485
- # Default to English
486
- return "english"
487
-
488
- def _determine_dataset_type(self, dataset_source: str, text: str) -> str:
489
- """Determine the dataset type."""
490
- if "alpaca" in dataset_source.lower() or "instruction" in text:
491
- return "instruction"
492
- elif "sharegpt" in dataset_source.lower() or "conversation" in text:
493
- return "conversational"
494
- elif "raw" in text or "text" in text:
495
- return "raw_text"
496
- else:
497
- return "instruction" # Default
498
-
499
- def _generate_reasoning(
500
- self,
501
- task_type: str,
502
- domain: str,
503
- modality: str,
504
- training_type: str,
505
- complexity: str,
506
- keywords: List[str]
507
- ) -> List[str]:
508
- """Generate human-readable reasoning for the classification."""
509
- reasoning = []
510
-
511
- reasoning.append(f"Classified as {task_type} task based on keywords: {', '.join(keywords[:3])}")
512
-
513
- if domain != "general":
514
- reasoning.append(f"Identified {domain} domain specialization")
515
-
516
- if modality != "text":
517
- reasoning.append(f"Detected {modality} modality requirements")
518
-
519
- if training_type != "sft":
520
- reasoning.append(f"Recommended {training_type} training approach")
521
-
522
- reasoning.append(f"Estimated {complexity} complexity level")
523
-
524
- return reasoning
525
-
526
- def get_supported_tasks(self) -> List[str]:
527
- """Get list of supported task types."""
528
- return list(self.task_patterns.keys())
529
-
530
- def get_supported_domains(self) -> List[str]:
531
- """Get list of supported domains."""
532
- return list(self.domain_patterns.keys())
533
-
534
- def classify_dataset(self, dataset_path: str) -> Dict[str, Any]:
535
- """Classify a dataset file directly."""
536
- try:
537
- if not Path(dataset_path).exists():
538
- return {"error": f"Dataset not found: {dataset_path}"}
539
-
540
- # Analyze file extension
541
- suffix = Path(dataset_path).suffix.lower()
542
-
543
- analysis = {
544
- "file_type": suffix,
545
- "size": 0,
546
- "format": "unknown",
547
- "language": "unknown",
548
- "estimated_samples": 0
549
- }
550
-
551
- if suffix == ".json":
552
- with open(dataset_path, 'r', encoding='utf-8') as f:
553
- data = json.load(f)
554
-
555
- if isinstance(data, list):
556
- analysis["estimated_samples"] = len(data)
557
- analysis["format"] = "json_list"
558
-
559
- # Analyze first sample
560
- if data:
561
- sample = data[0]
562
- if isinstance(sample, dict):
563
- if "instruction" in sample and "output" in sample:
564
- analysis["format"] = "alpaca"
565
- elif "messages" in sample:
566
- analysis["format"] = "sharegpt"
567
- elif "conversations" in sample:
568
- analysis["format"] = "conversational"
569
-
570
- analysis["size"] = Path(dataset_path).stat().st_size
571
-
572
- return analysis
573
-
574
- except Exception as e:
575
- logger.error(f"Failed to classify dataset {dataset_path}: {e}")
576
- return {"error": str(e)}
@@ -1,24 +0,0 @@
1
- """
2
- Training Data Storage Module
3
-
4
- This module provides persistent storage for training-related data:
5
- - Training job records and history
6
- - Model training metadata and metrics
7
- - Cost tracking and billing information
8
- - Integration with core model management
9
- - Model version management and lineage tracking
10
-
11
- Works seamlessly with existing core storage infrastructure.
12
- """
13
-
14
- from .training_storage import TrainingStorage, TrainingJobRecord, TrainingMetrics
15
- from .training_repository import TrainingRepository
16
- from .core_integration import CoreModelIntegration
17
-
18
- __all__ = [
19
- 'TrainingStorage',
20
- 'TrainingJobRecord',
21
- 'TrainingMetrics',
22
- 'TrainingRepository',
23
- 'CoreModelIntegration'
24
- ]