isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,576 +0,0 @@
1
- """
2
- Task Classification System for Training Requests
3
-
4
- This module automatically classifies training tasks based on:
5
- - Natural language descriptions
6
- - Dataset characteristics
7
- - Model requirements
8
- - Domain-specific patterns
9
-
10
- Supports classification for LLM, CV, Audio, and multi-modal tasks.
11
- """
12
-
13
- import logging
14
- import re
15
- from typing import Dict, List, Optional, Any, Tuple
16
- from dataclasses import dataclass
17
- from pathlib import Path
18
- import json
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- @dataclass
24
- class TaskAnalysis:
25
- """Results of task classification analysis."""
26
-
27
- # Primary classification
28
- task_type: str # "chat", "classification", "summarization", "generation", etc.
29
- domain: str # "general", "medical", "legal", "technical", etc.
30
- modality: str # "text", "image", "audio", "multimodal"
31
-
32
- # Training characteristics
33
- training_type: str # "sft", "rlhf", "dpo", "pretraining"
34
- complexity: str # "simple", "medium", "complex"
35
-
36
- # Data characteristics
37
- language: str = "english"
38
- dataset_type: str = "instruction" # "instruction", "conversational", "raw_text"
39
- estimated_size: int = 0
40
-
41
- # Confidence and metadata
42
- confidence: float = 0.0
43
- keywords: List[str] = None
44
- reasoning: List[str] = None
45
-
46
- def __post_init__(self):
47
- if self.keywords is None:
48
- self.keywords = []
49
- if self.reasoning is None:
50
- self.reasoning = []
51
-
52
-
53
- class TaskClassifier:
54
- """
55
- Intelligent task classification system.
56
-
57
- Analyzes training requests and datasets to automatically determine:
58
- - Task type (chat, classification, summarization, etc.)
59
- - Domain (medical, legal, technical, etc.)
60
- - Modality (text, image, audio, multimodal)
61
- - Training approach (SFT, RLHF, DPO, etc.)
62
- - Complexity level
63
-
64
- Example:
65
- ```python
66
- classifier = TaskClassifier()
67
-
68
- analysis = classifier.analyze_request(
69
- "Fine-tune a model for medical question answering",
70
- "medical_qa_dataset.json"
71
- )
72
-
73
- print(f"Task: {analysis.task_type}")
74
- print(f"Domain: {analysis.domain}")
75
- print(f"Training: {analysis.training_type}")
76
- ```
77
- """
78
-
79
- def __init__(self):
80
- """Initialize task classifier with pattern libraries."""
81
- self.task_patterns = self._load_task_patterns()
82
- self.domain_patterns = self._load_domain_patterns()
83
- self.language_patterns = self._load_language_patterns()
84
-
85
- logger.info("Task classifier initialized")
86
-
87
- def analyze_request(self, description: str, dataset_source: str) -> TaskAnalysis:
88
- """
89
- Analyze training request and classify task.
90
-
91
- Args:
92
- description: Natural language description of training task
93
- dataset_source: Path to dataset or dataset identifier
94
-
95
- Returns:
96
- Complete task analysis
97
- """
98
- logger.info(f"Classifying task: {description[:50]}...")
99
-
100
- try:
101
- # Step 1: Extract keywords and normalize text
102
- keywords = self._extract_keywords(description)
103
- normalized_desc = description.lower()
104
-
105
- # Step 2: Classify task type
106
- task_type, task_confidence = self._classify_task_type(normalized_desc, keywords)
107
-
108
- # Step 3: Classify domain
109
- domain, domain_confidence = self._classify_domain(normalized_desc, keywords)
110
-
111
- # Step 4: Determine modality
112
- modality = self._determine_modality(normalized_desc, keywords, dataset_source)
113
-
114
- # Step 5: Determine training type
115
- training_type = self._determine_training_type(normalized_desc, keywords)
116
-
117
- # Step 6: Analyze complexity
118
- complexity = self._analyze_complexity(normalized_desc, keywords, dataset_source)
119
-
120
- # Step 7: Detect language
121
- language = self._detect_language(normalized_desc, keywords)
122
-
123
- # Step 8: Determine dataset type
124
- dataset_type = self._determine_dataset_type(dataset_source, normalized_desc)
125
-
126
- # Step 9: Generate reasoning
127
- reasoning = self._generate_reasoning(
128
- task_type, domain, modality, training_type, complexity, keywords
129
- )
130
-
131
- # Step 10: Calculate overall confidence
132
- overall_confidence = (task_confidence + domain_confidence) / 2
133
-
134
- analysis = TaskAnalysis(
135
- task_type=task_type,
136
- domain=domain,
137
- modality=modality,
138
- training_type=training_type,
139
- complexity=complexity,
140
- language=language,
141
- dataset_type=dataset_type,
142
- confidence=overall_confidence,
143
- keywords=keywords,
144
- reasoning=reasoning
145
- )
146
-
147
- logger.info(f"Task classified: {task_type} ({domain}) - {training_type}")
148
- return analysis
149
-
150
- except Exception as e:
151
- logger.error(f"Task classification failed: {e}")
152
- # Return default analysis
153
- return TaskAnalysis(
154
- task_type="sft",
155
- domain="general",
156
- modality="text",
157
- training_type="sft",
158
- complexity="medium",
159
- confidence=0.1,
160
- reasoning=["Classification failed, using defaults"]
161
- )
162
-
163
- def _load_task_patterns(self) -> Dict[str, Dict[str, Any]]:
164
- """Load task type classification patterns."""
165
- return {
166
- "chat": {
167
- "keywords": ["chat", "conversation", "dialogue", "chatbot", "assistant", "qa", "question", "answer"],
168
- "patterns": [
169
- r"chat\s*(bot|assistant)",
170
- r"(conversation|dialogue)\s*model",
171
- r"question\s*answer",
172
- r"customer\s*service",
173
- r"virtual\s*assistant"
174
- ],
175
- "weight": 1.0
176
- },
177
- "classification": {
178
- "keywords": ["classify", "classification", "categorize", "category", "label", "sentiment", "emotion"],
179
- "patterns": [
180
- r"(text|document)\s*classification",
181
- r"sentiment\s*analysis",
182
- r"categoriz[ae]",
183
- r"label\s*prediction",
184
- r"emotion\s*detection"
185
- ],
186
- "weight": 1.0
187
- },
188
- "summarization": {
189
- "keywords": ["summarize", "summary", "summarization", "abstract", "brief", "condense"],
190
- "patterns": [
191
- r"summariz[ae]",
192
- r"abstract\s*generation",
193
- r"text\s*summary",
194
- r"document\s*summary"
195
- ],
196
- "weight": 1.0
197
- },
198
- "generation": {
199
- "keywords": ["generate", "generation", "creative", "story", "content", "write", "writing"],
200
- "patterns": [
201
- r"text\s*generation",
202
- r"content\s*generation",
203
- r"creative\s*writing",
204
- r"story\s*generation"
205
- ],
206
- "weight": 1.0
207
- },
208
- "translation": {
209
- "keywords": ["translate", "translation", "multilingual", "language", "cross-lingual"],
210
- "patterns": [
211
- r"translation",
212
- r"translate\s*between",
213
- r"multilingual",
214
- r"cross-lingual"
215
- ],
216
- "weight": 1.0
217
- },
218
- "reasoning": {
219
- "keywords": ["reasoning", "logic", "math", "mathematical", "problem", "solve"],
220
- "patterns": [
221
- r"mathematical\s*reasoning",
222
- r"logical\s*reasoning",
223
- r"problem\s*solving",
224
- r"math\s*problems"
225
- ],
226
- "weight": 1.0
227
- },
228
- "code": {
229
- "keywords": ["code", "programming", "python", "javascript", "sql", "development"],
230
- "patterns": [
231
- r"code\s*(generation|completion)",
232
- r"programming\s*assistance",
233
- r"software\s*development",
234
- r"(python|javascript|sql)\s*code"
235
- ],
236
- "weight": 1.0
237
- }
238
- }
239
-
240
- def _load_domain_patterns(self) -> Dict[str, Dict[str, Any]]:
241
- """Load domain classification patterns."""
242
- return {
243
- "medical": {
244
- "keywords": ["medical", "health", "healthcare", "clinical", "patient", "diagnosis", "treatment"],
245
- "patterns": [
246
- r"medical\s*(qa|question|diagnosis)",
247
- r"healthcare\s*assistant",
248
- r"clinical\s*notes",
249
- r"patient\s*records"
250
- ],
251
- "weight": 1.0
252
- },
253
- "legal": {
254
- "keywords": ["legal", "law", "lawyer", "court", "contract", "compliance", "regulation"],
255
- "patterns": [
256
- r"legal\s*(document|analysis)",
257
- r"law\s*assistant",
258
- r"contract\s*review",
259
- r"compliance\s*check"
260
- ],
261
- "weight": 1.0
262
- },
263
- "financial": {
264
- "keywords": ["financial", "finance", "trading", "investment", "banking", "economic"],
265
- "patterns": [
266
- r"financial\s*analysis",
267
- r"trading\s*assistant",
268
- r"investment\s*advice",
269
- r"banking\s*support"
270
- ],
271
- "weight": 1.0
272
- },
273
- "technical": {
274
- "keywords": ["technical", "engineering", "software", "programming", "development", "api"],
275
- "patterns": [
276
- r"technical\s*documentation",
277
- r"engineering\s*assistant",
278
- r"api\s*documentation",
279
- r"software\s*support"
280
- ],
281
- "weight": 1.0
282
- },
283
- "education": {
284
- "keywords": ["education", "learning", "teaching", "student", "tutor", "academic"],
285
- "patterns": [
286
- r"educational\s*assistant",
287
- r"tutoring\s*system",
288
- r"academic\s*support",
289
- r"learning\s*companion"
290
- ],
291
- "weight": 1.0
292
- },
293
- "ecommerce": {
294
- "keywords": ["ecommerce", "shopping", "product", "recommendation", "retail", "customer"],
295
- "patterns": [
296
- r"product\s*recommendation",
297
- r"shopping\s*assistant",
298
- r"ecommerce\s*support",
299
- r"retail\s*assistant"
300
- ],
301
- "weight": 1.0
302
- },
303
- "general": {
304
- "keywords": ["general", "assistant", "helper", "support", "chatbot"],
305
- "patterns": [
306
- r"general\s*purpose",
307
- r"personal\s*assistant",
308
- r"general\s*chatbot"
309
- ],
310
- "weight": 0.5 # Lower weight as fallback
311
- }
312
- }
313
-
314
- def _load_language_patterns(self) -> Dict[str, List[str]]:
315
- """Load language detection patterns."""
316
- return {
317
- "chinese": ["chinese", "中文", "汉语", "普通话", "mandarin", "cantonese", "zh"],
318
- "japanese": ["japanese", "日本語", "nihongo", "ja"],
319
- "korean": ["korean", "한국어", "hangul", "ko"],
320
- "spanish": ["spanish", "español", "castellano", "es"],
321
- "french": ["french", "français", "fr"],
322
- "german": ["german", "deutsch", "de"],
323
- "italian": ["italian", "italiano", "it"],
324
- "portuguese": ["portuguese", "português", "pt"],
325
- "russian": ["russian", "русский", "ru"],
326
- "arabic": ["arabic", "العربية", "ar"],
327
- "hindi": ["hindi", "हिंदी", "hi"],
328
- "english": ["english", "en"] # Default
329
- }
330
-
331
- def _extract_keywords(self, text: str) -> List[str]:
332
- """Extract relevant keywords from text."""
333
- # Simple keyword extraction
334
- words = re.findall(r'\b\w+\b', text.lower())
335
-
336
- # Filter out common stop words
337
- stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}
338
-
339
- keywords = [word for word in words if word not in stop_words and len(word) > 2]
340
-
341
- return keywords[:20] # Return top 20 keywords
342
-
343
- def _classify_task_type(self, text: str, keywords: List[str]) -> Tuple[str, float]:
344
- """Classify the primary task type."""
345
- scores = {}
346
-
347
- for task_type, patterns in self.task_patterns.items():
348
- score = 0.0
349
-
350
- # Check keywords
351
- for keyword in patterns["keywords"]:
352
- if keyword in text or keyword in keywords:
353
- score += 1.0
354
-
355
- # Check regex patterns
356
- for pattern in patterns["patterns"]:
357
- if re.search(pattern, text):
358
- score += 2.0
359
-
360
- # Apply weight
361
- score *= patterns["weight"]
362
- scores[task_type] = score
363
-
364
- # Find highest scoring task type
365
- if scores:
366
- best_task = max(scores, key=scores.get)
367
- confidence = min(1.0, scores[best_task] / 3.0) # Normalize confidence
368
-
369
- if confidence > 0.3:
370
- return best_task, confidence
371
-
372
- # Default to chat if no clear classification
373
- return "chat", 0.5
374
-
375
- def _classify_domain(self, text: str, keywords: List[str]) -> Tuple[str, float]:
376
- """Classify the domain/industry."""
377
- scores = {}
378
-
379
- for domain, patterns in self.domain_patterns.items():
380
- score = 0.0
381
-
382
- # Check keywords
383
- for keyword in patterns["keywords"]:
384
- if keyword in text or keyword in keywords:
385
- score += 1.0
386
-
387
- # Check regex patterns
388
- for pattern in patterns["patterns"]:
389
- if re.search(pattern, text):
390
- score += 2.0
391
-
392
- # Apply weight
393
- score *= patterns["weight"]
394
- scores[domain] = score
395
-
396
- # Find highest scoring domain
397
- if scores:
398
- best_domain = max(scores, key=scores.get)
399
- confidence = min(1.0, scores[best_domain] / 2.0)
400
-
401
- if confidence > 0.3:
402
- return best_domain, confidence
403
-
404
- # Default to general
405
- return "general", 0.5
406
-
407
- def _determine_modality(self, text: str, keywords: List[str], dataset_source: str) -> str:
408
- """Determine the modality (text, image, audio, multimodal)."""
409
- # Check for image-related keywords
410
- image_keywords = ["image", "picture", "photo", "visual", "vision", "cnn", "resnet", "vit"]
411
- if any(keyword in text for keyword in image_keywords):
412
- return "image"
413
-
414
- # Check for audio-related keywords
415
- audio_keywords = ["audio", "speech", "voice", "sound", "whisper", "tts", "stt"]
416
- if any(keyword in text for keyword in audio_keywords):
417
- return "audio"
418
-
419
- # Check for multimodal keywords
420
- multimodal_keywords = ["multimodal", "vision-language", "clip", "blip", "image-text"]
421
- if any(keyword in text for keyword in multimodal_keywords):
422
- return "multimodal"
423
-
424
- # Check dataset source for file extensions
425
- if dataset_source:
426
- if any(ext in dataset_source.lower() for ext in [".jpg", ".png", ".jpeg", ".gif", ".bmp"]):
427
- return "image"
428
- elif any(ext in dataset_source.lower() for ext in [".wav", ".mp3", ".flac", ".m4a"]):
429
- return "audio"
430
-
431
- # Default to text
432
- return "text"
433
-
434
- def _determine_training_type(self, text: str, keywords: List[str]) -> str:
435
- """Determine the training approach."""
436
- # Check for specific training types
437
- if any(keyword in text for keyword in ["rlhf", "reinforcement", "human feedback"]):
438
- return "rlhf"
439
-
440
- if any(keyword in text for keyword in ["dpo", "direct preference", "preference optimization"]):
441
- return "dpo"
442
-
443
- if any(keyword in text for keyword in ["pretrain", "pretraining", "from scratch"]):
444
- return "pretraining"
445
-
446
- if any(keyword in text for keyword in ["instruction", "supervised", "fine-tune", "finetune"]):
447
- return "sft"
448
-
449
- # Default to SFT
450
- return "sft"
451
-
452
- def _analyze_complexity(self, text: str, keywords: List[str], dataset_source: str) -> str:
453
- """Analyze task complexity."""
454
- complexity_score = 0
455
-
456
- # High complexity indicators
457
- high_complexity_keywords = ["complex", "advanced", "sophisticated", "multi-step", "reasoning", "mathematical"]
458
- if any(keyword in text for keyword in high_complexity_keywords):
459
- complexity_score += 2
460
-
461
- # Medium complexity indicators
462
- medium_complexity_keywords = ["detailed", "comprehensive", "analysis", "professional"]
463
- if any(keyword in text for keyword in medium_complexity_keywords):
464
- complexity_score += 1
465
-
466
- # Simple complexity indicators
467
- simple_complexity_keywords = ["simple", "basic", "quick", "fast", "easy"]
468
- if any(keyword in text for keyword in simple_complexity_keywords):
469
- complexity_score -= 1
470
-
471
- # Determine complexity level
472
- if complexity_score >= 2:
473
- return "complex"
474
- elif complexity_score <= -1:
475
- return "simple"
476
- else:
477
- return "medium"
478
-
479
- def _detect_language(self, text: str, keywords: List[str]) -> str:
480
- """Detect the target language."""
481
- for language, patterns in self.language_patterns.items():
482
- if any(pattern in text for pattern in patterns):
483
- return language
484
-
485
- # Default to English
486
- return "english"
487
-
488
- def _determine_dataset_type(self, dataset_source: str, text: str) -> str:
489
- """Determine the dataset type."""
490
- if "alpaca" in dataset_source.lower() or "instruction" in text:
491
- return "instruction"
492
- elif "sharegpt" in dataset_source.lower() or "conversation" in text:
493
- return "conversational"
494
- elif "raw" in text or "text" in text:
495
- return "raw_text"
496
- else:
497
- return "instruction" # Default
498
-
499
- def _generate_reasoning(
500
- self,
501
- task_type: str,
502
- domain: str,
503
- modality: str,
504
- training_type: str,
505
- complexity: str,
506
- keywords: List[str]
507
- ) -> List[str]:
508
- """Generate human-readable reasoning for the classification."""
509
- reasoning = []
510
-
511
- reasoning.append(f"Classified as {task_type} task based on keywords: {', '.join(keywords[:3])}")
512
-
513
- if domain != "general":
514
- reasoning.append(f"Identified {domain} domain specialization")
515
-
516
- if modality != "text":
517
- reasoning.append(f"Detected {modality} modality requirements")
518
-
519
- if training_type != "sft":
520
- reasoning.append(f"Recommended {training_type} training approach")
521
-
522
- reasoning.append(f"Estimated {complexity} complexity level")
523
-
524
- return reasoning
525
-
526
- def get_supported_tasks(self) -> List[str]:
527
- """Get list of supported task types."""
528
- return list(self.task_patterns.keys())
529
-
530
- def get_supported_domains(self) -> List[str]:
531
- """Get list of supported domains."""
532
- return list(self.domain_patterns.keys())
533
-
534
- def classify_dataset(self, dataset_path: str) -> Dict[str, Any]:
535
- """Classify a dataset file directly."""
536
- try:
537
- if not Path(dataset_path).exists():
538
- return {"error": f"Dataset not found: {dataset_path}"}
539
-
540
- # Analyze file extension
541
- suffix = Path(dataset_path).suffix.lower()
542
-
543
- analysis = {
544
- "file_type": suffix,
545
- "size": 0,
546
- "format": "unknown",
547
- "language": "unknown",
548
- "estimated_samples": 0
549
- }
550
-
551
- if suffix == ".json":
552
- with open(dataset_path, 'r', encoding='utf-8') as f:
553
- data = json.load(f)
554
-
555
- if isinstance(data, list):
556
- analysis["estimated_samples"] = len(data)
557
- analysis["format"] = "json_list"
558
-
559
- # Analyze first sample
560
- if data:
561
- sample = data[0]
562
- if isinstance(sample, dict):
563
- if "instruction" in sample and "output" in sample:
564
- analysis["format"] = "alpaca"
565
- elif "messages" in sample:
566
- analysis["format"] = "sharegpt"
567
- elif "conversations" in sample:
568
- analysis["format"] = "conversational"
569
-
570
- analysis["size"] = Path(dataset_path).stat().st_size
571
-
572
- return analysis
573
-
574
- except Exception as e:
575
- logger.error(f"Failed to classify dataset {dataset_path}: {e}")
576
- return {"error": str(e)}
@@ -1,24 +0,0 @@
1
- """
2
- Training Data Storage Module
3
-
4
- This module provides persistent storage for training-related data:
5
- - Training job records and history
6
- - Model training metadata and metrics
7
- - Cost tracking and billing information
8
- - Integration with core model management
9
- - Model version management and lineage tracking
10
-
11
- Works seamlessly with existing core storage infrastructure.
12
- """
13
-
14
- from .training_storage import TrainingStorage, TrainingJobRecord, TrainingMetrics
15
- from .training_repository import TrainingRepository
16
- from .core_integration import CoreModelIntegration
17
-
18
- __all__ = [
19
- 'TrainingStorage',
20
- 'TrainingJobRecord',
21
- 'TrainingMetrics',
22
- 'TrainingRepository',
23
- 'CoreModelIntegration'
24
- ]