isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,576 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Task Classification System for Training Requests
|
3
|
-
|
4
|
-
This module automatically classifies training tasks based on:
|
5
|
-
- Natural language descriptions
|
6
|
-
- Dataset characteristics
|
7
|
-
- Model requirements
|
8
|
-
- Domain-specific patterns
|
9
|
-
|
10
|
-
Supports classification for LLM, CV, Audio, and multi-modal tasks.
|
11
|
-
"""
|
12
|
-
|
13
|
-
import logging
|
14
|
-
import re
|
15
|
-
from typing import Dict, List, Optional, Any, Tuple
|
16
|
-
from dataclasses import dataclass
|
17
|
-
from pathlib import Path
|
18
|
-
import json
|
19
|
-
|
20
|
-
logger = logging.getLogger(__name__)
|
21
|
-
|
22
|
-
|
23
|
-
@dataclass
|
24
|
-
class TaskAnalysis:
|
25
|
-
"""Results of task classification analysis."""
|
26
|
-
|
27
|
-
# Primary classification
|
28
|
-
task_type: str # "chat", "classification", "summarization", "generation", etc.
|
29
|
-
domain: str # "general", "medical", "legal", "technical", etc.
|
30
|
-
modality: str # "text", "image", "audio", "multimodal"
|
31
|
-
|
32
|
-
# Training characteristics
|
33
|
-
training_type: str # "sft", "rlhf", "dpo", "pretraining"
|
34
|
-
complexity: str # "simple", "medium", "complex"
|
35
|
-
|
36
|
-
# Data characteristics
|
37
|
-
language: str = "english"
|
38
|
-
dataset_type: str = "instruction" # "instruction", "conversational", "raw_text"
|
39
|
-
estimated_size: int = 0
|
40
|
-
|
41
|
-
# Confidence and metadata
|
42
|
-
confidence: float = 0.0
|
43
|
-
keywords: List[str] = None
|
44
|
-
reasoning: List[str] = None
|
45
|
-
|
46
|
-
def __post_init__(self):
|
47
|
-
if self.keywords is None:
|
48
|
-
self.keywords = []
|
49
|
-
if self.reasoning is None:
|
50
|
-
self.reasoning = []
|
51
|
-
|
52
|
-
|
53
|
-
class TaskClassifier:
|
54
|
-
"""
|
55
|
-
Intelligent task classification system.
|
56
|
-
|
57
|
-
Analyzes training requests and datasets to automatically determine:
|
58
|
-
- Task type (chat, classification, summarization, etc.)
|
59
|
-
- Domain (medical, legal, technical, etc.)
|
60
|
-
- Modality (text, image, audio, multimodal)
|
61
|
-
- Training approach (SFT, RLHF, DPO, etc.)
|
62
|
-
- Complexity level
|
63
|
-
|
64
|
-
Example:
|
65
|
-
```python
|
66
|
-
classifier = TaskClassifier()
|
67
|
-
|
68
|
-
analysis = classifier.analyze_request(
|
69
|
-
"Fine-tune a model for medical question answering",
|
70
|
-
"medical_qa_dataset.json"
|
71
|
-
)
|
72
|
-
|
73
|
-
print(f"Task: {analysis.task_type}")
|
74
|
-
print(f"Domain: {analysis.domain}")
|
75
|
-
print(f"Training: {analysis.training_type}")
|
76
|
-
```
|
77
|
-
"""
|
78
|
-
|
79
|
-
def __init__(self):
|
80
|
-
"""Initialize task classifier with pattern libraries."""
|
81
|
-
self.task_patterns = self._load_task_patterns()
|
82
|
-
self.domain_patterns = self._load_domain_patterns()
|
83
|
-
self.language_patterns = self._load_language_patterns()
|
84
|
-
|
85
|
-
logger.info("Task classifier initialized")
|
86
|
-
|
87
|
-
def analyze_request(self, description: str, dataset_source: str) -> TaskAnalysis:
|
88
|
-
"""
|
89
|
-
Analyze training request and classify task.
|
90
|
-
|
91
|
-
Args:
|
92
|
-
description: Natural language description of training task
|
93
|
-
dataset_source: Path to dataset or dataset identifier
|
94
|
-
|
95
|
-
Returns:
|
96
|
-
Complete task analysis
|
97
|
-
"""
|
98
|
-
logger.info(f"Classifying task: {description[:50]}...")
|
99
|
-
|
100
|
-
try:
|
101
|
-
# Step 1: Extract keywords and normalize text
|
102
|
-
keywords = self._extract_keywords(description)
|
103
|
-
normalized_desc = description.lower()
|
104
|
-
|
105
|
-
# Step 2: Classify task type
|
106
|
-
task_type, task_confidence = self._classify_task_type(normalized_desc, keywords)
|
107
|
-
|
108
|
-
# Step 3: Classify domain
|
109
|
-
domain, domain_confidence = self._classify_domain(normalized_desc, keywords)
|
110
|
-
|
111
|
-
# Step 4: Determine modality
|
112
|
-
modality = self._determine_modality(normalized_desc, keywords, dataset_source)
|
113
|
-
|
114
|
-
# Step 5: Determine training type
|
115
|
-
training_type = self._determine_training_type(normalized_desc, keywords)
|
116
|
-
|
117
|
-
# Step 6: Analyze complexity
|
118
|
-
complexity = self._analyze_complexity(normalized_desc, keywords, dataset_source)
|
119
|
-
|
120
|
-
# Step 7: Detect language
|
121
|
-
language = self._detect_language(normalized_desc, keywords)
|
122
|
-
|
123
|
-
# Step 8: Determine dataset type
|
124
|
-
dataset_type = self._determine_dataset_type(dataset_source, normalized_desc)
|
125
|
-
|
126
|
-
# Step 9: Generate reasoning
|
127
|
-
reasoning = self._generate_reasoning(
|
128
|
-
task_type, domain, modality, training_type, complexity, keywords
|
129
|
-
)
|
130
|
-
|
131
|
-
# Step 10: Calculate overall confidence
|
132
|
-
overall_confidence = (task_confidence + domain_confidence) / 2
|
133
|
-
|
134
|
-
analysis = TaskAnalysis(
|
135
|
-
task_type=task_type,
|
136
|
-
domain=domain,
|
137
|
-
modality=modality,
|
138
|
-
training_type=training_type,
|
139
|
-
complexity=complexity,
|
140
|
-
language=language,
|
141
|
-
dataset_type=dataset_type,
|
142
|
-
confidence=overall_confidence,
|
143
|
-
keywords=keywords,
|
144
|
-
reasoning=reasoning
|
145
|
-
)
|
146
|
-
|
147
|
-
logger.info(f"Task classified: {task_type} ({domain}) - {training_type}")
|
148
|
-
return analysis
|
149
|
-
|
150
|
-
except Exception as e:
|
151
|
-
logger.error(f"Task classification failed: {e}")
|
152
|
-
# Return default analysis
|
153
|
-
return TaskAnalysis(
|
154
|
-
task_type="sft",
|
155
|
-
domain="general",
|
156
|
-
modality="text",
|
157
|
-
training_type="sft",
|
158
|
-
complexity="medium",
|
159
|
-
confidence=0.1,
|
160
|
-
reasoning=["Classification failed, using defaults"]
|
161
|
-
)
|
162
|
-
|
163
|
-
def _load_task_patterns(self) -> Dict[str, Dict[str, Any]]:
|
164
|
-
"""Load task type classification patterns."""
|
165
|
-
return {
|
166
|
-
"chat": {
|
167
|
-
"keywords": ["chat", "conversation", "dialogue", "chatbot", "assistant", "qa", "question", "answer"],
|
168
|
-
"patterns": [
|
169
|
-
r"chat\s*(bot|assistant)",
|
170
|
-
r"(conversation|dialogue)\s*model",
|
171
|
-
r"question\s*answer",
|
172
|
-
r"customer\s*service",
|
173
|
-
r"virtual\s*assistant"
|
174
|
-
],
|
175
|
-
"weight": 1.0
|
176
|
-
},
|
177
|
-
"classification": {
|
178
|
-
"keywords": ["classify", "classification", "categorize", "category", "label", "sentiment", "emotion"],
|
179
|
-
"patterns": [
|
180
|
-
r"(text|document)\s*classification",
|
181
|
-
r"sentiment\s*analysis",
|
182
|
-
r"categoriz[ae]",
|
183
|
-
r"label\s*prediction",
|
184
|
-
r"emotion\s*detection"
|
185
|
-
],
|
186
|
-
"weight": 1.0
|
187
|
-
},
|
188
|
-
"summarization": {
|
189
|
-
"keywords": ["summarize", "summary", "summarization", "abstract", "brief", "condense"],
|
190
|
-
"patterns": [
|
191
|
-
r"summariz[ae]",
|
192
|
-
r"abstract\s*generation",
|
193
|
-
r"text\s*summary",
|
194
|
-
r"document\s*summary"
|
195
|
-
],
|
196
|
-
"weight": 1.0
|
197
|
-
},
|
198
|
-
"generation": {
|
199
|
-
"keywords": ["generate", "generation", "creative", "story", "content", "write", "writing"],
|
200
|
-
"patterns": [
|
201
|
-
r"text\s*generation",
|
202
|
-
r"content\s*generation",
|
203
|
-
r"creative\s*writing",
|
204
|
-
r"story\s*generation"
|
205
|
-
],
|
206
|
-
"weight": 1.0
|
207
|
-
},
|
208
|
-
"translation": {
|
209
|
-
"keywords": ["translate", "translation", "multilingual", "language", "cross-lingual"],
|
210
|
-
"patterns": [
|
211
|
-
r"translation",
|
212
|
-
r"translate\s*between",
|
213
|
-
r"multilingual",
|
214
|
-
r"cross-lingual"
|
215
|
-
],
|
216
|
-
"weight": 1.0
|
217
|
-
},
|
218
|
-
"reasoning": {
|
219
|
-
"keywords": ["reasoning", "logic", "math", "mathematical", "problem", "solve"],
|
220
|
-
"patterns": [
|
221
|
-
r"mathematical\s*reasoning",
|
222
|
-
r"logical\s*reasoning",
|
223
|
-
r"problem\s*solving",
|
224
|
-
r"math\s*problems"
|
225
|
-
],
|
226
|
-
"weight": 1.0
|
227
|
-
},
|
228
|
-
"code": {
|
229
|
-
"keywords": ["code", "programming", "python", "javascript", "sql", "development"],
|
230
|
-
"patterns": [
|
231
|
-
r"code\s*(generation|completion)",
|
232
|
-
r"programming\s*assistance",
|
233
|
-
r"software\s*development",
|
234
|
-
r"(python|javascript|sql)\s*code"
|
235
|
-
],
|
236
|
-
"weight": 1.0
|
237
|
-
}
|
238
|
-
}
|
239
|
-
|
240
|
-
def _load_domain_patterns(self) -> Dict[str, Dict[str, Any]]:
|
241
|
-
"""Load domain classification patterns."""
|
242
|
-
return {
|
243
|
-
"medical": {
|
244
|
-
"keywords": ["medical", "health", "healthcare", "clinical", "patient", "diagnosis", "treatment"],
|
245
|
-
"patterns": [
|
246
|
-
r"medical\s*(qa|question|diagnosis)",
|
247
|
-
r"healthcare\s*assistant",
|
248
|
-
r"clinical\s*notes",
|
249
|
-
r"patient\s*records"
|
250
|
-
],
|
251
|
-
"weight": 1.0
|
252
|
-
},
|
253
|
-
"legal": {
|
254
|
-
"keywords": ["legal", "law", "lawyer", "court", "contract", "compliance", "regulation"],
|
255
|
-
"patterns": [
|
256
|
-
r"legal\s*(document|analysis)",
|
257
|
-
r"law\s*assistant",
|
258
|
-
r"contract\s*review",
|
259
|
-
r"compliance\s*check"
|
260
|
-
],
|
261
|
-
"weight": 1.0
|
262
|
-
},
|
263
|
-
"financial": {
|
264
|
-
"keywords": ["financial", "finance", "trading", "investment", "banking", "economic"],
|
265
|
-
"patterns": [
|
266
|
-
r"financial\s*analysis",
|
267
|
-
r"trading\s*assistant",
|
268
|
-
r"investment\s*advice",
|
269
|
-
r"banking\s*support"
|
270
|
-
],
|
271
|
-
"weight": 1.0
|
272
|
-
},
|
273
|
-
"technical": {
|
274
|
-
"keywords": ["technical", "engineering", "software", "programming", "development", "api"],
|
275
|
-
"patterns": [
|
276
|
-
r"technical\s*documentation",
|
277
|
-
r"engineering\s*assistant",
|
278
|
-
r"api\s*documentation",
|
279
|
-
r"software\s*support"
|
280
|
-
],
|
281
|
-
"weight": 1.0
|
282
|
-
},
|
283
|
-
"education": {
|
284
|
-
"keywords": ["education", "learning", "teaching", "student", "tutor", "academic"],
|
285
|
-
"patterns": [
|
286
|
-
r"educational\s*assistant",
|
287
|
-
r"tutoring\s*system",
|
288
|
-
r"academic\s*support",
|
289
|
-
r"learning\s*companion"
|
290
|
-
],
|
291
|
-
"weight": 1.0
|
292
|
-
},
|
293
|
-
"ecommerce": {
|
294
|
-
"keywords": ["ecommerce", "shopping", "product", "recommendation", "retail", "customer"],
|
295
|
-
"patterns": [
|
296
|
-
r"product\s*recommendation",
|
297
|
-
r"shopping\s*assistant",
|
298
|
-
r"ecommerce\s*support",
|
299
|
-
r"retail\s*assistant"
|
300
|
-
],
|
301
|
-
"weight": 1.0
|
302
|
-
},
|
303
|
-
"general": {
|
304
|
-
"keywords": ["general", "assistant", "helper", "support", "chatbot"],
|
305
|
-
"patterns": [
|
306
|
-
r"general\s*purpose",
|
307
|
-
r"personal\s*assistant",
|
308
|
-
r"general\s*chatbot"
|
309
|
-
],
|
310
|
-
"weight": 0.5 # Lower weight as fallback
|
311
|
-
}
|
312
|
-
}
|
313
|
-
|
314
|
-
def _load_language_patterns(self) -> Dict[str, List[str]]:
|
315
|
-
"""Load language detection patterns."""
|
316
|
-
return {
|
317
|
-
"chinese": ["chinese", "中文", "汉语", "普通话", "mandarin", "cantonese", "zh"],
|
318
|
-
"japanese": ["japanese", "日本語", "nihongo", "ja"],
|
319
|
-
"korean": ["korean", "한국어", "hangul", "ko"],
|
320
|
-
"spanish": ["spanish", "español", "castellano", "es"],
|
321
|
-
"french": ["french", "français", "fr"],
|
322
|
-
"german": ["german", "deutsch", "de"],
|
323
|
-
"italian": ["italian", "italiano", "it"],
|
324
|
-
"portuguese": ["portuguese", "português", "pt"],
|
325
|
-
"russian": ["russian", "русский", "ru"],
|
326
|
-
"arabic": ["arabic", "العربية", "ar"],
|
327
|
-
"hindi": ["hindi", "हिंदी", "hi"],
|
328
|
-
"english": ["english", "en"] # Default
|
329
|
-
}
|
330
|
-
|
331
|
-
def _extract_keywords(self, text: str) -> List[str]:
|
332
|
-
"""Extract relevant keywords from text."""
|
333
|
-
# Simple keyword extraction
|
334
|
-
words = re.findall(r'\b\w+\b', text.lower())
|
335
|
-
|
336
|
-
# Filter out common stop words
|
337
|
-
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those'}
|
338
|
-
|
339
|
-
keywords = [word for word in words if word not in stop_words and len(word) > 2]
|
340
|
-
|
341
|
-
return keywords[:20] # Return top 20 keywords
|
342
|
-
|
343
|
-
def _classify_task_type(self, text: str, keywords: List[str]) -> Tuple[str, float]:
|
344
|
-
"""Classify the primary task type."""
|
345
|
-
scores = {}
|
346
|
-
|
347
|
-
for task_type, patterns in self.task_patterns.items():
|
348
|
-
score = 0.0
|
349
|
-
|
350
|
-
# Check keywords
|
351
|
-
for keyword in patterns["keywords"]:
|
352
|
-
if keyword in text or keyword in keywords:
|
353
|
-
score += 1.0
|
354
|
-
|
355
|
-
# Check regex patterns
|
356
|
-
for pattern in patterns["patterns"]:
|
357
|
-
if re.search(pattern, text):
|
358
|
-
score += 2.0
|
359
|
-
|
360
|
-
# Apply weight
|
361
|
-
score *= patterns["weight"]
|
362
|
-
scores[task_type] = score
|
363
|
-
|
364
|
-
# Find highest scoring task type
|
365
|
-
if scores:
|
366
|
-
best_task = max(scores, key=scores.get)
|
367
|
-
confidence = min(1.0, scores[best_task] / 3.0) # Normalize confidence
|
368
|
-
|
369
|
-
if confidence > 0.3:
|
370
|
-
return best_task, confidence
|
371
|
-
|
372
|
-
# Default to chat if no clear classification
|
373
|
-
return "chat", 0.5
|
374
|
-
|
375
|
-
def _classify_domain(self, text: str, keywords: List[str]) -> Tuple[str, float]:
|
376
|
-
"""Classify the domain/industry."""
|
377
|
-
scores = {}
|
378
|
-
|
379
|
-
for domain, patterns in self.domain_patterns.items():
|
380
|
-
score = 0.0
|
381
|
-
|
382
|
-
# Check keywords
|
383
|
-
for keyword in patterns["keywords"]:
|
384
|
-
if keyword in text or keyword in keywords:
|
385
|
-
score += 1.0
|
386
|
-
|
387
|
-
# Check regex patterns
|
388
|
-
for pattern in patterns["patterns"]:
|
389
|
-
if re.search(pattern, text):
|
390
|
-
score += 2.0
|
391
|
-
|
392
|
-
# Apply weight
|
393
|
-
score *= patterns["weight"]
|
394
|
-
scores[domain] = score
|
395
|
-
|
396
|
-
# Find highest scoring domain
|
397
|
-
if scores:
|
398
|
-
best_domain = max(scores, key=scores.get)
|
399
|
-
confidence = min(1.0, scores[best_domain] / 2.0)
|
400
|
-
|
401
|
-
if confidence > 0.3:
|
402
|
-
return best_domain, confidence
|
403
|
-
|
404
|
-
# Default to general
|
405
|
-
return "general", 0.5
|
406
|
-
|
407
|
-
def _determine_modality(self, text: str, keywords: List[str], dataset_source: str) -> str:
|
408
|
-
"""Determine the modality (text, image, audio, multimodal)."""
|
409
|
-
# Check for image-related keywords
|
410
|
-
image_keywords = ["image", "picture", "photo", "visual", "vision", "cnn", "resnet", "vit"]
|
411
|
-
if any(keyword in text for keyword in image_keywords):
|
412
|
-
return "image"
|
413
|
-
|
414
|
-
# Check for audio-related keywords
|
415
|
-
audio_keywords = ["audio", "speech", "voice", "sound", "whisper", "tts", "stt"]
|
416
|
-
if any(keyword in text for keyword in audio_keywords):
|
417
|
-
return "audio"
|
418
|
-
|
419
|
-
# Check for multimodal keywords
|
420
|
-
multimodal_keywords = ["multimodal", "vision-language", "clip", "blip", "image-text"]
|
421
|
-
if any(keyword in text for keyword in multimodal_keywords):
|
422
|
-
return "multimodal"
|
423
|
-
|
424
|
-
# Check dataset source for file extensions
|
425
|
-
if dataset_source:
|
426
|
-
if any(ext in dataset_source.lower() for ext in [".jpg", ".png", ".jpeg", ".gif", ".bmp"]):
|
427
|
-
return "image"
|
428
|
-
elif any(ext in dataset_source.lower() for ext in [".wav", ".mp3", ".flac", ".m4a"]):
|
429
|
-
return "audio"
|
430
|
-
|
431
|
-
# Default to text
|
432
|
-
return "text"
|
433
|
-
|
434
|
-
def _determine_training_type(self, text: str, keywords: List[str]) -> str:
|
435
|
-
"""Determine the training approach."""
|
436
|
-
# Check for specific training types
|
437
|
-
if any(keyword in text for keyword in ["rlhf", "reinforcement", "human feedback"]):
|
438
|
-
return "rlhf"
|
439
|
-
|
440
|
-
if any(keyword in text for keyword in ["dpo", "direct preference", "preference optimization"]):
|
441
|
-
return "dpo"
|
442
|
-
|
443
|
-
if any(keyword in text for keyword in ["pretrain", "pretraining", "from scratch"]):
|
444
|
-
return "pretraining"
|
445
|
-
|
446
|
-
if any(keyword in text for keyword in ["instruction", "supervised", "fine-tune", "finetune"]):
|
447
|
-
return "sft"
|
448
|
-
|
449
|
-
# Default to SFT
|
450
|
-
return "sft"
|
451
|
-
|
452
|
-
def _analyze_complexity(self, text: str, keywords: List[str], dataset_source: str) -> str:
|
453
|
-
"""Analyze task complexity."""
|
454
|
-
complexity_score = 0
|
455
|
-
|
456
|
-
# High complexity indicators
|
457
|
-
high_complexity_keywords = ["complex", "advanced", "sophisticated", "multi-step", "reasoning", "mathematical"]
|
458
|
-
if any(keyword in text for keyword in high_complexity_keywords):
|
459
|
-
complexity_score += 2
|
460
|
-
|
461
|
-
# Medium complexity indicators
|
462
|
-
medium_complexity_keywords = ["detailed", "comprehensive", "analysis", "professional"]
|
463
|
-
if any(keyword in text for keyword in medium_complexity_keywords):
|
464
|
-
complexity_score += 1
|
465
|
-
|
466
|
-
# Simple complexity indicators
|
467
|
-
simple_complexity_keywords = ["simple", "basic", "quick", "fast", "easy"]
|
468
|
-
if any(keyword in text for keyword in simple_complexity_keywords):
|
469
|
-
complexity_score -= 1
|
470
|
-
|
471
|
-
# Determine complexity level
|
472
|
-
if complexity_score >= 2:
|
473
|
-
return "complex"
|
474
|
-
elif complexity_score <= -1:
|
475
|
-
return "simple"
|
476
|
-
else:
|
477
|
-
return "medium"
|
478
|
-
|
479
|
-
def _detect_language(self, text: str, keywords: List[str]) -> str:
|
480
|
-
"""Detect the target language."""
|
481
|
-
for language, patterns in self.language_patterns.items():
|
482
|
-
if any(pattern in text for pattern in patterns):
|
483
|
-
return language
|
484
|
-
|
485
|
-
# Default to English
|
486
|
-
return "english"
|
487
|
-
|
488
|
-
def _determine_dataset_type(self, dataset_source: str, text: str) -> str:
|
489
|
-
"""Determine the dataset type."""
|
490
|
-
if "alpaca" in dataset_source.lower() or "instruction" in text:
|
491
|
-
return "instruction"
|
492
|
-
elif "sharegpt" in dataset_source.lower() or "conversation" in text:
|
493
|
-
return "conversational"
|
494
|
-
elif "raw" in text or "text" in text:
|
495
|
-
return "raw_text"
|
496
|
-
else:
|
497
|
-
return "instruction" # Default
|
498
|
-
|
499
|
-
def _generate_reasoning(
|
500
|
-
self,
|
501
|
-
task_type: str,
|
502
|
-
domain: str,
|
503
|
-
modality: str,
|
504
|
-
training_type: str,
|
505
|
-
complexity: str,
|
506
|
-
keywords: List[str]
|
507
|
-
) -> List[str]:
|
508
|
-
"""Generate human-readable reasoning for the classification."""
|
509
|
-
reasoning = []
|
510
|
-
|
511
|
-
reasoning.append(f"Classified as {task_type} task based on keywords: {', '.join(keywords[:3])}")
|
512
|
-
|
513
|
-
if domain != "general":
|
514
|
-
reasoning.append(f"Identified {domain} domain specialization")
|
515
|
-
|
516
|
-
if modality != "text":
|
517
|
-
reasoning.append(f"Detected {modality} modality requirements")
|
518
|
-
|
519
|
-
if training_type != "sft":
|
520
|
-
reasoning.append(f"Recommended {training_type} training approach")
|
521
|
-
|
522
|
-
reasoning.append(f"Estimated {complexity} complexity level")
|
523
|
-
|
524
|
-
return reasoning
|
525
|
-
|
526
|
-
def get_supported_tasks(self) -> List[str]:
|
527
|
-
"""Get list of supported task types."""
|
528
|
-
return list(self.task_patterns.keys())
|
529
|
-
|
530
|
-
def get_supported_domains(self) -> List[str]:
|
531
|
-
"""Get list of supported domains."""
|
532
|
-
return list(self.domain_patterns.keys())
|
533
|
-
|
534
|
-
def classify_dataset(self, dataset_path: str) -> Dict[str, Any]:
|
535
|
-
"""Classify a dataset file directly."""
|
536
|
-
try:
|
537
|
-
if not Path(dataset_path).exists():
|
538
|
-
return {"error": f"Dataset not found: {dataset_path}"}
|
539
|
-
|
540
|
-
# Analyze file extension
|
541
|
-
suffix = Path(dataset_path).suffix.lower()
|
542
|
-
|
543
|
-
analysis = {
|
544
|
-
"file_type": suffix,
|
545
|
-
"size": 0,
|
546
|
-
"format": "unknown",
|
547
|
-
"language": "unknown",
|
548
|
-
"estimated_samples": 0
|
549
|
-
}
|
550
|
-
|
551
|
-
if suffix == ".json":
|
552
|
-
with open(dataset_path, 'r', encoding='utf-8') as f:
|
553
|
-
data = json.load(f)
|
554
|
-
|
555
|
-
if isinstance(data, list):
|
556
|
-
analysis["estimated_samples"] = len(data)
|
557
|
-
analysis["format"] = "json_list"
|
558
|
-
|
559
|
-
# Analyze first sample
|
560
|
-
if data:
|
561
|
-
sample = data[0]
|
562
|
-
if isinstance(sample, dict):
|
563
|
-
if "instruction" in sample and "output" in sample:
|
564
|
-
analysis["format"] = "alpaca"
|
565
|
-
elif "messages" in sample:
|
566
|
-
analysis["format"] = "sharegpt"
|
567
|
-
elif "conversations" in sample:
|
568
|
-
analysis["format"] = "conversational"
|
569
|
-
|
570
|
-
analysis["size"] = Path(dataset_path).stat().st_size
|
571
|
-
|
572
|
-
return analysis
|
573
|
-
|
574
|
-
except Exception as e:
|
575
|
-
logger.error(f"Failed to classify dataset {dataset_path}: {e}")
|
576
|
-
return {"error": str(e)}
|
@@ -1,24 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Training Data Storage Module
|
3
|
-
|
4
|
-
This module provides persistent storage for training-related data:
|
5
|
-
- Training job records and history
|
6
|
-
- Model training metadata and metrics
|
7
|
-
- Cost tracking and billing information
|
8
|
-
- Integration with core model management
|
9
|
-
- Model version management and lineage tracking
|
10
|
-
|
11
|
-
Works seamlessly with existing core storage infrastructure.
|
12
|
-
"""
|
13
|
-
|
14
|
-
from .training_storage import TrainingStorage, TrainingJobRecord, TrainingMetrics
|
15
|
-
from .training_repository import TrainingRepository
|
16
|
-
from .core_integration import CoreModelIntegration
|
17
|
-
|
18
|
-
__all__ = [
|
19
|
-
'TrainingStorage',
|
20
|
-
'TrainingJobRecord',
|
21
|
-
'TrainingMetrics',
|
22
|
-
'TrainingRepository',
|
23
|
-
'CoreModelIntegration'
|
24
|
-
]
|