stratifyai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +5 -0
- cli/stratifyai_cli.py +1753 -0
- stratifyai/__init__.py +113 -0
- stratifyai/api_key_helper.py +372 -0
- stratifyai/caching.py +279 -0
- stratifyai/chat/__init__.py +54 -0
- stratifyai/chat/builder.py +366 -0
- stratifyai/chat/stratifyai_anthropic.py +194 -0
- stratifyai/chat/stratifyai_bedrock.py +200 -0
- stratifyai/chat/stratifyai_deepseek.py +194 -0
- stratifyai/chat/stratifyai_google.py +194 -0
- stratifyai/chat/stratifyai_grok.py +194 -0
- stratifyai/chat/stratifyai_groq.py +195 -0
- stratifyai/chat/stratifyai_ollama.py +201 -0
- stratifyai/chat/stratifyai_openai.py +209 -0
- stratifyai/chat/stratifyai_openrouter.py +201 -0
- stratifyai/chunking.py +158 -0
- stratifyai/client.py +292 -0
- stratifyai/config.py +1273 -0
- stratifyai/cost_tracker.py +257 -0
- stratifyai/embeddings.py +245 -0
- stratifyai/exceptions.py +91 -0
- stratifyai/models.py +59 -0
- stratifyai/providers/__init__.py +5 -0
- stratifyai/providers/anthropic.py +330 -0
- stratifyai/providers/base.py +183 -0
- stratifyai/providers/bedrock.py +634 -0
- stratifyai/providers/deepseek.py +39 -0
- stratifyai/providers/google.py +39 -0
- stratifyai/providers/grok.py +39 -0
- stratifyai/providers/groq.py +39 -0
- stratifyai/providers/ollama.py +43 -0
- stratifyai/providers/openai.py +344 -0
- stratifyai/providers/openai_compatible.py +372 -0
- stratifyai/providers/openrouter.py +39 -0
- stratifyai/py.typed +2 -0
- stratifyai/rag.py +381 -0
- stratifyai/retry.py +185 -0
- stratifyai/router.py +643 -0
- stratifyai/summarization.py +179 -0
- stratifyai/utils/__init__.py +11 -0
- stratifyai/utils/bedrock_validator.py +136 -0
- stratifyai/utils/code_extractor.py +327 -0
- stratifyai/utils/csv_extractor.py +197 -0
- stratifyai/utils/file_analyzer.py +192 -0
- stratifyai/utils/json_extractor.py +219 -0
- stratifyai/utils/log_extractor.py +267 -0
- stratifyai/utils/model_selector.py +324 -0
- stratifyai/utils/provider_validator.py +442 -0
- stratifyai/utils/token_counter.py +186 -0
- stratifyai/vectordb.py +344 -0
- stratifyai-0.1.0.dist-info/METADATA +263 -0
- stratifyai-0.1.0.dist-info/RECORD +57 -0
- stratifyai-0.1.0.dist-info/WHEEL +5 -0
- stratifyai-0.1.0.dist-info/entry_points.txt +2 -0
- stratifyai-0.1.0.dist-info/licenses/LICENSE +21 -0
- stratifyai-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""Log file error extraction for intelligent file analysis.
|
|
2
|
+
|
|
3
|
+
This module extracts errors, warnings, and patterns from log files to reduce
|
|
4
|
+
token usage by 90%+ while preserving critical information.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Any
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class LogEntry:
|
|
17
|
+
"""A single log entry."""
|
|
18
|
+
timestamp: Optional[str]
|
|
19
|
+
level: str # ERROR, WARN, INFO, DEBUG, etc.
|
|
20
|
+
message: str
|
|
21
|
+
line_number: int
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class LogSummary:
|
|
26
|
+
"""Summary of log file analysis."""
|
|
27
|
+
file_path: str
|
|
28
|
+
total_lines: int
|
|
29
|
+
errors: List[LogEntry]
|
|
30
|
+
warnings: List[LogEntry]
|
|
31
|
+
error_patterns: Dict[str, int]
|
|
32
|
+
warning_patterns: Dict[str, int]
|
|
33
|
+
timestamp_range: Optional[tuple[str, str]]
|
|
34
|
+
|
|
35
|
+
def to_text(self) -> str:
|
|
36
|
+
"""Convert summary to human-readable text.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Formatted log summary
|
|
40
|
+
"""
|
|
41
|
+
lines = [
|
|
42
|
+
f"Log File: {self.file_path}",
|
|
43
|
+
f"Total Lines: {self.total_lines:,}",
|
|
44
|
+
f"Errors: {len(self.errors)}",
|
|
45
|
+
f"Warnings: {len(self.warnings)}",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
if self.timestamp_range:
|
|
49
|
+
lines.append(f"Time Range: {self.timestamp_range[0]} to {self.timestamp_range[1]}")
|
|
50
|
+
|
|
51
|
+
# Error patterns
|
|
52
|
+
if self.error_patterns:
|
|
53
|
+
lines.append("\nTop Error Patterns:")
|
|
54
|
+
for pattern, count in sorted(self.error_patterns.items(), key=lambda x: -x[1])[:10]:
|
|
55
|
+
lines.append(f" [{count}×] {pattern}")
|
|
56
|
+
|
|
57
|
+
# Warning patterns
|
|
58
|
+
if self.warning_patterns:
|
|
59
|
+
lines.append("\nTop Warning Patterns:")
|
|
60
|
+
for pattern, count in sorted(self.warning_patterns.items(), key=lambda x: -x[1])[:10]:
|
|
61
|
+
lines.append(f" [{count}×] {pattern}")
|
|
62
|
+
|
|
63
|
+
# Recent errors (last 5)
|
|
64
|
+
if self.errors:
|
|
65
|
+
lines.append("\nRecent Errors:")
|
|
66
|
+
for entry in self.errors[-5:]:
|
|
67
|
+
ts = entry.timestamp or "??:??:??"
|
|
68
|
+
lines.append(f" [Line {entry.line_number}] {ts} - {entry.message[:100]}")
|
|
69
|
+
|
|
70
|
+
# Recent warnings (last 5)
|
|
71
|
+
if self.warnings:
|
|
72
|
+
lines.append("\nRecent Warnings:")
|
|
73
|
+
for entry in self.warnings[-5:]:
|
|
74
|
+
ts = entry.timestamp or "??:??:??"
|
|
75
|
+
lines.append(f" [Line {entry.line_number}] {ts} - {entry.message[:100]}")
|
|
76
|
+
|
|
77
|
+
return "\n".join(lines)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Common log patterns
|
|
81
|
+
LOG_LEVEL_PATTERNS = [
|
|
82
|
+
(r'\b(ERROR|FATAL|CRITICAL)\b', 'ERROR'),
|
|
83
|
+
(r'\b(WARN|WARNING)\b', 'WARN'),
|
|
84
|
+
(r'\b(INFO|INFORMATION)\b', 'INFO'),
|
|
85
|
+
(r'\b(DEBUG|TRACE)\b', 'DEBUG'),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
TIMESTAMP_PATTERNS = [
|
|
89
|
+
r'\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}', # ISO format
|
|
90
|
+
r'\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2}', # US format
|
|
91
|
+
r'\d{2}:\d{2}:\d{2}', # Time only
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_timestamp(line: str) -> Optional[str]:
|
|
96
|
+
"""Extract timestamp from log line.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
line: Log line text
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Extracted timestamp or None
|
|
103
|
+
"""
|
|
104
|
+
for pattern in TIMESTAMP_PATTERNS:
|
|
105
|
+
match = re.search(pattern, line)
|
|
106
|
+
if match:
|
|
107
|
+
return match.group(0)
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def extract_log_level(line: str) -> Optional[str]:
|
|
112
|
+
"""Extract log level from log line.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
line: Log line text
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Log level (ERROR, WARN, INFO, DEBUG) or None
|
|
119
|
+
"""
|
|
120
|
+
for pattern, level in LOG_LEVEL_PATTERNS:
|
|
121
|
+
if re.search(pattern, line, re.IGNORECASE):
|
|
122
|
+
return level
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def extract_error_pattern(message: str) -> str:
|
|
127
|
+
"""Extract pattern from error message by removing variable parts.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
message: Error message
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Pattern string with variables replaced
|
|
134
|
+
"""
|
|
135
|
+
# Remove numbers
|
|
136
|
+
pattern = re.sub(r'\b\d+\b', '<NUM>', message)
|
|
137
|
+
|
|
138
|
+
# Remove hex addresses
|
|
139
|
+
pattern = re.sub(r'0x[0-9a-fA-F]+', '<ADDR>', pattern)
|
|
140
|
+
|
|
141
|
+
# Remove quoted strings
|
|
142
|
+
pattern = re.sub(r'"[^"]*"', '<STR>', pattern)
|
|
143
|
+
pattern = re.sub(r"'[^']*'", '<STR>', pattern)
|
|
144
|
+
|
|
145
|
+
# Remove file paths
|
|
146
|
+
pattern = re.sub(r'(/[\w/.-]+|[A-Z]:\\[\w\\.-]+)', '<PATH>', pattern)
|
|
147
|
+
|
|
148
|
+
# Remove timestamps
|
|
149
|
+
for ts_pattern in TIMESTAMP_PATTERNS:
|
|
150
|
+
pattern = re.sub(ts_pattern, '<TIME>', pattern)
|
|
151
|
+
|
|
152
|
+
return pattern[:200] # Truncate long patterns
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def analyze_log_file(
|
|
156
|
+
file_path: Path,
|
|
157
|
+
max_lines: Optional[int] = None,
|
|
158
|
+
max_errors: int = 100,
|
|
159
|
+
max_warnings: int = 100
|
|
160
|
+
) -> LogSummary:
|
|
161
|
+
"""Analyze a log file and extract errors/warnings.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
file_path: Path to log file
|
|
165
|
+
max_lines: Maximum lines to read (None = all)
|
|
166
|
+
max_errors: Maximum errors to store
|
|
167
|
+
max_warnings: Maximum warnings to store
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
LogSummary with extracted information
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
FileNotFoundError: If file doesn't exist
|
|
174
|
+
"""
|
|
175
|
+
if not file_path.exists():
|
|
176
|
+
raise FileNotFoundError(f"Log file not found: {file_path}")
|
|
177
|
+
|
|
178
|
+
errors: List[LogEntry] = []
|
|
179
|
+
warnings: List[LogEntry] = []
|
|
180
|
+
error_messages: List[str] = []
|
|
181
|
+
warning_messages: List[str] = []
|
|
182
|
+
timestamps: List[str] = []
|
|
183
|
+
|
|
184
|
+
total_lines = 0
|
|
185
|
+
|
|
186
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
187
|
+
for line_num, line in enumerate(f, 1):
|
|
188
|
+
if max_lines and line_num > max_lines:
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
total_lines += 1
|
|
192
|
+
line = line.strip()
|
|
193
|
+
if not line:
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
# Extract timestamp
|
|
197
|
+
timestamp = extract_timestamp(line)
|
|
198
|
+
if timestamp:
|
|
199
|
+
timestamps.append(timestamp)
|
|
200
|
+
|
|
201
|
+
# Extract log level
|
|
202
|
+
level = extract_log_level(line)
|
|
203
|
+
|
|
204
|
+
if level == 'ERROR':
|
|
205
|
+
if len(errors) < max_errors:
|
|
206
|
+
errors.append(LogEntry(
|
|
207
|
+
timestamp=timestamp,
|
|
208
|
+
level='ERROR',
|
|
209
|
+
message=line,
|
|
210
|
+
line_number=line_num
|
|
211
|
+
))
|
|
212
|
+
error_messages.append(line)
|
|
213
|
+
|
|
214
|
+
elif level == 'WARN':
|
|
215
|
+
if len(warnings) < max_warnings:
|
|
216
|
+
warnings.append(LogEntry(
|
|
217
|
+
timestamp=timestamp,
|
|
218
|
+
level='WARN',
|
|
219
|
+
message=line,
|
|
220
|
+
line_number=line_num
|
|
221
|
+
))
|
|
222
|
+
warning_messages.append(line)
|
|
223
|
+
|
|
224
|
+
# Extract patterns
|
|
225
|
+
error_patterns = Counter(extract_error_pattern(msg) for msg in error_messages)
|
|
226
|
+
warning_patterns = Counter(extract_error_pattern(msg) for msg in warning_messages)
|
|
227
|
+
|
|
228
|
+
# Timestamp range
|
|
229
|
+
timestamp_range = None
|
|
230
|
+
if timestamps:
|
|
231
|
+
timestamp_range = (timestamps[0], timestamps[-1])
|
|
232
|
+
|
|
233
|
+
return LogSummary(
|
|
234
|
+
file_path=str(file_path),
|
|
235
|
+
total_lines=total_lines,
|
|
236
|
+
errors=errors,
|
|
237
|
+
warnings=warnings,
|
|
238
|
+
error_patterns=dict(error_patterns.most_common(10)),
|
|
239
|
+
warning_patterns=dict(warning_patterns.most_common(10)),
|
|
240
|
+
timestamp_range=timestamp_range
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def extract_log_summary(file_path: Path) -> Dict[str, Any]:
|
|
245
|
+
"""Extract summary information from a log file.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
file_path: Path to log file
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Dictionary with summary and metadata
|
|
252
|
+
"""
|
|
253
|
+
summary = analyze_log_file(file_path)
|
|
254
|
+
summary_text = summary.to_text()
|
|
255
|
+
|
|
256
|
+
# Calculate sizes
|
|
257
|
+
original_size = file_path.stat().st_size
|
|
258
|
+
summary_size = len(summary_text)
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
'summary': summary,
|
|
262
|
+
'summary_text': summary_text,
|
|
263
|
+
'original_size_bytes': original_size,
|
|
264
|
+
'summary_size_bytes': summary_size,
|
|
265
|
+
'token_reduction_pct': ((original_size - summary_size) / original_size * 100) if original_size > 0 else 0.0,
|
|
266
|
+
'recommended_action': 'Use summary for LLM analysis instead of full log file'
|
|
267
|
+
}
|
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
"""Intelligent model selection for file extraction tasks.
|
|
2
|
+
|
|
3
|
+
This module provides automatic model selection based on file type,
|
|
4
|
+
extraction mode, and task requirements, optimizing for quality in
|
|
5
|
+
structured data extraction scenarios.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Tuple, Optional, List, Dict
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .file_analyzer import FileType
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ExtractionMode(Enum):
|
|
17
|
+
"""Types of extraction tasks."""
|
|
18
|
+
SCHEMA = "schema" # CSV/JSON schema extraction
|
|
19
|
+
ERRORS = "errors" # Log file error extraction
|
|
20
|
+
SUMMARY = "summary" # General summarization
|
|
21
|
+
STRUCTURE = "structure" # Code structure extraction
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ModelRecommendation:
|
|
26
|
+
"""Model recommendation with reasoning."""
|
|
27
|
+
provider: str
|
|
28
|
+
model: str
|
|
29
|
+
reasoning: str
|
|
30
|
+
quality_score: float
|
|
31
|
+
estimated_cost_per_1m: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ModelSelector:
|
|
35
|
+
"""Select optimal models for file extraction tasks."""
|
|
36
|
+
|
|
37
|
+
def __init__(self):
|
|
38
|
+
"""Initialize model selector with provider preferences."""
|
|
39
|
+
# Quality-focused models for schema extraction (ordered by preference)
|
|
40
|
+
self.schema_extraction_models = [
|
|
41
|
+
("anthropic", "claude-3-5-sonnet-20241022", "Best structured output quality"),
|
|
42
|
+
("openai", "gpt-4.5-turbo-20250205", "Excellent for data analysis"),
|
|
43
|
+
("anthropic", "claude-sonnet-4-5-20250929", "High quality, latest model"),
|
|
44
|
+
("google", "gemini-2.5-pro", "Strong structured reasoning"),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# Reasoning models for error/log analysis
|
|
48
|
+
self.error_extraction_models = [
|
|
49
|
+
("deepseek", "deepseek-reasoner", "Excellent reasoning for error analysis"),
|
|
50
|
+
("openai", "o1-mini", "Fast reasoning model"),
|
|
51
|
+
("openai", "o3-mini", "Advanced reasoning"),
|
|
52
|
+
("anthropic", "claude-3-5-sonnet-20241022", "Reliable error detection"),
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
# Code-optimized models for structure extraction
|
|
56
|
+
self.code_extraction_models = [
|
|
57
|
+
("deepseek", "deepseek-chat", "Optimized for code understanding"),
|
|
58
|
+
("anthropic", "claude-3-5-sonnet-20241022", "Strong code analysis"),
|
|
59
|
+
("openai", "gpt-4.5-turbo-20250205", "Excellent code comprehension"),
|
|
60
|
+
("google", "gemini-2.5-pro", "Good at code structure"),
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# Fast models for general summarization
|
|
64
|
+
self.summary_models = [
|
|
65
|
+
("google", "gemini-2.5-flash", "Fast and cost-effective"),
|
|
66
|
+
("groq", "llama-3.1-70b-versatile", "Very fast inference"),
|
|
67
|
+
("anthropic", "claude-3-5-haiku-20241022", "Quick, quality summaries"),
|
|
68
|
+
("openai", "gpt-4.1-mini", "Balanced speed/quality"),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
def select_for_file(
|
|
72
|
+
self,
|
|
73
|
+
file_path: Path,
|
|
74
|
+
extraction_mode: Optional[ExtractionMode] = None,
|
|
75
|
+
excluded_providers: Optional[List[str]] = None,
|
|
76
|
+
) -> ModelRecommendation:
|
|
77
|
+
"""Select optimal model for file extraction.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
file_path: Path to file being analyzed
|
|
81
|
+
extraction_mode: Specific extraction mode (auto-detected if None)
|
|
82
|
+
excluded_providers: Providers to exclude from selection
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
ModelRecommendation with selected provider/model and reasoning
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> selector = ModelSelector()
|
|
89
|
+
>>> rec = selector.select_for_file(Path("data.csv"))
|
|
90
|
+
>>> print(f"{rec.provider}/{rec.model}: {rec.reasoning}")
|
|
91
|
+
"""
|
|
92
|
+
from .file_analyzer import detect_file_type
|
|
93
|
+
|
|
94
|
+
excluded = excluded_providers or []
|
|
95
|
+
file_type = detect_file_type(file_path)
|
|
96
|
+
|
|
97
|
+
# Auto-detect extraction mode if not specified
|
|
98
|
+
if extraction_mode is None:
|
|
99
|
+
extraction_mode = self._infer_extraction_mode(file_type, file_path)
|
|
100
|
+
|
|
101
|
+
# Select model list based on extraction mode
|
|
102
|
+
if extraction_mode == ExtractionMode.SCHEMA:
|
|
103
|
+
candidates = self.schema_extraction_models
|
|
104
|
+
elif extraction_mode == ExtractionMode.ERRORS:
|
|
105
|
+
candidates = self.error_extraction_models
|
|
106
|
+
elif extraction_mode == ExtractionMode.STRUCTURE:
|
|
107
|
+
candidates = self.code_extraction_models
|
|
108
|
+
elif extraction_mode == ExtractionMode.SUMMARY:
|
|
109
|
+
candidates = self.summary_models
|
|
110
|
+
else:
|
|
111
|
+
# Fallback to schema extraction models (highest quality)
|
|
112
|
+
candidates = self.schema_extraction_models
|
|
113
|
+
|
|
114
|
+
# Filter out excluded providers
|
|
115
|
+
available_candidates = [
|
|
116
|
+
(provider, model, reason)
|
|
117
|
+
for provider, model, reason in candidates
|
|
118
|
+
if provider not in excluded
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
if not available_candidates:
|
|
122
|
+
# Fallback to any available provider
|
|
123
|
+
all_models = (
|
|
124
|
+
self.schema_extraction_models +
|
|
125
|
+
self.error_extraction_models +
|
|
126
|
+
self.code_extraction_models +
|
|
127
|
+
self.summary_models
|
|
128
|
+
)
|
|
129
|
+
available_candidates = [
|
|
130
|
+
(provider, model, reason)
|
|
131
|
+
for provider, model, reason in all_models
|
|
132
|
+
if provider not in excluded
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
if not available_candidates:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"No available models after filtering excluded providers. "
|
|
138
|
+
f"Excluded: {excluded}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Select first available (highest priority)
|
|
142
|
+
provider, model, reasoning = available_candidates[0]
|
|
143
|
+
|
|
144
|
+
# Get quality score and cost estimates
|
|
145
|
+
quality_score = self._get_quality_score(provider, model)
|
|
146
|
+
estimated_cost = self._get_estimated_cost(provider, model)
|
|
147
|
+
|
|
148
|
+
# Enhance reasoning with file type
|
|
149
|
+
full_reasoning = f"{reasoning} (for {file_type.value} {extraction_mode.value})"
|
|
150
|
+
|
|
151
|
+
return ModelRecommendation(
|
|
152
|
+
provider=provider,
|
|
153
|
+
model=model,
|
|
154
|
+
reasoning=full_reasoning,
|
|
155
|
+
quality_score=quality_score,
|
|
156
|
+
estimated_cost_per_1m=estimated_cost,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def select_for_extraction_mode(
|
|
160
|
+
self,
|
|
161
|
+
file_type: FileType,
|
|
162
|
+
extraction_mode: ExtractionMode,
|
|
163
|
+
excluded_providers: Optional[List[str]] = None,
|
|
164
|
+
) -> ModelRecommendation:
|
|
165
|
+
"""Select model based on file type and extraction mode.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
file_type: Detected file type
|
|
169
|
+
extraction_mode: Type of extraction task
|
|
170
|
+
excluded_providers: Providers to exclude
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
ModelRecommendation with selected provider/model
|
|
174
|
+
"""
|
|
175
|
+
excluded = excluded_providers or []
|
|
176
|
+
|
|
177
|
+
# Select candidates based on mode
|
|
178
|
+
if extraction_mode == ExtractionMode.SCHEMA:
|
|
179
|
+
candidates = self.schema_extraction_models
|
|
180
|
+
elif extraction_mode == ExtractionMode.ERRORS:
|
|
181
|
+
candidates = self.error_extraction_models
|
|
182
|
+
elif extraction_mode == ExtractionMode.STRUCTURE:
|
|
183
|
+
candidates = self.code_extraction_models
|
|
184
|
+
else:
|
|
185
|
+
candidates = self.summary_models
|
|
186
|
+
|
|
187
|
+
# Filter and select
|
|
188
|
+
available = [
|
|
189
|
+
(p, m, r) for p, m, r in candidates
|
|
190
|
+
if p not in excluded
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
if not available:
|
|
194
|
+
raise ValueError("No available models after filtering")
|
|
195
|
+
|
|
196
|
+
provider, model, reasoning = available[0]
|
|
197
|
+
|
|
198
|
+
return ModelRecommendation(
|
|
199
|
+
provider=provider,
|
|
200
|
+
model=model,
|
|
201
|
+
reasoning=f"{reasoning} (for {file_type.value} {extraction_mode.value})",
|
|
202
|
+
quality_score=self._get_quality_score(provider, model),
|
|
203
|
+
estimated_cost_per_1m=self._get_estimated_cost(provider, model),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def _infer_extraction_mode(self, file_type: FileType, file_path: Path) -> ExtractionMode:
|
|
207
|
+
"""Infer extraction mode from file type and name.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
file_type: Detected file type
|
|
211
|
+
file_path: Path to file
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Inferred ExtractionMode
|
|
215
|
+
"""
|
|
216
|
+
# Map file types to extraction modes
|
|
217
|
+
if file_type == FileType.CSV:
|
|
218
|
+
return ExtractionMode.SCHEMA
|
|
219
|
+
elif file_type == FileType.JSON:
|
|
220
|
+
return ExtractionMode.SCHEMA
|
|
221
|
+
elif file_type == FileType.LOG:
|
|
222
|
+
return ExtractionMode.ERRORS
|
|
223
|
+
elif file_type in [FileType.PYTHON, FileType.JAVASCRIPT, FileType.JAVA, FileType.GO]:
|
|
224
|
+
return ExtractionMode.STRUCTURE
|
|
225
|
+
else:
|
|
226
|
+
# Check filename for hints
|
|
227
|
+
filename_lower = file_path.name.lower()
|
|
228
|
+
if 'log' in filename_lower or 'error' in filename_lower:
|
|
229
|
+
return ExtractionMode.ERRORS
|
|
230
|
+
else:
|
|
231
|
+
return ExtractionMode.SUMMARY
|
|
232
|
+
|
|
233
|
+
def _get_quality_score(self, provider: str, model: str) -> float:
|
|
234
|
+
"""Get quality score for a model.
|
|
235
|
+
|
|
236
|
+
These are estimated scores based on benchmarks.
|
|
237
|
+
Should match Router's quality_scores.
|
|
238
|
+
"""
|
|
239
|
+
quality_scores = {
|
|
240
|
+
# OpenAI
|
|
241
|
+
"gpt-5": 0.98,
|
|
242
|
+
"o3-mini": 0.95,
|
|
243
|
+
"gpt-4.5-turbo-20250205": 0.93,
|
|
244
|
+
"gpt-4.1-turbo": 0.90,
|
|
245
|
+
"gpt-4.1-mini": 0.82,
|
|
246
|
+
"o1-mini": 0.88,
|
|
247
|
+
"o1": 0.96,
|
|
248
|
+
|
|
249
|
+
# Anthropic
|
|
250
|
+
"claude-sonnet-4-5-20250929": 0.94,
|
|
251
|
+
"claude-3-5-sonnet-20241022": 0.92,
|
|
252
|
+
"claude-3-5-haiku-20241022": 0.80,
|
|
253
|
+
|
|
254
|
+
# Google
|
|
255
|
+
"gemini-2.5-pro": 0.91,
|
|
256
|
+
"gemini-2.5-flash": 0.85,
|
|
257
|
+
"gemini-2.5-flash-lite": 0.78,
|
|
258
|
+
|
|
259
|
+
# DeepSeek
|
|
260
|
+
"deepseek-chat": 0.85,
|
|
261
|
+
"deepseek-reasoner": 0.90,
|
|
262
|
+
|
|
263
|
+
# Groq
|
|
264
|
+
"llama-3.1-70b-versatile": 0.83,
|
|
265
|
+
"llama-3.1-8b-instant": 0.75,
|
|
266
|
+
"mixtral-8x7b-32768": 0.80,
|
|
267
|
+
|
|
268
|
+
# Grok
|
|
269
|
+
"grok-beta": 0.87,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return quality_scores.get(model, 0.75)
|
|
273
|
+
|
|
274
|
+
def _get_estimated_cost(self, provider: str, model: str) -> float:
|
|
275
|
+
"""Get estimated cost per 1M tokens (average of input/output).
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
provider: Provider name
|
|
279
|
+
model: Model name
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Estimated cost per 1M tokens in USD
|
|
283
|
+
"""
|
|
284
|
+
# Rough estimates - should match MODEL_CATALOG
|
|
285
|
+
cost_estimates = {
|
|
286
|
+
"gpt-4.5-turbo-20250205": 2.50,
|
|
287
|
+
"claude-3-5-sonnet-20241022": 3.00,
|
|
288
|
+
"claude-sonnet-4-5-20250929": 3.00,
|
|
289
|
+
"gemini-2.5-pro": 1.25,
|
|
290
|
+
"gemini-2.5-flash": 0.15,
|
|
291
|
+
"deepseek-chat": 0.14,
|
|
292
|
+
"deepseek-reasoner": 0.55,
|
|
293
|
+
"o1-mini": 3.00,
|
|
294
|
+
"o3-mini": 1.10,
|
|
295
|
+
"llama-3.1-70b-versatile": 0.08,
|
|
296
|
+
"claude-3-5-haiku-20241022": 1.00,
|
|
297
|
+
"gpt-4.1-mini": 0.15,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return cost_estimates.get(model, 1.0)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def select_model_for_file(
|
|
304
|
+
file_path: Path,
|
|
305
|
+
extraction_mode: Optional[ExtractionMode] = None,
|
|
306
|
+
excluded_providers: Optional[List[str]] = None,
|
|
307
|
+
) -> Tuple[str, str, str]:
|
|
308
|
+
"""Convenience function to select model for file.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
file_path: Path to file
|
|
312
|
+
extraction_mode: Optional extraction mode (auto-detected if None)
|
|
313
|
+
excluded_providers: Providers to exclude
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Tuple of (provider, model, reasoning)
|
|
317
|
+
|
|
318
|
+
Examples:
|
|
319
|
+
>>> provider, model, reason = select_model_for_file(Path("data.csv"))
|
|
320
|
+
>>> print(f"Selected {provider}/{model}: {reason}")
|
|
321
|
+
"""
|
|
322
|
+
selector = ModelSelector()
|
|
323
|
+
recommendation = selector.select_for_file(file_path, extraction_mode, excluded_providers)
|
|
324
|
+
return recommendation.provider, recommendation.model, recommendation.reasoning
|