stratifyai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. cli/__init__.py +5 -0
  2. cli/stratifyai_cli.py +1753 -0
  3. stratifyai/__init__.py +113 -0
  4. stratifyai/api_key_helper.py +372 -0
  5. stratifyai/caching.py +279 -0
  6. stratifyai/chat/__init__.py +54 -0
  7. stratifyai/chat/builder.py +366 -0
  8. stratifyai/chat/stratifyai_anthropic.py +194 -0
  9. stratifyai/chat/stratifyai_bedrock.py +200 -0
  10. stratifyai/chat/stratifyai_deepseek.py +194 -0
  11. stratifyai/chat/stratifyai_google.py +194 -0
  12. stratifyai/chat/stratifyai_grok.py +194 -0
  13. stratifyai/chat/stratifyai_groq.py +195 -0
  14. stratifyai/chat/stratifyai_ollama.py +201 -0
  15. stratifyai/chat/stratifyai_openai.py +209 -0
  16. stratifyai/chat/stratifyai_openrouter.py +201 -0
  17. stratifyai/chunking.py +158 -0
  18. stratifyai/client.py +292 -0
  19. stratifyai/config.py +1273 -0
  20. stratifyai/cost_tracker.py +257 -0
  21. stratifyai/embeddings.py +245 -0
  22. stratifyai/exceptions.py +91 -0
  23. stratifyai/models.py +59 -0
  24. stratifyai/providers/__init__.py +5 -0
  25. stratifyai/providers/anthropic.py +330 -0
  26. stratifyai/providers/base.py +183 -0
  27. stratifyai/providers/bedrock.py +634 -0
  28. stratifyai/providers/deepseek.py +39 -0
  29. stratifyai/providers/google.py +39 -0
  30. stratifyai/providers/grok.py +39 -0
  31. stratifyai/providers/groq.py +39 -0
  32. stratifyai/providers/ollama.py +43 -0
  33. stratifyai/providers/openai.py +344 -0
  34. stratifyai/providers/openai_compatible.py +372 -0
  35. stratifyai/providers/openrouter.py +39 -0
  36. stratifyai/py.typed +2 -0
  37. stratifyai/rag.py +381 -0
  38. stratifyai/retry.py +185 -0
  39. stratifyai/router.py +643 -0
  40. stratifyai/summarization.py +179 -0
  41. stratifyai/utils/__init__.py +11 -0
  42. stratifyai/utils/bedrock_validator.py +136 -0
  43. stratifyai/utils/code_extractor.py +327 -0
  44. stratifyai/utils/csv_extractor.py +197 -0
  45. stratifyai/utils/file_analyzer.py +192 -0
  46. stratifyai/utils/json_extractor.py +219 -0
  47. stratifyai/utils/log_extractor.py +267 -0
  48. stratifyai/utils/model_selector.py +324 -0
  49. stratifyai/utils/provider_validator.py +442 -0
  50. stratifyai/utils/token_counter.py +186 -0
  51. stratifyai/vectordb.py +344 -0
  52. stratifyai-0.1.0.dist-info/METADATA +263 -0
  53. stratifyai-0.1.0.dist-info/RECORD +57 -0
  54. stratifyai-0.1.0.dist-info/WHEEL +5 -0
  55. stratifyai-0.1.0.dist-info/entry_points.txt +2 -0
  56. stratifyai-0.1.0.dist-info/licenses/LICENSE +21 -0
  57. stratifyai-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,267 @@
1
+ """Log file error extraction for intelligent file analysis.
2
+
3
+ This module extracts errors, warnings, and patterns from log files to reduce
4
+ token usage by 90%+ while preserving critical information.
5
+ """
6
+
7
+ import re
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional, Any
11
+ from collections import Counter
12
+ from datetime import datetime
13
+
14
+
15
+ @dataclass
16
+ class LogEntry:
17
+ """A single log entry."""
18
+ timestamp: Optional[str]
19
+ level: str # ERROR, WARN, INFO, DEBUG, etc.
20
+ message: str
21
+ line_number: int
22
+
23
+
24
+ @dataclass
25
+ class LogSummary:
26
+ """Summary of log file analysis."""
27
+ file_path: str
28
+ total_lines: int
29
+ errors: List[LogEntry]
30
+ warnings: List[LogEntry]
31
+ error_patterns: Dict[str, int]
32
+ warning_patterns: Dict[str, int]
33
+ timestamp_range: Optional[tuple[str, str]]
34
+
35
+ def to_text(self) -> str:
36
+ """Convert summary to human-readable text.
37
+
38
+ Returns:
39
+ Formatted log summary
40
+ """
41
+ lines = [
42
+ f"Log File: {self.file_path}",
43
+ f"Total Lines: {self.total_lines:,}",
44
+ f"Errors: {len(self.errors)}",
45
+ f"Warnings: {len(self.warnings)}",
46
+ ]
47
+
48
+ if self.timestamp_range:
49
+ lines.append(f"Time Range: {self.timestamp_range[0]} to {self.timestamp_range[1]}")
50
+
51
+ # Error patterns
52
+ if self.error_patterns:
53
+ lines.append("\nTop Error Patterns:")
54
+ for pattern, count in sorted(self.error_patterns.items(), key=lambda x: -x[1])[:10]:
55
+ lines.append(f" [{count}×] {pattern}")
56
+
57
+ # Warning patterns
58
+ if self.warning_patterns:
59
+ lines.append("\nTop Warning Patterns:")
60
+ for pattern, count in sorted(self.warning_patterns.items(), key=lambda x: -x[1])[:10]:
61
+ lines.append(f" [{count}×] {pattern}")
62
+
63
+ # Recent errors (last 5)
64
+ if self.errors:
65
+ lines.append("\nRecent Errors:")
66
+ for entry in self.errors[-5:]:
67
+ ts = entry.timestamp or "??:??:??"
68
+ lines.append(f" [Line {entry.line_number}] {ts} - {entry.message[:100]}")
69
+
70
+ # Recent warnings (last 5)
71
+ if self.warnings:
72
+ lines.append("\nRecent Warnings:")
73
+ for entry in self.warnings[-5:]:
74
+ ts = entry.timestamp or "??:??:??"
75
+ lines.append(f" [Line {entry.line_number}] {ts} - {entry.message[:100]}")
76
+
77
+ return "\n".join(lines)
78
+
79
+
80
+ # Common log patterns
81
+ LOG_LEVEL_PATTERNS = [
82
+ (r'\b(ERROR|FATAL|CRITICAL)\b', 'ERROR'),
83
+ (r'\b(WARN|WARNING)\b', 'WARN'),
84
+ (r'\b(INFO|INFORMATION)\b', 'INFO'),
85
+ (r'\b(DEBUG|TRACE)\b', 'DEBUG'),
86
+ ]
87
+
88
+ TIMESTAMP_PATTERNS = [
89
+ r'\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}', # ISO format
90
+ r'\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2}', # US format
91
+ r'\d{2}:\d{2}:\d{2}', # Time only
92
+ ]
93
+
94
+
95
+ def extract_timestamp(line: str) -> Optional[str]:
96
+ """Extract timestamp from log line.
97
+
98
+ Args:
99
+ line: Log line text
100
+
101
+ Returns:
102
+ Extracted timestamp or None
103
+ """
104
+ for pattern in TIMESTAMP_PATTERNS:
105
+ match = re.search(pattern, line)
106
+ if match:
107
+ return match.group(0)
108
+ return None
109
+
110
+
111
+ def extract_log_level(line: str) -> Optional[str]:
112
+ """Extract log level from log line.
113
+
114
+ Args:
115
+ line: Log line text
116
+
117
+ Returns:
118
+ Log level (ERROR, WARN, INFO, DEBUG) or None
119
+ """
120
+ for pattern, level in LOG_LEVEL_PATTERNS:
121
+ if re.search(pattern, line, re.IGNORECASE):
122
+ return level
123
+ return None
124
+
125
+
126
+ def extract_error_pattern(message: str) -> str:
127
+ """Extract pattern from error message by removing variable parts.
128
+
129
+ Args:
130
+ message: Error message
131
+
132
+ Returns:
133
+ Pattern string with variables replaced
134
+ """
135
+ # Remove numbers
136
+ pattern = re.sub(r'\b\d+\b', '<NUM>', message)
137
+
138
+ # Remove hex addresses
139
+ pattern = re.sub(r'0x[0-9a-fA-F]+', '<ADDR>', pattern)
140
+
141
+ # Remove quoted strings
142
+ pattern = re.sub(r'"[^"]*"', '<STR>', pattern)
143
+ pattern = re.sub(r"'[^']*'", '<STR>', pattern)
144
+
145
+ # Remove file paths
146
+ pattern = re.sub(r'(/[\w/.-]+|[A-Z]:\\[\w\\.-]+)', '<PATH>', pattern)
147
+
148
+ # Remove timestamps
149
+ for ts_pattern in TIMESTAMP_PATTERNS:
150
+ pattern = re.sub(ts_pattern, '<TIME>', pattern)
151
+
152
+ return pattern[:200] # Truncate long patterns
153
+
154
+
155
+ def analyze_log_file(
156
+ file_path: Path,
157
+ max_lines: Optional[int] = None,
158
+ max_errors: int = 100,
159
+ max_warnings: int = 100
160
+ ) -> LogSummary:
161
+ """Analyze a log file and extract errors/warnings.
162
+
163
+ Args:
164
+ file_path: Path to log file
165
+ max_lines: Maximum lines to read (None = all)
166
+ max_errors: Maximum errors to store
167
+ max_warnings: Maximum warnings to store
168
+
169
+ Returns:
170
+ LogSummary with extracted information
171
+
172
+ Raises:
173
+ FileNotFoundError: If file doesn't exist
174
+ """
175
+ if not file_path.exists():
176
+ raise FileNotFoundError(f"Log file not found: {file_path}")
177
+
178
+ errors: List[LogEntry] = []
179
+ warnings: List[LogEntry] = []
180
+ error_messages: List[str] = []
181
+ warning_messages: List[str] = []
182
+ timestamps: List[str] = []
183
+
184
+ total_lines = 0
185
+
186
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
187
+ for line_num, line in enumerate(f, 1):
188
+ if max_lines and line_num > max_lines:
189
+ break
190
+
191
+ total_lines += 1
192
+ line = line.strip()
193
+ if not line:
194
+ continue
195
+
196
+ # Extract timestamp
197
+ timestamp = extract_timestamp(line)
198
+ if timestamp:
199
+ timestamps.append(timestamp)
200
+
201
+ # Extract log level
202
+ level = extract_log_level(line)
203
+
204
+ if level == 'ERROR':
205
+ if len(errors) < max_errors:
206
+ errors.append(LogEntry(
207
+ timestamp=timestamp,
208
+ level='ERROR',
209
+ message=line,
210
+ line_number=line_num
211
+ ))
212
+ error_messages.append(line)
213
+
214
+ elif level == 'WARN':
215
+ if len(warnings) < max_warnings:
216
+ warnings.append(LogEntry(
217
+ timestamp=timestamp,
218
+ level='WARN',
219
+ message=line,
220
+ line_number=line_num
221
+ ))
222
+ warning_messages.append(line)
223
+
224
+ # Extract patterns
225
+ error_patterns = Counter(extract_error_pattern(msg) for msg in error_messages)
226
+ warning_patterns = Counter(extract_error_pattern(msg) for msg in warning_messages)
227
+
228
+ # Timestamp range
229
+ timestamp_range = None
230
+ if timestamps:
231
+ timestamp_range = (timestamps[0], timestamps[-1])
232
+
233
+ return LogSummary(
234
+ file_path=str(file_path),
235
+ total_lines=total_lines,
236
+ errors=errors,
237
+ warnings=warnings,
238
+ error_patterns=dict(error_patterns.most_common(10)),
239
+ warning_patterns=dict(warning_patterns.most_common(10)),
240
+ timestamp_range=timestamp_range
241
+ )
242
+
243
+
244
+ def extract_log_summary(file_path: Path) -> Dict[str, Any]:
245
+ """Extract summary information from a log file.
246
+
247
+ Args:
248
+ file_path: Path to log file
249
+
250
+ Returns:
251
+ Dictionary with summary and metadata
252
+ """
253
+ summary = analyze_log_file(file_path)
254
+ summary_text = summary.to_text()
255
+
256
+ # Calculate sizes
257
+ original_size = file_path.stat().st_size
258
+ summary_size = len(summary_text)
259
+
260
+ return {
261
+ 'summary': summary,
262
+ 'summary_text': summary_text,
263
+ 'original_size_bytes': original_size,
264
+ 'summary_size_bytes': summary_size,
265
+ 'token_reduction_pct': ((original_size - summary_size) / original_size * 100) if original_size > 0 else 0.0,
266
+ 'recommended_action': 'Use summary for LLM analysis instead of full log file'
267
+ }
@@ -0,0 +1,324 @@
1
+ """Intelligent model selection for file extraction tasks.
2
+
3
+ This module provides automatic model selection based on file type,
4
+ extraction mode, and task requirements, optimizing for quality in
5
+ structured data extraction scenarios.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from typing import Tuple, Optional, List, Dict
11
+ from pathlib import Path
12
+
13
+ from .file_analyzer import FileType
14
+
15
+
16
+ class ExtractionMode(Enum):
17
+ """Types of extraction tasks."""
18
+ SCHEMA = "schema" # CSV/JSON schema extraction
19
+ ERRORS = "errors" # Log file error extraction
20
+ SUMMARY = "summary" # General summarization
21
+ STRUCTURE = "structure" # Code structure extraction
22
+
23
+
24
+ @dataclass
25
+ class ModelRecommendation:
26
+ """Model recommendation with reasoning."""
27
+ provider: str
28
+ model: str
29
+ reasoning: str
30
+ quality_score: float
31
+ estimated_cost_per_1m: float
32
+
33
+
34
+ class ModelSelector:
35
+ """Select optimal models for file extraction tasks."""
36
+
37
+ def __init__(self):
38
+ """Initialize model selector with provider preferences."""
39
+ # Quality-focused models for schema extraction (ordered by preference)
40
+ self.schema_extraction_models = [
41
+ ("anthropic", "claude-3-5-sonnet-20241022", "Best structured output quality"),
42
+ ("openai", "gpt-4.5-turbo-20250205", "Excellent for data analysis"),
43
+ ("anthropic", "claude-sonnet-4-5-20250929", "High quality, latest model"),
44
+ ("google", "gemini-2.5-pro", "Strong structured reasoning"),
45
+ ]
46
+
47
+ # Reasoning models for error/log analysis
48
+ self.error_extraction_models = [
49
+ ("deepseek", "deepseek-reasoner", "Excellent reasoning for error analysis"),
50
+ ("openai", "o1-mini", "Fast reasoning model"),
51
+ ("openai", "o3-mini", "Advanced reasoning"),
52
+ ("anthropic", "claude-3-5-sonnet-20241022", "Reliable error detection"),
53
+ ]
54
+
55
+ # Code-optimized models for structure extraction
56
+ self.code_extraction_models = [
57
+ ("deepseek", "deepseek-chat", "Optimized for code understanding"),
58
+ ("anthropic", "claude-3-5-sonnet-20241022", "Strong code analysis"),
59
+ ("openai", "gpt-4.5-turbo-20250205", "Excellent code comprehension"),
60
+ ("google", "gemini-2.5-pro", "Good at code structure"),
61
+ ]
62
+
63
+ # Fast models for general summarization
64
+ self.summary_models = [
65
+ ("google", "gemini-2.5-flash", "Fast and cost-effective"),
66
+ ("groq", "llama-3.1-70b-versatile", "Very fast inference"),
67
+ ("anthropic", "claude-3-5-haiku-20241022", "Quick, quality summaries"),
68
+ ("openai", "gpt-4.1-mini", "Balanced speed/quality"),
69
+ ]
70
+
71
+ def select_for_file(
72
+ self,
73
+ file_path: Path,
74
+ extraction_mode: Optional[ExtractionMode] = None,
75
+ excluded_providers: Optional[List[str]] = None,
76
+ ) -> ModelRecommendation:
77
+ """Select optimal model for file extraction.
78
+
79
+ Args:
80
+ file_path: Path to file being analyzed
81
+ extraction_mode: Specific extraction mode (auto-detected if None)
82
+ excluded_providers: Providers to exclude from selection
83
+
84
+ Returns:
85
+ ModelRecommendation with selected provider/model and reasoning
86
+
87
+ Examples:
88
+ >>> selector = ModelSelector()
89
+ >>> rec = selector.select_for_file(Path("data.csv"))
90
+ >>> print(f"{rec.provider}/{rec.model}: {rec.reasoning}")
91
+ """
92
+ from .file_analyzer import detect_file_type
93
+
94
+ excluded = excluded_providers or []
95
+ file_type = detect_file_type(file_path)
96
+
97
+ # Auto-detect extraction mode if not specified
98
+ if extraction_mode is None:
99
+ extraction_mode = self._infer_extraction_mode(file_type, file_path)
100
+
101
+ # Select model list based on extraction mode
102
+ if extraction_mode == ExtractionMode.SCHEMA:
103
+ candidates = self.schema_extraction_models
104
+ elif extraction_mode == ExtractionMode.ERRORS:
105
+ candidates = self.error_extraction_models
106
+ elif extraction_mode == ExtractionMode.STRUCTURE:
107
+ candidates = self.code_extraction_models
108
+ elif extraction_mode == ExtractionMode.SUMMARY:
109
+ candidates = self.summary_models
110
+ else:
111
+ # Fallback to schema extraction models (highest quality)
112
+ candidates = self.schema_extraction_models
113
+
114
+ # Filter out excluded providers
115
+ available_candidates = [
116
+ (provider, model, reason)
117
+ for provider, model, reason in candidates
118
+ if provider not in excluded
119
+ ]
120
+
121
+ if not available_candidates:
122
+ # Fallback to any available provider
123
+ all_models = (
124
+ self.schema_extraction_models +
125
+ self.error_extraction_models +
126
+ self.code_extraction_models +
127
+ self.summary_models
128
+ )
129
+ available_candidates = [
130
+ (provider, model, reason)
131
+ for provider, model, reason in all_models
132
+ if provider not in excluded
133
+ ]
134
+
135
+ if not available_candidates:
136
+ raise ValueError(
137
+ "No available models after filtering excluded providers. "
138
+ f"Excluded: {excluded}"
139
+ )
140
+
141
+ # Select first available (highest priority)
142
+ provider, model, reasoning = available_candidates[0]
143
+
144
+ # Get quality score and cost estimates
145
+ quality_score = self._get_quality_score(provider, model)
146
+ estimated_cost = self._get_estimated_cost(provider, model)
147
+
148
+ # Enhance reasoning with file type
149
+ full_reasoning = f"{reasoning} (for {file_type.value} {extraction_mode.value})"
150
+
151
+ return ModelRecommendation(
152
+ provider=provider,
153
+ model=model,
154
+ reasoning=full_reasoning,
155
+ quality_score=quality_score,
156
+ estimated_cost_per_1m=estimated_cost,
157
+ )
158
+
159
+ def select_for_extraction_mode(
160
+ self,
161
+ file_type: FileType,
162
+ extraction_mode: ExtractionMode,
163
+ excluded_providers: Optional[List[str]] = None,
164
+ ) -> ModelRecommendation:
165
+ """Select model based on file type and extraction mode.
166
+
167
+ Args:
168
+ file_type: Detected file type
169
+ extraction_mode: Type of extraction task
170
+ excluded_providers: Providers to exclude
171
+
172
+ Returns:
173
+ ModelRecommendation with selected provider/model
174
+ """
175
+ excluded = excluded_providers or []
176
+
177
+ # Select candidates based on mode
178
+ if extraction_mode == ExtractionMode.SCHEMA:
179
+ candidates = self.schema_extraction_models
180
+ elif extraction_mode == ExtractionMode.ERRORS:
181
+ candidates = self.error_extraction_models
182
+ elif extraction_mode == ExtractionMode.STRUCTURE:
183
+ candidates = self.code_extraction_models
184
+ else:
185
+ candidates = self.summary_models
186
+
187
+ # Filter and select
188
+ available = [
189
+ (p, m, r) for p, m, r in candidates
190
+ if p not in excluded
191
+ ]
192
+
193
+ if not available:
194
+ raise ValueError("No available models after filtering")
195
+
196
+ provider, model, reasoning = available[0]
197
+
198
+ return ModelRecommendation(
199
+ provider=provider,
200
+ model=model,
201
+ reasoning=f"{reasoning} (for {file_type.value} {extraction_mode.value})",
202
+ quality_score=self._get_quality_score(provider, model),
203
+ estimated_cost_per_1m=self._get_estimated_cost(provider, model),
204
+ )
205
+
206
+ def _infer_extraction_mode(self, file_type: FileType, file_path: Path) -> ExtractionMode:
207
+ """Infer extraction mode from file type and name.
208
+
209
+ Args:
210
+ file_type: Detected file type
211
+ file_path: Path to file
212
+
213
+ Returns:
214
+ Inferred ExtractionMode
215
+ """
216
+ # Map file types to extraction modes
217
+ if file_type == FileType.CSV:
218
+ return ExtractionMode.SCHEMA
219
+ elif file_type == FileType.JSON:
220
+ return ExtractionMode.SCHEMA
221
+ elif file_type == FileType.LOG:
222
+ return ExtractionMode.ERRORS
223
+ elif file_type in [FileType.PYTHON, FileType.JAVASCRIPT, FileType.JAVA, FileType.GO]:
224
+ return ExtractionMode.STRUCTURE
225
+ else:
226
+ # Check filename for hints
227
+ filename_lower = file_path.name.lower()
228
+ if 'log' in filename_lower or 'error' in filename_lower:
229
+ return ExtractionMode.ERRORS
230
+ else:
231
+ return ExtractionMode.SUMMARY
232
+
233
+ def _get_quality_score(self, provider: str, model: str) -> float:
234
+ """Get quality score for a model.
235
+
236
+ These are estimated scores based on benchmarks.
237
+ Should match Router's quality_scores.
238
+ """
239
+ quality_scores = {
240
+ # OpenAI
241
+ "gpt-5": 0.98,
242
+ "o3-mini": 0.95,
243
+ "gpt-4.5-turbo-20250205": 0.93,
244
+ "gpt-4.1-turbo": 0.90,
245
+ "gpt-4.1-mini": 0.82,
246
+ "o1-mini": 0.88,
247
+ "o1": 0.96,
248
+
249
+ # Anthropic
250
+ "claude-sonnet-4-5-20250929": 0.94,
251
+ "claude-3-5-sonnet-20241022": 0.92,
252
+ "claude-3-5-haiku-20241022": 0.80,
253
+
254
+ # Google
255
+ "gemini-2.5-pro": 0.91,
256
+ "gemini-2.5-flash": 0.85,
257
+ "gemini-2.5-flash-lite": 0.78,
258
+
259
+ # DeepSeek
260
+ "deepseek-chat": 0.85,
261
+ "deepseek-reasoner": 0.90,
262
+
263
+ # Groq
264
+ "llama-3.1-70b-versatile": 0.83,
265
+ "llama-3.1-8b-instant": 0.75,
266
+ "mixtral-8x7b-32768": 0.80,
267
+
268
+ # Grok
269
+ "grok-beta": 0.87,
270
+ }
271
+
272
+ return quality_scores.get(model, 0.75)
273
+
274
+ def _get_estimated_cost(self, provider: str, model: str) -> float:
275
+ """Get estimated cost per 1M tokens (average of input/output).
276
+
277
+ Args:
278
+ provider: Provider name
279
+ model: Model name
280
+
281
+ Returns:
282
+ Estimated cost per 1M tokens in USD
283
+ """
284
+ # Rough estimates - should match MODEL_CATALOG
285
+ cost_estimates = {
286
+ "gpt-4.5-turbo-20250205": 2.50,
287
+ "claude-3-5-sonnet-20241022": 3.00,
288
+ "claude-sonnet-4-5-20250929": 3.00,
289
+ "gemini-2.5-pro": 1.25,
290
+ "gemini-2.5-flash": 0.15,
291
+ "deepseek-chat": 0.14,
292
+ "deepseek-reasoner": 0.55,
293
+ "o1-mini": 3.00,
294
+ "o3-mini": 1.10,
295
+ "llama-3.1-70b-versatile": 0.08,
296
+ "claude-3-5-haiku-20241022": 1.00,
297
+ "gpt-4.1-mini": 0.15,
298
+ }
299
+
300
+ return cost_estimates.get(model, 1.0)
301
+
302
+
303
+ def select_model_for_file(
304
+ file_path: Path,
305
+ extraction_mode: Optional[ExtractionMode] = None,
306
+ excluded_providers: Optional[List[str]] = None,
307
+ ) -> Tuple[str, str, str]:
308
+ """Convenience function to select model for file.
309
+
310
+ Args:
311
+ file_path: Path to file
312
+ extraction_mode: Optional extraction mode (auto-detected if None)
313
+ excluded_providers: Providers to exclude
314
+
315
+ Returns:
316
+ Tuple of (provider, model, reasoning)
317
+
318
+ Examples:
319
+ >>> provider, model, reason = select_model_for_file(Path("data.csv"))
320
+ >>> print(f"Selected {provider}/{model}: {reason}")
321
+ """
322
+ selector = ModelSelector()
323
+ recommendation = selector.select_for_file(file_path, extraction_mode, excluded_providers)
324
+ return recommendation.provider, recommendation.model, recommendation.reasoning