featcopilot 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,595 @@
1
+ """LiteLLM client wrapper for feature engineering.
2
+
3
+ Provides a unified interface to 100+ LLM providers through LiteLLM,
4
+ enabling flexible model selection without vendor lock-in.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ from typing import Any, Optional
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from featcopilot.utils.logger import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class LiteLLMConfig(BaseModel):
19
+ """Configuration for LiteLLM client."""
20
+
21
+ model: str = Field(default="gpt-4o", description="Model identifier (e.g., gpt-4o, claude-3-opus)")
22
+ temperature: float = Field(default=0.3, ge=0, le=2, description="Temperature for generation")
23
+ max_tokens: int = Field(default=4096, description="Maximum tokens in response")
24
+ timeout: float = Field(default=60.0, description="Timeout in seconds")
25
+ api_key: Optional[str] = Field(default=None, description="API key (uses env var if not provided)")
26
+ api_base: Optional[str] = Field(default=None, description="Custom API base URL")
27
+
28
+
29
+ class LiteLLMFeatureClient:
30
+ """
31
+ LiteLLM client wrapper for feature engineering.
32
+
33
+ Provides a unified interface to 100+ LLM providers through LiteLLM,
34
+ supporting OpenAI, Anthropic, Azure, Google, Cohere, and many more.
35
+
36
+ Parameters
37
+ ----------
38
+ config : LiteLLMConfig, optional
39
+ Configuration for the client
40
+ model : str, default='gpt-4o'
41
+ Model to use for generation (e.g., 'gpt-4o', 'claude-3-opus', 'gemini-pro')
42
+ api_key : str, optional
43
+ API key for the provider (uses environment variable if not provided)
44
+ api_base : str, optional
45
+ Custom API base URL for self-hosted models
46
+
47
+ Examples
48
+ --------
49
+ >>> client = LiteLLMFeatureClient(model='gpt-4o')
50
+ >>> await client.start()
51
+ >>> suggestions = await client.suggest_features(
52
+ ... column_info={'age': 'int', 'income': 'float'},
53
+ ... task='predict churn'
54
+ ... )
55
+ >>> await client.stop()
56
+
57
+ Notes
58
+ -----
59
+ Supported model prefixes:
60
+ - OpenAI: gpt-4, gpt-4o, gpt-3.5-turbo
61
+ - Anthropic: claude-3-opus, claude-3-sonnet, claude-3-haiku
62
+ - Azure: azure/deployment-name
63
+ - Google: gemini-pro, gemini-ultra
64
+ - AWS Bedrock: bedrock/model-id
65
+ - Ollama: ollama/llama2, ollama/mistral
66
+ - And many more...
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ config: Optional[LiteLLMConfig] = None,
72
+ model: str = "gpt-4o",
73
+ api_key: Optional[str] = None,
74
+ api_base: Optional[str] = None,
75
+ **kwargs,
76
+ ):
77
+ self.config = config or LiteLLMConfig(model=model, api_key=api_key, api_base=api_base, **kwargs)
78
+ self._is_started = False
79
+ self._litellm_available = False
80
+ self._litellm = None
81
+
82
+ async def start(self) -> "LiteLLMFeatureClient":
83
+ """
84
+ Start the LiteLLM client.
85
+
86
+ Returns
87
+ -------
88
+ self : LiteLLMFeatureClient
89
+ """
90
+ try:
91
+ import litellm
92
+
93
+ self._litellm = litellm
94
+ self._litellm_available = True
95
+ self._is_started = True
96
+
97
+ # Configure litellm settings
98
+ if self.config.api_key:
99
+ # Set API key based on model provider
100
+ model_lower = self.config.model.lower()
101
+ if "gpt" in model_lower or "openai" in model_lower:
102
+ import os
103
+
104
+ os.environ["OPENAI_API_KEY"] = self.config.api_key
105
+ elif "claude" in model_lower or "anthropic" in model_lower:
106
+ import os
107
+
108
+ os.environ["ANTHROPIC_API_KEY"] = self.config.api_key
109
+
110
+ logger.info(f"LiteLLM client started with model: {self.config.model}")
111
+
112
+ except ImportError:
113
+ self._litellm_available = False
114
+ self._is_started = True
115
+ logger.warning("litellm not installed. Using mock LLM responses. Install with: pip install litellm")
116
+
117
+ except Exception as e:
118
+ self._litellm_available = False
119
+ self._is_started = True
120
+ logger.warning(f"Could not initialize LiteLLM: {e}. Using mock LLM responses.")
121
+
122
+ return self
123
+
124
+ async def stop(self) -> None:
125
+ """Stop the LiteLLM client."""
126
+ self._is_started = False
127
+
128
+ async def send_prompt(self, prompt: str, system_prompt: Optional[str] = None) -> str:
129
+ """
130
+ Send a prompt and get a response.
131
+
132
+ Parameters
133
+ ----------
134
+ prompt : str
135
+ The prompt to send
136
+ system_prompt : str, optional
137
+ System prompt for the model
138
+
139
+ Returns
140
+ -------
141
+ response : str
142
+ The model's response
143
+ """
144
+ if not self._is_started:
145
+ await self.start()
146
+
147
+ if not self._litellm_available:
148
+ return self._mock_response(prompt)
149
+
150
+ try:
151
+ messages = []
152
+ if system_prompt:
153
+ messages.append({"role": "system", "content": system_prompt})
154
+ messages.append({"role": "user", "content": prompt})
155
+
156
+ # Build kwargs for litellm
157
+ kwargs: dict[str, Any] = {
158
+ "model": self.config.model,
159
+ "messages": messages,
160
+ "temperature": self.config.temperature,
161
+ "max_tokens": self.config.max_tokens,
162
+ "timeout": self.config.timeout,
163
+ }
164
+
165
+ if self.config.api_base:
166
+ kwargs["api_base"] = self.config.api_base
167
+
168
+ # Use async completion
169
+ response = await self._litellm.acompletion(**kwargs)
170
+
171
+ return response.choices[0].message.content
172
+
173
+ except Exception as e:
174
+ logger.error(f"LiteLLM request failed: {e}")
175
+ return self._mock_response(prompt)
176
+
177
+ def _mock_response(self, prompt: str) -> str:
178
+ """Generate mock response when LiteLLM is unavailable."""
179
+ import re
180
+
181
+ columns = re.findall(r"- (\w+) \(", prompt)
182
+
183
+ if ("suggest" in prompt.lower() or "feature" in prompt.lower()) and columns:
184
+ features = []
185
+ if len(columns) >= 2:
186
+ col1, col2 = columns[0], columns[1]
187
+ features.append(
188
+ {
189
+ "name": f"{col1}_{col2}_ratio",
190
+ "code": f"result = df['{col1}'] / (df['{col2}'] + 1e-8)",
191
+ "explanation": f"Ratio of {col1} to {col2}, captures relative relationship",
192
+ "source_columns": [col1, col2],
193
+ }
194
+ )
195
+ features.append(
196
+ {
197
+ "name": f"{col1}_{col2}_product",
198
+ "code": f"result = df['{col1}'] * df['{col2}']",
199
+ "explanation": f"Interaction between {col1} and {col2}",
200
+ "source_columns": [col1, col2],
201
+ }
202
+ )
203
+ if len(columns) >= 3:
204
+ col3 = columns[2]
205
+ features.append(
206
+ {
207
+ "name": f"{columns[0]}_normalized_by_{col3}",
208
+ "code": f"result = (df['{columns[0]}'] - df['{columns[0]}'].mean()) / (df['{col3}'] + 1e-8)",
209
+ "explanation": f"Normalized {columns[0]} adjusted by {col3}",
210
+ "source_columns": [columns[0], col3],
211
+ }
212
+ )
213
+ if len(columns) >= 1:
214
+ features.append(
215
+ {
216
+ "name": f"{columns[0]}_zscore",
217
+ "code": f"result = (df['{columns[0]}'] - df['{columns[0]}'].mean()) / (df['{columns[0]}'].std() + 1e-8)",
218
+ "explanation": f"Z-score normalization of {columns[0]}",
219
+ "source_columns": [columns[0]],
220
+ }
221
+ )
222
+ return json.dumps({"features": features})
223
+ elif "suggest" in prompt.lower() or "feature" in prompt.lower():
224
+ return json.dumps(
225
+ {
226
+ "features": [
227
+ {
228
+ "name": "feature_interaction",
229
+ "code": "result = df.iloc[:, 0] * df.iloc[:, 1]",
230
+ "explanation": "Interaction between first two features",
231
+ }
232
+ ]
233
+ }
234
+ )
235
+ elif "explain" in prompt.lower():
236
+ return "This feature captures the relationship between the input variables."
237
+ elif "code" in prompt.lower():
238
+ return "result = df.iloc[:, 0] * df.iloc[:, 1]"
239
+ else:
240
+ return "Mock response for: " + prompt[:100]
241
+
242
+ async def suggest_features(
243
+ self,
244
+ column_info: dict[str, str],
245
+ task_description: str,
246
+ column_descriptions: Optional[dict[str, str]] = None,
247
+ domain: Optional[str] = None,
248
+ max_suggestions: int = 10,
249
+ ) -> list[dict[str, Any]]:
250
+ """
251
+ Get LLM suggestions for new features.
252
+
253
+ Parameters
254
+ ----------
255
+ column_info : dict
256
+ Dictionary mapping column names to data types
257
+ task_description : str
258
+ Description of the ML task
259
+ column_descriptions : dict, optional
260
+ Human-readable descriptions of columns
261
+ domain : str, optional
262
+ Domain context (e.g., 'healthcare', 'finance')
263
+ max_suggestions : int, default=10
264
+ Maximum number of feature suggestions
265
+
266
+ Returns
267
+ -------
268
+ suggestions : list
269
+ List of feature suggestions with code and explanations
270
+ """
271
+ prompt = self._build_suggestion_prompt(
272
+ column_info, task_description, column_descriptions, domain, max_suggestions
273
+ )
274
+
275
+ system_prompt = (
276
+ "You are an expert data scientist specializing in feature engineering. "
277
+ "Always respond with valid JSON only."
278
+ )
279
+
280
+ response = await self.send_prompt(prompt, system_prompt=system_prompt)
281
+ return self._parse_suggestions(response)
282
+
283
+ def _build_suggestion_prompt(
284
+ self,
285
+ column_info: dict[str, str],
286
+ task_description: str,
287
+ column_descriptions: Optional[dict[str, str]] = None,
288
+ domain: Optional[str] = None,
289
+ max_suggestions: int = 10,
290
+ ) -> str:
291
+ """Build the prompt for feature suggestions."""
292
+ prompt = f"""Suggest {max_suggestions} new features for the following machine learning task.
293
+
294
+ ## ML Task
295
+ {task_description}
296
+
297
+ ## Available Columns
298
+ """
299
+ for col, dtype in column_info.items():
300
+ desc = column_descriptions.get(col, "") if column_descriptions else ""
301
+ prompt += f"- {col} ({dtype}): {desc}\n"
302
+
303
+ if domain:
304
+ prompt += f"\n## Domain Context\nThis is a {domain} problem.\n"
305
+
306
+ prompt += """
307
+ ## Requirements
308
+ 1. Suggest features that would be predictive for this task
309
+ 2. Provide Python code using pandas (assume df is the DataFrame)
310
+ 3. Explain why each feature might be useful
311
+ 4. Consider interactions, ratios, and domain-specific transformations
312
+
313
+ ## Output Format
314
+ Return a JSON object with a "features" array, each element having:
315
+ - "name": feature name (snake_case)
316
+ - "code": Python code to compute the feature (single line, result assigned to variable)
317
+ - "explanation": why this feature might be predictive
318
+ - "source_columns": list of column names used
319
+
320
+ Example:
321
+ {
322
+ "features": [
323
+ {
324
+ "name": "age_income_ratio",
325
+ "code": "result = df['age'] / (df['income'] + 1)",
326
+ "explanation": "Ratio of age to income may indicate life stage and financial maturity",
327
+ "source_columns": ["age", "income"]
328
+ }
329
+ ]
330
+ }
331
+
332
+ Return ONLY the JSON object, no other text.
333
+ """
334
+ return prompt
335
+
336
+ def _parse_suggestions(self, response: str) -> list[dict[str, Any]]:
337
+ """Parse feature suggestions from LLM response."""
338
+ try:
339
+ response = response.strip()
340
+ if response.startswith("```"):
341
+ lines = response.split("\n")
342
+ response = "\n".join(lines[1:-1])
343
+
344
+ data = json.loads(response)
345
+ return data.get("features", [])
346
+
347
+ except json.JSONDecodeError:
348
+ import re
349
+
350
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
351
+ if json_match:
352
+ try:
353
+ data = json.loads(json_match.group())
354
+ return data.get("features", [])
355
+ except json.JSONDecodeError:
356
+ pass
357
+
358
+ return []
359
+
360
+ async def explain_feature(
361
+ self,
362
+ feature_name: str,
363
+ feature_code: str,
364
+ column_descriptions: Optional[dict[str, str]] = None,
365
+ task_description: Optional[str] = None,
366
+ ) -> str:
367
+ """
368
+ Get a human-readable explanation of a feature.
369
+
370
+ Parameters
371
+ ----------
372
+ feature_name : str
373
+ Name of the feature
374
+ feature_code : str
375
+ Code that generates the feature
376
+ column_descriptions : dict, optional
377
+ Descriptions of source columns
378
+ task_description : str, optional
379
+ Description of the ML task
380
+
381
+ Returns
382
+ -------
383
+ explanation : str
384
+ Human-readable explanation
385
+ """
386
+ prompt = f"""Explain this feature in simple terms for a business stakeholder:
387
+
388
+ Feature Name: {feature_name}
389
+ Code: {feature_code}
390
+ """
391
+ if column_descriptions:
392
+ prompt += "\nColumn Descriptions:\n"
393
+ for col, desc in column_descriptions.items():
394
+ prompt += f"- {col}: {desc}\n"
395
+
396
+ if task_description:
397
+ prompt += f"\nML Task: {task_description}\n"
398
+
399
+ prompt += """
400
+ Provide a 2-3 sentence explanation of:
401
+ 1. What this feature represents
402
+ 2. Why it might be predictive for the task
403
+ """
404
+ return await self.send_prompt(prompt)
405
+
406
+ async def generate_feature_code(
407
+ self, description: str, column_info: dict[str, str], constraints: Optional[list[str]] = None
408
+ ) -> str:
409
+ """
410
+ Generate Python code for a described feature.
411
+
412
+ Parameters
413
+ ----------
414
+ description : str
415
+ Natural language description of desired feature
416
+ column_info : dict
417
+ Available columns and their types
418
+ constraints : list, optional
419
+ Constraints on the generated code
420
+
421
+ Returns
422
+ -------
423
+ code : str
424
+ Python code to generate the feature
425
+ """
426
+ prompt = f"""Generate Python code to create this feature:
427
+
428
+ Description: {description}
429
+
430
+ Available Columns:
431
+ """
432
+ for col, dtype in column_info.items():
433
+ prompt += f"- {col} ({dtype})\n"
434
+
435
+ if constraints:
436
+ prompt += "\nConstraints:\n"
437
+ for c in constraints:
438
+ prompt += f"- {c}\n"
439
+
440
+ prompt += """
441
+ Requirements:
442
+ 1. Use pandas operations (assume df is the DataFrame)
443
+ 2. Assign the result to a variable called 'result'
444
+ 3. Handle edge cases (division by zero, missing values)
445
+ 4. Return ONLY the code, no explanations
446
+
447
+ Example output:
448
+ result = df['col1'] / (df['col2'] + 1e-8)
449
+ """
450
+ response = await self.send_prompt(prompt)
451
+
452
+ code = response.strip()
453
+ if "```" in code:
454
+ lines = code.split("\n")
455
+ code_lines = []
456
+ in_code_block = False
457
+ for line in lines:
458
+ if line.startswith("```"):
459
+ in_code_block = not in_code_block
460
+ elif in_code_block:
461
+ code_lines.append(line)
462
+ code = "\n".join(code_lines)
463
+
464
+ return code
465
+
466
+ async def validate_feature_code(self, code: str, sample_data: Optional[dict[str, list]] = None) -> dict[str, Any]:
467
+ """
468
+ Validate generated feature code.
469
+
470
+ Parameters
471
+ ----------
472
+ code : str
473
+ Feature code to validate
474
+ sample_data : dict, optional
475
+ Sample data for testing
476
+
477
+ Returns
478
+ -------
479
+ result : dict
480
+ Validation result with 'valid', 'error', and 'warnings' keys
481
+ """
482
+ import numpy as np
483
+ import pandas as pd
484
+
485
+ result: dict[str, Any] = {"valid": True, "error": None, "warnings": []}
486
+
487
+ try:
488
+ compile(code, "<string>", "exec")
489
+ except SyntaxError as e:
490
+ result["valid"] = False
491
+ result["error"] = f"Syntax error: {e}"
492
+ return result
493
+
494
+ if sample_data:
495
+ try:
496
+ df = pd.DataFrame(sample_data)
497
+ local_vars: dict[str, Any] = {"df": df, "np": np, "pd": pd}
498
+ exec(
499
+ code,
500
+ {
501
+ "__builtins__": {
502
+ "len": len,
503
+ "sum": sum,
504
+ "max": max,
505
+ "min": min,
506
+ "int": int,
507
+ "float": float,
508
+ "str": str,
509
+ "bool": bool,
510
+ "abs": abs,
511
+ "round": round,
512
+ "pow": pow,
513
+ "range": range,
514
+ "list": list,
515
+ "dict": dict,
516
+ "set": set,
517
+ "tuple": tuple,
518
+ "sorted": sorted,
519
+ "reversed": reversed,
520
+ "enumerate": enumerate,
521
+ "zip": zip,
522
+ "any": any,
523
+ "all": all,
524
+ "map": map,
525
+ "filter": filter,
526
+ "isinstance": isinstance,
527
+ "hasattr": hasattr,
528
+ "getattr": getattr,
529
+ }
530
+ },
531
+ local_vars,
532
+ )
533
+
534
+ if "result" not in local_vars:
535
+ result["warnings"].append("Code does not assign to 'result' variable")
536
+
537
+ except Exception as e:
538
+ result["valid"] = False
539
+ result["error"] = f"Runtime error: {e}"
540
+
541
+ return result
542
+
543
+
544
+ class SyncLiteLLMFeatureClient:
545
+ """Synchronous wrapper for LiteLLMFeatureClient."""
546
+
547
+ def __init__(self, **kwargs):
548
+ self._async_client = LiteLLMFeatureClient(**kwargs)
549
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
550
+
551
+ def _run_async(self, coro):
552
+ """Run an async coroutine, handling nested event loops (e.g., Jupyter)."""
553
+ try:
554
+ # Check if we're in a running event loop (e.g., Jupyter)
555
+ loop = asyncio.get_running_loop()
556
+ # We're in a running loop - use nest_asyncio if available
557
+ try:
558
+ import nest_asyncio
559
+
560
+ nest_asyncio.apply()
561
+ return loop.run_until_complete(coro)
562
+ except ImportError:
563
+ # nest_asyncio not available, try alternative approach
564
+ import concurrent.futures
565
+
566
+ with concurrent.futures.ThreadPoolExecutor() as executor:
567
+ future = executor.submit(asyncio.run, coro)
568
+ return future.result()
569
+ except RuntimeError:
570
+ # No running event loop - safe to use asyncio.run
571
+ return asyncio.run(coro)
572
+
573
+ def start(self) -> "LiteLLMFeatureClient":
574
+ """Start the client."""
575
+ return self._run_async(self._async_client.start())
576
+
577
+ def stop(self) -> None:
578
+ """Stop the client."""
579
+ return self._run_async(self._async_client.stop())
580
+
581
+ def suggest_features(self, **kwargs) -> list[dict[str, Any]]:
582
+ """Get feature suggestions."""
583
+ return self._run_async(self._async_client.suggest_features(**kwargs))
584
+
585
+ def explain_feature(self, **kwargs) -> str:
586
+ """Explain a feature."""
587
+ return self._run_async(self._async_client.explain_feature(**kwargs))
588
+
589
+ def generate_feature_code(self, **kwargs) -> str:
590
+ """Generate feature code."""
591
+ return self._run_async(self._async_client.generate_feature_code(**kwargs))
592
+
593
+ def validate_feature_code(self, code: str, sample_data: Optional[dict[str, list]] = None) -> dict[str, Any]:
594
+ """Validate feature code."""
595
+ return self._run_async(self._async_client.validate_feature_code(code=code, sample_data=sample_data))