featcopilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,521 @@
1
+ """GitHub Copilot SDK client wrapper for feature engineering.
2
+
3
+ Provides a simplified interface to the Copilot SDK specifically
4
+ designed for feature engineering tasks.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ from typing import Any, Optional
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+
14
+ class CopilotConfig(BaseModel):
15
+ """Configuration for Copilot client."""
16
+
17
+ model: str = Field(default="gpt-5", description="Model to use")
18
+ temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
19
+ max_tokens: int = Field(default=4096, description="Maximum tokens in response")
20
+ timeout: float = Field(default=60.0, description="Timeout in seconds")
21
+ streaming: bool = Field(default=False, description="Enable streaming responses")
22
+
23
+
24
+ class CopilotFeatureClient:
25
+ """
26
+ GitHub Copilot SDK client wrapper for feature engineering.
27
+
28
+ Provides high-level methods for:
29
+ - Generating feature suggestions
30
+ - Explaining features
31
+ - Generating feature code
32
+ - Validating features
33
+
34
+ Parameters
35
+ ----------
36
+ config : CopilotConfig, optional
37
+ Configuration for the client
38
+ model : str, default='gpt-5'
39
+ Model to use for generation
40
+
41
+ Examples
42
+ --------
43
+ >>> client = CopilotFeatureClient(model='gpt-5')
44
+ >>> await client.start()
45
+ >>> suggestions = await client.suggest_features(
46
+ ... column_info={'age': 'int', 'income': 'float'},
47
+ ... task='predict churn'
48
+ ... )
49
+ >>> await client.stop()
50
+ """
51
+
52
+ def __init__(self, config: Optional[CopilotConfig] = None, model: str = "gpt-5", **kwargs):
53
+ self.config = config or CopilotConfig(model=model, **kwargs)
54
+ self._client = None
55
+ self._session = None
56
+ self._is_started = False
57
+ self._copilot_available = False
58
+
59
+ async def start(self) -> "CopilotFeatureClient":
60
+ """
61
+ Start the Copilot client.
62
+
63
+ Returns
64
+ -------
65
+ self : CopilotFeatureClient
66
+ """
67
+ try:
68
+ from copilot import CopilotClient
69
+
70
+ self._client = CopilotClient()
71
+ await self._client.start()
72
+ self._session = await self._client.create_session(
73
+ {
74
+ "model": self.config.model,
75
+ "streaming": self.config.streaming,
76
+ }
77
+ )
78
+ self._is_started = True
79
+ self._copilot_available = True
80
+
81
+ except ImportError:
82
+ # Copilot SDK not installed - use mock mode
83
+ self._copilot_available = False
84
+ self._is_started = True
85
+ print("Warning: copilot-sdk not installed. Using mock LLM responses.")
86
+
87
+ except Exception as e:
88
+ # Copilot not available - use mock mode
89
+ self._copilot_available = False
90
+ self._is_started = True
91
+ print(f"Warning: Could not connect to Copilot: {e}. Using mock LLM responses.")
92
+
93
+ return self
94
+
95
+ async def stop(self) -> None:
96
+ """Stop the Copilot client."""
97
+ if self._session and self._copilot_available:
98
+ await self._session.destroy()
99
+ if self._client and self._copilot_available:
100
+ await self._client.stop()
101
+ self._is_started = False
102
+
103
+ async def send_prompt(self, prompt: str) -> str:
104
+ """
105
+ Send a prompt and get a response.
106
+
107
+ Parameters
108
+ ----------
109
+ prompt : str
110
+ The prompt to send
111
+
112
+ Returns
113
+ -------
114
+ response : str
115
+ The model's response
116
+ """
117
+ if not self._is_started:
118
+ await self.start()
119
+
120
+ if not self._copilot_available:
121
+ return self._mock_response(prompt)
122
+
123
+ # Use asyncio.Event to wait for completion
124
+ done = asyncio.Event()
125
+ response_content = []
126
+
127
+ def on_event(event):
128
+ if event.type.value == "assistant.message":
129
+ response_content.append(event.data.content)
130
+ elif event.type.value == "session.idle":
131
+ done.set()
132
+
133
+ self._session.on(on_event)
134
+ await self._session.send({"prompt": prompt})
135
+
136
+ # Wait with timeout
137
+ try:
138
+ await asyncio.wait_for(done.wait(), timeout=self.config.timeout)
139
+ except asyncio.TimeoutError:
140
+ return "Error: Request timed out"
141
+
142
+ return response_content[-1] if response_content else ""
143
+
144
+ def _mock_response(self, prompt: str) -> str:
145
+ """Generate mock response when Copilot is unavailable."""
146
+ # Extract column names from prompt if available
147
+ import re
148
+
149
+ columns = re.findall(r"- (\w+) \(", prompt)
150
+
151
+ if ("suggest" in prompt.lower() or "feature" in prompt.lower()) and columns:
152
+ # Generate context-aware mock features based on actual columns
153
+ features = []
154
+ if len(columns) >= 2:
155
+ col1, col2 = columns[0], columns[1]
156
+ features.append(
157
+ {
158
+ "name": f"{col1}_{col2}_ratio",
159
+ "code": f"result = df['{col1}'] / (df['{col2}'] + 1e-8)",
160
+ "explanation": f"Ratio of {col1} to {col2}, captures relative relationship",
161
+ "source_columns": [col1, col2],
162
+ }
163
+ )
164
+ features.append(
165
+ {
166
+ "name": f"{col1}_{col2}_product",
167
+ "code": f"result = df['{col1}'] * df['{col2}']",
168
+ "explanation": f"Interaction between {col1} and {col2}",
169
+ "source_columns": [col1, col2],
170
+ }
171
+ )
172
+ if len(columns) >= 3:
173
+ col3 = columns[2]
174
+ features.append(
175
+ {
176
+ "name": f"{col1}_normalized_by_{col3}",
177
+ "code": f"result = (df['{col1}'] - df['{col1}'].mean()) / (df['{col3}'] + 1e-8)",
178
+ "explanation": f"Normalized {col1} adjusted by {col3}",
179
+ "source_columns": [col1, col3],
180
+ }
181
+ )
182
+ if len(columns) >= 1:
183
+ features.append(
184
+ {
185
+ "name": f"{columns[0]}_zscore",
186
+ "code": f"result = (df['{columns[0]}'] - df['{columns[0]}'].mean()) / (df['{columns[0]}'].std() + 1e-8)",
187
+ "explanation": f"Z-score normalization of {columns[0]}",
188
+ "source_columns": [columns[0]],
189
+ }
190
+ )
191
+ return json.dumps({"features": features})
192
+ elif "suggest" in prompt.lower() or "feature" in prompt.lower():
193
+ return json.dumps(
194
+ {
195
+ "features": [
196
+ {
197
+ "name": "feature_interaction",
198
+ "code": "result = df.iloc[:, 0] * df.iloc[:, 1]",
199
+ "explanation": "Interaction between first two features",
200
+ }
201
+ ]
202
+ }
203
+ )
204
+ elif "explain" in prompt.lower():
205
+ return "This feature captures the relationship between the input variables."
206
+ elif "code" in prompt.lower():
207
+ return "result = df.iloc[:, 0] * df.iloc[:, 1]"
208
+ else:
209
+ return "Mock response for: " + prompt[:100]
210
+
211
+ async def suggest_features(
212
+ self,
213
+ column_info: dict[str, str],
214
+ task_description: str,
215
+ column_descriptions: Optional[dict[str, str]] = None,
216
+ domain: Optional[str] = None,
217
+ max_suggestions: int = 10,
218
+ ) -> list[dict[str, Any]]:
219
+ """
220
+ Get LLM suggestions for new features.
221
+
222
+ Parameters
223
+ ----------
224
+ column_info : dict
225
+ Dictionary mapping column names to data types
226
+ task_description : str
227
+ Description of the ML task
228
+ column_descriptions : dict, optional
229
+ Human-readable descriptions of columns
230
+ domain : str, optional
231
+ Domain context (e.g., 'healthcare', 'finance')
232
+ max_suggestions : int, default=10
233
+ Maximum number of feature suggestions
234
+
235
+ Returns
236
+ -------
237
+ suggestions : list
238
+ List of feature suggestions with code and explanations
239
+ """
240
+ prompt = self._build_suggestion_prompt(
241
+ column_info, task_description, column_descriptions, domain, max_suggestions
242
+ )
243
+
244
+ response = await self.send_prompt(prompt)
245
+ return self._parse_suggestions(response)
246
+
247
+ def _build_suggestion_prompt(
248
+ self,
249
+ column_info: dict[str, str],
250
+ task_description: str,
251
+ column_descriptions: Optional[dict[str, str]] = None,
252
+ domain: Optional[str] = None,
253
+ max_suggestions: int = 10,
254
+ ) -> str:
255
+ """Build the prompt for feature suggestions."""
256
+ prompt = f"""You are an expert data scientist specializing in feature engineering.
257
+
258
+ TASK: Suggest {max_suggestions} new features for the following machine learning task.
259
+
260
+ ## ML Task
261
+ {task_description}
262
+
263
+ ## Available Columns
264
+ """
265
+ for col, dtype in column_info.items():
266
+ desc = column_descriptions.get(col, "") if column_descriptions else ""
267
+ prompt += f"- {col} ({dtype}): {desc}\n"
268
+
269
+ if domain:
270
+ prompt += f"\n## Domain Context\nThis is a {domain} problem.\n"
271
+
272
+ prompt += """
273
+ ## Requirements
274
+ 1. Suggest features that would be predictive for this task
275
+ 2. Provide Python code using pandas (assume df is the DataFrame)
276
+ 3. Explain why each feature might be useful
277
+ 4. Consider interactions, ratios, and domain-specific transformations
278
+
279
+ ## Output Format
280
+ Return a JSON object with a "features" array, each element having:
281
+ - "name": feature name (snake_case)
282
+ - "code": Python code to compute the feature (single line, result assigned to variable)
283
+ - "explanation": why this feature might be predictive
284
+ - "source_columns": list of column names used
285
+
286
+ Example:
287
+ {
288
+ "features": [
289
+ {
290
+ "name": "age_income_ratio",
291
+ "code": "result = df['age'] / (df['income'] + 1)",
292
+ "explanation": "Ratio of age to income may indicate life stage and financial maturity",
293
+ "source_columns": ["age", "income"]
294
+ }
295
+ ]
296
+ }
297
+
298
+ Return ONLY the JSON object, no other text.
299
+ """
300
+ return prompt
301
+
302
+ def _parse_suggestions(self, response: str) -> list[dict[str, Any]]:
303
+ """Parse feature suggestions from LLM response."""
304
+ try:
305
+ # Try to extract JSON from response
306
+ response = response.strip()
307
+ if response.startswith("```"):
308
+ # Remove markdown code blocks
309
+ lines = response.split("\n")
310
+ response = "\n".join(lines[1:-1])
311
+
312
+ data = json.loads(response)
313
+ return data.get("features", [])
314
+
315
+ except json.JSONDecodeError:
316
+ # Try to extract JSON substring
317
+ import re
318
+
319
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
320
+ if json_match:
321
+ try:
322
+ data = json.loads(json_match.group())
323
+ return data.get("features", [])
324
+ except json.JSONDecodeError:
325
+ pass
326
+
327
+ return []
328
+
329
+ async def explain_feature(
330
+ self,
331
+ feature_name: str,
332
+ feature_code: str,
333
+ column_descriptions: Optional[dict[str, str]] = None,
334
+ task_description: Optional[str] = None,
335
+ ) -> str:
336
+ """
337
+ Get a human-readable explanation of a feature.
338
+
339
+ Parameters
340
+ ----------
341
+ feature_name : str
342
+ Name of the feature
343
+ feature_code : str
344
+ Code that generates the feature
345
+ column_descriptions : dict, optional
346
+ Descriptions of source columns
347
+ task_description : str, optional
348
+ Description of the ML task
349
+
350
+ Returns
351
+ -------
352
+ explanation : str
353
+ Human-readable explanation
354
+ """
355
+ prompt = f"""Explain this feature in simple terms for a business stakeholder:
356
+
357
+ Feature Name: {feature_name}
358
+ Code: {feature_code}
359
+ """
360
+ if column_descriptions:
361
+ prompt += "\nColumn Descriptions:\n"
362
+ for col, desc in column_descriptions.items():
363
+ prompt += f"- {col}: {desc}\n"
364
+
365
+ if task_description:
366
+ prompt += f"\nML Task: {task_description}\n"
367
+
368
+ prompt += """
369
+ Provide a 2-3 sentence explanation of:
370
+ 1. What this feature represents
371
+ 2. Why it might be predictive for the task
372
+ """
373
+ return await self.send_prompt(prompt)
374
+
375
+ async def generate_feature_code(
376
+ self, description: str, column_info: dict[str, str], constraints: Optional[list[str]] = None
377
+ ) -> str:
378
+ """
379
+ Generate Python code for a described feature.
380
+
381
+ Parameters
382
+ ----------
383
+ description : str
384
+ Natural language description of desired feature
385
+ column_info : dict
386
+ Available columns and their types
387
+ constraints : list, optional
388
+ Constraints on the generated code
389
+
390
+ Returns
391
+ -------
392
+ code : str
393
+ Python code to generate the feature
394
+ """
395
+ prompt = f"""Generate Python code to create this feature:
396
+
397
+ Description: {description}
398
+
399
+ Available Columns:
400
+ """
401
+ for col, dtype in column_info.items():
402
+ prompt += f"- {col} ({dtype})\n"
403
+
404
+ if constraints:
405
+ prompt += "\nConstraints:\n"
406
+ for c in constraints:
407
+ prompt += f"- {c}\n"
408
+
409
+ prompt += """
410
+ Requirements:
411
+ 1. Use pandas operations (assume df is the DataFrame)
412
+ 2. Assign the result to a variable called 'result'
413
+ 3. Handle edge cases (division by zero, missing values)
414
+ 4. Return ONLY the code, no explanations
415
+
416
+ Example output:
417
+ result = df['col1'] / (df['col2'] + 1e-8)
418
+ """
419
+ response = await self.send_prompt(prompt)
420
+
421
+ # Extract code from response
422
+ code = response.strip()
423
+ if "```" in code:
424
+ lines = code.split("\n")
425
+ code_lines = []
426
+ in_code_block = False
427
+ for line in lines:
428
+ if line.startswith("```"):
429
+ in_code_block = not in_code_block
430
+ elif in_code_block:
431
+ code_lines.append(line)
432
+ code = "\n".join(code_lines)
433
+
434
+ return code
435
+
436
+ async def validate_feature_code(self, code: str, sample_data: Optional[dict[str, list]] = None) -> dict[str, Any]:
437
+ """
438
+ Validate generated feature code.
439
+
440
+ Parameters
441
+ ----------
442
+ code : str
443
+ Feature code to validate
444
+ sample_data : dict, optional
445
+ Sample data for testing
446
+
447
+ Returns
448
+ -------
449
+ result : dict
450
+ Validation result with 'valid', 'error', and 'warnings' keys
451
+ """
452
+ import numpy as np
453
+ import pandas as pd
454
+
455
+ result = {"valid": True, "error": None, "warnings": []}
456
+
457
+ # Syntax check
458
+ try:
459
+ compile(code, "<string>", "exec")
460
+ except SyntaxError as e:
461
+ result["valid"] = False
462
+ result["error"] = f"Syntax error: {e}"
463
+ return result
464
+
465
+ # Runtime check with sample data
466
+ if sample_data:
467
+ try:
468
+ df = pd.DataFrame(sample_data)
469
+ local_vars = {"df": df, "np": np, "pd": pd}
470
+ exec(
471
+ code,
472
+ {"__builtins__": {"len": len, "sum": sum, "max": max, "min": min}},
473
+ local_vars,
474
+ )
475
+
476
+ if "result" not in local_vars:
477
+ result["warnings"].append("Code does not assign to 'result' variable")
478
+
479
+ except Exception as e:
480
+ result["valid"] = False
481
+ result["error"] = f"Runtime error: {e}"
482
+
483
+ return result
484
+
485
+
486
+ # Synchronous wrapper for non-async contexts
487
+ class SyncCopilotFeatureClient:
488
+ """Synchronous wrapper for CopilotFeatureClient."""
489
+
490
+ def __init__(self, **kwargs):
491
+ self._async_client = CopilotFeatureClient(**kwargs)
492
+ self._loop = None
493
+
494
+ def _get_loop(self):
495
+ if self._loop is None or self._loop.is_closed():
496
+ try:
497
+ self._loop = asyncio.get_event_loop()
498
+ except RuntimeError:
499
+ self._loop = asyncio.new_event_loop()
500
+ asyncio.set_event_loop(self._loop)
501
+ return self._loop
502
+
503
+ def start(self):
504
+ return self._get_loop().run_until_complete(self._async_client.start())
505
+
506
+ def stop(self):
507
+ return self._get_loop().run_until_complete(self._async_client.stop())
508
+
509
+ def suggest_features(self, **kwargs):
510
+ return self._get_loop().run_until_complete(self._async_client.suggest_features(**kwargs))
511
+
512
+ def explain_feature(self, **kwargs):
513
+ return self._get_loop().run_until_complete(self._async_client.explain_feature(**kwargs))
514
+
515
+ def generate_feature_code(self, **kwargs):
516
+ return self._get_loop().run_until_complete(self._async_client.generate_feature_code(**kwargs))
517
+
518
+ def validate_feature_code(self, code: str, sample_data=None):
519
+ return self._get_loop().run_until_complete(
520
+ self._async_client.validate_feature_code(code=code, sample_data=sample_data)
521
+ )