featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ """Persistent storage for transform rules.
2
+
3
+ Provides JSON-file based storage for saving, loading, and searching
4
+ reusable transform rules.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from featcopilot.core.transform_rule import TransformRule
13
+ from featcopilot.utils.logger import get_logger
14
+
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class TransformRuleStore:
19
+ """
20
+ Persistent storage for transform rules.
21
+
22
+ Stores rules in a JSON file for reuse across sessions and datasets.
23
+ Supports searching by tags, description similarity, and column patterns.
24
+
25
+ Parameters
26
+ ----------
27
+ path : str, optional
28
+ Path to the JSON file for storage. Defaults to ~/.featcopilot/rules.json
29
+
30
+ Examples
31
+ --------
32
+ >>> store = TransformRuleStore()
33
+ >>> store.save_rule(rule)
34
+ >>> matching = store.find_matching_rules(columns=["price", "quantity"])
35
+ >>> all_rules = store.list_rules()
36
+ """
37
+
38
+ DEFAULT_PATH = "~/.featcopilot/rules.json"
39
+
40
+ def __init__(self, path: Optional[str] = None):
41
+ self.path = Path(os.path.expanduser(path or self.DEFAULT_PATH))
42
+ self._rules: dict[str, TransformRule] = {}
43
+ self._ensure_directory()
44
+ self._load()
45
+
46
+ def _ensure_directory(self) -> None:
47
+ """Ensure the storage directory exists."""
48
+ self.path.parent.mkdir(parents=True, exist_ok=True)
49
+
50
+ def _load(self) -> None:
51
+ """Load rules from storage file."""
52
+ if self.path.exists():
53
+ try:
54
+ with open(self.path, encoding="utf-8") as f:
55
+ data = json.load(f)
56
+ self._rules = {rule_id: TransformRule.from_dict(rule_data) for rule_id, rule_data in data.items()}
57
+ logger.debug(f"Loaded {len(self._rules)} rules from {self.path}")
58
+ except (json.JSONDecodeError, KeyError) as e:
59
+ logger.warning(f"Failed to load rules from {self.path}: {e}")
60
+ self._rules = {}
61
+ else:
62
+ self._rules = {}
63
+
64
+ def _save(self) -> None:
65
+ """Save rules to storage file."""
66
+ try:
67
+ with open(self.path, "w", encoding="utf-8") as f:
68
+ data = {rule_id: rule.to_dict() for rule_id, rule in self._rules.items()}
69
+ json.dump(data, f, indent=2)
70
+ logger.debug(f"Saved {len(self._rules)} rules to {self.path}")
71
+ except OSError as e:
72
+ logger.error(f"Failed to save rules to {self.path}: {e}")
73
+ raise
74
+
75
+ def save_rule(self, rule: TransformRule) -> str:
76
+ """
77
+ Save a rule to the store.
78
+
79
+ Parameters
80
+ ----------
81
+ rule : TransformRule
82
+ The rule to save
83
+
84
+ Returns
85
+ -------
86
+ str
87
+ The rule's ID
88
+ """
89
+ self._rules[rule.id] = rule
90
+ self._save()
91
+ logger.info(f"Saved rule '{rule.name}' with ID {rule.id}")
92
+ return rule.id
93
+
94
+ def get_rule(self, rule_id: str) -> Optional[TransformRule]:
95
+ """
96
+ Get a rule by ID.
97
+
98
+ Parameters
99
+ ----------
100
+ rule_id : str
101
+ The rule's ID
102
+
103
+ Returns
104
+ -------
105
+ TransformRule or None
106
+ The rule if found, None otherwise
107
+ """
108
+ return self._rules.get(rule_id)
109
+
110
+ def get_rule_by_name(self, name: str) -> Optional[TransformRule]:
111
+ """
112
+ Get a rule by name.
113
+
114
+ Parameters
115
+ ----------
116
+ name : str
117
+ The rule's name
118
+
119
+ Returns
120
+ -------
121
+ TransformRule or None
122
+ The first rule matching the name, None if not found
123
+ """
124
+ for rule in self._rules.values():
125
+ if rule.name == name:
126
+ return rule
127
+ return None
128
+
129
+ def delete_rule(self, rule_id: str) -> bool:
130
+ """
131
+ Delete a rule by ID.
132
+
133
+ Parameters
134
+ ----------
135
+ rule_id : str
136
+ The rule's ID
137
+
138
+ Returns
139
+ -------
140
+ bool
141
+ True if deleted, False if not found
142
+ """
143
+ if rule_id in self._rules:
144
+ del self._rules[rule_id]
145
+ self._save()
146
+ logger.info(f"Deleted rule {rule_id}")
147
+ return True
148
+ return False
149
+
150
+ def list_rules(self, tags: Optional[list[str]] = None) -> list[TransformRule]:
151
+ """
152
+ List all rules, optionally filtered by tags.
153
+
154
+ Parameters
155
+ ----------
156
+ tags : list[str], optional
157
+ Filter rules that have all specified tags
158
+
159
+ Returns
160
+ -------
161
+ list[TransformRule]
162
+ List of matching rules
163
+ """
164
+ rules = list(self._rules.values())
165
+
166
+ if tags:
167
+ rules = [r for r in rules if all(t in r.tags for t in tags)]
168
+
169
+ return rules
170
+
171
+ def find_matching_rules(
172
+ self,
173
+ columns: Optional[list[str]] = None,
174
+ description: Optional[str] = None,
175
+ tags: Optional[list[str]] = None,
176
+ min_usage: int = 0,
177
+ ) -> list[tuple[TransformRule, dict[str, str]]]:
178
+ """
179
+ Find rules that can be applied to the given context.
180
+
181
+ Parameters
182
+ ----------
183
+ columns : list[str], optional
184
+ Available column names to match against
185
+ description : str, optional
186
+ Description to search for (keyword matching)
187
+ tags : list[str], optional
188
+ Required tags
189
+ min_usage : int, default=0
190
+ Minimum usage count
191
+
192
+ Returns
193
+ -------
194
+ list[tuple[TransformRule, dict]]
195
+ List of (rule, column_mapping) tuples for applicable rules,
196
+ sorted by usage count (most used first)
197
+ """
198
+ results: list[tuple[TransformRule, dict[str, str]]] = []
199
+
200
+ for rule in self._rules.values():
201
+ # Filter by usage count
202
+ if rule.usage_count < min_usage:
203
+ continue
204
+
205
+ # Filter by tags
206
+ if tags and not all(t in rule.tags for t in tags):
207
+ continue
208
+
209
+ # Filter by description keywords
210
+ if description:
211
+ keywords = description.lower().split()
212
+ rule_text = f"{rule.name} {rule.description}".lower()
213
+ if not any(kw in rule_text for kw in keywords):
214
+ continue
215
+
216
+ # Check column compatibility
217
+ mapping: dict[str, str] = {}
218
+ if columns:
219
+ matches, mapping = rule.matches_columns(columns)
220
+ if not matches:
221
+ continue
222
+
223
+ results.append((rule, mapping))
224
+
225
+ # Sort by usage count (descending)
226
+ results.sort(key=lambda x: x[0].usage_count, reverse=True)
227
+
228
+ return results
229
+
230
+ def search_by_description(self, query: str, limit: int = 10) -> list[TransformRule]:
231
+ """
232
+ Search rules by description similarity.
233
+
234
+ Parameters
235
+ ----------
236
+ query : str
237
+ Search query
238
+ limit : int, default=10
239
+ Maximum number of results
240
+
241
+ Returns
242
+ -------
243
+ list[TransformRule]
244
+ Matching rules sorted by relevance
245
+ """
246
+ query_words = set(query.lower().split())
247
+ scored_rules: list[tuple[float, TransformRule]] = []
248
+
249
+ for rule in self._rules.values():
250
+ rule_words = set(f"{rule.name} {rule.description}".lower().split())
251
+
252
+ # Simple word overlap scoring
253
+ overlap = len(query_words & rule_words)
254
+ if overlap > 0:
255
+ score = overlap / len(query_words)
256
+ scored_rules.append((score, rule))
257
+
258
+ # Sort by score descending
259
+ scored_rules.sort(key=lambda x: x[0], reverse=True)
260
+
261
+ return [rule for _, rule in scored_rules[:limit]]
262
+
263
+ def import_rules(self, path: str, merge: bool = True) -> int:
264
+ """
265
+ Import rules from another JSON file.
266
+
267
+ Parameters
268
+ ----------
269
+ path : str
270
+ Path to import from
271
+ merge : bool, default=True
272
+ If True, merge with existing rules. If False, replace all.
273
+
274
+ Returns
275
+ -------
276
+ int
277
+ Number of rules imported
278
+ """
279
+ import_path = Path(os.path.expanduser(path))
280
+
281
+ if not import_path.exists():
282
+ raise FileNotFoundError(f"Import file not found: {path}")
283
+
284
+ with open(import_path, encoding="utf-8") as f:
285
+ data = json.load(f)
286
+
287
+ if not merge:
288
+ self._rules = {}
289
+
290
+ count = 0
291
+ for _rule_id, rule_data in data.items():
292
+ rule = TransformRule.from_dict(rule_data)
293
+ self._rules[rule.id] = rule
294
+ count += 1
295
+
296
+ self._save()
297
+ logger.info(f"Imported {count} rules from {path}")
298
+
299
+ return count
300
+
301
+ def export_rules(self, path: str, tags: Optional[list[str]] = None) -> int:
302
+ """
303
+ Export rules to a JSON file.
304
+
305
+ Parameters
306
+ ----------
307
+ path : str
308
+ Path to export to
309
+ tags : list[str], optional
310
+ Only export rules with these tags
311
+
312
+ Returns
313
+ -------
314
+ int
315
+ Number of rules exported
316
+ """
317
+ export_path = Path(os.path.expanduser(path))
318
+ export_path.parent.mkdir(parents=True, exist_ok=True)
319
+
320
+ rules_to_export = self.list_rules(tags=tags)
321
+
322
+ with open(export_path, "w", encoding="utf-8") as f:
323
+ data = {r.id: r.to_dict() for r in rules_to_export}
324
+ json.dump(data, f, indent=2)
325
+
326
+ logger.info(f"Exported {len(rules_to_export)} rules to {path}")
327
+
328
+ return len(rules_to_export)
329
+
330
+ def clear(self) -> None:
331
+ """Remove all rules from the store."""
332
+ self._rules = {}
333
+ self._save()
334
+ logger.info("Cleared all rules")
335
+
336
+ def __len__(self) -> int:
337
+ return len(self._rules)
338
+
339
+ def __contains__(self, rule_id: str) -> bool:
340
+ return rule_id in self._rules
341
+
342
+ def __iter__(self):
343
+ return iter(self._rules.values())
@@ -15,6 +15,9 @@ from featcopilot.engines.tabular import TabularEngine
15
15
  from featcopilot.engines.text import TextEngine
16
16
  from featcopilot.engines.timeseries import TimeSeriesEngine
17
17
  from featcopilot.selection.unified import FeatureSelector
18
+ from featcopilot.utils.logger import get_logger
19
+
20
+ logger = get_logger(__name__)
18
21
 
19
22
 
20
23
  class FeatureEngineerTransformer(BaseEstimator, TransformerMixin):
@@ -87,7 +90,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
87
90
  Parameters
88
91
  ----------
89
92
  engines : list, default=['tabular']
90
- Engines to use ('tabular', 'timeseries', 'text', 'llm')
93
+ Engines to use ('tabular', 'timeseries', 'relational', 'text', 'llm')
91
94
  max_features : int, optional
92
95
  Maximum features to generate/select
93
96
  selection_methods : list, default=['mutual_info', 'importance']
@@ -102,7 +105,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
102
105
  >>> engineer = AutoFeatureEngineer(
103
106
  ... engines=['tabular', 'llm'],
104
107
  ... max_features=100,
105
- ... llm_config={'model': 'gpt-5', 'enable_semantic': True}
108
+ ... llm_config={'model': 'gpt-5.2', 'enable_semantic': True}
106
109
  ... )
107
110
  >>> X_transformed = engineer.fit_transform(X, y)
108
111
  """
@@ -183,7 +186,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
183
186
  self._engine_instances[engine_name] = engine
184
187
 
185
188
  if self.verbose:
186
- print(f"Fitted {engine_name} engine")
189
+ logger.info(f"Fitted {engine_name} engine")
187
190
 
188
191
  self._is_fitted = True
189
192
  return self
@@ -196,11 +199,13 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
196
199
  return TimeSeriesEngine(max_features=self.max_features, verbose=self.verbose)
197
200
  elif engine_name == "text":
198
201
  return TextEngine(max_features=self.max_features, verbose=self.verbose)
202
+ elif engine_name == "relational":
203
+ return RelationalEngine(max_features=self.max_features, verbose=self.verbose)
199
204
  elif engine_name == "llm":
200
205
  from featcopilot.llm.semantic_engine import SemanticEngine
201
206
 
202
207
  return SemanticEngine(
203
- model=self.llm_config.get("model", "gpt-5"),
208
+ model=self.llm_config.get("model", "gpt-5.2"),
204
209
  max_suggestions=self.llm_config.get("max_suggestions", 20),
205
210
  domain=self.llm_config.get("domain"),
206
211
  verbose=self.verbose,
@@ -242,7 +247,7 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
242
247
  result[col] = transformed[col]
243
248
 
244
249
  if self.verbose:
245
- print(f"{engine_name}: Added {len(new_cols)} features")
250
+ logger.info(f"{engine_name}: Added {len(new_cols)} features")
246
251
 
247
252
  # Handle infinities and NaNs
248
253
  result = result.replace([np.inf, -np.inf], np.nan)
@@ -291,18 +296,25 @@ class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
291
296
  self.fit(X, y, column_descriptions, task_description, **fit_params)
292
297
  result = self.transform(X)
293
298
 
299
+ # Track original features (input columns) vs derived features
300
+ if isinstance(X, np.ndarray):
301
+ original_features = {f"feature_{i}" for i in range(X.shape[1])}
302
+ else:
303
+ original_features = set(X.columns)
304
+
294
305
  # Apply feature selection if enabled and y is provided
295
306
  if apply_selection and y is not None and self.max_features:
296
307
  self._selector = FeatureSelector(
297
308
  methods=self.selection_methods,
298
309
  max_features=self.max_features,
299
310
  correlation_threshold=self.correlation_threshold,
311
+ original_features=original_features,
300
312
  verbose=self.verbose,
301
313
  )
302
314
  result = self._selector.fit_transform(result, y)
303
315
 
304
316
  if self.verbose:
305
- print(f"Selected {len(self._selector.get_selected_features())} features")
317
+ logger.info(f"Selected {len(self._selector.get_selected_features())} features")
306
318
 
307
319
  return result
308
320
 
@@ -1,9 +1,23 @@
1
1
  """Utility functions and classes."""
2
2
 
3
3
  from featcopilot.utils.cache import FeatureCache
4
+ from featcopilot.utils.models import (
5
+ fetch_models,
6
+ get_default_model,
7
+ get_model_info,
8
+ get_model_names,
9
+ is_valid_model,
10
+ list_models,
11
+ )
4
12
  from featcopilot.utils.parallel import parallel_apply
5
13
 
6
14
  __all__ = [
7
15
  "parallel_apply",
8
16
  "FeatureCache",
17
+ "fetch_models",
18
+ "list_models",
19
+ "get_model_info",
20
+ "get_default_model",
21
+ "get_model_names",
22
+ "is_valid_model",
9
23
  ]
@@ -0,0 +1,47 @@
1
+ """Centralized logging configuration for featcopilot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+
8
+ # Create the logger
9
+ logger = logging.getLogger("featcopilot")
10
+
11
+ # Default handler with line number format
12
+ _handler = logging.StreamHandler(sys.stderr)
13
+ _formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s")
14
+ _handler.setFormatter(_formatter)
15
+
16
+ # Only add handler if not already configured
17
+ if not logger.handlers:
18
+ logger.addHandler(_handler)
19
+ logger.setLevel(logging.INFO)
20
+
21
+
22
+ def get_logger(name: str | None = None) -> logging.Logger:
23
+ """Get a logger instance.
24
+
25
+ Args:
26
+ name: Optional name for the logger. If None, returns the root featcopilot logger.
27
+
28
+ Returns:
29
+ A configured logger instance.
30
+ """
31
+ if name:
32
+ # Strip 'featcopilot.' prefix if present to avoid duplication
33
+ if name.startswith("featcopilot."):
34
+ name = name[len("featcopilot.") :]
35
+ return logging.getLogger(f"featcopilot.{name}")
36
+ return logger
37
+
38
+
39
+ def set_level(level: int | str) -> None:
40
+ """Set the logging level.
41
+
42
+ Args:
43
+ level: Logging level (e.g., logging.DEBUG, logging.INFO, "DEBUG", "INFO")
44
+ """
45
+ if isinstance(level, str):
46
+ level = getattr(logging, level.upper())
47
+ logger.setLevel(level)