NEExT 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
NEExT/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ # This file makes the directory a Python package
2
+ # Add any imports or code that should be available when someone imports NEExT
3
+
4
+ # Version of the package
5
+ __version__ = "0.1.0"
6
+
7
+ # Make the class directly available when someone does NEExT()
8
+ from .framework import NEExT
9
+
10
+ # This allows "from NEExT import *"
11
+ __all__ = ['NEExT']
NEExT/embeddings.py ADDED
@@ -0,0 +1,88 @@
1
+ from typing import List
2
+ import pandas as pd
3
+ from pydantic import BaseModel
4
+
5
+ class EmbeddingsConfig(BaseModel):
6
+ """Configuration for embeddings"""
7
+ embedding_name: str
8
+ embedding_columns: List[str]
9
+
10
+ class Embeddings:
11
+ """
12
+ A class for managing graph embeddings data and operations.
13
+
14
+ This class provides a container for embeddings data and operations like
15
+ merging different types of embeddings.
16
+
17
+ Attributes:
18
+ embeddings_df (pd.DataFrame): DataFrame containing the embeddings
19
+ embedding_name (str): Name of the embedding algorithm used
20
+ embedding_columns (List[str]): List of embedding column names
21
+
22
+ Example:
23
+ >>> embeddings1 = Embeddings(df1, "wasserstein", ["emb_0", "emb_1"])
24
+ >>> embeddings2 = Embeddings(df2, "approx_wasserstein", ["emb_0", "emb_1"])
25
+ >>> merged = embeddings1 + embeddings2 # Combines embeddings with unique column names
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ embeddings_df: pd.DataFrame,
31
+ embedding_name: str,
32
+ embedding_columns: List[str]
33
+ ):
34
+ """Initialize the Embeddings object."""
35
+ self.embeddings_df = embeddings_df
36
+ self.embedding_name = embedding_name
37
+ self.embedding_columns = embedding_columns
38
+
39
+ # Validate that all embedding columns exist in DataFrame
40
+ missing_cols = [col for col in embedding_columns if col not in embeddings_df.columns]
41
+ if missing_cols:
42
+ raise ValueError(f"Embedding columns {missing_cols} not found in DataFrame")
43
+
44
+ def __add__(self, other: 'Embeddings') -> 'Embeddings':
45
+ """
46
+ Merge two Embeddings objects.
47
+
48
+ Args:
49
+ other: Another Embeddings object to merge with
50
+
51
+ Returns:
52
+ Embeddings: New Embeddings object containing merged data with unique column names
53
+ """
54
+ if not isinstance(other, Embeddings):
55
+ raise TypeError("Can only add Embeddings objects together")
56
+
57
+ # Create copies of DataFrames to avoid modifying originals
58
+ df1 = self.embeddings_df.copy()
59
+ df2 = other.embeddings_df.copy()
60
+
61
+ # Rename columns to include embedding name prefix
62
+ rename_dict1 = {
63
+ col: f"{self.embedding_name}_{col}"
64
+ for col in self.embedding_columns
65
+ }
66
+ rename_dict2 = {
67
+ col: f"{other.embedding_name}_{col}"
68
+ for col in other.embedding_columns
69
+ }
70
+
71
+ df1.rename(columns=rename_dict1, inplace=True)
72
+ df2.rename(columns=rename_dict2, inplace=True)
73
+
74
+ # Merge DataFrames on graph_id
75
+ merged_df = pd.merge(
76
+ df1,
77
+ df2,
78
+ on='graph_id',
79
+ how='outer'
80
+ )
81
+
82
+ # Create new column names list
83
+ new_columns = list(rename_dict1.values()) + list(rename_dict2.values())
84
+
85
+ # Create new embedding name
86
+ new_name = f"{self.embedding_name}+{other.embedding_name}"
87
+
88
+ return Embeddings(merged_df, new_name, new_columns)
@@ -0,0 +1,325 @@
1
+ from typing import List, Optional, Literal, Dict
2
+ import pandas as pd
3
+ import numpy as np
4
+ from pydantic import BaseModel, Field
5
+ import time
6
+ from tqdm import tqdm
7
+ from .graph_collection import GraphCollection
8
+ from .features import Features
9
+ from .graph_embeddings import GraphEmbeddings
10
+ from .ml_models import MLModels
11
+ from .embeddings import Embeddings
12
+
13
+ class FeatureImportanceConfig(BaseModel):
14
+ """Configuration for feature importance analysis"""
15
+ algorithm: Literal["supervised_greedy", "supervised_fast", "unsupervised"]
16
+ embedding_algorithm: str = "approx_wasserstein"
17
+ model_name: Literal["xgboost", "random_forest"] = "random_forest"
18
+ random_state: int = 42
19
+ sample_size: int = 5
20
+
21
+ class FeatureImportance:
22
+ """
23
+ A class for analyzing feature importance in graph embeddings.
24
+
25
+ This class provides methods to determine the importance of node features
26
+ based on their predictive power in both supervised and unsupervised settings.
27
+
28
+ Attributes:
29
+ graph_collection (GraphCollection): Collection of graphs to analyze
30
+ features (Features): Features object containing node features
31
+ config (FeatureImportanceConfig): Configuration for importance analysis
32
+
33
+ Example:
34
+ >>> importance = FeatureImportance(
35
+ ... graph_collection=collection,
36
+ ... features=features,
37
+ ... algorithm="supervised_greedy",
38
+ ... embedding_algorithm="approx_wasserstein"
39
+ ... )
40
+ >>> results_df = importance.compute()
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ graph_collection: GraphCollection,
46
+ features: Features,
47
+ algorithm: str,
48
+ embedding_algorithm: str = "approx_wasserstein",
49
+ random_state: int = 42,
50
+ n_iterations: int = 5 # Keep for backward compatibility
51
+ ):
52
+ """Initialize the FeatureImportance analyzer."""
53
+ self.config = FeatureImportanceConfig(
54
+ algorithm=algorithm,
55
+ embedding_algorithm=embedding_algorithm,
56
+ random_state=random_state,
57
+ sample_size=n_iterations # Use n_iterations as sample_size
58
+ )
59
+ self.graph_collection = graph_collection
60
+ self.features = features
61
+
62
+ # Define available algorithms
63
+ self.available_algorithms = {
64
+ "supervised_greedy": self._supervised_greedy,
65
+ "supervised_fast": self._supervised_fast,
66
+ "unsupervised": self._unsupervised
67
+ }
68
+
69
+ def compute(self) -> pd.DataFrame:
70
+ """
71
+ Compute feature importance based on the configured algorithm.
72
+
73
+ Returns:
74
+ pd.DataFrame: DataFrame containing feature importance results
75
+ """
76
+ if self.config.algorithm not in self.available_algorithms:
77
+ raise ValueError(f"Unknown algorithm: {self.config.algorithm}")
78
+
79
+ start_time = time.time()
80
+ results = self.available_algorithms[self.config.algorithm]()
81
+ total_time = time.time() - start_time
82
+
83
+ # Add total computation time to results
84
+ results['total_time'] = total_time
85
+
86
+ return results
87
+
88
+ def _supervised_greedy(self) -> pd.DataFrame:
89
+ """
90
+ Compute feature importance using supervised greedy algorithm.
91
+
92
+ This method determines feature importance by iteratively selecting features
93
+ that maximize model performance when combined with previously selected features.
94
+
95
+ Returns:
96
+ pd.DataFrame: Results containing:
97
+ - feature_name: Name of the feature
98
+ - avg_performance: Average model performance at each step
99
+ - embedding_algorithm: Name of embedding algorithm used
100
+ - total_time: Total computation time in seconds
101
+ """
102
+ available_features = self.features.feature_columns.copy()
103
+ selected_features = []
104
+ performance_scores = []
105
+
106
+ # Create progress bar for the outer loop
107
+ pbar = tqdm(
108
+ total=len(available_features),
109
+ desc="Selecting features",
110
+ position=0,
111
+ leave=True
112
+ )
113
+
114
+ while available_features:
115
+ best_score = float('-inf')
116
+ best_feature = None
117
+
118
+ # Create progress bar for the inner loop
119
+ inner_pbar = tqdm(
120
+ available_features,
121
+ desc=f"Testing features (selected: {len(selected_features)})",
122
+ position=1,
123
+ leave=False
124
+ )
125
+
126
+ # Try each remaining feature
127
+ for feature in inner_pbar:
128
+ current_features = selected_features + [feature]
129
+ inner_pbar.set_postfix({'testing': feature}, refresh=True)
130
+
131
+ # Create embeddings using current feature set
132
+ embeddings = GraphEmbeddings(
133
+ graph_collection=self.graph_collection,
134
+ features=self.features,
135
+ embedding_algorithm=self.config.embedding_algorithm,
136
+ embedding_dimension=len(current_features),
137
+ feature_columns=current_features,
138
+ random_state=self.config.random_state
139
+ ).compute()
140
+
141
+ # Train and evaluate model
142
+ scores = []
143
+ ml_model = MLModels(
144
+ graph_collection=self.graph_collection,
145
+ embeddings=embeddings,
146
+ model_type="classifier" if isinstance(
147
+ self.graph_collection.graphs[0].graph_label,
148
+ (int, np.integer)
149
+ ) else "regressor",
150
+ random_state=self.config.random_state,
151
+ sample_size=self.config.sample_size # Use sample_size instead of n_iterations
152
+ )
153
+ results = ml_model.compute()
154
+
155
+ # Get performance metric
156
+ if results["model_type"] == "classifier":
157
+ scores = results["accuracy"] # Use all scores directly
158
+ else:
159
+ scores = [-score for score in results["rmse"]] # Negative RMSE for maximization
160
+
161
+ avg_score = np.mean(scores)
162
+ if avg_score > best_score:
163
+ best_score = avg_score
164
+ best_feature = feature
165
+
166
+ # Close inner progress bar
167
+ inner_pbar.close()
168
+
169
+ # Add best feature to selected features
170
+ selected_features.append(best_feature)
171
+ available_features.remove(best_feature)
172
+ performance_scores.append(abs(best_score)) # Convert back to positive RMSE if needed
173
+
174
+ # Update outer progress bar
175
+ pbar.update(1)
176
+ pbar.set_postfix({'best_feature': best_feature, 'score': abs(best_score)}, refresh=True)
177
+
178
+ # Close outer progress bar
179
+ pbar.close()
180
+
181
+ # Create results DataFrame
182
+ results_df = pd.DataFrame({
183
+ 'feature_name': selected_features,
184
+ 'avg_performance': performance_scores,
185
+ 'embedding_algorithm': self.config.embedding_algorithm
186
+ })
187
+
188
+ return results_df
189
+
190
+ def _supervised_fast(self) -> pd.DataFrame:
191
+ """
192
+ Compute feature importance using supervised fast algorithm.
193
+
194
+ This method:
195
+ 1. Determines feature importance order using Random Forest on 1D embeddings
196
+ 2. Evaluates performance by iteratively building models with increasing feature sets
197
+ 3. Returns results in same format as greedy method for consistency
198
+
199
+ Returns:
200
+ pd.DataFrame: Results containing:
201
+ - feature_name: Name of the feature in order of importance
202
+ - avg_performance: Performance using features up to this point
203
+ - embedding_algorithm: Name of embedding algorithm used
204
+ - total_time: Total computation time in seconds
205
+ """
206
+ start_time = time.time()
207
+ feature_embeddings = []
208
+
209
+ # Create progress bar for initial embeddings
210
+ pbar = tqdm(
211
+ self.features.feature_columns,
212
+ desc="Computing initial embeddings",
213
+ position=0,
214
+ leave=True
215
+ )
216
+
217
+ # Generate 1D embeddings for each feature
218
+ for feature in pbar:
219
+ pbar.set_postfix({'feature': feature}, refresh=True)
220
+
221
+ embeddings = GraphEmbeddings(
222
+ graph_collection=self.graph_collection,
223
+ features=self.features,
224
+ embedding_algorithm=self.config.embedding_algorithm,
225
+ embedding_dimension=1,
226
+ feature_columns=[feature],
227
+ random_state=self.config.random_state
228
+ ).compute()
229
+
230
+ embedding_df = embeddings.embeddings_df.copy()
231
+ embedding_df.rename(columns={'emb_0': feature}, inplace=True)
232
+ feature_embeddings.append(embedding_df)
233
+
234
+ # Merge all embeddings
235
+ merged_df = feature_embeddings[0]
236
+ for df in feature_embeddings[1:]:
237
+ merged_df = pd.merge(merged_df, df, on='graph_id', how='outer')
238
+
239
+ # Get feature importance order using Random Forest
240
+ embeddings = Embeddings(
241
+ embeddings_df=merged_df,
242
+ embedding_name=self.config.embedding_algorithm,
243
+ embedding_columns=self.features.feature_columns
244
+ )
245
+
246
+ ml_model = MLModels(
247
+ graph_collection=self.graph_collection,
248
+ embeddings=embeddings,
249
+ model_type="classifier" if isinstance(
250
+ self.graph_collection.graphs[0].graph_label,
251
+ (int, np.integer)
252
+ ) else "regressor",
253
+ model_name="random_forest",
254
+ compute_feature_importance=True,
255
+ sample_size=self.config.sample_size,
256
+ random_state=self.config.random_state
257
+ )
258
+
259
+ results = ml_model.compute()
260
+ ordered_features = results['feature_importance'].index.tolist()
261
+
262
+ # Evaluate performance iteratively
263
+ performance_scores = []
264
+
265
+ # Create progress bar for performance evaluation
266
+ pbar = tqdm(
267
+ range(len(ordered_features)),
268
+ desc="Evaluating feature combinations",
269
+ position=0,
270
+ leave=True
271
+ )
272
+
273
+ # Evaluate each feature combination
274
+ for i in pbar:
275
+ current_features = ordered_features[:i+1]
276
+ pbar.set_postfix({'n_features': len(current_features)}, refresh=True)
277
+
278
+ # Create embeddings using current feature set
279
+ embeddings = GraphEmbeddings(
280
+ graph_collection=self.graph_collection,
281
+ features=self.features,
282
+ embedding_algorithm=self.config.embedding_algorithm,
283
+ embedding_dimension=len(current_features), # Embedding size matches feature count
284
+ feature_columns=current_features,
285
+ random_state=self.config.random_state
286
+ ).compute()
287
+
288
+ # Train and evaluate model
289
+ ml_model = MLModels(
290
+ graph_collection=self.graph_collection,
291
+ embeddings=embeddings,
292
+ model_type="classifier" if isinstance(
293
+ self.graph_collection.graphs[0].graph_label,
294
+ (int, np.integer)
295
+ ) else "regressor",
296
+ model_name="random_forest",
297
+ sample_size=self.config.sample_size,
298
+ random_state=self.config.random_state
299
+ )
300
+
301
+ results = ml_model.compute()
302
+
303
+ # Get performance metric
304
+ if results["model_type"] == "classifier":
305
+ score = np.mean(results["accuracy"])
306
+ else:
307
+ score = -np.mean(results["rmse"]) # Negative RMSE for consistency
308
+
309
+ performance_scores.append(abs(score)) # Convert back to positive RMSE if needed
310
+
311
+ total_time = time.time() - start_time
312
+
313
+ # Create results DataFrame
314
+ results_df = pd.DataFrame({
315
+ 'feature_name': ordered_features,
316
+ 'avg_performance': performance_scores,
317
+ 'embedding_algorithm': self.config.embedding_algorithm,
318
+ 'total_time': total_time
319
+ })
320
+
321
+ return results_df
322
+
323
+ def _unsupervised(self) -> pd.DataFrame:
324
+ """Compute feature importance using unsupervised algorithm."""
325
+ pass
NEExT/features.py ADDED
@@ -0,0 +1,94 @@
1
+ from typing import List, Optional, Literal
2
+ import pandas as pd
3
+ from pydantic import BaseModel
4
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
5
+
6
+ class FeaturesConfig(BaseModel):
7
+ """Configuration for features normalization"""
8
+ scaler_type: Literal["StandardScaler", "MinMaxScaler", "RobustScaler"] = "StandardScaler"
9
+
10
+ class Features:
11
+ """
12
+ A class for managing feature data and operations.
13
+
14
+ This class provides a container for feature data and operations like
15
+ normalization and merging of features.
16
+
17
+ Attributes:
18
+ features_df (pd.DataFrame): DataFrame containing the features
19
+ feature_columns (List[str]): List of feature column names
20
+
21
+ Example:
22
+ >>> features = Features(df, ["page_rank", "degree_centrality"])
23
+ >>> features.normalize(type="StandardScaler")
24
+ >>> merged = features1 + features2
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ features_df: pd.DataFrame,
30
+ feature_columns: List[str]
31
+ ):
32
+ """Initialize the Features object."""
33
+ self.features_df = features_df
34
+ self.feature_columns = feature_columns
35
+
36
+ # Validate that all feature columns exist in DataFrame
37
+ missing_cols = [col for col in feature_columns if col not in features_df.columns]
38
+ if missing_cols:
39
+ raise ValueError(f"Features {missing_cols} not found in DataFrame")
40
+
41
+ def normalize(self, type: str = "StandardScaler") -> None:
42
+ """
43
+ Normalize features using the specified scaler.
44
+
45
+ Args:
46
+ type: Type of scaler to use ("StandardScaler", "MinMaxScaler", "RobustScaler")
47
+ """
48
+ config = FeaturesConfig(scaler_type=type)
49
+
50
+ if not self.features_df.empty:
51
+ # Get feature columns (exclude node_id and graph_id)
52
+ feature_cols = [col for col in self.features_df.columns
53
+ if col not in ['node_id', 'graph_id']]
54
+
55
+ if feature_cols:
56
+ # Initialize scaler
57
+ scalers = {
58
+ "StandardScaler": StandardScaler(),
59
+ "MinMaxScaler": MinMaxScaler(),
60
+ "RobustScaler": RobustScaler()
61
+ }
62
+ scaler = scalers[config.scaler_type]
63
+
64
+ # Fit and transform the feature columns
65
+ self.features_df[feature_cols] = scaler.fit_transform(
66
+ self.features_df[feature_cols]
67
+ )
68
+
69
+ def __add__(self, other: 'Features') -> 'Features':
70
+ """
71
+ Merge two Features objects.
72
+
73
+ Args:
74
+ other: Another Features object to merge with
75
+
76
+ Returns:
77
+ Features: New Features object containing merged data
78
+ """
79
+ if not isinstance(other, Features):
80
+ raise TypeError("Can only add Features objects together")
81
+
82
+ # Merge DataFrames on node_id and graph_id
83
+ merged_df = pd.merge(
84
+ self.features_df,
85
+ other.features_df,
86
+ on=['node_id', 'graph_id'],
87
+ how='outer'
88
+ )
89
+
90
+ # Combine feature columns
91
+ merged_columns = list(set(self.feature_columns + other.feature_columns))
92
+
93
+ return Features(merged_df, merged_columns)
94
+