NEExT 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- NEExT/__init__.py +11 -0
- NEExT/embeddings.py +88 -0
- NEExT/feature_importance.py +325 -0
- NEExT/features.py +94 -0
- NEExT/framework.py +309 -0
- NEExT/graph.py +224 -0
- NEExT/graph_collection.py +203 -0
- NEExT/graph_embeddings.py +230 -0
- NEExT/helper_functions.py +174 -0
- NEExT/io.py +182 -0
- NEExT/ml_models.py +479 -0
- NEExT/node_features.py +477 -0
- neext-0.1.0.dist-info/METADATA +318 -0
- neext-0.1.0.dist-info/RECORD +16 -0
- neext-0.1.0.dist-info/WHEEL +5 -0
- neext-0.1.0.dist-info/top_level.txt +1 -0
NEExT/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# This file makes the directory a Python package
|
|
2
|
+
# Add any imports or code that should be available when someone imports NEExT
|
|
3
|
+
|
|
4
|
+
# Version of the package
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
# Make the class directly available when someone does NEExT()
|
|
8
|
+
from .framework import NEExT
|
|
9
|
+
|
|
10
|
+
# This allows "from NEExT import *"
|
|
11
|
+
__all__ = ['NEExT']
|
NEExT/embeddings.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
class EmbeddingsConfig(BaseModel):
|
|
6
|
+
"""Configuration for embeddings"""
|
|
7
|
+
embedding_name: str
|
|
8
|
+
embedding_columns: List[str]
|
|
9
|
+
|
|
10
|
+
class Embeddings:
|
|
11
|
+
"""
|
|
12
|
+
A class for managing graph embeddings data and operations.
|
|
13
|
+
|
|
14
|
+
This class provides a container for embeddings data and operations like
|
|
15
|
+
merging different types of embeddings.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
embeddings_df (pd.DataFrame): DataFrame containing the embeddings
|
|
19
|
+
embedding_name (str): Name of the embedding algorithm used
|
|
20
|
+
embedding_columns (List[str]): List of embedding column names
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> embeddings1 = Embeddings(df1, "wasserstein", ["emb_0", "emb_1"])
|
|
24
|
+
>>> embeddings2 = Embeddings(df2, "approx_wasserstein", ["emb_0", "emb_1"])
|
|
25
|
+
>>> merged = embeddings1 + embeddings2 # Combines embeddings with unique column names
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
embeddings_df: pd.DataFrame,
|
|
31
|
+
embedding_name: str,
|
|
32
|
+
embedding_columns: List[str]
|
|
33
|
+
):
|
|
34
|
+
"""Initialize the Embeddings object."""
|
|
35
|
+
self.embeddings_df = embeddings_df
|
|
36
|
+
self.embedding_name = embedding_name
|
|
37
|
+
self.embedding_columns = embedding_columns
|
|
38
|
+
|
|
39
|
+
# Validate that all embedding columns exist in DataFrame
|
|
40
|
+
missing_cols = [col for col in embedding_columns if col not in embeddings_df.columns]
|
|
41
|
+
if missing_cols:
|
|
42
|
+
raise ValueError(f"Embedding columns {missing_cols} not found in DataFrame")
|
|
43
|
+
|
|
44
|
+
def __add__(self, other: 'Embeddings') -> 'Embeddings':
|
|
45
|
+
"""
|
|
46
|
+
Merge two Embeddings objects.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
other: Another Embeddings object to merge with
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Embeddings: New Embeddings object containing merged data with unique column names
|
|
53
|
+
"""
|
|
54
|
+
if not isinstance(other, Embeddings):
|
|
55
|
+
raise TypeError("Can only add Embeddings objects together")
|
|
56
|
+
|
|
57
|
+
# Create copies of DataFrames to avoid modifying originals
|
|
58
|
+
df1 = self.embeddings_df.copy()
|
|
59
|
+
df2 = other.embeddings_df.copy()
|
|
60
|
+
|
|
61
|
+
# Rename columns to include embedding name prefix
|
|
62
|
+
rename_dict1 = {
|
|
63
|
+
col: f"{self.embedding_name}_{col}"
|
|
64
|
+
for col in self.embedding_columns
|
|
65
|
+
}
|
|
66
|
+
rename_dict2 = {
|
|
67
|
+
col: f"{other.embedding_name}_{col}"
|
|
68
|
+
for col in other.embedding_columns
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
df1.rename(columns=rename_dict1, inplace=True)
|
|
72
|
+
df2.rename(columns=rename_dict2, inplace=True)
|
|
73
|
+
|
|
74
|
+
# Merge DataFrames on graph_id
|
|
75
|
+
merged_df = pd.merge(
|
|
76
|
+
df1,
|
|
77
|
+
df2,
|
|
78
|
+
on='graph_id',
|
|
79
|
+
how='outer'
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Create new column names list
|
|
83
|
+
new_columns = list(rename_dict1.values()) + list(rename_dict2.values())
|
|
84
|
+
|
|
85
|
+
# Create new embedding name
|
|
86
|
+
new_name = f"{self.embedding_name}+{other.embedding_name}"
|
|
87
|
+
|
|
88
|
+
return Embeddings(merged_df, new_name, new_columns)
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
from typing import List, Optional, Literal, Dict
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
import time
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from .graph_collection import GraphCollection
|
|
8
|
+
from .features import Features
|
|
9
|
+
from .graph_embeddings import GraphEmbeddings
|
|
10
|
+
from .ml_models import MLModels
|
|
11
|
+
from .embeddings import Embeddings
|
|
12
|
+
|
|
13
|
+
class FeatureImportanceConfig(BaseModel):
|
|
14
|
+
"""Configuration for feature importance analysis"""
|
|
15
|
+
algorithm: Literal["supervised_greedy", "supervised_fast", "unsupervised"]
|
|
16
|
+
embedding_algorithm: str = "approx_wasserstein"
|
|
17
|
+
model_name: Literal["xgboost", "random_forest"] = "random_forest"
|
|
18
|
+
random_state: int = 42
|
|
19
|
+
sample_size: int = 5
|
|
20
|
+
|
|
21
|
+
class FeatureImportance:
|
|
22
|
+
"""
|
|
23
|
+
A class for analyzing feature importance in graph embeddings.
|
|
24
|
+
|
|
25
|
+
This class provides methods to determine the importance of node features
|
|
26
|
+
based on their predictive power in both supervised and unsupervised settings.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
graph_collection (GraphCollection): Collection of graphs to analyze
|
|
30
|
+
features (Features): Features object containing node features
|
|
31
|
+
config (FeatureImportanceConfig): Configuration for importance analysis
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> importance = FeatureImportance(
|
|
35
|
+
... graph_collection=collection,
|
|
36
|
+
... features=features,
|
|
37
|
+
... algorithm="supervised_greedy",
|
|
38
|
+
... embedding_algorithm="approx_wasserstein"
|
|
39
|
+
... )
|
|
40
|
+
>>> results_df = importance.compute()
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
graph_collection: GraphCollection,
|
|
46
|
+
features: Features,
|
|
47
|
+
algorithm: str,
|
|
48
|
+
embedding_algorithm: str = "approx_wasserstein",
|
|
49
|
+
random_state: int = 42,
|
|
50
|
+
n_iterations: int = 5 # Keep for backward compatibility
|
|
51
|
+
):
|
|
52
|
+
"""Initialize the FeatureImportance analyzer."""
|
|
53
|
+
self.config = FeatureImportanceConfig(
|
|
54
|
+
algorithm=algorithm,
|
|
55
|
+
embedding_algorithm=embedding_algorithm,
|
|
56
|
+
random_state=random_state,
|
|
57
|
+
sample_size=n_iterations # Use n_iterations as sample_size
|
|
58
|
+
)
|
|
59
|
+
self.graph_collection = graph_collection
|
|
60
|
+
self.features = features
|
|
61
|
+
|
|
62
|
+
# Define available algorithms
|
|
63
|
+
self.available_algorithms = {
|
|
64
|
+
"supervised_greedy": self._supervised_greedy,
|
|
65
|
+
"supervised_fast": self._supervised_fast,
|
|
66
|
+
"unsupervised": self._unsupervised
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
def compute(self) -> pd.DataFrame:
|
|
70
|
+
"""
|
|
71
|
+
Compute feature importance based on the configured algorithm.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
pd.DataFrame: DataFrame containing feature importance results
|
|
75
|
+
"""
|
|
76
|
+
if self.config.algorithm not in self.available_algorithms:
|
|
77
|
+
raise ValueError(f"Unknown algorithm: {self.config.algorithm}")
|
|
78
|
+
|
|
79
|
+
start_time = time.time()
|
|
80
|
+
results = self.available_algorithms[self.config.algorithm]()
|
|
81
|
+
total_time = time.time() - start_time
|
|
82
|
+
|
|
83
|
+
# Add total computation time to results
|
|
84
|
+
results['total_time'] = total_time
|
|
85
|
+
|
|
86
|
+
return results
|
|
87
|
+
|
|
88
|
+
def _supervised_greedy(self) -> pd.DataFrame:
|
|
89
|
+
"""
|
|
90
|
+
Compute feature importance using supervised greedy algorithm.
|
|
91
|
+
|
|
92
|
+
This method determines feature importance by iteratively selecting features
|
|
93
|
+
that maximize model performance when combined with previously selected features.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
pd.DataFrame: Results containing:
|
|
97
|
+
- feature_name: Name of the feature
|
|
98
|
+
- avg_performance: Average model performance at each step
|
|
99
|
+
- embedding_algorithm: Name of embedding algorithm used
|
|
100
|
+
- total_time: Total computation time in seconds
|
|
101
|
+
"""
|
|
102
|
+
available_features = self.features.feature_columns.copy()
|
|
103
|
+
selected_features = []
|
|
104
|
+
performance_scores = []
|
|
105
|
+
|
|
106
|
+
# Create progress bar for the outer loop
|
|
107
|
+
pbar = tqdm(
|
|
108
|
+
total=len(available_features),
|
|
109
|
+
desc="Selecting features",
|
|
110
|
+
position=0,
|
|
111
|
+
leave=True
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
while available_features:
|
|
115
|
+
best_score = float('-inf')
|
|
116
|
+
best_feature = None
|
|
117
|
+
|
|
118
|
+
# Create progress bar for the inner loop
|
|
119
|
+
inner_pbar = tqdm(
|
|
120
|
+
available_features,
|
|
121
|
+
desc=f"Testing features (selected: {len(selected_features)})",
|
|
122
|
+
position=1,
|
|
123
|
+
leave=False
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Try each remaining feature
|
|
127
|
+
for feature in inner_pbar:
|
|
128
|
+
current_features = selected_features + [feature]
|
|
129
|
+
inner_pbar.set_postfix({'testing': feature}, refresh=True)
|
|
130
|
+
|
|
131
|
+
# Create embeddings using current feature set
|
|
132
|
+
embeddings = GraphEmbeddings(
|
|
133
|
+
graph_collection=self.graph_collection,
|
|
134
|
+
features=self.features,
|
|
135
|
+
embedding_algorithm=self.config.embedding_algorithm,
|
|
136
|
+
embedding_dimension=len(current_features),
|
|
137
|
+
feature_columns=current_features,
|
|
138
|
+
random_state=self.config.random_state
|
|
139
|
+
).compute()
|
|
140
|
+
|
|
141
|
+
# Train and evaluate model
|
|
142
|
+
scores = []
|
|
143
|
+
ml_model = MLModels(
|
|
144
|
+
graph_collection=self.graph_collection,
|
|
145
|
+
embeddings=embeddings,
|
|
146
|
+
model_type="classifier" if isinstance(
|
|
147
|
+
self.graph_collection.graphs[0].graph_label,
|
|
148
|
+
(int, np.integer)
|
|
149
|
+
) else "regressor",
|
|
150
|
+
random_state=self.config.random_state,
|
|
151
|
+
sample_size=self.config.sample_size # Use sample_size instead of n_iterations
|
|
152
|
+
)
|
|
153
|
+
results = ml_model.compute()
|
|
154
|
+
|
|
155
|
+
# Get performance metric
|
|
156
|
+
if results["model_type"] == "classifier":
|
|
157
|
+
scores = results["accuracy"] # Use all scores directly
|
|
158
|
+
else:
|
|
159
|
+
scores = [-score for score in results["rmse"]] # Negative RMSE for maximization
|
|
160
|
+
|
|
161
|
+
avg_score = np.mean(scores)
|
|
162
|
+
if avg_score > best_score:
|
|
163
|
+
best_score = avg_score
|
|
164
|
+
best_feature = feature
|
|
165
|
+
|
|
166
|
+
# Close inner progress bar
|
|
167
|
+
inner_pbar.close()
|
|
168
|
+
|
|
169
|
+
# Add best feature to selected features
|
|
170
|
+
selected_features.append(best_feature)
|
|
171
|
+
available_features.remove(best_feature)
|
|
172
|
+
performance_scores.append(abs(best_score)) # Convert back to positive RMSE if needed
|
|
173
|
+
|
|
174
|
+
# Update outer progress bar
|
|
175
|
+
pbar.update(1)
|
|
176
|
+
pbar.set_postfix({'best_feature': best_feature, 'score': abs(best_score)}, refresh=True)
|
|
177
|
+
|
|
178
|
+
# Close outer progress bar
|
|
179
|
+
pbar.close()
|
|
180
|
+
|
|
181
|
+
# Create results DataFrame
|
|
182
|
+
results_df = pd.DataFrame({
|
|
183
|
+
'feature_name': selected_features,
|
|
184
|
+
'avg_performance': performance_scores,
|
|
185
|
+
'embedding_algorithm': self.config.embedding_algorithm
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
return results_df
|
|
189
|
+
|
|
190
|
+
def _supervised_fast(self) -> pd.DataFrame:
|
|
191
|
+
"""
|
|
192
|
+
Compute feature importance using supervised fast algorithm.
|
|
193
|
+
|
|
194
|
+
This method:
|
|
195
|
+
1. Determines feature importance order using Random Forest on 1D embeddings
|
|
196
|
+
2. Evaluates performance by iteratively building models with increasing feature sets
|
|
197
|
+
3. Returns results in same format as greedy method for consistency
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
pd.DataFrame: Results containing:
|
|
201
|
+
- feature_name: Name of the feature in order of importance
|
|
202
|
+
- avg_performance: Performance using features up to this point
|
|
203
|
+
- embedding_algorithm: Name of embedding algorithm used
|
|
204
|
+
- total_time: Total computation time in seconds
|
|
205
|
+
"""
|
|
206
|
+
start_time = time.time()
|
|
207
|
+
feature_embeddings = []
|
|
208
|
+
|
|
209
|
+
# Create progress bar for initial embeddings
|
|
210
|
+
pbar = tqdm(
|
|
211
|
+
self.features.feature_columns,
|
|
212
|
+
desc="Computing initial embeddings",
|
|
213
|
+
position=0,
|
|
214
|
+
leave=True
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Generate 1D embeddings for each feature
|
|
218
|
+
for feature in pbar:
|
|
219
|
+
pbar.set_postfix({'feature': feature}, refresh=True)
|
|
220
|
+
|
|
221
|
+
embeddings = GraphEmbeddings(
|
|
222
|
+
graph_collection=self.graph_collection,
|
|
223
|
+
features=self.features,
|
|
224
|
+
embedding_algorithm=self.config.embedding_algorithm,
|
|
225
|
+
embedding_dimension=1,
|
|
226
|
+
feature_columns=[feature],
|
|
227
|
+
random_state=self.config.random_state
|
|
228
|
+
).compute()
|
|
229
|
+
|
|
230
|
+
embedding_df = embeddings.embeddings_df.copy()
|
|
231
|
+
embedding_df.rename(columns={'emb_0': feature}, inplace=True)
|
|
232
|
+
feature_embeddings.append(embedding_df)
|
|
233
|
+
|
|
234
|
+
# Merge all embeddings
|
|
235
|
+
merged_df = feature_embeddings[0]
|
|
236
|
+
for df in feature_embeddings[1:]:
|
|
237
|
+
merged_df = pd.merge(merged_df, df, on='graph_id', how='outer')
|
|
238
|
+
|
|
239
|
+
# Get feature importance order using Random Forest
|
|
240
|
+
embeddings = Embeddings(
|
|
241
|
+
embeddings_df=merged_df,
|
|
242
|
+
embedding_name=self.config.embedding_algorithm,
|
|
243
|
+
embedding_columns=self.features.feature_columns
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
ml_model = MLModels(
|
|
247
|
+
graph_collection=self.graph_collection,
|
|
248
|
+
embeddings=embeddings,
|
|
249
|
+
model_type="classifier" if isinstance(
|
|
250
|
+
self.graph_collection.graphs[0].graph_label,
|
|
251
|
+
(int, np.integer)
|
|
252
|
+
) else "regressor",
|
|
253
|
+
model_name="random_forest",
|
|
254
|
+
compute_feature_importance=True,
|
|
255
|
+
sample_size=self.config.sample_size,
|
|
256
|
+
random_state=self.config.random_state
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
results = ml_model.compute()
|
|
260
|
+
ordered_features = results['feature_importance'].index.tolist()
|
|
261
|
+
|
|
262
|
+
# Evaluate performance iteratively
|
|
263
|
+
performance_scores = []
|
|
264
|
+
|
|
265
|
+
# Create progress bar for performance evaluation
|
|
266
|
+
pbar = tqdm(
|
|
267
|
+
range(len(ordered_features)),
|
|
268
|
+
desc="Evaluating feature combinations",
|
|
269
|
+
position=0,
|
|
270
|
+
leave=True
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Evaluate each feature combination
|
|
274
|
+
for i in pbar:
|
|
275
|
+
current_features = ordered_features[:i+1]
|
|
276
|
+
pbar.set_postfix({'n_features': len(current_features)}, refresh=True)
|
|
277
|
+
|
|
278
|
+
# Create embeddings using current feature set
|
|
279
|
+
embeddings = GraphEmbeddings(
|
|
280
|
+
graph_collection=self.graph_collection,
|
|
281
|
+
features=self.features,
|
|
282
|
+
embedding_algorithm=self.config.embedding_algorithm,
|
|
283
|
+
embedding_dimension=len(current_features), # Embedding size matches feature count
|
|
284
|
+
feature_columns=current_features,
|
|
285
|
+
random_state=self.config.random_state
|
|
286
|
+
).compute()
|
|
287
|
+
|
|
288
|
+
# Train and evaluate model
|
|
289
|
+
ml_model = MLModels(
|
|
290
|
+
graph_collection=self.graph_collection,
|
|
291
|
+
embeddings=embeddings,
|
|
292
|
+
model_type="classifier" if isinstance(
|
|
293
|
+
self.graph_collection.graphs[0].graph_label,
|
|
294
|
+
(int, np.integer)
|
|
295
|
+
) else "regressor",
|
|
296
|
+
model_name="random_forest",
|
|
297
|
+
sample_size=self.config.sample_size,
|
|
298
|
+
random_state=self.config.random_state
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
results = ml_model.compute()
|
|
302
|
+
|
|
303
|
+
# Get performance metric
|
|
304
|
+
if results["model_type"] == "classifier":
|
|
305
|
+
score = np.mean(results["accuracy"])
|
|
306
|
+
else:
|
|
307
|
+
score = -np.mean(results["rmse"]) # Negative RMSE for consistency
|
|
308
|
+
|
|
309
|
+
performance_scores.append(abs(score)) # Convert back to positive RMSE if needed
|
|
310
|
+
|
|
311
|
+
total_time = time.time() - start_time
|
|
312
|
+
|
|
313
|
+
# Create results DataFrame
|
|
314
|
+
results_df = pd.DataFrame({
|
|
315
|
+
'feature_name': ordered_features,
|
|
316
|
+
'avg_performance': performance_scores,
|
|
317
|
+
'embedding_algorithm': self.config.embedding_algorithm,
|
|
318
|
+
'total_time': total_time
|
|
319
|
+
})
|
|
320
|
+
|
|
321
|
+
return results_df
|
|
322
|
+
|
|
323
|
+
def _unsupervised(self) -> pd.DataFrame:
|
|
324
|
+
"""Compute feature importance using unsupervised algorithm."""
|
|
325
|
+
pass
|
NEExT/features.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from typing import List, Optional, Literal
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
|
|
5
|
+
|
|
6
|
+
class FeaturesConfig(BaseModel):
|
|
7
|
+
"""Configuration for features normalization"""
|
|
8
|
+
scaler_type: Literal["StandardScaler", "MinMaxScaler", "RobustScaler"] = "StandardScaler"
|
|
9
|
+
|
|
10
|
+
class Features:
|
|
11
|
+
"""
|
|
12
|
+
A class for managing feature data and operations.
|
|
13
|
+
|
|
14
|
+
This class provides a container for feature data and operations like
|
|
15
|
+
normalization and merging of features.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
features_df (pd.DataFrame): DataFrame containing the features
|
|
19
|
+
feature_columns (List[str]): List of feature column names
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
>>> features = Features(df, ["page_rank", "degree_centrality"])
|
|
23
|
+
>>> features.normalize(type="StandardScaler")
|
|
24
|
+
>>> merged = features1 + features2
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
features_df: pd.DataFrame,
|
|
30
|
+
feature_columns: List[str]
|
|
31
|
+
):
|
|
32
|
+
"""Initialize the Features object."""
|
|
33
|
+
self.features_df = features_df
|
|
34
|
+
self.feature_columns = feature_columns
|
|
35
|
+
|
|
36
|
+
# Validate that all feature columns exist in DataFrame
|
|
37
|
+
missing_cols = [col for col in feature_columns if col not in features_df.columns]
|
|
38
|
+
if missing_cols:
|
|
39
|
+
raise ValueError(f"Features {missing_cols} not found in DataFrame")
|
|
40
|
+
|
|
41
|
+
def normalize(self, type: str = "StandardScaler") -> None:
|
|
42
|
+
"""
|
|
43
|
+
Normalize features using the specified scaler.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
type: Type of scaler to use ("StandardScaler", "MinMaxScaler", "RobustScaler")
|
|
47
|
+
"""
|
|
48
|
+
config = FeaturesConfig(scaler_type=type)
|
|
49
|
+
|
|
50
|
+
if not self.features_df.empty:
|
|
51
|
+
# Get feature columns (exclude node_id and graph_id)
|
|
52
|
+
feature_cols = [col for col in self.features_df.columns
|
|
53
|
+
if col not in ['node_id', 'graph_id']]
|
|
54
|
+
|
|
55
|
+
if feature_cols:
|
|
56
|
+
# Initialize scaler
|
|
57
|
+
scalers = {
|
|
58
|
+
"StandardScaler": StandardScaler(),
|
|
59
|
+
"MinMaxScaler": MinMaxScaler(),
|
|
60
|
+
"RobustScaler": RobustScaler()
|
|
61
|
+
}
|
|
62
|
+
scaler = scalers[config.scaler_type]
|
|
63
|
+
|
|
64
|
+
# Fit and transform the feature columns
|
|
65
|
+
self.features_df[feature_cols] = scaler.fit_transform(
|
|
66
|
+
self.features_df[feature_cols]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def __add__(self, other: 'Features') -> 'Features':
|
|
70
|
+
"""
|
|
71
|
+
Merge two Features objects.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
other: Another Features object to merge with
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Features: New Features object containing merged data
|
|
78
|
+
"""
|
|
79
|
+
if not isinstance(other, Features):
|
|
80
|
+
raise TypeError("Can only add Features objects together")
|
|
81
|
+
|
|
82
|
+
# Merge DataFrames on node_id and graph_id
|
|
83
|
+
merged_df = pd.merge(
|
|
84
|
+
self.features_df,
|
|
85
|
+
other.features_df,
|
|
86
|
+
on=['node_id', 'graph_id'],
|
|
87
|
+
how='outer'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Combine feature columns
|
|
91
|
+
merged_columns = list(set(self.feature_columns + other.feature_columns))
|
|
92
|
+
|
|
93
|
+
return Features(merged_df, merged_columns)
|
|
94
|
+
|