tsadmetrics 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entorno/bin/activate_this.py +32 -0
- entorno/bin/rst2html.py +23 -0
- entorno/bin/rst2html4.py +26 -0
- entorno/bin/rst2html5.py +33 -0
- entorno/bin/rst2latex.py +26 -0
- entorno/bin/rst2man.py +27 -0
- entorno/bin/rst2odt.py +28 -0
- entorno/bin/rst2odt_prepstyles.py +20 -0
- entorno/bin/rst2pseudoxml.py +23 -0
- entorno/bin/rst2s5.py +24 -0
- entorno/bin/rst2xetex.py +27 -0
- entorno/bin/rst2xml.py +23 -0
- entorno/bin/rstpep2html.py +25 -0
- experiments/scripts/compute_metrics.py +187 -0
- experiments/scripts/metrics_complexity_analysis.py +109 -0
- experiments/scripts/metro_experiment.py +133 -0
- experiments/scripts/opt_metro_experiment.py +343 -0
- tests/__init__.py +0 -0
- tests/test_binary.py +759 -0
- tests/test_non_binary.py +371 -0
- tsadmetrics/_tsadeval/affiliation/__init__.py +0 -0
- tsadmetrics/_tsadeval/affiliation/_affiliation_zone.py +86 -0
- tsadmetrics/_tsadeval/affiliation/_integral_interval.py +464 -0
- tsadmetrics/_tsadeval/affiliation/_single_ground_truth_event.py +68 -0
- tsadmetrics/_tsadeval/affiliation/generics.py +135 -0
- tsadmetrics/_tsadeval/affiliation/metrics.py +114 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/DataManage/File_IO.py +175 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/DataManage/Range.py +50 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/DataManage/Time_Plot.py +184 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/DataManage/__init__.py +0 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/__init__.py +0 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/etapr.py +386 -0
- tsadmetrics/_tsadeval/eTaPR_pkg/tapr.py +362 -0
- tsadmetrics/_tsadeval/prts/__init__.py +0 -0
- tsadmetrics/_tsadeval/prts/base/__init__.py +0 -0
- tsadmetrics/_tsadeval/prts/base/time_series_metrics.py +165 -0
- tsadmetrics/_tsadeval/prts/basic_metrics_ts.py +121 -0
- tsadmetrics/_tsadeval/prts/time_series_metrics/__init__.py +0 -0
- tsadmetrics/_tsadeval/prts/time_series_metrics/fscore.py +61 -0
- tsadmetrics/_tsadeval/prts/time_series_metrics/precision.py +86 -0
- tsadmetrics/_tsadeval/prts/time_series_metrics/precision_recall.py +21 -0
- tsadmetrics/_tsadeval/prts/time_series_metrics/recall.py +85 -0
- {tsadmetrics-0.1.4.dist-info → tsadmetrics-0.1.5.dist-info}/METADATA +1 -1
- tsadmetrics-0.1.5.dist-info/RECORD +62 -0
- tsadmetrics-0.1.5.dist-info/top_level.txt +4 -0
- tsadmetrics-0.1.4.dist-info/RECORD +0 -20
- tsadmetrics-0.1.4.dist-info/top_level.txt +0 -1
- {tsadmetrics-0.1.4.dist-info → tsadmetrics-0.1.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import seaborn as sns
|
5
|
+
import tsadmetrics as tm
|
6
|
+
import time
|
7
|
+
from sklearn.metrics import f1_score
|
8
|
+
|
9
|
+
from sklearn.model_selection import train_test_split
|
10
|
+
from sklearn.preprocessing import MinMaxScaler
|
11
|
+
|
12
|
+
|
13
|
+
df_analog = pd.read_csv('../preprocessed_data/MetroPT3_analogic.csv')
|
14
|
+
df_analog = pd.DataFrame(df_analog).set_index('timestamp')
|
15
|
+
|
16
|
+
# Separar las características (X) y la variable objetivo (y)
|
17
|
+
X = df_analog.drop(columns='anomaly') # Características
|
18
|
+
y = df_analog['anomaly'] # Variable objetivo
|
19
|
+
|
20
|
+
# Normalizar las características entre [0, 1]
|
21
|
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
22
|
+
X_normalized = scaler.fit_transform(X)
|
23
|
+
|
24
|
+
# Convertir el resultado normalizado de nuevo a un DataFrame
|
25
|
+
X_normalized = pd.DataFrame(X_normalized, columns=X.columns, index=X.index)
|
26
|
+
|
27
|
+
# Dividir el conjunto de datos normalizado en entrenamiento y prueba
|
28
|
+
train_df_analog, test_df_analog = train_test_split(
|
29
|
+
X_normalized.join(y), # Unir las características normalizadas con la variable objetivo
|
30
|
+
test_size=0.4,
|
31
|
+
random_state=42
|
32
|
+
)
|
33
|
+
|
34
|
+
X_train_analog = train_df_analog.drop(columns='anomaly')
|
35
|
+
y_train_analog = train_df_analog['anomaly']
|
36
|
+
X_test_analog = test_df_analog.drop(columns='anomaly')
|
37
|
+
y_test_analog = test_df_analog['anomaly']
|
38
|
+
|
39
|
+
|
40
|
+
#Modelos basados en distancia
|
41
|
+
from pyod.models.lof import LOF
|
42
|
+
from pyod.models.cblof import CBLOF
|
43
|
+
from pyod.models.knn import KNN
|
44
|
+
from pyod.models.abod import ABOD
|
45
|
+
|
46
|
+
modelos_distancia = [
|
47
|
+
LOF(n_neighbors=35, contamination=np.sum(y_train_analog)/len(y_train_analog), n_jobs=-1),
|
48
|
+
#COF(contamination=np.sum(y_train_analog)/len(y_train_analog),method='memory'),
|
49
|
+
CBLOF(contamination=np.sum(y_train_analog)/len(y_train_analog),n_jobs=-1),
|
50
|
+
KNN(n_neighbors=35, contamination=np.sum(y_train_analog)/len(y_train_analog),n_jobs=-1),
|
51
|
+
ABOD(contamination=np.sum(y_train_analog)/len(y_train_analog))
|
52
|
+
]
|
53
|
+
|
54
|
+
#Modelos basados en árboles de aislamiento
|
55
|
+
from pyod.models.iforest import IForest
|
56
|
+
modelos_arboles = [
|
57
|
+
IForest(contamination=np.sum(y_train_analog)/len(y_train_analog),n_jobs=-1, random_state=42)
|
58
|
+
]
|
59
|
+
|
60
|
+
#Modelos basados en Reconstrucción
|
61
|
+
from pyod.models.ae1svm import AE1SVM
|
62
|
+
from pyod.models.alad import ALAD
|
63
|
+
from pyod.models.auto_encoder import AutoEncoder
|
64
|
+
|
65
|
+
modelos_reconstruccion = [
|
66
|
+
AE1SVM(contamination=np.sum(y_train_analog)/len(y_train_analog)),
|
67
|
+
ALAD(contamination=np.sum(y_train_analog)/len(y_train_analog)),
|
68
|
+
AutoEncoder(contamination=np.sum(y_train_analog)/len(y_train_analog))
|
69
|
+
]
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
#ejecucion de los modelos
|
74
|
+
|
75
|
+
distancia_results = pd.DataFrame(columns=['nombre_modelo', 'f1_score', 'segment_wise_f_score', 'tiempo_entrenamiento'])
|
76
|
+
for modelo in modelos_distancia:
|
77
|
+
|
78
|
+
nombre_modelo = modelo.__class__.__name__
|
79
|
+
|
80
|
+
inicio = time.time()
|
81
|
+
try:
|
82
|
+
modelo.fit(X_train_analog)
|
83
|
+
t = time.time() - inicio
|
84
|
+
y_pred = modelo.predict(X_test_analog)
|
85
|
+
f1 = f1_score(y_test_analog, y_pred)
|
86
|
+
sw_f1 = tm.segment_wise_f_score(y_test_analog, y_pred)
|
87
|
+
print(f'Modelo: {nombre_modelo} - F1: {f1} - Segment-wise F1: {sw_f1} - Tiempo: {t}')
|
88
|
+
except Exception as e:
|
89
|
+
print(f'Error en el modelo {nombre_modelo}: {e}')
|
90
|
+
# Añadir los resultados al DataFrame
|
91
|
+
distancia_results.loc[len(distancia_results)] = [nombre_modelo, f1, sw_f1, t]
|
92
|
+
|
93
|
+
distancia_results.to_csv('../results/distancia_results.csv')
|
94
|
+
|
95
|
+
arbol_results = pd.DataFrame(columns=['nombre_modelo', 'f1_score', 'segment_wise_f_score', 'tiempo_entrenamiento'])
|
96
|
+
for modelo in modelos_arboles:
|
97
|
+
|
98
|
+
nombre_modelo = modelo.__class__.__name__
|
99
|
+
|
100
|
+
inicio = time.time()
|
101
|
+
try:
|
102
|
+
modelo.fit(X_train_analog)
|
103
|
+
t = time.time() - inicio
|
104
|
+
y_pred = modelo.predict(X_test_analog)
|
105
|
+
f1 = f1_score(y_test_analog, y_pred)
|
106
|
+
sw_f1 = tm.segment_wise_f_score(y_test_analog, y_pred)
|
107
|
+
print(f'Modelo: {nombre_modelo} - F1: {f1} - Segment-wise F1: {sw_f1} - Tiempo: {t}')
|
108
|
+
except Exception as e:
|
109
|
+
print(f'Error en el modelo {nombre_modelo}: {e}')
|
110
|
+
# Añadir los resultados al DataFrame
|
111
|
+
arbol_results.loc[len(arbol_results)] = [nombre_modelo, f1, sw_f1, t]
|
112
|
+
|
113
|
+
arbol_results.to_csv('../results/arbol_results.csv')
|
114
|
+
|
115
|
+
reconstruccion_results = pd.DataFrame(columns=['nombre_modelo', 'f1_score', 'segment_wise_f_score', 'tiempo_entrenamiento'])
|
116
|
+
for modelo in modelos_reconstruccion:
|
117
|
+
|
118
|
+
nombre_modelo = modelo.__class__.__name__
|
119
|
+
|
120
|
+
inicio = time.time()
|
121
|
+
try:
|
122
|
+
modelo.fit(X_train_analog[y_train_analog == 0])
|
123
|
+
t = time.time() - inicio
|
124
|
+
y_pred = modelo.predict(X_test_analog)
|
125
|
+
f1 = f1_score(y_test_analog, y_pred)
|
126
|
+
sw_f1 = tm.segment_wise_f_score(y_test_analog, y_pred)
|
127
|
+
print(f'Modelo: {nombre_modelo} - F1: {f1} - Segment-wise F1: {sw_f1} - Tiempo: {t}')
|
128
|
+
except Exception as e:
|
129
|
+
print(f'Error en el modelo {nombre_modelo}: {e}')
|
130
|
+
# Añadir los resultados al DataFrame
|
131
|
+
reconstruccion_results.loc[len(reconstruccion_results)] = [nombre_modelo, f1, sw_f1, t]
|
132
|
+
|
133
|
+
reconstruccion_results.to_csv('../results/reconstruccion_results.csv')
|
@@ -0,0 +1,343 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
import seaborn as sns
|
5
|
+
import tsadmetrics as tm
|
6
|
+
import time
|
7
|
+
from sklearn.metrics import f1_score, recall_score, precision_score
|
8
|
+
import optuna
|
9
|
+
from optuna.samplers import TPESampler
|
10
|
+
from functools import partial
|
11
|
+
import warnings
|
12
|
+
warnings.filterwarnings('ignore')
|
13
|
+
from sklearn.model_selection import train_test_split
|
14
|
+
from sklearn.preprocessing import MinMaxScaler
|
15
|
+
import os
|
16
|
+
import json
|
17
|
+
import torch
|
18
|
+
import torch.nn as nn
|
19
|
+
from torch.utils.data import DataLoader, TensorDataset
|
20
|
+
from sklearn.metrics import classification_report
|
21
|
+
|
22
|
+
def simplify_dataset(
|
23
|
+
data: pd.DataFrame,
|
24
|
+
window_size: int = 2,
|
25
|
+
time_col: str = None,
|
26
|
+
anomaly_col: str = 'anomaly',
|
27
|
+
agg_func: str = 'mean'
|
28
|
+
) -> pd.DataFrame:
|
29
|
+
"""
|
30
|
+
Reduce un dataset aplicando agregación en ventanas temporales.
|
31
|
+
"""
|
32
|
+
simplified_data = data.rolling(window_size, step=window_size).mean()
|
33
|
+
simplified_data = simplified_data.dropna()
|
34
|
+
simplified_data[anomaly_col] = (simplified_data[anomaly_col] > 0.1).astype(int)
|
35
|
+
return simplified_data.reset_index(drop=True)
|
36
|
+
|
37
|
+
# Configuración inicial para PyTorch
|
38
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
39
|
+
print(f"Using device: {device}")
|
40
|
+
|
41
|
+
def guardar_prediccion(modelo_nombre, y_true, y_pred_binario, y_pred_continuo=None, timestamps=None):
|
42
|
+
"""
|
43
|
+
Guarda las predicciones ordenadas por timestamp.
|
44
|
+
"""
|
45
|
+
os.makedirs('../results/predictions', exist_ok=True)
|
46
|
+
|
47
|
+
resultados = pd.DataFrame({
|
48
|
+
'timestamp': timestamps if timestamps is not None else np.arange(len(y_true)),
|
49
|
+
'ground_truth': np.array(y_true).flatten(),
|
50
|
+
'prediction_binary': np.array(y_pred_binario).flatten()
|
51
|
+
})
|
52
|
+
|
53
|
+
if y_pred_continuo is not None:
|
54
|
+
resultados['prediction_continuous'] = np.array(y_pred_continuo).flatten()
|
55
|
+
|
56
|
+
if 'timestamp' in resultados.columns:
|
57
|
+
resultados = resultados.sort_values('timestamp')
|
58
|
+
|
59
|
+
nombre_archivo = f'../results/predictions/{modelo_nombre}_pred.csv'
|
60
|
+
resultados.to_csv(nombre_archivo, index=False)
|
61
|
+
return nombre_archivo
|
62
|
+
|
63
|
+
# -------------------------------
|
64
|
+
# Cargar y preparar datos
|
65
|
+
df_analog = pd.read_csv('../preprocessed_data/MetroPT3_analogic.csv')
|
66
|
+
df_analog = pd.DataFrame(df_analog).set_index('timestamp')
|
67
|
+
df_analog = df_analog.sort_index()
|
68
|
+
|
69
|
+
# Reducción de tamaño
|
70
|
+
print('Tamaño inicial del dataset:', df_analog.shape)
|
71
|
+
print(f'Proporción de anomalías: {df_analog["anomaly"].mean():.2f}')
|
72
|
+
df_analog = simplify_dataset(df_analog, window_size=10, time_col='timestamp')
|
73
|
+
print('Tamaño del dataset:', df_analog.shape)
|
74
|
+
print(f'Proporción de anomalías: {df_analog["anomaly"].mean():.2f}')
|
75
|
+
|
76
|
+
# Separar y normalizar datos
|
77
|
+
X = df_analog.drop(columns='anomaly')
|
78
|
+
y = df_analog['anomaly']
|
79
|
+
scaler = MinMaxScaler(feature_range=(0, 1))
|
80
|
+
X_normalized = scaler.fit_transform(X)
|
81
|
+
X_normalized = pd.DataFrame(X_normalized, columns=X.columns, index=X.index)
|
82
|
+
|
83
|
+
# -------------------------------
|
84
|
+
# DIVISIÓN DE DATOS
|
85
|
+
# 1. Para modelos no-LSTM (shuffle=True)
|
86
|
+
train_df_shuf, test_df_shuf = train_test_split(
|
87
|
+
X_normalized.join(y),
|
88
|
+
test_size=0.4,
|
89
|
+
random_state=42,
|
90
|
+
shuffle=True
|
91
|
+
)
|
92
|
+
|
93
|
+
# 2. Para LSTM (shuffle=False para mantener orden temporal)
|
94
|
+
train_df_noshuf, test_df_noshuf = train_test_split(
|
95
|
+
X_normalized.join(y),
|
96
|
+
test_size=0.4,
|
97
|
+
random_state=42,
|
98
|
+
shuffle=False
|
99
|
+
)
|
100
|
+
|
101
|
+
# Preparar datos para modelos no-LSTM
|
102
|
+
X_train_shuf = train_df_shuf.drop(columns='anomaly')
|
103
|
+
y_train_shuf = train_df_shuf['anomaly']
|
104
|
+
X_test_shuf = test_df_shuf.drop(columns='anomaly')
|
105
|
+
y_test_shuf = test_df_shuf['anomaly']
|
106
|
+
|
107
|
+
# Preparar datos para LSTM
|
108
|
+
X_train_noshuf = train_df_noshuf.drop(columns='anomaly')
|
109
|
+
y_train_noshuf = train_df_noshuf['anomaly']
|
110
|
+
X_test_noshuf = test_df_noshuf.drop(columns='anomaly')
|
111
|
+
y_test_noshuf = test_df_noshuf['anomaly']
|
112
|
+
|
113
|
+
contamination = np.sum(y_train_shuf)/len(y_train_shuf)
|
114
|
+
|
115
|
+
# -------------------------------
|
116
|
+
# Definición del modelo LSTM
|
117
|
+
class AnomalyLSTM(nn.Module):
|
118
|
+
def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.2):
|
119
|
+
super(AnomalyLSTM, self).__init__()
|
120
|
+
self.lstm = nn.LSTM(
|
121
|
+
input_size=input_size,
|
122
|
+
hidden_size=hidden_size,
|
123
|
+
num_layers=num_layers,
|
124
|
+
batch_first=True,
|
125
|
+
dropout=dropout if num_layers > 1 else 0
|
126
|
+
)
|
127
|
+
self.fc = nn.Linear(hidden_size, 1)
|
128
|
+
self.sigmoid = nn.Sigmoid()
|
129
|
+
|
130
|
+
def forward(self, x):
|
131
|
+
lstm_out, _ = self.lstm(x)
|
132
|
+
last_time_step = lstm_out[:, -1, :]
|
133
|
+
output = self.fc(last_time_step)
|
134
|
+
return self.sigmoid(output)
|
135
|
+
|
136
|
+
def train_lstm(X_train, y_train, X_test, y_test, timestamps_test, sequence_length=10, epochs=20, batch_size=16):
|
137
|
+
# Preparar datos secuenciales
|
138
|
+
def create_sequences(data, targets, seq_length):
|
139
|
+
xs, ys = [], []
|
140
|
+
for i in range(len(data)-seq_length):
|
141
|
+
xs.append(data[i:(i+seq_length)])
|
142
|
+
ys.append(targets[i+seq_length])
|
143
|
+
return np.array(xs), np.array(ys)
|
144
|
+
|
145
|
+
X_train_seq, y_train_seq = create_sequences(X_train.values, y_train.values, sequence_length)
|
146
|
+
X_test_seq, y_test_seq = create_sequences(X_test.values, y_test.values, sequence_length)
|
147
|
+
|
148
|
+
# Convertir a tensores PyTorch
|
149
|
+
train_data = TensorDataset(
|
150
|
+
torch.FloatTensor(X_train_seq),
|
151
|
+
torch.FloatTensor(y_train_seq).unsqueeze(1)
|
152
|
+
)
|
153
|
+
test_data = TensorDataset(
|
154
|
+
torch.FloatTensor(X_test_seq),
|
155
|
+
torch.FloatTensor(y_test_seq).unsqueeze(1)
|
156
|
+
)
|
157
|
+
|
158
|
+
# IMPORTANTE: shuffle=False para DataLoader
|
159
|
+
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
|
160
|
+
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
|
161
|
+
|
162
|
+
# Inicializar modelo
|
163
|
+
model = AnomalyLSTM(input_size=X_train.shape[1]).to(device)
|
164
|
+
criterion = nn.BCELoss()
|
165
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
166
|
+
|
167
|
+
# Entrenamiento
|
168
|
+
train_start = time.time()
|
169
|
+
for epoch in range(epochs):
|
170
|
+
model.train()
|
171
|
+
for batch_x, batch_y in train_loader:
|
172
|
+
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
|
173
|
+
optimizer.zero_grad()
|
174
|
+
outputs = model(batch_x)
|
175
|
+
loss = criterion(outputs, batch_y)
|
176
|
+
loss.backward()
|
177
|
+
optimizer.step()
|
178
|
+
|
179
|
+
# Evaluación
|
180
|
+
model.eval()
|
181
|
+
test_preds, test_true, test_scores = [], [], []
|
182
|
+
with torch.no_grad():
|
183
|
+
for batch_x, batch_y in test_loader:
|
184
|
+
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
|
185
|
+
outputs = model(batch_x)
|
186
|
+
predicted = (outputs > 0.5).float()
|
187
|
+
test_preds.extend(predicted.cpu().numpy())
|
188
|
+
test_scores.extend(outputs.cpu().numpy())
|
189
|
+
test_true.extend(batch_y.cpu().numpy())
|
190
|
+
|
191
|
+
train_time = time.time() - train_start
|
192
|
+
|
193
|
+
# Ajustar predicciones al tamaño original y ordenar por timestamp
|
194
|
+
full_preds = np.concatenate([np.zeros(sequence_length), np.array(test_preds).flatten()])
|
195
|
+
full_scores = np.concatenate([np.zeros(sequence_length), np.array(test_scores).flatten()])
|
196
|
+
full_preds = full_preds[:len(y_test)]
|
197
|
+
full_scores = full_scores[:len(y_test)]
|
198
|
+
|
199
|
+
# Crear DataFrame con timestamps para ordenar
|
200
|
+
pred_df = pd.DataFrame({
|
201
|
+
'timestamp': timestamps_test[-len(full_preds):],
|
202
|
+
'y_true': y_test[-len(full_preds):],
|
203
|
+
'y_pred': full_preds,
|
204
|
+
'y_scores': full_scores
|
205
|
+
}).sort_values('timestamp')
|
206
|
+
|
207
|
+
# Calcular métricas ordenadas
|
208
|
+
f1 = f1_score(pred_df['y_true'], pred_df['y_pred'])
|
209
|
+
sw_f1 = tm.segment_wise_f_score(pred_df['y_true'], pred_df['y_pred'])
|
210
|
+
|
211
|
+
guardar_prediccion("LSTM", pred_df['y_true'], pred_df['y_pred'], pred_df['y_scores'], pred_df['timestamp'])
|
212
|
+
|
213
|
+
return model, f1, sw_f1, train_time, pred_df['y_pred']
|
214
|
+
|
215
|
+
# [Resto de las funciones (objective, optimize_model, evaluate_models) permanecen iguales...]
|
216
|
+
|
217
|
+
# -------------------------------
|
218
|
+
# Modelos
|
219
|
+
from pyod.models.lof import LOF
|
220
|
+
from pyod.models.cblof import CBLOF
|
221
|
+
from pyod.models.knn import KNN
|
222
|
+
from pyod.models.iforest import IForest
|
223
|
+
from pyod.models.ae1svm import AE1SVM
|
224
|
+
from pyod.models.auto_encoder import AutoEncoder
|
225
|
+
|
226
|
+
modelos_distancia = [LOF, CBLOF, KNN]
|
227
|
+
modelos_arboles = [IForest]
|
228
|
+
modelos_machine_learning = []
|
229
|
+
modelos_reconstruccion = [AE1SVM, AutoEncoder]
|
230
|
+
|
231
|
+
# -------------------------------
|
232
|
+
# Función evaluate_models modificada
|
233
|
+
# -------------------------------
|
234
|
+
# Función evaluate_models modificada para ordenar todas las predicciones
|
235
|
+
def evaluate_models(model_classes, best_params_dict, results_filename, include_lstm=False):
|
236
|
+
results_df = pd.DataFrame(columns=[
|
237
|
+
'nombre_modelo', 'f1_score', 'segment_wise_f_score', 'tiempo_entrenamiento', 'best_params'
|
238
|
+
])
|
239
|
+
|
240
|
+
# Evaluar modelos no-LSTM (usando datos con shuffle)
|
241
|
+
for model_class in model_classes:
|
242
|
+
nombre_modelo = model_class.__name__
|
243
|
+
params = best_params_dict.get(nombre_modelo, {})
|
244
|
+
params['contamination'] = contamination
|
245
|
+
|
246
|
+
if model_class.__name__ in ['LOF', 'CBLOF', 'KNN', 'IForest']:
|
247
|
+
params['n_jobs'] = -1
|
248
|
+
|
249
|
+
inicio = time.time()
|
250
|
+
try:
|
251
|
+
model = model_class(**params)
|
252
|
+
|
253
|
+
if nombre_modelo in ['AutoEncoder', 'AE1SVM']:
|
254
|
+
model.fit(X_train_shuf[y_train_shuf == 0])
|
255
|
+
else:
|
256
|
+
model.fit(X_train_shuf)
|
257
|
+
|
258
|
+
t = time.time() - inicio
|
259
|
+
y_pred = model.predict(X_test_shuf)
|
260
|
+
y_scores = model.decision_function(X_test_shuf) if hasattr(model, 'decision_function') else None
|
261
|
+
|
262
|
+
# Crear DataFrame temporal con timestamps para ordenar
|
263
|
+
pred_df = pd.DataFrame({
|
264
|
+
'timestamp': X_test_shuf.index,
|
265
|
+
'y_true': y_test_shuf,
|
266
|
+
'y_pred': y_pred,
|
267
|
+
'y_scores': y_scores if y_scores is not None else np.nan
|
268
|
+
}).sort_values('timestamp')
|
269
|
+
|
270
|
+
# Calcular métricas sobre datos ordenados
|
271
|
+
f1 = f1_score(pred_df['y_true'], pred_df['y_pred'])
|
272
|
+
sw_f1 = tm.segment_wise_f_score(pred_df['y_true'], pred_df['y_pred'])
|
273
|
+
|
274
|
+
# Guardar predicciones ordenadas
|
275
|
+
guardar_prediccion(
|
276
|
+
nombre_modelo,
|
277
|
+
pred_df['y_true'],
|
278
|
+
pred_df['y_pred'],
|
279
|
+
pred_df['y_scores'] if 'y_scores' in pred_df.columns else None,
|
280
|
+
pred_df['timestamp']
|
281
|
+
)
|
282
|
+
|
283
|
+
print(f'Modelo: {nombre_modelo} - F1: {f1:.4f} - Segment-wise F1: {sw_f1:.4f} - Tiempo: {t:.2f}s')
|
284
|
+
|
285
|
+
results_df.loc[len(results_df)] = [
|
286
|
+
nombre_modelo, f1, sw_f1, t, json.dumps(params, ensure_ascii=False)
|
287
|
+
]
|
288
|
+
except Exception as e:
|
289
|
+
print(f'Error en el modelo {nombre_modelo}: {e}')
|
290
|
+
|
291
|
+
# Evaluar LSTM (usando datos sin shuffle)
|
292
|
+
if include_lstm:
|
293
|
+
inicio_lstm = time.time()
|
294
|
+
print("\nEntrenando modelo LSTM (sin shuffle)...")
|
295
|
+
|
296
|
+
lstm_model, lstm_f1, lstm_sw_f1, lstm_time, lstm_preds = train_lstm(
|
297
|
+
X_train_noshuf, y_train_noshuf,
|
298
|
+
X_test_noshuf, y_test_noshuf,
|
299
|
+
timestamps_test=X_test_noshuf.index
|
300
|
+
)
|
301
|
+
|
302
|
+
print(f'Modelo: LSTM - F1: {lstm_f1:.4f} - Segment-wise F1: {lstm_sw_f1:.4f} - Tiempo: {lstm_time:.2f}s')
|
303
|
+
|
304
|
+
results_df.loc[len(results_df)] = [
|
305
|
+
"LSTM", lstm_f1, lstm_sw_f1, lstm_time,
|
306
|
+
json.dumps({
|
307
|
+
"sequence_length": 10,
|
308
|
+
"epochs": 20,
|
309
|
+
"batch_size": 16,
|
310
|
+
"hidden_size": 64,
|
311
|
+
"num_layers": 2,
|
312
|
+
"dropout": 0.2
|
313
|
+
}, ensure_ascii=False)
|
314
|
+
]
|
315
|
+
|
316
|
+
# Guardar resultados
|
317
|
+
os.makedirs('../results', exist_ok=True)
|
318
|
+
results_df.to_csv(f'../results/{results_filename}', index=False)
|
319
|
+
print(f'Resultados guardados en {results_filename}')
|
320
|
+
return results_df
|
321
|
+
|
322
|
+
# -------------------------------
|
323
|
+
# Parámetros y ejecución
|
324
|
+
best_params = {
|
325
|
+
'LOF': {"n_neighbors": 62, "metric":"minkowski", "contamination":contamination, "n_jobs":-1},
|
326
|
+
'CBLOF': {"n_clusters": 8, "alpha": 0.87571, "beta": 6, "contamination":contamination, "n_jobs":-1},
|
327
|
+
'KNN': {"n_neighbors": 5, "method":"mean", "contamination":contamination, "n_jobs":-1},
|
328
|
+
'IForest': {'n_jobs':-1, "contamination":contamination},
|
329
|
+
'AutoEncoder': {},
|
330
|
+
'AE1SVM': {}
|
331
|
+
}
|
332
|
+
|
333
|
+
print("\nEvaluando modelos basados en distancia...")
|
334
|
+
distancia_results = evaluate_models(modelos_distancia, best_params, 'distancia_results.csv')
|
335
|
+
|
336
|
+
print("\nEvaluando modelos basados en árboles...")
|
337
|
+
arbol_results = evaluate_models(modelos_arboles, best_params, 'arbol_results.csv')
|
338
|
+
|
339
|
+
print("\nEvaluando modelos de reconstrucción...")
|
340
|
+
reconstruccion_results = evaluate_models(modelos_reconstruccion, best_params, 'reconstruccion_results.csv')
|
341
|
+
|
342
|
+
print("\nEvaluando LSTM...")
|
343
|
+
ml_results = evaluate_models(modelos_machine_learning, best_params, 'ml_results.csv', include_lstm=True)
|
tests/__init__.py
ADDED
File without changes
|