InsideForest 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/trees.py +124 -59
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest.egg-info/PKG-INFO +1 -1
- {insideforest-0.2.3 → insideforest-0.2.4}/PKG-INFO +1 -1
- {insideforest-0.2.3 → insideforest-0.2.4}/setup.py +1 -1
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/__init__.py +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/descrip.py +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/labels.py +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/models.py +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest/regions.py +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest.egg-info/SOURCES.txt +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest.egg-info/dependency_links.txt +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/InsideForest.egg-info/top_level.txt +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/README.md +0 -0
- {insideforest-0.2.3 → insideforest-0.2.4}/setup.cfg +0 -0
|
@@ -2,9 +2,8 @@ import re
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
4
|
|
|
5
|
-
from sklearn import tree
|
|
6
5
|
from sklearn.tree import export_text
|
|
7
|
-
|
|
6
|
+
from tqdm import tqdm
|
|
8
7
|
|
|
9
8
|
class trees:
|
|
10
9
|
|
|
@@ -75,59 +74,96 @@ class trees:
|
|
|
75
74
|
estructura_iter.pop(i-1)
|
|
76
75
|
return estructura_iter, camino
|
|
77
76
|
|
|
78
|
-
def get_rangos(self, regr, data1):
|
|
79
|
-
# print('Obteniendo Rangos')
|
|
80
77
|
|
|
81
|
-
|
|
78
|
+
def get_rangos(self, regr, data1, verbose=0):
|
|
79
|
+
# Esta función puede tardar mucho, añadimos tqdm para el bucle principal.
|
|
80
|
+
|
|
81
|
+
if self.lang == 'pyspark':
|
|
82
82
|
arboles_estimadores = self.transform_tree_structure(regr.toDebugString)
|
|
83
83
|
arboles_estimadores = [a for a in arboles_estimadores.values()]
|
|
84
|
-
# print(arboles_estimadores[0])
|
|
85
84
|
else:
|
|
86
85
|
arboles_estimadores = regr.estimators_
|
|
87
|
-
# print(export_text(arboles_estimadores[0]))
|
|
88
86
|
|
|
89
87
|
df_info = []
|
|
90
88
|
n_estimador = 0
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
|
|
90
|
+
# Usamos tqdm en el for, si verbose=0, disable=True, si verbose=1, disable=False
|
|
91
|
+
for arbol_individual in tqdm(arboles_estimadores, disable=(verbose == 0), desc="Procesando árboles"):
|
|
92
|
+
if self.lang == 'pyspark':
|
|
93
93
|
r = arbol_individual
|
|
94
|
-
# print('lalalang')
|
|
95
94
|
else:
|
|
96
95
|
r = export_text(arbol_individual)
|
|
97
96
|
|
|
98
97
|
columnas_nombres = list(data1.columns)
|
|
99
98
|
columnas_nombres.reverse()
|
|
99
|
+
|
|
100
|
+
# Reemplazo de feature indices por nombres
|
|
100
101
|
for i, feat in enumerate(columnas_nombres):
|
|
101
|
-
r = r.replace('feature_'+str(len(columnas_nombres)-i-1), feat)
|
|
102
|
+
r = r.replace('feature_' + str(len(columnas_nombres) - i - 1), feat)
|
|
103
|
+
|
|
102
104
|
estructura = r.split('\n')
|
|
103
105
|
estructura_iter = estructura.copy()
|
|
104
106
|
paths = []
|
|
107
|
+
|
|
105
108
|
for i, valor in enumerate(estructura):
|
|
106
|
-
if not(('value: ' in valor) or ('class: ' in valor)):
|
|
109
|
+
if not (('value: ' in valor) or ('class: ' in valor)):
|
|
107
110
|
continue
|
|
108
111
|
estructura_iter, path_ = self.get_path(estructura_iter)
|
|
109
112
|
estructura_rep = [v.count('|') for v in path_[0]]
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
113
|
+
|
|
114
|
+
# if len(estructura_rep) != len(set(estructura_rep)):
|
|
115
|
+
# posiciones_ = []
|
|
116
|
+
# for i_pos, valor_pos in enumerate(estructura_rep):
|
|
117
|
+
# posiciones = [k for k, v in enumerate(estructura_rep) if v == valor_pos]
|
|
118
|
+
# posiciones_ += [x for x in posiciones if x != max(posiciones)]
|
|
119
|
+
# path_aux = [val for j, val in enumerate(path_[0]) if j not in set(posiciones_)]
|
|
120
|
+
# path_[0] = path_aux
|
|
121
|
+
|
|
122
|
+
if len(estructura_rep) != len(set(estructura_rep)):
|
|
123
|
+
seen = set()
|
|
124
|
+
new_path = []
|
|
125
|
+
# Recorremos path_[0] en orden inverso
|
|
126
|
+
for elem in reversed(path_[0]):
|
|
127
|
+
# Conteo de '|' en el elemento actual
|
|
128
|
+
bc = elem.count('|')
|
|
129
|
+
|
|
130
|
+
# Si no lo hemos visto aún, lo añadimos (porque es la última vez que aparece)
|
|
131
|
+
if bc not in seen:
|
|
132
|
+
new_path.append(elem)
|
|
133
|
+
seen.add(bc)
|
|
134
|
+
|
|
135
|
+
# new_path está invertido, lo devolvemos a su orden natural
|
|
136
|
+
new_path.reverse()
|
|
137
|
+
|
|
138
|
+
# Reemplazamos el path original
|
|
139
|
+
path_[0] = new_path
|
|
140
|
+
|
|
117
141
|
paths.append([x for x in path_ if x != ''])
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
142
|
+
|
|
143
|
+
valores = [float(path[1].split(': ')[1].replace(']', '')) for path in paths]
|
|
144
|
+
percent_ = np.percentile(valores, 90)
|
|
145
|
+
estructuras_maximizadoras = [[pa[0], val] for pa, val in zip(paths, valores) if val >= percent_]
|
|
146
|
+
|
|
121
147
|
importanc = []
|
|
122
148
|
for n_path in range(len(estructuras_maximizadoras)):
|
|
123
|
-
importanc += [
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
149
|
+
importanc += [
|
|
150
|
+
[
|
|
151
|
+
v.replace('|---', '').replace('| ', '')[1:],
|
|
152
|
+
2 / ((v.count('|')) + 1),
|
|
153
|
+
n_path,
|
|
154
|
+
n_estimador
|
|
155
|
+
]
|
|
156
|
+
for v in estructuras_maximizadoras[n_path][0]
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
asdf = pd.DataFrame(importanc, columns=['Regla', 'Importancia', 'N_regla', 'N_arbol'])
|
|
127
160
|
asdf['Va_Obj_minima'] = percent_
|
|
128
161
|
df_info.append(asdf)
|
|
129
162
|
|
|
130
|
-
n_estimador+=1
|
|
163
|
+
n_estimador += 1
|
|
164
|
+
|
|
165
|
+
if verbose == 1 and n_estimador % 10 == 0:
|
|
166
|
+
print(f"Procesados {n_estimador} árboles")
|
|
131
167
|
|
|
132
168
|
return pd.concat(df_info)
|
|
133
169
|
|
|
@@ -160,47 +196,59 @@ class trees:
|
|
|
160
196
|
return df_full_arboles
|
|
161
197
|
|
|
162
198
|
|
|
163
|
-
def get_summary(self, data1, df_full_arboles, var_obj, verbose):
|
|
164
|
-
agrupacion = pd.pivot_table(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
199
|
+
def get_summary(self, data1, df_full_arboles, var_obj, verbose=0):
|
|
200
|
+
agrupacion = pd.pivot_table(
|
|
201
|
+
df_full_arboles,
|
|
202
|
+
index=['N_regla', 'N_arbol', 'feature', 'operador'],
|
|
203
|
+
values=['rangos', 'Importancia'],
|
|
204
|
+
aggfunc=['min', 'max', 'mean']
|
|
205
|
+
)
|
|
168
206
|
|
|
169
207
|
agrupacion_min = agrupacion['min'].reset_index()
|
|
170
|
-
agrupacion_min = agrupacion_min[agrupacion_min['operador']=='<=']
|
|
208
|
+
agrupacion_min = agrupacion_min[agrupacion_min['operador'] == '<=']
|
|
171
209
|
agrupacion_max = agrupacion['max'].reset_index()
|
|
172
|
-
agrupacion_max = agrupacion_max[agrupacion_max['operador']=='>']
|
|
210
|
+
agrupacion_max = agrupacion_max[agrupacion_max['operador'] == '>']
|
|
173
211
|
agrupacion_mean = agrupacion['mean'].reset_index()
|
|
174
|
-
|
|
212
|
+
|
|
213
|
+
agrupacion = pd.concat([agrupacion_min, agrupacion_max]).sort_values(['N_arbol', 'N_regla'])
|
|
175
214
|
top_100_ramas = agrupacion.N_arbol.unique()[:100]
|
|
215
|
+
|
|
176
216
|
reglas = []
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
217
|
+
|
|
218
|
+
# Añadimos tqdm sobre top_100_ramas para ver progreso
|
|
219
|
+
for arbol_num in tqdm(top_100_ramas, disable=(verbose == 0), desc="Procesando ramas"):
|
|
220
|
+
# Mantenemos este print pero lo hacemos condicional
|
|
221
|
+
if arbol_num % 50 == 0 and verbose == 1:
|
|
222
|
+
print(f"Procesando rama del árbol: {arbol_num}")
|
|
223
|
+
|
|
224
|
+
ag_arbol = agrupacion[(agrupacion['N_arbol'] == arbol_num)]
|
|
181
225
|
for regla_num in ag_arbol.N_regla.unique():
|
|
182
226
|
data1_ = data1.copy()
|
|
183
|
-
ag_regla = ag_arbol[(ag_arbol['N_regla']==regla_num)]
|
|
184
|
-
men_ = ag_regla[(ag_regla['operador']=='<=')][['feature','rangos']].values
|
|
185
|
-
may_ = ag_regla[(ag_regla['operador']=='>')][['feature','rangos']].values
|
|
186
|
-
if len(men_)>0:
|
|
227
|
+
ag_regla = ag_arbol[(ag_arbol['N_regla'] == regla_num)]
|
|
228
|
+
men_ = ag_regla[(ag_regla['operador'] == '<=')][['feature', 'rangos']].values
|
|
229
|
+
may_ = ag_regla[(ag_regla['operador'] == '>')][['feature', 'rangos']].values
|
|
230
|
+
if len(men_) > 0:
|
|
187
231
|
for col, valor in men_:
|
|
188
|
-
data1_ = data1_.loc[data1_[col]<=valor
|
|
189
|
-
for
|
|
190
|
-
data1_ = data1_.loc[data1_[
|
|
232
|
+
data1_ = data1_.loc[data1_[col] <= valor, :]
|
|
233
|
+
for col2, valor2 in may_:
|
|
234
|
+
data1_ = data1_.loc[data1_[col2] > valor2, :]
|
|
191
235
|
else:
|
|
192
236
|
for col, valor in may_:
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
237
|
+
data1_ = data1_.loc[data1_[col] > valor, :]
|
|
238
|
+
for col2, valor2 in men_:
|
|
239
|
+
data1_ = data1_.loc[data1_[col2] <= valor2, :]
|
|
240
|
+
|
|
196
241
|
ag_regla_copy = ag_regla.copy()
|
|
197
242
|
ag_regla_copy.loc[:, 'n_sample'] = len(data1_)
|
|
198
243
|
ag_regla_copy.loc[:, 'ef_sample'] = data1_[var_obj].mean()
|
|
199
244
|
reglas.append(ag_regla_copy)
|
|
245
|
+
|
|
200
246
|
agrupacion = pd.concat(reglas)
|
|
201
|
-
agrupacion = agrupacion.sort_values(by=['ef_sample','n_sample'], ascending=False)
|
|
247
|
+
agrupacion = agrupacion.sort_values(by=['ef_sample', 'n_sample'], ascending=False)
|
|
202
248
|
return agrupacion
|
|
203
249
|
|
|
250
|
+
|
|
251
|
+
|
|
204
252
|
def get_rect_coords(self, df):
|
|
205
253
|
limits = {}
|
|
206
254
|
for i, row in df.iterrows():
|
|
@@ -274,18 +322,35 @@ class trees:
|
|
|
274
322
|
separacion_dim = self.get_dfs_dim(rectangles_)
|
|
275
323
|
return separacion_dim
|
|
276
324
|
|
|
277
|
-
def get_branches(self,df, var_obj, regr):
|
|
325
|
+
def get_branches(self, df, var_obj, regr, verbose=0):
|
|
326
|
+
"""
|
|
327
|
+
Función principal para extraer los rectángulos (reglas) de los árboles.
|
|
328
|
+
:param df: DataFrame original
|
|
329
|
+
:param var_obj: Nombre de la columna objetivo
|
|
330
|
+
:param regr: Modelo (RandomForest u otro) entrenado
|
|
331
|
+
:param verbose: 0 = sin prints ni barra de progreso, 1 = con prints y tqdm
|
|
332
|
+
:return: Lista de DataFrames con los rectángulos separados por dimensión
|
|
333
|
+
"""
|
|
334
|
+
# Separamos X e ignoramos la columna objetivo
|
|
278
335
|
X = df.drop(columns=[var_obj]).fillna(0)
|
|
279
|
-
# y = df[var_obj]
|
|
280
|
-
df_full_arboles = self.get_rangos(regr, X) # A prueba de Spark
|
|
281
336
|
|
|
282
|
-
|
|
283
|
-
|
|
337
|
+
if verbose==1:
|
|
338
|
+
print("Llamamos a get_rangos para extraer limites de los arboles")
|
|
339
|
+
df_full_arboles = self.get_rangos(regr, X, verbose)
|
|
340
|
+
|
|
341
|
+
if verbose==1:
|
|
342
|
+
print("Extraer las reglas con regex")
|
|
343
|
+
|
|
284
344
|
df_full_arboles = self.get_fro(df_full_arboles)
|
|
285
|
-
# print('get fro sobrepasado')
|
|
286
|
-
df_summ = self.get_summary(df, df_full_arboles,var_obj, False)
|
|
287
|
-
# print('get Summ sobrepasado')
|
|
288
345
|
|
|
346
|
+
if verbose==1:
|
|
347
|
+
print("Obtenemos un resumen de los árboles")
|
|
348
|
+
|
|
349
|
+
df_summ = self.get_summary(df, df_full_arboles, var_obj, verbose)
|
|
350
|
+
|
|
351
|
+
if verbose==1:
|
|
352
|
+
print("Generamos el df final con forma de rectángulo")
|
|
353
|
+
# Extraemos las reglas (extract_rectangles)
|
|
289
354
|
separacion_dim = self.extract_rectangles(df_summ)
|
|
290
355
|
|
|
291
|
-
return separacion_dim
|
|
356
|
+
return separacion_dim
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: InsideForest
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A comprehensive library for describing and analyzing data insights via AI
|
|
5
5
|
Home-page: https://github.com/jcval94/InsideForest.git
|
|
6
6
|
Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: InsideForest
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A comprehensive library for describing and analyzing data insights via AI
|
|
5
5
|
Home-page: https://github.com/jcval94/InsideForest.git
|
|
6
6
|
Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|