InsideForest 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- insideforest-0.2.2/InsideForest/descrip.py +245 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest/regions.py +1 -1
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest.egg-info/PKG-INFO +1 -1
- {insideforest-0.2.0 → insideforest-0.2.2}/PKG-INFO +1 -1
- {insideforest-0.2.0 → insideforest-0.2.2}/README.md +1 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/setup.py +1 -1
- insideforest-0.2.0/InsideForest/descrip.py +0 -102
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest/__init__.py +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest/labels.py +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest/models.py +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest/trees.py +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest.egg-info/SOURCES.txt +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest.egg-info/dependency_links.txt +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/InsideForest.egg-info/top_level.txt +0 -0
- {insideforest-0.2.0 → insideforest-0.2.2}/setup.cfg +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from openai import OpenAI
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import copy
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy.signal import savgol_filter
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def primer_punto_inflexion_decreciente(data, bins=10, window_length=5, polyorder=2):
|
|
12
|
+
"""
|
|
13
|
+
Encuentra el primer punto de inflexión decreciente en un histograma.
|
|
14
|
+
|
|
15
|
+
Parámetros:
|
|
16
|
+
- data: array-like, los datos para construir el histograma.
|
|
17
|
+
- bins: int o sequence, número de bins o los bordes de los bins.
|
|
18
|
+
- window_length: int, longitud de la ventana para el filtro Savitzky-Golay.
|
|
19
|
+
- polyorder: int, orden del polinomio para el filtro Savitzky-Golay.
|
|
20
|
+
|
|
21
|
+
Retorna:
|
|
22
|
+
- punto_inflexion: valor del bin donde ocurre el primer punto de inflexión decreciente.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Calcular el histograma
|
|
26
|
+
counts, bin_edges = np.histogram(data, bins=bins)
|
|
27
|
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
|
28
|
+
|
|
29
|
+
# Suavizar el histograma para reducir ruido
|
|
30
|
+
# Asegurarse de que window_length es impar y menor que el tamaño de counts
|
|
31
|
+
if window_length >= len(counts):
|
|
32
|
+
window_length = len(counts) - 1 if len(counts) % 2 == 0 else len(counts)
|
|
33
|
+
if window_length % 2 == 0:
|
|
34
|
+
window_length += 1
|
|
35
|
+
if window_length < polyorder + 2:
|
|
36
|
+
window_length = polyorder + 2 if (polyorder + 2) % 2 != 0 else polyorder + 3
|
|
37
|
+
|
|
38
|
+
counts_smooth = savgol_filter(counts, window_length=window_length, polyorder=polyorder)
|
|
39
|
+
|
|
40
|
+
# Calcular la segunda derivada
|
|
41
|
+
second_derivative = np.gradient(np.gradient(counts_smooth))
|
|
42
|
+
|
|
43
|
+
# Encontrar los puntos de inflexión donde la segunda derivada cambia de signo
|
|
44
|
+
# De positivo a negativo indica un cambio de concavidad hacia abajo (punto de inflexión decreciente)
|
|
45
|
+
sign_changes = np.diff(np.sign(second_derivative))
|
|
46
|
+
# Un cambio de +1 a -1 en la segunda derivada
|
|
47
|
+
inflection_indices = np.where(sign_changes < 0)[0] + 1 # +1 para corregir el desplazamiento de diff
|
|
48
|
+
|
|
49
|
+
if len(inflection_indices) == 0:
|
|
50
|
+
return None # No se encontró un punto de inflexión decreciente
|
|
51
|
+
|
|
52
|
+
# Seleccionar el primer punto de inflexión decreciente
|
|
53
|
+
primer_inflexion = bin_centers[inflection_indices[0]]
|
|
54
|
+
|
|
55
|
+
return primer_inflexion
|
|
56
|
+
|
|
57
|
+
def replace_with_dict(df, columns, var_rename):
|
|
58
|
+
"""
|
|
59
|
+
Reemplaza valores en columnas especificadas de un DataFrame usando un diccionario.
|
|
60
|
+
Reemplaza coincidencias exactas y subcadenas que contienen las claves del diccionario.
|
|
61
|
+
|
|
62
|
+
Parámetros
|
|
63
|
+
----------
|
|
64
|
+
df : pd.DataFrame
|
|
65
|
+
El DataFrame original.
|
|
66
|
+
columns : list of str
|
|
67
|
+
Lista de nombres de columnas donde se aplicarán los reemplazos.
|
|
68
|
+
var_rename : dict
|
|
69
|
+
Diccionario donde las claves son los valores a reemplazar y los valores son los nuevos valores.
|
|
70
|
+
|
|
71
|
+
Retorna
|
|
72
|
+
-------
|
|
73
|
+
df_replaced : pd.DataFrame
|
|
74
|
+
DataFrame con los reemplazos realizados en las columnas especificadas.
|
|
75
|
+
replace_info : dict
|
|
76
|
+
Información necesaria para revertir los reemplazos.
|
|
77
|
+
"""
|
|
78
|
+
df_replaced = df.copy()
|
|
79
|
+
replace_info = {}
|
|
80
|
+
|
|
81
|
+
# Ordenar las claves por longitud descendente para evitar conflictos en subcadenas
|
|
82
|
+
sorted_keys = sorted(var_rename.keys(), key=len, reverse=True)
|
|
83
|
+
escaped_keys = [re.escape(k) for k in sorted_keys]
|
|
84
|
+
pattern = re.compile('|'.join(escaped_keys))
|
|
85
|
+
|
|
86
|
+
for col in columns:
|
|
87
|
+
if col not in df_replaced.columns:
|
|
88
|
+
print(f"Advertencia: La columna '{col}' no se encontró en el DataFrame.")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# Almacenar información de reemplazo por columna
|
|
92
|
+
replace_info[col] = {
|
|
93
|
+
'var_rename': var_rename.copy()
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Definir la función de reemplazo
|
|
97
|
+
def repl(match):
|
|
98
|
+
return var_rename[match.group(0)]
|
|
99
|
+
|
|
100
|
+
# Aplicar el reemplazo usando expresiones regulares
|
|
101
|
+
df_replaced[col] = df_replaced[col].astype(str).str.replace(pattern, repl, regex=True)
|
|
102
|
+
|
|
103
|
+
return df_replaced, replace_info
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_descripciones_valiosas(df_datos_descript,df_datos_clusterizados, TARGETS, var_rename):
|
|
108
|
+
|
|
109
|
+
df_datos_descript = df_datos_descript.sort_values('cluster_ponderador', ascending=False)
|
|
110
|
+
# descrip_generales = [x for x in df_datos_descript['cluster_descripcion'].unique().tolist() if type('')==type(x)]
|
|
111
|
+
df_datos_clusterizados_desc = df_datos_clusterizados.merge(df_datos_descript, on='cluster', how='left')
|
|
112
|
+
stacked_data = df_datos_clusterizados_desc.groupby([TARGETS[0], 'cluster']).size().unstack(fill_value=0)
|
|
113
|
+
# best_clusters = df_datos_descript['cluster'].head(10).values.tolist()
|
|
114
|
+
|
|
115
|
+
proporcion_real = df_datos_clusterizados_desc[TARGETS[0]].value_counts(normalize=True).loc[1]
|
|
116
|
+
stacked_data_total = stacked_data.sum(axis=0)
|
|
117
|
+
proprcin_ = (stacked_data/stacked_data.sum(axis=0)).loc[1]
|
|
118
|
+
los_custers = pd.concat([proprcin_/proporcion_real, stacked_data_total], axis=1).sort_values(0, ascending=False)
|
|
119
|
+
los_custers_valiosos = los_custers[los_custers[1]>1].copy()
|
|
120
|
+
|
|
121
|
+
los_custers_valiosos_original = copy.deepcopy(los_custers_valiosos)
|
|
122
|
+
# Selecciona las columnas numéricas para la estandarización
|
|
123
|
+
numeric_cols = los_custers_valiosos.select_dtypes(include=np.number).columns
|
|
124
|
+
# Crea un StandardScaler
|
|
125
|
+
scaler = StandardScaler()
|
|
126
|
+
# Ajusta y transforma las columnas numéricas
|
|
127
|
+
los_custers_valiosos[numeric_cols] = scaler.fit_transform(los_custers_valiosos[numeric_cols])
|
|
128
|
+
|
|
129
|
+
los_custers_valiosos_original['importancia'] =los_custers_valiosos.sum(axis=1)
|
|
130
|
+
los_custers_valiosos_original.sort_values('importancia', ascending=False)
|
|
131
|
+
|
|
132
|
+
punto = primer_punto_inflexion_decreciente(los_custers_valiosos_original[0], bins=20, window_length=5, polyorder=2)
|
|
133
|
+
punto_1 = primer_punto_inflexion_decreciente(los_custers_valiosos_original[1], bins=20, window_length=5, polyorder=2)
|
|
134
|
+
|
|
135
|
+
los_custers_valiosos_original_cond = los_custers_valiosos_original[0]>punto*.4
|
|
136
|
+
los_custers_valiosos_original_cond_1 = los_custers_valiosos_original[0]>punto_1
|
|
137
|
+
|
|
138
|
+
los_custers_valiosos_original = los_custers_valiosos_original[los_custers_valiosos_original_cond|los_custers_valiosos_original_cond_1]
|
|
139
|
+
|
|
140
|
+
df_datos_descript_valiosas = df_datos_descript[df_datos_descript['cluster'].isin(los_custers_valiosos_original.index.tolist())]
|
|
141
|
+
|
|
142
|
+
df_datos_descript_valiosas,_ = replace_with_dict(df_datos_descript_valiosas, ['cluster_descripcion'], var_rename)
|
|
143
|
+
df_datos_descript_valiosas = df_datos_descript_valiosas.merge(proprcin_.reset_index(), on='cluster', how='left')
|
|
144
|
+
df_datos_descript_valiosas = df_datos_descript_valiosas.merge(los_custers.reset_index(), on='cluster', how='left')
|
|
145
|
+
df_datos_descript_valiosas = df_datos_descript_valiosas.rename(columns={'1_x':'Probabilidad','1_y':'N_probabilidad',0:'Soporte'})
|
|
146
|
+
return df_datos_descript_valiosas.drop(columns=['cluster_ponderador']), stacked_data
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def generate_descriptions(condition_list, language='en', OPENAI_API_KEY=None, default_params=None):
|
|
150
|
+
|
|
151
|
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
152
|
+
|
|
153
|
+
if default_params is None:
|
|
154
|
+
def get_default_params():
|
|
155
|
+
return {
|
|
156
|
+
'model': 'gpt-4-turbo',
|
|
157
|
+
'temperature': 0.5,
|
|
158
|
+
'max_tokens': 1500,
|
|
159
|
+
'n': 1,
|
|
160
|
+
'stop': None,
|
|
161
|
+
}
|
|
162
|
+
default_params = get_default_params()
|
|
163
|
+
|
|
164
|
+
# Crear un único mensaje con todas las condiciones
|
|
165
|
+
conditions_text = "\n".join([f"{i+1}. {condition}" for i, condition in enumerate(condition_list)])
|
|
166
|
+
|
|
167
|
+
# Prompt mejorado para descripciones simples y comprensibles
|
|
168
|
+
system_prompt = "You are an assistant that helps to describe dataset groups in very simple terms."
|
|
169
|
+
user_prompt = (
|
|
170
|
+
f"Generate a very simple description for each of the following conditions. "
|
|
171
|
+
f"Use everyday language. Avoid specific numbers and ranges; instead, "
|
|
172
|
+
f"use general groups like 'elderly people', 'classic cars', etc."
|
|
173
|
+
f"Make each description visually friendly highlight what makes that condition unique and using emojis. Structure: 'EMOJI': 'RESPONSE'"
|
|
174
|
+
f"Only respond with the descriptions in {language}. Conditions:\n\n{conditions_text}"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
mensajes = [
|
|
178
|
+
{"role": "system", "content": system_prompt},
|
|
179
|
+
{"role": "user", "content": user_prompt}
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
# Crear una solicitud de finalización de chat con todos los mensajes
|
|
183
|
+
respuesta = client.chat.completions.create(
|
|
184
|
+
messages=mensajes,
|
|
185
|
+
**default_params
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Dividir la respuesta en una lista de descripciones por línea
|
|
189
|
+
descriptions = respuesta.choices[0].message.content.strip().split("\n")
|
|
190
|
+
descriptions = [desc.strip() for desc in descriptions if desc.strip()]
|
|
191
|
+
|
|
192
|
+
# Return a dictionary with the responses
|
|
193
|
+
result = {'respuestas': descriptions}
|
|
194
|
+
return result
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def categorize_conditions(condition_list, df=None):
|
|
198
|
+
descriptions = []
|
|
199
|
+
|
|
200
|
+
# If df is provided, calculate thresholds using quantiles
|
|
201
|
+
if df is not None:
|
|
202
|
+
thresholds = {}
|
|
203
|
+
for column in df.columns:
|
|
204
|
+
# Calculate quantiles for low, medium, high categories
|
|
205
|
+
low = df[column].quantile(0.33)
|
|
206
|
+
high = df[column].quantile(0.66)
|
|
207
|
+
thresholds[column] = {'low': low, 'high': high}
|
|
208
|
+
|
|
209
|
+
for condition in condition_list:
|
|
210
|
+
features = {}
|
|
211
|
+
# Regex pattern to extract variable ranges
|
|
212
|
+
pattern = r'(\d+\.?\d*) <= (\w+) <= (\d+\.?\d*)'
|
|
213
|
+
matches = re.findall(pattern, condition)
|
|
214
|
+
|
|
215
|
+
for match in matches:
|
|
216
|
+
min_value, feature_name, max_value = match
|
|
217
|
+
min_value = float(min_value)
|
|
218
|
+
max_value = float(max_value)
|
|
219
|
+
# Calculate average value
|
|
220
|
+
avg_value = (min_value + max_value) / 2
|
|
221
|
+
# Categorize based on thresholds
|
|
222
|
+
if feature_name in thresholds:
|
|
223
|
+
low = thresholds[feature_name]['low']
|
|
224
|
+
high = thresholds[feature_name]['high']
|
|
225
|
+
# Determine category based on where the average value falls within the thresholds
|
|
226
|
+
if avg_value <= low:
|
|
227
|
+
category = 'BAJO'
|
|
228
|
+
elif avg_value <= high:
|
|
229
|
+
category = 'MEDIO'
|
|
230
|
+
else:
|
|
231
|
+
category = 'ALTO'
|
|
232
|
+
features[feature_name] = category
|
|
233
|
+
else:
|
|
234
|
+
features[feature_name] = 'N/A'
|
|
235
|
+
|
|
236
|
+
# Create description using the categories
|
|
237
|
+
description_parts = []
|
|
238
|
+
for feature, category in features.items():
|
|
239
|
+
description_parts.append(f"{feature} es {category}")
|
|
240
|
+
description = ', '.join(description_parts) + '.'
|
|
241
|
+
descriptions.append(description)
|
|
242
|
+
|
|
243
|
+
# Return a dictionary with the responses
|
|
244
|
+
result = {'respuestas': descriptions}
|
|
245
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: InsideForest
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: A comprehensive library for describing and analyzing data insights via AI
|
|
5
5
|
Home-page: https://github.com/jcval94/InsideForest.git
|
|
6
6
|
Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: InsideForest
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: A comprehensive library for describing and analyzing data insights via AI
|
|
5
5
|
Home-page: https://github.com/jcval94/InsideForest.git
|
|
6
6
|
Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
|
|
@@ -19,6 +19,7 @@ You could use our library to:
|
|
|
19
19
|
|
|
20
20
|
By using our library to build and analyze a random forest, you can gain deep insights into the patterns and relationships within your data. This can help you identify hidden trends and make better-informed decisions, leading to more successful outcomes for your business.
|
|
21
21
|
|
|
22
|
+
[CASO DE USO](https://colab.research.google.com/drive/11VGeB0V6PLMlQ8Uhba91fJ4UN1Bfbs90?usp=sharing)
|
|
22
23
|
|
|
23
24
|
## Installation
|
|
24
25
|
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
from openai import OpenAI
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
def generate_descriptions(condition_list, language='en', OPENAI_API_KEY=None, default_params=None):
|
|
7
|
-
|
|
8
|
-
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
9
|
-
|
|
10
|
-
if default_params is None:
|
|
11
|
-
def get_default_params():
|
|
12
|
-
return {
|
|
13
|
-
'model': 'gpt-4-turbo',
|
|
14
|
-
'temperature': 0.5,
|
|
15
|
-
'max_tokens': 1500,
|
|
16
|
-
'n': 1,
|
|
17
|
-
'stop': None,
|
|
18
|
-
}
|
|
19
|
-
default_params = get_default_params()
|
|
20
|
-
|
|
21
|
-
# Crear un único mensaje con todas las condiciones
|
|
22
|
-
conditions_text = "\n".join([f"{i+1}. {condition}" for i, condition in enumerate(condition_list)])
|
|
23
|
-
|
|
24
|
-
# Prompt mejorado para descripciones simples y comprensibles
|
|
25
|
-
system_prompt = "You are an assistant that helps to describe dataset groups in very simple terms."
|
|
26
|
-
user_prompt = (
|
|
27
|
-
f"Generate a very simple description for each of the following conditions. "
|
|
28
|
-
f"Use everyday language. Avoid specific numbers and ranges; instead, "
|
|
29
|
-
f"use general groups like 'elderly people', 'classic cars', etc."
|
|
30
|
-
f"Make each description visually friendly highlight what makes that condition unique and using emojis. Structure: 'EMOJI': 'RESPONSE'"
|
|
31
|
-
f"Only respond with the descriptions in {language}. Conditions:\n\n{conditions_text}"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
mensajes = [
|
|
35
|
-
{"role": "system", "content": system_prompt},
|
|
36
|
-
{"role": "user", "content": user_prompt}
|
|
37
|
-
]
|
|
38
|
-
|
|
39
|
-
# Crear una solicitud de finalización de chat con todos los mensajes
|
|
40
|
-
respuesta = client.chat.completions.create(
|
|
41
|
-
messages=mensajes,
|
|
42
|
-
**default_params
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
# Dividir la respuesta en una lista de descripciones por línea
|
|
46
|
-
descriptions = respuesta.choices[0].message.content.strip().split("\n")
|
|
47
|
-
descriptions = [desc.strip() for desc in descriptions if desc.strip()]
|
|
48
|
-
|
|
49
|
-
# Return a dictionary with the responses
|
|
50
|
-
result = {'respuestas': descriptions}
|
|
51
|
-
return result
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def categorize_conditions(condition_list, df=None):
|
|
55
|
-
descriptions = []
|
|
56
|
-
|
|
57
|
-
# If df is provided, calculate thresholds using quantiles
|
|
58
|
-
if df is not None:
|
|
59
|
-
thresholds = {}
|
|
60
|
-
for column in df.columns:
|
|
61
|
-
# Calculate quantiles for low, medium, high categories
|
|
62
|
-
low = df[column].quantile(0.33)
|
|
63
|
-
high = df[column].quantile(0.66)
|
|
64
|
-
thresholds[column] = {'low': low, 'high': high}
|
|
65
|
-
|
|
66
|
-
for condition in condition_list:
|
|
67
|
-
features = {}
|
|
68
|
-
# Regex pattern to extract variable ranges
|
|
69
|
-
pattern = r'(\d+\.?\d*) <= (\w+) <= (\d+\.?\d*)'
|
|
70
|
-
matches = re.findall(pattern, condition)
|
|
71
|
-
|
|
72
|
-
for match in matches:
|
|
73
|
-
min_value, feature_name, max_value = match
|
|
74
|
-
min_value = float(min_value)
|
|
75
|
-
max_value = float(max_value)
|
|
76
|
-
# Calculate average value
|
|
77
|
-
avg_value = (min_value + max_value) / 2
|
|
78
|
-
# Categorize based on thresholds
|
|
79
|
-
if feature_name in thresholds:
|
|
80
|
-
low = thresholds[feature_name]['low']
|
|
81
|
-
high = thresholds[feature_name]['high']
|
|
82
|
-
# Determine category based on where the average value falls within the thresholds
|
|
83
|
-
if avg_value <= low:
|
|
84
|
-
category = 'BAJO'
|
|
85
|
-
elif avg_value <= high:
|
|
86
|
-
category = 'MEDIO'
|
|
87
|
-
else:
|
|
88
|
-
category = 'ALTO'
|
|
89
|
-
features[feature_name] = category
|
|
90
|
-
else:
|
|
91
|
-
features[feature_name] = 'N/A'
|
|
92
|
-
|
|
93
|
-
# Create description using the categories
|
|
94
|
-
description_parts = []
|
|
95
|
-
for feature, category in features.items():
|
|
96
|
-
description_parts.append(f"{feature} es {category}")
|
|
97
|
-
description = ', '.join(description_parts) + '.'
|
|
98
|
-
descriptions.append(description)
|
|
99
|
-
|
|
100
|
-
# Return a dictionary with the responses
|
|
101
|
-
result = {'respuestas': descriptions}
|
|
102
|
-
return result
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|