InsideForest 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ import re
2
+ from openai import OpenAI
3
+ import pandas as pd
4
+ import copy
5
+ import numpy as np
6
+ from scipy.signal import savgol_filter
7
+ from sklearn.preprocessing import StandardScaler
8
+ import re
9
+
10
+
11
+ def primer_punto_inflexion_decreciente(data, bins=10, window_length=5, polyorder=2):
12
+ """
13
+ Encuentra el primer punto de inflexión decreciente en un histograma.
14
+
15
+ Parámetros:
16
+ - data: array-like, los datos para construir el histograma.
17
+ - bins: int o sequence, número de bins o los bordes de los bins.
18
+ - window_length: int, longitud de la ventana para el filtro Savitzky-Golay.
19
+ - polyorder: int, orden del polinomio para el filtro Savitzky-Golay.
20
+
21
+ Retorna:
22
+ - punto_inflexion: valor del bin donde ocurre el primer punto de inflexión decreciente.
23
+ """
24
+
25
+ # Calcular el histograma
26
+ counts, bin_edges = np.histogram(data, bins=bins)
27
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
28
+
29
+ # Suavizar el histograma para reducir ruido
30
+ # Asegurarse de que window_length es impar y menor que el tamaño de counts
31
+ if window_length >= len(counts):
32
+ window_length = len(counts) - 1 if len(counts) % 2 == 0 else len(counts)
33
+ if window_length % 2 == 0:
34
+ window_length += 1
35
+ if window_length < polyorder + 2:
36
+ window_length = polyorder + 2 if (polyorder + 2) % 2 != 0 else polyorder + 3
37
+
38
+ counts_smooth = savgol_filter(counts, window_length=window_length, polyorder=polyorder)
39
+
40
+ # Calcular la segunda derivada
41
+ second_derivative = np.gradient(np.gradient(counts_smooth))
42
+
43
+ # Encontrar los puntos de inflexión donde la segunda derivada cambia de signo
44
+ # De positivo a negativo indica un cambio de concavidad hacia abajo (punto de inflexión decreciente)
45
+ sign_changes = np.diff(np.sign(second_derivative))
46
+ # Un cambio de +1 a -1 en la segunda derivada
47
+ inflection_indices = np.where(sign_changes < 0)[0] + 1 # +1 para corregir el desplazamiento de diff
48
+
49
+ if len(inflection_indices) == 0:
50
+ return None # No se encontró un punto de inflexión decreciente
51
+
52
+ # Seleccionar el primer punto de inflexión decreciente
53
+ primer_inflexion = bin_centers[inflection_indices[0]]
54
+
55
+ return primer_inflexion
56
+
57
+ def replace_with_dict(df, columns, var_rename):
58
+ """
59
+ Reemplaza valores en columnas especificadas de un DataFrame usando un diccionario.
60
+ Reemplaza coincidencias exactas y subcadenas que contienen las claves del diccionario.
61
+
62
+ Parámetros
63
+ ----------
64
+ df : pd.DataFrame
65
+ El DataFrame original.
66
+ columns : list of str
67
+ Lista de nombres de columnas donde se aplicarán los reemplazos.
68
+ var_rename : dict
69
+ Diccionario donde las claves son los valores a reemplazar y los valores son los nuevos valores.
70
+
71
+ Retorna
72
+ -------
73
+ df_replaced : pd.DataFrame
74
+ DataFrame con los reemplazos realizados en las columnas especificadas.
75
+ replace_info : dict
76
+ Información necesaria para revertir los reemplazos.
77
+ """
78
+ df_replaced = df.copy()
79
+ replace_info = {}
80
+
81
+ # Ordenar las claves por longitud descendente para evitar conflictos en subcadenas
82
+ sorted_keys = sorted(var_rename.keys(), key=len, reverse=True)
83
+ escaped_keys = [re.escape(k) for k in sorted_keys]
84
+ pattern = re.compile('|'.join(escaped_keys))
85
+
86
+ for col in columns:
87
+ if col not in df_replaced.columns:
88
+ print(f"Advertencia: La columna '{col}' no se encontró en el DataFrame.")
89
+ continue
90
+
91
+ # Almacenar información de reemplazo por columna
92
+ replace_info[col] = {
93
+ 'var_rename': var_rename.copy()
94
+ }
95
+
96
+ # Definir la función de reemplazo
97
+ def repl(match):
98
+ return var_rename[match.group(0)]
99
+
100
+ # Aplicar el reemplazo usando expresiones regulares
101
+ df_replaced[col] = df_replaced[col].astype(str).str.replace(pattern, repl, regex=True)
102
+
103
+ return df_replaced, replace_info
104
+
105
+
106
+
107
+ def get_descripciones_valiosas(df_datos_descript,df_datos_clusterizados, TARGETS, var_rename):
108
+
109
+ df_datos_descript = df_datos_descript.sort_values('cluster_ponderador', ascending=False)
110
+ # descrip_generales = [x for x in df_datos_descript['cluster_descripcion'].unique().tolist() if type('')==type(x)]
111
+ df_datos_clusterizados_desc = df_datos_clusterizados.merge(df_datos_descript, on='cluster', how='left')
112
+ stacked_data = df_datos_clusterizados_desc.groupby([TARGETS[0], 'cluster']).size().unstack(fill_value=0)
113
+ # best_clusters = df_datos_descript['cluster'].head(10).values.tolist()
114
+
115
+ proporcion_real = df_datos_clusterizados_desc[TARGETS[0]].value_counts(normalize=True).loc[1]
116
+ stacked_data_total = stacked_data.sum(axis=0)
117
+ proprcin_ = (stacked_data/stacked_data.sum(axis=0)).loc[1]
118
+ los_custers = pd.concat([proprcin_/proporcion_real, stacked_data_total], axis=1).sort_values(0, ascending=False)
119
+ los_custers_valiosos = los_custers[los_custers[1]>1].copy()
120
+
121
+ los_custers_valiosos_original = copy.deepcopy(los_custers_valiosos)
122
+ # Selecciona las columnas numéricas para la estandarización
123
+ numeric_cols = los_custers_valiosos.select_dtypes(include=np.number).columns
124
+ # Crea un StandardScaler
125
+ scaler = StandardScaler()
126
+ # Ajusta y transforma las columnas numéricas
127
+ los_custers_valiosos[numeric_cols] = scaler.fit_transform(los_custers_valiosos[numeric_cols])
128
+
129
+ los_custers_valiosos_original['importancia'] =los_custers_valiosos.sum(axis=1)
130
+ los_custers_valiosos_original.sort_values('importancia', ascending=False)
131
+
132
+ punto = primer_punto_inflexion_decreciente(los_custers_valiosos_original[0], bins=20, window_length=5, polyorder=2)
133
+ punto_1 = primer_punto_inflexion_decreciente(los_custers_valiosos_original[1], bins=20, window_length=5, polyorder=2)
134
+
135
+ los_custers_valiosos_original_cond = los_custers_valiosos_original[0]>punto*.4
136
+ los_custers_valiosos_original_cond_1 = los_custers_valiosos_original[0]>punto_1
137
+
138
+ los_custers_valiosos_original = los_custers_valiosos_original[los_custers_valiosos_original_cond|los_custers_valiosos_original_cond_1]
139
+
140
+ df_datos_descript_valiosas = df_datos_descript[df_datos_descript['cluster'].isin(los_custers_valiosos_original.index.tolist())]
141
+
142
+ df_datos_descript_valiosas,_ = replace_with_dict(df_datos_descript_valiosas, ['cluster_descripcion'], var_rename)
143
+ df_datos_descript_valiosas = df_datos_descript_valiosas.merge(proprcin_.reset_index(), on='cluster', how='left')
144
+ df_datos_descript_valiosas = df_datos_descript_valiosas.merge(los_custers.reset_index(), on='cluster', how='left')
145
+ df_datos_descript_valiosas = df_datos_descript_valiosas.rename(columns={'1_x':'Probabilidad','1_y':'N_probabilidad',0:'Soporte'})
146
+ return df_datos_descript_valiosas.drop(columns=['cluster_ponderador']), stacked_data
147
+
148
+
149
+ def generate_descriptions(condition_list, language='en', OPENAI_API_KEY=None, default_params=None):
150
+
151
+ client = OpenAI(api_key=OPENAI_API_KEY)
152
+
153
+ if default_params is None:
154
+ def get_default_params():
155
+ return {
156
+ 'model': 'gpt-4-turbo',
157
+ 'temperature': 0.5,
158
+ 'max_tokens': 1500,
159
+ 'n': 1,
160
+ 'stop': None,
161
+ }
162
+ default_params = get_default_params()
163
+
164
+ # Crear un único mensaje con todas las condiciones
165
+ conditions_text = "\n".join([f"{i+1}. {condition}" for i, condition in enumerate(condition_list)])
166
+
167
+ # Prompt mejorado para descripciones simples y comprensibles
168
+ system_prompt = "You are an assistant that helps to describe dataset groups in very simple terms."
169
+ user_prompt = (
170
+ f"Generate a very simple description for each of the following conditions. "
171
+ f"Use everyday language. Avoid specific numbers and ranges; instead, "
172
+ f"use general groups like 'elderly people', 'classic cars', etc."
173
+ f"Make each description visually friendly highlight what makes that condition unique and using emojis. Structure: 'EMOJI': 'RESPONSE'"
174
+ f"Only respond with the descriptions in {language}. Conditions:\n\n{conditions_text}"
175
+ )
176
+
177
+ mensajes = [
178
+ {"role": "system", "content": system_prompt},
179
+ {"role": "user", "content": user_prompt}
180
+ ]
181
+
182
+ # Crear una solicitud de finalización de chat con todos los mensajes
183
+ respuesta = client.chat.completions.create(
184
+ messages=mensajes,
185
+ **default_params
186
+ )
187
+
188
+ # Dividir la respuesta en una lista de descripciones por línea
189
+ descriptions = respuesta.choices[0].message.content.strip().split("\n")
190
+ descriptions = [desc.strip() for desc in descriptions if desc.strip()]
191
+
192
+ # Return a dictionary with the responses
193
+ result = {'respuestas': descriptions}
194
+ return result
195
+
196
+
197
+ def categorize_conditions(condition_list, df=None):
198
+ descriptions = []
199
+
200
+ # If df is provided, calculate thresholds using quantiles
201
+ if df is not None:
202
+ thresholds = {}
203
+ for column in df.columns:
204
+ # Calculate quantiles for low, medium, high categories
205
+ low = df[column].quantile(0.33)
206
+ high = df[column].quantile(0.66)
207
+ thresholds[column] = {'low': low, 'high': high}
208
+
209
+ for condition in condition_list:
210
+ features = {}
211
+ # Regex pattern to extract variable ranges
212
+ pattern = r'(\d+\.?\d*) <= (\w+) <= (\d+\.?\d*)'
213
+ matches = re.findall(pattern, condition)
214
+
215
+ for match in matches:
216
+ min_value, feature_name, max_value = match
217
+ min_value = float(min_value)
218
+ max_value = float(max_value)
219
+ # Calculate average value
220
+ avg_value = (min_value + max_value) / 2
221
+ # Categorize based on thresholds
222
+ if feature_name in thresholds:
223
+ low = thresholds[feature_name]['low']
224
+ high = thresholds[feature_name]['high']
225
+ # Determine category based on where the average value falls within the thresholds
226
+ if avg_value <= low:
227
+ category = 'BAJO'
228
+ elif avg_value <= high:
229
+ category = 'MEDIO'
230
+ else:
231
+ category = 'ALTO'
232
+ features[feature_name] = category
233
+ else:
234
+ features[feature_name] = 'N/A'
235
+
236
+ # Create description using the categories
237
+ description_parts = []
238
+ for feature, category in features.items():
239
+ description_parts.append(f"{feature} es {category}")
240
+ description = ', '.join(description_parts) + '.'
241
+ descriptions.append(description)
242
+
243
+ # Return a dictionary with the responses
244
+ result = {'respuestas': descriptions}
245
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: InsideForest
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: A comprehensive library for describing and analyzing data insights via AI
5
5
  Home-page: https://github.com/jcval94/InsideForest.git
6
6
  Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: InsideForest
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: A comprehensive library for describing and analyzing data insights via AI
5
5
  Home-page: https://github.com/jcval94/InsideForest.git
6
6
  Author: [('Jose Carlos Del Valle', 'jcval94@gmail.com'), ('ChatGPT', 'chat.openai.com/chat')]
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='InsideForest',
5
- version='0.2.1',
5
+ version='0.2.2',
6
6
  packages=find_packages(),
7
7
  license='MIT',
8
8
  author=[('Jose Carlos Del Valle', 'jcval94@gmail.com'),
@@ -1,102 +0,0 @@
1
- import os
2
- import re
3
- from openai import OpenAI
4
- import pandas as pd
5
-
6
- def generate_descriptions(condition_list, language='en', OPENAI_API_KEY=None, default_params=None):
7
-
8
- client = OpenAI(api_key=OPENAI_API_KEY)
9
-
10
- if default_params is None:
11
- def get_default_params():
12
- return {
13
- 'model': 'gpt-4-turbo',
14
- 'temperature': 0.5,
15
- 'max_tokens': 1500,
16
- 'n': 1,
17
- 'stop': None,
18
- }
19
- default_params = get_default_params()
20
-
21
- # Crear un único mensaje con todas las condiciones
22
- conditions_text = "\n".join([f"{i+1}. {condition}" for i, condition in enumerate(condition_list)])
23
-
24
- # Prompt mejorado para descripciones simples y comprensibles
25
- system_prompt = "You are an assistant that helps to describe dataset groups in very simple terms."
26
- user_prompt = (
27
- f"Generate a very simple description for each of the following conditions. "
28
- f"Use everyday language. Avoid specific numbers and ranges; instead, "
29
- f"use general groups like 'elderly people', 'classic cars', etc."
30
- f"Make each description visually friendly highlight what makes that condition unique and using emojis. Structure: 'EMOJI': 'RESPONSE'"
31
- f"Only respond with the descriptions in {language}. Conditions:\n\n{conditions_text}"
32
- )
33
-
34
- mensajes = [
35
- {"role": "system", "content": system_prompt},
36
- {"role": "user", "content": user_prompt}
37
- ]
38
-
39
- # Crear una solicitud de finalización de chat con todos los mensajes
40
- respuesta = client.chat.completions.create(
41
- messages=mensajes,
42
- **default_params
43
- )
44
-
45
- # Dividir la respuesta en una lista de descripciones por línea
46
- descriptions = respuesta.choices[0].message.content.strip().split("\n")
47
- descriptions = [desc.strip() for desc in descriptions if desc.strip()]
48
-
49
- # Return a dictionary with the responses
50
- result = {'respuestas': descriptions}
51
- return result
52
-
53
-
54
- def categorize_conditions(condition_list, df=None):
55
- descriptions = []
56
-
57
- # If df is provided, calculate thresholds using quantiles
58
- if df is not None:
59
- thresholds = {}
60
- for column in df.columns:
61
- # Calculate quantiles for low, medium, high categories
62
- low = df[column].quantile(0.33)
63
- high = df[column].quantile(0.66)
64
- thresholds[column] = {'low': low, 'high': high}
65
-
66
- for condition in condition_list:
67
- features = {}
68
- # Regex pattern to extract variable ranges
69
- pattern = r'(\d+\.?\d*) <= (\w+) <= (\d+\.?\d*)'
70
- matches = re.findall(pattern, condition)
71
-
72
- for match in matches:
73
- min_value, feature_name, max_value = match
74
- min_value = float(min_value)
75
- max_value = float(max_value)
76
- # Calculate average value
77
- avg_value = (min_value + max_value) / 2
78
- # Categorize based on thresholds
79
- if feature_name in thresholds:
80
- low = thresholds[feature_name]['low']
81
- high = thresholds[feature_name]['high']
82
- # Determine category based on where the average value falls within the thresholds
83
- if avg_value <= low:
84
- category = 'BAJO'
85
- elif avg_value <= high:
86
- category = 'MEDIO'
87
- else:
88
- category = 'ALTO'
89
- features[feature_name] = category
90
- else:
91
- features[feature_name] = 'N/A'
92
-
93
- # Create description using the categories
94
- description_parts = []
95
- for feature, category in features.items():
96
- description_parts.append(f"{feature} es {category}")
97
- description = ', '.join(description_parts) + '.'
98
- descriptions.append(description)
99
-
100
- # Return a dictionary with the responses
101
- result = {'respuestas': descriptions}
102
- return result
File without changes
File without changes