sarapy 2.1.1__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sarapy/analysis/FeaturesResume.py +618 -0
- sarapy/analysis/__init__.py +3 -0
- sarapy/dataProcessing/OpsProcessor.py +49 -25
- sarapy/mlProcessors/PlantinClassifier.py +79 -32
- sarapy/mlProcessors/PlantinFMCreator.py +8 -11
- sarapy/mlProcessors/__init__.py +11 -0
- sarapy/preprocessing/TransformInputData.py +2 -2
- sarapy/preprocessing/__init__.py +11 -2
- sarapy/stats/__init__.py +13 -1
- sarapy/stats/stats.py +5 -6
- sarapy/utils/__init__.py +3 -0
- sarapy/utils/utils.py +172 -0
- sarapy/version.py +2 -2
- {sarapy-2.1.1.dist-info → sarapy-2.3.0.dist-info}/METADATA +10 -1
- sarapy-2.3.0.dist-info/RECORD +29 -0
- sarapy/utils/amg_decoder.py +0 -125
- sarapy/utils/amg_ppk.py +0 -38
- sarapy/utils/getRawOperations.py +0 -20
- sarapy-2.1.1.dist-info/RECORD +0 -29
- {sarapy-2.1.1.dist-info → sarapy-2.3.0.dist-info}/LICENCE +0 -0
- {sarapy-2.1.1.dist-info → sarapy-2.3.0.dist-info}/WHEEL +0 -0
- {sarapy-2.1.1.dist-info → sarapy-2.3.0.dist-info}/top_level.txt +0 -0
|
@@ -29,7 +29,12 @@ class OpsProcessor():
|
|
|
29
29
|
- kwargs: Diccionario con los argumentos necesarios instanciar algunas clases.
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
+
self.classifications_probas = None
|
|
32
33
|
plclass_map = {"classifier_file"}
|
|
34
|
+
self._operationsDict = {} ##diccionario de operarios con sus operaciones
|
|
35
|
+
self._platin_classifiedOperations = np.array([]) ##array con las operaciones clasificadas para plantin
|
|
36
|
+
self._fertilizer_classifiedOperations = np.array([]) ##array con las operaciones clasificadas para plantin
|
|
37
|
+
self._last_row_db = 0 ##indicador de la última fila de los datos extraidos de la base de datos histórica
|
|
33
38
|
|
|
34
39
|
kwargs_plclass = {}
|
|
35
40
|
##recorro kwargs y usando plclass_map creo un nuevo diccionario con los valores que se pasaron
|
|
@@ -45,8 +50,8 @@ class OpsProcessor():
|
|
|
45
50
|
if key in fmcreator_map:
|
|
46
51
|
fmcreator_kargs[key] = value
|
|
47
52
|
|
|
48
|
-
self._plantin_classifier = PlantinClassifier
|
|
49
|
-
self.plantinFMCreator = PlantinFMCreator
|
|
53
|
+
self._plantin_classifier = PlantinClassifier(**kwargs_plclass)
|
|
54
|
+
self.plantinFMCreator = PlantinFMCreator(**fmcreator_kargs)
|
|
50
55
|
|
|
51
56
|
##mapa de argumentos para FertilizerTransformer
|
|
52
57
|
ft_map = {"regresor_file", "poly_features_file"}
|
|
@@ -56,16 +61,11 @@ class OpsProcessor():
|
|
|
56
61
|
if key in ft_map:
|
|
57
62
|
ft_kwargs[key] = value
|
|
58
63
|
|
|
59
|
-
self._ftfmcreator = FertilizerFMCreator
|
|
60
|
-
self._fertilizer_transformer = FertilizerTransformer
|
|
61
|
-
|
|
62
|
-
self.
|
|
63
|
-
|
|
64
|
-
self._fertilizer_classifiedOperations = np.array([]) ##array con las operaciones clasificadas para plantin
|
|
65
|
-
self._last_row_db = 0 ##indicador de la última fila de los datos extraidos de la base de datos histórica
|
|
66
|
-
self.transformInputData = TransformInputData.TransformInputData()
|
|
67
|
-
self.transformToOutputData = TransformToOutputData.TransformToOutputData()
|
|
68
|
-
|
|
64
|
+
self._ftfmcreator = FertilizerFMCreator()
|
|
65
|
+
self._fertilizer_transformer = FertilizerTransformer(**ft_kwargs)
|
|
66
|
+
self.transformInputData = TransformInputData()
|
|
67
|
+
self.transformToOutputData = TransformToOutputData()
|
|
68
|
+
|
|
69
69
|
def processOperations(self, data, **kwargs):
|
|
70
70
|
"""Método para procesar las operaciones de los operarios.
|
|
71
71
|
|
|
@@ -94,14 +94,14 @@ class OpsProcessor():
|
|
|
94
94
|
|
|
95
95
|
#Si tenemos nuevas operaciones, actualizamos el diccionario de operaciones
|
|
96
96
|
self.updateOperationsDict(newSample) #actualizamos diccionario interno de la clase
|
|
97
|
-
pl_clas = self.classifyForPlantin(**kwargs) #clasificamos las operaciones para plantín
|
|
97
|
+
pl_clas, self.classifications_probas = self.classifyForPlantin(**kwargs) #clasificamos las operaciones para plantín
|
|
98
98
|
|
|
99
99
|
#estimamos los gramos de fertilizante
|
|
100
100
|
ft_grams = self._fertilizer_transformer.transform(newSample)
|
|
101
101
|
logging.debug(f"Fertilizer grams shape: {ft_grams.shape}")
|
|
102
102
|
id_db_h_nums, id_db_dw_nums = self.getActualOperationsNumbers() #obtenemos los números de operaciones desde el diccionario de operaciones
|
|
103
103
|
logging.debug(f"ID_DB_H shape: {id_db_h_nums.shape}, ID_DB_DW shape: {id_db_dw_nums.shape}")
|
|
104
|
-
date_oprc = pd.DataFrame(newSample)["date_oprc"].values.reshape(-1, 1) ##extraigo las fechas de operación de la muestra
|
|
104
|
+
# date_oprc = pd.DataFrame(newSample)["date_oprc"].values.reshape(-1, 1) ##extraigo las fechas de operación de la muestra
|
|
105
105
|
timestamps = pd.DataFrame(newSample)["timestamp"].values.reshape(-1, 1) ##extraigo los timestamps de la muestra
|
|
106
106
|
|
|
107
107
|
return self.transformToOutputData.fit_transform(np.column_stack((timestamps,
|
|
@@ -167,7 +167,8 @@ class OpsProcessor():
|
|
|
167
167
|
|
|
168
168
|
key_classify_map = {"feature_matrix", "update_samePlace",
|
|
169
169
|
"useRatioStats", "std_weight", "useDistancesStats",
|
|
170
|
-
"ratio_dcdp_umbral", "dist_umbral"
|
|
170
|
+
"ratio_dcdp_umbral", "dist_umbral",
|
|
171
|
+
"umbral_bajo_dstpt", "umbral_proba_dstpt"}
|
|
171
172
|
|
|
172
173
|
##recorro kwargs y usando key_classify_map creo un nuevo diccionario con los valores que se pasaron
|
|
173
174
|
classify_kwargs = {}
|
|
@@ -187,7 +188,7 @@ class OpsProcessor():
|
|
|
187
188
|
logging.debug(f"Número de operaciones para el nodo {ID_NPDP}: {len(operations)}")
|
|
188
189
|
features, dst_pt, inest_pt = self.plantinFMCreator.fit_transform(operations)
|
|
189
190
|
logging.debug(f"Features shape for {ID_NPDP}: {features.shape}")
|
|
190
|
-
classified_ops = self._plantin_classifier.classify(features, dst_pt, inest_pt, **classify_kwargs)
|
|
191
|
+
classified_ops, classifications_probas = self._plantin_classifier.classify(features, dst_pt, inest_pt, **classify_kwargs)
|
|
191
192
|
logging.debug(f"Classified operations shape for {ID_NPDP}: {classified_ops.shape}")
|
|
192
193
|
|
|
193
194
|
##chequeo si first_day_op_classified es True, si es así, no se considera la primera fila de las classified_ops
|
|
@@ -201,7 +202,7 @@ class OpsProcessor():
|
|
|
201
202
|
|
|
202
203
|
self._operationsDict[ID_NPDP]["first_day_op_classified"] = True
|
|
203
204
|
|
|
204
|
-
return plantinClassifications
|
|
205
|
+
return plantinClassifications, classifications_probas
|
|
205
206
|
|
|
206
207
|
def updateLastOperations(self, ID_NPDPs_newOperations):
|
|
207
208
|
"""Método para actualizar la última operación de una muestra de operaciones en el diccionario de operaciones
|
|
@@ -303,16 +304,39 @@ if __name__ == "__main__":
|
|
|
303
304
|
import pandas as pd
|
|
304
305
|
import json
|
|
305
306
|
import logging
|
|
306
|
-
|
|
307
307
|
|
|
308
|
-
|
|
308
|
+
## argumentos de PlantinFMCreator
|
|
309
|
+
kwargs_constructor = {"imputeDistances":False, "distanciaMedia":1.8, "umbral_precision":0.3,
|
|
310
|
+
"dist_mismo_lugar":0.0, "max_dist":100,
|
|
311
|
+
"umbral_ratio_dCdP":0.5, "deltaO_medio":4,}
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
##argumentos del método PlantinClassifier.clasiffy()
|
|
315
|
+
kwargs_classifier = {"proba_threshold":0.85,
|
|
316
|
+
"update_samePlace":False,
|
|
317
|
+
"update_dstpt":False,
|
|
318
|
+
"useRatioStats":False,
|
|
319
|
+
"std_weight":1.,
|
|
320
|
+
"useDistancesStats":False,
|
|
321
|
+
"ratio_dcdp_umbral":0.3,
|
|
322
|
+
"dist_umbral":0.5,
|
|
323
|
+
"umbral_bajo_dstpt":4,
|
|
324
|
+
"umbral_proba_dstpt":0.85}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
historical_data_path = "examples\\2025-09-04\\UPM039N\\historical-data.json"
|
|
309
328
|
with open(historical_data_path, 'r') as file:
|
|
310
329
|
samples = json.load(file)
|
|
311
330
|
|
|
312
|
-
|
|
331
|
+
op = OpsProcessor(classifier_file='modelos\\pipeline_rf.pkl',
|
|
332
|
+
regresor_file='modelos\\regresor.pkl', poly_features_file='modelos\\poly_features.pkl',
|
|
333
|
+
**kwargs_constructor)
|
|
334
|
+
|
|
335
|
+
ops_clasificadas = op.processOperations(samples, **kwargs_classifier)
|
|
336
|
+
probas = op.classifications_probas
|
|
337
|
+
print(probas[:3])
|
|
338
|
+
print(ops_clasificadas[:3])
|
|
339
|
+
df_ops_clasificadas = pd.DataFrame(ops_clasificadas)
|
|
340
|
+
|
|
341
|
+
print(df_ops_clasificadas.describe())
|
|
313
342
|
|
|
314
|
-
op = OpsProcessor(classifier_file='modelos\\pipeline_rf.pkl', imputeDistances = False,
|
|
315
|
-
regresor_file='modelos\\regresor.pkl', poly_features_file='modelos\\poly_features.pkl')
|
|
316
|
-
|
|
317
|
-
print(op.processOperations(samples[:20]))
|
|
318
|
-
# op.processOperations(samples2)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
###Documentación en https://github.com/lucasbaldezzari/sarapy/blob/main/docs/Docs.md
|
|
2
|
+
import logging
|
|
2
3
|
import numpy as np
|
|
3
4
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
4
5
|
from sklearn.pipeline import Pipeline
|
|
@@ -15,17 +16,24 @@ class PlantinClassifier(BaseEstimator, TransformerMixin):
|
|
|
15
16
|
- classifier_file: String con el nombre del archivo que contiene el clasificador entrenado. El archivo a cargar es un archivo .pkl.
|
|
16
17
|
"""
|
|
17
18
|
|
|
19
|
+
self.logger = logging.getLogger("PlantinClassifier")
|
|
20
|
+
|
|
21
|
+
self.classifications_probas = None
|
|
22
|
+
self.clasificaciones = None
|
|
23
|
+
|
|
18
24
|
#cargo el clasificador con pickle. Usamos try para capturar el error FileNotFoundError
|
|
19
25
|
try:
|
|
20
26
|
with open(classifier_file, 'rb') as file:
|
|
21
27
|
self._pipeline = pickle.load(file)
|
|
22
|
-
|
|
28
|
+
self.logger.info("Clasificador cargado con éxito.")
|
|
23
29
|
except FileNotFoundError:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def classify(self, feature_matrix, dst_pt, inest_pt,
|
|
30
|
+
self.logger.error("El archivo no se encuentra en el directorio actual.")
|
|
31
|
+
|
|
32
|
+
def classify(self, feature_matrix, dst_pt, inest_pt, proba_threshold = 0.85,
|
|
27
33
|
update_samePlace:bool = True, update_dstpt: bool = True,
|
|
28
|
-
|
|
34
|
+
umbral_proba_dstpt = 0.85, umbral_bajo_dstpt = 4,
|
|
35
|
+
use_ma = False, ma_window = 104,
|
|
36
|
+
use_min_dstpt = False, factor = 0.1, **kwargs):
|
|
29
37
|
"""Genera la clasificación de las operaciones para plantines.
|
|
30
38
|
|
|
31
39
|
- feature_matrix: Es un array con los datos (strings) provenientes de la base de datos histórica.
|
|
@@ -41,18 +49,26 @@ class PlantinClassifier(BaseEstimator, TransformerMixin):
|
|
|
41
49
|
NOTA: Estas características son necesarias en base a la última versión del modelo de clasificación.
|
|
42
50
|
"""
|
|
43
51
|
|
|
44
|
-
|
|
45
|
-
|
|
52
|
+
if use_ma:
|
|
53
|
+
dst_pt = self.get_dstpt_MA(dst_pt, window_size=ma_window, mode='same')
|
|
54
|
+
|
|
55
|
+
self.clasificaciones = self._pipeline.predict(feature_matrix)
|
|
56
|
+
self.classifications_probas = self._pipeline.predict_proba(feature_matrix)
|
|
57
|
+
|
|
58
|
+
# Si la probabilidad de ser plantín es menor al umbral, entonces la clasificación es 0 (no plantín)
|
|
59
|
+
self.clasificaciones[self.classifications_probas[:,1] < proba_threshold] = 0
|
|
46
60
|
|
|
47
61
|
if update_samePlace:
|
|
48
62
|
self.grouped_ops = self.groupOpsSamePlace(feature_matrix, **kwargs)
|
|
49
|
-
self.
|
|
63
|
+
self.clasificaciones = self.updateLabelsSamePlace(self.clasificaciones, self.grouped_ops)
|
|
50
64
|
|
|
51
65
|
if update_dstpt:
|
|
52
|
-
self.
|
|
66
|
+
self.clasificaciones = self.updateLabelsFromDSTPT(self.clasificaciones, dst_pt, inest_pt,
|
|
67
|
+
umbral_bajo_dstpt, umbral_proba_dstpt,
|
|
68
|
+
use_min_dstpt, factor)
|
|
69
|
+
|
|
70
|
+
return self.clasificaciones, self.classifications_probas
|
|
53
71
|
|
|
54
|
-
return self.clasificaiones
|
|
55
|
-
|
|
56
72
|
def groupOpsSamePlace(self, X, useRatioStats = True, std_weight=1, useDistancesStats = True,
|
|
57
73
|
ratio_dcdp_umbral=0.1, dist_umbral=0.5):
|
|
58
74
|
"""
|
|
@@ -123,20 +139,32 @@ class PlantinClassifier(BaseEstimator, TransformerMixin):
|
|
|
123
139
|
new_labels[indexes[1:]] = 0
|
|
124
140
|
|
|
125
141
|
return new_labels
|
|
126
|
-
|
|
127
|
-
def updateLabelsFromDSTPT(self, labels, dst_pt, inest_pt,
|
|
142
|
+
|
|
143
|
+
def updateLabelsFromDSTPT(self, labels, dst_pt, inest_pt,
|
|
144
|
+
umbral_bajo_dstpt = 4, umbral_proba_dstpt = 0.85,
|
|
145
|
+
use_min_dstpt = False, factor = 0.1):
|
|
128
146
|
"""
|
|
129
147
|
Función para actualizar las etiquetas de las operaciones que tengan distorsiones de plantín.
|
|
130
148
|
"""
|
|
131
149
|
new_labels = labels.copy()
|
|
150
|
+
|
|
151
|
+
umbral_bajo_dstpt = min(dst_pt)*(1+factor) if use_min_dstpt else umbral_bajo_dstpt
|
|
132
152
|
|
|
133
|
-
##filtro
|
|
134
|
-
new_labels[(dst_pt <
|
|
153
|
+
##filtro
|
|
154
|
+
new_labels[(dst_pt < umbral_bajo_dstpt) & (inest_pt == 0)] = 0
|
|
135
155
|
|
|
136
|
-
##si inest_pt 1 es y
|
|
137
|
-
new_labels[(inest_pt == 1) & (self.
|
|
156
|
+
##si inest_pt 1 es y las probs son menores a umbral_proba_dstpt, entonces la operación es 0
|
|
157
|
+
new_labels[(inest_pt == 1) & (self.classifications_probas[:,1] < umbral_proba_dstpt)] = 0
|
|
138
158
|
|
|
139
159
|
return new_labels
|
|
160
|
+
|
|
161
|
+
def get_dstpt_MA(self, dst_pt, window_size=104, mode='same'):
|
|
162
|
+
"""
|
|
163
|
+
Función para calcular la media móvil de una serie temporal.
|
|
164
|
+
data: numpy array con los datos de la serie temporal
|
|
165
|
+
window_size: tamaño de la ventana para calcular la media móvil
|
|
166
|
+
"""
|
|
167
|
+
return np.convolve(dst_pt, np.ones(window_size)/window_size, mode=mode)
|
|
140
168
|
|
|
141
169
|
if __name__ == "__main__":
|
|
142
170
|
import os
|
|
@@ -144,25 +172,44 @@ if __name__ == "__main__":
|
|
|
144
172
|
import numpy as np
|
|
145
173
|
from sarapy.preprocessing import TransformInputData
|
|
146
174
|
from sarapy.mlProcessors import PlantinFMCreator
|
|
147
|
-
import sarapy.utils.getRawOperations as getRawOperations
|
|
148
175
|
from sarapy.mlProcessors import PlantinClassifier
|
|
176
|
+
import json
|
|
149
177
|
|
|
150
|
-
fmcreator = PlantinFMCreator.PlantinFMCreator(imputeDistances=False)
|
|
151
|
-
tindata = TransformInputData.TransformInputData()
|
|
152
178
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
179
|
+
kwargs_fmcreator = {"imputeDistances":False, "distanciaMedia":1.8, "umbral_precision":0.3,
|
|
180
|
+
"dist_mismo_lugar":0.0, "max_dist":100,
|
|
181
|
+
"umbral_ratio_dCdP":2, "deltaO_medio":4,}
|
|
182
|
+
|
|
183
|
+
kwargs_classifier = {"proba_threshold":0.85,
|
|
184
|
+
"update_samePlace":False,
|
|
185
|
+
"update_dstpt":False,
|
|
186
|
+
"useRatioStats":False,
|
|
187
|
+
"std_weight":1.,
|
|
188
|
+
"useDistancesStats":False,
|
|
189
|
+
"ratio_dcdp_umbral":0.3,
|
|
190
|
+
"dist_umbral":0.5,
|
|
191
|
+
"umbral_bajo_dstpt":4,
|
|
192
|
+
"umbral_proba_dstpt":0.85}
|
|
193
|
+
|
|
194
|
+
historical_data_path = "examples\\2025-09-04\\UPM039N\\historical-data.json"
|
|
195
|
+
with open(historical_data_path, 'r') as file:
|
|
196
|
+
samples = json.load(file)
|
|
197
|
+
|
|
198
|
+
fmcreator = PlantinFMCreator(**kwargs_fmcreator)
|
|
199
|
+
tindata = TransformInputData()
|
|
200
|
+
raw_X = tindata.transform(samples)
|
|
160
201
|
|
|
161
202
|
X, dst_pt, inest_pt = fmcreator.fit_transform(raw_X)
|
|
162
203
|
|
|
163
|
-
|
|
164
|
-
|
|
204
|
+
rf_clf_wu = PlantinClassifier(classifier_file='modelos\\pipeline_rf.pkl')
|
|
205
|
+
|
|
206
|
+
clasificaciones, probas = rf_clf_wu.classify(X, dst_pt, inest_pt, **kwargs_classifier)
|
|
207
|
+
print("media de clasificaciones", clasificaciones.mean())
|
|
208
|
+
print("media de probabilidades", probas.mean(axis=0), probas.std(axis=0), np.median(probas, axis=0))
|
|
209
|
+
print("primeras clasificaciones", clasificaciones[100:105])
|
|
210
|
+
print("primeras probabilidades", probas[100:105])
|
|
211
|
+
print("primeras distorsiones", dst_pt[100:105])
|
|
212
|
+
print("primeras inestabilidades", inest_pt[100:105])
|
|
213
|
+
|
|
214
|
+
# print(rf_clf_wu.classify(X, dst_pt, inest_pt, **kwargs_classifier))
|
|
165
215
|
|
|
166
|
-
print(rf_clf_nu.classify(X, dst_pt, inest_pt, update_samePlace = False, update_dstpt=False).mean())
|
|
167
|
-
print(rf_clf_wu.classify(X, dst_pt, inest_pt, update_samePlace=True, update_dstpt=True,
|
|
168
|
-
useRatioStats=True, useDistancesStats=True,umbral_proba=0.8).mean())
|
|
@@ -20,7 +20,7 @@ class PlantinFMCreator(BaseEstimator, TransformerMixin):
|
|
|
20
20
|
|
|
21
21
|
def __init__(self, imputeDistances = True, distanciaMedia:float = 1.8,
|
|
22
22
|
umbral_precision:float = 0.3, dist_mismo_lugar = 0.0, max_dist = 100,
|
|
23
|
-
umbral_ratio_dCdP:float = 0.5, deltaO_medio = 4):
|
|
23
|
+
umbral_ratio_dCdP:float = 0.5, deltaO_medio = 4, baseDeltaP = 10):
|
|
24
24
|
"""Inicializa la clase FMCreator.
|
|
25
25
|
|
|
26
26
|
Args:
|
|
@@ -39,6 +39,7 @@ class PlantinFMCreator(BaseEstimator, TransformerMixin):
|
|
|
39
39
|
self.max_dist = max_dist
|
|
40
40
|
self.umbral_ratio_dCdP = umbral_ratio_dCdP
|
|
41
41
|
self.deltaO_medio = deltaO_medio
|
|
42
|
+
self.baseDeltaP = baseDeltaP
|
|
42
43
|
|
|
43
44
|
##creamos un diccionario para saber la posición de cada dato dentro del array devuelto por transform()
|
|
44
45
|
self._dataPositions = {"DST_PT": 0, "deltaO": 2, "ratio_dCdP": 3, "distances": 4}
|
|
@@ -74,12 +75,12 @@ class PlantinFMCreator(BaseEstimator, TransformerMixin):
|
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
date_oprc = self.tlmDataProcessor["date_oprc",:] #datos de fecha y hora de operación
|
|
77
|
-
time_ac = self.tlmDataProcessor["TIME_AC",:] #datos de fecha y hora de operación en formato timestamp
|
|
78
|
+
time_ac = self.tlmDataProcessor["TIME_AC",:]/self.baseDeltaP #datos de fecha y hora de operación en formato timestamp
|
|
78
79
|
lats = self.tlmDataProcessor["latitud",:] #latitudes de las operaciones
|
|
79
80
|
longs = self.tlmDataProcessor["longitud",:] #longitudes de las operaciones
|
|
80
81
|
self.dst_pt = self.tlmDataProcessor["SC_PT",:] #distorsión del plantín
|
|
81
82
|
self.inest_pt = self.tlmDataProcessor["INST_PT",:] #inest
|
|
82
|
-
|
|
83
|
+
|
|
83
84
|
|
|
84
85
|
##***** OBTENEMOS LOS DATOS PARA FITEAR LOS OBJETOS Y ASÍ PROCESAR LA FM *****
|
|
85
86
|
|
|
@@ -87,7 +88,8 @@ class PlantinFMCreator(BaseEstimator, TransformerMixin):
|
|
|
87
88
|
timeData = np.hstack((date_oprc.reshape(-1,1),time_ac.reshape(-1, 1)))
|
|
88
89
|
|
|
89
90
|
self._timeDeltas = timeProcessor.fit_transform(timeData)
|
|
90
|
-
|
|
91
|
+
# print(np.median(self._timeDeltas[:,tpDP["ratio_dCdP"]]))
|
|
92
|
+
|
|
91
93
|
##fitteamos geoprocessor con las latitudes y longitudes
|
|
92
94
|
points = np.hstack((lats.reshape(-1,1),longs.reshape(-1,1)))
|
|
93
95
|
self._distances = geoprocessor.fit_transform(points)
|
|
@@ -113,11 +115,6 @@ class PlantinFMCreator(BaseEstimator, TransformerMixin):
|
|
|
113
115
|
self.fit(X)
|
|
114
116
|
return self.transform(X)
|
|
115
117
|
|
|
116
|
-
# @property
|
|
117
|
-
# def tlmExtracted(self):
|
|
118
|
-
# """Devuelve los datos de telemetría extraídos."""
|
|
119
|
-
# return self.tlmExtracted
|
|
120
|
-
|
|
121
118
|
@property
|
|
122
119
|
def tlmdeDP(self):
|
|
123
120
|
"""Devuelve el diccionario con la posición de los datos dentro del array devuelto por transform()."""
|
|
@@ -144,7 +141,7 @@ if __name__ == "__main__":
|
|
|
144
141
|
import json
|
|
145
142
|
from sarapy.preprocessing import TransformInputData
|
|
146
143
|
|
|
147
|
-
historical_data_path = "examples
|
|
144
|
+
historical_data_path = "examples\\2025-08-04\\UPM003N\\historical-data.json"
|
|
148
145
|
with open(historical_data_path, 'r') as file:
|
|
149
146
|
historical_data = json.load(file)
|
|
150
147
|
df = pd.DataFrame(historical_data)
|
|
@@ -157,4 +154,4 @@ if __name__ == "__main__":
|
|
|
157
154
|
fmcreator = PlantinFMCreator(imputeDistances=False)
|
|
158
155
|
|
|
159
156
|
fm, dst_pt, inest_pt = fmcreator.fit_transform(X)
|
|
160
|
-
print(
|
|
157
|
+
print(np.median(fm,axis=0))
|
sarapy/mlProcessors/__init__.py
CHANGED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .FertilizerFMCreator import FertilizerFMCreator
|
|
2
|
+
from .FertilizerTransformer import FertilizerTransformer
|
|
3
|
+
from .PlantinClassifier import PlantinClassifier
|
|
4
|
+
from .PlantinFMCreator import PlantinFMCreator
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"FertilizerFMCreator",
|
|
8
|
+
"FertilizerTransformer",
|
|
9
|
+
"PlantinClassifier",
|
|
10
|
+
"PlantinFMCreator",
|
|
11
|
+
]
|
|
@@ -137,7 +137,7 @@ if __name__ == "__main__":
|
|
|
137
137
|
import pandas as pd
|
|
138
138
|
import json
|
|
139
139
|
|
|
140
|
-
historical_data_path = "examples
|
|
140
|
+
historical_data_path = "examples\\2025-08-04\\UPM006N\\historical-data.json"
|
|
141
141
|
with open(historical_data_path, 'r') as file:
|
|
142
142
|
historical_data = json.load(file)
|
|
143
143
|
df = pd.DataFrame(historical_data)
|
|
@@ -146,4 +146,4 @@ if __name__ == "__main__":
|
|
|
146
146
|
data_positions = json.load(open("sarapy/preprocessing/telemetriaDataPosition.json", 'r'))
|
|
147
147
|
transform_input_data = TransformInputData()
|
|
148
148
|
transformed_data = transform_input_data.transform(historical_data)
|
|
149
|
-
print(transformed_data[
|
|
149
|
+
print(transformed_data[:2])
|
sarapy/preprocessing/__init__.py
CHANGED
|
@@ -1,2 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
from .DistancesImputer import DistancesImputer
|
|
2
|
+
from .FertilizerImputer import FertilizerImputer
|
|
3
|
+
from .TransformInputData import TransformInputData
|
|
4
|
+
from .TransformToOutputData import TransformToOutputData
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"DistancesImputer",
|
|
8
|
+
"FertilizerImputer",
|
|
9
|
+
"TransformInputData",
|
|
10
|
+
"TransformToOutputData"
|
|
11
|
+
]
|
sarapy/stats/__init__.py
CHANGED
|
@@ -1 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
from .stats import *
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"getMA",
|
|
5
|
+
"probabilidadEmpirica",
|
|
6
|
+
"penalizacion",
|
|
7
|
+
"probSaturacion",
|
|
8
|
+
"estimarKDE",
|
|
9
|
+
"saturationProbability",
|
|
10
|
+
"movingProbability",
|
|
11
|
+
"resumen_sensor",
|
|
12
|
+
"detectar_secuencia_saturada"
|
|
13
|
+
]
|
sarapy/stats/stats.py
CHANGED
|
@@ -2,8 +2,7 @@ import numpy as np
|
|
|
2
2
|
from scipy.stats import skew, kurtosis, gaussian_kde
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import logging
|
|
5
|
-
|
|
6
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
5
|
+
logger = logging.getLogger(__name__) # ← "sarapy.stats"
|
|
7
6
|
|
|
8
7
|
def getMA(data, window_size=104, mode='same'):
|
|
9
8
|
"""
|
|
@@ -63,7 +62,7 @@ def saturationProbability(distorsion_data, saturation_mode = "alto", umbrales =
|
|
|
63
62
|
if distorsion_data.shape[0] == 0:
|
|
64
63
|
raise ValueError("La distorsion_data no puede estar vacía.")
|
|
65
64
|
if distorsion_data.shape[0] < 50:
|
|
66
|
-
|
|
65
|
+
logger.warning("La distorsion_data tiene menos de 50 elementos. Los resultados pueden no ser representativos.")
|
|
67
66
|
|
|
68
67
|
ventana_filtered = distorsion_data.copy()
|
|
69
68
|
if saturation_mode == "bajo":
|
|
@@ -75,7 +74,7 @@ def saturationProbability(distorsion_data, saturation_mode = "alto", umbrales =
|
|
|
75
74
|
|
|
76
75
|
##chequeo si la ventana filtrada está vacía
|
|
77
76
|
if ventana_filtered.shape[0] == 0:
|
|
78
|
-
|
|
77
|
+
logger.warning("Ventana filtrada vacía. Se retornará 0.0.")
|
|
79
78
|
return 0.0
|
|
80
79
|
|
|
81
80
|
skew_val = skew(ventana_filtered)
|
|
@@ -84,13 +83,13 @@ def saturationProbability(distorsion_data, saturation_mode = "alto", umbrales =
|
|
|
84
83
|
pena = penalizacion(alpha, skew_val, beta, kurt_val)
|
|
85
84
|
##chequeo que pena no sea nan, sino reemplazo por 1
|
|
86
85
|
if np.isnan(pena):
|
|
87
|
-
|
|
86
|
+
logger.warning("La penalización es NaN. Se reemplazará por 1.")
|
|
88
87
|
pena = 1.0
|
|
89
88
|
# Probabilidad
|
|
90
89
|
proba_empirica = ventana_filtered.shape[0]/distorsion_data.shape[0]
|
|
91
90
|
prob_saturacion = proba_empirica * pena
|
|
92
91
|
|
|
93
|
-
|
|
92
|
+
logger.debug(f"Ventana filtrada: {ventana_filtered.shape[0]}, {distorsion_data.shape[0]}, {proba_empirica}, {pena}")
|
|
94
93
|
return prob_saturacion
|
|
95
94
|
|
|
96
95
|
def movingProbability(distorsion_data, window_size=104, **kwargs):
|
sarapy/utils/__init__.py
CHANGED
sarapy/utils/utils.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from sarapy.analysis.FeaturesResume import FeaturesResume
|
|
7
|
+
|
|
8
|
+
def dataMerging(historical_data, post_processing_data, raw_data, nodoName = None, newColumns = False, asDF = False):
|
|
9
|
+
"""
|
|
10
|
+
Función para tomar historical_data y post_processing_data y formar una
|
|
11
|
+
sóla lista de diccionarios (json)
|
|
12
|
+
|
|
13
|
+
Si newColumns es False la función reemplaza los valores de tag_seedling y tag_fertilizer de historical_data,
|
|
14
|
+
sino genera dos nuevos campos llamados tag_seedling_classified y tag_fertilizer_estimated en historical_data.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
- historical_data (list): Lista de diccionarios con datos históricos (tipo json)
|
|
18
|
+
- post_processing_data (list): Lista de diccionarios con datos de post-procesamiento (tipo json)
|
|
19
|
+
- nodoName (str|None): Nombre del nodo al que pertenecen los datos. Por defecto es None
|
|
20
|
+
- newColumns (bool): Indica si se deben crear nuevas columnas en lugar de reemplazar las existentes.
|
|
21
|
+
- asDF (bool): Indica si se debe retornar como un dataframe o no
|
|
22
|
+
"""
|
|
23
|
+
#chequeo que historical_data y post_processing_data sean del mismo tamaño, sino rais
|
|
24
|
+
if len(historical_data) != len(post_processing_data):
|
|
25
|
+
raise ValueError("Las listas de datos históricos y de post-procesamiento no son del mismo tamaño.")
|
|
26
|
+
|
|
27
|
+
final_data = pd.DataFrame(historical_data)
|
|
28
|
+
post_data = pd.DataFrame(post_processing_data)
|
|
29
|
+
raw_data = pd.DataFrame(raw_data)
|
|
30
|
+
|
|
31
|
+
final_data['raw_tag_seedling'] = raw_data['raw_tag_seedling']
|
|
32
|
+
final_data['raw_tag_fertilizer'] = raw_data['raw_tag_fertilizer']
|
|
33
|
+
|
|
34
|
+
if not newColumns:
|
|
35
|
+
final_data['tag_seedling'] = post_data['tag_seedling']
|
|
36
|
+
final_data['tag_fertilizer'] = post_data['tag_fertilizer']
|
|
37
|
+
else:
|
|
38
|
+
final_data['tag_seedling_classified'] = post_data['tag_seedling']
|
|
39
|
+
final_data['tag_fertilizer_estimated'] = post_data['tag_fertilizer']
|
|
40
|
+
|
|
41
|
+
if nodoName:
|
|
42
|
+
final_data['nodo'] = nodoName
|
|
43
|
+
|
|
44
|
+
#retorno como lista de diccionarios (json)
|
|
45
|
+
if not asDF:
|
|
46
|
+
return final_data.to_dict(orient='records')
|
|
47
|
+
else:
|
|
48
|
+
return final_data
|
|
49
|
+
|
|
50
|
+
def getOutliersThresholds(data, q1 = 0.25, q3 = 0.75, k = 1.5):
|
|
51
|
+
"""Cálculo de los límites para detectar outliers a partir del rango intercuartil
|
|
52
|
+
|
|
53
|
+
data: array con los datos
|
|
54
|
+
q1: primer cuartil
|
|
55
|
+
q3: tercer cuartil
|
|
56
|
+
k: factor de escala
|
|
57
|
+
"""
|
|
58
|
+
# Calculo del rango intercuartil
|
|
59
|
+
q1 = np.quantile(data, q1)
|
|
60
|
+
q3 = np.quantile(data, q3)
|
|
61
|
+
iqr = q3 - q1
|
|
62
|
+
|
|
63
|
+
# Cálculo de los límites
|
|
64
|
+
lower = q1 - k * iqr
|
|
65
|
+
upper = q3 + k * iqr
|
|
66
|
+
|
|
67
|
+
return lower, upper
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def countingZeros(array: List[int], minimos_seguidos: int = 3) -> List[Tuple[int, int]]:
|
|
71
|
+
"""
|
|
72
|
+
Cuenta ceros consecutivos en un array binario (0s y 1s), retornando una lista de tuplas.
|
|
73
|
+
Cada tupla (n, k) indica que se encontraron 'n' secuencias de 'k' ceros consecutivos,
|
|
74
|
+
siempre que k >= minimos_seguidos.
|
|
75
|
+
|
|
76
|
+
Parameters:
|
|
77
|
+
array (List[int]): Lista binaria de 0s y 1s.
|
|
78
|
+
minimos_seguidos (int): Mínimo de ceros consecutivos a considerar.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List[Tuple[int, int]]: Lista de tuplas (n, k), ordenadas por k.
|
|
82
|
+
"""
|
|
83
|
+
contador = 0
|
|
84
|
+
resultados = {}
|
|
85
|
+
indexes = []
|
|
86
|
+
for i, val in enumerate(array):
|
|
87
|
+
if val == 0:
|
|
88
|
+
contador += 1
|
|
89
|
+
indexes.append(i)
|
|
90
|
+
else:
|
|
91
|
+
if contador >= minimos_seguidos:
|
|
92
|
+
if contador in resultados.keys():
|
|
93
|
+
resultados[contador][0] += 1
|
|
94
|
+
resultados[contador][1] += (indexes,)
|
|
95
|
+
indexes = []
|
|
96
|
+
else:
|
|
97
|
+
resultados[contador] = [1, (indexes,)]
|
|
98
|
+
indexes = []
|
|
99
|
+
contador = 0
|
|
100
|
+
|
|
101
|
+
# Por si la secuencia termina en ceros
|
|
102
|
+
if contador >= minimos_seguidos:
|
|
103
|
+
if contador in resultados.keys():
|
|
104
|
+
resultados[contador][0] += 1
|
|
105
|
+
resultados[contador][1] += (indexes,)
|
|
106
|
+
indexes = []
|
|
107
|
+
else:
|
|
108
|
+
resultados[contador] = [1, (indexes,)]
|
|
109
|
+
|
|
110
|
+
# retorna [cantidad de ocurrencias, longitud de ceros, indices de ocurrencias]
|
|
111
|
+
return sorted([(v[0], k, v[1]) for k, v in resultados.items()])
|
|
112
|
+
|
|
113
|
+
def get_lat_long_from_indices(df: pd.DataFrame, indices: List[List[int]]) -> Tuple[float, float]:
|
|
114
|
+
"""
|
|
115
|
+
Obtiene la latitud y longitud a partir de una lista de índices en un DataFrame.
|
|
116
|
+
|
|
117
|
+
Parameters:
|
|
118
|
+
df (pd.DataFrame): DataFrame que contiene las columnas 'latitude' y 'longitude'.
|
|
119
|
+
indices (List[int]): Lista de listas de índices para buscar las coordenadas.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Tuple[float, float]: Tupla con la latitud y longitud correspondientes.
|
|
123
|
+
"""
|
|
124
|
+
latitudes = []
|
|
125
|
+
longitudes = []
|
|
126
|
+
nodos = []
|
|
127
|
+
for index_list in indices:
|
|
128
|
+
for index in index_list:
|
|
129
|
+
latitudes.append(df.iloc[index]["latitude"])
|
|
130
|
+
longitudes.append(df.iloc[index]["longitude"])
|
|
131
|
+
nodos.append(df.iloc[index]["nodo"])
|
|
132
|
+
return [nodos, latitudes, longitudes]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def readingFolders(raiz: str | Path, ignorar_ocultas: bool = True, ordenar: bool = True) -> list[str]:
|
|
136
|
+
raiz = Path(raiz)
|
|
137
|
+
if not raiz.is_dir():
|
|
138
|
+
raise NotADirectoryError(f"La ruta no es una carpeta: {raiz}")
|
|
139
|
+
|
|
140
|
+
nombres = [p.name for p in raiz.iterdir() if p.is_dir()]
|
|
141
|
+
if ignorar_ocultas:
|
|
142
|
+
nombres = [n for n in nombres if not n.startswith(".")]
|
|
143
|
+
if ordenar:
|
|
144
|
+
nombres.sort()
|
|
145
|
+
return nombres
|
|
146
|
+
|
|
147
|
+
def computar_resumenes_por_filtro(nodos_ok, merged_cache, filtro, outliers):
|
|
148
|
+
"""
|
|
149
|
+
Función para computar resúmenes filtrados por un criterio específico.
|
|
150
|
+
"""
|
|
151
|
+
conteos, resumenes, dstp_ptmas, delta_dcdp, time_ac = {}, {}, {}, {}, {}
|
|
152
|
+
for nodo in nodos_ok:
|
|
153
|
+
fr = FeaturesResume(merged_cache[nodo], info=nodo, filtrar=filtro)
|
|
154
|
+
fr.removeOutliers(outliers)
|
|
155
|
+
conteos[nodo] = fr.data["tag_seedling"].value_counts(normalize=True)
|
|
156
|
+
resumenes[nodo] = fr.getResume(to="all")
|
|
157
|
+
dstp_ptmas[nodo] = fr.getSensorMA()
|
|
158
|
+
delta_dcdp[nodo] = fr.data["ratio_dCdP"]
|
|
159
|
+
time_ac[nodo] = fr.data["time_ac"]
|
|
160
|
+
return conteos, resumenes
|
|
161
|
+
|
|
162
|
+
def metricas_desde_resumenes(nodos_ok, resumenes, stats):
|
|
163
|
+
"""Devuelve dict nombre_metrica -> vector numpy en el orden de nodos_ok."""
|
|
164
|
+
return {
|
|
165
|
+
"nodo": [n for n in nodos_ok],
|
|
166
|
+
"time_ac": np.array([resumenes[n]["time_ac"][stats] for n in nodos_ok]),
|
|
167
|
+
"deltaO": np.array([resumenes[n]["deltaO"][stats] for n in nodos_ok]),
|
|
168
|
+
"ratio_dCdP":np.array([resumenes[n]["ratio_dCdP"][stats]for n in nodos_ok]),
|
|
169
|
+
"precision": np.array([resumenes[n]["precision"][stats] for n in nodos_ok]),
|
|
170
|
+
"distances": np.array([resumenes[n]["distances"][stats] for n in nodos_ok]),
|
|
171
|
+
"dst_pt": np.array([resumenes[n]["dst_pt"][stats] for n in nodos_ok]),
|
|
172
|
+
}
|
sarapy/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
## Version of the package
|
|
2
|
-
__version__ = "2.
|
|
1
|
+
## Version of the package
|
|
2
|
+
__version__ = "2.3.0"
|