frogml-core 0.0.72__py3-none-any.whl → 0.0.74__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. frogml_core/__init__.py +1 -1
  2. frogml_core/clients/feature_store/offline_serving_client.py +31 -29
  3. frogml_core/clients/location_discovery/__init__.py +1 -0
  4. frogml_core/clients/location_discovery/client.py +73 -0
  5. frogml_core/feature_store/_common/functions.py +0 -19
  6. frogml_core/feature_store/offline/__init__.py +1 -2
  7. frogml_core/inner/model_loggers_utils.py +8 -22
  8. frogml_core/model_loggers/artifact_logger.py +7 -2
  9. frogml_core/model_loggers/data_logger.py +11 -6
  10. {frogml_core-0.0.72.dist-info → frogml_core-0.0.74.dist-info}/METADATA +1 -1
  11. {frogml_core-0.0.72.dist-info → frogml_core-0.0.74.dist-info}/RECORD +21 -16
  12. frogml_proto/qwak/service_discovery/service_discovery_location_pb2.py +65 -0
  13. frogml_proto/qwak/service_discovery/service_discovery_location_pb2.pyi +73 -0
  14. frogml_proto/qwak/service_discovery/service_discovery_location_pb2_grpc.py +4 -0
  15. frogml_proto/qwak/service_discovery/service_discovery_location_service_pb2.py +49 -0
  16. frogml_proto/qwak/service_discovery/service_discovery_location_service_pb2.pyi +41 -0
  17. frogml_proto/qwak/service_discovery/service_discovery_location_service_pb2_grpc.py +231 -0
  18. frogml_services_mock/mocks/frogml_mocks.py +4 -0
  19. frogml_services_mock/mocks/location_discovery_service_api.py +104 -0
  20. frogml_services_mock/services_mock.py +16 -4
  21. frogml_core/feature_store/offline/_query_engine.py +0 -32
  22. frogml_core/feature_store/offline/athena/__init__.py +0 -0
  23. frogml_core/feature_store/offline/athena/athena_query_engine.py +0 -154
  24. frogml_core/feature_store/offline/client.py +0 -721
  25. {frogml_core-0.0.72.dist-info → frogml_core-0.0.74.dist-info}/WHEEL +0 -0
@@ -1,721 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import date, datetime
3
- from functools import reduce
4
- from typing import DefaultDict, Dict, List, Tuple, Union
5
-
6
- from frogml_core.clients.administration.eco_system.client import EcosystemClient
7
-
8
- try:
9
- import pandas as pd
10
- except ImportError:
11
- pass
12
-
13
- from warnings import warn
14
-
15
- from dateutil.parser import ParserError
16
-
17
- from frogml_core.exceptions import FrogmlException
18
- from frogml_core.feature_store._common.functions import normalize_cols
19
- from frogml_core.feature_store.offline.athena.athena_query_engine import (
20
- AthenaQueryEngine,
21
- )
22
-
23
-
24
- class OfflineClient:
25
- """
26
- A class used to retrieve data from the offline store - mainly used to get train data for models.
27
- It requires frogml configure and aws access.
28
- @deprecated
29
- """
30
-
31
- # Default SCD parameters of the feature store etl
32
- ANALYTICS_DB_PREFIX = "qwak_analytics_feature_store"
33
- FS_DB_PREFIX = "qwak_feature_store"
34
- FS_TABLE_NAME_PREFIX = "offline_feature_store"
35
- FS_START_TIME_COLUMN = "start_timestamp"
36
- FS_END_TIME_COLUMN = "end_timestamp"
37
- FEATURE_STORE_PREFIX = "feature_store"
38
-
39
- DEFAULT_NUMBER_OF_SAMPLE_DATA_ROWS = "100"
40
-
41
- def __init__(
42
- self,
43
- query_engine=None,
44
- environment_id=None,
45
- ):
46
- warn(
47
- "This Client will be deprecated soon, Please use OfflineClientV2 Instead",
48
- DeprecationWarning,
49
- stacklevel=2,
50
- )
51
- self.query_engine = (
52
- query_engine if query_engine is not None else AthenaQueryEngine()
53
- )
54
- self.quotes = self.query_engine.get_quotes()
55
- if environment_id is None:
56
- user_context = EcosystemClient().get_authenticated_user_context().user
57
- environment_id = (
58
- user_context.account_details.default_environment_id.replace("-", "_")
59
- )
60
-
61
- self.environment_id = environment_id.replace("-", "_")
62
- self.FS_DB_NAME = self.FS_DB_PREFIX + "_" + self.environment_id
63
- self.FS_ANALYTICS_DB_NAME = self.ANALYTICS_DB_PREFIX + "_" + self.environment_id
64
-
65
- def get_feature_range_values(
66
- self,
67
- entity_key_to_features: dict,
68
- start_date: Union[datetime, date],
69
- end_date: Union[datetime, date],
70
- ):
71
- """
72
- :param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
73
- :param start_date: the column name of the point in time column (default - timestamp)
74
- :param end_date: the column name of the point in time column (default - timestamp)
75
- :return: a pandas dataframe or a list of dataframes (a dataframe for every entity_key) - all feature values for
76
- all entites under the given date range
77
- @depracted
78
-
79
- each row in the returned data-frame is constructed by retrieving the requested features of the entity
80
- key(s) for all entity values in within the defined date tange.
81
-
82
- Feature sets should be named [Feature Set Name].[Feature Name],
83
- i.e: user_purchases.number_of_purchases.
84
-
85
- Examples:
86
- >>> from datetime import datetime
87
- >>> from frogml_core.feature_store.offline import OfflineClient
88
- >>>
89
- >>> start_date = datetime(year=2021, month=1, day=1)
90
- >>> end_date = datetime(year=2021, month=1, day=3)
91
- >>>
92
- >>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
93
- >>> 'user_purchases.avg_purchase_amount']}
94
- >>>
95
- >>> offline_feature_store = OfflineClient()
96
- >>>
97
- >>> train_df = offline_feature_store.get_feature_range_values(
98
- >>> entity_key_to_features=key_to_features,
99
- >>> start_date=start_date,
100
- >>> end_date=end_date)
101
- >>>
102
- >>> print(train_df.head())
103
- >>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
104
- >>> # 0 1 2021-01-02 17:00:00 76 4.796842
105
- >>> # 1 1 2021-01-01 12:00:00 5 1.548000
106
- >>> # 2 2 2021-01-02 12:00:00 5 5.548000
107
- >>> # 3 2 2021-01-01 18:00:00 5 2.788000
108
- """
109
- warn(
110
- "This method will be deprecated soon, Please use OfflineClientV2: get_feature_values() Instead",
111
- DeprecationWarning,
112
- stacklevel=2,
113
- )
114
- try:
115
- from frogml_core.feature_store._common.featureset_asterisk_handler import (
116
- unpack_asterisk_features_from_key_mapping,
117
- )
118
-
119
- entity_key_to_features = unpack_asterisk_features_from_key_mapping(
120
- entity_key_to_features, lambda: self
121
- )
122
- self._validate_range_query_inputs(
123
- entity_key_to_features, start_date, end_date
124
- )
125
-
126
- feature_set_name_to_feature_list = (
127
- self._partition_feature_set_by_entity_feature(entity_key_to_features)
128
- )
129
- feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
130
- feature_set_name_to_feature_list
131
- )
132
- self._validate_features_exist(
133
- feature_set_name_to_feature_list, feature_set_to_dtypes
134
- )
135
-
136
- df = self._run_ranges_query(
137
- feature_set_name_to_feature_list, start_date, end_date
138
- )
139
-
140
- return self._normalize_df(df)
141
-
142
- except FrogmlException as qwak_exception:
143
- raise FrogmlException(
144
- f"Got the following Qwak generated exception: {qwak_exception}"
145
- )
146
- except Exception as e:
147
- raise FrogmlException(f"Got the following run-time exception: {e}")
148
- finally:
149
- try:
150
- self.query_engine.cleanup()
151
- except Exception as e:
152
- print(f"Got the following run-time exception during cleanup: {e}")
153
-
154
- def get_feature_values(
155
- self,
156
- entity_key_to_features: dict,
157
- population: "pd.DataFrame",
158
- point_in_time_column_name: str = "timestamp",
159
- ):
160
- """
161
- :param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
162
- :param population: a pandas data-frame with a point in time column
163
- and a column for each entity key defined at entity_key_to_features.
164
- :param point_in_time_column_name: the column name of the point in time column (default - timestamp)
165
- :return: a pandas data-frame - the population joined with the feature values for all
166
- the requested entities and features.
167
- @deprecated
168
-
169
- each row in the returned data-frame is constructed by retrieving the requested features of the entity key(s) for
170
- the specific entity value(s) in the population and on the specific point in time defined.
171
-
172
- Feature sets should be named [Feature Set Name].[Feature Name],
173
- i.e: user_purchases.number_of_purchases.
174
-
175
- Examples:
176
- >>> import pandas as pd
177
- >>> from frogml_core.feature_store.offline import OfflineClient
178
- >>>
179
- >>> population_df = pd.DataFrame(
180
- >>> columns= ['uuid', 'timestamp' ],
181
- >>> data =[[ '1' , '2021-01-02 17:00:00' ],
182
- >>> [ '2' , '2021-01-01 12:00:00' ]])
183
- >>>
184
- >>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
185
- >>> 'user_purchases.avg_purchase_amount']}
186
- >>>
187
- >>> offline_feature_store = OfflineClient()
188
- >>>
189
- >>> train_df = offline_feature_store.get_feature_values(
190
- >>> entity_key_to_features=key_to_features,
191
- >>> population=population_df,
192
- >>> point_in_time_column_name='timestamp')
193
- >>>
194
- >>> print(train_df.head())
195
- >>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
196
- >>> # 0 1 2021-04-24 17:00:00 76 4.796842
197
- >>> # 1 2 2021-04-24 12:00:00 5 1.548000
198
- """
199
- warn(
200
- "This method will be deprecated soon, Please use OfflineClientV2: get_feature_range_values() Instead",
201
- DeprecationWarning,
202
- stacklevel=2,
203
- )
204
- import pandas as pd
205
-
206
- try:
207
- from frogml_core.feature_store._common.featureset_asterisk_handler import (
208
- unpack_asterisk_features_from_key_mapping,
209
- )
210
-
211
- population = population.copy()
212
-
213
- entity_key_to_features = unpack_asterisk_features_from_key_mapping(
214
- entity_key_to_features, lambda: self
215
- )
216
-
217
- self._validate_point_in_time_query_inputs(
218
- entity_key_to_features, population, point_in_time_column_name
219
- )
220
-
221
- feature_set_name_to_feature_dict = (
222
- self._partition_feature_set_by_entity_feature(entity_key_to_features)
223
- )
224
-
225
- feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
226
- feature_set_name_to_feature_dict
227
- )
228
-
229
- self._validate_features_exist(
230
- feature_set_name_to_feature_dict,
231
- feature_set_to_dtypes,
232
- )
233
-
234
- population = self._align_entity_key_dtype(
235
- feature_set_to_dtypes, entity_key_to_features, population
236
- )
237
-
238
- uploaded_population_path = self.query_engine.upload_table(population)
239
-
240
- df = pd.DataFrame()
241
-
242
- if feature_set_name_to_feature_dict:
243
- df = self._run_point_in_time_query(
244
- feature_set_name_to_feature_dict,
245
- uploaded_population_path,
246
- point_in_time_column_name,
247
- [column.lower() for column in population.columns],
248
- )
249
-
250
- return self._normalize_df(df)
251
-
252
- except FrogmlException as qwak_exception:
253
- raise FrogmlException(
254
- f"Got the following Qwak generated exception: {qwak_exception}"
255
- )
256
- except Exception as e:
257
- raise FrogmlException(f"Got the following run-time exception: {e}")
258
- finally:
259
- try:
260
- self.query_engine.cleanup()
261
- except Exception as e:
262
- print(f"Got the following run-time exception during cleanup: {e}")
263
-
264
- @staticmethod
265
- def _normalize_df(df: "pd.DataFrame") -> "pd.DataFrame":
266
- columns = df.columns.values.tolist()
267
- new_columns = normalize_cols(columns)
268
- df.columns = new_columns
269
- return df
270
-
271
- @staticmethod
272
- def _validate_range_query_inputs(
273
- entity_key_to_features: dict, start_date: datetime, end_date: datetime
274
- ):
275
- missing_features_entity_keys = [
276
- entity_key
277
- for entity_key, features in entity_key_to_features.items()
278
- if not features
279
- ]
280
-
281
- if missing_features_entity_keys:
282
- raise FrogmlException(
283
- f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
284
- )
285
-
286
- if (end_date - start_date).total_seconds() < 0:
287
- raise FrogmlException("Invalid date range - end date is before start date")
288
-
289
- @staticmethod
290
- def _validate_point_in_time_query_inputs(
291
- entity_key_to_features: dict,
292
- population: "pd.DataFrame",
293
- point_in_time_column_name: str,
294
- ):
295
- """
296
- Validates that the entity keys, timestamp cols and features exist in DB
297
- """
298
- missing_keys = [
299
- entity_key
300
- for entity_key in entity_key_to_features.keys()
301
- if entity_key not in population
302
- ]
303
- if missing_keys:
304
- raise FrogmlException(
305
- f"The entity keys must be in population_df columns, missing: [{missing_keys}]"
306
- )
307
-
308
- missing_features_entity_keys = [
309
- entity_key
310
- for entity_key, features in entity_key_to_features.items()
311
- if not features
312
- ]
313
-
314
- if missing_features_entity_keys:
315
- raise FrogmlException(
316
- f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
317
- )
318
-
319
- if point_in_time_column_name not in population:
320
- raise FrogmlException(
321
- "The point in time column must be part of the population dataframe"
322
- )
323
-
324
- from pandas.api.types import is_datetime64_any_dtype
325
-
326
- if not is_datetime64_any_dtype(population[point_in_time_column_name]):
327
- try:
328
- population[point_in_time_column_name] = pd.to_datetime(
329
- population[point_in_time_column_name]
330
- )
331
- except ParserError as e:
332
- raise FrogmlException(
333
- f"It was not possible to cast provided point in time column to datetime"
334
- f"\nError message: {e}"
335
- )
336
-
337
- @staticmethod
338
- def _partition_feature_set_by_entity_feature(
339
- entity_key_to_features,
340
- ) -> DefaultDict[Tuple[str, str], List[str]]:
341
- """
342
- Partition feature by entity key and featureset name
343
- Args:
344
- entity_key_to_features: dict of entity_key -> full feature name
345
- Returns:
346
- dict of (entity_key,featureset_name) -> list of feature names
347
- """
348
- feature_name_to_feature_list = defaultdict(list)
349
-
350
- for entity_key, feature_list in entity_key_to_features.items():
351
- for feature in feature_list:
352
- split_feature_set_and_feature = feature.lower().split(".")
353
- if len(split_feature_set_and_feature) != 2:
354
- raise FrogmlException(
355
- f"Failed to verify features. Name should be: <feature set name>.<feature name>. "
356
- f"Current name is: {feature}"
357
- )
358
- feature_set_name = split_feature_set_and_feature[0]
359
- feature_name_to_feature_list[(entity_key, feature_set_name)].append(
360
- feature
361
- )
362
-
363
- return feature_name_to_feature_list
364
-
365
- def _validate_database_and_get_feature_set_dtypes(
366
- self, feature_name_to_feature_list: DefaultDict[Tuple[str, str], List[str]]
367
- ) -> Dict[Tuple[str, str], List[Tuple[str, str]]]:
368
- """
369
- Args:
370
- feature_name_to_feature_list: dictionary from feature set name to its' features
371
-
372
- Returns
373
- dictionary from feature set name and entity key to a list of feature name, feature type
374
- """
375
- if self.FS_DB_NAME not in self._fs_db_names():
376
- raise FrogmlException("Offline feature store does not contain any data")
377
-
378
- feature_set_to_dtypes = {}
379
- for (
380
- entity_key,
381
- feature_set_name,
382
- ), feature_list in feature_name_to_feature_list.items():
383
- table_name = self._get_offline_feature_store_full_name(feature_set_name)
384
- if table_name not in self._fs_tables_names():
385
- raise FrogmlException(
386
- f"[{feature_set_name}] feature set does not contain any data"
387
- )
388
-
389
- columns_query_result = self.query_engine.run_query(
390
- f"SELECT * FROM INFORMATION_SCHEMA.COLUMNS " # nosec B608
391
- f"WHERE TABLE_SCHEMA = '{self.FS_DB_NAME}' "
392
- f"AND TABLE_NAME = '{table_name}'"
393
- )
394
-
395
- feature_set_to_dtypes[(entity_key, feature_set_name)] = [
396
- (column_tup[3], column_tup[7]) for column_tup in columns_query_result
397
- ]
398
- return feature_set_to_dtypes
399
-
400
- @staticmethod
401
- def _validate_features_exist(
402
- feature_name_to_feature_list: defaultdict,
403
- feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
404
- ):
405
- """
406
- Args:
407
- feature_name_to_feature_list: dictionary from feature set name to its' features
408
- feature_set_to_dtypes: dictionary from feature set name and entity key
409
- to a list of feature name, feature type
410
- """
411
- for (
412
- entity_key,
413
- feature_set_name,
414
- ), feature_list in feature_name_to_feature_list.items():
415
- columns = [
416
- column_tuple[0].lower()
417
- for column_tuple in feature_set_to_dtypes[
418
- (entity_key, feature_set_name)
419
- ]
420
- ]
421
- absent_features = [
422
- feature for feature in feature_list if feature not in columns
423
- ]
424
- if absent_features:
425
- raise FrogmlException(
426
- f"Missing the following features for the feature set "
427
- f"[{feature_set_name}]:"
428
- f"\n{absent_features}"
429
- )
430
-
431
- def _align_entity_key_dtype(
432
- self,
433
- feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
434
- entity_key_to_features: Dict[str, List[str]],
435
- population: "pd.DataFrame",
436
- ) -> "pd.DataFrame":
437
- """
438
- Args:
439
- feature_set_to_dtypes: dictionary from feature set name and entity key
440
- to a list of feature name, feature type
441
- entity_key_to_features: a dictionary { entity_key(s) -> features list }.
442
- population: a pandas data-frame with a point in time column
443
- and a column for each entity key defined at entity_key_to_features.
444
- Returns:
445
- entity type aligned population df
446
- """
447
- sql_type_to_pandas = {
448
- "string": "string",
449
- "integer": "int",
450
- "varchar": "string",
451
- "text": "string",
452
- "bigint": "int",
453
- }
454
-
455
- entity_key_to_dtype = self._validate_and_get_entity_keys_dtypes(
456
- entity_key_to_features, population
457
- )
458
-
459
- for (
460
- entity_key,
461
- feature_set_name,
462
- ), feature_dtypes_list in feature_set_to_dtypes.items():
463
- given_entity_key_dtype = entity_key_to_dtype[entity_key]
464
- entity_column_tuple = [
465
- column_tuple
466
- for column_tuple in feature_dtypes_list
467
- if column_tuple[0] == entity_key
468
- ]
469
- if not entity_column_tuple:
470
- raise FrogmlException(
471
- f"Did not find entity key [{entity_key}] in the table of [{feature_set_name}] "
472
- f"- existing columns are: "
473
- f"{[column_tuple[0] for column_tuple in feature_dtypes_list]}"
474
- )
475
- actual_entity_type = entity_column_tuple[0][1]
476
- if actual_entity_type == given_entity_key_dtype:
477
- continue
478
- else:
479
- try:
480
- population[entity_key] = population[entity_key].astype(
481
- sql_type_to_pandas[actual_entity_type]
482
- )
483
- print(
484
- f"Entity [{entity_key}] given type [{given_entity_key_dtype}] "
485
- f"was not aligned with actual type [{actual_entity_type}] - casted to correct type"
486
- )
487
- except ValueError as e:
488
- raise FrogmlException(
489
- f"Mismatched entity type for [{entity_key}] - [{given_entity_key_dtype}] "
490
- f"- failed to cast to actual type [{actual_entity_type}], Error: {e}"
491
- )
492
-
493
- return population
494
-
495
- def _validate_and_get_entity_keys_dtypes(
496
- self,
497
- entity_key_to_features: Dict[str, List[str]],
498
- population_df: "pd.DataFrame",
499
- ) -> Dict[str, str]:
500
- """
501
- Args:
502
- entity_key_to_features: a dictionary { entity_key(s) -> features list }.
503
- population_df: a pandas data-frame with a point in time column
504
- and a column for each entity key defined at entity_key_to_features.
505
-
506
- Returns:
507
- dictionary of entity key to it's dtype
508
- """
509
- supported_dtypes_to_actual_type = {
510
- "object": "string",
511
- "int32": "integer",
512
- "int64": "integer",
513
- }
514
- entity_key_to_dtype = {}
515
- for entity_key in entity_key_to_features.keys():
516
- entity_pandas_dtype = population_df.dtypes[entity_key].name
517
- if entity_pandas_dtype not in supported_dtypes_to_actual_type:
518
- raise FrogmlException(
519
- f"Got an unsupported dtype for the entity key "
520
- f"[{entity_key}] - [{entity_pandas_dtype}]"
521
- )
522
- entity_key_to_dtype[entity_key] = supported_dtypes_to_actual_type[
523
- entity_pandas_dtype
524
- ]
525
- return entity_key_to_dtype
526
-
527
- def _run_ranges_query(
528
- self,
529
- feature_name_to_feature_list: defaultdict,
530
- start_date: datetime,
531
- end_date: datetime,
532
- ):
533
- result_dfs = []
534
- features_set_by_entity = defaultdict(lambda: defaultdict(set))
535
- for (
536
- (entity_key, feature_set_name),
537
- feature_list,
538
- ) in feature_name_to_feature_list.items():
539
- for feature in feature_list:
540
- feature_set_name = feature.split(".")[0]
541
- features_set_by_entity[entity_key][feature_set_name].add(feature)
542
-
543
- for entity_key, features_dict in features_set_by_entity.items():
544
- entity_dfs = []
545
- for feature_set_name, feature_list in features_dict.items():
546
- offline_feature_store_full_path, table_path = self.get_fs_full_path(
547
- feature_set_name
548
- )
549
-
550
- features = ", ".join(
551
- [
552
- f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes}"
553
- for feature in feature_list
554
- ]
555
- )
556
-
557
- where_part = (
558
- "WHERE "
559
- f"{table_path}.{self.FS_START_TIME_COLUMN} >= timestamp '{start_date}' "
560
- f"AND ({self.FS_END_TIME_COLUMN} <= "
561
- f"timestamp '{end_date}' OR {table_path}.{self.FS_END_TIME_COLUMN} IS NULL) "
562
- f"AND {table_path}.{self.FS_START_TIME_COLUMN} < timestamp '{end_date}'"
563
- )
564
-
565
- full_sql = (
566
- f"SELECT {offline_feature_store_full_path}.{self.quotes}{entity_key}{self.quotes}, " # nosec B608
567
- f"{offline_feature_store_full_path}.{self.quotes}{self.FS_START_TIME_COLUMN}{self.quotes}, "
568
- f"{features} "
569
- f"FROM {offline_feature_store_full_path} "
570
- f"{where_part}"
571
- )
572
-
573
- entity_dfs.append(self.query_engine.read_pandas_from_query(full_sql))
574
-
575
- entity_final_df = reduce(
576
- lambda left, right: pd.merge(
577
- left, right, on=[entity_key, self.FS_START_TIME_COLUMN], how="outer"
578
- ),
579
- entity_dfs,
580
- )
581
- result_dfs.append(entity_final_df.reset_index(drop=True))
582
-
583
- return result_dfs[0] if len(result_dfs) == 1 else result_dfs
584
-
585
- def _run_point_in_time_query(
586
- self,
587
- feature_name_to_feature_list: defaultdict,
588
- uploaded_population_path: str,
589
- point_in_time_column_name: str,
590
- population_list: list,
591
- ) -> "pd.DataFrame":
592
- """
593
- creates SQL query for pint in time feature fetching based on population and requested features
594
- """
595
- dfs = []
596
-
597
- for index, ((entity_key, feature_set_name), feature_list) in enumerate(
598
- feature_name_to_feature_list.items()
599
- ):
600
- offline_feature_store_full_path, table_path = self.get_fs_full_path(
601
- feature_set_name
602
- )
603
-
604
- join_part = self._get_join_population_sql(
605
- entity_key, offline_feature_store_full_path, uploaded_population_path
606
- )
607
-
608
- point_in_time_column_full_path = (
609
- f"{uploaded_population_path}.{point_in_time_column_name}"
610
- )
611
-
612
- where_part = (
613
- "WHERE "
614
- f"{point_in_time_column_full_path} >= "
615
- f"{table_path}.{self.FS_START_TIME_COLUMN} "
616
- f"AND ({point_in_time_column_full_path} < "
617
- f"{table_path}.{self.FS_END_TIME_COLUMN} OR "
618
- f"{table_path}.{self.FS_END_TIME_COLUMN} IS NULL)"
619
- )
620
-
621
- features = ", ".join(
622
- [
623
- f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
624
- for feature in feature_list
625
- ]
626
- )
627
-
628
- final_query_features = ", ".join(
629
- [
630
- f"filtered_features.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
631
- for feature in feature_list
632
- ]
633
- )
634
-
635
- full_sql = (
636
- f"WITH " # nosec B608
637
- "filtered_features AS ( "
638
- f"SELECT {uploaded_population_path}.*, {features} "
639
- f"FROM {uploaded_population_path} "
640
- f"{join_part} "
641
- f"{where_part} "
642
- ") "
643
- f"SELECT population.*, {final_query_features} "
644
- f"FROM {uploaded_population_path} population "
645
- f"LEFT JOIN filtered_features "
646
- f"ON population.{entity_key} = filtered_features.{entity_key} "
647
- f"AND population.{point_in_time_column_name} = filtered_features.{point_in_time_column_name}"
648
- )
649
-
650
- dfs.append(
651
- self.query_engine.read_pandas_from_query(
652
- full_sql, [point_in_time_column_name]
653
- )
654
- )
655
- return self._merge_query_dataframes_results(dfs, population_list)
656
-
657
- @staticmethod
658
- def _merge_query_dataframes_results(
659
- dfs: List["pd.DataFrame"], population_list
660
- ) -> "pd.DataFrame":
661
- """
662
- merges query result dataframes according to population list cols
663
- """
664
- if dfs:
665
- df_final = reduce(
666
- lambda left, right: pd.merge(
667
- left, right, on=population_list, how="outer"
668
- ),
669
- dfs,
670
- )
671
-
672
- ordered_cols = population_list + (
673
- df_final.columns.drop(population_list).tolist()
674
- )
675
-
676
- return df_final[ordered_cols].reset_index(drop=True)
677
- else:
678
- return pd.DataFrame()
679
-
680
- def _get_join_population_sql(
681
- self, entity_key, offline_feature_store_full_path, uploaded_population_path
682
- ):
683
- """
684
- return a join sql query for uploaded population table
685
- """
686
- join_part = (
687
- f"LEFT JOIN {offline_feature_store_full_path} ON "
688
- f"{offline_feature_store_full_path}.{entity_key} = "
689
- f"{uploaded_population_path}.{entity_key} "
690
- )
691
- return join_part
692
-
693
- def get_fs_full_path(self, feature_set_name: str) -> Tuple[str, str]:
694
- offline_feature_store_table_name = self._get_offline_feature_store_full_name(
695
- feature_set_name
696
- )
697
- table_path = f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
698
- offline_feature_store_full_path = (
699
- f"{self.quotes}{self.FS_DB_NAME}{self.quotes}."
700
- f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
701
- )
702
- return offline_feature_store_full_path, table_path
703
-
704
- def _fs_db_names(self) -> List[str]:
705
- return [
706
- database_tuple[0]
707
- for database_tuple in self.query_engine.run_query("SHOW SCHEMAS")
708
- ]
709
-
710
- def _fs_tables_names(self) -> List[str]:
711
- return [
712
- table_tuple[0]
713
- for table_tuple in self.query_engine.run_query(
714
- f"SHOW TABLES IN {self.FS_DB_NAME}"
715
- )
716
- ]
717
-
718
- def _get_offline_feature_store_full_name(self, feature_set_name: str) -> str:
719
- return f"{self.FS_TABLE_NAME_PREFIX}_{feature_set_name}".lower().replace(
720
- "-", "_"
721
- )