qwak-core 0.4.246__py3-none-any.whl → 0.4.248__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. _qwak_proto/qwak/service_discovery/service_discovery_location_pb2.py +65 -0
  2. _qwak_proto/qwak/service_discovery/service_discovery_location_pb2.pyi +73 -0
  3. _qwak_proto/qwak/service_discovery/service_discovery_location_pb2_grpc.py +4 -0
  4. _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2.py +49 -0
  5. _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2.pyi +41 -0
  6. _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2_grpc.py +231 -0
  7. qwak/__init__.py +1 -1
  8. qwak/clients/feature_store/offline_serving_client.py +29 -4
  9. qwak/clients/location_discovery/__init__.py +1 -0
  10. qwak/clients/location_discovery/client.py +73 -0
  11. qwak/feature_store/_common/functions.py +0 -19
  12. qwak/feature_store/offline/__init__.py +1 -2
  13. qwak/inner/model_loggers_utils.py +8 -20
  14. qwak/model_loggers/artifact_logger.py +7 -2
  15. qwak/model_loggers/data_logger.py +11 -6
  16. {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/METADATA +1 -1
  17. {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/RECORD +21 -17
  18. qwak_services_mock/mocks/location_discovery_service_api.py +104 -0
  19. qwak_services_mock/mocks/qwak_mocks.py +4 -0
  20. qwak_services_mock/services_mock.py +13 -0
  21. qwak/feature_store/_common/featureset_asterisk_handler.py +0 -115
  22. qwak/feature_store/offline/_query_engine.py +0 -32
  23. qwak/feature_store/offline/athena/__init__.py +0 -0
  24. qwak/feature_store/offline/athena/athena_query_engine.py +0 -153
  25. qwak/feature_store/offline/client.py +0 -718
  26. {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/WHEEL +0 -0
@@ -1,718 +0,0 @@
1
- from collections import defaultdict
2
- from datetime import date, datetime
3
- from functools import reduce
4
- from typing import DefaultDict, Dict, List, Tuple, Union
5
-
6
- from qwak.clients.administration.eco_system.client import EcosystemClient
7
-
8
- try:
9
- import pandas as pd
10
- except ImportError:
11
- pass
12
-
13
- from warnings import warn
14
-
15
- from dateutil.parser import ParserError
16
- from qwak.exceptions import QwakException
17
- from qwak.feature_store._common.functions import normalize_cols
18
- from qwak.feature_store.offline.athena.athena_query_engine import AthenaQueryEngine
19
-
20
-
21
- class OfflineClient:
22
- """
23
- A class used to retrieve data from the offline store - mainly used to get train data for models.
24
- It requires qwak configure and aws access.
25
- @deprecated
26
- """
27
-
28
- # Default SCD parameters of the feature store etl
29
- ANALYTICS_DB_PREFIX = "qwak_analytics_feature_store"
30
- FS_DB_PREFIX = "qwak_feature_store"
31
- FS_TABLE_NAME_PREFIX = "offline_feature_store"
32
- FS_START_TIME_COLUMN = "start_timestamp"
33
- FS_END_TIME_COLUMN = "end_timestamp"
34
- FEATURE_STORE_PREFIX = "feature_store"
35
-
36
- DEFAULT_NUMBER_OF_SAMPLE_DATA_ROWS = "100"
37
-
38
- def __init__(
39
- self,
40
- query_engine=None,
41
- environment_id=None,
42
- ):
43
- warn(
44
- "This Client will be deprecated soon, Please use OfflineClientV2 Instead",
45
- DeprecationWarning,
46
- stacklevel=2,
47
- )
48
- self.query_engine = (
49
- query_engine if query_engine is not None else AthenaQueryEngine()
50
- )
51
- self.quotes = self.query_engine.get_quotes()
52
- if environment_id is None:
53
- user_context = EcosystemClient().get_authenticated_user_context().user
54
- environment_id = (
55
- user_context.account_details.default_environment_id.replace("-", "_")
56
- )
57
-
58
- self.environment_id = environment_id.replace("-", "_")
59
- self.FS_DB_NAME = self.FS_DB_PREFIX + "_" + self.environment_id
60
- self.FS_ANALYTICS_DB_NAME = self.ANALYTICS_DB_PREFIX + "_" + self.environment_id
61
-
62
- def get_feature_range_values(
63
- self,
64
- entity_key_to_features: dict,
65
- start_date: Union[datetime, date],
66
- end_date: Union[datetime, date],
67
- ):
68
- """
69
- :param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
70
- :param start_date: the column name of the point in time column (default - timestamp)
71
- :param end_date: the column name of the point in time column (default - timestamp)
72
- :return: a pandas dataframe or a list of dataframes (a dataframe for every entity_key) - all feature values for
73
- all entites under the given date range
74
- @depracted
75
-
76
- each row in the returned data-frame is constructed by retrieving the requested features of the entity
77
- key(s) for all entity values in within the defined date tange.
78
-
79
- Feature sets should be named [Feature Set Name].[Feature Name],
80
- i.e: user_purchases.number_of_purchases.
81
-
82
- Examples:
83
- >>> from datetime import datetime
84
- >>> from qwak.feature_store.offline import OfflineClient
85
- >>>
86
- >>> start_date = datetime(year=2021, month=1, day=1)
87
- >>> end_date = datetime(year=2021, month=1, day=3)
88
- >>>
89
- >>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
90
- >>> 'user_purchases.avg_purchase_amount']}
91
- >>>
92
- >>> offline_feature_store = OfflineClient()
93
- >>>
94
- >>> train_df = offline_feature_store.get_feature_range_values(
95
- >>> entity_key_to_features=key_to_features,
96
- >>> start_date=start_date,
97
- >>> end_date=end_date)
98
- >>>
99
- >>> print(train_df.head())
100
- >>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
101
- >>> # 0 1 2021-01-02 17:00:00 76 4.796842
102
- >>> # 1 1 2021-01-01 12:00:00 5 1.548000
103
- >>> # 2 2 2021-01-02 12:00:00 5 5.548000
104
- >>> # 3 2 2021-01-01 18:00:00 5 2.788000
105
- """
106
- warn(
107
- "This method will be deprecated soon, Please use OfflineClientV2: get_feature_values() Instead",
108
- DeprecationWarning,
109
- stacklevel=2,
110
- )
111
- try:
112
- from qwak.feature_store._common.featureset_asterisk_handler import (
113
- unpack_asterisk_features_from_key_mapping,
114
- )
115
-
116
- entity_key_to_features = unpack_asterisk_features_from_key_mapping(
117
- entity_key_to_features, lambda: self
118
- )
119
- self._validate_range_query_inputs(
120
- entity_key_to_features, start_date, end_date
121
- )
122
-
123
- feature_set_name_to_feature_list = (
124
- self._partition_feature_set_by_entity_feature(entity_key_to_features)
125
- )
126
- feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
127
- feature_set_name_to_feature_list
128
- )
129
- self._validate_features_exist(
130
- feature_set_name_to_feature_list, feature_set_to_dtypes
131
- )
132
-
133
- df = self._run_ranges_query(
134
- feature_set_name_to_feature_list, start_date, end_date
135
- )
136
-
137
- return self._normalize_df(df)
138
-
139
- except QwakException as qwak_exception:
140
- raise QwakException(
141
- f"Got the following Qwak generated exception: {qwak_exception}"
142
- )
143
- except Exception as e:
144
- raise QwakException(f"Got the following run-time exception: {e}")
145
- finally:
146
- try:
147
- self.query_engine.cleanup()
148
- except Exception as e:
149
- print(f"Got the following run-time exception during cleanup: {e}")
150
-
151
- def get_feature_values(
152
- self,
153
- entity_key_to_features: dict,
154
- population: "pd.DataFrame",
155
- point_in_time_column_name: str = "timestamp",
156
- ):
157
- """
158
- :param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
159
- :param population: a pandas data-frame with a point in time column
160
- and a column for each entity key defined at entity_key_to_features.
161
- :param point_in_time_column_name: the column name of the point in time column (default - timestamp)
162
- :return: a pandas data-frame - the population joined with the feature values for all
163
- the requested entities and features.
164
- @deprecated
165
-
166
- each row in the returned data-frame is constructed by retrieving the requested features of the entity key(s) for
167
- the specific entity value(s) in the population and on the specific point in time defined.
168
-
169
- Feature sets should be named [Feature Set Name].[Feature Name],
170
- i.e: user_purchases.number_of_purchases.
171
-
172
- Examples:
173
- >>> import pandas as pd
174
- >>> from qwak.feature_store.offline import OfflineClient
175
- >>>
176
- >>> population_df = pd.DataFrame(
177
- >>> columns= ['uuid', 'timestamp' ],
178
- >>> data =[[ '1' , '2021-01-02 17:00:00' ],
179
- >>> [ '2' , '2021-01-01 12:00:00' ]])
180
- >>>
181
- >>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
182
- >>> 'user_purchases.avg_purchase_amount']}
183
- >>>
184
- >>> offline_feature_store = OfflineClient()
185
- >>>
186
- >>> train_df = offline_feature_store.get_feature_values(
187
- >>> entity_key_to_features=key_to_features,
188
- >>> population=population_df,
189
- >>> point_in_time_column_name='timestamp')
190
- >>>
191
- >>> print(train_df.head())
192
- >>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
193
- >>> # 0 1 2021-04-24 17:00:00 76 4.796842
194
- >>> # 1 2 2021-04-24 12:00:00 5 1.548000
195
- """
196
- warn(
197
- "This method will be deprecated soon, Please use OfflineClientV2: get_feature_range_values() Instead",
198
- DeprecationWarning,
199
- stacklevel=2,
200
- )
201
- import pandas as pd
202
-
203
- try:
204
- from qwak.feature_store._common.featureset_asterisk_handler import (
205
- unpack_asterisk_features_from_key_mapping,
206
- )
207
-
208
- population = population.copy()
209
-
210
- entity_key_to_features = unpack_asterisk_features_from_key_mapping(
211
- entity_key_to_features, lambda: self
212
- )
213
-
214
- self._validate_point_in_time_query_inputs(
215
- entity_key_to_features, population, point_in_time_column_name
216
- )
217
-
218
- feature_set_name_to_feature_dict = (
219
- self._partition_feature_set_by_entity_feature(entity_key_to_features)
220
- )
221
-
222
- feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
223
- feature_set_name_to_feature_dict
224
- )
225
-
226
- self._validate_features_exist(
227
- feature_set_name_to_feature_dict,
228
- feature_set_to_dtypes,
229
- )
230
-
231
- population = self._align_entity_key_dtype(
232
- feature_set_to_dtypes, entity_key_to_features, population
233
- )
234
-
235
- uploaded_population_path = self.query_engine.upload_table(population)
236
-
237
- df = pd.DataFrame()
238
-
239
- if feature_set_name_to_feature_dict:
240
- df = self._run_point_in_time_query(
241
- feature_set_name_to_feature_dict,
242
- uploaded_population_path,
243
- point_in_time_column_name,
244
- [column.lower() for column in population.columns],
245
- )
246
-
247
- return self._normalize_df(df)
248
-
249
- except QwakException as qwak_exception:
250
- raise QwakException(
251
- f"Got the following Qwak generated exception: {qwak_exception}"
252
- )
253
- except Exception as e:
254
- raise QwakException(f"Got the following run-time exception: {e}")
255
- finally:
256
- try:
257
- self.query_engine.cleanup()
258
- except Exception as e:
259
- print(f"Got the following run-time exception during cleanup: {e}")
260
-
261
- @staticmethod
262
- def _normalize_df(df: "pd.DataFrame") -> "pd.DataFrame":
263
- columns = df.columns.values.tolist()
264
- new_columns = normalize_cols(columns)
265
- df.columns = new_columns
266
- return df
267
-
268
- @staticmethod
269
- def _validate_range_query_inputs(
270
- entity_key_to_features: dict, start_date: datetime, end_date: datetime
271
- ):
272
- missing_features_entity_keys = [
273
- entity_key
274
- for entity_key, features in entity_key_to_features.items()
275
- if not features
276
- ]
277
-
278
- if missing_features_entity_keys:
279
- raise QwakException(
280
- f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
281
- )
282
-
283
- if (end_date - start_date).total_seconds() < 0:
284
- raise QwakException("Invalid date range - end date is before start date")
285
-
286
- @staticmethod
287
- def _validate_point_in_time_query_inputs(
288
- entity_key_to_features: dict,
289
- population: "pd.DataFrame",
290
- point_in_time_column_name: str,
291
- ):
292
- """
293
- Validates that the entity keys, timestamp cols and features exist in DB
294
- """
295
- missing_keys = [
296
- entity_key
297
- for entity_key in entity_key_to_features.keys()
298
- if entity_key not in population
299
- ]
300
- if missing_keys:
301
- raise QwakException(
302
- f"The entity keys must be in population_df columns, missing: [{missing_keys}]"
303
- )
304
-
305
- missing_features_entity_keys = [
306
- entity_key
307
- for entity_key, features in entity_key_to_features.items()
308
- if not features
309
- ]
310
-
311
- if missing_features_entity_keys:
312
- raise QwakException(
313
- f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
314
- )
315
-
316
- if point_in_time_column_name not in population:
317
- raise QwakException(
318
- "The point in time column must be part of the population dataframe"
319
- )
320
-
321
- from pandas.api.types import is_datetime64_any_dtype
322
-
323
- if not is_datetime64_any_dtype(population[point_in_time_column_name]):
324
- try:
325
- population[point_in_time_column_name] = pd.to_datetime(
326
- population[point_in_time_column_name]
327
- )
328
- except ParserError as e:
329
- raise QwakException(
330
- f"It was not possible to cast provided point in time column to datetime"
331
- f"\nError message: {e}"
332
- )
333
-
334
- @staticmethod
335
- def _partition_feature_set_by_entity_feature(
336
- entity_key_to_features,
337
- ) -> DefaultDict[Tuple[str, str], List[str]]:
338
- """
339
- Partition feature by entity key and featureset name
340
- Args:
341
- entity_key_to_features: dict of entity_key -> full feature name
342
- Returns:
343
- dict of (entity_key,featureset_name) -> list of feature names
344
- """
345
- feature_name_to_feature_list = defaultdict(list)
346
-
347
- for entity_key, feature_list in entity_key_to_features.items():
348
- for feature in feature_list:
349
- split_feature_set_and_feature = feature.lower().split(".")
350
- if len(split_feature_set_and_feature) != 2:
351
- raise QwakException(
352
- f"Failed to verify features. Name should be: <feature set name>.<feature name>. "
353
- f"Current name is: {feature}"
354
- )
355
- feature_set_name = split_feature_set_and_feature[0]
356
- feature_name_to_feature_list[(entity_key, feature_set_name)].append(
357
- feature
358
- )
359
-
360
- return feature_name_to_feature_list
361
-
362
- def _validate_database_and_get_feature_set_dtypes(
363
- self, feature_name_to_feature_list: DefaultDict[Tuple[str, str], List[str]]
364
- ) -> Dict[Tuple[str, str], List[Tuple[str, str]]]:
365
- """
366
- Args:
367
- feature_name_to_feature_list: dictionary from feature set name to its' features
368
-
369
- Returns
370
- dictionary from feature set name and entity key to a list of feature name, feature type
371
- """
372
- if self.FS_DB_NAME not in self._fs_db_names():
373
- raise QwakException("Offline feature store does not contain any data")
374
-
375
- feature_set_to_dtypes = {}
376
- for (
377
- entity_key,
378
- feature_set_name,
379
- ), feature_list in feature_name_to_feature_list.items():
380
- table_name = self._get_offline_feature_store_full_name(feature_set_name)
381
- if table_name not in self._fs_tables_names():
382
- raise QwakException(
383
- f"[{feature_set_name}] feature set does not contain any data"
384
- )
385
-
386
- columns_query_result = self.query_engine.run_query(
387
- f"SELECT * FROM INFORMATION_SCHEMA.COLUMNS " # nosec B608
388
- f"WHERE TABLE_SCHEMA = '{self.FS_DB_NAME}' "
389
- f"AND TABLE_NAME = '{table_name}'"
390
- )
391
-
392
- feature_set_to_dtypes[(entity_key, feature_set_name)] = [
393
- (column_tup[3], column_tup[7]) for column_tup in columns_query_result
394
- ]
395
- return feature_set_to_dtypes
396
-
397
- @staticmethod
398
- def _validate_features_exist(
399
- feature_name_to_feature_list: defaultdict,
400
- feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
401
- ):
402
- """
403
- Args:
404
- feature_name_to_feature_list: dictionary from feature set name to its' features
405
- feature_set_to_dtypes: dictionary from feature set name and entity key
406
- to a list of feature name, feature type
407
- """
408
- for (
409
- entity_key,
410
- feature_set_name,
411
- ), feature_list in feature_name_to_feature_list.items():
412
- columns = [
413
- column_tuple[0].lower()
414
- for column_tuple in feature_set_to_dtypes[
415
- (entity_key, feature_set_name)
416
- ]
417
- ]
418
- absent_features = [
419
- feature for feature in feature_list if feature not in columns
420
- ]
421
- if absent_features:
422
- raise QwakException(
423
- f"Missing the following features for the feature set "
424
- f"[{feature_set_name}]:"
425
- f"\n{absent_features}"
426
- )
427
-
428
- def _align_entity_key_dtype(
429
- self,
430
- feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
431
- entity_key_to_features: Dict[str, List[str]],
432
- population: "pd.DataFrame",
433
- ) -> "pd.DataFrame":
434
- """
435
- Args:
436
- feature_set_to_dtypes: dictionary from feature set name and entity key
437
- to a list of feature name, feature type
438
- entity_key_to_features: a dictionary { entity_key(s) -> features list }.
439
- population: a pandas data-frame with a point in time column
440
- and a column for each entity key defined at entity_key_to_features.
441
- Returns:
442
- entity type aligned population df
443
- """
444
- sql_type_to_pandas = {
445
- "string": "string",
446
- "integer": "int",
447
- "varchar": "string",
448
- "text": "string",
449
- "bigint": "int",
450
- }
451
-
452
- entity_key_to_dtype = self._validate_and_get_entity_keys_dtypes(
453
- entity_key_to_features, population
454
- )
455
-
456
- for (
457
- entity_key,
458
- feature_set_name,
459
- ), feature_dtypes_list in feature_set_to_dtypes.items():
460
- given_entity_key_dtype = entity_key_to_dtype[entity_key]
461
- entity_column_tuple = [
462
- column_tuple
463
- for column_tuple in feature_dtypes_list
464
- if column_tuple[0] == entity_key
465
- ]
466
- if not entity_column_tuple:
467
- raise QwakException(
468
- f"Did not find entity key [{entity_key}] in the table of [{feature_set_name}] "
469
- f"- existing columns are: "
470
- f"{[column_tuple[0] for column_tuple in feature_dtypes_list]}"
471
- )
472
- actual_entity_type = entity_column_tuple[0][1]
473
- if actual_entity_type == given_entity_key_dtype:
474
- continue
475
- else:
476
- try:
477
- population[entity_key] = population[entity_key].astype(
478
- sql_type_to_pandas[actual_entity_type]
479
- )
480
- print(
481
- f"Entity [{entity_key}] given type [{given_entity_key_dtype}] "
482
- f"was not aligned with actual type [{actual_entity_type}] - casted to correct type"
483
- )
484
- except ValueError as e:
485
- raise QwakException(
486
- f"Mismatched entity type for [{entity_key}] - [{given_entity_key_dtype}] "
487
- f"- failed to cast to actual type [{actual_entity_type}], Error: {e}"
488
- )
489
-
490
- return population
491
-
492
- def _validate_and_get_entity_keys_dtypes(
493
- self,
494
- entity_key_to_features: Dict[str, List[str]],
495
- population_df: "pd.DataFrame",
496
- ) -> Dict[str, str]:
497
- """
498
- Args:
499
- entity_key_to_features: a dictionary { entity_key(s) -> features list }.
500
- population_df: a pandas data-frame with a point in time column
501
- and a column for each entity key defined at entity_key_to_features.
502
-
503
- Returns:
504
- dictionary of entity key to it's dtype
505
- """
506
- supported_dtypes_to_actual_type = {
507
- "object": "string",
508
- "int32": "integer",
509
- "int64": "integer",
510
- }
511
- entity_key_to_dtype = {}
512
- for entity_key in entity_key_to_features.keys():
513
- entity_pandas_dtype = population_df.dtypes[entity_key].name
514
- if entity_pandas_dtype not in supported_dtypes_to_actual_type:
515
- raise QwakException(
516
- f"Got an unsupported dtype for the entity key "
517
- f"[{entity_key}] - [{entity_pandas_dtype}]"
518
- )
519
- entity_key_to_dtype[entity_key] = supported_dtypes_to_actual_type[
520
- entity_pandas_dtype
521
- ]
522
- return entity_key_to_dtype
523
-
524
- def _run_ranges_query(
525
- self,
526
- feature_name_to_feature_list: defaultdict,
527
- start_date: datetime,
528
- end_date: datetime,
529
- ):
530
- result_dfs = []
531
- features_set_by_entity = defaultdict(lambda: defaultdict(set))
532
- for (
533
- (entity_key, feature_set_name),
534
- feature_list,
535
- ) in feature_name_to_feature_list.items():
536
- for feature in feature_list:
537
- feature_set_name = feature.split(".")[0]
538
- features_set_by_entity[entity_key][feature_set_name].add(feature)
539
-
540
- for entity_key, features_dict in features_set_by_entity.items():
541
- entity_dfs = []
542
- for feature_set_name, feature_list in features_dict.items():
543
- offline_feature_store_full_path, table_path = self.get_fs_full_path(
544
- feature_set_name
545
- )
546
-
547
- features = ", ".join(
548
- [
549
- f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes}"
550
- for feature in feature_list
551
- ]
552
- )
553
-
554
- where_part = (
555
- "WHERE "
556
- f"{table_path}.{self.FS_START_TIME_COLUMN} >= timestamp '{start_date}' "
557
- f"AND ({self.FS_END_TIME_COLUMN} <= "
558
- f"timestamp '{end_date}' OR {table_path}.{self.FS_END_TIME_COLUMN} IS NULL) "
559
- f"AND {table_path}.{self.FS_START_TIME_COLUMN} < timestamp '{end_date}'"
560
- )
561
-
562
- full_sql = (
563
- f"SELECT {offline_feature_store_full_path}.{self.quotes}{entity_key}{self.quotes}, " # nosec B608
564
- f"{offline_feature_store_full_path}.{self.quotes}{self.FS_START_TIME_COLUMN}{self.quotes}, "
565
- f"{features} "
566
- f"FROM {offline_feature_store_full_path} "
567
- f"{where_part}"
568
- )
569
-
570
- entity_dfs.append(self.query_engine.read_pandas_from_query(full_sql))
571
-
572
- entity_final_df = reduce(
573
- lambda left, right: pd.merge(
574
- left, right, on=[entity_key, self.FS_START_TIME_COLUMN], how="outer"
575
- ),
576
- entity_dfs,
577
- )
578
- result_dfs.append(entity_final_df.reset_index(drop=True))
579
-
580
- return result_dfs[0] if len(result_dfs) == 1 else result_dfs
581
-
582
- def _run_point_in_time_query(
583
- self,
584
- feature_name_to_feature_list: defaultdict,
585
- uploaded_population_path: str,
586
- point_in_time_column_name: str,
587
- population_list: list,
588
- ) -> "pd.DataFrame":
589
- """
590
- creates SQL query for pint in time feature fetching based on population and requested features
591
- """
592
- dfs = []
593
-
594
- for index, ((entity_key, feature_set_name), feature_list) in enumerate(
595
- feature_name_to_feature_list.items()
596
- ):
597
- offline_feature_store_full_path, table_path = self.get_fs_full_path(
598
- feature_set_name
599
- )
600
-
601
- join_part = self._get_join_population_sql(
602
- entity_key, offline_feature_store_full_path, uploaded_population_path
603
- )
604
-
605
- point_in_time_column_full_path = (
606
- f"{uploaded_population_path}.{point_in_time_column_name}"
607
- )
608
-
609
- where_part = (
610
- "WHERE "
611
- f"{point_in_time_column_full_path} >= "
612
- f"{table_path}.{self.FS_START_TIME_COLUMN} "
613
- f"AND ({point_in_time_column_full_path} < "
614
- f"{table_path}.{self.FS_END_TIME_COLUMN} OR "
615
- f"{table_path}.{self.FS_END_TIME_COLUMN} IS NULL)"
616
- )
617
-
618
- features = ", ".join(
619
- [
620
- f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
621
- for feature in feature_list
622
- ]
623
- )
624
-
625
- final_query_features = ", ".join(
626
- [
627
- f"filtered_features.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
628
- for feature in feature_list
629
- ]
630
- )
631
-
632
- full_sql = (
633
- f"WITH " # nosec B608
634
- "filtered_features AS ( "
635
- f"SELECT {uploaded_population_path}.*, {features} "
636
- f"FROM {uploaded_population_path} "
637
- f"{join_part} "
638
- f"{where_part} "
639
- ") "
640
- f"SELECT population.*, {final_query_features} "
641
- f"FROM {uploaded_population_path} population "
642
- f"LEFT JOIN filtered_features "
643
- f"ON population.{entity_key} = filtered_features.{entity_key} "
644
- f"AND population.{point_in_time_column_name} = filtered_features.{point_in_time_column_name}"
645
- )
646
-
647
- dfs.append(
648
- self.query_engine.read_pandas_from_query(
649
- full_sql, [point_in_time_column_name]
650
- )
651
- )
652
- return self._merge_query_dataframes_results(dfs, population_list)
653
-
654
- @staticmethod
655
- def _merge_query_dataframes_results(
656
- dfs: List["pd.DataFrame"], population_list
657
- ) -> "pd.DataFrame":
658
- """
659
- merges query result dataframes according to population list cols
660
- """
661
- if dfs:
662
- df_final = reduce(
663
- lambda left, right: pd.merge(
664
- left, right, on=population_list, how="outer"
665
- ),
666
- dfs,
667
- )
668
-
669
- ordered_cols = population_list + (
670
- df_final.columns.drop(population_list).tolist()
671
- )
672
-
673
- return df_final[ordered_cols].reset_index(drop=True)
674
- else:
675
- return pd.DataFrame()
676
-
677
- def _get_join_population_sql(
678
- self, entity_key, offline_feature_store_full_path, uploaded_population_path
679
- ):
680
- """
681
- return a join sql query for uploaded population table
682
- """
683
- join_part = (
684
- f"LEFT JOIN {offline_feature_store_full_path} ON "
685
- f"{offline_feature_store_full_path}.{entity_key} = "
686
- f"{uploaded_population_path}.{entity_key} "
687
- )
688
- return join_part
689
-
690
- def get_fs_full_path(self, feature_set_name: str) -> Tuple[str, str]:
691
- offline_feature_store_table_name = self._get_offline_feature_store_full_name(
692
- feature_set_name
693
- )
694
- table_path = f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
695
- offline_feature_store_full_path = (
696
- f"{self.quotes}{self.FS_DB_NAME}{self.quotes}."
697
- f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
698
- )
699
- return offline_feature_store_full_path, table_path
700
-
701
- def _fs_db_names(self) -> List[str]:
702
- return [
703
- database_tuple[0]
704
- for database_tuple in self.query_engine.run_query("SHOW SCHEMAS")
705
- ]
706
-
707
- def _fs_tables_names(self) -> List[str]:
708
- return [
709
- table_tuple[0]
710
- for table_tuple in self.query_engine.run_query(
711
- f"SHOW TABLES IN {self.FS_DB_NAME}"
712
- )
713
- ]
714
-
715
- def _get_offline_feature_store_full_name(self, feature_set_name: str) -> str:
716
- return f"{self.FS_TABLE_NAME_PREFIX}_{feature_set_name}".lower().replace(
717
- "-", "_"
718
- )