qwak-core 0.4.246__py3-none-any.whl → 0.4.248__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _qwak_proto/qwak/service_discovery/service_discovery_location_pb2.py +65 -0
- _qwak_proto/qwak/service_discovery/service_discovery_location_pb2.pyi +73 -0
- _qwak_proto/qwak/service_discovery/service_discovery_location_pb2_grpc.py +4 -0
- _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2.py +49 -0
- _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2.pyi +41 -0
- _qwak_proto/qwak/service_discovery/service_discovery_location_service_pb2_grpc.py +231 -0
- qwak/__init__.py +1 -1
- qwak/clients/feature_store/offline_serving_client.py +29 -4
- qwak/clients/location_discovery/__init__.py +1 -0
- qwak/clients/location_discovery/client.py +73 -0
- qwak/feature_store/_common/functions.py +0 -19
- qwak/feature_store/offline/__init__.py +1 -2
- qwak/inner/model_loggers_utils.py +8 -20
- qwak/model_loggers/artifact_logger.py +7 -2
- qwak/model_loggers/data_logger.py +11 -6
- {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/METADATA +1 -1
- {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/RECORD +21 -17
- qwak_services_mock/mocks/location_discovery_service_api.py +104 -0
- qwak_services_mock/mocks/qwak_mocks.py +4 -0
- qwak_services_mock/services_mock.py +13 -0
- qwak/feature_store/_common/featureset_asterisk_handler.py +0 -115
- qwak/feature_store/offline/_query_engine.py +0 -32
- qwak/feature_store/offline/athena/__init__.py +0 -0
- qwak/feature_store/offline/athena/athena_query_engine.py +0 -153
- qwak/feature_store/offline/client.py +0 -718
- {qwak_core-0.4.246.dist-info → qwak_core-0.4.248.dist-info}/WHEEL +0 -0
@@ -1,718 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
from datetime import date, datetime
|
3
|
-
from functools import reduce
|
4
|
-
from typing import DefaultDict, Dict, List, Tuple, Union
|
5
|
-
|
6
|
-
from qwak.clients.administration.eco_system.client import EcosystemClient
|
7
|
-
|
8
|
-
try:
|
9
|
-
import pandas as pd
|
10
|
-
except ImportError:
|
11
|
-
pass
|
12
|
-
|
13
|
-
from warnings import warn
|
14
|
-
|
15
|
-
from dateutil.parser import ParserError
|
16
|
-
from qwak.exceptions import QwakException
|
17
|
-
from qwak.feature_store._common.functions import normalize_cols
|
18
|
-
from qwak.feature_store.offline.athena.athena_query_engine import AthenaQueryEngine
|
19
|
-
|
20
|
-
|
21
|
-
class OfflineClient:
|
22
|
-
"""
|
23
|
-
A class used to retrieve data from the offline store - mainly used to get train data for models.
|
24
|
-
It requires qwak configure and aws access.
|
25
|
-
@deprecated
|
26
|
-
"""
|
27
|
-
|
28
|
-
# Default SCD parameters of the feature store etl
|
29
|
-
ANALYTICS_DB_PREFIX = "qwak_analytics_feature_store"
|
30
|
-
FS_DB_PREFIX = "qwak_feature_store"
|
31
|
-
FS_TABLE_NAME_PREFIX = "offline_feature_store"
|
32
|
-
FS_START_TIME_COLUMN = "start_timestamp"
|
33
|
-
FS_END_TIME_COLUMN = "end_timestamp"
|
34
|
-
FEATURE_STORE_PREFIX = "feature_store"
|
35
|
-
|
36
|
-
DEFAULT_NUMBER_OF_SAMPLE_DATA_ROWS = "100"
|
37
|
-
|
38
|
-
def __init__(
|
39
|
-
self,
|
40
|
-
query_engine=None,
|
41
|
-
environment_id=None,
|
42
|
-
):
|
43
|
-
warn(
|
44
|
-
"This Client will be deprecated soon, Please use OfflineClientV2 Instead",
|
45
|
-
DeprecationWarning,
|
46
|
-
stacklevel=2,
|
47
|
-
)
|
48
|
-
self.query_engine = (
|
49
|
-
query_engine if query_engine is not None else AthenaQueryEngine()
|
50
|
-
)
|
51
|
-
self.quotes = self.query_engine.get_quotes()
|
52
|
-
if environment_id is None:
|
53
|
-
user_context = EcosystemClient().get_authenticated_user_context().user
|
54
|
-
environment_id = (
|
55
|
-
user_context.account_details.default_environment_id.replace("-", "_")
|
56
|
-
)
|
57
|
-
|
58
|
-
self.environment_id = environment_id.replace("-", "_")
|
59
|
-
self.FS_DB_NAME = self.FS_DB_PREFIX + "_" + self.environment_id
|
60
|
-
self.FS_ANALYTICS_DB_NAME = self.ANALYTICS_DB_PREFIX + "_" + self.environment_id
|
61
|
-
|
62
|
-
def get_feature_range_values(
|
63
|
-
self,
|
64
|
-
entity_key_to_features: dict,
|
65
|
-
start_date: Union[datetime, date],
|
66
|
-
end_date: Union[datetime, date],
|
67
|
-
):
|
68
|
-
"""
|
69
|
-
:param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
|
70
|
-
:param start_date: the column name of the point in time column (default - timestamp)
|
71
|
-
:param end_date: the column name of the point in time column (default - timestamp)
|
72
|
-
:return: a pandas dataframe or a list of dataframes (a dataframe for every entity_key) - all feature values for
|
73
|
-
all entites under the given date range
|
74
|
-
@depracted
|
75
|
-
|
76
|
-
each row in the returned data-frame is constructed by retrieving the requested features of the entity
|
77
|
-
key(s) for all entity values in within the defined date tange.
|
78
|
-
|
79
|
-
Feature sets should be named [Feature Set Name].[Feature Name],
|
80
|
-
i.e: user_purchases.number_of_purchases.
|
81
|
-
|
82
|
-
Examples:
|
83
|
-
>>> from datetime import datetime
|
84
|
-
>>> from qwak.feature_store.offline import OfflineClient
|
85
|
-
>>>
|
86
|
-
>>> start_date = datetime(year=2021, month=1, day=1)
|
87
|
-
>>> end_date = datetime(year=2021, month=1, day=3)
|
88
|
-
>>>
|
89
|
-
>>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
|
90
|
-
>>> 'user_purchases.avg_purchase_amount']}
|
91
|
-
>>>
|
92
|
-
>>> offline_feature_store = OfflineClient()
|
93
|
-
>>>
|
94
|
-
>>> train_df = offline_feature_store.get_feature_range_values(
|
95
|
-
>>> entity_key_to_features=key_to_features,
|
96
|
-
>>> start_date=start_date,
|
97
|
-
>>> end_date=end_date)
|
98
|
-
>>>
|
99
|
-
>>> print(train_df.head())
|
100
|
-
>>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
|
101
|
-
>>> # 0 1 2021-01-02 17:00:00 76 4.796842
|
102
|
-
>>> # 1 1 2021-01-01 12:00:00 5 1.548000
|
103
|
-
>>> # 2 2 2021-01-02 12:00:00 5 5.548000
|
104
|
-
>>> # 3 2 2021-01-01 18:00:00 5 2.788000
|
105
|
-
"""
|
106
|
-
warn(
|
107
|
-
"This method will be deprecated soon, Please use OfflineClientV2: get_feature_values() Instead",
|
108
|
-
DeprecationWarning,
|
109
|
-
stacklevel=2,
|
110
|
-
)
|
111
|
-
try:
|
112
|
-
from qwak.feature_store._common.featureset_asterisk_handler import (
|
113
|
-
unpack_asterisk_features_from_key_mapping,
|
114
|
-
)
|
115
|
-
|
116
|
-
entity_key_to_features = unpack_asterisk_features_from_key_mapping(
|
117
|
-
entity_key_to_features, lambda: self
|
118
|
-
)
|
119
|
-
self._validate_range_query_inputs(
|
120
|
-
entity_key_to_features, start_date, end_date
|
121
|
-
)
|
122
|
-
|
123
|
-
feature_set_name_to_feature_list = (
|
124
|
-
self._partition_feature_set_by_entity_feature(entity_key_to_features)
|
125
|
-
)
|
126
|
-
feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
|
127
|
-
feature_set_name_to_feature_list
|
128
|
-
)
|
129
|
-
self._validate_features_exist(
|
130
|
-
feature_set_name_to_feature_list, feature_set_to_dtypes
|
131
|
-
)
|
132
|
-
|
133
|
-
df = self._run_ranges_query(
|
134
|
-
feature_set_name_to_feature_list, start_date, end_date
|
135
|
-
)
|
136
|
-
|
137
|
-
return self._normalize_df(df)
|
138
|
-
|
139
|
-
except QwakException as qwak_exception:
|
140
|
-
raise QwakException(
|
141
|
-
f"Got the following Qwak generated exception: {qwak_exception}"
|
142
|
-
)
|
143
|
-
except Exception as e:
|
144
|
-
raise QwakException(f"Got the following run-time exception: {e}")
|
145
|
-
finally:
|
146
|
-
try:
|
147
|
-
self.query_engine.cleanup()
|
148
|
-
except Exception as e:
|
149
|
-
print(f"Got the following run-time exception during cleanup: {e}")
|
150
|
-
|
151
|
-
def get_feature_values(
|
152
|
-
self,
|
153
|
-
entity_key_to_features: dict,
|
154
|
-
population: "pd.DataFrame",
|
155
|
-
point_in_time_column_name: str = "timestamp",
|
156
|
-
):
|
157
|
-
"""
|
158
|
-
:param entity_key_to_features: a dictionary { entity_key(s) -> features list }.
|
159
|
-
:param population: a pandas data-frame with a point in time column
|
160
|
-
and a column for each entity key defined at entity_key_to_features.
|
161
|
-
:param point_in_time_column_name: the column name of the point in time column (default - timestamp)
|
162
|
-
:return: a pandas data-frame - the population joined with the feature values for all
|
163
|
-
the requested entities and features.
|
164
|
-
@deprecated
|
165
|
-
|
166
|
-
each row in the returned data-frame is constructed by retrieving the requested features of the entity key(s) for
|
167
|
-
the specific entity value(s) in the population and on the specific point in time defined.
|
168
|
-
|
169
|
-
Feature sets should be named [Feature Set Name].[Feature Name],
|
170
|
-
i.e: user_purchases.number_of_purchases.
|
171
|
-
|
172
|
-
Examples:
|
173
|
-
>>> import pandas as pd
|
174
|
-
>>> from qwak.feature_store.offline import OfflineClient
|
175
|
-
>>>
|
176
|
-
>>> population_df = pd.DataFrame(
|
177
|
-
>>> columns= ['uuid', 'timestamp' ],
|
178
|
-
>>> data =[[ '1' , '2021-01-02 17:00:00' ],
|
179
|
-
>>> [ '2' , '2021-01-01 12:00:00' ]])
|
180
|
-
>>>
|
181
|
-
>>> key_to_features = {'uuid': ['user_purchases.number_of_purchases',
|
182
|
-
>>> 'user_purchases.avg_purchase_amount']}
|
183
|
-
>>>
|
184
|
-
>>> offline_feature_store = OfflineClient()
|
185
|
-
>>>
|
186
|
-
>>> train_df = offline_feature_store.get_feature_values(
|
187
|
-
>>> entity_key_to_features=key_to_features,
|
188
|
-
>>> population=population_df,
|
189
|
-
>>> point_in_time_column_name='timestamp')
|
190
|
-
>>>
|
191
|
-
>>> print(train_df.head())
|
192
|
-
>>> # uuid timestamp user_purchases.number_of_purchases user_purchases.avg_purchase_amount
|
193
|
-
>>> # 0 1 2021-04-24 17:00:00 76 4.796842
|
194
|
-
>>> # 1 2 2021-04-24 12:00:00 5 1.548000
|
195
|
-
"""
|
196
|
-
warn(
|
197
|
-
"This method will be deprecated soon, Please use OfflineClientV2: get_feature_range_values() Instead",
|
198
|
-
DeprecationWarning,
|
199
|
-
stacklevel=2,
|
200
|
-
)
|
201
|
-
import pandas as pd
|
202
|
-
|
203
|
-
try:
|
204
|
-
from qwak.feature_store._common.featureset_asterisk_handler import (
|
205
|
-
unpack_asterisk_features_from_key_mapping,
|
206
|
-
)
|
207
|
-
|
208
|
-
population = population.copy()
|
209
|
-
|
210
|
-
entity_key_to_features = unpack_asterisk_features_from_key_mapping(
|
211
|
-
entity_key_to_features, lambda: self
|
212
|
-
)
|
213
|
-
|
214
|
-
self._validate_point_in_time_query_inputs(
|
215
|
-
entity_key_to_features, population, point_in_time_column_name
|
216
|
-
)
|
217
|
-
|
218
|
-
feature_set_name_to_feature_dict = (
|
219
|
-
self._partition_feature_set_by_entity_feature(entity_key_to_features)
|
220
|
-
)
|
221
|
-
|
222
|
-
feature_set_to_dtypes = self._validate_database_and_get_feature_set_dtypes(
|
223
|
-
feature_set_name_to_feature_dict
|
224
|
-
)
|
225
|
-
|
226
|
-
self._validate_features_exist(
|
227
|
-
feature_set_name_to_feature_dict,
|
228
|
-
feature_set_to_dtypes,
|
229
|
-
)
|
230
|
-
|
231
|
-
population = self._align_entity_key_dtype(
|
232
|
-
feature_set_to_dtypes, entity_key_to_features, population
|
233
|
-
)
|
234
|
-
|
235
|
-
uploaded_population_path = self.query_engine.upload_table(population)
|
236
|
-
|
237
|
-
df = pd.DataFrame()
|
238
|
-
|
239
|
-
if feature_set_name_to_feature_dict:
|
240
|
-
df = self._run_point_in_time_query(
|
241
|
-
feature_set_name_to_feature_dict,
|
242
|
-
uploaded_population_path,
|
243
|
-
point_in_time_column_name,
|
244
|
-
[column.lower() for column in population.columns],
|
245
|
-
)
|
246
|
-
|
247
|
-
return self._normalize_df(df)
|
248
|
-
|
249
|
-
except QwakException as qwak_exception:
|
250
|
-
raise QwakException(
|
251
|
-
f"Got the following Qwak generated exception: {qwak_exception}"
|
252
|
-
)
|
253
|
-
except Exception as e:
|
254
|
-
raise QwakException(f"Got the following run-time exception: {e}")
|
255
|
-
finally:
|
256
|
-
try:
|
257
|
-
self.query_engine.cleanup()
|
258
|
-
except Exception as e:
|
259
|
-
print(f"Got the following run-time exception during cleanup: {e}")
|
260
|
-
|
261
|
-
@staticmethod
|
262
|
-
def _normalize_df(df: "pd.DataFrame") -> "pd.DataFrame":
|
263
|
-
columns = df.columns.values.tolist()
|
264
|
-
new_columns = normalize_cols(columns)
|
265
|
-
df.columns = new_columns
|
266
|
-
return df
|
267
|
-
|
268
|
-
@staticmethod
|
269
|
-
def _validate_range_query_inputs(
|
270
|
-
entity_key_to_features: dict, start_date: datetime, end_date: datetime
|
271
|
-
):
|
272
|
-
missing_features_entity_keys = [
|
273
|
-
entity_key
|
274
|
-
for entity_key, features in entity_key_to_features.items()
|
275
|
-
if not features
|
276
|
-
]
|
277
|
-
|
278
|
-
if missing_features_entity_keys:
|
279
|
-
raise QwakException(
|
280
|
-
f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
|
281
|
-
)
|
282
|
-
|
283
|
-
if (end_date - start_date).total_seconds() < 0:
|
284
|
-
raise QwakException("Invalid date range - end date is before start date")
|
285
|
-
|
286
|
-
@staticmethod
|
287
|
-
def _validate_point_in_time_query_inputs(
|
288
|
-
entity_key_to_features: dict,
|
289
|
-
population: "pd.DataFrame",
|
290
|
-
point_in_time_column_name: str,
|
291
|
-
):
|
292
|
-
"""
|
293
|
-
Validates that the entity keys, timestamp cols and features exist in DB
|
294
|
-
"""
|
295
|
-
missing_keys = [
|
296
|
-
entity_key
|
297
|
-
for entity_key in entity_key_to_features.keys()
|
298
|
-
if entity_key not in population
|
299
|
-
]
|
300
|
-
if missing_keys:
|
301
|
-
raise QwakException(
|
302
|
-
f"The entity keys must be in population_df columns, missing: [{missing_keys}]"
|
303
|
-
)
|
304
|
-
|
305
|
-
missing_features_entity_keys = [
|
306
|
-
entity_key
|
307
|
-
for entity_key, features in entity_key_to_features.items()
|
308
|
-
if not features
|
309
|
-
]
|
310
|
-
|
311
|
-
if missing_features_entity_keys:
|
312
|
-
raise QwakException(
|
313
|
-
f"Features of an entity key must exist, missing features for: [{missing_features_entity_keys}]"
|
314
|
-
)
|
315
|
-
|
316
|
-
if point_in_time_column_name not in population:
|
317
|
-
raise QwakException(
|
318
|
-
"The point in time column must be part of the population dataframe"
|
319
|
-
)
|
320
|
-
|
321
|
-
from pandas.api.types import is_datetime64_any_dtype
|
322
|
-
|
323
|
-
if not is_datetime64_any_dtype(population[point_in_time_column_name]):
|
324
|
-
try:
|
325
|
-
population[point_in_time_column_name] = pd.to_datetime(
|
326
|
-
population[point_in_time_column_name]
|
327
|
-
)
|
328
|
-
except ParserError as e:
|
329
|
-
raise QwakException(
|
330
|
-
f"It was not possible to cast provided point in time column to datetime"
|
331
|
-
f"\nError message: {e}"
|
332
|
-
)
|
333
|
-
|
334
|
-
@staticmethod
|
335
|
-
def _partition_feature_set_by_entity_feature(
|
336
|
-
entity_key_to_features,
|
337
|
-
) -> DefaultDict[Tuple[str, str], List[str]]:
|
338
|
-
"""
|
339
|
-
Partition feature by entity key and featureset name
|
340
|
-
Args:
|
341
|
-
entity_key_to_features: dict of entity_key -> full feature name
|
342
|
-
Returns:
|
343
|
-
dict of (entity_key,featureset_name) -> list of feature names
|
344
|
-
"""
|
345
|
-
feature_name_to_feature_list = defaultdict(list)
|
346
|
-
|
347
|
-
for entity_key, feature_list in entity_key_to_features.items():
|
348
|
-
for feature in feature_list:
|
349
|
-
split_feature_set_and_feature = feature.lower().split(".")
|
350
|
-
if len(split_feature_set_and_feature) != 2:
|
351
|
-
raise QwakException(
|
352
|
-
f"Failed to verify features. Name should be: <feature set name>.<feature name>. "
|
353
|
-
f"Current name is: {feature}"
|
354
|
-
)
|
355
|
-
feature_set_name = split_feature_set_and_feature[0]
|
356
|
-
feature_name_to_feature_list[(entity_key, feature_set_name)].append(
|
357
|
-
feature
|
358
|
-
)
|
359
|
-
|
360
|
-
return feature_name_to_feature_list
|
361
|
-
|
362
|
-
def _validate_database_and_get_feature_set_dtypes(
|
363
|
-
self, feature_name_to_feature_list: DefaultDict[Tuple[str, str], List[str]]
|
364
|
-
) -> Dict[Tuple[str, str], List[Tuple[str, str]]]:
|
365
|
-
"""
|
366
|
-
Args:
|
367
|
-
feature_name_to_feature_list: dictionary from feature set name to its' features
|
368
|
-
|
369
|
-
Returns
|
370
|
-
dictionary from feature set name and entity key to a list of feature name, feature type
|
371
|
-
"""
|
372
|
-
if self.FS_DB_NAME not in self._fs_db_names():
|
373
|
-
raise QwakException("Offline feature store does not contain any data")
|
374
|
-
|
375
|
-
feature_set_to_dtypes = {}
|
376
|
-
for (
|
377
|
-
entity_key,
|
378
|
-
feature_set_name,
|
379
|
-
), feature_list in feature_name_to_feature_list.items():
|
380
|
-
table_name = self._get_offline_feature_store_full_name(feature_set_name)
|
381
|
-
if table_name not in self._fs_tables_names():
|
382
|
-
raise QwakException(
|
383
|
-
f"[{feature_set_name}] feature set does not contain any data"
|
384
|
-
)
|
385
|
-
|
386
|
-
columns_query_result = self.query_engine.run_query(
|
387
|
-
f"SELECT * FROM INFORMATION_SCHEMA.COLUMNS " # nosec B608
|
388
|
-
f"WHERE TABLE_SCHEMA = '{self.FS_DB_NAME}' "
|
389
|
-
f"AND TABLE_NAME = '{table_name}'"
|
390
|
-
)
|
391
|
-
|
392
|
-
feature_set_to_dtypes[(entity_key, feature_set_name)] = [
|
393
|
-
(column_tup[3], column_tup[7]) for column_tup in columns_query_result
|
394
|
-
]
|
395
|
-
return feature_set_to_dtypes
|
396
|
-
|
397
|
-
@staticmethod
|
398
|
-
def _validate_features_exist(
|
399
|
-
feature_name_to_feature_list: defaultdict,
|
400
|
-
feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
|
401
|
-
):
|
402
|
-
"""
|
403
|
-
Args:
|
404
|
-
feature_name_to_feature_list: dictionary from feature set name to its' features
|
405
|
-
feature_set_to_dtypes: dictionary from feature set name and entity key
|
406
|
-
to a list of feature name, feature type
|
407
|
-
"""
|
408
|
-
for (
|
409
|
-
entity_key,
|
410
|
-
feature_set_name,
|
411
|
-
), feature_list in feature_name_to_feature_list.items():
|
412
|
-
columns = [
|
413
|
-
column_tuple[0].lower()
|
414
|
-
for column_tuple in feature_set_to_dtypes[
|
415
|
-
(entity_key, feature_set_name)
|
416
|
-
]
|
417
|
-
]
|
418
|
-
absent_features = [
|
419
|
-
feature for feature in feature_list if feature not in columns
|
420
|
-
]
|
421
|
-
if absent_features:
|
422
|
-
raise QwakException(
|
423
|
-
f"Missing the following features for the feature set "
|
424
|
-
f"[{feature_set_name}]:"
|
425
|
-
f"\n{absent_features}"
|
426
|
-
)
|
427
|
-
|
428
|
-
def _align_entity_key_dtype(
|
429
|
-
self,
|
430
|
-
feature_set_to_dtypes: Dict[Tuple[str, str], List[Tuple[str, str]]],
|
431
|
-
entity_key_to_features: Dict[str, List[str]],
|
432
|
-
population: "pd.DataFrame",
|
433
|
-
) -> "pd.DataFrame":
|
434
|
-
"""
|
435
|
-
Args:
|
436
|
-
feature_set_to_dtypes: dictionary from feature set name and entity key
|
437
|
-
to a list of feature name, feature type
|
438
|
-
entity_key_to_features: a dictionary { entity_key(s) -> features list }.
|
439
|
-
population: a pandas data-frame with a point in time column
|
440
|
-
and a column for each entity key defined at entity_key_to_features.
|
441
|
-
Returns:
|
442
|
-
entity type aligned population df
|
443
|
-
"""
|
444
|
-
sql_type_to_pandas = {
|
445
|
-
"string": "string",
|
446
|
-
"integer": "int",
|
447
|
-
"varchar": "string",
|
448
|
-
"text": "string",
|
449
|
-
"bigint": "int",
|
450
|
-
}
|
451
|
-
|
452
|
-
entity_key_to_dtype = self._validate_and_get_entity_keys_dtypes(
|
453
|
-
entity_key_to_features, population
|
454
|
-
)
|
455
|
-
|
456
|
-
for (
|
457
|
-
entity_key,
|
458
|
-
feature_set_name,
|
459
|
-
), feature_dtypes_list in feature_set_to_dtypes.items():
|
460
|
-
given_entity_key_dtype = entity_key_to_dtype[entity_key]
|
461
|
-
entity_column_tuple = [
|
462
|
-
column_tuple
|
463
|
-
for column_tuple in feature_dtypes_list
|
464
|
-
if column_tuple[0] == entity_key
|
465
|
-
]
|
466
|
-
if not entity_column_tuple:
|
467
|
-
raise QwakException(
|
468
|
-
f"Did not find entity key [{entity_key}] in the table of [{feature_set_name}] "
|
469
|
-
f"- existing columns are: "
|
470
|
-
f"{[column_tuple[0] for column_tuple in feature_dtypes_list]}"
|
471
|
-
)
|
472
|
-
actual_entity_type = entity_column_tuple[0][1]
|
473
|
-
if actual_entity_type == given_entity_key_dtype:
|
474
|
-
continue
|
475
|
-
else:
|
476
|
-
try:
|
477
|
-
population[entity_key] = population[entity_key].astype(
|
478
|
-
sql_type_to_pandas[actual_entity_type]
|
479
|
-
)
|
480
|
-
print(
|
481
|
-
f"Entity [{entity_key}] given type [{given_entity_key_dtype}] "
|
482
|
-
f"was not aligned with actual type [{actual_entity_type}] - casted to correct type"
|
483
|
-
)
|
484
|
-
except ValueError as e:
|
485
|
-
raise QwakException(
|
486
|
-
f"Mismatched entity type for [{entity_key}] - [{given_entity_key_dtype}] "
|
487
|
-
f"- failed to cast to actual type [{actual_entity_type}], Error: {e}"
|
488
|
-
)
|
489
|
-
|
490
|
-
return population
|
491
|
-
|
492
|
-
def _validate_and_get_entity_keys_dtypes(
|
493
|
-
self,
|
494
|
-
entity_key_to_features: Dict[str, List[str]],
|
495
|
-
population_df: "pd.DataFrame",
|
496
|
-
) -> Dict[str, str]:
|
497
|
-
"""
|
498
|
-
Args:
|
499
|
-
entity_key_to_features: a dictionary { entity_key(s) -> features list }.
|
500
|
-
population_df: a pandas data-frame with a point in time column
|
501
|
-
and a column for each entity key defined at entity_key_to_features.
|
502
|
-
|
503
|
-
Returns:
|
504
|
-
dictionary of entity key to it's dtype
|
505
|
-
"""
|
506
|
-
supported_dtypes_to_actual_type = {
|
507
|
-
"object": "string",
|
508
|
-
"int32": "integer",
|
509
|
-
"int64": "integer",
|
510
|
-
}
|
511
|
-
entity_key_to_dtype = {}
|
512
|
-
for entity_key in entity_key_to_features.keys():
|
513
|
-
entity_pandas_dtype = population_df.dtypes[entity_key].name
|
514
|
-
if entity_pandas_dtype not in supported_dtypes_to_actual_type:
|
515
|
-
raise QwakException(
|
516
|
-
f"Got an unsupported dtype for the entity key "
|
517
|
-
f"[{entity_key}] - [{entity_pandas_dtype}]"
|
518
|
-
)
|
519
|
-
entity_key_to_dtype[entity_key] = supported_dtypes_to_actual_type[
|
520
|
-
entity_pandas_dtype
|
521
|
-
]
|
522
|
-
return entity_key_to_dtype
|
523
|
-
|
524
|
-
def _run_ranges_query(
|
525
|
-
self,
|
526
|
-
feature_name_to_feature_list: defaultdict,
|
527
|
-
start_date: datetime,
|
528
|
-
end_date: datetime,
|
529
|
-
):
|
530
|
-
result_dfs = []
|
531
|
-
features_set_by_entity = defaultdict(lambda: defaultdict(set))
|
532
|
-
for (
|
533
|
-
(entity_key, feature_set_name),
|
534
|
-
feature_list,
|
535
|
-
) in feature_name_to_feature_list.items():
|
536
|
-
for feature in feature_list:
|
537
|
-
feature_set_name = feature.split(".")[0]
|
538
|
-
features_set_by_entity[entity_key][feature_set_name].add(feature)
|
539
|
-
|
540
|
-
for entity_key, features_dict in features_set_by_entity.items():
|
541
|
-
entity_dfs = []
|
542
|
-
for feature_set_name, feature_list in features_dict.items():
|
543
|
-
offline_feature_store_full_path, table_path = self.get_fs_full_path(
|
544
|
-
feature_set_name
|
545
|
-
)
|
546
|
-
|
547
|
-
features = ", ".join(
|
548
|
-
[
|
549
|
-
f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes}"
|
550
|
-
for feature in feature_list
|
551
|
-
]
|
552
|
-
)
|
553
|
-
|
554
|
-
where_part = (
|
555
|
-
"WHERE "
|
556
|
-
f"{table_path}.{self.FS_START_TIME_COLUMN} >= timestamp '{start_date}' "
|
557
|
-
f"AND ({self.FS_END_TIME_COLUMN} <= "
|
558
|
-
f"timestamp '{end_date}' OR {table_path}.{self.FS_END_TIME_COLUMN} IS NULL) "
|
559
|
-
f"AND {table_path}.{self.FS_START_TIME_COLUMN} < timestamp '{end_date}'"
|
560
|
-
)
|
561
|
-
|
562
|
-
full_sql = (
|
563
|
-
f"SELECT {offline_feature_store_full_path}.{self.quotes}{entity_key}{self.quotes}, " # nosec B608
|
564
|
-
f"{offline_feature_store_full_path}.{self.quotes}{self.FS_START_TIME_COLUMN}{self.quotes}, "
|
565
|
-
f"{features} "
|
566
|
-
f"FROM {offline_feature_store_full_path} "
|
567
|
-
f"{where_part}"
|
568
|
-
)
|
569
|
-
|
570
|
-
entity_dfs.append(self.query_engine.read_pandas_from_query(full_sql))
|
571
|
-
|
572
|
-
entity_final_df = reduce(
|
573
|
-
lambda left, right: pd.merge(
|
574
|
-
left, right, on=[entity_key, self.FS_START_TIME_COLUMN], how="outer"
|
575
|
-
),
|
576
|
-
entity_dfs,
|
577
|
-
)
|
578
|
-
result_dfs.append(entity_final_df.reset_index(drop=True))
|
579
|
-
|
580
|
-
return result_dfs[0] if len(result_dfs) == 1 else result_dfs
|
581
|
-
|
582
|
-
def _run_point_in_time_query(
|
583
|
-
self,
|
584
|
-
feature_name_to_feature_list: defaultdict,
|
585
|
-
uploaded_population_path: str,
|
586
|
-
point_in_time_column_name: str,
|
587
|
-
population_list: list,
|
588
|
-
) -> "pd.DataFrame":
|
589
|
-
"""
|
590
|
-
creates SQL query for pint in time feature fetching based on population and requested features
|
591
|
-
"""
|
592
|
-
dfs = []
|
593
|
-
|
594
|
-
for index, ((entity_key, feature_set_name), feature_list) in enumerate(
|
595
|
-
feature_name_to_feature_list.items()
|
596
|
-
):
|
597
|
-
offline_feature_store_full_path, table_path = self.get_fs_full_path(
|
598
|
-
feature_set_name
|
599
|
-
)
|
600
|
-
|
601
|
-
join_part = self._get_join_population_sql(
|
602
|
-
entity_key, offline_feature_store_full_path, uploaded_population_path
|
603
|
-
)
|
604
|
-
|
605
|
-
point_in_time_column_full_path = (
|
606
|
-
f"{uploaded_population_path}.{point_in_time_column_name}"
|
607
|
-
)
|
608
|
-
|
609
|
-
where_part = (
|
610
|
-
"WHERE "
|
611
|
-
f"{point_in_time_column_full_path} >= "
|
612
|
-
f"{table_path}.{self.FS_START_TIME_COLUMN} "
|
613
|
-
f"AND ({point_in_time_column_full_path} < "
|
614
|
-
f"{table_path}.{self.FS_END_TIME_COLUMN} OR "
|
615
|
-
f"{table_path}.{self.FS_END_TIME_COLUMN} IS NULL)"
|
616
|
-
)
|
617
|
-
|
618
|
-
features = ", ".join(
|
619
|
-
[
|
620
|
-
f"{offline_feature_store_full_path}.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
|
621
|
-
for feature in feature_list
|
622
|
-
]
|
623
|
-
)
|
624
|
-
|
625
|
-
final_query_features = ", ".join(
|
626
|
-
[
|
627
|
-
f"filtered_features.{self.quotes}{feature}{self.quotes} as {self.quotes}{feature}{self.quotes}"
|
628
|
-
for feature in feature_list
|
629
|
-
]
|
630
|
-
)
|
631
|
-
|
632
|
-
full_sql = (
|
633
|
-
f"WITH " # nosec B608
|
634
|
-
"filtered_features AS ( "
|
635
|
-
f"SELECT {uploaded_population_path}.*, {features} "
|
636
|
-
f"FROM {uploaded_population_path} "
|
637
|
-
f"{join_part} "
|
638
|
-
f"{where_part} "
|
639
|
-
") "
|
640
|
-
f"SELECT population.*, {final_query_features} "
|
641
|
-
f"FROM {uploaded_population_path} population "
|
642
|
-
f"LEFT JOIN filtered_features "
|
643
|
-
f"ON population.{entity_key} = filtered_features.{entity_key} "
|
644
|
-
f"AND population.{point_in_time_column_name} = filtered_features.{point_in_time_column_name}"
|
645
|
-
)
|
646
|
-
|
647
|
-
dfs.append(
|
648
|
-
self.query_engine.read_pandas_from_query(
|
649
|
-
full_sql, [point_in_time_column_name]
|
650
|
-
)
|
651
|
-
)
|
652
|
-
return self._merge_query_dataframes_results(dfs, population_list)
|
653
|
-
|
654
|
-
@staticmethod
|
655
|
-
def _merge_query_dataframes_results(
|
656
|
-
dfs: List["pd.DataFrame"], population_list
|
657
|
-
) -> "pd.DataFrame":
|
658
|
-
"""
|
659
|
-
merges query result dataframes according to population list cols
|
660
|
-
"""
|
661
|
-
if dfs:
|
662
|
-
df_final = reduce(
|
663
|
-
lambda left, right: pd.merge(
|
664
|
-
left, right, on=population_list, how="outer"
|
665
|
-
),
|
666
|
-
dfs,
|
667
|
-
)
|
668
|
-
|
669
|
-
ordered_cols = population_list + (
|
670
|
-
df_final.columns.drop(population_list).tolist()
|
671
|
-
)
|
672
|
-
|
673
|
-
return df_final[ordered_cols].reset_index(drop=True)
|
674
|
-
else:
|
675
|
-
return pd.DataFrame()
|
676
|
-
|
677
|
-
def _get_join_population_sql(
|
678
|
-
self, entity_key, offline_feature_store_full_path, uploaded_population_path
|
679
|
-
):
|
680
|
-
"""
|
681
|
-
return a join sql query for uploaded population table
|
682
|
-
"""
|
683
|
-
join_part = (
|
684
|
-
f"LEFT JOIN {offline_feature_store_full_path} ON "
|
685
|
-
f"{offline_feature_store_full_path}.{entity_key} = "
|
686
|
-
f"{uploaded_population_path}.{entity_key} "
|
687
|
-
)
|
688
|
-
return join_part
|
689
|
-
|
690
|
-
def get_fs_full_path(self, feature_set_name: str) -> Tuple[str, str]:
|
691
|
-
offline_feature_store_table_name = self._get_offline_feature_store_full_name(
|
692
|
-
feature_set_name
|
693
|
-
)
|
694
|
-
table_path = f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
|
695
|
-
offline_feature_store_full_path = (
|
696
|
-
f"{self.quotes}{self.FS_DB_NAME}{self.quotes}."
|
697
|
-
f"{self.quotes}{offline_feature_store_table_name}{self.quotes}"
|
698
|
-
)
|
699
|
-
return offline_feature_store_full_path, table_path
|
700
|
-
|
701
|
-
def _fs_db_names(self) -> List[str]:
|
702
|
-
return [
|
703
|
-
database_tuple[0]
|
704
|
-
for database_tuple in self.query_engine.run_query("SHOW SCHEMAS")
|
705
|
-
]
|
706
|
-
|
707
|
-
def _fs_tables_names(self) -> List[str]:
|
708
|
-
return [
|
709
|
-
table_tuple[0]
|
710
|
-
for table_tuple in self.query_engine.run_query(
|
711
|
-
f"SHOW TABLES IN {self.FS_DB_NAME}"
|
712
|
-
)
|
713
|
-
]
|
714
|
-
|
715
|
-
def _get_offline_feature_store_full_name(self, feature_set_name: str) -> str:
|
716
|
-
return f"{self.FS_TABLE_NAME_PREFIX}_{feature_set_name}".lower().replace(
|
717
|
-
"-", "_"
|
718
|
-
)
|