qwak-core 0.5.4__py3-none-any.whl → 0.5.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _qwak_proto/qwak/builds/build_pb2.py +40 -40
- _qwak_proto/qwak/builds/build_pb2.pyi +7 -1
- _qwak_proto/qwak/builds/build_values_pb2.py +38 -38
- _qwak_proto/qwak/builds/build_values_pb2.pyi +7 -1
- _qwak_proto/qwak/model_group/model_group_repository_details_pb2.py +16 -12
- _qwak_proto/qwak/model_group/model_group_repository_details_pb2.pyi +44 -6
- qwak/__init__.py +1 -1
- qwak/clients/feature_store/execution_management_client.py +28 -0
- qwak/feature_store/execution/streaming_backfill.py +48 -0
- qwak/feature_store/feature_sets/streaming.py +84 -63
- qwak/feature_store/feature_sets/streaming_backfill.py +88 -124
- {qwak_core-0.5.4.dist-info → qwak_core-0.5.16.dist-info}/METADATA +1 -1
- {qwak_core-0.5.4.dist-info → qwak_core-0.5.16.dist-info}/RECORD +15 -14
- qwak_services_mock/mocks/execution_management_service.py +9 -1
- {qwak_core-0.5.4.dist-info → qwak_core-0.5.16.dist-info}/WHEEL +0 -0
|
@@ -2,7 +2,7 @@ import collections
|
|
|
2
2
|
import functools
|
|
3
3
|
import inspect
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from datetime import datetime
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
6
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
|
7
7
|
|
|
8
8
|
from _qwak_proto.qwak.feature_store.features.execution_pb2 import (
|
|
@@ -21,6 +21,7 @@ from _qwak_proto.qwak.feature_store.sources.streaming_pb2 import (
|
|
|
21
21
|
StreamingSource,
|
|
22
22
|
StreamingSource as ProtoStreamingSource,
|
|
23
23
|
)
|
|
24
|
+
from google.protobuf.timestamp_pb2 import Timestamp as ProtoTimestamp
|
|
24
25
|
from qwak.clients.feature_store import FeatureRegistryClient
|
|
25
26
|
from qwak.exceptions import QwakException
|
|
26
27
|
from qwak.feature_store._common.artifact_utils import ArtifactSpec, ArtifactsUploader
|
|
@@ -34,7 +35,7 @@ from qwak.feature_store.feature_sets.metadata import (
|
|
|
34
35
|
set_metadata_on_function,
|
|
35
36
|
)
|
|
36
37
|
from qwak.feature_store.feature_sets.streaming_backfill import (
|
|
37
|
-
|
|
38
|
+
BackfillDataSource,
|
|
38
39
|
StreamingBackfill,
|
|
39
40
|
)
|
|
40
41
|
from qwak.feature_store.feature_sets.transformations import (
|
|
@@ -75,6 +76,7 @@ def feature_set(
|
|
|
75
76
|
key: Optional[str] = None,
|
|
76
77
|
auxiliary_sinks: List[BaseSink] = [],
|
|
77
78
|
repository: Optional[str] = None,
|
|
79
|
+
backfill_max_timestamp: Optional[datetime] = None,
|
|
78
80
|
):
|
|
79
81
|
"""
|
|
80
82
|
Creates a streaming feature set for the specified entity using the given streaming data sources.
|
|
@@ -110,6 +112,11 @@ def feature_set(
|
|
|
110
112
|
"""
|
|
111
113
|
|
|
112
114
|
def decorator(function):
|
|
115
|
+
if isinstance(function, StreamingBackfill):
|
|
116
|
+
raise QwakException(
|
|
117
|
+
"Backfill can no longer be defined as a decorator on the feature set, it must be triggered after feature set creation."
|
|
118
|
+
)
|
|
119
|
+
|
|
113
120
|
user_transformation = function()
|
|
114
121
|
FeaturesetUtils.validate_base_featureset_decorator(
|
|
115
122
|
user_transformation=user_transformation, entity=entity, key=key
|
|
@@ -120,10 +127,6 @@ def feature_set(
|
|
|
120
127
|
offline_scheduling_policy=offline_scheduling_policy,
|
|
121
128
|
)
|
|
122
129
|
|
|
123
|
-
streaming_backfill: Optional[StreamingBackfill] = (
|
|
124
|
-
StreamingBackfill.get_streaming_backfill_from_function(function=function)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
130
|
fs_name = name or function.__name__
|
|
128
131
|
streaming_feature_set = StreamingFeatureSet(
|
|
129
132
|
name=fs_name,
|
|
@@ -150,7 +153,7 @@ def feature_set(
|
|
|
150
153
|
online_cluster_template=getattr(
|
|
151
154
|
function, _ONLINE_CLUSTER_SPEC, ClusterTemplate.SMALL
|
|
152
155
|
),
|
|
153
|
-
|
|
156
|
+
backfill_max_timestamp=backfill_max_timestamp,
|
|
154
157
|
__instance_module_path__=inspect.stack()[1].filename,
|
|
155
158
|
auxiliary_sinks=auxiliary_sinks,
|
|
156
159
|
)
|
|
@@ -197,55 +200,68 @@ def execution_specification(
|
|
|
197
200
|
@typechecked
|
|
198
201
|
def backfill(
|
|
199
202
|
*,
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
203
|
+
feature_set_name: str,
|
|
204
|
+
start_date: Optional[datetime],
|
|
205
|
+
end_date: Optional[datetime],
|
|
206
|
+
data_sources: Union[List[str], List[BackfillDataSource]],
|
|
204
207
|
backfill_cluster_template: Optional[ClusterTemplate] = ClusterTemplate.SMALL,
|
|
205
208
|
):
|
|
206
209
|
"""
|
|
207
|
-
|
|
210
|
+
Triggers a backfill execution for an existing streaming featureset. Currently available for streaming
|
|
208
211
|
aggregation featuresets only.
|
|
209
212
|
|
|
210
|
-
:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
data_sources=["users_registration_stream"],
|
|
223
|
-
timestamp_column_name="reg_date"
|
|
224
|
-
)
|
|
213
|
+
Args:
|
|
214
|
+
feature_set_name (str): Name of the FeatureSet to trigger a backfill for.
|
|
215
|
+
start_date (datetime): Backfill start date, on Streaming Aggregation Feature Sets,
|
|
216
|
+
needs to align with the FeatureSet tiles.
|
|
217
|
+
end_date (datetime): Backfill end date, on Streaming Aggregation Feature Sets,
|
|
218
|
+
needs to align with the FeatureSet tiles and be smaller than the Feature Set's backfill_max_timestamp.
|
|
219
|
+
data_sources (list[BackfillDataSource] | list[str]): A list of BackfillDataSource objects containing
|
|
220
|
+
batch source name and optional time range, or a list of batch source names (with no time range limits).
|
|
221
|
+
backfill_cluster_template (ClusterTemplate, optional): An optional cluster specification for the backfill job.
|
|
222
|
+
Defaults to SMALL.
|
|
223
|
+
|
|
224
|
+
Examples:
|
|
225
225
|
@streaming.backfill(
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
226
|
+
feature_set_name="user_streaming_agg_features",
|
|
227
|
+
start_date=datetime(2022,1,1,0,0,0),
|
|
228
|
+
end_date=datetime(2023,9,1,0,0,0),
|
|
229
|
+
data_sources=[BackfillDataSource(data_source_name="backfill_data_source",
|
|
230
|
+
start_datetime=datetime(2023,1,1,0,0,0),
|
|
231
|
+
end_datetime=datetime(2023,8,1,0,0,0))],
|
|
230
232
|
backfill_cluster_template=ClusterTemplate.SMALL
|
|
231
|
-
backfill_transformation=SparkSqlTransformation("SELECT user_id, reg_country, reg_date FROM backfill_data_source")
|
|
232
233
|
)
|
|
233
|
-
def
|
|
234
|
-
return SparkSqlTransformation("SELECT user_id, reg_country, reg_date FROM
|
|
234
|
+
def backfill_transformation():
|
|
235
|
+
return SparkSqlTransformation("SELECT user_id, reg_country, reg_date FROM backfill_data_source")
|
|
235
236
|
"""
|
|
236
237
|
|
|
237
238
|
def decorator(function):
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
239
|
+
if isinstance(function, StreamingFeatureSet):
|
|
240
|
+
raise QwakException(
|
|
241
|
+
"Backfill can no longer be defined as a decorator on the feature set, it must be triggered after feature set creation."
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
backfill_transformation: SparkSqlTransformation = function()
|
|
245
|
+
|
|
246
|
+
if not isinstance(backfill_transformation, SparkSqlTransformation):
|
|
247
|
+
raise QwakException(
|
|
248
|
+
"Backfill must defined on a method returning a SparkSqlTransformation"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
streaming_backfill = StreamingBackfill(
|
|
252
|
+
featureset_name=feature_set_name,
|
|
253
|
+
start_datetime=start_date,
|
|
254
|
+
end_datetime=end_date,
|
|
255
|
+
data_sources=StreamingBackfill._get_normalized_backfill_sources_spec(
|
|
256
|
+
data_sources
|
|
257
|
+
),
|
|
258
|
+
transform=backfill_transformation,
|
|
259
|
+
cluster_template=backfill_cluster_template,
|
|
246
260
|
)
|
|
247
261
|
|
|
248
|
-
|
|
262
|
+
functools.update_wrapper(streaming_backfill, backfill_transformation)
|
|
263
|
+
|
|
264
|
+
return streaming_backfill
|
|
249
265
|
|
|
250
266
|
return decorator
|
|
251
267
|
|
|
@@ -316,7 +332,7 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
316
332
|
offline_cluster_template: Optional[ClusterTemplate] = None
|
|
317
333
|
online_cluster_template: Optional[ClusterTemplate] = None
|
|
318
334
|
metadata: Optional[Metadata] = None
|
|
319
|
-
|
|
335
|
+
backfill_max_timestamp: Optional[datetime] = None
|
|
320
336
|
auxiliary_sinks: List[BaseSink] = field(default_factory=lambda: [])
|
|
321
337
|
|
|
322
338
|
def __post_init__(self):
|
|
@@ -399,7 +415,6 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
399
415
|
proto_featureset_type = self._get_streaming_aggregation_featureset_proto(
|
|
400
416
|
artifact_url=artifact_url,
|
|
401
417
|
streaming_sources=data_sources,
|
|
402
|
-
feature_registry=feature_registry,
|
|
403
418
|
initial_tile_size=maybe_initial_tile_size,
|
|
404
419
|
)
|
|
405
420
|
|
|
@@ -453,10 +468,9 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
453
468
|
"Auxiliary Sinks Are not supported in Streaming Aggregation Feature Sets"
|
|
454
469
|
)
|
|
455
470
|
|
|
456
|
-
|
|
457
|
-
if self.backfill and not is_streaming_agg:
|
|
471
|
+
if self.backfill_max_timestamp and not is_streaming_agg:
|
|
458
472
|
raise QwakException(
|
|
459
|
-
"
|
|
473
|
+
"backfill_max_timestamp can only be set for Streaming Aggregation FeatureSet."
|
|
460
474
|
)
|
|
461
475
|
|
|
462
476
|
# Validate transformation is PySpark when multiple data sources are used
|
|
@@ -515,18 +529,29 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
515
529
|
)
|
|
516
530
|
raise QwakException(error_message_str)
|
|
517
531
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
532
|
+
if not self.backfill_max_timestamp:
|
|
533
|
+
raise QwakException(
|
|
534
|
+
"""
|
|
535
|
+
backfill_max_timestamp must be set for Streaming Aggregation FeatureSet.
|
|
536
|
+
Events earlier than this timestamp can only be processed by triggering backfill,
|
|
537
|
+
the Streaming job will not process events that are earlier than this timestamp.
|
|
538
|
+
"""
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
self._validate_streaming_aggregation_backfill_max_timestamp()
|
|
521
542
|
|
|
522
543
|
return initial_tile_size
|
|
523
544
|
|
|
524
|
-
def
|
|
545
|
+
def _validate_streaming_aggregation_backfill_max_timestamp(self):
|
|
525
546
|
initial_tile_size, _ = StreamingFeatureSet._get_default_slide_period(
|
|
526
547
|
self.transformation.windows
|
|
527
548
|
)
|
|
528
549
|
|
|
529
|
-
self.
|
|
550
|
+
if self.backfill_max_timestamp.timestamp() % initial_tile_size != 0:
|
|
551
|
+
raise QwakException(
|
|
552
|
+
f"Chosen backfill max timestamp is invalid,"
|
|
553
|
+
f" it has to be exactly dividable by slice size of {initial_tile_size} seconds."
|
|
554
|
+
)
|
|
530
555
|
|
|
531
556
|
@staticmethod
|
|
532
557
|
def _get_default_slide_period(
|
|
@@ -596,9 +621,12 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
596
621
|
self,
|
|
597
622
|
artifact_url: Optional[str],
|
|
598
623
|
streaming_sources: List[StreamingSource],
|
|
599
|
-
feature_registry: FeatureRegistryClient,
|
|
600
624
|
initial_tile_size: int,
|
|
601
625
|
) -> ProtoFeatureSetType:
|
|
626
|
+
backfill_max_timestamp = ProtoTimestamp()
|
|
627
|
+
backfill_max_timestamp.FromDatetime(
|
|
628
|
+
self.backfill_max_timestamp.astimezone(timezone.utc)
|
|
629
|
+
)
|
|
602
630
|
return ProtoFeatureSetType(
|
|
603
631
|
streaming_aggregation_feature_set=ProtoStreamingAggregationFeatureSet(
|
|
604
632
|
transformation=self.transformation._to_proto(
|
|
@@ -621,14 +649,7 @@ class StreamingFeatureSet(BaseFeatureSet):
|
|
|
621
649
|
allowed_late_arrival_seconds=60 * 10,
|
|
622
650
|
aggregations=self.transformation._get_aggregations_proto(),
|
|
623
651
|
),
|
|
624
|
-
backfill_spec=
|
|
625
|
-
|
|
626
|
-
feature_registry=feature_registry,
|
|
627
|
-
original_instance_module_path=self.__instance_module_path__,
|
|
628
|
-
featureset_name=self.name,
|
|
629
|
-
)
|
|
630
|
-
if self.backfill
|
|
631
|
-
else None
|
|
632
|
-
),
|
|
652
|
+
backfill_spec=None,
|
|
653
|
+
backfill_max_timestamp=backfill_max_timestamp,
|
|
633
654
|
)
|
|
634
655
|
)
|
|
@@ -1,24 +1,18 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
1
|
from dataclasses import dataclass
|
|
3
2
|
from datetime import datetime, timezone
|
|
4
3
|
from typing import List, Optional, Set, Union
|
|
5
4
|
|
|
6
5
|
from _qwak_proto.qwak.feature_store.features.execution_pb2 import (
|
|
7
|
-
|
|
6
|
+
ExecutionSpec as ProtoExecutionSpec,
|
|
8
7
|
)
|
|
9
|
-
from _qwak_proto.qwak.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
)
|
|
14
|
-
from _qwak_proto.qwak.feature_store.sources.batch_pb2 import (
|
|
15
|
-
BatchSource as ProtoBatchSource,
|
|
8
|
+
from _qwak_proto.qwak.execution.v1.streaming_aggregation_pb2 import (
|
|
9
|
+
StreamingAggregationBackfillIngestion as ProtoStreamingAggregationBackfillIngestion,
|
|
10
|
+
BackfillDataSource as ProtoBackfillDataSource,
|
|
11
|
+
TimeRange as ProtoTimeRange,
|
|
16
12
|
)
|
|
17
13
|
from google.protobuf.timestamp_pb2 import Timestamp as ProtoTimestamp
|
|
18
|
-
from qwak.clients.feature_store import FeatureRegistryClient
|
|
19
14
|
from qwak.exceptions import QwakException
|
|
20
15
|
from qwak.feature_store._common.artifact_utils import ArtifactSpec, ArtifactsUploader
|
|
21
|
-
from qwak.feature_store._common.feature_set_utils import get_batch_source_for_featureset
|
|
22
16
|
from qwak.feature_store.feature_sets.execution_spec import ClusterTemplate
|
|
23
17
|
from qwak.feature_store.feature_sets.transformations import SparkSqlTransformation
|
|
24
18
|
|
|
@@ -26,36 +20,15 @@ _BACKFILL_ = "_qwak_backfill_specification"
|
|
|
26
20
|
|
|
27
21
|
|
|
28
22
|
@dataclass
|
|
29
|
-
class
|
|
23
|
+
class BackfillDataSource:
|
|
30
24
|
data_source_name: str
|
|
31
|
-
|
|
32
|
-
@abstractmethod
|
|
33
|
-
def _to_proto(self, feature_registry: FeatureRegistryClient):
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def _from_proto(cls, proto: ProtoBackfillDataSourceSpec):
|
|
38
|
-
function_mapping = {"batch_data_source_spec": BackfillBatchDataSourceSpec}
|
|
39
|
-
|
|
40
|
-
backfill_source_type: str = proto.WhichOneof("type")
|
|
41
|
-
|
|
42
|
-
if backfill_source_type in function_mapping:
|
|
43
|
-
function_class = function_mapping.get(backfill_source_type)
|
|
44
|
-
return function_class._from_proto(proto)
|
|
45
|
-
|
|
46
|
-
raise QwakException(
|
|
47
|
-
f"Got unsupported backfill source type {backfill_source_type} for streaming backfill"
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@dataclass
|
|
52
|
-
class BackfillBatchDataSourceSpec(DataSourceBackfillSpec):
|
|
53
25
|
start_datetime: Optional[datetime] = None
|
|
54
26
|
end_datetime: Optional[datetime] = None
|
|
55
27
|
|
|
56
|
-
def
|
|
57
|
-
self
|
|
58
|
-
|
|
28
|
+
def __post_init__(self):
|
|
29
|
+
self._validate()
|
|
30
|
+
|
|
31
|
+
def _to_proto(self) -> ProtoBackfillDataSource:
|
|
59
32
|
start_timestamp: Optional[ProtoTimestamp] = None
|
|
60
33
|
end_timestamp: Optional[ProtoTimestamp] = None
|
|
61
34
|
|
|
@@ -67,63 +40,94 @@ class BackfillBatchDataSourceSpec(DataSourceBackfillSpec):
|
|
|
67
40
|
start_timestamp = ProtoTimestamp()
|
|
68
41
|
start_timestamp.FromDatetime(self.start_datetime.astimezone(timezone.utc))
|
|
69
42
|
|
|
70
|
-
|
|
71
|
-
batch_ds_name=self.data_source_name, feature_registry=feature_registry
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
return ProtoBackfillBatchDataSourceSpec(
|
|
75
|
-
data_source=proto_data_source,
|
|
43
|
+
time_range = ProtoTimeRange(
|
|
76
44
|
start_timestamp=start_timestamp,
|
|
77
45
|
end_timestamp=end_timestamp,
|
|
78
46
|
)
|
|
79
47
|
|
|
48
|
+
return ProtoBackfillDataSource(
|
|
49
|
+
data_source_name=self.data_source_name,
|
|
50
|
+
time_range=time_range,
|
|
51
|
+
)
|
|
52
|
+
|
|
80
53
|
@classmethod
|
|
81
|
-
def _from_proto(
|
|
82
|
-
cls, proto: ProtoBackfillDataSourceSpec
|
|
83
|
-
) -> "BackfillBatchDataSourceSpec":
|
|
54
|
+
def _from_proto(cls, proto: ProtoBackfillDataSource) -> "BackfillDataSource":
|
|
84
55
|
start_datetime: Optional[datetime] = None
|
|
85
56
|
end_datetime: Optional[datetime] = None
|
|
86
57
|
|
|
87
|
-
|
|
88
|
-
proto.batch_data_source_spec
|
|
89
|
-
)
|
|
58
|
+
time_range: ProtoTimeRange = proto.time_range
|
|
90
59
|
|
|
91
|
-
proto_start_timestamp: ProtoTimestamp =
|
|
92
|
-
|
|
60
|
+
proto_start_timestamp: Optional[ProtoTimestamp] = (
|
|
61
|
+
time_range.start_timestamp if time_range.start_timestamp else None
|
|
62
|
+
)
|
|
63
|
+
proto_end_timestamp: Optional[ProtoTimestamp] = (
|
|
64
|
+
time_range.end_timestamp if time_range.end_timestamp else None
|
|
65
|
+
)
|
|
93
66
|
|
|
94
|
-
start_datetime =
|
|
95
|
-
|
|
67
|
+
start_datetime = (
|
|
68
|
+
datetime.fromtimestamp(
|
|
69
|
+
proto_start_timestamp.seconds + proto_start_timestamp.nanos / 1e9
|
|
70
|
+
)
|
|
71
|
+
if proto_start_timestamp
|
|
72
|
+
else None
|
|
96
73
|
)
|
|
97
74
|
|
|
98
|
-
end_datetime =
|
|
99
|
-
|
|
75
|
+
end_datetime = (
|
|
76
|
+
datetime.fromtimestamp(
|
|
77
|
+
proto_end_timestamp.seconds + proto_end_timestamp.nanos / 1e9
|
|
78
|
+
)
|
|
79
|
+
if proto_end_timestamp
|
|
80
|
+
else None
|
|
100
81
|
)
|
|
101
82
|
|
|
102
83
|
return cls(
|
|
103
|
-
data_source_name=
|
|
84
|
+
data_source_name=proto.data_source_name,
|
|
104
85
|
start_datetime=start_datetime,
|
|
105
86
|
end_datetime=end_datetime,
|
|
106
87
|
)
|
|
107
88
|
|
|
89
|
+
def _validate(self):
|
|
90
|
+
if self.start_datetime and self.end_datetime:
|
|
91
|
+
if self.start_datetime >= self.end_datetime:
|
|
92
|
+
raise QwakException(
|
|
93
|
+
f"Backfill data source {self.data_source_name} has invalid time range: "
|
|
94
|
+
f"start_datetime {self.start_datetime} is after or equal end_datetime {self.end_datetime}."
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if not self.data_source_name:
|
|
98
|
+
raise QwakException(
|
|
99
|
+
"Backfill data source must have a valid data source name."
|
|
100
|
+
)
|
|
101
|
+
|
|
108
102
|
|
|
109
103
|
@dataclass
|
|
110
104
|
class StreamingBackfill:
|
|
105
|
+
featureset_name: str
|
|
111
106
|
start_datetime: datetime
|
|
112
107
|
end_datetime: datetime
|
|
113
|
-
|
|
108
|
+
data_sources: List[BackfillDataSource]
|
|
114
109
|
transform: "SparkSqlTransformation"
|
|
115
110
|
cluster_template: Optional[ClusterTemplate] = ClusterTemplate.SMALL
|
|
116
111
|
|
|
117
112
|
def __post_init__(self):
|
|
118
|
-
if not self.
|
|
113
|
+
if not self.featureset_name:
|
|
114
|
+
raise QwakException("featureset_name must be provided for backfill.")
|
|
115
|
+
|
|
116
|
+
if not self.start_datetime or not self.end_datetime:
|
|
119
117
|
raise QwakException(
|
|
120
|
-
"
|
|
121
|
-
"At least one data source has to be provided when trying to create a streaming backfill."
|
|
118
|
+
"For Streaming backfill, start_datetime and end_datetime are mandatory fields."
|
|
122
119
|
)
|
|
123
120
|
|
|
124
|
-
if
|
|
121
|
+
if self.start_datetime >= self.end_datetime:
|
|
125
122
|
raise QwakException(
|
|
126
|
-
"
|
|
123
|
+
f"Backfill has invalid time range: "
|
|
124
|
+
f"start_datetime {self.start_datetime} is after or equal end_datetime {self.end_datetime}."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if not self.data_sources:
|
|
128
|
+
raise QwakException(
|
|
129
|
+
"Trying to create a streaming backfill with no data sources. "
|
|
130
|
+
"At least one data source has to be provided when trying to create a streaming backfill."
|
|
127
131
|
)
|
|
128
132
|
|
|
129
133
|
if type(self.transform) is not SparkSqlTransformation:
|
|
@@ -135,7 +139,7 @@ class StreamingBackfill:
|
|
|
135
139
|
|
|
136
140
|
def _validate_unique_sources(self):
|
|
137
141
|
source_names: List[str] = [
|
|
138
|
-
data_source.data_source_name for data_source in self.
|
|
142
|
+
data_source.data_source_name for data_source in self.data_sources
|
|
139
143
|
]
|
|
140
144
|
duplicates: Set[str] = {
|
|
141
145
|
item for item in source_names if source_names.count(item) > 1
|
|
@@ -146,23 +150,14 @@ class StreamingBackfill:
|
|
|
146
150
|
f"Found these duplicates: {', '.join(set(duplicates))}"
|
|
147
151
|
)
|
|
148
152
|
|
|
149
|
-
def _validate_tile_size(self, initial_tile_size: int):
|
|
150
|
-
if self.end_datetime.timestamp() % initial_tile_size != 0:
|
|
151
|
-
raise QwakException(
|
|
152
|
-
f"Chosen backfill end datetime is invalid,"
|
|
153
|
-
f" it has to be exactly dividable by slice size of {initial_tile_size} seconds."
|
|
154
|
-
)
|
|
155
|
-
|
|
156
153
|
def _to_proto(
|
|
157
154
|
self,
|
|
158
|
-
feature_registry: FeatureRegistryClient,
|
|
159
|
-
featureset_name: str,
|
|
160
155
|
original_instance_module_path: str,
|
|
161
|
-
) ->
|
|
156
|
+
) -> ProtoStreamingAggregationBackfillIngestion:
|
|
162
157
|
artifact_url: Optional[str] = None
|
|
163
158
|
artifact_spec: Optional[ArtifactSpec] = ArtifactsUploader.get_artifact_spec(
|
|
164
159
|
transformation=self.transform,
|
|
165
|
-
featureset_name=f"{featureset_name}-backfill",
|
|
160
|
+
featureset_name=f"{self.featureset_name}-backfill",
|
|
166
161
|
__instance_module_path__=original_instance_module_path,
|
|
167
162
|
)
|
|
168
163
|
|
|
@@ -175,85 +170,54 @@ class StreamingBackfill:
|
|
|
175
170
|
start_timestamp = ProtoTimestamp()
|
|
176
171
|
start_timestamp.FromDatetime(self.start_datetime.astimezone(timezone.utc))
|
|
177
172
|
|
|
178
|
-
return
|
|
173
|
+
return ProtoStreamingAggregationBackfillIngestion(
|
|
174
|
+
featureset_name=self.featureset_name,
|
|
179
175
|
start_timestamp=start_timestamp,
|
|
180
176
|
end_timestamp=end_timestamp,
|
|
181
|
-
execution_spec=
|
|
182
|
-
|
|
177
|
+
execution_spec=ProtoExecutionSpec(
|
|
178
|
+
cluster_template=ClusterTemplate.to_proto(self.cluster_template)
|
|
183
179
|
),
|
|
184
180
|
transformation=self.transform._to_proto(artifact_path=artifact_url),
|
|
185
181
|
data_source_specs=[
|
|
186
|
-
|
|
187
|
-
batch_data_source_spec=data_source_spec._to_proto(
|
|
188
|
-
feature_registry=feature_registry
|
|
189
|
-
)
|
|
190
|
-
)
|
|
191
|
-
for data_source_spec in self.data_sources_specs
|
|
182
|
+
data_source._to_proto() for data_source in self.data_sources
|
|
192
183
|
],
|
|
193
184
|
)
|
|
194
185
|
|
|
195
186
|
@classmethod
|
|
196
|
-
def _from_proto(cls, proto:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
data_sources_specs = [
|
|
202
|
-
BackfillBatchDataSourceSpec._from_proto(ds)
|
|
203
|
-
for ds in proto.data_source_specs
|
|
187
|
+
def _from_proto(cls, proto: ProtoStreamingAggregationBackfillIngestion):
|
|
188
|
+
backfill_data_sources = [
|
|
189
|
+
BackfillDataSource._from_proto(ds) for ds in proto.data_source_specs
|
|
204
190
|
]
|
|
205
191
|
|
|
206
192
|
return cls(
|
|
193
|
+
featureset_name=proto.featureset_name,
|
|
207
194
|
start_datetime=datetime.fromtimestamp(
|
|
208
195
|
proto.start_timestamp.seconds + proto.start_timestamp.nanos / 1e9
|
|
209
196
|
),
|
|
210
197
|
end_datetime=datetime.fromtimestamp(
|
|
211
198
|
proto.end_timestamp.seconds + proto.end_timestamp.nanos / 1e9
|
|
212
199
|
),
|
|
213
|
-
|
|
200
|
+
data_sources=backfill_data_sources,
|
|
214
201
|
transform=SparkSqlTransformation._from_proto(
|
|
215
202
|
proto.transformation.sql_transformation
|
|
216
203
|
),
|
|
204
|
+
cluster_template=(
|
|
205
|
+
ClusterTemplate.from_proto(proto.execution_spec.cluster_template)
|
|
206
|
+
if proto.execution_spec.cluster_template
|
|
207
|
+
else None
|
|
208
|
+
),
|
|
217
209
|
)
|
|
218
210
|
|
|
219
211
|
@staticmethod
|
|
220
212
|
def _get_normalized_backfill_sources_spec(
|
|
221
|
-
data_sources: Union[List[str], List[
|
|
222
|
-
) -> List[
|
|
223
|
-
# reformat all data source
|
|
213
|
+
data_sources: Union[List[str], List[BackfillDataSource]],
|
|
214
|
+
) -> List[BackfillDataSource]:
|
|
215
|
+
# reformat all data source names to 'BackfillDataSource'
|
|
224
216
|
return [
|
|
225
217
|
(
|
|
226
|
-
|
|
218
|
+
BackfillDataSource(data_source_name=data_source)
|
|
227
219
|
if isinstance(data_source, str)
|
|
228
220
|
else data_source
|
|
229
221
|
)
|
|
230
222
|
for data_source in data_sources
|
|
231
223
|
]
|
|
232
|
-
|
|
233
|
-
@classmethod
|
|
234
|
-
def set_streaming_backfill_on_function(
|
|
235
|
-
cls,
|
|
236
|
-
function,
|
|
237
|
-
start_date: datetime,
|
|
238
|
-
end_date: datetime,
|
|
239
|
-
data_sources: Union[List[str], List[DataSourceBackfillSpec]],
|
|
240
|
-
backfill_transformation: SparkSqlTransformation,
|
|
241
|
-
backfill_cluster_template: Optional[ClusterTemplate] = ClusterTemplate.SMALL,
|
|
242
|
-
):
|
|
243
|
-
setattr(
|
|
244
|
-
function,
|
|
245
|
-
_BACKFILL_,
|
|
246
|
-
cls(
|
|
247
|
-
start_datetime=start_date,
|
|
248
|
-
end_datetime=end_date,
|
|
249
|
-
data_sources_specs=StreamingBackfill._get_normalized_backfill_sources_spec(
|
|
250
|
-
data_sources
|
|
251
|
-
),
|
|
252
|
-
transform=backfill_transformation,
|
|
253
|
-
cluster_template=backfill_cluster_template,
|
|
254
|
-
),
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
@staticmethod
|
|
258
|
-
def get_streaming_backfill_from_function(function):
|
|
259
|
-
return getattr(function, _BACKFILL_, None)
|