airbyte-cdk 6.23.0__py3-none-any.whl → 6.23.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +80 -16
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +23 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
- airbyte_cdk/sources/declarative/incremental/__init__.py +6 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +334 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +3 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +15 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +18 -6
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +163 -16
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +51 -57
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- {airbyte_cdk-6.23.0.dist-info → airbyte_cdk-6.23.0.dev1.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.23.0.dist-info → airbyte_cdk-6.23.0.dev1.dist-info}/RECORD +18 -15
- {airbyte_cdk-6.23.0.dist-info → airbyte_cdk-6.23.0.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.23.0.dist-info → airbyte_cdk-6.23.0.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.23.0.dist-info → airbyte_cdk-6.23.0.dev1.dist-info}/entry_points.txt +0 -0
@@ -8,7 +8,6 @@ import datetime
|
|
8
8
|
import importlib
|
9
9
|
import inspect
|
10
10
|
import re
|
11
|
-
import sys
|
12
11
|
from functools import partial
|
13
12
|
from typing import (
|
14
13
|
Any,
|
@@ -88,6 +87,8 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
88
87
|
)
|
89
88
|
from airbyte_cdk.sources.declarative.incremental import (
|
90
89
|
ChildPartitionResumableFullRefreshCursor,
|
90
|
+
ConcurrentCursorFactory,
|
91
|
+
ConcurrentPerPartitionCursor,
|
91
92
|
CursorFactory,
|
92
93
|
DatetimeBasedCursor,
|
93
94
|
DeclarativeCursor,
|
@@ -102,6 +103,7 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
|
|
102
103
|
LegacyToPerPartitionStateMigration,
|
103
104
|
)
|
104
105
|
from airbyte_cdk.sources.declarative.models import (
|
106
|
+
Clamping,
|
105
107
|
CustomStateMigration,
|
106
108
|
)
|
107
109
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
@@ -461,6 +463,16 @@ from airbyte_cdk.sources.message import (
|
|
461
463
|
InMemoryMessageRepository,
|
462
464
|
LogAppenderMessageRepositoryDecorator,
|
463
465
|
MessageRepository,
|
466
|
+
NoopMessageRepository,
|
467
|
+
)
|
468
|
+
from airbyte_cdk.sources.streams.concurrent.clamping import (
|
469
|
+
ClampingEndProvider,
|
470
|
+
ClampingStrategy,
|
471
|
+
DayClampingStrategy,
|
472
|
+
MonthClampingStrategy,
|
473
|
+
NoClamping,
|
474
|
+
WeekClampingStrategy,
|
475
|
+
Weekday,
|
464
476
|
)
|
465
477
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
466
478
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -917,6 +929,8 @@ class ModelToComponentFactory:
|
|
917
929
|
stream_namespace: Optional[str],
|
918
930
|
config: Config,
|
919
931
|
stream_state: MutableMapping[str, Any],
|
932
|
+
message_repository: Optional[MessageRepository] = None,
|
933
|
+
runtime_lookback_window: Optional[datetime.timedelta] = None,
|
920
934
|
**kwargs: Any,
|
921
935
|
) -> ConcurrentCursor:
|
922
936
|
component_type = component_definition.get("type")
|
@@ -978,10 +992,22 @@ class ModelToComponentFactory:
|
|
978
992
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
979
993
|
datetime_format=datetime_format,
|
980
994
|
input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
|
981
|
-
is_sequential_state=True,
|
995
|
+
is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state
|
982
996
|
cursor_granularity=cursor_granularity,
|
983
997
|
)
|
984
998
|
|
999
|
+
# Adjusts the stream state by applying the runtime lookback window.
|
1000
|
+
# This is used to ensure correct state handling in case of failed partitions.
|
1001
|
+
stream_state_value = stream_state.get(cursor_field.cursor_field_key)
|
1002
|
+
if runtime_lookback_window and stream_state_value:
|
1003
|
+
new_stream_state = (
|
1004
|
+
connector_state_converter.parse_timestamp(stream_state_value)
|
1005
|
+
- runtime_lookback_window
|
1006
|
+
)
|
1007
|
+
stream_state[cursor_field.cursor_field_key] = connector_state_converter.output_format(
|
1008
|
+
new_stream_state
|
1009
|
+
)
|
1010
|
+
|
985
1011
|
start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime]
|
986
1012
|
if isinstance(datetime_based_cursor_model.start_datetime, MinMaxDatetimeModel):
|
987
1013
|
start_date_runtime_value = self.create_min_max_datetime(
|
@@ -1048,11 +1074,58 @@ class ModelToComponentFactory:
|
|
1048
1074
|
if evaluated_step:
|
1049
1075
|
step_length = parse_duration(evaluated_step)
|
1050
1076
|
|
1077
|
+
clamping_strategy: ClampingStrategy = NoClamping()
|
1078
|
+
if datetime_based_cursor_model.clamping:
|
1079
|
+
# While it is undesirable to interpolate within the model factory (as opposed to at runtime),
|
1080
|
+
# it is still better than shifting interpolation low-code concept into the ConcurrentCursor runtime
|
1081
|
+
# object which we want to keep agnostic of being low-code
|
1082
|
+
target = InterpolatedString(
|
1083
|
+
string=datetime_based_cursor_model.clamping.target,
|
1084
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
1085
|
+
)
|
1086
|
+
evaluated_target = target.eval(config=config)
|
1087
|
+
match evaluated_target:
|
1088
|
+
case "DAY":
|
1089
|
+
clamping_strategy = DayClampingStrategy()
|
1090
|
+
end_date_provider = ClampingEndProvider(
|
1091
|
+
DayClampingStrategy(is_ceiling=False),
|
1092
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1093
|
+
granularity=cursor_granularity or datetime.timedelta(seconds=1),
|
1094
|
+
)
|
1095
|
+
case "WEEK":
|
1096
|
+
if (
|
1097
|
+
not datetime_based_cursor_model.clamping.target_details
|
1098
|
+
or "weekday" not in datetime_based_cursor_model.clamping.target_details
|
1099
|
+
):
|
1100
|
+
raise ValueError(
|
1101
|
+
"Given WEEK clamping, weekday needs to be provided as target_details"
|
1102
|
+
)
|
1103
|
+
weekday = self._assemble_weekday(
|
1104
|
+
datetime_based_cursor_model.clamping.target_details["weekday"]
|
1105
|
+
)
|
1106
|
+
clamping_strategy = WeekClampingStrategy(weekday)
|
1107
|
+
end_date_provider = ClampingEndProvider(
|
1108
|
+
WeekClampingStrategy(weekday, is_ceiling=False),
|
1109
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1110
|
+
granularity=cursor_granularity or datetime.timedelta(days=1),
|
1111
|
+
)
|
1112
|
+
case "MONTH":
|
1113
|
+
clamping_strategy = MonthClampingStrategy()
|
1114
|
+
end_date_provider = ClampingEndProvider(
|
1115
|
+
MonthClampingStrategy(is_ceiling=False),
|
1116
|
+
end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
1117
|
+
granularity=cursor_granularity or datetime.timedelta(days=1),
|
1118
|
+
)
|
1119
|
+
case _:
|
1120
|
+
raise ValueError(
|
1121
|
+
f"Invalid clamping target {evaluated_target}, expected DAY, WEEK, MONTH"
|
1122
|
+
)
|
1123
|
+
|
1051
1124
|
return ConcurrentCursor(
|
1052
1125
|
stream_name=stream_name,
|
1053
1126
|
stream_namespace=stream_namespace,
|
1054
1127
|
stream_state=stream_state,
|
1055
|
-
message_repository=self._message_repository,
|
1128
|
+
message_repository=message_repository or self._message_repository,
|
1056
1129
|
connector_state_manager=state_manager,
|
1057
1130
|
connector_state_converter=connector_state_converter,
|
1058
1131
|
cursor_field=cursor_field,
|
@@ -1062,6 +1135,83 @@ class ModelToComponentFactory:
|
|
1062
1135
|
lookback_window=lookback_window,
|
1063
1136
|
slice_range=step_length,
|
1064
1137
|
cursor_granularity=cursor_granularity,
|
1138
|
+
clamping_strategy=clamping_strategy,
|
1139
|
+
)
|
1140
|
+
|
1141
|
+
def _assemble_weekday(self, weekday: str) -> Weekday:
|
1142
|
+
match weekday:
|
1143
|
+
case "MONDAY":
|
1144
|
+
return Weekday.MONDAY
|
1145
|
+
case "TUESDAY":
|
1146
|
+
return Weekday.TUESDAY
|
1147
|
+
case "WEDNESDAY":
|
1148
|
+
return Weekday.WEDNESDAY
|
1149
|
+
case "THURSDAY":
|
1150
|
+
return Weekday.THURSDAY
|
1151
|
+
case "FRIDAY":
|
1152
|
+
return Weekday.FRIDAY
|
1153
|
+
case "SATURDAY":
|
1154
|
+
return Weekday.SATURDAY
|
1155
|
+
case "SUNDAY":
|
1156
|
+
return Weekday.SUNDAY
|
1157
|
+
case _:
|
1158
|
+
raise ValueError(f"Unknown weekday {weekday}")
|
1159
|
+
|
1160
|
+
def create_concurrent_cursor_from_perpartition_cursor(
|
1161
|
+
self,
|
1162
|
+
state_manager: ConnectorStateManager,
|
1163
|
+
model_type: Type[BaseModel],
|
1164
|
+
component_definition: ComponentDefinition,
|
1165
|
+
stream_name: str,
|
1166
|
+
stream_namespace: Optional[str],
|
1167
|
+
config: Config,
|
1168
|
+
stream_state: MutableMapping[str, Any],
|
1169
|
+
partition_router: PartitionRouter,
|
1170
|
+
**kwargs: Any,
|
1171
|
+
) -> ConcurrentPerPartitionCursor:
|
1172
|
+
component_type = component_definition.get("type")
|
1173
|
+
if component_definition.get("type") != model_type.__name__:
|
1174
|
+
raise ValueError(
|
1175
|
+
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1176
|
+
)
|
1177
|
+
|
1178
|
+
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1179
|
+
|
1180
|
+
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1181
|
+
raise ValueError(
|
1182
|
+
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1183
|
+
)
|
1184
|
+
|
1185
|
+
interpolated_cursor_field = InterpolatedString.create(
|
1186
|
+
datetime_based_cursor_model.cursor_field,
|
1187
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
1188
|
+
)
|
1189
|
+
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1190
|
+
|
1191
|
+
# Create the cursor factory
|
1192
|
+
cursor_factory = ConcurrentCursorFactory(
|
1193
|
+
partial(
|
1194
|
+
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1195
|
+
state_manager=state_manager,
|
1196
|
+
model_type=model_type,
|
1197
|
+
component_definition=component_definition,
|
1198
|
+
stream_name=stream_name,
|
1199
|
+
stream_namespace=stream_namespace,
|
1200
|
+
config=config,
|
1201
|
+
message_repository=NoopMessageRepository(),
|
1202
|
+
)
|
1203
|
+
)
|
1204
|
+
|
1205
|
+
# Return the concurrent cursor and state converter
|
1206
|
+
return ConcurrentPerPartitionCursor(
|
1207
|
+
cursor_factory=cursor_factory,
|
1208
|
+
partition_router=partition_router,
|
1209
|
+
stream_name=stream_name,
|
1210
|
+
stream_namespace=stream_namespace,
|
1211
|
+
stream_state=stream_state,
|
1212
|
+
message_repository=self._message_repository, # type: ignore
|
1213
|
+
connector_state_manager=state_manager,
|
1214
|
+
cursor_field=cursor_field,
|
1065
1215
|
)
|
1066
1216
|
|
1067
1217
|
@staticmethod
|
@@ -1369,18 +1519,15 @@ class ModelToComponentFactory:
|
|
1369
1519
|
raise ValueError(
|
1370
1520
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1371
1521
|
)
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
|
1379
|
-
|
1380
|
-
|
1381
|
-
else None
|
1382
|
-
),
|
1383
|
-
}
|
1522
|
+
cursor = (
|
1523
|
+
combined_slicers
|
1524
|
+
if isinstance(
|
1525
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1526
|
+
)
|
1527
|
+
else self._create_component_from_model(model=model.incremental_sync, config=config)
|
1528
|
+
)
|
1529
|
+
|
1530
|
+
client_side_incremental_sync = {"cursor": cursor}
|
1384
1531
|
|
1385
1532
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1386
1533
|
cursor_model = model.incremental_sync
|
@@ -2227,7 +2374,7 @@ class ModelToComponentFactory:
|
|
2227
2374
|
if (
|
2228
2375
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
2229
2376
|
or type(stream_slicer) is not DatetimeBasedCursor
|
2230
|
-
):
|
2377
|
+
) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
|
2231
2378
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
2232
2379
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
2233
2380
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
|
|
160
160
|
stream_slice,
|
161
161
|
next_page_token,
|
162
162
|
self._paginator.get_request_headers,
|
163
|
-
self.
|
163
|
+
self.request_option_provider.get_request_headers,
|
164
164
|
)
|
165
165
|
if isinstance(headers, str):
|
166
166
|
raise ValueError("Request headers cannot be a string")
|
@@ -0,0 +1,99 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from datetime import datetime, timedelta
|
3
|
+
from enum import Enum
|
4
|
+
from typing import Callable
|
5
|
+
|
6
|
+
from airbyte_cdk.sources.streams.concurrent.cursor_types import CursorValueType
|
7
|
+
|
8
|
+
|
9
|
+
class ClampingStrategy(ABC):
|
10
|
+
def clamp(self, value: CursorValueType) -> CursorValueType:
|
11
|
+
raise NotImplementedError()
|
12
|
+
|
13
|
+
|
14
|
+
class NoClamping(ClampingStrategy):
|
15
|
+
def clamp(self, value: CursorValueType) -> CursorValueType:
|
16
|
+
return value
|
17
|
+
|
18
|
+
|
19
|
+
class ClampingEndProvider:
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
clamping_strategy: ClampingStrategy,
|
23
|
+
end_provider: Callable[[], CursorValueType],
|
24
|
+
granularity: timedelta,
|
25
|
+
) -> None:
|
26
|
+
self._clamping_strategy = clamping_strategy
|
27
|
+
self._end_provider = end_provider
|
28
|
+
self._granularity = granularity
|
29
|
+
|
30
|
+
def __call__(self) -> CursorValueType:
|
31
|
+
return self._clamping_strategy.clamp(self._end_provider()) - self._granularity
|
32
|
+
|
33
|
+
|
34
|
+
class DayClampingStrategy(ClampingStrategy):
|
35
|
+
def __init__(self, is_ceiling: bool = True) -> None:
|
36
|
+
self._is_ceiling = is_ceiling
|
37
|
+
|
38
|
+
def clamp(self, value: datetime) -> datetime: # type: ignore # datetime implements method from CursorValueType
|
39
|
+
return_value = value.replace(hour=0, minute=0, second=0, microsecond=0)
|
40
|
+
if self._is_ceiling:
|
41
|
+
return return_value + timedelta(days=1)
|
42
|
+
return return_value
|
43
|
+
|
44
|
+
|
45
|
+
class MonthClampingStrategy(ClampingStrategy):
|
46
|
+
def __init__(self, is_ceiling: bool = True) -> None:
|
47
|
+
self._is_ceiling = is_ceiling
|
48
|
+
|
49
|
+
def clamp(self, value: datetime) -> datetime: # type: ignore # datetime implements method from CursorValueType
|
50
|
+
return_value = value.replace(hour=0, minute=0, second=0, microsecond=0)
|
51
|
+
needs_to_round = value.day != 1
|
52
|
+
if not needs_to_round:
|
53
|
+
return return_value
|
54
|
+
|
55
|
+
return self._ceil(return_value) if self._is_ceiling else return_value.replace(day=1)
|
56
|
+
|
57
|
+
def _ceil(self, value: datetime) -> datetime:
|
58
|
+
return value.replace(
|
59
|
+
year=value.year + 1 if value.month == 12 else value.year,
|
60
|
+
month=(value.month % 12) + 1,
|
61
|
+
day=1,
|
62
|
+
hour=0,
|
63
|
+
minute=0,
|
64
|
+
second=0,
|
65
|
+
microsecond=0,
|
66
|
+
)
|
67
|
+
|
68
|
+
|
69
|
+
class Weekday(Enum):
|
70
|
+
"""
|
71
|
+
These integer values map to the same ones used by the Datetime.date.weekday() implementation
|
72
|
+
"""
|
73
|
+
|
74
|
+
MONDAY = 0
|
75
|
+
TUESDAY = 1
|
76
|
+
WEDNESDAY = 2
|
77
|
+
THURSDAY = 3
|
78
|
+
FRIDAY = 4
|
79
|
+
SATURDAY = 5
|
80
|
+
SUNDAY = 6
|
81
|
+
|
82
|
+
|
83
|
+
class WeekClampingStrategy(ClampingStrategy):
|
84
|
+
def __init__(self, day_of_week: Weekday, is_ceiling: bool = True) -> None:
|
85
|
+
self._day_of_week = day_of_week.value
|
86
|
+
self._is_ceiling = is_ceiling
|
87
|
+
|
88
|
+
def clamp(self, value: datetime) -> datetime: # type: ignore # datetime implements method from CursorValueType
|
89
|
+
days_diff_to_ceiling = (
|
90
|
+
7 - (value.weekday() - self._day_of_week)
|
91
|
+
if value.weekday() > self._day_of_week
|
92
|
+
else abs(value.weekday() - self._day_of_week)
|
93
|
+
)
|
94
|
+
delta = (
|
95
|
+
timedelta(days_diff_to_ceiling)
|
96
|
+
if self._is_ceiling
|
97
|
+
else timedelta(days_diff_to_ceiling - 7)
|
98
|
+
)
|
99
|
+
return value.replace(hour=0, minute=0, second=0, microsecond=0) + delta
|
@@ -13,7 +13,6 @@ from typing import (
|
|
13
13
|
Mapping,
|
14
14
|
MutableMapping,
|
15
15
|
Optional,
|
16
|
-
Protocol,
|
17
16
|
Tuple,
|
18
17
|
Union,
|
19
18
|
)
|
@@ -21,6 +20,8 @@ from typing import (
|
|
21
20
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
22
21
|
from airbyte_cdk.sources.message import MessageRepository
|
23
22
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
23
|
+
from airbyte_cdk.sources.streams.concurrent.clamping import ClampingStrategy, NoClamping
|
24
|
+
from airbyte_cdk.sources.streams.concurrent.cursor_types import CursorValueType, GapType
|
24
25
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
25
26
|
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
26
27
|
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
@@ -35,36 +36,6 @@ def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
|
|
35
36
|
return functools.reduce(lambda a, b: a[b], path, mapping)
|
36
37
|
|
37
38
|
|
38
|
-
class GapType(Protocol):
|
39
|
-
"""
|
40
|
-
This is the representation of gaps between two cursor values. Examples:
|
41
|
-
* if cursor values are datetimes, GapType is timedelta
|
42
|
-
* if cursor values are integer, GapType will also be integer
|
43
|
-
"""
|
44
|
-
|
45
|
-
pass
|
46
|
-
|
47
|
-
|
48
|
-
class CursorValueType(Protocol):
|
49
|
-
"""Protocol for annotating comparable types."""
|
50
|
-
|
51
|
-
@abstractmethod
|
52
|
-
def __lt__(self: "CursorValueType", other: "CursorValueType") -> bool:
|
53
|
-
pass
|
54
|
-
|
55
|
-
@abstractmethod
|
56
|
-
def __ge__(self: "CursorValueType", other: "CursorValueType") -> bool:
|
57
|
-
pass
|
58
|
-
|
59
|
-
@abstractmethod
|
60
|
-
def __add__(self: "CursorValueType", other: GapType) -> "CursorValueType":
|
61
|
-
pass
|
62
|
-
|
63
|
-
@abstractmethod
|
64
|
-
def __sub__(self: "CursorValueType", other: GapType) -> "CursorValueType":
|
65
|
-
pass
|
66
|
-
|
67
|
-
|
68
39
|
class CursorField:
|
69
40
|
def __init__(self, cursor_field_key: str) -> None:
|
70
41
|
self.cursor_field_key = cursor_field_key
|
@@ -172,6 +143,7 @@ class ConcurrentCursor(Cursor):
|
|
172
143
|
lookback_window: Optional[GapType] = None,
|
173
144
|
slice_range: Optional[GapType] = None,
|
174
145
|
cursor_granularity: Optional[GapType] = None,
|
146
|
+
clamping_strategy: ClampingStrategy = NoClamping(),
|
175
147
|
) -> None:
|
176
148
|
self._stream_name = stream_name
|
177
149
|
self._stream_namespace = stream_namespace
|
@@ -193,10 +165,13 @@ class ConcurrentCursor(Cursor):
|
|
193
165
|
self._cursor_granularity = cursor_granularity
|
194
166
|
# Flag to track if the logger has been triggered (per stream)
|
195
167
|
self._should_be_synced_logger_triggered = False
|
168
|
+
self._clamping_strategy = clamping_strategy
|
196
169
|
|
197
170
|
@property
|
198
171
|
def state(self) -> MutableMapping[str, Any]:
|
199
|
-
return self.
|
172
|
+
return self._connector_state_converter.convert_to_state_message(
|
173
|
+
self.cursor_field, self._concurrent_state
|
174
|
+
)
|
200
175
|
|
201
176
|
@property
|
202
177
|
def cursor_field(self) -> CursorField:
|
@@ -241,10 +216,10 @@ class ConcurrentCursor(Cursor):
|
|
241
216
|
return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
242
217
|
|
243
218
|
def close_partition(self, partition: Partition) -> None:
|
244
|
-
slice_count_before = len(self.
|
219
|
+
slice_count_before = len(self._concurrent_state.get("slices", []))
|
245
220
|
self._add_slice_to_state(partition)
|
246
221
|
if slice_count_before < len(
|
247
|
-
self.
|
222
|
+
self._concurrent_state["slices"]
|
248
223
|
): # only emit if at least one slice has been processed
|
249
224
|
self._merge_partitions()
|
250
225
|
self._emit_state_message()
|
@@ -256,11 +231,11 @@ class ConcurrentCursor(Cursor):
|
|
256
231
|
)
|
257
232
|
|
258
233
|
if self._slice_boundary_fields:
|
259
|
-
if "slices" not in self.
|
234
|
+
if "slices" not in self._concurrent_state:
|
260
235
|
raise RuntimeError(
|
261
236
|
f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
|
262
237
|
)
|
263
|
-
self.
|
238
|
+
self._concurrent_state["slices"].append(
|
264
239
|
{
|
265
240
|
self._connector_state_converter.START_KEY: self._extract_from_slice(
|
266
241
|
partition, self._slice_boundary_fields[self._START_BOUNDARY]
|
@@ -288,7 +263,7 @@ class ConcurrentCursor(Cursor):
|
|
288
263
|
"expected. Please contact the Airbyte team."
|
289
264
|
)
|
290
265
|
|
291
|
-
self.
|
266
|
+
self._concurrent_state["slices"].append(
|
292
267
|
{
|
293
268
|
self._connector_state_converter.START_KEY: self.start,
|
294
269
|
self._connector_state_converter.END_KEY: most_recent_cursor_value,
|
@@ -300,9 +275,7 @@ class ConcurrentCursor(Cursor):
|
|
300
275
|
self._connector_state_manager.update_state_for_stream(
|
301
276
|
self._stream_name,
|
302
277
|
self._stream_namespace,
|
303
|
-
self.
|
304
|
-
self._cursor_field, self.state
|
305
|
-
),
|
278
|
+
self.state,
|
306
279
|
)
|
307
280
|
state_message = self._connector_state_manager.create_state_message(
|
308
281
|
self._stream_name, self._stream_namespace
|
@@ -310,7 +283,9 @@ class ConcurrentCursor(Cursor):
|
|
310
283
|
self._message_repository.emit_message(state_message)
|
311
284
|
|
312
285
|
def _merge_partitions(self) -> None:
|
313
|
-
self.
|
286
|
+
self._concurrent_state["slices"] = self._connector_state_converter.merge_intervals(
|
287
|
+
self._concurrent_state["slices"]
|
288
|
+
)
|
314
289
|
|
315
290
|
def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
|
316
291
|
try:
|
@@ -347,36 +322,42 @@ class ConcurrentCursor(Cursor):
|
|
347
322
|
if self._start is not None and self._is_start_before_first_slice():
|
348
323
|
yield from self._split_per_slice_range(
|
349
324
|
self._start,
|
350
|
-
self.
|
325
|
+
self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY],
|
351
326
|
False,
|
352
327
|
)
|
353
328
|
|
354
|
-
if len(self.
|
329
|
+
if len(self._concurrent_state["slices"]) == 1:
|
355
330
|
yield from self._split_per_slice_range(
|
356
331
|
self._calculate_lower_boundary_of_last_slice(
|
357
|
-
self.
|
332
|
+
self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY]
|
358
333
|
),
|
359
334
|
self._end_provider(),
|
360
335
|
True,
|
361
336
|
)
|
362
|
-
elif len(self.
|
363
|
-
for i in range(len(self.
|
337
|
+
elif len(self._concurrent_state["slices"]) > 1:
|
338
|
+
for i in range(len(self._concurrent_state["slices"]) - 1):
|
364
339
|
if self._cursor_granularity:
|
365
340
|
yield from self._split_per_slice_range(
|
366
|
-
self.
|
341
|
+
self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY]
|
367
342
|
+ self._cursor_granularity,
|
368
|
-
self.
|
343
|
+
self._concurrent_state["slices"][i + 1][
|
344
|
+
self._connector_state_converter.START_KEY
|
345
|
+
],
|
369
346
|
False,
|
370
347
|
)
|
371
348
|
else:
|
372
349
|
yield from self._split_per_slice_range(
|
373
|
-
self.
|
374
|
-
|
350
|
+
self._concurrent_state["slices"][i][
|
351
|
+
self._connector_state_converter.END_KEY
|
352
|
+
],
|
353
|
+
self._concurrent_state["slices"][i + 1][
|
354
|
+
self._connector_state_converter.START_KEY
|
355
|
+
],
|
375
356
|
False,
|
376
357
|
)
|
377
358
|
yield from self._split_per_slice_range(
|
378
359
|
self._calculate_lower_boundary_of_last_slice(
|
379
|
-
self.
|
360
|
+
self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY]
|
380
361
|
),
|
381
362
|
self._end_provider(),
|
382
363
|
True,
|
@@ -387,7 +368,8 @@ class ConcurrentCursor(Cursor):
|
|
387
368
|
def _is_start_before_first_slice(self) -> bool:
|
388
369
|
return (
|
389
370
|
self._start is not None
|
390
|
-
and self._start
|
371
|
+
and self._start
|
372
|
+
< self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
|
391
373
|
)
|
392
374
|
|
393
375
|
def _calculate_lower_boundary_of_last_slice(
|
@@ -408,10 +390,12 @@ class ConcurrentCursor(Cursor):
|
|
408
390
|
|
409
391
|
lower = max(lower, self._start) if self._start else lower
|
410
392
|
if not self._slice_range or self._evaluate_upper_safely(lower, self._slice_range) >= upper:
|
393
|
+
clamped_lower = self._clamping_strategy.clamp(lower)
|
394
|
+
clamped_upper = self._clamping_strategy.clamp(upper)
|
411
395
|
start_value, end_value = (
|
412
|
-
(
|
396
|
+
(clamped_lower, clamped_upper - self._cursor_granularity)
|
413
397
|
if self._cursor_granularity and not upper_is_end
|
414
|
-
else (
|
398
|
+
else (clamped_lower, clamped_upper)
|
415
399
|
)
|
416
400
|
yield StreamSlice(
|
417
401
|
partition={},
|
@@ -433,11 +417,21 @@ class ConcurrentCursor(Cursor):
|
|
433
417
|
)
|
434
418
|
has_reached_upper_boundary = current_upper_boundary >= upper
|
435
419
|
|
420
|
+
clamped_upper = (
|
421
|
+
self._clamping_strategy.clamp(current_upper_boundary)
|
422
|
+
if current_upper_boundary != upper
|
423
|
+
else current_upper_boundary
|
424
|
+
)
|
425
|
+
clamped_lower = self._clamping_strategy.clamp(current_lower_boundary)
|
426
|
+
if clamped_lower >= clamped_upper:
|
427
|
+
# clamping collapsed both values which means that it is time to stop processing
|
428
|
+
# FIXME should this be replace by proper end_provider
|
429
|
+
break
|
436
430
|
start_value, end_value = (
|
437
|
-
(
|
431
|
+
(clamped_lower, clamped_upper - self._cursor_granularity)
|
438
432
|
if self._cursor_granularity
|
439
433
|
and (not upper_is_end or not has_reached_upper_boundary)
|
440
|
-
else (
|
434
|
+
else (clamped_lower, clamped_upper)
|
441
435
|
)
|
442
436
|
yield StreamSlice(
|
443
437
|
partition={},
|
@@ -450,7 +444,7 @@ class ConcurrentCursor(Cursor):
|
|
450
444
|
]: self._connector_state_converter.output_format(end_value),
|
451
445
|
},
|
452
446
|
)
|
453
|
-
current_lower_boundary =
|
447
|
+
current_lower_boundary = clamped_upper
|
454
448
|
if current_upper_boundary >= upper:
|
455
449
|
stop_processing = True
|
456
450
|
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
from typing import Protocol
|
3
|
+
|
4
|
+
|
5
|
+
class GapType(Protocol):
|
6
|
+
"""
|
7
|
+
This is the representation of gaps between two cursor values. Examples:
|
8
|
+
* if cursor values are datetimes, GapType is timedelta
|
9
|
+
* if cursor values are integer, GapType will also be integer
|
10
|
+
"""
|
11
|
+
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
class CursorValueType(Protocol):
|
16
|
+
"""Protocol for annotating comparable types."""
|
17
|
+
|
18
|
+
@abstractmethod
|
19
|
+
def __lt__(self: "CursorValueType", other: "CursorValueType") -> bool:
|
20
|
+
pass
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def __ge__(self: "CursorValueType", other: "CursorValueType") -> bool:
|
24
|
+
pass
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def __add__(self: "CursorValueType", other: GapType) -> "CursorValueType":
|
28
|
+
pass
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def __sub__(self: "CursorValueType", other: GapType) -> "CursorValueType":
|
32
|
+
pass
|