airbyte-cdk 6.23.0__py3-none-any.whl → 6.23.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,9 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
+ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
+ PerPartitionWithGlobalCursor,
25
+ )
23
26
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
24
27
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
25
28
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -32,7 +35,7 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
32
35
  ModelToComponentFactory,
33
36
  )
34
37
  from airbyte_cdk.sources.declarative.requesters import HttpRequester
35
- from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever
38
+ from airbyte_cdk.sources.declarative.retrievers import Retriever, SimpleRetriever
36
39
  from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
37
40
  DeclarativePartitionFactory,
38
41
  StreamSlicerPartitionGenerator,
@@ -231,21 +234,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
231
234
  stream_state=stream_state,
232
235
  )
233
236
 
234
- retriever = declarative_stream.retriever
235
-
236
- # This is an optimization so that we don't invoke any cursor or state management flows within the
237
- # low-code framework because state management is handled through the ConcurrentCursor.
238
- if declarative_stream and isinstance(retriever, SimpleRetriever):
239
- # Also a temporary hack. In the legacy Stream implementation, as part of the read,
240
- # set_initial_state() is called to instantiate incoming state on the cursor. Although we no
241
- # longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
242
- # like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
243
- # still rely on a DatetimeBasedCursor that is properly initialized with state.
244
- if retriever.cursor:
245
- retriever.cursor.set_initial_state(stream_state=stream_state)
246
- # We zero it out here, but since this is a cursor reference, the state is still properly
247
- # instantiated for the other components that reference it
248
- retriever.cursor = None
237
+ retriever = self._get_retriever(declarative_stream, stream_state)
249
238
 
250
239
  partition_generator = StreamSlicerPartitionGenerator(
251
240
  DeclarativePartitionFactory(
@@ -305,6 +294,60 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
305
294
  cursor=final_state_cursor,
306
295
  )
307
296
  )
297
+ elif (
298
+ incremental_sync_component_definition
299
+ and incremental_sync_component_definition.get("type", "")
300
+ == DatetimeBasedCursorModel.__name__
301
+ and self._stream_supports_concurrent_partition_processing(
302
+ declarative_stream=declarative_stream
303
+ )
304
+ and hasattr(declarative_stream.retriever, "stream_slicer")
305
+ and isinstance(
306
+ declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
307
+ )
308
+ ):
309
+ stream_state = state_manager.get_stream_state(
310
+ stream_name=declarative_stream.name, namespace=declarative_stream.namespace
311
+ )
312
+ partition_router = declarative_stream.retriever.stream_slicer._partition_router
313
+
314
+ perpartition_cursor = (
315
+ self._constructor.create_concurrent_cursor_from_perpartition_cursor(
316
+ state_manager=state_manager,
317
+ model_type=DatetimeBasedCursorModel,
318
+ component_definition=incremental_sync_component_definition,
319
+ stream_name=declarative_stream.name,
320
+ stream_namespace=declarative_stream.namespace,
321
+ config=config or {},
322
+ stream_state=stream_state,
323
+ partition_router=partition_router,
324
+ )
325
+ )
326
+
327
+ retriever = self._get_retriever(declarative_stream, stream_state)
328
+
329
+ partition_generator = StreamSlicerPartitionGenerator(
330
+ DeclarativePartitionFactory(
331
+ declarative_stream.name,
332
+ declarative_stream.get_json_schema(),
333
+ retriever,
334
+ self.message_repository,
335
+ ),
336
+ perpartition_cursor,
337
+ )
338
+
339
+ concurrent_streams.append(
340
+ DefaultStream(
341
+ partition_generator=partition_generator,
342
+ name=declarative_stream.name,
343
+ json_schema=declarative_stream.get_json_schema(),
344
+ availability_strategy=AlwaysAvailableAvailabilityStrategy(),
345
+ primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
346
+ cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
347
+ logger=self.logger,
348
+ cursor=perpartition_cursor,
349
+ )
350
+ )
308
351
  else:
309
352
  synchronous_streams.append(declarative_stream)
310
353
  else:
@@ -395,6 +438,27 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
395
438
  return False
396
439
  return True
397
440
 
441
+ def _get_retriever(
442
+ self, declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any]
443
+ ) -> Retriever:
444
+ retriever = declarative_stream.retriever
445
+
446
+ # This is an optimization so that we don't invoke any cursor or state management flows within the
447
+ # low-code framework because state management is handled through the ConcurrentCursor.
448
+ if declarative_stream and isinstance(retriever, SimpleRetriever):
449
+ # Also a temporary hack. In the legacy Stream implementation, as part of the read,
450
+ # set_initial_state() is called to instantiate incoming state on the cursor. Although we no
451
+ # longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
452
+ # like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
453
+ # still rely on a DatetimeBasedCursor that is properly initialized with state.
454
+ if retriever.cursor:
455
+ retriever.cursor.set_initial_state(stream_state=stream_state)
456
+ # We zero it out here, but since this is a cursor reference, the state is still properly
457
+ # instantiated for the other components that reference it
458
+ retriever.cursor = None
459
+
460
+ return retriever
461
+
398
462
  @staticmethod
399
463
  def _select_streams(
400
464
  streams: List[AbstractStream], configured_catalog: ConfiguredAirbyteCatalog
@@ -784,6 +784,29 @@ definitions:
784
784
  type:
785
785
  type: string
786
786
  enum: [DatetimeBasedCursor]
787
+ clamping:
788
+ title: Date Range Clamping
789
+ description: This option is used to adjust the upper and lower boundaries of each datetime window to beginning and end of the provided target period (day, week, month)
790
+ type: object
791
+ required:
792
+ - target
793
+ properties:
794
+ target:
795
+ title: Target
796
+ description: The period of time that datetime windows will be clamped by
797
+ # This should ideally be an enum. However, we don't use an enum because we want to allow for connectors
798
+ # to support interpolation on the connector config to get the target which is an arbitrary string
799
+ type: string
800
+ interpolation_context:
801
+ - config
802
+ examples:
803
+ - "DAY"
804
+ - "WEEK"
805
+ - "MONTH"
806
+ - "{{ config['target'] }}"
807
+ target_details:
808
+ type: object
809
+ additionalProperties: true
787
810
  cursor_field:
788
811
  title: Cursor Field
789
812
  description: The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- date_time_based_cursor: DatetimeBasedCursor,
63
- substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
62
+ cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
64
63
  **kwargs: Any,
65
64
  ):
66
65
  super().__init__(**kwargs)
67
- self._date_time_based_cursor = date_time_based_cursor
68
- self._substream_cursor = substream_cursor
66
+ self._cursor = cursor
69
67
 
70
68
  def filter_records(
71
69
  self,
@@ -77,7 +75,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
77
75
  records = (
78
76
  record
79
77
  for record in records
80
- if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
78
+ if self._cursor.should_be_synced(
81
79
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
82
80
  # Record stream name is empty cause it is not used durig the filtering
83
81
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,6 +2,10 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
6
+ ConcurrentCursorFactory,
7
+ ConcurrentPerPartitionCursor,
8
+ )
5
9
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
6
10
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
7
11
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -21,6 +25,8 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
21
25
 
22
26
  __all__ = [
23
27
  "CursorFactory",
28
+ "ConcurrentCursorFactory",
29
+ "ConcurrentPerPartitionCursor",
24
30
  "DatetimeBasedCursor",
25
31
  "DeclarativeCursor",
26
32
  "GlobalSubstreamCursor",
@@ -0,0 +1,334 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import copy
6
+ import logging
7
+ import threading
8
+ from collections import OrderedDict
9
+ from copy import deepcopy
10
+ from datetime import timedelta
11
+ from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
12
+
13
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
+ from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
15
+ Timer,
16
+ iterate_with_last_flag_and_state,
17
+ )
18
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
19
+ from airbyte_cdk.sources.message import MessageRepository
20
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
21
+ PerPartitionKeySerializer,
22
+ )
23
+ from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField
24
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
25
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
26
+
27
+ logger = logging.getLogger("airbyte")
28
+
29
+
30
+ class ConcurrentCursorFactory:
31
+ def __init__(self, create_function: Callable[..., ConcurrentCursor]):
32
+ self._create_function = create_function
33
+
34
+ def create(
35
+ self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[timedelta]
36
+ ) -> ConcurrentCursor:
37
+ return self._create_function(
38
+ stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
39
+ )
40
+
41
+
42
+ class ConcurrentPerPartitionCursor(Cursor):
43
+ """
44
+ Manages state per partition when a stream has many partitions, preventing data loss or duplication.
45
+
46
+ Attributes:
47
+ DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
48
+
49
+ - **Partition Limitation Logic**
50
+ Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
51
+
52
+ - **Global Cursor Fallback**
53
+ New partitions use global state as the initial state to progress the state for deleted or new partitions. The history data added after the initial sync will be missing.
54
+
55
+ CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
56
+ """
57
+
58
+ DEFAULT_MAX_PARTITIONS_NUMBER = 10000
59
+ _NO_STATE: Mapping[str, Any] = {}
60
+ _NO_CURSOR_STATE: Mapping[str, Any] = {}
61
+ _GLOBAL_STATE_KEY = "state"
62
+ _PERPARTITION_STATE_KEY = "states"
63
+ _KEY = 0
64
+ _VALUE = 1
65
+
66
+ def __init__(
67
+ self,
68
+ cursor_factory: ConcurrentCursorFactory,
69
+ partition_router: PartitionRouter,
70
+ stream_name: str,
71
+ stream_namespace: Optional[str],
72
+ stream_state: Any,
73
+ message_repository: MessageRepository,
74
+ connector_state_manager: ConnectorStateManager,
75
+ cursor_field: CursorField,
76
+ ) -> None:
77
+ self._global_cursor: Optional[StreamState] = {}
78
+ self._stream_name = stream_name
79
+ self._stream_namespace = stream_namespace
80
+ self._message_repository = message_repository
81
+ self._connector_state_manager = connector_state_manager
82
+ self._cursor_field = cursor_field
83
+
84
+ self._cursor_factory = cursor_factory
85
+ self._partition_router = partition_router
86
+
87
+ # The dict is ordered to ensure that once the maximum number of partitions is reached,
88
+ # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
89
+ self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
90
+ self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
91
+ self._finished_partitions: set[str] = set()
92
+ self._lock = threading.Lock()
93
+ self._timer = Timer()
94
+ self._new_global_cursor: Optional[StreamState] = None
95
+ self._lookback_window: int = 0
96
+ self._parent_state: Optional[StreamState] = None
97
+ self._over_limit: int = 0
98
+ self._partition_serializer = PerPartitionKeySerializer()
99
+
100
+ self._set_initial_state(stream_state)
101
+
102
+ @property
103
+ def cursor_field(self) -> CursorField:
104
+ return self._cursor_field
105
+
106
+ @property
107
+ def state(self) -> MutableMapping[str, Any]:
108
+ states = []
109
+ for partition_tuple, cursor in self._cursor_per_partition.items():
110
+ if cursor.state:
111
+ states.append(
112
+ {
113
+ "partition": self._to_dict(partition_tuple),
114
+ "cursor": copy.deepcopy(cursor.state),
115
+ }
116
+ )
117
+ state: dict[str, Any] = {self._PERPARTITION_STATE_KEY: states}
118
+
119
+ if self._global_cursor:
120
+ state[self._GLOBAL_STATE_KEY] = self._global_cursor
121
+ if self._lookback_window is not None:
122
+ state["lookback_window"] = self._lookback_window
123
+ if self._parent_state is not None:
124
+ state["parent_state"] = self._parent_state
125
+ return state
126
+
127
+ def close_partition(self, partition: Partition) -> None:
128
+ # Attempt to retrieve the stream slice
129
+ stream_slice: Optional[StreamSlice] = partition.to_slice() # type: ignore[assignment]
130
+
131
+ # Ensure stream_slice is not None
132
+ if stream_slice is None:
133
+ raise ValueError("stream_slice cannot be None")
134
+
135
+ partition_key = self._to_partition_key(stream_slice.partition)
136
+ self._cursor_per_partition[partition_key].close_partition(partition=partition)
137
+ with self._lock:
138
+ self._semaphore_per_partition[partition_key].acquire()
139
+ cursor = self._cursor_per_partition[partition_key]
140
+ if (
141
+ partition_key in self._finished_partitions
142
+ and self._semaphore_per_partition[partition_key]._value == 0
143
+ ):
144
+ if (
145
+ self._new_global_cursor is None
146
+ or self._new_global_cursor[self.cursor_field.cursor_field_key]
147
+ < cursor.state[self.cursor_field.cursor_field_key]
148
+ ):
149
+ self._new_global_cursor = copy.deepcopy(cursor.state)
150
+ self._emit_state_message()
151
+
152
+ def ensure_at_least_one_state_emitted(self) -> None:
153
+ """
154
+ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
155
+ called.
156
+ """
157
+ if not any(
158
+ semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
159
+ ):
160
+ self._global_cursor = self._new_global_cursor
161
+ self._lookback_window = self._timer.finish()
162
+ self._parent_state = self._partition_router.get_stream_state()
163
+ self._emit_state_message()
164
+
165
+ def _emit_state_message(self) -> None:
166
+ self._connector_state_manager.update_state_for_stream(
167
+ self._stream_name,
168
+ self._stream_namespace,
169
+ self.state,
170
+ )
171
+ state_message = self._connector_state_manager.create_state_message(
172
+ self._stream_name, self._stream_namespace
173
+ )
174
+ self._message_repository.emit_message(state_message)
175
+
176
+ def stream_slices(self) -> Iterable[StreamSlice]:
177
+ if self._timer.is_running():
178
+ raise RuntimeError("stream_slices has been executed more than once.")
179
+
180
+ slices = self._partition_router.stream_slices()
181
+ self._timer.start()
182
+ for partition in slices:
183
+ yield from self._generate_slices_from_partition(partition)
184
+
185
+ def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
186
+ # Ensure the maximum number of partitions is not exceeded
187
+ self._ensure_partition_limit()
188
+
189
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
190
+ if not cursor:
191
+ cursor = self._create_cursor(
192
+ self._global_cursor,
193
+ self._lookback_window if self._global_cursor else 0,
194
+ )
195
+ self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
196
+ self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
197
+ threading.Semaphore(0)
198
+ )
199
+
200
+ for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
201
+ cursor.stream_slices(),
202
+ lambda: None,
203
+ ):
204
+ self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
205
+ if is_last_slice:
206
+ self._finished_partitions.add(self._to_partition_key(partition.partition))
207
+ yield StreamSlice(
208
+ partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
209
+ )
210
+
211
+ def _ensure_partition_limit(self) -> None:
212
+ """
213
+ Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
214
+ """
215
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
216
+ self._over_limit += 1
217
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
218
+ 0
219
+ ] # Remove the oldest partition
220
+ logger.warning(
221
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
222
+ )
223
+
224
+ def _set_initial_state(self, stream_state: StreamState) -> None:
225
+ """
226
+ Initialize the cursor's state using the provided `stream_state`.
227
+
228
+ This method supports global and per-partition state initialization.
229
+
230
+ - **Global State**: If `states` is missing, the `state` is treated as global and applied to all partitions.
231
+ The `global state` holds a single cursor position representing the latest processed record across all partitions.
232
+
233
+ - **Lookback Window**: Configured via `lookback_window`, it defines the period (in seconds) for reprocessing records.
234
+ This ensures robustness in case of upstream data delays or reordering. If not specified, it defaults to 0.
235
+
236
+ - **Per-Partition State**: If `states` is present, each partition's cursor state is initialized separately.
237
+
238
+ - **Parent State**: (if available) Used to initialize partition routers based on parent streams.
239
+
240
+ Args:
241
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
242
+ {
243
+ "states": [
244
+ {
245
+ "partition": {
246
+ "partition_key": "value"
247
+ },
248
+ "cursor": {
249
+ "last_updated": "2023-05-27T00:00:00Z"
250
+ }
251
+ }
252
+ ],
253
+ "state": {
254
+ "last_updated": "2023-05-27T00:00:00Z"
255
+ },
256
+ lookback_window: 10,
257
+ "parent_state": {
258
+ "parent_stream_name": {
259
+ "last_updated": "2023-05-27T00:00:00Z"
260
+ }
261
+ }
262
+ }
263
+ """
264
+ if not stream_state:
265
+ return
266
+
267
+ if self._PERPARTITION_STATE_KEY not in stream_state:
268
+ # We assume that `stream_state` is in a global format that can be applied to all partitions.
269
+ # Example: {"global_state_format_key": "global_state_format_value"}
270
+ self._global_cursor = deepcopy(stream_state)
271
+ self._new_global_cursor = deepcopy(stream_state)
272
+
273
+ else:
274
+ self._lookback_window = int(stream_state.get("lookback_window", 0))
275
+
276
+ for state in stream_state[self._PERPARTITION_STATE_KEY]:
277
+ self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
278
+ self._create_cursor(state["cursor"])
279
+ )
280
+ self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
281
+ threading.Semaphore(0)
282
+ )
283
+
284
+ # set default state for missing partitions if it is per partition with fallback to global
285
+ if self._GLOBAL_STATE_KEY in stream_state:
286
+ self._global_cursor = deepcopy(stream_state[self._GLOBAL_STATE_KEY])
287
+ self._new_global_cursor = deepcopy(stream_state[self._GLOBAL_STATE_KEY])
288
+
289
+ # Set initial parent state
290
+ if stream_state.get("parent_state"):
291
+ self._parent_state = stream_state["parent_state"]
292
+
293
+ # Set parent state for partition routers based on parent streams
294
+ self._partition_router.set_initial_state(stream_state)
295
+
296
+ def observe(self, record: Record) -> None:
297
+ if not record.associated_slice:
298
+ raise ValueError(
299
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
300
+ )
301
+ self._cursor_per_partition[
302
+ self._to_partition_key(record.associated_slice.partition)
303
+ ].observe(record)
304
+
305
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
306
+ return self._partition_serializer.to_partition_key(partition)
307
+
308
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
309
+ return self._partition_serializer.to_partition(partition_key)
310
+
311
+ def _create_cursor(
312
+ self, cursor_state: Any, runtime_lookback_window: int = 0
313
+ ) -> ConcurrentCursor:
314
+ cursor = self._cursor_factory.create(
315
+ stream_state=deepcopy(cursor_state),
316
+ runtime_lookback_window=timedelta(seconds=runtime_lookback_window),
317
+ )
318
+ return cursor
319
+
320
+ def should_be_synced(self, record: Record) -> bool:
321
+ return self._get_cursor(record).should_be_synced(record)
322
+
323
+ def _get_cursor(self, record: Record) -> ConcurrentCursor:
324
+ if not record.associated_slice:
325
+ raise ValueError(
326
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
327
+ )
328
+ partition_key = self._to_partition_key(record.associated_slice.partition)
329
+ if partition_key not in self._cursor_per_partition:
330
+ raise ValueError(
331
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
332
+ )
333
+ cursor = self._cursor_per_partition[partition_key]
334
+ return cursor
@@ -64,6 +64,9 @@ class Timer:
64
64
  else:
65
65
  raise RuntimeError("Global substream cursor timer not started")
66
66
 
67
+ def is_running(self) -> bool:
68
+ return self._start is not None
69
+
67
70
 
68
71
  class GlobalSubstreamCursor(DeclarativeCursor):
69
72
  """
@@ -303,6 +303,21 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
+ if (
307
+ record.associated_slice
308
+ and self._to_partition_key(record.associated_slice.partition)
309
+ not in self._cursor_per_partition
310
+ ):
311
+ partition_state = (
312
+ self._state_to_migrate_from
313
+ if self._state_to_migrate_from
314
+ else self._NO_CURSOR_STATE
315
+ )
316
+ cursor = self._create_cursor(partition_state)
317
+
318
+ self._cursor_per_partition[
319
+ self._to_partition_key(record.associated_slice.partition)
320
+ ] = cursor
306
321
  return self._get_cursor(record).should_be_synced(
307
322
  self._convert_record_to_cursor_record(record)
308
323
  )
@@ -328,6 +328,16 @@ class LegacyToPerPartitionStateMigration(BaseModel):
328
328
  type: Optional[Literal["LegacyToPerPartitionStateMigration"]] = None
329
329
 
330
330
 
331
+ class Clamping(BaseModel):
332
+ target: str = Field(
333
+ ...,
334
+ description="The period of time that datetime windows will be clamped by",
335
+ examples=["DAY", "WEEK", "MONTH", "{{ config['target'] }}"],
336
+ title="Target",
337
+ )
338
+ target_details: Optional[Dict[str, Any]] = None
339
+
340
+
331
341
  class Algorithm(Enum):
332
342
  HS256 = "HS256"
333
343
  HS384 = "HS384"
@@ -719,7 +729,7 @@ class HttpResponseFilter(BaseModel):
719
729
  class TypesMap(BaseModel):
720
730
  target_type: Union[str, List[str]]
721
731
  current_type: Union[str, List[str]]
722
- condition: Optional[str]
732
+ condition: Optional[str] = None
723
733
 
724
734
 
725
735
  class SchemaTypeIdentifier(BaseModel):
@@ -797,14 +807,11 @@ class DpathFlattenFields(BaseModel):
797
807
  field_path: List[str] = Field(
798
808
  ...,
799
809
  description="A path to field that needs to be flattened.",
800
- examples=[
801
- ["data"],
802
- ["data", "*", "field"],
803
- ],
810
+ examples=[["data"], ["data", "*", "field"]],
804
811
  title="Field Path",
805
812
  )
806
813
  delete_origin_value: Optional[bool] = Field(
807
- False,
814
+ None,
808
815
  description="Whether to delete the origin value or keep it. Default is False.",
809
816
  title="Delete Origin Value",
810
817
  )
@@ -1454,6 +1461,11 @@ class AuthFlow(BaseModel):
1454
1461
 
1455
1462
  class DatetimeBasedCursor(BaseModel):
1456
1463
  type: Literal["DatetimeBasedCursor"]
1464
+ clamping: Optional[Clamping] = Field(
1465
+ None,
1466
+ description="This option is used to adjust the upper and lower boundaries of each datetime window to beginning and end of the provided target period (day, week, month)",
1467
+ title="Date Range Clamping",
1468
+ )
1457
1469
  cursor_field: str = Field(
1458
1470
  ...,
1459
1471
  description="The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.",