airbyte-cdk 6.37.0.dev1__py3-none-any.whl → 6.37.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/models.py +16 -14
- airbyte_cdk/connector_builder/test_reader/helpers.py +120 -22
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +16 -3
- airbyte_cdk/connector_builder/test_reader/types.py +9 -1
- airbyte_cdk/sources/declarative/auth/token_provider.py +1 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +43 -7
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +7 -1
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +67 -46
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +13 -2
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +1 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +83 -17
- airbyte_cdk/sources/declarative/interpolation/macros.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +30 -45
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +18 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +171 -70
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/requesters/README.md +5 -5
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +60 -17
- airbyte_cdk/sources/declarative/requesters/http_requester.py +7 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +10 -3
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +2 -2
- airbyte_cdk/sources/http_logger.py +3 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +2 -1
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +1 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/RECORD +31 -31
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -136
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.1.dist-info}/entry_points.txt +0 -0
@@ -777,6 +777,44 @@ definitions:
|
|
777
777
|
type:
|
778
778
|
type: string
|
779
779
|
enum: [LegacyToPerPartitionStateMigration]
|
780
|
+
IncrementingCountCursor:
|
781
|
+
title: Incrementing Count Cursor
|
782
|
+
description: Cursor that allows for incremental sync according to a continuously increasing integer.
|
783
|
+
type: object
|
784
|
+
required:
|
785
|
+
- type
|
786
|
+
- cursor_field
|
787
|
+
properties:
|
788
|
+
type:
|
789
|
+
type: string
|
790
|
+
enum: [IncrementingCountCursor]
|
791
|
+
cursor_field:
|
792
|
+
title: Cursor Field
|
793
|
+
description: The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.
|
794
|
+
type: string
|
795
|
+
interpolation_context:
|
796
|
+
- config
|
797
|
+
examples:
|
798
|
+
- "created_at"
|
799
|
+
- "{{ config['record_cursor'] }}"
|
800
|
+
start_value:
|
801
|
+
title: Start Value
|
802
|
+
description: The value that determines the earliest record that should be synced.
|
803
|
+
anyOf:
|
804
|
+
- type: string
|
805
|
+
- type: integer
|
806
|
+
interpolation_context:
|
807
|
+
- config
|
808
|
+
examples:
|
809
|
+
- 0
|
810
|
+
- "{{ config['start_value'] }}"
|
811
|
+
start_value_option:
|
812
|
+
title: Inject Start Value Into Outgoing HTTP Request
|
813
|
+
description: Optionally configures how the start value will be sent in requests to the source API.
|
814
|
+
"$ref": "#/definitions/RequestOption"
|
815
|
+
$parameters:
|
816
|
+
type: object
|
817
|
+
additionalProperties: true
|
780
818
|
DatetimeBasedCursor:
|
781
819
|
title: Datetime Based Cursor
|
782
820
|
description: Cursor to provide incremental capabilities over datetime.
|
@@ -844,6 +882,7 @@ definitions:
|
|
844
882
|
* **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`
|
845
883
|
* **%S**: Second (zero-padded) - `00`, `01`, ..., `59`
|
846
884
|
* **%f**: Microsecond (zero-padded to 6 digits) - `000000`
|
885
|
+
* **%_ms**: Millisecond (zero-padded to 3 digits) - `000`
|
847
886
|
* **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`
|
848
887
|
* **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`
|
849
888
|
* **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`
|
@@ -1318,6 +1357,7 @@ definitions:
|
|
1318
1357
|
anyOf:
|
1319
1358
|
- "$ref": "#/definitions/CustomIncrementalSync"
|
1320
1359
|
- "$ref": "#/definitions/DatetimeBasedCursor"
|
1360
|
+
- "$ref": "#/definitions/IncrementingCountCursor"
|
1321
1361
|
name:
|
1322
1362
|
title: Name
|
1323
1363
|
description: The stream name.
|
@@ -1490,7 +1530,11 @@ definitions:
|
|
1490
1530
|
limit:
|
1491
1531
|
title: Limit
|
1492
1532
|
description: The maximum number of calls allowed within the interval.
|
1493
|
-
|
1533
|
+
anyOf:
|
1534
|
+
- type: integer
|
1535
|
+
- type: string
|
1536
|
+
interpolation_context:
|
1537
|
+
- config
|
1494
1538
|
interval:
|
1495
1539
|
title: Interval
|
1496
1540
|
description: The time interval for the rate limit.
|
@@ -1775,6 +1819,9 @@ definitions:
|
|
1775
1819
|
- stream_interval
|
1776
1820
|
- stream_partition
|
1777
1821
|
- stream_slice
|
1822
|
+
- creation_response
|
1823
|
+
- polling_response
|
1824
|
+
- download_target
|
1778
1825
|
examples:
|
1779
1826
|
- "/products"
|
1780
1827
|
- "/quotes/{{ stream_partition['id'] }}/quote_line_groups"
|
@@ -2394,6 +2441,7 @@ definitions:
|
|
2394
2441
|
* **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`
|
2395
2442
|
* **%S**: Second (zero-padded) - `00`, `01`, ..., `59`
|
2396
2443
|
* **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999`
|
2444
|
+
* **%_ms**: Millisecond (zero-padded to 3 digits) - `000`, `001`, ..., `999`
|
2397
2445
|
* **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`
|
2398
2446
|
* **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`
|
2399
2447
|
* **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`
|
@@ -3130,14 +3178,12 @@ definitions:
|
|
3130
3178
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3131
3179
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3132
3180
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3133
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3134
3181
|
- type: array
|
3135
3182
|
items:
|
3136
3183
|
anyOf:
|
3137
3184
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3138
3185
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3139
3186
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3140
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3141
3187
|
decoder:
|
3142
3188
|
title: Decoder
|
3143
3189
|
description: Component decoding the response so records can be extracted.
|
@@ -3221,7 +3267,7 @@ definitions:
|
|
3221
3267
|
- polling_requester
|
3222
3268
|
- download_requester
|
3223
3269
|
- status_extractor
|
3224
|
-
-
|
3270
|
+
- download_target_extractor
|
3225
3271
|
properties:
|
3226
3272
|
type:
|
3227
3273
|
type: string
|
@@ -3238,7 +3284,7 @@ definitions:
|
|
3238
3284
|
anyOf:
|
3239
3285
|
- "$ref": "#/definitions/CustomRecordExtractor"
|
3240
3286
|
- "$ref": "#/definitions/DpathExtractor"
|
3241
|
-
|
3287
|
+
download_target_extractor:
|
3242
3288
|
description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.
|
3243
3289
|
anyOf:
|
3244
3290
|
- "$ref": "#/definitions/CustomRecordExtractor"
|
@@ -3259,7 +3305,7 @@ definitions:
|
|
3259
3305
|
anyOf:
|
3260
3306
|
- "$ref": "#/definitions/CustomRequester"
|
3261
3307
|
- "$ref": "#/definitions/HttpRequester"
|
3262
|
-
|
3308
|
+
download_target_requester:
|
3263
3309
|
description: Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.
|
3264
3310
|
anyOf:
|
3265
3311
|
- "$ref": "#/definitions/CustomRequester"
|
@@ -3292,14 +3338,12 @@ definitions:
|
|
3292
3338
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3293
3339
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3294
3340
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3295
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3296
3341
|
- type: array
|
3297
3342
|
items:
|
3298
3343
|
anyOf:
|
3299
3344
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3300
3345
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3301
3346
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3302
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3303
3347
|
decoder:
|
3304
3348
|
title: Decoder
|
3305
3349
|
description: Component decoding the response so records can be extracted.
|
@@ -3416,44 +3460,6 @@ definitions:
|
|
3416
3460
|
$parameters:
|
3417
3461
|
type: object
|
3418
3462
|
additionalProperties: true
|
3419
|
-
GroupingPartitionRouter:
|
3420
|
-
title: Grouping Partition Router
|
3421
|
-
description: >
|
3422
|
-
A decorator on top of a partition router that groups partitions into batches of a specified size.
|
3423
|
-
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
3424
|
-
Note that per-partition incremental syncs may not work as expected because the grouping
|
3425
|
-
of partitions might change between syncs, potentially leading to inconsistent state tracking.
|
3426
|
-
type: object
|
3427
|
-
required:
|
3428
|
-
- type
|
3429
|
-
- group_size
|
3430
|
-
- underlying_partition_router
|
3431
|
-
properties:
|
3432
|
-
type:
|
3433
|
-
type: string
|
3434
|
-
enum: [GroupingPartitionRouter]
|
3435
|
-
group_size:
|
3436
|
-
title: Group Size
|
3437
|
-
description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
|
3438
|
-
type: integer
|
3439
|
-
examples:
|
3440
|
-
- 10
|
3441
|
-
- 50
|
3442
|
-
underlying_partition_router:
|
3443
|
-
title: Underlying Partition Router
|
3444
|
-
description: The partition router whose output will be grouped. This can be any valid partition router component.
|
3445
|
-
anyOf:
|
3446
|
-
- "$ref": "#/definitions/CustomPartitionRouter"
|
3447
|
-
- "$ref": "#/definitions/ListPartitionRouter"
|
3448
|
-
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3449
|
-
deduplicate:
|
3450
|
-
title: Deduplicate Partitions
|
3451
|
-
description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
|
3452
|
-
type: boolean
|
3453
|
-
default: true
|
3454
|
-
$parameters:
|
3455
|
-
type: object
|
3456
|
-
additionalProperties: true
|
3457
3463
|
WaitUntilTimeFromHeader:
|
3458
3464
|
title: Wait Until Time Defined In Response Header
|
3459
3465
|
description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
|
@@ -3705,6 +3711,21 @@ interpolation:
|
|
3705
3711
|
self: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=
|
3706
3712
|
next: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=0236d6d2
|
3707
3713
|
count: 82
|
3714
|
+
- title: creation_response
|
3715
|
+
description: The response received from the creation_requester in the AsyncRetriever component.
|
3716
|
+
type: object
|
3717
|
+
examples:
|
3718
|
+
- id: "1234"
|
3719
|
+
- title: polling_response
|
3720
|
+
description: The response received from the polling_requester in the AsyncRetriever component.
|
3721
|
+
type: object
|
3722
|
+
examples:
|
3723
|
+
- id: "1234"
|
3724
|
+
- title: download_target
|
3725
|
+
description: The `URL` received from the polling_requester in the AsyncRetriever with jobStatus as `COMPLETED`.
|
3726
|
+
type: string
|
3727
|
+
examples:
|
3728
|
+
- "https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=0236d6d2&filename=xxx_yyy_zzz.csv"
|
3708
3729
|
- title: stream_interval
|
3709
3730
|
description: The current stream interval being processed. The keys are defined by the incremental sync component. Default keys are `start_time` and `end_time`.
|
3710
3731
|
type: object
|
@@ -107,6 +107,16 @@ class CsvParser(Parser):
|
|
107
107
|
encoding: Optional[str] = "utf-8"
|
108
108
|
delimiter: Optional[str] = ","
|
109
109
|
|
110
|
+
def _get_delimiter(self) -> Optional[str]:
|
111
|
+
"""
|
112
|
+
Get delimiter from the configuration. Check for the escape character and decode it.
|
113
|
+
"""
|
114
|
+
if self.delimiter is not None:
|
115
|
+
if self.delimiter.startswith("\\"):
|
116
|
+
self.delimiter = self.delimiter.encode("utf-8").decode("unicode_escape")
|
117
|
+
|
118
|
+
return self.delimiter
|
119
|
+
|
110
120
|
def parse(
|
111
121
|
self,
|
112
122
|
data: BufferedIOBase,
|
@@ -115,8 +125,9 @@ class CsvParser(Parser):
|
|
115
125
|
Parse CSV data from decompressed bytes.
|
116
126
|
"""
|
117
127
|
text_data = TextIOWrapper(data, encoding=self.encoding) # type: ignore
|
118
|
-
reader = csv.DictReader(text_data, delimiter=self.
|
119
|
-
|
128
|
+
reader = csv.DictReader(text_data, delimiter=self._get_delimiter() or ",")
|
129
|
+
for row in reader:
|
130
|
+
yield row
|
120
131
|
|
121
132
|
|
122
133
|
@dataclass
|
@@ -136,6 +136,7 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
136
136
|
"""
|
137
137
|
|
138
138
|
try:
|
139
|
+
# TODO: Add support for other file types, like `json`, with `pd.read_json()`
|
139
140
|
with open(path, "r", encoding=file_encoding) as data:
|
140
141
|
chunks = pd.read_csv(
|
141
142
|
data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object
|
@@ -95,6 +95,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
95
95
|
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
96
96
|
self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
|
97
97
|
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
|
98
|
+
|
99
|
+
# Parent-state tracking: store each partition’s parent state in creation order
|
100
|
+
self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
|
101
|
+
|
98
102
|
self._finished_partitions: set[str] = set()
|
99
103
|
self._lock = threading.Lock()
|
100
104
|
self._timer = Timer()
|
@@ -155,11 +159,62 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
155
159
|
and self._semaphore_per_partition[partition_key]._value == 0
|
156
160
|
):
|
157
161
|
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
|
158
|
-
|
162
|
+
|
163
|
+
self._check_and_update_parent_state()
|
164
|
+
|
165
|
+
self._emit_state_message()
|
166
|
+
|
167
|
+
def _check_and_update_parent_state(self) -> None:
|
168
|
+
"""
|
169
|
+
Pop the leftmost partition state from _partition_parent_state_map only if
|
170
|
+
*all partitions* up to (and including) that partition key in _semaphore_per_partition
|
171
|
+
are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
|
172
|
+
Additionally, delete finished semaphores with a value of 0 to free up memory,
|
173
|
+
as they are only needed to track errors and completion status.
|
174
|
+
"""
|
175
|
+
last_closed_state = None
|
176
|
+
|
177
|
+
while self._partition_parent_state_map:
|
178
|
+
# Look at the earliest partition key in creation order
|
179
|
+
earliest_key = next(iter(self._partition_parent_state_map))
|
180
|
+
|
181
|
+
# Verify ALL partitions from the left up to earliest_key are finished
|
182
|
+
all_left_finished = True
|
183
|
+
for p_key, sem in list(
|
184
|
+
self._semaphore_per_partition.items()
|
185
|
+
): # Use list to allow modification during iteration
|
186
|
+
# If any earlier partition is still not finished, we must stop
|
187
|
+
if p_key not in self._finished_partitions or sem._value != 0:
|
188
|
+
all_left_finished = False
|
189
|
+
break
|
190
|
+
# Once we've reached earliest_key in the semaphore order, we can stop checking
|
191
|
+
if p_key == earliest_key:
|
192
|
+
break
|
193
|
+
|
194
|
+
# If the partitions up to earliest_key are not all finished, break the while-loop
|
195
|
+
if not all_left_finished:
|
196
|
+
break
|
197
|
+
|
198
|
+
# Pop the leftmost entry from parent-state map
|
199
|
+
_, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
|
200
|
+
last_closed_state = closed_parent_state
|
201
|
+
|
202
|
+
# Clean up finished semaphores with value 0 up to and including earliest_key
|
203
|
+
for p_key in list(self._semaphore_per_partition.keys()):
|
204
|
+
sem = self._semaphore_per_partition[p_key]
|
205
|
+
if p_key in self._finished_partitions and sem._value == 0:
|
206
|
+
del self._semaphore_per_partition[p_key]
|
207
|
+
logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
|
208
|
+
if p_key == earliest_key:
|
209
|
+
break
|
210
|
+
|
211
|
+
# Update _parent_state if we popped at least one partition
|
212
|
+
if last_closed_state is not None:
|
213
|
+
self._parent_state = last_closed_state
|
159
214
|
|
160
215
|
def ensure_at_least_one_state_emitted(self) -> None:
|
161
216
|
"""
|
162
|
-
The platform
|
217
|
+
The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
163
218
|
called.
|
164
219
|
"""
|
165
220
|
if not any(
|
@@ -201,13 +256,19 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
201
256
|
|
202
257
|
slices = self._partition_router.stream_slices()
|
203
258
|
self._timer.start()
|
204
|
-
for partition in
|
205
|
-
|
259
|
+
for partition, last, parent_state in iterate_with_last_flag_and_state(
|
260
|
+
slices, self._partition_router.get_stream_state
|
261
|
+
):
|
262
|
+
yield from self._generate_slices_from_partition(partition, parent_state)
|
206
263
|
|
207
|
-
def _generate_slices_from_partition(
|
264
|
+
def _generate_slices_from_partition(
|
265
|
+
self, partition: StreamSlice, parent_state: Mapping[str, Any]
|
266
|
+
) -> Iterable[StreamSlice]:
|
208
267
|
# Ensure the maximum number of partitions is not exceeded
|
209
268
|
self._ensure_partition_limit()
|
210
269
|
|
270
|
+
partition_key = self._to_partition_key(partition.partition)
|
271
|
+
|
211
272
|
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
212
273
|
if not cursor:
|
213
274
|
cursor = self._create_cursor(
|
@@ -216,18 +277,26 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
216
277
|
)
|
217
278
|
with self._lock:
|
218
279
|
self._number_of_partitions += 1
|
219
|
-
self._cursor_per_partition[
|
220
|
-
|
221
|
-
|
222
|
-
|
280
|
+
self._cursor_per_partition[partition_key] = cursor
|
281
|
+
self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
|
282
|
+
|
283
|
+
with self._lock:
|
284
|
+
if (
|
285
|
+
len(self._partition_parent_state_map) == 0
|
286
|
+
or self._partition_parent_state_map[
|
287
|
+
next(reversed(self._partition_parent_state_map))
|
288
|
+
]
|
289
|
+
!= parent_state
|
290
|
+
):
|
291
|
+
self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
|
223
292
|
|
224
293
|
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
225
294
|
cursor.stream_slices(),
|
226
295
|
lambda: None,
|
227
296
|
):
|
228
|
-
self._semaphore_per_partition[
|
297
|
+
self._semaphore_per_partition[partition_key].release()
|
229
298
|
if is_last_slice:
|
230
|
-
self._finished_partitions.add(
|
299
|
+
self._finished_partitions.add(partition_key)
|
231
300
|
yield StreamSlice(
|
232
301
|
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
233
302
|
)
|
@@ -257,9 +326,9 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
257
326
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
258
327
|
# Try removing finished partitions first
|
259
328
|
for partition_key in list(self._cursor_per_partition.keys()):
|
260
|
-
if (
|
261
|
-
partition_key in self.
|
262
|
-
|
329
|
+
if partition_key in self._finished_partitions and (
|
330
|
+
partition_key not in self._semaphore_per_partition
|
331
|
+
or self._semaphore_per_partition[partition_key]._value == 0
|
263
332
|
):
|
264
333
|
oldest_partition = self._cursor_per_partition.pop(
|
265
334
|
partition_key
|
@@ -338,9 +407,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
338
407
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
339
408
|
self._create_cursor(state["cursor"])
|
340
409
|
)
|
341
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
342
|
-
threading.Semaphore(0)
|
343
|
-
)
|
344
410
|
|
345
411
|
# set default state for missing partitions if it is per partition with fallback to global
|
346
412
|
if self._GLOBAL_STATE_KEY in stream_state:
|
@@ -646,7 +646,7 @@ class Rate(BaseModel):
|
|
646
646
|
class Config:
|
647
647
|
extra = Extra.allow
|
648
648
|
|
649
|
-
limit: int = Field(
|
649
|
+
limit: Union[int, str] = Field(
|
650
650
|
...,
|
651
651
|
description="The maximum number of calls allowed within the interval.",
|
652
652
|
title="Limit",
|
@@ -1508,6 +1508,28 @@ class AuthFlow(BaseModel):
|
|
1508
1508
|
oauth_config_specification: Optional[OAuthConfigSpecification] = None
|
1509
1509
|
|
1510
1510
|
|
1511
|
+
class IncrementingCountCursor(BaseModel):
|
1512
|
+
type: Literal["IncrementingCountCursor"]
|
1513
|
+
cursor_field: str = Field(
|
1514
|
+
...,
|
1515
|
+
description="The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.",
|
1516
|
+
examples=["created_at", "{{ config['record_cursor'] }}"],
|
1517
|
+
title="Cursor Field",
|
1518
|
+
)
|
1519
|
+
start_value: Optional[Union[str, int]] = Field(
|
1520
|
+
None,
|
1521
|
+
description="The value that determines the earliest record that should be synced.",
|
1522
|
+
examples=[0, "{{ config['start_value'] }}"],
|
1523
|
+
title="Start Value",
|
1524
|
+
)
|
1525
|
+
start_value_option: Optional[RequestOption] = Field(
|
1526
|
+
None,
|
1527
|
+
description="Optionally configures how the start value will be sent in requests to the source API.",
|
1528
|
+
title="Inject Start Value Into Outgoing HTTP Request",
|
1529
|
+
)
|
1530
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
1531
|
+
|
1532
|
+
|
1511
1533
|
class DatetimeBasedCursor(BaseModel):
|
1512
1534
|
type: Literal["DatetimeBasedCursor"]
|
1513
1535
|
clamping: Optional[Clamping] = Field(
|
@@ -1948,7 +1970,9 @@ class DeclarativeStream(BaseModel):
|
|
1948
1970
|
description="Component used to coordinate how records are extracted across stream slices and request pages.",
|
1949
1971
|
title="Retriever",
|
1950
1972
|
)
|
1951
|
-
incremental_sync: Optional[
|
1973
|
+
incremental_sync: Optional[
|
1974
|
+
Union[CustomIncrementalSync, DatetimeBasedCursor, IncrementingCountCursor]
|
1975
|
+
] = Field(
|
1952
1976
|
None,
|
1953
1977
|
description="Component used to fetch data incrementally based on a time field in the data.",
|
1954
1978
|
title="Incremental Sync",
|
@@ -2225,15 +2249,7 @@ class SimpleRetriever(BaseModel):
|
|
2225
2249
|
CustomPartitionRouter,
|
2226
2250
|
ListPartitionRouter,
|
2227
2251
|
SubstreamPartitionRouter,
|
2228
|
-
|
2229
|
-
List[
|
2230
|
-
Union[
|
2231
|
-
CustomPartitionRouter,
|
2232
|
-
ListPartitionRouter,
|
2233
|
-
SubstreamPartitionRouter,
|
2234
|
-
GroupingPartitionRouter,
|
2235
|
-
]
|
2236
|
-
],
|
2252
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2237
2253
|
]
|
2238
2254
|
] = Field(
|
2239
2255
|
[],
|
@@ -2271,7 +2287,7 @@ class AsyncRetriever(BaseModel):
|
|
2271
2287
|
status_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2272
2288
|
..., description="Responsible for fetching the actual status of the async job."
|
2273
2289
|
)
|
2274
|
-
|
2290
|
+
download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2275
2291
|
...,
|
2276
2292
|
description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.",
|
2277
2293
|
)
|
@@ -2286,7 +2302,7 @@ class AsyncRetriever(BaseModel):
|
|
2286
2302
|
...,
|
2287
2303
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.",
|
2288
2304
|
)
|
2289
|
-
|
2305
|
+
download_target_requester: Optional[Union[CustomRequester, HttpRequester]] = Field(
|
2290
2306
|
None,
|
2291
2307
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.",
|
2292
2308
|
)
|
@@ -2311,15 +2327,7 @@ class AsyncRetriever(BaseModel):
|
|
2311
2327
|
CustomPartitionRouter,
|
2312
2328
|
ListPartitionRouter,
|
2313
2329
|
SubstreamPartitionRouter,
|
2314
|
-
|
2315
|
-
List[
|
2316
|
-
Union[
|
2317
|
-
CustomPartitionRouter,
|
2318
|
-
ListPartitionRouter,
|
2319
|
-
SubstreamPartitionRouter,
|
2320
|
-
GroupingPartitionRouter,
|
2321
|
-
]
|
2322
|
-
],
|
2330
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2323
2331
|
]
|
2324
2332
|
] = Field(
|
2325
2333
|
[],
|
@@ -2371,29 +2379,6 @@ class SubstreamPartitionRouter(BaseModel):
|
|
2371
2379
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2372
2380
|
|
2373
2381
|
|
2374
|
-
class GroupingPartitionRouter(BaseModel):
|
2375
|
-
type: Literal["GroupingPartitionRouter"]
|
2376
|
-
group_size: int = Field(
|
2377
|
-
...,
|
2378
|
-
description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
|
2379
|
-
examples=[10, 50],
|
2380
|
-
title="Group Size",
|
2381
|
-
)
|
2382
|
-
underlying_partition_router: Union[
|
2383
|
-
CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
|
2384
|
-
] = Field(
|
2385
|
-
...,
|
2386
|
-
description="The partition router whose output will be grouped. This can be any valid partition router component.",
|
2387
|
-
title="Underlying Partition Router",
|
2388
|
-
)
|
2389
|
-
deduplicate: Optional[bool] = Field(
|
2390
|
-
True,
|
2391
|
-
description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
|
2392
|
-
title="Deduplicate Partitions",
|
2393
|
-
)
|
2394
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2395
|
-
|
2396
|
-
|
2397
2382
|
class HttpComponentsResolver(BaseModel):
|
2398
2383
|
type: Literal["HttpComponentsResolver"]
|
2399
2384
|
retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(
|
@@ -45,7 +45,7 @@ class AirbyteCustomCodeNotPermittedError(Exception):
|
|
45
45
|
def _hash_text(input_text: str, hash_type: str = "md5") -> str:
|
46
46
|
"""Return the hash of the input text using the specified hash type."""
|
47
47
|
if not input_text:
|
48
|
-
raise ValueError("
|
48
|
+
raise ValueError("Hash input text cannot be empty.")
|
49
49
|
|
50
50
|
hash_object = CHECKSUM_FUNCTIONS[hash_type]()
|
51
51
|
hash_object.update(input_text.encode())
|
@@ -68,6 +68,10 @@ def validate_python_code(
|
|
68
68
|
|
69
69
|
Currently we fail if no checksums are provided, although this may change in the future.
|
70
70
|
"""
|
71
|
+
if not code_text:
|
72
|
+
# No code provided, nothing to validate.
|
73
|
+
return
|
74
|
+
|
71
75
|
if not checksums:
|
72
76
|
raise ValueError(f"A checksum is required to validate the code. Received: {checksums}")
|
73
77
|
|
@@ -77,8 +81,18 @@ def validate_python_code(
|
|
77
81
|
f"Unsupported checksum type: {checksum_type}. Supported checksum types are: {CHECKSUM_FUNCTIONS.keys()}"
|
78
82
|
)
|
79
83
|
|
80
|
-
|
81
|
-
|
84
|
+
calculated_checksum = _hash_text(code_text, checksum_type)
|
85
|
+
if calculated_checksum != checksum:
|
86
|
+
raise AirbyteCodeTamperedError(
|
87
|
+
f"{checksum_type} checksum does not match."
|
88
|
+
+ str(
|
89
|
+
{
|
90
|
+
"expected_checksum": checksum,
|
91
|
+
"actual_checksum": calculated_checksum,
|
92
|
+
"code_text": code_text,
|
93
|
+
}
|
94
|
+
),
|
95
|
+
)
|
82
96
|
|
83
97
|
|
84
98
|
def get_registered_components_module(
|
@@ -94,7 +108,7 @@ def get_registered_components_module(
|
|
94
108
|
|
95
109
|
Returns `None` if no components is provided and the `components` module is not found.
|
96
110
|
"""
|
97
|
-
if config and INJECTED_COMPONENTS_PY
|
111
|
+
if config and config.get(INJECTED_COMPONENTS_PY, None):
|
98
112
|
if not custom_code_execution_permitted():
|
99
113
|
raise AirbyteCustomCodeNotPermittedError
|
100
114
|
|