airbyte-cdk 6.37.0.dev1__py3-none-any.whl → 6.37.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/models.py +16 -14
- airbyte_cdk/connector_builder/test_reader/helpers.py +120 -22
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +16 -3
- airbyte_cdk/connector_builder/test_reader/types.py +9 -1
- airbyte_cdk/sources/declarative/auth/token_provider.py +1 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +43 -7
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +7 -1
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +77 -48
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +13 -2
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +1 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +83 -17
- airbyte_cdk/sources/declarative/interpolation/macros.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +37 -50
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +18 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +171 -70
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/requesters/README.md +5 -5
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +60 -17
- airbyte_cdk/sources/declarative/requesters/http_requester.py +49 -17
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +25 -4
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +6 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +7 -2
- airbyte_cdk/sources/declarative/requesters/requester.py +7 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +10 -3
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +21 -4
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +2 -2
- airbyte_cdk/sources/http_logger.py +3 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +2 -1
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +3 -3
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +1 -0
- airbyte_cdk/sources/types.py +1 -0
- airbyte_cdk/utils/mapping_helpers.py +18 -1
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/METADATA +4 -4
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/RECORD +39 -44
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -136
- airbyte_cdk/sources/embedded/__init__.py +0 -3
- airbyte_cdk/sources/embedded/base_integration.py +0 -61
- airbyte_cdk/sources/embedded/catalog.py +0 -57
- airbyte_cdk/sources/embedded/runner.py +0 -57
- airbyte_cdk/sources/embedded/tools.py +0 -27
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dist-info}/entry_points.txt +0 -0
@@ -777,6 +777,44 @@ definitions:
|
|
777
777
|
type:
|
778
778
|
type: string
|
779
779
|
enum: [LegacyToPerPartitionStateMigration]
|
780
|
+
IncrementingCountCursor:
|
781
|
+
title: Incrementing Count Cursor
|
782
|
+
description: Cursor that allows for incremental sync according to a continuously increasing integer.
|
783
|
+
type: object
|
784
|
+
required:
|
785
|
+
- type
|
786
|
+
- cursor_field
|
787
|
+
properties:
|
788
|
+
type:
|
789
|
+
type: string
|
790
|
+
enum: [IncrementingCountCursor]
|
791
|
+
cursor_field:
|
792
|
+
title: Cursor Field
|
793
|
+
description: The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.
|
794
|
+
type: string
|
795
|
+
interpolation_context:
|
796
|
+
- config
|
797
|
+
examples:
|
798
|
+
- "created_at"
|
799
|
+
- "{{ config['record_cursor'] }}"
|
800
|
+
start_value:
|
801
|
+
title: Start Value
|
802
|
+
description: The value that determines the earliest record that should be synced.
|
803
|
+
anyOf:
|
804
|
+
- type: string
|
805
|
+
- type: integer
|
806
|
+
interpolation_context:
|
807
|
+
- config
|
808
|
+
examples:
|
809
|
+
- 0
|
810
|
+
- "{{ config['start_value'] }}"
|
811
|
+
start_value_option:
|
812
|
+
title: Inject Start Value Into Outgoing HTTP Request
|
813
|
+
description: Optionally configures how the start value will be sent in requests to the source API.
|
814
|
+
"$ref": "#/definitions/RequestOption"
|
815
|
+
$parameters:
|
816
|
+
type: object
|
817
|
+
additionalProperties: true
|
780
818
|
DatetimeBasedCursor:
|
781
819
|
title: Datetime Based Cursor
|
782
820
|
description: Cursor to provide incremental capabilities over datetime.
|
@@ -844,6 +882,7 @@ definitions:
|
|
844
882
|
* **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`
|
845
883
|
* **%S**: Second (zero-padded) - `00`, `01`, ..., `59`
|
846
884
|
* **%f**: Microsecond (zero-padded to 6 digits) - `000000`
|
885
|
+
* **%_ms**: Millisecond (zero-padded to 3 digits) - `000`
|
847
886
|
* **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`
|
848
887
|
* **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`
|
849
888
|
* **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`
|
@@ -1318,6 +1357,7 @@ definitions:
|
|
1318
1357
|
anyOf:
|
1319
1358
|
- "$ref": "#/definitions/CustomIncrementalSync"
|
1320
1359
|
- "$ref": "#/definitions/DatetimeBasedCursor"
|
1360
|
+
- "$ref": "#/definitions/IncrementingCountCursor"
|
1321
1361
|
name:
|
1322
1362
|
title: Name
|
1323
1363
|
description: The stream name.
|
@@ -1490,7 +1530,11 @@ definitions:
|
|
1490
1530
|
limit:
|
1491
1531
|
title: Limit
|
1492
1532
|
description: The maximum number of calls allowed within the interval.
|
1493
|
-
|
1533
|
+
anyOf:
|
1534
|
+
- type: integer
|
1535
|
+
- type: string
|
1536
|
+
interpolation_context:
|
1537
|
+
- config
|
1494
1538
|
interval:
|
1495
1539
|
title: Interval
|
1496
1540
|
description: The time interval for the rate limit.
|
@@ -1750,7 +1794,6 @@ definitions:
|
|
1750
1794
|
type: object
|
1751
1795
|
required:
|
1752
1796
|
- type
|
1753
|
-
- path
|
1754
1797
|
- url_base
|
1755
1798
|
properties:
|
1756
1799
|
type:
|
@@ -1762,9 +1805,18 @@ definitions:
|
|
1762
1805
|
type: string
|
1763
1806
|
interpolation_context:
|
1764
1807
|
- config
|
1808
|
+
- next_page_token
|
1809
|
+
- stream_interval
|
1810
|
+
- stream_partition
|
1811
|
+
- stream_slice
|
1812
|
+
- creation_response
|
1813
|
+
- polling_response
|
1814
|
+
- download_target
|
1765
1815
|
examples:
|
1766
1816
|
- "https://connect.squareup.com/v2"
|
1767
|
-
- "{{ config['base_url'] or 'https://app.posthog.com'}}/api
|
1817
|
+
- "{{ config['base_url'] or 'https://app.posthog.com'}}/api"
|
1818
|
+
- "https://connect.squareup.com/v2/quotes/{{ stream_partition['id'] }}/quote_line_groups"
|
1819
|
+
- "https://example.com/api/v1/resource/{{ next_page_token['id'] }}"
|
1768
1820
|
path:
|
1769
1821
|
title: URL Path
|
1770
1822
|
description: Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.
|
@@ -1775,6 +1827,9 @@ definitions:
|
|
1775
1827
|
- stream_interval
|
1776
1828
|
- stream_partition
|
1777
1829
|
- stream_slice
|
1830
|
+
- creation_response
|
1831
|
+
- polling_response
|
1832
|
+
- download_target
|
1778
1833
|
examples:
|
1779
1834
|
- "/products"
|
1780
1835
|
- "/quotes/{{ stream_partition['id'] }}/quote_line_groups"
|
@@ -2394,6 +2449,7 @@ definitions:
|
|
2394
2449
|
* **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`
|
2395
2450
|
* **%S**: Second (zero-padded) - `00`, `01`, ..., `59`
|
2396
2451
|
* **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999`
|
2452
|
+
* **%_ms**: Millisecond (zero-padded to 3 digits) - `000`, `001`, ..., `999`
|
2397
2453
|
* **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`
|
2398
2454
|
* **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`
|
2399
2455
|
* **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`
|
@@ -3130,14 +3186,12 @@ definitions:
|
|
3130
3186
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3131
3187
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3132
3188
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3133
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3134
3189
|
- type: array
|
3135
3190
|
items:
|
3136
3191
|
anyOf:
|
3137
3192
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3138
3193
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3139
3194
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3140
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3141
3195
|
decoder:
|
3142
3196
|
title: Decoder
|
3143
3197
|
description: Component decoding the response so records can be extracted.
|
@@ -3221,7 +3275,7 @@ definitions:
|
|
3221
3275
|
- polling_requester
|
3222
3276
|
- download_requester
|
3223
3277
|
- status_extractor
|
3224
|
-
-
|
3278
|
+
- download_target_extractor
|
3225
3279
|
properties:
|
3226
3280
|
type:
|
3227
3281
|
type: string
|
@@ -3238,7 +3292,7 @@ definitions:
|
|
3238
3292
|
anyOf:
|
3239
3293
|
- "$ref": "#/definitions/CustomRecordExtractor"
|
3240
3294
|
- "$ref": "#/definitions/DpathExtractor"
|
3241
|
-
|
3295
|
+
download_target_extractor:
|
3242
3296
|
description: Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.
|
3243
3297
|
anyOf:
|
3244
3298
|
- "$ref": "#/definitions/CustomRecordExtractor"
|
@@ -3259,7 +3313,7 @@ definitions:
|
|
3259
3313
|
anyOf:
|
3260
3314
|
- "$ref": "#/definitions/CustomRequester"
|
3261
3315
|
- "$ref": "#/definitions/HttpRequester"
|
3262
|
-
|
3316
|
+
download_target_requester:
|
3263
3317
|
description: Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.
|
3264
3318
|
anyOf:
|
3265
3319
|
- "$ref": "#/definitions/CustomRequester"
|
@@ -3292,14 +3346,12 @@ definitions:
|
|
3292
3346
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3293
3347
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3294
3348
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3295
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3296
3349
|
- type: array
|
3297
3350
|
items:
|
3298
3351
|
anyOf:
|
3299
3352
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3300
3353
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3301
3354
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3302
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3303
3355
|
decoder:
|
3304
3356
|
title: Decoder
|
3305
3357
|
description: Component decoding the response so records can be extracted.
|
@@ -3416,44 +3468,6 @@ definitions:
|
|
3416
3468
|
$parameters:
|
3417
3469
|
type: object
|
3418
3470
|
additionalProperties: true
|
3419
|
-
GroupingPartitionRouter:
|
3420
|
-
title: Grouping Partition Router
|
3421
|
-
description: >
|
3422
|
-
A decorator on top of a partition router that groups partitions into batches of a specified size.
|
3423
|
-
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
3424
|
-
Note that per-partition incremental syncs may not work as expected because the grouping
|
3425
|
-
of partitions might change between syncs, potentially leading to inconsistent state tracking.
|
3426
|
-
type: object
|
3427
|
-
required:
|
3428
|
-
- type
|
3429
|
-
- group_size
|
3430
|
-
- underlying_partition_router
|
3431
|
-
properties:
|
3432
|
-
type:
|
3433
|
-
type: string
|
3434
|
-
enum: [GroupingPartitionRouter]
|
3435
|
-
group_size:
|
3436
|
-
title: Group Size
|
3437
|
-
description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
|
3438
|
-
type: integer
|
3439
|
-
examples:
|
3440
|
-
- 10
|
3441
|
-
- 50
|
3442
|
-
underlying_partition_router:
|
3443
|
-
title: Underlying Partition Router
|
3444
|
-
description: The partition router whose output will be grouped. This can be any valid partition router component.
|
3445
|
-
anyOf:
|
3446
|
-
- "$ref": "#/definitions/CustomPartitionRouter"
|
3447
|
-
- "$ref": "#/definitions/ListPartitionRouter"
|
3448
|
-
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3449
|
-
deduplicate:
|
3450
|
-
title: Deduplicate Partitions
|
3451
|
-
description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
|
3452
|
-
type: boolean
|
3453
|
-
default: true
|
3454
|
-
$parameters:
|
3455
|
-
type: object
|
3456
|
-
additionalProperties: true
|
3457
3471
|
WaitUntilTimeFromHeader:
|
3458
3472
|
title: Wait Until Time Defined In Response Header
|
3459
3473
|
description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
|
@@ -3705,6 +3719,21 @@ interpolation:
|
|
3705
3719
|
self: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=
|
3706
3720
|
next: https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=0236d6d2
|
3707
3721
|
count: 82
|
3722
|
+
- title: creation_response
|
3723
|
+
description: The response received from the creation_requester in the AsyncRetriever component.
|
3724
|
+
type: object
|
3725
|
+
examples:
|
3726
|
+
- id: "1234"
|
3727
|
+
- title: polling_response
|
3728
|
+
description: The response received from the polling_requester in the AsyncRetriever component.
|
3729
|
+
type: object
|
3730
|
+
examples:
|
3731
|
+
- id: "1234"
|
3732
|
+
- title: download_target
|
3733
|
+
description: The `URL` received from the polling_requester in the AsyncRetriever with jobStatus as `COMPLETED`.
|
3734
|
+
type: string
|
3735
|
+
examples:
|
3736
|
+
- "https://api.sendgrid.com/v3/marketing/lists?page_size=1&page_token=0236d6d2&filename=xxx_yyy_zzz.csv"
|
3708
3737
|
- title: stream_interval
|
3709
3738
|
description: The current stream interval being processed. The keys are defined by the incremental sync component. Default keys are `start_time` and `end_time`.
|
3710
3739
|
type: object
|
@@ -107,6 +107,16 @@ class CsvParser(Parser):
|
|
107
107
|
encoding: Optional[str] = "utf-8"
|
108
108
|
delimiter: Optional[str] = ","
|
109
109
|
|
110
|
+
def _get_delimiter(self) -> Optional[str]:
|
111
|
+
"""
|
112
|
+
Get delimiter from the configuration. Check for the escape character and decode it.
|
113
|
+
"""
|
114
|
+
if self.delimiter is not None:
|
115
|
+
if self.delimiter.startswith("\\"):
|
116
|
+
self.delimiter = self.delimiter.encode("utf-8").decode("unicode_escape")
|
117
|
+
|
118
|
+
return self.delimiter
|
119
|
+
|
110
120
|
def parse(
|
111
121
|
self,
|
112
122
|
data: BufferedIOBase,
|
@@ -115,8 +125,9 @@ class CsvParser(Parser):
|
|
115
125
|
Parse CSV data from decompressed bytes.
|
116
126
|
"""
|
117
127
|
text_data = TextIOWrapper(data, encoding=self.encoding) # type: ignore
|
118
|
-
reader = csv.DictReader(text_data, delimiter=self.
|
119
|
-
|
128
|
+
reader = csv.DictReader(text_data, delimiter=self._get_delimiter() or ",")
|
129
|
+
for row in reader:
|
130
|
+
yield row
|
120
131
|
|
121
132
|
|
122
133
|
@dataclass
|
@@ -136,6 +136,7 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
136
136
|
"""
|
137
137
|
|
138
138
|
try:
|
139
|
+
# TODO: Add support for other file types, like `json`, with `pd.read_json()`
|
139
140
|
with open(path, "r", encoding=file_encoding) as data:
|
140
141
|
chunks = pd.read_csv(
|
141
142
|
data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object
|
@@ -95,6 +95,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
95
95
|
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
96
96
|
self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
|
97
97
|
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
|
98
|
+
|
99
|
+
# Parent-state tracking: store each partition’s parent state in creation order
|
100
|
+
self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
|
101
|
+
|
98
102
|
self._finished_partitions: set[str] = set()
|
99
103
|
self._lock = threading.Lock()
|
100
104
|
self._timer = Timer()
|
@@ -155,11 +159,62 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
155
159
|
and self._semaphore_per_partition[partition_key]._value == 0
|
156
160
|
):
|
157
161
|
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
|
158
|
-
|
162
|
+
|
163
|
+
self._check_and_update_parent_state()
|
164
|
+
|
165
|
+
self._emit_state_message()
|
166
|
+
|
167
|
+
def _check_and_update_parent_state(self) -> None:
|
168
|
+
"""
|
169
|
+
Pop the leftmost partition state from _partition_parent_state_map only if
|
170
|
+
*all partitions* up to (and including) that partition key in _semaphore_per_partition
|
171
|
+
are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
|
172
|
+
Additionally, delete finished semaphores with a value of 0 to free up memory,
|
173
|
+
as they are only needed to track errors and completion status.
|
174
|
+
"""
|
175
|
+
last_closed_state = None
|
176
|
+
|
177
|
+
while self._partition_parent_state_map:
|
178
|
+
# Look at the earliest partition key in creation order
|
179
|
+
earliest_key = next(iter(self._partition_parent_state_map))
|
180
|
+
|
181
|
+
# Verify ALL partitions from the left up to earliest_key are finished
|
182
|
+
all_left_finished = True
|
183
|
+
for p_key, sem in list(
|
184
|
+
self._semaphore_per_partition.items()
|
185
|
+
): # Use list to allow modification during iteration
|
186
|
+
# If any earlier partition is still not finished, we must stop
|
187
|
+
if p_key not in self._finished_partitions or sem._value != 0:
|
188
|
+
all_left_finished = False
|
189
|
+
break
|
190
|
+
# Once we've reached earliest_key in the semaphore order, we can stop checking
|
191
|
+
if p_key == earliest_key:
|
192
|
+
break
|
193
|
+
|
194
|
+
# If the partitions up to earliest_key are not all finished, break the while-loop
|
195
|
+
if not all_left_finished:
|
196
|
+
break
|
197
|
+
|
198
|
+
# Pop the leftmost entry from parent-state map
|
199
|
+
_, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
|
200
|
+
last_closed_state = closed_parent_state
|
201
|
+
|
202
|
+
# Clean up finished semaphores with value 0 up to and including earliest_key
|
203
|
+
for p_key in list(self._semaphore_per_partition.keys()):
|
204
|
+
sem = self._semaphore_per_partition[p_key]
|
205
|
+
if p_key in self._finished_partitions and sem._value == 0:
|
206
|
+
del self._semaphore_per_partition[p_key]
|
207
|
+
logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
|
208
|
+
if p_key == earliest_key:
|
209
|
+
break
|
210
|
+
|
211
|
+
# Update _parent_state if we popped at least one partition
|
212
|
+
if last_closed_state is not None:
|
213
|
+
self._parent_state = last_closed_state
|
159
214
|
|
160
215
|
def ensure_at_least_one_state_emitted(self) -> None:
|
161
216
|
"""
|
162
|
-
The platform
|
217
|
+
The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
163
218
|
called.
|
164
219
|
"""
|
165
220
|
if not any(
|
@@ -201,13 +256,19 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
201
256
|
|
202
257
|
slices = self._partition_router.stream_slices()
|
203
258
|
self._timer.start()
|
204
|
-
for partition in
|
205
|
-
|
259
|
+
for partition, last, parent_state in iterate_with_last_flag_and_state(
|
260
|
+
slices, self._partition_router.get_stream_state
|
261
|
+
):
|
262
|
+
yield from self._generate_slices_from_partition(partition, parent_state)
|
206
263
|
|
207
|
-
def _generate_slices_from_partition(
|
264
|
+
def _generate_slices_from_partition(
|
265
|
+
self, partition: StreamSlice, parent_state: Mapping[str, Any]
|
266
|
+
) -> Iterable[StreamSlice]:
|
208
267
|
# Ensure the maximum number of partitions is not exceeded
|
209
268
|
self._ensure_partition_limit()
|
210
269
|
|
270
|
+
partition_key = self._to_partition_key(partition.partition)
|
271
|
+
|
211
272
|
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
212
273
|
if not cursor:
|
213
274
|
cursor = self._create_cursor(
|
@@ -216,18 +277,26 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
216
277
|
)
|
217
278
|
with self._lock:
|
218
279
|
self._number_of_partitions += 1
|
219
|
-
self._cursor_per_partition[
|
220
|
-
|
221
|
-
|
222
|
-
|
280
|
+
self._cursor_per_partition[partition_key] = cursor
|
281
|
+
self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
|
282
|
+
|
283
|
+
with self._lock:
|
284
|
+
if (
|
285
|
+
len(self._partition_parent_state_map) == 0
|
286
|
+
or self._partition_parent_state_map[
|
287
|
+
next(reversed(self._partition_parent_state_map))
|
288
|
+
]
|
289
|
+
!= parent_state
|
290
|
+
):
|
291
|
+
self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
|
223
292
|
|
224
293
|
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
225
294
|
cursor.stream_slices(),
|
226
295
|
lambda: None,
|
227
296
|
):
|
228
|
-
self._semaphore_per_partition[
|
297
|
+
self._semaphore_per_partition[partition_key].release()
|
229
298
|
if is_last_slice:
|
230
|
-
self._finished_partitions.add(
|
299
|
+
self._finished_partitions.add(partition_key)
|
231
300
|
yield StreamSlice(
|
232
301
|
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
233
302
|
)
|
@@ -257,9 +326,9 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
257
326
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
258
327
|
# Try removing finished partitions first
|
259
328
|
for partition_key in list(self._cursor_per_partition.keys()):
|
260
|
-
if (
|
261
|
-
partition_key in self.
|
262
|
-
|
329
|
+
if partition_key in self._finished_partitions and (
|
330
|
+
partition_key not in self._semaphore_per_partition
|
331
|
+
or self._semaphore_per_partition[partition_key]._value == 0
|
263
332
|
):
|
264
333
|
oldest_partition = self._cursor_per_partition.pop(
|
265
334
|
partition_key
|
@@ -338,9 +407,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
338
407
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
339
408
|
self._create_cursor(state["cursor"])
|
340
409
|
)
|
341
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
342
|
-
threading.Semaphore(0)
|
343
|
-
)
|
344
410
|
|
345
411
|
# set default state for missing partitions if it is per partition with fallback to global
|
346
412
|
if self._GLOBAL_STATE_KEY in stream_state:
|
@@ -646,7 +646,7 @@ class Rate(BaseModel):
|
|
646
646
|
class Config:
|
647
647
|
extra = Extra.allow
|
648
648
|
|
649
|
-
limit: int = Field(
|
649
|
+
limit: Union[int, str] = Field(
|
650
650
|
...,
|
651
651
|
description="The maximum number of calls allowed within the interval.",
|
652
652
|
title="Limit",
|
@@ -939,7 +939,7 @@ class MinMaxDatetime(BaseModel):
|
|
939
939
|
)
|
940
940
|
datetime_format: Optional[str] = Field(
|
941
941
|
"",
|
942
|
-
description='Format of the datetime value. Defaults to "%Y-%m-%dT%H:%M:%S.%f%z" if left empty. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (Sunday as first day) - `00`, `01`, ..., `53`\n * **%W**: Week number of the year (Monday as first day) - `00`, `01`, ..., `53`\n * **%c**: Date and time representation - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date representation - `08/16/1988`\n * **%X**: Time representation - `21:30:00`\n * **%%**: Literal \'%\' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n',
|
942
|
+
description='Format of the datetime value. Defaults to "%Y-%m-%dT%H:%M:%S.%f%z" if left empty. Use placeholders starting with "%" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`, `000001`, ..., `999999`\n * **%_ms**: Millisecond (zero-padded to 3 digits) - `000`, `001`, ..., `999`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (Sunday as first day) - `00`, `01`, ..., `53`\n * **%W**: Week number of the year (Monday as first day) - `00`, `01`, ..., `53`\n * **%c**: Date and time representation - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date representation - `08/16/1988`\n * **%X**: Time representation - `21:30:00`\n * **%%**: Literal \'%\' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n',
|
943
943
|
examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s"],
|
944
944
|
title="Datetime Format",
|
945
945
|
)
|
@@ -1508,6 +1508,28 @@ class AuthFlow(BaseModel):
|
|
1508
1508
|
oauth_config_specification: Optional[OAuthConfigSpecification] = None
|
1509
1509
|
|
1510
1510
|
|
1511
|
+
class IncrementingCountCursor(BaseModel):
|
1512
|
+
type: Literal["IncrementingCountCursor"]
|
1513
|
+
cursor_field: str = Field(
|
1514
|
+
...,
|
1515
|
+
description="The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.",
|
1516
|
+
examples=["created_at", "{{ config['record_cursor'] }}"],
|
1517
|
+
title="Cursor Field",
|
1518
|
+
)
|
1519
|
+
start_value: Optional[Union[str, int]] = Field(
|
1520
|
+
None,
|
1521
|
+
description="The value that determines the earliest record that should be synced.",
|
1522
|
+
examples=[0, "{{ config['start_value'] }}"],
|
1523
|
+
title="Start Value",
|
1524
|
+
)
|
1525
|
+
start_value_option: Optional[RequestOption] = Field(
|
1526
|
+
None,
|
1527
|
+
description="Optionally configures how the start value will be sent in requests to the source API.",
|
1528
|
+
title="Inject Start Value Into Outgoing HTTP Request",
|
1529
|
+
)
|
1530
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
1531
|
+
|
1532
|
+
|
1511
1533
|
class DatetimeBasedCursor(BaseModel):
|
1512
1534
|
type: Literal["DatetimeBasedCursor"]
|
1513
1535
|
clamping: Optional[Clamping] = Field(
|
@@ -1523,7 +1545,7 @@ class DatetimeBasedCursor(BaseModel):
|
|
1523
1545
|
)
|
1524
1546
|
datetime_format: str = Field(
|
1525
1547
|
...,
|
1526
|
-
description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp (milliseconds) - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
|
1548
|
+
description="The datetime format used to format the datetime values that are sent in outgoing requests to the API. Use placeholders starting with \"%\" to describe the format the API is using. The following placeholders are available:\n * **%s**: Epoch unix timestamp - `1686218963`\n * **%s_as_float**: Epoch unix timestamp in seconds as float with microsecond precision - `1686218963.123456`\n * **%ms**: Epoch unix timestamp (milliseconds) - `1686218963123`\n * **%a**: Weekday (abbreviated) - `Sun`\n * **%A**: Weekday (full) - `Sunday`\n * **%w**: Weekday (decimal) - `0` (Sunday), `6` (Saturday)\n * **%d**: Day of the month (zero-padded) - `01`, `02`, ..., `31`\n * **%b**: Month (abbreviated) - `Jan`\n * **%B**: Month (full) - `January`\n * **%m**: Month (zero-padded) - `01`, `02`, ..., `12`\n * **%y**: Year (without century, zero-padded) - `00`, `01`, ..., `99`\n * **%Y**: Year (with century) - `0001`, `0002`, ..., `9999`\n * **%H**: Hour (24-hour, zero-padded) - `00`, `01`, ..., `23`\n * **%I**: Hour (12-hour, zero-padded) - `01`, `02`, ..., `12`\n * **%p**: AM/PM indicator\n * **%M**: Minute (zero-padded) - `00`, `01`, ..., `59`\n * **%S**: Second (zero-padded) - `00`, `01`, ..., `59`\n * **%f**: Microsecond (zero-padded to 6 digits) - `000000`\n * **%_ms**: Millisecond (zero-padded to 3 digits) - `000`\n * **%z**: UTC offset - `(empty)`, `+0000`, `-04:00`\n * **%Z**: Time zone name - `(empty)`, `UTC`, `GMT`\n * **%j**: Day of the year (zero-padded) - `001`, `002`, ..., `366`\n * **%U**: Week number of the year (starting Sunday) - `00`, ..., `53`\n * **%W**: Week number of the year (starting Monday) - `00`, ..., `53`\n * **%c**: Date and time - `Tue Aug 16 21:30:00 1988`\n * **%x**: Date standard format - `08/16/1988`\n * **%X**: Time standard format - `21:30:00`\n * **%%**: Literal '%' character\n\n Some placeholders depend on the locale of the underlying system - in most cases this locale is configured as en/US. For more information see the [Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes).\n",
|
1527
1549
|
examples=["%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%d", "%s", "%ms", "%s_as_float"],
|
1528
1550
|
title="Outgoing Datetime Format",
|
1529
1551
|
)
|
@@ -1948,7 +1970,9 @@ class DeclarativeStream(BaseModel):
|
|
1948
1970
|
description="Component used to coordinate how records are extracted across stream slices and request pages.",
|
1949
1971
|
title="Retriever",
|
1950
1972
|
)
|
1951
|
-
incremental_sync: Optional[
|
1973
|
+
incremental_sync: Optional[
|
1974
|
+
Union[CustomIncrementalSync, DatetimeBasedCursor, IncrementingCountCursor]
|
1975
|
+
] = Field(
|
1952
1976
|
None,
|
1953
1977
|
description="Component used to fetch data incrementally based on a time field in the data.",
|
1954
1978
|
title="Incremental Sync",
|
@@ -2048,12 +2072,14 @@ class HttpRequester(BaseModel):
|
|
2048
2072
|
description="Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.",
|
2049
2073
|
examples=[
|
2050
2074
|
"https://connect.squareup.com/v2",
|
2051
|
-
"{{ config['base_url'] or 'https://app.posthog.com'}}/api
|
2075
|
+
"{{ config['base_url'] or 'https://app.posthog.com'}}/api",
|
2076
|
+
"https://connect.squareup.com/v2/quotes/{{ stream_partition['id'] }}/quote_line_groups",
|
2077
|
+
"https://example.com/api/v1/resource/{{ next_page_token['id'] }}",
|
2052
2078
|
],
|
2053
2079
|
title="API Base URL",
|
2054
2080
|
)
|
2055
|
-
path: str = Field(
|
2056
|
-
|
2081
|
+
path: Optional[str] = Field(
|
2082
|
+
None,
|
2057
2083
|
description="Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.",
|
2058
2084
|
examples=[
|
2059
2085
|
"/products",
|
@@ -2225,15 +2251,7 @@ class SimpleRetriever(BaseModel):
|
|
2225
2251
|
CustomPartitionRouter,
|
2226
2252
|
ListPartitionRouter,
|
2227
2253
|
SubstreamPartitionRouter,
|
2228
|
-
|
2229
|
-
List[
|
2230
|
-
Union[
|
2231
|
-
CustomPartitionRouter,
|
2232
|
-
ListPartitionRouter,
|
2233
|
-
SubstreamPartitionRouter,
|
2234
|
-
GroupingPartitionRouter,
|
2235
|
-
]
|
2236
|
-
],
|
2254
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2237
2255
|
]
|
2238
2256
|
] = Field(
|
2239
2257
|
[],
|
@@ -2271,7 +2289,7 @@ class AsyncRetriever(BaseModel):
|
|
2271
2289
|
status_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2272
2290
|
..., description="Responsible for fetching the actual status of the async job."
|
2273
2291
|
)
|
2274
|
-
|
2292
|
+
download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2275
2293
|
...,
|
2276
2294
|
description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.",
|
2277
2295
|
)
|
@@ -2286,7 +2304,7 @@ class AsyncRetriever(BaseModel):
|
|
2286
2304
|
...,
|
2287
2305
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.",
|
2288
2306
|
)
|
2289
|
-
|
2307
|
+
download_target_requester: Optional[Union[CustomRequester, HttpRequester]] = Field(
|
2290
2308
|
None,
|
2291
2309
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.",
|
2292
2310
|
)
|
@@ -2311,15 +2329,7 @@ class AsyncRetriever(BaseModel):
|
|
2311
2329
|
CustomPartitionRouter,
|
2312
2330
|
ListPartitionRouter,
|
2313
2331
|
SubstreamPartitionRouter,
|
2314
|
-
|
2315
|
-
List[
|
2316
|
-
Union[
|
2317
|
-
CustomPartitionRouter,
|
2318
|
-
ListPartitionRouter,
|
2319
|
-
SubstreamPartitionRouter,
|
2320
|
-
GroupingPartitionRouter,
|
2321
|
-
]
|
2322
|
-
],
|
2332
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2323
2333
|
]
|
2324
2334
|
] = Field(
|
2325
2335
|
[],
|
@@ -2371,29 +2381,6 @@ class SubstreamPartitionRouter(BaseModel):
|
|
2371
2381
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2372
2382
|
|
2373
2383
|
|
2374
|
-
class GroupingPartitionRouter(BaseModel):
|
2375
|
-
type: Literal["GroupingPartitionRouter"]
|
2376
|
-
group_size: int = Field(
|
2377
|
-
...,
|
2378
|
-
description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
|
2379
|
-
examples=[10, 50],
|
2380
|
-
title="Group Size",
|
2381
|
-
)
|
2382
|
-
underlying_partition_router: Union[
|
2383
|
-
CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
|
2384
|
-
] = Field(
|
2385
|
-
...,
|
2386
|
-
description="The partition router whose output will be grouped. This can be any valid partition router component.",
|
2387
|
-
title="Underlying Partition Router",
|
2388
|
-
)
|
2389
|
-
deduplicate: Optional[bool] = Field(
|
2390
|
-
True,
|
2391
|
-
description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
|
2392
|
-
title="Deduplicate Partitions",
|
2393
|
-
)
|
2394
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2395
|
-
|
2396
|
-
|
2397
2384
|
class HttpComponentsResolver(BaseModel):
|
2398
2385
|
type: Literal["HttpComponentsResolver"]
|
2399
2386
|
retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(
|