airbyte-cdk 6.36.0.dev0__py3-none-any.whl → 6.37.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/models.py +16 -14
- airbyte_cdk/connector_builder/test_reader/helpers.py +120 -22
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +16 -3
- airbyte_cdk/connector_builder/test_reader/types.py +9 -1
- airbyte_cdk/entrypoint.py +7 -7
- airbyte_cdk/sources/declarative/auth/token_provider.py +1 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +15 -75
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +15 -16
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +13 -2
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +0 -1
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/__init__.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/filters.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +1 -1
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +14 -1
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1 -1
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +52 -30
- airbyte_cdk/sources/declarative/requesters/http_requester.py +0 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +1 -4
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +0 -3
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -47
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +0 -2
- airbyte_cdk/sources/declarative/transformations/add_fields.py +4 -4
- airbyte_cdk/sources/http_logger.py +3 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +1 -0
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/RECORD +36 -36
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.36.0.dev0.dist-info → airbyte_cdk-6.37.0.dev0.dist-info}/entry_points.txt +0 -0
@@ -21,20 +21,6 @@ class HttpRequest:
|
|
21
21
|
body: Optional[str] = None
|
22
22
|
|
23
23
|
|
24
|
-
@dataclass
|
25
|
-
class StreamReadPages:
|
26
|
-
records: List[object]
|
27
|
-
request: Optional[HttpRequest] = None
|
28
|
-
response: Optional[HttpResponse] = None
|
29
|
-
|
30
|
-
|
31
|
-
@dataclass
|
32
|
-
class StreamReadSlices:
|
33
|
-
pages: List[StreamReadPages]
|
34
|
-
slice_descriptor: Optional[Dict[str, Any]]
|
35
|
-
state: Optional[List[Dict[str, Any]]] = None
|
36
|
-
|
37
|
-
|
38
24
|
@dataclass
|
39
25
|
class LogMessage:
|
40
26
|
message: str
|
@@ -46,11 +32,27 @@ class LogMessage:
|
|
46
32
|
@dataclass
|
47
33
|
class AuxiliaryRequest:
|
48
34
|
title: str
|
35
|
+
type: str
|
49
36
|
description: str
|
50
37
|
request: HttpRequest
|
51
38
|
response: HttpResponse
|
52
39
|
|
53
40
|
|
41
|
+
@dataclass
|
42
|
+
class StreamReadPages:
|
43
|
+
records: List[object]
|
44
|
+
request: Optional[HttpRequest] = None
|
45
|
+
response: Optional[HttpResponse] = None
|
46
|
+
|
47
|
+
|
48
|
+
@dataclass
|
49
|
+
class StreamReadSlices:
|
50
|
+
pages: List[StreamReadPages]
|
51
|
+
slice_descriptor: Optional[Dict[str, Any]]
|
52
|
+
state: Optional[List[Dict[str, Any]]] = None
|
53
|
+
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
|
54
|
+
|
55
|
+
|
54
56
|
@dataclass
|
55
57
|
class StreamRead(object):
|
56
58
|
logs: List[LogMessage]
|
@@ -28,7 +28,7 @@ from airbyte_cdk.utils.schema_inferrer import (
|
|
28
28
|
SchemaInferrer,
|
29
29
|
)
|
30
30
|
|
31
|
-
from .types import LOG_MESSAGES_OUTPUT_TYPE
|
31
|
+
from .types import ASYNC_AUXILIARY_REQUEST_TYPES, LOG_MESSAGES_OUTPUT_TYPE
|
32
32
|
|
33
33
|
# -------
|
34
34
|
# Parsers
|
@@ -226,7 +226,8 @@ def should_close_page(
|
|
226
226
|
at_least_one_page_in_group
|
227
227
|
and is_log_message(message)
|
228
228
|
and (
|
229
|
-
is_page_http_request(json_message)
|
229
|
+
is_page_http_request(json_message)
|
230
|
+
or message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
230
231
|
)
|
231
232
|
)
|
232
233
|
|
@@ -330,6 +331,10 @@ def is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool:
|
|
330
331
|
return is_http_log(message) and message.get("http", {}).get("is_auxiliary", False)
|
331
332
|
|
332
333
|
|
334
|
+
def is_async_auxiliary_request(message: AuxiliaryRequest) -> bool:
|
335
|
+
return message.type in ASYNC_AUXILIARY_REQUEST_TYPES
|
336
|
+
|
337
|
+
|
333
338
|
def is_log_message(message: AirbyteMessage) -> bool:
|
334
339
|
"""
|
335
340
|
Determines whether the provided message is of type LOG.
|
@@ -413,6 +418,7 @@ def handle_current_slice(
|
|
413
418
|
current_slice_pages: List[StreamReadPages],
|
414
419
|
current_slice_descriptor: Optional[Dict[str, Any]] = None,
|
415
420
|
latest_state_message: Optional[Dict[str, Any]] = None,
|
421
|
+
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None,
|
416
422
|
) -> StreamReadSlices:
|
417
423
|
"""
|
418
424
|
Handles the current slice by packaging its pages, descriptor, and state into a StreamReadSlices instance.
|
@@ -421,6 +427,7 @@ def handle_current_slice(
|
|
421
427
|
current_slice_pages (List[StreamReadPages]): The pages to be included in the slice.
|
422
428
|
current_slice_descriptor (Optional[Dict[str, Any]]): Descriptor for the current slice, optional.
|
423
429
|
latest_state_message (Optional[Dict[str, Any]]): The latest state message, optional.
|
430
|
+
auxiliary_requests (Optional[List[AuxiliaryRequest]]): The auxiliary requests to include, optional.
|
424
431
|
|
425
432
|
Returns:
|
426
433
|
StreamReadSlices: An object containing the current slice's pages, descriptor, and state.
|
@@ -429,6 +436,7 @@ def handle_current_slice(
|
|
429
436
|
pages=current_slice_pages,
|
430
437
|
slice_descriptor=current_slice_descriptor,
|
431
438
|
state=[latest_state_message] if latest_state_message else [],
|
439
|
+
auxiliary_requests=auxiliary_requests if auxiliary_requests else [],
|
432
440
|
)
|
433
441
|
|
434
442
|
|
@@ -486,29 +494,24 @@ def handle_auxiliary_request(json_message: Dict[str, JsonType]) -> AuxiliaryRequ
|
|
486
494
|
Raises:
|
487
495
|
ValueError: If any of the "airbyte_cdk", "stream", or "http" fields is not a dictionary.
|
488
496
|
"""
|
489
|
-
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
490
|
-
|
491
|
-
if not isinstance(airbyte_cdk, dict):
|
492
|
-
raise ValueError(
|
493
|
-
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
494
|
-
)
|
495
|
-
|
496
|
-
stream = airbyte_cdk.get("stream", {})
|
497
497
|
|
498
|
-
|
499
|
-
|
498
|
+
airbyte_cdk = get_airbyte_cdk_from_message(json_message)
|
499
|
+
stream = get_stream_from_airbyte_cdk(airbyte_cdk)
|
500
|
+
title_prefix = get_auxiliary_request_title_prefix(stream)
|
501
|
+
http = get_http_property_from_message(json_message)
|
502
|
+
request_type = get_auxiliary_request_type(stream, http)
|
500
503
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
|
504
|
+
title = title_prefix + str(http.get("title", None))
|
505
|
+
description = str(http.get("description", None))
|
506
|
+
request = create_request_from_log_message(json_message)
|
507
|
+
response = create_response_from_log_message(json_message)
|
506
508
|
|
507
509
|
return AuxiliaryRequest(
|
508
|
-
title=
|
509
|
-
|
510
|
-
|
511
|
-
|
510
|
+
title=title,
|
511
|
+
type=request_type,
|
512
|
+
description=description,
|
513
|
+
request=request,
|
514
|
+
response=response,
|
512
515
|
)
|
513
516
|
|
514
517
|
|
@@ -558,7 +561,8 @@ def handle_log_message(
|
|
558
561
|
at_least_one_page_in_group,
|
559
562
|
current_page_request,
|
560
563
|
current_page_response,
|
561
|
-
auxiliary_request
|
564
|
+
auxiliary_request,
|
565
|
+
log_message,
|
562
566
|
)
|
563
567
|
|
564
568
|
|
@@ -589,3 +593,97 @@ def handle_record_message(
|
|
589
593
|
datetime_format_inferrer.accumulate(message.record) # type: ignore
|
590
594
|
|
591
595
|
return records_count
|
596
|
+
|
597
|
+
|
598
|
+
# -------
|
599
|
+
# Reusable Getters
|
600
|
+
# -------
|
601
|
+
|
602
|
+
|
603
|
+
def get_airbyte_cdk_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
|
604
|
+
"""
|
605
|
+
Retrieves the "airbyte_cdk" dictionary from the provided JSON message.
|
606
|
+
|
607
|
+
This function validates that the extracted "airbyte_cdk" is of type dict,
|
608
|
+
raising a ValueError if the validation fails.
|
609
|
+
|
610
|
+
Parameters:
|
611
|
+
json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
|
612
|
+
|
613
|
+
Returns:
|
614
|
+
dict: The "airbyte_cdk" dictionary extracted from the JSON message.
|
615
|
+
|
616
|
+
Raises:
|
617
|
+
ValueError: If the "airbyte_cdk" field is not a dictionary.
|
618
|
+
"""
|
619
|
+
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
620
|
+
|
621
|
+
if not isinstance(airbyte_cdk, dict):
|
622
|
+
raise ValueError(
|
623
|
+
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
624
|
+
)
|
625
|
+
|
626
|
+
return airbyte_cdk
|
627
|
+
|
628
|
+
|
629
|
+
def get_stream_from_airbyte_cdk(airbyte_cdk: dict) -> dict: # type: ignore
|
630
|
+
"""
|
631
|
+
Retrieves the "stream" dictionary from the provided "airbyte_cdk" dictionary.
|
632
|
+
|
633
|
+
This function ensures that the extracted "stream" is of type dict,
|
634
|
+
raising a ValueError if the validation fails.
|
635
|
+
|
636
|
+
Parameters:
|
637
|
+
airbyte_cdk (dict): The dictionary representing the Airbyte CDK data.
|
638
|
+
|
639
|
+
Returns:
|
640
|
+
dict: The "stream" dictionary extracted from the Airbyte CDK data.
|
641
|
+
|
642
|
+
Raises:
|
643
|
+
ValueError: If the "stream" field is not a dictionary.
|
644
|
+
"""
|
645
|
+
|
646
|
+
stream = airbyte_cdk.get("stream", {})
|
647
|
+
|
648
|
+
if not isinstance(stream, dict):
|
649
|
+
raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}")
|
650
|
+
|
651
|
+
return stream
|
652
|
+
|
653
|
+
|
654
|
+
def get_auxiliary_request_title_prefix(stream: dict) -> str: # type: ignore
|
655
|
+
"""
|
656
|
+
Generates a title prefix based on the stream type.
|
657
|
+
"""
|
658
|
+
return "Parent stream: " if stream.get("is_substream", False) else ""
|
659
|
+
|
660
|
+
|
661
|
+
def get_http_property_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
|
662
|
+
"""
|
663
|
+
Retrieves the "http" dictionary from the provided JSON message.
|
664
|
+
|
665
|
+
This function validates that the extracted "http" is of type dict,
|
666
|
+
raising a ValueError if the validation fails.
|
667
|
+
|
668
|
+
Parameters:
|
669
|
+
json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
|
670
|
+
|
671
|
+
Returns:
|
672
|
+
dict: The "http" dictionary extracted from the JSON message.
|
673
|
+
|
674
|
+
Raises:
|
675
|
+
ValueError: If the "http" field is not a dictionary.
|
676
|
+
"""
|
677
|
+
http = json_message.get("http", {})
|
678
|
+
|
679
|
+
if not isinstance(http, dict):
|
680
|
+
raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
|
681
|
+
|
682
|
+
return http
|
683
|
+
|
684
|
+
|
685
|
+
def get_auxiliary_request_type(stream: dict, http: dict) -> str: # type: ignore
|
686
|
+
"""
|
687
|
+
Determines the type of the auxiliary request based on the stream and HTTP properties.
|
688
|
+
"""
|
689
|
+
return "PARENT_STREAM" if stream.get("is_substream", False) else str(http.get("type", None))
|
@@ -6,6 +6,7 @@
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
7
7
|
|
8
8
|
from airbyte_cdk.connector_builder.models import (
|
9
|
+
AuxiliaryRequest,
|
9
10
|
HttpRequest,
|
10
11
|
HttpResponse,
|
11
12
|
StreamReadPages,
|
@@ -24,6 +25,7 @@ from .helpers import (
|
|
24
25
|
handle_current_slice,
|
25
26
|
handle_log_message,
|
26
27
|
handle_record_message,
|
28
|
+
is_async_auxiliary_request,
|
27
29
|
is_config_update_message,
|
28
30
|
is_log_message,
|
29
31
|
is_record_message,
|
@@ -89,6 +91,7 @@ def get_message_groups(
|
|
89
91
|
current_page_request: Optional[HttpRequest] = None
|
90
92
|
current_page_response: Optional[HttpResponse] = None
|
91
93
|
latest_state_message: Optional[Dict[str, Any]] = None
|
94
|
+
slice_auxiliary_requests: List[AuxiliaryRequest] = []
|
92
95
|
|
93
96
|
while records_count < limit and (message := next(messages, None)):
|
94
97
|
json_message = airbyte_message_to_json(message)
|
@@ -106,6 +109,7 @@ def get_message_groups(
|
|
106
109
|
current_slice_pages,
|
107
110
|
current_slice_descriptor,
|
108
111
|
latest_state_message,
|
112
|
+
slice_auxiliary_requests,
|
109
113
|
)
|
110
114
|
current_slice_descriptor = parse_slice_description(message.log.message) # type: ignore
|
111
115
|
current_slice_pages = []
|
@@ -118,7 +122,8 @@ def get_message_groups(
|
|
118
122
|
at_least_one_page_in_group,
|
119
123
|
current_page_request,
|
120
124
|
current_page_response,
|
121
|
-
|
125
|
+
auxiliary_request,
|
126
|
+
log_message,
|
122
127
|
) = handle_log_message(
|
123
128
|
message,
|
124
129
|
json_message,
|
@@ -126,8 +131,15 @@ def get_message_groups(
|
|
126
131
|
current_page_request,
|
127
132
|
current_page_response,
|
128
133
|
)
|
129
|
-
|
130
|
-
|
134
|
+
|
135
|
+
if auxiliary_request:
|
136
|
+
if is_async_auxiliary_request(auxiliary_request):
|
137
|
+
slice_auxiliary_requests.append(auxiliary_request)
|
138
|
+
else:
|
139
|
+
yield auxiliary_request
|
140
|
+
|
141
|
+
if log_message:
|
142
|
+
yield log_message
|
131
143
|
elif is_trace_with_error(message):
|
132
144
|
if message.trace is not None:
|
133
145
|
yield message.trace
|
@@ -157,4 +169,5 @@ def get_message_groups(
|
|
157
169
|
current_slice_pages,
|
158
170
|
current_slice_descriptor,
|
159
171
|
latest_state_message,
|
172
|
+
slice_auxiliary_requests,
|
160
173
|
)
|
@@ -71,5 +71,13 @@ LOG_MESSAGES_OUTPUT_TYPE = tuple[
|
|
71
71
|
bool,
|
72
72
|
HttpRequest | None,
|
73
73
|
HttpResponse | None,
|
74
|
-
AuxiliaryRequest |
|
74
|
+
AuxiliaryRequest | None,
|
75
|
+
AirbyteLogMessage | None,
|
76
|
+
]
|
77
|
+
|
78
|
+
ASYNC_AUXILIARY_REQUEST_TYPES = [
|
79
|
+
"ASYNC_CREATE",
|
80
|
+
"ASYNC_POLL",
|
81
|
+
"ASYNC_ABORT",
|
82
|
+
"ASYNC_DELETE",
|
75
83
|
]
|
airbyte_cdk/entrypoint.py
CHANGED
@@ -37,8 +37,8 @@ from airbyte_cdk.sources import Source
|
|
37
37
|
from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor
|
38
38
|
from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit, split_config
|
39
39
|
|
40
|
-
|
41
|
-
from airbyte_cdk.utils import is_cloud_environment, message_utils
|
40
|
+
from airbyte_cdk.utils import PrintBuffer, is_cloud_environment, message_utils # add PrintBuffer back once fixed
|
41
|
+
# from airbyte_cdk.utils import is_cloud_environment, message_utils
|
42
42
|
from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets, update_secrets
|
43
43
|
from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH
|
44
44
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
@@ -337,11 +337,11 @@ def launch(source: Source, args: List[str]) -> None:
|
|
337
337
|
parsed_args = source_entrypoint.parse_args(args)
|
338
338
|
# temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
|
339
339
|
# Refer to: https://github.com/airbytehq/oncall/issues/6235
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
340
|
+
with PrintBuffer():
|
341
|
+
for message in source_entrypoint.run(parsed_args):
|
342
|
+
# simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
|
343
|
+
# the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
|
344
|
+
print(f"{message}\n", end="", flush=True)
|
345
345
|
|
346
346
|
|
347
347
|
def _init_internal_request_filter() -> None:
|
@@ -24,7 +24,6 @@ from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import Da
|
|
24
24
|
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
|
25
25
|
PerPartitionWithGlobalCursor,
|
26
26
|
)
|
27
|
-
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
28
27
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
29
28
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
30
29
|
ConcurrencyLevel as ConcurrencyLevelModel,
|
@@ -36,17 +35,16 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
|
|
36
35
|
ModelToComponentFactory,
|
37
36
|
)
|
38
37
|
from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter
|
39
|
-
from airbyte_cdk.sources.declarative.requesters import HttpRequester
|
40
38
|
from airbyte_cdk.sources.declarative.retrievers import AsyncRetriever, Retriever, SimpleRetriever
|
41
39
|
from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
|
42
40
|
DeclarativePartitionFactory,
|
43
41
|
StreamSlicerPartitionGenerator,
|
44
42
|
)
|
45
|
-
from airbyte_cdk.sources.declarative.transformations.add_fields import AddFields
|
46
43
|
from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
47
44
|
from airbyte_cdk.sources.source import TState
|
48
45
|
from airbyte_cdk.sources.streams import Stream
|
49
46
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
47
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
50
48
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
|
51
49
|
AlwaysAvailableAvailabilityStrategy,
|
52
50
|
)
|
@@ -121,6 +119,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
121
119
|
message_repository=self.message_repository,
|
122
120
|
)
|
123
121
|
|
122
|
+
# TODO: Remove this. This property is necessary to safely migrate Stripe during the transition state.
|
123
|
+
@property
|
124
|
+
def is_partially_declarative(self) -> bool:
|
125
|
+
"""This flag used to avoid unexpected AbstractStreamFacade processing as concurrent streams."""
|
126
|
+
return False
|
127
|
+
|
124
128
|
def read(
|
125
129
|
self,
|
126
130
|
logger: logging.Logger,
|
@@ -321,9 +325,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
321
325
|
incremental_sync_component_definition
|
322
326
|
and incremental_sync_component_definition.get("type", "")
|
323
327
|
== DatetimeBasedCursorModel.__name__
|
324
|
-
and self._stream_supports_concurrent_partition_processing(
|
325
|
-
declarative_stream=declarative_stream
|
326
|
-
)
|
327
328
|
and hasattr(declarative_stream.retriever, "stream_slicer")
|
328
329
|
and isinstance(
|
329
330
|
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
|
@@ -375,6 +376,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
375
376
|
)
|
376
377
|
else:
|
377
378
|
synchronous_streams.append(declarative_stream)
|
379
|
+
# TODO: Remove this. This check is necessary to safely migrate Stripe during the transition state.
|
380
|
+
# Condition below needs to ensure that concurrent support is not lost for sources that already support
|
381
|
+
# it before migration, but now are only partially migrated to declarative implementation (e.g., Stripe).
|
382
|
+
elif (
|
383
|
+
isinstance(declarative_stream, AbstractStreamFacade)
|
384
|
+
and self.is_partially_declarative
|
385
|
+
):
|
386
|
+
concurrent_streams.append(declarative_stream.get_underlying_stream())
|
378
387
|
else:
|
379
388
|
synchronous_streams.append(declarative_stream)
|
380
389
|
|
@@ -390,9 +399,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
390
399
|
and bool(incremental_sync_component_definition)
|
391
400
|
and incremental_sync_component_definition.get("type", "")
|
392
401
|
== DatetimeBasedCursorModel.__name__
|
393
|
-
and self._stream_supports_concurrent_partition_processing(
|
394
|
-
declarative_stream=declarative_stream
|
395
|
-
)
|
396
402
|
and hasattr(declarative_stream.retriever, "stream_slicer")
|
397
403
|
and (
|
398
404
|
isinstance(declarative_stream.retriever.stream_slicer, DatetimeBasedCursor)
|
@@ -400,72 +406,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
400
406
|
)
|
401
407
|
)
|
402
408
|
|
403
|
-
def _stream_supports_concurrent_partition_processing(
|
404
|
-
self, declarative_stream: DeclarativeStream
|
405
|
-
) -> bool:
|
406
|
-
"""
|
407
|
-
Many connectors make use of stream_state during interpolation on a per-partition basis under the assumption that
|
408
|
-
state is updated sequentially. Because the concurrent CDK engine processes different partitions in parallel,
|
409
|
-
stream_state is no longer a thread-safe interpolation context. It would be a race condition because a cursor's
|
410
|
-
stream_state can be updated in any order depending on which stream partition's finish first.
|
411
|
-
|
412
|
-
We should start to move away from depending on the value of stream_state for low-code components that operate
|
413
|
-
per-partition, but we need to gate this otherwise some connectors will be blocked from publishing. See the
|
414
|
-
cdk-migrations.md for the full list of connectors.
|
415
|
-
"""
|
416
|
-
|
417
|
-
if isinstance(declarative_stream.retriever, SimpleRetriever) and isinstance(
|
418
|
-
declarative_stream.retriever.requester, HttpRequester
|
419
|
-
):
|
420
|
-
http_requester = declarative_stream.retriever.requester
|
421
|
-
if "stream_state" in http_requester._path.string:
|
422
|
-
self.logger.warning(
|
423
|
-
f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the HttpRequester which is not thread-safe. Defaulting to synchronous processing"
|
424
|
-
)
|
425
|
-
return False
|
426
|
-
|
427
|
-
request_options_provider = http_requester._request_options_provider
|
428
|
-
if request_options_provider.request_options_contain_stream_state():
|
429
|
-
self.logger.warning(
|
430
|
-
f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the HttpRequester which is not thread-safe. Defaulting to synchronous processing"
|
431
|
-
)
|
432
|
-
return False
|
433
|
-
|
434
|
-
record_selector = declarative_stream.retriever.record_selector
|
435
|
-
if isinstance(record_selector, RecordSelector):
|
436
|
-
if (
|
437
|
-
record_selector.record_filter
|
438
|
-
and not isinstance(
|
439
|
-
record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator
|
440
|
-
)
|
441
|
-
and "stream_state" in record_selector.record_filter.condition
|
442
|
-
):
|
443
|
-
self.logger.warning(
|
444
|
-
f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the RecordFilter which is not thread-safe. Defaulting to synchronous processing"
|
445
|
-
)
|
446
|
-
return False
|
447
|
-
|
448
|
-
for add_fields in [
|
449
|
-
transformation
|
450
|
-
for transformation in record_selector.transformations
|
451
|
-
if isinstance(transformation, AddFields)
|
452
|
-
]:
|
453
|
-
for field in add_fields.fields:
|
454
|
-
if isinstance(field.value, str) and "stream_state" in field.value:
|
455
|
-
self.logger.warning(
|
456
|
-
f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the AddFields which is not thread-safe. Defaulting to synchronous processing"
|
457
|
-
)
|
458
|
-
return False
|
459
|
-
if (
|
460
|
-
isinstance(field.value, InterpolatedString)
|
461
|
-
and "stream_state" in field.value.string
|
462
|
-
):
|
463
|
-
self.logger.warning(
|
464
|
-
f"Low-code stream '{declarative_stream.name}' uses interpolation of stream_state in the AddFields which is not thread-safe. Defaulting to synchronous processing"
|
465
|
-
)
|
466
|
-
return False
|
467
|
-
return True
|
468
|
-
|
469
409
|
@staticmethod
|
470
410
|
def _get_retriever(
|
471
411
|
declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any]
|
@@ -82,7 +82,6 @@ definitions:
|
|
82
82
|
- stream_interval
|
83
83
|
- stream_partition
|
84
84
|
- stream_slice
|
85
|
-
- stream_state
|
86
85
|
examples:
|
87
86
|
- "{{ record['updates'] }}"
|
88
87
|
- "{{ record['MetaData']['LastUpdatedTime'] }}"
|
@@ -1491,7 +1490,11 @@ definitions:
|
|
1491
1490
|
limit:
|
1492
1491
|
title: Limit
|
1493
1492
|
description: The maximum number of calls allowed within the interval.
|
1494
|
-
|
1493
|
+
anyOf:
|
1494
|
+
- type: integer
|
1495
|
+
- type: string
|
1496
|
+
interpolation_context:
|
1497
|
+
- config
|
1495
1498
|
interval:
|
1496
1499
|
title: Interval
|
1497
1500
|
description: The time interval for the rate limit.
|
@@ -1776,7 +1779,6 @@ definitions:
|
|
1776
1779
|
- stream_interval
|
1777
1780
|
- stream_partition
|
1778
1781
|
- stream_slice
|
1779
|
-
- stream_state
|
1780
1782
|
examples:
|
1781
1783
|
- "/products"
|
1782
1784
|
- "/quotes/{{ stream_partition['id'] }}/quote_line_groups"
|
@@ -1826,7 +1828,6 @@ definitions:
|
|
1826
1828
|
- stream_interval
|
1827
1829
|
- stream_partition
|
1828
1830
|
- stream_slice
|
1829
|
-
- stream_state
|
1830
1831
|
examples:
|
1831
1832
|
- |
|
1832
1833
|
[{"clause": {"type": "timestamp", "operator": 10, "parameters":
|
@@ -1844,7 +1845,6 @@ definitions:
|
|
1844
1845
|
- stream_interval
|
1845
1846
|
- stream_partition
|
1846
1847
|
- stream_slice
|
1847
|
-
- stream_state
|
1848
1848
|
examples:
|
1849
1849
|
- sort_order: "ASC"
|
1850
1850
|
sort_field: "CREATED_AT"
|
@@ -1865,7 +1865,6 @@ definitions:
|
|
1865
1865
|
- stream_interval
|
1866
1866
|
- stream_partition
|
1867
1867
|
- stream_slice
|
1868
|
-
- stream_state
|
1869
1868
|
examples:
|
1870
1869
|
- Output-Format: JSON
|
1871
1870
|
- Version: "{{ config['version'] }}"
|
@@ -1882,7 +1881,6 @@ definitions:
|
|
1882
1881
|
- stream_interval
|
1883
1882
|
- stream_partition
|
1884
1883
|
- stream_slice
|
1885
|
-
- stream_state
|
1886
1884
|
examples:
|
1887
1885
|
- unit: "day"
|
1888
1886
|
- query: 'last_event_time BETWEEN TIMESTAMP "{{ stream_interval.start_time }}" AND TIMESTAMP "{{ stream_interval.end_time }}"'
|
@@ -2237,7 +2235,6 @@ definitions:
|
|
2237
2235
|
interpolation_context:
|
2238
2236
|
- config
|
2239
2237
|
- record
|
2240
|
-
- stream_state
|
2241
2238
|
- stream_slice
|
2242
2239
|
new:
|
2243
2240
|
type: string
|
@@ -2251,7 +2248,6 @@ definitions:
|
|
2251
2248
|
interpolation_context:
|
2252
2249
|
- config
|
2253
2250
|
- record
|
2254
|
-
- stream_state
|
2255
2251
|
- stream_slice
|
2256
2252
|
$parameters:
|
2257
2253
|
type: object
|
@@ -2901,7 +2897,6 @@ definitions:
|
|
2901
2897
|
- stream_interval
|
2902
2898
|
- stream_partition
|
2903
2899
|
- stream_slice
|
2904
|
-
- stream_state
|
2905
2900
|
examples:
|
2906
2901
|
- "{{ record['created_at'] >= stream_interval['start_time'] }}"
|
2907
2902
|
- "{{ record.status in ['active', 'expired'] }}"
|
@@ -3689,12 +3684,6 @@ interpolation:
|
|
3689
3684
|
- title: stream_slice
|
3690
3685
|
description: This variable is deprecated. Use stream_interval or stream_partition instead.
|
3691
3686
|
type: object
|
3692
|
-
- title: stream_state
|
3693
|
-
description: The current state of the stream. The object's keys are defined by the incremental sync's cursor_field the and partition router's values.
|
3694
|
-
type: object
|
3695
|
-
examples:
|
3696
|
-
- created_at: "2020-01-01 00:00:00.000+00:00"
|
3697
|
-
- updated_at: "2020-01-02 00:00:00.000+00:00"
|
3698
3687
|
macros:
|
3699
3688
|
- title: now_utc
|
3700
3689
|
description: Returns the current date and time in the UTC timezone.
|
@@ -3759,6 +3748,16 @@ interpolation:
|
|
3759
3748
|
- "{{ format_datetime(config['start_time'], '%Y-%m-%d') }}"
|
3760
3749
|
- "{{ format_datetime(config['start_date'], '%Y-%m-%dT%H:%M:%S.%fZ') }}"
|
3761
3750
|
- "{{ format_datetime(config['start_date'], '%Y-%m-%dT%H:%M:%S.%fZ', '%a, %d %b %Y %H:%M:%S %z') }}"
|
3751
|
+
- title: str_to_datetime
|
3752
|
+
description: Converts a string to a datetime object with UTC timezone.
|
3753
|
+
arguments:
|
3754
|
+
s: The string to convert.
|
3755
|
+
return_type: datetime.datetime
|
3756
|
+
examples:
|
3757
|
+
- "{{ str_to_datetime('2022-01-14') }}"
|
3758
|
+
- "{{ str_to_datetime('2022-01-01 13:45:30') }}"
|
3759
|
+
- "{{ str_to_datetime('2022-01-01T13:45:30+00:00') }}"
|
3760
|
+
- "{{ str_to_datetime('2022-01-01T13:45:30.123456Z') }}"
|
3762
3761
|
filters:
|
3763
3762
|
- title: hash
|
3764
3763
|
description: Convert the specified value to a hashed string.
|
@@ -107,6 +107,16 @@ class CsvParser(Parser):
|
|
107
107
|
encoding: Optional[str] = "utf-8"
|
108
108
|
delimiter: Optional[str] = ","
|
109
109
|
|
110
|
+
def _get_delimiter(self) -> Optional[str]:
|
111
|
+
"""
|
112
|
+
Get delimiter from the configuration. Check for the escape character and decode it.
|
113
|
+
"""
|
114
|
+
if self.delimiter is not None:
|
115
|
+
if self.delimiter.startswith("\\"):
|
116
|
+
self.delimiter = self.delimiter.encode("utf-8").decode("unicode_escape")
|
117
|
+
|
118
|
+
return self.delimiter
|
119
|
+
|
110
120
|
def parse(
|
111
121
|
self,
|
112
122
|
data: BufferedIOBase,
|
@@ -115,8 +125,9 @@ class CsvParser(Parser):
|
|
115
125
|
Parse CSV data from decompressed bytes.
|
116
126
|
"""
|
117
127
|
text_data = TextIOWrapper(data, encoding=self.encoding) # type: ignore
|
118
|
-
reader = csv.DictReader(text_data, delimiter=self.
|
119
|
-
|
128
|
+
reader = csv.DictReader(text_data, delimiter=self._get_delimiter() or ",")
|
129
|
+
for row in reader:
|
130
|
+
yield row
|
120
131
|
|
121
132
|
|
122
133
|
@dataclass
|
@@ -251,7 +251,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
251
251
|
self._message_repository.emit_message(state_message)
|
252
252
|
|
253
253
|
def stream_slices(self) -> Iterable[StreamSlice]:
|
254
|
-
print("stream_slices")
|
255
254
|
if self._timer.is_running():
|
256
255
|
raise RuntimeError("stream_slices has been executed more than once.")
|
257
256
|
|