airbyte-cdk 6.37.0.dev1__py3-none-any.whl → 6.37.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/models.py +16 -14
- airbyte_cdk/connector_builder/test_reader/helpers.py +120 -22
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +16 -3
- airbyte_cdk/connector_builder/test_reader/types.py +9 -1
- airbyte_cdk/sources/declarative/auth/token_provider.py +1 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +15 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +5 -43
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +16 -4
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +1 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +83 -17
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -42
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +52 -63
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +42 -4
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +10 -3
- airbyte_cdk/sources/http_logger.py +3 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +1 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/RECORD +23 -24
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -136
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/entry_points.txt +0 -0
@@ -21,20 +21,6 @@ class HttpRequest:
|
|
21
21
|
body: Optional[str] = None
|
22
22
|
|
23
23
|
|
24
|
-
@dataclass
|
25
|
-
class StreamReadPages:
|
26
|
-
records: List[object]
|
27
|
-
request: Optional[HttpRequest] = None
|
28
|
-
response: Optional[HttpResponse] = None
|
29
|
-
|
30
|
-
|
31
|
-
@dataclass
|
32
|
-
class StreamReadSlices:
|
33
|
-
pages: List[StreamReadPages]
|
34
|
-
slice_descriptor: Optional[Dict[str, Any]]
|
35
|
-
state: Optional[List[Dict[str, Any]]] = None
|
36
|
-
|
37
|
-
|
38
24
|
@dataclass
|
39
25
|
class LogMessage:
|
40
26
|
message: str
|
@@ -46,11 +32,27 @@ class LogMessage:
|
|
46
32
|
@dataclass
|
47
33
|
class AuxiliaryRequest:
|
48
34
|
title: str
|
35
|
+
type: str
|
49
36
|
description: str
|
50
37
|
request: HttpRequest
|
51
38
|
response: HttpResponse
|
52
39
|
|
53
40
|
|
41
|
+
@dataclass
|
42
|
+
class StreamReadPages:
|
43
|
+
records: List[object]
|
44
|
+
request: Optional[HttpRequest] = None
|
45
|
+
response: Optional[HttpResponse] = None
|
46
|
+
|
47
|
+
|
48
|
+
@dataclass
|
49
|
+
class StreamReadSlices:
|
50
|
+
pages: List[StreamReadPages]
|
51
|
+
slice_descriptor: Optional[Dict[str, Any]]
|
52
|
+
state: Optional[List[Dict[str, Any]]] = None
|
53
|
+
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
|
54
|
+
|
55
|
+
|
54
56
|
@dataclass
|
55
57
|
class StreamRead(object):
|
56
58
|
logs: List[LogMessage]
|
@@ -28,7 +28,7 @@ from airbyte_cdk.utils.schema_inferrer import (
|
|
28
28
|
SchemaInferrer,
|
29
29
|
)
|
30
30
|
|
31
|
-
from .types import LOG_MESSAGES_OUTPUT_TYPE
|
31
|
+
from .types import ASYNC_AUXILIARY_REQUEST_TYPES, LOG_MESSAGES_OUTPUT_TYPE
|
32
32
|
|
33
33
|
# -------
|
34
34
|
# Parsers
|
@@ -226,7 +226,8 @@ def should_close_page(
|
|
226
226
|
at_least_one_page_in_group
|
227
227
|
and is_log_message(message)
|
228
228
|
and (
|
229
|
-
is_page_http_request(json_message)
|
229
|
+
is_page_http_request(json_message)
|
230
|
+
or message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
|
230
231
|
)
|
231
232
|
)
|
232
233
|
|
@@ -330,6 +331,10 @@ def is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool:
|
|
330
331
|
return is_http_log(message) and message.get("http", {}).get("is_auxiliary", False)
|
331
332
|
|
332
333
|
|
334
|
+
def is_async_auxiliary_request(message: AuxiliaryRequest) -> bool:
|
335
|
+
return message.type in ASYNC_AUXILIARY_REQUEST_TYPES
|
336
|
+
|
337
|
+
|
333
338
|
def is_log_message(message: AirbyteMessage) -> bool:
|
334
339
|
"""
|
335
340
|
Determines whether the provided message is of type LOG.
|
@@ -413,6 +418,7 @@ def handle_current_slice(
|
|
413
418
|
current_slice_pages: List[StreamReadPages],
|
414
419
|
current_slice_descriptor: Optional[Dict[str, Any]] = None,
|
415
420
|
latest_state_message: Optional[Dict[str, Any]] = None,
|
421
|
+
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None,
|
416
422
|
) -> StreamReadSlices:
|
417
423
|
"""
|
418
424
|
Handles the current slice by packaging its pages, descriptor, and state into a StreamReadSlices instance.
|
@@ -421,6 +427,7 @@ def handle_current_slice(
|
|
421
427
|
current_slice_pages (List[StreamReadPages]): The pages to be included in the slice.
|
422
428
|
current_slice_descriptor (Optional[Dict[str, Any]]): Descriptor for the current slice, optional.
|
423
429
|
latest_state_message (Optional[Dict[str, Any]]): The latest state message, optional.
|
430
|
+
auxiliary_requests (Optional[List[AuxiliaryRequest]]): The auxiliary requests to include, optional.
|
424
431
|
|
425
432
|
Returns:
|
426
433
|
StreamReadSlices: An object containing the current slice's pages, descriptor, and state.
|
@@ -429,6 +436,7 @@ def handle_current_slice(
|
|
429
436
|
pages=current_slice_pages,
|
430
437
|
slice_descriptor=current_slice_descriptor,
|
431
438
|
state=[latest_state_message] if latest_state_message else [],
|
439
|
+
auxiliary_requests=auxiliary_requests if auxiliary_requests else [],
|
432
440
|
)
|
433
441
|
|
434
442
|
|
@@ -486,29 +494,24 @@ def handle_auxiliary_request(json_message: Dict[str, JsonType]) -> AuxiliaryRequ
|
|
486
494
|
Raises:
|
487
495
|
ValueError: If any of the "airbyte_cdk", "stream", or "http" fields is not a dictionary.
|
488
496
|
"""
|
489
|
-
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
490
|
-
|
491
|
-
if not isinstance(airbyte_cdk, dict):
|
492
|
-
raise ValueError(
|
493
|
-
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
494
|
-
)
|
495
|
-
|
496
|
-
stream = airbyte_cdk.get("stream", {})
|
497
497
|
|
498
|
-
|
499
|
-
|
498
|
+
airbyte_cdk = get_airbyte_cdk_from_message(json_message)
|
499
|
+
stream = get_stream_from_airbyte_cdk(airbyte_cdk)
|
500
|
+
title_prefix = get_auxiliary_request_title_prefix(stream)
|
501
|
+
http = get_http_property_from_message(json_message)
|
502
|
+
request_type = get_auxiliary_request_type(stream, http)
|
500
503
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
|
504
|
+
title = title_prefix + str(http.get("title", None))
|
505
|
+
description = str(http.get("description", None))
|
506
|
+
request = create_request_from_log_message(json_message)
|
507
|
+
response = create_response_from_log_message(json_message)
|
506
508
|
|
507
509
|
return AuxiliaryRequest(
|
508
|
-
title=
|
509
|
-
|
510
|
-
|
511
|
-
|
510
|
+
title=title,
|
511
|
+
type=request_type,
|
512
|
+
description=description,
|
513
|
+
request=request,
|
514
|
+
response=response,
|
512
515
|
)
|
513
516
|
|
514
517
|
|
@@ -558,7 +561,8 @@ def handle_log_message(
|
|
558
561
|
at_least_one_page_in_group,
|
559
562
|
current_page_request,
|
560
563
|
current_page_response,
|
561
|
-
auxiliary_request
|
564
|
+
auxiliary_request,
|
565
|
+
log_message,
|
562
566
|
)
|
563
567
|
|
564
568
|
|
@@ -589,3 +593,97 @@ def handle_record_message(
|
|
589
593
|
datetime_format_inferrer.accumulate(message.record) # type: ignore
|
590
594
|
|
591
595
|
return records_count
|
596
|
+
|
597
|
+
|
598
|
+
# -------
|
599
|
+
# Reusable Getters
|
600
|
+
# -------
|
601
|
+
|
602
|
+
|
603
|
+
def get_airbyte_cdk_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
|
604
|
+
"""
|
605
|
+
Retrieves the "airbyte_cdk" dictionary from the provided JSON message.
|
606
|
+
|
607
|
+
This function validates that the extracted "airbyte_cdk" is of type dict,
|
608
|
+
raising a ValueError if the validation fails.
|
609
|
+
|
610
|
+
Parameters:
|
611
|
+
json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
|
612
|
+
|
613
|
+
Returns:
|
614
|
+
dict: The "airbyte_cdk" dictionary extracted from the JSON message.
|
615
|
+
|
616
|
+
Raises:
|
617
|
+
ValueError: If the "airbyte_cdk" field is not a dictionary.
|
618
|
+
"""
|
619
|
+
airbyte_cdk = json_message.get("airbyte_cdk", {})
|
620
|
+
|
621
|
+
if not isinstance(airbyte_cdk, dict):
|
622
|
+
raise ValueError(
|
623
|
+
f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
|
624
|
+
)
|
625
|
+
|
626
|
+
return airbyte_cdk
|
627
|
+
|
628
|
+
|
629
|
+
def get_stream_from_airbyte_cdk(airbyte_cdk: dict) -> dict: # type: ignore
|
630
|
+
"""
|
631
|
+
Retrieves the "stream" dictionary from the provided "airbyte_cdk" dictionary.
|
632
|
+
|
633
|
+
This function ensures that the extracted "stream" is of type dict,
|
634
|
+
raising a ValueError if the validation fails.
|
635
|
+
|
636
|
+
Parameters:
|
637
|
+
airbyte_cdk (dict): The dictionary representing the Airbyte CDK data.
|
638
|
+
|
639
|
+
Returns:
|
640
|
+
dict: The "stream" dictionary extracted from the Airbyte CDK data.
|
641
|
+
|
642
|
+
Raises:
|
643
|
+
ValueError: If the "stream" field is not a dictionary.
|
644
|
+
"""
|
645
|
+
|
646
|
+
stream = airbyte_cdk.get("stream", {})
|
647
|
+
|
648
|
+
if not isinstance(stream, dict):
|
649
|
+
raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}")
|
650
|
+
|
651
|
+
return stream
|
652
|
+
|
653
|
+
|
654
|
+
def get_auxiliary_request_title_prefix(stream: dict) -> str: # type: ignore
|
655
|
+
"""
|
656
|
+
Generates a title prefix based on the stream type.
|
657
|
+
"""
|
658
|
+
return "Parent stream: " if stream.get("is_substream", False) else ""
|
659
|
+
|
660
|
+
|
661
|
+
def get_http_property_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
|
662
|
+
"""
|
663
|
+
Retrieves the "http" dictionary from the provided JSON message.
|
664
|
+
|
665
|
+
This function validates that the extracted "http" is of type dict,
|
666
|
+
raising a ValueError if the validation fails.
|
667
|
+
|
668
|
+
Parameters:
|
669
|
+
json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
|
670
|
+
|
671
|
+
Returns:
|
672
|
+
dict: The "http" dictionary extracted from the JSON message.
|
673
|
+
|
674
|
+
Raises:
|
675
|
+
ValueError: If the "http" field is not a dictionary.
|
676
|
+
"""
|
677
|
+
http = json_message.get("http", {})
|
678
|
+
|
679
|
+
if not isinstance(http, dict):
|
680
|
+
raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
|
681
|
+
|
682
|
+
return http
|
683
|
+
|
684
|
+
|
685
|
+
def get_auxiliary_request_type(stream: dict, http: dict) -> str: # type: ignore
|
686
|
+
"""
|
687
|
+
Determines the type of the auxiliary request based on the stream and HTTP properties.
|
688
|
+
"""
|
689
|
+
return "PARENT_STREAM" if stream.get("is_substream", False) else str(http.get("type", None))
|
@@ -6,6 +6,7 @@
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
7
7
|
|
8
8
|
from airbyte_cdk.connector_builder.models import (
|
9
|
+
AuxiliaryRequest,
|
9
10
|
HttpRequest,
|
10
11
|
HttpResponse,
|
11
12
|
StreamReadPages,
|
@@ -24,6 +25,7 @@ from .helpers import (
|
|
24
25
|
handle_current_slice,
|
25
26
|
handle_log_message,
|
26
27
|
handle_record_message,
|
28
|
+
is_async_auxiliary_request,
|
27
29
|
is_config_update_message,
|
28
30
|
is_log_message,
|
29
31
|
is_record_message,
|
@@ -89,6 +91,7 @@ def get_message_groups(
|
|
89
91
|
current_page_request: Optional[HttpRequest] = None
|
90
92
|
current_page_response: Optional[HttpResponse] = None
|
91
93
|
latest_state_message: Optional[Dict[str, Any]] = None
|
94
|
+
slice_auxiliary_requests: List[AuxiliaryRequest] = []
|
92
95
|
|
93
96
|
while records_count < limit and (message := next(messages, None)):
|
94
97
|
json_message = airbyte_message_to_json(message)
|
@@ -106,6 +109,7 @@ def get_message_groups(
|
|
106
109
|
current_slice_pages,
|
107
110
|
current_slice_descriptor,
|
108
111
|
latest_state_message,
|
112
|
+
slice_auxiliary_requests,
|
109
113
|
)
|
110
114
|
current_slice_descriptor = parse_slice_description(message.log.message) # type: ignore
|
111
115
|
current_slice_pages = []
|
@@ -118,7 +122,8 @@ def get_message_groups(
|
|
118
122
|
at_least_one_page_in_group,
|
119
123
|
current_page_request,
|
120
124
|
current_page_response,
|
121
|
-
|
125
|
+
auxiliary_request,
|
126
|
+
log_message,
|
122
127
|
) = handle_log_message(
|
123
128
|
message,
|
124
129
|
json_message,
|
@@ -126,8 +131,15 @@ def get_message_groups(
|
|
126
131
|
current_page_request,
|
127
132
|
current_page_response,
|
128
133
|
)
|
129
|
-
|
130
|
-
|
134
|
+
|
135
|
+
if auxiliary_request:
|
136
|
+
if is_async_auxiliary_request(auxiliary_request):
|
137
|
+
slice_auxiliary_requests.append(auxiliary_request)
|
138
|
+
else:
|
139
|
+
yield auxiliary_request
|
140
|
+
|
141
|
+
if log_message:
|
142
|
+
yield log_message
|
131
143
|
elif is_trace_with_error(message):
|
132
144
|
if message.trace is not None:
|
133
145
|
yield message.trace
|
@@ -157,4 +169,5 @@ def get_message_groups(
|
|
157
169
|
current_slice_pages,
|
158
170
|
current_slice_descriptor,
|
159
171
|
latest_state_message,
|
172
|
+
slice_auxiliary_requests,
|
160
173
|
)
|
@@ -71,5 +71,13 @@ LOG_MESSAGES_OUTPUT_TYPE = tuple[
|
|
71
71
|
bool,
|
72
72
|
HttpRequest | None,
|
73
73
|
HttpResponse | None,
|
74
|
-
AuxiliaryRequest |
|
74
|
+
AuxiliaryRequest | None,
|
75
|
+
AirbyteLogMessage | None,
|
76
|
+
]
|
77
|
+
|
78
|
+
ASYNC_AUXILIARY_REQUEST_TYPES = [
|
79
|
+
"ASYNC_CREATE",
|
80
|
+
"ASYNC_POLL",
|
81
|
+
"ASYNC_ABORT",
|
82
|
+
"ASYNC_DELETE",
|
75
83
|
]
|
@@ -44,6 +44,7 @@ from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
|
44
44
|
from airbyte_cdk.sources.source import TState
|
45
45
|
from airbyte_cdk.sources.streams import Stream
|
46
46
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
47
|
+
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
47
48
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
|
48
49
|
AlwaysAvailableAvailabilityStrategy,
|
49
50
|
)
|
@@ -118,6 +119,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
118
119
|
message_repository=self.message_repository,
|
119
120
|
)
|
120
121
|
|
122
|
+
# TODO: Remove this. This property is necessary to safely migrate Stripe during the transition state.
|
123
|
+
@property
|
124
|
+
def is_partially_declarative(self) -> bool:
|
125
|
+
"""This flag used to avoid unexpected AbstractStreamFacade processing as concurrent streams."""
|
126
|
+
return False
|
127
|
+
|
121
128
|
def read(
|
122
129
|
self,
|
123
130
|
logger: logging.Logger,
|
@@ -369,6 +376,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
369
376
|
)
|
370
377
|
else:
|
371
378
|
synchronous_streams.append(declarative_stream)
|
379
|
+
# TODO: Remove this. This check is necessary to safely migrate Stripe during the transition state.
|
380
|
+
# Condition below needs to ensure that concurrent support is not lost for sources that already support
|
381
|
+
# it before migration, but now are only partially migrated to declarative implementation (e.g., Stripe).
|
382
|
+
elif (
|
383
|
+
isinstance(declarative_stream, AbstractStreamFacade)
|
384
|
+
and self.is_partially_declarative
|
385
|
+
):
|
386
|
+
concurrent_streams.append(declarative_stream.get_underlying_stream())
|
372
387
|
else:
|
373
388
|
synchronous_streams.append(declarative_stream)
|
374
389
|
|
@@ -1490,7 +1490,11 @@ definitions:
|
|
1490
1490
|
limit:
|
1491
1491
|
title: Limit
|
1492
1492
|
description: The maximum number of calls allowed within the interval.
|
1493
|
-
|
1493
|
+
anyOf:
|
1494
|
+
- type: integer
|
1495
|
+
- type: string
|
1496
|
+
interpolation_context:
|
1497
|
+
- config
|
1494
1498
|
interval:
|
1495
1499
|
title: Interval
|
1496
1500
|
description: The time interval for the rate limit.
|
@@ -3130,14 +3134,12 @@ definitions:
|
|
3130
3134
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3131
3135
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3132
3136
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3133
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3134
3137
|
- type: array
|
3135
3138
|
items:
|
3136
3139
|
anyOf:
|
3137
3140
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3138
3141
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3139
3142
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3140
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3141
3143
|
decoder:
|
3142
3144
|
title: Decoder
|
3143
3145
|
description: Component decoding the response so records can be extracted.
|
@@ -3292,14 +3294,12 @@ definitions:
|
|
3292
3294
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3293
3295
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3294
3296
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3295
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3296
3297
|
- type: array
|
3297
3298
|
items:
|
3298
3299
|
anyOf:
|
3299
3300
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3300
3301
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3301
3302
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3302
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3303
3303
|
decoder:
|
3304
3304
|
title: Decoder
|
3305
3305
|
description: Component decoding the response so records can be extracted.
|
@@ -3416,44 +3416,6 @@ definitions:
|
|
3416
3416
|
$parameters:
|
3417
3417
|
type: object
|
3418
3418
|
additionalProperties: true
|
3419
|
-
GroupingPartitionRouter:
|
3420
|
-
title: Grouping Partition Router
|
3421
|
-
description: >
|
3422
|
-
A decorator on top of a partition router that groups partitions into batches of a specified size.
|
3423
|
-
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
3424
|
-
Note that per-partition incremental syncs may not work as expected because the grouping
|
3425
|
-
of partitions might change between syncs, potentially leading to inconsistent state tracking.
|
3426
|
-
type: object
|
3427
|
-
required:
|
3428
|
-
- type
|
3429
|
-
- group_size
|
3430
|
-
- underlying_partition_router
|
3431
|
-
properties:
|
3432
|
-
type:
|
3433
|
-
type: string
|
3434
|
-
enum: [GroupingPartitionRouter]
|
3435
|
-
group_size:
|
3436
|
-
title: Group Size
|
3437
|
-
description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
|
3438
|
-
type: integer
|
3439
|
-
examples:
|
3440
|
-
- 10
|
3441
|
-
- 50
|
3442
|
-
underlying_partition_router:
|
3443
|
-
title: Underlying Partition Router
|
3444
|
-
description: The partition router whose output will be grouped. This can be any valid partition router component.
|
3445
|
-
anyOf:
|
3446
|
-
- "$ref": "#/definitions/CustomPartitionRouter"
|
3447
|
-
- "$ref": "#/definitions/ListPartitionRouter"
|
3448
|
-
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3449
|
-
deduplicate:
|
3450
|
-
title: Deduplicate Partitions
|
3451
|
-
description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
|
3452
|
-
type: boolean
|
3453
|
-
default: true
|
3454
|
-
$parameters:
|
3455
|
-
type: object
|
3456
|
-
additionalProperties: true
|
3457
3419
|
WaitUntilTimeFromHeader:
|
3458
3420
|
title: Wait Until Time Defined In Response Header
|
3459
3421
|
description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
|
@@ -5,7 +5,7 @@ import json
|
|
5
5
|
import logging
|
6
6
|
from abc import ABC, abstractmethod
|
7
7
|
from dataclasses import dataclass
|
8
|
-
from io import BufferedIOBase, TextIOWrapper
|
8
|
+
from io import BufferedIOBase, BytesIO, TextIOWrapper
|
9
9
|
from typing import Any, Generator, MutableMapping, Optional
|
10
10
|
|
11
11
|
import orjson
|
@@ -107,6 +107,16 @@ class CsvParser(Parser):
|
|
107
107
|
encoding: Optional[str] = "utf-8"
|
108
108
|
delimiter: Optional[str] = ","
|
109
109
|
|
110
|
+
def _get_delimiter(self) -> Optional[str]:
|
111
|
+
"""
|
112
|
+
Get delimiter from the configuration. Check for the escape character and decode it.
|
113
|
+
"""
|
114
|
+
if self.delimiter is not None:
|
115
|
+
if self.delimiter.startswith("\\"):
|
116
|
+
self.delimiter = self.delimiter.encode("utf-8").decode("unicode_escape")
|
117
|
+
|
118
|
+
return self.delimiter
|
119
|
+
|
110
120
|
def parse(
|
111
121
|
self,
|
112
122
|
data: BufferedIOBase,
|
@@ -114,9 +124,11 @@ class CsvParser(Parser):
|
|
114
124
|
"""
|
115
125
|
Parse CSV data from decompressed bytes.
|
116
126
|
"""
|
117
|
-
|
118
|
-
|
119
|
-
|
127
|
+
bytes_data = BytesIO(data.read())
|
128
|
+
text_data = TextIOWrapper(bytes_data, encoding=self.encoding) # type: ignore
|
129
|
+
reader = csv.DictReader(text_data, delimiter=self._get_delimiter() or ",")
|
130
|
+
for row in reader:
|
131
|
+
yield row
|
120
132
|
|
121
133
|
|
122
134
|
@dataclass
|
@@ -136,6 +136,7 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
136
136
|
"""
|
137
137
|
|
138
138
|
try:
|
139
|
+
# TODO: Add support for other file types, like `json`, with `pd.read_json()`
|
139
140
|
with open(path, "r", encoding=file_encoding) as data:
|
140
141
|
chunks = pd.read_csv(
|
141
142
|
data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object
|
@@ -95,6 +95,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
95
95
|
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
96
96
|
self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
|
97
97
|
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
|
98
|
+
|
99
|
+
# Parent-state tracking: store each partition’s parent state in creation order
|
100
|
+
self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
|
101
|
+
|
98
102
|
self._finished_partitions: set[str] = set()
|
99
103
|
self._lock = threading.Lock()
|
100
104
|
self._timer = Timer()
|
@@ -155,11 +159,62 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
155
159
|
and self._semaphore_per_partition[partition_key]._value == 0
|
156
160
|
):
|
157
161
|
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
|
158
|
-
|
162
|
+
|
163
|
+
self._check_and_update_parent_state()
|
164
|
+
|
165
|
+
self._emit_state_message()
|
166
|
+
|
167
|
+
def _check_and_update_parent_state(self) -> None:
|
168
|
+
"""
|
169
|
+
Pop the leftmost partition state from _partition_parent_state_map only if
|
170
|
+
*all partitions* up to (and including) that partition key in _semaphore_per_partition
|
171
|
+
are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
|
172
|
+
Additionally, delete finished semaphores with a value of 0 to free up memory,
|
173
|
+
as they are only needed to track errors and completion status.
|
174
|
+
"""
|
175
|
+
last_closed_state = None
|
176
|
+
|
177
|
+
while self._partition_parent_state_map:
|
178
|
+
# Look at the earliest partition key in creation order
|
179
|
+
earliest_key = next(iter(self._partition_parent_state_map))
|
180
|
+
|
181
|
+
# Verify ALL partitions from the left up to earliest_key are finished
|
182
|
+
all_left_finished = True
|
183
|
+
for p_key, sem in list(
|
184
|
+
self._semaphore_per_partition.items()
|
185
|
+
): # Use list to allow modification during iteration
|
186
|
+
# If any earlier partition is still not finished, we must stop
|
187
|
+
if p_key not in self._finished_partitions or sem._value != 0:
|
188
|
+
all_left_finished = False
|
189
|
+
break
|
190
|
+
# Once we've reached earliest_key in the semaphore order, we can stop checking
|
191
|
+
if p_key == earliest_key:
|
192
|
+
break
|
193
|
+
|
194
|
+
# If the partitions up to earliest_key are not all finished, break the while-loop
|
195
|
+
if not all_left_finished:
|
196
|
+
break
|
197
|
+
|
198
|
+
# Pop the leftmost entry from parent-state map
|
199
|
+
_, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
|
200
|
+
last_closed_state = closed_parent_state
|
201
|
+
|
202
|
+
# Clean up finished semaphores with value 0 up to and including earliest_key
|
203
|
+
for p_key in list(self._semaphore_per_partition.keys()):
|
204
|
+
sem = self._semaphore_per_partition[p_key]
|
205
|
+
if p_key in self._finished_partitions and sem._value == 0:
|
206
|
+
del self._semaphore_per_partition[p_key]
|
207
|
+
logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
|
208
|
+
if p_key == earliest_key:
|
209
|
+
break
|
210
|
+
|
211
|
+
# Update _parent_state if we popped at least one partition
|
212
|
+
if last_closed_state is not None:
|
213
|
+
self._parent_state = last_closed_state
|
159
214
|
|
160
215
|
def ensure_at_least_one_state_emitted(self) -> None:
|
161
216
|
"""
|
162
|
-
The platform
|
217
|
+
The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
163
218
|
called.
|
164
219
|
"""
|
165
220
|
if not any(
|
@@ -201,13 +256,19 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
201
256
|
|
202
257
|
slices = self._partition_router.stream_slices()
|
203
258
|
self._timer.start()
|
204
|
-
for partition in
|
205
|
-
|
259
|
+
for partition, last, parent_state in iterate_with_last_flag_and_state(
|
260
|
+
slices, self._partition_router.get_stream_state
|
261
|
+
):
|
262
|
+
yield from self._generate_slices_from_partition(partition, parent_state)
|
206
263
|
|
207
|
-
def _generate_slices_from_partition(
|
264
|
+
def _generate_slices_from_partition(
|
265
|
+
self, partition: StreamSlice, parent_state: Mapping[str, Any]
|
266
|
+
) -> Iterable[StreamSlice]:
|
208
267
|
# Ensure the maximum number of partitions is not exceeded
|
209
268
|
self._ensure_partition_limit()
|
210
269
|
|
270
|
+
partition_key = self._to_partition_key(partition.partition)
|
271
|
+
|
211
272
|
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
212
273
|
if not cursor:
|
213
274
|
cursor = self._create_cursor(
|
@@ -216,18 +277,26 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
216
277
|
)
|
217
278
|
with self._lock:
|
218
279
|
self._number_of_partitions += 1
|
219
|
-
self._cursor_per_partition[
|
220
|
-
|
221
|
-
|
222
|
-
|
280
|
+
self._cursor_per_partition[partition_key] = cursor
|
281
|
+
self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
|
282
|
+
|
283
|
+
with self._lock:
|
284
|
+
if (
|
285
|
+
len(self._partition_parent_state_map) == 0
|
286
|
+
or self._partition_parent_state_map[
|
287
|
+
next(reversed(self._partition_parent_state_map))
|
288
|
+
]
|
289
|
+
!= parent_state
|
290
|
+
):
|
291
|
+
self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
|
223
292
|
|
224
293
|
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
225
294
|
cursor.stream_slices(),
|
226
295
|
lambda: None,
|
227
296
|
):
|
228
|
-
self._semaphore_per_partition[
|
297
|
+
self._semaphore_per_partition[partition_key].release()
|
229
298
|
if is_last_slice:
|
230
|
-
self._finished_partitions.add(
|
299
|
+
self._finished_partitions.add(partition_key)
|
231
300
|
yield StreamSlice(
|
232
301
|
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
233
302
|
)
|
@@ -257,9 +326,9 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
257
326
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
258
327
|
# Try removing finished partitions first
|
259
328
|
for partition_key in list(self._cursor_per_partition.keys()):
|
260
|
-
if (
|
261
|
-
partition_key in self.
|
262
|
-
|
329
|
+
if partition_key in self._finished_partitions and (
|
330
|
+
partition_key not in self._semaphore_per_partition
|
331
|
+
or self._semaphore_per_partition[partition_key]._value == 0
|
263
332
|
):
|
264
333
|
oldest_partition = self._cursor_per_partition.pop(
|
265
334
|
partition_key
|
@@ -338,9 +407,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
338
407
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
339
408
|
self._create_cursor(state["cursor"])
|
340
409
|
)
|
341
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
342
|
-
threading.Semaphore(0)
|
343
|
-
)
|
344
410
|
|
345
411
|
# set default state for missing partitions if it is per partition with fallback to global
|
346
412
|
if self._GLOBAL_STATE_KEY in stream_state:
|