airbyte-cdk 6.37.0.dev1__py3-none-any.whl → 6.37.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. airbyte_cdk/connector_builder/models.py +16 -14
  2. airbyte_cdk/connector_builder/test_reader/helpers.py +120 -22
  3. airbyte_cdk/connector_builder/test_reader/message_grouper.py +16 -3
  4. airbyte_cdk/connector_builder/test_reader/types.py +9 -1
  5. airbyte_cdk/sources/declarative/auth/token_provider.py +1 -0
  6. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +15 -0
  7. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +5 -43
  8. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +16 -4
  9. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +1 -0
  10. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +83 -17
  11. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -42
  12. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +52 -63
  13. airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
  14. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +42 -4
  15. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +10 -3
  16. airbyte_cdk/sources/http_logger.py +3 -0
  17. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +1 -0
  18. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/METADATA +1 -1
  19. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/RECORD +23 -24
  20. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -136
  21. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/LICENSE_SHORT +0 -0
  23. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/WHEEL +0 -0
  24. {airbyte_cdk-6.37.0.dev1.dist-info → airbyte_cdk-6.37.2.dev1.dist-info}/entry_points.txt +0 -0
@@ -21,20 +21,6 @@ class HttpRequest:
21
21
  body: Optional[str] = None
22
22
 
23
23
 
24
- @dataclass
25
- class StreamReadPages:
26
- records: List[object]
27
- request: Optional[HttpRequest] = None
28
- response: Optional[HttpResponse] = None
29
-
30
-
31
- @dataclass
32
- class StreamReadSlices:
33
- pages: List[StreamReadPages]
34
- slice_descriptor: Optional[Dict[str, Any]]
35
- state: Optional[List[Dict[str, Any]]] = None
36
-
37
-
38
24
  @dataclass
39
25
  class LogMessage:
40
26
  message: str
@@ -46,11 +32,27 @@ class LogMessage:
46
32
  @dataclass
47
33
  class AuxiliaryRequest:
48
34
  title: str
35
+ type: str
49
36
  description: str
50
37
  request: HttpRequest
51
38
  response: HttpResponse
52
39
 
53
40
 
41
+ @dataclass
42
+ class StreamReadPages:
43
+ records: List[object]
44
+ request: Optional[HttpRequest] = None
45
+ response: Optional[HttpResponse] = None
46
+
47
+
48
+ @dataclass
49
+ class StreamReadSlices:
50
+ pages: List[StreamReadPages]
51
+ slice_descriptor: Optional[Dict[str, Any]]
52
+ state: Optional[List[Dict[str, Any]]] = None
53
+ auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
54
+
55
+
54
56
  @dataclass
55
57
  class StreamRead(object):
56
58
  logs: List[LogMessage]
@@ -28,7 +28,7 @@ from airbyte_cdk.utils.schema_inferrer import (
28
28
  SchemaInferrer,
29
29
  )
30
30
 
31
- from .types import LOG_MESSAGES_OUTPUT_TYPE
31
+ from .types import ASYNC_AUXILIARY_REQUEST_TYPES, LOG_MESSAGES_OUTPUT_TYPE
32
32
 
33
33
  # -------
34
34
  # Parsers
@@ -226,7 +226,8 @@ def should_close_page(
226
226
  at_least_one_page_in_group
227
227
  and is_log_message(message)
228
228
  and (
229
- is_page_http_request(json_message) or message.log.message.startswith("slice:") # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
229
+ is_page_http_request(json_message)
230
+ or message.log.message.startswith(SliceLogger.SLICE_LOG_PREFIX) # type: ignore[union-attr] # AirbyteMessage with MessageType.LOG has log.message
230
231
  )
231
232
  )
232
233
 
@@ -330,6 +331,10 @@ def is_auxiliary_http_request(message: Optional[Dict[str, Any]]) -> bool:
330
331
  return is_http_log(message) and message.get("http", {}).get("is_auxiliary", False)
331
332
 
332
333
 
334
+ def is_async_auxiliary_request(message: AuxiliaryRequest) -> bool:
335
+ return message.type in ASYNC_AUXILIARY_REQUEST_TYPES
336
+
337
+
333
338
  def is_log_message(message: AirbyteMessage) -> bool:
334
339
  """
335
340
  Determines whether the provided message is of type LOG.
@@ -413,6 +418,7 @@ def handle_current_slice(
413
418
  current_slice_pages: List[StreamReadPages],
414
419
  current_slice_descriptor: Optional[Dict[str, Any]] = None,
415
420
  latest_state_message: Optional[Dict[str, Any]] = None,
421
+ auxiliary_requests: Optional[List[AuxiliaryRequest]] = None,
416
422
  ) -> StreamReadSlices:
417
423
  """
418
424
  Handles the current slice by packaging its pages, descriptor, and state into a StreamReadSlices instance.
@@ -421,6 +427,7 @@ def handle_current_slice(
421
427
  current_slice_pages (List[StreamReadPages]): The pages to be included in the slice.
422
428
  current_slice_descriptor (Optional[Dict[str, Any]]): Descriptor for the current slice, optional.
423
429
  latest_state_message (Optional[Dict[str, Any]]): The latest state message, optional.
430
+ auxiliary_requests (Optional[List[AuxiliaryRequest]]): The auxiliary requests to include, optional.
424
431
 
425
432
  Returns:
426
433
  StreamReadSlices: An object containing the current slice's pages, descriptor, and state.
@@ -429,6 +436,7 @@ def handle_current_slice(
429
436
  pages=current_slice_pages,
430
437
  slice_descriptor=current_slice_descriptor,
431
438
  state=[latest_state_message] if latest_state_message else [],
439
+ auxiliary_requests=auxiliary_requests if auxiliary_requests else [],
432
440
  )
433
441
 
434
442
 
@@ -486,29 +494,24 @@ def handle_auxiliary_request(json_message: Dict[str, JsonType]) -> AuxiliaryRequ
486
494
  Raises:
487
495
  ValueError: If any of the "airbyte_cdk", "stream", or "http" fields is not a dictionary.
488
496
  """
489
- airbyte_cdk = json_message.get("airbyte_cdk", {})
490
-
491
- if not isinstance(airbyte_cdk, dict):
492
- raise ValueError(
493
- f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
494
- )
495
-
496
- stream = airbyte_cdk.get("stream", {})
497
497
 
498
- if not isinstance(stream, dict):
499
- raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}")
498
+ airbyte_cdk = get_airbyte_cdk_from_message(json_message)
499
+ stream = get_stream_from_airbyte_cdk(airbyte_cdk)
500
+ title_prefix = get_auxiliary_request_title_prefix(stream)
501
+ http = get_http_property_from_message(json_message)
502
+ request_type = get_auxiliary_request_type(stream, http)
500
503
 
501
- title_prefix = "Parent stream: " if stream.get("is_substream", False) else ""
502
- http = json_message.get("http", {})
503
-
504
- if not isinstance(http, dict):
505
- raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
504
+ title = title_prefix + str(http.get("title", None))
505
+ description = str(http.get("description", None))
506
+ request = create_request_from_log_message(json_message)
507
+ response = create_response_from_log_message(json_message)
506
508
 
507
509
  return AuxiliaryRequest(
508
- title=title_prefix + str(http.get("title", None)),
509
- description=str(http.get("description", None)),
510
- request=create_request_from_log_message(json_message),
511
- response=create_response_from_log_message(json_message),
510
+ title=title,
511
+ type=request_type,
512
+ description=description,
513
+ request=request,
514
+ response=response,
512
515
  )
513
516
 
514
517
 
@@ -558,7 +561,8 @@ def handle_log_message(
558
561
  at_least_one_page_in_group,
559
562
  current_page_request,
560
563
  current_page_response,
561
- auxiliary_request or log_message,
564
+ auxiliary_request,
565
+ log_message,
562
566
  )
563
567
 
564
568
 
@@ -589,3 +593,97 @@ def handle_record_message(
589
593
  datetime_format_inferrer.accumulate(message.record) # type: ignore
590
594
 
591
595
  return records_count
596
+
597
+
598
+ # -------
599
+ # Reusable Getters
600
+ # -------
601
+
602
+
603
+ def get_airbyte_cdk_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
604
+ """
605
+ Retrieves the "airbyte_cdk" dictionary from the provided JSON message.
606
+
607
+ This function validates that the extracted "airbyte_cdk" is of type dict,
608
+ raising a ValueError if the validation fails.
609
+
610
+ Parameters:
611
+ json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
612
+
613
+ Returns:
614
+ dict: The "airbyte_cdk" dictionary extracted from the JSON message.
615
+
616
+ Raises:
617
+ ValueError: If the "airbyte_cdk" field is not a dictionary.
618
+ """
619
+ airbyte_cdk = json_message.get("airbyte_cdk", {})
620
+
621
+ if not isinstance(airbyte_cdk, dict):
622
+ raise ValueError(
623
+ f"Expected airbyte_cdk to be a dict, got {airbyte_cdk} of type {type(airbyte_cdk)}"
624
+ )
625
+
626
+ return airbyte_cdk
627
+
628
+
629
+ def get_stream_from_airbyte_cdk(airbyte_cdk: dict) -> dict: # type: ignore
630
+ """
631
+ Retrieves the "stream" dictionary from the provided "airbyte_cdk" dictionary.
632
+
633
+ This function ensures that the extracted "stream" is of type dict,
634
+ raising a ValueError if the validation fails.
635
+
636
+ Parameters:
637
+ airbyte_cdk (dict): The dictionary representing the Airbyte CDK data.
638
+
639
+ Returns:
640
+ dict: The "stream" dictionary extracted from the Airbyte CDK data.
641
+
642
+ Raises:
643
+ ValueError: If the "stream" field is not a dictionary.
644
+ """
645
+
646
+ stream = airbyte_cdk.get("stream", {})
647
+
648
+ if not isinstance(stream, dict):
649
+ raise ValueError(f"Expected stream to be a dict, got {stream} of type {type(stream)}")
650
+
651
+ return stream
652
+
653
+
654
+ def get_auxiliary_request_title_prefix(stream: dict) -> str: # type: ignore
655
+ """
656
+ Generates a title prefix based on the stream type.
657
+ """
658
+ return "Parent stream: " if stream.get("is_substream", False) else ""
659
+
660
+
661
+ def get_http_property_from_message(json_message: Dict[str, JsonType]) -> dict: # type: ignore
662
+ """
663
+ Retrieves the "http" dictionary from the provided JSON message.
664
+
665
+ This function validates that the extracted "http" is of type dict,
666
+ raising a ValueError if the validation fails.
667
+
668
+ Parameters:
669
+ json_message (Dict[str, JsonType]): A dictionary representing the JSON message.
670
+
671
+ Returns:
672
+ dict: The "http" dictionary extracted from the JSON message.
673
+
674
+ Raises:
675
+ ValueError: If the "http" field is not a dictionary.
676
+ """
677
+ http = json_message.get("http", {})
678
+
679
+ if not isinstance(http, dict):
680
+ raise ValueError(f"Expected http to be a dict, got {http} of type {type(http)}")
681
+
682
+ return http
683
+
684
+
685
+ def get_auxiliary_request_type(stream: dict, http: dict) -> str: # type: ignore
686
+ """
687
+ Determines the type of the auxiliary request based on the stream and HTTP properties.
688
+ """
689
+ return "PARENT_STREAM" if stream.get("is_substream", False) else str(http.get("type", None))
@@ -6,6 +6,7 @@
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from airbyte_cdk.connector_builder.models import (
9
+ AuxiliaryRequest,
9
10
  HttpRequest,
10
11
  HttpResponse,
11
12
  StreamReadPages,
@@ -24,6 +25,7 @@ from .helpers import (
24
25
  handle_current_slice,
25
26
  handle_log_message,
26
27
  handle_record_message,
28
+ is_async_auxiliary_request,
27
29
  is_config_update_message,
28
30
  is_log_message,
29
31
  is_record_message,
@@ -89,6 +91,7 @@ def get_message_groups(
89
91
  current_page_request: Optional[HttpRequest] = None
90
92
  current_page_response: Optional[HttpResponse] = None
91
93
  latest_state_message: Optional[Dict[str, Any]] = None
94
+ slice_auxiliary_requests: List[AuxiliaryRequest] = []
92
95
 
93
96
  while records_count < limit and (message := next(messages, None)):
94
97
  json_message = airbyte_message_to_json(message)
@@ -106,6 +109,7 @@ def get_message_groups(
106
109
  current_slice_pages,
107
110
  current_slice_descriptor,
108
111
  latest_state_message,
112
+ slice_auxiliary_requests,
109
113
  )
110
114
  current_slice_descriptor = parse_slice_description(message.log.message) # type: ignore
111
115
  current_slice_pages = []
@@ -118,7 +122,8 @@ def get_message_groups(
118
122
  at_least_one_page_in_group,
119
123
  current_page_request,
120
124
  current_page_response,
121
- log_or_auxiliary_request,
125
+ auxiliary_request,
126
+ log_message,
122
127
  ) = handle_log_message(
123
128
  message,
124
129
  json_message,
@@ -126,8 +131,15 @@ def get_message_groups(
126
131
  current_page_request,
127
132
  current_page_response,
128
133
  )
129
- if log_or_auxiliary_request:
130
- yield log_or_auxiliary_request
134
+
135
+ if auxiliary_request:
136
+ if is_async_auxiliary_request(auxiliary_request):
137
+ slice_auxiliary_requests.append(auxiliary_request)
138
+ else:
139
+ yield auxiliary_request
140
+
141
+ if log_message:
142
+ yield log_message
131
143
  elif is_trace_with_error(message):
132
144
  if message.trace is not None:
133
145
  yield message.trace
@@ -157,4 +169,5 @@ def get_message_groups(
157
169
  current_slice_pages,
158
170
  current_slice_descriptor,
159
171
  latest_state_message,
172
+ slice_auxiliary_requests,
160
173
  )
@@ -71,5 +71,13 @@ LOG_MESSAGES_OUTPUT_TYPE = tuple[
71
71
  bool,
72
72
  HttpRequest | None,
73
73
  HttpResponse | None,
74
- AuxiliaryRequest | AirbyteLogMessage | None,
74
+ AuxiliaryRequest | None,
75
+ AirbyteLogMessage | None,
76
+ ]
77
+
78
+ ASYNC_AUXILIARY_REQUEST_TYPES = [
79
+ "ASYNC_CREATE",
80
+ "ASYNC_POLL",
81
+ "ASYNC_ABORT",
82
+ "ASYNC_DELETE",
75
83
  ]
@@ -58,6 +58,7 @@ class SessionTokenProvider(TokenProvider):
58
58
  "Obtains session token",
59
59
  None,
60
60
  is_auxiliary=True,
61
+ type="AUTH",
61
62
  ),
62
63
  )
63
64
  if response is None:
@@ -44,6 +44,7 @@ from airbyte_cdk.sources.declarative.types import ConnectionDefinition
44
44
  from airbyte_cdk.sources.source import TState
45
45
  from airbyte_cdk.sources.streams import Stream
46
46
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
47
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
47
48
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
48
49
  AlwaysAvailableAvailabilityStrategy,
49
50
  )
@@ -118,6 +119,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
118
119
  message_repository=self.message_repository,
119
120
  )
120
121
 
122
+ # TODO: Remove this. This property is necessary to safely migrate Stripe during the transition state.
123
+ @property
124
+ def is_partially_declarative(self) -> bool:
125
+ """This flag used to avoid unexpected AbstractStreamFacade processing as concurrent streams."""
126
+ return False
127
+
121
128
  def read(
122
129
  self,
123
130
  logger: logging.Logger,
@@ -369,6 +376,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
369
376
  )
370
377
  else:
371
378
  synchronous_streams.append(declarative_stream)
379
+ # TODO: Remove this. This check is necessary to safely migrate Stripe during the transition state.
380
+ # Condition below needs to ensure that concurrent support is not lost for sources that already support
381
+ # it before migration, but now are only partially migrated to declarative implementation (e.g., Stripe).
382
+ elif (
383
+ isinstance(declarative_stream, AbstractStreamFacade)
384
+ and self.is_partially_declarative
385
+ ):
386
+ concurrent_streams.append(declarative_stream.get_underlying_stream())
372
387
  else:
373
388
  synchronous_streams.append(declarative_stream)
374
389
 
@@ -1490,7 +1490,11 @@ definitions:
1490
1490
  limit:
1491
1491
  title: Limit
1492
1492
  description: The maximum number of calls allowed within the interval.
1493
- type: integer
1493
+ anyOf:
1494
+ - type: integer
1495
+ - type: string
1496
+ interpolation_context:
1497
+ - config
1494
1498
  interval:
1495
1499
  title: Interval
1496
1500
  description: The time interval for the rate limit.
@@ -3130,14 +3134,12 @@ definitions:
3130
3134
  - "$ref": "#/definitions/CustomPartitionRouter"
3131
3135
  - "$ref": "#/definitions/ListPartitionRouter"
3132
3136
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3133
- - "$ref": "#/definitions/GroupingPartitionRouter"
3134
3137
  - type: array
3135
3138
  items:
3136
3139
  anyOf:
3137
3140
  - "$ref": "#/definitions/CustomPartitionRouter"
3138
3141
  - "$ref": "#/definitions/ListPartitionRouter"
3139
3142
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3140
- - "$ref": "#/definitions/GroupingPartitionRouter"
3141
3143
  decoder:
3142
3144
  title: Decoder
3143
3145
  description: Component decoding the response so records can be extracted.
@@ -3292,14 +3294,12 @@ definitions:
3292
3294
  - "$ref": "#/definitions/CustomPartitionRouter"
3293
3295
  - "$ref": "#/definitions/ListPartitionRouter"
3294
3296
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3295
- - "$ref": "#/definitions/GroupingPartitionRouter"
3296
3297
  - type: array
3297
3298
  items:
3298
3299
  anyOf:
3299
3300
  - "$ref": "#/definitions/CustomPartitionRouter"
3300
3301
  - "$ref": "#/definitions/ListPartitionRouter"
3301
3302
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3302
- - "$ref": "#/definitions/GroupingPartitionRouter"
3303
3303
  decoder:
3304
3304
  title: Decoder
3305
3305
  description: Component decoding the response so records can be extracted.
@@ -3416,44 +3416,6 @@ definitions:
3416
3416
  $parameters:
3417
3417
  type: object
3418
3418
  additionalProperties: true
3419
- GroupingPartitionRouter:
3420
- title: Grouping Partition Router
3421
- description: >
3422
- A decorator on top of a partition router that groups partitions into batches of a specified size.
3423
- This is useful for APIs that support filtering by multiple partition keys in a single request.
3424
- Note that per-partition incremental syncs may not work as expected because the grouping
3425
- of partitions might change between syncs, potentially leading to inconsistent state tracking.
3426
- type: object
3427
- required:
3428
- - type
3429
- - group_size
3430
- - underlying_partition_router
3431
- properties:
3432
- type:
3433
- type: string
3434
- enum: [GroupingPartitionRouter]
3435
- group_size:
3436
- title: Group Size
3437
- description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
3438
- type: integer
3439
- examples:
3440
- - 10
3441
- - 50
3442
- underlying_partition_router:
3443
- title: Underlying Partition Router
3444
- description: The partition router whose output will be grouped. This can be any valid partition router component.
3445
- anyOf:
3446
- - "$ref": "#/definitions/CustomPartitionRouter"
3447
- - "$ref": "#/definitions/ListPartitionRouter"
3448
- - "$ref": "#/definitions/SubstreamPartitionRouter"
3449
- deduplicate:
3450
- title: Deduplicate Partitions
3451
- description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
3452
- type: boolean
3453
- default: true
3454
- $parameters:
3455
- type: object
3456
- additionalProperties: true
3457
3419
  WaitUntilTimeFromHeader:
3458
3420
  title: Wait Until Time Defined In Response Header
3459
3421
  description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
@@ -5,7 +5,7 @@ import json
5
5
  import logging
6
6
  from abc import ABC, abstractmethod
7
7
  from dataclasses import dataclass
8
- from io import BufferedIOBase, TextIOWrapper
8
+ from io import BufferedIOBase, BytesIO, TextIOWrapper
9
9
  from typing import Any, Generator, MutableMapping, Optional
10
10
 
11
11
  import orjson
@@ -107,6 +107,16 @@ class CsvParser(Parser):
107
107
  encoding: Optional[str] = "utf-8"
108
108
  delimiter: Optional[str] = ","
109
109
 
110
+ def _get_delimiter(self) -> Optional[str]:
111
+ """
112
+ Get delimiter from the configuration. Check for the escape character and decode it.
113
+ """
114
+ if self.delimiter is not None:
115
+ if self.delimiter.startswith("\\"):
116
+ self.delimiter = self.delimiter.encode("utf-8").decode("unicode_escape")
117
+
118
+ return self.delimiter
119
+
110
120
  def parse(
111
121
  self,
112
122
  data: BufferedIOBase,
@@ -114,9 +124,11 @@ class CsvParser(Parser):
114
124
  """
115
125
  Parse CSV data from decompressed bytes.
116
126
  """
117
- text_data = TextIOWrapper(data, encoding=self.encoding) # type: ignore
118
- reader = csv.DictReader(text_data, delimiter=self.delimiter or ",")
119
- yield from reader
127
+ bytes_data = BytesIO(data.read())
128
+ text_data = TextIOWrapper(bytes_data, encoding=self.encoding) # type: ignore
129
+ reader = csv.DictReader(text_data, delimiter=self._get_delimiter() or ",")
130
+ for row in reader:
131
+ yield row
120
132
 
121
133
 
122
134
  @dataclass
@@ -136,6 +136,7 @@ class ResponseToFileExtractor(RecordExtractor):
136
136
  """
137
137
 
138
138
  try:
139
+ # TODO: Add support for other file types, like `json`, with `pd.read_json()`
139
140
  with open(path, "r", encoding=file_encoding) as data:
140
141
  chunks = pd.read_csv(
141
142
  data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object
@@ -95,6 +95,10 @@ class ConcurrentPerPartitionCursor(Cursor):
95
95
  # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
96
96
  self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
97
97
  self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
98
+
99
+ # Parent-state tracking: store each partition’s parent state in creation order
100
+ self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
101
+
98
102
  self._finished_partitions: set[str] = set()
99
103
  self._lock = threading.Lock()
100
104
  self._timer = Timer()
@@ -155,11 +159,62 @@ class ConcurrentPerPartitionCursor(Cursor):
155
159
  and self._semaphore_per_partition[partition_key]._value == 0
156
160
  ):
157
161
  self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
158
- self._emit_state_message()
162
+
163
+ self._check_and_update_parent_state()
164
+
165
+ self._emit_state_message()
166
+
167
+ def _check_and_update_parent_state(self) -> None:
168
+ """
169
+ Pop the leftmost partition state from _partition_parent_state_map only if
170
+ *all partitions* up to (and including) that partition key in _semaphore_per_partition
171
+ are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
172
+ Additionally, delete finished semaphores with a value of 0 to free up memory,
173
+ as they are only needed to track errors and completion status.
174
+ """
175
+ last_closed_state = None
176
+
177
+ while self._partition_parent_state_map:
178
+ # Look at the earliest partition key in creation order
179
+ earliest_key = next(iter(self._partition_parent_state_map))
180
+
181
+ # Verify ALL partitions from the left up to earliest_key are finished
182
+ all_left_finished = True
183
+ for p_key, sem in list(
184
+ self._semaphore_per_partition.items()
185
+ ): # Use list to allow modification during iteration
186
+ # If any earlier partition is still not finished, we must stop
187
+ if p_key not in self._finished_partitions or sem._value != 0:
188
+ all_left_finished = False
189
+ break
190
+ # Once we've reached earliest_key in the semaphore order, we can stop checking
191
+ if p_key == earliest_key:
192
+ break
193
+
194
+ # If the partitions up to earliest_key are not all finished, break the while-loop
195
+ if not all_left_finished:
196
+ break
197
+
198
+ # Pop the leftmost entry from parent-state map
199
+ _, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
200
+ last_closed_state = closed_parent_state
201
+
202
+ # Clean up finished semaphores with value 0 up to and including earliest_key
203
+ for p_key in list(self._semaphore_per_partition.keys()):
204
+ sem = self._semaphore_per_partition[p_key]
205
+ if p_key in self._finished_partitions and sem._value == 0:
206
+ del self._semaphore_per_partition[p_key]
207
+ logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
208
+ if p_key == earliest_key:
209
+ break
210
+
211
+ # Update _parent_state if we popped at least one partition
212
+ if last_closed_state is not None:
213
+ self._parent_state = last_closed_state
159
214
 
160
215
  def ensure_at_least_one_state_emitted(self) -> None:
161
216
  """
162
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
217
+ The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
163
218
  called.
164
219
  """
165
220
  if not any(
@@ -201,13 +256,19 @@ class ConcurrentPerPartitionCursor(Cursor):
201
256
 
202
257
  slices = self._partition_router.stream_slices()
203
258
  self._timer.start()
204
- for partition in slices:
205
- yield from self._generate_slices_from_partition(partition)
259
+ for partition, last, parent_state in iterate_with_last_flag_and_state(
260
+ slices, self._partition_router.get_stream_state
261
+ ):
262
+ yield from self._generate_slices_from_partition(partition, parent_state)
206
263
 
207
- def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
264
+ def _generate_slices_from_partition(
265
+ self, partition: StreamSlice, parent_state: Mapping[str, Any]
266
+ ) -> Iterable[StreamSlice]:
208
267
  # Ensure the maximum number of partitions is not exceeded
209
268
  self._ensure_partition_limit()
210
269
 
270
+ partition_key = self._to_partition_key(partition.partition)
271
+
211
272
  cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
212
273
  if not cursor:
213
274
  cursor = self._create_cursor(
@@ -216,18 +277,26 @@ class ConcurrentPerPartitionCursor(Cursor):
216
277
  )
217
278
  with self._lock:
218
279
  self._number_of_partitions += 1
219
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
220
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
221
- threading.Semaphore(0)
222
- )
280
+ self._cursor_per_partition[partition_key] = cursor
281
+ self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
282
+
283
+ with self._lock:
284
+ if (
285
+ len(self._partition_parent_state_map) == 0
286
+ or self._partition_parent_state_map[
287
+ next(reversed(self._partition_parent_state_map))
288
+ ]
289
+ != parent_state
290
+ ):
291
+ self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
223
292
 
224
293
  for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
225
294
  cursor.stream_slices(),
226
295
  lambda: None,
227
296
  ):
228
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
297
+ self._semaphore_per_partition[partition_key].release()
229
298
  if is_last_slice:
230
- self._finished_partitions.add(self._to_partition_key(partition.partition))
299
+ self._finished_partitions.add(partition_key)
231
300
  yield StreamSlice(
232
301
  partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
233
302
  )
@@ -257,9 +326,9 @@ class ConcurrentPerPartitionCursor(Cursor):
257
326
  while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
258
327
  # Try removing finished partitions first
259
328
  for partition_key in list(self._cursor_per_partition.keys()):
260
- if (
261
- partition_key in self._finished_partitions
262
- and self._semaphore_per_partition[partition_key]._value == 0
329
+ if partition_key in self._finished_partitions and (
330
+ partition_key not in self._semaphore_per_partition
331
+ or self._semaphore_per_partition[partition_key]._value == 0
263
332
  ):
264
333
  oldest_partition = self._cursor_per_partition.pop(
265
334
  partition_key
@@ -338,9 +407,6 @@ class ConcurrentPerPartitionCursor(Cursor):
338
407
  self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
339
408
  self._create_cursor(state["cursor"])
340
409
  )
341
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
342
- threading.Semaphore(0)
343
- )
344
410
 
345
411
  # set default state for missing partitions if it is per partition with fallback to global
346
412
  if self._GLOBAL_STATE_KEY in stream_state: