airbyte-cdk 6.41.9.dev4101__py3-none-any.whl → 6.42.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +25 -0
  2. airbyte_cdk/connector_builder/main.py +3 -0
  3. airbyte_cdk/models/__init__.py +0 -1
  4. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +0 -1
  5. airbyte_cdk/sources/declarative/async_job/job.py +6 -0
  6. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
  7. airbyte_cdk/sources/declarative/async_job/job_tracker.py +22 -6
  8. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -22
  9. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +71 -39
  10. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +2 -1
  11. airbyte_cdk/sources/declarative/manifest_declarative_source.py +17 -2
  12. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +48 -25
  13. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +45 -24
  14. airbyte_cdk/sources/declarative/partition_routers/__init__.py +4 -0
  15. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +150 -0
  16. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +5 -1
  17. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -17
  18. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
  19. airbyte_cdk/sources/streams/concurrent/default_stream.py +0 -3
  20. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +4 -0
  21. airbyte_cdk/sources/types.py +0 -11
  22. airbyte_cdk/sources/utils/record_helper.py +1 -8
  23. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/METADATA +2 -2
  24. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/RECORD +28 -29
  25. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +0 -61
  26. airbyte_cdk/sources/utils/files_directory.py +0 -15
  27. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/LICENSE.txt +0 -0
  28. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/LICENSE_SHORT +0 -0
  29. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/WHEEL +0 -0
  30. {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/entry_points.txt +0 -0
@@ -117,5 +117,30 @@ def resolve_manifest(source: ManifestDeclarativeSource) -> AirbyteMessage:
117
117
  return error.as_airbyte_message()
118
118
 
119
119
 
120
+ def full_resolve_manifest(source: ManifestDeclarativeSource) -> AirbyteMessage:
121
+ try:
122
+ manifest = {**source.resolved_manifest}
123
+ streams = manifest.get("streams", [])
124
+ for stream in streams:
125
+ stream["dynamic_stream_name"] = None
126
+ streams.extend(source.dynamic_streams)
127
+ manifest["streams"] = streams
128
+ return AirbyteMessage(
129
+ type=Type.RECORD,
130
+ record=AirbyteRecordMessage(
131
+ data={"manifest": manifest},
132
+ emitted_at=_emitted_at(),
133
+ stream="full_resolve_manifest",
134
+ ),
135
+ )
136
+ except AirbyteTracedException as exc:
137
+ return exc.as_airbyte_message()
138
+ except Exception as exc:
139
+ error = AirbyteTracedException.from_exception(
140
+ exc, message=f"Error full resolving manifest: {str(exc)}"
141
+ )
142
+ return error.as_airbyte_message()
143
+
144
+
120
145
  def _emitted_at() -> int:
121
146
  return ab_datetime_now().to_epoch_millis()
@@ -12,6 +12,7 @@ from airbyte_cdk.connector import BaseConnector
12
12
  from airbyte_cdk.connector_builder.connector_builder_handler import (
13
13
  TestReadLimits,
14
14
  create_source,
15
+ full_resolve_manifest,
15
16
  get_limits,
16
17
  read_stream,
17
18
  resolve_manifest,
@@ -81,6 +82,8 @@ def handle_connector_builder_request(
81
82
  catalog is not None
82
83
  ), "`test_read` requires a valid `ConfiguredAirbyteCatalog`, got None."
83
84
  return read_stream(source, config, catalog, state, limits)
85
+ elif command == "full_resolve_manifest":
86
+ return full_resolve_manifest(source)
84
87
  else:
85
88
  raise ValueError(f"Unrecognized command {command}.")
86
89
 
@@ -19,7 +19,6 @@ from .airbyte_protocol import (
19
19
  AirbyteMessage,
20
20
  AirbyteProtocol,
21
21
  AirbyteRecordMessage,
22
- AirbyteRecordMessageFileReference,
23
22
  AirbyteStateBlob,
24
23
  AirbyteStateMessage,
25
24
  AirbyteStateStats,
@@ -150,7 +150,6 @@ class ConcurrentReadProcessor:
150
150
  stream_name=record.stream_name,
151
151
  data_or_message=record.data,
152
152
  is_file_transfer_message=record.is_file_transfer_message,
153
- file_reference=record.file_reference,
154
153
  )
155
154
  stream = self._stream_name_to_instance[record.stream_name]
156
155
 
@@ -34,6 +34,12 @@ class AsyncJob:
34
34
 
35
35
  def status(self) -> AsyncJobStatus:
36
36
  if self._timer.has_timed_out():
37
+ # TODO: we should account the fact that,
38
+ # certain APIs could send the `Timeout` status,
39
+ # thus we should not return `Timeout` in that case,
40
+ # but act based on the scenario.
41
+
42
+ # the default behavior is to return `Timeout` status and retry.
37
43
  return AsyncJobStatus.TIMED_OUT
38
44
  return self._status
39
45
 
@@ -44,16 +44,21 @@ class AsyncPartition:
44
44
  This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
45
45
  """
46
46
 
47
- _MAX_NUMBER_OF_ATTEMPTS = 3
47
+ _DEFAULT_MAX_JOB_RETRY = 3
48
48
 
49
- def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
49
+ def __init__(
50
+ self, jobs: List[AsyncJob], stream_slice: StreamSlice, job_max_retry: Optional[int] = None
51
+ ) -> None:
50
52
  self._attempts_per_job = {job: 1 for job in jobs}
51
53
  self._stream_slice = stream_slice
54
+ self._job_max_retry = (
55
+ job_max_retry if job_max_retry is not None else self._DEFAULT_MAX_JOB_RETRY
56
+ )
52
57
 
53
58
  def has_reached_max_attempt(self) -> bool:
54
59
  return any(
55
60
  map(
56
- lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
61
+ lambda attempt_count: attempt_count >= self._job_max_retry,
57
62
  self._attempts_per_job.values(),
58
63
  )
59
64
  )
@@ -62,7 +67,7 @@ class AsyncPartition:
62
67
  current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
63
68
  if current_attempt_count is None:
64
69
  raise ValueError("Could not find job to replace")
65
- elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
70
+ elif current_attempt_count >= self._job_max_retry:
66
71
  raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
67
72
 
68
73
  new_attempt_count = current_attempt_count + 1
@@ -155,6 +160,7 @@ class AsyncJobOrchestrator:
155
160
  message_repository: MessageRepository,
156
161
  exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
157
162
  has_bulk_parent: bool = False,
163
+ job_max_retry: Optional[int] = None,
158
164
  ) -> None:
159
165
  """
160
166
  If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
@@ -175,11 +181,12 @@ class AsyncJobOrchestrator:
175
181
  self._message_repository = message_repository
176
182
  self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
177
183
  self._has_bulk_parent = has_bulk_parent
184
+ self._job_max_retry = job_max_retry
178
185
 
179
186
  self._non_breaking_exceptions: List[Exception] = []
180
187
 
181
188
  def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
182
- failed_status_jobs = (AsyncJobStatus.FAILED,)
189
+ failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT)
183
190
  jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
184
191
  for job in jobs_to_replace:
185
192
  new_job = self._start_job(job.job_parameters(), job.api_job_id())
@@ -214,7 +221,7 @@ class AsyncJobOrchestrator:
214
221
  for _slice in self._slice_iterator:
215
222
  at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
216
223
  job = self._start_job(_slice)
217
- self._running_partitions.append(AsyncPartition([job], _slice))
224
+ self._running_partitions.append(AsyncPartition([job], _slice, self._job_max_retry))
218
225
  if self._has_bulk_parent and self._slice_iterator.has_next():
219
226
  break
220
227
  except ConcurrentJobLimitReached:
@@ -363,7 +370,7 @@ class AsyncJobOrchestrator:
363
370
  self._reallocate_partition(current_running_partitions, partition)
364
371
 
365
372
  # We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
366
- self._remove_completed_or_timed_out_jobs(partition)
373
+ self._remove_completed_jobs(partition)
367
374
 
368
375
  # update the referenced list with running partitions
369
376
  self._running_partitions = current_running_partitions
@@ -378,11 +385,7 @@ class AsyncJobOrchestrator:
378
385
  def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
379
386
  for job in partition.jobs:
380
387
  if job.status() == AsyncJobStatus.TIMED_OUT:
381
- self._abort_job(job, free_job_allocation=True)
382
- raise AirbyteTracedException(
383
- internal_message=f"Job {job.api_job_id()} has timed out. Try increasing the `polling job timeout`.",
384
- failure_type=FailureType.config_error,
385
- )
388
+ self._abort_job(job, free_job_allocation=False)
386
389
 
387
390
  def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
388
391
  try:
@@ -392,7 +395,7 @@ class AsyncJobOrchestrator:
392
395
  except Exception as exception:
393
396
  LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
394
397
 
395
- def _remove_completed_or_timed_out_jobs(self, partition: AsyncPartition) -> None:
398
+ def _remove_completed_jobs(self, partition: AsyncPartition) -> None:
396
399
  """
397
400
  Remove completed or timed out jobs from the partition.
398
401
 
@@ -400,7 +403,7 @@ class AsyncJobOrchestrator:
400
403
  partition (AsyncPartition): The partition to process.
401
404
  """
402
405
  for job in partition.jobs:
403
- if job.status() in [AsyncJobStatus.COMPLETED, AsyncJobStatus.TIMED_OUT]:
406
+ if job.status() == AsyncJobStatus.COMPLETED:
404
407
  self._job_tracker.remove_job(job.api_job_id())
405
408
 
406
409
  def _reallocate_partition(
@@ -415,10 +418,7 @@ class AsyncJobOrchestrator:
415
418
  current_running_partitions (list): The list of currently running partitions.
416
419
  partition (AsyncPartition): The partition to reallocate.
417
420
  """
418
- for job in partition.jobs:
419
- if job.status() != AsyncJobStatus.TIMED_OUT:
420
- # allow the FAILED jobs to be re-allocated for partition
421
- current_running_partitions.insert(0, partition)
421
+ current_running_partitions.insert(0, partition)
422
422
 
423
423
  def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
424
424
  """
@@ -3,9 +3,11 @@
3
3
  import logging
4
4
  import threading
5
5
  import uuid
6
- from typing import Set
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, Mapping, Set, Union
7
8
 
8
9
  from airbyte_cdk.logger import lazy_log
10
+ from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
9
11
 
10
12
  LOGGER = logging.getLogger("airbyte")
11
13
 
@@ -14,15 +16,29 @@ class ConcurrentJobLimitReached(Exception):
14
16
  pass
15
17
 
16
18
 
19
+ @dataclass
17
20
  class JobTracker:
18
- def __init__(self, limit: int):
21
+ limit: Union[int, str]
22
+ config: Mapping[str, Any] = field(default_factory=dict)
23
+
24
+ def __post_init__(self) -> None:
19
25
  self._jobs: Set[str] = set()
20
- if limit < 1:
26
+ self._lock = threading.Lock()
27
+ if isinstance(self.limit, str):
28
+ try:
29
+ self.limit = int(
30
+ InterpolatedString(self.limit, parameters={}).eval(config=self.config)
31
+ )
32
+ except Exception as e:
33
+ LOGGER.warning(
34
+ f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
35
+ )
36
+ self.limit = 1
37
+ if self.limit < 1:
21
38
  LOGGER.warning(
22
- f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
39
+ f"The `max_concurrent_async_job_count` property is less than 1: {self.limit}. Setting to 1. Please update the source manifest to set a valid value."
23
40
  )
24
- self._limit = 1 if limit < 1 else limit
25
- self._lock = threading.Lock()
41
+ self._limit = self.limit if self.limit >= 1 else 1
26
42
 
27
43
  def try_to_get_intent(self) -> str:
28
44
  lazy_log(
@@ -25,7 +25,6 @@ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global impor
25
25
  PerPartitionWithGlobalCursor,
26
26
  )
27
27
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
- from airbyte_cdk.sources.declarative.models import FileUploader
29
28
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
30
29
  ConcurrencyLevel as ConcurrencyLevelModel,
31
30
  )
@@ -207,20 +206,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
207
206
  # these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
208
207
  # so we need to treat them as synchronous
209
208
 
210
- file_uploader = None
211
- if isinstance(declarative_stream, DeclarativeStream):
212
- file_uploader = (
213
- self._constructor.create_component(
214
- model_type=FileUploader,
215
- component_definition=name_to_stream_mapping[declarative_stream.name][
216
- "file_uploader"
217
- ],
218
- config=config,
219
- )
220
- if "file_uploader" in name_to_stream_mapping[declarative_stream.name]
221
- else None
222
- )
223
-
224
209
  if (
225
210
  isinstance(declarative_stream, DeclarativeStream)
226
211
  and name_to_stream_mapping[declarative_stream.name]["type"]
@@ -288,7 +273,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
288
273
  declarative_stream.get_json_schema(),
289
274
  retriever,
290
275
  self.message_repository,
291
- file_uploader,
292
276
  ),
293
277
  stream_slicer=declarative_stream.retriever.stream_slicer,
294
278
  )
@@ -319,7 +303,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
319
303
  declarative_stream.get_json_schema(),
320
304
  retriever,
321
305
  self.message_repository,
322
- file_uploader,
323
306
  ),
324
307
  stream_slicer=cursor,
325
308
  )
@@ -339,7 +322,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
339
322
  else None,
340
323
  logger=self.logger,
341
324
  cursor=cursor,
342
- supports_file_transfer=bool(file_uploader),
343
325
  )
344
326
  )
345
327
  elif (
@@ -351,7 +333,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
351
333
  declarative_stream.get_json_schema(),
352
334
  declarative_stream.retriever,
353
335
  self.message_repository,
354
- file_uploader,
355
336
  ),
356
337
  declarative_stream.retriever.stream_slicer,
357
338
  )
@@ -372,7 +353,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
372
353
  cursor_field=None,
373
354
  logger=self.logger,
374
355
  cursor=final_state_cursor,
375
- supports_file_transfer=bool(file_uploader),
376
356
  )
377
357
  )
378
358
  elif (
@@ -412,7 +392,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
412
392
  declarative_stream.get_json_schema(),
413
393
  retriever,
414
394
  self.message_repository,
415
- file_uploader,
416
395
  ),
417
396
  perpartition_cursor,
418
397
  )
@@ -427,7 +406,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
427
406
  cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
428
407
  logger=self.logger,
429
408
  cursor=perpartition_cursor,
430
- supports_file_transfer=bool(file_uploader),
431
409
  )
432
410
  )
433
411
  else:
@@ -47,7 +47,12 @@ properties:
47
47
  max_concurrent_async_job_count:
48
48
  title: Maximum Concurrent Asynchronous Jobs
49
49
  description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.
50
- type: integer
50
+ type:
51
+ - integer
52
+ - string
53
+ examples:
54
+ - 3
55
+ - "{{ config['max_concurrent_async_job_count'] }}"
51
56
  metadata:
52
57
  type: object
53
58
  description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.
@@ -1422,33 +1427,6 @@ definitions:
1422
1427
  - "$ref": "#/definitions/LegacyToPerPartitionStateMigration"
1423
1428
  - "$ref": "#/definitions/CustomStateMigration"
1424
1429
  default: []
1425
- file_uploader:
1426
- title: File Uploader
1427
- description: (experimental) Describes how to fetch a file
1428
- type: object
1429
- required:
1430
- - type
1431
- - requester
1432
- - download_target_extractor
1433
- properties:
1434
- type:
1435
- type: string
1436
- enum: [ FileUploader ]
1437
- requester:
1438
- description: Requester component that describes how to prepare HTTP requests to send to the source API.
1439
- anyOf:
1440
- - "$ref": "#/definitions/CustomRequester"
1441
- - "$ref": "#/definitions/HttpRequester"
1442
- download_target_extractor:
1443
- description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response
1444
- anyOf:
1445
- - "$ref": "#/definitions/CustomRecordExtractor"
1446
- - "$ref": "#/definitions/DpathExtractor"
1447
- file_extractor:
1448
- description: Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content
1449
- anyOf:
1450
- - "$ref": "#/definitions/CustomRecordExtractor"
1451
- - "$ref": "#/definitions/DpathExtractor"
1452
1430
  $parameters:
1453
1431
  type: object
1454
1432
  additional_properties: true
@@ -2219,7 +2197,8 @@ definitions:
2219
2197
  type: object
2220
2198
  additionalProperties: true
2221
2199
  JsonDecoder:
2222
- title: Json Decoder
2200
+ title: JSON
2201
+ description: Select 'JSON' if the response is formatted as a JSON object.
2223
2202
  type: object
2224
2203
  required:
2225
2204
  - type
@@ -2228,8 +2207,8 @@ definitions:
2228
2207
  type: string
2229
2208
  enum: [JsonDecoder]
2230
2209
  JsonlDecoder:
2231
- title: JSONL Decoder
2232
- description: Use this if the response consists of JSON objects separated by new lines (`\n`) in JSONL format.
2210
+ title: JSON Lines
2211
+ description: Select 'JSON Lines' if the response consists of JSON objects separated by new lines ('\n') in JSONL format.
2233
2212
  type: object
2234
2213
  required:
2235
2214
  - type
@@ -2354,8 +2333,8 @@ definitions:
2354
2333
  type: object
2355
2334
  additionalProperties: true
2356
2335
  IterableDecoder:
2357
- title: Iterable Decoder
2358
- description: Use this if the response consists of strings separated by new lines (`\n`). The Decoder will wrap each row into a JSON object with the `record` key.
2336
+ title: Iterable
2337
+ description: Select 'Iterable' if the response consists of strings separated by new lines (`\n`). The string will then be wrapped into a JSON object with the `record` key.
2359
2338
  type: object
2360
2339
  required:
2361
2340
  - type
@@ -2364,8 +2343,8 @@ definitions:
2364
2343
  type: string
2365
2344
  enum: [IterableDecoder]
2366
2345
  XmlDecoder:
2367
- title: XML Decoder
2368
- description: Use this if the response is XML.
2346
+ title: XML
2347
+ description: Select 'XML' if the response consists of XML-formatted data.
2369
2348
  type: object
2370
2349
  required:
2371
2350
  - type
@@ -2396,8 +2375,8 @@ definitions:
2396
2375
  type: object
2397
2376
  additionalProperties: true
2398
2377
  ZipfileDecoder:
2399
- title: Zipfile Decoder
2400
- description: Decoder for response data that is returned as zipfile(s).
2378
+ title: ZIP File
2379
+ description: Select 'ZIP file' for response data that is returned as a zipfile. Requires specifying an inner data type/decoder to parse the unzipped data.
2401
2380
  type: object
2402
2381
  additionalProperties: true
2403
2382
  required:
@@ -2921,7 +2900,7 @@ definitions:
2921
2900
  title: Lazy Read Pointer
2922
2901
  description: If set, this will enable lazy reading, using the initial read of parent records to extract child records.
2923
2902
  type: array
2924
- default: [ ]
2903
+ default: []
2925
2904
  items:
2926
2905
  - type: string
2927
2906
  interpolation_context:
@@ -3226,7 +3205,7 @@ definitions:
3226
3205
  properties:
3227
3206
  type:
3228
3207
  type: string
3229
- enum: [ StateDelegatingStream ]
3208
+ enum: [StateDelegatingStream]
3230
3209
  name:
3231
3210
  title: Name
3232
3211
  description: The stream name.
@@ -3281,12 +3260,14 @@ definitions:
3281
3260
  - "$ref": "#/definitions/CustomPartitionRouter"
3282
3261
  - "$ref": "#/definitions/ListPartitionRouter"
3283
3262
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3263
+ - "$ref": "#/definitions/GroupingPartitionRouter"
3284
3264
  - type: array
3285
3265
  items:
3286
3266
  anyOf:
3287
3267
  - "$ref": "#/definitions/CustomPartitionRouter"
3288
3268
  - "$ref": "#/definitions/ListPartitionRouter"
3289
3269
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3270
+ - "$ref": "#/definitions/GroupingPartitionRouter"
3290
3271
  decoder:
3291
3272
  title: Decoder
3292
3273
  description: Component decoding the response so records can be extracted.
@@ -3303,6 +3284,8 @@ definitions:
3303
3284
  type: object
3304
3285
  additionalProperties: true
3305
3286
  GzipDecoder:
3287
+ title: gzip
3288
+ description: Select 'gzip' for response data that is compressed with gzip. Requires specifying an inner data type/decoder to parse the decompressed data.
3306
3289
  type: object
3307
3290
  required:
3308
3291
  - type
@@ -3318,6 +3301,8 @@ definitions:
3318
3301
  - "$ref": "#/definitions/JsonDecoder"
3319
3302
  - "$ref": "#/definitions/JsonlDecoder"
3320
3303
  CsvDecoder:
3304
+ title: CSV
3305
+ description: "Select 'CSV' for response data that is formatted as CSV (comma-separated values). Can specify an encoding (default: 'utf-8') and a delimiter (default: ',')."
3321
3306
  type: object
3322
3307
  required:
3323
3308
  - type
@@ -3448,12 +3433,14 @@ definitions:
3448
3433
  - "$ref": "#/definitions/CustomPartitionRouter"
3449
3434
  - "$ref": "#/definitions/ListPartitionRouter"
3450
3435
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3436
+ - "$ref": "#/definitions/GroupingPartitionRouter"
3451
3437
  - type: array
3452
3438
  items:
3453
3439
  anyOf:
3454
3440
  - "$ref": "#/definitions/CustomPartitionRouter"
3455
3441
  - "$ref": "#/definitions/ListPartitionRouter"
3456
3442
  - "$ref": "#/definitions/SubstreamPartitionRouter"
3443
+ - "$ref": "#/definitions/GroupingPartitionRouter"
3457
3444
  decoder:
3458
3445
  title: Decoder
3459
3446
  description: Component decoding the response so records can be extracted.
@@ -3570,6 +3557,44 @@ definitions:
3570
3557
  $parameters:
3571
3558
  type: object
3572
3559
  additionalProperties: true
3560
+ GroupingPartitionRouter:
3561
+ title: Grouping Partition Router
3562
+ description: >
3563
+ A decorator on top of a partition router that groups partitions into batches of a specified size.
3564
+ This is useful for APIs that support filtering by multiple partition keys in a single request.
3565
+ Note that per-partition incremental syncs may not work as expected because the grouping
3566
+ of partitions might change between syncs, potentially leading to inconsistent state tracking.
3567
+ type: object
3568
+ required:
3569
+ - type
3570
+ - group_size
3571
+ - underlying_partition_router
3572
+ properties:
3573
+ type:
3574
+ type: string
3575
+ enum: [GroupingPartitionRouter]
3576
+ group_size:
3577
+ title: Group Size
3578
+ description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
3579
+ type: integer
3580
+ examples:
3581
+ - 10
3582
+ - 50
3583
+ underlying_partition_router:
3584
+ title: Underlying Partition Router
3585
+ description: The partition router whose output will be grouped. This can be any valid partition router component.
3586
+ anyOf:
3587
+ - "$ref": "#/definitions/CustomPartitionRouter"
3588
+ - "$ref": "#/definitions/ListPartitionRouter"
3589
+ - "$ref": "#/definitions/SubstreamPartitionRouter"
3590
+ deduplicate:
3591
+ title: Deduplicate Partitions
3592
+ description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
3593
+ type: boolean
3594
+ default: true
3595
+ $parameters:
3596
+ type: object
3597
+ additionalProperties: true
3573
3598
  WaitUntilTimeFromHeader:
3574
3599
  title: Wait Until Time Defined In Response Header
3575
3600
  description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
@@ -3741,6 +3766,13 @@ definitions:
3741
3766
  type:
3742
3767
  type: string
3743
3768
  enum: [DynamicDeclarativeStream]
3769
+ name:
3770
+ title: Name
3771
+ description: The dynamic stream name.
3772
+ type: string
3773
+ default: ""
3774
+ example:
3775
+ - "Tables"
3744
3776
  stream_template:
3745
3777
  title: Stream Template
3746
3778
  description: Reference to the stream template.
@@ -79,6 +79,7 @@ class ConcurrentPerPartitionCursor(Cursor):
79
79
  connector_state_manager: ConnectorStateManager,
80
80
  connector_state_converter: AbstractStreamStateConverter,
81
81
  cursor_field: CursorField,
82
+ use_global_cursor: bool = False,
82
83
  ) -> None:
83
84
  self._global_cursor: Optional[StreamState] = {}
84
85
  self._stream_name = stream_name
@@ -106,7 +107,7 @@ class ConcurrentPerPartitionCursor(Cursor):
106
107
  self._lookback_window: int = 0
107
108
  self._parent_state: Optional[StreamState] = None
108
109
  self._number_of_partitions: int = 0
109
- self._use_global_cursor: bool = False
110
+ self._use_global_cursor: bool = use_global_cursor
110
111
  self._partition_serializer = PerPartitionKeySerializer()
111
112
  # Track the last time a state message was emitted
112
113
  self._last_emission_time: float = 0.0
@@ -106,6 +106,7 @@ class ManifestDeclarativeSource(DeclarativeSource):
106
106
  AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger()
107
107
  )
108
108
 
109
+ self._config = config or {}
109
110
  self._validate_source()
110
111
 
111
112
  @property
@@ -116,6 +117,12 @@ class ManifestDeclarativeSource(DeclarativeSource):
116
117
  def message_repository(self) -> MessageRepository:
117
118
  return self._message_repository
118
119
 
120
+ @property
121
+ def dynamic_streams(self) -> List[Dict[str, Any]]:
122
+ return self._dynamic_stream_configs(
123
+ manifest=self._source_config, config=self._config, with_dynamic_stream_name=True
124
+ )
125
+
119
126
  @property
120
127
  def connection_checker(self) -> ConnectionChecker:
121
128
  check = self._source_config["check"]
@@ -348,13 +355,16 @@ class ManifestDeclarativeSource(DeclarativeSource):
348
355
  return stream_configs
349
356
 
350
357
  def _dynamic_stream_configs(
351
- self, manifest: Mapping[str, Any], config: Mapping[str, Any]
358
+ self,
359
+ manifest: Mapping[str, Any],
360
+ config: Mapping[str, Any],
361
+ with_dynamic_stream_name: Optional[bool] = None,
352
362
  ) -> List[Dict[str, Any]]:
353
363
  dynamic_stream_definitions: List[Dict[str, Any]] = manifest.get("dynamic_streams", [])
354
364
  dynamic_stream_configs: List[Dict[str, Any]] = []
355
365
  seen_dynamic_streams: Set[str] = set()
356
366
 
357
- for dynamic_definition in dynamic_stream_definitions:
367
+ for dynamic_definition_index, dynamic_definition in enumerate(dynamic_stream_definitions):
358
368
  components_resolver_config = dynamic_definition["components_resolver"]
359
369
 
360
370
  if not components_resolver_config:
@@ -393,6 +403,11 @@ class ManifestDeclarativeSource(DeclarativeSource):
393
403
  # Ensure that each stream is created with a unique name
394
404
  name = dynamic_stream.get("name")
395
405
 
406
+ if with_dynamic_stream_name:
407
+ dynamic_stream["dynamic_stream_name"] = dynamic_definition.get(
408
+ "name", f"dynamic_stream_{dynamic_definition_index}"
409
+ )
410
+
396
411
  if not isinstance(name, str):
397
412
  raise ValueError(
398
413
  f"Expected stream name {name} to be a string, got {type(name)}."